[RFC PATCH] For TARGET_AVX use *mov<mode>_internal for misaligned loads

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [RFC PATCH] For TARGET_AVX use *mov<mode>_internal for misaligned loads
@ 2013-10-30  9:54 Jakub Jelinek
  2013-10-30 10:00 ` Ondřej Bílka
                   ` (2 more replies)
  0 siblings, 3 replies; 15+ messages in thread
From: Jakub Jelinek @ 2013-10-30  9:54 UTC (permalink / raw)
  To: Richard Henderson, Uros Bizjak, Kirill Yukhin; +Cc: gcc-patches

Hi!

Yesterday I've noticed that for AVX which allows unaligned operands in
AVX arithmetics instructions we still don't combine unaligned loads with the
AVX arithmetics instructions.  So say for -O2 -mavx -ftree-vectorize
void
f1 (int *__restrict e, int *__restrict f)
{
  int i;
  for (i = 0; i < 1024; i++)
    e[i] = f[i] * 7;
}

void
f2 (int *__restrict e, int *__restrict f)
{
  int i;
  for (i = 0; i < 1024; i++)
    e[i] = f[i];
}
we have:
        vmovdqu (%rsi,%rax), %xmm0
        vpmulld %xmm1, %xmm0, %xmm0
        vmovups %xmm0, (%rdi,%rax)
in the first loop.  Apparently all the MODE_VECTOR_INT and MODE_VECTOR_FLOAT
*mov<mode>_internal patterns (and various others) use misaligned_operand
to see if they should emit vmovaps or vmovups (etc.), so as suggested by
Richard on IRC it isn't necessary to either allow UNSPEC_LOADU in memory
operands of all the various non-move AVX instructions for TARGET_AVX, or
add extra patterns to help combine, this patch instead just uses the
*mov<mode>_internal in that case (assuming initially misaligned_operand
doesn't become !misaligned_operand through RTL optimizations).  Additionally
the patch attempts to avoid gen_lowpart on the non-MEM lhs of the unaligned
loads, which usually means combine will fail, by doing the load into a
temporary pseudo in that case and then doing a pseudo to pseudo move with
gen_lowpart on the rhs (which will be merged soon after into following
instructions).

I'll bootstrap/regtest this on x86_64-linux and i686-linux, unfortunately my
bootstrap/regtest server isn't AVX capable.

2013-10-30  Jakub Jelinek  <jakub@redhat.com>

	* config/i386/i386.c (ix86_avx256_split_vector_move_misalign): If
	op1 is misaligned_operand, just use *mov<mode>_internal insn
	rather than UNSPEC_LOADU load.
	(ix86_expand_vector_move_misalign): Likewise (for TARGET_AVX only).
	Avoid gen_lowpart on op0 if it isn't MEM.

--- gcc/config/i386/i386.c.jj	2013-10-30 08:15:38.000000000 +0100
+++ gcc/config/i386/i386.c	2013-10-30 10:20:22.684708729 +0100
@@ -16560,6 +16560,12 @@ ix86_avx256_split_vector_move_misalign (
 	  r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
 	  emit_move_insn (op0, r);
 	}
+      /* Normal *mov<mode>_internal pattern will handle
+	 unaligned loads just fine if misaligned_operand
+	 is true, and without the UNSPEC it can be combined
+	 with arithmetic instructions.  */
+      else if (misaligned_operand (op1, GET_MODE (op1)))
+	emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
       else
 	emit_insn (load_unaligned (op0, op1));
     }
@@ -16634,7 +16640,7 @@ ix86_avx256_split_vector_move_misalign (
 void
 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
 {
-  rtx op0, op1, m;
+  rtx op0, op1, orig_op0 = NULL_RTX, m;
   rtx (*load_unaligned) (rtx, rtx);
   rtx (*store_unaligned) (rtx, rtx);
 
@@ -16647,7 +16653,16 @@ ix86_expand_vector_move_misalign (enum m
 	{
 	case MODE_VECTOR_INT:
 	case MODE_INT:
-	  op0 = gen_lowpart (V16SImode, op0);
+	  if (GET_MODE (op0) != V16SImode)
+	    {
+	      if (!MEM_P (op0))
+		{
+		  orig_op0 = op0;
+		  op0 = gen_reg_rtx (V16SImode);
+		}
+	      else
+		op0 = gen_lowpart (V16SImode, op0);
+	    }
 	  op1 = gen_lowpart (V16SImode, op1);
 	  /* FALLTHRU */
 
@@ -16676,6 +16691,8 @@ ix86_expand_vector_move_misalign (enum m
 	    emit_insn (store_unaligned (op0, op1));
 	  else
 	    gcc_unreachable ();
+	  if (orig_op0)
+	    emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
 	  break;
 
 	default:
@@ -16692,12 +16709,23 @@ ix86_expand_vector_move_misalign (enum m
 	{
 	case MODE_VECTOR_INT:
 	case MODE_INT:
-	  op0 = gen_lowpart (V32QImode, op0);
+	  if (GET_MODE (op0) != V32QImode)
+	    {
+	      if (!MEM_P (op0))
+		{
+		  orig_op0 = op0;
+		  op0 = gen_reg_rtx (V32QImode);
+		}
+	      else
+		op0 = gen_lowpart (V32QImode, op0);
+	    }
 	  op1 = gen_lowpart (V32QImode, op1);
 	  /* FALLTHRU */
 
 	case MODE_VECTOR_FLOAT:
 	  ix86_avx256_split_vector_move_misalign (op0, op1);
+	  if (orig_op0)
+	    emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
 	  break;
 
 	default:
@@ -16709,15 +16737,30 @@ ix86_expand_vector_move_misalign (enum m
 
   if (MEM_P (op1))
     {
+      /* Normal *mov<mode>_internal pattern will handle
+	 unaligned loads just fine if misaligned_operand
+	 is true, and without the UNSPEC it can be combined
+	 with arithmetic instructions.  */
+      if (TARGET_AVX
+	  && (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
+	      || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
+	  && misaligned_operand (op1, GET_MODE (op1)))
+	emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
       /* ??? If we have typed data, then it would appear that using
 	 movdqu is the only way to get unaligned data loaded with
 	 integer type.  */
-      if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+      else if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
 	{
-	  op0 = gen_lowpart (V16QImode, op0);
+	  if (GET_MODE (op0) != V16QImode)
+	    {
+	      orig_op0 = op0;
+	      op0 = gen_reg_rtx (V16QImode);
+	    }
 	  op1 = gen_lowpart (V16QImode, op1);
 	  /* We will eventually emit movups based on insn attributes.  */
 	  emit_insn (gen_sse2_loaddquv16qi (op0, op1));
+	  if (orig_op0)
+	    emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
 	}
       else if (TARGET_SSE2 && mode == V2DFmode)
         {
@@ -16765,9 +16808,16 @@ ix86_expand_vector_move_misalign (enum m
 	      || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
 	      || optimize_insn_for_size_p ())
 	    {
-	      op0 = gen_lowpart (V4SFmode, op0);
+	      if (GET_MODE (op0) != V4SFmode)
+		{
+		  orig_op0 = op0;
+		  op0 = gen_reg_rtx (V4SFmode);
+		}
 	      op1 = gen_lowpart (V4SFmode, op1);
 	      emit_insn (gen_sse_loadups (op0, op1));
+	      if (orig_op0)
+		emit_move_insn (orig_op0,
+				gen_lowpart (GET_MODE (orig_op0), op0));
 	      return;
             }
 

	Jakub

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC PATCH] For TARGET_AVX use *mov<mode>_internal for misaligned loads
  2013-10-30  9:54 [RFC PATCH] For TARGET_AVX use *mov<mode>_internal for misaligned loads Jakub Jelinek
@ 2013-10-30 10:00 ` Ondřej Bílka
  2013-10-30 10:34   ` Jakub Jelinek
  2013-10-30 11:01 ` Uros Bizjak
  2013-10-30 17:01 ` [RFC PATCH] For TARGET_AVX use *mov<mode>_internal for misaligned loads Richard Henderson
  2 siblings, 1 reply; 15+ messages in thread
From: Ondřej Bílka @ 2013-10-30 10:00 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Richard Henderson, Uros Bizjak, Kirill Yukhin, gcc-patches

On Wed, Oct 30, 2013 at 10:47:13AM +0100, Jakub Jelinek wrote:
> Hi!
> 
> Yesterday I've noticed that for AVX which allows unaligned operands in
> AVX arithmetics instructions we still don't combine unaligned loads with the
> AVX arithmetics instructions.  So say for -O2 -mavx -ftree-vectorize
> void
> f1 (int *__restrict e, int *__restrict f)
> {
>   int i;
>   for (i = 0; i < 1024; i++)
>     e[i] = f[i] * 7;
> }
> 
> void
> f2 (int *__restrict e, int *__restrict f)
> {
>   int i;
>   for (i = 0; i < 1024; i++)
>     e[i] = f[i];
> }
> we have:
>         vmovdqu (%rsi,%rax), %xmm0
>         vpmulld %xmm1, %xmm0, %xmm0
>         vmovups %xmm0, (%rdi,%rax)
> in the first loop.  Apparently all the MODE_VECTOR_INT and MODE_VECTOR_FLOAT
> *mov<mode>_internal patterns (and various others) use misaligned_operand
> to see if they should emit vmovaps or vmovups (etc.), so as suggested by

That is intentional. In pre-haswell architectures splitting load is
faster than 32 byte load. 

See IntelÂ® 64 and IA-32 Architectures Optimization Reference Manual for
details.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC PATCH] For TARGET_AVX use *mov<mode>_internal for misaligned loads
  2013-10-30 10:00 ` Ondřej Bílka
@ 2013-10-30 10:34   ` Jakub Jelinek
  2013-10-30 10:35     ` Jakub Jelinek
  0 siblings, 1 reply; 15+ messages in thread
From: Jakub Jelinek @ 2013-10-30 10:34 UTC (permalink / raw)
  To: Ondřej Bílka
  Cc: Richard Henderson, Uros Bizjak, Kirill Yukhin, gcc-patches

On Wed, Oct 30, 2013 at 10:53:58AM +0100, OndÅ™ej BÃlka wrote:
> > Yesterday I've noticed that for AVX which allows unaligned operands in
> > AVX arithmetics instructions we still don't combine unaligned loads with the
> > AVX arithmetics instructions.  So say for -O2 -mavx -ftree-vectorize
> > void
> > f1 (int *__restrict e, int *__restrict f)
> > {
> >   int i;
> >   for (i = 0; i < 1024; i++)
> >     e[i] = f[i] * 7;
> > }
> > 
> > void
> > f2 (int *__restrict e, int *__restrict f)
> > {
> >   int i;
> >   for (i = 0; i < 1024; i++)
> >     e[i] = f[i];
> > }
> > we have:
> >         vmovdqu (%rsi,%rax), %xmm0
> >         vpmulld %xmm1, %xmm0, %xmm0
> >         vmovups %xmm0, (%rdi,%rax)
> > in the first loop.  Apparently all the MODE_VECTOR_INT and MODE_VECTOR_FLOAT
> > *mov<mode>_internal patterns (and various others) use misaligned_operand
> > to see if they should emit vmovaps or vmovups (etc.), so as suggested by
> 
> That is intentional. In pre-haswell architectures splitting load is
> faster than 32 byte load. 

But the above is 16 byte unaligned load.  Furthermore, GCC supports
-mavx256-split-unaligned-load and can emit 32 byte loads either as an
unaligned 32 byte load, or merge of 16 byte unaligned loads.  The patch
affects only the cases where we were already emitting 16 byte or 32 byte
unaligned loads rather than split loads.

	Jakub

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC PATCH] For TARGET_AVX use *mov<mode>_internal for misaligned loads
  2013-10-30 10:34   ` Jakub Jelinek
@ 2013-10-30 10:35     ` Jakub Jelinek
  2013-10-30 11:01       ` Ondřej Bílka
  0 siblings, 1 reply; 15+ messages in thread
From: Jakub Jelinek @ 2013-10-30 10:35 UTC (permalink / raw)
  To: Ondřej Bílka
  Cc: Richard Henderson, Uros Bizjak, Kirill Yukhin, gcc-patches

On Wed, Oct 30, 2013 at 11:00:13AM +0100, Jakub Jelinek wrote:
> But the above is 16 byte unaligned load.  Furthermore, GCC supports
> -mavx256-split-unaligned-load and can emit 32 byte loads either as an
> unaligned 32 byte load, or merge of 16 byte unaligned loads.  The patch
> affects only the cases where we were already emitting 16 byte or 32 byte
> unaligned loads rather than split loads.

With my patch, the differences (in all cases only on f1) for
-O2 -mavx -ftree-vectorize with the patch is (16 byte unaligned load, not split):
-	vmovdqu	(%rsi,%rax), %xmm0
-	vpmulld	%xmm1, %xmm0, %xmm0
+	vpmulld	(%rsi,%rax), %xmm1, %xmm0
 	vmovups	%xmm0, (%rdi,%rax)
with -O2 -mavx2 -ftree-vectorize (again, load wasn't split):
-	vmovdqu	(%rsi,%rax), %ymm0
-	vpmulld	%ymm1, %ymm0, %ymm0
+	vpmulld	(%rsi,%rax), %ymm1, %ymm0
 	vmovups	%ymm0, (%rdi,%rax)
and with -O2 -mavx2 -mavx256-split-unaligned-load:
 	vmovdqu	(%rsi,%rax), %xmm0
 	vinserti128	$0x1, 16(%rsi,%rax), %ymm0, %ymm0
-	vpmulld	%ymm1, %ymm0, %ymm0
+	vpmulld	%ymm0, %ymm1, %ymm0
 	vmovups	%ymm0, (%rdi,%rax)
(the last change is just giving RTL optimizers more freedom by not
doing the SUBREG on the lhs).

	Jakub

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC PATCH] For TARGET_AVX use *mov<mode>_internal for misaligned loads
  2013-10-30 10:35     ` Jakub Jelinek
@ 2013-10-30 11:01       ` Ondřej Bílka
  0 siblings, 0 replies; 15+ messages in thread
From: Ondřej Bílka @ 2013-10-30 11:01 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Richard Henderson, Uros Bizjak, Kirill Yukhin, gcc-patches

On Wed, Oct 30, 2013 at 11:05:58AM +0100, Jakub Jelinek wrote:
> On Wed, Oct 30, 2013 at 11:00:13AM +0100, Jakub Jelinek wrote:
> > But the above is 16 byte unaligned load.  Furthermore, GCC supports
> > -mavx256-split-unaligned-load and can emit 32 byte loads either as an
> > unaligned 32 byte load, or merge of 16 byte unaligned loads.  The patch
> > affects only the cases where we were already emitting 16 byte or 32 byte
> > unaligned loads rather than split loads.
> 
> With my patch, the differences (in all cases only on f1) for
> -O2 -mavx -ftree-vectorize with the patch is (16 byte unaligned load, not split):

My point was that this could mask split loads, thank for clarifying

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC PATCH] For TARGET_AVX use *mov<mode>_internal for misaligned loads
  2013-10-30  9:54 [RFC PATCH] For TARGET_AVX use *mov<mode>_internal for misaligned loads Jakub Jelinek
  2013-10-30 10:00 ` Ondřej Bílka
@ 2013-10-30 11:01 ` Uros Bizjak
  2013-10-30 11:54   ` Jakub Jelinek
  2013-10-30 17:01 ` [RFC PATCH] For TARGET_AVX use *mov<mode>_internal for misaligned loads Richard Henderson
  2 siblings, 1 reply; 15+ messages in thread
From: Uros Bizjak @ 2013-10-30 11:01 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Richard Henderson, Kirill Yukhin, gcc-patches

On Wed, Oct 30, 2013 at 10:47 AM, Jakub Jelinek <jakub@redhat.com> wrote:

> Yesterday I've noticed that for AVX which allows unaligned operands in
> AVX arithmetics instructions we still don't combine unaligned loads with the
> AVX arithmetics instructions.  So say for -O2 -mavx -ftree-vectorize

This is actually PR 47754 that fell below radar for some reason...

> we have:
>         vmovdqu (%rsi,%rax), %xmm0
>         vpmulld %xmm1, %xmm0, %xmm0
>         vmovups %xmm0, (%rdi,%rax)
> in the first loop.  Apparently all the MODE_VECTOR_INT and MODE_VECTOR_FLOAT
> *mov<mode>_internal patterns (and various others) use misaligned_operand
> to see if they should emit vmovaps or vmovups (etc.), so as suggested by
> Richard on IRC it isn't necessary to either allow UNSPEC_LOADU in memory
> operands of all the various non-move AVX instructions for TARGET_AVX, or
> add extra patterns to help combine, this patch instead just uses the
> *mov<mode>_internal in that case (assuming initially misaligned_operand
> doesn't become !misaligned_operand through RTL optimizations).  Additionally

No worries here. We will generate movdqa, and it is definitely a gcc
bug if RTL optimizations change misaligned operand to aligned.

> the patch attempts to avoid gen_lowpart on the non-MEM lhs of the unaligned
> loads, which usually means combine will fail, by doing the load into a
> temporary pseudo in that case and then doing a pseudo to pseudo move with
> gen_lowpart on the rhs (which will be merged soon after into following
> instructions).

Is this similar to PR44141? There were similar problems with V4SFmode
subregs, so combine was not able to merge load to the arithemtic insn.

> I'll bootstrap/regtest this on x86_64-linux and i686-linux, unfortunately my
> bootstrap/regtest server isn't AVX capable.

I can bootstrap the patch later today on IvyBridge with
--with-arch=core-avx-i --with-cpu=core-avx-i --with-fpmath=avx.

Uros.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC PATCH] For TARGET_AVX use *mov<mode>_internal for misaligned loads
  2013-10-30 11:01 ` Uros Bizjak
@ 2013-10-30 11:54   ` Jakub Jelinek
  2013-10-30 18:02     ` Uros Bizjak
  2013-10-31 17:27     ` [PATCH] Try to avoid vector mode punning in SET_DEST on i?86 Jakub Jelinek
  0 siblings, 2 replies; 15+ messages in thread
From: Jakub Jelinek @ 2013-10-30 11:54 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: Richard Henderson, Kirill Yukhin, gcc-patches

On Wed, Oct 30, 2013 at 11:55:44AM +0100, Uros Bizjak wrote:
> On Wed, Oct 30, 2013 at 10:47 AM, Jakub Jelinek <jakub@redhat.com> wrote:
> 
> > Yesterday I've noticed that for AVX which allows unaligned operands in
> > AVX arithmetics instructions we still don't combine unaligned loads with the
> > AVX arithmetics instructions.  So say for -O2 -mavx -ftree-vectorize
> 
> This is actually PR 47754 that fell below radar for some reason...

Apparently yes.

> > the patch attempts to avoid gen_lowpart on the non-MEM lhs of the unaligned
> > loads, which usually means combine will fail, by doing the load into a
> > temporary pseudo in that case and then doing a pseudo to pseudo move with
> > gen_lowpart on the rhs (which will be merged soon after into following
> > instructions).
> 
> Is this similar to PR44141? There were similar problems with V4SFmode
> subregs, so combine was not able to merge load to the arithemtic insn.

From the work on the vectorization last year I remember many cases where
subregs (even equal size) on the LHS of instructions prevented combiner or
other RTL optimizations from doing it's job.  I believe I've changed some
easy places that did that completely unnecessarily, but certainly have not
went through all the code to look for other places where this is done.

Perhaps let's hack up a checking pass that will after expansion walk the
whole IL and complain about same sized subregs on the LHS of insns, then do make
check with it for a couple of ISAs (-msse2,-msse4,-mavx,-mavx2 e.g.?

> > I'll bootstrap/regtest this on x86_64-linux and i686-linux, unfortunately my
> > bootstrap/regtest server isn't AVX capable.
> 
> I can bootstrap the patch later today on IvyBridge with
> --with-arch=core-avx-i --with-cpu=core-avx-i --with-fpmath=avx.

That would be greatly appreciated, thanks.

	Jakub

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC PATCH] For TARGET_AVX use *mov<mode>_internal for misaligned loads
  2013-10-30  9:54 [RFC PATCH] For TARGET_AVX use *mov<mode>_internal for misaligned loads Jakub Jelinek
  2013-10-30 10:00 ` Ondřej Bílka
  2013-10-30 11:01 ` Uros Bizjak
@ 2013-10-30 17:01 ` Richard Henderson
  2013-10-30 17:47   ` Jakub Jelinek
  2 siblings, 1 reply; 15+ messages in thread
From: Richard Henderson @ 2013-10-30 17:01 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Uros Bizjak, Kirill Yukhin, gcc-patches

On 10/30/2013 02:47 AM, Jakub Jelinek wrote:
> 2013-10-30  Jakub Jelinek  <jakub@redhat.com>
> 
> 	* config/i386/i386.c (ix86_avx256_split_vector_move_misalign): If
> 	op1 is misaligned_operand, just use *mov<mode>_internal insn
> 	rather than UNSPEC_LOADU load.
> 	(ix86_expand_vector_move_misalign): Likewise (for TARGET_AVX only).
> 	Avoid gen_lowpart on op0 if it isn't MEM.

Ok.


r~

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC PATCH] For TARGET_AVX use *mov<mode>_internal for misaligned loads
  2013-10-30 17:01 ` [RFC PATCH] For TARGET_AVX use *mov<mode>_internal for misaligned loads Richard Henderson
@ 2013-10-30 17:47   ` Jakub Jelinek
  0 siblings, 0 replies; 15+ messages in thread
From: Jakub Jelinek @ 2013-10-30 17:47 UTC (permalink / raw)
  To: Richard Henderson; +Cc: Uros Bizjak, Kirill Yukhin, gcc-patches

On Wed, Oct 30, 2013 at 09:17:04AM -0700, Richard Henderson wrote:
> On 10/30/2013 02:47 AM, Jakub Jelinek wrote:
> > 2013-10-30  Jakub Jelinek  <jakub@redhat.com>
> > 
> > 	* config/i386/i386.c (ix86_avx256_split_vector_move_misalign): If
> > 	op1 is misaligned_operand, just use *mov<mode>_internal insn
> > 	rather than UNSPEC_LOADU load.
> > 	(ix86_expand_vector_move_misalign): Likewise (for TARGET_AVX only).
> > 	Avoid gen_lowpart on op0 if it isn't MEM.
> 
> Ok.

Testing revealed some testsuite failures, due to either trying to match
insn names in -dp dump or counting specific FMA insns, where with the
patch there are changes like:
-       vmovupd 0(%r13,%rax), %ymm0
-       vfmadd231pd     %ymm1, %ymm2, %ymm0
+       vmovapd %ymm2, %ymm0
+       vfmadd213pd     0(%r13,%rax), %ymm1, %ymm0

So, here is updated patch with those testsuite changes and added PR line
to ChangeLog.  I'll wait for Uros' testresults.

2013-10-30  Jakub Jelinek  <jakub@redhat.com>

	PR target/47754
	* config/i386/i386.c (ix86_avx256_split_vector_move_misalign): If
	op1 is misaligned_operand, just use *mov<mode>_internal insn
	rather than UNSPEC_LOADU load.
	(ix86_expand_vector_move_misalign): Likewise (for TARGET_AVX only).
	Avoid gen_lowpart on op0 if it isn't MEM.

	* gcc.target/i386/avx256-unaligned-load-1.c: Adjust scan-assembler
	and scan-assembler-not regexps.
	* gcc.target/i386/avx256-unaligned-load-2.c: Likewise.
	* gcc.target/i386/avx256-unaligned-load-3.c: Likewise.
	* gcc.target/i386/avx256-unaligned-load-4.c: Likewise.
	* gcc.target/i386/l_fma_float_1.c: Expect vf{,n}m{add,sub}213*p*
	instead of vf{,n}m{add,sub}231*p*.
	* gcc.target/i386/l_fma_float_3.c: Likewise.
	* gcc.target/i386/l_fma_double_1.c: Likewise.
	* gcc.target/i386/l_fma_double_3.c: Likewise.

--- gcc/config/i386/i386.c.jj	2013-10-30 08:15:38.000000000 +0100
+++ gcc/config/i386/i386.c	2013-10-30 10:20:22.684708729 +0100
@@ -16560,6 +16560,12 @@ ix86_avx256_split_vector_move_misalign (
 	  r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
 	  emit_move_insn (op0, r);
 	}
+      /* Normal *mov<mode>_internal pattern will handle
+	 unaligned loads just fine if misaligned_operand
+	 is true, and without the UNSPEC it can be combined
+	 with arithmetic instructions.  */
+      else if (misaligned_operand (op1, GET_MODE (op1)))
+	emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
       else
 	emit_insn (load_unaligned (op0, op1));
     }
@@ -16634,7 +16640,7 @@ ix86_avx256_split_vector_move_misalign (
 void
 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
 {
-  rtx op0, op1, m;
+  rtx op0, op1, orig_op0 = NULL_RTX, m;
   rtx (*load_unaligned) (rtx, rtx);
   rtx (*store_unaligned) (rtx, rtx);
 
@@ -16647,7 +16653,16 @@ ix86_expand_vector_move_misalign (enum m
 	{
 	case MODE_VECTOR_INT:
 	case MODE_INT:
-	  op0 = gen_lowpart (V16SImode, op0);
+	  if (GET_MODE (op0) != V16SImode)
+	    {
+	      if (!MEM_P (op0))
+		{
+		  orig_op0 = op0;
+		  op0 = gen_reg_rtx (V16SImode);
+		}
+	      else
+		op0 = gen_lowpart (V16SImode, op0);
+	    }
 	  op1 = gen_lowpart (V16SImode, op1);
 	  /* FALLTHRU */
 
@@ -16676,6 +16691,8 @@ ix86_expand_vector_move_misalign (enum m
 	    emit_insn (store_unaligned (op0, op1));
 	  else
 	    gcc_unreachable ();
+	  if (orig_op0)
+	    emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
 	  break;
 
 	default:
@@ -16692,12 +16709,23 @@ ix86_expand_vector_move_misalign (enum m
 	{
 	case MODE_VECTOR_INT:
 	case MODE_INT:
-	  op0 = gen_lowpart (V32QImode, op0);
+	  if (GET_MODE (op0) != V32QImode)
+	    {
+	      if (!MEM_P (op0))
+		{
+		  orig_op0 = op0;
+		  op0 = gen_reg_rtx (V32QImode);
+		}
+	      else
+		op0 = gen_lowpart (V32QImode, op0);
+	    }
 	  op1 = gen_lowpart (V32QImode, op1);
 	  /* FALLTHRU */
 
 	case MODE_VECTOR_FLOAT:
 	  ix86_avx256_split_vector_move_misalign (op0, op1);
+	  if (orig_op0)
+	    emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
 	  break;
 
 	default:
@@ -16709,15 +16737,30 @@ ix86_expand_vector_move_misalign (enum m
 
   if (MEM_P (op1))
     {
+      /* Normal *mov<mode>_internal pattern will handle
+	 unaligned loads just fine if misaligned_operand
+	 is true, and without the UNSPEC it can be combined
+	 with arithmetic instructions.  */
+      if (TARGET_AVX
+	  && (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
+	      || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
+	  && misaligned_operand (op1, GET_MODE (op1)))
+	emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
       /* ??? If we have typed data, then it would appear that using
 	 movdqu is the only way to get unaligned data loaded with
 	 integer type.  */
-      if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+      else if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
 	{
-	  op0 = gen_lowpart (V16QImode, op0);
+	  if (GET_MODE (op0) != V16QImode)
+	    {
+	      orig_op0 = op0;
+	      op0 = gen_reg_rtx (V16QImode);
+	    }
 	  op1 = gen_lowpart (V16QImode, op1);
 	  /* We will eventually emit movups based on insn attributes.  */
 	  emit_insn (gen_sse2_loaddquv16qi (op0, op1));
+	  if (orig_op0)
+	    emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
 	}
       else if (TARGET_SSE2 && mode == V2DFmode)
         {
@@ -16765,9 +16808,16 @@ ix86_expand_vector_move_misalign (enum m
 	      || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
 	      || optimize_insn_for_size_p ())
 	    {
-	      op0 = gen_lowpart (V4SFmode, op0);
+	      if (GET_MODE (op0) != V4SFmode)
+		{
+		  orig_op0 = op0;
+		  op0 = gen_reg_rtx (V4SFmode);
+		}
 	      op1 = gen_lowpart (V4SFmode, op1);
 	      emit_insn (gen_sse_loadups (op0, op1));
+	      if (orig_op0)
+		emit_move_insn (orig_op0,
+				gen_lowpart (GET_MODE (orig_op0), op0));
 	      return;
             }
 
--- gcc/testsuite/gcc.target/i386/avx256-unaligned-load-1.c.jj	2012-10-16 13:15:44.000000000 +0200
+++ gcc/testsuite/gcc.target/i386/avx256-unaligned-load-1.c	2013-10-30 17:58:30.312180662 +0100
@@ -14,6 +14,6 @@ avx_test (void)
     c[i] = a[i] * b[i+3];
 }
 
-/* { dg-final { scan-assembler-not "avx_loadups256" } } */
-/* { dg-final { scan-assembler "sse_loadups" } } */
+/* { dg-final { scan-assembler-not "(avx_loadups256|vmovups\[^\n\r]*movv8sf_internal)" } } */
+/* { dg-final { scan-assembler "(sse_loadups|movv4sf_internal)" } } */
 /* { dg-final { scan-assembler "vinsertf128" } } */
--- gcc/testsuite/gcc.target/i386/avx256-unaligned-load-2.c.jj	2013-05-10 10:36:29.000000000 +0200
+++ gcc/testsuite/gcc.target/i386/avx256-unaligned-load-2.c	2013-10-30 18:00:19.700628673 +0100
@@ -10,6 +10,6 @@ avx_test (char **cp, char **ep)
     *ap++ = *cp++;
 }
 
-/* { dg-final { scan-assembler-not "avx_loaddqu256" } } */
-/* { dg-final { scan-assembler "sse2_loaddqu" } } */
+/* { dg-final { scan-assembler-not "(avx_loaddqu256|vmovdqu\[^\n\r]*movv32qi_internal)" } } */
+/* { dg-final { scan-assembler "(sse2_loaddqu|vmovdqu\[^\n\r]*movv16qi_internal)" } } */
 /* { dg-final { scan-assembler "vinsert.128" } } */
--- gcc/testsuite/gcc.target/i386/avx256-unaligned-load-3.c.jj	2012-10-16 13:15:44.000000000 +0200
+++ gcc/testsuite/gcc.target/i386/avx256-unaligned-load-3.c	2013-10-30 18:01:02.900409927 +0100
@@ -14,6 +14,6 @@ avx_test (void)
     c[i] = a[i] * b[i+3];
 }
 
-/* { dg-final { scan-assembler-not "avx_loadupd256" } } */
-/* { dg-final { scan-assembler "sse2_loadupd" } } */
+/* { dg-final { scan-assembler-not "(avx_loadupd256|vmovupd\[^\n\r]*movv4df_internal)" } } */
+/* { dg-final { scan-assembler "(sse2_loadupd|vmovupd\[^\n\r]*movv2df_internal)" } } */
 /* { dg-final { scan-assembler "vinsertf128" } } */
--- gcc/testsuite/gcc.target/i386/avx256-unaligned-load-4.c.jj	2013-06-10 18:16:38.000000000 +0200
+++ gcc/testsuite/gcc.target/i386/avx256-unaligned-load-4.c	2013-10-30 18:01:28.121281630 +0100
@@ -14,6 +14,6 @@ avx_test (void)
     b[i] = a[i+3] * 2;
 }
 
-/* { dg-final { scan-assembler "avx_loadups256" } } */
-/* { dg-final { scan-assembler-not "sse_loadups" } } */
+/* { dg-final { scan-assembler "(avx_loadups256|vmovups\[^\n\r]*movv8sf_internal)" } } */
+/* { dg-final { scan-assembler-not "(sse_loadups|vmovups\[^\n\r]*movv4sf_internal)" } } */
 /* { dg-final { scan-assembler-not "vinsertf128" } } */
--- gcc/testsuite/gcc.target/i386/l_fma_float_1.c.jj	2013-08-13 12:20:13.000000000 +0200
+++ gcc/testsuite/gcc.target/i386/l_fma_float_1.c	2013-10-30 18:09:20.083894747 +0100
@@ -9,13 +9,13 @@
 #include "l_fma_1.h"
 
 /* { dg-final { scan-assembler-times "vfmadd132ps" 4  } } */
-/* { dg-final { scan-assembler-times "vfmadd231ps" 4  } } */
+/* { dg-final { scan-assembler-times "vfmadd213ps" 4  } } */
 /* { dg-final { scan-assembler-times "vfmsub132ps" 4  } } */
-/* { dg-final { scan-assembler-times "vfmsub231ps" 4  } } */
+/* { dg-final { scan-assembler-times "vfmsub213ps" 4  } } */
 /* { dg-final { scan-assembler-times "vfnmadd132ps" 4  } } */
-/* { dg-final { scan-assembler-times "vfnmadd231ps" 4  } } */
+/* { dg-final { scan-assembler-times "vfnmadd213ps" 4  } } */
 /* { dg-final { scan-assembler-times "vfnmsub132ps" 4  } } */
-/* { dg-final { scan-assembler-times "vfnmsub231ps" 4  } } */
+/* { dg-final { scan-assembler-times "vfnmsub213ps" 4  } } */
 /* { dg-final { scan-assembler-times "vfmadd132ss" 60 } } */
 /* { dg-final { scan-assembler-times "vfmadd213ss" 60 } } */
 /* { dg-final { scan-assembler-times "vfmsub132ss" 60 } } */
--- gcc/testsuite/gcc.target/i386/l_fma_float_3.c.jj	2013-08-13 12:20:13.000000000 +0200
+++ gcc/testsuite/gcc.target/i386/l_fma_float_3.c	2013-10-30 18:09:37.204811080 +0100
@@ -9,13 +9,13 @@
 #include "l_fma_3.h"
 
 /* { dg-final { scan-assembler-times "vfmadd132ps" 4  } } */
-/* { dg-final { scan-assembler-times "vfmadd231ps" 4  } } */
+/* { dg-final { scan-assembler-times "vfmadd213ps" 4  } } */
 /* { dg-final { scan-assembler-times "vfmsub132ps" 4  } } */
-/* { dg-final { scan-assembler-times "vfmsub231ps" 4  } } */
+/* { dg-final { scan-assembler-times "vfmsub213ps" 4  } } */
 /* { dg-final { scan-assembler-times "vfnmadd132ps" 4  } } */
-/* { dg-final { scan-assembler-times "vfnmadd231ps" 4  } } */
+/* { dg-final { scan-assembler-times "vfnmadd213ps" 4  } } */
 /* { dg-final { scan-assembler-times "vfnmsub132ps" 4  } } */
-/* { dg-final { scan-assembler-times "vfnmsub231ps" 4  } } */
+/* { dg-final { scan-assembler-times "vfnmsub213ps" 4  } } */
 /* { dg-final { scan-assembler-times "vfmadd132ss" 60  } } */
 /* { dg-final { scan-assembler-times "vfmadd213ss" 60  } } */
 /* { dg-final { scan-assembler-times "vfmsub132ss" 60  } } */
--- gcc/testsuite/gcc.target/i386/l_fma_double_1.c.jj	2013-08-13 12:20:13.000000000 +0200
+++ gcc/testsuite/gcc.target/i386/l_fma_double_1.c	2013-10-30 18:08:44.504073698 +0100
@@ -10,13 +10,13 @@ typedef double adouble __attribute__((al
 #include "l_fma_1.h"
 
 /* { dg-final { scan-assembler-times "vfmadd132pd" 4  } } */
-/* { dg-final { scan-assembler-times "vfmadd231pd" 4  } } */
+/* { dg-final { scan-assembler-times "vfmadd213pd" 4  } } */
 /* { dg-final { scan-assembler-times "vfmsub132pd" 4  } } */
-/* { dg-final { scan-assembler-times "vfmsub231pd" 4  } } */
+/* { dg-final { scan-assembler-times "vfmsub213pd" 4  } } */
 /* { dg-final { scan-assembler-times "vfnmadd132pd" 4  } } */
-/* { dg-final { scan-assembler-times "vfnmadd231pd" 4  } } */
+/* { dg-final { scan-assembler-times "vfnmadd213pd" 4  } } */
 /* { dg-final { scan-assembler-times "vfnmsub132pd" 4  } } */
-/* { dg-final { scan-assembler-times "vfnmsub231pd" 4  } } */
+/* { dg-final { scan-assembler-times "vfnmsub213pd" 4  } } */
 /* { dg-final { scan-assembler-times "vfmadd132sd" 28  } } */
 /* { dg-final { scan-assembler-times "vfmadd213sd" 28 } } */
 /* { dg-final { scan-assembler-times "vfmsub132sd" 28 } } */
--- gcc/testsuite/gcc.target/i386/l_fma_double_3.c.jj	2013-08-13 12:20:13.000000000 +0200
+++ gcc/testsuite/gcc.target/i386/l_fma_double_3.c	2013-10-30 18:09:02.270986352 +0100
@@ -10,13 +10,13 @@ typedef double adouble __attribute__((al
 #include "l_fma_3.h"
 
 /* { dg-final { scan-assembler-times "vfmadd132pd" 4  } } */
-/* { dg-final { scan-assembler-times "vfmadd231pd" 4  } } */
+/* { dg-final { scan-assembler-times "vfmadd213pd" 4  } } */
 /* { dg-final { scan-assembler-times "vfmsub132pd" 4  } } */
-/* { dg-final { scan-assembler-times "vfmsub231pd" 4  } } */
+/* { dg-final { scan-assembler-times "vfmsub213pd" 4  } } */
 /* { dg-final { scan-assembler-times "vfnmadd132pd" 4  } } */
-/* { dg-final { scan-assembler-times "vfnmadd231pd" 4  } } */
+/* { dg-final { scan-assembler-times "vfnmadd213pd" 4  } } */
 /* { dg-final { scan-assembler-times "vfnmsub132pd" 4  } } */
-/* { dg-final { scan-assembler-times "vfnmsub231pd" 4  } } */
+/* { dg-final { scan-assembler-times "vfnmsub213pd" 4  } } */
 /* { dg-final { scan-assembler-times "vfmadd132sd" 28 } } */
 /* { dg-final { scan-assembler-times "vfmadd213sd" 28 } } */
 /* { dg-final { scan-assembler-times "vfmsub132sd" 28 } } */


	Jakub

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC PATCH] For TARGET_AVX use *mov<mode>_internal for misaligned loads
  2013-10-30 11:54   ` Jakub Jelinek
@ 2013-10-30 18:02     ` Uros Bizjak
  2013-10-30 18:05       ` Uros Bizjak
  2013-10-31 17:27     ` [PATCH] Try to avoid vector mode punning in SET_DEST on i?86 Jakub Jelinek
  1 sibling, 1 reply; 15+ messages in thread
From: Uros Bizjak @ 2013-10-30 18:02 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Richard Henderson, Kirill Yukhin, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 2041 bytes --]

On Wed, Oct 30, 2013 at 12:11 PM, Jakub Jelinek <jakub@redhat.com> wrote:

>> > Yesterday I've noticed that for AVX which allows unaligned operands in
>> > AVX arithmetics instructions we still don't combine unaligned loads with the
>> > AVX arithmetics instructions.  So say for -O2 -mavx -ftree-vectorize
>>
>> This is actually PR 47754 that fell below radar for some reason...
>
> Apparently yes.
>
>> > the patch attempts to avoid gen_lowpart on the non-MEM lhs of the unaligned
>> > loads, which usually means combine will fail, by doing the load into a
>> > temporary pseudo in that case and then doing a pseudo to pseudo move with
>> > gen_lowpart on the rhs (which will be merged soon after into following
>> > instructions).
>>
>> Is this similar to PR44141? There were similar problems with V4SFmode
>> subregs, so combine was not able to merge load to the arithemtic insn.
>
> From the work on the vectorization last year I remember many cases where
> subregs (even equal size) on the LHS of instructions prevented combiner or
> other RTL optimizations from doing it's job.  I believe I've changed some
> easy places that did that completely unnecessarily, but certainly have not
> went through all the code to look for other places where this is done.
>
> Perhaps let's hack up a checking pass that will after expansion walk the
> whole IL and complain about same sized subregs on the LHS of insns, then do make
> check with it for a couple of ISAs (-msse2,-msse4,-mavx,-mavx2 e.g.?
>
>> > I'll bootstrap/regtest this on x86_64-linux and i686-linux, unfortunately my
>> > bootstrap/regtest server isn't AVX capable.
>>
>> I can bootstrap the patch later today on IvyBridge with
>> --with-arch=core-avx-i --with-cpu=core-avx-i --with-fpmath=avx.
>
> That would be greatly appreciated, thanks.

The bootstrap and regression test was OK for x86_64-linux-gnu {,-m32}.

The failures in the attached report are either pre-existing or benign
due to core-avx-i default to AVX.

Please also mention PR44141 in the ChangeLog entry.

Uros.

[-- Attachment #2: mail-report.log.gz --]
[-- Type: application/x-gzip, Size: 9145 bytes --]

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC PATCH] For TARGET_AVX use *mov<mode>_internal for misaligned loads
  2013-10-30 18:02     ` Uros Bizjak
@ 2013-10-30 18:05       ` Uros Bizjak
  0 siblings, 0 replies; 15+ messages in thread
From: Uros Bizjak @ 2013-10-30 18:05 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Richard Henderson, Kirill Yukhin, gcc-patches

On Wed, Oct 30, 2013 at 6:42 PM, Uros Bizjak <ubizjak@gmail.com> wrote:
> On Wed, Oct 30, 2013 at 12:11 PM, Jakub Jelinek <jakub@redhat.com> wrote:
>
>>> > Yesterday I've noticed that for AVX which allows unaligned operands in
>>> > AVX arithmetics instructions we still don't combine unaligned loads with the
>>> > AVX arithmetics instructions.  So say for -O2 -mavx -ftree-vectorize
>>>
>>> This is actually PR 47754 that fell below radar for some reason...
>>
>> Apparently yes.
>>
>>> > the patch attempts to avoid gen_lowpart on the non-MEM lhs of the unaligned
>>> > loads, which usually means combine will fail, by doing the load into a
>>> > temporary pseudo in that case and then doing a pseudo to pseudo move with
>>> > gen_lowpart on the rhs (which will be merged soon after into following
>>> > instructions).
>>>
>>> Is this similar to PR44141? There were similar problems with V4SFmode
>>> subregs, so combine was not able to merge load to the arithemtic insn.
>>
>> From the work on the vectorization last year I remember many cases where
>> subregs (even equal size) on the LHS of instructions prevented combiner or
>> other RTL optimizations from doing it's job.  I believe I've changed some
>> easy places that did that completely unnecessarily, but certainly have not
>> went through all the code to look for other places where this is done.
>>
>> Perhaps let's hack up a checking pass that will after expansion walk the
>> whole IL and complain about same sized subregs on the LHS of insns, then do make
>> check with it for a couple of ISAs (-msse2,-msse4,-mavx,-mavx2 e.g.?
>>
>>> > I'll bootstrap/regtest this on x86_64-linux and i686-linux, unfortunately my
>>> > bootstrap/regtest server isn't AVX capable.
>>>
>>> I can bootstrap the patch later today on IvyBridge with
>>> --with-arch=core-avx-i --with-cpu=core-avx-i --with-fpmath=avx.
>>
>> That would be greatly appreciated, thanks.
>
> The bootstrap and regression test was OK for x86_64-linux-gnu {,-m32}.
>
> The failures in the attached report are either pre-existing or benign
> due to core-avx-i default to AVX.

I was referring to the *runtime* failures here.

> Please also mention PR44141 in the ChangeLog entry.

Ops, this should be PR47754.

Thanks,
Uros.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH] Try to avoid vector mode punning in SET_DEST on i?86
  2013-10-30 11:54   ` Jakub Jelinek
  2013-10-30 18:02     ` Uros Bizjak
@ 2013-10-31 17:27     ` Jakub Jelinek
  2013-10-31 20:05       ` Richard Henderson
  2013-11-04 18:28       ` James Greenhalgh
  1 sibling, 2 replies; 15+ messages in thread
From: Jakub Jelinek @ 2013-10-31 17:27 UTC (permalink / raw)
  To: Uros Bizjak, Richard Henderson; +Cc: Kirill Yukhin, gcc-patches

Hi!

On Wed, Oct 30, 2013 at 12:11:25PM +0100, Jakub Jelinek wrote:
> > > the patch attempts to avoid gen_lowpart on the non-MEM lhs of the unaligned
> > > loads, which usually means combine will fail, by doing the load into a
> > > temporary pseudo in that case and then doing a pseudo to pseudo move with
> > > gen_lowpart on the rhs (which will be merged soon after into following
> > > instructions).
> > 
> > Is this similar to PR44141? There were similar problems with V4SFmode
> > subregs, so combine was not able to merge load to the arithemtic insn.
> 
> >From the work on the vectorization last year I remember many cases where
> subregs (even equal size) on the LHS of instructions prevented combiner or
> other RTL optimizations from doing it's job.  I believe I've changed some
> easy places that did that completely unnecessarily, but certainly have not
> went through all the code to look for other places where this is done.
> 
> Perhaps let's hack up a checking pass that will after expansion walk the
> whole IL and complain about same sized subregs on the LHS of insns, then do make
> check with it for a couple of ISAs (-msse2,-msse4,-mavx,-mavx2 e.g.?

So, I've used a hack in cfgexpand.c:

--- cfgexpand.c.jj	2013-10-30 08:15:36.000000000 +0100
+++ cfgexpand.c	2013-10-30 16:19:33.974203309 +0100
@@ -4535,6 +4535,31 @@ expand_stack_alignment (void)
     }
 }
 
+static void
+check_vector_subregs (rtx, const_rtx pat, void *data)
+{
+  rtx x;
+  if (GET_CODE (pat) != SET)
+    return;
+  x = SET_DEST (pat);
+  if (GET_CODE (x) == SUBREG
+      && GET_CODE (SUBREG_REG (x)) == REG
+      && (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT
+	  || GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
+	  || GET_MODE_CLASS (GET_MODE (SUBREG_REG (x))) == MODE_VECTOR_INT
+	  || GET_MODE_CLASS (GET_MODE (SUBREG_REG (x))) == MODE_VECTOR_FLOAT)
+      && GET_MODE_BITSIZE (GET_MODE (x))
+	 == GET_MODE_BITSIZE (GET_MODE (SUBREG_REG (x))))
+    {
+      FILE *f = fopen ("/tmp/subregs", "a");
+      fprintf (f, "%s %d ", main_input_filename ? main_input_filename : "-",
+	       (int) BITS_PER_WORD);
+      print_inline_rtx (f, (rtx) data, 0);
+      putc ('\n', f);
+      fclose (f);
+    }
+}
+
 /* Translate the intermediate representation contained in the CFG
    from GIMPLE trees to RTL.
 
@@ -4922,6 +4947,13 @@ gimple_expand_cfg (void)
   set_block_levels (DECL_INITIAL (cfun->decl), 0);
   default_rtl_profile ();
 
+  {
+    rtx insn;
+    for (insn = get_insns (); insn; insn = NEXT_INSN (insn))
+      if (INSN_P (insn))
+	note_stores (PATTERN (insn), check_vector_subregs, insn);
+  }
+
   timevar_pop (TV_POST_EXPAND);
 
   return 0;

which revealed about 18000 vector mode punning subregs in SET_DEST in
x86_64 64-bit i386.exp testing alone, this patch brings that down to
zero (and in the whole make check-gcc down to about 40 which are from
various parts of the middle-end).  The disadvantage of mode punning
subregs in SET_DEST is that then CSE, combine, etc. don't attempt to
optimize those.  Additionally, I've noticed that expand_vec_perm_palignr
would happily clobber *d, that seems undesirable to me, other places always
modify a copy of the structure.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2013-10-31  Jakub Jelinek  <jakub@redhat.com>

	* optabs.c (expand_vec_perm): Avoid vector mode punning
	SUBREGs in SET_DEST.
	* expmed.c (store_bit_field_1): Likewise.
	* config/i386/sse.md (movdi_to_sse, vec_pack_sfix_trunc_v2df,
	vec_pack_sfix_v2df, vec_shl_<mode>, vec_shr_<mode>,
	vec_interleave_high<mode>, vec_interleave_low<mode>): Likewise.
	* config/i386/i386.c (ix86_expand_vector_move_misalign,
	ix86_expand_sse_movcc, ix86_expand_int_vcond, ix86_expand_vec_perm,
	ix86_expand_sse_unpack, ix86_expand_args_builtin,
	ix86_expand_vector_init_duplicate, ix86_expand_vector_set,
	emit_reduc_half, expand_vec_perm_blend, expand_vec_perm_pshufb,
	expand_vec_perm_interleave2, expand_vec_perm_pshufb2,
	expand_vec_perm_vpshufb2_vpermq,
	expand_vec_perm_vpshufb2_vpermq_even_odd, expand_vec_perm_even_odd_1,
	expand_vec_perm_broadcast_1, expand_vec_perm_vpshufb4_vpermq2,
	ix86_expand_sse2_mulv4si3, ix86_expand_pinsr): Likewise.
	(expand_vec_perm_palignr): Likewise.  Modify a copy of *d rather
	than *d itself.

--- gcc/optabs.c.jj	2013-10-29 09:25:45.000000000 +0100
+++ gcc/optabs.c	2013-10-31 13:20:40.384808642 +0100
@@ -6624,8 +6624,8 @@ expand_vec_perm (enum machine_mode mode,
 	  icode = direct_optab_handler (vec_perm_const_optab, qimode);
 	  if (icode != CODE_FOR_nothing)
 	    {
-	      tmp = expand_vec_perm_1 (icode, gen_lowpart (qimode, target),
-				       gen_lowpart (qimode, v0),
+	      tmp = mode != qimode ? gen_reg_rtx (qimode) : target;
+	      tmp = expand_vec_perm_1 (icode, tmp, gen_lowpart (qimode, v0),
 				       gen_lowpart (qimode, v1), sel_qi);
 	      if (tmp)
 		return gen_lowpart (mode, tmp);
@@ -6674,7 +6674,7 @@ expand_vec_perm (enum machine_mode mode,
 	}
       tmp = gen_rtx_CONST_VECTOR (qimode, vec);
       sel = gen_lowpart (qimode, sel);
-      sel = expand_vec_perm (qimode, sel, sel, tmp, NULL);
+      sel = expand_vec_perm (qimode, gen_reg_rtx (qimode), sel, tmp, NULL);
       gcc_assert (sel != NULL);
 
       /* Add the byte offset to each byte element.  */
@@ -6689,8 +6689,8 @@ expand_vec_perm (enum machine_mode mode,
       gcc_assert (sel_qi != NULL);
     }
 
-  tmp = expand_vec_perm_1 (icode, gen_lowpart (qimode, target),
-			   gen_lowpart (qimode, v0),
+  tmp = mode != qimode ? gen_reg_rtx (qimode) : target;
+  tmp = expand_vec_perm_1 (icode, tmp, gen_lowpart (qimode, v0),
 			   gen_lowpart (qimode, v1), sel_qi);
   if (tmp)
     tmp = gen_lowpart (mode, tmp);
--- gcc/expmed.c.jj	2013-09-30 22:13:54.000000000 +0200
+++ gcc/expmed.c	2013-10-30 21:03:28.609613144 +0100
@@ -624,13 +624,28 @@ store_bit_field_1 (rtx str_rtx, unsigned
 	  || (bitsize % BITS_PER_WORD == 0 && bitnum % BITS_PER_WORD == 0)))
     {
       /* Use the subreg machinery either to narrow OP0 to the required
-	 words or to cope with mode punning between equal-sized modes.  */
-      rtx sub = simplify_gen_subreg (fieldmode, op0, GET_MODE (op0),
-				     bitnum / BITS_PER_UNIT);
-      if (sub)
+	 words or to cope with mode punning between equal-sized modes.
+	 In the latter case, use subreg on the rhs side, not lhs.  */
+      rtx sub;
+
+      if (bitsize == GET_MODE_BITSIZE (GET_MODE (op0)))
 	{
-	  emit_move_insn (sub, value);
-	  return true;
+	  sub = simplify_gen_subreg (GET_MODE (op0), value, fieldmode, 0);
+	  if (sub)
+	    {
+	      emit_move_insn (op0, sub);
+	      return true;
+	    }
+	}
+      else
+	{
+	  sub = simplify_gen_subreg (fieldmode, op0, GET_MODE (op0),
+				     bitnum / BITS_PER_UNIT);
+	  if (sub)
+	    {
+	      emit_move_insn (sub, value);
+	      return true;
+	    }
 	}
     }
 
--- gcc/config/i386/sse.md.jj	2013-10-29 09:25:55.000000000 +0100
+++ gcc/config/i386/sse.md	2013-10-31 10:04:26.750274224 +0100
@@ -800,10 +800,13 @@ (define_insn_and_split "movdi_to_sse"
 				  gen_rtx_SUBREG (SImode, operands[1], 4)));
       emit_insn (gen_vec_interleave_lowv4si (operands[0], operands[0],
 					     operands[2]));
-    }
+   }
  else if (memory_operand (operands[1], DImode))
-   emit_insn (gen_vec_concatv2di (gen_lowpart (V2DImode, operands[0]),
-				  operands[1], const0_rtx));
+   {
+     rtx tmp = gen_reg_rtx (V2DImode);
+     emit_insn (gen_vec_concatv2di (tmp, operands[1], const0_rtx));
+     emit_move_insn (operands[0], gen_lowpart (V4SImode, tmp));
+   }
  else
    gcc_unreachable ();
 })
@@ -4208,7 +4211,7 @@ (define_expand "vec_pack_sfix_trunc_v2df
    (match_operand:V2DF 2 "nonimmediate_operand")]
   "TARGET_SSE2"
 {
-  rtx tmp0, tmp1;
+  rtx tmp0, tmp1, tmp2;
 
   if (TARGET_AVX && !TARGET_PREFER_AVX128)
     {
@@ -4222,13 +4225,14 @@ (define_expand "vec_pack_sfix_trunc_v2df
     {
       tmp0 = gen_reg_rtx (V4SImode);
       tmp1 = gen_reg_rtx (V4SImode);
+      tmp2 = gen_reg_rtx (V2DImode);
 
       emit_insn (gen_sse2_cvttpd2dq (tmp0, operands[1]));
       emit_insn (gen_sse2_cvttpd2dq (tmp1, operands[2]));
-      emit_insn
-       (gen_vec_interleave_lowv2di (gen_lowpart (V2DImode, operands[0]),
-				    gen_lowpart (V2DImode, tmp0),
-				    gen_lowpart (V2DImode, tmp1)));
+      emit_insn (gen_vec_interleave_lowv2di (tmp2,
+					     gen_lowpart (V2DImode, tmp0),
+					     gen_lowpart (V2DImode, tmp1)));
+      emit_move_insn (operands[0], gen_lowpart (V4SImode, tmp2));
     }
   DONE;
 })
@@ -4289,7 +4293,7 @@ (define_expand "vec_pack_sfix_v2df"
    (match_operand:V2DF 2 "nonimmediate_operand")]
   "TARGET_SSE2"
 {
-  rtx tmp0, tmp1;
+  rtx tmp0, tmp1, tmp2;
 
   if (TARGET_AVX && !TARGET_PREFER_AVX128)
     {
@@ -4303,13 +4307,14 @@ (define_expand "vec_pack_sfix_v2df"
     {
       tmp0 = gen_reg_rtx (V4SImode);
       tmp1 = gen_reg_rtx (V4SImode);
+      tmp2 = gen_reg_rtx (V2DImode);
 
       emit_insn (gen_sse2_cvtpd2dq (tmp0, operands[1]));
       emit_insn (gen_sse2_cvtpd2dq (tmp1, operands[2]));
-      emit_insn
-       (gen_vec_interleave_lowv2di (gen_lowpart (V2DImode, operands[0]),
-				    gen_lowpart (V2DImode, tmp0),
-				    gen_lowpart (V2DImode, tmp1)));
+      emit_insn (gen_vec_interleave_lowv2di (tmp2,
+					     gen_lowpart (V2DImode, tmp0),
+					     gen_lowpart (V2DImode, tmp1)));
+      emit_move_insn (operands[0], gen_lowpart (V4SImode, tmp2));
     }
   DONE;
 })
@@ -7328,14 +7333,16 @@ (define_insn "<shift_insn><mode>3"
    (set_attr "mode" "<sseinsnmode>")])
 
 (define_expand "vec_shl_<mode>"
-  [(set (match_operand:VI_128 0 "register_operand")
+  [(set (match_dup 3)
 	(ashift:V1TI
 	 (match_operand:VI_128 1 "register_operand")
-	 (match_operand:SI 2 "const_0_to_255_mul_8_operand")))]
+	 (match_operand:SI 2 "const_0_to_255_mul_8_operand")))
+   (set (match_operand:VI_128 0 "register_operand") (match_dup 4))]
   "TARGET_SSE2"
 {
-  operands[0] = gen_lowpart (V1TImode, operands[0]);
   operands[1] = gen_lowpart (V1TImode, operands[1]);
+  operands[3] = gen_reg_rtx (V1TImode);
+  operands[4] = gen_lowpart (<MODE>mode, operands[3]);
 })
 
 (define_insn "<sse2_avx2>_ashl<mode>3"
@@ -7365,14 +7372,16 @@ (define_insn "<sse2_avx2>_ashl<mode>3"
    (set_attr "mode" "<sseinsnmode>")])
 
 (define_expand "vec_shr_<mode>"
-  [(set (match_operand:VI_128 0 "register_operand")
+  [(set (match_dup 3)
 	(lshiftrt:V1TI
 	 (match_operand:VI_128 1 "register_operand")
-	 (match_operand:SI 2 "const_0_to_255_mul_8_operand")))]
+	 (match_operand:SI 2 "const_0_to_255_mul_8_operand")))
+   (set (match_operand:VI_128 0 "register_operand") (match_dup 4))]
   "TARGET_SSE2"
 {
-  operands[0] = gen_lowpart (V1TImode, operands[0]);
   operands[1] = gen_lowpart (V1TImode, operands[1]);
+  operands[3] = gen_reg_rtx (V1TImode);
+  operands[4] = gen_lowpart (<MODE>mode, operands[3]);
 })
 
 (define_insn "<sse2_avx2>_lshr<mode>3"
@@ -8542,12 +8551,13 @@ (define_expand "vec_interleave_high<mode
 {
   rtx t1 = gen_reg_rtx (<MODE>mode);
   rtx t2 = gen_reg_rtx (<MODE>mode);
+  rtx t3 = gen_reg_rtx (V4DImode);
   emit_insn (gen_avx2_interleave_low<mode> (t1, operands[1], operands[2]));
   emit_insn (gen_avx2_interleave_high<mode> (t2,  operands[1], operands[2]));
-  emit_insn (gen_avx2_permv2ti
-	     (gen_lowpart (V4DImode, operands[0]),
-	      gen_lowpart (V4DImode, t1),
-	      gen_lowpart (V4DImode, t2), GEN_INT (1 + (3 << 4))));
+  emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, t1),
+				gen_lowpart (V4DImode, t2),
+				GEN_INT (1 + (3 << 4))));
+  emit_move_insn (operands[0], gen_lowpart (<MODE>mode, t3));
   DONE;
 })
 
@@ -8559,12 +8569,13 @@ (define_expand "vec_interleave_low<mode>
 {
   rtx t1 = gen_reg_rtx (<MODE>mode);
   rtx t2 = gen_reg_rtx (<MODE>mode);
+  rtx t3 = gen_reg_rtx (V4DImode);
   emit_insn (gen_avx2_interleave_low<mode> (t1, operands[1], operands[2]));
   emit_insn (gen_avx2_interleave_high<mode> (t2, operands[1], operands[2]));
-  emit_insn (gen_avx2_permv2ti
-	     (gen_lowpart (V4DImode, operands[0]),
-	      gen_lowpart (V4DImode, t1),
-	      gen_lowpart (V4DImode, t2), GEN_INT (0 + (2 << 4))));
+  emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, t1),
+				gen_lowpart (V4DImode, t2),
+				GEN_INT (0 + (2 << 4))));
+  emit_move_insn (operands[0], gen_lowpart (<MODE>mode, t3));
   DONE;
 })
 
--- gcc/config/i386/i386.c.jj	2013-10-30 18:49:26.000000000 +0100
+++ gcc/config/i386/i386.c	2013-10-31 13:05:56.690352315 +0100
@@ -16803,6 +16803,8 @@ ix86_expand_vector_move_misalign (enum m
 	}
       else
         {
+	  rtx t;
+
 	  if (TARGET_AVX
 	      || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
 	      || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
@@ -16821,18 +16823,22 @@ ix86_expand_vector_move_misalign (enum m
 	      return;
             }
 
+	  if (mode != V4SFmode)
+	    t = gen_reg_rtx (V4SFmode);
+	  else
+	    t = op0;
+	    
 	  if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
-	    emit_move_insn (op0, CONST0_RTX (mode));
+	    emit_move_insn (t, CONST0_RTX (V4SFmode));
 	  else
-	    emit_clobber (op0);
-
-	  if (mode != V4SFmode)
-	    op0 = gen_lowpart (V4SFmode, op0);
+	    emit_clobber (t);
 
 	  m = adjust_address (op1, V2SFmode, 0);
-	  emit_insn (gen_sse_loadlps (op0, op0, m));
+	  emit_insn (gen_sse_loadlps (t, t, m));
 	  m = adjust_address (op1, V2SFmode, 8);
-	  emit_insn (gen_sse_loadhps (op0, op0, m));
+	  emit_insn (gen_sse_loadhps (t, t, m));
+	  if (mode != V4SFmode)
+	    emit_move_insn (op0, gen_lowpart (mode, t));
 	}
     }
   else if (MEM_P (op0))
@@ -20473,6 +20479,7 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp
   else
     {
       rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
+      rtx d = dest;
 
       if (!nonimmediate_operand (op_true, mode))
 	op_true = force_reg (mode, op_true);
@@ -20496,7 +20503,8 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp
 	  if (TARGET_SSE4_1)
 	    {
 	      gen = gen_sse4_1_pblendvb;
-	      dest = gen_lowpart (V16QImode, dest);
+	      if (mode != V16QImode)
+		d = gen_reg_rtx (V16QImode);
 	      op_false = gen_lowpart (V16QImode, op_false);
 	      op_true = gen_lowpart (V16QImode, op_true);
 	      cmp = gen_lowpart (V16QImode, cmp);
@@ -20517,7 +20525,8 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp
 	  if (TARGET_AVX2)
 	    {
 	      gen = gen_avx2_pblendvb;
-	      dest = gen_lowpart (V32QImode, dest);
+	      if (mode != V32QImode)
+		d = gen_reg_rtx (V32QImode);
 	      op_false = gen_lowpart (V32QImode, op_false);
 	      op_true = gen_lowpart (V32QImode, op_true);
 	      cmp = gen_lowpart (V32QImode, cmp);
@@ -20528,7 +20537,11 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp
 	}
 
       if (gen != NULL)
-	emit_insn (gen (dest, op_false, op_true, cmp));
+	{
+	  emit_insn (gen (d, op_false, op_true, cmp));
+	  if (d != dest)
+	    emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
+	}
       else
 	{
 	  op_true = force_reg (mode, op_true);
@@ -20849,8 +20862,7 @@ ix86_expand_int_vcond (rtx operands[])
   else
     {
       gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
-      x = ix86_expand_sse_cmp (gen_lowpart (mode, operands[0]),
-			       code, cop0, cop1,
+      x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
 			       operands[1+negate], operands[2-negate]);
       x = gen_lowpart (data_mode, x);
     }
@@ -20869,7 +20881,7 @@ ix86_expand_vec_perm (rtx operands[])
   rtx op0 = operands[1];
   rtx op1 = operands[2];
   rtx mask = operands[3];
-  rtx t1, t2, t3, t4, vt, vt2, vec[32];
+  rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
   enum machine_mode mode = GET_MODE (op0);
   enum machine_mode maskmode = GET_MODE (mask);
   int w, e, i;
@@ -20937,7 +20949,7 @@ ix86_expand_vec_perm (rtx operands[])
 
 	  /* Continue as if V8SImode (resp. V32QImode) was used initially.  */
 	  operands[3] = mask = t1;
-	  target = gen_lowpart (mode, target);
+	  target = gen_reg_rtx (mode);
 	  op0 = gen_lowpart (mode, op0);
 	  op1 = gen_lowpart (mode, op1);
 	}
@@ -20949,7 +20961,12 @@ ix86_expand_vec_perm (rtx operands[])
 	     the high bits of the shuffle elements.  No need for us to
 	     perform an AND ourselves.  */
 	  if (one_operand_shuffle)
-	    emit_insn (gen_avx2_permvarv8si (target, op0, mask));
+	    {
+	      emit_insn (gen_avx2_permvarv8si (target, op0, mask));
+	      if (target != operands[0])
+		emit_move_insn (operands[0],
+				gen_lowpart (GET_MODE (operands[0]), target));
+	    }
 	  else
 	    {
 	      t1 = gen_reg_rtx (V8SImode);
@@ -21022,13 +21039,13 @@ ix86_expand_vec_perm (rtx operands[])
 	     stands for other 12 bytes.  */
 	  /* The bit whether element is from the same lane or the other
 	     lane is bit 4, so shift it up by 3 to the MSB position.  */
-	  emit_insn (gen_ashlv4di3 (gen_lowpart (V4DImode, t1),
-				    gen_lowpart (V4DImode, mask),
+	  t5 = gen_reg_rtx (V4DImode);
+	  emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
 				    GEN_INT (3)));
 	  /* Clear MSB bits from the mask just in case it had them set.  */
 	  emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
 	  /* After this t1 will have MSB set for elements from other lane.  */
-	  emit_insn (gen_xorv32qi3 (t1, t1, vt2));
+	  emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
 	  /* Clear bits other than MSB.  */
 	  emit_insn (gen_andv32qi3 (t1, t1, vt));
 	  /* Or in the lower bits from mask into t3.  */
@@ -21037,8 +21054,8 @@ ix86_expand_vec_perm (rtx operands[])
 	     lane.  */
 	  emit_insn (gen_xorv32qi3 (t1, t1, vt));
 	  /* Swap 128-bit lanes in t3.  */
-	  emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
-					  gen_lowpart (V4DImode, t3),
+	  t6 = gen_reg_rtx (V4DImode);
+	  emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
 					  const2_rtx, GEN_INT (3),
 					  const0_rtx, const1_rtx));
 	  /* And or in the lower bits from mask into t1.  */
@@ -21048,15 +21065,20 @@ ix86_expand_vec_perm (rtx operands[])
 	      /* Each of these shuffles will put 0s in places where
 		 element from the other 128-bit lane is needed, otherwise
 		 will shuffle in the requested value.  */
-	      emit_insn (gen_avx2_pshufbv32qi3 (t3, op0, t3));
+	      emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
+						gen_lowpart (V32QImode, t6)));
 	      emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
 	      /* For t3 the 128-bit lanes are swapped again.  */
-	      emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
-					      gen_lowpart (V4DImode, t3),
+	      t7 = gen_reg_rtx (V4DImode);
+	      emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
 					      const2_rtx, GEN_INT (3),
 					      const0_rtx, const1_rtx));
 	      /* And oring both together leads to the result.  */
-	      emit_insn (gen_iorv32qi3 (target, t1, t3));
+	      emit_insn (gen_iorv32qi3 (target, t1,
+					gen_lowpart (V32QImode, t7)));
+	      if (target != operands[0])
+		emit_move_insn (operands[0],
+				gen_lowpart (GET_MODE (operands[0]), target));
 	      return;
 	    }
 
@@ -21064,20 +21086,22 @@ ix86_expand_vec_perm (rtx operands[])
 	  /* Similarly to the above one_operand_shuffle code,
 	     just for repeated twice for each operand.  merge_two:
 	     code will merge the two results together.  */
-	  emit_insn (gen_avx2_pshufbv32qi3 (t4, op0, t3));
-	  emit_insn (gen_avx2_pshufbv32qi3 (t3, op1, t3));
+	  emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
+					    gen_lowpart (V32QImode, t6)));
+	  emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
+					    gen_lowpart (V32QImode, t6)));
 	  emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
 	  emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
-	  emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t4),
-					  gen_lowpart (V4DImode, t4),
+	  t7 = gen_reg_rtx (V4DImode);
+	  emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
 					  const2_rtx, GEN_INT (3),
 					  const0_rtx, const1_rtx));
-	  emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
-					  gen_lowpart (V4DImode, t3),
+	  t8 = gen_reg_rtx (V4DImode);
+	  emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
 					  const2_rtx, GEN_INT (3),
 					  const0_rtx, const1_rtx));
-	  emit_insn (gen_iorv32qi3 (t4, t2, t4));
-	  emit_insn (gen_iorv32qi3 (t3, t1, t3));
+	  emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
+	  emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
 	  t1 = t4;
 	  t2 = t3;
 	  goto merge_two;
@@ -21146,15 +21170,24 @@ ix86_expand_vec_perm (rtx operands[])
   /* The actual shuffle operations all operate on V16QImode.  */
   op0 = gen_lowpart (V16QImode, op0);
   op1 = gen_lowpart (V16QImode, op1);
-  target = gen_lowpart (V16QImode, target);
 
   if (TARGET_XOP)
     {
+      if (GET_MODE (target) != V16QImode)
+	target = gen_reg_rtx (V16QImode);
       emit_insn (gen_xop_pperm (target, op0, op1, mask));
+      if (target != operands[0])
+	emit_move_insn (operands[0],
+			gen_lowpart (GET_MODE (operands[0]), target));
     }
   else if (one_operand_shuffle)
     {
+      if (GET_MODE (target) != V16QImode)
+	target = gen_reg_rtx (V16QImode);
       emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
+      if (target != operands[0])
+	emit_move_insn (operands[0],
+			gen_lowpart (GET_MODE (operands[0]), target));
     }
   else
     {
@@ -21194,7 +21227,9 @@ ix86_expand_vec_perm (rtx operands[])
       mask = expand_simple_binop (maskmode, AND, mask, vt,
 				  NULL_RTX, 0, OPTAB_DIRECT);
 
-      xops[0] = gen_lowpart (mode, operands[0]);
+      if (GET_MODE (target) != mode)
+	target = gen_reg_rtx (mode);
+      xops[0] = target;
       xops[1] = gen_lowpart (mode, t2);
       xops[2] = gen_lowpart (mode, t1);
       xops[3] = gen_rtx_EQ (maskmode, mask, vt);
@@ -21202,6 +21237,9 @@ ix86_expand_vec_perm (rtx operands[])
       xops[5] = vt;
       ok = ix86_expand_int_vcond (xops);
       gcc_assert (ok);
+      if (target != operands[0])
+	emit_move_insn (operands[0],
+			gen_lowpart (GET_MODE (operands[0]), target));
     }
 }
 
@@ -21280,10 +21318,10 @@ ix86_expand_sse_unpack (rtx dest, rtx sr
       else if (high_p)
 	{
 	  /* Shift higher 8 bytes to lower 8 bytes.  */
-	  tmp = gen_reg_rtx (imode);
-	  emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, tmp),
-					 gen_lowpart (V1TImode, src),
+	  tmp = gen_reg_rtx (V1TImode);
+	  emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
 					 GEN_INT (64)));
+	  tmp = gen_lowpart (imode, tmp);
 	}
       else
 	tmp = src;
@@ -21324,7 +21362,9 @@ ix86_expand_sse_unpack (rtx dest, rtx sr
 	tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
 				   src, pc_rtx, pc_rtx);
 
-      emit_insn (unpack (gen_lowpart (imode, dest), src, tmp));
+      rtx tmp2 = gen_reg_rtx (imode);
+      emit_insn (unpack (tmp2, src, tmp));
+      emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
     }
 }
 
@@ -31967,8 +32007,8 @@ ix86_expand_args_builtin (const struct b
     }
   else
     {
-      target = gen_reg_rtx (rmode);
-      real_target = simplify_gen_subreg (tmode, target, rmode, 0);
+      real_target = gen_reg_rtx (tmode);
+      target = simplify_gen_subreg (rmode, real_target, tmode, 0);
     }
 
   for (i = 0; i < nargs; i++)
@@ -36691,8 +36731,9 @@ ix86_expand_vector_init_duplicate (bool
 	  emit_move_insn (tmp1, gen_lowpart (SImode, val));
 
 	  /* Insert the SImode value as low element of a V4SImode vector. */
-	  tmp2 = gen_lowpart (V4SImode, dperm.op0);
+	  tmp2 = gen_reg_rtx (V4SImode);
 	  emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
+	  emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
 
 	  ok = (expand_vec_perm_1 (&dperm)
 		|| expand_vec_perm_broadcast_1 (&dperm));
@@ -36722,9 +36763,10 @@ ix86_expand_vector_init_duplicate (bool
 				 NULL_RTX, 1, OPTAB_LIB_WIDEN);
 	val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
 
-	x = gen_lowpart (wvmode, target);
+	x = gen_reg_rtx (wvmode);
 	ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
 	gcc_assert (ok);
+	emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
 	return ok;
       }
 
@@ -37599,8 +37641,9 @@ ix86_expand_vector_set (bool mmx_ok, rtx
       else
 	{
 	  /* For SSE1, we have to reuse the V4SF code.  */
-	  ix86_expand_vector_set (false, gen_lowpart (V4SFmode, target),
-				  gen_lowpart (SFmode, val), elt);
+	  rtx t = gen_reg_rtx (V4SFmode);
+	  ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
+	  emit_move_insn (target, gen_lowpart (mode, t));
 	}
       return;
 
@@ -37918,7 +37961,7 @@ ix86_expand_vector_extract (bool mmx_ok,
 static void
 emit_reduc_half (rtx dest, rtx src, int i)
 {
-  rtx tem;
+  rtx tem, d = dest;
   switch (GET_MODE (src))
     {
     case V4SFmode:
@@ -37935,8 +37978,8 @@ emit_reduc_half (rtx dest, rtx src, int
     case V8HImode:
     case V4SImode:
     case V2DImode:
-      tem = gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, dest),
-				gen_lowpart (V1TImode, src),
+      d = gen_reg_rtx (V1TImode);
+      tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
 				GEN_INT (i / 2));
       break;
     case V8SFmode:
@@ -37957,19 +38000,26 @@ emit_reduc_half (rtx dest, rtx src, int
     case V8SImode:
     case V4DImode:
       if (i == 256)
-	tem = gen_avx2_permv2ti (gen_lowpart (V4DImode, dest),
-				 gen_lowpart (V4DImode, src),
-				 gen_lowpart (V4DImode, src),
-				 const1_rtx);
-      else
-	tem = gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, dest),
-				  gen_lowpart (V2TImode, src),
-				  GEN_INT (i / 2));
+	{
+	  if (GET_MODE (dest) != V4DImode)
+	    d = gen_reg_rtx (V4DImode);
+	  tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
+				   gen_lowpart (V4DImode, src),
+				   const1_rtx);
+	}
+      else
+	{
+	  d = gen_reg_rtx (V2TImode);
+	  tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
+				    GEN_INT (i / 2));
+	}
       break;
     default:
       gcc_unreachable ();
     }
   emit_insn (tem);
+  if (d != dest)
+    emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
 }
 
 /* Expand a vector reduction.  FN is the binary pattern to reduce;
@@ -39462,6 +39512,8 @@ expand_vec_perm_blend (struct expand_vec
 	      emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
 	    else
 	      emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
+	    if (target != d->target)
+	      emit_move_insn (d->target, gen_lowpart (d->vmode, target));
 	    return true;
 	  }
 
@@ -39471,7 +39523,7 @@ expand_vec_perm_blend (struct expand_vec
       /* FALLTHRU */
 
     do_subreg:
-      target = gen_lowpart (vmode, target);
+      target = gen_reg_rtx (vmode);
       op0 = gen_lowpart (vmode, op0);
       op1 = gen_lowpart (vmode, op1);
       break;
@@ -39525,7 +39577,7 @@ expand_vec_perm_blend (struct expand_vec
 
 		vmode = V32QImode;
 		nelt = 32;
-		target = gen_lowpart (vmode, target);
+		target = gen_reg_rtx (vmode);
 		op0 = gen_lowpart (vmode, op0);
 		op1 = gen_lowpart (vmode, op1);
 		goto finish_pblendvb;
@@ -39558,6 +39610,8 @@ expand_vec_perm_blend (struct expand_vec
   x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask));
   x = gen_rtx_SET (VOIDmode, target, x);
   emit_insn (x);
+  if (target != d->target)
+    emit_move_insn (d->target, gen_lowpart (d->vmode, target));
 
   return true;
 }
@@ -39663,13 +39717,17 @@ expand_vec_perm_pshufb (struct expand_ve
 
 	      /* Use vperm2i128 insn.  The pattern uses
 		 V4DImode instead of V2TImode.  */
-	      target = gen_lowpart (V4DImode, d->target);
+	      target = d->target;
+	      if (d->vmode != V4DImode)
+		target = gen_reg_rtx (V4DImode);
 	      op0 = gen_lowpart (V4DImode, d->op0);
 	      op1 = gen_lowpart (V4DImode, d->op1);
 	      rperm[0]
 		= GEN_INT (((d->perm[0] & (nelt / 2)) ? 1 : 0)
 			   || ((d->perm[nelt / 2] & (nelt / 2)) ? 2 : 0));
 	      emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
+	      if (target != d->target)
+		emit_move_insn (d->target, gen_lowpart (d->vmode, target));
 	      return true;
 	    }
 	  return false;
@@ -39704,9 +39762,15 @@ expand_vec_perm_pshufb (struct expand_ve
 		    perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
 		  if (d->testing_p)
 		    return true;
-		  return expand_vselect (gen_lowpart (V4DImode, d->target),
-					 gen_lowpart (V4DImode, d->op0),
-					 perm, 4, false);
+		  target = gen_reg_rtx (V4DImode);
+		  if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
+				      perm, 4, false))
+		    {
+		      emit_move_insn (d->target,
+				      gen_lowpart (d->vmode, target));
+		      return true;
+		    }
+		  return false;
 		}
 
 	      /* Next see if vpermd can be used.  */
@@ -39758,7 +39822,9 @@ expand_vec_perm_pshufb (struct expand_ve
 				gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
   vperm = force_reg (vmode, vperm);
 
-  target = gen_lowpart (vmode, d->target);
+  target = d->target;
+  if (d->vmode != vmode)
+    target = gen_reg_rtx (vmode);
   op0 = gen_lowpart (vmode, d->op0);
   if (d->one_operand_p)
     {
@@ -39776,6 +39842,8 @@ expand_vec_perm_pshufb (struct expand_ve
       op1 = gen_lowpart (vmode, d->op1);
       emit_insn (gen_xop_pperm (target, op0, op1, vperm));
     }
+  if (target != d->target)
+    emit_move_insn (d->target, gen_lowpart (d->vmode, target));
 
   return true;
 }
@@ -39975,7 +40043,8 @@ expand_vec_perm_palignr (struct expand_v
   unsigned i, nelt = d->nelt;
   unsigned min, max;
   bool in_order, ok;
-  rtx shift;
+  rtx shift, target;
+  struct expand_vec_perm_d dcopy;
 
   /* Even with AVX, palignr only operates on 128-bit vectors.  */
   if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
@@ -39998,29 +40067,33 @@ expand_vec_perm_palignr (struct expand_v
   if (d->testing_p)
     return true;
 
+  dcopy = *d;
   shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
-  emit_insn (gen_ssse3_palignrti (gen_lowpart (TImode, d->target),
-				  gen_lowpart (TImode, d->op1),
+  target = gen_reg_rtx (TImode);
+  emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, d->op1),
 				  gen_lowpart (TImode, d->op0), shift));
 
-  d->op0 = d->op1 = d->target;
-  d->one_operand_p = true;
+  dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
+  dcopy.one_operand_p = true;
 
   in_order = true;
   for (i = 0; i < nelt; ++i)
     {
-      unsigned e = d->perm[i] - min;
+      unsigned e = dcopy.perm[i] - min;
       if (e != i)
 	in_order = false;
-      d->perm[i] = e;
+      dcopy.perm[i] = e;
     }
 
   /* Test for the degenerate case where the alignment by itself
      produces the desired permutation.  */
   if (in_order)
-    return true;
+    {
+      emit_move_insn (d->target, dcopy.op0);
+      return true;
+    }
 
-  ok = expand_vec_perm_1 (d);
+  ok = expand_vec_perm_1 (&dcopy);
   gcc_assert (ok);
 
   return ok;
@@ -40274,10 +40347,10 @@ expand_vec_perm_interleave2 (struct expa
       else
 	dfinal.perm[i] = e;
     }
-  dfinal.op0 = gen_reg_rtx (dfinal.vmode);
+  dremap.target = gen_reg_rtx (dremap.vmode);
+  dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
   dfinal.op1 = dfinal.op0;
   dfinal.one_operand_p = true;
-  dremap.target = dfinal.op0;
 
   /* Test if the final remap can be done with a single insn.  For V4SFmode or
      V4SImode this *will* succeed.  For V8HImode or V16QImode it may not.  */
@@ -40294,7 +40367,6 @@ expand_vec_perm_interleave2 (struct expa
 
   if (dremap.vmode != dfinal.vmode)
     {
-      dremap.target = gen_lowpart (dremap.vmode, dremap.target);
       dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
       dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
     }
@@ -40745,8 +40817,12 @@ expand_vec_perm_pshufb2 (struct expand_v
   op = gen_lowpart (V16QImode, d->op1);
   emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
 
-  op = gen_lowpart (V16QImode, d->target);
+  op = d->target;
+  if (d->vmode != V16QImode)
+    op = gen_reg_rtx (V16QImode);
   emit_insn (gen_iorv16qi3 (op, l, h));
+  if (op != d->target)
+    emit_move_insn (d->target, gen_lowpart (d->vmode, op));
 
   return true;
 }
@@ -40812,8 +40888,12 @@ expand_vec_perm_vpshufb2_vpermq (struct
   op = gen_lowpart (V32QImode, d->op0);
   emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
 
-  op = gen_lowpart (V32QImode, d->target);
+  op = d->target;
+  if (d->vmode != V32QImode)
+    op = gen_reg_rtx (V32QImode);
   emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
+  if (op != d->target)
+    emit_move_insn (d->target, gen_lowpart (d->vmode, op));
 
   return true;
 }
@@ -40889,10 +40969,11 @@ expand_vec_perm_vpshufb2_vpermq_even_odd
   emit_insn (gen_iorv32qi3 (ior, l, h));
 
   /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation.  */
-  op = gen_lowpart (V4DImode, d->target);
+  op = gen_reg_rtx (V4DImode);
   ior = gen_lowpart (V4DImode, ior);
   emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
 				  const1_rtx, GEN_INT (3)));
+  emit_move_insn (d->target, gen_lowpart (d->vmode, op));
 
   return true;
 }
@@ -40903,7 +40984,7 @@ expand_vec_perm_vpshufb2_vpermq_even_odd
 static bool
 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
 {
-  rtx t1, t2, t3;
+  rtx t1, t2, t3, t4, t5;
 
   switch (d->vmode)
     {
@@ -41015,10 +41096,17 @@ expand_vec_perm_even_odd_1 (struct expan
 	{
 	  struct expand_vec_perm_d d_copy = *d;
 	  d_copy.vmode = V4DFmode;
-	  d_copy.target = gen_lowpart (V4DFmode, d->target);
+	  d_copy.target = gen_reg_rtx (V4DFmode);
 	  d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
 	  d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
-	  return expand_vec_perm_even_odd_1 (&d_copy, odd);
+	  if (expand_vec_perm_even_odd_1 (&d_copy, odd))
+	    {
+	      if (!d->testing_p)
+		emit_move_insn (d->target,
+				gen_lowpart (V4DImode, d_copy.target));
+	      return true;
+	    }
+	  return false;
 	}
 
       t1 = gen_reg_rtx (V4DImode);
@@ -41041,44 +41129,51 @@ expand_vec_perm_even_odd_1 (struct expan
 	{
 	  struct expand_vec_perm_d d_copy = *d;
 	  d_copy.vmode = V8SFmode;
-	  d_copy.target = gen_lowpart (V8SFmode, d->target);
+	  d_copy.target = gen_reg_rtx (V8SFmode);
 	  d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
 	  d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
-	  return expand_vec_perm_even_odd_1 (&d_copy, odd);
+	  if (expand_vec_perm_even_odd_1 (&d_copy, odd))
+	    {
+	      if (!d->testing_p)
+		emit_move_insn (d->target,
+				gen_lowpart (V8SImode, d_copy.target));
+	      return true;
+	    }
+	  return false;
 	}
 
       t1 = gen_reg_rtx (V8SImode);
       t2 = gen_reg_rtx (V8SImode);
+      t3 = gen_reg_rtx (V4DImode);
+      t4 = gen_reg_rtx (V4DImode);
+      t5 = gen_reg_rtx (V4DImode);
 
       /* Shuffle the lanes around into
 	 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }.  */
-      emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t1),
-				    gen_lowpart (V4DImode, d->op0),
+      emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
 				    gen_lowpart (V4DImode, d->op1),
 				    GEN_INT (0x20)));
-      emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t2),
-				    gen_lowpart (V4DImode, d->op0),
+      emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
 				    gen_lowpart (V4DImode, d->op1),
 				    GEN_INT (0x31)));
 
       /* Swap the 2nd and 3rd position in each lane into
 	 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }.  */
-      emit_insn (gen_avx2_pshufdv3 (t1, t1,
+      emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
 				    GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
-      emit_insn (gen_avx2_pshufdv3 (t2, t2,
+      emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
 				    GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
 
       /* Now an vpunpck[lh]qdq will produce
 	 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }.  */
       if (odd)
-	t3 = gen_avx2_interleave_highv4di (gen_lowpart (V4DImode, d->target),
-					   gen_lowpart (V4DImode, t1),
+	t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
 					   gen_lowpart (V4DImode, t2));
       else
-	t3 = gen_avx2_interleave_lowv4di (gen_lowpart (V4DImode, d->target),
-					  gen_lowpart (V4DImode, t1),
+	t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
 					  gen_lowpart (V4DImode, t2));
       emit_insn (t3);
+      emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
       break;
 
     default:
@@ -41116,7 +41211,7 @@ expand_vec_perm_broadcast_1 (struct expa
   unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
   enum machine_mode vmode = d->vmode;
   unsigned char perm2[4];
-  rtx op0 = d->op0;
+  rtx op0 = d->op0, dest;
   bool ok;
 
   switch (vmode)
@@ -41162,9 +41257,11 @@ expand_vec_perm_broadcast_1 (struct expa
       while (vmode != V4SImode);
 
       memset (perm2, elt, 4);
-      ok = expand_vselect (gen_lowpart (V4SImode, d->target), op0, perm2, 4,
-			   d->testing_p);
+      dest = gen_reg_rtx (V4SImode);
+      ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
       gcc_assert (ok);
+      if (!d->testing_p)
+	emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
       return true;
 
     case V32QImode:
@@ -41306,8 +41403,12 @@ expand_vec_perm_vpshufb4_vpermq2 (struct
     }
 
   gcc_assert (l[0] && l[1]);
-  op = gen_lowpart (V32QImode, d->target);
+  op = d->target;
+  if (d->vmode != V32QImode)
+    op = gen_reg_rtx (V32QImode);
   emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
+  if (op != d->target)
+    emit_move_insn (d->target, gen_lowpart (d->vmode, op));
   return true;
 }
 
@@ -41875,7 +41976,9 @@ ix86_expand_mul_widen_hilo (rtx dest, rt
 			 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
       gcc_assert (t1 && t2);
 
-      ix86_expand_vec_interleave (gen_lowpart (mode, dest), t1, t2, high_p);
+      t3 = gen_reg_rtx (mode);
+      ix86_expand_vec_interleave (t3, t1, t2, high_p);
+      emit_move_insn (dest, gen_lowpart (wmode, t3));
       break;
 
     case V16QImode:
@@ -41896,14 +41999,14 @@ ix86_expand_mul_widen_hilo (rtx dest, rt
 void
 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
 {
-  rtx res_1, res_2;
+  rtx res_1, res_2, res_3, res_4;
 
   res_1 = gen_reg_rtx (V4SImode);
   res_2 = gen_reg_rtx (V4SImode);
-  ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_1),
-				 op1, op2, true, false);
-  ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_2),
-				 op1, op2, true, true);
+  res_3 = gen_reg_rtx (V2DImode);
+  res_4 = gen_reg_rtx (V2DImode);
+  ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
+  ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
 
   /* Move the results in element 2 down to element 1; we don't care
      what goes in elements 2 and 3.  Then we can merge the parts
@@ -41917,9 +42020,11 @@ ix86_expand_sse2_mulv4si3 (rtx op0, rtx
      In both cases the cost of the reformatting stall was too high
      and the overall sequence slower.  */
 
-  emit_insn (gen_sse2_pshufd_1 (res_1, res_1, const0_rtx, const2_rtx,
+  emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
+				const0_rtx, const2_rtx,
 				const0_rtx, const0_rtx));
-  emit_insn (gen_sse2_pshufd_1 (res_2, res_2, const0_rtx, const2_rtx,
+  emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
+				const0_rtx, const2_rtx,
 				const0_rtx, const0_rtx));
   res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
 
@@ -42088,12 +42193,17 @@ ix86_expand_pinsr (rtx *operands)
 	    return false;
 	  }
 
-	dst = gen_lowpart (dstmode, dst);
+	rtx d = dst;
+	if (GET_MODE (dst) != dstmode)
+	  d = gen_reg_rtx (dstmode);
 	src = gen_lowpart (srcmode, src);
 
 	pos /= size;
 
-	emit_insn (pinsr (dst, dst, src, GEN_INT (1 << pos)));
+	emit_insn (pinsr (d, gen_lowpart (dstmode, dst), src,
+			  GEN_INT (1 << pos)));
+	if (d != dst)
+	  emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
 	return true;
       }
 

	Jakub

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] Try to avoid vector mode punning in SET_DEST on i?86
  2013-10-31 17:27     ` [PATCH] Try to avoid vector mode punning in SET_DEST on i?86 Jakub Jelinek
@ 2013-10-31 20:05       ` Richard Henderson
  2013-11-04 18:28       ` James Greenhalgh
  1 sibling, 0 replies; 15+ messages in thread
From: Richard Henderson @ 2013-10-31 20:05 UTC (permalink / raw)
  To: Jakub Jelinek, Uros Bizjak; +Cc: Kirill Yukhin, gcc-patches

On 10/31/2013 09:49 AM, Jakub Jelinek wrote:
> 2013-10-31  Jakub Jelinek  <jakub@redhat.com>
> 
> 	* optabs.c (expand_vec_perm): Avoid vector mode punning
> 	SUBREGs in SET_DEST.
> 	* expmed.c (store_bit_field_1): Likewise.
> 	* config/i386/sse.md (movdi_to_sse, vec_pack_sfix_trunc_v2df,
> 	vec_pack_sfix_v2df, vec_shl_<mode>, vec_shr_<mode>,
> 	vec_interleave_high<mode>, vec_interleave_low<mode>): Likewise.
> 	* config/i386/i386.c (ix86_expand_vector_move_misalign,
> 	ix86_expand_sse_movcc, ix86_expand_int_vcond, ix86_expand_vec_perm,
> 	ix86_expand_sse_unpack, ix86_expand_args_builtin,
> 	ix86_expand_vector_init_duplicate, ix86_expand_vector_set,
> 	emit_reduc_half, expand_vec_perm_blend, expand_vec_perm_pshufb,
> 	expand_vec_perm_interleave2, expand_vec_perm_pshufb2,
> 	expand_vec_perm_vpshufb2_vpermq,
> 	expand_vec_perm_vpshufb2_vpermq_even_odd, expand_vec_perm_even_odd_1,
> 	expand_vec_perm_broadcast_1, expand_vec_perm_vpshufb4_vpermq2,
> 	ix86_expand_sse2_mulv4si3, ix86_expand_pinsr): Likewise.
> 	(expand_vec_perm_palignr): Likewise.  Modify a copy of *d rather
> 	than *d itself.

Ok.


r~

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] Try to avoid vector mode punning in SET_DEST on i?86
  2013-10-31 17:27     ` [PATCH] Try to avoid vector mode punning in SET_DEST on i?86 Jakub Jelinek
  2013-10-31 20:05       ` Richard Henderson
@ 2013-11-04 18:28       ` James Greenhalgh
  2013-11-04 20:27         ` Jakub Jelinek
  1 sibling, 1 reply; 15+ messages in thread
From: James Greenhalgh @ 2013-11-04 18:28 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Uros Bizjak, Richard Henderson, Kirill Yukhin, gcc-patches

On Thu, Oct 31, 2013 at 04:49:47PM +0000, Jakub Jelinek wrote:

> 2013-10-31  Jakub Jelinek  <jakub@redhat.com>
> 
>         * optabs.c (expand_vec_perm): Avoid vector mode punning
>         SUBREGs in SET_DEST.
>         * expmed.c (store_bit_field_1): Likewise.
>         * config/i386/sse.md (movdi_to_sse, vec_pack_sfix_trunc_v2df,
>         vec_pack_sfix_v2df, vec_shl_<mode>, vec_shr_<mode>,
>         vec_interleave_high<mode>, vec_interleave_low<mode>): Likewise.
>         * config/i386/i386.c (ix86_expand_vector_move_misalign,
>         ix86_expand_sse_movcc, ix86_expand_int_vcond, ix86_expand_vec_perm,
>         ix86_expand_sse_unpack, ix86_expand_args_builtin,
>         ix86_expand_vector_init_duplicate, ix86_expand_vector_set,
>         emit_reduc_half, expand_vec_perm_blend, expand_vec_perm_pshufb,
>         expand_vec_perm_interleave2, expand_vec_perm_pshufb2,
>         expand_vec_perm_vpshufb2_vpermq,
>         expand_vec_perm_vpshufb2_vpermq_even_odd, expand_vec_perm_even_odd_1,
>         expand_vec_perm_broadcast_1, expand_vec_perm_vpshufb4_vpermq2,
>         ix86_expand_sse2_mulv4si3, ix86_expand_pinsr): Likewise.
>         (expand_vec_perm_palignr): Likewise.  Modify a copy of *d rather
>         than *d itself.
> 
> --- gcc/optabs.c.jj     2013-10-29 09:25:45.000000000 +0100
> +++ gcc/optabs.c        2013-10-31 13:20:40.384808642 +0100
> @@ -6674,7 +6674,7 @@ expand_vec_perm (enum machine_mode mode,
>         }
>        tmp = gen_rtx_CONST_VECTOR (qimode, vec);
>        sel = gen_lowpart (qimode, sel);
> -      sel = expand_vec_perm (qimode, sel, sel, tmp, NULL);
> +      sel = expand_vec_perm (qimode, gen_reg_rtx (qimode), sel, tmp, NULL);
>        gcc_assert (sel != NULL);
> 
>        /* Add the byte offset to each byte element.  */

This hunk causes issues on AArch64 and ARM.

We look to see which permute operation we should generate in
aarch64_expand_vec_perm_const. If we notice that all elements in 
would select from op0 we copy op0 to op1 and generate appropriate code.

With this hunk applied we end up selecting from the register
generated by gen_reg_rtx (qimode), rather than from 'sel' as we
intended. Thus we lose the value of 'sel' and everything starts to
go wrong!

The hunk looks suspicious to me (why do we pick a register out of
thin air?), and reverting it fixes the problems I see. Could you
give me a pointer as to what this hunk fixes on i?86?

I don't think I can fix this up in the expander very easily?

Thanks,
James


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] Try to avoid vector mode punning in SET_DEST on i?86
  2013-11-04 18:28       ` James Greenhalgh
@ 2013-11-04 20:27         ` Jakub Jelinek
  0 siblings, 0 replies; 15+ messages in thread
From: Jakub Jelinek @ 2013-11-04 20:27 UTC (permalink / raw)
  To: James Greenhalgh
  Cc: Uros Bizjak, Richard Henderson, Kirill Yukhin, gcc-patches

On Mon, Nov 04, 2013 at 06:23:19PM +0000, James Greenhalgh wrote:
> > --- gcc/optabs.c.jj     2013-10-29 09:25:45.000000000 +0100
> > +++ gcc/optabs.c        2013-10-31 13:20:40.384808642 +0100
> > @@ -6674,7 +6674,7 @@ expand_vec_perm (enum machine_mode mode,
> >         }
> >        tmp = gen_rtx_CONST_VECTOR (qimode, vec);
> >        sel = gen_lowpart (qimode, sel);
> > -      sel = expand_vec_perm (qimode, sel, sel, tmp, NULL);
> > +      sel = expand_vec_perm (qimode, gen_reg_rtx (qimode), sel, tmp, NULL);
> >        gcc_assert (sel != NULL);
> > 
> >        /* Add the byte offset to each byte element.  */
> 
> This hunk causes issues on AArch64 and ARM.
> 
> We look to see which permute operation we should generate in
> aarch64_expand_vec_perm_const. If we notice that all elements in 
> would select from op0 we copy op0 to op1 and generate appropriate code.

Sorry for that, I've missed that expand_vec_perm has very different
argument order than expand_vec_perm_1 (the former has target last,
the latter second after icode), so misread the code as assigning to sel
when it in fact does a single argument permutation into NULL target.
Apparently this code isn't used on x86_64/i686 at all, so my
bootstrap/regtest has not detected it.

I've reverted that one liner now:
--- ChangeLog	(revision 204357)
+++ ChangeLog	(working copy)
@@ -1,5 +1,8 @@
 2013-11-04  Jakub Jelinek  <jakub@redhat.com>
 
+	* optabs.c (expand_vec_perm): Revert one incorrect line from
+	2013-10-31 change.
+
 	PR tree-optimization/58978
 	* tree-vrp.c (all_imm_uses_in_stmt_or_feed_cond): Don't modify
 	use_stmt by single_imm_use directly.  Only call single_imm_use
--- optabs.c	(revision 204356)
+++ optabs.c	(working copy)
@@ -6674,7 +6674,7 @@ expand_vec_perm (enum machine_mode mode,
 	}
       tmp = gen_rtx_CONST_VECTOR (qimode, vec);
       sel = gen_lowpart (qimode, sel);
-      sel = expand_vec_perm (qimode, gen_reg_rtx (qimode), sel, tmp, NULL);
+      sel = expand_vec_perm (qimode, sel, sel, tmp, NULL);
       gcc_assert (sel != NULL);
 
       /* Add the byte offset to each byte element.  */

	Jakub

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2013-11-04 20:26 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-10-30  9:54 [RFC PATCH] For TARGET_AVX use *mov<mode>_internal for misaligned loads Jakub Jelinek
2013-10-30 10:00 ` Ondřej Bílka
2013-10-30 10:34   ` Jakub Jelinek
2013-10-30 10:35     ` Jakub Jelinek
2013-10-30 11:01       ` Ondřej Bílka
2013-10-30 11:01 ` Uros Bizjak
2013-10-30 11:54   ` Jakub Jelinek
2013-10-30 18:02     ` Uros Bizjak
2013-10-30 18:05       ` Uros Bizjak
2013-10-31 17:27     ` [PATCH] Try to avoid vector mode punning in SET_DEST on i?86 Jakub Jelinek
2013-10-31 20:05       ` Richard Henderson
2013-11-04 18:28       ` James Greenhalgh
2013-11-04 20:27         ` Jakub Jelinek
2013-10-30 17:01 ` [RFC PATCH] For TARGET_AVX use *mov<mode>_internal for misaligned loads Richard Henderson
2013-10-30 17:47   ` Jakub Jelinek

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).