public inbox for binutils@sourceware.org
 help / color / mirror / Atom feed
* [PATCH] x86: Optimize EVEX vector load/store instructions
@ 2019-03-15 23:58 H.J. Lu
  2019-03-17 20:47 ` V2 " H.J. Lu
  0 siblings, 1 reply; 7+ messages in thread
From: H.J. Lu @ 2019-03-15 23:58 UTC (permalink / raw)
  To: binutils

When there is no write mask, we can encode lower 16 128-bit/256-bit
vector register load and store instructions as VEX vector register
load and store instructions with -O2.

gas/

	PR gas/24348
	* config/tc-i386.c (optimize_encoding): Encode EVEX 128-bit and
	256-bit vector register load/store instructions as VEX vector
	register load/store instructions for -O2.
	(md_parse_option): Set optimize to INT_MAX for -Os.
	* doc/c-i386.texi: Update -O2 documentation.

gas/

	PR gas/24348
	* testsuite/gas/i386/optimize-1.s: Add tests for EVEX vector
	load/store instructions.
	* testsuite/gas/i386/optimize-2.s: Likewise.
	* testsuite/gas/i386/optimize-3.s: Likewise.
	* testsuite/gas/i386/optimize-5.s: Likewise.
	* testsuite/gas/i386/x86-64-optimize-2.s: Likewise.
	* testsuite/gas/i386/x86-64-optimize-3.s: Likewise.
	* testsuite/gas/i386/x86-64-optimize-4.s: Likewise.
	* testsuite/gas/i386/x86-64-optimize-5.s: Likewise.
	* testsuite/gas/i386/x86-64-optimize-6.s: Likewise.
	* testsuite/gas/i386/optimize-1.d: Updated.
	* testsuite/gas/i386/optimize-2.d: Likewise.
	* testsuite/gas/i386/optimize-3.d: Likewise.
	* testsuite/gas/i386/optimize-4.d: Likewise.
	* testsuite/gas/i386/optimize-5.d: Likewise.
	* testsuite/gas/i386/x86-64-optimize-2.d: Likewise.
	* testsuite/gas/i386/x86-64-optimize-3.d: Likewise.
	* testsuite/gas/i386/x86-64-optimize-4.d: Likewise.
	* testsuite/gas/i386/x86-64-optimize-5.d: Likewise.
	* testsuite/gas/i386/x86-64-optimize-6.d: Likewise.

opcodes/

	PR gas/24348
	* i386-opc.tbl: Add Optimize to vmovdqa32, vmovdqa64, vmovdqu8,
	vmovdqu16, vmovdqu32 and vmovdqu64.
	* i386-tbl.h: Regenerated.
---
 gas/config/tc-i386.c                       |  59 +++++++++++-
 gas/doc/c-i386.texi                        |   4 +-
 gas/testsuite/gas/i386/optimize-1.d        |  36 +++++++
 gas/testsuite/gas/i386/optimize-1.s        |  42 +++++++++
 gas/testsuite/gas/i386/optimize-2.d        |  72 ++++++++++++++
 gas/testsuite/gas/i386/optimize-2.s        |  84 +++++++++++++++++
 gas/testsuite/gas/i386/optimize-3.d        |   6 ++
 gas/testsuite/gas/i386/optimize-3.s        |   7 ++
 gas/testsuite/gas/i386/optimize-4.d        |  36 +++++++
 gas/testsuite/gas/i386/optimize-5.d        |  42 +++++++++
 gas/testsuite/gas/i386/optimize-5.s        |   7 ++
 gas/testsuite/gas/i386/x86-64-optimize-2.d |  48 ++++++++++
 gas/testsuite/gas/i386/x86-64-optimize-2.s |  56 +++++++++++
 gas/testsuite/gas/i386/x86-64-optimize-3.d |  90 ++++++++++++++++++
 gas/testsuite/gas/i386/x86-64-optimize-3.s | 105 +++++++++++++++++++++
 gas/testsuite/gas/i386/x86-64-optimize-4.d |   6 ++
 gas/testsuite/gas/i386/x86-64-optimize-4.s |   7 ++
 gas/testsuite/gas/i386/x86-64-optimize-5.d |  54 +++++++++++
 gas/testsuite/gas/i386/x86-64-optimize-5.s |   7 ++
 gas/testsuite/gas/i386/x86-64-optimize-6.d |  54 +++++++++++
 gas/testsuite/gas/i386/x86-64-optimize-6.s |   7 ++
 opcodes/i386-opc.tbl                       |  12 +--
 opcodes/i386-tbl.h                         |  12 +--
 23 files changed, 839 insertions(+), 14 deletions(-)

diff --git a/gas/config/tc-i386.c b/gas/config/tc-i386.c
index 1b1b0a95da..1028c8d02f 100644
--- a/gas/config/tc-i386.c
+++ b/gas/config/tc-i386.c
@@ -4056,6 +4056,63 @@ optimize_encoding (void)
 	    i.types[j].bitfield.ymmword = 0;
 	  }
     }
+  else if (optimize > 1
+	   && i.vec_encoding != vex_encoding_evex
+	   && !i.mask
+	   && is_evex_encoding (&i.tm)
+	   && (i.tm.base_opcode == 0x666f
+	       || (i.tm.base_opcode ^ Opcode_SIMD_IntD) == 0x666f
+	       || i.tm.base_opcode == 0xf36f
+	       || (i.tm.base_opcode ^ Opcode_SIMD_IntD) == 0xf36f
+	       || i.tm.base_opcode == 0xf26f
+	       || (i.tm.base_opcode ^ Opcode_SIMD_IntD) == 0xf26f)
+	   && i.tm.extension_opcode == None)
+    {
+      /* Optimize: -O2:
+	   VOP, one of vmovdqa32, vmovdqa64, vmovdqu8, vmovdqu16,
+	   vmovdqu32 and vmovdqu64:
+	     EVEX VOP %xmmM, %xmmN
+	       -> VEX VOP %xmmM, %xmmN (M and N < 16)
+	     EVEX VOP %ymmM, %ymmN
+	       -> VEX VOP %ymmM, %ymmN (M and N < 16)
+	     EVEX VOP %xmmM, mem
+	       -> VEX VOP %xmmM, mem (M < 16)
+	     EVEX VOP %ymmM, mem
+	       -> VEX VOP %ymmM, mem (M < 16)
+	     EVEX VOP mem, %xmmN
+	       -> VEX VOP mem, %xmmN (N < 16)
+	     EVEX VOP mem, %ymmN
+	       -> VEX VOP mem, %ymmN (N < 16)
+       */
+      int ymmword = 0;
+      for (j = 0; j < 2; j++)
+	if (i.types[j].bitfield.regsimd)
+	  {
+	    if (i.op[j].regs->reg_num > 15
+		|| i.types[j].bitfield.zmmword)
+	      return;
+	    ymmword = i.types[j].bitfield.ymmword;
+	  }
+
+      if (i.tm.base_opcode == 0xf26f)
+	i.tm.base_opcode = 0xf36f;
+      else if ((i.tm.base_opcode ^ Opcode_SIMD_IntD) == 0xf26f)
+	i.tm.base_opcode = 0xf36f ^ Opcode_SIMD_IntD;
+      i.tm.opcode_modifier.vex = ymmword ? VEX256 : VEX128;
+      i.tm.opcode_modifier.vexw = VEXW0;
+      i.tm.opcode_modifier.evex = 0;
+      i.tm.opcode_modifier.masking = 0;
+      i.tm.opcode_modifier.disp8memshift = 0;
+      i.memshift = 0;
+      for (j = 0; j < 2; j++)
+	if (operand_type_check (i.types[j], disp)
+	    && i.op[j].disps->X_op == O_constant)
+	  {
+	    i.types[j].bitfield.disp8
+	      = fits_in_disp8 (i.op[j].disps->X_add_number);
+	    break;
+	  }
+    }
 }
 
 /* This is the guts of the machine-dependent assembler.  LINE points to a
@@ -11342,7 +11399,7 @@ md_parse_option (int c, const char *arg)
 	{
 	  optimize_for_space = 1;
 	  /* Turn on all encoding optimizations.  */
-	  optimize = -1;
+	  optimize = INT_MAX;
 	}
       else
 	{
diff --git a/gas/doc/c-i386.texi b/gas/doc/c-i386.texi
index 6c63560dbc..3820d2593a 100644
--- a/gas/doc/c-i386.texi
+++ b/gas/doc/c-i386.texi
@@ -456,7 +456,9 @@ immediate as 32-bit register load instructions with 31-bit or 32-bits
 immediates and encode 64-bit register clearing instructions with 32-bit
 register clearing instructions.  @samp{-O2} includes @samp{-O1}
 optimization plus encodes 256-bit and 512-bit vector register clearing
-instructions with 128-bit vector register clearing instructions.
+instructions with 128-bit vector register clearing instructions as well
+as encodes EVEX 128-bit and 256-bit vector register load/store
+instructions with VEX vector register load/store instructions.
 @samp{-Os} includes @samp{-O2} optimization plus encodes 16-bit, 32-bit
 and 64-bit register tests with immediate as 8-bit register test with
 immediate.  @samp{-O0} turns off this optimization.
diff --git a/gas/testsuite/gas/i386/optimize-1.d b/gas/testsuite/gas/i386/optimize-1.d
index 4358c19c21..70c802c002 100644
--- a/gas/testsuite/gas/i386/optimize-1.d
+++ b/gas/testsuite/gas/i386/optimize-1.d
@@ -62,4 +62,40 @@ Disassembly of section .text:
  +[a-f0-9]+:	c5 f4 47 e9          	kxorw  %k1,%k1,%k5
  +[a-f0-9]+:	c5 f4 42 e9          	kandnw %k1,%k1,%k5
  +[a-f0-9]+:	c5 f4 42 e9          	kandnw %k1,%k1,%k5
+ +[a-f0-9]+:	c5 f9 6f d1          	vmovdqa %xmm1,%xmm2
+ +[a-f0-9]+:	c5 f9 6f d1          	vmovdqa %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 f9 6f 50 7f       	vmovdqa 0x7f\(%eax\),%xmm2
+ +[a-f0-9]+:	c5 f9 6f 50 7f       	vmovdqa 0x7f\(%eax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
+ +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fd 6f 50 7f       	vmovdqa 0x7f\(%eax\),%ymm2
+ +[a-f0-9]+:	c5 fd 6f 50 7f       	vmovdqa 0x7f\(%eax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
+ +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
 #pass
diff --git a/gas/testsuite/gas/i386/optimize-1.s b/gas/testsuite/gas/i386/optimize-1.s
index f61a176de8..6dcfbc2799 100644
--- a/gas/testsuite/gas/i386/optimize-1.s
+++ b/gas/testsuite/gas/i386/optimize-1.s
@@ -72,3 +72,45 @@ _start:
 
 	kandnd %k1, %k1, %k5
 	kandnq %k1, %k1, %k5
+
+	vmovdqa32	%xmm1, %xmm2
+	vmovdqa64	%xmm1, %xmm2
+	vmovdqu8	%xmm1, %xmm2
+	vmovdqu16	%xmm1, %xmm2
+	vmovdqu32	%xmm1, %xmm2
+	vmovdqu64	%xmm1, %xmm2
+
+	vmovdqa32	127(%eax), %xmm2
+	vmovdqa64	127(%eax), %xmm2
+	vmovdqu8	127(%eax), %xmm2
+	vmovdqu16	127(%eax), %xmm2
+	vmovdqu32	127(%eax), %xmm2
+	vmovdqu64	127(%eax), %xmm2
+
+	vmovdqa32	%xmm1, 128(%eax)
+	vmovdqa64	%xmm1, 128(%eax)
+	vmovdqu8	%xmm1, 128(%eax)
+	vmovdqu16	%xmm1, 128(%eax)
+	vmovdqu32	%xmm1, 128(%eax)
+	vmovdqu64	%xmm1, 128(%eax)
+
+	vmovdqa32	%ymm1, %ymm2
+	vmovdqa64	%ymm1, %ymm2
+	vmovdqu8	%ymm1, %ymm2
+	vmovdqu16	%ymm1, %ymm2
+	vmovdqu32	%ymm1, %ymm2
+	vmovdqu64	%ymm1, %ymm2
+
+	vmovdqa32	127(%eax), %ymm2
+	vmovdqa64	127(%eax), %ymm2
+	vmovdqu8	127(%eax), %ymm2
+	vmovdqu16	127(%eax), %ymm2
+	vmovdqu32	127(%eax), %ymm2
+	vmovdqu64	127(%eax), %ymm2
+
+	vmovdqa32	%ymm1, 128(%eax)
+	vmovdqa64	%ymm1, 128(%eax)
+	vmovdqu8	%ymm1, 128(%eax)
+	vmovdqu16	%ymm1, 128(%eax)
+	vmovdqu32	%ymm1, 128(%eax)
+	vmovdqu64	%ymm1, 128(%eax)
diff --git a/gas/testsuite/gas/i386/optimize-2.d b/gas/testsuite/gas/i386/optimize-2.d
index ec989b0e13..68aaaaaab4 100644
--- a/gas/testsuite/gas/i386/optimize-2.d
+++ b/gas/testsuite/gas/i386/optimize-2.d
@@ -16,4 +16,76 @@ Disassembly of section .text:
  +[a-f0-9]+:	f6 c3 7f             	test   \$0x7f,%bl
  +[a-f0-9]+:	f7 c7 7f 00 00 00    	test   \$0x7f,%edi
  +[a-f0-9]+:	66 f7 c7 7f 00       	test   \$0x7f,%di
+ +[a-f0-9]+:	c5 f9 6f d1          	vmovdqa %xmm1,%xmm2
+ +[a-f0-9]+:	c5 f9 6f d1          	vmovdqa %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 f9 6f 50 7f       	vmovdqa 0x7f\(%eax\),%xmm2
+ +[a-f0-9]+:	c5 f9 6f 50 7f       	vmovdqa 0x7f\(%eax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
+ +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fd 6f 50 7f       	vmovdqa 0x7f\(%eax\),%ymm2
+ +[a-f0-9]+:	c5 fd 6f 50 7f       	vmovdqa 0x7f\(%eax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
+ +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7d 48 6f d1    	vmovdqa32 %zmm1,%zmm2
+ +[a-f0-9]+:	62 f1 fd 48 6f d1    	vmovdqa64 %zmm1,%zmm2
+ +[a-f0-9]+:	62 f1 7f 48 6f d1    	vmovdqu8 %zmm1,%zmm2
+ +[a-f0-9]+:	62 f1 ff 48 6f d1    	vmovdqu16 %zmm1,%zmm2
+ +[a-f0-9]+:	62 f1 7e 48 6f d1    	vmovdqu32 %zmm1,%zmm2
+ +[a-f0-9]+:	62 f1 fe 48 6f d1    	vmovdqu64 %zmm1,%zmm2
+ +[a-f0-9]+:	62 f1 7d 28 6f d1    	vmovdqa32 %ymm1,%ymm2
+ +[a-f0-9]+:	62 f1 fd 28 6f d1    	vmovdqa64 %ymm1,%ymm2
+ +[a-f0-9]+:	62 f1 7f 08 6f d1    	vmovdqu8 %xmm1,%xmm2
+ +[a-f0-9]+:	62 f1 ff 08 6f d1    	vmovdqu16 %xmm1,%xmm2
+ +[a-f0-9]+:	62 f1 7e 08 6f d1    	vmovdqu32 %xmm1,%xmm2
+ +[a-f0-9]+:	62 f1 fe 08 6f d1    	vmovdqu64 %xmm1,%xmm2
+ +[a-f0-9]+:	62 f1 7d 29 6f d1    	vmovdqa32 %ymm1,%ymm2\{%k1\}
+ +[a-f0-9]+:	62 f1 fd 29 6f d1    	vmovdqa64 %ymm1,%ymm2\{%k1\}
+ +[a-f0-9]+:	62 f1 7f 09 6f d1    	vmovdqu8 %xmm1,%xmm2\{%k1\}
+ +[a-f0-9]+:	62 f1 ff 09 6f d1    	vmovdqu16 %xmm1,%xmm2\{%k1\}
+ +[a-f0-9]+:	62 f1 7e 09 6f d1    	vmovdqu32 %xmm1,%xmm2\{%k1\}
+ +[a-f0-9]+:	62 f1 fe 09 6f d1    	vmovdqu64 %xmm1,%xmm2\{%k1\}
+ +[a-f0-9]+:	62 f1 7d 29 6f 10    	vmovdqa32 \(%eax\),%ymm2\{%k1\}
+ +[a-f0-9]+:	62 f1 fd 29 6f 10    	vmovdqa64 \(%eax\),%ymm2\{%k1\}
+ +[a-f0-9]+:	62 f1 7f 09 6f 10    	vmovdqu8 \(%eax\),%xmm2\{%k1\}
+ +[a-f0-9]+:	62 f1 ff 09 6f 10    	vmovdqu16 \(%eax\),%xmm2\{%k1\}
+ +[a-f0-9]+:	62 f1 7e 09 6f 10    	vmovdqu32 \(%eax\),%xmm2\{%k1\}
+ +[a-f0-9]+:	62 f1 fe 09 6f 10    	vmovdqu64 \(%eax\),%xmm2\{%k1\}
+ +[a-f0-9]+:	62 f1 7d 29 7f 08    	vmovdqa32 %ymm1,\(%eax\)\{%k1\}
+ +[a-f0-9]+:	62 f1 fd 29 7f 08    	vmovdqa64 %ymm1,\(%eax\)\{%k1\}
+ +[a-f0-9]+:	62 f1 7f 09 7f 08    	vmovdqu8 %xmm1,\(%eax\)\{%k1\}
+ +[a-f0-9]+:	62 f1 ff 09 7f 08    	vmovdqu16 %xmm1,\(%eax\)\{%k1\}
+ +[a-f0-9]+:	62 f1 7e 09 7f 08    	vmovdqu32 %xmm1,\(%eax\)\{%k1\}
+ +[a-f0-9]+:	62 f1 fe 09 7f 08    	vmovdqu64 %xmm1,\(%eax\)\{%k1\}
+ +[a-f0-9]+:	62 f1 7d 89 6f d1    	vmovdqa32 %xmm1,%xmm2\{%k1\}\{z\}
+ +[a-f0-9]+:	62 f1 fd 89 6f d1    	vmovdqa64 %xmm1,%xmm2\{%k1\}\{z\}
+ +[a-f0-9]+:	62 f1 7f 89 6f d1    	vmovdqu8 %xmm1,%xmm2\{%k1\}\{z\}
+ +[a-f0-9]+:	62 f1 ff 89 6f d1    	vmovdqu16 %xmm1,%xmm2\{%k1\}\{z\}
+ +[a-f0-9]+:	62 f1 7e 89 6f d1    	vmovdqu32 %xmm1,%xmm2\{%k1\}\{z\}
+ +[a-f0-9]+:	62 f1 fe 89 6f d1    	vmovdqu64 %xmm1,%xmm2\{%k1\}\{z\}
 #pass
diff --git a/gas/testsuite/gas/i386/optimize-2.s b/gas/testsuite/gas/i386/optimize-2.s
index b427a741b9..d73f41ba61 100644
--- a/gas/testsuite/gas/i386/optimize-2.s
+++ b/gas/testsuite/gas/i386/optimize-2.s
@@ -11,3 +11,87 @@ _start:
 	test	$0x7f, %bl
 	test	$0x7f, %edi
 	test	$0x7f, %di
+
+	vmovdqa32	%xmm1, %xmm2
+	vmovdqa64	%xmm1, %xmm2
+	vmovdqu8	%xmm1, %xmm2
+	vmovdqu16	%xmm1, %xmm2
+	vmovdqu32	%xmm1, %xmm2
+	vmovdqu64	%xmm1, %xmm2
+
+	vmovdqa32	127(%eax), %xmm2
+	vmovdqa64	127(%eax), %xmm2
+	vmovdqu8	127(%eax), %xmm2
+	vmovdqu16	127(%eax), %xmm2
+	vmovdqu32	127(%eax), %xmm2
+	vmovdqu64	127(%eax), %xmm2
+
+	vmovdqa32	%xmm1, 128(%eax)
+	vmovdqa64	%xmm1, 128(%eax)
+	vmovdqu8	%xmm1, 128(%eax)
+	vmovdqu16	%xmm1, 128(%eax)
+	vmovdqu32	%xmm1, 128(%eax)
+	vmovdqu64	%xmm1, 128(%eax)
+
+	vmovdqa32	%ymm1, %ymm2
+	vmovdqa64	%ymm1, %ymm2
+	vmovdqu8	%ymm1, %ymm2
+	vmovdqu16	%ymm1, %ymm2
+	vmovdqu32	%ymm1, %ymm2
+	vmovdqu64	%ymm1, %ymm2
+
+	vmovdqa32	127(%eax), %ymm2
+	vmovdqa64	127(%eax), %ymm2
+	vmovdqu8	127(%eax), %ymm2
+	vmovdqu16	127(%eax), %ymm2
+	vmovdqu32	127(%eax), %ymm2
+	vmovdqu64	127(%eax), %ymm2
+
+	vmovdqa32	%ymm1, 128(%eax)
+	vmovdqa64	%ymm1, 128(%eax)
+	vmovdqu8	%ymm1, 128(%eax)
+	vmovdqu16	%ymm1, 128(%eax)
+	vmovdqu32	%ymm1, 128(%eax)
+	vmovdqu64	%ymm1, 128(%eax)
+
+	vmovdqa32	%zmm1, %zmm2
+	vmovdqa64	%zmm1, %zmm2
+	vmovdqu8	%zmm1, %zmm2
+	vmovdqu16	%zmm1, %zmm2
+	vmovdqu32	%zmm1, %zmm2
+	vmovdqu64	%zmm1, %zmm2
+
+	{evex} vmovdqa32	%ymm1, %ymm2
+	{evex} vmovdqa64	%ymm1, %ymm2
+	{evex} vmovdqu8		%xmm1, %xmm2
+	{evex} vmovdqu16	%xmm1, %xmm2
+	{evex} vmovdqu32	%xmm1, %xmm2
+	{evex} vmovdqu64	%xmm1, %xmm2
+
+	vmovdqa32	%ymm1, %ymm2{%k1}
+	vmovdqa64	%ymm1, %ymm2{%k1}
+	vmovdqu8	%xmm1, %xmm2{%k1}
+	vmovdqu16	%xmm1, %xmm2{%k1}
+	vmovdqu32	%xmm1, %xmm2{%k1}
+	vmovdqu64	%xmm1, %xmm2{%k1}
+
+	vmovdqa32	(%eax), %ymm2{%k1}
+	vmovdqa64	(%eax), %ymm2{%k1}
+	vmovdqu8	(%eax), %xmm2{%k1}
+	vmovdqu16	(%eax), %xmm2{%k1}
+	vmovdqu32	(%eax), %xmm2{%k1}
+	vmovdqu64	(%eax), %xmm2{%k1}
+
+	vmovdqa32	%ymm1, (%eax){%k1}
+	vmovdqa64	%ymm1, (%eax){%k1}
+	vmovdqu8	%xmm1, (%eax){%k1}
+	vmovdqu16	%xmm1, (%eax){%k1}
+	vmovdqu32	%xmm1, (%eax){%k1}
+	vmovdqu64	%xmm1, (%eax){%k1}
+
+	vmovdqa32	%xmm1, %xmm2{%k1}{z}
+	vmovdqa64	%xmm1, %xmm2{%k1}{z}
+	vmovdqu8	%xmm1, %xmm2{%k1}{z}
+	vmovdqu16	%xmm1, %xmm2{%k1}{z}
+	vmovdqu32	%xmm1, %xmm2{%k1}{z}
+	vmovdqu64	%xmm1, %xmm2{%k1}{z}
diff --git a/gas/testsuite/gas/i386/optimize-3.d b/gas/testsuite/gas/i386/optimize-3.d
index f251a3626d..cd43243b49 100644
--- a/gas/testsuite/gas/i386/optimize-3.d
+++ b/gas/testsuite/gas/i386/optimize-3.d
@@ -9,4 +9,10 @@ Disassembly of section .text:
 
 0+ <_start>:
  +[a-f0-9]+:	a9 7f 00 00 00       	test   \$0x7f,%eax
+ +[a-f0-9]+:	62 f1 7d 28 6f d1    	vmovdqa32 %ymm1,%ymm2
+ +[a-f0-9]+:	62 f1 fd 28 6f d1    	vmovdqa64 %ymm1,%ymm2
+ +[a-f0-9]+:	62 f1 7f 08 6f d1    	vmovdqu8 %xmm1,%xmm2
+ +[a-f0-9]+:	62 f1 ff 08 6f d1    	vmovdqu16 %xmm1,%xmm2
+ +[a-f0-9]+:	62 f1 7e 08 6f d1    	vmovdqu32 %xmm1,%xmm2
+ +[a-f0-9]+:	62 f1 fe 08 6f d1    	vmovdqu64 %xmm1,%xmm2
 #pass
diff --git a/gas/testsuite/gas/i386/optimize-3.s b/gas/testsuite/gas/i386/optimize-3.s
index 536bf0cfb2..a70893c15d 100644
--- a/gas/testsuite/gas/i386/optimize-3.s
+++ b/gas/testsuite/gas/i386/optimize-3.s
@@ -4,3 +4,10 @@
 	.text
 _start:
 	{nooptimize} testl $0x7f, %eax
+
+	{nooptimize} vmovdqa32	%ymm1, %ymm2
+	{nooptimize} vmovdqa64	%ymm1, %ymm2
+	{nooptimize} vmovdqu8	%xmm1, %xmm2
+	{nooptimize} vmovdqu16	%xmm1, %xmm2
+	{nooptimize} vmovdqu32	%xmm1, %xmm2
+	{nooptimize} vmovdqu64	%xmm1, %xmm2
diff --git a/gas/testsuite/gas/i386/optimize-4.d b/gas/testsuite/gas/i386/optimize-4.d
index 9f99dadf34..2df84654d6 100644
--- a/gas/testsuite/gas/i386/optimize-4.d
+++ b/gas/testsuite/gas/i386/optimize-4.d
@@ -62,6 +62,42 @@ Disassembly of section .text:
  +[a-f0-9]+:	c5 f4 47 e9          	kxorw  %k1,%k1,%k5
  +[a-f0-9]+:	c5 f4 42 e9          	kandnw %k1,%k1,%k5
  +[a-f0-9]+:	c5 f4 42 e9          	kandnw %k1,%k1,%k5
+ +[a-f0-9]+:	c5 f9 6f d1          	vmovdqa %xmm1,%xmm2
+ +[a-f0-9]+:	c5 f9 6f d1          	vmovdqa %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 f9 6f 50 7f       	vmovdqa 0x7f\(%eax\),%xmm2
+ +[a-f0-9]+:	c5 f9 6f 50 7f       	vmovdqa 0x7f\(%eax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
+ +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fd 6f 50 7f       	vmovdqa 0x7f\(%eax\),%ymm2
+ +[a-f0-9]+:	c5 fd 6f 50 7f       	vmovdqa 0x7f\(%eax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
+ +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
  +[a-f0-9]+:	62 f1 f5 08 55 e9    	vandnpd %xmm1,%xmm1,%xmm5
  +[a-f0-9]+:	62 f1 f5 08 55 e9    	vandnpd %xmm1,%xmm1,%xmm5
 #pass
diff --git a/gas/testsuite/gas/i386/optimize-5.d b/gas/testsuite/gas/i386/optimize-5.d
index cfd0df04a4..ecc1ab139a 100644
--- a/gas/testsuite/gas/i386/optimize-5.d
+++ b/gas/testsuite/gas/i386/optimize-5.d
@@ -62,6 +62,48 @@ Disassembly of section .text:
  +[a-f0-9]+:	c5 f4 47 e9          	kxorw  %k1,%k1,%k5
  +[a-f0-9]+:	c5 f4 42 e9          	kandnw %k1,%k1,%k5
  +[a-f0-9]+:	c5 f4 42 e9          	kandnw %k1,%k1,%k5
+ +[a-f0-9]+:	c5 f9 6f d1          	vmovdqa %xmm1,%xmm2
+ +[a-f0-9]+:	c5 f9 6f d1          	vmovdqa %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 f9 6f 50 7f       	vmovdqa 0x7f\(%eax\),%xmm2
+ +[a-f0-9]+:	c5 f9 6f 50 7f       	vmovdqa 0x7f\(%eax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
+ +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fd 6f 50 7f       	vmovdqa 0x7f\(%eax\),%ymm2
+ +[a-f0-9]+:	c5 fd 6f 50 7f       	vmovdqa 0x7f\(%eax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
+ +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
  +[a-f0-9]+:	62 f1 f5 08 55 e9    	vandnpd %xmm1,%xmm1,%xmm5
  +[a-f0-9]+:	62 f1 f5 08 55 e9    	vandnpd %xmm1,%xmm1,%xmm5
+ +[a-f0-9]+:	62 f1 7d 28 6f d1    	vmovdqa32 %ymm1,%ymm2
+ +[a-f0-9]+:	62 f1 fd 28 6f d1    	vmovdqa64 %ymm1,%ymm2
+ +[a-f0-9]+:	62 f1 7f 08 6f d1    	vmovdqu8 %xmm1,%xmm2
+ +[a-f0-9]+:	62 f1 ff 08 6f d1    	vmovdqu16 %xmm1,%xmm2
+ +[a-f0-9]+:	62 f1 7e 08 6f d1    	vmovdqu32 %xmm1,%xmm2
+ +[a-f0-9]+:	62 f1 fe 08 6f d1    	vmovdqu64 %xmm1,%xmm2
 #pass
diff --git a/gas/testsuite/gas/i386/optimize-5.s b/gas/testsuite/gas/i386/optimize-5.s
index 66c762bd3b..77d60edb69 100644
--- a/gas/testsuite/gas/i386/optimize-5.s
+++ b/gas/testsuite/gas/i386/optimize-5.s
@@ -6,3 +6,10 @@
 
 	{evex} vandnpd %zmm1, %zmm1, %zmm5
 	{evex} vandnpd %ymm1, %ymm1, %ymm5
+
+	{evex} vmovdqa32	%ymm1, %ymm2
+	{evex} vmovdqa64	%ymm1, %ymm2
+	{evex} vmovdqu8		%xmm1, %xmm2
+	{evex} vmovdqu16	%xmm1, %xmm2
+	{evex} vmovdqu32	%xmm1, %xmm2
+	{evex} vmovdqu64	%xmm1, %xmm2
diff --git a/gas/testsuite/gas/i386/x86-64-optimize-2.d b/gas/testsuite/gas/i386/x86-64-optimize-2.d
index f374619d4a..067df076f7 100644
--- a/gas/testsuite/gas/i386/x86-64-optimize-2.d
+++ b/gas/testsuite/gas/i386/x86-64-optimize-2.d
@@ -106,4 +106,52 @@ Disassembly of section .text:
  +[a-f0-9]+:	62 e1 f5 08 fb c1    	vpsubq %xmm1,%xmm1,%xmm16
  +[a-f0-9]+:	62 b1 f5 40 fb c9    	vpsubq %zmm17,%zmm17,%zmm1
  +[a-f0-9]+:	62 b1 f5 00 fb c9    	vpsubq %xmm17,%xmm17,%xmm1
+ +[a-f0-9]+:	c5 f9 6f d1          	vmovdqa %xmm1,%xmm2
+ +[a-f0-9]+:	c5 f9 6f d1          	vmovdqa %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c4 41 79 6f e3       	vmovdqa %xmm11,%xmm12
+ +[a-f0-9]+:	c4 41 79 6f e3       	vmovdqa %xmm11,%xmm12
+ +[a-f0-9]+:	c4 41 7a 6f e3       	vmovdqu %xmm11,%xmm12
+ +[a-f0-9]+:	c4 41 7a 6f e3       	vmovdqu %xmm11,%xmm12
+ +[a-f0-9]+:	c4 41 7a 6f e3       	vmovdqu %xmm11,%xmm12
+ +[a-f0-9]+:	c4 41 7a 6f e3       	vmovdqu %xmm11,%xmm12
+ +[a-f0-9]+:	c5 f9 6f 50 7f       	vmovdqa 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 f9 6f 50 7f       	vmovdqa 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c4 41 7d 6f e3       	vmovdqa %ymm11,%ymm12
+ +[a-f0-9]+:	c4 41 7d 6f e3       	vmovdqa %ymm11,%ymm12
+ +[a-f0-9]+:	c4 41 7e 6f e3       	vmovdqu %ymm11,%ymm12
+ +[a-f0-9]+:	c4 41 7e 6f e3       	vmovdqu %ymm11,%ymm12
+ +[a-f0-9]+:	c4 41 7e 6f e3       	vmovdqu %ymm11,%ymm12
+ +[a-f0-9]+:	c4 41 7e 6f e3       	vmovdqu %ymm11,%ymm12
+ +[a-f0-9]+:	c5 fd 6f 50 7f       	vmovdqa 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fd 6f 50 7f       	vmovdqa 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
 #pass
diff --git a/gas/testsuite/gas/i386/x86-64-optimize-2.s b/gas/testsuite/gas/i386/x86-64-optimize-2.s
index 10ce788ffb..1275610e55 100644
--- a/gas/testsuite/gas/i386/x86-64-optimize-2.s
+++ b/gas/testsuite/gas/i386/x86-64-optimize-2.s
@@ -114,3 +114,59 @@ _start:
 	vpsubq %ymm1, %ymm1, %ymm16
 	vpsubq %zmm17, %zmm17, %zmm1
 	vpsubq %ymm17, %ymm17, %ymm1
+
+	vmovdqa32	%xmm1, %xmm2
+	vmovdqa64	%xmm1, %xmm2
+	vmovdqu8	%xmm1, %xmm2
+	vmovdqu16	%xmm1, %xmm2
+	vmovdqu32	%xmm1, %xmm2
+	vmovdqu64	%xmm1, %xmm2
+
+	vmovdqa32	%xmm11, %xmm12
+	vmovdqa64	%xmm11, %xmm12
+	vmovdqu8	%xmm11, %xmm12
+	vmovdqu16	%xmm11, %xmm12
+	vmovdqu32	%xmm11, %xmm12
+	vmovdqu64	%xmm11, %xmm12
+
+	vmovdqa32	127(%rax), %xmm2
+	vmovdqa64	127(%rax), %xmm2
+	vmovdqu8	127(%rax), %xmm2
+	vmovdqu16	127(%rax), %xmm2
+	vmovdqu32	127(%rax), %xmm2
+	vmovdqu64	127(%rax), %xmm2
+
+	vmovdqa32	%xmm1, 128(%rax)
+	vmovdqa64	%xmm1, 128(%rax)
+	vmovdqu8	%xmm1, 128(%rax)
+	vmovdqu16	%xmm1, 128(%rax)
+	vmovdqu32	%xmm1, 128(%rax)
+	vmovdqu64	%xmm1, 128(%rax)
+
+	vmovdqa32	%ymm1, %ymm2
+	vmovdqa64	%ymm1, %ymm2
+	vmovdqu8	%ymm1, %ymm2
+	vmovdqu16	%ymm1, %ymm2
+	vmovdqu32	%ymm1, %ymm2
+	vmovdqu64	%ymm1, %ymm2
+
+	vmovdqa32	%ymm11, %ymm12
+	vmovdqa64	%ymm11, %ymm12
+	vmovdqu8	%ymm11, %ymm12
+	vmovdqu16	%ymm11, %ymm12
+	vmovdqu32	%ymm11, %ymm12
+	vmovdqu64	%ymm11, %ymm12
+
+	vmovdqa32	127(%rax), %ymm2
+	vmovdqa64	127(%rax), %ymm2
+	vmovdqu8	127(%rax), %ymm2
+	vmovdqu16	127(%rax), %ymm2
+	vmovdqu32	127(%rax), %ymm2
+	vmovdqu64	127(%rax), %ymm2
+
+	vmovdqa32	%ymm1, 128(%rax)
+	vmovdqa64	%ymm1, 128(%rax)
+	vmovdqu8	%ymm1, 128(%rax)
+	vmovdqu16	%ymm1, 128(%rax)
+	vmovdqu32	%ymm1, 128(%rax)
+	vmovdqu64	%ymm1, 128(%rax)
diff --git a/gas/testsuite/gas/i386/x86-64-optimize-3.d b/gas/testsuite/gas/i386/x86-64-optimize-3.d
index b46f728dd8..35a53e0f4b 100644
--- a/gas/testsuite/gas/i386/x86-64-optimize-3.d
+++ b/gas/testsuite/gas/i386/x86-64-optimize-3.d
@@ -24,4 +24,94 @@ Disassembly of section .text:
  +[a-f0-9]+:	41 f6 c1 7f          	test   \$0x7f,%r9b
  +[a-f0-9]+:	41 f6 c1 7f          	test   \$0x7f,%r9b
  +[a-f0-9]+:	41 f6 c1 7f          	test   \$0x7f,%r9b
+ +[a-f0-9]+:	c5 f9 6f d1          	vmovdqa %xmm1,%xmm2
+ +[a-f0-9]+:	c5 f9 6f d1          	vmovdqa %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c4 41 79 6f e3       	vmovdqa %xmm11,%xmm12
+ +[a-f0-9]+:	c4 41 79 6f e3       	vmovdqa %xmm11,%xmm12
+ +[a-f0-9]+:	c4 41 7a 6f e3       	vmovdqu %xmm11,%xmm12
+ +[a-f0-9]+:	c4 41 7a 6f e3       	vmovdqu %xmm11,%xmm12
+ +[a-f0-9]+:	c4 41 7a 6f e3       	vmovdqu %xmm11,%xmm12
+ +[a-f0-9]+:	c4 41 7a 6f e3       	vmovdqu %xmm11,%xmm12
+ +[a-f0-9]+:	c5 f9 6f 50 7f       	vmovdqa 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 f9 6f 50 7f       	vmovdqa 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c4 41 7d 6f e3       	vmovdqa %ymm11,%ymm12
+ +[a-f0-9]+:	c4 41 7d 6f e3       	vmovdqa %ymm11,%ymm12
+ +[a-f0-9]+:	c4 41 7e 6f e3       	vmovdqu %ymm11,%ymm12
+ +[a-f0-9]+:	c4 41 7e 6f e3       	vmovdqu %ymm11,%ymm12
+ +[a-f0-9]+:	c4 41 7e 6f e3       	vmovdqu %ymm11,%ymm12
+ +[a-f0-9]+:	c4 41 7e 6f e3       	vmovdqu %ymm11,%ymm12
+ +[a-f0-9]+:	c5 fd 6f 50 7f       	vmovdqa 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fd 6f 50 7f       	vmovdqa 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 b1 7d 08 6f d5    	vmovdqa32 %xmm21,%xmm2
+ +[a-f0-9]+:	62 b1 fd 08 6f d5    	vmovdqa64 %xmm21,%xmm2
+ +[a-f0-9]+:	62 b1 7f 08 6f d5    	vmovdqu8 %xmm21,%xmm2
+ +[a-f0-9]+:	62 b1 ff 08 6f d5    	vmovdqu16 %xmm21,%xmm2
+ +[a-f0-9]+:	62 b1 7e 08 6f d5    	vmovdqu32 %xmm21,%xmm2
+ +[a-f0-9]+:	62 b1 fe 08 6f d5    	vmovdqu64 %xmm21,%xmm2
+ +[a-f0-9]+:	62 f1 7d 48 6f d1    	vmovdqa32 %zmm1,%zmm2
+ +[a-f0-9]+:	62 f1 fd 48 6f d1    	vmovdqa64 %zmm1,%zmm2
+ +[a-f0-9]+:	62 f1 7f 48 6f d1    	vmovdqu8 %zmm1,%zmm2
+ +[a-f0-9]+:	62 f1 ff 48 6f d1    	vmovdqu16 %zmm1,%zmm2
+ +[a-f0-9]+:	62 f1 7e 48 6f d1    	vmovdqu32 %zmm1,%zmm2
+ +[a-f0-9]+:	62 f1 fe 48 6f d1    	vmovdqu64 %zmm1,%zmm2
+ +[a-f0-9]+:	62 f1 7d 28 6f d1    	vmovdqa32 %ymm1,%ymm2
+ +[a-f0-9]+:	62 f1 fd 28 6f d1    	vmovdqa64 %ymm1,%ymm2
+ +[a-f0-9]+:	62 f1 7f 08 6f d1    	vmovdqu8 %xmm1,%xmm2
+ +[a-f0-9]+:	62 f1 ff 08 6f d1    	vmovdqu16 %xmm1,%xmm2
+ +[a-f0-9]+:	62 f1 7e 08 6f d1    	vmovdqu32 %xmm1,%xmm2
+ +[a-f0-9]+:	62 f1 fe 08 6f d1    	vmovdqu64 %xmm1,%xmm2
+ +[a-f0-9]+:	62 f1 7d 29 6f d1    	vmovdqa32 %ymm1,%ymm2\{%k1\}
+ +[a-f0-9]+:	62 f1 fd 29 6f d1    	vmovdqa64 %ymm1,%ymm2\{%k1\}
+ +[a-f0-9]+:	62 f1 7f 09 6f d1    	vmovdqu8 %xmm1,%xmm2\{%k1\}
+ +[a-f0-9]+:	62 f1 ff 09 6f d1    	vmovdqu16 %xmm1,%xmm2\{%k1\}
+ +[a-f0-9]+:	62 f1 7e 09 6f d1    	vmovdqu32 %xmm1,%xmm2\{%k1\}
+ +[a-f0-9]+:	62 f1 fe 09 6f d1    	vmovdqu64 %xmm1,%xmm2\{%k1\}
+ +[a-f0-9]+:	62 f1 7d 29 6f 10    	vmovdqa32 \(%rax\),%ymm2\{%k1\}
+ +[a-f0-9]+:	62 f1 fd 29 6f 10    	vmovdqa64 \(%rax\),%ymm2\{%k1\}
+ +[a-f0-9]+:	62 f1 7f 09 6f 10    	vmovdqu8 \(%rax\),%xmm2\{%k1\}
+ +[a-f0-9]+:	62 f1 ff 09 6f 10    	vmovdqu16 \(%rax\),%xmm2\{%k1\}
+ +[a-f0-9]+:	62 f1 7e 09 6f 10    	vmovdqu32 \(%rax\),%xmm2\{%k1\}
+ +[a-f0-9]+:	62 f1 fe 09 6f 10    	vmovdqu64 \(%rax\),%xmm2\{%k1\}
+ +[a-f0-9]+:	62 f1 7d 29 7f 08    	vmovdqa32 %ymm1,\(%rax\)\{%k1\}
+ +[a-f0-9]+:	62 f1 fd 29 7f 08    	vmovdqa64 %ymm1,\(%rax\)\{%k1\}
+ +[a-f0-9]+:	62 f1 7f 09 7f 08    	vmovdqu8 %xmm1,\(%rax\)\{%k1\}
+ +[a-f0-9]+:	62 f1 ff 09 7f 08    	vmovdqu16 %xmm1,\(%rax\)\{%k1\}
+ +[a-f0-9]+:	62 f1 7e 09 7f 08    	vmovdqu32 %xmm1,\(%rax\)\{%k1\}
+ +[a-f0-9]+:	62 f1 fe 09 7f 08    	vmovdqu64 %xmm1,\(%rax\)\{%k1\}
+ +[a-f0-9]+:	62 f1 7d 89 6f d1    	vmovdqa32 %xmm1,%xmm2\{%k1\}\{z\}
+ +[a-f0-9]+:	62 f1 fd 89 6f d1    	vmovdqa64 %xmm1,%xmm2\{%k1\}\{z\}
+ +[a-f0-9]+:	62 f1 7f 89 6f d1    	vmovdqu8 %xmm1,%xmm2\{%k1\}\{z\}
+ +[a-f0-9]+:	62 f1 ff 89 6f d1    	vmovdqu16 %xmm1,%xmm2\{%k1\}\{z\}
+ +[a-f0-9]+:	62 f1 7e 89 6f d1    	vmovdqu32 %xmm1,%xmm2\{%k1\}\{z\}
+ +[a-f0-9]+:	62 f1 fe 89 6f d1    	vmovdqu64 %xmm1,%xmm2\{%k1\}\{z\}
 #pass
diff --git a/gas/testsuite/gas/i386/x86-64-optimize-3.s b/gas/testsuite/gas/i386/x86-64-optimize-3.s
index 61c150a87c..688f9623b2 100644
--- a/gas/testsuite/gas/i386/x86-64-optimize-3.s
+++ b/gas/testsuite/gas/i386/x86-64-optimize-3.s
@@ -19,3 +19,108 @@ _start:
 	test	$0x7f, %r9d
 	test	$0x7f, %r9w
 	test	$0x7f, %r9b
+
+	vmovdqa32	%xmm1, %xmm2
+	vmovdqa64	%xmm1, %xmm2
+	vmovdqu8	%xmm1, %xmm2
+	vmovdqu16	%xmm1, %xmm2
+	vmovdqu32	%xmm1, %xmm2
+	vmovdqu64	%xmm1, %xmm2
+
+	vmovdqa32	%xmm11, %xmm12
+	vmovdqa64	%xmm11, %xmm12
+	vmovdqu8	%xmm11, %xmm12
+	vmovdqu16	%xmm11, %xmm12
+	vmovdqu32	%xmm11, %xmm12
+	vmovdqu64	%xmm11, %xmm12
+
+	vmovdqa32	127(%rax), %xmm2
+	vmovdqa64	127(%rax), %xmm2
+	vmovdqu8	127(%rax), %xmm2
+	vmovdqu16	127(%rax), %xmm2
+	vmovdqu32	127(%rax), %xmm2
+	vmovdqu64	127(%rax), %xmm2
+
+	vmovdqa32	%xmm1, 128(%rax)
+	vmovdqa64	%xmm1, 128(%rax)
+	vmovdqu8	%xmm1, 128(%rax)
+	vmovdqu16	%xmm1, 128(%rax)
+	vmovdqu32	%xmm1, 128(%rax)
+	vmovdqu64	%xmm1, 128(%rax)
+
+	vmovdqa32	%ymm1, %ymm2
+	vmovdqa64	%ymm1, %ymm2
+	vmovdqu8	%ymm1, %ymm2
+	vmovdqu16	%ymm1, %ymm2
+	vmovdqu32	%ymm1, %ymm2
+	vmovdqu64	%ymm1, %ymm2
+
+	vmovdqa32	%ymm11, %ymm12
+	vmovdqa64	%ymm11, %ymm12
+	vmovdqu8	%ymm11, %ymm12
+	vmovdqu16	%ymm11, %ymm12
+	vmovdqu32	%ymm11, %ymm12
+	vmovdqu64	%ymm11, %ymm12
+
+	vmovdqa32	127(%rax), %ymm2
+	vmovdqa64	127(%rax), %ymm2
+	vmovdqu8	127(%rax), %ymm2
+	vmovdqu16	127(%rax), %ymm2
+	vmovdqu32	127(%rax), %ymm2
+	vmovdqu64	127(%rax), %ymm2
+
+	vmovdqa32	%ymm1, 128(%rax)
+	vmovdqa64	%ymm1, 128(%rax)
+	vmovdqu8	%ymm1, 128(%rax)
+	vmovdqu16	%ymm1, 128(%rax)
+	vmovdqu32	%ymm1, 128(%rax)
+	vmovdqu64	%ymm1, 128(%rax)
+
+	vmovdqa32	%xmm21, %xmm2
+	vmovdqa64	%xmm21, %xmm2
+	vmovdqu8	%xmm21, %xmm2
+	vmovdqu16	%xmm21, %xmm2
+	vmovdqu32	%xmm21, %xmm2
+	vmovdqu64	%xmm21, %xmm2
+
+	vmovdqa32	%zmm1, %zmm2
+	vmovdqa64	%zmm1, %zmm2
+	vmovdqu8	%zmm1, %zmm2
+	vmovdqu16	%zmm1, %zmm2
+	vmovdqu32	%zmm1, %zmm2
+	vmovdqu64	%zmm1, %zmm2
+
+	{evex} vmovdqa32	%ymm1, %ymm2
+	{evex} vmovdqa64	%ymm1, %ymm2
+	{evex} vmovdqu8		%xmm1, %xmm2
+	{evex} vmovdqu16	%xmm1, %xmm2
+	{evex} vmovdqu32	%xmm1, %xmm2
+	{evex} vmovdqu64	%xmm1, %xmm2
+
+	vmovdqa32	%ymm1, %ymm2{%k1}
+	vmovdqa64	%ymm1, %ymm2{%k1}
+	vmovdqu8	%xmm1, %xmm2{%k1}
+	vmovdqu16	%xmm1, %xmm2{%k1}
+	vmovdqu32	%xmm1, %xmm2{%k1}
+	vmovdqu64	%xmm1, %xmm2{%k1}
+
+	vmovdqa32	(%rax), %ymm2{%k1}
+	vmovdqa64	(%rax), %ymm2{%k1}
+	vmovdqu8	(%rax), %xmm2{%k1}
+	vmovdqu16	(%rax), %xmm2{%k1}
+	vmovdqu32	(%rax), %xmm2{%k1}
+	vmovdqu64	(%rax), %xmm2{%k1}
+
+	vmovdqa32	%ymm1, (%rax){%k1}
+	vmovdqa64	%ymm1, (%rax){%k1}
+	vmovdqu8	%xmm1, (%rax){%k1}
+	vmovdqu16	%xmm1, (%rax){%k1}
+	vmovdqu32	%xmm1, (%rax){%k1}
+	vmovdqu64	%xmm1, (%rax){%k1}
+
+	vmovdqa32	%xmm1, %xmm2{%k1}{z}
+	vmovdqa64	%xmm1, %xmm2{%k1}{z}
+	vmovdqu8	%xmm1, %xmm2{%k1}{z}
+	vmovdqu16	%xmm1, %xmm2{%k1}{z}
+	vmovdqu32	%xmm1, %xmm2{%k1}{z}
+	vmovdqu64	%xmm1, %xmm2{%k1}{z}
diff --git a/gas/testsuite/gas/i386/x86-64-optimize-4.d b/gas/testsuite/gas/i386/x86-64-optimize-4.d
index 10e7b02d3a..18fdeb1442 100644
--- a/gas/testsuite/gas/i386/x86-64-optimize-4.d
+++ b/gas/testsuite/gas/i386/x86-64-optimize-4.d
@@ -9,4 +9,10 @@ Disassembly of section .text:
 
 0+ <_start>:
  +[a-f0-9]+:	a9 7f 00 00 00       	test   \$0x7f,%eax
+ +[a-f0-9]+:	62 f1 7d 28 6f d1    	vmovdqa32 %ymm1,%ymm2
+ +[a-f0-9]+:	62 f1 fd 28 6f d1    	vmovdqa64 %ymm1,%ymm2
+ +[a-f0-9]+:	62 f1 7f 08 6f d1    	vmovdqu8 %xmm1,%xmm2
+ +[a-f0-9]+:	62 f1 ff 08 6f d1    	vmovdqu16 %xmm1,%xmm2
+ +[a-f0-9]+:	62 f1 7e 08 6f d1    	vmovdqu32 %xmm1,%xmm2
+ +[a-f0-9]+:	62 f1 fe 08 6f d1    	vmovdqu64 %xmm1,%xmm2
 #pass
diff --git a/gas/testsuite/gas/i386/x86-64-optimize-4.s b/gas/testsuite/gas/i386/x86-64-optimize-4.s
index 0c4fdcecc5..b6d872db2c 100644
--- a/gas/testsuite/gas/i386/x86-64-optimize-4.s
+++ b/gas/testsuite/gas/i386/x86-64-optimize-4.s
@@ -4,3 +4,10 @@
 	.text
 _start:
 	{nooptimize} testl $0x7f, %eax
+
+	{nooptimize} vmovdqa32	%ymm1, %ymm2
+	{nooptimize} vmovdqa64	%ymm1, %ymm2
+	{nooptimize} vmovdqu8	%xmm1, %xmm2
+	{nooptimize} vmovdqu16	%xmm1, %xmm2
+	{nooptimize} vmovdqu32	%xmm1, %xmm2
+	{nooptimize} vmovdqu64	%xmm1, %xmm2
diff --git a/gas/testsuite/gas/i386/x86-64-optimize-5.d b/gas/testsuite/gas/i386/x86-64-optimize-5.d
index 085f7f29f2..012237df57 100644
--- a/gas/testsuite/gas/i386/x86-64-optimize-5.d
+++ b/gas/testsuite/gas/i386/x86-64-optimize-5.d
@@ -106,6 +106,60 @@ Disassembly of section .text:
  +[a-f0-9]+:	62 e1 f5 08 fb c1    	vpsubq %xmm1,%xmm1,%xmm16
  +[a-f0-9]+:	62 b1 f5 00 fb c9    	vpsubq %xmm17,%xmm17,%xmm1
  +[a-f0-9]+:	62 b1 f5 00 fb c9    	vpsubq %xmm17,%xmm17,%xmm1
+ +[a-f0-9]+:	c5 f9 6f d1          	vmovdqa %xmm1,%xmm2
+ +[a-f0-9]+:	c5 f9 6f d1          	vmovdqa %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c4 41 79 6f e3       	vmovdqa %xmm11,%xmm12
+ +[a-f0-9]+:	c4 41 79 6f e3       	vmovdqa %xmm11,%xmm12
+ +[a-f0-9]+:	c4 41 7a 6f e3       	vmovdqu %xmm11,%xmm12
+ +[a-f0-9]+:	c4 41 7a 6f e3       	vmovdqu %xmm11,%xmm12
+ +[a-f0-9]+:	c4 41 7a 6f e3       	vmovdqu %xmm11,%xmm12
+ +[a-f0-9]+:	c4 41 7a 6f e3       	vmovdqu %xmm11,%xmm12
+ +[a-f0-9]+:	c5 f9 6f 50 7f       	vmovdqa 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 f9 6f 50 7f       	vmovdqa 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c4 41 7d 6f e3       	vmovdqa %ymm11,%ymm12
+ +[a-f0-9]+:	c4 41 7d 6f e3       	vmovdqa %ymm11,%ymm12
+ +[a-f0-9]+:	c4 41 7e 6f e3       	vmovdqu %ymm11,%ymm12
+ +[a-f0-9]+:	c4 41 7e 6f e3       	vmovdqu %ymm11,%ymm12
+ +[a-f0-9]+:	c4 41 7e 6f e3       	vmovdqu %ymm11,%ymm12
+ +[a-f0-9]+:	c4 41 7e 6f e3       	vmovdqu %ymm11,%ymm12
+ +[a-f0-9]+:	c5 fd 6f 50 7f       	vmovdqa 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fd 6f 50 7f       	vmovdqa 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
  +[a-f0-9]+:	62 f1 f5 08 55 e9    	vandnpd %xmm1,%xmm1,%xmm5
  +[a-f0-9]+:	62 f1 f5 08 55 e9    	vandnpd %xmm1,%xmm1,%xmm5
+ +[a-f0-9]+:	62 f1 7d 28 6f d1    	vmovdqa32 %ymm1,%ymm2
+ +[a-f0-9]+:	62 f1 fd 28 6f d1    	vmovdqa64 %ymm1,%ymm2
+ +[a-f0-9]+:	62 f1 7f 08 6f d1    	vmovdqu8 %xmm1,%xmm2
+ +[a-f0-9]+:	62 f1 ff 08 6f d1    	vmovdqu16 %xmm1,%xmm2
+ +[a-f0-9]+:	62 f1 7e 08 6f d1    	vmovdqu32 %xmm1,%xmm2
+ +[a-f0-9]+:	62 f1 fe 08 6f d1    	vmovdqu64 %xmm1,%xmm2
 #pass
diff --git a/gas/testsuite/gas/i386/x86-64-optimize-5.s b/gas/testsuite/gas/i386/x86-64-optimize-5.s
index 6b4ff103ab..9756ae815c 100644
--- a/gas/testsuite/gas/i386/x86-64-optimize-5.s
+++ b/gas/testsuite/gas/i386/x86-64-optimize-5.s
@@ -4,3 +4,10 @@
 
 	{evex} vandnpd %zmm1, %zmm1, %zmm5
 	{evex} vandnpd %ymm1, %ymm1, %ymm5
+
+	{evex} vmovdqa32	%ymm1, %ymm2
+	{evex} vmovdqa64	%ymm1, %ymm2
+	{evex} vmovdqu8		%xmm1, %xmm2
+	{evex} vmovdqu16	%xmm1, %xmm2
+	{evex} vmovdqu32	%xmm1, %xmm2
+	{evex} vmovdqu64	%xmm1, %xmm2
diff --git a/gas/testsuite/gas/i386/x86-64-optimize-6.d b/gas/testsuite/gas/i386/x86-64-optimize-6.d
index 0d52c8fcbb..aca119e4f9 100644
--- a/gas/testsuite/gas/i386/x86-64-optimize-6.d
+++ b/gas/testsuite/gas/i386/x86-64-optimize-6.d
@@ -106,6 +106,60 @@ Disassembly of section .text:
  +[a-f0-9]+:	62 e1 f5 08 fb c1    	vpsubq %xmm1,%xmm1,%xmm16
  +[a-f0-9]+:	62 b1 f5 00 fb c9    	vpsubq %xmm17,%xmm17,%xmm1
  +[a-f0-9]+:	62 b1 f5 00 fb c9    	vpsubq %xmm17,%xmm17,%xmm1
+ +[a-f0-9]+:	c5 f9 6f d1          	vmovdqa %xmm1,%xmm2
+ +[a-f0-9]+:	c5 f9 6f d1          	vmovdqa %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c4 41 79 6f e3       	vmovdqa %xmm11,%xmm12
+ +[a-f0-9]+:	c4 41 79 6f e3       	vmovdqa %xmm11,%xmm12
+ +[a-f0-9]+:	c4 41 7a 6f e3       	vmovdqu %xmm11,%xmm12
+ +[a-f0-9]+:	c4 41 7a 6f e3       	vmovdqu %xmm11,%xmm12
+ +[a-f0-9]+:	c4 41 7a 6f e3       	vmovdqu %xmm11,%xmm12
+ +[a-f0-9]+:	c4 41 7a 6f e3       	vmovdqu %xmm11,%xmm12
+ +[a-f0-9]+:	c5 f9 6f 50 7f       	vmovdqa 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 f9 6f 50 7f       	vmovdqa 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c4 41 7d 6f e3       	vmovdqa %ymm11,%ymm12
+ +[a-f0-9]+:	c4 41 7d 6f e3       	vmovdqa %ymm11,%ymm12
+ +[a-f0-9]+:	c4 41 7e 6f e3       	vmovdqu %ymm11,%ymm12
+ +[a-f0-9]+:	c4 41 7e 6f e3       	vmovdqu %ymm11,%ymm12
+ +[a-f0-9]+:	c4 41 7e 6f e3       	vmovdqu %ymm11,%ymm12
+ +[a-f0-9]+:	c4 41 7e 6f e3       	vmovdqu %ymm11,%ymm12
+ +[a-f0-9]+:	c5 fd 6f 50 7f       	vmovdqa 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fd 6f 50 7f       	vmovdqa 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
  +[a-f0-9]+:	62 f1 f5 08 55 e9    	vandnpd %xmm1,%xmm1,%xmm5
  +[a-f0-9]+:	62 f1 f5 08 55 e9    	vandnpd %xmm1,%xmm1,%xmm5
+ +[a-f0-9]+:	62 f1 7d 28 6f d1    	vmovdqa32 %ymm1,%ymm2
+ +[a-f0-9]+:	62 f1 fd 28 6f d1    	vmovdqa64 %ymm1,%ymm2
+ +[a-f0-9]+:	62 f1 7f 08 6f d1    	vmovdqu8 %xmm1,%xmm2
+ +[a-f0-9]+:	62 f1 ff 08 6f d1    	vmovdqu16 %xmm1,%xmm2
+ +[a-f0-9]+:	62 f1 7e 08 6f d1    	vmovdqu32 %xmm1,%xmm2
+ +[a-f0-9]+:	62 f1 fe 08 6f d1    	vmovdqu64 %xmm1,%xmm2
 #pass
diff --git a/gas/testsuite/gas/i386/x86-64-optimize-6.s b/gas/testsuite/gas/i386/x86-64-optimize-6.s
index 70ccbc41be..7c403fcc86 100644
--- a/gas/testsuite/gas/i386/x86-64-optimize-6.s
+++ b/gas/testsuite/gas/i386/x86-64-optimize-6.s
@@ -6,3 +6,10 @@
 
 	{evex} vandnpd %zmm1, %zmm1, %zmm5
 	{evex} vandnpd %ymm1, %ymm1, %ymm5
+
+	{evex} vmovdqa32	%ymm1, %ymm2
+	{evex} vmovdqa64	%ymm1, %ymm2
+	{evex} vmovdqu8		%xmm1, %xmm2
+	{evex} vmovdqu16	%xmm1, %xmm2
+	{evex} vmovdqu32	%xmm1, %xmm2
+	{evex} vmovdqu64	%xmm1, %xmm2
diff --git a/opcodes/i386-opc.tbl b/opcodes/i386-opc.tbl
index 1194dcd1c0..26a68d8cbe 100644
--- a/opcodes/i386-opc.tbl
+++ b/opcodes/i386-opc.tbl
@@ -3709,11 +3709,11 @@ vmovd, 2, 0x666E, None, 1, CpuAVX512F, D|Modrm|EVex=2|VexOpcode=0|Disp8MemShift=
 
 vmovddup, 2, 0xF212, None, 1, CpuAVX512F, Modrm|Masking=3|VexOpcode=0|VexW=2|Disp8ShiftVL|CheckRegSize|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf, { RegYMM|RegZMM|Unspecified|BaseIndex, RegYMM|RegZMM }
 
-vmovdqa64, 2, 0x666F, None, 1, CpuAVX512F, D|Modrm|MaskingMorZ|VexOpcode=0|VexW=2|Disp8ShiftVL|CheckRegSize|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf, { RegXMM|RegYMM|RegZMM|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
-vmovdqa32, 2, 0x666F, None, 1, CpuAVX512F, D|Modrm|MaskingMorZ|VexOpcode=0|VexW=1|Disp8ShiftVL|CheckRegSize|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf, { RegXMM|RegYMM|RegZMM|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
+vmovdqa64, 2, 0x666F, None, 1, CpuAVX512F, D|Modrm|MaskingMorZ|VexOpcode=0|VexW=2|Disp8ShiftVL|CheckRegSize|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf|Optimize, { RegXMM|RegYMM|RegZMM|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
+vmovdqa32, 2, 0x666F, None, 1, CpuAVX512F, D|Modrm|MaskingMorZ|VexOpcode=0|VexW=1|Disp8ShiftVL|CheckRegSize|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf|Optimize, { RegXMM|RegYMM|RegZMM|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
 vmovntdq, 2, 0x66E7, None, 1, CpuAVX512F, Modrm|VexOpcode=0|VexW=1|Disp8ShiftVL|CheckRegSize|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf, { RegXMM|RegYMM|RegZMM, XMMword|YMMword|ZMMword|Unspecified|BaseIndex }
-vmovdqu32, 2, 0xF36F, None, 1, CpuAVX512F, D|Modrm|MaskingMorZ|VexOpcode=0|VexW=1|Disp8ShiftVL|CheckRegSize|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf, { RegXMM|RegYMM|RegZMM|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
-vmovdqu64, 2, 0xF36F, None, 1, CpuAVX512F, D|Modrm|MaskingMorZ|VexOpcode=0|VexW=2|Disp8ShiftVL|CheckRegSize|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf, { RegXMM|RegYMM|RegZMM|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
+vmovdqu32, 2, 0xF36F, None, 1, CpuAVX512F, D|Modrm|MaskingMorZ|VexOpcode=0|VexW=1|Disp8ShiftVL|CheckRegSize|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf|Optimize, { RegXMM|RegYMM|RegZMM|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
+vmovdqu64, 2, 0xF36F, None, 1, CpuAVX512F, D|Modrm|MaskingMorZ|VexOpcode=0|VexW=2|Disp8ShiftVL|CheckRegSize|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf|Optimize, { RegXMM|RegYMM|RegZMM|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
 
 vmovhlps, 3, 0x12, None, 1, CpuAVX512F, Modrm|EVex=4|VexOpcode=0|VexVVVV=1|VexW=1|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf, { RegXMM, RegXMM, RegXMM }
 vmovlhps, 3, 0x16, None, 1, CpuAVX512F, Modrm|EVex=4|VexOpcode=0|VexVVVV=1|VexW=1|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf, { RegXMM, RegXMM, RegXMM }
@@ -4190,8 +4190,8 @@ kshiftrq, 3, 0x6631, None, 1, CpuAVX512BW, Modrm|Vex=1|VexOpcode=2|VexW=2|No_bSu
 
 vdbpsadbw, 4, 0x6642, None, 1, CpuAVX512BW, Modrm|Masking=3|VexOpcode=2|VexVVVV=1|VexW=1|Disp8ShiftVL|CheckRegSize|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf, { Imm8, RegXMM|RegYMM|RegZMM|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
 
-vmovdqu8, 2, 0xF26F, None, 1, CpuAVX512BW, D|Modrm|MaskingMorZ|VexOpcode=0|VexW=1|Disp8ShiftVL|CheckRegSize|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf, { RegXMM|RegYMM|RegZMM|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
-vmovdqu16, 2, 0xF26F, None, 1, CpuAVX512BW, D|Modrm|MaskingMorZ|VexOpcode=0|VexW=2|Disp8ShiftVL|CheckRegSize|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf, { RegXMM|RegYMM|RegZMM|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
+vmovdqu8, 2, 0xF26F, None, 1, CpuAVX512BW, D|Modrm|MaskingMorZ|VexOpcode=0|VexW=1|Disp8ShiftVL|CheckRegSize|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf|Optimize, { RegXMM|RegYMM|RegZMM|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
+vmovdqu16, 2, 0xF26F, None, 1, CpuAVX512BW, D|Modrm|MaskingMorZ|VexOpcode=0|VexW=2|Disp8ShiftVL|CheckRegSize|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf|Optimize, { RegXMM|RegYMM|RegZMM|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
 
 vpabsb, 2, 0x661C, None, 1, CpuAVX512BW, Modrm|Masking=3|VexOpcode=1|VexWIG|Disp8ShiftVL|CheckRegSize|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf, { RegXMM|RegYMM|RegZMM|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
 vpmaxsb, 3, 0x663C, None, 1, CpuAVX512BW, Modrm|Masking=3|VexOpcode=1|VexWIG|VexVVVV=1|Disp8ShiftVL|CheckRegSize|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf, { RegXMM|RegYMM|RegZMM|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
diff --git a/opcodes/i386-tbl.h b/opcodes/i386-tbl.h
index 81575df3f2..bd33eb5ce5 100644
--- a/opcodes/i386-tbl.h
+++ b/opcodes/i386-tbl.h
@@ -60123,7 +60123,7 @@ const insn_template i386_optab[] =
         0, 0, 0, 0, 0, 0 } },
     { 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0 },
+      2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 7, 0, 0, 1, 0, 0, 0, 0, 0 },
     { { { 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
 	  0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0,
 	  0, 0 } },
@@ -60139,7 +60139,7 @@ const insn_template i386_optab[] =
         0, 0, 0, 0, 0, 0 } },
     { 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0 },
+      1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 7, 0, 0, 1, 0, 0, 0, 0, 0 },
     { { { 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
 	  0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0,
 	  0, 0 } },
@@ -60155,7 +60155,7 @@ const insn_template i386_optab[] =
         0, 0, 0, 0, 0, 0 } },
     { 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0 },
+      1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 7, 0, 0, 1, 0, 0, 0, 0, 0 },
     { { { 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
 	  0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0,
 	  0, 0 } },
@@ -60171,7 +60171,7 @@ const insn_template i386_optab[] =
         0, 0, 0, 0, 0, 0 } },
     { 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0 },
+      2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 7, 0, 0, 1, 0, 0, 0, 0, 0 },
     { { { 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
 	  0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0,
 	  0, 0 } },
@@ -63555,7 +63555,7 @@ const insn_template i386_optab[] =
         0, 0, 0, 0, 0, 0 } },
     { 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0 },
+      1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 7, 0, 0, 1, 0, 0, 0, 0, 0 },
     { { { 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
 	  0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0,
 	  0, 0 } },
@@ -63571,7 +63571,7 @@ const insn_template i386_optab[] =
         0, 0, 0, 0, 0, 0 } },
     { 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0 },
+      2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 7, 0, 0, 1, 0, 0, 0, 0, 0 },
     { { { 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
 	  0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0,
 	  0, 0 } },
-- 
2.20.1

^ permalink raw reply	[flat|nested] 7+ messages in thread

* V2 [PATCH] x86: Optimize EVEX vector load/store instructions
  2019-03-15 23:58 [PATCH] x86: Optimize EVEX vector load/store instructions H.J. Lu
@ 2019-03-17 20:47 ` H.J. Lu
  2019-03-18 13:49   ` Jan Beulich
  0 siblings, 1 reply; 7+ messages in thread
From: H.J. Lu @ 2019-03-17 20:47 UTC (permalink / raw)
  To: binutils

On Sat, Mar 16, 2019 at 07:54:14AM +0800, H.J. Lu wrote:
> When there is no write mask, we can encode lower 16 128-bit/256-bit
> vector register load and store instructions as VEX vector register
> load and store instructions with -O2.
> 
> gas/
> 
> 	PR gas/24348
> 	* config/tc-i386.c (optimize_encoding): Encode EVEX 128-bit and
> 	256-bit vector register load/store instructions as VEX vector
> 	register load/store instructions for -O2.
> 	(md_parse_option): Set optimize to INT_MAX for -Os.
> 	* doc/c-i386.texi: Update -O2 documentation.
> 

This is the patch I am checking in.


H.J.
----
When there is no write mask, we can encode lower 16 128-bit/256-bit
vector register load and store instructions as VEX vector register
load and store instructions with -O1.

gas/

	PR gas/24348
	* config/tc-i386.c (optimize_encoding): Encode EVEX 128-bit and
	256-bit vector register load/store instructions as VEX vector
	register load/store instructions for -O1.
	* doc/c-i386.texi: Update -O1 documentation.
	* testsuite/gas/i386/i386.exp: Run PR gas/24348 tests.
	* testsuite/gas/i386/optimize-1.s: Add tests for EVEX vector
	load/store instructions.
	* testsuite/gas/i386/optimize-2.s: Likewise.
	* testsuite/gas/i386/optimize-3.s: Likewise.
	* testsuite/gas/i386/optimize-5.s: Likewise.
	* testsuite/gas/i386/x86-64-optimize-2.s: Likewise.
	* testsuite/gas/i386/x86-64-optimize-3.s: Likewise.
	* testsuite/gas/i386/x86-64-optimize-4.s: Likewise.
	* testsuite/gas/i386/x86-64-optimize-5.s: Likewise.
	* testsuite/gas/i386/x86-64-optimize-6.s: Likewise.
	* testsuite/gas/i386/optimize-1.d: Updated.
	* testsuite/gas/i386/optimize-2.d: Likewise.
	* testsuite/gas/i386/optimize-3.d: Likewise.
	* testsuite/gas/i386/optimize-4.d: Likewise.
	* testsuite/gas/i386/optimize-5.d: Likewise.
	* testsuite/gas/i386/x86-64-optimize-2.d: Likewise.
	* testsuite/gas/i386/x86-64-optimize-3.d: Likewise.
	* testsuite/gas/i386/x86-64-optimize-4.d: Likewise.
	* testsuite/gas/i386/x86-64-optimize-5.d: Likewise.
	* testsuite/gas/i386/x86-64-optimize-6.d: Likewise.
	* testsuite/gas/i386/optimize-7.d: New file.
	* testsuite/gas/i386/optimize-7.s: Likewise.
	* testsuite/gas/i386/x86-64-optimize-8.d: Likewise.
	* testsuite/gas/i386/x86-64-optimize-8.s: Likewise.

opcodes/

	PR gas/24348
	* i386-opc.tbl: Add Optimize to vmovdqa32, vmovdqa64, vmovdqu8,
	vmovdqu16, vmovdqu32 and vmovdqu64.
	* i386-tbl.h: Regenerated.
---
 gas/config/tc-i386.c                        |  50 ++++++++++
 gas/doc/c-i386.texi                         |   4 +-
 gas/testsuite/gas/i386/i386.exp             |   2 +
 gas/testsuite/gas/i386/optimize-1.d         |  36 +++++++
 gas/testsuite/gas/i386/optimize-1.s         |  42 ++++++++
 gas/testsuite/gas/i386/optimize-1a.d        |  36 +++++++
 gas/testsuite/gas/i386/optimize-2.d         |  72 ++++++++++++++
 gas/testsuite/gas/i386/optimize-2.s         |  84 ++++++++++++++++
 gas/testsuite/gas/i386/optimize-3.d         |   6 ++
 gas/testsuite/gas/i386/optimize-3.s         |   7 ++
 gas/testsuite/gas/i386/optimize-4.d         |  36 +++++++
 gas/testsuite/gas/i386/optimize-5.d         |  42 ++++++++
 gas/testsuite/gas/i386/optimize-5.s         |   7 ++
 gas/testsuite/gas/i386/optimize-7.d         |  12 +++
 gas/testsuite/gas/i386/optimize-7.s         |   6 ++
 gas/testsuite/gas/i386/x86-64-optimize-2.d  |  48 +++++++++
 gas/testsuite/gas/i386/x86-64-optimize-2.s  |  56 +++++++++++
 gas/testsuite/gas/i386/x86-64-optimize-2a.d |  48 +++++++++
 gas/testsuite/gas/i386/x86-64-optimize-3.d  |  90 +++++++++++++++++
 gas/testsuite/gas/i386/x86-64-optimize-3.s  | 105 ++++++++++++++++++++
 gas/testsuite/gas/i386/x86-64-optimize-4.d  |   6 ++
 gas/testsuite/gas/i386/x86-64-optimize-4.s  |   7 ++
 gas/testsuite/gas/i386/x86-64-optimize-5.d  |  54 ++++++++++
 gas/testsuite/gas/i386/x86-64-optimize-5.s  |   7 ++
 gas/testsuite/gas/i386/x86-64-optimize-6.d  |  54 ++++++++++
 gas/testsuite/gas/i386/x86-64-optimize-6.s  |   7 ++
 gas/testsuite/gas/i386/x86-64-optimize-8.d  |  12 +++
 gas/testsuite/gas/i386/x86-64-optimize-8.s  |   6 ++
 opcodes/i386-opc.tbl                        |  12 +--
 opcodes/i386-tbl.h                          |  12 +--
 30 files changed, 953 insertions(+), 13 deletions(-)
 create mode 100644 gas/testsuite/gas/i386/optimize-7.d
 create mode 100644 gas/testsuite/gas/i386/optimize-7.s
 create mode 100644 gas/testsuite/gas/i386/x86-64-optimize-8.d
 create mode 100644 gas/testsuite/gas/i386/x86-64-optimize-8.s

diff --git a/gas/config/tc-i386.c b/gas/config/tc-i386.c
index 856c18d672..fa060759ae 100644
--- a/gas/config/tc-i386.c
+++ b/gas/config/tc-i386.c
@@ -4075,6 +4075,56 @@ optimize_encoding (void)
 	    i.types[j].bitfield.ymmword = 0;
 	  }
     }
+  else if ((cpu_arch_flags.bitfield.cpuavx
+	    || cpu_arch_isa_flags.bitfield.cpuavx)
+	   && i.vec_encoding != vex_encoding_evex
+	   && !i.types[0].bitfield.zmmword
+	   && !i.mask
+	   && is_evex_encoding (&i.tm)
+	   && (i.tm.base_opcode == 0x666f
+	       || (i.tm.base_opcode ^ Opcode_SIMD_IntD) == 0x666f
+	       || i.tm.base_opcode == 0xf36f
+	       || (i.tm.base_opcode ^ Opcode_SIMD_IntD) == 0xf36f
+	       || i.tm.base_opcode == 0xf26f
+	       || (i.tm.base_opcode ^ Opcode_SIMD_IntD) == 0xf26f)
+	   && i.tm.extension_opcode == None)
+    {
+      /* Optimize: -O1:
+	   VOP, one of vmovdqa32, vmovdqa64, vmovdqu8, vmovdqu16,
+	   vmovdqu32 and vmovdqu64:
+	     EVEX VOP %xmmM, %xmmN
+	       -> VEX vmovdqa|vmovdqu %xmmM, %xmmN (M and N < 16)
+	     EVEX VOP %ymmM, %ymmN
+	       -> VEX vmovdqa|vmovdqu %ymmM, %ymmN (M and N < 16)
+	     EVEX VOP %xmmM, mem
+	       -> VEX vmovdqa|vmovdqu %xmmM, mem (M < 16)
+	     EVEX VOP %ymmM, mem
+	       -> VEX vmovdqa|vmovdqu %ymmM, mem (M < 16)
+	     EVEX VOP mem, %xmmN
+	       -> VEX mvmovdqa|vmovdquem, %xmmN (N < 16)
+	     EVEX VOP mem, %ymmN
+	       -> VEX vmovdqa|vmovdqu mem, %ymmN (N < 16)
+       */
+      if (i.tm.base_opcode == 0xf26f)
+	i.tm.base_opcode = 0xf36f;
+      else if ((i.tm.base_opcode ^ Opcode_SIMD_IntD) == 0xf26f)
+	i.tm.base_opcode = 0xf36f ^ Opcode_SIMD_IntD;
+      i.tm.opcode_modifier.vex
+	= i.types[0].bitfield.ymmword ? VEX256 : VEX128;
+      i.tm.opcode_modifier.vexw = VEXW0;
+      i.tm.opcode_modifier.evex = 0;
+      i.tm.opcode_modifier.masking = 0;
+      i.tm.opcode_modifier.disp8memshift = 0;
+      i.memshift = 0;
+      for (j = 0; j < 2; j++)
+	if (operand_type_check (i.types[j], disp)
+	    && i.op[j].disps->X_op == O_constant)
+	  {
+	    i.types[j].bitfield.disp8
+	      = fits_in_disp8 (i.op[j].disps->X_add_number);
+	    break;
+	  }
+    }
 }
 
 /* This is the guts of the machine-dependent assembler.  LINE points to a
diff --git a/gas/doc/c-i386.texi b/gas/doc/c-i386.texi
index 7e5f5c257e..4acd5ff616 100644
--- a/gas/doc/c-i386.texi
+++ b/gas/doc/c-i386.texi
@@ -456,7 +456,9 @@ immediate as 32-bit register load instructions with 31-bit or 32-bits
 immediates, encode 64-bit register clearing instructions with 32-bit
 register clearing instructions and encode 256-bit/512-bit VEX/EVEX
 vector register clearing instructions with 128-bit VEX vector register
-clearing instructions.  @samp{-O2} includes @samp{-O1} optimization plus
+clearing instructions as well as encode 128-bit/256-bit EVEX vector
+register load/store instructions with VEX vector register load/store
+instructions.  @samp{-O2} includes @samp{-O1} optimization plus
 encodes 256-bit/512-bit EVEX vector register clearing instructions with
 128-bit EVEX vector register clearing instructions.
 @samp{-Os} includes @samp{-O2} optimization plus encodes 16-bit, 32-bit
diff --git a/gas/testsuite/gas/i386/i386.exp b/gas/testsuite/gas/i386/i386.exp
index 798bfb564a..3067b4a1f1 100644
--- a/gas/testsuite/gas/i386/i386.exp
+++ b/gas/testsuite/gas/i386/i386.exp
@@ -476,6 +476,7 @@ if [expr ([istarget "i*86-*-*"] ||  [istarget "x86_64-*-*"]) && [gas_32_check]]
     run_dump_test "optimize-6a"
     run_dump_test "optimize-6b"
     run_dump_test "optimize-6c"
+    run_dump_test "optimize-7"
 
     # These tests require support for 8 and 16 bit relocs,
     # so we only run them for ELF and COFF targets.
@@ -990,6 +991,7 @@ if [expr ([istarget "i*86-*-*"] || [istarget "x86_64-*-*"]) && [gas_64_check]] t
     run_dump_test "x86-64-optimize-7a"
     run_dump_test "x86-64-optimize-7b"
     run_dump_test "x86-64-optimize-7c"
+    run_dump_test "x86-64-optimize-8"
 
     if { ![istarget "*-*-aix*"]
       && ![istarget "*-*-beos*"]
diff --git a/gas/testsuite/gas/i386/optimize-1.d b/gas/testsuite/gas/i386/optimize-1.d
index 4358c19c21..70c802c002 100644
--- a/gas/testsuite/gas/i386/optimize-1.d
+++ b/gas/testsuite/gas/i386/optimize-1.d
@@ -62,4 +62,40 @@ Disassembly of section .text:
  +[a-f0-9]+:	c5 f4 47 e9          	kxorw  %k1,%k1,%k5
  +[a-f0-9]+:	c5 f4 42 e9          	kandnw %k1,%k1,%k5
  +[a-f0-9]+:	c5 f4 42 e9          	kandnw %k1,%k1,%k5
+ +[a-f0-9]+:	c5 f9 6f d1          	vmovdqa %xmm1,%xmm2
+ +[a-f0-9]+:	c5 f9 6f d1          	vmovdqa %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 f9 6f 50 7f       	vmovdqa 0x7f\(%eax\),%xmm2
+ +[a-f0-9]+:	c5 f9 6f 50 7f       	vmovdqa 0x7f\(%eax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
+ +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fd 6f 50 7f       	vmovdqa 0x7f\(%eax\),%ymm2
+ +[a-f0-9]+:	c5 fd 6f 50 7f       	vmovdqa 0x7f\(%eax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
+ +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
 #pass
diff --git a/gas/testsuite/gas/i386/optimize-1.s b/gas/testsuite/gas/i386/optimize-1.s
index f61a176de8..6dcfbc2799 100644
--- a/gas/testsuite/gas/i386/optimize-1.s
+++ b/gas/testsuite/gas/i386/optimize-1.s
@@ -72,3 +72,45 @@ _start:
 
 	kandnd %k1, %k1, %k5
 	kandnq %k1, %k1, %k5
+
+	vmovdqa32	%xmm1, %xmm2
+	vmovdqa64	%xmm1, %xmm2
+	vmovdqu8	%xmm1, %xmm2
+	vmovdqu16	%xmm1, %xmm2
+	vmovdqu32	%xmm1, %xmm2
+	vmovdqu64	%xmm1, %xmm2
+
+	vmovdqa32	127(%eax), %xmm2
+	vmovdqa64	127(%eax), %xmm2
+	vmovdqu8	127(%eax), %xmm2
+	vmovdqu16	127(%eax), %xmm2
+	vmovdqu32	127(%eax), %xmm2
+	vmovdqu64	127(%eax), %xmm2
+
+	vmovdqa32	%xmm1, 128(%eax)
+	vmovdqa64	%xmm1, 128(%eax)
+	vmovdqu8	%xmm1, 128(%eax)
+	vmovdqu16	%xmm1, 128(%eax)
+	vmovdqu32	%xmm1, 128(%eax)
+	vmovdqu64	%xmm1, 128(%eax)
+
+	vmovdqa32	%ymm1, %ymm2
+	vmovdqa64	%ymm1, %ymm2
+	vmovdqu8	%ymm1, %ymm2
+	vmovdqu16	%ymm1, %ymm2
+	vmovdqu32	%ymm1, %ymm2
+	vmovdqu64	%ymm1, %ymm2
+
+	vmovdqa32	127(%eax), %ymm2
+	vmovdqa64	127(%eax), %ymm2
+	vmovdqu8	127(%eax), %ymm2
+	vmovdqu16	127(%eax), %ymm2
+	vmovdqu32	127(%eax), %ymm2
+	vmovdqu64	127(%eax), %ymm2
+
+	vmovdqa32	%ymm1, 128(%eax)
+	vmovdqa64	%ymm1, 128(%eax)
+	vmovdqu8	%ymm1, 128(%eax)
+	vmovdqu16	%ymm1, 128(%eax)
+	vmovdqu32	%ymm1, 128(%eax)
+	vmovdqu64	%ymm1, 128(%eax)
diff --git a/gas/testsuite/gas/i386/optimize-1a.d b/gas/testsuite/gas/i386/optimize-1a.d
index e6e6d81fe4..cee2383d84 100644
--- a/gas/testsuite/gas/i386/optimize-1a.d
+++ b/gas/testsuite/gas/i386/optimize-1a.d
@@ -63,4 +63,40 @@ Disassembly of section .text:
  +[a-f0-9]+:	c5 f4 47 e9          	kxorw  %k1,%k1,%k5
  +[a-f0-9]+:	c5 f4 42 e9          	kandnw %k1,%k1,%k5
  +[a-f0-9]+:	c5 f4 42 e9          	kandnw %k1,%k1,%k5
+ +[a-f0-9]+:	c5 f9 6f d1          	vmovdqa %xmm1,%xmm2
+ +[a-f0-9]+:	c5 f9 6f d1          	vmovdqa %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 f9 6f 50 7f       	vmovdqa 0x7f\(%eax\),%xmm2
+ +[a-f0-9]+:	c5 f9 6f 50 7f       	vmovdqa 0x7f\(%eax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
+ +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fd 6f 50 7f       	vmovdqa 0x7f\(%eax\),%ymm2
+ +[a-f0-9]+:	c5 fd 6f 50 7f       	vmovdqa 0x7f\(%eax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
+ +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
 #pass
diff --git a/gas/testsuite/gas/i386/optimize-2.d b/gas/testsuite/gas/i386/optimize-2.d
index e8a516997a..19467f5c01 100644
--- a/gas/testsuite/gas/i386/optimize-2.d
+++ b/gas/testsuite/gas/i386/optimize-2.d
@@ -17,4 +17,76 @@ Disassembly of section .text:
  +[a-f0-9]+:	f7 c7 7f 00 00 00    	test   \$0x7f,%edi
  +[a-f0-9]+:	66 f7 c7 7f 00       	test   \$0x7f,%di
  +[a-f0-9]+:	c5 f1 55 e9          	vandnpd %xmm1,%xmm1,%xmm5
+ +[a-f0-9]+:	c5 f9 6f d1          	vmovdqa %xmm1,%xmm2
+ +[a-f0-9]+:	c5 f9 6f d1          	vmovdqa %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 f9 6f 50 7f       	vmovdqa 0x7f\(%eax\),%xmm2
+ +[a-f0-9]+:	c5 f9 6f 50 7f       	vmovdqa 0x7f\(%eax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
+ +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fd 6f 50 7f       	vmovdqa 0x7f\(%eax\),%ymm2
+ +[a-f0-9]+:	c5 fd 6f 50 7f       	vmovdqa 0x7f\(%eax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
+ +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7d 48 6f d1    	vmovdqa32 %zmm1,%zmm2
+ +[a-f0-9]+:	62 f1 fd 48 6f d1    	vmovdqa64 %zmm1,%zmm2
+ +[a-f0-9]+:	62 f1 7f 48 6f d1    	vmovdqu8 %zmm1,%zmm2
+ +[a-f0-9]+:	62 f1 ff 48 6f d1    	vmovdqu16 %zmm1,%zmm2
+ +[a-f0-9]+:	62 f1 7e 48 6f d1    	vmovdqu32 %zmm1,%zmm2
+ +[a-f0-9]+:	62 f1 fe 48 6f d1    	vmovdqu64 %zmm1,%zmm2
+ +[a-f0-9]+:	62 f1 7d 28 6f d1    	vmovdqa32 %ymm1,%ymm2
+ +[a-f0-9]+:	62 f1 fd 28 6f d1    	vmovdqa64 %ymm1,%ymm2
+ +[a-f0-9]+:	62 f1 7f 08 6f d1    	vmovdqu8 %xmm1,%xmm2
+ +[a-f0-9]+:	62 f1 ff 08 6f d1    	vmovdqu16 %xmm1,%xmm2
+ +[a-f0-9]+:	62 f1 7e 08 6f d1    	vmovdqu32 %xmm1,%xmm2
+ +[a-f0-9]+:	62 f1 fe 08 6f d1    	vmovdqu64 %xmm1,%xmm2
+ +[a-f0-9]+:	62 f1 7d 29 6f d1    	vmovdqa32 %ymm1,%ymm2\{%k1\}
+ +[a-f0-9]+:	62 f1 fd 29 6f d1    	vmovdqa64 %ymm1,%ymm2\{%k1\}
+ +[a-f0-9]+:	62 f1 7f 09 6f d1    	vmovdqu8 %xmm1,%xmm2\{%k1\}
+ +[a-f0-9]+:	62 f1 ff 09 6f d1    	vmovdqu16 %xmm1,%xmm2\{%k1\}
+ +[a-f0-9]+:	62 f1 7e 09 6f d1    	vmovdqu32 %xmm1,%xmm2\{%k1\}
+ +[a-f0-9]+:	62 f1 fe 09 6f d1    	vmovdqu64 %xmm1,%xmm2\{%k1\}
+ +[a-f0-9]+:	62 f1 7d 29 6f 10    	vmovdqa32 \(%eax\),%ymm2\{%k1\}
+ +[a-f0-9]+:	62 f1 fd 29 6f 10    	vmovdqa64 \(%eax\),%ymm2\{%k1\}
+ +[a-f0-9]+:	62 f1 7f 09 6f 10    	vmovdqu8 \(%eax\),%xmm2\{%k1\}
+ +[a-f0-9]+:	62 f1 ff 09 6f 10    	vmovdqu16 \(%eax\),%xmm2\{%k1\}
+ +[a-f0-9]+:	62 f1 7e 09 6f 10    	vmovdqu32 \(%eax\),%xmm2\{%k1\}
+ +[a-f0-9]+:	62 f1 fe 09 6f 10    	vmovdqu64 \(%eax\),%xmm2\{%k1\}
+ +[a-f0-9]+:	62 f1 7d 29 7f 08    	vmovdqa32 %ymm1,\(%eax\)\{%k1\}
+ +[a-f0-9]+:	62 f1 fd 29 7f 08    	vmovdqa64 %ymm1,\(%eax\)\{%k1\}
+ +[a-f0-9]+:	62 f1 7f 09 7f 08    	vmovdqu8 %xmm1,\(%eax\)\{%k1\}
+ +[a-f0-9]+:	62 f1 ff 09 7f 08    	vmovdqu16 %xmm1,\(%eax\)\{%k1\}
+ +[a-f0-9]+:	62 f1 7e 09 7f 08    	vmovdqu32 %xmm1,\(%eax\)\{%k1\}
+ +[a-f0-9]+:	62 f1 fe 09 7f 08    	vmovdqu64 %xmm1,\(%eax\)\{%k1\}
+ +[a-f0-9]+:	62 f1 7d 89 6f d1    	vmovdqa32 %xmm1,%xmm2\{%k1\}\{z\}
+ +[a-f0-9]+:	62 f1 fd 89 6f d1    	vmovdqa64 %xmm1,%xmm2\{%k1\}\{z\}
+ +[a-f0-9]+:	62 f1 7f 89 6f d1    	vmovdqu8 %xmm1,%xmm2\{%k1\}\{z\}
+ +[a-f0-9]+:	62 f1 ff 89 6f d1    	vmovdqu16 %xmm1,%xmm2\{%k1\}\{z\}
+ +[a-f0-9]+:	62 f1 7e 89 6f d1    	vmovdqu32 %xmm1,%xmm2\{%k1\}\{z\}
+ +[a-f0-9]+:	62 f1 fe 89 6f d1    	vmovdqu64 %xmm1,%xmm2\{%k1\}\{z\}
 #pass
diff --git a/gas/testsuite/gas/i386/optimize-2.s b/gas/testsuite/gas/i386/optimize-2.s
index c9b57a8dd1..0a4fb23167 100644
--- a/gas/testsuite/gas/i386/optimize-2.s
+++ b/gas/testsuite/gas/i386/optimize-2.s
@@ -13,3 +13,87 @@ _start:
 	test	$0x7f, %di
 
 	vandnpd	%zmm1, %zmm1, %zmm5
+
+	vmovdqa32	%xmm1, %xmm2
+	vmovdqa64	%xmm1, %xmm2
+	vmovdqu8	%xmm1, %xmm2
+	vmovdqu16	%xmm1, %xmm2
+	vmovdqu32	%xmm1, %xmm2
+	vmovdqu64	%xmm1, %xmm2
+
+	vmovdqa32	127(%eax), %xmm2
+	vmovdqa64	127(%eax), %xmm2
+	vmovdqu8	127(%eax), %xmm2
+	vmovdqu16	127(%eax), %xmm2
+	vmovdqu32	127(%eax), %xmm2
+	vmovdqu64	127(%eax), %xmm2
+
+	vmovdqa32	%xmm1, 128(%eax)
+	vmovdqa64	%xmm1, 128(%eax)
+	vmovdqu8	%xmm1, 128(%eax)
+	vmovdqu16	%xmm1, 128(%eax)
+	vmovdqu32	%xmm1, 128(%eax)
+	vmovdqu64	%xmm1, 128(%eax)
+
+	vmovdqa32	%ymm1, %ymm2
+	vmovdqa64	%ymm1, %ymm2
+	vmovdqu8	%ymm1, %ymm2
+	vmovdqu16	%ymm1, %ymm2
+	vmovdqu32	%ymm1, %ymm2
+	vmovdqu64	%ymm1, %ymm2
+
+	vmovdqa32	127(%eax), %ymm2
+	vmovdqa64	127(%eax), %ymm2
+	vmovdqu8	127(%eax), %ymm2
+	vmovdqu16	127(%eax), %ymm2
+	vmovdqu32	127(%eax), %ymm2
+	vmovdqu64	127(%eax), %ymm2
+
+	vmovdqa32	%ymm1, 128(%eax)
+	vmovdqa64	%ymm1, 128(%eax)
+	vmovdqu8	%ymm1, 128(%eax)
+	vmovdqu16	%ymm1, 128(%eax)
+	vmovdqu32	%ymm1, 128(%eax)
+	vmovdqu64	%ymm1, 128(%eax)
+
+	vmovdqa32	%zmm1, %zmm2
+	vmovdqa64	%zmm1, %zmm2
+	vmovdqu8	%zmm1, %zmm2
+	vmovdqu16	%zmm1, %zmm2
+	vmovdqu32	%zmm1, %zmm2
+	vmovdqu64	%zmm1, %zmm2
+
+	{evex} vmovdqa32	%ymm1, %ymm2
+	{evex} vmovdqa64	%ymm1, %ymm2
+	{evex} vmovdqu8		%xmm1, %xmm2
+	{evex} vmovdqu16	%xmm1, %xmm2
+	{evex} vmovdqu32	%xmm1, %xmm2
+	{evex} vmovdqu64	%xmm1, %xmm2
+
+	vmovdqa32	%ymm1, %ymm2{%k1}
+	vmovdqa64	%ymm1, %ymm2{%k1}
+	vmovdqu8	%xmm1, %xmm2{%k1}
+	vmovdqu16	%xmm1, %xmm2{%k1}
+	vmovdqu32	%xmm1, %xmm2{%k1}
+	vmovdqu64	%xmm1, %xmm2{%k1}
+
+	vmovdqa32	(%eax), %ymm2{%k1}
+	vmovdqa64	(%eax), %ymm2{%k1}
+	vmovdqu8	(%eax), %xmm2{%k1}
+	vmovdqu16	(%eax), %xmm2{%k1}
+	vmovdqu32	(%eax), %xmm2{%k1}
+	vmovdqu64	(%eax), %xmm2{%k1}
+
+	vmovdqa32	%ymm1, (%eax){%k1}
+	vmovdqa64	%ymm1, (%eax){%k1}
+	vmovdqu8	%xmm1, (%eax){%k1}
+	vmovdqu16	%xmm1, (%eax){%k1}
+	vmovdqu32	%xmm1, (%eax){%k1}
+	vmovdqu64	%xmm1, (%eax){%k1}
+
+	vmovdqa32	%xmm1, %xmm2{%k1}{z}
+	vmovdqa64	%xmm1, %xmm2{%k1}{z}
+	vmovdqu8	%xmm1, %xmm2{%k1}{z}
+	vmovdqu16	%xmm1, %xmm2{%k1}{z}
+	vmovdqu32	%xmm1, %xmm2{%k1}{z}
+	vmovdqu64	%xmm1, %xmm2{%k1}{z}
diff --git a/gas/testsuite/gas/i386/optimize-3.d b/gas/testsuite/gas/i386/optimize-3.d
index f251a3626d..cd43243b49 100644
--- a/gas/testsuite/gas/i386/optimize-3.d
+++ b/gas/testsuite/gas/i386/optimize-3.d
@@ -9,4 +9,10 @@ Disassembly of section .text:
 
 0+ <_start>:
  +[a-f0-9]+:	a9 7f 00 00 00       	test   \$0x7f,%eax
+ +[a-f0-9]+:	62 f1 7d 28 6f d1    	vmovdqa32 %ymm1,%ymm2
+ +[a-f0-9]+:	62 f1 fd 28 6f d1    	vmovdqa64 %ymm1,%ymm2
+ +[a-f0-9]+:	62 f1 7f 08 6f d1    	vmovdqu8 %xmm1,%xmm2
+ +[a-f0-9]+:	62 f1 ff 08 6f d1    	vmovdqu16 %xmm1,%xmm2
+ +[a-f0-9]+:	62 f1 7e 08 6f d1    	vmovdqu32 %xmm1,%xmm2
+ +[a-f0-9]+:	62 f1 fe 08 6f d1    	vmovdqu64 %xmm1,%xmm2
 #pass
diff --git a/gas/testsuite/gas/i386/optimize-3.s b/gas/testsuite/gas/i386/optimize-3.s
index 536bf0cfb2..a70893c15d 100644
--- a/gas/testsuite/gas/i386/optimize-3.s
+++ b/gas/testsuite/gas/i386/optimize-3.s
@@ -4,3 +4,10 @@
 	.text
 _start:
 	{nooptimize} testl $0x7f, %eax
+
+	{nooptimize} vmovdqa32	%ymm1, %ymm2
+	{nooptimize} vmovdqa64	%ymm1, %ymm2
+	{nooptimize} vmovdqu8	%xmm1, %xmm2
+	{nooptimize} vmovdqu16	%xmm1, %xmm2
+	{nooptimize} vmovdqu32	%xmm1, %xmm2
+	{nooptimize} vmovdqu64	%xmm1, %xmm2
diff --git a/gas/testsuite/gas/i386/optimize-4.d b/gas/testsuite/gas/i386/optimize-4.d
index 9f99dadf34..2df84654d6 100644
--- a/gas/testsuite/gas/i386/optimize-4.d
+++ b/gas/testsuite/gas/i386/optimize-4.d
@@ -62,6 +62,42 @@ Disassembly of section .text:
  +[a-f0-9]+:	c5 f4 47 e9          	kxorw  %k1,%k1,%k5
  +[a-f0-9]+:	c5 f4 42 e9          	kandnw %k1,%k1,%k5
  +[a-f0-9]+:	c5 f4 42 e9          	kandnw %k1,%k1,%k5
+ +[a-f0-9]+:	c5 f9 6f d1          	vmovdqa %xmm1,%xmm2
+ +[a-f0-9]+:	c5 f9 6f d1          	vmovdqa %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 f9 6f 50 7f       	vmovdqa 0x7f\(%eax\),%xmm2
+ +[a-f0-9]+:	c5 f9 6f 50 7f       	vmovdqa 0x7f\(%eax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
+ +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fd 6f 50 7f       	vmovdqa 0x7f\(%eax\),%ymm2
+ +[a-f0-9]+:	c5 fd 6f 50 7f       	vmovdqa 0x7f\(%eax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
+ +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
  +[a-f0-9]+:	62 f1 f5 08 55 e9    	vandnpd %xmm1,%xmm1,%xmm5
  +[a-f0-9]+:	62 f1 f5 08 55 e9    	vandnpd %xmm1,%xmm1,%xmm5
 #pass
diff --git a/gas/testsuite/gas/i386/optimize-5.d b/gas/testsuite/gas/i386/optimize-5.d
index cfd0df04a4..ecc1ab139a 100644
--- a/gas/testsuite/gas/i386/optimize-5.d
+++ b/gas/testsuite/gas/i386/optimize-5.d
@@ -62,6 +62,48 @@ Disassembly of section .text:
  +[a-f0-9]+:	c5 f4 47 e9          	kxorw  %k1,%k1,%k5
  +[a-f0-9]+:	c5 f4 42 e9          	kandnw %k1,%k1,%k5
  +[a-f0-9]+:	c5 f4 42 e9          	kandnw %k1,%k1,%k5
+ +[a-f0-9]+:	c5 f9 6f d1          	vmovdqa %xmm1,%xmm2
+ +[a-f0-9]+:	c5 f9 6f d1          	vmovdqa %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 f9 6f 50 7f       	vmovdqa 0x7f\(%eax\),%xmm2
+ +[a-f0-9]+:	c5 f9 6f 50 7f       	vmovdqa 0x7f\(%eax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
+ +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fd 6f 50 7f       	vmovdqa 0x7f\(%eax\),%ymm2
+ +[a-f0-9]+:	c5 fd 6f 50 7f       	vmovdqa 0x7f\(%eax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
+ +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
  +[a-f0-9]+:	62 f1 f5 08 55 e9    	vandnpd %xmm1,%xmm1,%xmm5
  +[a-f0-9]+:	62 f1 f5 08 55 e9    	vandnpd %xmm1,%xmm1,%xmm5
+ +[a-f0-9]+:	62 f1 7d 28 6f d1    	vmovdqa32 %ymm1,%ymm2
+ +[a-f0-9]+:	62 f1 fd 28 6f d1    	vmovdqa64 %ymm1,%ymm2
+ +[a-f0-9]+:	62 f1 7f 08 6f d1    	vmovdqu8 %xmm1,%xmm2
+ +[a-f0-9]+:	62 f1 ff 08 6f d1    	vmovdqu16 %xmm1,%xmm2
+ +[a-f0-9]+:	62 f1 7e 08 6f d1    	vmovdqu32 %xmm1,%xmm2
+ +[a-f0-9]+:	62 f1 fe 08 6f d1    	vmovdqu64 %xmm1,%xmm2
 #pass
diff --git a/gas/testsuite/gas/i386/optimize-5.s b/gas/testsuite/gas/i386/optimize-5.s
index 66c762bd3b..77d60edb69 100644
--- a/gas/testsuite/gas/i386/optimize-5.s
+++ b/gas/testsuite/gas/i386/optimize-5.s
@@ -6,3 +6,10 @@
 
 	{evex} vandnpd %zmm1, %zmm1, %zmm5
 	{evex} vandnpd %ymm1, %ymm1, %ymm5
+
+	{evex} vmovdqa32	%ymm1, %ymm2
+	{evex} vmovdqa64	%ymm1, %ymm2
+	{evex} vmovdqu8		%xmm1, %xmm2
+	{evex} vmovdqu16	%xmm1, %xmm2
+	{evex} vmovdqu32	%xmm1, %xmm2
+	{evex} vmovdqu64	%xmm1, %xmm2
diff --git a/gas/testsuite/gas/i386/optimize-7.d b/gas/testsuite/gas/i386/optimize-7.d
new file mode 100644
index 0000000000..92ca7a6c75
--- /dev/null
+++ b/gas/testsuite/gas/i386/optimize-7.d
@@ -0,0 +1,12 @@
+#as: -O2 -march=+noavx
+#objdump: -drw
+#name: optimized encoding 7 with -O2
+
+.*: +file format .*
+
+
+Disassembly of section .text:
+
+0+ <_start>:
+ +[a-f0-9]+:	62 f1 7d 28 6f d1    	vmovdqa32 %ymm1,%ymm2
+#pass
diff --git a/gas/testsuite/gas/i386/optimize-7.s b/gas/testsuite/gas/i386/optimize-7.s
new file mode 100644
index 0000000000..261b4afa27
--- /dev/null
+++ b/gas/testsuite/gas/i386/optimize-7.s
@@ -0,0 +1,6 @@
+# Check instructions with optimized encoding
+
+	.allow_index_reg
+	.text
+_start:
+	vmovdqa32	%ymm1, %ymm2
diff --git a/gas/testsuite/gas/i386/x86-64-optimize-2.d b/gas/testsuite/gas/i386/x86-64-optimize-2.d
index fa031e8893..7d7340fae0 100644
--- a/gas/testsuite/gas/i386/x86-64-optimize-2.d
+++ b/gas/testsuite/gas/i386/x86-64-optimize-2.d
@@ -106,4 +106,52 @@ Disassembly of section .text:
  +[a-f0-9]+:	62 e1 f5 08 fb c1    	vpsubq %xmm1,%xmm1,%xmm16
  +[a-f0-9]+:	62 b1 f5 00 fb c9    	vpsubq %xmm17,%xmm17,%xmm1
  +[a-f0-9]+:	62 b1 f5 00 fb c9    	vpsubq %xmm17,%xmm17,%xmm1
+ +[a-f0-9]+:	c5 f9 6f d1          	vmovdqa %xmm1,%xmm2
+ +[a-f0-9]+:	c5 f9 6f d1          	vmovdqa %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c4 41 79 6f e3       	vmovdqa %xmm11,%xmm12
+ +[a-f0-9]+:	c4 41 79 6f e3       	vmovdqa %xmm11,%xmm12
+ +[a-f0-9]+:	c4 41 7a 6f e3       	vmovdqu %xmm11,%xmm12
+ +[a-f0-9]+:	c4 41 7a 6f e3       	vmovdqu %xmm11,%xmm12
+ +[a-f0-9]+:	c4 41 7a 6f e3       	vmovdqu %xmm11,%xmm12
+ +[a-f0-9]+:	c4 41 7a 6f e3       	vmovdqu %xmm11,%xmm12
+ +[a-f0-9]+:	c5 f9 6f 50 7f       	vmovdqa 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 f9 6f 50 7f       	vmovdqa 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c4 41 7d 6f e3       	vmovdqa %ymm11,%ymm12
+ +[a-f0-9]+:	c4 41 7d 6f e3       	vmovdqa %ymm11,%ymm12
+ +[a-f0-9]+:	c4 41 7e 6f e3       	vmovdqu %ymm11,%ymm12
+ +[a-f0-9]+:	c4 41 7e 6f e3       	vmovdqu %ymm11,%ymm12
+ +[a-f0-9]+:	c4 41 7e 6f e3       	vmovdqu %ymm11,%ymm12
+ +[a-f0-9]+:	c4 41 7e 6f e3       	vmovdqu %ymm11,%ymm12
+ +[a-f0-9]+:	c5 fd 6f 50 7f       	vmovdqa 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fd 6f 50 7f       	vmovdqa 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
 #pass
diff --git a/gas/testsuite/gas/i386/x86-64-optimize-2.s b/gas/testsuite/gas/i386/x86-64-optimize-2.s
index 10ce788ffb..1275610e55 100644
--- a/gas/testsuite/gas/i386/x86-64-optimize-2.s
+++ b/gas/testsuite/gas/i386/x86-64-optimize-2.s
@@ -114,3 +114,59 @@ _start:
 	vpsubq %ymm1, %ymm1, %ymm16
 	vpsubq %zmm17, %zmm17, %zmm1
 	vpsubq %ymm17, %ymm17, %ymm1
+
+	vmovdqa32	%xmm1, %xmm2
+	vmovdqa64	%xmm1, %xmm2
+	vmovdqu8	%xmm1, %xmm2
+	vmovdqu16	%xmm1, %xmm2
+	vmovdqu32	%xmm1, %xmm2
+	vmovdqu64	%xmm1, %xmm2
+
+	vmovdqa32	%xmm11, %xmm12
+	vmovdqa64	%xmm11, %xmm12
+	vmovdqu8	%xmm11, %xmm12
+	vmovdqu16	%xmm11, %xmm12
+	vmovdqu32	%xmm11, %xmm12
+	vmovdqu64	%xmm11, %xmm12
+
+	vmovdqa32	127(%rax), %xmm2
+	vmovdqa64	127(%rax), %xmm2
+	vmovdqu8	127(%rax), %xmm2
+	vmovdqu16	127(%rax), %xmm2
+	vmovdqu32	127(%rax), %xmm2
+	vmovdqu64	127(%rax), %xmm2
+
+	vmovdqa32	%xmm1, 128(%rax)
+	vmovdqa64	%xmm1, 128(%rax)
+	vmovdqu8	%xmm1, 128(%rax)
+	vmovdqu16	%xmm1, 128(%rax)
+	vmovdqu32	%xmm1, 128(%rax)
+	vmovdqu64	%xmm1, 128(%rax)
+
+	vmovdqa32	%ymm1, %ymm2
+	vmovdqa64	%ymm1, %ymm2
+	vmovdqu8	%ymm1, %ymm2
+	vmovdqu16	%ymm1, %ymm2
+	vmovdqu32	%ymm1, %ymm2
+	vmovdqu64	%ymm1, %ymm2
+
+	vmovdqa32	%ymm11, %ymm12
+	vmovdqa64	%ymm11, %ymm12
+	vmovdqu8	%ymm11, %ymm12
+	vmovdqu16	%ymm11, %ymm12
+	vmovdqu32	%ymm11, %ymm12
+	vmovdqu64	%ymm11, %ymm12
+
+	vmovdqa32	127(%rax), %ymm2
+	vmovdqa64	127(%rax), %ymm2
+	vmovdqu8	127(%rax), %ymm2
+	vmovdqu16	127(%rax), %ymm2
+	vmovdqu32	127(%rax), %ymm2
+	vmovdqu64	127(%rax), %ymm2
+
+	vmovdqa32	%ymm1, 128(%rax)
+	vmovdqa64	%ymm1, 128(%rax)
+	vmovdqu8	%ymm1, 128(%rax)
+	vmovdqu16	%ymm1, 128(%rax)
+	vmovdqu32	%ymm1, 128(%rax)
+	vmovdqu64	%ymm1, 128(%rax)
diff --git a/gas/testsuite/gas/i386/x86-64-optimize-2a.d b/gas/testsuite/gas/i386/x86-64-optimize-2a.d
index 9c6466d4ae..532a1458bc 100644
--- a/gas/testsuite/gas/i386/x86-64-optimize-2a.d
+++ b/gas/testsuite/gas/i386/x86-64-optimize-2a.d
@@ -107,4 +107,52 @@ Disassembly of section .text:
  +[a-f0-9]+:	62 e1 f5 28 fb c1    	vpsubq %ymm1,%ymm1,%ymm16
  +[a-f0-9]+:	62 b1 f5 40 fb c9    	vpsubq %zmm17,%zmm17,%zmm1
  +[a-f0-9]+:	62 b1 f5 20 fb c9    	vpsubq %ymm17,%ymm17,%ymm1
+ +[a-f0-9]+:	c5 f9 6f d1          	vmovdqa %xmm1,%xmm2
+ +[a-f0-9]+:	c5 f9 6f d1          	vmovdqa %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c4 41 79 6f e3       	vmovdqa %xmm11,%xmm12
+ +[a-f0-9]+:	c4 41 79 6f e3       	vmovdqa %xmm11,%xmm12
+ +[a-f0-9]+:	c4 41 7a 6f e3       	vmovdqu %xmm11,%xmm12
+ +[a-f0-9]+:	c4 41 7a 6f e3       	vmovdqu %xmm11,%xmm12
+ +[a-f0-9]+:	c4 41 7a 6f e3       	vmovdqu %xmm11,%xmm12
+ +[a-f0-9]+:	c4 41 7a 6f e3       	vmovdqu %xmm11,%xmm12
+ +[a-f0-9]+:	c5 f9 6f 50 7f       	vmovdqa 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 f9 6f 50 7f       	vmovdqa 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c4 41 7d 6f e3       	vmovdqa %ymm11,%ymm12
+ +[a-f0-9]+:	c4 41 7d 6f e3       	vmovdqa %ymm11,%ymm12
+ +[a-f0-9]+:	c4 41 7e 6f e3       	vmovdqu %ymm11,%ymm12
+ +[a-f0-9]+:	c4 41 7e 6f e3       	vmovdqu %ymm11,%ymm12
+ +[a-f0-9]+:	c4 41 7e 6f e3       	vmovdqu %ymm11,%ymm12
+ +[a-f0-9]+:	c4 41 7e 6f e3       	vmovdqu %ymm11,%ymm12
+ +[a-f0-9]+:	c5 fd 6f 50 7f       	vmovdqa 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fd 6f 50 7f       	vmovdqa 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
 #pass
diff --git a/gas/testsuite/gas/i386/x86-64-optimize-3.d b/gas/testsuite/gas/i386/x86-64-optimize-3.d
index f85c0af05e..74336a4fe2 100644
--- a/gas/testsuite/gas/i386/x86-64-optimize-3.d
+++ b/gas/testsuite/gas/i386/x86-64-optimize-3.d
@@ -25,4 +25,94 @@ Disassembly of section .text:
  +[a-f0-9]+:	41 f6 c1 7f          	test   \$0x7f,%r9b
  +[a-f0-9]+:	41 f6 c1 7f          	test   \$0x7f,%r9b
  +[a-f0-9]+:	c5 f1 55 e9          	vandnpd %xmm1,%xmm1,%xmm5
+ +[a-f0-9]+:	c5 f9 6f d1          	vmovdqa %xmm1,%xmm2
+ +[a-f0-9]+:	c5 f9 6f d1          	vmovdqa %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c4 41 79 6f e3       	vmovdqa %xmm11,%xmm12
+ +[a-f0-9]+:	c4 41 79 6f e3       	vmovdqa %xmm11,%xmm12
+ +[a-f0-9]+:	c4 41 7a 6f e3       	vmovdqu %xmm11,%xmm12
+ +[a-f0-9]+:	c4 41 7a 6f e3       	vmovdqu %xmm11,%xmm12
+ +[a-f0-9]+:	c4 41 7a 6f e3       	vmovdqu %xmm11,%xmm12
+ +[a-f0-9]+:	c4 41 7a 6f e3       	vmovdqu %xmm11,%xmm12
+ +[a-f0-9]+:	c5 f9 6f 50 7f       	vmovdqa 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 f9 6f 50 7f       	vmovdqa 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c4 41 7d 6f e3       	vmovdqa %ymm11,%ymm12
+ +[a-f0-9]+:	c4 41 7d 6f e3       	vmovdqa %ymm11,%ymm12
+ +[a-f0-9]+:	c4 41 7e 6f e3       	vmovdqu %ymm11,%ymm12
+ +[a-f0-9]+:	c4 41 7e 6f e3       	vmovdqu %ymm11,%ymm12
+ +[a-f0-9]+:	c4 41 7e 6f e3       	vmovdqu %ymm11,%ymm12
+ +[a-f0-9]+:	c4 41 7e 6f e3       	vmovdqu %ymm11,%ymm12
+ +[a-f0-9]+:	c5 fd 6f 50 7f       	vmovdqa 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fd 6f 50 7f       	vmovdqa 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 b1 7d 08 6f d5    	vmovdqa32 %xmm21,%xmm2
+ +[a-f0-9]+:	62 b1 fd 08 6f d5    	vmovdqa64 %xmm21,%xmm2
+ +[a-f0-9]+:	62 b1 7f 08 6f d5    	vmovdqu8 %xmm21,%xmm2
+ +[a-f0-9]+:	62 b1 ff 08 6f d5    	vmovdqu16 %xmm21,%xmm2
+ +[a-f0-9]+:	62 b1 7e 08 6f d5    	vmovdqu32 %xmm21,%xmm2
+ +[a-f0-9]+:	62 b1 fe 08 6f d5    	vmovdqu64 %xmm21,%xmm2
+ +[a-f0-9]+:	62 f1 7d 48 6f d1    	vmovdqa32 %zmm1,%zmm2
+ +[a-f0-9]+:	62 f1 fd 48 6f d1    	vmovdqa64 %zmm1,%zmm2
+ +[a-f0-9]+:	62 f1 7f 48 6f d1    	vmovdqu8 %zmm1,%zmm2
+ +[a-f0-9]+:	62 f1 ff 48 6f d1    	vmovdqu16 %zmm1,%zmm2
+ +[a-f0-9]+:	62 f1 7e 48 6f d1    	vmovdqu32 %zmm1,%zmm2
+ +[a-f0-9]+:	62 f1 fe 48 6f d1    	vmovdqu64 %zmm1,%zmm2
+ +[a-f0-9]+:	62 f1 7d 28 6f d1    	vmovdqa32 %ymm1,%ymm2
+ +[a-f0-9]+:	62 f1 fd 28 6f d1    	vmovdqa64 %ymm1,%ymm2
+ +[a-f0-9]+:	62 f1 7f 08 6f d1    	vmovdqu8 %xmm1,%xmm2
+ +[a-f0-9]+:	62 f1 ff 08 6f d1    	vmovdqu16 %xmm1,%xmm2
+ +[a-f0-9]+:	62 f1 7e 08 6f d1    	vmovdqu32 %xmm1,%xmm2
+ +[a-f0-9]+:	62 f1 fe 08 6f d1    	vmovdqu64 %xmm1,%xmm2
+ +[a-f0-9]+:	62 f1 7d 29 6f d1    	vmovdqa32 %ymm1,%ymm2\{%k1\}
+ +[a-f0-9]+:	62 f1 fd 29 6f d1    	vmovdqa64 %ymm1,%ymm2\{%k1\}
+ +[a-f0-9]+:	62 f1 7f 09 6f d1    	vmovdqu8 %xmm1,%xmm2\{%k1\}
+ +[a-f0-9]+:	62 f1 ff 09 6f d1    	vmovdqu16 %xmm1,%xmm2\{%k1\}
+ +[a-f0-9]+:	62 f1 7e 09 6f d1    	vmovdqu32 %xmm1,%xmm2\{%k1\}
+ +[a-f0-9]+:	62 f1 fe 09 6f d1    	vmovdqu64 %xmm1,%xmm2\{%k1\}
+ +[a-f0-9]+:	62 f1 7d 29 6f 10    	vmovdqa32 \(%rax\),%ymm2\{%k1\}
+ +[a-f0-9]+:	62 f1 fd 29 6f 10    	vmovdqa64 \(%rax\),%ymm2\{%k1\}
+ +[a-f0-9]+:	62 f1 7f 09 6f 10    	vmovdqu8 \(%rax\),%xmm2\{%k1\}
+ +[a-f0-9]+:	62 f1 ff 09 6f 10    	vmovdqu16 \(%rax\),%xmm2\{%k1\}
+ +[a-f0-9]+:	62 f1 7e 09 6f 10    	vmovdqu32 \(%rax\),%xmm2\{%k1\}
+ +[a-f0-9]+:	62 f1 fe 09 6f 10    	vmovdqu64 \(%rax\),%xmm2\{%k1\}
+ +[a-f0-9]+:	62 f1 7d 29 7f 08    	vmovdqa32 %ymm1,\(%rax\)\{%k1\}
+ +[a-f0-9]+:	62 f1 fd 29 7f 08    	vmovdqa64 %ymm1,\(%rax\)\{%k1\}
+ +[a-f0-9]+:	62 f1 7f 09 7f 08    	vmovdqu8 %xmm1,\(%rax\)\{%k1\}
+ +[a-f0-9]+:	62 f1 ff 09 7f 08    	vmovdqu16 %xmm1,\(%rax\)\{%k1\}
+ +[a-f0-9]+:	62 f1 7e 09 7f 08    	vmovdqu32 %xmm1,\(%rax\)\{%k1\}
+ +[a-f0-9]+:	62 f1 fe 09 7f 08    	vmovdqu64 %xmm1,\(%rax\)\{%k1\}
+ +[a-f0-9]+:	62 f1 7d 89 6f d1    	vmovdqa32 %xmm1,%xmm2\{%k1\}\{z\}
+ +[a-f0-9]+:	62 f1 fd 89 6f d1    	vmovdqa64 %xmm1,%xmm2\{%k1\}\{z\}
+ +[a-f0-9]+:	62 f1 7f 89 6f d1    	vmovdqu8 %xmm1,%xmm2\{%k1\}\{z\}
+ +[a-f0-9]+:	62 f1 ff 89 6f d1    	vmovdqu16 %xmm1,%xmm2\{%k1\}\{z\}
+ +[a-f0-9]+:	62 f1 7e 89 6f d1    	vmovdqu32 %xmm1,%xmm2\{%k1\}\{z\}
+ +[a-f0-9]+:	62 f1 fe 89 6f d1    	vmovdqu64 %xmm1,%xmm2\{%k1\}\{z\}
 #pass
diff --git a/gas/testsuite/gas/i386/x86-64-optimize-3.s b/gas/testsuite/gas/i386/x86-64-optimize-3.s
index 4a52a25ddd..d9c2eb86cb 100644
--- a/gas/testsuite/gas/i386/x86-64-optimize-3.s
+++ b/gas/testsuite/gas/i386/x86-64-optimize-3.s
@@ -21,3 +21,108 @@ _start:
 	test	$0x7f, %r9b
 
 	vandnpd	%zmm1, %zmm1, %zmm5
+
+	vmovdqa32	%xmm1, %xmm2
+	vmovdqa64	%xmm1, %xmm2
+	vmovdqu8	%xmm1, %xmm2
+	vmovdqu16	%xmm1, %xmm2
+	vmovdqu32	%xmm1, %xmm2
+	vmovdqu64	%xmm1, %xmm2
+
+	vmovdqa32	%xmm11, %xmm12
+	vmovdqa64	%xmm11, %xmm12
+	vmovdqu8	%xmm11, %xmm12
+	vmovdqu16	%xmm11, %xmm12
+	vmovdqu32	%xmm11, %xmm12
+	vmovdqu64	%xmm11, %xmm12
+
+	vmovdqa32	127(%rax), %xmm2
+	vmovdqa64	127(%rax), %xmm2
+	vmovdqu8	127(%rax), %xmm2
+	vmovdqu16	127(%rax), %xmm2
+	vmovdqu32	127(%rax), %xmm2
+	vmovdqu64	127(%rax), %xmm2
+
+	vmovdqa32	%xmm1, 128(%rax)
+	vmovdqa64	%xmm1, 128(%rax)
+	vmovdqu8	%xmm1, 128(%rax)
+	vmovdqu16	%xmm1, 128(%rax)
+	vmovdqu32	%xmm1, 128(%rax)
+	vmovdqu64	%xmm1, 128(%rax)
+
+	vmovdqa32	%ymm1, %ymm2
+	vmovdqa64	%ymm1, %ymm2
+	vmovdqu8	%ymm1, %ymm2
+	vmovdqu16	%ymm1, %ymm2
+	vmovdqu32	%ymm1, %ymm2
+	vmovdqu64	%ymm1, %ymm2
+
+	vmovdqa32	%ymm11, %ymm12
+	vmovdqa64	%ymm11, %ymm12
+	vmovdqu8	%ymm11, %ymm12
+	vmovdqu16	%ymm11, %ymm12
+	vmovdqu32	%ymm11, %ymm12
+	vmovdqu64	%ymm11, %ymm12
+
+	vmovdqa32	127(%rax), %ymm2
+	vmovdqa64	127(%rax), %ymm2
+	vmovdqu8	127(%rax), %ymm2
+	vmovdqu16	127(%rax), %ymm2
+	vmovdqu32	127(%rax), %ymm2
+	vmovdqu64	127(%rax), %ymm2
+
+	vmovdqa32	%ymm1, 128(%rax)
+	vmovdqa64	%ymm1, 128(%rax)
+	vmovdqu8	%ymm1, 128(%rax)
+	vmovdqu16	%ymm1, 128(%rax)
+	vmovdqu32	%ymm1, 128(%rax)
+	vmovdqu64	%ymm1, 128(%rax)
+
+	vmovdqa32	%xmm21, %xmm2
+	vmovdqa64	%xmm21, %xmm2
+	vmovdqu8	%xmm21, %xmm2
+	vmovdqu16	%xmm21, %xmm2
+	vmovdqu32	%xmm21, %xmm2
+	vmovdqu64	%xmm21, %xmm2
+
+	vmovdqa32	%zmm1, %zmm2
+	vmovdqa64	%zmm1, %zmm2
+	vmovdqu8	%zmm1, %zmm2
+	vmovdqu16	%zmm1, %zmm2
+	vmovdqu32	%zmm1, %zmm2
+	vmovdqu64	%zmm1, %zmm2
+
+	{evex} vmovdqa32	%ymm1, %ymm2
+	{evex} vmovdqa64	%ymm1, %ymm2
+	{evex} vmovdqu8		%xmm1, %xmm2
+	{evex} vmovdqu16	%xmm1, %xmm2
+	{evex} vmovdqu32	%xmm1, %xmm2
+	{evex} vmovdqu64	%xmm1, %xmm2
+
+	vmovdqa32	%ymm1, %ymm2{%k1}
+	vmovdqa64	%ymm1, %ymm2{%k1}
+	vmovdqu8	%xmm1, %xmm2{%k1}
+	vmovdqu16	%xmm1, %xmm2{%k1}
+	vmovdqu32	%xmm1, %xmm2{%k1}
+	vmovdqu64	%xmm1, %xmm2{%k1}
+
+	vmovdqa32	(%rax), %ymm2{%k1}
+	vmovdqa64	(%rax), %ymm2{%k1}
+	vmovdqu8	(%rax), %xmm2{%k1}
+	vmovdqu16	(%rax), %xmm2{%k1}
+	vmovdqu32	(%rax), %xmm2{%k1}
+	vmovdqu64	(%rax), %xmm2{%k1}
+
+	vmovdqa32	%ymm1, (%rax){%k1}
+	vmovdqa64	%ymm1, (%rax){%k1}
+	vmovdqu8	%xmm1, (%rax){%k1}
+	vmovdqu16	%xmm1, (%rax){%k1}
+	vmovdqu32	%xmm1, (%rax){%k1}
+	vmovdqu64	%xmm1, (%rax){%k1}
+
+	vmovdqa32	%xmm1, %xmm2{%k1}{z}
+	vmovdqa64	%xmm1, %xmm2{%k1}{z}
+	vmovdqu8	%xmm1, %xmm2{%k1}{z}
+	vmovdqu16	%xmm1, %xmm2{%k1}{z}
+	vmovdqu32	%xmm1, %xmm2{%k1}{z}
+	vmovdqu64	%xmm1, %xmm2{%k1}{z}
diff --git a/gas/testsuite/gas/i386/x86-64-optimize-4.d b/gas/testsuite/gas/i386/x86-64-optimize-4.d
index 10e7b02d3a..18fdeb1442 100644
--- a/gas/testsuite/gas/i386/x86-64-optimize-4.d
+++ b/gas/testsuite/gas/i386/x86-64-optimize-4.d
@@ -9,4 +9,10 @@ Disassembly of section .text:
 
 0+ <_start>:
  +[a-f0-9]+:	a9 7f 00 00 00       	test   \$0x7f,%eax
+ +[a-f0-9]+:	62 f1 7d 28 6f d1    	vmovdqa32 %ymm1,%ymm2
+ +[a-f0-9]+:	62 f1 fd 28 6f d1    	vmovdqa64 %ymm1,%ymm2
+ +[a-f0-9]+:	62 f1 7f 08 6f d1    	vmovdqu8 %xmm1,%xmm2
+ +[a-f0-9]+:	62 f1 ff 08 6f d1    	vmovdqu16 %xmm1,%xmm2
+ +[a-f0-9]+:	62 f1 7e 08 6f d1    	vmovdqu32 %xmm1,%xmm2
+ +[a-f0-9]+:	62 f1 fe 08 6f d1    	vmovdqu64 %xmm1,%xmm2
 #pass
diff --git a/gas/testsuite/gas/i386/x86-64-optimize-4.s b/gas/testsuite/gas/i386/x86-64-optimize-4.s
index 0c4fdcecc5..b6d872db2c 100644
--- a/gas/testsuite/gas/i386/x86-64-optimize-4.s
+++ b/gas/testsuite/gas/i386/x86-64-optimize-4.s
@@ -4,3 +4,10 @@
 	.text
 _start:
 	{nooptimize} testl $0x7f, %eax
+
+	{nooptimize} vmovdqa32	%ymm1, %ymm2
+	{nooptimize} vmovdqa64	%ymm1, %ymm2
+	{nooptimize} vmovdqu8	%xmm1, %xmm2
+	{nooptimize} vmovdqu16	%xmm1, %xmm2
+	{nooptimize} vmovdqu32	%xmm1, %xmm2
+	{nooptimize} vmovdqu64	%xmm1, %xmm2
diff --git a/gas/testsuite/gas/i386/x86-64-optimize-5.d b/gas/testsuite/gas/i386/x86-64-optimize-5.d
index 085f7f29f2..012237df57 100644
--- a/gas/testsuite/gas/i386/x86-64-optimize-5.d
+++ b/gas/testsuite/gas/i386/x86-64-optimize-5.d
@@ -106,6 +106,60 @@ Disassembly of section .text:
  +[a-f0-9]+:	62 e1 f5 08 fb c1    	vpsubq %xmm1,%xmm1,%xmm16
  +[a-f0-9]+:	62 b1 f5 00 fb c9    	vpsubq %xmm17,%xmm17,%xmm1
  +[a-f0-9]+:	62 b1 f5 00 fb c9    	vpsubq %xmm17,%xmm17,%xmm1
+ +[a-f0-9]+:	c5 f9 6f d1          	vmovdqa %xmm1,%xmm2
+ +[a-f0-9]+:	c5 f9 6f d1          	vmovdqa %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c4 41 79 6f e3       	vmovdqa %xmm11,%xmm12
+ +[a-f0-9]+:	c4 41 79 6f e3       	vmovdqa %xmm11,%xmm12
+ +[a-f0-9]+:	c4 41 7a 6f e3       	vmovdqu %xmm11,%xmm12
+ +[a-f0-9]+:	c4 41 7a 6f e3       	vmovdqu %xmm11,%xmm12
+ +[a-f0-9]+:	c4 41 7a 6f e3       	vmovdqu %xmm11,%xmm12
+ +[a-f0-9]+:	c4 41 7a 6f e3       	vmovdqu %xmm11,%xmm12
+ +[a-f0-9]+:	c5 f9 6f 50 7f       	vmovdqa 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 f9 6f 50 7f       	vmovdqa 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c4 41 7d 6f e3       	vmovdqa %ymm11,%ymm12
+ +[a-f0-9]+:	c4 41 7d 6f e3       	vmovdqa %ymm11,%ymm12
+ +[a-f0-9]+:	c4 41 7e 6f e3       	vmovdqu %ymm11,%ymm12
+ +[a-f0-9]+:	c4 41 7e 6f e3       	vmovdqu %ymm11,%ymm12
+ +[a-f0-9]+:	c4 41 7e 6f e3       	vmovdqu %ymm11,%ymm12
+ +[a-f0-9]+:	c4 41 7e 6f e3       	vmovdqu %ymm11,%ymm12
+ +[a-f0-9]+:	c5 fd 6f 50 7f       	vmovdqa 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fd 6f 50 7f       	vmovdqa 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
  +[a-f0-9]+:	62 f1 f5 08 55 e9    	vandnpd %xmm1,%xmm1,%xmm5
  +[a-f0-9]+:	62 f1 f5 08 55 e9    	vandnpd %xmm1,%xmm1,%xmm5
+ +[a-f0-9]+:	62 f1 7d 28 6f d1    	vmovdqa32 %ymm1,%ymm2
+ +[a-f0-9]+:	62 f1 fd 28 6f d1    	vmovdqa64 %ymm1,%ymm2
+ +[a-f0-9]+:	62 f1 7f 08 6f d1    	vmovdqu8 %xmm1,%xmm2
+ +[a-f0-9]+:	62 f1 ff 08 6f d1    	vmovdqu16 %xmm1,%xmm2
+ +[a-f0-9]+:	62 f1 7e 08 6f d1    	vmovdqu32 %xmm1,%xmm2
+ +[a-f0-9]+:	62 f1 fe 08 6f d1    	vmovdqu64 %xmm1,%xmm2
 #pass
diff --git a/gas/testsuite/gas/i386/x86-64-optimize-5.s b/gas/testsuite/gas/i386/x86-64-optimize-5.s
index 6b4ff103ab..9756ae815c 100644
--- a/gas/testsuite/gas/i386/x86-64-optimize-5.s
+++ b/gas/testsuite/gas/i386/x86-64-optimize-5.s
@@ -4,3 +4,10 @@
 
 	{evex} vandnpd %zmm1, %zmm1, %zmm5
 	{evex} vandnpd %ymm1, %ymm1, %ymm5
+
+	{evex} vmovdqa32	%ymm1, %ymm2
+	{evex} vmovdqa64	%ymm1, %ymm2
+	{evex} vmovdqu8		%xmm1, %xmm2
+	{evex} vmovdqu16	%xmm1, %xmm2
+	{evex} vmovdqu32	%xmm1, %xmm2
+	{evex} vmovdqu64	%xmm1, %xmm2
diff --git a/gas/testsuite/gas/i386/x86-64-optimize-6.d b/gas/testsuite/gas/i386/x86-64-optimize-6.d
index 0d52c8fcbb..aca119e4f9 100644
--- a/gas/testsuite/gas/i386/x86-64-optimize-6.d
+++ b/gas/testsuite/gas/i386/x86-64-optimize-6.d
@@ -106,6 +106,60 @@ Disassembly of section .text:
  +[a-f0-9]+:	62 e1 f5 08 fb c1    	vpsubq %xmm1,%xmm1,%xmm16
  +[a-f0-9]+:	62 b1 f5 00 fb c9    	vpsubq %xmm17,%xmm17,%xmm1
  +[a-f0-9]+:	62 b1 f5 00 fb c9    	vpsubq %xmm17,%xmm17,%xmm1
+ +[a-f0-9]+:	c5 f9 6f d1          	vmovdqa %xmm1,%xmm2
+ +[a-f0-9]+:	c5 f9 6f d1          	vmovdqa %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c5 fa 6f d1          	vmovdqu %xmm1,%xmm2
+ +[a-f0-9]+:	c4 41 79 6f e3       	vmovdqa %xmm11,%xmm12
+ +[a-f0-9]+:	c4 41 79 6f e3       	vmovdqa %xmm11,%xmm12
+ +[a-f0-9]+:	c4 41 7a 6f e3       	vmovdqu %xmm11,%xmm12
+ +[a-f0-9]+:	c4 41 7a 6f e3       	vmovdqu %xmm11,%xmm12
+ +[a-f0-9]+:	c4 41 7a 6f e3       	vmovdqu %xmm11,%xmm12
+ +[a-f0-9]+:	c4 41 7a 6f e3       	vmovdqu %xmm11,%xmm12
+ +[a-f0-9]+:	c5 f9 6f 50 7f       	vmovdqa 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 f9 6f 50 7f       	vmovdqa 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
+ +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
+ +[a-f0-9]+:	c4 41 7d 6f e3       	vmovdqa %ymm11,%ymm12
+ +[a-f0-9]+:	c4 41 7d 6f e3       	vmovdqa %ymm11,%ymm12
+ +[a-f0-9]+:	c4 41 7e 6f e3       	vmovdqu %ymm11,%ymm12
+ +[a-f0-9]+:	c4 41 7e 6f e3       	vmovdqu %ymm11,%ymm12
+ +[a-f0-9]+:	c4 41 7e 6f e3       	vmovdqu %ymm11,%ymm12
+ +[a-f0-9]+:	c4 41 7e 6f e3       	vmovdqu %ymm11,%ymm12
+ +[a-f0-9]+:	c5 fd 6f 50 7f       	vmovdqa 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fd 6f 50 7f       	vmovdqa 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
+ +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
  +[a-f0-9]+:	62 f1 f5 08 55 e9    	vandnpd %xmm1,%xmm1,%xmm5
  +[a-f0-9]+:	62 f1 f5 08 55 e9    	vandnpd %xmm1,%xmm1,%xmm5
+ +[a-f0-9]+:	62 f1 7d 28 6f d1    	vmovdqa32 %ymm1,%ymm2
+ +[a-f0-9]+:	62 f1 fd 28 6f d1    	vmovdqa64 %ymm1,%ymm2
+ +[a-f0-9]+:	62 f1 7f 08 6f d1    	vmovdqu8 %xmm1,%xmm2
+ +[a-f0-9]+:	62 f1 ff 08 6f d1    	vmovdqu16 %xmm1,%xmm2
+ +[a-f0-9]+:	62 f1 7e 08 6f d1    	vmovdqu32 %xmm1,%xmm2
+ +[a-f0-9]+:	62 f1 fe 08 6f d1    	vmovdqu64 %xmm1,%xmm2
 #pass
diff --git a/gas/testsuite/gas/i386/x86-64-optimize-6.s b/gas/testsuite/gas/i386/x86-64-optimize-6.s
index 70ccbc41be..7c403fcc86 100644
--- a/gas/testsuite/gas/i386/x86-64-optimize-6.s
+++ b/gas/testsuite/gas/i386/x86-64-optimize-6.s
@@ -6,3 +6,10 @@
 
 	{evex} vandnpd %zmm1, %zmm1, %zmm5
 	{evex} vandnpd %ymm1, %ymm1, %ymm5
+
+	{evex} vmovdqa32	%ymm1, %ymm2
+	{evex} vmovdqa64	%ymm1, %ymm2
+	{evex} vmovdqu8		%xmm1, %xmm2
+	{evex} vmovdqu16	%xmm1, %xmm2
+	{evex} vmovdqu32	%xmm1, %xmm2
+	{evex} vmovdqu64	%xmm1, %xmm2
diff --git a/gas/testsuite/gas/i386/x86-64-optimize-8.d b/gas/testsuite/gas/i386/x86-64-optimize-8.d
new file mode 100644
index 0000000000..46efa5229d
--- /dev/null
+++ b/gas/testsuite/gas/i386/x86-64-optimize-8.d
@@ -0,0 +1,12 @@
+#as: -O2 -march=+noavx
+#objdump: -drw
+#name: x86-64 optimized encoding 8 with -O2
+
+.*: +file format .*
+
+
+Disassembly of section .text:
+
+0+ <_start>:
+ +[a-f0-9]+:	62 f1 7d 28 6f d1    	vmovdqa32 %ymm1,%ymm2
+#pass
diff --git a/gas/testsuite/gas/i386/x86-64-optimize-8.s b/gas/testsuite/gas/i386/x86-64-optimize-8.s
new file mode 100644
index 0000000000..4b9865a91b
--- /dev/null
+++ b/gas/testsuite/gas/i386/x86-64-optimize-8.s
@@ -0,0 +1,6 @@
+# Check 64bit instructions with optimized encoding
+
+	.allow_index_reg
+	.text
+_start:
+	vmovdqa32	%ymm1, %ymm2
diff --git a/opcodes/i386-opc.tbl b/opcodes/i386-opc.tbl
index 1194dcd1c0..26a68d8cbe 100644
--- a/opcodes/i386-opc.tbl
+++ b/opcodes/i386-opc.tbl
@@ -3709,11 +3709,11 @@ vmovd, 2, 0x666E, None, 1, CpuAVX512F, D|Modrm|EVex=2|VexOpcode=0|Disp8MemShift=
 
 vmovddup, 2, 0xF212, None, 1, CpuAVX512F, Modrm|Masking=3|VexOpcode=0|VexW=2|Disp8ShiftVL|CheckRegSize|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf, { RegYMM|RegZMM|Unspecified|BaseIndex, RegYMM|RegZMM }
 
-vmovdqa64, 2, 0x666F, None, 1, CpuAVX512F, D|Modrm|MaskingMorZ|VexOpcode=0|VexW=2|Disp8ShiftVL|CheckRegSize|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf, { RegXMM|RegYMM|RegZMM|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
-vmovdqa32, 2, 0x666F, None, 1, CpuAVX512F, D|Modrm|MaskingMorZ|VexOpcode=0|VexW=1|Disp8ShiftVL|CheckRegSize|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf, { RegXMM|RegYMM|RegZMM|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
+vmovdqa64, 2, 0x666F, None, 1, CpuAVX512F, D|Modrm|MaskingMorZ|VexOpcode=0|VexW=2|Disp8ShiftVL|CheckRegSize|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf|Optimize, { RegXMM|RegYMM|RegZMM|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
+vmovdqa32, 2, 0x666F, None, 1, CpuAVX512F, D|Modrm|MaskingMorZ|VexOpcode=0|VexW=1|Disp8ShiftVL|CheckRegSize|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf|Optimize, { RegXMM|RegYMM|RegZMM|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
 vmovntdq, 2, 0x66E7, None, 1, CpuAVX512F, Modrm|VexOpcode=0|VexW=1|Disp8ShiftVL|CheckRegSize|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf, { RegXMM|RegYMM|RegZMM, XMMword|YMMword|ZMMword|Unspecified|BaseIndex }
-vmovdqu32, 2, 0xF36F, None, 1, CpuAVX512F, D|Modrm|MaskingMorZ|VexOpcode=0|VexW=1|Disp8ShiftVL|CheckRegSize|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf, { RegXMM|RegYMM|RegZMM|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
-vmovdqu64, 2, 0xF36F, None, 1, CpuAVX512F, D|Modrm|MaskingMorZ|VexOpcode=0|VexW=2|Disp8ShiftVL|CheckRegSize|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf, { RegXMM|RegYMM|RegZMM|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
+vmovdqu32, 2, 0xF36F, None, 1, CpuAVX512F, D|Modrm|MaskingMorZ|VexOpcode=0|VexW=1|Disp8ShiftVL|CheckRegSize|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf|Optimize, { RegXMM|RegYMM|RegZMM|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
+vmovdqu64, 2, 0xF36F, None, 1, CpuAVX512F, D|Modrm|MaskingMorZ|VexOpcode=0|VexW=2|Disp8ShiftVL|CheckRegSize|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf|Optimize, { RegXMM|RegYMM|RegZMM|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
 
 vmovhlps, 3, 0x12, None, 1, CpuAVX512F, Modrm|EVex=4|VexOpcode=0|VexVVVV=1|VexW=1|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf, { RegXMM, RegXMM, RegXMM }
 vmovlhps, 3, 0x16, None, 1, CpuAVX512F, Modrm|EVex=4|VexOpcode=0|VexVVVV=1|VexW=1|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf, { RegXMM, RegXMM, RegXMM }
@@ -4190,8 +4190,8 @@ kshiftrq, 3, 0x6631, None, 1, CpuAVX512BW, Modrm|Vex=1|VexOpcode=2|VexW=2|No_bSu
 
 vdbpsadbw, 4, 0x6642, None, 1, CpuAVX512BW, Modrm|Masking=3|VexOpcode=2|VexVVVV=1|VexW=1|Disp8ShiftVL|CheckRegSize|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf, { Imm8, RegXMM|RegYMM|RegZMM|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
 
-vmovdqu8, 2, 0xF26F, None, 1, CpuAVX512BW, D|Modrm|MaskingMorZ|VexOpcode=0|VexW=1|Disp8ShiftVL|CheckRegSize|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf, { RegXMM|RegYMM|RegZMM|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
-vmovdqu16, 2, 0xF26F, None, 1, CpuAVX512BW, D|Modrm|MaskingMorZ|VexOpcode=0|VexW=2|Disp8ShiftVL|CheckRegSize|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf, { RegXMM|RegYMM|RegZMM|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
+vmovdqu8, 2, 0xF26F, None, 1, CpuAVX512BW, D|Modrm|MaskingMorZ|VexOpcode=0|VexW=1|Disp8ShiftVL|CheckRegSize|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf|Optimize, { RegXMM|RegYMM|RegZMM|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
+vmovdqu16, 2, 0xF26F, None, 1, CpuAVX512BW, D|Modrm|MaskingMorZ|VexOpcode=0|VexW=2|Disp8ShiftVL|CheckRegSize|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf|Optimize, { RegXMM|RegYMM|RegZMM|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
 
 vpabsb, 2, 0x661C, None, 1, CpuAVX512BW, Modrm|Masking=3|VexOpcode=1|VexWIG|Disp8ShiftVL|CheckRegSize|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf, { RegXMM|RegYMM|RegZMM|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM }
 vpmaxsb, 3, 0x663C, None, 1, CpuAVX512BW, Modrm|Masking=3|VexOpcode=1|VexWIG|VexVVVV=1|Disp8ShiftVL|CheckRegSize|No_bSuf|No_wSuf|No_lSuf|No_sSuf|No_qSuf|No_ldSuf, { RegXMM|RegYMM|RegZMM|Unspecified|BaseIndex, RegXMM|RegYMM|RegZMM, RegXMM|RegYMM|RegZMM }
diff --git a/opcodes/i386-tbl.h b/opcodes/i386-tbl.h
index 81575df3f2..bd33eb5ce5 100644
--- a/opcodes/i386-tbl.h
+++ b/opcodes/i386-tbl.h
@@ -60123,7 +60123,7 @@ const insn_template i386_optab[] =
         0, 0, 0, 0, 0, 0 } },
     { 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0 },
+      2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 7, 0, 0, 1, 0, 0, 0, 0, 0 },
     { { { 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
 	  0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0,
 	  0, 0 } },
@@ -60139,7 +60139,7 @@ const insn_template i386_optab[] =
         0, 0, 0, 0, 0, 0 } },
     { 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0 },
+      1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 7, 0, 0, 1, 0, 0, 0, 0, 0 },
     { { { 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
 	  0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0,
 	  0, 0 } },
@@ -60155,7 +60155,7 @@ const insn_template i386_optab[] =
         0, 0, 0, 0, 0, 0 } },
     { 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0 },
+      1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 7, 0, 0, 1, 0, 0, 0, 0, 0 },
     { { { 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
 	  0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0,
 	  0, 0 } },
@@ -60171,7 +60171,7 @@ const insn_template i386_optab[] =
         0, 0, 0, 0, 0, 0 } },
     { 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0 },
+      2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 7, 0, 0, 1, 0, 0, 0, 0, 0 },
     { { { 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
 	  0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0,
 	  0, 0 } },
@@ -63555,7 +63555,7 @@ const insn_template i386_optab[] =
         0, 0, 0, 0, 0, 0 } },
     { 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0 },
+      1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 7, 0, 0, 1, 0, 0, 0, 0, 0 },
     { { { 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
 	  0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0,
 	  0, 0 } },
@@ -63571,7 +63571,7 @@ const insn_template i386_optab[] =
         0, 0, 0, 0, 0, 0 } },
     { 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0 },
+      2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 7, 0, 0, 1, 0, 0, 0, 0, 0 },
     { { { 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
 	  0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0,
 	  0, 0 } },
-- 
2.20.1

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: V2 [PATCH] x86: Optimize EVEX vector load/store instructions
  2019-03-17 20:47 ` V2 " H.J. Lu
@ 2019-03-18 13:49   ` Jan Beulich
  2019-03-19  6:21     ` [PATCH] x86: Correct EVEX vector load/store optimization H.J. Lu
  0 siblings, 1 reply; 7+ messages in thread
From: Jan Beulich @ 2019-03-18 13:49 UTC (permalink / raw)
  To: H.J. Lu; +Cc: binutils

>>> On 17.03.19 at 21:47, <hjl.tools@gmail.com> wrote:
> --- a/gas/config/tc-i386.c
> +++ b/gas/config/tc-i386.c
> @@ -4075,6 +4075,56 @@ optimize_encoding (void)
>  	    i.types[j].bitfield.ymmword = 0;
>  	  }
>      }
> +  else if ((cpu_arch_flags.bitfield.cpuavx
> +	    || cpu_arch_isa_flags.bitfield.cpuavx)

Once again a questionable condition, as per earlier replies to
other patches of yours.

> +	   && i.vec_encoding != vex_encoding_evex
> +	   && !i.types[0].bitfield.zmmword
> +	   && !i.mask
> +	   && is_evex_encoding (&i.tm)
> +	   && (i.tm.base_opcode == 0x666f
> +	       || (i.tm.base_opcode ^ Opcode_SIMD_IntD) == 0x666f
> +	       || i.tm.base_opcode == 0xf36f
> +	       || (i.tm.base_opcode ^ Opcode_SIMD_IntD) == 0xf36f
> +	       || i.tm.base_opcode == 0xf26f
> +	       || (i.tm.base_opcode ^ Opcode_SIMD_IntD) == 0xf26f)

All three of these can be expressed with just a single comparison,
using & or | instead of ^ and (if necessary) adjusting the literal
value compared against.

> +	   && i.tm.extension_opcode == None)
> +    {
> +      /* Optimize: -O1:
> +	   VOP, one of vmovdqa32, vmovdqa64, vmovdqu8, vmovdqu16,
> +	   vmovdqu32 and vmovdqu64:
> +	     EVEX VOP %xmmM, %xmmN
> +	       -> VEX vmovdqa|vmovdqu %xmmM, %xmmN (M and N < 16)
> +	     EVEX VOP %ymmM, %ymmN
> +	       -> VEX vmovdqa|vmovdqu %ymmM, %ymmN (M and N < 16)
> +	     EVEX VOP %xmmM, mem
> +	       -> VEX vmovdqa|vmovdqu %xmmM, mem (M < 16)
> +	     EVEX VOP %ymmM, mem
> +	       -> VEX vmovdqa|vmovdqu %ymmM, mem (M < 16)
> +	     EVEX VOP mem, %xmmN
> +	       -> VEX mvmovdqa|vmovdquem, %xmmN (N < 16)

There's some confusion on this line.

> +	     EVEX VOP mem, %ymmN
> +	       -> VEX vmovdqa|vmovdqu mem, %ymmN (N < 16)
> +       */

For the variants with a memory operand I doubt the conversion
is always a win, and it may be against the user request in case of
-Os. This is because of the Disp8 scaling the EVEX encoding permits.

> +      if (i.tm.base_opcode == 0xf26f)
> +	i.tm.base_opcode = 0xf36f;
> +      else if ((i.tm.base_opcode ^ Opcode_SIMD_IntD) == 0xf26f)
> +	i.tm.base_opcode = 0xf36f ^ Opcode_SIMD_IntD;

This again can be expressed without "else if()" afaict.

Jan


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH] x86: Correct EVEX vector load/store optimization
  2019-03-18 13:49   ` Jan Beulich
@ 2019-03-19  6:21     ` H.J. Lu
  2019-03-19  8:30       ` Jan Beulich
  0 siblings, 1 reply; 7+ messages in thread
From: H.J. Lu @ 2019-03-19  6:21 UTC (permalink / raw)
  To: Jan Beulich; +Cc: Binutils

[-- Attachment #1: Type: text/plain, Size: 2568 bytes --]

On Mon, Mar 18, 2019 at 9:49 PM Jan Beulich <JBeulich@suse.com> wrote:
>
> >>> On 17.03.19 at 21:47, <hjl.tools@gmail.com> wrote:
> > --- a/gas/config/tc-i386.c
> > +++ b/gas/config/tc-i386.c
> > @@ -4075,6 +4075,56 @@ optimize_encoding (void)
> >           i.types[j].bitfield.ymmword = 0;
> >         }
> >      }
> > +  else if ((cpu_arch_flags.bitfield.cpuavx
> > +         || cpu_arch_isa_flags.bitfield.cpuavx)
>
> Once again a questionable condition, as per earlier replies to
> other patches of yours.

Fixed.

> > +        && i.vec_encoding != vex_encoding_evex
> > +        && !i.types[0].bitfield.zmmword
> > +        && !i.mask
> > +        && is_evex_encoding (&i.tm)
> > +        && (i.tm.base_opcode == 0x666f
> > +            || (i.tm.base_opcode ^ Opcode_SIMD_IntD) == 0x666f
> > +            || i.tm.base_opcode == 0xf36f
> > +            || (i.tm.base_opcode ^ Opcode_SIMD_IntD) == 0xf36f
> > +            || i.tm.base_opcode == 0xf26f
> > +            || (i.tm.base_opcode ^ Opcode_SIMD_IntD) == 0xf26f)
>
> All three of these can be expressed with just a single comparison,
> using & or | instead of ^ and (if necessary) adjusting the literal
> value compared against.

Fixed.

> > +        && i.tm.extension_opcode == None)
> > +    {
> > +      /* Optimize: -O1:
> > +        VOP, one of vmovdqa32, vmovdqa64, vmovdqu8, vmovdqu16,
> > +        vmovdqu32 and vmovdqu64:
> > +          EVEX VOP %xmmM, %xmmN
> > +            -> VEX vmovdqa|vmovdqu %xmmM, %xmmN (M and N < 16)
> > +          EVEX VOP %ymmM, %ymmN
> > +            -> VEX vmovdqa|vmovdqu %ymmM, %ymmN (M and N < 16)
> > +          EVEX VOP %xmmM, mem
> > +            -> VEX vmovdqa|vmovdqu %xmmM, mem (M < 16)
> > +          EVEX VOP %ymmM, mem
> > +            -> VEX vmovdqa|vmovdqu %ymmM, mem (M < 16)
> > +          EVEX VOP mem, %xmmN
> > +            -> VEX mvmovdqa|vmovdquem, %xmmN (N < 16)
>
> There's some confusion on this line.
>
> > +          EVEX VOP mem, %ymmN
> > +            -> VEX vmovdqa|vmovdqu mem, %ymmN (N < 16)
> > +       */
>
> For the variants with a memory operand I doubt the conversion
> is always a win, and it may be against the user request in case of
> -Os. This is because of the Disp8 scaling the EVEX encoding permits.

Fixed.

> > +      if (i.tm.base_opcode == 0xf26f)
> > +     i.tm.base_opcode = 0xf36f;
> > +      else if ((i.tm.base_opcode ^ Opcode_SIMD_IntD) == 0xf26f)
> > +     i.tm.base_opcode = 0xf36f ^ Opcode_SIMD_IntD;
>
> This again can be expressed without "else if()" afaict.
>

Fixed.

Here is the patch.

Thanks.

-- 
H.J.

[-- Attachment #2: 0001-x86-Correct-EVEX-vector-load-store-optimization.patch --]
[-- Type: text/x-patch, Size: 34028 bytes --]

From 84ecabf0624411c1ab95bfadbd864aa4b226b2e8 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Tue, 19 Mar 2019 10:56:39 +0800
Subject: [PATCH] x86: Correct EVEX vector load/store optimization

Update EVEX vector load/store optimization:

1. There is no need to check AVX since AVX2 is required for AVX512F.
2. We need to check both operands for ZMM register since AT&T syntax
may not set zmmword on the first operand.
3. Update Opcode_SIMD_IntD check and set.
4. Since the VEX prefix has 2 or 3 bytes, the EVEX prefix has 4 bytes,
EVEX Disp8 has 1 byte and VEX Disp32 has 4 bytes, we choose EVEX Disp8
over VEX Disp32.

	* config/tc-i386.c (optimize_encoding): Don't check AVX for
	EVEX vector load/store optimization.  Check both operands for
	ZMM register.  Update EVEX vector load/store opcode check.
	Choose EVEX Disp8 over VEX Disp32.
	* testsuite/gas/i386/optimize-1.d: Updated.
	* testsuite/gas/i386/optimize-1a.d: Likewise.
	* testsuite/gas/i386/optimize-2.d: Likewise.
	* testsuite/gas/i386/optimize-4.d: Likewise.
	* testsuite/gas/i386/optimize-5.d: Likewise.
	* testsuite/gas/i386/x86-64-optimize-2.d: Likewise.
	* testsuite/gas/i386/x86-64-optimize-2a.d: Likewise.
	* testsuite/gas/i386/x86-64-optimize-2b.d: Likewise.
	* testsuite/gas/i386/x86-64-optimize-3.d: Likewise.
	* testsuite/gas/i386/x86-64-optimize-5.d: Likewise.
	* testsuite/gas/i386/x86-64-optimize-6.d: Likewise.
	* testsuite/gas/i386/optimize-1.s: Add ZMM register load
	test.
	* testsuite/gas/i386/x86-64-optimize-2.s: Likewise.
---
 gas/config/tc-i386.c                        | 46 +++++++++++++++------
 gas/testsuite/gas/i386/optimize-1.d         | 25 +++++------
 gas/testsuite/gas/i386/optimize-1.s         |  2 +
 gas/testsuite/gas/i386/optimize-1a.d        | 25 +++++------
 gas/testsuite/gas/i386/optimize-2.d         | 24 +++++------
 gas/testsuite/gas/i386/optimize-4.d         | 25 +++++------
 gas/testsuite/gas/i386/optimize-5.d         | 25 +++++------
 gas/testsuite/gas/i386/x86-64-optimize-2.d  | 25 +++++------
 gas/testsuite/gas/i386/x86-64-optimize-2.s  |  2 +
 gas/testsuite/gas/i386/x86-64-optimize-2a.d | 25 +++++------
 gas/testsuite/gas/i386/x86-64-optimize-2b.d | 25 +++++------
 gas/testsuite/gas/i386/x86-64-optimize-3.d  | 24 +++++------
 gas/testsuite/gas/i386/x86-64-optimize-5.d  | 25 +++++------
 gas/testsuite/gas/i386/x86-64-optimize-6.d  | 25 +++++------
 14 files changed, 178 insertions(+), 145 deletions(-)

diff --git a/gas/config/tc-i386.c b/gas/config/tc-i386.c
index 3885728de7..3447fe0fa3 100644
--- a/gas/config/tc-i386.c
+++ b/gas/config/tc-i386.c
@@ -4068,18 +4068,14 @@ optimize_encoding (void)
 	    i.types[j].bitfield.ymmword = 0;
 	  }
     }
-  else if ((cpu_arch_flags.bitfield.cpuavx
-	    || cpu_arch_isa_flags.bitfield.cpuavx)
-	   && i.vec_encoding != vex_encoding_evex
+  else if (i.vec_encoding != vex_encoding_evex
 	   && !i.types[0].bitfield.zmmword
+	   && !i.types[1].bitfield.zmmword
 	   && !i.mask
 	   && is_evex_encoding (&i.tm)
-	   && (i.tm.base_opcode == 0x666f
-	       || (i.tm.base_opcode ^ Opcode_SIMD_IntD) == 0x666f
-	       || i.tm.base_opcode == 0xf36f
-	       || (i.tm.base_opcode ^ Opcode_SIMD_IntD) == 0xf36f
-	       || i.tm.base_opcode == 0xf26f
-	       || (i.tm.base_opcode ^ Opcode_SIMD_IntD) == 0xf26f)
+	   && ((i.tm.base_opcode & ~Opcode_SIMD_IntD) == 0x666f
+	       || (i.tm.base_opcode & ~Opcode_SIMD_IntD) == 0xf36f
+	       || (i.tm.base_opcode & ~Opcode_SIMD_IntD) == 0xf26f)
 	   && i.tm.extension_opcode == None)
     {
       /* Optimize: -O1:
@@ -4098,10 +4094,34 @@ optimize_encoding (void)
 	     EVEX VOP mem, %ymmN
 	       -> VEX vmovdqa|vmovdqu mem, %ymmN (N < 16)
        */
-      if (i.tm.base_opcode == 0xf26f)
-	i.tm.base_opcode = 0xf36f;
-      else if ((i.tm.base_opcode ^ Opcode_SIMD_IntD) == 0xf26f)
-	i.tm.base_opcode = 0xf36f ^ Opcode_SIMD_IntD;
+      for (j = 0; j < 2; j++)
+	if (operand_type_check (i.types[j], disp)
+	    && i.op[j].disps->X_op == O_constant)
+	  {
+	    /* Since the VEX prefix has 2 or 3 bytes, the EVEX prefix
+	       has 4 bytes, EVEX Disp8 has 1 byte and VEX Disp32 has 4
+	       bytes, we choose EVEX Disp8 over VEX Disp32.  */
+	    int evex_disp8, vex_disp8;
+	    unsigned int memshift = i.memshift;
+	    offsetT n = i.op[j].disps->X_add_number;
+
+	    evex_disp8 = fits_in_disp8 (n);
+	    i.memshift = 0;
+	    vex_disp8 = fits_in_disp8 (n);
+	    if (evex_disp8 != vex_disp8)
+	      {
+		i.memshift = memshift;
+		return;
+	      }
+
+	    i.types[j].bitfield.disp8 = vex_disp8;
+	    break;
+	  }
+      if ((i.tm.base_opcode & ~Opcode_SIMD_IntD) == 0xf26f)
+	{
+	  i.tm.base_opcode &= Opcode_SIMD_IntD;
+	  i.tm.base_opcode |= 0xf36f;
+	}
       i.tm.opcode_modifier.vex
 	= i.types[0].bitfield.ymmword ? VEX256 : VEX128;
       i.tm.opcode_modifier.vexw = VEXW0;
diff --git a/gas/testsuite/gas/i386/optimize-1.d b/gas/testsuite/gas/i386/optimize-1.d
index 70c802c002..2f40c72a4e 100644
--- a/gas/testsuite/gas/i386/optimize-1.d
+++ b/gas/testsuite/gas/i386/optimize-1.d
@@ -74,12 +74,12 @@ Disassembly of section .text:
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
- +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7d 08 7f 48 08 	vmovdqa32 %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 fd 08 7f 48 08 	vmovdqa64 %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7f 08 7f 48 08 	vmovdqu8 %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 ff 08 7f 48 08 	vmovdqu16 %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7e 08 7f 48 08 	vmovdqu32 %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 fe 08 7f 48 08 	vmovdqu64 %xmm1,0x80\(%eax\)
  +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
  +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
  +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
@@ -92,10 +92,11 @@ Disassembly of section .text:
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
- +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7d 28 7f 48 04 	vmovdqa32 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 fd 28 7f 48 04 	vmovdqa64 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7f 28 7f 48 04 	vmovdqu8 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 ff 28 7f 48 04 	vmovdqu16 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7e 28 7f 48 04 	vmovdqu32 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 fe 28 7f 48 04 	vmovdqu64 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7d 48 6f 10    	vmovdqa32 \(%eax\),%zmm2
 #pass
diff --git a/gas/testsuite/gas/i386/optimize-1.s b/gas/testsuite/gas/i386/optimize-1.s
index 6dcfbc2799..4c15d16c2a 100644
--- a/gas/testsuite/gas/i386/optimize-1.s
+++ b/gas/testsuite/gas/i386/optimize-1.s
@@ -114,3 +114,5 @@ _start:
 	vmovdqu16	%ymm1, 128(%eax)
 	vmovdqu32	%ymm1, 128(%eax)
 	vmovdqu64	%ymm1, 128(%eax)
+
+	vmovdqa32	(%eax), %zmm2
diff --git a/gas/testsuite/gas/i386/optimize-1a.d b/gas/testsuite/gas/i386/optimize-1a.d
index cee2383d84..d7c253a6fa 100644
--- a/gas/testsuite/gas/i386/optimize-1a.d
+++ b/gas/testsuite/gas/i386/optimize-1a.d
@@ -75,12 +75,12 @@ Disassembly of section .text:
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
- +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7d 08 7f 48 08 	vmovdqa32 %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 fd 08 7f 48 08 	vmovdqa64 %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7f 08 7f 48 08 	vmovdqu8 %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 ff 08 7f 48 08 	vmovdqu16 %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7e 08 7f 48 08 	vmovdqu32 %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 fe 08 7f 48 08 	vmovdqu64 %xmm1,0x80\(%eax\)
  +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
  +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
  +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
@@ -93,10 +93,11 @@ Disassembly of section .text:
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
- +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7d 28 7f 48 04 	vmovdqa32 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 fd 28 7f 48 04 	vmovdqa64 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7f 28 7f 48 04 	vmovdqu8 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 ff 28 7f 48 04 	vmovdqu16 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7e 28 7f 48 04 	vmovdqu32 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 fe 28 7f 48 04 	vmovdqu64 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7d 48 6f 10    	vmovdqa32 \(%eax\),%zmm2
 #pass
diff --git a/gas/testsuite/gas/i386/optimize-2.d b/gas/testsuite/gas/i386/optimize-2.d
index 19467f5c01..ed61dec6fa 100644
--- a/gas/testsuite/gas/i386/optimize-2.d
+++ b/gas/testsuite/gas/i386/optimize-2.d
@@ -29,12 +29,12 @@ Disassembly of section .text:
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
- +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7d 08 7f 48 08 	vmovdqa32 %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 fd 08 7f 48 08 	vmovdqa64 %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7f 08 7f 48 08 	vmovdqu8 %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 ff 08 7f 48 08 	vmovdqu16 %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7e 08 7f 48 08 	vmovdqu32 %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 fe 08 7f 48 08 	vmovdqu64 %xmm1,0x80\(%eax\)
  +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
  +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
  +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
@@ -47,12 +47,12 @@ Disassembly of section .text:
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
- +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7d 28 7f 48 04 	vmovdqa32 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 fd 28 7f 48 04 	vmovdqa64 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7f 28 7f 48 04 	vmovdqu8 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 ff 28 7f 48 04 	vmovdqu16 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7e 28 7f 48 04 	vmovdqu32 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 fe 28 7f 48 04 	vmovdqu64 %ymm1,0x80\(%eax\)
  +[a-f0-9]+:	62 f1 7d 48 6f d1    	vmovdqa32 %zmm1,%zmm2
  +[a-f0-9]+:	62 f1 fd 48 6f d1    	vmovdqa64 %zmm1,%zmm2
  +[a-f0-9]+:	62 f1 7f 48 6f d1    	vmovdqu8 %zmm1,%zmm2
diff --git a/gas/testsuite/gas/i386/optimize-4.d b/gas/testsuite/gas/i386/optimize-4.d
index 2df84654d6..f062ad7717 100644
--- a/gas/testsuite/gas/i386/optimize-4.d
+++ b/gas/testsuite/gas/i386/optimize-4.d
@@ -74,12 +74,12 @@ Disassembly of section .text:
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
- +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7d 08 7f 48 08 	vmovdqa32 %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 fd 08 7f 48 08 	vmovdqa64 %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7f 08 7f 48 08 	vmovdqu8 %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 ff 08 7f 48 08 	vmovdqu16 %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7e 08 7f 48 08 	vmovdqu32 %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 fe 08 7f 48 08 	vmovdqu64 %xmm1,0x80\(%eax\)
  +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
  +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
  +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
@@ -92,12 +92,13 @@ Disassembly of section .text:
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
- +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7d 28 7f 48 04 	vmovdqa32 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 fd 28 7f 48 04 	vmovdqa64 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7f 28 7f 48 04 	vmovdqu8 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 ff 28 7f 48 04 	vmovdqu16 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7e 28 7f 48 04 	vmovdqu32 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 fe 28 7f 48 04 	vmovdqu64 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7d 48 6f 10    	vmovdqa32 \(%eax\),%zmm2
  +[a-f0-9]+:	62 f1 f5 08 55 e9    	vandnpd %xmm1,%xmm1,%xmm5
  +[a-f0-9]+:	62 f1 f5 08 55 e9    	vandnpd %xmm1,%xmm1,%xmm5
 #pass
diff --git a/gas/testsuite/gas/i386/optimize-5.d b/gas/testsuite/gas/i386/optimize-5.d
index ecc1ab139a..fdf5561af8 100644
--- a/gas/testsuite/gas/i386/optimize-5.d
+++ b/gas/testsuite/gas/i386/optimize-5.d
@@ -74,12 +74,12 @@ Disassembly of section .text:
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
- +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7d 08 7f 48 08 	vmovdqa32 %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 fd 08 7f 48 08 	vmovdqa64 %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7f 08 7f 48 08 	vmovdqu8 %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 ff 08 7f 48 08 	vmovdqu16 %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7e 08 7f 48 08 	vmovdqu32 %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 fe 08 7f 48 08 	vmovdqu64 %xmm1,0x80\(%eax\)
  +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
  +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
  +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
@@ -92,12 +92,13 @@ Disassembly of section .text:
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
- +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7d 28 7f 48 04 	vmovdqa32 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 fd 28 7f 48 04 	vmovdqa64 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7f 28 7f 48 04 	vmovdqu8 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 ff 28 7f 48 04 	vmovdqu16 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7e 28 7f 48 04 	vmovdqu32 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 fe 28 7f 48 04 	vmovdqu64 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7d 48 6f 10    	vmovdqa32 \(%eax\),%zmm2
  +[a-f0-9]+:	62 f1 f5 08 55 e9    	vandnpd %xmm1,%xmm1,%xmm5
  +[a-f0-9]+:	62 f1 f5 08 55 e9    	vandnpd %xmm1,%xmm1,%xmm5
  +[a-f0-9]+:	62 f1 7d 28 6f d1    	vmovdqa32 %ymm1,%ymm2
diff --git a/gas/testsuite/gas/i386/x86-64-optimize-2.d b/gas/testsuite/gas/i386/x86-64-optimize-2.d
index 067df076f7..45b98ae694 100644
--- a/gas/testsuite/gas/i386/x86-64-optimize-2.d
+++ b/gas/testsuite/gas/i386/x86-64-optimize-2.d
@@ -124,12 +124,12 @@ Disassembly of section .text:
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
- +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7d 08 7f 48 08 	vmovdqa32 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 fd 08 7f 48 08 	vmovdqa64 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7f 08 7f 48 08 	vmovdqu8 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 ff 08 7f 48 08 	vmovdqu16 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7e 08 7f 48 08 	vmovdqu32 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 fe 08 7f 48 08 	vmovdqu64 %xmm1,0x80\(%rax\)
  +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
  +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
  +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
@@ -148,10 +148,11 @@ Disassembly of section .text:
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
- +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7d 28 7f 48 04 	vmovdqa32 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 fd 28 7f 48 04 	vmovdqa64 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7f 28 7f 48 04 	vmovdqu8 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 ff 28 7f 48 04 	vmovdqu16 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7e 28 7f 48 04 	vmovdqu32 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 fe 28 7f 48 04 	vmovdqu64 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7d 48 6f 10    	vmovdqa32 \(%rax\),%zmm2
 #pass
diff --git a/gas/testsuite/gas/i386/x86-64-optimize-2.s b/gas/testsuite/gas/i386/x86-64-optimize-2.s
index 1275610e55..e5d298225a 100644
--- a/gas/testsuite/gas/i386/x86-64-optimize-2.s
+++ b/gas/testsuite/gas/i386/x86-64-optimize-2.s
@@ -170,3 +170,5 @@ _start:
 	vmovdqu16	%ymm1, 128(%rax)
 	vmovdqu32	%ymm1, 128(%rax)
 	vmovdqu64	%ymm1, 128(%rax)
+
+	vmovdqa32	(%rax), %zmm2
diff --git a/gas/testsuite/gas/i386/x86-64-optimize-2a.d b/gas/testsuite/gas/i386/x86-64-optimize-2a.d
index 532a1458bc..39385b96ec 100644
--- a/gas/testsuite/gas/i386/x86-64-optimize-2a.d
+++ b/gas/testsuite/gas/i386/x86-64-optimize-2a.d
@@ -125,12 +125,12 @@ Disassembly of section .text:
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
- +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7d 08 7f 48 08 	vmovdqa32 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 fd 08 7f 48 08 	vmovdqa64 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7f 08 7f 48 08 	vmovdqu8 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 ff 08 7f 48 08 	vmovdqu16 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7e 08 7f 48 08 	vmovdqu32 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 fe 08 7f 48 08 	vmovdqu64 %xmm1,0x80\(%rax\)
  +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
  +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
  +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
@@ -149,10 +149,11 @@ Disassembly of section .text:
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
- +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7d 28 7f 48 04 	vmovdqa32 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 fd 28 7f 48 04 	vmovdqa64 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7f 28 7f 48 04 	vmovdqu8 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 ff 28 7f 48 04 	vmovdqu16 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7e 28 7f 48 04 	vmovdqu32 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 fe 28 7f 48 04 	vmovdqu64 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7d 48 6f 10    	vmovdqa32 \(%rax\),%zmm2
 #pass
diff --git a/gas/testsuite/gas/i386/x86-64-optimize-2b.d b/gas/testsuite/gas/i386/x86-64-optimize-2b.d
index 09474a1016..3eb3a59eac 100644
--- a/gas/testsuite/gas/i386/x86-64-optimize-2b.d
+++ b/gas/testsuite/gas/i386/x86-64-optimize-2b.d
@@ -124,12 +124,12 @@ Disassembly of section .text:
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
- +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7d 08 7f 48 08 	vmovdqa32 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 fd 08 7f 48 08 	vmovdqa64 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7f 08 7f 48 08 	vmovdqu8 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 ff 08 7f 48 08 	vmovdqu16 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7e 08 7f 48 08 	vmovdqu32 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 fe 08 7f 48 08 	vmovdqu64 %xmm1,0x80\(%rax\)
  +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
  +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
  +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
@@ -148,10 +148,11 @@ Disassembly of section .text:
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
- +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7d 28 7f 48 04 	vmovdqa32 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 fd 28 7f 48 04 	vmovdqa64 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7f 28 7f 48 04 	vmovdqu8 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 ff 28 7f 48 04 	vmovdqu16 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7e 28 7f 48 04 	vmovdqu32 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 fe 28 7f 48 04 	vmovdqu64 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7d 48 6f 10    	vmovdqa32 \(%rax\),%zmm2
 #pass
diff --git a/gas/testsuite/gas/i386/x86-64-optimize-3.d b/gas/testsuite/gas/i386/x86-64-optimize-3.d
index 74336a4fe2..5e2832df4c 100644
--- a/gas/testsuite/gas/i386/x86-64-optimize-3.d
+++ b/gas/testsuite/gas/i386/x86-64-optimize-3.d
@@ -43,12 +43,12 @@ Disassembly of section .text:
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
- +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7d 08 7f 48 08 	vmovdqa32 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 fd 08 7f 48 08 	vmovdqa64 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7f 08 7f 48 08 	vmovdqu8 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 ff 08 7f 48 08 	vmovdqu16 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7e 08 7f 48 08 	vmovdqu32 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 fe 08 7f 48 08 	vmovdqu64 %xmm1,0x80\(%rax\)
  +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
  +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
  +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
@@ -67,12 +67,12 @@ Disassembly of section .text:
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
- +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7d 28 7f 48 04 	vmovdqa32 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 fd 28 7f 48 04 	vmovdqa64 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7f 28 7f 48 04 	vmovdqu8 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 ff 28 7f 48 04 	vmovdqu16 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7e 28 7f 48 04 	vmovdqu32 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 fe 28 7f 48 04 	vmovdqu64 %ymm1,0x80\(%rax\)
  +[a-f0-9]+:	62 b1 7d 08 6f d5    	vmovdqa32 %xmm21,%xmm2
  +[a-f0-9]+:	62 b1 fd 08 6f d5    	vmovdqa64 %xmm21,%xmm2
  +[a-f0-9]+:	62 b1 7f 08 6f d5    	vmovdqu8 %xmm21,%xmm2
diff --git a/gas/testsuite/gas/i386/x86-64-optimize-5.d b/gas/testsuite/gas/i386/x86-64-optimize-5.d
index 012237df57..5065d650d4 100644
--- a/gas/testsuite/gas/i386/x86-64-optimize-5.d
+++ b/gas/testsuite/gas/i386/x86-64-optimize-5.d
@@ -124,12 +124,12 @@ Disassembly of section .text:
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
- +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7d 08 7f 48 08 	vmovdqa32 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 fd 08 7f 48 08 	vmovdqa64 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7f 08 7f 48 08 	vmovdqu8 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 ff 08 7f 48 08 	vmovdqu16 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7e 08 7f 48 08 	vmovdqu32 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 fe 08 7f 48 08 	vmovdqu64 %xmm1,0x80\(%rax\)
  +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
  +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
  +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
@@ -148,12 +148,13 @@ Disassembly of section .text:
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
- +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7d 28 7f 48 04 	vmovdqa32 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 fd 28 7f 48 04 	vmovdqa64 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7f 28 7f 48 04 	vmovdqu8 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 ff 28 7f 48 04 	vmovdqu16 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7e 28 7f 48 04 	vmovdqu32 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 fe 28 7f 48 04 	vmovdqu64 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7d 48 6f 10    	vmovdqa32 \(%rax\),%zmm2
  +[a-f0-9]+:	62 f1 f5 08 55 e9    	vandnpd %xmm1,%xmm1,%xmm5
  +[a-f0-9]+:	62 f1 f5 08 55 e9    	vandnpd %xmm1,%xmm1,%xmm5
  +[a-f0-9]+:	62 f1 7d 28 6f d1    	vmovdqa32 %ymm1,%ymm2
diff --git a/gas/testsuite/gas/i386/x86-64-optimize-6.d b/gas/testsuite/gas/i386/x86-64-optimize-6.d
index aca119e4f9..8ebd9b2475 100644
--- a/gas/testsuite/gas/i386/x86-64-optimize-6.d
+++ b/gas/testsuite/gas/i386/x86-64-optimize-6.d
@@ -124,12 +124,12 @@ Disassembly of section .text:
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
- +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7d 08 7f 48 08 	vmovdqa32 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 fd 08 7f 48 08 	vmovdqa64 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7f 08 7f 48 08 	vmovdqu8 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 ff 08 7f 48 08 	vmovdqu16 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7e 08 7f 48 08 	vmovdqu32 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 fe 08 7f 48 08 	vmovdqu64 %xmm1,0x80\(%rax\)
  +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
  +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
  +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
@@ -148,12 +148,13 @@ Disassembly of section .text:
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
- +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7d 28 7f 48 04 	vmovdqa32 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 fd 28 7f 48 04 	vmovdqa64 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7f 28 7f 48 04 	vmovdqu8 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 ff 28 7f 48 04 	vmovdqu16 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7e 28 7f 48 04 	vmovdqu32 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 fe 28 7f 48 04 	vmovdqu64 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7d 48 6f 10    	vmovdqa32 \(%rax\),%zmm2
  +[a-f0-9]+:	62 f1 f5 08 55 e9    	vandnpd %xmm1,%xmm1,%xmm5
  +[a-f0-9]+:	62 f1 f5 08 55 e9    	vandnpd %xmm1,%xmm1,%xmm5
  +[a-f0-9]+:	62 f1 7d 28 6f d1    	vmovdqa32 %ymm1,%ymm2
-- 
2.20.1


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] x86: Correct EVEX vector load/store optimization
  2019-03-19  6:21     ` [PATCH] x86: Correct EVEX vector load/store optimization H.J. Lu
@ 2019-03-19  8:30       ` Jan Beulich
  2019-03-19  8:48         ` H.J. Lu
  0 siblings, 1 reply; 7+ messages in thread
From: Jan Beulich @ 2019-03-19  8:30 UTC (permalink / raw)
  To: H.J. Lu; +Cc: binutils

>>> On 19.03.19 at 07:20, <hjl.tools@gmail.com> wrote:
> On Mon, Mar 18, 2019 at 9:49 PM Jan Beulich <JBeulich@suse.com> wrote:
>>
>> >>> On 17.03.19 at 21:47, <hjl.tools@gmail.com> wrote:
>> > --- a/gas/config/tc-i386.c
>> > +++ b/gas/config/tc-i386.c
>> > @@ -4075,6 +4075,56 @@ optimize_encoding (void)
>> >           i.types[j].bitfield.ymmword = 0;
>> >         }
>> >      }
>> > +  else if ((cpu_arch_flags.bitfield.cpuavx
>> > +         || cpu_arch_isa_flags.bitfield.cpuavx)
>>
>> Once again a questionable condition, as per earlier replies to
>> other patches of yours.
> 
> Fixed.
> 
>> > +        && i.vec_encoding != vex_encoding_evex
>> > +        && !i.types[0].bitfield.zmmword
>> > +        && !i.mask
>> > +        && is_evex_encoding (&i.tm)
>> > +        && (i.tm.base_opcode == 0x666f
>> > +            || (i.tm.base_opcode ^ Opcode_SIMD_IntD) == 0x666f
>> > +            || i.tm.base_opcode == 0xf36f
>> > +            || (i.tm.base_opcode ^ Opcode_SIMD_IntD) == 0xf36f
>> > +            || i.tm.base_opcode == 0xf26f
>> > +            || (i.tm.base_opcode ^ Opcode_SIMD_IntD) == 0xf26f)
>>
>> All three of these can be expressed with just a single comparison,
>> using & or | instead of ^ and (if necessary) adjusting the literal
>> value compared against.
> 
> Fixed.
> 
>> > +        && i.tm.extension_opcode == None)
>> > +    {
>> > +      /* Optimize: -O1:
>> > +        VOP, one of vmovdqa32, vmovdqa64, vmovdqu8, vmovdqu16,
>> > +        vmovdqu32 and vmovdqu64:
>> > +          EVEX VOP %xmmM, %xmmN
>> > +            -> VEX vmovdqa|vmovdqu %xmmM, %xmmN (M and N < 16)
>> > +          EVEX VOP %ymmM, %ymmN
>> > +            -> VEX vmovdqa|vmovdqu %ymmM, %ymmN (M and N < 16)
>> > +          EVEX VOP %xmmM, mem
>> > +            -> VEX vmovdqa|vmovdqu %xmmM, mem (M < 16)
>> > +          EVEX VOP %ymmM, mem
>> > +            -> VEX vmovdqa|vmovdqu %ymmM, mem (M < 16)
>> > +          EVEX VOP mem, %xmmN
>> > +            -> VEX mvmovdqa|vmovdquem, %xmmN (N < 16)
>>
>> There's some confusion on this line.
>>
>> > +          EVEX VOP mem, %ymmN
>> > +            -> VEX vmovdqa|vmovdqu mem, %ymmN (N < 16)
>> > +       */
>>
>> For the variants with a memory operand I doubt the conversion
>> is always a win, and it may be against the user request in case of
>> -Os. This is because of the Disp8 scaling the EVEX encoding permits.
> 
> Fixed.
> 
>> > +      if (i.tm.base_opcode == 0xf26f)
>> > +     i.tm.base_opcode = 0xf36f;
>> > +      else if ((i.tm.base_opcode ^ Opcode_SIMD_IntD) == 0xf26f)
>> > +     i.tm.base_opcode = 0xf36f ^ Opcode_SIMD_IntD;
>>
>> This again can be expressed without "else if()" afaict.
>>
> 
> Fixed.
> 
> Here is the patch.

Thanks.

>--- a/gas/config/tc-i386.c
>+++ b/gas/config/tc-i386.c
>@@ -4068,18 +4068,14 @@ optimize_encoding (void)
> 	    i.types[j].bitfield.ymmword = 0;
> 	  }
>     }
>-  else if ((cpu_arch_flags.bitfield.cpuavx
>-	    || cpu_arch_isa_flags.bitfield.cpuavx)
>-	   && i.vec_encoding != vex_encoding_evex
>+  else if (i.vec_encoding != vex_encoding_evex
> 	   && !i.types[0].bitfield.zmmword

Ah, here the remaining cpuavx goes away as well.

>+      if ((i.tm.base_opcode & ~Opcode_SIMD_IntD) == 0xf26f)
>+	{
>+	  i.tm.base_opcode &= Opcode_SIMD_IntD;
>+	  i.tm.base_opcode |= 0xf36f;
>+	}

How about the even simpler

      if ((i.tm.base_opcode & ~Opcode_SIMD_IntD) == 0xf26f)
	i.tm.base_opcode ^= 0xf36f ^ 0xf26f;

?

Jan


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] x86: Correct EVEX vector load/store optimization
  2019-03-19  8:30       ` Jan Beulich
@ 2019-03-19  8:48         ` H.J. Lu
  2019-03-19  8:52           ` Jan Beulich
  0 siblings, 1 reply; 7+ messages in thread
From: H.J. Lu @ 2019-03-19  8:48 UTC (permalink / raw)
  To: Jan Beulich; +Cc: Binutils

[-- Attachment #1: Type: text/plain, Size: 3814 bytes --]

On Tue, Mar 19, 2019 at 4:30 PM Jan Beulich <JBeulich@suse.com> wrote:
>
> >>> On 19.03.19 at 07:20, <hjl.tools@gmail.com> wrote:
> > On Mon, Mar 18, 2019 at 9:49 PM Jan Beulich <JBeulich@suse.com> wrote:
> >>
> >> >>> On 17.03.19 at 21:47, <hjl.tools@gmail.com> wrote:
> >> > --- a/gas/config/tc-i386.c
> >> > +++ b/gas/config/tc-i386.c
> >> > @@ -4075,6 +4075,56 @@ optimize_encoding (void)
> >> >           i.types[j].bitfield.ymmword = 0;
> >> >         }
> >> >      }
> >> > +  else if ((cpu_arch_flags.bitfield.cpuavx
> >> > +         || cpu_arch_isa_flags.bitfield.cpuavx)
> >>
> >> Once again a questionable condition, as per earlier replies to
> >> other patches of yours.
> >
> > Fixed.
> >
> >> > +        && i.vec_encoding != vex_encoding_evex
> >> > +        && !i.types[0].bitfield.zmmword
> >> > +        && !i.mask
> >> > +        && is_evex_encoding (&i.tm)
> >> > +        && (i.tm.base_opcode == 0x666f
> >> > +            || (i.tm.base_opcode ^ Opcode_SIMD_IntD) == 0x666f
> >> > +            || i.tm.base_opcode == 0xf36f
> >> > +            || (i.tm.base_opcode ^ Opcode_SIMD_IntD) == 0xf36f
> >> > +            || i.tm.base_opcode == 0xf26f
> >> > +            || (i.tm.base_opcode ^ Opcode_SIMD_IntD) == 0xf26f)
> >>
> >> All three of these can be expressed with just a single comparison,
> >> using & or | instead of ^ and (if necessary) adjusting the literal
> >> value compared against.
> >
> > Fixed.
> >
> >> > +        && i.tm.extension_opcode == None)
> >> > +    {
> >> > +      /* Optimize: -O1:
> >> > +        VOP, one of vmovdqa32, vmovdqa64, vmovdqu8, vmovdqu16,
> >> > +        vmovdqu32 and vmovdqu64:
> >> > +          EVEX VOP %xmmM, %xmmN
> >> > +            -> VEX vmovdqa|vmovdqu %xmmM, %xmmN (M and N < 16)
> >> > +          EVEX VOP %ymmM, %ymmN
> >> > +            -> VEX vmovdqa|vmovdqu %ymmM, %ymmN (M and N < 16)
> >> > +          EVEX VOP %xmmM, mem
> >> > +            -> VEX vmovdqa|vmovdqu %xmmM, mem (M < 16)
> >> > +          EVEX VOP %ymmM, mem
> >> > +            -> VEX vmovdqa|vmovdqu %ymmM, mem (M < 16)
> >> > +          EVEX VOP mem, %xmmN
> >> > +            -> VEX mvmovdqa|vmovdquem, %xmmN (N < 16)
> >>
> >> There's some confusion on this line.
> >>
> >> > +          EVEX VOP mem, %ymmN
> >> > +            -> VEX vmovdqa|vmovdqu mem, %ymmN (N < 16)
> >> > +       */
> >>
> >> For the variants with a memory operand I doubt the conversion
> >> is always a win, and it may be against the user request in case of
> >> -Os. This is because of the Disp8 scaling the EVEX encoding permits.
> >
> > Fixed.
> >
> >> > +      if (i.tm.base_opcode == 0xf26f)
> >> > +     i.tm.base_opcode = 0xf36f;
> >> > +      else if ((i.tm.base_opcode ^ Opcode_SIMD_IntD) == 0xf26f)
> >> > +     i.tm.base_opcode = 0xf36f ^ Opcode_SIMD_IntD;
> >>
> >> This again can be expressed without "else if()" afaict.
> >>
> >
> > Fixed.
> >
> > Here is the patch.
>
> Thanks.
>
> >--- a/gas/config/tc-i386.c
> >+++ b/gas/config/tc-i386.c
> >@@ -4068,18 +4068,14 @@ optimize_encoding (void)
> >           i.types[j].bitfield.ymmword = 0;
> >         }
> >     }
> >-  else if ((cpu_arch_flags.bitfield.cpuavx
> >-          || cpu_arch_isa_flags.bitfield.cpuavx)
> >-         && i.vec_encoding != vex_encoding_evex
> >+  else if (i.vec_encoding != vex_encoding_evex
> >          && !i.types[0].bitfield.zmmword
>
> Ah, here the remaining cpuavx goes away as well.
>
> >+      if ((i.tm.base_opcode & ~Opcode_SIMD_IntD) == 0xf26f)
> >+      {
> >+        i.tm.base_opcode &= Opcode_SIMD_IntD;
> >+        i.tm.base_opcode |= 0xf36f;
> >+      }
>
> How about the even simpler
>
>       if ((i.tm.base_opcode & ~Opcode_SIMD_IntD) == 0xf26f)
>         i.tm.base_opcode ^= 0xf36f ^ 0xf26f;
>

It works.

I am going to check in this patch together with other 2.

Thanks.

-- 
H.J.

[-- Attachment #2: 0001-x86-Correct-EVEX-vector-load-store-optimization.patch --]
[-- Type: text/x-patch, Size: 33996 bytes --]

From 177fca87fa53139e3a409876c0d9333e6b33780c Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Tue, 19 Mar 2019 10:56:39 +0800
Subject: [PATCH] x86: Correct EVEX vector load/store optimization

Update EVEX vector load/store optimization:

1. There is no need to check AVX since AVX2 is required for AVX512F.
2. We need to check both operands for ZMM register since AT&T syntax
may not set zmmword on the first operand.
3. Update Opcode_SIMD_IntD check and set.
4. Since the VEX prefix has 2 or 3 bytes, the EVEX prefix has 4 bytes,
EVEX Disp8 has 1 byte and VEX Disp32 has 4 bytes, we choose EVEX Disp8
over VEX Disp32.

	* config/tc-i386.c (optimize_encoding): Don't check AVX for
	EVEX vector load/store optimization.  Check both operands for
	ZMM register.  Update EVEX vector load/store opcode check.
	Choose EVEX Disp8 over VEX Disp32.
	* testsuite/gas/i386/optimize-1.d: Updated.
	* testsuite/gas/i386/optimize-1a.d: Likewise.
	* testsuite/gas/i386/optimize-2.d: Likewise.
	* testsuite/gas/i386/optimize-4.d: Likewise.
	* testsuite/gas/i386/optimize-5.d: Likewise.
	* testsuite/gas/i386/x86-64-optimize-2.d: Likewise.
	* testsuite/gas/i386/x86-64-optimize-2a.d: Likewise.
	* testsuite/gas/i386/x86-64-optimize-2b.d: Likewise.
	* testsuite/gas/i386/x86-64-optimize-3.d: Likewise.
	* testsuite/gas/i386/x86-64-optimize-5.d: Likewise.
	* testsuite/gas/i386/x86-64-optimize-6.d: Likewise.
	* testsuite/gas/i386/optimize-1.s: Add ZMM register load
	test.
	* testsuite/gas/i386/x86-64-optimize-2.s: Likewise.
---
 gas/config/tc-i386.c                        | 43 ++++++++++++++-------
 gas/testsuite/gas/i386/optimize-1.d         | 25 ++++++------
 gas/testsuite/gas/i386/optimize-1.s         |  2 +
 gas/testsuite/gas/i386/optimize-1a.d        | 25 ++++++------
 gas/testsuite/gas/i386/optimize-2.d         | 24 ++++++------
 gas/testsuite/gas/i386/optimize-4.d         | 25 ++++++------
 gas/testsuite/gas/i386/optimize-5.d         | 25 ++++++------
 gas/testsuite/gas/i386/x86-64-optimize-2.d  | 25 ++++++------
 gas/testsuite/gas/i386/x86-64-optimize-2.s  |  2 +
 gas/testsuite/gas/i386/x86-64-optimize-2a.d | 25 ++++++------
 gas/testsuite/gas/i386/x86-64-optimize-2b.d | 25 ++++++------
 gas/testsuite/gas/i386/x86-64-optimize-3.d  | 24 ++++++------
 gas/testsuite/gas/i386/x86-64-optimize-5.d  | 25 ++++++------
 gas/testsuite/gas/i386/x86-64-optimize-6.d  | 25 ++++++------
 14 files changed, 175 insertions(+), 145 deletions(-)

diff --git a/gas/config/tc-i386.c b/gas/config/tc-i386.c
index 3885728de7..690fd23ff0 100644
--- a/gas/config/tc-i386.c
+++ b/gas/config/tc-i386.c
@@ -4068,18 +4068,14 @@ optimize_encoding (void)
 	    i.types[j].bitfield.ymmword = 0;
 	  }
     }
-  else if ((cpu_arch_flags.bitfield.cpuavx
-	    || cpu_arch_isa_flags.bitfield.cpuavx)
-	   && i.vec_encoding != vex_encoding_evex
+  else if (i.vec_encoding != vex_encoding_evex
 	   && !i.types[0].bitfield.zmmword
+	   && !i.types[1].bitfield.zmmword
 	   && !i.mask
 	   && is_evex_encoding (&i.tm)
-	   && (i.tm.base_opcode == 0x666f
-	       || (i.tm.base_opcode ^ Opcode_SIMD_IntD) == 0x666f
-	       || i.tm.base_opcode == 0xf36f
-	       || (i.tm.base_opcode ^ Opcode_SIMD_IntD) == 0xf36f
-	       || i.tm.base_opcode == 0xf26f
-	       || (i.tm.base_opcode ^ Opcode_SIMD_IntD) == 0xf26f)
+	   && ((i.tm.base_opcode & ~Opcode_SIMD_IntD) == 0x666f
+	       || (i.tm.base_opcode & ~Opcode_SIMD_IntD) == 0xf36f
+	       || (i.tm.base_opcode & ~Opcode_SIMD_IntD) == 0xf26f)
 	   && i.tm.extension_opcode == None)
     {
       /* Optimize: -O1:
@@ -4098,10 +4094,31 @@ optimize_encoding (void)
 	     EVEX VOP mem, %ymmN
 	       -> VEX vmovdqa|vmovdqu mem, %ymmN (N < 16)
        */
-      if (i.tm.base_opcode == 0xf26f)
-	i.tm.base_opcode = 0xf36f;
-      else if ((i.tm.base_opcode ^ Opcode_SIMD_IntD) == 0xf26f)
-	i.tm.base_opcode = 0xf36f ^ Opcode_SIMD_IntD;
+      for (j = 0; j < 2; j++)
+	if (operand_type_check (i.types[j], disp)
+	    && i.op[j].disps->X_op == O_constant)
+	  {
+	    /* Since the VEX prefix has 2 or 3 bytes, the EVEX prefix
+	       has 4 bytes, EVEX Disp8 has 1 byte and VEX Disp32 has 4
+	       bytes, we choose EVEX Disp8 over VEX Disp32.  */
+	    int evex_disp8, vex_disp8;
+	    unsigned int memshift = i.memshift;
+	    offsetT n = i.op[j].disps->X_add_number;
+
+	    evex_disp8 = fits_in_disp8 (n);
+	    i.memshift = 0;
+	    vex_disp8 = fits_in_disp8 (n);
+	    if (evex_disp8 != vex_disp8)
+	      {
+		i.memshift = memshift;
+		return;
+	      }
+
+	    i.types[j].bitfield.disp8 = vex_disp8;
+	    break;
+	  }
+      if ((i.tm.base_opcode & ~Opcode_SIMD_IntD) == 0xf26f)
+	i.tm.base_opcode ^= 0xf36f ^ 0xf26f;
       i.tm.opcode_modifier.vex
 	= i.types[0].bitfield.ymmword ? VEX256 : VEX128;
       i.tm.opcode_modifier.vexw = VEXW0;
diff --git a/gas/testsuite/gas/i386/optimize-1.d b/gas/testsuite/gas/i386/optimize-1.d
index 70c802c002..2f40c72a4e 100644
--- a/gas/testsuite/gas/i386/optimize-1.d
+++ b/gas/testsuite/gas/i386/optimize-1.d
@@ -74,12 +74,12 @@ Disassembly of section .text:
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
- +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7d 08 7f 48 08 	vmovdqa32 %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 fd 08 7f 48 08 	vmovdqa64 %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7f 08 7f 48 08 	vmovdqu8 %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 ff 08 7f 48 08 	vmovdqu16 %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7e 08 7f 48 08 	vmovdqu32 %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 fe 08 7f 48 08 	vmovdqu64 %xmm1,0x80\(%eax\)
  +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
  +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
  +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
@@ -92,10 +92,11 @@ Disassembly of section .text:
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
- +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7d 28 7f 48 04 	vmovdqa32 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 fd 28 7f 48 04 	vmovdqa64 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7f 28 7f 48 04 	vmovdqu8 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 ff 28 7f 48 04 	vmovdqu16 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7e 28 7f 48 04 	vmovdqu32 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 fe 28 7f 48 04 	vmovdqu64 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7d 48 6f 10    	vmovdqa32 \(%eax\),%zmm2
 #pass
diff --git a/gas/testsuite/gas/i386/optimize-1.s b/gas/testsuite/gas/i386/optimize-1.s
index 6dcfbc2799..4c15d16c2a 100644
--- a/gas/testsuite/gas/i386/optimize-1.s
+++ b/gas/testsuite/gas/i386/optimize-1.s
@@ -114,3 +114,5 @@ _start:
 	vmovdqu16	%ymm1, 128(%eax)
 	vmovdqu32	%ymm1, 128(%eax)
 	vmovdqu64	%ymm1, 128(%eax)
+
+	vmovdqa32	(%eax), %zmm2
diff --git a/gas/testsuite/gas/i386/optimize-1a.d b/gas/testsuite/gas/i386/optimize-1a.d
index cee2383d84..d7c253a6fa 100644
--- a/gas/testsuite/gas/i386/optimize-1a.d
+++ b/gas/testsuite/gas/i386/optimize-1a.d
@@ -75,12 +75,12 @@ Disassembly of section .text:
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
- +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7d 08 7f 48 08 	vmovdqa32 %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 fd 08 7f 48 08 	vmovdqa64 %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7f 08 7f 48 08 	vmovdqu8 %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 ff 08 7f 48 08 	vmovdqu16 %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7e 08 7f 48 08 	vmovdqu32 %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 fe 08 7f 48 08 	vmovdqu64 %xmm1,0x80\(%eax\)
  +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
  +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
  +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
@@ -93,10 +93,11 @@ Disassembly of section .text:
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
- +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7d 28 7f 48 04 	vmovdqa32 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 fd 28 7f 48 04 	vmovdqa64 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7f 28 7f 48 04 	vmovdqu8 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 ff 28 7f 48 04 	vmovdqu16 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7e 28 7f 48 04 	vmovdqu32 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 fe 28 7f 48 04 	vmovdqu64 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7d 48 6f 10    	vmovdqa32 \(%eax\),%zmm2
 #pass
diff --git a/gas/testsuite/gas/i386/optimize-2.d b/gas/testsuite/gas/i386/optimize-2.d
index 19467f5c01..ed61dec6fa 100644
--- a/gas/testsuite/gas/i386/optimize-2.d
+++ b/gas/testsuite/gas/i386/optimize-2.d
@@ -29,12 +29,12 @@ Disassembly of section .text:
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
- +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7d 08 7f 48 08 	vmovdqa32 %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 fd 08 7f 48 08 	vmovdqa64 %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7f 08 7f 48 08 	vmovdqu8 %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 ff 08 7f 48 08 	vmovdqu16 %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7e 08 7f 48 08 	vmovdqu32 %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 fe 08 7f 48 08 	vmovdqu64 %xmm1,0x80\(%eax\)
  +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
  +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
  +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
@@ -47,12 +47,12 @@ Disassembly of section .text:
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
- +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7d 28 7f 48 04 	vmovdqa32 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 fd 28 7f 48 04 	vmovdqa64 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7f 28 7f 48 04 	vmovdqu8 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 ff 28 7f 48 04 	vmovdqu16 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7e 28 7f 48 04 	vmovdqu32 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 fe 28 7f 48 04 	vmovdqu64 %ymm1,0x80\(%eax\)
  +[a-f0-9]+:	62 f1 7d 48 6f d1    	vmovdqa32 %zmm1,%zmm2
  +[a-f0-9]+:	62 f1 fd 48 6f d1    	vmovdqa64 %zmm1,%zmm2
  +[a-f0-9]+:	62 f1 7f 48 6f d1    	vmovdqu8 %zmm1,%zmm2
diff --git a/gas/testsuite/gas/i386/optimize-4.d b/gas/testsuite/gas/i386/optimize-4.d
index 2df84654d6..f062ad7717 100644
--- a/gas/testsuite/gas/i386/optimize-4.d
+++ b/gas/testsuite/gas/i386/optimize-4.d
@@ -74,12 +74,12 @@ Disassembly of section .text:
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
- +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7d 08 7f 48 08 	vmovdqa32 %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 fd 08 7f 48 08 	vmovdqa64 %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7f 08 7f 48 08 	vmovdqu8 %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 ff 08 7f 48 08 	vmovdqu16 %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7e 08 7f 48 08 	vmovdqu32 %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 fe 08 7f 48 08 	vmovdqu64 %xmm1,0x80\(%eax\)
  +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
  +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
  +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
@@ -92,12 +92,13 @@ Disassembly of section .text:
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
- +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7d 28 7f 48 04 	vmovdqa32 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 fd 28 7f 48 04 	vmovdqa64 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7f 28 7f 48 04 	vmovdqu8 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 ff 28 7f 48 04 	vmovdqu16 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7e 28 7f 48 04 	vmovdqu32 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 fe 28 7f 48 04 	vmovdqu64 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7d 48 6f 10    	vmovdqa32 \(%eax\),%zmm2
  +[a-f0-9]+:	62 f1 f5 08 55 e9    	vandnpd %xmm1,%xmm1,%xmm5
  +[a-f0-9]+:	62 f1 f5 08 55 e9    	vandnpd %xmm1,%xmm1,%xmm5
 #pass
diff --git a/gas/testsuite/gas/i386/optimize-5.d b/gas/testsuite/gas/i386/optimize-5.d
index ecc1ab139a..fdf5561af8 100644
--- a/gas/testsuite/gas/i386/optimize-5.d
+++ b/gas/testsuite/gas/i386/optimize-5.d
@@ -74,12 +74,12 @@ Disassembly of section .text:
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%eax\),%xmm2
- +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7d 08 7f 48 08 	vmovdqa32 %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 fd 08 7f 48 08 	vmovdqa64 %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7f 08 7f 48 08 	vmovdqu8 %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 ff 08 7f 48 08 	vmovdqu16 %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7e 08 7f 48 08 	vmovdqu32 %xmm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 fe 08 7f 48 08 	vmovdqu64 %xmm1,0x80\(%eax\)
  +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
  +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
  +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
@@ -92,12 +92,13 @@ Disassembly of section .text:
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%eax\),%ymm2
- +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7d 28 7f 48 04 	vmovdqa32 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 fd 28 7f 48 04 	vmovdqa64 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7f 28 7f 48 04 	vmovdqu8 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 ff 28 7f 48 04 	vmovdqu16 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7e 28 7f 48 04 	vmovdqu32 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 fe 28 7f 48 04 	vmovdqu64 %ymm1,0x80\(%eax\)
+ +[a-f0-9]+:	62 f1 7d 48 6f 10    	vmovdqa32 \(%eax\),%zmm2
  +[a-f0-9]+:	62 f1 f5 08 55 e9    	vandnpd %xmm1,%xmm1,%xmm5
  +[a-f0-9]+:	62 f1 f5 08 55 e9    	vandnpd %xmm1,%xmm1,%xmm5
  +[a-f0-9]+:	62 f1 7d 28 6f d1    	vmovdqa32 %ymm1,%ymm2
diff --git a/gas/testsuite/gas/i386/x86-64-optimize-2.d b/gas/testsuite/gas/i386/x86-64-optimize-2.d
index 067df076f7..45b98ae694 100644
--- a/gas/testsuite/gas/i386/x86-64-optimize-2.d
+++ b/gas/testsuite/gas/i386/x86-64-optimize-2.d
@@ -124,12 +124,12 @@ Disassembly of section .text:
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
- +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7d 08 7f 48 08 	vmovdqa32 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 fd 08 7f 48 08 	vmovdqa64 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7f 08 7f 48 08 	vmovdqu8 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 ff 08 7f 48 08 	vmovdqu16 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7e 08 7f 48 08 	vmovdqu32 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 fe 08 7f 48 08 	vmovdqu64 %xmm1,0x80\(%rax\)
  +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
  +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
  +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
@@ -148,10 +148,11 @@ Disassembly of section .text:
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
- +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7d 28 7f 48 04 	vmovdqa32 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 fd 28 7f 48 04 	vmovdqa64 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7f 28 7f 48 04 	vmovdqu8 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 ff 28 7f 48 04 	vmovdqu16 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7e 28 7f 48 04 	vmovdqu32 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 fe 28 7f 48 04 	vmovdqu64 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7d 48 6f 10    	vmovdqa32 \(%rax\),%zmm2
 #pass
diff --git a/gas/testsuite/gas/i386/x86-64-optimize-2.s b/gas/testsuite/gas/i386/x86-64-optimize-2.s
index 1275610e55..e5d298225a 100644
--- a/gas/testsuite/gas/i386/x86-64-optimize-2.s
+++ b/gas/testsuite/gas/i386/x86-64-optimize-2.s
@@ -170,3 +170,5 @@ _start:
 	vmovdqu16	%ymm1, 128(%rax)
 	vmovdqu32	%ymm1, 128(%rax)
 	vmovdqu64	%ymm1, 128(%rax)
+
+	vmovdqa32	(%rax), %zmm2
diff --git a/gas/testsuite/gas/i386/x86-64-optimize-2a.d b/gas/testsuite/gas/i386/x86-64-optimize-2a.d
index 532a1458bc..39385b96ec 100644
--- a/gas/testsuite/gas/i386/x86-64-optimize-2a.d
+++ b/gas/testsuite/gas/i386/x86-64-optimize-2a.d
@@ -125,12 +125,12 @@ Disassembly of section .text:
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
- +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7d 08 7f 48 08 	vmovdqa32 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 fd 08 7f 48 08 	vmovdqa64 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7f 08 7f 48 08 	vmovdqu8 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 ff 08 7f 48 08 	vmovdqu16 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7e 08 7f 48 08 	vmovdqu32 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 fe 08 7f 48 08 	vmovdqu64 %xmm1,0x80\(%rax\)
  +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
  +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
  +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
@@ -149,10 +149,11 @@ Disassembly of section .text:
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
- +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7d 28 7f 48 04 	vmovdqa32 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 fd 28 7f 48 04 	vmovdqa64 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7f 28 7f 48 04 	vmovdqu8 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 ff 28 7f 48 04 	vmovdqu16 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7e 28 7f 48 04 	vmovdqu32 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 fe 28 7f 48 04 	vmovdqu64 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7d 48 6f 10    	vmovdqa32 \(%rax\),%zmm2
 #pass
diff --git a/gas/testsuite/gas/i386/x86-64-optimize-2b.d b/gas/testsuite/gas/i386/x86-64-optimize-2b.d
index 09474a1016..3eb3a59eac 100644
--- a/gas/testsuite/gas/i386/x86-64-optimize-2b.d
+++ b/gas/testsuite/gas/i386/x86-64-optimize-2b.d
@@ -124,12 +124,12 @@ Disassembly of section .text:
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
- +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7d 08 7f 48 08 	vmovdqa32 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 fd 08 7f 48 08 	vmovdqa64 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7f 08 7f 48 08 	vmovdqu8 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 ff 08 7f 48 08 	vmovdqu16 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7e 08 7f 48 08 	vmovdqu32 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 fe 08 7f 48 08 	vmovdqu64 %xmm1,0x80\(%rax\)
  +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
  +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
  +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
@@ -148,10 +148,11 @@ Disassembly of section .text:
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
- +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7d 28 7f 48 04 	vmovdqa32 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 fd 28 7f 48 04 	vmovdqa64 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7f 28 7f 48 04 	vmovdqu8 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 ff 28 7f 48 04 	vmovdqu16 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7e 28 7f 48 04 	vmovdqu32 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 fe 28 7f 48 04 	vmovdqu64 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7d 48 6f 10    	vmovdqa32 \(%rax\),%zmm2
 #pass
diff --git a/gas/testsuite/gas/i386/x86-64-optimize-3.d b/gas/testsuite/gas/i386/x86-64-optimize-3.d
index 74336a4fe2..5e2832df4c 100644
--- a/gas/testsuite/gas/i386/x86-64-optimize-3.d
+++ b/gas/testsuite/gas/i386/x86-64-optimize-3.d
@@ -43,12 +43,12 @@ Disassembly of section .text:
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
- +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7d 08 7f 48 08 	vmovdqa32 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 fd 08 7f 48 08 	vmovdqa64 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7f 08 7f 48 08 	vmovdqu8 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 ff 08 7f 48 08 	vmovdqu16 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7e 08 7f 48 08 	vmovdqu32 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 fe 08 7f 48 08 	vmovdqu64 %xmm1,0x80\(%rax\)
  +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
  +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
  +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
@@ -67,12 +67,12 @@ Disassembly of section .text:
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
- +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7d 28 7f 48 04 	vmovdqa32 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 fd 28 7f 48 04 	vmovdqa64 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7f 28 7f 48 04 	vmovdqu8 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 ff 28 7f 48 04 	vmovdqu16 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7e 28 7f 48 04 	vmovdqu32 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 fe 28 7f 48 04 	vmovdqu64 %ymm1,0x80\(%rax\)
  +[a-f0-9]+:	62 b1 7d 08 6f d5    	vmovdqa32 %xmm21,%xmm2
  +[a-f0-9]+:	62 b1 fd 08 6f d5    	vmovdqa64 %xmm21,%xmm2
  +[a-f0-9]+:	62 b1 7f 08 6f d5    	vmovdqu8 %xmm21,%xmm2
diff --git a/gas/testsuite/gas/i386/x86-64-optimize-5.d b/gas/testsuite/gas/i386/x86-64-optimize-5.d
index 012237df57..5065d650d4 100644
--- a/gas/testsuite/gas/i386/x86-64-optimize-5.d
+++ b/gas/testsuite/gas/i386/x86-64-optimize-5.d
@@ -124,12 +124,12 @@ Disassembly of section .text:
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
- +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7d 08 7f 48 08 	vmovdqa32 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 fd 08 7f 48 08 	vmovdqa64 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7f 08 7f 48 08 	vmovdqu8 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 ff 08 7f 48 08 	vmovdqu16 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7e 08 7f 48 08 	vmovdqu32 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 fe 08 7f 48 08 	vmovdqu64 %xmm1,0x80\(%rax\)
  +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
  +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
  +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
@@ -148,12 +148,13 @@ Disassembly of section .text:
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
- +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7d 28 7f 48 04 	vmovdqa32 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 fd 28 7f 48 04 	vmovdqa64 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7f 28 7f 48 04 	vmovdqu8 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 ff 28 7f 48 04 	vmovdqu16 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7e 28 7f 48 04 	vmovdqu32 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 fe 28 7f 48 04 	vmovdqu64 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7d 48 6f 10    	vmovdqa32 \(%rax\),%zmm2
  +[a-f0-9]+:	62 f1 f5 08 55 e9    	vandnpd %xmm1,%xmm1,%xmm5
  +[a-f0-9]+:	62 f1 f5 08 55 e9    	vandnpd %xmm1,%xmm1,%xmm5
  +[a-f0-9]+:	62 f1 7d 28 6f d1    	vmovdqa32 %ymm1,%ymm2
diff --git a/gas/testsuite/gas/i386/x86-64-optimize-6.d b/gas/testsuite/gas/i386/x86-64-optimize-6.d
index aca119e4f9..8ebd9b2475 100644
--- a/gas/testsuite/gas/i386/x86-64-optimize-6.d
+++ b/gas/testsuite/gas/i386/x86-64-optimize-6.d
@@ -124,12 +124,12 @@ Disassembly of section .text:
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
  +[a-f0-9]+:	c5 fa 6f 50 7f       	vmovdqu 0x7f\(%rax\),%xmm2
- +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 f9 7f 88 80 00 00 00 	vmovdqa %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fa 7f 88 80 00 00 00 	vmovdqu %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7d 08 7f 48 08 	vmovdqa32 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 fd 08 7f 48 08 	vmovdqa64 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7f 08 7f 48 08 	vmovdqu8 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 ff 08 7f 48 08 	vmovdqu16 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7e 08 7f 48 08 	vmovdqu32 %xmm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 fe 08 7f 48 08 	vmovdqu64 %xmm1,0x80\(%rax\)
  +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
  +[a-f0-9]+:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
  +[a-f0-9]+:	c5 fe 6f d1          	vmovdqu %ymm1,%ymm2
@@ -148,12 +148,13 @@ Disassembly of section .text:
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
  +[a-f0-9]+:	c5 fe 6f 50 7f       	vmovdqu 0x7f\(%rax\),%ymm2
- +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fd 7f 88 80 00 00 00 	vmovdqa %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
- +[a-f0-9]+:	c5 fe 7f 88 80 00 00 00 	vmovdqu %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7d 28 7f 48 04 	vmovdqa32 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 fd 28 7f 48 04 	vmovdqa64 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7f 28 7f 48 04 	vmovdqu8 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 ff 28 7f 48 04 	vmovdqu16 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7e 28 7f 48 04 	vmovdqu32 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 fe 28 7f 48 04 	vmovdqu64 %ymm1,0x80\(%rax\)
+ +[a-f0-9]+:	62 f1 7d 48 6f 10    	vmovdqa32 \(%rax\),%zmm2
  +[a-f0-9]+:	62 f1 f5 08 55 e9    	vandnpd %xmm1,%xmm1,%xmm5
  +[a-f0-9]+:	62 f1 f5 08 55 e9    	vandnpd %xmm1,%xmm1,%xmm5
  +[a-f0-9]+:	62 f1 7d 28 6f d1    	vmovdqa32 %ymm1,%ymm2
-- 
2.20.1


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] x86: Correct EVEX vector load/store optimization
  2019-03-19  8:48         ` H.J. Lu
@ 2019-03-19  8:52           ` Jan Beulich
  0 siblings, 0 replies; 7+ messages in thread
From: Jan Beulich @ 2019-03-19  8:52 UTC (permalink / raw)
  To: H.J. Lu; +Cc: binutils

>>> On 19.03.19 at 09:48, <hjl.tools@gmail.com> wrote:
> On Tue, Mar 19, 2019 at 4:30 PM Jan Beulich <JBeulich@suse.com> wrote:
>>
>> >>> On 19.03.19 at 07:20, <hjl.tools@gmail.com> wrote:
>> > On Mon, Mar 18, 2019 at 9:49 PM Jan Beulich <JBeulich@suse.com> wrote:
>> >>
>> >> >>> On 17.03.19 at 21:47, <hjl.tools@gmail.com> wrote:
>> >> > --- a/gas/config/tc-i386.c
>> >> > +++ b/gas/config/tc-i386.c
>> >> > @@ -4075,6 +4075,56 @@ optimize_encoding (void)
>> >> >           i.types[j].bitfield.ymmword = 0;
>> >> >         }
>> >> >      }
>> >> > +  else if ((cpu_arch_flags.bitfield.cpuavx
>> >> > +         || cpu_arch_isa_flags.bitfield.cpuavx)
>> >>
>> >> Once again a questionable condition, as per earlier replies to
>> >> other patches of yours.
>> >
>> > Fixed.
>> >
>> >> > +        && i.vec_encoding != vex_encoding_evex
>> >> > +        && !i.types[0].bitfield.zmmword
>> >> > +        && !i.mask
>> >> > +        && is_evex_encoding (&i.tm)
>> >> > +        && (i.tm.base_opcode == 0x666f
>> >> > +            || (i.tm.base_opcode ^ Opcode_SIMD_IntD) == 0x666f
>> >> > +            || i.tm.base_opcode == 0xf36f
>> >> > +            || (i.tm.base_opcode ^ Opcode_SIMD_IntD) == 0xf36f
>> >> > +            || i.tm.base_opcode == 0xf26f
>> >> > +            || (i.tm.base_opcode ^ Opcode_SIMD_IntD) == 0xf26f)
>> >>
>> >> All three of these can be expressed with just a single comparison,
>> >> using & or | instead of ^ and (if necessary) adjusting the literal
>> >> value compared against.
>> >
>> > Fixed.
>> >
>> >> > +        && i.tm.extension_opcode == None)
>> >> > +    {
>> >> > +      /* Optimize: -O1:
>> >> > +        VOP, one of vmovdqa32, vmovdqa64, vmovdqu8, vmovdqu16,
>> >> > +        vmovdqu32 and vmovdqu64:
>> >> > +          EVEX VOP %xmmM, %xmmN
>> >> > +            -> VEX vmovdqa|vmovdqu %xmmM, %xmmN (M and N < 16)
>> >> > +          EVEX VOP %ymmM, %ymmN
>> >> > +            -> VEX vmovdqa|vmovdqu %ymmM, %ymmN (M and N < 16)
>> >> > +          EVEX VOP %xmmM, mem
>> >> > +            -> VEX vmovdqa|vmovdqu %xmmM, mem (M < 16)
>> >> > +          EVEX VOP %ymmM, mem
>> >> > +            -> VEX vmovdqa|vmovdqu %ymmM, mem (M < 16)
>> >> > +          EVEX VOP mem, %xmmN
>> >> > +            -> VEX mvmovdqa|vmovdquem, %xmmN (N < 16)
>> >>
>> >> There's some confusion on this line.
>> >>
>> >> > +          EVEX VOP mem, %ymmN
>> >> > +            -> VEX vmovdqa|vmovdqu mem, %ymmN (N < 16)
>> >> > +       */
>> >>
>> >> For the variants with a memory operand I doubt the conversion
>> >> is always a win, and it may be against the user request in case of
>> >> -Os. This is because of the Disp8 scaling the EVEX encoding permits.
>> >
>> > Fixed.
>> >
>> >> > +      if (i.tm.base_opcode == 0xf26f)
>> >> > +     i.tm.base_opcode = 0xf36f;
>> >> > +      else if ((i.tm.base_opcode ^ Opcode_SIMD_IntD) == 0xf26f)
>> >> > +     i.tm.base_opcode = 0xf36f ^ Opcode_SIMD_IntD;
>> >>
>> >> This again can be expressed without "else if()" afaict.
>> >>
>> >
>> > Fixed.
>> >
>> > Here is the patch.
>>
>> Thanks.
>>
>> >--- a/gas/config/tc-i386.c
>> >+++ b/gas/config/tc-i386.c
>> >@@ -4068,18 +4068,14 @@ optimize_encoding (void)
>> >           i.types[j].bitfield.ymmword = 0;
>> >         }
>> >     }
>> >-  else if ((cpu_arch_flags.bitfield.cpuavx
>> >-          || cpu_arch_isa_flags.bitfield.cpuavx)
>> >-         && i.vec_encoding != vex_encoding_evex
>> >+  else if (i.vec_encoding != vex_encoding_evex
>> >          && !i.types[0].bitfield.zmmword
>>
>> Ah, here the remaining cpuavx goes away as well.
>>
>> >+      if ((i.tm.base_opcode & ~Opcode_SIMD_IntD) == 0xf26f)
>> >+      {
>> >+        i.tm.base_opcode &= Opcode_SIMD_IntD;
>> >+        i.tm.base_opcode |= 0xf36f;
>> >+      }
>>
>> How about the even simpler
>>
>>       if ((i.tm.base_opcode & ~Opcode_SIMD_IntD) == 0xf26f)
>>         i.tm.base_opcode ^= 0xf36f ^ 0xf26f;
>>
> 
> It works.
> 
> I am going to check in this patch together with other 2.
> 
> Thanks.

Thank you as well.

Jan

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2019-03-19  8:52 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-03-15 23:58 [PATCH] x86: Optimize EVEX vector load/store instructions H.J. Lu
2019-03-17 20:47 ` V2 " H.J. Lu
2019-03-18 13:49   ` Jan Beulich
2019-03-19  6:21     ` [PATCH] x86: Correct EVEX vector load/store optimization H.J. Lu
2019-03-19  8:30       ` Jan Beulich
2019-03-19  8:48         ` H.J. Lu
2019-03-19  8:52           ` Jan Beulich

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).