public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH, testsuite, i386] BMI2 support for GCC
@ 2011-08-19 12:27 Kirill Yukhin
  2011-08-19 13:26 ` H.J. Lu
  0 siblings, 1 reply; 27+ messages in thread
From: Kirill Yukhin @ 2011-08-19 12:27 UTC (permalink / raw)
  To: gcc-patches List, Uros Bizjak, H.J. Lu

[-- Attachment #1: Type: text/plain, Size: 549 bytes --]

Hi guys,
I've prepared a patch which enables BMI2 extensions in GCC

It conforms (hopefully) to Spec which can be found at [1]

I am attaching following files:
 - bmi2.gcc.patch. Bunch of changes to GCC
 - ChangeLog. Entry for ChangeLog in GCC's root directory
 - ChangeLog.testsuite. Entry for ChangeLog in GCC's test suite

Bootstrap is passed
Make-check shows no new fails, my compile-time new tests are passed
Make-check under simulator causes all my new tests to pass

Is it OK for trunk?

[1] - http://software.intel.com/file/36945

Thanks, K

[-- Attachment #2: bmi2.gcc.patch --]
[-- Type: application/octet-stream, Size: 52585 bytes --]

diff --git a/gcc/common/config/i386/i386-common.c b/gcc/common/config/i386/i386-common.c
index b201835..99643d6 100644
--- a/gcc/common/config/i386/i386-common.c
+++ b/gcc/common/config/i386/i386-common.c
@@ -79,6 +79,7 @@ along with GCC; see the file COPYING3.  If not see
   (OPTION_MASK_ISA_ABM | OPTION_MASK_ISA_POPCNT)
 
 #define OPTION_MASK_ISA_BMI_SET OPTION_MASK_ISA_BMI
+#define OPTION_MASK_ISA_BMI2_SET OPTION_MASK_ISA_BMI2
 #define OPTION_MASK_ISA_TBM_SET OPTION_MASK_ISA_TBM
 #define OPTION_MASK_ISA_POPCNT_SET OPTION_MASK_ISA_POPCNT
 #define OPTION_MASK_ISA_CX16_SET OPTION_MASK_ISA_CX16
@@ -137,6 +138,7 @@ along with GCC; see the file COPYING3.  If not see
 #define OPTION_MASK_ISA_PCLMUL_UNSET OPTION_MASK_ISA_PCLMUL
 #define OPTION_MASK_ISA_ABM_UNSET OPTION_MASK_ISA_ABM
 #define OPTION_MASK_ISA_BMI_UNSET OPTION_MASK_ISA_BMI
+#define OPTION_MASK_ISA_BMI2_UNSET OPTION_MASK_ISA_BMI2
 #define OPTION_MASK_ISA_TBM_UNSET OPTION_MASK_ISA_TBM
 #define OPTION_MASK_ISA_POPCNT_UNSET OPTION_MASK_ISA_POPCNT
 #define OPTION_MASK_ISA_CX16_UNSET OPTION_MASK_ISA_CX16
@@ -395,6 +397,19 @@ ix86_handle_option (struct gcc_options *opts,
 	}
       return true;
 
+    case OPT_mbmi2:
+      if (value)
+	{
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI2_SET;
+	  opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_BMI2_SET;
+	}
+      else
+	{
+	  opts->x_ix86_isa_flags &= ~OPTION_MASK_ISA_BMI2_UNSET;
+	  opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_BMI2_UNSET;
+	}
+      return true;
+
     case OPT_mtbm:
       if (value)
 	{
diff --git a/gcc/config.gcc b/gcc/config.gcc
index b92ce3d..30cce99 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -352,7 +352,7 @@ i[34567]86-*-*)
 		       nmmintrin.h bmmintrin.h fma4intrin.h wmmintrin.h
 		       immintrin.h x86intrin.h avxintrin.h xopintrin.h
 		       ia32intrin.h cross-stdarg.h lwpintrin.h popcntintrin.h
-		       lzcntintrin.h bmiintrin.h tbmintrin.h"
+		       lzcntintrin.h bmiintrin.h bmi2intrin.h tbmintrin.h"
 	;;
 x86_64-*-*)
 	cpu_type=i386
@@ -364,7 +364,7 @@ x86_64-*-*)
 		       nmmintrin.h bmmintrin.h fma4intrin.h wmmintrin.h
 		       immintrin.h x86intrin.h avxintrin.h xopintrin.h
 		       ia32intrin.h cross-stdarg.h lwpintrin.h popcntintrin.h
-		       lzcntintrin.h bmiintrin.h tbmintrin.h"
+		       lzcntintrin.h bmiintrin.h bmi2intrin.h tbmintrin.h"
 	need_64bit_hwint=yes
 	;;
 ia64-*-*)
diff --git a/gcc/config/i386/bmi2intrin.h b/gcc/config/i386/bmi2intrin.h
new file mode 100644
index 0000000..f3ffa52
--- /dev/null
+++ b/gcc/config/i386/bmi2intrin.h
@@ -0,0 +1,81 @@
+/* Copyright (C) 2010, 2011 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _X86INTRIN_H_INCLUDED
+# error "Never use <bmi2intrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __BMI2__
+# error "BMI2 instruction set not enabled"
+#endif /* __BMI2__ */
+
+#ifndef _BMI2INTRIN_H_INCLUDED
+#define _BMI2INTRIN_H_INCLUDED
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_bzhi_u32 (unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_bzhi_si (__X, __Y);
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_pdep_u32 (unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_pdep_si (__X, __Y);
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_pext_u32 (unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_pext_si (__X, __Y);
+}
+
+#ifdef  __x86_64__
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_bzhi_u64 (unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_bzhi_di (__X, __Y);
+}
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_pdep_u64 (unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_pdep_di (__X, __Y);
+}
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_pext_u64 (unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_pext_di (__X, __Y);
+}
+
+#endif /* __x86_64__  */
+
+#endif /* _BMI2INTRIN_H_INCLUDED */
diff --git a/gcc/config/i386/cpuid.h b/gcc/config/i386/cpuid.h
index d53743f..5da8fd2 100644
--- a/gcc/config/i386/cpuid.h
+++ b/gcc/config/i386/cpuid.h
@@ -67,6 +67,7 @@
 #define bit_FSGSBASE	(1 << 0)
 #define bit_BMI		(1 << 3)
 #define bit_AVX2	(1 << 5)
+#define bit_BMI2	(1 << 8)
 
 #if defined(__i386__) && defined(__PIC__)
 /* %ebx may be the PIC register.  */
diff --git a/gcc/config/i386/driver-i386.c b/gcc/config/i386/driver-i386.c
index b7a1f52..8107ece 100644
--- a/gcc/config/i386/driver-i386.c
+++ b/gcc/config/i386/driver-i386.c
@@ -396,7 +396,7 @@ const char *host_detect_local_cpu (int argc, const char **argv)
   unsigned int has_popcnt = 0, has_aes = 0, has_avx = 0, has_avx2 = 0;
   unsigned int has_pclmul = 0, has_abm = 0, has_lwp = 0;
   unsigned int has_fma = 0, has_fma4 = 0, has_xop = 0;
-  unsigned int has_bmi = 0, has_tbm = 0, has_lzcnt = 0;
+  unsigned int has_bmi = 0, has_bmi2 = 0, has_tbm = 0, has_lzcnt = 0;
 
   bool arch;
 
@@ -475,6 +475,7 @@ const char *host_detect_local_cpu (int argc, const char **argv)
 
       has_bmi = ebx & bit_BMI;
       has_avx2 = ebx & bit_AVX2;
+      has_bmi2 = ebx & bit_BMI2;
     }
 
   if (!arch)
@@ -715,6 +716,7 @@ const char *host_detect_local_cpu (int argc, const char **argv)
       const char *fma4 = has_fma4 ? " -mfma4" : " -mno-fma4";
       const char *xop = has_xop ? " -mxop" : " -mno-xop";
       const char *bmi = has_bmi ? " -mbmi" : " -mno-bmi";
+      const char *bmi2 = has_bmi2 ? " -mbmi2" : " -mno-bmi2";
       const char *tbm = has_tbm ? " -mtbm" : " -mno-tbm";
       const char *avx = has_avx ? " -mavx" : " -mno-avx";
       const char *avx2 = has_avx2 ? " -mavx2" : " -mno-avx2";
@@ -723,8 +725,8 @@ const char *host_detect_local_cpu (int argc, const char **argv)
       const char *lzcnt = has_lzcnt ? " -mlzcnt" : " -mno-lzcnt";
 
       options = concat (options, cx16, sahf, movbe, ase, pclmul,
-			popcnt, abm, lwp, fma, fma4, xop, bmi, tbm,
-			avx2, avx, sse4_2, sse4_1, lzcnt, NULL);
+			popcnt, abm, lwp, fma, fma4, xop, bmi, bmi2,
+			tbm, avx, avx2, sse4_2, sse4_1, lzcnt, NULL);
     }
 
 done:
diff --git a/gcc/config/i386/i386-c.c b/gcc/config/i386/i386-c.c
index 5c1dfe6..d4b0b08 100644
--- a/gcc/config/i386/i386-c.c
+++ b/gcc/config/i386/i386-c.c
@@ -273,6 +273,8 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
     def_or_undef (parse_in, "__ABM__");
   if (isa_flag & OPTION_MASK_ISA_BMI)
     def_or_undef (parse_in, "__BMI__");
+  if (isa_flag & OPTION_MASK_ISA_BMI2)
+    def_or_undef (parse_in, "__BMI2__");
   if (isa_flag & OPTION_MASK_ISA_LZCNT)
     def_or_undef (parse_in, "__LZCNT__");
   if (isa_flag & OPTION_MASK_ISA_TBM)
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 53c5944..e8f229a 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -2664,6 +2664,7 @@ ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
     { "-mmmx",		OPTION_MASK_ISA_MMX },
     { "-mabm",		OPTION_MASK_ISA_ABM },
     { "-mbmi",		OPTION_MASK_ISA_BMI },
+    { "-mbmi2", 	OPTION_MASK_ISA_BMI2 },
     { "-mlzcnt",	OPTION_MASK_ISA_LZCNT },
     { "-mtbm",		OPTION_MASK_ISA_TBM },
     { "-mpopcnt",	OPTION_MASK_ISA_POPCNT },
@@ -2921,6 +2922,7 @@ ix86_option_override_internal (bool main_args_p)
 #define PTA_TBM		 	(HOST_WIDE_INT_1 << 28)
 #define PTA_XOP		 	(HOST_WIDE_INT_1 << 29)
 #define PTA_AVX2		(HOST_WIDE_INT_1 << 30)
+#define PTA_BMI2	 	(HOST_WIDE_INT_1 << 31)
 /* if this reaches 64, need to widen struct pta flags below */
 
   static struct pta
@@ -2978,8 +2980,8 @@ ix86_option_override_internal (bool main_args_p)
 	PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
 	| PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
 	| PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
-	| PTA_RDRND | PTA_F16C | PTA_BMI | PTA_LZCNT | PTA_FMA
-	| PTA_MOVBE},
+	| PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
+        | PTA_FMA | PTA_MOVBE},
       {"atom", PROCESSOR_ATOM, CPU_ATOM,
 	PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
 	| PTA_SSSE3 | PTA_CX16 | PTA_MOVBE},
@@ -3300,6 +3302,9 @@ ix86_option_override_internal (bool main_args_p)
 	if (processor_alias_table[i].flags & PTA_TBM
 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
 	  ix86_isa_flags |= OPTION_MASK_ISA_TBM;
+	if (processor_alias_table[i].flags & PTA_BMI2
+	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
+	  ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
 	if (processor_alias_table[i].flags & PTA_CX16
 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
 	  ix86_isa_flags |= OPTION_MASK_ISA_CX16;
@@ -4053,6 +4058,7 @@ ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
     IX86_ATTR_ISA ("3dnow",	OPT_m3dnow),
     IX86_ATTR_ISA ("abm",	OPT_mabm),
     IX86_ATTR_ISA ("bmi",	OPT_mbmi),
+    IX86_ATTR_ISA ("bmi2",	OPT_mbmi2),
     IX86_ATTR_ISA ("lzcnt",	OPT_mlzcnt),
     IX86_ATTR_ISA ("tbm",	OPT_mtbm),
     IX86_ATTR_ISA ("aes",	OPT_maes),
@@ -13285,6 +13291,7 @@ put_condition_code (enum rtx_code code, enum machine_mode mode, int reverse,
    If CODE is 't', pretend the mode is V8SFmode.
    If CODE is 'h', pretend the reg is the 'high' byte register.
    If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
+   If CODE is 'N', print the half mode high register.
    If CODE is 'd', duplicate the operand for AVX instruction.
  */
 
@@ -13294,6 +13301,15 @@ print_reg (rtx x, int code, FILE *file)
   const char *reg;
   bool duplicated = code == 'd' && TARGET_AVX;
 
+  if (code == 'N')
+    {
+      enum machine_mode mode = GET_MODE (x);
+      enum machine_mode half_mode = mode == TImode ? DImode : SImode;
+      x = simplify_gen_subreg (half_mode, x, mode,
+			       GET_MODE_SIZE (half_mode));
+      code = 0;
+    }
+
   gcc_assert (x == pc_rtx
 	      || (REGNO (x) != ARG_POINTER_REGNUM
 		  && REGNO (x) != FRAME_POINTER_REGNUM
@@ -13472,6 +13488,7 @@ get_some_local_dynamic_name (void)
    t --  likewise, print the V8SFmode name of the register.
    h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
    y -- print "st(0)" instead of "st" as a register.
+   N -- print the half mode high register.
    d -- print duplicated register operand for AVX instruction.
    D -- print condition for SSE cmp instruction.
    P -- if PIC, print an @PLT suffix.
@@ -13678,6 +13695,7 @@ ix86_print_operand (FILE *file, rtx x, int code)
 	case 'h':
 	case 't':
 	case 'y':
+	case 'N':
 	case 'x':
 	case 'X':
 	case 'P':
@@ -15745,8 +15763,20 @@ ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
     }
   else
     {
-      clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
-      emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
+      rtx insn;
+      if (code == ROTATERT
+	  && TARGET_BMI2
+	  && !optimize_function_for_size_p (cfun)
+	  && ((mode == SImode) || (mode == DImode && TARGET_64BIT))
+	  && CONST_INT_P (src2) ) {
+        /* We generatin RORX instruction, freedom of register +
+	   flags not affected  */
+	insn = op;
+      } else {
+	clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
+	insn = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob));
+      }
+      emit_insn (insn);
     }
 
   /* Fix up the destination if needed.  */
@@ -24072,6 +24102,13 @@ enum ix86_builtins
   IX86_BUILTIN_BEXTRI32,
   IX86_BUILTIN_BEXTRI64,
 
+  /* BMI2 instructions. */
+  IX86_BUILTIN_BZHI32,
+  IX86_BUILTIN_BZHI64,
+  IX86_BUILTIN_PDEP32,
+  IX86_BUILTIN_PDEP64,
+  IX86_BUILTIN_PEXT32,
+  IX86_BUILTIN_PEXT64,
 
   /* FSGSBASE instructions.  */
   IX86_BUILTIN_RDFSBASE32,
@@ -25046,6 +25083,14 @@ static const struct builtin_description bdesc_args[] =
   { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps256, "__builtin_ia32_vcvtph2ps256", IX86_BUILTIN_CVTPH2PS256, UNKNOWN, (int) V8SF_FTYPE_V8HI },
   { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph, "__builtin_ia32_vcvtps2ph", IX86_BUILTIN_CVTPS2PH, UNKNOWN, (int) V8HI_FTYPE_V4SF_INT },
   { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph256, "__builtin_ia32_vcvtps2ph256", IX86_BUILTIN_CVTPS2PH256, UNKNOWN, (int) V8HI_FTYPE_V8SF_INT },
+
+  /* BMI2 */
+  { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_si3, "__builtin_ia32_bzhi_si", IX86_BUILTIN_BZHI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
+  { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_di3, "__builtin_ia32_bzhi_di", IX86_BUILTIN_BZHI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
+  { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_si3, "__builtin_ia32_pdep_si", IX86_BUILTIN_PDEP32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
+  { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_di3, "__builtin_ia32_pdep_di", IX86_BUILTIN_PDEP64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
+  { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_si3, "__builtin_ia32_pext_si", IX86_BUILTIN_PEXT32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
+  { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_di3, "__builtin_ia32_pext_di", IX86_BUILTIN_PEXT64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
 };
 
 /* FMA4 and XOP.  */
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index f8a35ba..47442a0 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -62,6 +62,7 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 #define TARGET_ROUND	OPTION_ISA_ROUND
 #define TARGET_ABM	OPTION_ISA_ABM
 #define TARGET_BMI	OPTION_ISA_BMI
+#define TARGET_BMI2	OPTION_ISA_BMI2
 #define TARGET_LZCNT	OPTION_ISA_LZCNT
 #define TARGET_TBM	OPTION_ISA_TBM
 #define TARGET_POPCNT	OPTION_ISA_POPCNT
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index e7ae397..3fa6b5e 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -236,6 +236,11 @@
 
   ;; For RDRAND support
   UNSPEC_RDRAND
+
+  ;; For BMI2 support
+  UNSPEC_BZHI
+  UNSPEC_PDEP
+  UNSPEC_PEXT
 ])
 
 (define_c_enum "unspecv" [
@@ -751,14 +756,17 @@
 ;; Base name for insn mnemonic.
 (define_code_attr logic [(and "and") (ior "or") (xor "xor")])
 
+;; Mapping of shift operators
+(define_code_iterator any_shift [ashift lshiftrt ashiftrt])
+
 ;; Mapping of shift-right operators
 (define_code_iterator any_shiftrt [lshiftrt ashiftrt])
 
 ;; Base name for define_insn
-(define_code_attr shiftrt_insn [(lshiftrt "lshr") (ashiftrt "ashr")])
+(define_code_attr shift_insn [(ashift "ashl") (lshiftrt "lshr") (ashiftrt "ashr")])
 
 ;; Base name for insn mnemonic.
-(define_code_attr shiftrt [(lshiftrt "shr") (ashiftrt "sar")])
+(define_code_attr shift [(ashift "shl") (lshiftrt "shr") (ashiftrt "sar")])
 
 ;; Mapping of rotate operators
 (define_code_iterator any_rotate [rotate rotatert])
@@ -777,6 +785,8 @@
 
 ;; Used in signed and unsigned widening multiplications.
 (define_code_iterator any_extend [sign_extend zero_extend])
+(define_code_attr any_extend [(sign_extend "SIGN_EXTEND")
+			      (zero_extend "ZERO_EXTEND")])
 
 ;; Various insn prefixes for signed and unsigned operations.
 (define_code_attr u [(sign_extend "") (zero_extend "u")
@@ -6837,7 +6847,17 @@
 		       (match_operand:DWIH 1 "nonimmediate_operand" ""))
 		     (any_extend:<DWI>
 		       (match_operand:DWIH 2 "register_operand" ""))))
-	      (clobber (reg:CC FLAGS_REG))])])
+	      (clobber (reg:CC FLAGS_REG))])]
+  ""
+{
+  if (TARGET_BMI2 && <any_extend> == ZERO_EXTEND)
+    {
+      emit_insn (gen_bmi2_umul<mode><dwi>3_1 (operands[0],
+					      operands[1],
+					      operands[2]));
+      DONE;
+    }
+})
 
 (define_expand "<u>mulqihi3"
   [(parallel [(set (match_operand:HI 0 "register_operand" "")
@@ -6849,6 +6869,24 @@
 	      (clobber (reg:CC FLAGS_REG))])]
   "TARGET_QIMODE_MATH")
 
+(define_insn "bmi2_umul<mode><dwi>3_1"
+  [(set (match_operand:<DWI> 0 "register_operand" "=r")
+	(mult:<DWI>
+	  (zero_extend:<DWI>
+	    (match_operand:DWIH 1 "register_operand" "d"))
+	  (zero_extend:<DWI>
+	    (match_operand:DWIH 2 "nonimmediate_operand" "rm"))))]
+  "TARGET_BMI2"
+{
+  if (<MODE>mode == DImode)
+    return "mulx\t{%2, %q0, %N0|%N0, %q0, %2}";
+  else
+    return "mulx\t{%2, %k0, %N0|%N0, %k0, %2}";
+}
+  [(set_attr "type" "imul")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<MODE>")])
+
 (define_insn "*<u>mul<mode><dwi>3_1"
   [(set (match_operand:<DWI> 0 "register_operand" "=A")
 	(mult:<DWI>
@@ -9587,15 +9625,43 @@
 
 ;; See comment above `ashl<mode>3' about how this works.
 
-(define_expand "<shiftrt_insn><mode>3"
+(define_expand "<shift_insn><mode>3"
   [(set (match_operand:SDWIM 0 "<shift_operand>" "")
 	(any_shiftrt:SDWIM (match_operand:SDWIM 1 "<shift_operand>" "")
 			   (match_operand:QI 2 "nonmemory_operand" "")))]
   ""
   "ix86_expand_binary_operator (<CODE>, <MODE>mode, operands); DONE;")
 
+;; Update pattern if BMI2 is available
+(define_split
+  [(set (match_operand:SWI48 0 "register_operand" "")
+	(any_shift:SWI48
+	  (match_operand:SWI48 1 "nonimmediate_operand" "")
+	  (subreg:QI
+	      (match_operand:SI 2 "register_operand" "") 0)))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_BMI2 && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands) && !reload_completed"
+  [(set (match_dup 0)
+        (any_shift:SWI48 (match_dup 1) (match_dup 2)))]
+{
+  if (can_create_pseudo_p () && <MODE>mode != SImode) {
+    rtx tmp = gen_rtx_REG (<MODE>mode, 0);
+    emit_insn (gen_extendsidi2 (tmp, operands[2]));
+    operands[2] = tmp;
+  }
+})
+
+(define_insn "*bmi2_<shift_insn><mode>3"
+  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=r")
+	(any_shift:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" "rm")
+			 (match_operand:SWI48 2 "register_operand" "r")))]
+  "TARGET_BMI2 && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
+  "<shift>x\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "ishift")
+   (set_attr "mode" "<MODE>")])
+
 ;; Avoid useless masking of count operand.
-(define_insn_and_split "*<shiftrt_insn><mode>3_mask"
+(define_insn_and_split "*<shift_insn><mode>3_mask"
   [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm")
 	(any_shiftrt:SWI48
 	  (match_operand:SWI48 1 "nonimmediate_operand" "0")
@@ -9621,7 +9687,7 @@
   [(set_attr "type" "ishift")
    (set_attr "mode" "<MODE>")])
 
-(define_insn_and_split "*<shiftrt_insn><mode>3_doubleword"
+(define_insn_and_split "*<shift_insn><mode>3_doubleword"
   [(set (match_operand:DWI 0 "register_operand" "=r")
 	(any_shiftrt:DWI (match_operand:DWI 1 "register_operand" "0")
 			 (match_operand:QI 2 "nonmemory_operand" "<S>c")))
@@ -9630,7 +9696,7 @@
   "#"
   "(optimize && flag_peephole2) ? epilogue_completed : reload_completed"
   [(const_int 0)]
-  "ix86_split_<shiftrt_insn> (operands, NULL_RTX, <MODE>mode); DONE;"
+  "ix86_split_<shift_insn> (operands, NULL_RTX, <MODE>mode); DONE;"
   [(set_attr "type" "multi")])
 
 ;; By default we don't ask for a scratch register, because when DWImode
@@ -9647,7 +9713,7 @@
    (match_dup 3)]
   "TARGET_CMOVE"
   [(const_int 0)]
-  "ix86_split_<shiftrt_insn> (operands, operands[3], <DWI>mode); DONE;")
+  "ix86_split_<shift_insn> (operands, operands[3], <DWI>mode); DONE;")
 
 (define_insn "x86_64_shrd"
   [(set (match_operand:DI 0 "nonimmediate_operand" "+r*m")
@@ -9763,7 +9829,7 @@
   DONE;
 })
 
-(define_insn "*<shiftrt_insn><mode>3_1"
+(define_insn "*<shift_insn><mode>3_1"
   [(set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m")
 	(any_shiftrt:SWI (match_operand:SWI 1 "nonimmediate_operand" "0")
 			 (match_operand:QI 2 "nonmemory_operand" "c<S>")))
@@ -9772,9 +9838,9 @@
 {
   if (operands[2] == const1_rtx
       && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
-    return "<shiftrt>{<imodesuffix>}\t%0";
+    return "<shift>{<imodesuffix>}\t%0";
   else
-    return "<shiftrt>{<imodesuffix>}\t{%2, %0|%0, %2}";
+    return "<shift>{<imodesuffix>}\t{%2, %0|%0, %2}";
 }
   [(set_attr "type" "ishift")
    (set (attr "length_immediate")
@@ -9786,7 +9852,7 @@
        (const_string "*")))
    (set_attr "mode" "<MODE>")])
 
-(define_insn "*<shiftrt_insn>si3_1_zext"
+(define_insn "*<shift_insn>si3_1_zext"
   [(set (match_operand:DI 0 "register_operand" "=r")
 	(zero_extend:DI
 	  (any_shiftrt:SI (match_operand:SI 1 "register_operand" "0")
@@ -9796,9 +9862,9 @@
 {
   if (operands[2] == const1_rtx
       && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
-    return "<shiftrt>{l}\t%k0";
+    return "<shift>{l}\t%k0";
   else
-    return "<shiftrt>{l}\t{%2, %k0|%k0, %2}";
+    return "<shift>{l}\t{%2, %k0|%k0, %2}";
 }
   [(set_attr "type" "ishift")
    (set (attr "length_immediate")
@@ -9810,7 +9876,7 @@
        (const_string "*")))
    (set_attr "mode" "SI")])
 
-(define_insn "*<shiftrt_insn>qi3_1_slp"
+(define_insn "*<shift_insn>qi3_1_slp"
   [(set (strict_low_part (match_operand:QI 0 "nonimmediate_operand" "+qm"))
 	(any_shiftrt:QI (match_dup 0)
 			(match_operand:QI 1 "nonmemory_operand" "cI")))
@@ -9822,9 +9888,9 @@
 {
   if (operands[1] == const1_rtx
       && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
-    return "<shiftrt>{b}\t%0";
+    return "<shift>{b}\t%0";
   else
-    return "<shiftrt>{b}\t{%1, %0|%0, %1}";
+    return "<shift>{b}\t{%1, %0|%0, %1}";
 }
   [(set_attr "type" "ishift1")
    (set (attr "length_immediate")
@@ -9839,7 +9905,7 @@
 ;; This pattern can't accept a variable shift count, since shifts by
 ;; zero don't affect the flags.  We assume that shifts by constant
 ;; zero are optimized away.
-(define_insn "*<shiftrt_insn><mode>3_cmp"
+(define_insn "*<shift_insn><mode>3_cmp"
   [(set (reg FLAGS_REG)
 	(compare
 	  (any_shiftrt:SWI
@@ -9857,9 +9923,9 @@
 {
   if (operands[2] == const1_rtx
       && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
-    return "<shiftrt>{<imodesuffix>}\t%0";
+    return "<shift>{<imodesuffix>}\t%0";
   else
-    return "<shiftrt>{<imodesuffix>}\t{%2, %0|%0, %2}";
+    return "<shift>{<imodesuffix>}\t{%2, %0|%0, %2}";
 }
   [(set_attr "type" "ishift")
    (set (attr "length_immediate")
@@ -9871,7 +9937,7 @@
        (const_string "*")))
    (set_attr "mode" "<MODE>")])
 
-(define_insn "*<shiftrt_insn>si3_cmp_zext"
+(define_insn "*<shift_insn>si3_cmp_zext"
   [(set (reg FLAGS_REG)
 	(compare
 	  (any_shiftrt:SI (match_operand:SI 1 "register_operand" "0")
@@ -9889,9 +9955,9 @@
 {
   if (operands[2] == const1_rtx
       && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
-    return "<shiftrt>{l}\t%k0";
+    return "<shift>{l}\t%k0";
   else
-    return "<shiftrt>{l}\t{%2, %k0|%k0, %2}";
+    return "<shift>{l}\t{%2, %k0|%k0, %2}";
 }
   [(set_attr "type" "ishift")
    (set (attr "length_immediate")
@@ -9903,7 +9969,7 @@
        (const_string "*")))
    (set_attr "mode" "SI")])
 
-(define_insn "*<shiftrt_insn><mode>3_cconly"
+(define_insn "*<shift_insn><mode>3_cconly"
   [(set (reg FLAGS_REG)
 	(compare
 	  (any_shiftrt:SWI
@@ -9919,9 +9985,9 @@
 {
   if (operands[2] == const1_rtx
       && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
-    return "<shiftrt>{<imodesuffix>}\t%0";
+    return "<shift>{<imodesuffix>}\t%0";
   else
-    return "<shiftrt>{<imodesuffix>}\t{%2, %0|%0, %2}";
+    return "<shift>{<imodesuffix>}\t{%2, %0|%0, %2}";
 }
   [(set_attr "type" "ishift")
    (set (attr "length_immediate")
@@ -10060,6 +10126,15 @@
   split_double_mode (<DWI>mode, &operands[0], 1, &operands[4], &operands[5]);
 })
 
+(define_insn "*bmi2_rorx<mode>3_1"
+  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=r")
+	(rotatert:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" "rm")
+			(match_operand:QI 2 "immediate_operand" "c<S>")))]
+  "TARGET_BMI2 && ix86_binary_operator_ok (ROTATERT, <MODE>mode, operands)"
+  "rorx\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "rotate")
+   (set_attr "mode" "<MODE>")])
+
 (define_insn "*<rotate_insn><mode>3_1"
   [(set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m")
 	(any_rotate:SWI (match_operand:SWI 1 "nonimmediate_operand" "0")
@@ -12346,6 +12421,42 @@
   "xor{b}\t{%h0, %b0|%b0, %h0}"
   [(set_attr "length" "2")
    (set_attr "mode" "HI")])
+
+;; BMI2 instructions.
+(define_insn "bmi2_bzhi_<mode>3"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+        (unspec:SWI48 [(match_operand:SWI48 1 "nonimmediate_operand" "rm")
+                       (match_operand:SWI48 2 "register_operand" "r")]
+                       UNSPEC_BZHI))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_BMI2"
+  "bzhi\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "bitmanip")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "bmi2_pdep_<mode>3"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+        (unspec:SWI48 [(match_operand:SWI48 1 "nonimmediate_operand" "rm")
+                       (match_operand:SWI48 2 "register_operand" "r")]
+                       UNSPEC_PDEP))]
+  "TARGET_BMI2"
+  "pdep\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "bitmanip")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "bmi2_pext_<mode>3"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+        (unspec:SWI48 [(match_operand:SWI48 1 "nonimmediate_operand" "rm")
+                       (match_operand:SWI48 2 "register_operand" "r")]
+                       UNSPEC_PEXT))]
+  "TARGET_BMI2"
+  "pext\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "bitmanip")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<MODE>")])
+
 \f
 ;; Thread-local storage patterns for ELF.
 ;;
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index 54d7af1..8e4d51b 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -493,6 +493,10 @@ mbmi
 Target Report Mask(ISA_BMI) Var(ix86_isa_flags) Save
 Support BMI built-in functions and code generation
 
+mbmi2
+Target Report Mask(ISA_BMI2) Var(ix86_isa_flags) Save
+Support BMI2 built-in functions and code generation
+
 mlzcnt
 Target Report Mask(ISA_LZCNT) Var(ix86_isa_flags) Save
 Support LZCNT built-in function and code generation
diff --git a/gcc/config/i386/x86intrin.h b/gcc/config/i386/x86intrin.h
index 88456f9..e01ecd2 100644
--- a/gcc/config/i386/x86intrin.h
+++ b/gcc/config/i386/x86intrin.h
@@ -81,6 +81,10 @@
 #include <bmiintrin.h>
 #endif
 
+#ifdef __BMI2__
+#include <bmi2intrin.h>
+#endif
+
 #ifdef __TBM__
 #include <tbmintrin.h>
 #endif
diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index 786c18d..1900276 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -9693,6 +9693,17 @@ unsigned int __builtin_ia32_bextr_u32(unsigned int, unsigned int);
 unsigned long long __builtin_ia32_bextr_u64 (unsigned long long, unsigned long long);
 @end smallexample
 
+The following built-in functions are available when @option{-mbmi2} is used.
+All of them generate the machine instruction that is part of the name.
+@smallexample
+unsigned int _bzhi_u32 (unsigned int, unsigned int)
+unsigned int _pdep_u32 (unsigned int, unsigned int)
+unsigned int _pext_u32 (unsigned int, unsigned int)
+unsigned long long _bzhi_u64 (unsigned long long, unsigned long long)
+unsigned long long _pdep_u64 (unsigned long long, unsigned long long)
+unsigned long long _pext_u64 (unsigned long long, unsigned long long)
+@end smallexample
+
 The following built-in functions are available when @option{-mlzcnt} is used.
 All of them generate the machine instruction that is part of the name.
 @smallexample
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index fdc3297..acf30e3 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -607,7 +607,7 @@ Objective-C and Objective-C++ Dialects}.
 -mmmx  -msse  -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -msse4 -mavx @gol
 -mavx2 -maes -mpclmul -mfsgsbase -mrdrnd -mf16c -mfma @gol
 -msse4a -m3dnow -mpopcnt -mabm -mbmi -mtbm -mfma4 -mxop -mlzcnt @gol
--mlwp -mthreads  -mno-align-stringops  -minline-all-stringops @gol
+-mbmi2 -mlwp -mthreads  -mno-align-stringops  -minline-all-stringops @gol
 -minline-stringops-dynamically -mstringop-strategy=@var{alg} @gol
 -mpush-args  -maccumulate-outgoing-args  -m128bit-long-double @gol
 -m96bit-long-double  -mregparm=@var{num}  -msseregparm @gol
@@ -12697,7 +12697,9 @@ preferred alignment to @option{-mpreferred-stack-boundary=2}.
 @itemx -mabm
 @itemx -mno-abm
 @itemx -mbmi
+@itemx -mbmi2
 @itemx -mno-bmi
+@itemx -mno-bmi2
 @itemx -mlzcnt
 @itemx -mno-lzcnt
 @itemx -mtbm
@@ -12709,8 +12711,9 @@ preferred alignment to @option{-mpreferred-stack-boundary=2}.
 @opindex m3dnow
 @opindex mno-3dnow
 These switches enable or disable the use of instructions in the MMX, SSE,
-SSE2, SSE3, SSSE3, SSE4.1, AVX, AVX2, AES, PCLMUL, FSGSBASE, RDRND, F16C, FMA,
-SSE4A, FMA4, XOP, LWP, ABM, BMI, LZCNT or 3DNow!@: extended instruction sets.
+SSE2, SSE3, SSSE3, SSE4.1, AVX, AVX2, AES, PCLMUL, FSGSBASE, RDRND, F16C,
+FMA, SSE4A, FMA4, XOP, LWP, ABM, BMI, BMI2, LZCNT or 3DNow!
+@: extended instruction sets.
 These extensions are also available as built-in functions: see
 @ref{X86 Built-in Functions}, for details of the functions enabled and
 disabled by these switches.
diff --git a/gcc/testsuite/g++.dg/other/i386-2.C b/gcc/testsuite/g++.dg/other/i386-2.C
index ed183c7..5f2eaf9 100644
--- a/gcc/testsuite/g++.dg/other/i386-2.C
+++ b/gcc/testsuite/g++.dg/other/i386-2.C
@@ -1,5 +1,5 @@
 /* { dg-do compile { target i?86-*-* x86_64-*-* } } */
-/* { dg-options "-O -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c" } */
+/* { dg-options "-O -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c" } */
 
 /* Test that {,x,e,p,t,s,w,a,b,i}mmintrin.h, mm3dnow.h, fma4intrin.h,
    xopintrin.h, abmintrin.h, bmiintrin.h, tbmintrin.h, lwpintrin.h,
diff --git a/gcc/testsuite/g++.dg/other/i386-3.C b/gcc/testsuite/g++.dg/other/i386-3.C
index 626f972..76d4d19 100644
--- a/gcc/testsuite/g++.dg/other/i386-3.C
+++ b/gcc/testsuite/g++.dg/other/i386-3.C
@@ -1,5 +1,5 @@
 /* { dg-do compile { target i?86-*-* x86_64-*-* } } */
-/* { dg-options "-O -fkeep-inline-functions -march=k8 -msse4a -m3dnow -mavx -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c" } */
+/* { dg-options "-O -fkeep-inline-functions -march=k8 -msse4a -m3dnow -mavx -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c" } */
 
 /* Test that {,x,e,p,t,s,w,a,b,i}mmintrin.h, mm3dnow.h, fma4intrin.h,
    xopintrin.h, abmintrin.h, bmiintrin.h, tbmintrin.h, lwpintrin.h,
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-bzhi32-1.c b/gcc/testsuite/gcc.target/i386/bmi2-bzhi32-1.c
new file mode 100644
index 0000000..aae2353
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-bzhi32-1.c
@@ -0,0 +1,35 @@
+/* { dg-do run { target { bmi2 } } } */
+/* { dg-options "-mbmi2 -O2" } */
+
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned
+calc_bzhi_u32 (unsigned a, int l)
+{
+  unsigned res = a;
+  int i;
+  for (i=0; i<32-l; ++i)
+    res &= ~(1 << (31 - i));
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned src = 0xce7ace0f;
+  unsigned res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_bzhi_u32 (src, i * 2);
+    res = _bzhi_u32 (src, i * 2);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-bzhi32-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-bzhi32-1a.c
new file mode 100644
index 0000000..79e47a4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-bzhi32-1a.c
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+#include "bmi2-bzhi32-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_bzhi_si3" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-bzhi64-1.c b/gcc/testsuite/gcc.target/i386/bmi2-bzhi64-1.c
new file mode 100644
index 0000000..8db29db
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-bzhi64-1.c
@@ -0,0 +1,35 @@
+/* { dg-do run { target { bmi2 && { ! ia32 } } } } */
+/* { dg-options "-mbmi2 -O2" } */
+
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned long long
+calc_bzhi_u64 (unsigned long long a, int l)
+{
+  unsigned long long res = a;
+  int i;
+  for (i=0; i<64-l; ++i)
+    res &= ~(1LL << (63 - i));
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned long long src = 0xce7ace0ce7ace0ff;
+  unsigned long long res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_bzhi_u64 (src, i * 2);
+    res = _bzhi_u64 (src, i * 2);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-bzhi64-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-bzhi64-1a.c
new file mode 100644
index 0000000..dc4a94c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-bzhi64-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-bzhi64-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_bzhi_di3" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-check.h b/gcc/testsuite/gcc.target/i386/bmi2-check.h
new file mode 100644
index 0000000..5ffce44
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-check.h
@@ -0,0 +1,36 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "cpuid.h"
+
+static void bmi2_test (void);
+
+static void
+__attribute__ ((noinline))
+do_test (void)
+{
+  bmi2_test ();
+}
+
+int
+main ()
+{
+  unsigned int eax, ebx, ecx, edx;
+
+  __cpuid_count (7, 0,  eax, ebx, ecx, edx);
+
+  /* Run BMI2 test only if host has BMI2 support.  */
+  if (ebx & bit_BMI2)
+    {
+      do_test ();
+#ifdef DEBUG
+      printf ("PASSED\n");
+#endif
+    }
+#ifdef DEBUG
+  else
+    printf ("SKIPPED\n");
+#endif
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-mulx32-1.c b/gcc/testsuite/gcc.target/i386/bmi2-mulx32-1.c
new file mode 100644
index 0000000..e1d49de
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-mulx32-1.c
@@ -0,0 +1,37 @@
+/* { dg-do run { target { bmi2 && { ia32 } } } } */
+/* { dg-options "-mbmi2 -Ofast" } */
+
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned long long
+calc_mul_u32 (unsigned a, unsigned b)
+{
+  unsigned long long res = 0;
+  volatile unsigned dummy = 0;
+  int i;
+  for (i=0; i<b; ++i)
+    res += (unsigned long long)(dummy? 0 : a);
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned a = 0xce7ace0;
+  unsigned b = 0xfacefff;
+  unsigned long long res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    a = a * (i + 1);
+    b = b / (i + 1);
+
+    res_ref = calc_mul_u32 (a, b);
+    res = (unsigned long long)a * b;
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-mulx32-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-mulx32-1a.c
new file mode 100644
index 0000000..cf3bb08
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-mulx32-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile { target { ia32 } } } */
+/* { dg-options "-O2 -mbmi2 -dp" } */
+
+#include "bmi2-mulx32-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_umulsidi3_1" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-mulx64-1.c b/gcc/testsuite/gcc.target/i386/bmi2-mulx64-1.c
new file mode 100644
index 0000000..ded3dc1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-mulx64-1.c
@@ -0,0 +1,39 @@
+/* { dg-do run { target { bmi2 && { ! ia32 } } } } */
+/* { dg-options "-mbmi2 -Ofast" } */
+
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned __int128
+calc_mul_u64 (unsigned long long a, unsigned long long b)
+{
+  unsigned __int128 res = 0;
+  volatile unsigned dummy = 0;
+  int i;
+  for (i=0; i<b; ++i) {
+    /* Block loop opts  */
+    res += (unsigned __int128)(dummy? 0 : a);
+  }
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned long long a = 0xce7ace0ce7ace0;
+  unsigned long long b = 0xface;
+  unsigned __int128 res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    a = a * (i + 1);
+    b = b / (i + 1);
+
+    res_ref = calc_mul_u64 (a, b);
+    res = (unsigned __int128)a * b;
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-mulx64-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-mulx64-1a.c
new file mode 100644
index 0000000..592d713
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-mulx64-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mbmi2 -dp" } */
+
+#include "bmi2-mulx64-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_umulditi3_1" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-pdep32-1.c b/gcc/testsuite/gcc.target/i386/bmi2-pdep32-1.c
new file mode 100644
index 0000000..e44a968
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-pdep32-1.c
@@ -0,0 +1,39 @@
+/* { dg-do run { target { bmi2 } } } */
+/* { dg-options "-mbmi2 -O2" } */
+
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned
+calc_pdep_u32 (unsigned a, int mask)
+{
+  unsigned res = 0;
+  int i, k = 0;
+
+  for (i=0; i<32; ++i)
+    if (mask & (1 << i)) {
+      res |= ((a & (1 << k)) >> k) << i;
+      ++k;
+    }
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned src = 0xce7acc;
+  unsigned res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_pdep_u32 (src, i*3);
+    res = _pdep_u32 (src, i*3);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-pdep32-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-pdep32-1a.c
new file mode 100644
index 0000000..87888fc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-pdep32-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-pdep32-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_pdep_si3" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-pdep64-1.c b/gcc/testsuite/gcc.target/i386/bmi2-pdep64-1.c
new file mode 100644
index 0000000..c0074fc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-pdep64-1.c
@@ -0,0 +1,38 @@
+/* { dg-do run { target { bmi2 && { ! ia32 } } } } */
+/* { dg-options "-mbmi2 -O2" } */
+
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned long long
+calc_pdep_u64 (unsigned long long a, unsigned long long mask)
+{
+  unsigned long long res = 0;
+  unsigned long long i, k = 0;
+
+  for (i=0; i<64; ++i)
+    if (mask & (1LL << i)) {
+      res |= ((a & (1LL << k)) >> k) << i;
+      ++k;
+    }
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned long long i;
+  unsigned long long src = 0xce7acce7acce7ac;
+  unsigned long long res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_pdep_u64 (src, ~(i * 3));
+    res = _pdep_u64 (src, ~(i * 3));
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-pdep64-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-pdep64-1a.c
new file mode 100644
index 0000000..8163c40
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-pdep64-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-pdep64-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_pdep_di3" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-pext32-1.c b/gcc/testsuite/gcc.target/i386/bmi2-pext32-1.c
new file mode 100644
index 0000000..f21029f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-pext32-1.c
@@ -0,0 +1,39 @@
+/* { dg-do run { target { bmi2 } } } */
+/* { dg-options "-mbmi2 -O2" } */
+
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned
+calc_pext_u32 (unsigned a, unsigned mask)
+{
+  unsigned res = 0;
+  int i, k = 0;
+
+  for (i=0; i<32; ++i)
+    if (mask & (1 << i)) {
+      res |= ((a & (1 << i)) >> i) << k;
+      ++k;
+    }
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned src = 0xce7acc;
+  unsigned res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_pext_u32 (src, ~(i * 3));
+    res = _pext_u32 (src, ~(i * 3));
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-pext32-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-pext32-1a.c
new file mode 100644
index 0000000..c4a6dee
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-pext32-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-pext32-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_pext_si3" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-pext64-1.c b/gcc/testsuite/gcc.target/i386/bmi2-pext64-1.c
new file mode 100644
index 0000000..bad0584
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-pext64-1.c
@@ -0,0 +1,39 @@
+/* { dg-do run { target { bmi2 && { ! ia32 } } } } */
+/* { dg-options "-mbmi2 -O2" } */
+
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned long long
+calc_pext_u64 (unsigned long long a, unsigned long long mask)
+{
+  unsigned long long res = 0;
+  int i, k = 0;
+
+  for (i=0; i<64; ++i)
+    if (mask & (1LL << i)) {
+      res |= ((a & (1LL << i)) >> i) << k;
+      ++k;
+    }
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned long long i;
+  unsigned long long src = 0xce7acce7acce7ac;
+  unsigned long long res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_pext_u64 (src, ~(i * 3));
+    res = _pext_u64 (src, ~(i * 3));
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-pext64-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-pext64-1a.c
new file mode 100644
index 0000000..aaf06c1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-pext64-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-pext64-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_pext_di3" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-rorx32-1.c b/gcc/testsuite/gcc.target/i386/bmi2-rorx32-1.c
new file mode 100644
index 0000000..84618e3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-rorx32-1.c
@@ -0,0 +1,36 @@
+/* { dg-do run { target { bmi2 } } } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned
+calc_rorx_u32 (unsigned a, int l)
+{
+  unsigned res = a;
+  int i;
+  for (i=0; i<l; ++i)
+    res = (res >> 1) | ((res&1)<< 31);
+
+  return res;
+}
+
+#define SHIFT_VAL 0x0e
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned src = 0xce7ace0;
+  unsigned res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_rorx_u32 (src, SHIFT_VAL);
+    res = (src >> SHIFT_VAL) | (src << (32 - SHIFT_VAL));
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-rorx32-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-rorx32-1a.c
new file mode 100644
index 0000000..bb3b28d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-rorx32-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mbmi2 -dp" } */
+
+#include "bmi2-rorx32-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_rorxsi3_1" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-rorx64-1.c b/gcc/testsuite/gcc.target/i386/bmi2-rorx64-1.c
new file mode 100644
index 0000000..7dc722a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-rorx64-1.c
@@ -0,0 +1,36 @@
+/* { dg-do run { target { bmi2 && { ! ia32 } } } } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned long long
+calc_rorx_u64 (unsigned long long a, int l)
+{
+  unsigned long long res = a;
+  int i;
+  for (i=0; i<l; ++i)
+    res = (res >> 1) | ((res&1)<< 63);
+
+  return res;
+}
+
+#define SHIFT_VAL 0x1e
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned long long src = 0xce7ace0ce7ace0;
+  unsigned long long res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i+1);
+
+    res_ref = calc_rorx_u64 (src, SHIFT_VAL);
+    res = (src >> SHIFT_VAL) | (src << (64 - SHIFT_VAL));
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-rorx64-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-rorx64-1a.c
new file mode 100644
index 0000000..2a7a7a0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-rorx64-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mbmi2 -dp" } */
+
+#include "bmi2-rorx64-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_rorxdi3_1" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-sarx32-1.c b/gcc/testsuite/gcc.target/i386/bmi2-sarx32-1.c
new file mode 100644
index 0000000..2bbf016
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-sarx32-1.c
@@ -0,0 +1,36 @@
+/* { dg-do run { target { bmi2 } } } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+int
+calc_sarx_u32 (int a, int l)
+{
+  int res = a;
+  int i;
+  for (i=0; i<l; ++i)
+    res >>= 1;
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  int src = 0xfce7ace0;
+  int res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_sarx_u32 (src, i + 1);
+    res = src >> (i + 1);
+
+    printf ("%x %x\n", res_ref, res);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-sarx32-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-sarx32-1a.c
new file mode 100644
index 0000000..f10d60b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-sarx32-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mbmi2 -dp" } */
+
+#include "bmi2-sarx32-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_ashrsi3" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-sarx64-1.c b/gcc/testsuite/gcc.target/i386/bmi2-sarx64-1.c
new file mode 100644
index 0000000..0bb13c0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-sarx64-1.c
@@ -0,0 +1,34 @@
+/* { dg-do run { target { bmi2 && { ! ia32 } } } } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+long long
+calc_sarx_u64 (long long a, int l)
+{
+  long long res = a;
+  int i;
+  for (i=0; i<l; ++i)
+    res >>= 1;
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  long long src = 0xfce7ace0ce7ace0;
+  long long res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i+1);
+
+    res_ref = calc_sarx_u64 (src, i + 1);
+    res = src >> (i + 1);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-sarx64-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-sarx64-1a.c
new file mode 100644
index 0000000..bcf0fd4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-sarx64-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mbmi2 -dp" } */
+
+#include "bmi2-sarx64-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_ashrdi3" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-shlx32-1.c b/gcc/testsuite/gcc.target/i386/bmi2-shlx32-1.c
new file mode 100644
index 0000000..3f35047
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-shlx32-1.c
@@ -0,0 +1,34 @@
+/* { dg-do run { target { bmi2 } } } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+int
+calc_shlx_u32 (int a, int l)
+{
+  int res = a;
+  int i;
+  for (i=0; i<l; ++i)
+    res <<= 1;
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  int src = 0xfce7ace0;
+  int res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_shlx_u32 (src, i + 1);
+    res = src << (i + 1);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-shlx32-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-shlx32-1a.c
new file mode 100644
index 0000000..215e5d3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-shlx32-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mbmi2 -dp" } */
+
+#include "bmi2-shlx32-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_ashlsi3" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-shrx32-1.c b/gcc/testsuite/gcc.target/i386/bmi2-shrx32-1.c
new file mode 100644
index 0000000..17f0c67
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-shrx32-1.c
@@ -0,0 +1,36 @@
+/* { dg-do run { target { bmi2 } } } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned
+calc_shrx_u32 (unsigned a, int l)
+{
+  unsigned res = a;
+  int i;
+  for (i=0; i<l; ++i)
+    res >>= 1;
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned src = 0xce7ace0;
+  unsigned res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_shrx_u32 (src, i + 1);
+    res = src >> (i + 1);
+
+    printf ("%x %x\n", res_ref, res);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-shrx32-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-shrx32-1a.c
new file mode 100644
index 0000000..24c53d4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-shrx32-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mbmi2 -dp" } */
+
+#include "bmi2-shrx32-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_lshrsi3" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-shrx64-1.c b/gcc/testsuite/gcc.target/i386/bmi2-shrx64-1.c
new file mode 100644
index 0000000..022baa9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-shrx64-1.c
@@ -0,0 +1,34 @@
+/* { dg-do run { target { bmi2 && { ! ia32 } } } } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned long long
+calc_shrx_u64 (unsigned long long a, int l)
+{
+  unsigned long long res = a;
+  int i;
+  for (i=0; i<l; ++i)
+    res >>= 1;
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned long long src = 0xce7ace0ce7ace0;
+  unsigned long long res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i+1);
+
+    res_ref = calc_shrx_u64 (src, i + 1);
+    res = src >> (i + 1);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-shrx64-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-shrx64-1a.c
new file mode 100644
index 0000000..7830439
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-shrx64-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mbmi2 -dp" } */
+
+#include "bmi2-shrx64-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_lshrdi3" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/i386.exp b/gcc/testsuite/gcc.target/i386/i386.exp
index 167b79b..cff8a9a 100644
--- a/gcc/testsuite/gcc.target/i386/i386.exp
+++ b/gcc/testsuite/gcc.target/i386/i386.exp
@@ -206,6 +206,17 @@ proc check_effective_target_bmi { } {
     } "-mbmi" ]
 }
 
+# Return 1 if bmi2 instructions can be compiled.
+proc check_effective_target_bmi2 { } {
+    return [check_no_compiler_messages bmi2 object {
+	unsigned int
+	_bzhi_u32 (unsigned int __X, unsigned int __Y)
+	{
+	    return __builtin_ia32_bzhi_si (__X, __Y);
+	}
+    } "-mbmi2" ]
+}
+
 # If the linker used understands -M <mapfile>, pass it to clear hardware
 # capabilities set by the Sun assembler.
 set clearcap_ldflags "-Wl,-M,$srcdir/$subdir/clearcap.map"

[-- Attachment #3: ChangeLog --]
[-- Type: application/octet-stream, Size: 3015 bytes --]

2011-08-18  Kirill Yukhin  <kirill.yukhin@intel.com>

	* common/config/i386/i386-common.c (OPTION_MASK_ISA_BMI2_SET):
	New.
	(OPTION_MASK_ISA_BMI2_UNSET): Likewise.
	(ix86_handle_option): Handle OPT_mbmi2 case.
	* config.gcc (i[34567]86-*-*): Add bmi22intrin.h.
	(x86_64-*-*): Likewise.
	* config/i386/bmi2intrin.h: New file.
	* config/i386/cpuid.h (bit_BMI2): New.
	* config/i386/driver-i386.c (host_detect_local_cpu): Detect
	BMI2 feature.
	* config/i386/i386-c.c (ix86_target_macros_internal): Define
	__BMI2_ if needed.
	* config/i386/i386.c (ix86_option_override_internal): Handle
	BMI2 option, extend core-avx2.
	(ix86_valid_target_attribute_inner_p): Likewise.
	(print_reg): New code.
	(ix86_print_operand): Likewise.
	(ix86_expand_binary_operator): Generate pattern for BMI2's
	RORX.
	(ix86_builtins): Add IX86_BUILTIN_BZHI32, IX86_BUILTIN_BZHI64,
	IX86_BUILTIN_PDEP32, IX86_BUILTIN_PDEP64, IX86_BUILTIN_PEXT32,
	IX86_BUILTIN_PEXT64.
	(bdesc_args): Add IX86_BUILTIN_BZHI32, IX86_BUILTIN_BZHI64,
	IX86_BUILTIN_PDEP32, IX86_BUILTIN_PDEP64, IX86_BUILTIN_PEXT32,
	IX86_BUILTIN_PEXT64.
	* config/i386/i386.h (TARGET_BMI2): New.
	* config/i386/i386.md (UNSPEC_BZHI) New.
	(UNSPEC_PDEP): Likewise.
	(UNSPEC_PEXT): Likewise.
	(define_code_iterator any_shift): New.
	(define_code_attr shiftrt_insn): Rename to ...
	(define_code_attr shift_insn): ... this.
	(define_code_attr shiftrt): Likewise.
	(define_code_attr shift): Likewise.
	(any_extend): Update.
	(define_insn "bmi2_umul<mode><dwi>3_1"): New.
	(define_expand "<u>mul<mode><dwi>3"): Update.
	(define_expand "<shiftrt_insn><mode>3"): Rename to ...
	(define_expand "<shift_insn><mode>3"): ... this.
	(define_split)<any_shift, BMI2>: New.
	(define_insn "*bmi2_<shift_insn><mode>3"): Likewise.
	(define_insn_and_split "*<shiftrt_insn><mode>3_mask"): Rename
	to ...
	(define_insn_and_split "*<shift_insn><mode>3_mask"): ... this.
	(define_insn_and_split "*<shiftrt_insn><mode>3_doubleword"):
	Rename to ...
	(define_insn_and_split "*<shift_insn><mode>3_doubleword"):
	... this, update.
	(define_peephole2) <CMOVE>: Update.
	(define_insn "*<shiftrt_insn><mode>3_1"): Rename to ...
	(define_insn "*<shift_insn><mode>3_1"): ... this. Update.
	(define_insn "*<shiftrt_insn>si3_1_zext"): Likewise.
	(define_insn "*<shift_insn>si3_1_zext"): Likewise.
	(define_insn "*<shiftrt_insn><mode>3_cmp"): Likewise.
	(define_insn "*<shift_insn><mode>3_cmp"): Likewise.
	(define_insn "*<shiftrt_insn>si3_cmp_zext"): Likewise.
	(define_insn "*<shift_insn>si3_cmp_zext"): Likewise.
	(define_insn "*<shiftrt_insn><mode>3_cconly"): Likewise.
	(define_insn "*<shift_insn><mode>3_cconly"): Likewise.
	(define_insn "*bmi2_rorx<mode>3_1"): New.
	(define_insn "bmi2_bzhi_<mode>3"): Likewise.
	(define_insn "bmi2_pdep_<mode>3"): Likewise.
	(define_insn "bmi2_pext_<mode>3"): Likewise.
	* config/i386/i386.opt (mbmi2): New.
	* config/i386/x86intrin.h: Include bmi2intrin.h when __BMI2__
	is defined.
	* doc/extend.texi: Document BMI2 built-in functions.
	* doc/invoke.texi: Document -mbmi2.

[-- Attachment #4: ChangeLog.testsuite --]
[-- Type: application/octet-stream, Size: 1690 bytes --]

2011-08-18  Kirill Yukhin  <kirill.yukhin@intel.com>

	* g++.dg/other/i386-2.C: Add -mbmi2 check.
	* g++.dg/other/i386-3.C: Likewise.
	* gcc.target/i386/bmi2-bzhi32-1.c: New testcase.
	* gcc.target/i386/bmi2-bzhi32-1a.c: Likewise.
	* gcc.target/i386/bmi2-bzhi64-1.c: Likewise.
	* gcc.target/i386/bmi2-bzhi64-1a.c: Likewise.
	* gcc.target/i386/bmi2-mulx32-1.c: Likewise.
	* gcc.target/i386/bmi2-mulx32-1a.c: Likewise.
	* gcc.target/i386/bmi2-mulx64-1.c: Likewise.
	* gcc.target/i386/bmi2-mulx64-1a.c: Likewise.
	* gcc.target/i386/bmi2-pdep32-1.c: Likewise.
	* gcc.target/i386/bmi2-pdep32-1a.c: Likewise.
	* gcc.target/i386/bmi2-pdep64-1.c: Likewise.
	* gcc.target/i386/bmi2-pdep64-1a.c: Likewise.
	* gcc.target/i386/bmi2-pext32-1.c: Likewise.
	* gcc.target/i386/bmi2-pext32-1a.c: Likewise.
	* gcc.target/i386/bmi2-pext64-1.c: Likewise.
	* gcc.target/i386/bmi2-pext64-1a.c: Likewise.
	* gcc.target/i386/bmi2-rorx32-1.c: Likewise.
	* gcc.target/i386/bmi2-rorx32-1a.c: Likewise.
	* gcc.target/i386/bmi2-rorx64-1.c: Likewise.
	* gcc.target/i386/bmi2-rorx64-1a.c: Likewise.
	* gcc.target/i386/bmi2-sarx32-1.c: Likewise.
	* gcc.target/i386/bmi2-sarx32-1a.c: Likewise.
	* gcc.target/i386/bmi2-sarx64-1.c: Likewise.
	* gcc.target/i386/bmi2-sarx64-1a.c: Likewise.
	* gcc.target/i386/bmi2-shlx32-1.c: Likewise.
	* gcc.target/i386/bmi2-shlx32-1a.c: Likewise.
	* gcc.target/i386/bmi2-shlx64-1.c: Likewise.
	* gcc.target/i386/bmi2-shlx64-1a.c: Likewise.
	* gcc.target/i386/bmi2-shrx32-1.c: Likewise.
	* gcc.target/i386/bmi2-shrx32-1a.c: Likewise.
	* gcc.target/i386/bmi2-shrx64-1.c: Likewise.
	* gcc.target/i386/bmi2-shrx64-1a.c: Likewise.
	* gcc.target/i386/i386.exp (check_effective_target_bmi2): New.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH, testsuite, i386] BMI2 support for GCC
  2011-08-19 12:27 [PATCH, testsuite, i386] BMI2 support for GCC Kirill Yukhin
@ 2011-08-19 13:26 ` H.J. Lu
  2011-08-19 14:05   ` Kirill Yukhin
  0 siblings, 1 reply; 27+ messages in thread
From: H.J. Lu @ 2011-08-19 13:26 UTC (permalink / raw)
  To: Kirill Yukhin; +Cc: gcc-patches List, Uros Bizjak

On Fri, Aug 19, 2011 at 2:23 AM, Kirill Yukhin <kirill.yukhin@gmail.com> wrote:
> Hi guys,
> I've prepared a patch which enables BMI2 extensions in GCC
>
> It conforms (hopefully) to Spec which can be found at [1]
>
> I am attaching following files:
>  - bmi2.gcc.patch. Bunch of changes to GCC
>  - ChangeLog. Entry for ChangeLog in GCC's root directory
>  - ChangeLog.testsuite. Entry for ChangeLog in GCC's test suite
>
> Bootstrap is passed
> Make-check shows no new fails, my compile-time new tests are passed
> Make-check under simulator causes all my new tests to pass
>
> Is it OK for trunk?
>
> [1] - http://software.intel.com/file/36945
>
> Thanks, K
>

Incorrect format:

+	  && CONST_INT_P (src2) ) {
+        /* We generatin RORX instruction, freedom of register +
+	   flags not affected  */
+	insn = op;
+      } else {
+	clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
+	insn = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob));
+      }

+{
+  if (can_create_pseudo_p () && <MODE>mode != SImode) {
+    rtx tmp = gen_rtx_REG (<MODE>mode, 0);
+    emit_insn (gen_extendsidi2 (tmp, operands[2]));
+    operands[2] = tmp;
+  }


-- 
H.J.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH, testsuite, i386] BMI2 support for GCC
  2011-08-19 13:26 ` H.J. Lu
@ 2011-08-19 14:05   ` Kirill Yukhin
  2011-08-19 14:13     ` Jakub Jelinek
  0 siblings, 1 reply; 27+ messages in thread
From: Kirill Yukhin @ 2011-08-19 14:05 UTC (permalink / raw)
  To: H.J. Lu; +Cc: gcc-patches List, Uros Bizjak

[-- Attachment #1: Type: text/plain, Size: 1457 bytes --]

Thanks, it is fixed.
Update patch is attached.
Is it ok?

Thanks, K

On Fri, Aug 19, 2011 at 4:48 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Fri, Aug 19, 2011 at 2:23 AM, Kirill Yukhin <kirill.yukhin@gmail.com> wrote:
>> Hi guys,
>> I've prepared a patch which enables BMI2 extensions in GCC
>>
>> It conforms (hopefully) to Spec which can be found at [1]
>>
>> I am attaching following files:
>>  - bmi2.gcc.patch. Bunch of changes to GCC
>>  - ChangeLog. Entry for ChangeLog in GCC's root directory
>>  - ChangeLog.testsuite. Entry for ChangeLog in GCC's test suite
>>
>> Bootstrap is passed
>> Make-check shows no new fails, my compile-time new tests are passed
>> Make-check under simulator causes all my new tests to pass
>>
>> Is it OK for trunk?
>>
>> [1] - http://software.intel.com/file/36945
>>
>> Thanks, K
>>
>
> Incorrect format:
>
> +         && CONST_INT_P (src2) ) {
> +        /* We generatin RORX instruction, freedom of register +
> +          flags not affected  */
> +       insn = op;
> +      } else {
> +       clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
> +       insn = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob));
> +      }
>
> +{
> +  if (can_create_pseudo_p () && <MODE>mode != SImode) {
> +    rtx tmp = gen_rtx_REG (<MODE>mode, 0);
> +    emit_insn (gen_extendsidi2 (tmp, operands[2]));
> +    operands[2] = tmp;
> +  }
>
>
> --
> H.J.
>

[-- Attachment #2: bmi2-2.gcc.patch --]
[-- Type: application/octet-stream, Size: 52589 bytes --]

diff --git a/gcc/common/config/i386/i386-common.c b/gcc/common/config/i386/i386-common.c
index b201835..99643d6 100644
--- a/gcc/common/config/i386/i386-common.c
+++ b/gcc/common/config/i386/i386-common.c
@@ -79,6 +79,7 @@ along with GCC; see the file COPYING3.  If not see
   (OPTION_MASK_ISA_ABM | OPTION_MASK_ISA_POPCNT)
 
 #define OPTION_MASK_ISA_BMI_SET OPTION_MASK_ISA_BMI
+#define OPTION_MASK_ISA_BMI2_SET OPTION_MASK_ISA_BMI2
 #define OPTION_MASK_ISA_TBM_SET OPTION_MASK_ISA_TBM
 #define OPTION_MASK_ISA_POPCNT_SET OPTION_MASK_ISA_POPCNT
 #define OPTION_MASK_ISA_CX16_SET OPTION_MASK_ISA_CX16
@@ -137,6 +138,7 @@ along with GCC; see the file COPYING3.  If not see
 #define OPTION_MASK_ISA_PCLMUL_UNSET OPTION_MASK_ISA_PCLMUL
 #define OPTION_MASK_ISA_ABM_UNSET OPTION_MASK_ISA_ABM
 #define OPTION_MASK_ISA_BMI_UNSET OPTION_MASK_ISA_BMI
+#define OPTION_MASK_ISA_BMI2_UNSET OPTION_MASK_ISA_BMI2
 #define OPTION_MASK_ISA_TBM_UNSET OPTION_MASK_ISA_TBM
 #define OPTION_MASK_ISA_POPCNT_UNSET OPTION_MASK_ISA_POPCNT
 #define OPTION_MASK_ISA_CX16_UNSET OPTION_MASK_ISA_CX16
@@ -395,6 +397,19 @@ ix86_handle_option (struct gcc_options *opts,
 	}
       return true;
 
+    case OPT_mbmi2:
+      if (value)
+	{
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI2_SET;
+	  opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_BMI2_SET;
+	}
+      else
+	{
+	  opts->x_ix86_isa_flags &= ~OPTION_MASK_ISA_BMI2_UNSET;
+	  opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_BMI2_UNSET;
+	}
+      return true;
+
     case OPT_mtbm:
       if (value)
 	{
diff --git a/gcc/config.gcc b/gcc/config.gcc
index b92ce3d..30cce99 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -352,7 +352,7 @@ i[34567]86-*-*)
 		       nmmintrin.h bmmintrin.h fma4intrin.h wmmintrin.h
 		       immintrin.h x86intrin.h avxintrin.h xopintrin.h
 		       ia32intrin.h cross-stdarg.h lwpintrin.h popcntintrin.h
-		       lzcntintrin.h bmiintrin.h tbmintrin.h"
+		       lzcntintrin.h bmiintrin.h bmi2intrin.h tbmintrin.h"
 	;;
 x86_64-*-*)
 	cpu_type=i386
@@ -364,7 +364,7 @@ x86_64-*-*)
 		       nmmintrin.h bmmintrin.h fma4intrin.h wmmintrin.h
 		       immintrin.h x86intrin.h avxintrin.h xopintrin.h
 		       ia32intrin.h cross-stdarg.h lwpintrin.h popcntintrin.h
-		       lzcntintrin.h bmiintrin.h tbmintrin.h"
+		       lzcntintrin.h bmiintrin.h bmi2intrin.h tbmintrin.h"
 	need_64bit_hwint=yes
 	;;
 ia64-*-*)
diff --git a/gcc/config/i386/bmi2intrin.h b/gcc/config/i386/bmi2intrin.h
new file mode 100644
index 0000000..f3ffa52
--- /dev/null
+++ b/gcc/config/i386/bmi2intrin.h
@@ -0,0 +1,81 @@
+/* Copyright (C) 2010, 2011 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _X86INTRIN_H_INCLUDED
+# error "Never use <bmi2intrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __BMI2__
+# error "BMI2 instruction set not enabled"
+#endif /* __BMI2__ */
+
+#ifndef _BMI2INTRIN_H_INCLUDED
+#define _BMI2INTRIN_H_INCLUDED
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_bzhi_u32 (unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_bzhi_si (__X, __Y);
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_pdep_u32 (unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_pdep_si (__X, __Y);
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_pext_u32 (unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_pext_si (__X, __Y);
+}
+
+#ifdef  __x86_64__
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_bzhi_u64 (unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_bzhi_di (__X, __Y);
+}
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_pdep_u64 (unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_pdep_di (__X, __Y);
+}
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_pext_u64 (unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_pext_di (__X, __Y);
+}
+
+#endif /* __x86_64__  */
+
+#endif /* _BMI2INTRIN_H_INCLUDED */
diff --git a/gcc/config/i386/cpuid.h b/gcc/config/i386/cpuid.h
index d53743f..5da8fd2 100644
--- a/gcc/config/i386/cpuid.h
+++ b/gcc/config/i386/cpuid.h
@@ -67,6 +67,7 @@
 #define bit_FSGSBASE	(1 << 0)
 #define bit_BMI		(1 << 3)
 #define bit_AVX2	(1 << 5)
+#define bit_BMI2	(1 << 8)
 
 #if defined(__i386__) && defined(__PIC__)
 /* %ebx may be the PIC register.  */
diff --git a/gcc/config/i386/driver-i386.c b/gcc/config/i386/driver-i386.c
index b7a1f52..8107ece 100644
--- a/gcc/config/i386/driver-i386.c
+++ b/gcc/config/i386/driver-i386.c
@@ -396,7 +396,7 @@ const char *host_detect_local_cpu (int argc, const char **argv)
   unsigned int has_popcnt = 0, has_aes = 0, has_avx = 0, has_avx2 = 0;
   unsigned int has_pclmul = 0, has_abm = 0, has_lwp = 0;
   unsigned int has_fma = 0, has_fma4 = 0, has_xop = 0;
-  unsigned int has_bmi = 0, has_tbm = 0, has_lzcnt = 0;
+  unsigned int has_bmi = 0, has_bmi2 = 0, has_tbm = 0, has_lzcnt = 0;
 
   bool arch;
 
@@ -475,6 +475,7 @@ const char *host_detect_local_cpu (int argc, const char **argv)
 
       has_bmi = ebx & bit_BMI;
       has_avx2 = ebx & bit_AVX2;
+      has_bmi2 = ebx & bit_BMI2;
     }
 
   if (!arch)
@@ -715,6 +716,7 @@ const char *host_detect_local_cpu (int argc, const char **argv)
       const char *fma4 = has_fma4 ? " -mfma4" : " -mno-fma4";
       const char *xop = has_xop ? " -mxop" : " -mno-xop";
       const char *bmi = has_bmi ? " -mbmi" : " -mno-bmi";
+      const char *bmi2 = has_bmi2 ? " -mbmi2" : " -mno-bmi2";
       const char *tbm = has_tbm ? " -mtbm" : " -mno-tbm";
       const char *avx = has_avx ? " -mavx" : " -mno-avx";
       const char *avx2 = has_avx2 ? " -mavx2" : " -mno-avx2";
@@ -723,8 +725,8 @@ const char *host_detect_local_cpu (int argc, const char **argv)
       const char *lzcnt = has_lzcnt ? " -mlzcnt" : " -mno-lzcnt";
 
       options = concat (options, cx16, sahf, movbe, ase, pclmul,
-			popcnt, abm, lwp, fma, fma4, xop, bmi, tbm,
-			avx2, avx, sse4_2, sse4_1, lzcnt, NULL);
+			popcnt, abm, lwp, fma, fma4, xop, bmi, bmi2,
+			tbm, avx, avx2, sse4_2, sse4_1, lzcnt, NULL);
     }
 
 done:
diff --git a/gcc/config/i386/i386-c.c b/gcc/config/i386/i386-c.c
index 5c1dfe6..d4b0b08 100644
--- a/gcc/config/i386/i386-c.c
+++ b/gcc/config/i386/i386-c.c
@@ -273,6 +273,8 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
     def_or_undef (parse_in, "__ABM__");
   if (isa_flag & OPTION_MASK_ISA_BMI)
     def_or_undef (parse_in, "__BMI__");
+  if (isa_flag & OPTION_MASK_ISA_BMI2)
+    def_or_undef (parse_in, "__BMI2__");
   if (isa_flag & OPTION_MASK_ISA_LZCNT)
     def_or_undef (parse_in, "__LZCNT__");
   if (isa_flag & OPTION_MASK_ISA_TBM)
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 53c5944..8a0a5f6 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -2664,6 +2664,7 @@ ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
     { "-mmmx",		OPTION_MASK_ISA_MMX },
     { "-mabm",		OPTION_MASK_ISA_ABM },
     { "-mbmi",		OPTION_MASK_ISA_BMI },
+    { "-mbmi2", 	OPTION_MASK_ISA_BMI2 },
     { "-mlzcnt",	OPTION_MASK_ISA_LZCNT },
     { "-mtbm",		OPTION_MASK_ISA_TBM },
     { "-mpopcnt",	OPTION_MASK_ISA_POPCNT },
@@ -2921,6 +2922,7 @@ ix86_option_override_internal (bool main_args_p)
 #define PTA_TBM		 	(HOST_WIDE_INT_1 << 28)
 #define PTA_XOP		 	(HOST_WIDE_INT_1 << 29)
 #define PTA_AVX2		(HOST_WIDE_INT_1 << 30)
+#define PTA_BMI2	 	(HOST_WIDE_INT_1 << 31)
 /* if this reaches 64, need to widen struct pta flags below */
 
   static struct pta
@@ -2978,8 +2980,8 @@ ix86_option_override_internal (bool main_args_p)
 	PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
 	| PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
 	| PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
-	| PTA_RDRND | PTA_F16C | PTA_BMI | PTA_LZCNT | PTA_FMA
-	| PTA_MOVBE},
+	| PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
+        | PTA_FMA | PTA_MOVBE},
       {"atom", PROCESSOR_ATOM, CPU_ATOM,
 	PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
 	| PTA_SSSE3 | PTA_CX16 | PTA_MOVBE},
@@ -3300,6 +3302,9 @@ ix86_option_override_internal (bool main_args_p)
 	if (processor_alias_table[i].flags & PTA_TBM
 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
 	  ix86_isa_flags |= OPTION_MASK_ISA_TBM;
+	if (processor_alias_table[i].flags & PTA_BMI2
+	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
+	  ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
 	if (processor_alias_table[i].flags & PTA_CX16
 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
 	  ix86_isa_flags |= OPTION_MASK_ISA_CX16;
@@ -4053,6 +4058,7 @@ ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
     IX86_ATTR_ISA ("3dnow",	OPT_m3dnow),
     IX86_ATTR_ISA ("abm",	OPT_mabm),
     IX86_ATTR_ISA ("bmi",	OPT_mbmi),
+    IX86_ATTR_ISA ("bmi2",	OPT_mbmi2),
     IX86_ATTR_ISA ("lzcnt",	OPT_mlzcnt),
     IX86_ATTR_ISA ("tbm",	OPT_mtbm),
     IX86_ATTR_ISA ("aes",	OPT_maes),
@@ -13285,6 +13291,7 @@ put_condition_code (enum rtx_code code, enum machine_mode mode, int reverse,
    If CODE is 't', pretend the mode is V8SFmode.
    If CODE is 'h', pretend the reg is the 'high' byte register.
    If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
+   If CODE is 'N', print the half mode high register.
    If CODE is 'd', duplicate the operand for AVX instruction.
  */
 
@@ -13294,6 +13301,15 @@ print_reg (rtx x, int code, FILE *file)
   const char *reg;
   bool duplicated = code == 'd' && TARGET_AVX;
 
+  if (code == 'N')
+    {
+      enum machine_mode mode = GET_MODE (x);
+      enum machine_mode half_mode = mode == TImode ? DImode : SImode;
+      x = simplify_gen_subreg (half_mode, x, mode,
+			       GET_MODE_SIZE (half_mode));
+      code = 0;
+    }
+
   gcc_assert (x == pc_rtx
 	      || (REGNO (x) != ARG_POINTER_REGNUM
 		  && REGNO (x) != FRAME_POINTER_REGNUM
@@ -13472,6 +13488,7 @@ get_some_local_dynamic_name (void)
    t --  likewise, print the V8SFmode name of the register.
    h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
    y -- print "st(0)" instead of "st" as a register.
+   N -- print the half mode high register.
    d -- print duplicated register operand for AVX instruction.
    D -- print condition for SSE cmp instruction.
    P -- if PIC, print an @PLT suffix.
@@ -13678,6 +13695,7 @@ ix86_print_operand (FILE *file, rtx x, int code)
 	case 'h':
 	case 't':
 	case 'y':
+	case 'N':
 	case 'x':
 	case 'X':
 	case 'P':
@@ -15745,8 +15763,23 @@ ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
     }
   else
     {
-      clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
-      emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
+      rtx insn;
+      if (code == ROTATERT
+	  && TARGET_BMI2
+	  && !optimize_function_for_size_p (cfun)
+	  && ((mode == SImode) || (mode == DImode && TARGET_64BIT))
+	  && CONST_INT_P (src2) )
+	{
+	  /* We generatin RORX instruction, freedom of register +
+	     flags not affected  */
+	  insn = op;
+	}
+      else
+	{
+	  clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
+	  insn = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob));
+	}
+      emit_insn (insn);
     }
 
   /* Fix up the destination if needed.  */
@@ -24072,6 +24105,13 @@ enum ix86_builtins
   IX86_BUILTIN_BEXTRI32,
   IX86_BUILTIN_BEXTRI64,
 
+  /* BMI2 instructions. */
+  IX86_BUILTIN_BZHI32,
+  IX86_BUILTIN_BZHI64,
+  IX86_BUILTIN_PDEP32,
+  IX86_BUILTIN_PDEP64,
+  IX86_BUILTIN_PEXT32,
+  IX86_BUILTIN_PEXT64,
 
   /* FSGSBASE instructions.  */
   IX86_BUILTIN_RDFSBASE32,
@@ -25046,6 +25086,14 @@ static const struct builtin_description bdesc_args[] =
   { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps256, "__builtin_ia32_vcvtph2ps256", IX86_BUILTIN_CVTPH2PS256, UNKNOWN, (int) V8SF_FTYPE_V8HI },
   { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph, "__builtin_ia32_vcvtps2ph", IX86_BUILTIN_CVTPS2PH, UNKNOWN, (int) V8HI_FTYPE_V4SF_INT },
   { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph256, "__builtin_ia32_vcvtps2ph256", IX86_BUILTIN_CVTPS2PH256, UNKNOWN, (int) V8HI_FTYPE_V8SF_INT },
+
+  /* BMI2 */
+  { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_si3, "__builtin_ia32_bzhi_si", IX86_BUILTIN_BZHI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
+  { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_di3, "__builtin_ia32_bzhi_di", IX86_BUILTIN_BZHI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
+  { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_si3, "__builtin_ia32_pdep_si", IX86_BUILTIN_PDEP32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
+  { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_di3, "__builtin_ia32_pdep_di", IX86_BUILTIN_PDEP64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
+  { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_si3, "__builtin_ia32_pext_si", IX86_BUILTIN_PEXT32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
+  { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_di3, "__builtin_ia32_pext_di", IX86_BUILTIN_PEXT64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
 };
 
 /* FMA4 and XOP.  */
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index f8a35ba..47442a0 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -62,6 +62,7 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 #define TARGET_ROUND	OPTION_ISA_ROUND
 #define TARGET_ABM	OPTION_ISA_ABM
 #define TARGET_BMI	OPTION_ISA_BMI
+#define TARGET_BMI2	OPTION_ISA_BMI2
 #define TARGET_LZCNT	OPTION_ISA_LZCNT
 #define TARGET_TBM	OPTION_ISA_TBM
 #define TARGET_POPCNT	OPTION_ISA_POPCNT
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index e7ae397..3fa6b5e 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -236,6 +236,11 @@
 
   ;; For RDRAND support
   UNSPEC_RDRAND
+
+  ;; For BMI2 support
+  UNSPEC_BZHI
+  UNSPEC_PDEP
+  UNSPEC_PEXT
 ])
 
 (define_c_enum "unspecv" [
@@ -751,14 +756,17 @@
 ;; Base name for insn mnemonic.
 (define_code_attr logic [(and "and") (ior "or") (xor "xor")])
 
+;; Mapping of shift operators
+(define_code_iterator any_shift [ashift lshiftrt ashiftrt])
+
 ;; Mapping of shift-right operators
 (define_code_iterator any_shiftrt [lshiftrt ashiftrt])
 
 ;; Base name for define_insn
-(define_code_attr shiftrt_insn [(lshiftrt "lshr") (ashiftrt "ashr")])
+(define_code_attr shift_insn [(ashift "ashl") (lshiftrt "lshr") (ashiftrt "ashr")])
 
 ;; Base name for insn mnemonic.
-(define_code_attr shiftrt [(lshiftrt "shr") (ashiftrt "sar")])
+(define_code_attr shift [(ashift "shl") (lshiftrt "shr") (ashiftrt "sar")])
 
 ;; Mapping of rotate operators
 (define_code_iterator any_rotate [rotate rotatert])
@@ -777,6 +785,8 @@
 
 ;; Used in signed and unsigned widening multiplications.
 (define_code_iterator any_extend [sign_extend zero_extend])
+(define_code_attr any_extend [(sign_extend "SIGN_EXTEND")
+			      (zero_extend "ZERO_EXTEND")])
 
 ;; Various insn prefixes for signed and unsigned operations.
 (define_code_attr u [(sign_extend "") (zero_extend "u")
@@ -6837,7 +6847,17 @@
 		       (match_operand:DWIH 1 "nonimmediate_operand" ""))
 		     (any_extend:<DWI>
 		       (match_operand:DWIH 2 "register_operand" ""))))
-	      (clobber (reg:CC FLAGS_REG))])])
+	      (clobber (reg:CC FLAGS_REG))])]
+  ""
+{
+  if (TARGET_BMI2 && <any_extend> == ZERO_EXTEND)
+    {
+      emit_insn (gen_bmi2_umul<mode><dwi>3_1 (operands[0],
+					      operands[1],
+					      operands[2]));
+      DONE;
+    }
+})
 
 (define_expand "<u>mulqihi3"
   [(parallel [(set (match_operand:HI 0 "register_operand" "")
@@ -6849,6 +6869,24 @@
 	      (clobber (reg:CC FLAGS_REG))])]
   "TARGET_QIMODE_MATH")
 
+(define_insn "bmi2_umul<mode><dwi>3_1"
+  [(set (match_operand:<DWI> 0 "register_operand" "=r")
+	(mult:<DWI>
+	  (zero_extend:<DWI>
+	    (match_operand:DWIH 1 "register_operand" "d"))
+	  (zero_extend:<DWI>
+	    (match_operand:DWIH 2 "nonimmediate_operand" "rm"))))]
+  "TARGET_BMI2"
+{
+  if (<MODE>mode == DImode)
+    return "mulx\t{%2, %q0, %N0|%N0, %q0, %2}";
+  else
+    return "mulx\t{%2, %k0, %N0|%N0, %k0, %2}";
+}
+  [(set_attr "type" "imul")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<MODE>")])
+
 (define_insn "*<u>mul<mode><dwi>3_1"
   [(set (match_operand:<DWI> 0 "register_operand" "=A")
 	(mult:<DWI>
@@ -9587,15 +9625,43 @@
 
 ;; See comment above `ashl<mode>3' about how this works.
 
-(define_expand "<shiftrt_insn><mode>3"
+(define_expand "<shift_insn><mode>3"
   [(set (match_operand:SDWIM 0 "<shift_operand>" "")
 	(any_shiftrt:SDWIM (match_operand:SDWIM 1 "<shift_operand>" "")
 			   (match_operand:QI 2 "nonmemory_operand" "")))]
   ""
   "ix86_expand_binary_operator (<CODE>, <MODE>mode, operands); DONE;")
 
+;; Update pattern if BMI2 is available
+(define_split
+  [(set (match_operand:SWI48 0 "register_operand" "")
+	(any_shift:SWI48
+	  (match_operand:SWI48 1 "nonimmediate_operand" "")
+	  (subreg:QI
+	      (match_operand:SI 2 "register_operand" "") 0)))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_BMI2 && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands) && !reload_completed"
+  [(set (match_dup 0)
+        (any_shift:SWI48 (match_dup 1) (match_dup 2)))]
+{
+  if (can_create_pseudo_p () && <MODE>mode != SImode) {
+    rtx tmp = gen_rtx_REG (<MODE>mode, 0);
+    emit_insn (gen_extendsidi2 (tmp, operands[2]));
+    operands[2] = tmp;
+  }
+})
+
+(define_insn "*bmi2_<shift_insn><mode>3"
+  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=r")
+	(any_shift:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" "rm")
+			 (match_operand:SWI48 2 "register_operand" "r")))]
+  "TARGET_BMI2 && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
+  "<shift>x\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "ishift")
+   (set_attr "mode" "<MODE>")])
+
 ;; Avoid useless masking of count operand.
-(define_insn_and_split "*<shiftrt_insn><mode>3_mask"
+(define_insn_and_split "*<shift_insn><mode>3_mask"
   [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm")
 	(any_shiftrt:SWI48
 	  (match_operand:SWI48 1 "nonimmediate_operand" "0")
@@ -9621,7 +9687,7 @@
   [(set_attr "type" "ishift")
    (set_attr "mode" "<MODE>")])
 
-(define_insn_and_split "*<shiftrt_insn><mode>3_doubleword"
+(define_insn_and_split "*<shift_insn><mode>3_doubleword"
   [(set (match_operand:DWI 0 "register_operand" "=r")
 	(any_shiftrt:DWI (match_operand:DWI 1 "register_operand" "0")
 			 (match_operand:QI 2 "nonmemory_operand" "<S>c")))
@@ -9630,7 +9696,7 @@
   "#"
   "(optimize && flag_peephole2) ? epilogue_completed : reload_completed"
   [(const_int 0)]
-  "ix86_split_<shiftrt_insn> (operands, NULL_RTX, <MODE>mode); DONE;"
+  "ix86_split_<shift_insn> (operands, NULL_RTX, <MODE>mode); DONE;"
   [(set_attr "type" "multi")])
 
 ;; By default we don't ask for a scratch register, because when DWImode
@@ -9647,7 +9713,7 @@
    (match_dup 3)]
   "TARGET_CMOVE"
   [(const_int 0)]
-  "ix86_split_<shiftrt_insn> (operands, operands[3], <DWI>mode); DONE;")
+  "ix86_split_<shift_insn> (operands, operands[3], <DWI>mode); DONE;")
 
 (define_insn "x86_64_shrd"
   [(set (match_operand:DI 0 "nonimmediate_operand" "+r*m")
@@ -9763,7 +9829,7 @@
   DONE;
 })
 
-(define_insn "*<shiftrt_insn><mode>3_1"
+(define_insn "*<shift_insn><mode>3_1"
   [(set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m")
 	(any_shiftrt:SWI (match_operand:SWI 1 "nonimmediate_operand" "0")
 			 (match_operand:QI 2 "nonmemory_operand" "c<S>")))
@@ -9772,9 +9838,9 @@
 {
   if (operands[2] == const1_rtx
       && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
-    return "<shiftrt>{<imodesuffix>}\t%0";
+    return "<shift>{<imodesuffix>}\t%0";
   else
-    return "<shiftrt>{<imodesuffix>}\t{%2, %0|%0, %2}";
+    return "<shift>{<imodesuffix>}\t{%2, %0|%0, %2}";
 }
   [(set_attr "type" "ishift")
    (set (attr "length_immediate")
@@ -9786,7 +9852,7 @@
        (const_string "*")))
    (set_attr "mode" "<MODE>")])
 
-(define_insn "*<shiftrt_insn>si3_1_zext"
+(define_insn "*<shift_insn>si3_1_zext"
   [(set (match_operand:DI 0 "register_operand" "=r")
 	(zero_extend:DI
 	  (any_shiftrt:SI (match_operand:SI 1 "register_operand" "0")
@@ -9796,9 +9862,9 @@
 {
   if (operands[2] == const1_rtx
       && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
-    return "<shiftrt>{l}\t%k0";
+    return "<shift>{l}\t%k0";
   else
-    return "<shiftrt>{l}\t{%2, %k0|%k0, %2}";
+    return "<shift>{l}\t{%2, %k0|%k0, %2}";
 }
   [(set_attr "type" "ishift")
    (set (attr "length_immediate")
@@ -9810,7 +9876,7 @@
        (const_string "*")))
    (set_attr "mode" "SI")])
 
-(define_insn "*<shiftrt_insn>qi3_1_slp"
+(define_insn "*<shift_insn>qi3_1_slp"
   [(set (strict_low_part (match_operand:QI 0 "nonimmediate_operand" "+qm"))
 	(any_shiftrt:QI (match_dup 0)
 			(match_operand:QI 1 "nonmemory_operand" "cI")))
@@ -9822,9 +9888,9 @@
 {
   if (operands[1] == const1_rtx
       && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
-    return "<shiftrt>{b}\t%0";
+    return "<shift>{b}\t%0";
   else
-    return "<shiftrt>{b}\t{%1, %0|%0, %1}";
+    return "<shift>{b}\t{%1, %0|%0, %1}";
 }
   [(set_attr "type" "ishift1")
    (set (attr "length_immediate")
@@ -9839,7 +9905,7 @@
 ;; This pattern can't accept a variable shift count, since shifts by
 ;; zero don't affect the flags.  We assume that shifts by constant
 ;; zero are optimized away.
-(define_insn "*<shiftrt_insn><mode>3_cmp"
+(define_insn "*<shift_insn><mode>3_cmp"
   [(set (reg FLAGS_REG)
 	(compare
 	  (any_shiftrt:SWI
@@ -9857,9 +9923,9 @@
 {
   if (operands[2] == const1_rtx
       && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
-    return "<shiftrt>{<imodesuffix>}\t%0";
+    return "<shift>{<imodesuffix>}\t%0";
   else
-    return "<shiftrt>{<imodesuffix>}\t{%2, %0|%0, %2}";
+    return "<shift>{<imodesuffix>}\t{%2, %0|%0, %2}";
 }
   [(set_attr "type" "ishift")
    (set (attr "length_immediate")
@@ -9871,7 +9937,7 @@
        (const_string "*")))
    (set_attr "mode" "<MODE>")])
 
-(define_insn "*<shiftrt_insn>si3_cmp_zext"
+(define_insn "*<shift_insn>si3_cmp_zext"
   [(set (reg FLAGS_REG)
 	(compare
 	  (any_shiftrt:SI (match_operand:SI 1 "register_operand" "0")
@@ -9889,9 +9955,9 @@
 {
   if (operands[2] == const1_rtx
       && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
-    return "<shiftrt>{l}\t%k0";
+    return "<shift>{l}\t%k0";
   else
-    return "<shiftrt>{l}\t{%2, %k0|%k0, %2}";
+    return "<shift>{l}\t{%2, %k0|%k0, %2}";
 }
   [(set_attr "type" "ishift")
    (set (attr "length_immediate")
@@ -9903,7 +9969,7 @@
        (const_string "*")))
    (set_attr "mode" "SI")])
 
-(define_insn "*<shiftrt_insn><mode>3_cconly"
+(define_insn "*<shift_insn><mode>3_cconly"
   [(set (reg FLAGS_REG)
 	(compare
 	  (any_shiftrt:SWI
@@ -9919,9 +9985,9 @@
 {
   if (operands[2] == const1_rtx
       && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
-    return "<shiftrt>{<imodesuffix>}\t%0";
+    return "<shift>{<imodesuffix>}\t%0";
   else
-    return "<shiftrt>{<imodesuffix>}\t{%2, %0|%0, %2}";
+    return "<shift>{<imodesuffix>}\t{%2, %0|%0, %2}";
 }
   [(set_attr "type" "ishift")
    (set (attr "length_immediate")
@@ -10060,6 +10126,15 @@
   split_double_mode (<DWI>mode, &operands[0], 1, &operands[4], &operands[5]);
 })
 
+(define_insn "*bmi2_rorx<mode>3_1"
+  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=r")
+	(rotatert:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" "rm")
+			(match_operand:QI 2 "immediate_operand" "c<S>")))]
+  "TARGET_BMI2 && ix86_binary_operator_ok (ROTATERT, <MODE>mode, operands)"
+  "rorx\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "rotate")
+   (set_attr "mode" "<MODE>")])
+
 (define_insn "*<rotate_insn><mode>3_1"
   [(set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m")
 	(any_rotate:SWI (match_operand:SWI 1 "nonimmediate_operand" "0")
@@ -12346,6 +12421,42 @@
   "xor{b}\t{%h0, %b0|%b0, %h0}"
   [(set_attr "length" "2")
    (set_attr "mode" "HI")])
+
+;; BMI2 instructions.
+(define_insn "bmi2_bzhi_<mode>3"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+        (unspec:SWI48 [(match_operand:SWI48 1 "nonimmediate_operand" "rm")
+                       (match_operand:SWI48 2 "register_operand" "r")]
+                       UNSPEC_BZHI))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_BMI2"
+  "bzhi\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "bitmanip")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "bmi2_pdep_<mode>3"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+        (unspec:SWI48 [(match_operand:SWI48 1 "nonimmediate_operand" "rm")
+                       (match_operand:SWI48 2 "register_operand" "r")]
+                       UNSPEC_PDEP))]
+  "TARGET_BMI2"
+  "pdep\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "bitmanip")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "bmi2_pext_<mode>3"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+        (unspec:SWI48 [(match_operand:SWI48 1 "nonimmediate_operand" "rm")
+                       (match_operand:SWI48 2 "register_operand" "r")]
+                       UNSPEC_PEXT))]
+  "TARGET_BMI2"
+  "pext\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "bitmanip")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<MODE>")])
+
 \f
 ;; Thread-local storage patterns for ELF.
 ;;
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index 54d7af1..8e4d51b 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -493,6 +493,10 @@ mbmi
 Target Report Mask(ISA_BMI) Var(ix86_isa_flags) Save
 Support BMI built-in functions and code generation
 
+mbmi2
+Target Report Mask(ISA_BMI2) Var(ix86_isa_flags) Save
+Support BMI2 built-in functions and code generation
+
 mlzcnt
 Target Report Mask(ISA_LZCNT) Var(ix86_isa_flags) Save
 Support LZCNT built-in function and code generation
diff --git a/gcc/config/i386/x86intrin.h b/gcc/config/i386/x86intrin.h
index 88456f9..e01ecd2 100644
--- a/gcc/config/i386/x86intrin.h
+++ b/gcc/config/i386/x86intrin.h
@@ -81,6 +81,10 @@
 #include <bmiintrin.h>
 #endif
 
+#ifdef __BMI2__
+#include <bmi2intrin.h>
+#endif
+
 #ifdef __TBM__
 #include <tbmintrin.h>
 #endif
diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index 786c18d..1900276 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -9693,6 +9693,17 @@ unsigned int __builtin_ia32_bextr_u32(unsigned int, unsigned int);
 unsigned long long __builtin_ia32_bextr_u64 (unsigned long long, unsigned long long);
 @end smallexample
 
+The following built-in functions are available when @option{-mbmi2} is used.
+All of them generate the machine instruction that is part of the name.
+@smallexample
+unsigned int _bzhi_u32 (unsigned int, unsigned int)
+unsigned int _pdep_u32 (unsigned int, unsigned int)
+unsigned int _pext_u32 (unsigned int, unsigned int)
+unsigned long long _bzhi_u64 (unsigned long long, unsigned long long)
+unsigned long long _pdep_u64 (unsigned long long, unsigned long long)
+unsigned long long _pext_u64 (unsigned long long, unsigned long long)
+@end smallexample
+
 The following built-in functions are available when @option{-mlzcnt} is used.
 All of them generate the machine instruction that is part of the name.
 @smallexample
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index fdc3297..acf30e3 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -607,7 +607,7 @@ Objective-C and Objective-C++ Dialects}.
 -mmmx  -msse  -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -msse4 -mavx @gol
 -mavx2 -maes -mpclmul -mfsgsbase -mrdrnd -mf16c -mfma @gol
 -msse4a -m3dnow -mpopcnt -mabm -mbmi -mtbm -mfma4 -mxop -mlzcnt @gol
--mlwp -mthreads  -mno-align-stringops  -minline-all-stringops @gol
+-mbmi2 -mlwp -mthreads  -mno-align-stringops  -minline-all-stringops @gol
 -minline-stringops-dynamically -mstringop-strategy=@var{alg} @gol
 -mpush-args  -maccumulate-outgoing-args  -m128bit-long-double @gol
 -m96bit-long-double  -mregparm=@var{num}  -msseregparm @gol
@@ -12697,7 +12697,9 @@ preferred alignment to @option{-mpreferred-stack-boundary=2}.
 @itemx -mabm
 @itemx -mno-abm
 @itemx -mbmi
+@itemx -mbmi2
 @itemx -mno-bmi
+@itemx -mno-bmi2
 @itemx -mlzcnt
 @itemx -mno-lzcnt
 @itemx -mtbm
@@ -12709,8 +12711,9 @@ preferred alignment to @option{-mpreferred-stack-boundary=2}.
 @opindex m3dnow
 @opindex mno-3dnow
 These switches enable or disable the use of instructions in the MMX, SSE,
-SSE2, SSE3, SSSE3, SSE4.1, AVX, AVX2, AES, PCLMUL, FSGSBASE, RDRND, F16C, FMA,
-SSE4A, FMA4, XOP, LWP, ABM, BMI, LZCNT or 3DNow!@: extended instruction sets.
+SSE2, SSE3, SSSE3, SSE4.1, AVX, AVX2, AES, PCLMUL, FSGSBASE, RDRND, F16C,
+FMA, SSE4A, FMA4, XOP, LWP, ABM, BMI, BMI2, LZCNT or 3DNow!
+@: extended instruction sets.
 These extensions are also available as built-in functions: see
 @ref{X86 Built-in Functions}, for details of the functions enabled and
 disabled by these switches.
diff --git a/gcc/testsuite/g++.dg/other/i386-2.C b/gcc/testsuite/g++.dg/other/i386-2.C
index ed183c7..5f2eaf9 100644
--- a/gcc/testsuite/g++.dg/other/i386-2.C
+++ b/gcc/testsuite/g++.dg/other/i386-2.C
@@ -1,5 +1,5 @@
 /* { dg-do compile { target i?86-*-* x86_64-*-* } } */
-/* { dg-options "-O -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c" } */
+/* { dg-options "-O -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c" } */
 
 /* Test that {,x,e,p,t,s,w,a,b,i}mmintrin.h, mm3dnow.h, fma4intrin.h,
    xopintrin.h, abmintrin.h, bmiintrin.h, tbmintrin.h, lwpintrin.h,
diff --git a/gcc/testsuite/g++.dg/other/i386-3.C b/gcc/testsuite/g++.dg/other/i386-3.C
index 626f972..76d4d19 100644
--- a/gcc/testsuite/g++.dg/other/i386-3.C
+++ b/gcc/testsuite/g++.dg/other/i386-3.C
@@ -1,5 +1,5 @@
 /* { dg-do compile { target i?86-*-* x86_64-*-* } } */
-/* { dg-options "-O -fkeep-inline-functions -march=k8 -msse4a -m3dnow -mavx -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c" } */
+/* { dg-options "-O -fkeep-inline-functions -march=k8 -msse4a -m3dnow -mavx -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c" } */
 
 /* Test that {,x,e,p,t,s,w,a,b,i}mmintrin.h, mm3dnow.h, fma4intrin.h,
    xopintrin.h, abmintrin.h, bmiintrin.h, tbmintrin.h, lwpintrin.h,
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-bzhi32-1.c b/gcc/testsuite/gcc.target/i386/bmi2-bzhi32-1.c
new file mode 100644
index 0000000..aae2353
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-bzhi32-1.c
@@ -0,0 +1,35 @@
+/* { dg-do run { target { bmi2 } } } */
+/* { dg-options "-mbmi2 -O2" } */
+
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned
+calc_bzhi_u32 (unsigned a, int l)
+{
+  unsigned res = a;
+  int i;
+  for (i=0; i<32-l; ++i)
+    res &= ~(1 << (31 - i));
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned src = 0xce7ace0f;
+  unsigned res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_bzhi_u32 (src, i * 2);
+    res = _bzhi_u32 (src, i * 2);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-bzhi32-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-bzhi32-1a.c
new file mode 100644
index 0000000..79e47a4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-bzhi32-1a.c
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+#include "bmi2-bzhi32-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_bzhi_si3" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-bzhi64-1.c b/gcc/testsuite/gcc.target/i386/bmi2-bzhi64-1.c
new file mode 100644
index 0000000..8db29db
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-bzhi64-1.c
@@ -0,0 +1,35 @@
+/* { dg-do run { target { bmi2 && { ! ia32 } } } } */
+/* { dg-options "-mbmi2 -O2" } */
+
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned long long
+calc_bzhi_u64 (unsigned long long a, int l)
+{
+  unsigned long long res = a;
+  int i;
+  for (i=0; i<64-l; ++i)
+    res &= ~(1LL << (63 - i));
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned long long src = 0xce7ace0ce7ace0ff;
+  unsigned long long res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_bzhi_u64 (src, i * 2);
+    res = _bzhi_u64 (src, i * 2);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-bzhi64-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-bzhi64-1a.c
new file mode 100644
index 0000000..dc4a94c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-bzhi64-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-bzhi64-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_bzhi_di3" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-check.h b/gcc/testsuite/gcc.target/i386/bmi2-check.h
new file mode 100644
index 0000000..5ffce44
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-check.h
@@ -0,0 +1,36 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "cpuid.h"
+
+static void bmi2_test (void);
+
+static void
+__attribute__ ((noinline))
+do_test (void)
+{
+  bmi2_test ();
+}
+
+int
+main ()
+{
+  unsigned int eax, ebx, ecx, edx;
+
+  __cpuid_count (7, 0,  eax, ebx, ecx, edx);
+
+  /* Run BMI2 test only if host has BMI2 support.  */
+  if (ebx & bit_BMI2)
+    {
+      do_test ();
+#ifdef DEBUG
+      printf ("PASSED\n");
+#endif
+    }
+#ifdef DEBUG
+  else
+    printf ("SKIPPED\n");
+#endif
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-mulx32-1.c b/gcc/testsuite/gcc.target/i386/bmi2-mulx32-1.c
new file mode 100644
index 0000000..e1d49de
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-mulx32-1.c
@@ -0,0 +1,37 @@
+/* { dg-do run { target { bmi2 && { ia32 } } } } */
+/* { dg-options "-mbmi2 -Ofast" } */
+
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned long long
+calc_mul_u32 (unsigned a, unsigned b)
+{
+  unsigned long long res = 0;
+  volatile unsigned dummy = 0;
+  int i;
+  for (i=0; i<b; ++i)
+    res += (unsigned long long)(dummy? 0 : a);
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned a = 0xce7ace0;
+  unsigned b = 0xfacefff;
+  unsigned long long res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    a = a * (i + 1);
+    b = b / (i + 1);
+
+    res_ref = calc_mul_u32 (a, b);
+    res = (unsigned long long)a * b;
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-mulx32-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-mulx32-1a.c
new file mode 100644
index 0000000..cf3bb08
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-mulx32-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile { target { ia32 } } } */
+/* { dg-options "-O2 -mbmi2 -dp" } */
+
+#include "bmi2-mulx32-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_umulsidi3_1" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-mulx64-1.c b/gcc/testsuite/gcc.target/i386/bmi2-mulx64-1.c
new file mode 100644
index 0000000..ded3dc1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-mulx64-1.c
@@ -0,0 +1,39 @@
+/* { dg-do run { target { bmi2 && { ! ia32 } } } } */
+/* { dg-options "-mbmi2 -Ofast" } */
+
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned __int128
+calc_mul_u64 (unsigned long long a, unsigned long long b)
+{
+  unsigned __int128 res = 0;
+  volatile unsigned dummy = 0;
+  int i;
+  for (i=0; i<b; ++i) {
+    /* Block loop opts  */
+    res += (unsigned __int128)(dummy? 0 : a);
+  }
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned long long a = 0xce7ace0ce7ace0;
+  unsigned long long b = 0xface;
+  unsigned __int128 res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    a = a * (i + 1);
+    b = b / (i + 1);
+
+    res_ref = calc_mul_u64 (a, b);
+    res = (unsigned __int128)a * b;
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-mulx64-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-mulx64-1a.c
new file mode 100644
index 0000000..592d713
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-mulx64-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mbmi2 -dp" } */
+
+#include "bmi2-mulx64-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_umulditi3_1" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-pdep32-1.c b/gcc/testsuite/gcc.target/i386/bmi2-pdep32-1.c
new file mode 100644
index 0000000..e44a968
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-pdep32-1.c
@@ -0,0 +1,39 @@
+/* { dg-do run { target { bmi2 } } } */
+/* { dg-options "-mbmi2 -O2" } */
+
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned
+calc_pdep_u32 (unsigned a, int mask)
+{
+  unsigned res = 0;
+  int i, k = 0;
+
+  for (i=0; i<32; ++i)
+    if (mask & (1 << i)) {
+      res |= ((a & (1 << k)) >> k) << i;
+      ++k;
+    }
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned src = 0xce7acc;
+  unsigned res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_pdep_u32 (src, i*3);
+    res = _pdep_u32 (src, i*3);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-pdep32-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-pdep32-1a.c
new file mode 100644
index 0000000..87888fc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-pdep32-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-pdep32-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_pdep_si3" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-pdep64-1.c b/gcc/testsuite/gcc.target/i386/bmi2-pdep64-1.c
new file mode 100644
index 0000000..c0074fc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-pdep64-1.c
@@ -0,0 +1,38 @@
+/* { dg-do run { target { bmi2 && { ! ia32 } } } } */
+/* { dg-options "-mbmi2 -O2" } */
+
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned long long
+calc_pdep_u64 (unsigned long long a, unsigned long long mask)
+{
+  unsigned long long res = 0;
+  unsigned long long i, k = 0;
+
+  for (i=0; i<64; ++i)
+    if (mask & (1LL << i)) {
+      res |= ((a & (1LL << k)) >> k) << i;
+      ++k;
+    }
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned long long i;
+  unsigned long long src = 0xce7acce7acce7ac;
+  unsigned long long res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_pdep_u64 (src, ~(i * 3));
+    res = _pdep_u64 (src, ~(i * 3));
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-pdep64-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-pdep64-1a.c
new file mode 100644
index 0000000..8163c40
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-pdep64-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-pdep64-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_pdep_di3" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-pext32-1.c b/gcc/testsuite/gcc.target/i386/bmi2-pext32-1.c
new file mode 100644
index 0000000..f21029f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-pext32-1.c
@@ -0,0 +1,39 @@
+/* { dg-do run { target { bmi2 } } } */
+/* { dg-options "-mbmi2 -O2" } */
+
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned
+calc_pext_u32 (unsigned a, unsigned mask)
+{
+  unsigned res = 0;
+  int i, k = 0;
+
+  for (i=0; i<32; ++i)
+    if (mask & (1 << i)) {
+      res |= ((a & (1 << i)) >> i) << k;
+      ++k;
+    }
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned src = 0xce7acc;
+  unsigned res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_pext_u32 (src, ~(i * 3));
+    res = _pext_u32 (src, ~(i * 3));
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-pext32-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-pext32-1a.c
new file mode 100644
index 0000000..c4a6dee
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-pext32-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-pext32-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_pext_si3" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-pext64-1.c b/gcc/testsuite/gcc.target/i386/bmi2-pext64-1.c
new file mode 100644
index 0000000..bad0584
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-pext64-1.c
@@ -0,0 +1,39 @@
+/* { dg-do run { target { bmi2 && { ! ia32 } } } } */
+/* { dg-options "-mbmi2 -O2" } */
+
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned long long
+calc_pext_u64 (unsigned long long a, unsigned long long mask)
+{
+  unsigned long long res = 0;
+  int i, k = 0;
+
+  for (i=0; i<64; ++i)
+    if (mask & (1LL << i)) {
+      res |= ((a & (1LL << i)) >> i) << k;
+      ++k;
+    }
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned long long i;
+  unsigned long long src = 0xce7acce7acce7ac;
+  unsigned long long res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_pext_u64 (src, ~(i * 3));
+    res = _pext_u64 (src, ~(i * 3));
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-pext64-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-pext64-1a.c
new file mode 100644
index 0000000..aaf06c1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-pext64-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-pext64-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_pext_di3" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-rorx32-1.c b/gcc/testsuite/gcc.target/i386/bmi2-rorx32-1.c
new file mode 100644
index 0000000..84618e3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-rorx32-1.c
@@ -0,0 +1,36 @@
+/* { dg-do run { target { bmi2 } } } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned
+calc_rorx_u32 (unsigned a, int l)
+{
+  unsigned res = a;
+  int i;
+  for (i=0; i<l; ++i)
+    res = (res >> 1) | ((res&1)<< 31);
+
+  return res;
+}
+
+#define SHIFT_VAL 0x0e
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned src = 0xce7ace0;
+  unsigned res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_rorx_u32 (src, SHIFT_VAL);
+    res = (src >> SHIFT_VAL) | (src << (32 - SHIFT_VAL));
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-rorx32-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-rorx32-1a.c
new file mode 100644
index 0000000..bb3b28d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-rorx32-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mbmi2 -dp" } */
+
+#include "bmi2-rorx32-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_rorxsi3_1" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-rorx64-1.c b/gcc/testsuite/gcc.target/i386/bmi2-rorx64-1.c
new file mode 100644
index 0000000..7dc722a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-rorx64-1.c
@@ -0,0 +1,36 @@
+/* { dg-do run { target { bmi2 && { ! ia32 } } } } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned long long
+calc_rorx_u64 (unsigned long long a, int l)
+{
+  unsigned long long res = a;
+  int i;
+  for (i=0; i<l; ++i)
+    res = (res >> 1) | ((res&1)<< 63);
+
+  return res;
+}
+
+#define SHIFT_VAL 0x1e
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned long long src = 0xce7ace0ce7ace0;
+  unsigned long long res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i+1);
+
+    res_ref = calc_rorx_u64 (src, SHIFT_VAL);
+    res = (src >> SHIFT_VAL) | (src << (64 - SHIFT_VAL));
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-rorx64-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-rorx64-1a.c
new file mode 100644
index 0000000..2a7a7a0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-rorx64-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mbmi2 -dp" } */
+
+#include "bmi2-rorx64-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_rorxdi3_1" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-sarx32-1.c b/gcc/testsuite/gcc.target/i386/bmi2-sarx32-1.c
new file mode 100644
index 0000000..2bbf016
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-sarx32-1.c
@@ -0,0 +1,36 @@
+/* { dg-do run { target { bmi2 } } } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+int
+calc_sarx_u32 (int a, int l)
+{
+  int res = a;
+  int i;
+  for (i=0; i<l; ++i)
+    res >>= 1;
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  int src = 0xfce7ace0;
+  int res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_sarx_u32 (src, i + 1);
+    res = src >> (i + 1);
+
+    printf ("%x %x\n", res_ref, res);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-sarx32-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-sarx32-1a.c
new file mode 100644
index 0000000..f10d60b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-sarx32-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mbmi2 -dp" } */
+
+#include "bmi2-sarx32-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_ashrsi3" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-sarx64-1.c b/gcc/testsuite/gcc.target/i386/bmi2-sarx64-1.c
new file mode 100644
index 0000000..0bb13c0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-sarx64-1.c
@@ -0,0 +1,34 @@
+/* { dg-do run { target { bmi2 && { ! ia32 } } } } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+long long
+calc_sarx_u64 (long long a, int l)
+{
+  long long res = a;
+  int i;
+  for (i=0; i<l; ++i)
+    res >>= 1;
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  long long src = 0xfce7ace0ce7ace0;
+  long long res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i+1);
+
+    res_ref = calc_sarx_u64 (src, i + 1);
+    res = src >> (i + 1);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-sarx64-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-sarx64-1a.c
new file mode 100644
index 0000000..bcf0fd4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-sarx64-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mbmi2 -dp" } */
+
+#include "bmi2-sarx64-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_ashrdi3" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-shlx32-1.c b/gcc/testsuite/gcc.target/i386/bmi2-shlx32-1.c
new file mode 100644
index 0000000..3f35047
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-shlx32-1.c
@@ -0,0 +1,34 @@
+/* { dg-do run { target { bmi2 } } } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+int
+calc_shlx_u32 (int a, int l)
+{
+  int res = a;
+  int i;
+  for (i=0; i<l; ++i)
+    res <<= 1;
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  int src = 0xfce7ace0;
+  int res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_shlx_u32 (src, i + 1);
+    res = src << (i + 1);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-shlx32-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-shlx32-1a.c
new file mode 100644
index 0000000..215e5d3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-shlx32-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mbmi2 -dp" } */
+
+#include "bmi2-shlx32-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_ashlsi3" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-shrx32-1.c b/gcc/testsuite/gcc.target/i386/bmi2-shrx32-1.c
new file mode 100644
index 0000000..17f0c67
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-shrx32-1.c
@@ -0,0 +1,36 @@
+/* { dg-do run { target { bmi2 } } } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned
+calc_shrx_u32 (unsigned a, int l)
+{
+  unsigned res = a;
+  int i;
+  for (i=0; i<l; ++i)
+    res >>= 1;
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned src = 0xce7ace0;
+  unsigned res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_shrx_u32 (src, i + 1);
+    res = src >> (i + 1);
+
+    printf ("%x %x\n", res_ref, res);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-shrx32-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-shrx32-1a.c
new file mode 100644
index 0000000..24c53d4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-shrx32-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mbmi2 -dp" } */
+
+#include "bmi2-shrx32-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_lshrsi3" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-shrx64-1.c b/gcc/testsuite/gcc.target/i386/bmi2-shrx64-1.c
new file mode 100644
index 0000000..022baa9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-shrx64-1.c
@@ -0,0 +1,34 @@
+/* { dg-do run { target { bmi2 && { ! ia32 } } } } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned long long
+calc_shrx_u64 (unsigned long long a, int l)
+{
+  unsigned long long res = a;
+  int i;
+  for (i=0; i<l; ++i)
+    res >>= 1;
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned long long src = 0xce7ace0ce7ace0;
+  unsigned long long res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i+1);
+
+    res_ref = calc_shrx_u64 (src, i + 1);
+    res = src >> (i + 1);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-shrx64-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-shrx64-1a.c
new file mode 100644
index 0000000..7830439
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-shrx64-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mbmi2 -dp" } */
+
+#include "bmi2-shrx64-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_lshrdi3" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/i386.exp b/gcc/testsuite/gcc.target/i386/i386.exp
index 167b79b..cff8a9a 100644
--- a/gcc/testsuite/gcc.target/i386/i386.exp
+++ b/gcc/testsuite/gcc.target/i386/i386.exp
@@ -206,6 +206,17 @@ proc check_effective_target_bmi { } {
     } "-mbmi" ]
 }
 
+# Return 1 if bmi2 instructions can be compiled.
+proc check_effective_target_bmi2 { } {
+    return [check_no_compiler_messages bmi2 object {
+	unsigned int
+	_bzhi_u32 (unsigned int __X, unsigned int __Y)
+	{
+	    return __builtin_ia32_bzhi_si (__X, __Y);
+	}
+    } "-mbmi2" ]
+}
+
 # If the linker used understands -M <mapfile>, pass it to clear hardware
 # capabilities set by the Sun assembler.
 set clearcap_ldflags "-Wl,-M,$srcdir/$subdir/clearcap.map"

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH, testsuite, i386] BMI2 support for GCC
  2011-08-19 14:05   ` Kirill Yukhin
@ 2011-08-19 14:13     ` Jakub Jelinek
  2011-08-19 14:34       ` Kirill Yukhin
  0 siblings, 1 reply; 27+ messages in thread
From: Jakub Jelinek @ 2011-08-19 14:13 UTC (permalink / raw)
  To: Kirill Yukhin; +Cc: H.J. Lu, gcc-patches List, Uros Bizjak

On Fri, Aug 19, 2011 at 05:18:19PM +0400, Kirill Yukhin wrote:
> Thanks, it is fixed.
> Update patch is attached.

+         /* We generatin RORX instruction, freedom of register +                                                                                  
+            flags not affected  */                                                                                                                

comment doesn't look to be correct english (missing verb, missing g at
the end of generating, missing dot at the end of sentence).

	Jakub

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH, testsuite, i386] BMI2 support for GCC
  2011-08-19 14:13     ` Jakub Jelinek
@ 2011-08-19 14:34       ` Kirill Yukhin
  2011-08-19 14:38         ` H.J. Lu
  0 siblings, 1 reply; 27+ messages in thread
From: Kirill Yukhin @ 2011-08-19 14:34 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: H.J. Lu, gcc-patches List, Uros Bizjak

[-- Attachment #1: Type: text/plain, Size: 540 bytes --]

Thanks!
Fixed, updated patch is attached.

Is it ok?

Thanks, K

On Fri, Aug 19, 2011 at 5:22 PM, Jakub Jelinek <jakub@redhat.com> wrote:
> On Fri, Aug 19, 2011 at 05:18:19PM +0400, Kirill Yukhin wrote:
>> Thanks, it is fixed.
>> Update patch is attached.
>
> +         /* We generatin RORX instruction, freedom of register +
> +            flags not affected  */
>
> comment doesn't look to be correct english (missing verb, missing g at
> the end of generating, missing dot at the end of sentence).
>
>        Jakub
>

[-- Attachment #2: bmi2-3.gcc.patch --]
[-- Type: application/octet-stream, Size: 52593 bytes --]

diff --git a/gcc/common/config/i386/i386-common.c b/gcc/common/config/i386/i386-common.c
index b201835..99643d6 100644
--- a/gcc/common/config/i386/i386-common.c
+++ b/gcc/common/config/i386/i386-common.c
@@ -79,6 +79,7 @@ along with GCC; see the file COPYING3.  If not see
   (OPTION_MASK_ISA_ABM | OPTION_MASK_ISA_POPCNT)
 
 #define OPTION_MASK_ISA_BMI_SET OPTION_MASK_ISA_BMI
+#define OPTION_MASK_ISA_BMI2_SET OPTION_MASK_ISA_BMI2
 #define OPTION_MASK_ISA_TBM_SET OPTION_MASK_ISA_TBM
 #define OPTION_MASK_ISA_POPCNT_SET OPTION_MASK_ISA_POPCNT
 #define OPTION_MASK_ISA_CX16_SET OPTION_MASK_ISA_CX16
@@ -137,6 +138,7 @@ along with GCC; see the file COPYING3.  If not see
 #define OPTION_MASK_ISA_PCLMUL_UNSET OPTION_MASK_ISA_PCLMUL
 #define OPTION_MASK_ISA_ABM_UNSET OPTION_MASK_ISA_ABM
 #define OPTION_MASK_ISA_BMI_UNSET OPTION_MASK_ISA_BMI
+#define OPTION_MASK_ISA_BMI2_UNSET OPTION_MASK_ISA_BMI2
 #define OPTION_MASK_ISA_TBM_UNSET OPTION_MASK_ISA_TBM
 #define OPTION_MASK_ISA_POPCNT_UNSET OPTION_MASK_ISA_POPCNT
 #define OPTION_MASK_ISA_CX16_UNSET OPTION_MASK_ISA_CX16
@@ -395,6 +397,19 @@ ix86_handle_option (struct gcc_options *opts,
 	}
       return true;
 
+    case OPT_mbmi2:
+      if (value)
+	{
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI2_SET;
+	  opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_BMI2_SET;
+	}
+      else
+	{
+	  opts->x_ix86_isa_flags &= ~OPTION_MASK_ISA_BMI2_UNSET;
+	  opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_BMI2_UNSET;
+	}
+      return true;
+
     case OPT_mtbm:
       if (value)
 	{
diff --git a/gcc/config.gcc b/gcc/config.gcc
index b92ce3d..30cce99 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -352,7 +352,7 @@ i[34567]86-*-*)
 		       nmmintrin.h bmmintrin.h fma4intrin.h wmmintrin.h
 		       immintrin.h x86intrin.h avxintrin.h xopintrin.h
 		       ia32intrin.h cross-stdarg.h lwpintrin.h popcntintrin.h
-		       lzcntintrin.h bmiintrin.h tbmintrin.h"
+		       lzcntintrin.h bmiintrin.h bmi2intrin.h tbmintrin.h"
 	;;
 x86_64-*-*)
 	cpu_type=i386
@@ -364,7 +364,7 @@ x86_64-*-*)
 		       nmmintrin.h bmmintrin.h fma4intrin.h wmmintrin.h
 		       immintrin.h x86intrin.h avxintrin.h xopintrin.h
 		       ia32intrin.h cross-stdarg.h lwpintrin.h popcntintrin.h
-		       lzcntintrin.h bmiintrin.h tbmintrin.h"
+		       lzcntintrin.h bmiintrin.h bmi2intrin.h tbmintrin.h"
 	need_64bit_hwint=yes
 	;;
 ia64-*-*)
diff --git a/gcc/config/i386/bmi2intrin.h b/gcc/config/i386/bmi2intrin.h
new file mode 100644
index 0000000..f3ffa52
--- /dev/null
+++ b/gcc/config/i386/bmi2intrin.h
@@ -0,0 +1,81 @@
+/* Copyright (C) 2010, 2011 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _X86INTRIN_H_INCLUDED
+# error "Never use <bmi2intrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __BMI2__
+# error "BMI2 instruction set not enabled"
+#endif /* __BMI2__ */
+
+#ifndef _BMI2INTRIN_H_INCLUDED
+#define _BMI2INTRIN_H_INCLUDED
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_bzhi_u32 (unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_bzhi_si (__X, __Y);
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_pdep_u32 (unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_pdep_si (__X, __Y);
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_pext_u32 (unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_pext_si (__X, __Y);
+}
+
+#ifdef  __x86_64__
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_bzhi_u64 (unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_bzhi_di (__X, __Y);
+}
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_pdep_u64 (unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_pdep_di (__X, __Y);
+}
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_pext_u64 (unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_pext_di (__X, __Y);
+}
+
+#endif /* __x86_64__  */
+
+#endif /* _BMI2INTRIN_H_INCLUDED */
diff --git a/gcc/config/i386/cpuid.h b/gcc/config/i386/cpuid.h
index d53743f..5da8fd2 100644
--- a/gcc/config/i386/cpuid.h
+++ b/gcc/config/i386/cpuid.h
@@ -67,6 +67,7 @@
 #define bit_FSGSBASE	(1 << 0)
 #define bit_BMI		(1 << 3)
 #define bit_AVX2	(1 << 5)
+#define bit_BMI2	(1 << 8)
 
 #if defined(__i386__) && defined(__PIC__)
 /* %ebx may be the PIC register.  */
diff --git a/gcc/config/i386/driver-i386.c b/gcc/config/i386/driver-i386.c
index b7a1f52..8107ece 100644
--- a/gcc/config/i386/driver-i386.c
+++ b/gcc/config/i386/driver-i386.c
@@ -396,7 +396,7 @@ const char *host_detect_local_cpu (int argc, const char **argv)
   unsigned int has_popcnt = 0, has_aes = 0, has_avx = 0, has_avx2 = 0;
   unsigned int has_pclmul = 0, has_abm = 0, has_lwp = 0;
   unsigned int has_fma = 0, has_fma4 = 0, has_xop = 0;
-  unsigned int has_bmi = 0, has_tbm = 0, has_lzcnt = 0;
+  unsigned int has_bmi = 0, has_bmi2 = 0, has_tbm = 0, has_lzcnt = 0;
 
   bool arch;
 
@@ -475,6 +475,7 @@ const char *host_detect_local_cpu (int argc, const char **argv)
 
       has_bmi = ebx & bit_BMI;
       has_avx2 = ebx & bit_AVX2;
+      has_bmi2 = ebx & bit_BMI2;
     }
 
   if (!arch)
@@ -715,6 +716,7 @@ const char *host_detect_local_cpu (int argc, const char **argv)
       const char *fma4 = has_fma4 ? " -mfma4" : " -mno-fma4";
       const char *xop = has_xop ? " -mxop" : " -mno-xop";
       const char *bmi = has_bmi ? " -mbmi" : " -mno-bmi";
+      const char *bmi2 = has_bmi2 ? " -mbmi2" : " -mno-bmi2";
       const char *tbm = has_tbm ? " -mtbm" : " -mno-tbm";
       const char *avx = has_avx ? " -mavx" : " -mno-avx";
       const char *avx2 = has_avx2 ? " -mavx2" : " -mno-avx2";
@@ -723,8 +725,8 @@ const char *host_detect_local_cpu (int argc, const char **argv)
       const char *lzcnt = has_lzcnt ? " -mlzcnt" : " -mno-lzcnt";
 
       options = concat (options, cx16, sahf, movbe, ase, pclmul,
-			popcnt, abm, lwp, fma, fma4, xop, bmi, tbm,
-			avx2, avx, sse4_2, sse4_1, lzcnt, NULL);
+			popcnt, abm, lwp, fma, fma4, xop, bmi, bmi2,
+			tbm, avx, avx2, sse4_2, sse4_1, lzcnt, NULL);
     }
 
 done:
diff --git a/gcc/config/i386/i386-c.c b/gcc/config/i386/i386-c.c
index 5c1dfe6..d4b0b08 100644
--- a/gcc/config/i386/i386-c.c
+++ b/gcc/config/i386/i386-c.c
@@ -273,6 +273,8 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
     def_or_undef (parse_in, "__ABM__");
   if (isa_flag & OPTION_MASK_ISA_BMI)
     def_or_undef (parse_in, "__BMI__");
+  if (isa_flag & OPTION_MASK_ISA_BMI2)
+    def_or_undef (parse_in, "__BMI2__");
   if (isa_flag & OPTION_MASK_ISA_LZCNT)
     def_or_undef (parse_in, "__LZCNT__");
   if (isa_flag & OPTION_MASK_ISA_TBM)
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 53c5944..bdfce4d 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -2664,6 +2664,7 @@ ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
     { "-mmmx",		OPTION_MASK_ISA_MMX },
     { "-mabm",		OPTION_MASK_ISA_ABM },
     { "-mbmi",		OPTION_MASK_ISA_BMI },
+    { "-mbmi2", 	OPTION_MASK_ISA_BMI2 },
     { "-mlzcnt",	OPTION_MASK_ISA_LZCNT },
     { "-mtbm",		OPTION_MASK_ISA_TBM },
     { "-mpopcnt",	OPTION_MASK_ISA_POPCNT },
@@ -2921,6 +2922,7 @@ ix86_option_override_internal (bool main_args_p)
 #define PTA_TBM		 	(HOST_WIDE_INT_1 << 28)
 #define PTA_XOP		 	(HOST_WIDE_INT_1 << 29)
 #define PTA_AVX2		(HOST_WIDE_INT_1 << 30)
+#define PTA_BMI2	 	(HOST_WIDE_INT_1 << 31)
 /* if this reaches 64, need to widen struct pta flags below */
 
   static struct pta
@@ -2978,8 +2980,8 @@ ix86_option_override_internal (bool main_args_p)
 	PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
 	| PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
 	| PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
-	| PTA_RDRND | PTA_F16C | PTA_BMI | PTA_LZCNT | PTA_FMA
-	| PTA_MOVBE},
+	| PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
+        | PTA_FMA | PTA_MOVBE},
       {"atom", PROCESSOR_ATOM, CPU_ATOM,
 	PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
 	| PTA_SSSE3 | PTA_CX16 | PTA_MOVBE},
@@ -3300,6 +3302,9 @@ ix86_option_override_internal (bool main_args_p)
 	if (processor_alias_table[i].flags & PTA_TBM
 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
 	  ix86_isa_flags |= OPTION_MASK_ISA_TBM;
+	if (processor_alias_table[i].flags & PTA_BMI2
+	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
+	  ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
 	if (processor_alias_table[i].flags & PTA_CX16
 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
 	  ix86_isa_flags |= OPTION_MASK_ISA_CX16;
@@ -4053,6 +4058,7 @@ ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
     IX86_ATTR_ISA ("3dnow",	OPT_m3dnow),
     IX86_ATTR_ISA ("abm",	OPT_mabm),
     IX86_ATTR_ISA ("bmi",	OPT_mbmi),
+    IX86_ATTR_ISA ("bmi2",	OPT_mbmi2),
     IX86_ATTR_ISA ("lzcnt",	OPT_mlzcnt),
     IX86_ATTR_ISA ("tbm",	OPT_mtbm),
     IX86_ATTR_ISA ("aes",	OPT_maes),
@@ -13285,6 +13291,7 @@ put_condition_code (enum rtx_code code, enum machine_mode mode, int reverse,
    If CODE is 't', pretend the mode is V8SFmode.
    If CODE is 'h', pretend the reg is the 'high' byte register.
    If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
+   If CODE is 'N', print the half mode high register.
    If CODE is 'd', duplicate the operand for AVX instruction.
  */
 
@@ -13294,6 +13301,15 @@ print_reg (rtx x, int code, FILE *file)
   const char *reg;
   bool duplicated = code == 'd' && TARGET_AVX;
 
+  if (code == 'N')
+    {
+      enum machine_mode mode = GET_MODE (x);
+      enum machine_mode half_mode = mode == TImode ? DImode : SImode;
+      x = simplify_gen_subreg (half_mode, x, mode,
+			       GET_MODE_SIZE (half_mode));
+      code = 0;
+    }
+
   gcc_assert (x == pc_rtx
 	      || (REGNO (x) != ARG_POINTER_REGNUM
 		  && REGNO (x) != FRAME_POINTER_REGNUM
@@ -13472,6 +13488,7 @@ get_some_local_dynamic_name (void)
    t --  likewise, print the V8SFmode name of the register.
    h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
    y -- print "st(0)" instead of "st" as a register.
+   N -- print the half mode high register.
    d -- print duplicated register operand for AVX instruction.
    D -- print condition for SSE cmp instruction.
    P -- if PIC, print an @PLT suffix.
@@ -13678,6 +13695,7 @@ ix86_print_operand (FILE *file, rtx x, int code)
 	case 'h':
 	case 't':
 	case 'y':
+	case 'N':
 	case 'x':
 	case 'X':
 	case 'P':
@@ -15745,8 +15763,23 @@ ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
     }
   else
     {
-      clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
-      emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
+      rtx insn;
+      if (code == ROTATERT
+	  && TARGET_BMI2
+	  && !optimize_function_for_size_p (cfun)
+	  && ((mode == SImode) || (mode == DImode && TARGET_64BIT))
+	  && CONST_INT_P (src2) )
+	{
+	  /* We're generating RORX instruction: freedom of register +
+	     flags not affected  */
+	  insn = op;
+	}
+      else
+	{
+	  clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
+	  insn = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob));
+	}
+      emit_insn (insn);
     }
 
   /* Fix up the destination if needed.  */
@@ -24072,6 +24105,13 @@ enum ix86_builtins
   IX86_BUILTIN_BEXTRI32,
   IX86_BUILTIN_BEXTRI64,
 
+  /* BMI2 instructions. */
+  IX86_BUILTIN_BZHI32,
+  IX86_BUILTIN_BZHI64,
+  IX86_BUILTIN_PDEP32,
+  IX86_BUILTIN_PDEP64,
+  IX86_BUILTIN_PEXT32,
+  IX86_BUILTIN_PEXT64,
 
   /* FSGSBASE instructions.  */
   IX86_BUILTIN_RDFSBASE32,
@@ -25046,6 +25086,14 @@ static const struct builtin_description bdesc_args[] =
   { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps256, "__builtin_ia32_vcvtph2ps256", IX86_BUILTIN_CVTPH2PS256, UNKNOWN, (int) V8SF_FTYPE_V8HI },
   { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph, "__builtin_ia32_vcvtps2ph", IX86_BUILTIN_CVTPS2PH, UNKNOWN, (int) V8HI_FTYPE_V4SF_INT },
   { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph256, "__builtin_ia32_vcvtps2ph256", IX86_BUILTIN_CVTPS2PH256, UNKNOWN, (int) V8HI_FTYPE_V8SF_INT },
+
+  /* BMI2 */
+  { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_si3, "__builtin_ia32_bzhi_si", IX86_BUILTIN_BZHI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
+  { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_di3, "__builtin_ia32_bzhi_di", IX86_BUILTIN_BZHI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
+  { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_si3, "__builtin_ia32_pdep_si", IX86_BUILTIN_PDEP32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
+  { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_di3, "__builtin_ia32_pdep_di", IX86_BUILTIN_PDEP64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
+  { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_si3, "__builtin_ia32_pext_si", IX86_BUILTIN_PEXT32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
+  { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_di3, "__builtin_ia32_pext_di", IX86_BUILTIN_PEXT64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
 };
 
 /* FMA4 and XOP.  */
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index f8a35ba..47442a0 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -62,6 +62,7 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 #define TARGET_ROUND	OPTION_ISA_ROUND
 #define TARGET_ABM	OPTION_ISA_ABM
 #define TARGET_BMI	OPTION_ISA_BMI
+#define TARGET_BMI2	OPTION_ISA_BMI2
 #define TARGET_LZCNT	OPTION_ISA_LZCNT
 #define TARGET_TBM	OPTION_ISA_TBM
 #define TARGET_POPCNT	OPTION_ISA_POPCNT
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index e7ae397..3fa6b5e 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -236,6 +236,11 @@
 
   ;; For RDRAND support
   UNSPEC_RDRAND
+
+  ;; For BMI2 support
+  UNSPEC_BZHI
+  UNSPEC_PDEP
+  UNSPEC_PEXT
 ])
 
 (define_c_enum "unspecv" [
@@ -751,14 +756,17 @@
 ;; Base name for insn mnemonic.
 (define_code_attr logic [(and "and") (ior "or") (xor "xor")])
 
+;; Mapping of shift operators
+(define_code_iterator any_shift [ashift lshiftrt ashiftrt])
+
 ;; Mapping of shift-right operators
 (define_code_iterator any_shiftrt [lshiftrt ashiftrt])
 
 ;; Base name for define_insn
-(define_code_attr shiftrt_insn [(lshiftrt "lshr") (ashiftrt "ashr")])
+(define_code_attr shift_insn [(ashift "ashl") (lshiftrt "lshr") (ashiftrt "ashr")])
 
 ;; Base name for insn mnemonic.
-(define_code_attr shiftrt [(lshiftrt "shr") (ashiftrt "sar")])
+(define_code_attr shift [(ashift "shl") (lshiftrt "shr") (ashiftrt "sar")])
 
 ;; Mapping of rotate operators
 (define_code_iterator any_rotate [rotate rotatert])
@@ -777,6 +785,8 @@
 
 ;; Used in signed and unsigned widening multiplications.
 (define_code_iterator any_extend [sign_extend zero_extend])
+(define_code_attr any_extend [(sign_extend "SIGN_EXTEND")
+			      (zero_extend "ZERO_EXTEND")])
 
 ;; Various insn prefixes for signed and unsigned operations.
 (define_code_attr u [(sign_extend "") (zero_extend "u")
@@ -6837,7 +6847,17 @@
 		       (match_operand:DWIH 1 "nonimmediate_operand" ""))
 		     (any_extend:<DWI>
 		       (match_operand:DWIH 2 "register_operand" ""))))
-	      (clobber (reg:CC FLAGS_REG))])])
+	      (clobber (reg:CC FLAGS_REG))])]
+  ""
+{
+  if (TARGET_BMI2 && <any_extend> == ZERO_EXTEND)
+    {
+      emit_insn (gen_bmi2_umul<mode><dwi>3_1 (operands[0],
+					      operands[1],
+					      operands[2]));
+      DONE;
+    }
+})
 
 (define_expand "<u>mulqihi3"
   [(parallel [(set (match_operand:HI 0 "register_operand" "")
@@ -6849,6 +6869,24 @@
 	      (clobber (reg:CC FLAGS_REG))])]
   "TARGET_QIMODE_MATH")
 
+(define_insn "bmi2_umul<mode><dwi>3_1"
+  [(set (match_operand:<DWI> 0 "register_operand" "=r")
+	(mult:<DWI>
+	  (zero_extend:<DWI>
+	    (match_operand:DWIH 1 "register_operand" "d"))
+	  (zero_extend:<DWI>
+	    (match_operand:DWIH 2 "nonimmediate_operand" "rm"))))]
+  "TARGET_BMI2"
+{
+  if (<MODE>mode == DImode)
+    return "mulx\t{%2, %q0, %N0|%N0, %q0, %2}";
+  else
+    return "mulx\t{%2, %k0, %N0|%N0, %k0, %2}";
+}
+  [(set_attr "type" "imul")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<MODE>")])
+
 (define_insn "*<u>mul<mode><dwi>3_1"
   [(set (match_operand:<DWI> 0 "register_operand" "=A")
 	(mult:<DWI>
@@ -9587,15 +9625,43 @@
 
 ;; See comment above `ashl<mode>3' about how this works.
 
-(define_expand "<shiftrt_insn><mode>3"
+(define_expand "<shift_insn><mode>3"
   [(set (match_operand:SDWIM 0 "<shift_operand>" "")
 	(any_shiftrt:SDWIM (match_operand:SDWIM 1 "<shift_operand>" "")
 			   (match_operand:QI 2 "nonmemory_operand" "")))]
   ""
   "ix86_expand_binary_operator (<CODE>, <MODE>mode, operands); DONE;")
 
+;; Update pattern if BMI2 is available
+(define_split
+  [(set (match_operand:SWI48 0 "register_operand" "")
+	(any_shift:SWI48
+	  (match_operand:SWI48 1 "nonimmediate_operand" "")
+	  (subreg:QI
+	      (match_operand:SI 2 "register_operand" "") 0)))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_BMI2 && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands) && !reload_completed"
+  [(set (match_dup 0)
+        (any_shift:SWI48 (match_dup 1) (match_dup 2)))]
+{
+  if (can_create_pseudo_p () && <MODE>mode != SImode) {
+    rtx tmp = gen_rtx_REG (<MODE>mode, 0);
+    emit_insn (gen_extendsidi2 (tmp, operands[2]));
+    operands[2] = tmp;
+  }
+})
+
+(define_insn "*bmi2_<shift_insn><mode>3"
+  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=r")
+	(any_shift:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" "rm")
+			 (match_operand:SWI48 2 "register_operand" "r")))]
+  "TARGET_BMI2 && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
+  "<shift>x\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "ishift")
+   (set_attr "mode" "<MODE>")])
+
 ;; Avoid useless masking of count operand.
-(define_insn_and_split "*<shiftrt_insn><mode>3_mask"
+(define_insn_and_split "*<shift_insn><mode>3_mask"
   [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm")
 	(any_shiftrt:SWI48
 	  (match_operand:SWI48 1 "nonimmediate_operand" "0")
@@ -9621,7 +9687,7 @@
   [(set_attr "type" "ishift")
    (set_attr "mode" "<MODE>")])
 
-(define_insn_and_split "*<shiftrt_insn><mode>3_doubleword"
+(define_insn_and_split "*<shift_insn><mode>3_doubleword"
   [(set (match_operand:DWI 0 "register_operand" "=r")
 	(any_shiftrt:DWI (match_operand:DWI 1 "register_operand" "0")
 			 (match_operand:QI 2 "nonmemory_operand" "<S>c")))
@@ -9630,7 +9696,7 @@
   "#"
   "(optimize && flag_peephole2) ? epilogue_completed : reload_completed"
   [(const_int 0)]
-  "ix86_split_<shiftrt_insn> (operands, NULL_RTX, <MODE>mode); DONE;"
+  "ix86_split_<shift_insn> (operands, NULL_RTX, <MODE>mode); DONE;"
   [(set_attr "type" "multi")])
 
 ;; By default we don't ask for a scratch register, because when DWImode
@@ -9647,7 +9713,7 @@
    (match_dup 3)]
   "TARGET_CMOVE"
   [(const_int 0)]
-  "ix86_split_<shiftrt_insn> (operands, operands[3], <DWI>mode); DONE;")
+  "ix86_split_<shift_insn> (operands, operands[3], <DWI>mode); DONE;")
 
 (define_insn "x86_64_shrd"
   [(set (match_operand:DI 0 "nonimmediate_operand" "+r*m")
@@ -9763,7 +9829,7 @@
   DONE;
 })
 
-(define_insn "*<shiftrt_insn><mode>3_1"
+(define_insn "*<shift_insn><mode>3_1"
   [(set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m")
 	(any_shiftrt:SWI (match_operand:SWI 1 "nonimmediate_operand" "0")
 			 (match_operand:QI 2 "nonmemory_operand" "c<S>")))
@@ -9772,9 +9838,9 @@
 {
   if (operands[2] == const1_rtx
       && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
-    return "<shiftrt>{<imodesuffix>}\t%0";
+    return "<shift>{<imodesuffix>}\t%0";
   else
-    return "<shiftrt>{<imodesuffix>}\t{%2, %0|%0, %2}";
+    return "<shift>{<imodesuffix>}\t{%2, %0|%0, %2}";
 }
   [(set_attr "type" "ishift")
    (set (attr "length_immediate")
@@ -9786,7 +9852,7 @@
        (const_string "*")))
    (set_attr "mode" "<MODE>")])
 
-(define_insn "*<shiftrt_insn>si3_1_zext"
+(define_insn "*<shift_insn>si3_1_zext"
   [(set (match_operand:DI 0 "register_operand" "=r")
 	(zero_extend:DI
 	  (any_shiftrt:SI (match_operand:SI 1 "register_operand" "0")
@@ -9796,9 +9862,9 @@
 {
   if (operands[2] == const1_rtx
       && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
-    return "<shiftrt>{l}\t%k0";
+    return "<shift>{l}\t%k0";
   else
-    return "<shiftrt>{l}\t{%2, %k0|%k0, %2}";
+    return "<shift>{l}\t{%2, %k0|%k0, %2}";
 }
   [(set_attr "type" "ishift")
    (set (attr "length_immediate")
@@ -9810,7 +9876,7 @@
        (const_string "*")))
    (set_attr "mode" "SI")])
 
-(define_insn "*<shiftrt_insn>qi3_1_slp"
+(define_insn "*<shift_insn>qi3_1_slp"
   [(set (strict_low_part (match_operand:QI 0 "nonimmediate_operand" "+qm"))
 	(any_shiftrt:QI (match_dup 0)
 			(match_operand:QI 1 "nonmemory_operand" "cI")))
@@ -9822,9 +9888,9 @@
 {
   if (operands[1] == const1_rtx
       && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
-    return "<shiftrt>{b}\t%0";
+    return "<shift>{b}\t%0";
   else
-    return "<shiftrt>{b}\t{%1, %0|%0, %1}";
+    return "<shift>{b}\t{%1, %0|%0, %1}";
 }
   [(set_attr "type" "ishift1")
    (set (attr "length_immediate")
@@ -9839,7 +9905,7 @@
 ;; This pattern can't accept a variable shift count, since shifts by
 ;; zero don't affect the flags.  We assume that shifts by constant
 ;; zero are optimized away.
-(define_insn "*<shiftrt_insn><mode>3_cmp"
+(define_insn "*<shift_insn><mode>3_cmp"
   [(set (reg FLAGS_REG)
 	(compare
 	  (any_shiftrt:SWI
@@ -9857,9 +9923,9 @@
 {
   if (operands[2] == const1_rtx
       && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
-    return "<shiftrt>{<imodesuffix>}\t%0";
+    return "<shift>{<imodesuffix>}\t%0";
   else
-    return "<shiftrt>{<imodesuffix>}\t{%2, %0|%0, %2}";
+    return "<shift>{<imodesuffix>}\t{%2, %0|%0, %2}";
 }
   [(set_attr "type" "ishift")
    (set (attr "length_immediate")
@@ -9871,7 +9937,7 @@
        (const_string "*")))
    (set_attr "mode" "<MODE>")])
 
-(define_insn "*<shiftrt_insn>si3_cmp_zext"
+(define_insn "*<shift_insn>si3_cmp_zext"
   [(set (reg FLAGS_REG)
 	(compare
 	  (any_shiftrt:SI (match_operand:SI 1 "register_operand" "0")
@@ -9889,9 +9955,9 @@
 {
   if (operands[2] == const1_rtx
       && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
-    return "<shiftrt>{l}\t%k0";
+    return "<shift>{l}\t%k0";
   else
-    return "<shiftrt>{l}\t{%2, %k0|%k0, %2}";
+    return "<shift>{l}\t{%2, %k0|%k0, %2}";
 }
   [(set_attr "type" "ishift")
    (set (attr "length_immediate")
@@ -9903,7 +9969,7 @@
        (const_string "*")))
    (set_attr "mode" "SI")])
 
-(define_insn "*<shiftrt_insn><mode>3_cconly"
+(define_insn "*<shift_insn><mode>3_cconly"
   [(set (reg FLAGS_REG)
 	(compare
 	  (any_shiftrt:SWI
@@ -9919,9 +9985,9 @@
 {
   if (operands[2] == const1_rtx
       && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
-    return "<shiftrt>{<imodesuffix>}\t%0";
+    return "<shift>{<imodesuffix>}\t%0";
   else
-    return "<shiftrt>{<imodesuffix>}\t{%2, %0|%0, %2}";
+    return "<shift>{<imodesuffix>}\t{%2, %0|%0, %2}";
 }
   [(set_attr "type" "ishift")
    (set (attr "length_immediate")
@@ -10060,6 +10126,15 @@
   split_double_mode (<DWI>mode, &operands[0], 1, &operands[4], &operands[5]);
 })
 
+(define_insn "*bmi2_rorx<mode>3_1"
+  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=r")
+	(rotatert:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" "rm")
+			(match_operand:QI 2 "immediate_operand" "c<S>")))]
+  "TARGET_BMI2 && ix86_binary_operator_ok (ROTATERT, <MODE>mode, operands)"
+  "rorx\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "rotate")
+   (set_attr "mode" "<MODE>")])
+
 (define_insn "*<rotate_insn><mode>3_1"
   [(set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m")
 	(any_rotate:SWI (match_operand:SWI 1 "nonimmediate_operand" "0")
@@ -12346,6 +12421,42 @@
   "xor{b}\t{%h0, %b0|%b0, %h0}"
   [(set_attr "length" "2")
    (set_attr "mode" "HI")])
+
+;; BMI2 instructions.
+(define_insn "bmi2_bzhi_<mode>3"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+        (unspec:SWI48 [(match_operand:SWI48 1 "nonimmediate_operand" "rm")
+                       (match_operand:SWI48 2 "register_operand" "r")]
+                       UNSPEC_BZHI))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_BMI2"
+  "bzhi\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "bitmanip")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "bmi2_pdep_<mode>3"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+        (unspec:SWI48 [(match_operand:SWI48 1 "nonimmediate_operand" "rm")
+                       (match_operand:SWI48 2 "register_operand" "r")]
+                       UNSPEC_PDEP))]
+  "TARGET_BMI2"
+  "pdep\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "bitmanip")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "bmi2_pext_<mode>3"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+        (unspec:SWI48 [(match_operand:SWI48 1 "nonimmediate_operand" "rm")
+                       (match_operand:SWI48 2 "register_operand" "r")]
+                       UNSPEC_PEXT))]
+  "TARGET_BMI2"
+  "pext\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "bitmanip")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<MODE>")])
+
 \f
 ;; Thread-local storage patterns for ELF.
 ;;
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index 54d7af1..8e4d51b 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -493,6 +493,10 @@ mbmi
 Target Report Mask(ISA_BMI) Var(ix86_isa_flags) Save
 Support BMI built-in functions and code generation
 
+mbmi2
+Target Report Mask(ISA_BMI2) Var(ix86_isa_flags) Save
+Support BMI2 built-in functions and code generation
+
 mlzcnt
 Target Report Mask(ISA_LZCNT) Var(ix86_isa_flags) Save
 Support LZCNT built-in function and code generation
diff --git a/gcc/config/i386/x86intrin.h b/gcc/config/i386/x86intrin.h
index 88456f9..e01ecd2 100644
--- a/gcc/config/i386/x86intrin.h
+++ b/gcc/config/i386/x86intrin.h
@@ -81,6 +81,10 @@
 #include <bmiintrin.h>
 #endif
 
+#ifdef __BMI2__
+#include <bmi2intrin.h>
+#endif
+
 #ifdef __TBM__
 #include <tbmintrin.h>
 #endif
diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index 786c18d..1900276 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -9693,6 +9693,17 @@ unsigned int __builtin_ia32_bextr_u32(unsigned int, unsigned int);
 unsigned long long __builtin_ia32_bextr_u64 (unsigned long long, unsigned long long);
 @end smallexample
 
+The following built-in functions are available when @option{-mbmi2} is used.
+All of them generate the machine instruction that is part of the name.
+@smallexample
+unsigned int _bzhi_u32 (unsigned int, unsigned int)
+unsigned int _pdep_u32 (unsigned int, unsigned int)
+unsigned int _pext_u32 (unsigned int, unsigned int)
+unsigned long long _bzhi_u64 (unsigned long long, unsigned long long)
+unsigned long long _pdep_u64 (unsigned long long, unsigned long long)
+unsigned long long _pext_u64 (unsigned long long, unsigned long long)
+@end smallexample
+
 The following built-in functions are available when @option{-mlzcnt} is used.
 All of them generate the machine instruction that is part of the name.
 @smallexample
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index fdc3297..acf30e3 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -607,7 +607,7 @@ Objective-C and Objective-C++ Dialects}.
 -mmmx  -msse  -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -msse4 -mavx @gol
 -mavx2 -maes -mpclmul -mfsgsbase -mrdrnd -mf16c -mfma @gol
 -msse4a -m3dnow -mpopcnt -mabm -mbmi -mtbm -mfma4 -mxop -mlzcnt @gol
--mlwp -mthreads  -mno-align-stringops  -minline-all-stringops @gol
+-mbmi2 -mlwp -mthreads  -mno-align-stringops  -minline-all-stringops @gol
 -minline-stringops-dynamically -mstringop-strategy=@var{alg} @gol
 -mpush-args  -maccumulate-outgoing-args  -m128bit-long-double @gol
 -m96bit-long-double  -mregparm=@var{num}  -msseregparm @gol
@@ -12697,7 +12697,9 @@ preferred alignment to @option{-mpreferred-stack-boundary=2}.
 @itemx -mabm
 @itemx -mno-abm
 @itemx -mbmi
+@itemx -mbmi2
 @itemx -mno-bmi
+@itemx -mno-bmi2
 @itemx -mlzcnt
 @itemx -mno-lzcnt
 @itemx -mtbm
@@ -12709,8 +12711,9 @@ preferred alignment to @option{-mpreferred-stack-boundary=2}.
 @opindex m3dnow
 @opindex mno-3dnow
 These switches enable or disable the use of instructions in the MMX, SSE,
-SSE2, SSE3, SSSE3, SSE4.1, AVX, AVX2, AES, PCLMUL, FSGSBASE, RDRND, F16C, FMA,
-SSE4A, FMA4, XOP, LWP, ABM, BMI, LZCNT or 3DNow!@: extended instruction sets.
+SSE2, SSE3, SSSE3, SSE4.1, AVX, AVX2, AES, PCLMUL, FSGSBASE, RDRND, F16C,
+FMA, SSE4A, FMA4, XOP, LWP, ABM, BMI, BMI2, LZCNT or 3DNow!
+@: extended instruction sets.
 These extensions are also available as built-in functions: see
 @ref{X86 Built-in Functions}, for details of the functions enabled and
 disabled by these switches.
diff --git a/gcc/testsuite/g++.dg/other/i386-2.C b/gcc/testsuite/g++.dg/other/i386-2.C
index ed183c7..5f2eaf9 100644
--- a/gcc/testsuite/g++.dg/other/i386-2.C
+++ b/gcc/testsuite/g++.dg/other/i386-2.C
@@ -1,5 +1,5 @@
 /* { dg-do compile { target i?86-*-* x86_64-*-* } } */
-/* { dg-options "-O -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c" } */
+/* { dg-options "-O -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c" } */
 
 /* Test that {,x,e,p,t,s,w,a,b,i}mmintrin.h, mm3dnow.h, fma4intrin.h,
    xopintrin.h, abmintrin.h, bmiintrin.h, tbmintrin.h, lwpintrin.h,
diff --git a/gcc/testsuite/g++.dg/other/i386-3.C b/gcc/testsuite/g++.dg/other/i386-3.C
index 626f972..76d4d19 100644
--- a/gcc/testsuite/g++.dg/other/i386-3.C
+++ b/gcc/testsuite/g++.dg/other/i386-3.C
@@ -1,5 +1,5 @@
 /* { dg-do compile { target i?86-*-* x86_64-*-* } } */
-/* { dg-options "-O -fkeep-inline-functions -march=k8 -msse4a -m3dnow -mavx -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c" } */
+/* { dg-options "-O -fkeep-inline-functions -march=k8 -msse4a -m3dnow -mavx -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c" } */
 
 /* Test that {,x,e,p,t,s,w,a,b,i}mmintrin.h, mm3dnow.h, fma4intrin.h,
    xopintrin.h, abmintrin.h, bmiintrin.h, tbmintrin.h, lwpintrin.h,
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-bzhi32-1.c b/gcc/testsuite/gcc.target/i386/bmi2-bzhi32-1.c
new file mode 100644
index 0000000..aae2353
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-bzhi32-1.c
@@ -0,0 +1,35 @@
+/* { dg-do run { target { bmi2 } } } */
+/* { dg-options "-mbmi2 -O2" } */
+
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned
+calc_bzhi_u32 (unsigned a, int l)
+{
+  unsigned res = a;
+  int i;
+  for (i=0; i<32-l; ++i)
+    res &= ~(1 << (31 - i));
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned src = 0xce7ace0f;
+  unsigned res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_bzhi_u32 (src, i * 2);
+    res = _bzhi_u32 (src, i * 2);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-bzhi32-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-bzhi32-1a.c
new file mode 100644
index 0000000..79e47a4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-bzhi32-1a.c
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+#include "bmi2-bzhi32-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_bzhi_si3" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-bzhi64-1.c b/gcc/testsuite/gcc.target/i386/bmi2-bzhi64-1.c
new file mode 100644
index 0000000..8db29db
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-bzhi64-1.c
@@ -0,0 +1,35 @@
+/* { dg-do run { target { bmi2 && { ! ia32 } } } } */
+/* { dg-options "-mbmi2 -O2" } */
+
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned long long
+calc_bzhi_u64 (unsigned long long a, int l)
+{
+  unsigned long long res = a;
+  int i;
+  for (i=0; i<64-l; ++i)
+    res &= ~(1LL << (63 - i));
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned long long src = 0xce7ace0ce7ace0ff;
+  unsigned long long res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_bzhi_u64 (src, i * 2);
+    res = _bzhi_u64 (src, i * 2);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-bzhi64-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-bzhi64-1a.c
new file mode 100644
index 0000000..dc4a94c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-bzhi64-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-bzhi64-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_bzhi_di3" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-check.h b/gcc/testsuite/gcc.target/i386/bmi2-check.h
new file mode 100644
index 0000000..5ffce44
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-check.h
@@ -0,0 +1,36 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "cpuid.h"
+
+static void bmi2_test (void);
+
+static void
+__attribute__ ((noinline))
+do_test (void)
+{
+  bmi2_test ();
+}
+
+int
+main ()
+{
+  unsigned int eax, ebx, ecx, edx;
+
+  __cpuid_count (7, 0,  eax, ebx, ecx, edx);
+
+  /* Run BMI2 test only if host has BMI2 support.  */
+  if (ebx & bit_BMI2)
+    {
+      do_test ();
+#ifdef DEBUG
+      printf ("PASSED\n");
+#endif
+    }
+#ifdef DEBUG
+  else
+    printf ("SKIPPED\n");
+#endif
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-mulx32-1.c b/gcc/testsuite/gcc.target/i386/bmi2-mulx32-1.c
new file mode 100644
index 0000000..e1d49de
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-mulx32-1.c
@@ -0,0 +1,37 @@
+/* { dg-do run { target { bmi2 && { ia32 } } } } */
+/* { dg-options "-mbmi2 -Ofast" } */
+
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned long long
+calc_mul_u32 (unsigned a, unsigned b)
+{
+  unsigned long long res = 0;
+  volatile unsigned dummy = 0;
+  int i;
+  for (i=0; i<b; ++i)
+    res += (unsigned long long)(dummy? 0 : a);
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned a = 0xce7ace0;
+  unsigned b = 0xfacefff;
+  unsigned long long res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    a = a * (i + 1);
+    b = b / (i + 1);
+
+    res_ref = calc_mul_u32 (a, b);
+    res = (unsigned long long)a * b;
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-mulx32-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-mulx32-1a.c
new file mode 100644
index 0000000..cf3bb08
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-mulx32-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile { target { ia32 } } } */
+/* { dg-options "-O2 -mbmi2 -dp" } */
+
+#include "bmi2-mulx32-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_umulsidi3_1" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-mulx64-1.c b/gcc/testsuite/gcc.target/i386/bmi2-mulx64-1.c
new file mode 100644
index 0000000..ded3dc1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-mulx64-1.c
@@ -0,0 +1,39 @@
+/* { dg-do run { target { bmi2 && { ! ia32 } } } } */
+/* { dg-options "-mbmi2 -Ofast" } */
+
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned __int128
+calc_mul_u64 (unsigned long long a, unsigned long long b)
+{
+  unsigned __int128 res = 0;
+  volatile unsigned dummy = 0;
+  int i;
+  for (i=0; i<b; ++i) {
+    /* Block loop opts  */
+    res += (unsigned __int128)(dummy? 0 : a);
+  }
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned long long a = 0xce7ace0ce7ace0;
+  unsigned long long b = 0xface;
+  unsigned __int128 res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    a = a * (i + 1);
+    b = b / (i + 1);
+
+    res_ref = calc_mul_u64 (a, b);
+    res = (unsigned __int128)a * b;
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-mulx64-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-mulx64-1a.c
new file mode 100644
index 0000000..592d713
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-mulx64-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mbmi2 -dp" } */
+
+#include "bmi2-mulx64-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_umulditi3_1" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-pdep32-1.c b/gcc/testsuite/gcc.target/i386/bmi2-pdep32-1.c
new file mode 100644
index 0000000..e44a968
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-pdep32-1.c
@@ -0,0 +1,39 @@
+/* { dg-do run { target { bmi2 } } } */
+/* { dg-options "-mbmi2 -O2" } */
+
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned
+calc_pdep_u32 (unsigned a, int mask)
+{
+  unsigned res = 0;
+  int i, k = 0;
+
+  for (i=0; i<32; ++i)
+    if (mask & (1 << i)) {
+      res |= ((a & (1 << k)) >> k) << i;
+      ++k;
+    }
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned src = 0xce7acc;
+  unsigned res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_pdep_u32 (src, i*3);
+    res = _pdep_u32 (src, i*3);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-pdep32-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-pdep32-1a.c
new file mode 100644
index 0000000..87888fc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-pdep32-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-pdep32-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_pdep_si3" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-pdep64-1.c b/gcc/testsuite/gcc.target/i386/bmi2-pdep64-1.c
new file mode 100644
index 0000000..c0074fc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-pdep64-1.c
@@ -0,0 +1,38 @@
+/* { dg-do run { target { bmi2 && { ! ia32 } } } } */
+/* { dg-options "-mbmi2 -O2" } */
+
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned long long
+calc_pdep_u64 (unsigned long long a, unsigned long long mask)
+{
+  unsigned long long res = 0;
+  unsigned long long i, k = 0;
+
+  for (i=0; i<64; ++i)
+    if (mask & (1LL << i)) {
+      res |= ((a & (1LL << k)) >> k) << i;
+      ++k;
+    }
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned long long i;
+  unsigned long long src = 0xce7acce7acce7ac;
+  unsigned long long res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_pdep_u64 (src, ~(i * 3));
+    res = _pdep_u64 (src, ~(i * 3));
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-pdep64-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-pdep64-1a.c
new file mode 100644
index 0000000..8163c40
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-pdep64-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-pdep64-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_pdep_di3" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-pext32-1.c b/gcc/testsuite/gcc.target/i386/bmi2-pext32-1.c
new file mode 100644
index 0000000..f21029f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-pext32-1.c
@@ -0,0 +1,39 @@
+/* { dg-do run { target { bmi2 } } } */
+/* { dg-options "-mbmi2 -O2" } */
+
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned
+calc_pext_u32 (unsigned a, unsigned mask)
+{
+  unsigned res = 0;
+  int i, k = 0;
+
+  for (i=0; i<32; ++i)
+    if (mask & (1 << i)) {
+      res |= ((a & (1 << i)) >> i) << k;
+      ++k;
+    }
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned src = 0xce7acc;
+  unsigned res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_pext_u32 (src, ~(i * 3));
+    res = _pext_u32 (src, ~(i * 3));
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-pext32-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-pext32-1a.c
new file mode 100644
index 0000000..c4a6dee
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-pext32-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-pext32-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_pext_si3" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-pext64-1.c b/gcc/testsuite/gcc.target/i386/bmi2-pext64-1.c
new file mode 100644
index 0000000..bad0584
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-pext64-1.c
@@ -0,0 +1,39 @@
+/* { dg-do run { target { bmi2 && { ! ia32 } } } } */
+/* { dg-options "-mbmi2 -O2" } */
+
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned long long
+calc_pext_u64 (unsigned long long a, unsigned long long mask)
+{
+  unsigned long long res = 0;
+  int i, k = 0;
+
+  for (i=0; i<64; ++i)
+    if (mask & (1LL << i)) {
+      res |= ((a & (1LL << i)) >> i) << k;
+      ++k;
+    }
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned long long i;
+  unsigned long long src = 0xce7acce7acce7ac;
+  unsigned long long res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_pext_u64 (src, ~(i * 3));
+    res = _pext_u64 (src, ~(i * 3));
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-pext64-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-pext64-1a.c
new file mode 100644
index 0000000..aaf06c1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-pext64-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-pext64-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_pext_di3" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-rorx32-1.c b/gcc/testsuite/gcc.target/i386/bmi2-rorx32-1.c
new file mode 100644
index 0000000..84618e3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-rorx32-1.c
@@ -0,0 +1,36 @@
+/* { dg-do run { target { bmi2 } } } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned
+calc_rorx_u32 (unsigned a, int l)
+{
+  unsigned res = a;
+  int i;
+  for (i=0; i<l; ++i)
+    res = (res >> 1) | ((res&1)<< 31);
+
+  return res;
+}
+
+#define SHIFT_VAL 0x0e
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned src = 0xce7ace0;
+  unsigned res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_rorx_u32 (src, SHIFT_VAL);
+    res = (src >> SHIFT_VAL) | (src << (32 - SHIFT_VAL));
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-rorx32-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-rorx32-1a.c
new file mode 100644
index 0000000..bb3b28d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-rorx32-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mbmi2 -dp" } */
+
+#include "bmi2-rorx32-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_rorxsi3_1" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-rorx64-1.c b/gcc/testsuite/gcc.target/i386/bmi2-rorx64-1.c
new file mode 100644
index 0000000..7dc722a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-rorx64-1.c
@@ -0,0 +1,36 @@
+/* { dg-do run { target { bmi2 && { ! ia32 } } } } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned long long
+calc_rorx_u64 (unsigned long long a, int l)
+{
+  unsigned long long res = a;
+  int i;
+  for (i=0; i<l; ++i)
+    res = (res >> 1) | ((res&1)<< 63);
+
+  return res;
+}
+
+#define SHIFT_VAL 0x1e
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned long long src = 0xce7ace0ce7ace0;
+  unsigned long long res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i+1);
+
+    res_ref = calc_rorx_u64 (src, SHIFT_VAL);
+    res = (src >> SHIFT_VAL) | (src << (64 - SHIFT_VAL));
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-rorx64-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-rorx64-1a.c
new file mode 100644
index 0000000..2a7a7a0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-rorx64-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mbmi2 -dp" } */
+
+#include "bmi2-rorx64-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_rorxdi3_1" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-sarx32-1.c b/gcc/testsuite/gcc.target/i386/bmi2-sarx32-1.c
new file mode 100644
index 0000000..2bbf016
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-sarx32-1.c
@@ -0,0 +1,36 @@
+/* { dg-do run { target { bmi2 } } } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+int
+calc_sarx_u32 (int a, int l)
+{
+  int res = a;
+  int i;
+  for (i=0; i<l; ++i)
+    res >>= 1;
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  int src = 0xfce7ace0;
+  int res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_sarx_u32 (src, i + 1);
+    res = src >> (i + 1);
+
+    printf ("%x %x\n", res_ref, res);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-sarx32-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-sarx32-1a.c
new file mode 100644
index 0000000..f10d60b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-sarx32-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mbmi2 -dp" } */
+
+#include "bmi2-sarx32-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_ashrsi3" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-sarx64-1.c b/gcc/testsuite/gcc.target/i386/bmi2-sarx64-1.c
new file mode 100644
index 0000000..0bb13c0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-sarx64-1.c
@@ -0,0 +1,34 @@
+/* { dg-do run { target { bmi2 && { ! ia32 } } } } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+long long
+calc_sarx_u64 (long long a, int l)
+{
+  long long res = a;
+  int i;
+  for (i=0; i<l; ++i)
+    res >>= 1;
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  long long src = 0xfce7ace0ce7ace0;
+  long long res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i+1);
+
+    res_ref = calc_sarx_u64 (src, i + 1);
+    res = src >> (i + 1);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-sarx64-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-sarx64-1a.c
new file mode 100644
index 0000000..bcf0fd4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-sarx64-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mbmi2 -dp" } */
+
+#include "bmi2-sarx64-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_ashrdi3" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-shlx32-1.c b/gcc/testsuite/gcc.target/i386/bmi2-shlx32-1.c
new file mode 100644
index 0000000..3f35047
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-shlx32-1.c
@@ -0,0 +1,34 @@
+/* { dg-do run { target { bmi2 } } } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+int
+calc_shlx_u32 (int a, int l)
+{
+  int res = a;
+  int i;
+  for (i=0; i<l; ++i)
+    res <<= 1;
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  int src = 0xfce7ace0;
+  int res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_shlx_u32 (src, i + 1);
+    res = src << (i + 1);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-shlx32-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-shlx32-1a.c
new file mode 100644
index 0000000..215e5d3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-shlx32-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mbmi2 -dp" } */
+
+#include "bmi2-shlx32-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_ashlsi3" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-shrx32-1.c b/gcc/testsuite/gcc.target/i386/bmi2-shrx32-1.c
new file mode 100644
index 0000000..17f0c67
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-shrx32-1.c
@@ -0,0 +1,36 @@
+/* { dg-do run { target { bmi2 } } } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned
+calc_shrx_u32 (unsigned a, int l)
+{
+  unsigned res = a;
+  int i;
+  for (i=0; i<l; ++i)
+    res >>= 1;
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned src = 0xce7ace0;
+  unsigned res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_shrx_u32 (src, i + 1);
+    res = src >> (i + 1);
+
+    printf ("%x %x\n", res_ref, res);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-shrx32-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-shrx32-1a.c
new file mode 100644
index 0000000..24c53d4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-shrx32-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mbmi2 -dp" } */
+
+#include "bmi2-shrx32-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_lshrsi3" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-shrx64-1.c b/gcc/testsuite/gcc.target/i386/bmi2-shrx64-1.c
new file mode 100644
index 0000000..022baa9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-shrx64-1.c
@@ -0,0 +1,34 @@
+/* { dg-do run { target { bmi2 && { ! ia32 } } } } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned long long
+calc_shrx_u64 (unsigned long long a, int l)
+{
+  unsigned long long res = a;
+  int i;
+  for (i=0; i<l; ++i)
+    res >>= 1;
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned long long src = 0xce7ace0ce7ace0;
+  unsigned long long res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i+1);
+
+    res_ref = calc_shrx_u64 (src, i + 1);
+    res = src >> (i + 1);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-shrx64-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-shrx64-1a.c
new file mode 100644
index 0000000..7830439
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-shrx64-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mbmi2 -dp" } */
+
+#include "bmi2-shrx64-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_lshrdi3" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/i386.exp b/gcc/testsuite/gcc.target/i386/i386.exp
index 167b79b..cff8a9a 100644
--- a/gcc/testsuite/gcc.target/i386/i386.exp
+++ b/gcc/testsuite/gcc.target/i386/i386.exp
@@ -206,6 +206,17 @@ proc check_effective_target_bmi { } {
     } "-mbmi" ]
 }
 
+# Return 1 if bmi2 instructions can be compiled.
+proc check_effective_target_bmi2 { } {
+    return [check_no_compiler_messages bmi2 object {
+	unsigned int
+	_bzhi_u32 (unsigned int __X, unsigned int __Y)
+	{
+	    return __builtin_ia32_bzhi_si (__X, __Y);
+	}
+    } "-mbmi2" ]
+}
+
 # If the linker used understands -M <mapfile>, pass it to clear hardware
 # capabilities set by the Sun assembler.
 set clearcap_ldflags "-Wl,-M,$srcdir/$subdir/clearcap.map"

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH, testsuite, i386] BMI2 support for GCC
  2011-08-19 14:38         ` H.J. Lu
@ 2011-08-19 14:38           ` Kirill Yukhin
  2011-08-19 14:47             ` H.J. Lu
  0 siblings, 1 reply; 27+ messages in thread
From: Kirill Yukhin @ 2011-08-19 14:38 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Jakub Jelinek, gcc-patches List, Uros Bizjak

[-- Attachment #1: Type: text/plain, Size: 1230 bytes --]

Thanks, fixed.

Updated patch is attached.

K

On Fri, Aug 19, 2011 at 6:04 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
> No need for () in "(mode == SImode)":
>
> +         && !optimize_function_for_size_p (cfun)
> +         && ((mode == SImode) || (mode == DImode && TARGET_64BIT))
>
> Wrong placement of '{':
>
> +  if (can_create_pseudo_p () && <MODE>mode != SImode) {
> +    rtx tmp = gen_rtx_REG (<MODE>mode, 0);
> +    emit_insn (gen_extendsidi2 (tmp, operands[2]));
> +    operands[2] = tmp;
> +  }
>
>
>
> On Fri, Aug 19, 2011 at 6:53 AM, Kirill Yukhin <kirill.yukhin@gmail.com> wrote:
>> Thanks!
>> Fixed, updated patch is attached.
>>
>> Is it ok?
>>
>> Thanks, K
>>
>> On Fri, Aug 19, 2011 at 5:22 PM, Jakub Jelinek <jakub@redhat.com> wrote:
>>> On Fri, Aug 19, 2011 at 05:18:19PM +0400, Kirill Yukhin wrote:
>>>> Thanks, it is fixed.
>>>> Update patch is attached.
>>>
>>> +         /* We generatin RORX instruction, freedom of register +
>>> +            flags not affected  */
>>>
>>> comment doesn't look to be correct english (missing verb, missing g at
>>> the end of generating, missing dot at the end of sentence).
>>>
>>>        Jakub
>>>
>>
>
>
>
> --
> H.J.
>

[-- Attachment #2: bmi2-4.gcc.patch --]
[-- Type: application/octet-stream, Size: 52595 bytes --]

diff --git a/gcc/common/config/i386/i386-common.c b/gcc/common/config/i386/i386-common.c
index b201835..99643d6 100644
--- a/gcc/common/config/i386/i386-common.c
+++ b/gcc/common/config/i386/i386-common.c
@@ -79,6 +79,7 @@ along with GCC; see the file COPYING3.  If not see
   (OPTION_MASK_ISA_ABM | OPTION_MASK_ISA_POPCNT)
 
 #define OPTION_MASK_ISA_BMI_SET OPTION_MASK_ISA_BMI
+#define OPTION_MASK_ISA_BMI2_SET OPTION_MASK_ISA_BMI2
 #define OPTION_MASK_ISA_TBM_SET OPTION_MASK_ISA_TBM
 #define OPTION_MASK_ISA_POPCNT_SET OPTION_MASK_ISA_POPCNT
 #define OPTION_MASK_ISA_CX16_SET OPTION_MASK_ISA_CX16
@@ -137,6 +138,7 @@ along with GCC; see the file COPYING3.  If not see
 #define OPTION_MASK_ISA_PCLMUL_UNSET OPTION_MASK_ISA_PCLMUL
 #define OPTION_MASK_ISA_ABM_UNSET OPTION_MASK_ISA_ABM
 #define OPTION_MASK_ISA_BMI_UNSET OPTION_MASK_ISA_BMI
+#define OPTION_MASK_ISA_BMI2_UNSET OPTION_MASK_ISA_BMI2
 #define OPTION_MASK_ISA_TBM_UNSET OPTION_MASK_ISA_TBM
 #define OPTION_MASK_ISA_POPCNT_UNSET OPTION_MASK_ISA_POPCNT
 #define OPTION_MASK_ISA_CX16_UNSET OPTION_MASK_ISA_CX16
@@ -395,6 +397,19 @@ ix86_handle_option (struct gcc_options *opts,
 	}
       return true;
 
+    case OPT_mbmi2:
+      if (value)
+	{
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI2_SET;
+	  opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_BMI2_SET;
+	}
+      else
+	{
+	  opts->x_ix86_isa_flags &= ~OPTION_MASK_ISA_BMI2_UNSET;
+	  opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_BMI2_UNSET;
+	}
+      return true;
+
     case OPT_mtbm:
       if (value)
 	{
diff --git a/gcc/config.gcc b/gcc/config.gcc
index b92ce3d..30cce99 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -352,7 +352,7 @@ i[34567]86-*-*)
 		       nmmintrin.h bmmintrin.h fma4intrin.h wmmintrin.h
 		       immintrin.h x86intrin.h avxintrin.h xopintrin.h
 		       ia32intrin.h cross-stdarg.h lwpintrin.h popcntintrin.h
-		       lzcntintrin.h bmiintrin.h tbmintrin.h"
+		       lzcntintrin.h bmiintrin.h bmi2intrin.h tbmintrin.h"
 	;;
 x86_64-*-*)
 	cpu_type=i386
@@ -364,7 +364,7 @@ x86_64-*-*)
 		       nmmintrin.h bmmintrin.h fma4intrin.h wmmintrin.h
 		       immintrin.h x86intrin.h avxintrin.h xopintrin.h
 		       ia32intrin.h cross-stdarg.h lwpintrin.h popcntintrin.h
-		       lzcntintrin.h bmiintrin.h tbmintrin.h"
+		       lzcntintrin.h bmiintrin.h bmi2intrin.h tbmintrin.h"
 	need_64bit_hwint=yes
 	;;
 ia64-*-*)
diff --git a/gcc/config/i386/bmi2intrin.h b/gcc/config/i386/bmi2intrin.h
new file mode 100644
index 0000000..f3ffa52
--- /dev/null
+++ b/gcc/config/i386/bmi2intrin.h
@@ -0,0 +1,81 @@
+/* Copyright (C) 2010, 2011 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _X86INTRIN_H_INCLUDED
+# error "Never use <bmi2intrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __BMI2__
+# error "BMI2 instruction set not enabled"
+#endif /* __BMI2__ */
+
+#ifndef _BMI2INTRIN_H_INCLUDED
+#define _BMI2INTRIN_H_INCLUDED
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_bzhi_u32 (unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_bzhi_si (__X, __Y);
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_pdep_u32 (unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_pdep_si (__X, __Y);
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_pext_u32 (unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_pext_si (__X, __Y);
+}
+
+#ifdef  __x86_64__
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_bzhi_u64 (unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_bzhi_di (__X, __Y);
+}
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_pdep_u64 (unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_pdep_di (__X, __Y);
+}
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_pext_u64 (unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_pext_di (__X, __Y);
+}
+
+#endif /* __x86_64__  */
+
+#endif /* _BMI2INTRIN_H_INCLUDED */
diff --git a/gcc/config/i386/cpuid.h b/gcc/config/i386/cpuid.h
index d53743f..5da8fd2 100644
--- a/gcc/config/i386/cpuid.h
+++ b/gcc/config/i386/cpuid.h
@@ -67,6 +67,7 @@
 #define bit_FSGSBASE	(1 << 0)
 #define bit_BMI		(1 << 3)
 #define bit_AVX2	(1 << 5)
+#define bit_BMI2	(1 << 8)
 
 #if defined(__i386__) && defined(__PIC__)
 /* %ebx may be the PIC register.  */
diff --git a/gcc/config/i386/driver-i386.c b/gcc/config/i386/driver-i386.c
index b7a1f52..8107ece 100644
--- a/gcc/config/i386/driver-i386.c
+++ b/gcc/config/i386/driver-i386.c
@@ -396,7 +396,7 @@ const char *host_detect_local_cpu (int argc, const char **argv)
   unsigned int has_popcnt = 0, has_aes = 0, has_avx = 0, has_avx2 = 0;
   unsigned int has_pclmul = 0, has_abm = 0, has_lwp = 0;
   unsigned int has_fma = 0, has_fma4 = 0, has_xop = 0;
-  unsigned int has_bmi = 0, has_tbm = 0, has_lzcnt = 0;
+  unsigned int has_bmi = 0, has_bmi2 = 0, has_tbm = 0, has_lzcnt = 0;
 
   bool arch;
 
@@ -475,6 +475,7 @@ const char *host_detect_local_cpu (int argc, const char **argv)
 
       has_bmi = ebx & bit_BMI;
       has_avx2 = ebx & bit_AVX2;
+      has_bmi2 = ebx & bit_BMI2;
     }
 
   if (!arch)
@@ -715,6 +716,7 @@ const char *host_detect_local_cpu (int argc, const char **argv)
       const char *fma4 = has_fma4 ? " -mfma4" : " -mno-fma4";
       const char *xop = has_xop ? " -mxop" : " -mno-xop";
       const char *bmi = has_bmi ? " -mbmi" : " -mno-bmi";
+      const char *bmi2 = has_bmi2 ? " -mbmi2" : " -mno-bmi2";
       const char *tbm = has_tbm ? " -mtbm" : " -mno-tbm";
       const char *avx = has_avx ? " -mavx" : " -mno-avx";
       const char *avx2 = has_avx2 ? " -mavx2" : " -mno-avx2";
@@ -723,8 +725,8 @@ const char *host_detect_local_cpu (int argc, const char **argv)
       const char *lzcnt = has_lzcnt ? " -mlzcnt" : " -mno-lzcnt";
 
       options = concat (options, cx16, sahf, movbe, ase, pclmul,
-			popcnt, abm, lwp, fma, fma4, xop, bmi, tbm,
-			avx2, avx, sse4_2, sse4_1, lzcnt, NULL);
+			popcnt, abm, lwp, fma, fma4, xop, bmi, bmi2,
+			tbm, avx, avx2, sse4_2, sse4_1, lzcnt, NULL);
     }
 
 done:
diff --git a/gcc/config/i386/i386-c.c b/gcc/config/i386/i386-c.c
index 5c1dfe6..d4b0b08 100644
--- a/gcc/config/i386/i386-c.c
+++ b/gcc/config/i386/i386-c.c
@@ -273,6 +273,8 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
     def_or_undef (parse_in, "__ABM__");
   if (isa_flag & OPTION_MASK_ISA_BMI)
     def_or_undef (parse_in, "__BMI__");
+  if (isa_flag & OPTION_MASK_ISA_BMI2)
+    def_or_undef (parse_in, "__BMI2__");
   if (isa_flag & OPTION_MASK_ISA_LZCNT)
     def_or_undef (parse_in, "__LZCNT__");
   if (isa_flag & OPTION_MASK_ISA_TBM)
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 53c5944..bff1a05 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -2664,6 +2664,7 @@ ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
     { "-mmmx",		OPTION_MASK_ISA_MMX },
     { "-mabm",		OPTION_MASK_ISA_ABM },
     { "-mbmi",		OPTION_MASK_ISA_BMI },
+    { "-mbmi2", 	OPTION_MASK_ISA_BMI2 },
     { "-mlzcnt",	OPTION_MASK_ISA_LZCNT },
     { "-mtbm",		OPTION_MASK_ISA_TBM },
     { "-mpopcnt",	OPTION_MASK_ISA_POPCNT },
@@ -2921,6 +2922,7 @@ ix86_option_override_internal (bool main_args_p)
 #define PTA_TBM		 	(HOST_WIDE_INT_1 << 28)
 #define PTA_XOP		 	(HOST_WIDE_INT_1 << 29)
 #define PTA_AVX2		(HOST_WIDE_INT_1 << 30)
+#define PTA_BMI2	 	(HOST_WIDE_INT_1 << 31)
 /* if this reaches 64, need to widen struct pta flags below */
 
   static struct pta
@@ -2978,8 +2980,8 @@ ix86_option_override_internal (bool main_args_p)
 	PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
 	| PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
 	| PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
-	| PTA_RDRND | PTA_F16C | PTA_BMI | PTA_LZCNT | PTA_FMA
-	| PTA_MOVBE},
+	| PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
+        | PTA_FMA | PTA_MOVBE},
       {"atom", PROCESSOR_ATOM, CPU_ATOM,
 	PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
 	| PTA_SSSE3 | PTA_CX16 | PTA_MOVBE},
@@ -3300,6 +3302,9 @@ ix86_option_override_internal (bool main_args_p)
 	if (processor_alias_table[i].flags & PTA_TBM
 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
 	  ix86_isa_flags |= OPTION_MASK_ISA_TBM;
+	if (processor_alias_table[i].flags & PTA_BMI2
+	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
+	  ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
 	if (processor_alias_table[i].flags & PTA_CX16
 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
 	  ix86_isa_flags |= OPTION_MASK_ISA_CX16;
@@ -4053,6 +4058,7 @@ ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
     IX86_ATTR_ISA ("3dnow",	OPT_m3dnow),
     IX86_ATTR_ISA ("abm",	OPT_mabm),
     IX86_ATTR_ISA ("bmi",	OPT_mbmi),
+    IX86_ATTR_ISA ("bmi2",	OPT_mbmi2),
     IX86_ATTR_ISA ("lzcnt",	OPT_mlzcnt),
     IX86_ATTR_ISA ("tbm",	OPT_mtbm),
     IX86_ATTR_ISA ("aes",	OPT_maes),
@@ -13285,6 +13291,7 @@ put_condition_code (enum rtx_code code, enum machine_mode mode, int reverse,
    If CODE is 't', pretend the mode is V8SFmode.
    If CODE is 'h', pretend the reg is the 'high' byte register.
    If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
+   If CODE is 'N', print the half mode high register.
    If CODE is 'd', duplicate the operand for AVX instruction.
  */
 
@@ -13294,6 +13301,15 @@ print_reg (rtx x, int code, FILE *file)
   const char *reg;
   bool duplicated = code == 'd' && TARGET_AVX;
 
+  if (code == 'N')
+    {
+      enum machine_mode mode = GET_MODE (x);
+      enum machine_mode half_mode = mode == TImode ? DImode : SImode;
+      x = simplify_gen_subreg (half_mode, x, mode,
+			       GET_MODE_SIZE (half_mode));
+      code = 0;
+    }
+
   gcc_assert (x == pc_rtx
 	      || (REGNO (x) != ARG_POINTER_REGNUM
 		  && REGNO (x) != FRAME_POINTER_REGNUM
@@ -13472,6 +13488,7 @@ get_some_local_dynamic_name (void)
    t --  likewise, print the V8SFmode name of the register.
    h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
    y -- print "st(0)" instead of "st" as a register.
+   N -- print the half mode high register.
    d -- print duplicated register operand for AVX instruction.
    D -- print condition for SSE cmp instruction.
    P -- if PIC, print an @PLT suffix.
@@ -13678,6 +13695,7 @@ ix86_print_operand (FILE *file, rtx x, int code)
 	case 'h':
 	case 't':
 	case 'y':
+	case 'N':
 	case 'x':
 	case 'X':
 	case 'P':
@@ -15745,8 +15763,23 @@ ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
     }
   else
     {
-      clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
-      emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
+      rtx insn;
+      if (code == ROTATERT
+	  && TARGET_BMI2
+	  && !optimize_function_for_size_p (cfun)
+	  && (mode == SImode || (mode == DImode && TARGET_64BIT))
+	  && CONST_INT_P (src2) )
+	{
+	  /* We're generating RORX instruction: freedom of register +
+	     flags not affected.  */
+	  insn = op;
+	}
+      else
+	{
+	  clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
+	  insn = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob));
+	}
+      emit_insn (insn);
     }
 
   /* Fix up the destination if needed.  */
@@ -24072,6 +24105,13 @@ enum ix86_builtins
   IX86_BUILTIN_BEXTRI32,
   IX86_BUILTIN_BEXTRI64,
 
+  /* BMI2 instructions. */
+  IX86_BUILTIN_BZHI32,
+  IX86_BUILTIN_BZHI64,
+  IX86_BUILTIN_PDEP32,
+  IX86_BUILTIN_PDEP64,
+  IX86_BUILTIN_PEXT32,
+  IX86_BUILTIN_PEXT64,
 
   /* FSGSBASE instructions.  */
   IX86_BUILTIN_RDFSBASE32,
@@ -25046,6 +25086,14 @@ static const struct builtin_description bdesc_args[] =
   { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps256, "__builtin_ia32_vcvtph2ps256", IX86_BUILTIN_CVTPH2PS256, UNKNOWN, (int) V8SF_FTYPE_V8HI },
   { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph, "__builtin_ia32_vcvtps2ph", IX86_BUILTIN_CVTPS2PH, UNKNOWN, (int) V8HI_FTYPE_V4SF_INT },
   { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph256, "__builtin_ia32_vcvtps2ph256", IX86_BUILTIN_CVTPS2PH256, UNKNOWN, (int) V8HI_FTYPE_V8SF_INT },
+
+  /* BMI2 */
+  { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_si3, "__builtin_ia32_bzhi_si", IX86_BUILTIN_BZHI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
+  { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_di3, "__builtin_ia32_bzhi_di", IX86_BUILTIN_BZHI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
+  { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_si3, "__builtin_ia32_pdep_si", IX86_BUILTIN_PDEP32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
+  { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_di3, "__builtin_ia32_pdep_di", IX86_BUILTIN_PDEP64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
+  { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_si3, "__builtin_ia32_pext_si", IX86_BUILTIN_PEXT32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
+  { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_di3, "__builtin_ia32_pext_di", IX86_BUILTIN_PEXT64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
 };
 
 /* FMA4 and XOP.  */
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index f8a35ba..47442a0 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -62,6 +62,7 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 #define TARGET_ROUND	OPTION_ISA_ROUND
 #define TARGET_ABM	OPTION_ISA_ABM
 #define TARGET_BMI	OPTION_ISA_BMI
+#define TARGET_BMI2	OPTION_ISA_BMI2
 #define TARGET_LZCNT	OPTION_ISA_LZCNT
 #define TARGET_TBM	OPTION_ISA_TBM
 #define TARGET_POPCNT	OPTION_ISA_POPCNT
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index e7ae397..81479f6 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -236,6 +236,11 @@
 
   ;; For RDRAND support
   UNSPEC_RDRAND
+
+  ;; For BMI2 support
+  UNSPEC_BZHI
+  UNSPEC_PDEP
+  UNSPEC_PEXT
 ])
 
 (define_c_enum "unspecv" [
@@ -751,14 +756,17 @@
 ;; Base name for insn mnemonic.
 (define_code_attr logic [(and "and") (ior "or") (xor "xor")])
 
+;; Mapping of shift operators
+(define_code_iterator any_shift [ashift lshiftrt ashiftrt])
+
 ;; Mapping of shift-right operators
 (define_code_iterator any_shiftrt [lshiftrt ashiftrt])
 
 ;; Base name for define_insn
-(define_code_attr shiftrt_insn [(lshiftrt "lshr") (ashiftrt "ashr")])
+(define_code_attr shift_insn [(ashift "ashl") (lshiftrt "lshr") (ashiftrt "ashr")])
 
 ;; Base name for insn mnemonic.
-(define_code_attr shiftrt [(lshiftrt "shr") (ashiftrt "sar")])
+(define_code_attr shift [(ashift "shl") (lshiftrt "shr") (ashiftrt "sar")])
 
 ;; Mapping of rotate operators
 (define_code_iterator any_rotate [rotate rotatert])
@@ -777,6 +785,8 @@
 
 ;; Used in signed and unsigned widening multiplications.
 (define_code_iterator any_extend [sign_extend zero_extend])
+(define_code_attr any_extend [(sign_extend "SIGN_EXTEND")
+			      (zero_extend "ZERO_EXTEND")])
 
 ;; Various insn prefixes for signed and unsigned operations.
 (define_code_attr u [(sign_extend "") (zero_extend "u")
@@ -6837,7 +6847,17 @@
 		       (match_operand:DWIH 1 "nonimmediate_operand" ""))
 		     (any_extend:<DWI>
 		       (match_operand:DWIH 2 "register_operand" ""))))
-	      (clobber (reg:CC FLAGS_REG))])])
+	      (clobber (reg:CC FLAGS_REG))])]
+  ""
+{
+  if (TARGET_BMI2 && <any_extend> == ZERO_EXTEND)
+    {
+      emit_insn (gen_bmi2_umul<mode><dwi>3_1 (operands[0],
+					      operands[1],
+					      operands[2]));
+      DONE;
+    }
+})
 
 (define_expand "<u>mulqihi3"
   [(parallel [(set (match_operand:HI 0 "register_operand" "")
@@ -6849,6 +6869,24 @@
 	      (clobber (reg:CC FLAGS_REG))])]
   "TARGET_QIMODE_MATH")
 
+(define_insn "bmi2_umul<mode><dwi>3_1"
+  [(set (match_operand:<DWI> 0 "register_operand" "=r")
+	(mult:<DWI>
+	  (zero_extend:<DWI>
+	    (match_operand:DWIH 1 "register_operand" "d"))
+	  (zero_extend:<DWI>
+	    (match_operand:DWIH 2 "nonimmediate_operand" "rm"))))]
+  "TARGET_BMI2"
+{
+  if (<MODE>mode == DImode)
+    return "mulx\t{%2, %q0, %N0|%N0, %q0, %2}";
+  else
+    return "mulx\t{%2, %k0, %N0|%N0, %k0, %2}";
+}
+  [(set_attr "type" "imul")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<MODE>")])
+
 (define_insn "*<u>mul<mode><dwi>3_1"
   [(set (match_operand:<DWI> 0 "register_operand" "=A")
 	(mult:<DWI>
@@ -9587,15 +9625,44 @@
 
 ;; See comment above `ashl<mode>3' about how this works.
 
-(define_expand "<shiftrt_insn><mode>3"
+(define_expand "<shift_insn><mode>3"
   [(set (match_operand:SDWIM 0 "<shift_operand>" "")
 	(any_shiftrt:SDWIM (match_operand:SDWIM 1 "<shift_operand>" "")
 			   (match_operand:QI 2 "nonmemory_operand" "")))]
   ""
   "ix86_expand_binary_operator (<CODE>, <MODE>mode, operands); DONE;")
 
+;; Update pattern if BMI2 is available
+(define_split
+  [(set (match_operand:SWI48 0 "register_operand" "")
+	(any_shift:SWI48
+	  (match_operand:SWI48 1 "nonimmediate_operand" "")
+	  (subreg:QI
+	      (match_operand:SI 2 "register_operand" "") 0)))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_BMI2 && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands) && !reload_completed"
+  [(set (match_dup 0)
+        (any_shift:SWI48 (match_dup 1) (match_dup 2)))]
+{
+  if (can_create_pseudo_p () && <MODE>mode != SImode)
+  {
+    rtx tmp = gen_rtx_REG (<MODE>mode, 0);
+    emit_insn (gen_extendsidi2 (tmp, operands[2]));
+    operands[2] = tmp;
+  }
+})
+
+(define_insn "*bmi2_<shift_insn><mode>3"
+  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=r")
+	(any_shift:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" "rm")
+			 (match_operand:SWI48 2 "register_operand" "r")))]
+  "TARGET_BMI2 && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
+  "<shift>x\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "ishift")
+   (set_attr "mode" "<MODE>")])
+
 ;; Avoid useless masking of count operand.
-(define_insn_and_split "*<shiftrt_insn><mode>3_mask"
+(define_insn_and_split "*<shift_insn><mode>3_mask"
   [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm")
 	(any_shiftrt:SWI48
 	  (match_operand:SWI48 1 "nonimmediate_operand" "0")
@@ -9621,7 +9688,7 @@
   [(set_attr "type" "ishift")
    (set_attr "mode" "<MODE>")])
 
-(define_insn_and_split "*<shiftrt_insn><mode>3_doubleword"
+(define_insn_and_split "*<shift_insn><mode>3_doubleword"
   [(set (match_operand:DWI 0 "register_operand" "=r")
 	(any_shiftrt:DWI (match_operand:DWI 1 "register_operand" "0")
 			 (match_operand:QI 2 "nonmemory_operand" "<S>c")))
@@ -9630,7 +9697,7 @@
   "#"
   "(optimize && flag_peephole2) ? epilogue_completed : reload_completed"
   [(const_int 0)]
-  "ix86_split_<shiftrt_insn> (operands, NULL_RTX, <MODE>mode); DONE;"
+  "ix86_split_<shift_insn> (operands, NULL_RTX, <MODE>mode); DONE;"
   [(set_attr "type" "multi")])
 
 ;; By default we don't ask for a scratch register, because when DWImode
@@ -9647,7 +9714,7 @@
    (match_dup 3)]
   "TARGET_CMOVE"
   [(const_int 0)]
-  "ix86_split_<shiftrt_insn> (operands, operands[3], <DWI>mode); DONE;")
+  "ix86_split_<shift_insn> (operands, operands[3], <DWI>mode); DONE;")
 
 (define_insn "x86_64_shrd"
   [(set (match_operand:DI 0 "nonimmediate_operand" "+r*m")
@@ -9763,7 +9830,7 @@
   DONE;
 })
 
-(define_insn "*<shiftrt_insn><mode>3_1"
+(define_insn "*<shift_insn><mode>3_1"
   [(set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m")
 	(any_shiftrt:SWI (match_operand:SWI 1 "nonimmediate_operand" "0")
 			 (match_operand:QI 2 "nonmemory_operand" "c<S>")))
@@ -9772,9 +9839,9 @@
 {
   if (operands[2] == const1_rtx
       && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
-    return "<shiftrt>{<imodesuffix>}\t%0";
+    return "<shift>{<imodesuffix>}\t%0";
   else
-    return "<shiftrt>{<imodesuffix>}\t{%2, %0|%0, %2}";
+    return "<shift>{<imodesuffix>}\t{%2, %0|%0, %2}";
 }
   [(set_attr "type" "ishift")
    (set (attr "length_immediate")
@@ -9786,7 +9853,7 @@
        (const_string "*")))
    (set_attr "mode" "<MODE>")])
 
-(define_insn "*<shiftrt_insn>si3_1_zext"
+(define_insn "*<shift_insn>si3_1_zext"
   [(set (match_operand:DI 0 "register_operand" "=r")
 	(zero_extend:DI
 	  (any_shiftrt:SI (match_operand:SI 1 "register_operand" "0")
@@ -9796,9 +9863,9 @@
 {
   if (operands[2] == const1_rtx
       && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
-    return "<shiftrt>{l}\t%k0";
+    return "<shift>{l}\t%k0";
   else
-    return "<shiftrt>{l}\t{%2, %k0|%k0, %2}";
+    return "<shift>{l}\t{%2, %k0|%k0, %2}";
 }
   [(set_attr "type" "ishift")
    (set (attr "length_immediate")
@@ -9810,7 +9877,7 @@
        (const_string "*")))
    (set_attr "mode" "SI")])
 
-(define_insn "*<shiftrt_insn>qi3_1_slp"
+(define_insn "*<shift_insn>qi3_1_slp"
   [(set (strict_low_part (match_operand:QI 0 "nonimmediate_operand" "+qm"))
 	(any_shiftrt:QI (match_dup 0)
 			(match_operand:QI 1 "nonmemory_operand" "cI")))
@@ -9822,9 +9889,9 @@
 {
   if (operands[1] == const1_rtx
       && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
-    return "<shiftrt>{b}\t%0";
+    return "<shift>{b}\t%0";
   else
-    return "<shiftrt>{b}\t{%1, %0|%0, %1}";
+    return "<shift>{b}\t{%1, %0|%0, %1}";
 }
   [(set_attr "type" "ishift1")
    (set (attr "length_immediate")
@@ -9839,7 +9906,7 @@
 ;; This pattern can't accept a variable shift count, since shifts by
 ;; zero don't affect the flags.  We assume that shifts by constant
 ;; zero are optimized away.
-(define_insn "*<shiftrt_insn><mode>3_cmp"
+(define_insn "*<shift_insn><mode>3_cmp"
   [(set (reg FLAGS_REG)
 	(compare
 	  (any_shiftrt:SWI
@@ -9857,9 +9924,9 @@
 {
   if (operands[2] == const1_rtx
       && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
-    return "<shiftrt>{<imodesuffix>}\t%0";
+    return "<shift>{<imodesuffix>}\t%0";
   else
-    return "<shiftrt>{<imodesuffix>}\t{%2, %0|%0, %2}";
+    return "<shift>{<imodesuffix>}\t{%2, %0|%0, %2}";
 }
   [(set_attr "type" "ishift")
    (set (attr "length_immediate")
@@ -9871,7 +9938,7 @@
        (const_string "*")))
    (set_attr "mode" "<MODE>")])
 
-(define_insn "*<shiftrt_insn>si3_cmp_zext"
+(define_insn "*<shift_insn>si3_cmp_zext"
   [(set (reg FLAGS_REG)
 	(compare
 	  (any_shiftrt:SI (match_operand:SI 1 "register_operand" "0")
@@ -9889,9 +9956,9 @@
 {
   if (operands[2] == const1_rtx
       && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
-    return "<shiftrt>{l}\t%k0";
+    return "<shift>{l}\t%k0";
   else
-    return "<shiftrt>{l}\t{%2, %k0|%k0, %2}";
+    return "<shift>{l}\t{%2, %k0|%k0, %2}";
 }
   [(set_attr "type" "ishift")
    (set (attr "length_immediate")
@@ -9903,7 +9970,7 @@
        (const_string "*")))
    (set_attr "mode" "SI")])
 
-(define_insn "*<shiftrt_insn><mode>3_cconly"
+(define_insn "*<shift_insn><mode>3_cconly"
   [(set (reg FLAGS_REG)
 	(compare
 	  (any_shiftrt:SWI
@@ -9919,9 +9986,9 @@
 {
   if (operands[2] == const1_rtx
       && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
-    return "<shiftrt>{<imodesuffix>}\t%0";
+    return "<shift>{<imodesuffix>}\t%0";
   else
-    return "<shiftrt>{<imodesuffix>}\t{%2, %0|%0, %2}";
+    return "<shift>{<imodesuffix>}\t{%2, %0|%0, %2}";
 }
   [(set_attr "type" "ishift")
    (set (attr "length_immediate")
@@ -10060,6 +10127,15 @@
   split_double_mode (<DWI>mode, &operands[0], 1, &operands[4], &operands[5]);
 })
 
+(define_insn "*bmi2_rorx<mode>3_1"
+  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=r")
+	(rotatert:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" "rm")
+			(match_operand:QI 2 "immediate_operand" "c<S>")))]
+  "TARGET_BMI2 && ix86_binary_operator_ok (ROTATERT, <MODE>mode, operands)"
+  "rorx\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "rotate")
+   (set_attr "mode" "<MODE>")])
+
 (define_insn "*<rotate_insn><mode>3_1"
   [(set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m")
 	(any_rotate:SWI (match_operand:SWI 1 "nonimmediate_operand" "0")
@@ -12346,6 +12422,42 @@
   "xor{b}\t{%h0, %b0|%b0, %h0}"
   [(set_attr "length" "2")
    (set_attr "mode" "HI")])
+
+;; BMI2 instructions.
+(define_insn "bmi2_bzhi_<mode>3"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+        (unspec:SWI48 [(match_operand:SWI48 1 "nonimmediate_operand" "rm")
+                       (match_operand:SWI48 2 "register_operand" "r")]
+                       UNSPEC_BZHI))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_BMI2"
+  "bzhi\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "bitmanip")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "bmi2_pdep_<mode>3"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+        (unspec:SWI48 [(match_operand:SWI48 1 "nonimmediate_operand" "rm")
+                       (match_operand:SWI48 2 "register_operand" "r")]
+                       UNSPEC_PDEP))]
+  "TARGET_BMI2"
+  "pdep\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "bitmanip")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "bmi2_pext_<mode>3"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+        (unspec:SWI48 [(match_operand:SWI48 1 "nonimmediate_operand" "rm")
+                       (match_operand:SWI48 2 "register_operand" "r")]
+                       UNSPEC_PEXT))]
+  "TARGET_BMI2"
+  "pext\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "bitmanip")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<MODE>")])
+
 \f
 ;; Thread-local storage patterns for ELF.
 ;;
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index 54d7af1..8e4d51b 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -493,6 +493,10 @@ mbmi
 Target Report Mask(ISA_BMI) Var(ix86_isa_flags) Save
 Support BMI built-in functions and code generation
 
+mbmi2
+Target Report Mask(ISA_BMI2) Var(ix86_isa_flags) Save
+Support BMI2 built-in functions and code generation
+
 mlzcnt
 Target Report Mask(ISA_LZCNT) Var(ix86_isa_flags) Save
 Support LZCNT built-in function and code generation
diff --git a/gcc/config/i386/x86intrin.h b/gcc/config/i386/x86intrin.h
index 88456f9..e01ecd2 100644
--- a/gcc/config/i386/x86intrin.h
+++ b/gcc/config/i386/x86intrin.h
@@ -81,6 +81,10 @@
 #include <bmiintrin.h>
 #endif
 
+#ifdef __BMI2__
+#include <bmi2intrin.h>
+#endif
+
 #ifdef __TBM__
 #include <tbmintrin.h>
 #endif
diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index 786c18d..1900276 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -9693,6 +9693,17 @@ unsigned int __builtin_ia32_bextr_u32(unsigned int, unsigned int);
 unsigned long long __builtin_ia32_bextr_u64 (unsigned long long, unsigned long long);
 @end smallexample
 
+The following built-in functions are available when @option{-mbmi2} is used.
+All of them generate the machine instruction that is part of the name.
+@smallexample
+unsigned int _bzhi_u32 (unsigned int, unsigned int)
+unsigned int _pdep_u32 (unsigned int, unsigned int)
+unsigned int _pext_u32 (unsigned int, unsigned int)
+unsigned long long _bzhi_u64 (unsigned long long, unsigned long long)
+unsigned long long _pdep_u64 (unsigned long long, unsigned long long)
+unsigned long long _pext_u64 (unsigned long long, unsigned long long)
+@end smallexample
+
 The following built-in functions are available when @option{-mlzcnt} is used.
 All of them generate the machine instruction that is part of the name.
 @smallexample
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index fdc3297..acf30e3 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -607,7 +607,7 @@ Objective-C and Objective-C++ Dialects}.
 -mmmx  -msse  -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -msse4 -mavx @gol
 -mavx2 -maes -mpclmul -mfsgsbase -mrdrnd -mf16c -mfma @gol
 -msse4a -m3dnow -mpopcnt -mabm -mbmi -mtbm -mfma4 -mxop -mlzcnt @gol
--mlwp -mthreads  -mno-align-stringops  -minline-all-stringops @gol
+-mbmi2 -mlwp -mthreads  -mno-align-stringops  -minline-all-stringops @gol
 -minline-stringops-dynamically -mstringop-strategy=@var{alg} @gol
 -mpush-args  -maccumulate-outgoing-args  -m128bit-long-double @gol
 -m96bit-long-double  -mregparm=@var{num}  -msseregparm @gol
@@ -12697,7 +12697,9 @@ preferred alignment to @option{-mpreferred-stack-boundary=2}.
 @itemx -mabm
 @itemx -mno-abm
 @itemx -mbmi
+@itemx -mbmi2
 @itemx -mno-bmi
+@itemx -mno-bmi2
 @itemx -mlzcnt
 @itemx -mno-lzcnt
 @itemx -mtbm
@@ -12709,8 +12711,9 @@ preferred alignment to @option{-mpreferred-stack-boundary=2}.
 @opindex m3dnow
 @opindex mno-3dnow
 These switches enable or disable the use of instructions in the MMX, SSE,
-SSE2, SSE3, SSSE3, SSE4.1, AVX, AVX2, AES, PCLMUL, FSGSBASE, RDRND, F16C, FMA,
-SSE4A, FMA4, XOP, LWP, ABM, BMI, LZCNT or 3DNow!@: extended instruction sets.
+SSE2, SSE3, SSSE3, SSE4.1, AVX, AVX2, AES, PCLMUL, FSGSBASE, RDRND, F16C,
+FMA, SSE4A, FMA4, XOP, LWP, ABM, BMI, BMI2, LZCNT or 3DNow!
+@: extended instruction sets.
 These extensions are also available as built-in functions: see
 @ref{X86 Built-in Functions}, for details of the functions enabled and
 disabled by these switches.
diff --git a/gcc/testsuite/g++.dg/other/i386-2.C b/gcc/testsuite/g++.dg/other/i386-2.C
index ed183c7..5f2eaf9 100644
--- a/gcc/testsuite/g++.dg/other/i386-2.C
+++ b/gcc/testsuite/g++.dg/other/i386-2.C
@@ -1,5 +1,5 @@
 /* { dg-do compile { target i?86-*-* x86_64-*-* } } */
-/* { dg-options "-O -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c" } */
+/* { dg-options "-O -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c" } */
 
 /* Test that {,x,e,p,t,s,w,a,b,i}mmintrin.h, mm3dnow.h, fma4intrin.h,
    xopintrin.h, abmintrin.h, bmiintrin.h, tbmintrin.h, lwpintrin.h,
diff --git a/gcc/testsuite/g++.dg/other/i386-3.C b/gcc/testsuite/g++.dg/other/i386-3.C
index 626f972..76d4d19 100644
--- a/gcc/testsuite/g++.dg/other/i386-3.C
+++ b/gcc/testsuite/g++.dg/other/i386-3.C
@@ -1,5 +1,5 @@
 /* { dg-do compile { target i?86-*-* x86_64-*-* } } */
-/* { dg-options "-O -fkeep-inline-functions -march=k8 -msse4a -m3dnow -mavx -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c" } */
+/* { dg-options "-O -fkeep-inline-functions -march=k8 -msse4a -m3dnow -mavx -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c" } */
 
 /* Test that {,x,e,p,t,s,w,a,b,i}mmintrin.h, mm3dnow.h, fma4intrin.h,
    xopintrin.h, abmintrin.h, bmiintrin.h, tbmintrin.h, lwpintrin.h,
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-bzhi32-1.c b/gcc/testsuite/gcc.target/i386/bmi2-bzhi32-1.c
new file mode 100644
index 0000000..aae2353
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-bzhi32-1.c
@@ -0,0 +1,35 @@
+/* { dg-do run { target { bmi2 } } } */
+/* { dg-options "-mbmi2 -O2" } */
+
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned
+calc_bzhi_u32 (unsigned a, int l)
+{
+  unsigned res = a;
+  int i;
+  for (i=0; i<32-l; ++i)
+    res &= ~(1 << (31 - i));
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned src = 0xce7ace0f;
+  unsigned res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_bzhi_u32 (src, i * 2);
+    res = _bzhi_u32 (src, i * 2);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-bzhi32-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-bzhi32-1a.c
new file mode 100644
index 0000000..79e47a4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-bzhi32-1a.c
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+#include "bmi2-bzhi32-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_bzhi_si3" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-bzhi64-1.c b/gcc/testsuite/gcc.target/i386/bmi2-bzhi64-1.c
new file mode 100644
index 0000000..8db29db
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-bzhi64-1.c
@@ -0,0 +1,35 @@
+/* { dg-do run { target { bmi2 && { ! ia32 } } } } */
+/* { dg-options "-mbmi2 -O2" } */
+
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned long long
+calc_bzhi_u64 (unsigned long long a, int l)
+{
+  unsigned long long res = a;
+  int i;
+  for (i=0; i<64-l; ++i)
+    res &= ~(1LL << (63 - i));
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned long long src = 0xce7ace0ce7ace0ff;
+  unsigned long long res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_bzhi_u64 (src, i * 2);
+    res = _bzhi_u64 (src, i * 2);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-bzhi64-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-bzhi64-1a.c
new file mode 100644
index 0000000..dc4a94c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-bzhi64-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-bzhi64-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_bzhi_di3" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-check.h b/gcc/testsuite/gcc.target/i386/bmi2-check.h
new file mode 100644
index 0000000..5ffce44
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-check.h
@@ -0,0 +1,36 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "cpuid.h"
+
+static void bmi2_test (void);
+
+static void
+__attribute__ ((noinline))
+do_test (void)
+{
+  bmi2_test ();
+}
+
+int
+main ()
+{
+  unsigned int eax, ebx, ecx, edx;
+
+  __cpuid_count (7, 0,  eax, ebx, ecx, edx);
+
+  /* Run BMI2 test only if host has BMI2 support.  */
+  if (ebx & bit_BMI2)
+    {
+      do_test ();
+#ifdef DEBUG
+      printf ("PASSED\n");
+#endif
+    }
+#ifdef DEBUG
+  else
+    printf ("SKIPPED\n");
+#endif
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-mulx32-1.c b/gcc/testsuite/gcc.target/i386/bmi2-mulx32-1.c
new file mode 100644
index 0000000..e1d49de
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-mulx32-1.c
@@ -0,0 +1,37 @@
+/* { dg-do run { target { bmi2 && { ia32 } } } } */
+/* { dg-options "-mbmi2 -Ofast" } */
+
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned long long
+calc_mul_u32 (unsigned a, unsigned b)
+{
+  unsigned long long res = 0;
+  volatile unsigned dummy = 0;
+  int i;
+  for (i=0; i<b; ++i)
+    res += (unsigned long long)(dummy? 0 : a);
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned a = 0xce7ace0;
+  unsigned b = 0xfacefff;
+  unsigned long long res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    a = a * (i + 1);
+    b = b / (i + 1);
+
+    res_ref = calc_mul_u32 (a, b);
+    res = (unsigned long long)a * b;
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-mulx32-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-mulx32-1a.c
new file mode 100644
index 0000000..cf3bb08
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-mulx32-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile { target { ia32 } } } */
+/* { dg-options "-O2 -mbmi2 -dp" } */
+
+#include "bmi2-mulx32-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_umulsidi3_1" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-mulx64-1.c b/gcc/testsuite/gcc.target/i386/bmi2-mulx64-1.c
new file mode 100644
index 0000000..ded3dc1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-mulx64-1.c
@@ -0,0 +1,39 @@
+/* { dg-do run { target { bmi2 && { ! ia32 } } } } */
+/* { dg-options "-mbmi2 -Ofast" } */
+
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned __int128
+calc_mul_u64 (unsigned long long a, unsigned long long b)
+{
+  unsigned __int128 res = 0;
+  volatile unsigned dummy = 0;
+  int i;
+  for (i=0; i<b; ++i) {
+    /* Block loop opts  */
+    res += (unsigned __int128)(dummy? 0 : a);
+  }
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned long long a = 0xce7ace0ce7ace0;
+  unsigned long long b = 0xface;
+  unsigned __int128 res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    a = a * (i + 1);
+    b = b / (i + 1);
+
+    res_ref = calc_mul_u64 (a, b);
+    res = (unsigned __int128)a * b;
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-mulx64-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-mulx64-1a.c
new file mode 100644
index 0000000..592d713
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-mulx64-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mbmi2 -dp" } */
+
+#include "bmi2-mulx64-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_umulditi3_1" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-pdep32-1.c b/gcc/testsuite/gcc.target/i386/bmi2-pdep32-1.c
new file mode 100644
index 0000000..e44a968
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-pdep32-1.c
@@ -0,0 +1,39 @@
+/* { dg-do run { target { bmi2 } } } */
+/* { dg-options "-mbmi2 -O2" } */
+
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned
+calc_pdep_u32 (unsigned a, int mask)
+{
+  unsigned res = 0;
+  int i, k = 0;
+
+  for (i=0; i<32; ++i)
+    if (mask & (1 << i)) {
+      res |= ((a & (1 << k)) >> k) << i;
+      ++k;
+    }
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned src = 0xce7acc;
+  unsigned res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_pdep_u32 (src, i*3);
+    res = _pdep_u32 (src, i*3);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-pdep32-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-pdep32-1a.c
new file mode 100644
index 0000000..87888fc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-pdep32-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-pdep32-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_pdep_si3" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-pdep64-1.c b/gcc/testsuite/gcc.target/i386/bmi2-pdep64-1.c
new file mode 100644
index 0000000..c0074fc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-pdep64-1.c
@@ -0,0 +1,38 @@
+/* { dg-do run { target { bmi2 && { ! ia32 } } } } */
+/* { dg-options "-mbmi2 -O2" } */
+
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned long long
+calc_pdep_u64 (unsigned long long a, unsigned long long mask)
+{
+  unsigned long long res = 0;
+  unsigned long long i, k = 0;
+
+  for (i=0; i<64; ++i)
+    if (mask & (1LL << i)) {
+      res |= ((a & (1LL << k)) >> k) << i;
+      ++k;
+    }
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned long long i;
+  unsigned long long src = 0xce7acce7acce7ac;
+  unsigned long long res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_pdep_u64 (src, ~(i * 3));
+    res = _pdep_u64 (src, ~(i * 3));
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-pdep64-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-pdep64-1a.c
new file mode 100644
index 0000000..8163c40
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-pdep64-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-pdep64-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_pdep_di3" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-pext32-1.c b/gcc/testsuite/gcc.target/i386/bmi2-pext32-1.c
new file mode 100644
index 0000000..f21029f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-pext32-1.c
@@ -0,0 +1,39 @@
+/* { dg-do run { target { bmi2 } } } */
+/* { dg-options "-mbmi2 -O2" } */
+
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned
+calc_pext_u32 (unsigned a, unsigned mask)
+{
+  unsigned res = 0;
+  int i, k = 0;
+
+  for (i=0; i<32; ++i)
+    if (mask & (1 << i)) {
+      res |= ((a & (1 << i)) >> i) << k;
+      ++k;
+    }
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned src = 0xce7acc;
+  unsigned res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_pext_u32 (src, ~(i * 3));
+    res = _pext_u32 (src, ~(i * 3));
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-pext32-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-pext32-1a.c
new file mode 100644
index 0000000..c4a6dee
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-pext32-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-pext32-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_pext_si3" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-pext64-1.c b/gcc/testsuite/gcc.target/i386/bmi2-pext64-1.c
new file mode 100644
index 0000000..bad0584
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-pext64-1.c
@@ -0,0 +1,39 @@
+/* { dg-do run { target { bmi2 && { ! ia32 } } } } */
+/* { dg-options "-mbmi2 -O2" } */
+
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned long long
+calc_pext_u64 (unsigned long long a, unsigned long long mask)
+{
+  unsigned long long res = 0;
+  int i, k = 0;
+
+  for (i=0; i<64; ++i)
+    if (mask & (1LL << i)) {
+      res |= ((a & (1LL << i)) >> i) << k;
+      ++k;
+    }
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned long long i;
+  unsigned long long src = 0xce7acce7acce7ac;
+  unsigned long long res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_pext_u64 (src, ~(i * 3));
+    res = _pext_u64 (src, ~(i * 3));
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-pext64-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-pext64-1a.c
new file mode 100644
index 0000000..aaf06c1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-pext64-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-pext64-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_pext_di3" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-rorx32-1.c b/gcc/testsuite/gcc.target/i386/bmi2-rorx32-1.c
new file mode 100644
index 0000000..84618e3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-rorx32-1.c
@@ -0,0 +1,36 @@
+/* { dg-do run { target { bmi2 } } } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned
+calc_rorx_u32 (unsigned a, int l)
+{
+  unsigned res = a;
+  int i;
+  for (i=0; i<l; ++i)
+    res = (res >> 1) | ((res&1)<< 31);
+
+  return res;
+}
+
+#define SHIFT_VAL 0x0e
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned src = 0xce7ace0;
+  unsigned res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_rorx_u32 (src, SHIFT_VAL);
+    res = (src >> SHIFT_VAL) | (src << (32 - SHIFT_VAL));
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-rorx32-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-rorx32-1a.c
new file mode 100644
index 0000000..bb3b28d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-rorx32-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mbmi2 -dp" } */
+
+#include "bmi2-rorx32-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_rorxsi3_1" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-rorx64-1.c b/gcc/testsuite/gcc.target/i386/bmi2-rorx64-1.c
new file mode 100644
index 0000000..7dc722a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-rorx64-1.c
@@ -0,0 +1,36 @@
+/* { dg-do run { target { bmi2 && { ! ia32 } } } } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned long long
+calc_rorx_u64 (unsigned long long a, int l)
+{
+  unsigned long long res = a;
+  int i;
+  for (i=0; i<l; ++i)
+    res = (res >> 1) | ((res&1)<< 63);
+
+  return res;
+}
+
+#define SHIFT_VAL 0x1e
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned long long src = 0xce7ace0ce7ace0;
+  unsigned long long res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i+1);
+
+    res_ref = calc_rorx_u64 (src, SHIFT_VAL);
+    res = (src >> SHIFT_VAL) | (src << (64 - SHIFT_VAL));
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-rorx64-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-rorx64-1a.c
new file mode 100644
index 0000000..2a7a7a0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-rorx64-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mbmi2 -dp" } */
+
+#include "bmi2-rorx64-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_rorxdi3_1" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-sarx32-1.c b/gcc/testsuite/gcc.target/i386/bmi2-sarx32-1.c
new file mode 100644
index 0000000..2bbf016
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-sarx32-1.c
@@ -0,0 +1,36 @@
+/* { dg-do run { target { bmi2 } } } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+int
+calc_sarx_u32 (int a, int l)
+{
+  int res = a;
+  int i;
+  for (i=0; i<l; ++i)
+    res >>= 1;
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  int src = 0xfce7ace0;
+  int res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_sarx_u32 (src, i + 1);
+    res = src >> (i + 1);
+
+    printf ("%x %x\n", res_ref, res);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-sarx32-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-sarx32-1a.c
new file mode 100644
index 0000000..f10d60b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-sarx32-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mbmi2 -dp" } */
+
+#include "bmi2-sarx32-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_ashrsi3" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-sarx64-1.c b/gcc/testsuite/gcc.target/i386/bmi2-sarx64-1.c
new file mode 100644
index 0000000..0bb13c0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-sarx64-1.c
@@ -0,0 +1,34 @@
+/* { dg-do run { target { bmi2 && { ! ia32 } } } } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+long long
+calc_sarx_u64 (long long a, int l)
+{
+  long long res = a;
+  int i;
+  for (i=0; i<l; ++i)
+    res >>= 1;
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  long long src = 0xfce7ace0ce7ace0;
+  long long res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i+1);
+
+    res_ref = calc_sarx_u64 (src, i + 1);
+    res = src >> (i + 1);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-sarx64-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-sarx64-1a.c
new file mode 100644
index 0000000..bcf0fd4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-sarx64-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mbmi2 -dp" } */
+
+#include "bmi2-sarx64-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_ashrdi3" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-shlx32-1.c b/gcc/testsuite/gcc.target/i386/bmi2-shlx32-1.c
new file mode 100644
index 0000000..3f35047
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-shlx32-1.c
@@ -0,0 +1,34 @@
+/* { dg-do run { target { bmi2 } } } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+int
+calc_shlx_u32 (int a, int l)
+{
+  int res = a;
+  int i;
+  for (i=0; i<l; ++i)
+    res <<= 1;
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  int src = 0xfce7ace0;
+  int res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_shlx_u32 (src, i + 1);
+    res = src << (i + 1);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-shlx32-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-shlx32-1a.c
new file mode 100644
index 0000000..215e5d3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-shlx32-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mbmi2 -dp" } */
+
+#include "bmi2-shlx32-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_ashlsi3" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-shrx32-1.c b/gcc/testsuite/gcc.target/i386/bmi2-shrx32-1.c
new file mode 100644
index 0000000..17f0c67
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-shrx32-1.c
@@ -0,0 +1,36 @@
+/* { dg-do run { target { bmi2 } } } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned
+calc_shrx_u32 (unsigned a, int l)
+{
+  unsigned res = a;
+  int i;
+  for (i=0; i<l; ++i)
+    res >>= 1;
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned src = 0xce7ace0;
+  unsigned res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_shrx_u32 (src, i + 1);
+    res = src >> (i + 1);
+
+    printf ("%x %x\n", res_ref, res);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-shrx32-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-shrx32-1a.c
new file mode 100644
index 0000000..24c53d4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-shrx32-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mbmi2 -dp" } */
+
+#include "bmi2-shrx32-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_lshrsi3" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-shrx64-1.c b/gcc/testsuite/gcc.target/i386/bmi2-shrx64-1.c
new file mode 100644
index 0000000..022baa9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-shrx64-1.c
@@ -0,0 +1,34 @@
+/* { dg-do run { target { bmi2 && { ! ia32 } } } } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned long long
+calc_shrx_u64 (unsigned long long a, int l)
+{
+  unsigned long long res = a;
+  int i;
+  for (i=0; i<l; ++i)
+    res >>= 1;
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned long long src = 0xce7ace0ce7ace0;
+  unsigned long long res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i+1);
+
+    res_ref = calc_shrx_u64 (src, i + 1);
+    res = src >> (i + 1);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-shrx64-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-shrx64-1a.c
new file mode 100644
index 0000000..7830439
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-shrx64-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mbmi2 -dp" } */
+
+#include "bmi2-shrx64-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_lshrdi3" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/i386.exp b/gcc/testsuite/gcc.target/i386/i386.exp
index 167b79b..cff8a9a 100644
--- a/gcc/testsuite/gcc.target/i386/i386.exp
+++ b/gcc/testsuite/gcc.target/i386/i386.exp
@@ -206,6 +206,17 @@ proc check_effective_target_bmi { } {
     } "-mbmi" ]
 }
 
+# Return 1 if bmi2 instructions can be compiled.
+proc check_effective_target_bmi2 { } {
+    return [check_no_compiler_messages bmi2 object {
+	unsigned int
+	_bzhi_u32 (unsigned int __X, unsigned int __Y)
+	{
+	    return __builtin_ia32_bzhi_si (__X, __Y);
+	}
+    } "-mbmi2" ]
+}
+
 # If the linker used understands -M <mapfile>, pass it to clear hardware
 # capabilities set by the Sun assembler.
 set clearcap_ldflags "-Wl,-M,$srcdir/$subdir/clearcap.map"

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH, testsuite, i386] BMI2 support for GCC
  2011-08-19 14:34       ` Kirill Yukhin
@ 2011-08-19 14:38         ` H.J. Lu
  2011-08-19 14:38           ` Kirill Yukhin
  0 siblings, 1 reply; 27+ messages in thread
From: H.J. Lu @ 2011-08-19 14:38 UTC (permalink / raw)
  To: Kirill Yukhin; +Cc: Jakub Jelinek, gcc-patches List, Uros Bizjak

No need for () in "(mode == SImode)":

+	  && !optimize_function_for_size_p (cfun)
+	  && ((mode == SImode) || (mode == DImode && TARGET_64BIT))

Wrong placement of '{':

+  if (can_create_pseudo_p () && <MODE>mode != SImode) {
+    rtx tmp = gen_rtx_REG (<MODE>mode, 0);
+    emit_insn (gen_extendsidi2 (tmp, operands[2]));
+    operands[2] = tmp;
+  }



On Fri, Aug 19, 2011 at 6:53 AM, Kirill Yukhin <kirill.yukhin@gmail.com> wrote:
> Thanks!
> Fixed, updated patch is attached.
>
> Is it ok?
>
> Thanks, K
>
> On Fri, Aug 19, 2011 at 5:22 PM, Jakub Jelinek <jakub@redhat.com> wrote:
>> On Fri, Aug 19, 2011 at 05:18:19PM +0400, Kirill Yukhin wrote:
>>> Thanks, it is fixed.
>>> Update patch is attached.
>>
>> +         /* We generatin RORX instruction, freedom of register +
>> +            flags not affected  */
>>
>> comment doesn't look to be correct english (missing verb, missing g at
>> the end of generating, missing dot at the end of sentence).
>>
>>        Jakub
>>
>



-- 
H.J.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH, testsuite, i386] BMI2 support for GCC
  2011-08-19 14:38           ` Kirill Yukhin
@ 2011-08-19 14:47             ` H.J. Lu
  2011-08-19 15:22               ` Kirill Yukhin
  0 siblings, 1 reply; 27+ messages in thread
From: H.J. Lu @ 2011-08-19 14:47 UTC (permalink / raw)
  To: Kirill Yukhin; +Cc: Jakub Jelinek, gcc-patches List, Uros Bizjak

It is hard to tell.  Can you double check indentation on

+  if (can_create_pseudo_p () && <MODE>mode != SImode)
+  {
+    rtx tmp = gen_rtx_REG (<MODE>mode, 0);
+    emit_insn (gen_extendsidi2 (tmp, operands[2]));
+    operands[2] = tmp;
+  }


On Fri, Aug 19, 2011 at 7:13 AM, Kirill Yukhin <kirill.yukhin@gmail.com> wrote:
> Thanks, fixed.
>
> Updated patch is attached.
>
> K
>
> On Fri, Aug 19, 2011 at 6:04 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
>> No need for () in "(mode == SImode)":
>>
>> +         && !optimize_function_for_size_p (cfun)
>> +         && ((mode == SImode) || (mode == DImode && TARGET_64BIT))
>>
>> Wrong placement of '{':
>>
>> +  if (can_create_pseudo_p () && <MODE>mode != SImode) {
>> +    rtx tmp = gen_rtx_REG (<MODE>mode, 0);
>> +    emit_insn (gen_extendsidi2 (tmp, operands[2]));
>> +    operands[2] = tmp;
>> +  }
>>
>>
>>
>> On Fri, Aug 19, 2011 at 6:53 AM, Kirill Yukhin <kirill.yukhin@gmail.com> wrote:
>>> Thanks!
>>> Fixed, updated patch is attached.
>>>
>>> Is it ok?
>>>
>>> Thanks, K
>>>
>>> On Fri, Aug 19, 2011 at 5:22 PM, Jakub Jelinek <jakub@redhat.com> wrote:
>>>> On Fri, Aug 19, 2011 at 05:18:19PM +0400, Kirill Yukhin wrote:
>>>>> Thanks, it is fixed.
>>>>> Update patch is attached.
>>>>
>>>> +         /* We generatin RORX instruction, freedom of register +
>>>> +            flags not affected  */
>>>>
>>>> comment doesn't look to be correct english (missing verb, missing g at
>>>> the end of generating, missing dot at the end of sentence).
>>>>
>>>>        Jakub
>>>>
>>>
>>
>>
>>
>> --
>> H.J.
>>
>



-- 
H.J.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH, testsuite, i386] BMI2 support for GCC
  2011-08-19 14:47             ` H.J. Lu
@ 2011-08-19 15:22               ` Kirill Yukhin
  2011-08-19 15:36                 ` Kirill Yukhin
  2011-08-20 20:05                 ` Uros Bizjak
  0 siblings, 2 replies; 27+ messages in thread
From: Kirill Yukhin @ 2011-08-19 15:22 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Jakub Jelinek, gcc-patches List, Uros Bizjak

[-- Attachment #1: Type: text/plain, Size: 1791 bytes --]

On Fri, Aug 19, 2011 at 6:31 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
> It is hard to tell.  Can you double check indentation on
>
> +  if (can_create_pseudo_p () && <MODE>mode != SImode)
> +  {
> +    rtx tmp = gen_rtx_REG (<MODE>mode, 0);
> +    emit_insn (gen_extendsidi2 (tmp, operands[2]));
> +    operands[2] = tmp;
> +  }
>
>
> On Fri, Aug 19, 2011 at 7:13 AM, Kirill Yukhin <kirill.yukhin@gmail.com> wrote:
>> Thanks, fixed.
>>
>> Updated patch is attached.
>>
>> K
>>
>> On Fri, Aug 19, 2011 at 6:04 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
>>> No need for () in "(mode == SImode)":
>>>
>>> +         && !optimize_function_for_size_p (cfun)
>>> +         && ((mode == SImode) || (mode == DImode && TARGET_64BIT))
>>>
>>> Wrong placement of '{':
>>>
>>> +  if (can_create_pseudo_p () && <MODE>mode != SImode) {
>>> +    rtx tmp = gen_rtx_REG (<MODE>mode, 0);
>>> +    emit_insn (gen_extendsidi2 (tmp, operands[2]));
>>> +    operands[2] = tmp;
>>> +  }
>>>
>>>
>>>
>>> On Fri, Aug 19, 2011 at 6:53 AM, Kirill Yukhin <kirill.yukhin@gmail.com> wrote:
>>>> Thanks!
>>>> Fixed, updated patch is attached.
>>>>
>>>> Is it ok?
>>>>
>>>> Thanks, K
>>>>
>>>> On Fri, Aug 19, 2011 at 5:22 PM, Jakub Jelinek <jakub@redhat.com> wrote:
>>>>> On Fri, Aug 19, 2011 at 05:18:19PM +0400, Kirill Yukhin wrote:
>>>>>> Thanks, it is fixed.
>>>>>> Update patch is attached.
>>>>>
>>>>> +         /* We generatin RORX instruction, freedom of register +
>>>>> +            flags not affected  */
>>>>>
>>>>> comment doesn't look to be correct english (missing verb, missing g at
>>>>> the end of generating, missing dot at the end of sentence).
>>>>>
>>>>>        Jakub
>>>>>
>>>>
>>>
>>>
>>>
>>> --
>>> H.J.
>>>
>>
>
>
>
> --
> H.J.
>

[-- Attachment #2: bmi2-5.gcc.patch --]
[-- Type: application/octet-stream, Size: 52605 bytes --]

diff --git a/gcc/common/config/i386/i386-common.c b/gcc/common/config/i386/i386-common.c
index b201835..99643d6 100644
--- a/gcc/common/config/i386/i386-common.c
+++ b/gcc/common/config/i386/i386-common.c
@@ -79,6 +79,7 @@ along with GCC; see the file COPYING3.  If not see
   (OPTION_MASK_ISA_ABM | OPTION_MASK_ISA_POPCNT)
 
 #define OPTION_MASK_ISA_BMI_SET OPTION_MASK_ISA_BMI
+#define OPTION_MASK_ISA_BMI2_SET OPTION_MASK_ISA_BMI2
 #define OPTION_MASK_ISA_TBM_SET OPTION_MASK_ISA_TBM
 #define OPTION_MASK_ISA_POPCNT_SET OPTION_MASK_ISA_POPCNT
 #define OPTION_MASK_ISA_CX16_SET OPTION_MASK_ISA_CX16
@@ -137,6 +138,7 @@ along with GCC; see the file COPYING3.  If not see
 #define OPTION_MASK_ISA_PCLMUL_UNSET OPTION_MASK_ISA_PCLMUL
 #define OPTION_MASK_ISA_ABM_UNSET OPTION_MASK_ISA_ABM
 #define OPTION_MASK_ISA_BMI_UNSET OPTION_MASK_ISA_BMI
+#define OPTION_MASK_ISA_BMI2_UNSET OPTION_MASK_ISA_BMI2
 #define OPTION_MASK_ISA_TBM_UNSET OPTION_MASK_ISA_TBM
 #define OPTION_MASK_ISA_POPCNT_UNSET OPTION_MASK_ISA_POPCNT
 #define OPTION_MASK_ISA_CX16_UNSET OPTION_MASK_ISA_CX16
@@ -395,6 +397,19 @@ ix86_handle_option (struct gcc_options *opts,
 	}
       return true;
 
+    case OPT_mbmi2:
+      if (value)
+	{
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI2_SET;
+	  opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_BMI2_SET;
+	}
+      else
+	{
+	  opts->x_ix86_isa_flags &= ~OPTION_MASK_ISA_BMI2_UNSET;
+	  opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_BMI2_UNSET;
+	}
+      return true;
+
     case OPT_mtbm:
       if (value)
 	{
diff --git a/gcc/config.gcc b/gcc/config.gcc
index b92ce3d..30cce99 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -352,7 +352,7 @@ i[34567]86-*-*)
 		       nmmintrin.h bmmintrin.h fma4intrin.h wmmintrin.h
 		       immintrin.h x86intrin.h avxintrin.h xopintrin.h
 		       ia32intrin.h cross-stdarg.h lwpintrin.h popcntintrin.h
-		       lzcntintrin.h bmiintrin.h tbmintrin.h"
+		       lzcntintrin.h bmiintrin.h bmi2intrin.h tbmintrin.h"
 	;;
 x86_64-*-*)
 	cpu_type=i386
@@ -364,7 +364,7 @@ x86_64-*-*)
 		       nmmintrin.h bmmintrin.h fma4intrin.h wmmintrin.h
 		       immintrin.h x86intrin.h avxintrin.h xopintrin.h
 		       ia32intrin.h cross-stdarg.h lwpintrin.h popcntintrin.h
-		       lzcntintrin.h bmiintrin.h tbmintrin.h"
+		       lzcntintrin.h bmiintrin.h bmi2intrin.h tbmintrin.h"
 	need_64bit_hwint=yes
 	;;
 ia64-*-*)
diff --git a/gcc/config/i386/bmi2intrin.h b/gcc/config/i386/bmi2intrin.h
new file mode 100644
index 0000000..f3ffa52
--- /dev/null
+++ b/gcc/config/i386/bmi2intrin.h
@@ -0,0 +1,81 @@
+/* Copyright (C) 2010, 2011 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _X86INTRIN_H_INCLUDED
+# error "Never use <bmi2intrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __BMI2__
+# error "BMI2 instruction set not enabled"
+#endif /* __BMI2__ */
+
+#ifndef _BMI2INTRIN_H_INCLUDED
+#define _BMI2INTRIN_H_INCLUDED
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_bzhi_u32 (unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_bzhi_si (__X, __Y);
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_pdep_u32 (unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_pdep_si (__X, __Y);
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_pext_u32 (unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_pext_si (__X, __Y);
+}
+
+#ifdef  __x86_64__
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_bzhi_u64 (unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_bzhi_di (__X, __Y);
+}
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_pdep_u64 (unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_pdep_di (__X, __Y);
+}
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_pext_u64 (unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_pext_di (__X, __Y);
+}
+
+#endif /* __x86_64__  */
+
+#endif /* _BMI2INTRIN_H_INCLUDED */
diff --git a/gcc/config/i386/cpuid.h b/gcc/config/i386/cpuid.h
index d53743f..5da8fd2 100644
--- a/gcc/config/i386/cpuid.h
+++ b/gcc/config/i386/cpuid.h
@@ -67,6 +67,7 @@
 #define bit_FSGSBASE	(1 << 0)
 #define bit_BMI		(1 << 3)
 #define bit_AVX2	(1 << 5)
+#define bit_BMI2	(1 << 8)
 
 #if defined(__i386__) && defined(__PIC__)
 /* %ebx may be the PIC register.  */
diff --git a/gcc/config/i386/driver-i386.c b/gcc/config/i386/driver-i386.c
index b7a1f52..8107ece 100644
--- a/gcc/config/i386/driver-i386.c
+++ b/gcc/config/i386/driver-i386.c
@@ -396,7 +396,7 @@ const char *host_detect_local_cpu (int argc, const char **argv)
   unsigned int has_popcnt = 0, has_aes = 0, has_avx = 0, has_avx2 = 0;
   unsigned int has_pclmul = 0, has_abm = 0, has_lwp = 0;
   unsigned int has_fma = 0, has_fma4 = 0, has_xop = 0;
-  unsigned int has_bmi = 0, has_tbm = 0, has_lzcnt = 0;
+  unsigned int has_bmi = 0, has_bmi2 = 0, has_tbm = 0, has_lzcnt = 0;
 
   bool arch;
 
@@ -475,6 +475,7 @@ const char *host_detect_local_cpu (int argc, const char **argv)
 
       has_bmi = ebx & bit_BMI;
       has_avx2 = ebx & bit_AVX2;
+      has_bmi2 = ebx & bit_BMI2;
     }
 
   if (!arch)
@@ -715,6 +716,7 @@ const char *host_detect_local_cpu (int argc, const char **argv)
       const char *fma4 = has_fma4 ? " -mfma4" : " -mno-fma4";
       const char *xop = has_xop ? " -mxop" : " -mno-xop";
       const char *bmi = has_bmi ? " -mbmi" : " -mno-bmi";
+      const char *bmi2 = has_bmi2 ? " -mbmi2" : " -mno-bmi2";
       const char *tbm = has_tbm ? " -mtbm" : " -mno-tbm";
       const char *avx = has_avx ? " -mavx" : " -mno-avx";
       const char *avx2 = has_avx2 ? " -mavx2" : " -mno-avx2";
@@ -723,8 +725,8 @@ const char *host_detect_local_cpu (int argc, const char **argv)
       const char *lzcnt = has_lzcnt ? " -mlzcnt" : " -mno-lzcnt";
 
       options = concat (options, cx16, sahf, movbe, ase, pclmul,
-			popcnt, abm, lwp, fma, fma4, xop, bmi, tbm,
-			avx2, avx, sse4_2, sse4_1, lzcnt, NULL);
+			popcnt, abm, lwp, fma, fma4, xop, bmi, bmi2,
+			tbm, avx, avx2, sse4_2, sse4_1, lzcnt, NULL);
     }
 
 done:
diff --git a/gcc/config/i386/i386-c.c b/gcc/config/i386/i386-c.c
index 5c1dfe6..d4b0b08 100644
--- a/gcc/config/i386/i386-c.c
+++ b/gcc/config/i386/i386-c.c
@@ -273,6 +273,8 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
     def_or_undef (parse_in, "__ABM__");
   if (isa_flag & OPTION_MASK_ISA_BMI)
     def_or_undef (parse_in, "__BMI__");
+  if (isa_flag & OPTION_MASK_ISA_BMI2)
+    def_or_undef (parse_in, "__BMI2__");
   if (isa_flag & OPTION_MASK_ISA_LZCNT)
     def_or_undef (parse_in, "__LZCNT__");
   if (isa_flag & OPTION_MASK_ISA_TBM)
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 53c5944..bff1a05 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -2664,6 +2664,7 @@ ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
     { "-mmmx",		OPTION_MASK_ISA_MMX },
     { "-mabm",		OPTION_MASK_ISA_ABM },
     { "-mbmi",		OPTION_MASK_ISA_BMI },
+    { "-mbmi2", 	OPTION_MASK_ISA_BMI2 },
     { "-mlzcnt",	OPTION_MASK_ISA_LZCNT },
     { "-mtbm",		OPTION_MASK_ISA_TBM },
     { "-mpopcnt",	OPTION_MASK_ISA_POPCNT },
@@ -2921,6 +2922,7 @@ ix86_option_override_internal (bool main_args_p)
 #define PTA_TBM		 	(HOST_WIDE_INT_1 << 28)
 #define PTA_XOP		 	(HOST_WIDE_INT_1 << 29)
 #define PTA_AVX2		(HOST_WIDE_INT_1 << 30)
+#define PTA_BMI2	 	(HOST_WIDE_INT_1 << 31)
 /* if this reaches 64, need to widen struct pta flags below */
 
   static struct pta
@@ -2978,8 +2980,8 @@ ix86_option_override_internal (bool main_args_p)
 	PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
 	| PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
 	| PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
-	| PTA_RDRND | PTA_F16C | PTA_BMI | PTA_LZCNT | PTA_FMA
-	| PTA_MOVBE},
+	| PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
+        | PTA_FMA | PTA_MOVBE},
       {"atom", PROCESSOR_ATOM, CPU_ATOM,
 	PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
 	| PTA_SSSE3 | PTA_CX16 | PTA_MOVBE},
@@ -3300,6 +3302,9 @@ ix86_option_override_internal (bool main_args_p)
 	if (processor_alias_table[i].flags & PTA_TBM
 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
 	  ix86_isa_flags |= OPTION_MASK_ISA_TBM;
+	if (processor_alias_table[i].flags & PTA_BMI2
+	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
+	  ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
 	if (processor_alias_table[i].flags & PTA_CX16
 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
 	  ix86_isa_flags |= OPTION_MASK_ISA_CX16;
@@ -4053,6 +4058,7 @@ ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
     IX86_ATTR_ISA ("3dnow",	OPT_m3dnow),
     IX86_ATTR_ISA ("abm",	OPT_mabm),
     IX86_ATTR_ISA ("bmi",	OPT_mbmi),
+    IX86_ATTR_ISA ("bmi2",	OPT_mbmi2),
     IX86_ATTR_ISA ("lzcnt",	OPT_mlzcnt),
     IX86_ATTR_ISA ("tbm",	OPT_mtbm),
     IX86_ATTR_ISA ("aes",	OPT_maes),
@@ -13285,6 +13291,7 @@ put_condition_code (enum rtx_code code, enum machine_mode mode, int reverse,
    If CODE is 't', pretend the mode is V8SFmode.
    If CODE is 'h', pretend the reg is the 'high' byte register.
    If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
+   If CODE is 'N', print the half mode high register.
    If CODE is 'd', duplicate the operand for AVX instruction.
  */
 
@@ -13294,6 +13301,15 @@ print_reg (rtx x, int code, FILE *file)
   const char *reg;
   bool duplicated = code == 'd' && TARGET_AVX;
 
+  if (code == 'N')
+    {
+      enum machine_mode mode = GET_MODE (x);
+      enum machine_mode half_mode = mode == TImode ? DImode : SImode;
+      x = simplify_gen_subreg (half_mode, x, mode,
+			       GET_MODE_SIZE (half_mode));
+      code = 0;
+    }
+
   gcc_assert (x == pc_rtx
 	      || (REGNO (x) != ARG_POINTER_REGNUM
 		  && REGNO (x) != FRAME_POINTER_REGNUM
@@ -13472,6 +13488,7 @@ get_some_local_dynamic_name (void)
    t --  likewise, print the V8SFmode name of the register.
    h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
    y -- print "st(0)" instead of "st" as a register.
+   N -- print the half mode high register.
    d -- print duplicated register operand for AVX instruction.
    D -- print condition for SSE cmp instruction.
    P -- if PIC, print an @PLT suffix.
@@ -13678,6 +13695,7 @@ ix86_print_operand (FILE *file, rtx x, int code)
 	case 'h':
 	case 't':
 	case 'y':
+	case 'N':
 	case 'x':
 	case 'X':
 	case 'P':
@@ -15745,8 +15763,23 @@ ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
     }
   else
     {
-      clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
-      emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
+      rtx insn;
+      if (code == ROTATERT
+	  && TARGET_BMI2
+	  && !optimize_function_for_size_p (cfun)
+	  && (mode == SImode || (mode == DImode && TARGET_64BIT))
+	  && CONST_INT_P (src2) )
+	{
+	  /* We're generating RORX instruction: freedom of register +
+	     flags not affected.  */
+	  insn = op;
+	}
+      else
+	{
+	  clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
+	  insn = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob));
+	}
+      emit_insn (insn);
     }
 
   /* Fix up the destination if needed.  */
@@ -24072,6 +24105,13 @@ enum ix86_builtins
   IX86_BUILTIN_BEXTRI32,
   IX86_BUILTIN_BEXTRI64,
 
+  /* BMI2 instructions. */
+  IX86_BUILTIN_BZHI32,
+  IX86_BUILTIN_BZHI64,
+  IX86_BUILTIN_PDEP32,
+  IX86_BUILTIN_PDEP64,
+  IX86_BUILTIN_PEXT32,
+  IX86_BUILTIN_PEXT64,
 
   /* FSGSBASE instructions.  */
   IX86_BUILTIN_RDFSBASE32,
@@ -25046,6 +25086,14 @@ static const struct builtin_description bdesc_args[] =
   { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps256, "__builtin_ia32_vcvtph2ps256", IX86_BUILTIN_CVTPH2PS256, UNKNOWN, (int) V8SF_FTYPE_V8HI },
   { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph, "__builtin_ia32_vcvtps2ph", IX86_BUILTIN_CVTPS2PH, UNKNOWN, (int) V8HI_FTYPE_V4SF_INT },
   { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph256, "__builtin_ia32_vcvtps2ph256", IX86_BUILTIN_CVTPS2PH256, UNKNOWN, (int) V8HI_FTYPE_V8SF_INT },
+
+  /* BMI2 */
+  { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_si3, "__builtin_ia32_bzhi_si", IX86_BUILTIN_BZHI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
+  { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_di3, "__builtin_ia32_bzhi_di", IX86_BUILTIN_BZHI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
+  { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_si3, "__builtin_ia32_pdep_si", IX86_BUILTIN_PDEP32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
+  { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_di3, "__builtin_ia32_pdep_di", IX86_BUILTIN_PDEP64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
+  { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_si3, "__builtin_ia32_pext_si", IX86_BUILTIN_PEXT32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
+  { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_di3, "__builtin_ia32_pext_di", IX86_BUILTIN_PEXT64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
 };
 
 /* FMA4 and XOP.  */
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index f8a35ba..47442a0 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -62,6 +62,7 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 #define TARGET_ROUND	OPTION_ISA_ROUND
 #define TARGET_ABM	OPTION_ISA_ABM
 #define TARGET_BMI	OPTION_ISA_BMI
+#define TARGET_BMI2	OPTION_ISA_BMI2
 #define TARGET_LZCNT	OPTION_ISA_LZCNT
 #define TARGET_TBM	OPTION_ISA_TBM
 #define TARGET_POPCNT	OPTION_ISA_POPCNT
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index e7ae397..05f7666 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -236,6 +236,11 @@
 
   ;; For RDRAND support
   UNSPEC_RDRAND
+
+  ;; For BMI2 support
+  UNSPEC_BZHI
+  UNSPEC_PDEP
+  UNSPEC_PEXT
 ])
 
 (define_c_enum "unspecv" [
@@ -751,14 +756,17 @@
 ;; Base name for insn mnemonic.
 (define_code_attr logic [(and "and") (ior "or") (xor "xor")])
 
+;; Mapping of shift operators
+(define_code_iterator any_shift [ashift lshiftrt ashiftrt])
+
 ;; Mapping of shift-right operators
 (define_code_iterator any_shiftrt [lshiftrt ashiftrt])
 
 ;; Base name for define_insn
-(define_code_attr shiftrt_insn [(lshiftrt "lshr") (ashiftrt "ashr")])
+(define_code_attr shift_insn [(ashift "ashl") (lshiftrt "lshr") (ashiftrt "ashr")])
 
 ;; Base name for insn mnemonic.
-(define_code_attr shiftrt [(lshiftrt "shr") (ashiftrt "sar")])
+(define_code_attr shift [(ashift "shl") (lshiftrt "shr") (ashiftrt "sar")])
 
 ;; Mapping of rotate operators
 (define_code_iterator any_rotate [rotate rotatert])
@@ -777,6 +785,8 @@
 
 ;; Used in signed and unsigned widening multiplications.
 (define_code_iterator any_extend [sign_extend zero_extend])
+(define_code_attr any_extend [(sign_extend "SIGN_EXTEND")
+			      (zero_extend "ZERO_EXTEND")])
 
 ;; Various insn prefixes for signed and unsigned operations.
 (define_code_attr u [(sign_extend "") (zero_extend "u")
@@ -6837,7 +6847,17 @@
 		       (match_operand:DWIH 1 "nonimmediate_operand" ""))
 		     (any_extend:<DWI>
 		       (match_operand:DWIH 2 "register_operand" ""))))
-	      (clobber (reg:CC FLAGS_REG))])])
+	      (clobber (reg:CC FLAGS_REG))])]
+  ""
+{
+  if (TARGET_BMI2 && <any_extend> == ZERO_EXTEND)
+    {
+      emit_insn (gen_bmi2_umul<mode><dwi>3_1 (operands[0],
+					      operands[1],
+					      operands[2]));
+      DONE;
+    }
+})
 
 (define_expand "<u>mulqihi3"
   [(parallel [(set (match_operand:HI 0 "register_operand" "")
@@ -6849,6 +6869,24 @@
 	      (clobber (reg:CC FLAGS_REG))])]
   "TARGET_QIMODE_MATH")
 
+(define_insn "bmi2_umul<mode><dwi>3_1"
+  [(set (match_operand:<DWI> 0 "register_operand" "=r")
+	(mult:<DWI>
+	  (zero_extend:<DWI>
+	    (match_operand:DWIH 1 "register_operand" "d"))
+	  (zero_extend:<DWI>
+	    (match_operand:DWIH 2 "nonimmediate_operand" "rm"))))]
+  "TARGET_BMI2"
+{
+  if (<MODE>mode == DImode)
+    return "mulx\t{%2, %q0, %N0|%N0, %q0, %2}";
+  else
+    return "mulx\t{%2, %k0, %N0|%N0, %k0, %2}";
+}
+  [(set_attr "type" "imul")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<MODE>")])
+
 (define_insn "*<u>mul<mode><dwi>3_1"
   [(set (match_operand:<DWI> 0 "register_operand" "=A")
 	(mult:<DWI>
@@ -9587,15 +9625,44 @@
 
 ;; See comment above `ashl<mode>3' about how this works.
 
-(define_expand "<shiftrt_insn><mode>3"
+(define_expand "<shift_insn><mode>3"
   [(set (match_operand:SDWIM 0 "<shift_operand>" "")
 	(any_shiftrt:SDWIM (match_operand:SDWIM 1 "<shift_operand>" "")
 			   (match_operand:QI 2 "nonmemory_operand" "")))]
   ""
   "ix86_expand_binary_operator (<CODE>, <MODE>mode, operands); DONE;")
 
+;; Update pattern if BMI2 is available
+(define_split
+  [(set (match_operand:SWI48 0 "register_operand" "")
+	(any_shift:SWI48
+	  (match_operand:SWI48 1 "nonimmediate_operand" "")
+	  (subreg:QI
+	      (match_operand:SI 2 "register_operand" "") 0)))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_BMI2 && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands) && !reload_completed"
+  [(set (match_dup 0)
+        (any_shift:SWI48 (match_dup 1) (match_dup 2)))]
+{
+  if (can_create_pseudo_p () && <MODE>mode != SImode)
+    {
+      rtx tmp = gen_rtx_REG (<MODE>mode, 0);
+      emit_insn (gen_extendsidi2 (tmp, operands[2]));
+      operands[2] = tmp;
+    }
+})
+
+(define_insn "*bmi2_<shift_insn><mode>3"
+  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=r")
+	(any_shift:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" "rm")
+			 (match_operand:SWI48 2 "register_operand" "r")))]
+  "TARGET_BMI2 && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
+  "<shift>x\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "ishift")
+   (set_attr "mode" "<MODE>")])
+
 ;; Avoid useless masking of count operand.
-(define_insn_and_split "*<shiftrt_insn><mode>3_mask"
+(define_insn_and_split "*<shift_insn><mode>3_mask"
   [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm")
 	(any_shiftrt:SWI48
 	  (match_operand:SWI48 1 "nonimmediate_operand" "0")
@@ -9621,7 +9688,7 @@
   [(set_attr "type" "ishift")
    (set_attr "mode" "<MODE>")])
 
-(define_insn_and_split "*<shiftrt_insn><mode>3_doubleword"
+(define_insn_and_split "*<shift_insn><mode>3_doubleword"
   [(set (match_operand:DWI 0 "register_operand" "=r")
 	(any_shiftrt:DWI (match_operand:DWI 1 "register_operand" "0")
 			 (match_operand:QI 2 "nonmemory_operand" "<S>c")))
@@ -9630,7 +9697,7 @@
   "#"
   "(optimize && flag_peephole2) ? epilogue_completed : reload_completed"
   [(const_int 0)]
-  "ix86_split_<shiftrt_insn> (operands, NULL_RTX, <MODE>mode); DONE;"
+  "ix86_split_<shift_insn> (operands, NULL_RTX, <MODE>mode); DONE;"
   [(set_attr "type" "multi")])
 
 ;; By default we don't ask for a scratch register, because when DWImode
@@ -9647,7 +9714,7 @@
    (match_dup 3)]
   "TARGET_CMOVE"
   [(const_int 0)]
-  "ix86_split_<shiftrt_insn> (operands, operands[3], <DWI>mode); DONE;")
+  "ix86_split_<shift_insn> (operands, operands[3], <DWI>mode); DONE;")
 
 (define_insn "x86_64_shrd"
   [(set (match_operand:DI 0 "nonimmediate_operand" "+r*m")
@@ -9763,7 +9830,7 @@
   DONE;
 })
 
-(define_insn "*<shiftrt_insn><mode>3_1"
+(define_insn "*<shift_insn><mode>3_1"
   [(set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m")
 	(any_shiftrt:SWI (match_operand:SWI 1 "nonimmediate_operand" "0")
 			 (match_operand:QI 2 "nonmemory_operand" "c<S>")))
@@ -9772,9 +9839,9 @@
 {
   if (operands[2] == const1_rtx
       && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
-    return "<shiftrt>{<imodesuffix>}\t%0";
+    return "<shift>{<imodesuffix>}\t%0";
   else
-    return "<shiftrt>{<imodesuffix>}\t{%2, %0|%0, %2}";
+    return "<shift>{<imodesuffix>}\t{%2, %0|%0, %2}";
 }
   [(set_attr "type" "ishift")
    (set (attr "length_immediate")
@@ -9786,7 +9853,7 @@
        (const_string "*")))
    (set_attr "mode" "<MODE>")])
 
-(define_insn "*<shiftrt_insn>si3_1_zext"
+(define_insn "*<shift_insn>si3_1_zext"
   [(set (match_operand:DI 0 "register_operand" "=r")
 	(zero_extend:DI
 	  (any_shiftrt:SI (match_operand:SI 1 "register_operand" "0")
@@ -9796,9 +9863,9 @@
 {
   if (operands[2] == const1_rtx
       && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
-    return "<shiftrt>{l}\t%k0";
+    return "<shift>{l}\t%k0";
   else
-    return "<shiftrt>{l}\t{%2, %k0|%k0, %2}";
+    return "<shift>{l}\t{%2, %k0|%k0, %2}";
 }
   [(set_attr "type" "ishift")
    (set (attr "length_immediate")
@@ -9810,7 +9877,7 @@
        (const_string "*")))
    (set_attr "mode" "SI")])
 
-(define_insn "*<shiftrt_insn>qi3_1_slp"
+(define_insn "*<shift_insn>qi3_1_slp"
   [(set (strict_low_part (match_operand:QI 0 "nonimmediate_operand" "+qm"))
 	(any_shiftrt:QI (match_dup 0)
 			(match_operand:QI 1 "nonmemory_operand" "cI")))
@@ -9822,9 +9889,9 @@
 {
   if (operands[1] == const1_rtx
       && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
-    return "<shiftrt>{b}\t%0";
+    return "<shift>{b}\t%0";
   else
-    return "<shiftrt>{b}\t{%1, %0|%0, %1}";
+    return "<shift>{b}\t{%1, %0|%0, %1}";
 }
   [(set_attr "type" "ishift1")
    (set (attr "length_immediate")
@@ -9839,7 +9906,7 @@
 ;; This pattern can't accept a variable shift count, since shifts by
 ;; zero don't affect the flags.  We assume that shifts by constant
 ;; zero are optimized away.
-(define_insn "*<shiftrt_insn><mode>3_cmp"
+(define_insn "*<shift_insn><mode>3_cmp"
   [(set (reg FLAGS_REG)
 	(compare
 	  (any_shiftrt:SWI
@@ -9857,9 +9924,9 @@
 {
   if (operands[2] == const1_rtx
       && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
-    return "<shiftrt>{<imodesuffix>}\t%0";
+    return "<shift>{<imodesuffix>}\t%0";
   else
-    return "<shiftrt>{<imodesuffix>}\t{%2, %0|%0, %2}";
+    return "<shift>{<imodesuffix>}\t{%2, %0|%0, %2}";
 }
   [(set_attr "type" "ishift")
    (set (attr "length_immediate")
@@ -9871,7 +9938,7 @@
        (const_string "*")))
    (set_attr "mode" "<MODE>")])
 
-(define_insn "*<shiftrt_insn>si3_cmp_zext"
+(define_insn "*<shift_insn>si3_cmp_zext"
   [(set (reg FLAGS_REG)
 	(compare
 	  (any_shiftrt:SI (match_operand:SI 1 "register_operand" "0")
@@ -9889,9 +9956,9 @@
 {
   if (operands[2] == const1_rtx
       && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
-    return "<shiftrt>{l}\t%k0";
+    return "<shift>{l}\t%k0";
   else
-    return "<shiftrt>{l}\t{%2, %k0|%k0, %2}";
+    return "<shift>{l}\t{%2, %k0|%k0, %2}";
 }
   [(set_attr "type" "ishift")
    (set (attr "length_immediate")
@@ -9903,7 +9970,7 @@
        (const_string "*")))
    (set_attr "mode" "SI")])
 
-(define_insn "*<shiftrt_insn><mode>3_cconly"
+(define_insn "*<shift_insn><mode>3_cconly"
   [(set (reg FLAGS_REG)
 	(compare
 	  (any_shiftrt:SWI
@@ -9919,9 +9986,9 @@
 {
   if (operands[2] == const1_rtx
       && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
-    return "<shiftrt>{<imodesuffix>}\t%0";
+    return "<shift>{<imodesuffix>}\t%0";
   else
-    return "<shiftrt>{<imodesuffix>}\t{%2, %0|%0, %2}";
+    return "<shift>{<imodesuffix>}\t{%2, %0|%0, %2}";
 }
   [(set_attr "type" "ishift")
    (set (attr "length_immediate")
@@ -10060,6 +10127,15 @@
   split_double_mode (<DWI>mode, &operands[0], 1, &operands[4], &operands[5]);
 })
 
+(define_insn "*bmi2_rorx<mode>3_1"
+  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=r")
+	(rotatert:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" "rm")
+			(match_operand:QI 2 "immediate_operand" "c<S>")))]
+  "TARGET_BMI2 && ix86_binary_operator_ok (ROTATERT, <MODE>mode, operands)"
+  "rorx\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "rotate")
+   (set_attr "mode" "<MODE>")])
+
 (define_insn "*<rotate_insn><mode>3_1"
   [(set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m")
 	(any_rotate:SWI (match_operand:SWI 1 "nonimmediate_operand" "0")
@@ -12346,6 +12422,42 @@
   "xor{b}\t{%h0, %b0|%b0, %h0}"
   [(set_attr "length" "2")
    (set_attr "mode" "HI")])
+
+;; BMI2 instructions.
+(define_insn "bmi2_bzhi_<mode>3"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+        (unspec:SWI48 [(match_operand:SWI48 1 "nonimmediate_operand" "rm")
+                       (match_operand:SWI48 2 "register_operand" "r")]
+                       UNSPEC_BZHI))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_BMI2"
+  "bzhi\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "bitmanip")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "bmi2_pdep_<mode>3"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+        (unspec:SWI48 [(match_operand:SWI48 1 "nonimmediate_operand" "rm")
+                       (match_operand:SWI48 2 "register_operand" "r")]
+                       UNSPEC_PDEP))]
+  "TARGET_BMI2"
+  "pdep\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "bitmanip")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "bmi2_pext_<mode>3"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+        (unspec:SWI48 [(match_operand:SWI48 1 "nonimmediate_operand" "rm")
+                       (match_operand:SWI48 2 "register_operand" "r")]
+                       UNSPEC_PEXT))]
+  "TARGET_BMI2"
+  "pext\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "bitmanip")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<MODE>")])
+
 \f
 ;; Thread-local storage patterns for ELF.
 ;;
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index 54d7af1..8e4d51b 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -493,6 +493,10 @@ mbmi
 Target Report Mask(ISA_BMI) Var(ix86_isa_flags) Save
 Support BMI built-in functions and code generation
 
+mbmi2
+Target Report Mask(ISA_BMI2) Var(ix86_isa_flags) Save
+Support BMI2 built-in functions and code generation
+
 mlzcnt
 Target Report Mask(ISA_LZCNT) Var(ix86_isa_flags) Save
 Support LZCNT built-in function and code generation
diff --git a/gcc/config/i386/x86intrin.h b/gcc/config/i386/x86intrin.h
index 88456f9..e01ecd2 100644
--- a/gcc/config/i386/x86intrin.h
+++ b/gcc/config/i386/x86intrin.h
@@ -81,6 +81,10 @@
 #include <bmiintrin.h>
 #endif
 
+#ifdef __BMI2__
+#include <bmi2intrin.h>
+#endif
+
 #ifdef __TBM__
 #include <tbmintrin.h>
 #endif
diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index 786c18d..1900276 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -9693,6 +9693,17 @@ unsigned int __builtin_ia32_bextr_u32(unsigned int, unsigned int);
 unsigned long long __builtin_ia32_bextr_u64 (unsigned long long, unsigned long long);
 @end smallexample
 
+The following built-in functions are available when @option{-mbmi2} is used.
+All of them generate the machine instruction that is part of the name.
+@smallexample
+unsigned int _bzhi_u32 (unsigned int, unsigned int)
+unsigned int _pdep_u32 (unsigned int, unsigned int)
+unsigned int _pext_u32 (unsigned int, unsigned int)
+unsigned long long _bzhi_u64 (unsigned long long, unsigned long long)
+unsigned long long _pdep_u64 (unsigned long long, unsigned long long)
+unsigned long long _pext_u64 (unsigned long long, unsigned long long)
+@end smallexample
+
 The following built-in functions are available when @option{-mlzcnt} is used.
 All of them generate the machine instruction that is part of the name.
 @smallexample
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index fdc3297..acf30e3 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -607,7 +607,7 @@ Objective-C and Objective-C++ Dialects}.
 -mmmx  -msse  -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -msse4 -mavx @gol
 -mavx2 -maes -mpclmul -mfsgsbase -mrdrnd -mf16c -mfma @gol
 -msse4a -m3dnow -mpopcnt -mabm -mbmi -mtbm -mfma4 -mxop -mlzcnt @gol
--mlwp -mthreads  -mno-align-stringops  -minline-all-stringops @gol
+-mbmi2 -mlwp -mthreads  -mno-align-stringops  -minline-all-stringops @gol
 -minline-stringops-dynamically -mstringop-strategy=@var{alg} @gol
 -mpush-args  -maccumulate-outgoing-args  -m128bit-long-double @gol
 -m96bit-long-double  -mregparm=@var{num}  -msseregparm @gol
@@ -12697,7 +12697,9 @@ preferred alignment to @option{-mpreferred-stack-boundary=2}.
 @itemx -mabm
 @itemx -mno-abm
 @itemx -mbmi
+@itemx -mbmi2
 @itemx -mno-bmi
+@itemx -mno-bmi2
 @itemx -mlzcnt
 @itemx -mno-lzcnt
 @itemx -mtbm
@@ -12709,8 +12711,9 @@ preferred alignment to @option{-mpreferred-stack-boundary=2}.
 @opindex m3dnow
 @opindex mno-3dnow
 These switches enable or disable the use of instructions in the MMX, SSE,
-SSE2, SSE3, SSSE3, SSE4.1, AVX, AVX2, AES, PCLMUL, FSGSBASE, RDRND, F16C, FMA,
-SSE4A, FMA4, XOP, LWP, ABM, BMI, LZCNT or 3DNow!@: extended instruction sets.
+SSE2, SSE3, SSSE3, SSE4.1, AVX, AVX2, AES, PCLMUL, FSGSBASE, RDRND, F16C,
+FMA, SSE4A, FMA4, XOP, LWP, ABM, BMI, BMI2, LZCNT or 3DNow!
+@: extended instruction sets.
 These extensions are also available as built-in functions: see
 @ref{X86 Built-in Functions}, for details of the functions enabled and
 disabled by these switches.
diff --git a/gcc/testsuite/g++.dg/other/i386-2.C b/gcc/testsuite/g++.dg/other/i386-2.C
index ed183c7..5f2eaf9 100644
--- a/gcc/testsuite/g++.dg/other/i386-2.C
+++ b/gcc/testsuite/g++.dg/other/i386-2.C
@@ -1,5 +1,5 @@
 /* { dg-do compile { target i?86-*-* x86_64-*-* } } */
-/* { dg-options "-O -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c" } */
+/* { dg-options "-O -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c" } */
 
 /* Test that {,x,e,p,t,s,w,a,b,i}mmintrin.h, mm3dnow.h, fma4intrin.h,
    xopintrin.h, abmintrin.h, bmiintrin.h, tbmintrin.h, lwpintrin.h,
diff --git a/gcc/testsuite/g++.dg/other/i386-3.C b/gcc/testsuite/g++.dg/other/i386-3.C
index 626f972..76d4d19 100644
--- a/gcc/testsuite/g++.dg/other/i386-3.C
+++ b/gcc/testsuite/g++.dg/other/i386-3.C
@@ -1,5 +1,5 @@
 /* { dg-do compile { target i?86-*-* x86_64-*-* } } */
-/* { dg-options "-O -fkeep-inline-functions -march=k8 -msse4a -m3dnow -mavx -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c" } */
+/* { dg-options "-O -fkeep-inline-functions -march=k8 -msse4a -m3dnow -mavx -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c" } */
 
 /* Test that {,x,e,p,t,s,w,a,b,i}mmintrin.h, mm3dnow.h, fma4intrin.h,
    xopintrin.h, abmintrin.h, bmiintrin.h, tbmintrin.h, lwpintrin.h,
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-bzhi32-1.c b/gcc/testsuite/gcc.target/i386/bmi2-bzhi32-1.c
new file mode 100644
index 0000000..aae2353
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-bzhi32-1.c
@@ -0,0 +1,35 @@
+/* { dg-do run { target { bmi2 } } } */
+/* { dg-options "-mbmi2 -O2" } */
+
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned
+calc_bzhi_u32 (unsigned a, int l)
+{
+  unsigned res = a;
+  int i;
+  for (i=0; i<32-l; ++i)
+    res &= ~(1 << (31 - i));
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned src = 0xce7ace0f;
+  unsigned res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_bzhi_u32 (src, i * 2);
+    res = _bzhi_u32 (src, i * 2);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-bzhi32-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-bzhi32-1a.c
new file mode 100644
index 0000000..79e47a4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-bzhi32-1a.c
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+#include "bmi2-bzhi32-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_bzhi_si3" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-bzhi64-1.c b/gcc/testsuite/gcc.target/i386/bmi2-bzhi64-1.c
new file mode 100644
index 0000000..8db29db
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-bzhi64-1.c
@@ -0,0 +1,35 @@
+/* { dg-do run { target { bmi2 && { ! ia32 } } } } */
+/* { dg-options "-mbmi2 -O2" } */
+
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned long long
+calc_bzhi_u64 (unsigned long long a, int l)
+{
+  unsigned long long res = a;
+  int i;
+  for (i=0; i<64-l; ++i)
+    res &= ~(1LL << (63 - i));
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned long long src = 0xce7ace0ce7ace0ff;
+  unsigned long long res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_bzhi_u64 (src, i * 2);
+    res = _bzhi_u64 (src, i * 2);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-bzhi64-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-bzhi64-1a.c
new file mode 100644
index 0000000..dc4a94c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-bzhi64-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-bzhi64-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_bzhi_di3" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-check.h b/gcc/testsuite/gcc.target/i386/bmi2-check.h
new file mode 100644
index 0000000..5ffce44
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-check.h
@@ -0,0 +1,36 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "cpuid.h"
+
+static void bmi2_test (void);
+
+static void
+__attribute__ ((noinline))
+do_test (void)
+{
+  bmi2_test ();
+}
+
+int
+main ()
+{
+  unsigned int eax, ebx, ecx, edx;
+
+  __cpuid_count (7, 0,  eax, ebx, ecx, edx);
+
+  /* Run BMI2 test only if host has BMI2 support.  */
+  if (ebx & bit_BMI2)
+    {
+      do_test ();
+#ifdef DEBUG
+      printf ("PASSED\n");
+#endif
+    }
+#ifdef DEBUG
+  else
+    printf ("SKIPPED\n");
+#endif
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-mulx32-1.c b/gcc/testsuite/gcc.target/i386/bmi2-mulx32-1.c
new file mode 100644
index 0000000..e1d49de
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-mulx32-1.c
@@ -0,0 +1,37 @@
+/* { dg-do run { target { bmi2 && { ia32 } } } } */
+/* { dg-options "-mbmi2 -Ofast" } */
+
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned long long
+calc_mul_u32 (unsigned a, unsigned b)
+{
+  unsigned long long res = 0;
+  volatile unsigned dummy = 0;
+  int i;
+  for (i=0; i<b; ++i)
+    res += (unsigned long long)(dummy? 0 : a);
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned a = 0xce7ace0;
+  unsigned b = 0xfacefff;
+  unsigned long long res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    a = a * (i + 1);
+    b = b / (i + 1);
+
+    res_ref = calc_mul_u32 (a, b);
+    res = (unsigned long long)a * b;
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-mulx32-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-mulx32-1a.c
new file mode 100644
index 0000000..cf3bb08
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-mulx32-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile { target { ia32 } } } */
+/* { dg-options "-O2 -mbmi2 -dp" } */
+
+#include "bmi2-mulx32-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_umulsidi3_1" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-mulx64-1.c b/gcc/testsuite/gcc.target/i386/bmi2-mulx64-1.c
new file mode 100644
index 0000000..ded3dc1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-mulx64-1.c
@@ -0,0 +1,39 @@
+/* { dg-do run { target { bmi2 && { ! ia32 } } } } */
+/* { dg-options "-mbmi2 -Ofast" } */
+
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned __int128
+calc_mul_u64 (unsigned long long a, unsigned long long b)
+{
+  unsigned __int128 res = 0;
+  volatile unsigned dummy = 0;
+  int i;
+  for (i=0; i<b; ++i) {
+    /* Block loop opts  */
+    res += (unsigned __int128)(dummy? 0 : a);
+  }
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned long long a = 0xce7ace0ce7ace0;
+  unsigned long long b = 0xface;
+  unsigned __int128 res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    a = a * (i + 1);
+    b = b / (i + 1);
+
+    res_ref = calc_mul_u64 (a, b);
+    res = (unsigned __int128)a * b;
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-mulx64-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-mulx64-1a.c
new file mode 100644
index 0000000..592d713
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-mulx64-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mbmi2 -dp" } */
+
+#include "bmi2-mulx64-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_umulditi3_1" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-pdep32-1.c b/gcc/testsuite/gcc.target/i386/bmi2-pdep32-1.c
new file mode 100644
index 0000000..e44a968
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-pdep32-1.c
@@ -0,0 +1,39 @@
+/* { dg-do run { target { bmi2 } } } */
+/* { dg-options "-mbmi2 -O2" } */
+
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned
+calc_pdep_u32 (unsigned a, int mask)
+{
+  unsigned res = 0;
+  int i, k = 0;
+
+  for (i=0; i<32; ++i)
+    if (mask & (1 << i)) {
+      res |= ((a & (1 << k)) >> k) << i;
+      ++k;
+    }
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned src = 0xce7acc;
+  unsigned res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_pdep_u32 (src, i*3);
+    res = _pdep_u32 (src, i*3);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-pdep32-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-pdep32-1a.c
new file mode 100644
index 0000000..87888fc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-pdep32-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-pdep32-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_pdep_si3" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-pdep64-1.c b/gcc/testsuite/gcc.target/i386/bmi2-pdep64-1.c
new file mode 100644
index 0000000..c0074fc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-pdep64-1.c
@@ -0,0 +1,38 @@
+/* { dg-do run { target { bmi2 && { ! ia32 } } } } */
+/* { dg-options "-mbmi2 -O2" } */
+
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned long long
+calc_pdep_u64 (unsigned long long a, unsigned long long mask)
+{
+  unsigned long long res = 0;
+  unsigned long long i, k = 0;
+
+  for (i=0; i<64; ++i)
+    if (mask & (1LL << i)) {
+      res |= ((a & (1LL << k)) >> k) << i;
+      ++k;
+    }
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned long long i;
+  unsigned long long src = 0xce7acce7acce7ac;
+  unsigned long long res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_pdep_u64 (src, ~(i * 3));
+    res = _pdep_u64 (src, ~(i * 3));
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-pdep64-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-pdep64-1a.c
new file mode 100644
index 0000000..8163c40
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-pdep64-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-pdep64-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_pdep_di3" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-pext32-1.c b/gcc/testsuite/gcc.target/i386/bmi2-pext32-1.c
new file mode 100644
index 0000000..f21029f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-pext32-1.c
@@ -0,0 +1,39 @@
+/* { dg-do run { target { bmi2 } } } */
+/* { dg-options "-mbmi2 -O2" } */
+
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned
+calc_pext_u32 (unsigned a, unsigned mask)
+{
+  unsigned res = 0;
+  int i, k = 0;
+
+  for (i=0; i<32; ++i)
+    if (mask & (1 << i)) {
+      res |= ((a & (1 << i)) >> i) << k;
+      ++k;
+    }
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned src = 0xce7acc;
+  unsigned res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_pext_u32 (src, ~(i * 3));
+    res = _pext_u32 (src, ~(i * 3));
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-pext32-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-pext32-1a.c
new file mode 100644
index 0000000..c4a6dee
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-pext32-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-pext32-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_pext_si3" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-pext64-1.c b/gcc/testsuite/gcc.target/i386/bmi2-pext64-1.c
new file mode 100644
index 0000000..bad0584
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-pext64-1.c
@@ -0,0 +1,39 @@
+/* { dg-do run { target { bmi2 && { ! ia32 } } } } */
+/* { dg-options "-mbmi2 -O2" } */
+
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned long long
+calc_pext_u64 (unsigned long long a, unsigned long long mask)
+{
+  unsigned long long res = 0;
+  int i, k = 0;
+
+  for (i=0; i<64; ++i)
+    if (mask & (1LL << i)) {
+      res |= ((a & (1LL << i)) >> i) << k;
+      ++k;
+    }
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned long long i;
+  unsigned long long src = 0xce7acce7acce7ac;
+  unsigned long long res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_pext_u64 (src, ~(i * 3));
+    res = _pext_u64 (src, ~(i * 3));
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-pext64-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-pext64-1a.c
new file mode 100644
index 0000000..aaf06c1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-pext64-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-pext64-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_pext_di3" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-rorx32-1.c b/gcc/testsuite/gcc.target/i386/bmi2-rorx32-1.c
new file mode 100644
index 0000000..84618e3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-rorx32-1.c
@@ -0,0 +1,36 @@
+/* { dg-do run { target { bmi2 } } } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned
+calc_rorx_u32 (unsigned a, int l)
+{
+  unsigned res = a;
+  int i;
+  for (i=0; i<l; ++i)
+    res = (res >> 1) | ((res&1)<< 31);
+
+  return res;
+}
+
+#define SHIFT_VAL 0x0e
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned src = 0xce7ace0;
+  unsigned res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_rorx_u32 (src, SHIFT_VAL);
+    res = (src >> SHIFT_VAL) | (src << (32 - SHIFT_VAL));
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-rorx32-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-rorx32-1a.c
new file mode 100644
index 0000000..bb3b28d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-rorx32-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mbmi2 -dp" } */
+
+#include "bmi2-rorx32-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_rorxsi3_1" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-rorx64-1.c b/gcc/testsuite/gcc.target/i386/bmi2-rorx64-1.c
new file mode 100644
index 0000000..7dc722a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-rorx64-1.c
@@ -0,0 +1,36 @@
+/* { dg-do run { target { bmi2 && { ! ia32 } } } } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned long long
+calc_rorx_u64 (unsigned long long a, int l)
+{
+  unsigned long long res = a;
+  int i;
+  for (i=0; i<l; ++i)
+    res = (res >> 1) | ((res&1)<< 63);
+
+  return res;
+}
+
+#define SHIFT_VAL 0x1e
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned long long src = 0xce7ace0ce7ace0;
+  unsigned long long res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i+1);
+
+    res_ref = calc_rorx_u64 (src, SHIFT_VAL);
+    res = (src >> SHIFT_VAL) | (src << (64 - SHIFT_VAL));
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-rorx64-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-rorx64-1a.c
new file mode 100644
index 0000000..2a7a7a0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-rorx64-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mbmi2 -dp" } */
+
+#include "bmi2-rorx64-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_rorxdi3_1" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-sarx32-1.c b/gcc/testsuite/gcc.target/i386/bmi2-sarx32-1.c
new file mode 100644
index 0000000..2bbf016
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-sarx32-1.c
@@ -0,0 +1,36 @@
+/* { dg-do run { target { bmi2 } } } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+int
+calc_sarx_u32 (int a, int l)
+{
+  int res = a;
+  int i;
+  for (i=0; i<l; ++i)
+    res >>= 1;
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  int src = 0xfce7ace0;
+  int res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_sarx_u32 (src, i + 1);
+    res = src >> (i + 1);
+
+    printf ("%x %x\n", res_ref, res);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-sarx32-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-sarx32-1a.c
new file mode 100644
index 0000000..f10d60b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-sarx32-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mbmi2 -dp" } */
+
+#include "bmi2-sarx32-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_ashrsi3" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-sarx64-1.c b/gcc/testsuite/gcc.target/i386/bmi2-sarx64-1.c
new file mode 100644
index 0000000..0bb13c0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-sarx64-1.c
@@ -0,0 +1,34 @@
+/* { dg-do run { target { bmi2 && { ! ia32 } } } } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+long long
+calc_sarx_u64 (long long a, int l)
+{
+  long long res = a;
+  int i;
+  for (i=0; i<l; ++i)
+    res >>= 1;
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  long long src = 0xfce7ace0ce7ace0;
+  long long res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i+1);
+
+    res_ref = calc_sarx_u64 (src, i + 1);
+    res = src >> (i + 1);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-sarx64-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-sarx64-1a.c
new file mode 100644
index 0000000..bcf0fd4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-sarx64-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mbmi2 -dp" } */
+
+#include "bmi2-sarx64-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_ashrdi3" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-shlx32-1.c b/gcc/testsuite/gcc.target/i386/bmi2-shlx32-1.c
new file mode 100644
index 0000000..3f35047
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-shlx32-1.c
@@ -0,0 +1,34 @@
+/* { dg-do run { target { bmi2 } } } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+int
+calc_shlx_u32 (int a, int l)
+{
+  int res = a;
+  int i;
+  for (i=0; i<l; ++i)
+    res <<= 1;
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  int src = 0xfce7ace0;
+  int res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_shlx_u32 (src, i + 1);
+    res = src << (i + 1);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-shlx32-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-shlx32-1a.c
new file mode 100644
index 0000000..215e5d3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-shlx32-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mbmi2 -dp" } */
+
+#include "bmi2-shlx32-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_ashlsi3" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-shrx32-1.c b/gcc/testsuite/gcc.target/i386/bmi2-shrx32-1.c
new file mode 100644
index 0000000..17f0c67
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-shrx32-1.c
@@ -0,0 +1,36 @@
+/* { dg-do run { target { bmi2 } } } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned
+calc_shrx_u32 (unsigned a, int l)
+{
+  unsigned res = a;
+  int i;
+  for (i=0; i<l; ++i)
+    res >>= 1;
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned src = 0xce7ace0;
+  unsigned res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_shrx_u32 (src, i + 1);
+    res = src >> (i + 1);
+
+    printf ("%x %x\n", res_ref, res);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-shrx32-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-shrx32-1a.c
new file mode 100644
index 0000000..24c53d4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-shrx32-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mbmi2 -dp" } */
+
+#include "bmi2-shrx32-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_lshrsi3" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-shrx64-1.c b/gcc/testsuite/gcc.target/i386/bmi2-shrx64-1.c
new file mode 100644
index 0000000..022baa9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-shrx64-1.c
@@ -0,0 +1,34 @@
+/* { dg-do run { target { bmi2 && { ! ia32 } } } } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned long long
+calc_shrx_u64 (unsigned long long a, int l)
+{
+  unsigned long long res = a;
+  int i;
+  for (i=0; i<l; ++i)
+    res >>= 1;
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  unsigned long long src = 0xce7ace0ce7ace0;
+  unsigned long long res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i+1);
+
+    res_ref = calc_shrx_u64 (src, i + 1);
+    res = src >> (i + 1);
+
+    if (res != res_ref)
+      abort();
+  }
+}
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-shrx64-1a.c b/gcc/testsuite/gcc.target/i386/bmi2-shrx64-1a.c
new file mode 100644
index 0000000..7830439
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-shrx64-1a.c
@@ -0,0 +1,6 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mbmi2 -dp" } */
+
+#include "bmi2-shrx64-1.c"
+
+/* { dg-final { scan-assembler-times "bmi2_lshrdi3" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/i386.exp b/gcc/testsuite/gcc.target/i386/i386.exp
index 167b79b..cff8a9a 100644
--- a/gcc/testsuite/gcc.target/i386/i386.exp
+++ b/gcc/testsuite/gcc.target/i386/i386.exp
@@ -206,6 +206,17 @@ proc check_effective_target_bmi { } {
     } "-mbmi" ]
 }
 
+# Return 1 if bmi2 instructions can be compiled.
+proc check_effective_target_bmi2 { } {
+    return [check_no_compiler_messages bmi2 object {
+	unsigned int
+	_bzhi_u32 (unsigned int __X, unsigned int __Y)
+	{
+	    return __builtin_ia32_bzhi_si (__X, __Y);
+	}
+    } "-mbmi2" ]
+}
+
 # If the linker used understands -M <mapfile>, pass it to clear hardware
 # capabilities set by the Sun assembler.
 set clearcap_ldflags "-Wl,-M,$srcdir/$subdir/clearcap.map"

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH, testsuite, i386] BMI2 support for GCC
  2011-08-19 15:22               ` Kirill Yukhin
@ 2011-08-19 15:36                 ` Kirill Yukhin
  2011-08-20 20:05                 ` Uros Bizjak
  1 sibling, 0 replies; 27+ messages in thread
From: Kirill Yukhin @ 2011-08-19 15:36 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Jakub Jelinek, gcc-patches List, Uros Bizjak

Done. Patch attached in previous mail

K

On Fri, Aug 19, 2011 at 6:51 PM, Kirill Yukhin <kirill.yukhin@gmail.com> wrote:
> On Fri, Aug 19, 2011 at 6:31 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
>> It is hard to tell.  Can you double check indentation on
>>
>> +  if (can_create_pseudo_p () && <MODE>mode != SImode)
>> +  {
>> +    rtx tmp = gen_rtx_REG (<MODE>mode, 0);
>> +    emit_insn (gen_extendsidi2 (tmp, operands[2]));
>> +    operands[2] = tmp;
>> +  }
>>
>>
>> On Fri, Aug 19, 2011 at 7:13 AM, Kirill Yukhin <kirill.yukhin@gmail.com> wrote:
>>> Thanks, fixed.
>>>
>>> Updated patch is attached.
>>>
>>> K
>>>
>>> On Fri, Aug 19, 2011 at 6:04 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
>>>> No need for () in "(mode == SImode)":
>>>>
>>>> +         && !optimize_function_for_size_p (cfun)
>>>> +         && ((mode == SImode) || (mode == DImode && TARGET_64BIT))
>>>>
>>>> Wrong placement of '{':
>>>>
>>>> +  if (can_create_pseudo_p () && <MODE>mode != SImode) {
>>>> +    rtx tmp = gen_rtx_REG (<MODE>mode, 0);
>>>> +    emit_insn (gen_extendsidi2 (tmp, operands[2]));
>>>> +    operands[2] = tmp;
>>>> +  }
>>>>
>>>>
>>>>
>>>> On Fri, Aug 19, 2011 at 6:53 AM, Kirill Yukhin <kirill.yukhin@gmail.com> wrote:
>>>>> Thanks!
>>>>> Fixed, updated patch is attached.
>>>>>
>>>>> Is it ok?
>>>>>
>>>>> Thanks, K
>>>>>
>>>>> On Fri, Aug 19, 2011 at 5:22 PM, Jakub Jelinek <jakub@redhat.com> wrote:
>>>>>> On Fri, Aug 19, 2011 at 05:18:19PM +0400, Kirill Yukhin wrote:
>>>>>>> Thanks, it is fixed.
>>>>>>> Update patch is attached.
>>>>>>
>>>>>> +         /* We generatin RORX instruction, freedom of register +
>>>>>> +            flags not affected  */
>>>>>>
>>>>>> comment doesn't look to be correct english (missing verb, missing g at
>>>>>> the end of generating, missing dot at the end of sentence).
>>>>>>
>>>>>>        Jakub
>>>>>>
>>>>>
>>>>
>>>>
>>>>
>>>> --
>>>> H.J.
>>>>
>>>
>>
>>
>>
>> --
>> H.J.
>>
>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH, testsuite, i386] BMI2 support for GCC
  2011-08-19 15:22               ` Kirill Yukhin
  2011-08-19 15:36                 ` Kirill Yukhin
@ 2011-08-20 20:05                 ` Uros Bizjak
  2011-08-20 21:52                   ` Uros Bizjak
  1 sibling, 1 reply; 27+ messages in thread
From: Uros Bizjak @ 2011-08-20 20:05 UTC (permalink / raw)
  To: Kirill Yukhin; +Cc: H.J. Lu, Jakub Jelinek, gcc-patches List

On Fri, Aug 19, 2011 at 4:51 PM, Kirill Yukhin <kirill.yukhin@gmail.com> wrote:

>>> Updated patch is attached.

Comments inline.

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 53c5944..bff1a05 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -79,6 +79,7 @@ along with GCC; see the file COPYING3.  If not see
   (OPTION_MASK_ISA_ABM | OPTION_MASK_ISA_POPCNT)

 #define OPTION_MASK_ISA_BMI_SET OPTION_MASK_ISA_BMI
+#define OPTION_MASK_ISA_BMI2_SET OPTION_MASK_ISA_BMI2

Are you sure that -mbmi2 does not imply -mbmi?

@@ -13285,6 +13291,7 @@ put_condition_code (enum rtx_code code, enum
machine_mode mode, int reverse,
    If CODE is 't', pretend the mode is V8SFmode.
    If CODE is 'h', pretend the reg is the 'high' byte register.
    If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
+   If CODE is 'N', print the half mode high register.
    If CODE is 'd', duplicate the operand for AVX instruction.
  */

   If CODE is 'N', print the high register of a double word register pair.

@@ -13294,6 +13301,15 @@ print_reg (rtx x, int code, FILE *file)
   const char *reg;
   bool duplicated = code == 'd' && TARGET_AVX;

+  if (code == 'N')
+    {
+      enum machine_mode mode = GET_MODE (x);
+      enum machine_mode half_mode = mode == TImode ? DImode : SImode;
+      x = simplify_gen_subreg (half_mode, x, mode,
+			       GET_MODE_SIZE (half_mode));
+      code = 0;
+    }
+

No need to check modes, we _KNOW_ that DWI expands to double word
modes. Also, handling of 'N' should be put a couple of lines lower,
like:

     code = 16;
   else if (code == 't')
     code = 32;
+  else if (code == 'N')
+    {
+      gcc_assert (mode == GET_MODE_WIDER_MODE (word_mode));
+      x = gen_highpart (word_mode, x);
+      code = GET_MODE_SIZE (word_mode);
+    }
   else
     code = GET_MODE_SIZE (GET_MODE (x));

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index e7ae397..05f7666 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md

 (define_c_enum "unspecv" [
@@ -751,14 +756,17 @@
 ;; Base name for insn mnemonic.
 (define_code_attr logic [(and "and") (ior "or") (xor "xor")])

+;; Mapping of shift operators
+(define_code_iterator any_shift [ashift lshiftrt ashiftrt])
+
 ;; Mapping of shift-right operators
 (define_code_iterator any_shiftrt [lshiftrt ashiftrt])

 ;; Base name for define_insn
-(define_code_attr shiftrt_insn [(lshiftrt "lshr") (ashiftrt "ashr")])
+(define_code_attr shift_insn [(ashift "ashl") (lshiftrt "lshr")
(ashiftrt "ashr")])

 ;; Base name for insn mnemonic.
-(define_code_attr shiftrt [(lshiftrt "shr") (ashiftrt "sar")])
+(define_code_attr shift [(ashift "shl") (lshiftrt "shr") (ashiftrt "sar")])

These renames should be part of another follow-up patch.

 ;; Mapping of rotate operators
 (define_code_iterator any_rotate [rotate rotatert])
@@ -777,6 +785,8 @@

 ;; Used in signed and unsigned widening multiplications.
 (define_code_iterator any_extend [sign_extend zero_extend])
+(define_code_attr any_extend [(sign_extend "SIGN_EXTEND")
+			      (zero_extend "ZERO_EXTEND")])

No. Pattern should be splitted instead.

 ;; Various insn prefixes for signed and unsigned operations.
 (define_code_attr u [(sign_extend "") (zero_extend "u")
@@ -6837,7 +6847,17 @@
 		       (match_operand:DWIH 1 "nonimmediate_operand" ""))
 		     (any_extend:<DWI>
 		       (match_operand:DWIH 2 "register_operand" ""))))
-	      (clobber (reg:CC FLAGS_REG))])])
+	      (clobber (reg:CC FLAGS_REG))])]
+  ""
+{
+  if (TARGET_BMI2 && <any_extend> == ZERO_EXTEND)
+    {
+      emit_insn (gen_bmi2_umul<mode><dwi>3_1 (operands[0],
+					      operands[1],
+					      operands[2]));
+      DONE;
+    }
+})

Please split the expander instead!

+;; Update pattern if BMI2 is available
+(define_split
+  [(set (match_operand:SWI48 0 "register_operand" "")
+	(any_shift:SWI48
+	  (match_operand:SWI48 1 "nonimmediate_operand" "")
+	  (subreg:QI
+	      (match_operand:SI 2 "register_operand" "") 0)))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_BMI2 && ix86_binary_operator_ok (<CODE>, <MODE>mode,
operands) && !reload_completed"
+  [(set (match_dup 0)
+        (any_shift:SWI48 (match_dup 1) (match_dup 2)))]
+{
+  if (can_create_pseudo_p () && <MODE>mode != SImode)
+    {
+      rtx tmp = gen_rtx_REG (<MODE>mode, 0);
+      emit_insn (gen_extendsidi2 (tmp, operands[2]));
+      operands[2] = tmp;
+    }
+})

Why splitters? Generate the shifts directly from the expander, fixing
the operands on-the-fly if necessary. Also, do not rename half of the
shift expanders and insn patterns just to introduce *ONE* extra RTX...

@@ -15745,8 +15763,23 @@ ix86_expand_binary_operator (enum rtx_code
code, enum machine_mode mode,
     }

Don't expand RORX through ix86_expand_binary_operator, generate it
directly from expander. You are complicating things with splitters too
much!

I will rewrite this part of i386.md.

@@ -12346,6 +12422,42 @@
   "xor{b}\t{%h0, %b0|%b0, %h0}"
   [(set_attr "length" "2")
    (set_attr "mode" "HI")])
+
+;; BMI2 instructions.
+(define_insn "bmi2_bzhi_<mode>3"

Please put these into "Bit manipulation instructions." section.  Also,
if possible, UNSPECs should not be used. It is much better to describe
pattern with generic RTXes, this way you will enable many existing
optimizations that (obviously) don't know how to handle UNSPECs.

+++ b/gcc/testsuite/gcc.target/i386/bmi2-bzhi32-1.c
@@ -0,0 +1,35 @@
+/* { dg-do run { target { bmi2 } } } */
+/* { dg-options "-mbmi2 -O2" } */
+
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned
+calc_bzhi_u32 (unsigned a, int l)
+{
+  unsigned res = a;
+  int i;
+  for (i=0; i<32-l; ++i)
+    res &= ~(1 << (31 - i));
+
+  return res;
+}

Please add spaces around operators (also in other testcases).

+/* { dg-do run { target { bmi2 && { ia32 } } } } */
+/* { dg-options "-mbmi2 -Ofast" } */

Don't use -Ofast in the testsuite.  Use explicit -O2, -ffast-math,
etc. Options come and go from -Ofast.

+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned long long
+calc_mul_u32 (unsigned a, unsigned b)
+{
+  unsigned long long res = 0;
+  volatile unsigned dummy = 0;
+  int i;
+  for (i=0; i<b; ++i)
+    res += (unsigned long long)(dummy? 0 : a);

Spaces.

+    res = (unsigned long long)a * b;

And here.

+  for (i=0; i<b; ++i) {

And here.  Please follow GNU coding standards [1].

+    /* Block loop opts  */
+    res += (unsigned __int128)(dummy? 0 : a);
+  }

Why? There are other ways to suppress unwanted optimizations, using
volatile and asm. See many examples through the testsuite.

+++ b/gcc/testsuite/gcc.target/i386/bmi2-sarx32-1.c
@@ -0,0 +1,36 @@
+/* { dg-do run { target { bmi2 } } } */
+/* { dg-options "-mbmi2 -O2 -dp" } */
+
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+int
+calc_sarx_u32 (int a, int l)
+{
+  int res = a;
+  int i;
+  for (i=0; i<l; ++i)
+    res >>= 1;
+
+  return res;
+}
+
+static void
+bmi2_test ()
+{
+  unsigned i;
+  int src = 0xfce7ace0;
+  int res, res_ref;
+
+  for (i=0; i<5; ++i) {
+    src = src * (i + 1);
+
+    res_ref = calc_sarx_u32 (src, i + 1);
+    res = src >> (i + 1);
+
+    printf ("%x %x\n", res_ref, res);

If you _REALLY_ need debugging printfs in the test, then protect it
with #ifdef DEBUG and put them just before abort (again, see many
examples). But more or less, these printfs are just annoying.


[1] http://www.gnu.org/prep/standards/standards.html

Uros.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH, testsuite, i386] BMI2 support for GCC
  2011-08-20 20:05                 ` Uros Bizjak
@ 2011-08-20 21:52                   ` Uros Bizjak
  2011-08-20 22:04                     ` H.J. Lu
                                       ` (2 more replies)
  0 siblings, 3 replies; 27+ messages in thread
From: Uros Bizjak @ 2011-08-20 21:52 UTC (permalink / raw)
  To: Kirill Yukhin; +Cc: H.J. Lu, Jakub Jelinek, gcc-patches List

[-- Attachment #1: Type: text/plain, Size: 3236 bytes --]

On Sat, Aug 20, 2011 at 2:09 PM, Uros Bizjak <ubizjak@gmail.com> wrote:

> Don't expand RORX through ix86_expand_binary_operator, generate it
> directly from expander. You are complicating things with splitters too
> much!
>
> I will rewrite this part of i386.md.

So, attached RFC patch handles BMI2 mul, shift and ror stuff.

Some remarks:
- M and N register modifiers are added to print low and high register
of a double word register pair. This is needed for mulx insn.
- ishiftx and rotatex instruction type attributes are added.
- "w" mode attribute is added to add register prefix for word mode.
This is needed to output QImode count register of shift insns.

- mulx is expanded directly from expander, IMO it is always a win to
generate this insn if available.

- Yb register constraint is added to conditionally enable generation
of BMI alternatives in generic shift and rotate patterns. The BMI
variant is generated only if RA chooses it as the most profitable
alternative.
- shift and rotate instructions are split post-reload from generic
patterns to strip flags clobber.
- zero-extended 64bit variants are also handled for shift and rotate insns.
- rotate right AND rotate left instructions are handled through rorx.

2011-08-20  Uros Bizjak  <ubizjak@gmail.com>

	* config/i386/i386.md (type): Add ishiftx and rotatex.
	(length_immediate): Handle ishiftx and rotatex.
	(imm_disp): Ditto.
	(w): New mode attribute.

	(mul<mode><dwi>3): Split from <u>mul<mode><dwi>3.
	(umul<mode><dwi>3): Ditto.  Generate bmi2_umul<mode><dwi>3_1 pattern
	for TARGET_BMI2.
	(bmi2_umul<mode><dwi>3_1): New insn pattern.

	(*bmi2_ashl<mode>3_1): New insn pattern.
	(*ashl<mode>3_1): Add ishiftx BMI2 alternative.
	(*ashl<mode>3_1 splitter): New splitter to avoid flags dependency.
	(*bmi2_ashlsi3_1_zext): New insn pattern.
	(*ashlsi3_1_zext): Add ishiftx BMI2 alternative.
	(*ashlsi3_1_zext splitter): New splitter to avoid flags dependency.

	(*bmi2_<shiftrt_insn><mode>3_1): New insn pattern.
	(*<shiftrt_insn><mode>3_1): Add ishiftx BMI2 alternative.
	(*<shiftrt_insn><mode>3_1 splitter): New splitter to avoid
	flags dependency.
	(*bmi2_<shiftrt_insn>si3_1_zext): New insn pattern.
	(*<shiftrt_insn>si3_1_zext): Add ishiftx BMI2 alternative.
	(*<shiftrt_insn>si3_1_zext splitter): New splitter to avoid
	flags dependency.

	(*bmi2_rorx<mode>3_1): New insn pattern.
	(*<rotate_insn><mode>3_1): Add rotatex BMI2 alternative.
	(*rotate<mode>3_1 splitter): New splitter to avoid flags dependency.
	(*rotatert<mode>3_1 splitter): Ditto.
	(*bmi2_rorxsi3_1_zext): New insn pattern.
	(*<rotate_insn>si3_1_zext): Add rotatex BMI2 alternative.
	(*rotatesi3_1_zext  splitter): New splitter to avoid flags dependency.
	(*rotatertsi3_1_zext splitter): Ditto.

	* config/i386/constraints.md (Yb): New register constraint.
	* config/i386/i386.c (print_reg): Handle 'M' and 'N' modifiers.
	(print_operand): Ditto.

The patch is currently in RFC/RFT state, since I have no way to
properly test it. The patch bootstraps OK and regression test is clean
on x86_64-pc-linux-gnu {,-m32}. I tested the patch lightly on provided
testcases, so expected patterns are generated. Oh, and all insn
constraints should be changed from TARGET_BMI to TARGET_BMI2.

Uros.

[-- Attachment #2: p.diff.txt --]
[-- Type: text/plain, Size: 22714 bytes --]

Index: i386.md
===================================================================
--- i386.md	(revision 177925)
+++ i386.md	(working copy)
@@ -50,6 +50,8 @@
 ;; t --  likewise, print the V8SFmode name of the register.
 ;; h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
 ;; y -- print "st(0)" instead of "st" as a register.
+;; M -- print the low register of a double word register pair.
+;; N -- print the high register of a double word register pair.
 ;; d -- print duplicated register operand for AVX instruction.
 ;; D -- print condition for SSE cmp instruction.
 ;; P -- if PIC, print an @PLT suffix.
@@ -377,7 +379,7 @@
 (define_attr "type"
   "other,multi,
    alu,alu1,negnot,imov,imovx,lea,
-   incdec,ishift,ishift1,rotate,rotate1,imul,idiv,
+   incdec,ishift,ishiftx,ishift1,rotate,rotatex,rotate1,imul,idiv,
    icmp,test,ibr,setcc,icmov,
    push,pop,call,callv,leave,
    str,bitmanip,
@@ -414,8 +416,8 @@
 	   (const_int 0)
 	 (eq_attr "unit" "i387,sse,mmx")
 	   (const_int 0)
-	 (eq_attr "type" "alu,alu1,negnot,imovx,ishift,rotate,ishift1,rotate1,
-			  imul,icmp,push,pop")
+	 (eq_attr "type" "alu,alu1,negnot,imovx,ishift,ishiftx,ishift1,
+			  rotate,rotatex,rotate1,imul,icmp,push,pop")
 	   (symbol_ref "ix86_attr_length_immediate_default (insn, true)")
 	 (eq_attr "type" "imov,test")
 	   (symbol_ref "ix86_attr_length_immediate_default (insn, false)")
@@ -675,7 +677,7 @@
 	      (and (match_operand 0 "memory_displacement_operand" "")
 		   (match_operand 1 "immediate_operand" "")))
 	   (const_string "true")
-	 (and (eq_attr "type" "alu,ishift,rotate,imul,idiv")
+	 (and (eq_attr "type" "alu,ishift,ishiftx,rotate,rotatex,imul,idiv")
 	      (and (match_operand 0 "memory_displacement_operand" "")
 		   (match_operand 2 "immediate_operand" "")))
 	   (const_string "true")
@@ -947,6 +949,9 @@
 ;; Instruction suffix for REX 64bit operators.
 (define_mode_attr rex64suffix [(SI "") (DI "{q}")])
 
+;; Register prefix for word mode.
+(define_mode_attr w [(SI "k") (DI "q")])
+
 ;; This mode iterator allows :P to be used for patterns that operate on
 ;; pointer-sized quantities.  Exactly one of the two alternatives will match.
 (define_mode_iterator P [(SI "Pmode == SImode") (DI "Pmode == DImode")])
@@ -6830,15 +6835,34 @@
    (set_attr "bdver1_decode" "direct")
    (set_attr "mode" "QI")])
 
-(define_expand "<u>mul<mode><dwi>3"
+(define_expand "mul<mode><dwi>3"
   [(parallel [(set (match_operand:<DWI> 0 "register_operand" "")
 		   (mult:<DWI>
-		     (any_extend:<DWI>
+		     (sign_extend:<DWI>
 		       (match_operand:DWIH 1 "nonimmediate_operand" ""))
-		     (any_extend:<DWI>
+		     (sign_extend:<DWI>
 		       (match_operand:DWIH 2 "register_operand" ""))))
 	      (clobber (reg:CC FLAGS_REG))])])
 
+(define_expand "umul<mode><dwi>3"
+  [(parallel [(set (match_operand:<DWI> 0 "register_operand" "")
+		   (mult:<DWI>
+		     (zero_extend:<DWI>
+		       (match_operand:DWIH 1 "nonimmediate_operand" ""))
+		     (zero_extend:<DWI>
+		       (match_operand:DWIH 2 "register_operand" ""))))
+	      (clobber (reg:CC FLAGS_REG))])]
+  ""
+{
+  if (TARGET_BMI)
+    {
+      emit_insn (gen_bmi2_umul<mode><dwi>3_1 (operands[0],
+					      operands[1],
+					      operands[2]));
+      DONE;
+    }
+})
+
 (define_expand "<u>mulqihi3"
   [(parallel [(set (match_operand:HI 0 "register_operand" "")
 		   (mult:HI
@@ -6849,6 +6873,20 @@
 	      (clobber (reg:CC FLAGS_REG))])]
   "TARGET_QIMODE_MATH")
 
+(define_insn "bmi2_umul<mode><dwi>3_1"
+  [(set (match_operand:<DWI> 0 "register_operand" "=r")
+	(mult:<DWI>
+	  (zero_extend:<DWI>
+	    (match_operand:DWIH 1 "nonimmediate_operand" "%d"))
+	  (zero_extend:<DWI>
+	    (match_operand:DWIH 2 "nonimmediate_operand" "rm"))))]
+  "TARGET_BMI
+   && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
+  "mulx\t{%2, %M0, %N0|%N0, %M0, %2}"
+  [(set_attr "type" "imul")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<MODE>")])
+
 (define_insn "*<u>mul<mode><dwi>3_1"
   [(set (match_operand:<DWI> 0 "register_operand" "=A")
 	(mult:<DWI>
@@ -9056,16 +9094,26 @@
   [(set_attr "type" "ishift")
    (set_attr "mode" "<MODE>")])
 
+(define_insn "*bmi2_ashl<mode>3_1"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+	(ashift:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" "rm")
+		      (match_operand:QI 2 "register_operand" "r")))]
+  "TARGET_BMI"
+  "salx\t{%<w>2, %1, %0|%0, %1, %<w>2}"
+  [(set_attr "type" "ishiftx")
+   (set_attr "mode" "<MODE>")])
+
 (define_insn "*ashl<mode>3_1"
-  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r")
-	(ashift:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" "0,l")
-		      (match_operand:QI 2 "nonmemory_operand" "c<S>,M")))
+  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r,Yb")
+	(ashift:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" "0,l,mYb")
+		      (match_operand:QI 2 "nonmemory_operand" "c<S>,M,Yb")))
    (clobber (reg:CC FLAGS_REG))]
   "ix86_binary_operator_ok (ASHIFT, <MODE>mode, operands)"
 {
   switch (get_attr_type (insn))
     {
     case TYPE_LEA:
+    case TYPE_ISHIFTX:
       return "#";
 
     case TYPE_ALU:
@@ -9084,6 +9132,8 @@
   [(set (attr "type")
      (cond [(eq_attr "alternative" "1")
 	      (const_string "lea")
+	    (eq_attr "alternative" "2")
+	      (const_string "ishiftx")
             (and (and (ne (symbol_ref "TARGET_DOUBLE_WITH_ADD")
 		          (const_int 0))
 		      (match_operand 0 "register_operand" ""))
@@ -9102,17 +9152,39 @@
        (const_string "*")))
    (set_attr "mode" "<MODE>")])
 
+;; Convert shift to the shiftx pattern to avoid flags dependency.
+(define_split
+  [(set (match_operand:SWI48 0 "register_operand" "")
+	(ashift:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" "")
+		      (match_operand:QI 2 "register_operand" "")))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_BMI && reload_completed
+   && true_regnum (operands[0]) != true_regnum (operands[1])"
+  [(set (match_dup 0)
+	(ashift:SWI48 (match_dup 1) (match_dup 2)))])
+
+(define_insn "*bmi2_ashlsi3_1_zext"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(zero_extend:DI
+	  (ashift:SI (match_operand:SI 1 "nonimmediate_operand" "rm")
+		     (match_operand:QI 2 "register_operand" "r"))))]
+  "TARGET_64BIT && TARGET_BMI"
+  "salx\t{%k2, %1, %k0|%k0, %1, %k2}"
+  [(set_attr "type" "ishiftx")
+   (set_attr "mode" "SI")])
+
 (define_insn "*ashlsi3_1_zext"
-  [(set (match_operand:DI 0 "register_operand" "=r,r")
+  [(set (match_operand:DI 0 "register_operand" "=r,r,Yb")
 	(zero_extend:DI
-	  (ashift:SI (match_operand:SI 1 "register_operand" "0,l")
-		     (match_operand:QI 2 "nonmemory_operand" "cI,M"))))
+	  (ashift:SI (match_operand:SI 1 "nonimmediate_operand" "0,l,mYb")
+		     (match_operand:QI 2 "nonmemory_operand" "cI,M,Yb"))))
    (clobber (reg:CC FLAGS_REG))]
   "TARGET_64BIT && ix86_binary_operator_ok (ASHIFT, SImode, operands)"
 {
   switch (get_attr_type (insn))
     {
     case TYPE_LEA:
+    case TYPE_ISHIFTX:
       return "#";
 
     case TYPE_ALU:
@@ -9130,6 +9202,8 @@
   [(set (attr "type")
      (cond [(eq_attr "alternative" "1")
 	      (const_string "lea")
+	    (eq_attr "alternative" "2")
+	      (const_string "ishiftx")
             (and (ne (symbol_ref "TARGET_DOUBLE_WITH_ADD")
 		     (const_int 0))
 		 (match_operand 2 "const1_operand" ""))
@@ -9147,6 +9221,18 @@
        (const_string "*")))
    (set_attr "mode" "SI")])
 
+;; Convert shift to the shiftx pattern to avoid flags dependency.
+(define_split
+  [(set (match_operand:DI 0 "register_operand" "")
+	(zero_extend:DI
+	  (ashift:SI (match_operand:SI 1 "nonimmediate_operand" "")
+		     (match_operand:QI 2 "register_operand" ""))))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT && TARGET_BMI && reload_completed
+   && true_regnum (operands[0]) != true_regnum (operands[1])"
+  [(set (match_dup 0)
+  	(zero_extend:DI (ashift:SI (match_dup 1) (match_dup 2))))])
+
 (define_insn "*ashlhi3_1"
   [(set (match_operand:HI 0 "nonimmediate_operand" "=rm")
 	(ashift:HI (match_operand:HI 1 "nonimmediate_operand" "0")
@@ -9763,20 +9849,37 @@
   DONE;
 })
 
+(define_insn "*bmi2_<shiftrt_insn><mode>3_1"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+	(any_shiftrt:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" "rm")
+			   (match_operand:QI 2 "register_operand" "r")))]
+  "TARGET_BMI"
+  "<shiftrt>x\t{%<w>2, %1, %0|%0, %1, %<w>2}"
+  [(set_attr "type" "ishiftx")
+   (set_attr "mode" "<MODE>")])
+
 (define_insn "*<shiftrt_insn><mode>3_1"
-  [(set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m")
-	(any_shiftrt:SWI (match_operand:SWI 1 "nonimmediate_operand" "0")
-			 (match_operand:QI 2 "nonmemory_operand" "c<S>")))
+  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,Yb")
+	(any_shiftrt:SWI48
+	  (match_operand:SWI48 1 "nonimmediate_operand" "0,mYb")
+	  (match_operand:QI 2 "nonmemory_operand" "c<S>,Yb")))
    (clobber (reg:CC FLAGS_REG))]
   "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
 {
-  if (operands[2] == const1_rtx
-      && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
-    return "<shiftrt>{<imodesuffix>}\t%0";
-  else
-    return "<shiftrt>{<imodesuffix>}\t{%2, %0|%0, %2}";
+  switch (get_attr_type (insn))
+    {
+    case TYPE_ISHIFTX:
+      return "#";
+
+    default:
+      if (operands[2] == const1_rtx
+	  && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
+	return "<shiftrt>{<imodesuffix>}\t%0";
+      else
+	return "<shiftrt>{<imodesuffix>}\t{%2, %0|%0, %2}";
+    }
 }
-  [(set_attr "type" "ishift")
+  [(set_attr "type" "ishift,ishiftx")
    (set (attr "length_immediate")
      (if_then_else
        (and (match_operand 2 "const1_operand" "")
@@ -9786,19 +9889,83 @@
        (const_string "*")))
    (set_attr "mode" "<MODE>")])
 
-(define_insn "*<shiftrt_insn>si3_1_zext"
+;; Convert shift to the shiftx pattern to avoid flags dependency.
+(define_split
+  [(set (match_operand:SWI48 0 "register_operand" "")
+	(any_shiftrt:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" "")
+			   (match_operand:QI 2 "register_operand" "")))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_BMI && reload_completed
+   && true_regnum (operands[0]) != true_regnum (operands[1])"
+  [(set (match_dup 0)
+	(any_shiftrt:SWI48 (match_dup 1) (match_dup 2)))])
+
+(define_insn "*bmi2_<shiftrt_insn>si3_1_zext"
   [(set (match_operand:DI 0 "register_operand" "=r")
 	(zero_extend:DI
-	  (any_shiftrt:SI (match_operand:SI 1 "register_operand" "0")
-			  (match_operand:QI 2 "nonmemory_operand" "cI"))))
+	  (any_shiftrt:SI (match_operand:SI 1 "nonimmediate_operand" "rm")
+			  (match_operand:QI 2 "register_operand" "r"))))]
+  "TARGET_64BIT && TARGET_BMI"
+  "<shiftrt>x\t{%k2, %1, %k0|%k0, %1, %k2}"
+  [(set_attr "type" "ishiftx")
+   (set_attr "mode" "SI")])
+
+(define_insn "*<shiftrt_insn>si3_1_zext"
+  [(set (match_operand:DI 0 "register_operand" "=r,Yb")
+	(zero_extend:DI
+	  (any_shiftrt:SI (match_operand:SI 1 "nonimmediate_operand" "0,mYb")
+			  (match_operand:QI 2 "nonmemory_operand" "cI,Yb"))))
    (clobber (reg:CC FLAGS_REG))]
   "TARGET_64BIT && ix86_binary_operator_ok (<CODE>, SImode, operands)"
 {
+  switch (get_attr_type (insn))
+    {
+    case TYPE_ISHIFTX:
+      return "#";
+
+    default:
+      if (operands[2] == const1_rtx
+	  && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
+	return "<shiftrt>{l}\t%k0";
+      else
+	return "<shiftrt>{l}\t{%2, %k0|%k0, %2}";
+    }
+}
+  [(set_attr "type" "ishift,ishiftx")
+   (set (attr "length_immediate")
+     (if_then_else
+       (and (match_operand 2 "const1_operand" "")
+	    (ne (symbol_ref "TARGET_SHIFT1 || optimize_function_for_size_p (cfun)")
+		(const_int 0)))
+       (const_string "0")
+       (const_string "*")))
+   (set_attr "mode" "SI")])
+
+;; Convert shift to the shiftx pattern to avoid flags dependency.
+(define_split
+  [(set (match_operand:DI 0 "register_operand" "")
+	(zero_extend:DI
+	  (any_shiftrt:SI (match_operand:SI 1 "nonimmediate_operand" "")
+			  (match_operand:QI 2 "register_operand" ""))))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT && TARGET_BMI && reload_completed
+   && true_regnum (operands[0]) != true_regnum (operands[1])"
+  [(set (match_dup 0)
+  	(zero_extend:DI (any_shiftrt:SI (match_dup 1) (match_dup 2))))])
+
+(define_insn "*<shiftrt_insn><mode>3_1"
+  [(set (match_operand:SWI12 0 "nonimmediate_operand" "=<r>m")
+	(any_shiftrt:SWI12
+	  (match_operand:SWI12 1 "nonimmediate_operand" "0")
+	  (match_operand:QI 2 "nonmemory_operand" "c<S>")))
+   (clobber (reg:CC FLAGS_REG))]
+  "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
+{
   if (operands[2] == const1_rtx
       && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
-    return "<shiftrt>{l}\t%k0";
+    return "<shiftrt>{<imodesuffix>}\t%0";
   else
-    return "<shiftrt>{l}\t{%2, %k0|%k0, %2}";
+    return "<shiftrt>{<imodesuffix>}\t{%2, %0|%0, %2}";
 }
   [(set_attr "type" "ishift")
    (set (attr "length_immediate")
@@ -9808,7 +9975,7 @@
 		(const_int 0)))
        (const_string "0")
        (const_string "*")))
-   (set_attr "mode" "SI")])
+   (set_attr "mode" "<MODE>")])
 
 (define_insn "*<shiftrt_insn>qi3_1_slp"
   [(set (strict_low_part (match_operand:QI 0 "nonimmediate_operand" "+qm"))
@@ -10060,42 +10227,153 @@
   split_double_mode (<DWI>mode, &operands[0], 1, &operands[4], &operands[5]);
 })
 
+(define_insn "*bmi2_rorx<mode>3_1"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+	(rotatert:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" "rm")
+			(match_operand:QI 2 "immediate_operand" "<S>")))]
+  "TARGET_BMI"
+  "rorx\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "rotatex")
+   (set_attr "mode" "<MODE>")])
+
 (define_insn "*<rotate_insn><mode>3_1"
-  [(set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m")
-	(any_rotate:SWI (match_operand:SWI 1 "nonimmediate_operand" "0")
-			(match_operand:QI 2 "nonmemory_operand" "c<S>")))
+  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,Yb")
+	(any_rotate:SWI48
+	  (match_operand:SWI48 1 "nonimmediate_operand" "0,mYb")
+	  (match_operand:QI 2 "nonmemory_operand" "c<S>,<S>")))
    (clobber (reg:CC FLAGS_REG))]
   "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
 {
-  if (operands[2] == const1_rtx
-      && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
-    return "<rotate>{<imodesuffix>}\t%0";
-  else
-    return "<rotate>{<imodesuffix>}\t{%2, %0|%0, %2}";
+  switch (get_attr_type (insn))
+    {
+    case TYPE_ROTATEX:
+      return "#";
+
+    default:
+      if (operands[2] == const1_rtx
+	  && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
+	return "<rotate>{<imodesuffix>}\t%0";
+      else
+	return "<rotate>{<imodesuffix>}\t{%2, %0|%0, %2}";
+    }
 }
-  [(set_attr "type" "rotate")
+  [(set_attr "type" "rotate,rotatex")
    (set (attr "length_immediate")
      (if_then_else
-       (and (match_operand 2 "const1_operand" "")
-	    (ne (symbol_ref "TARGET_SHIFT1 || optimize_function_for_size_p (cfun)")
-		(const_int 0)))
+       (and (eq_attr "type" "rotate")
+	    (and (match_operand 2 "const1_operand" "")
+		 (ne (symbol_ref "TARGET_SHIFT1 || optimize_function_for_size_p (cfun)")
+		     (const_int 0))))
        (const_string "0")
        (const_string "*")))
    (set_attr "mode" "<MODE>")])
 
-(define_insn "*<rotate_insn>si3_1_zext"
+;; Convert rotate to the rotatex pattern to avoid flags dependency.
+(define_split
+  [(set (match_operand:SWI48 0 "register_operand" "")
+	(rotate:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" "")
+		      (match_operand:QI 2 "immediate_operand" "")))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_BMI && reload_completed
+   && true_regnum (operands[0]) != true_regnum (operands[1])"
+  [(set (match_dup 0)
+	(rotatert:SWI48 (match_dup 1) (match_dup 2)))]
+{
+  operands[2]
+    = GEN_INT (GET_MODE_BITSIZE (<MODE>mode) - INTVAL (operands[2]));
+})
+
+(define_split
+  [(set (match_operand:SWI48 0 "register_operand" "")
+	(rotatert:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" "")
+			(match_operand:QI 2 "immediate_operand" "")))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_BMI && reload_completed
+   && true_regnum (operands[0]) != true_regnum (operands[1])"
+  [(set (match_dup 0)
+	(rotatert:SWI48 (match_dup 1) (match_dup 2)))])
+
+(define_insn "*bmi2_rorxsi3_1_zext"
   [(set (match_operand:DI 0 "register_operand" "=r")
 	(zero_extend:DI
-	  (any_rotate:SI (match_operand:SI 1 "register_operand" "0")
-			 (match_operand:QI 2 "nonmemory_operand" "cI"))))
+	  (rotatert:SI (match_operand:SI 1 "nonimmediate_operand" "rm")
+		       (match_operand:QI 2 "immediate_operand" "I"))))]
+  "TARGET_64BIT && TARGET_BMI"
+  "rorx\t{%2, %1, %k0|%k0, %1, %2}"
+  [(set_attr "type" "rotatex")
+   (set_attr "mode" "SI")])
+
+(define_insn "*<rotate_insn>si3_1_zext"
+  [(set (match_operand:DI 0 "register_operand" "=r,Yb")
+	(zero_extend:DI
+	  (any_rotate:SI (match_operand:SI 1 "nonimmediate_operand" "0,mYb")
+			 (match_operand:QI 2 "nonmemory_operand" "cI,I"))))
    (clobber (reg:CC FLAGS_REG))]
   "TARGET_64BIT && ix86_binary_operator_ok (<CODE>, SImode, operands)"
 {
-    if (operands[2] == const1_rtx
-	&& (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
-    return "<rotate>{l}\t%k0";
+  switch (get_attr_type (insn))
+    {
+    case TYPE_ROTATEX:
+      return "#";
+
+    default:
+      if (operands[2] == const1_rtx
+	  && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
+	return "<rotate>{l}\t%k0";
+      else
+	return "<rotate>{l}\t{%2, %k0|%k0, %2}";
+    }
+}
+  [(set_attr "type" "rotate,rotatex")
+   (set (attr "length_immediate")
+     (if_then_else
+       (and (eq_attr "type" "rotate")
+	    (and (match_operand 2 "const1_operand" "")
+		 (ne (symbol_ref "TARGET_SHIFT1 || optimize_function_for_size_p (cfun)")
+		     (const_int 0))))
+       (const_string "0")
+       (const_string "*")))
+   (set_attr "mode" "SI")])
+
+;; Convert rotate to the rotatex pattern to avoid flags dependency.
+(define_split
+  [(set (match_operand:DI 0 "register_operand" "")
+	(zero_extend:DI
+	  (rotate:SI (match_operand:SI 1 "nonimmediate_operand" "")
+		     (match_operand:QI 2 "immediate_operand" ""))))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT && TARGET_BMI && reload_completed
+   && true_regnum (operands[0]) != true_regnum (operands[1])"
+  [(set (match_dup 0)
+  	(zero_extend:DI (rotatert:SI (match_dup 1) (match_dup 2))))]
+{
+  operands[2]
+    = GEN_INT (GET_MODE_BITSIZE (SImode) - INTVAL (operands[2]));
+})
+
+(define_split
+  [(set (match_operand:DI 0 "register_operand" "")
+	(zero_extend:DI
+	  (rotatert:SI (match_operand:SI 1 "nonimmediate_operand" "")
+		       (match_operand:QI 2 "immediate_operand" ""))))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT && TARGET_BMI && reload_completed
+   && true_regnum (operands[0]) != true_regnum (operands[1])"
+  [(set (match_dup 0)
+  	(zero_extend:DI (rotatert:SI (match_dup 1) (match_dup 2))))])
+
+(define_insn "*<rotate_insn><mode>3_1"
+  [(set (match_operand:SWI12 0 "nonimmediate_operand" "=<r>m")
+	(any_rotate:SWI12 (match_operand:SWI12 1 "nonimmediate_operand" "0")
+			  (match_operand:QI 2 "nonmemory_operand" "c<S>")))
+   (clobber (reg:CC FLAGS_REG))]
+  "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
+{
+  if (operands[2] == const1_rtx
+      && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
+    return "<rotate>{<imodesuffix>}\t%0";
   else
-    return "<rotate>{l}\t{%2, %k0|%k0, %2}";
+    return "<rotate>{<imodesuffix>}\t{%2, %0|%0, %2}";
 }
   [(set_attr "type" "rotate")
    (set (attr "length_immediate")
@@ -10105,7 +10383,7 @@
 		(const_int 0)))
        (const_string "0")
        (const_string "*")))
-   (set_attr "mode" "SI")])
+   (set_attr "mode" "<MODE>")])
 
 (define_insn "*<rotate_insn>qi3_1_slp"
   [(set (strict_low_part (match_operand:QI 0 "nonimmediate_operand" "+qm"))
Index: constraints.md
===================================================================
--- constraints.md	(revision 177925)
+++ constraints.md	(working copy)
@@ -92,6 +92,7 @@
 ;;  m	MMX inter-unit moves enabled
 ;;  d	Integer register when integer DFmode moves are enabled
 ;;  x	Integer register when integer XFmode moves are enabled
+;;  b	Integer register when BMI2 instructions are enabled
 
 (define_register_constraint "Yz" "TARGET_SSE ? SSE_FIRST_REG : NO_REGS"
  "First SSE register (@code{%xmm0}).")
@@ -123,6 +124,10 @@
  "optimize_function_for_speed_p (cfun) ? GENERAL_REGS : NO_REGS"
  "@internal Any integer register when integer XFmode moves are enabled.")
 
+(define_register_constraint "Yb"
+ "TARGET_BMI ? GENERAL_REGS : NO_REGS"
+ "@internal Any integer register, when BMI2 is enabled.")
+
 (define_constraint "z"
   "@internal Constant call address operand."
   (match_operand 0 "constant_call_address_operand"))
Index: i386.c
===================================================================
--- i386.c	(revision 177928)
+++ i386.c	(working copy)
@@ -13285,6 +13285,8 @@ put_condition_code (enum rtx_code code, enum machi
    If CODE is 't', pretend the mode is V8SFmode.
    If CODE is 'h', pretend the reg is the 'high' byte register.
    If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
+   If CODE is 'M', print the low register of a double word register pair.
+   If CODE is 'N', print the high register of a double word register pair.
    If CODE is 'd', duplicate the operand for AVX instruction.
  */
 
@@ -13327,6 +13329,18 @@ print_reg (rtx x, int code, FILE *file)
     code = 16;
   else if (code == 't')
     code = 32;
+  else if (code == 'M')
+    {
+      gcc_assert (GET_MODE (x) == GET_MODE_WIDER_MODE (word_mode));
+      x = gen_lowpart (word_mode, x);
+      code = GET_MODE_SIZE (word_mode);
+    }
+  else if (code == 'N')
+    {
+      gcc_assert (GET_MODE (x) == GET_MODE_WIDER_MODE (word_mode));
+      x = gen_highpart (word_mode, x);
+      code = GET_MODE_SIZE (word_mode);
+    }
   else
     code = GET_MODE_SIZE (GET_MODE (x));
 
@@ -13472,6 +13486,8 @@ get_some_local_dynamic_name (void)
    t --  likewise, print the V8SFmode name of the register.
    h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
    y -- print "st(0)" instead of "st" as a register.
+   M -- print the low register of a double word register pair.
+   N -- print the high register of a double word register pair.
    d -- print duplicated register operand for AVX instruction.
    D -- print condition for SSE cmp instruction.
    P -- if PIC, print an @PLT suffix.
@@ -13678,6 +13694,8 @@ ix86_print_operand (FILE *file, rtx x, int code)
 	case 'h':
 	case 't':
 	case 'y':
+	case 'M':
+	case 'N':
 	case 'x':
 	case 'X':
 	case 'P':

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH, testsuite, i386] BMI2 support for GCC
  2011-08-20 21:52                   ` Uros Bizjak
@ 2011-08-20 22:04                     ` H.J. Lu
  2011-08-20 22:26                       ` Uros Bizjak
  2011-08-21 15:01                       ` Uros Bizjak
  2011-08-20 22:39                     ` Richard Henderson
  2011-08-20 23:50                     ` Richard Henderson
  2 siblings, 2 replies; 27+ messages in thread
From: H.J. Lu @ 2011-08-20 22:04 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: Kirill Yukhin, Jakub Jelinek, gcc-patches List

On Sat, Aug 20, 2011 at 2:16 PM, Uros Bizjak <ubizjak@gmail.com> wrote:
> On Sat, Aug 20, 2011 at 2:09 PM, Uros Bizjak <ubizjak@gmail.com> wrote:
>
>> Don't expand RORX through ix86_expand_binary_operator, generate it
>> directly from expander. You are complicating things with splitters too
>> much!
>>
>> I will rewrite this part of i386.md.
>
> So, attached RFC patch handles BMI2 mul, shift and ror stuff.
>
> Some remarks:
> - M and N register modifiers are added to print low and high register
> of a double word register pair. This is needed for mulx insn.
> - ishiftx and rotatex instruction type attributes are added.
> - "w" mode attribute is added to add register prefix for word mode.
> This is needed to output QImode count register of shift insns.
>
> - mulx is expanded directly from expander, IMO it is always a win to
> generate this insn if available.
>
> - Yb register constraint is added to conditionally enable generation
> of BMI alternatives in generic shift and rotate patterns. The BMI
> variant is generated only if RA chooses it as the most profitable
> alternative.
> - shift and rotate instructions are split post-reload from generic
> patterns to strip flags clobber.
> - zero-extended 64bit variants are also handled for shift and rotate insns.
> - rotate right AND rotate left instructions are handled through rorx.
>
> 2011-08-20  Uros Bizjak  <ubizjak@gmail.com>
>
>        * config/i386/i386.md (type): Add ishiftx and rotatex.
>        (length_immediate): Handle ishiftx and rotatex.
>        (imm_disp): Ditto.
>        (w): New mode attribute.
>
>        (mul<mode><dwi>3): Split from <u>mul<mode><dwi>3.
>        (umul<mode><dwi>3): Ditto.  Generate bmi2_umul<mode><dwi>3_1 pattern
>        for TARGET_BMI2.
>        (bmi2_umul<mode><dwi>3_1): New insn pattern.
>
>        (*bmi2_ashl<mode>3_1): New insn pattern.
>        (*ashl<mode>3_1): Add ishiftx BMI2 alternative.
>        (*ashl<mode>3_1 splitter): New splitter to avoid flags dependency.
>        (*bmi2_ashlsi3_1_zext): New insn pattern.
>        (*ashlsi3_1_zext): Add ishiftx BMI2 alternative.
>        (*ashlsi3_1_zext splitter): New splitter to avoid flags dependency.
>
>        (*bmi2_<shiftrt_insn><mode>3_1): New insn pattern.
>        (*<shiftrt_insn><mode>3_1): Add ishiftx BMI2 alternative.
>        (*<shiftrt_insn><mode>3_1 splitter): New splitter to avoid
>        flags dependency.
>        (*bmi2_<shiftrt_insn>si3_1_zext): New insn pattern.
>        (*<shiftrt_insn>si3_1_zext): Add ishiftx BMI2 alternative.
>        (*<shiftrt_insn>si3_1_zext splitter): New splitter to avoid
>        flags dependency.
>
>        (*bmi2_rorx<mode>3_1): New insn pattern.
>        (*<rotate_insn><mode>3_1): Add rotatex BMI2 alternative.
>        (*rotate<mode>3_1 splitter): New splitter to avoid flags dependency.
>        (*rotatert<mode>3_1 splitter): Ditto.
>        (*bmi2_rorxsi3_1_zext): New insn pattern.
>        (*<rotate_insn>si3_1_zext): Add rotatex BMI2 alternative.
>        (*rotatesi3_1_zext  splitter): New splitter to avoid flags dependency.
>        (*rotatertsi3_1_zext splitter): Ditto.
>
>        * config/i386/constraints.md (Yb): New register constraint.
>        * config/i386/i386.c (print_reg): Handle 'M' and 'N' modifiers.
>        (print_operand): Ditto.
>
> The patch is currently in RFC/RFT state, since I have no way to
> properly test it. The patch bootstraps OK and regression test is clean

We are using HSW emulator (SDE):

http://software.intel.com/en-us/articles/pre-release-license-agreement-for-intel-software-development-emulator-accept-end-user-license-agreement-and-download/

to test FMA, BMI/BMI2.  I have a SDE sim for dejagnu so that I can run
GCC testsuite under SDE.

> on x86_64-pc-linux-gnu {,-m32}. I tested the patch lightly on provided
> testcases, so expected patterns are generated. Oh, and all insn
> constraints should be changed from TARGET_BMI to TARGET_BMI2.
>
> Uros.
>

We can also implement MULX with split:

(define_split
  [(parallel [(set (match_operand:<DWI> 0 "register_operand" "")
                   (mult:<DWI>
                     (zero_extend:<DWI>
                       (match_operand:DWIH 1 "nonimmediate_operand" ""))
                     (zero_extend:<DWI>
                       (match_operand:DWIH 2 "nonimmediate_operand" ""))))
              (clobber (reg:CC FLAGS_REG))])]
  "TARGET_BMI2
   && ix86_binary_operator_ok (MULT, <MODE>mode, operands)"
  [(set (match_operand:<DWI> 0 "register_operand" "")
        (mult:<DWI>
          (zero_extend:<DWI>
            (match_operand:DWIH 1 "register_operand" ""))
          (zero_extend:<DWI>
            (match_operand:DWIH 2 "nonimmediate_operand" ""))))])

(define_insn "*bmi2_umul<mode><dwi>3_1"
  [(set (match_operand:<DWI> 0 "register_operand" "=r")
        (mult:<DWI>
          (zero_extend:<DWI>
            (match_operand:DWIH 1 "register_operand" "d"))
          (zero_extend:<DWI>
            (match_operand:DWIH 2 "nonimmediate_operand" "rm"))))]
  "TARGET_BMI2"
{
  if (<MODE>mode == DImode)
    return "mulx\t{%2, %M0, %N0|%N0, %M0, %2}";
  else
    return "mulx\t{%2, %M0, %N0|%N0, %M0, %2}";
}
  [(set_attr "type" "imul")
   (set_attr "prefix" "vex")
   (set_attr "mode" "<MODE>")])

-- 
H.J.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH, testsuite, i386] BMI2 support for GCC
  2011-08-20 22:04                     ` H.J. Lu
@ 2011-08-20 22:26                       ` Uros Bizjak
  2011-08-20 22:56                         ` H.J. Lu
  2011-08-21 15:01                       ` Uros Bizjak
  1 sibling, 1 reply; 27+ messages in thread
From: Uros Bizjak @ 2011-08-20 22:26 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Kirill Yukhin, Jakub Jelinek, gcc-patches List

On Sat, Aug 20, 2011 at 11:31 PM, H.J. Lu <hjl.tools@gmail.com> wrote:

> We can also implement MULX with split:
>
> (define_split
>  [(parallel [(set (match_operand:<DWI> 0 "register_operand" "")
>                   (mult:<DWI>
>                     (zero_extend:<DWI>
>                       (match_operand:DWIH 1 "nonimmediate_operand" ""))
>                     (zero_extend:<DWI>
>                       (match_operand:DWIH 2 "nonimmediate_operand" ""))))
>              (clobber (reg:CC FLAGS_REG))])]
>  "TARGET_BMI2
>   && ix86_binary_operator_ok (MULT, <MODE>mode, operands)"
>  [(set (match_operand:<DWI> 0 "register_operand" "")
>        (mult:<DWI>
>          (zero_extend:<DWI>
>            (match_operand:DWIH 1 "register_operand" ""))
>          (zero_extend:<DWI>
>            (match_operand:DWIH 2 "nonimmediate_operand" ""))))])

Well, this is unconditional splitter, no better than current approach
where the pattern is expanded directly.

If you want to squeeze out the last 0.005% of performance, you should
add BMI alternative to existing umul pattern, leave the choice of
alternative to RA and split the exact alternative (that is, you need
some true_regnum calls in splitter constraint) after reload to mulx
pattern. Please, see new patterns for how this should be done.

I'm not against this approach, but after 10 hours of hacking, I just
wanted to leave it to an interested reader ;)

Uros.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH, testsuite, i386] BMI2 support for GCC
  2011-08-20 21:52                   ` Uros Bizjak
  2011-08-20 22:04                     ` H.J. Lu
@ 2011-08-20 22:39                     ` Richard Henderson
  2011-08-21  1:37                       ` Uros Bizjak
  2011-08-20 23:50                     ` Richard Henderson
  2 siblings, 1 reply; 27+ messages in thread
From: Richard Henderson @ 2011-08-20 22:39 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: Kirill Yukhin, H.J. Lu, Jakub Jelinek, gcc-patches List

On 08/20/2011 02:16 PM, Uros Bizjak wrote:
> - Yb register constraint is added to conditionally enable generation
> of BMI alternatives in generic shift and rotate patterns. The BMI
> variant is generated only if RA chooses it as the most profitable
> alternative.

We really should use the (relatively new) enabled attribute instead
of adding more and more conditional register constraints.


r~

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH, testsuite, i386] BMI2 support for GCC
  2011-08-20 22:26                       ` Uros Bizjak
@ 2011-08-20 22:56                         ` H.J. Lu
  0 siblings, 0 replies; 27+ messages in thread
From: H.J. Lu @ 2011-08-20 22:56 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: Kirill Yukhin, Jakub Jelinek, gcc-patches List

On Sat, Aug 20, 2011 at 2:44 PM, Uros Bizjak <ubizjak@gmail.com> wrote:
> On Sat, Aug 20, 2011 at 11:31 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
>
>> We can also implement MULX with split:
>>
>> (define_split
>>  [(parallel [(set (match_operand:<DWI> 0 "register_operand" "")
>>                   (mult:<DWI>
>>                     (zero_extend:<DWI>
>>                       (match_operand:DWIH 1 "nonimmediate_operand" ""))
>>                     (zero_extend:<DWI>
>>                       (match_operand:DWIH 2 "nonimmediate_operand" ""))))
>>              (clobber (reg:CC FLAGS_REG))])]
>>  "TARGET_BMI2
>>   && ix86_binary_operator_ok (MULT, <MODE>mode, operands)"
>>  [(set (match_operand:<DWI> 0 "register_operand" "")
>>        (mult:<DWI>
>>          (zero_extend:<DWI>
>>            (match_operand:DWIH 1 "register_operand" ""))
>>          (zero_extend:<DWI>
>>            (match_operand:DWIH 2 "nonimmediate_operand" ""))))])
>
> Well, this is unconditional splitter, no better than current approach
> where the pattern is expanded directly.
>
> If you want to squeeze out the last 0.005% of performance, you should
> add BMI alternative to existing umul pattern, leave the choice of
> alternative to RA and split the exact alternative (that is, you need
> some true_regnum calls in splitter constraint) after reload to mulx
> pattern. Please, see new patterns for how this should be done.
>
> I'm not against this approach, but after 10 hours of hacking, I just
> wanted to leave it to an interested reader ;)

We won't use split then.

Thanks.


-- 
H.J.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH, testsuite, i386] BMI2 support for GCC
  2011-08-20 21:52                   ` Uros Bizjak
  2011-08-20 22:04                     ` H.J. Lu
  2011-08-20 22:39                     ` Richard Henderson
@ 2011-08-20 23:50                     ` Richard Henderson
  2011-08-20 23:58                       ` H.J. Lu
  2011-08-21 11:14                       ` Uros Bizjak
  2 siblings, 2 replies; 27+ messages in thread
From: Richard Henderson @ 2011-08-20 23:50 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: Kirill Yukhin, H.J. Lu, Jakub Jelinek, gcc-patches List

On 08/20/2011 02:16 PM, Uros Bizjak wrote:
> +(define_insn "bmi2_umul<mode><dwi>3_1"
> +  [(set (match_operand:<DWI> 0 "register_operand" "=r")
> +	(mult:<DWI>
> +	  (zero_extend:<DWI>
> +	    (match_operand:DWIH 1 "nonimmediate_operand" "%d"))
> +	  (zero_extend:<DWI>
> +	    (match_operand:DWIH 2 "nonimmediate_operand" "rm"))))]
> +  "TARGET_BMI
> +   && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
> +  "mulx\t{%2, %M0, %N0|%N0, %M0, %2}"
> +  [(set_attr "type" "imul")
> +   (set_attr "prefix" "vex")
> +   (set_attr "mode" "<MODE>")])

You can do better than this, and avoid the %M %N specifiers.
The outputs are truly independent and do not need to be a pair.

See the mn10300 umulsidi3{,_internal} patterns.


r~

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH, testsuite, i386] BMI2 support for GCC
  2011-08-20 23:50                     ` Richard Henderson
@ 2011-08-20 23:58                       ` H.J. Lu
  2011-08-21  0:11                         ` H.J. Lu
  2011-08-21 11:14                       ` Uros Bizjak
  1 sibling, 1 reply; 27+ messages in thread
From: H.J. Lu @ 2011-08-20 23:58 UTC (permalink / raw)
  To: Richard Henderson
  Cc: Uros Bizjak, Kirill Yukhin, Jakub Jelinek, gcc-patches List

On Sat, Aug 20, 2011 at 2:52 PM, Richard Henderson <rth@redhat.com> wrote:
> On 08/20/2011 02:16 PM, Uros Bizjak wrote:
>> +(define_insn "bmi2_umul<mode><dwi>3_1"
>> +  [(set (match_operand:<DWI> 0 "register_operand" "=r")
>> +     (mult:<DWI>
>> +       (zero_extend:<DWI>
>> +         (match_operand:DWIH 1 "nonimmediate_operand" "%d"))
>> +       (zero_extend:<DWI>
>> +         (match_operand:DWIH 2 "nonimmediate_operand" "rm"))))]
>> +  "TARGET_BMI
>> +   && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
>> +  "mulx\t{%2, %M0, %N0|%N0, %M0, %2}"
>> +  [(set_attr "type" "imul")
>> +   (set_attr "prefix" "vex")
>> +   (set_attr "mode" "<MODE>")])
>
> You can do better than this, and avoid the %M %N specifiers.
> The outputs are truly independent and do not need to be a pair.
>

Since RA use register pairs for TImode/DImode, should requiring
TI/DI registers in pairs generate better does?


-- 
H.J.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH, testsuite, i386] BMI2 support for GCC
  2011-08-20 23:58                       ` H.J. Lu
@ 2011-08-21  0:11                         ` H.J. Lu
  2011-08-21  4:24                           ` Richard Henderson
  0 siblings, 1 reply; 27+ messages in thread
From: H.J. Lu @ 2011-08-21  0:11 UTC (permalink / raw)
  To: Richard Henderson
  Cc: Uros Bizjak, Kirill Yukhin, Jakub Jelinek, gcc-patches List

On Sat, Aug 20, 2011 at 3:02 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Sat, Aug 20, 2011 at 2:52 PM, Richard Henderson <rth@redhat.com> wrote:
>> On 08/20/2011 02:16 PM, Uros Bizjak wrote:
>>> +(define_insn "bmi2_umul<mode><dwi>3_1"
>>> +  [(set (match_operand:<DWI> 0 "register_operand" "=r")
>>> +     (mult:<DWI>
>>> +       (zero_extend:<DWI>
>>> +         (match_operand:DWIH 1 "nonimmediate_operand" "%d"))
>>> +       (zero_extend:<DWI>
>>> +         (match_operand:DWIH 2 "nonimmediate_operand" "rm"))))]
>>> +  "TARGET_BMI
>>> +   && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
>>> +  "mulx\t{%2, %M0, %N0|%N0, %M0, %2}"
>>> +  [(set_attr "type" "imul")
>>> +   (set_attr "prefix" "vex")
>>> +   (set_attr "mode" "<MODE>")])
>>
>> You can do better than this, and avoid the %M %N specifiers.
>> The outputs are truly independent and do not need to be a pair.
>>
>
> Since RA use register pairs for TImode/DImode, should requiring
> TI/DI registers in pairs generate better does?
                                                          ^^^^^^ codes.

Without register pairs, we are generating very strange codes.

-- 
H.J.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH, testsuite, i386] BMI2 support for GCC
  2011-08-20 22:39                     ` Richard Henderson
@ 2011-08-21  1:37                       ` Uros Bizjak
  2011-08-21 13:23                         ` Jakub Jelinek
  0 siblings, 1 reply; 27+ messages in thread
From: Uros Bizjak @ 2011-08-21  1:37 UTC (permalink / raw)
  To: Richard Henderson; +Cc: Kirill Yukhin, H.J. Lu, Jakub Jelinek, gcc-patches List

[-- Attachment #1: Type: text/plain, Size: 2298 bytes --]

On Sat, Aug 20, 2011 at 11:47 PM, Richard Henderson <rth@redhat.com> wrote:

>> - Yb register constraint is added to conditionally enable generation
>> of BMI alternatives in generic shift and rotate patterns. The BMI
>> variant is generated only if RA chooses it as the most profitable
>> alternative.
>
> We really should use the (relatively new) enabled attribute instead
> of adding more and more conditional register constraints.

Indeed. New version is attached - this one also implements imul
splitting to mulx.

2011-08-20  Uros Bizjak  <ubizjak@gmail.com>

	* config/i386/i386.md (type): Add imulx, ishiftx and rotatex.
	(length_immediate): Handle imulx, ishiftx and rotatex.
	(imm_disp): Ditto.
	(isa): Add bmi2.
	(enabled): Handle bmi2.
	(w): New mode attribute.

	(*mul<mode><dwi>3): Split from *<u>mul<mode><dwi>3.
	(*umul<mode><dwi>3): Ditto.  Add imulx BMI2 alternative.
	(bmi2_umul<mode><dwi>3_1): New insn pattern.
	(*umul<mode><dwi>3 splitter): New splitter to avoid flags dependency.

	(*bmi2_ashl<mode>3_1): New insn pattern.
	(*ashl<mode>3_1): Add ishiftx BMI2 alternative.
	(*ashl<mode>3_1 splitter): New splitter to avoid flags dependency.
	(*bmi2_ashlsi3_1_zext): New insn pattern.
	(*ashlsi3_1_zext): Add ishiftx BMI2 alternative.
	(*ashlsi3_1_zext splitter): New splitter to avoid flags dependency.

	(*bmi2_<shiftrt_insn><mode>3_1): New insn pattern.
	(*<shiftrt_insn><mode>3_1): Add ishiftx BMI2 alternative.
	(*<shiftrt_insn><mode>3_1 splitter): New splitter to avoid
	flags dependency.
	(*bmi2_<shiftrt_insn>si3_1_zext): New insn pattern.
	(*<shiftrt_insn>si3_1_zext): Add ishiftx BMI2 alternative.
	(*<shiftrt_insn>si3_1_zext splitter): New splitter to avoid
	flags dependency.

	(*bmi2_rorx<mode>3_1): New insn pattern.
	(*<rotate_insn><mode>3_1): Add rotatex BMI2 alternative.
	(*rotate<mode>3_1 splitter): New splitter to avoid flags dependency.
	(*rotatert<mode>3_1 splitter): Ditto.
	(*bmi2_rorxsi3_1_zext): New insn pattern.
	(*<rotate_insn>si3_1_zext): Add rotatex BMI2 alternative.
	(*rotatesi3_1_zext  splitter): New splitter to avoid flags dependency.
	(*rotatertsi3_1_zext splitter): Ditto.

	* config/i386/i386.c (print_reg): Handle 'M' and 'N' modifiers.
	(print_operand): Ditto.

Bootstrapped on x86_64-pc-linux-gnu, regression test in progress.

Uros.

[-- Attachment #2: p2.diff.txt --]
[-- Type: text/plain, Size: 22945 bytes --]

Index: i386/i386.md
===================================================================
--- i386/i386.md	(revision 177925)
+++ i386/i386.md	(working copy)
@@ -50,6 +50,8 @@
 ;; t --  likewise, print the V8SFmode name of the register.
 ;; h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
 ;; y -- print "st(0)" instead of "st" as a register.
+;; M -- print the low register of a double word register pair.
+;; N -- print the high register of a double word register pair.
 ;; d -- print duplicated register operand for AVX instruction.
 ;; D -- print condition for SSE cmp instruction.
 ;; P -- if PIC, print an @PLT suffix.
@@ -377,7 +379,7 @@
 (define_attr "type"
   "other,multi,
    alu,alu1,negnot,imov,imovx,lea,
-   incdec,ishift,ishift1,rotate,rotate1,imul,idiv,
+   incdec,ishift,ishiftx,ishift1,rotate,rotatex,rotate1,imul,imulx,idiv,
    icmp,test,ibr,setcc,icmov,
    push,pop,call,callv,leave,
    str,bitmanip,
@@ -414,8 +416,8 @@
 	   (const_int 0)
 	 (eq_attr "unit" "i387,sse,mmx")
 	   (const_int 0)
-	 (eq_attr "type" "alu,alu1,negnot,imovx,ishift,rotate,ishift1,rotate1,
-			  imul,icmp,push,pop")
+	 (eq_attr "type" "alu,alu1,negnot,imovx,ishift,ishiftx,ishift1,
+			  rotate,rotatex,rotate1,imul,imulx,icmp,push,pop")
 	   (symbol_ref "ix86_attr_length_immediate_default (insn, true)")
 	 (eq_attr "type" "imov,test")
 	   (symbol_ref "ix86_attr_length_immediate_default (insn, false)")
@@ -675,7 +677,7 @@
 	      (and (match_operand 0 "memory_displacement_operand" "")
 		   (match_operand 1 "immediate_operand" "")))
 	   (const_string "true")
-	 (and (eq_attr "type" "alu,ishift,rotate,imul,idiv")
+	 (and (eq_attr "type" "alu,ishift,ishiftx,rotate,rotatex,imul,imulx,idiv")
 	      (and (match_operand 0 "memory_displacement_operand" "")
 		   (match_operand 2 "immediate_operand" "")))
 	   (const_string "true")
@@ -699,11 +701,12 @@
 (define_attr "movu" "0,1" (const_string "0"))
 
 ;; Used to control the "enabled" attribute on a per-instruction basis.
-(define_attr "isa" "base,noavx,avx"
+(define_attr "isa" "base,bmi2,noavx,avx"
   (const_string "base"))
 
 (define_attr "enabled" ""
-  (cond [(eq_attr "isa" "noavx") (symbol_ref "!TARGET_AVX")
+  (cond [(eq_attr "isa" "bmi2") (symbol_ref "TARGET_BMI")
+  	 (eq_attr "isa" "noavx") (symbol_ref "!TARGET_AVX")
 	 (eq_attr "isa" "avx") (symbol_ref "TARGET_AVX")
 	]
 	(const_int 1)))
@@ -947,6 +950,9 @@
 ;; Instruction suffix for REX 64bit operators.
 (define_mode_attr rex64suffix [(SI "") (DI "{q}")])
 
+;; Register prefix for word mode.
+(define_mode_attr w [(SI "k") (DI "q")])
+
 ;; This mode iterator allows :P to be used for patterns that operate on
 ;; pointer-sized quantities.  Exactly one of the two alternatives will match.
 (define_mode_iterator P [(SI "Pmode == SImode") (DI "Pmode == DImode")])
@@ -6849,16 +6855,71 @@
 	      (clobber (reg:CC FLAGS_REG))])]
   "TARGET_QIMODE_MATH")
 
-(define_insn "*<u>mul<mode><dwi>3_1"
+(define_insn "*bmi2_umul<mode><dwi>3_1"
+  [(set (match_operand:<DWI> 0 "register_operand" "=r")
+	(mult:<DWI>
+	  (zero_extend:<DWI>
+	    (match_operand:DWIH 1 "nonimmediate_operand" "%d"))
+	  (zero_extend:<DWI>
+	    (match_operand:DWIH 2 "nonimmediate_operand" "rm"))))]
+  "TARGET_BMI
+   && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
+  "mulx\t{%2, %M0, %N0|%N0, %M0, %2}"
+  [(set_attr "type" "imulx")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*umul<mode><dwi>3_1"
+  [(set (match_operand:<DWI> 0 "register_operand" "=A,r")
+	(mult:<DWI>
+	  (zero_extend:<DWI>
+	    (match_operand:DWIH 1 "nonimmediate_operand" "%0,d"))
+	  (zero_extend:<DWI>
+	    (match_operand:DWIH 2 "nonimmediate_operand" "rm,rm"))))
+   (clobber (reg:CC FLAGS_REG))]
+  "!(MEM_P (operands[1]) && MEM_P (operands[2]))"
+  "@
+   imul{<imodesuffix>}\t%2
+   #"
+  [(set_attr "isa" "base,bmi2")
+   (set_attr "type" "imul,imulx")
+   (set_attr "length_immediate" "0,*")
+   (set (attr "athlon_decode")
+	(cond [(eq_attr "alternative" "0")
+		 (if_then_else (eq_attr "cpu" "athlon")
+		   (const_string "vector")
+		   (const_string "double"))]
+	      (const_string "*")))
+   (set_attr "amdfam10_decode" "double,*")
+   (set_attr "bdver1_decode" "direct,*")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "<MODE>")])
+
+;; Convert umul to umulx pattern to avoid flags dependency.
+(define_split
+ [(set (match_operand:<DWI> 0 "register_operand" "")
+       (mult:<DWI>
+	 (zero_extend:<DWI>
+	   (match_operand:DWIH 1 "nonimmediate_operand" ""))
+	 (zero_extend:<DWI>
+	   (match_operand:DWIH 2 "nonimmediate_operand" ""))))
+  (clobber (reg:CC FLAGS_REG))]
+ "TARGET_BMI && reload_completed"
+ [(set (match_dup 0)
+       (mult:<DWI>
+	 (zero_extend:<DWI> (match_dup 1))
+         (zero_extend:<DWI> (match_dup 2))))])
+
+(define_insn "*mul<mode><dwi>3_1"
   [(set (match_operand:<DWI> 0 "register_operand" "=A")
 	(mult:<DWI>
-	  (any_extend:<DWI>
+	  (sign_extend:<DWI>
 	    (match_operand:DWIH 1 "nonimmediate_operand" "%0"))
-	  (any_extend:<DWI>
+	  (sign_extend:<DWI>
 	    (match_operand:DWIH 2 "nonimmediate_operand" "rm"))))
    (clobber (reg:CC FLAGS_REG))]
   "!(MEM_P (operands[1]) && MEM_P (operands[2]))"
-  "<sgnprefix>mul{<imodesuffix>}\t%2"
+  "imul{<imodesuffix>}\t%2"
   [(set_attr "type" "imul")
    (set_attr "length_immediate" "0")
    (set (attr "athlon_decode")
@@ -9056,16 +9117,26 @@
   [(set_attr "type" "ishift")
    (set_attr "mode" "<MODE>")])
 
+(define_insn "*bmi2_ashl<mode>3_1"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+	(ashift:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" "rm")
+		      (match_operand:QI 2 "register_operand" "r")))]
+  "TARGET_BMI"
+  "salx\t{%<w>2, %1, %0|%0, %1, %<w>2}"
+  [(set_attr "type" "ishiftx")
+   (set_attr "mode" "<MODE>")])
+
 (define_insn "*ashl<mode>3_1"
-  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r")
-	(ashift:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" "0,l")
-		      (match_operand:QI 2 "nonmemory_operand" "c<S>,M")))
+  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r,r")
+	(ashift:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" "0,l,rm")
+		      (match_operand:QI 2 "nonmemory_operand" "c<S>,M,r")))
    (clobber (reg:CC FLAGS_REG))]
   "ix86_binary_operator_ok (ASHIFT, <MODE>mode, operands)"
 {
   switch (get_attr_type (insn))
     {
     case TYPE_LEA:
+    case TYPE_ISHIFTX:
       return "#";
 
     case TYPE_ALU:
@@ -9081,9 +9152,12 @@
 	return "sal{<imodesuffix>}\t{%2, %0|%0, %2}";
     }
 }
-  [(set (attr "type")
+  [(set_attr "isa" "base,base,bmi2")
+   (set (attr "type")
      (cond [(eq_attr "alternative" "1")
 	      (const_string "lea")
+	    (eq_attr "alternative" "2")
+	      (const_string "ishiftx")
             (and (and (ne (symbol_ref "TARGET_DOUBLE_WITH_ADD")
 		          (const_int 0))
 		      (match_operand 0 "register_operand" ""))
@@ -9102,17 +9176,38 @@
        (const_string "*")))
    (set_attr "mode" "<MODE>")])
 
+;; Convert shift to the shiftx pattern to avoid flags dependency.
+(define_split
+  [(set (match_operand:SWI48 0 "register_operand" "")
+	(ashift:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" "")
+		      (match_operand:QI 2 "register_operand" "")))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_BMI && reload_completed"
+  [(set (match_dup 0)
+	(ashift:SWI48 (match_dup 1) (match_dup 2)))])
+
+(define_insn "*bmi2_ashlsi3_1_zext"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(zero_extend:DI
+	  (ashift:SI (match_operand:SI 1 "nonimmediate_operand" "rm")
+		     (match_operand:QI 2 "register_operand" "r"))))]
+  "TARGET_64BIT && TARGET_BMI"
+  "salx\t{%k2, %1, %k0|%k0, %1, %k2}"
+  [(set_attr "type" "ishiftx")
+   (set_attr "mode" "SI")])
+
 (define_insn "*ashlsi3_1_zext"
-  [(set (match_operand:DI 0 "register_operand" "=r,r")
+  [(set (match_operand:DI 0 "register_operand" "=r,r,r")
 	(zero_extend:DI
-	  (ashift:SI (match_operand:SI 1 "register_operand" "0,l")
-		     (match_operand:QI 2 "nonmemory_operand" "cI,M"))))
+	  (ashift:SI (match_operand:SI 1 "nonimmediate_operand" "0,l,rm")
+		     (match_operand:QI 2 "nonmemory_operand" "cI,M,r"))))
    (clobber (reg:CC FLAGS_REG))]
   "TARGET_64BIT && ix86_binary_operator_ok (ASHIFT, SImode, operands)"
 {
   switch (get_attr_type (insn))
     {
     case TYPE_LEA:
+    case TYPE_ISHIFTX:
       return "#";
 
     case TYPE_ALU:
@@ -9127,9 +9222,12 @@
 	return "sal{l}\t{%2, %k0|%k0, %2}";
     }
 }
-  [(set (attr "type")
+  [(set_attr "isa" "base,base,bmi2")
+   (set (attr "type")
      (cond [(eq_attr "alternative" "1")
 	      (const_string "lea")
+	    (eq_attr "alternative" "2")
+	      (const_string "ishiftx")
             (and (ne (symbol_ref "TARGET_DOUBLE_WITH_ADD")
 		     (const_int 0))
 		 (match_operand 2 "const1_operand" ""))
@@ -9147,6 +9245,17 @@
        (const_string "*")))
    (set_attr "mode" "SI")])
 
+;; Convert shift to the shiftx pattern to avoid flags dependency.
+(define_split
+  [(set (match_operand:DI 0 "register_operand" "")
+	(zero_extend:DI
+	  (ashift:SI (match_operand:SI 1 "nonimmediate_operand" "")
+		     (match_operand:QI 2 "register_operand" ""))))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT && TARGET_BMI && reload_completed"
+  [(set (match_dup 0)
+  	(zero_extend:DI (ashift:SI (match_dup 1) (match_dup 2))))])
+
 (define_insn "*ashlhi3_1"
   [(set (match_operand:HI 0 "nonimmediate_operand" "=rm")
 	(ashift:HI (match_operand:HI 1 "nonimmediate_operand" "0")
@@ -9763,20 +9872,38 @@
   DONE;
 })
 
+(define_insn "*bmi2_<shiftrt_insn><mode>3_1"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+	(any_shiftrt:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" "rm")
+			   (match_operand:QI 2 "register_operand" "r")))]
+  "TARGET_BMI"
+  "<shiftrt>x\t{%<w>2, %1, %0|%0, %1, %<w>2}"
+  [(set_attr "type" "ishiftx")
+   (set_attr "mode" "<MODE>")])
+
 (define_insn "*<shiftrt_insn><mode>3_1"
-  [(set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m")
-	(any_shiftrt:SWI (match_operand:SWI 1 "nonimmediate_operand" "0")
-			 (match_operand:QI 2 "nonmemory_operand" "c<S>")))
+  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r")
+	(any_shiftrt:SWI48
+	  (match_operand:SWI48 1 "nonimmediate_operand" "0,rm")
+	  (match_operand:QI 2 "nonmemory_operand" "c<S>,r")))
    (clobber (reg:CC FLAGS_REG))]
   "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
 {
-  if (operands[2] == const1_rtx
-      && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
-    return "<shiftrt>{<imodesuffix>}\t%0";
-  else
-    return "<shiftrt>{<imodesuffix>}\t{%2, %0|%0, %2}";
+  switch (get_attr_type (insn))
+    {
+    case TYPE_ISHIFTX:
+      return "#";
+
+    default:
+      if (operands[2] == const1_rtx
+	  && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
+	return "<shiftrt>{<imodesuffix>}\t%0";
+      else
+	return "<shiftrt>{<imodesuffix>}\t{%2, %0|%0, %2}";
+    }
 }
-  [(set_attr "type" "ishift")
+  [(set_attr "isa" "base,bmi2")
+   (set_attr "type" "ishift,ishiftx")
    (set (attr "length_immediate")
      (if_then_else
        (and (match_operand 2 "const1_operand" "")
@@ -9786,19 +9913,82 @@
        (const_string "*")))
    (set_attr "mode" "<MODE>")])
 
-(define_insn "*<shiftrt_insn>si3_1_zext"
+;; Convert shift to the shiftx pattern to avoid flags dependency.
+(define_split
+  [(set (match_operand:SWI48 0 "register_operand" "")
+	(any_shiftrt:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" "")
+			   (match_operand:QI 2 "register_operand" "")))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_BMI && reload_completed"
+  [(set (match_dup 0)
+	(any_shiftrt:SWI48 (match_dup 1) (match_dup 2)))])
+
+(define_insn "*bmi2_<shiftrt_insn>si3_1_zext"
   [(set (match_operand:DI 0 "register_operand" "=r")
 	(zero_extend:DI
-	  (any_shiftrt:SI (match_operand:SI 1 "register_operand" "0")
-			  (match_operand:QI 2 "nonmemory_operand" "cI"))))
+	  (any_shiftrt:SI (match_operand:SI 1 "nonimmediate_operand" "rm")
+			  (match_operand:QI 2 "register_operand" "r"))))]
+  "TARGET_64BIT && TARGET_BMI"
+  "<shiftrt>x\t{%k2, %1, %k0|%k0, %1, %k2}"
+  [(set_attr "type" "ishiftx")
+   (set_attr "mode" "SI")])
+
+(define_insn "*<shiftrt_insn>si3_1_zext"
+  [(set (match_operand:DI 0 "register_operand" "=r,r")
+	(zero_extend:DI
+	  (any_shiftrt:SI (match_operand:SI 1 "nonimmediate_operand" "0,rm")
+			  (match_operand:QI 2 "nonmemory_operand" "cI,r"))))
    (clobber (reg:CC FLAGS_REG))]
   "TARGET_64BIT && ix86_binary_operator_ok (<CODE>, SImode, operands)"
 {
+  switch (get_attr_type (insn))
+    {
+    case TYPE_ISHIFTX:
+      return "#";
+
+    default:
+      if (operands[2] == const1_rtx
+	  && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
+	return "<shiftrt>{l}\t%k0";
+      else
+	return "<shiftrt>{l}\t{%2, %k0|%k0, %2}";
+    }
+}
+  [(set_attr "isa" "base,bmi2")
+   (set_attr "type" "ishift,ishiftx")
+   (set (attr "length_immediate")
+     (if_then_else
+       (and (match_operand 2 "const1_operand" "")
+	    (ne (symbol_ref "TARGET_SHIFT1 || optimize_function_for_size_p (cfun)")
+		(const_int 0)))
+       (const_string "0")
+       (const_string "*")))
+   (set_attr "mode" "SI")])
+
+;; Convert shift to the shiftx pattern to avoid flags dependency.
+(define_split
+  [(set (match_operand:DI 0 "register_operand" "")
+	(zero_extend:DI
+	  (any_shiftrt:SI (match_operand:SI 1 "nonimmediate_operand" "")
+			  (match_operand:QI 2 "register_operand" ""))))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT && TARGET_BMI && reload_completed"
+  [(set (match_dup 0)
+  	(zero_extend:DI (any_shiftrt:SI (match_dup 1) (match_dup 2))))])
+
+(define_insn "*<shiftrt_insn><mode>3_1"
+  [(set (match_operand:SWI12 0 "nonimmediate_operand" "=<r>m")
+	(any_shiftrt:SWI12
+	  (match_operand:SWI12 1 "nonimmediate_operand" "0")
+	  (match_operand:QI 2 "nonmemory_operand" "c<S>")))
+   (clobber (reg:CC FLAGS_REG))]
+  "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
+{
   if (operands[2] == const1_rtx
       && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
-    return "<shiftrt>{l}\t%k0";
+    return "<shiftrt>{<imodesuffix>}\t%0";
   else
-    return "<shiftrt>{l}\t{%2, %k0|%k0, %2}";
+    return "<shiftrt>{<imodesuffix>}\t{%2, %0|%0, %2}";
 }
   [(set_attr "type" "ishift")
    (set (attr "length_immediate")
@@ -9808,7 +9998,7 @@
 		(const_int 0)))
        (const_string "0")
        (const_string "*")))
-   (set_attr "mode" "SI")])
+   (set_attr "mode" "<MODE>")])
 
 (define_insn "*<shiftrt_insn>qi3_1_slp"
   [(set (strict_low_part (match_operand:QI 0 "nonimmediate_operand" "+qm"))
@@ -10060,42 +10250,151 @@
   split_double_mode (<DWI>mode, &operands[0], 1, &operands[4], &operands[5]);
 })
 
+(define_insn "*bmi2_rorx<mode>3_1"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+	(rotatert:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" "rm")
+			(match_operand:QI 2 "immediate_operand" "<S>")))]
+  "TARGET_BMI"
+  "rorx\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "rotatex")
+   (set_attr "mode" "<MODE>")])
+
 (define_insn "*<rotate_insn><mode>3_1"
-  [(set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m")
-	(any_rotate:SWI (match_operand:SWI 1 "nonimmediate_operand" "0")
-			(match_operand:QI 2 "nonmemory_operand" "c<S>")))
+  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r")
+	(any_rotate:SWI48
+	  (match_operand:SWI48 1 "nonimmediate_operand" "0,rm")
+	  (match_operand:QI 2 "nonmemory_operand" "c<S>,<S>")))
    (clobber (reg:CC FLAGS_REG))]
   "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
 {
-  if (operands[2] == const1_rtx
-      && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
-    return "<rotate>{<imodesuffix>}\t%0";
-  else
-    return "<rotate>{<imodesuffix>}\t{%2, %0|%0, %2}";
+  switch (get_attr_type (insn))
+    {
+    case TYPE_ROTATEX:
+      return "#";
+
+    default:
+      if (operands[2] == const1_rtx
+	  && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
+	return "<rotate>{<imodesuffix>}\t%0";
+      else
+	return "<rotate>{<imodesuffix>}\t{%2, %0|%0, %2}";
+    }
 }
-  [(set_attr "type" "rotate")
+  [(set_attr "isa" "base,bmi2")
+   (set_attr "type" "rotate,rotatex")
    (set (attr "length_immediate")
      (if_then_else
-       (and (match_operand 2 "const1_operand" "")
-	    (ne (symbol_ref "TARGET_SHIFT1 || optimize_function_for_size_p (cfun)")
-		(const_int 0)))
+       (and (eq_attr "type" "rotate")
+	    (and (match_operand 2 "const1_operand" "")
+		 (ne (symbol_ref "TARGET_SHIFT1 || optimize_function_for_size_p (cfun)")
+		     (const_int 0))))
        (const_string "0")
        (const_string "*")))
    (set_attr "mode" "<MODE>")])
 
-(define_insn "*<rotate_insn>si3_1_zext"
+;; Convert rotate to the rotatex pattern to avoid flags dependency.
+(define_split
+  [(set (match_operand:SWI48 0 "register_operand" "")
+	(rotate:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" "")
+		      (match_operand:QI 2 "immediate_operand" "")))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_BMI && reload_completed"
+  [(set (match_dup 0)
+	(rotatert:SWI48 (match_dup 1) (match_dup 2)))]
+{
+  operands[2]
+    = GEN_INT (GET_MODE_BITSIZE (<MODE>mode) - INTVAL (operands[2]));
+})
+
+(define_split
+  [(set (match_operand:SWI48 0 "register_operand" "")
+	(rotatert:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" "")
+			(match_operand:QI 2 "immediate_operand" "")))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_BMI && reload_completed"
+  [(set (match_dup 0)
+	(rotatert:SWI48 (match_dup 1) (match_dup 2)))])
+
+(define_insn "*bmi2_rorxsi3_1_zext"
   [(set (match_operand:DI 0 "register_operand" "=r")
 	(zero_extend:DI
-	  (any_rotate:SI (match_operand:SI 1 "register_operand" "0")
-			 (match_operand:QI 2 "nonmemory_operand" "cI"))))
+	  (rotatert:SI (match_operand:SI 1 "nonimmediate_operand" "rm")
+		       (match_operand:QI 2 "immediate_operand" "I"))))]
+  "TARGET_64BIT && TARGET_BMI"
+  "rorx\t{%2, %1, %k0|%k0, %1, %2}"
+  [(set_attr "type" "rotatex")
+   (set_attr "mode" "SI")])
+
+(define_insn "*<rotate_insn>si3_1_zext"
+  [(set (match_operand:DI 0 "register_operand" "=r,r")
+	(zero_extend:DI
+	  (any_rotate:SI (match_operand:SI 1 "nonimmediate_operand" "0,rm")
+			 (match_operand:QI 2 "nonmemory_operand" "cI,I"))))
    (clobber (reg:CC FLAGS_REG))]
   "TARGET_64BIT && ix86_binary_operator_ok (<CODE>, SImode, operands)"
 {
-    if (operands[2] == const1_rtx
-	&& (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
-    return "<rotate>{l}\t%k0";
+  switch (get_attr_type (insn))
+    {
+    case TYPE_ROTATEX:
+      return "#";
+
+    default:
+      if (operands[2] == const1_rtx
+	  && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
+	return "<rotate>{l}\t%k0";
+      else
+	return "<rotate>{l}\t{%2, %k0|%k0, %2}";
+    }
+}
+  [(set_attr "isa" "base,bmi2")
+   (set_attr "type" "rotate,rotatex")
+   (set (attr "length_immediate")
+     (if_then_else
+       (and (eq_attr "type" "rotate")
+	    (and (match_operand 2 "const1_operand" "")
+		 (ne (symbol_ref "TARGET_SHIFT1 || optimize_function_for_size_p (cfun)")
+		     (const_int 0))))
+       (const_string "0")
+       (const_string "*")))
+   (set_attr "mode" "SI")])
+
+;; Convert rotate to the rotatex pattern to avoid flags dependency.
+(define_split
+  [(set (match_operand:DI 0 "register_operand" "")
+	(zero_extend:DI
+	  (rotate:SI (match_operand:SI 1 "nonimmediate_operand" "")
+		     (match_operand:QI 2 "immediate_operand" ""))))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT && TARGET_BMI && reload_completed"
+  [(set (match_dup 0)
+  	(zero_extend:DI (rotatert:SI (match_dup 1) (match_dup 2))))]
+{
+  operands[2]
+    = GEN_INT (GET_MODE_BITSIZE (SImode) - INTVAL (operands[2]));
+})
+
+(define_split
+  [(set (match_operand:DI 0 "register_operand" "")
+	(zero_extend:DI
+	  (rotatert:SI (match_operand:SI 1 "nonimmediate_operand" "")
+		       (match_operand:QI 2 "immediate_operand" ""))))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT && TARGET_BMI && reload_completed"
+  [(set (match_dup 0)
+  	(zero_extend:DI (rotatert:SI (match_dup 1) (match_dup 2))))])
+
+(define_insn "*<rotate_insn><mode>3_1"
+  [(set (match_operand:SWI12 0 "nonimmediate_operand" "=<r>m")
+	(any_rotate:SWI12 (match_operand:SWI12 1 "nonimmediate_operand" "0")
+			  (match_operand:QI 2 "nonmemory_operand" "c<S>")))
+   (clobber (reg:CC FLAGS_REG))]
+  "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
+{
+  if (operands[2] == const1_rtx
+      && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
+    return "<rotate>{<imodesuffix>}\t%0";
   else
-    return "<rotate>{l}\t{%2, %k0|%k0, %2}";
+    return "<rotate>{<imodesuffix>}\t{%2, %0|%0, %2}";
 }
   [(set_attr "type" "rotate")
    (set (attr "length_immediate")
@@ -10105,7 +10404,7 @@
 		(const_int 0)))
        (const_string "0")
        (const_string "*")))
-   (set_attr "mode" "SI")])
+   (set_attr "mode" "<MODE>")])
 
 (define_insn "*<rotate_insn>qi3_1_slp"
   [(set (strict_low_part (match_operand:QI 0 "nonimmediate_operand" "+qm"))
Index: i386/i386.c
===================================================================
--- i386/i386.c	(revision 177928)
+++ i386/i386.c	(working copy)
@@ -13285,6 +13285,8 @@ put_condition_code (enum rtx_code code, enum machi
    If CODE is 't', pretend the mode is V8SFmode.
    If CODE is 'h', pretend the reg is the 'high' byte register.
    If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
+   If CODE is 'M', print the low register of a double word register pair.
+   If CODE is 'N', print the high register of a double word register pair.
    If CODE is 'd', duplicate the operand for AVX instruction.
  */
 
@@ -13327,6 +13329,18 @@ print_reg (rtx x, int code, FILE *file)
     code = 16;
   else if (code == 't')
     code = 32;
+  else if (code == 'M')
+    {
+      gcc_assert (GET_MODE (x) == GET_MODE_WIDER_MODE (word_mode));
+      x = gen_lowpart (word_mode, x);
+      code = GET_MODE_SIZE (word_mode);
+    }
+  else if (code == 'N')
+    {
+      gcc_assert (GET_MODE (x) == GET_MODE_WIDER_MODE (word_mode));
+      x = gen_highpart (word_mode, x);
+      code = GET_MODE_SIZE (word_mode);
+    }
   else
     code = GET_MODE_SIZE (GET_MODE (x));
 
@@ -13472,6 +13486,8 @@ get_some_local_dynamic_name (void)
    t --  likewise, print the V8SFmode name of the register.
    h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
    y -- print "st(0)" instead of "st" as a register.
+   M -- print the low register of a double word register pair.
+   N -- print the high register of a double word register pair.
    d -- print duplicated register operand for AVX instruction.
    D -- print condition for SSE cmp instruction.
    P -- if PIC, print an @PLT suffix.
@@ -13678,6 +13694,8 @@ ix86_print_operand (FILE *file, rtx x, int code)
 	case 'h':
 	case 't':
 	case 'y':
+	case 'M':
+	case 'N':
 	case 'x':
 	case 'X':
 	case 'P':

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH, testsuite, i386] BMI2 support for GCC
  2011-08-21  0:11                         ` H.J. Lu
@ 2011-08-21  4:24                           ` Richard Henderson
  2011-08-21  9:14                             ` Uros Bizjak
  0 siblings, 1 reply; 27+ messages in thread
From: Richard Henderson @ 2011-08-21  4:24 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Uros Bizjak, Kirill Yukhin, Jakub Jelinek, gcc-patches List

On 08/20/2011 03:03 PM, H.J. Lu wrote:
> On Sat, Aug 20, 2011 at 3:02 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
>>> You can do better than this, and avoid the %M %N specifiers.
>>> The outputs are truly independent and do not need to be a pair.
>>>
>>
>> Since RA use register pairs for TImode/DImode, should requiring
>> TI/DI registers in pairs generate better does?
>                                                           ^^^^^^ codes.
> 
> Without register pairs, we are generating very strange codes.
> 

We ought to be making better use of the lower-subregs pass.
Representing independent outputs when possible enables that.

Admittedly, the i386 port needs more attention to really make
this happen properly.  But we don't need to make things even
worse in the meantime.


r~

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH, testsuite, i386] BMI2 support for GCC
  2011-08-21  4:24                           ` Richard Henderson
@ 2011-08-21  9:14                             ` Uros Bizjak
  2011-08-21  9:39                               ` H.J. Lu
  0 siblings, 1 reply; 27+ messages in thread
From: Uros Bizjak @ 2011-08-21  9:14 UTC (permalink / raw)
  To: Richard Henderson; +Cc: H.J. Lu, Kirill Yukhin, Jakub Jelinek, gcc-patches List

On Sun, Aug 21, 2011 at 1:58 AM, Richard Henderson <rth@redhat.com> wrote:
> On 08/20/2011 03:03 PM, H.J. Lu wrote:
>> On Sat, Aug 20, 2011 at 3:02 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
>>>> You can do better than this, and avoid the %M %N specifiers.
>>>> The outputs are truly independent and do not need to be a pair.
>>>>
>>>
>>> Since RA use register pairs for TImode/DImode, should requiring
>>> TI/DI registers in pairs generate better does?
>>                                                           ^^^^^^ codes.
>>
>> Without register pairs, we are generating very strange codes.
>>
>
> We ought to be making better use of the lower-subregs pass.
> Representing independent outputs when possible enables that.
>
> Admittedly, the i386 port needs more attention to really make
> this happen properly.  But we don't need to make things even
> worse in the meantime.

I will investigate this.

BTW: Latest patch has a small error. Insn mnemonic in following
pattern should be "mult" instead of "imult", so the correct version
reads:

+(define_insn "*umul<mode><dwi>3_1"
+  [(set (match_operand:<DWI> 0 "register_operand" "=A,r")
+	(mult:<DWI>
+	  (zero_extend:<DWI>
+	    (match_operand:DWIH 1 "nonimmediate_operand" "%0,d"))
+	  (zero_extend:<DWI>
+	    (match_operand:DWIH 2 "nonimmediate_operand" "rm,rm"))))
+   (clobber (reg:CC FLAGS_REG))]
+  "!(MEM_P (operands[1]) && MEM_P (operands[2]))"
+  "@
+   mul{<imodesuffix>}\t%2
+   #"

Uros.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH, testsuite, i386] BMI2 support for GCC
  2011-08-21  9:14                             ` Uros Bizjak
@ 2011-08-21  9:39                               ` H.J. Lu
  2011-08-21  9:49                                 ` Richard Henderson
  0 siblings, 1 reply; 27+ messages in thread
From: H.J. Lu @ 2011-08-21  9:39 UTC (permalink / raw)
  To: Uros Bizjak
  Cc: Richard Henderson, Kirill Yukhin, Jakub Jelinek, gcc-patches List

On Sat, Aug 20, 2011 at 5:47 PM, Uros Bizjak <ubizjak@gmail.com> wrote:
> On Sun, Aug 21, 2011 at 1:58 AM, Richard Henderson <rth@redhat.com> wrote:
>> On 08/20/2011 03:03 PM, H.J. Lu wrote:
>>> On Sat, Aug 20, 2011 at 3:02 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
>>>>> You can do better than this, and avoid the %M %N specifiers.
>>>>> The outputs are truly independent and do not need to be a pair.
>>>>>
>>>>
>>>> Since RA use register pairs for TImode/DImode, should requiring
>>>> TI/DI registers in pairs generate better does?
>>>                                                           ^^^^^^ codes.
>>>
>>> Without register pairs, we are generating very strange codes.
>>>
>>
>> We ought to be making better use of the lower-subregs pass.
>> Representing independent outputs when possible enables that.
>>
>> Admittedly, the i386 port needs more attention to really make
>> this happen properly.  But we don't need to make things even
>> worse in the meantime.
>
> I will investigate this.
>

One problem is 32bit movdi and 64bit movti.  They require
register pairs.We may need to split them before RA.


-- 
H.J.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH, testsuite, i386] BMI2 support for GCC
  2011-08-21  9:39                               ` H.J. Lu
@ 2011-08-21  9:49                                 ` Richard Henderson
  0 siblings, 0 replies; 27+ messages in thread
From: Richard Henderson @ 2011-08-21  9:49 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Uros Bizjak, Kirill Yukhin, Jakub Jelinek, gcc-patches List

On 08/20/2011 05:52 PM, H.J. Lu wrote:
> One problem is 32bit movdi and 64bit movti.  They require
> register pairs.We may need to split them before RA.

lower-subreg ought to be able to look through plain moves...


r~

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH, testsuite, i386] BMI2 support for GCC
  2011-08-20 23:50                     ` Richard Henderson
  2011-08-20 23:58                       ` H.J. Lu
@ 2011-08-21 11:14                       ` Uros Bizjak
  1 sibling, 0 replies; 27+ messages in thread
From: Uros Bizjak @ 2011-08-21 11:14 UTC (permalink / raw)
  To: Richard Henderson; +Cc: Kirill Yukhin, H.J. Lu, Jakub Jelinek, gcc-patches List

On Sat, Aug 20, 2011 at 11:52 PM, Richard Henderson <rth@redhat.com> wrote:
> On 08/20/2011 02:16 PM, Uros Bizjak wrote:
>> +(define_insn "bmi2_umul<mode><dwi>3_1"
>> +  [(set (match_operand:<DWI> 0 "register_operand" "=r")
>> +     (mult:<DWI>
>> +       (zero_extend:<DWI>
>> +         (match_operand:DWIH 1 "nonimmediate_operand" "%d"))
>> +       (zero_extend:<DWI>
>> +         (match_operand:DWIH 2 "nonimmediate_operand" "rm"))))]
>> +  "TARGET_BMI
>> +   && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
>> +  "mulx\t{%2, %M0, %N0|%N0, %M0, %2}"
>> +  [(set_attr "type" "imul")
>> +   (set_attr "prefix" "vex")
>> +   (set_attr "mode" "<MODE>")])
>
> You can do better than this, and avoid the %M %N specifiers.
> The outputs are truly independent and do not need to be a pair.
>
> See the mn10300 umulsidi3{,_internal} patterns.

I have tried your suggestion, using patterns like following:

(define_insn "umulsidi3_1"
  [(set (match_operand:SI 0 "register_operand" "=a,r")
	(mult:SI
	  (match_operand:SI 2 "nonimmediate_operand" "%0,d")
	  (match_operand:SI 3 "nonimmediate_operand" "rm,rm")))
   (set (match_operand:SI 1 "register_operand" "=d,r")
	(truncate:SI
	  (lshiftrt:DI
	    (mult:DI (zero_extend:DI (match_dup 2))
		     (zero_extend:DI (match_dup 3)))
	    (const_int 32))))
   (clobber (reg:CC FLAGS_REG))]
  "!TARGET_64BIT
   && !(MEM_P (operands[2]) && MEM_P (operands[3]))"
  "@
   mull\t%3
   #"
  [(set_attr "isa" "base,bmi2")
   (set_attr "type" "imul,imulx")
   (set_attr "length_immediate" "0,*")
   (set (attr "athlon_decode")
	(cond [(eq_attr "alternative" "0")
		 (if_then_else (eq_attr "cpu" "athlon")
		   (const_string "vector")
		   (const_string "double"))]
	      (const_string "*")))
   (set_attr "amdfam10_decode" "double,*")
   (set_attr "bdver1_decode" "direct,*")
   (set_attr "prefix" "orig,vex")
   (set_attr "mode" "SI")])


The compiler works, for a couple of simple testcases it produces the
same code as with register pairs. However, there are a couple of
problems:

- various length calculations look into operand{0,1,2} to determine
instruction length. This is fixable with a little effort.

- patterns that include (const_int N) do not macroize and this leads
to pattern explosion. For this simple example, in addition to
splitting out  any_extend pattern, we have to split also DWIH
patterns.

In the past, I have tried to use match_operand with const_int INTVAL
predicates, but gcc crashed elsewhere due to additional operand.
Please see [1].

IMO, it is currently too much pain to implement splitted pairs in
existing patterns for too low gain. I will however implement split to
mulx pattern after reload to proposed pattern to avoid %M %N.

[1] http://gcc.gnu.org/ml/gcc/2010-07/msg00143.html

Uros.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH, testsuite, i386] BMI2 support for GCC
  2011-08-21  1:37                       ` Uros Bizjak
@ 2011-08-21 13:23                         ` Jakub Jelinek
  0 siblings, 0 replies; 27+ messages in thread
From: Jakub Jelinek @ 2011-08-21 13:23 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: Richard Henderson, Kirill Yukhin, H.J. Lu, gcc-patches List

On Sun, Aug 21, 2011 at 12:55:41AM +0200, Uros Bizjak wrote:
>  (define_attr "enabled" ""
> -  (cond [(eq_attr "isa" "noavx") (symbol_ref "!TARGET_AVX")
> +  (cond [(eq_attr "isa" "bmi2") (symbol_ref "TARGET_BMI")

Shouldn't this be TARGET_BMI2 ?

> +  	 (eq_attr "isa" "noavx") (symbol_ref "!TARGET_AVX")
>  	 (eq_attr "isa" "avx") (symbol_ref "TARGET_AVX")
>  	]
>  	(const_int 1)))

	Jakub

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH, testsuite, i386] BMI2 support for GCC
  2011-08-20 22:04                     ` H.J. Lu
  2011-08-20 22:26                       ` Uros Bizjak
@ 2011-08-21 15:01                       ` Uros Bizjak
  1 sibling, 0 replies; 27+ messages in thread
From: Uros Bizjak @ 2011-08-21 15:01 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Kirill Yukhin, Jakub Jelinek, gcc-patches List

On Sat, Aug 20, 2011 at 11:31 PM, H.J. Lu <hjl.tools@gmail.com> wrote:

>> The patch is currently in RFC/RFT state, since I have no way to
>> properly test it. The patch bootstraps OK and regression test is clean
>
> We are using HSW emulator (SDE):
>
> http://software.intel.com/en-us/articles/pre-release-license-agreement-for-intel-software-development-emulator-accept-end-user-license-agreement-and-download/
>
> to test FMA, BMI/BMI2.  I have a SDE sim for dejagnu so that I can run
> GCC testsuite under SDE.

It is not simulator that is problematic. My binutils-of-the-day
doesn't support BMI2 insns.

Just an idea - is it possible to setup development environment with a
simulator on one of gcc compile-farm machines? This way, everything
will work out of the box and with detailed instructions, I won't
scratch my head on how to setup simulator every time new ISA is
introduced ;)

Uros

^ permalink raw reply	[flat|nested] 27+ messages in thread

end of thread, other threads:[~2011-08-21 11:02 UTC | newest]

Thread overview: 27+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-08-19 12:27 [PATCH, testsuite, i386] BMI2 support for GCC Kirill Yukhin
2011-08-19 13:26 ` H.J. Lu
2011-08-19 14:05   ` Kirill Yukhin
2011-08-19 14:13     ` Jakub Jelinek
2011-08-19 14:34       ` Kirill Yukhin
2011-08-19 14:38         ` H.J. Lu
2011-08-19 14:38           ` Kirill Yukhin
2011-08-19 14:47             ` H.J. Lu
2011-08-19 15:22               ` Kirill Yukhin
2011-08-19 15:36                 ` Kirill Yukhin
2011-08-20 20:05                 ` Uros Bizjak
2011-08-20 21:52                   ` Uros Bizjak
2011-08-20 22:04                     ` H.J. Lu
2011-08-20 22:26                       ` Uros Bizjak
2011-08-20 22:56                         ` H.J. Lu
2011-08-21 15:01                       ` Uros Bizjak
2011-08-20 22:39                     ` Richard Henderson
2011-08-21  1:37                       ` Uros Bizjak
2011-08-21 13:23                         ` Jakub Jelinek
2011-08-20 23:50                     ` Richard Henderson
2011-08-20 23:58                       ` H.J. Lu
2011-08-21  0:11                         ` H.J. Lu
2011-08-21  4:24                           ` Richard Henderson
2011-08-21  9:14                             ` Uros Bizjak
2011-08-21  9:39                               ` H.J. Lu
2011-08-21  9:49                                 ` Richard Henderson
2011-08-21 11:14                       ` Uros Bizjak

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).