public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [i386, patch, RFC] HLE support in GCC
@ 2012-03-07 11:06 Kirill Yukhin
  2012-03-07 11:10 ` Jakub Jelinek
  0 siblings, 1 reply; 49+ messages in thread
From: Kirill Yukhin @ 2012-03-07 11:06 UTC (permalink / raw)
  To: Uros Bizjak, Jakub Jelinek, Richard Guenther, Andi Kleen,
	H.J. Lu, Sergey Ostanevich
  Cc: gcc-patches List, Kirill Yukhin

[-- Attachment #1: Type: text/plain, Size: 359 bytes --]

Hello guys,
I am attaching initial patch which enables TSX's HLE [1] prefixes in
GCC. Since we have no official intrinsics declarations, I want to hear
your comments about the patch

Note, there is no option '-mhle' and no tests (I'll do that after)

[1] - http://software.intel.com/en-us/blogs/2012/02/07/transactional-synchronization-in-haswell/

Thanks, K

[-- Attachment #2: hle-rfc.gcc.patch --]
[-- Type: application/octet-stream, Size: 34981 bytes --]

diff --git a/gcc/config.gcc b/gcc/config.gcc
index 99f0b47..a829d4c 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -360,7 +360,7 @@ i[34567]86-*-*)
 		       immintrin.h x86intrin.h avxintrin.h xopintrin.h
 		       ia32intrin.h cross-stdarg.h lwpintrin.h popcntintrin.h
 		       lzcntintrin.h bmiintrin.h bmi2intrin.h tbmintrin.h
-		       avx2intrin.h fmaintrin.h f16cintrin.h"
+		       avx2intrin.h fmaintrin.h f16cintrin.h hleintrin.h"
 	;;
 x86_64-*-*)
 	cpu_type=i386
@@ -373,7 +373,7 @@ x86_64-*-*)
 		       immintrin.h x86intrin.h avxintrin.h xopintrin.h
 		       ia32intrin.h cross-stdarg.h lwpintrin.h popcntintrin.h
 		       lzcntintrin.h bmiintrin.h tbmintrin.h bmi2intrin.h
-		       avx2intrin.h fmaintrin.h f16cintrin.h"
+		       avx2intrin.h fmaintrin.h f16cintrin.h hleintrin.h"
 	need_64bit_hwint=yes
 	;;
 ia64-*-*)
diff --git a/gcc/config/i386/hleintrin.h b/gcc/config/i386/hleintrin.h
new file mode 100644
index 0000000..0a44bf0
--- /dev/null
+++ b/gcc/config/i386/hleintrin.h
@@ -0,0 +1,288 @@
+/* Copyright (C) 2011 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+# error "Never use <hleintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _HLEINTRIN_H_INCLUDED
+#define _HLEINTRIN_H_INCLUDED
+
+#define __LOCK_NONE		0
+#define __LOCK_NORMAL		1
+#define __LOCK_XACQUIRE		2
+#define __LOCK_XRELEASE		3
+
+#ifdef __OPTIMIZE__
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_lock_btc_i16 (const int __L, short *__P, short __V)
+{
+  return (int) __builtin_ia32_hle_btc_i16 (__L, __P, __V);
+}
+
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_lock_btc_i32 (const int __L, int *__P, int __V)
+{
+  return (int) __builtin_ia32_hle_btc_i32 (__L, __P, __V);
+}
+
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_lock_btr_i16 (const int __L, short *__P, short __V)
+{
+  return (int) __builtin_ia32_hle_btr_i16 (__L, __P, __V);
+}
+
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_lock_btr_i32 (const int __L, int *__P, int __V)
+{
+  return (int) __builtin_ia32_hle_btr_i32 (__L, __P, __V);
+}
+
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_lock_bts_i16 (const int __L, short *__P, short __V)
+{
+  return (int) __builtin_ia32_hle_bts_i16 (__L, __P, __V);
+}
+
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_lock_bts_i32 (const int __L, int *__P, int __V)
+{
+  return (int) __builtin_ia32_hle_bts_i32 (__L, __P, __V);
+}
+
+extern __inline char
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_lock_val_cmpxchg_i8 (const int __L, char *__P, char __O, char __N)
+{
+  return (char) __builtin_ia32_hle_val_cmpxchg_i8 (__L, __P, __O, __N);
+}
+
+extern __inline short
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_lock_val_cmpxchg_i16 (const int __L, short *__P, short __O, short __N)
+{
+  return (short) __builtin_ia32_hle_val_cmpxchg_i16 (__L, __P, __O, __N);
+}
+
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_lock_val_cmpxchg_i32 (const int __L, int *__P, int __O, int __N)
+{
+  return (int) __builtin_ia32_hle_val_cmpxchg_i32 (__L, __P, __O, __N);
+}
+
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_lock_bool_cmpxchg_i8 (const int __L, char *__P, char __O, char __N)
+{
+  return (int) __builtin_ia32_hle_bool_cmpxchg_i8 (__L, __P, __O, __N);
+}
+
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_lock_bool_cmpxchg_i16 (const int __L, short *__P, short __O, short __N)
+{
+  return (int) __builtin_ia32_hle_bool_cmpxchg_i16 (__L, __P, __O, __N);
+}
+
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_lock_bool_cmpxchg_i32 (const int __L, int *__P, int __O, int __N)
+{
+  return (int) __builtin_ia32_hle_bool_cmpxchg_i32 (__L, __P, __O, __N);
+}
+
+extern __inline char
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_lock_xadd_i8 (const int __L, char *__P, char __V)
+{
+  return (char) __builtin_ia32_hle_xadd_i8 (__L, __P, __V);
+}
+
+extern __inline short
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_lock_xadd_i16 (const int __L, short *__P, short __V)
+{
+  return (short) __builtin_ia32_hle_xadd_i16 (__L, __P, __V);
+}
+
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_lock_xadd_i32 (const int __L, int *__P, int __V)
+{
+  return (int) __builtin_ia32_hle_xadd_i32 (__L, __P, __V);
+}
+
+extern __inline char
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_lock_xchg_i8 (const int __L, char *__P, char __V)
+{
+  return (char) __builtin_ia32_hle_xchg_i8 (__L, __P, __V);
+}
+
+extern __inline short
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_lock_xchg_i16 (const int __L, short *__P, short __V)
+{
+  return (short) __builtin_ia32_hle_xchg_i16 (__L, __P, __V);
+}
+
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_lock_xchg_i32 (const int __L, int *__P, int __V)
+{
+  return (int) __builtin_ia32_hle_xchg_i32 (__L, __P, __V);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_lock_store_i8 (const int __L, char *__P, char __V)
+{
+  __builtin_ia32_hle_store_i8 (__L, __P, __V);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_lock_store_i16 (const int __L, short *__P, short __V)
+{
+  __builtin_ia32_hle_store_i16 (__L, __P, __V);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_lock_store_i32 (const int __L, int *__P, int __V)
+{
+  __builtin_ia32_hle_store_i32 (__L, __P, __V);
+}
+
+#ifdef __x86_64__
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_lock_bts_i64 (const int __L, long long *__P, long long __V)
+{
+  return (int) __builtin_ia32_hle_bts_i64 (__L, __P, __V);
+}
+
+extern __inline long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_lock_val_cmpxchg_i64 (const int __L, long long *__P, long long __O,
+		       long long __N)
+{
+  return (long long) __builtin_ia32_hle_val_cmpxchg_i64 (__L, __P,
+							 __O, __N);
+}
+
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_lock_bool_cmpxchg_i64 (const int __L, long long *__P, long long __O,
+		       long long __N)
+{
+  return (int) __builtin_ia32_hle_bool_cmpxchg_i64 (__L, __P, __O, __N);
+}
+
+extern __inline long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_lock_xadd_i64 (const int __L, long long *__P, long long __V)
+{
+  return (long long) __builtin_ia32_hle_xadd_i64 (__L, __P, __V);
+}
+
+extern __inline long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_lock_xchg_i64 (const int __L, long long *__P, long long __V)
+{
+  return (long long) __builtin_ia32_hle_xchg_i64 (__L, __P, __V);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_lock_store_i64 (const int __L, long long *__P, long long __V)
+{
+  __builtin_ia32_hle_store_i64 (__L, __P, __V);
+}
+#endif
+#else
+#define _lock_bts_i16(LOCK, PTR, VAL) \
+  ((int) __builtin_ia32_hle_bts_i16 ((LOCK), (PTR), (VAL)))
+#define _lock_bts_i32(LOCK, PTR, VAL) \
+  ((int) __builtin_ia32_hle_bts_i32 ((LOCK), (PTR), (VAL)))
+
+#define _lock_val_cmpxchg_i8(LOCK, PTR, OLD, NEW) \
+  ((char) __builtin_ia32_hle_val_cmpxchg_i8 ((LOCK), (PTR), (OLD), (NEW)))
+#define _lock_val_cmpxchg_i16(LOCK, PTR, OLD, NEW) \
+  ((short) __builtin_ia32_hle_val_cmpxchg_i16 ((LOCK), (PTR), (OLD), (NEW)))
+#define _lock_val_cmpxchg_i32(LOCK, PTR, OLD, NEW) \
+  ((int) __builtin_ia32_hle_val_cmpxchg_i32 ((LOCK), (PTR), (OLD), (NEW)))
+
+#define _lock_bool_cmpxchg_i8(LOCK, PTR, OLD, NEW) \
+  ((int) __builtin_ia32_hle_bool_cmpxchg_i8 ((LOCK), (PTR), (OLD), (NEW)))
+#define _lock_bool_cmpxchg_i16(LOCK, PTR, OLD, NEW) \
+  ((int) __builtin_ia32_hle_bool_cmpxchg_i16 ((LOCK), (PTR), (OLD), (NEW)))
+#define _lock_bool_cmpxchg_i32(LOCK, PTR, OLD, NEW) \
+  ((int) __builtin_ia32_hle_bool_cmpxchg_i32 ((LOCK), (PTR), (OLD), (NEW)))
+
+#define _lock_xadd_i8(LOCK, PTR, VAL) \
+  ((char) __builtin_ia32_hle_xadd_i8 ((LOCK), (PTR), (VAL)))
+#define _lock_xadd_i16(LOCK, PTR, VAL) \
+  ((short) __builtin_ia32_hle_xadd_i16 ((LOCK), (PTR), (VAL)))
+#define _lock_xadd_i32(LOCK, PTR, VAL) \
+  ((int) __builtin_ia32_hle_xadd_i32 ((LOCK), (PTR), (VAL)))
+
+#define _lock_xchg_i8(LOCK, PTR, VAL) \
+  ((char) __builtin_ia32_hle_xchg_i8 ((LOCK), (PTR), (VAL)))
+#define _lock_xchg_i8(LOCK, PTR, VAL) \
+  ((char) __builtin_ia32_hle_xchg_i8 ((LOCK), (PTR), (VAL)))
+#define _lock_xchg_i16(LOCK, PTR, VAL) \
+  ((short) __builtin_ia32_hle_xchg_i16 ((LOCK), (PTR), (VAL)))
+#define _lock_xchg_i32(LOCK, PTR, VAL) \
+  ((int) __builtin_ia32_hle_xchg_i32 ((LOCK), (PTR), (VAL)))
+
+#ifdef __x86_64__
+#define _lock_btc_i64(LOCK, PTR, VAL) \
+  ((int) __builtin_ia32_hle_btc_i64 ((LOCK), (PTR), (VAL)))
+#define _lock_btr_i64(LOCK, PTR, VAL) \
+  ((int) __builtin_ia32_hle_btr_i64 ((LOCK), (PTR), (VAL)))
+#define _lock_bts_i64(LOCK, PTR, VAL) \
+  ((int) __builtin_ia32_hle_bts_i64 ((LOCK), (PTR), (VAL)))
+
+#define _lock_val_cmpxchg_i64(LOCK, PTR, OLD, NEW) \
+  ((long long) __builtin_ia32_hle_val_cmpxchg_i64 ((LOCK), (PTR), (OLD), (NEW)))
+#define _lock_bool_cmpxchg_i64(LOCK, PTR, OLD, NEW) \
+  ((int) __builtin_ia32_hle_bool_cmpxchg_i64 ((LOCK), (PTR), (OLD), (NEW)))
+#define _lock_xadd_i64(LOCK, PTR, VAL) \
+  ((long long) __builtin_ia32_hle_xadd_i64 ((LOCK), (PTR), (VAL)))
+#define _lock_xchg_i64(LOCK, PTR, VAL) \
+  ((long long) __builtin_ia32_hle_xchg_i64 ((LOCK), (PTR), (VAL)))
+
+#define _lock_store_i64(LOCK, PTR, VAL) \
+  __builtin_ia32_hle_store_i64 ((LOCK), (PTR), (VAL))
+#endif
+#endif
+
+#endif
diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-builtin-types.def
index d00b053..8e70557 100644
--- a/gcc/config/i386/i386-builtin-types.def
+++ b/gcc/config/i386/i386-builtin-types.def
@@ -56,6 +56,7 @@ DEF_PRIMITIVE_TYPE (UHI, unsigned_intHI_type_node)
 DEF_PRIMITIVE_TYPE (USI, unsigned_intSI_type_node)
 DEF_PRIMITIVE_TYPE (UDI, long_long_unsigned_type_node)
 # ??? Some of the types below should use the mode types above.
+DEF_PRIMITIVE_TYPE (SHORT, short_integer_type_node)
 DEF_PRIMITIVE_TYPE (USHORT, short_unsigned_type_node)
 DEF_PRIMITIVE_TYPE (INT, integer_type_node)
 DEF_PRIMITIVE_TYPE (UINT, unsigned_type_node)
@@ -109,6 +110,7 @@ DEF_POINTER_TYPE (PCVOID, VOID, CONST)
 DEF_POINTER_TYPE (PVOID, VOID)
 DEF_POINTER_TYPE (PDOUBLE, DOUBLE)
 DEF_POINTER_TYPE (PFLOAT, FLOAT)
+DEF_POINTER_TYPE (PSHORT, SHORT)
 DEF_POINTER_TYPE (PUSHORT, USHORT)
 DEF_POINTER_TYPE (PINT, INT)
 DEF_POINTER_TYPE (PLONGLONG, LONGLONG)
@@ -432,9 +434,26 @@ DEF_FUNCTION_TYPE (V8UHI, V8UHI, V8UHI, V8UHI)
 DEF_FUNCTION_TYPE (V16UQI, V16UQI, V16UQI, V16UQI)
 DEF_FUNCTION_TYPE (V4DF, V4DF, V4DF, V4DI)
 DEF_FUNCTION_TYPE (V8SF, V8SF, V8SF, V8SI)
+DEF_FUNCTION_TYPE (CHAR, INT, PCHAR, CHAR)
+DEF_FUNCTION_TYPE (SHORT, INT, PSHORT, SHORT)
+DEF_FUNCTION_TYPE (INT, INT, PINT, INT)
+DEF_FUNCTION_TYPE (LONGLONG, INT, PLONGLONG, LONGLONG)
+DEF_FUNCTION_TYPE (INT, INT, PSHORT, SHORT)
+DEF_FUNCTION_TYPE (INT, INT, PLONGLONG, LONGLONG)
+DEF_FUNCTION_TYPE (VOID, INT, PCHAR, CHAR)
+DEF_FUNCTION_TYPE (VOID, INT, PSHORT, SHORT)
+DEF_FUNCTION_TYPE (VOID, INT, PINT, INT)
+DEF_FUNCTION_TYPE (VOID, INT, PLONGLONG, LONGLONG)
 
 DEF_FUNCTION_TYPE (V2DI, V2DI, V2DI, UINT, UINT)
 DEF_FUNCTION_TYPE (V4HI, HI, HI, HI, HI)
+DEF_FUNCTION_TYPE (CHAR, INT, PCHAR, CHAR, CHAR)
+DEF_FUNCTION_TYPE (INT, INT, PCHAR, CHAR, CHAR)
+DEF_FUNCTION_TYPE (SHORT, INT, PSHORT, SHORT, SHORT)
+DEF_FUNCTION_TYPE (INT, INT, PSHORT, SHORT, SHORT)
+DEF_FUNCTION_TYPE (INT, INT, PINT, INT, INT)
+DEF_FUNCTION_TYPE (LONGLONG, INT, PLONGLONG, LONGLONG, LONGLONG)
+DEF_FUNCTION_TYPE (INT, INT, PLONGLONG, LONGLONG, LONGLONG)
 
 DEF_FUNCTION_TYPE (INT, V16QI, INT, V16QI, INT, INT)
 DEF_FUNCTION_TYPE (V16QI, V16QI, INT, V16QI, INT, INT)
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 973bbeb..05589a1 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -25634,6 +25634,37 @@ enum ix86_builtins
   IX86_BUILTIN_CVTPS2PH,
   IX86_BUILTIN_CVTPS2PH256,
 
+  /* Instructions with HLE prefix.  */
+  IX86_BUILTIN_HLE_BTC16,
+  IX86_BUILTIN_HLE_BTC32,
+  IX86_BUILTIN_HLE_BTC64,
+  IX86_BUILTIN_HLE_BTR16,
+  IX86_BUILTIN_HLE_BTR32,
+  IX86_BUILTIN_HLE_BTR64,
+  IX86_BUILTIN_HLE_BTS16,
+  IX86_BUILTIN_HLE_BTS32,
+  IX86_BUILTIN_HLE_BTS64,
+  IX86_BUILTIN_HLE_VAL_CMPXCHG8,
+  IX86_BUILTIN_HLE_VAL_CMPXCHG16,
+  IX86_BUILTIN_HLE_VAL_CMPXCHG32,
+  IX86_BUILTIN_HLE_VAL_CMPXCHG64,
+  IX86_BUILTIN_HLE_BOOL_CMPXCHG8,
+  IX86_BUILTIN_HLE_BOOL_CMPXCHG16,
+  IX86_BUILTIN_HLE_BOOL_CMPXCHG32,
+  IX86_BUILTIN_HLE_BOOL_CMPXCHG64,
+  IX86_BUILTIN_HLE_XADD8,
+  IX86_BUILTIN_HLE_XADD16,
+  IX86_BUILTIN_HLE_XADD32,
+  IX86_BUILTIN_HLE_XADD64,
+  IX86_BUILTIN_HLE_XCHG8,
+  IX86_BUILTIN_HLE_XCHG16,
+  IX86_BUILTIN_HLE_XCHG32,
+  IX86_BUILTIN_HLE_XCHG64,
+  IX86_BUILTIN_HLE_STORE8,
+  IX86_BUILTIN_HLE_STORE16,
+  IX86_BUILTIN_HLE_STORE32,
+  IX86_BUILTIN_HLE_STORE64,
+
   /* CFString built-in for darwin */
   IX86_BUILTIN_CFSTRING,
 
@@ -25824,6 +25855,99 @@ static const struct builtin_description bdesc_pcmpistr[] =
   { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistriz128", IX86_BUILTIN_PCMPISTRZ128, UNKNOWN, (int) CCZmode },
 };
 
+static const struct builtin_description bdesc_hle[] =
+{
+  /* HLE */
+  { ~OPTION_MASK_ISA_64BIT, CODE_FOR_hle_btchi,
+    "__builtin_ia32_hle_btc_i16", IX86_BUILTIN_HLE_BTC16,
+    UNKNOWN, (int) INT_FTYPE_INT_PSHORT_SHORT },
+  { ~OPTION_MASK_ISA_64BIT, CODE_FOR_hle_btcsi,
+    "__builtin_ia32_hle_btc_i32", IX86_BUILTIN_HLE_BTC32,
+    UNKNOWN, (int) INT_FTYPE_INT_PINT_INT },
+  { OPTION_MASK_ISA_64BIT, CODE_FOR_hle_btcdi,
+    "__builtin_ia32_hle_btc_i64", IX86_BUILTIN_HLE_BTC64,
+    UNKNOWN, (int) INT_FTYPE_INT_PLONGLONG_LONGLONG },
+  { ~OPTION_MASK_ISA_64BIT, CODE_FOR_hle_btrhi,
+    "__builtin_ia32_hle_btr_i16", IX86_BUILTIN_HLE_BTR16,
+    UNKNOWN, (int) INT_FTYPE_INT_PSHORT_SHORT },
+  { ~OPTION_MASK_ISA_64BIT, CODE_FOR_hle_btrsi,
+    "__builtin_ia32_hle_btr_i32",
+    IX86_BUILTIN_HLE_BTR32, UNKNOWN,
+    (int) INT_FTYPE_INT_PINT_INT },
+  { OPTION_MASK_ISA_64BIT, CODE_FOR_hle_btrdi,
+    "__builtin_ia32_hle_btr_i64", IX86_BUILTIN_HLE_BTR64,
+    UNKNOWN, (int) INT_FTYPE_INT_PLONGLONG_LONGLONG },
+  { ~OPTION_MASK_ISA_64BIT, CODE_FOR_hle_btshi,
+    "__builtin_ia32_hle_bts_i16", IX86_BUILTIN_HLE_BTS16,
+    UNKNOWN, (int) INT_FTYPE_INT_PSHORT_SHORT },
+  { ~OPTION_MASK_ISA_64BIT, CODE_FOR_hle_btssi,
+    "__builtin_ia32_hle_bts_i32", IX86_BUILTIN_HLE_BTS32,
+    UNKNOWN, (int) INT_FTYPE_INT_PINT_INT },
+  { OPTION_MASK_ISA_64BIT, CODE_FOR_hle_btsdi,
+    "__builtin_ia32_hle_bts_i64", IX86_BUILTIN_HLE_BTS64,
+    UNKNOWN, (int) INT_FTYPE_INT_PLONGLONG_LONGLONG },
+  { ~OPTION_MASK_ISA_64BIT, CODE_FOR_hle_cmpxchgqi,
+    "__builtin_ia32_hle_val_cmpxchg_i8", IX86_BUILTIN_HLE_VAL_CMPXCHG8,
+    UNKNOWN, (int) CHAR_FTYPE_INT_PCHAR_CHAR_CHAR },
+  { ~OPTION_MASK_ISA_64BIT, CODE_FOR_hle_cmpxchghi,
+    "__builtin_ia32_hle_val_cmpxchg_i16", IX86_BUILTIN_HLE_VAL_CMPXCHG16,
+    UNKNOWN, (int) SHORT_FTYPE_INT_PSHORT_SHORT_SHORT },
+  { ~OPTION_MASK_ISA_64BIT, CODE_FOR_hle_cmpxchgsi,
+    "__builtin_ia32_hle_val_cmpxchg_i32", IX86_BUILTIN_HLE_VAL_CMPXCHG32,
+    UNKNOWN, (int) INT_FTYPE_INT_PINT_INT_INT },
+  { OPTION_MASK_ISA_64BIT, CODE_FOR_hle_cmpxchgdi,
+    "__builtin_ia32_hle_val_cmpxchg_i64", IX86_BUILTIN_HLE_VAL_CMPXCHG64,
+    UNKNOWN, (int) LONGLONG_FTYPE_INT_PLONGLONG_LONGLONG_LONGLONG },
+  { ~OPTION_MASK_ISA_64BIT, CODE_FOR_hle_bool_cmpxchgqi,
+    "__builtin_ia32_hle_bool_cmpxchg_i8", IX86_BUILTIN_HLE_BOOL_CMPXCHG8,
+    UNKNOWN, (int) INT_FTYPE_INT_PCHAR_CHAR_CHAR },
+  { ~OPTION_MASK_ISA_64BIT, CODE_FOR_hle_bool_cmpxchghi,
+    "__builtin_ia32_hle_bool_cmpxchg_i16", IX86_BUILTIN_HLE_BOOL_CMPXCHG16,
+    UNKNOWN, (int) INT_FTYPE_INT_PSHORT_SHORT_SHORT },
+  { ~OPTION_MASK_ISA_64BIT, CODE_FOR_hle_bool_cmpxchgsi,
+    "__builtin_ia32_hle_bool_cmpxchg_i32", IX86_BUILTIN_HLE_BOOL_CMPXCHG32,
+    UNKNOWN, (int) INT_FTYPE_INT_PINT_INT_INT },
+  { OPTION_MASK_ISA_64BIT, CODE_FOR_hle_bool_cmpxchgdi,
+    "__builtin_ia32_hle_bool_cmpxchg_i64", IX86_BUILTIN_HLE_BOOL_CMPXCHG64,
+    UNKNOWN, (int) INT_FTYPE_INT_PLONGLONG_LONGLONG_LONGLONG },
+  { ~OPTION_MASK_ISA_64BIT, CODE_FOR_hle_xaddqi,
+    "__builtin_ia32_hle_xadd_i8", IX86_BUILTIN_HLE_XADD8,
+    UNKNOWN, (int) CHAR_FTYPE_INT_PCHAR_CHAR },
+  { ~OPTION_MASK_ISA_64BIT, CODE_FOR_hle_xaddhi,
+    "__builtin_ia32_hle_xadd_i16", IX86_BUILTIN_HLE_XADD16,
+    UNKNOWN, (int) SHORT_FTYPE_INT_PSHORT_SHORT },
+  { ~OPTION_MASK_ISA_64BIT, CODE_FOR_hle_xaddsi,
+    "__builtin_ia32_hle_xadd_i32", IX86_BUILTIN_HLE_XADD32,
+    UNKNOWN, (int) INT_FTYPE_INT_PINT_INT },
+  { OPTION_MASK_ISA_64BIT, CODE_FOR_hle_xadddi,
+    "__builtin_ia32_hle_xadd_i64", IX86_BUILTIN_HLE_XADD64,
+    UNKNOWN, (int) LONGLONG_FTYPE_INT_PLONGLONG_LONGLONG },
+  { ~OPTION_MASK_ISA_64BIT, CODE_FOR_hle_xchgqi,
+    "__builtin_ia32_hle_xchg_i8", IX86_BUILTIN_HLE_XCHG8,
+    UNKNOWN, (int) CHAR_FTYPE_INT_PCHAR_CHAR },
+  { ~OPTION_MASK_ISA_64BIT, CODE_FOR_hle_xchghi,
+    "__builtin_ia32_hle_xchg_i16", IX86_BUILTIN_HLE_XCHG16,
+    UNKNOWN, (int) SHORT_FTYPE_INT_PSHORT_SHORT },
+  { ~OPTION_MASK_ISA_64BIT, CODE_FOR_hle_xchgsi,
+    "__builtin_ia32_hle_xchg_i32", IX86_BUILTIN_HLE_XCHG32,
+    UNKNOWN, (int) INT_FTYPE_INT_PINT_INT },
+  { OPTION_MASK_ISA_64BIT, CODE_FOR_hle_xchgdi, 
+    "__builtin_ia32_hle_xchg_i64", IX86_BUILTIN_HLE_XCHG64,
+    UNKNOWN, (int) LONGLONG_FTYPE_INT_PLONGLONG_LONGLONG },
+  { ~OPTION_MASK_ISA_64BIT, CODE_FOR_hle_storeqi,
+    "__builtin_ia32_hle_store_i8", IX86_BUILTIN_HLE_STORE8,
+    UNKNOWN, (int) VOID_FTYPE_INT_PCHAR_CHAR },
+  { ~OPTION_MASK_ISA_64BIT, CODE_FOR_hle_storehi,
+    "__builtin_ia32_hle_store_i16", IX86_BUILTIN_HLE_STORE16,
+    UNKNOWN, (int) VOID_FTYPE_INT_PSHORT_SHORT },
+  { ~OPTION_MASK_ISA_64BIT, CODE_FOR_hle_storesi,
+    "__builtin_ia32_hle_store_i32", IX86_BUILTIN_HLE_STORE32,
+    UNKNOWN, (int) VOID_FTYPE_INT_PINT_INT },
+  { OPTION_MASK_ISA_64BIT, CODE_FOR_hle_storedi, 
+    "__builtin_ia32_hle_store_i64", IX86_BUILTIN_HLE_STORE64,
+    UNKNOWN, (int) VOID_FTYPE_INT_PLONGLONG_LONGLONG },
+};
+
 /* Special builtins with variable number of arguments.  */
 static const struct builtin_description bdesc_special_args[] =
 {
@@ -27246,6 +27370,13 @@ ix86_init_mmx_sse_builtins (void)
       def_builtin_const (d->mask, d->name, ftype, d->code);
     }
 
+  /* Instructions with HLE prefix.  */
+  for (i = 0, d = bdesc_hle; i < ARRAY_SIZE (bdesc_hle); i++, d++)
+    {
+      ftype = (enum ix86_builtin_func_type) d->flag;
+      def_builtin (d->mask, d->name, ftype, d->code);
+    }
+
   /* SSE */
   def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
 	       VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
@@ -29003,6 +29134,128 @@ ix86_expand_special_args_builtin (const struct builtin_description *d,
   return klass == store ? 0 : target;
 }
 
+/* Subroutine of ix86_expand_builtin to take care of insns with HLE
+   prefix.  */
+
+static rtx
+ix86_expand_hle (const struct builtin_description *d, tree exp,
+		 rtx target)
+{
+  tree arg;
+  rtx pat, op;
+  unsigned int i, nargs;
+  rtx args[4];
+  enum insn_code icode = d->icode;
+  const struct insn_data_d *insn_p = &insn_data[icode];
+  enum machine_mode tmode = insn_p->operand[0].mode;
+  bool store = false;
+
+  switch ((enum ix86_builtin_func_type) d->flag)
+    {
+    case VOID_FTYPE_INT_PCHAR_CHAR:
+    case VOID_FTYPE_INT_PSHORT_SHORT:
+    case VOID_FTYPE_INT_PINT_INT:
+    case VOID_FTYPE_INT_PLONGLONG_LONGLONG:
+      store = true;
+    case CHAR_FTYPE_INT_PCHAR_CHAR:
+    case SHORT_FTYPE_INT_PSHORT_SHORT:
+    case INT_FTYPE_INT_PINT_INT:
+    case LONGLONG_FTYPE_INT_PLONGLONG_LONGLONG:
+    case INT_FTYPE_INT_PSHORT_SHORT:
+    case INT_FTYPE_INT_PLONGLONG_LONGLONG:
+      nargs = 3;
+      break;
+    case CHAR_FTYPE_INT_PCHAR_CHAR_CHAR:
+    case SHORT_FTYPE_INT_PSHORT_SHORT_SHORT:
+    case INT_FTYPE_INT_PINT_INT_INT:
+    case LONGLONG_FTYPE_INT_PLONGLONG_LONGLONG_LONGLONG:
+    case INT_FTYPE_INT_PCHAR_CHAR_CHAR:
+    case INT_FTYPE_INT_PSHORT_SHORT_SHORT:
+    case INT_FTYPE_INT_PLONGLONG_LONGLONG_LONGLONG:
+      nargs = 4;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  gcc_assert (nargs <= ARRAY_SIZE (args));
+
+   /* Check Lock type.  */
+  arg = CALL_EXPR_ARG (exp, 0);
+  op = expand_normal (arg);
+
+  if (store)
+    {
+      if (!CONST_INT_P (op) || (INTVAL (op) != 0 && INTVAL (op) != 3))
+	error ("the first argument must be 0 or 3");
+
+      gcc_assert (target == 0);
+    }
+  else
+    {
+      if (!CONST_INT_P (op) || INTVAL (op) < 0 || INTVAL (op) > 3)
+	error ("the first argument must be 0, 1, 2 or 3");
+
+      if (optimize
+	  || target == 0
+	  || GET_MODE (target) != tmode
+	  || !insn_p->operand[0].predicate (target, tmode))
+	target = gen_reg_rtx (tmode);
+    }
+
+  args[0] = op;
+
+  for (i = 1; i < nargs; i++)
+    {
+      enum machine_mode mode = insn_p->operand[i].mode;
+
+      arg = CALL_EXPR_ARG (exp, i);
+      op = expand_normal (arg);
+
+      switch (i)
+	{
+	case 1:
+	  /* Memory operand.  */
+	  if (GET_MODE (op) != Pmode)
+	    op = convert_to_mode (Pmode, op, 1);
+	  op = gen_rtx_MEM (mode, force_reg (Pmode, op));
+	  gcc_assert (GET_MODE (op) == mode
+		      || GET_MODE (op) == VOIDmode);
+	  break;
+	default:
+	  if (!insn_p->operand[i].predicate (op, mode))
+	    {
+	      if ((GET_MODE (op) != mode && GET_MODE (op) != VOIDmode))
+		op = simplify_gen_subreg (mode, op, GET_MODE (op), 0);
+	      op = copy_to_mode_reg (mode, op);
+	    }
+	  break;
+	}
+
+      args[i] = op;
+    }
+
+  switch (nargs)
+    {
+    case 3:
+      if (store)
+	pat = GEN_FCN (icode) (args[1], args[2], args[0]);
+      else
+	pat = GEN_FCN (icode) (target, args[1], args[2], args[0]);
+      break;
+    case 4:
+      pat = GEN_FCN (icode) (target, args[1], args[2], args[3], args[0]);
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  if (! pat)
+    return 0;
+  emit_insn (pat);
+  return store ? 0 : target;
+}
+
 /* Return the integer constant in ARG.  Constrain it to be in the range
    of the subparts of VEC_TYPE; issue an error if not.  */
 
@@ -29657,6 +29910,10 @@ rdrand_step:
     if (d->code == fcode)
       return ix86_expand_sse_pcmpistr (d, exp, target);
 
+  for (i = 0, d = bdesc_hle; i < ARRAY_SIZE (bdesc_hle); i++, d++)
+    if (d->code == fcode)
+      return ix86_expand_hle (d, exp, target);
+
   for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
     if (d->code == fcode)
       return ix86_expand_multi_arg_builtin (d->icode, exp, target,
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index bfbf5bf..e4fd6ce 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -201,6 +201,11 @@
   UNSPECV_RDGSBASE
   UNSPECV_WRFSBASE
   UNSPECV_WRGSBASE
+
+  UNSPECV_HLE_CMPXCHG
+  UNSPECV_HLE_XCHG
+  UNSPECV_BT_CARRY
+  UNSPECV_STORE
 ])
 
 ;; Constants to represent rounding modes in the ROUND instruction
@@ -18115,6 +18120,338 @@
   [(set_attr "length" "2")
    (set_attr "memory" "unknown")])
 
+;; Patterns with HLE prefixes.
+(define_insn "hle_cmpxchg<mode>"
+  [(set (match_operand:SWI 0 "register_operand" "=a")
+	(match_operand:SWI 1 "memory_operand" "+m"))
+   (set (match_dup 1)
+	(unspec_volatile:SWI
+	  [(match_dup 1)
+	   (match_operand:SWI 2 "register_operand" "a")
+	   (match_operand:SWI 3 "register_operand" "<r>")
+	   (match_operand:SWI 4 "const_int_operand" "n")]
+	  UNSPECV_HLE_CMPXCHG))
+   (set (reg:CCZ FLAGS_REG)
+        (compare:CCZ
+          (unspec_volatile:SWI
+            [(match_dup 1) (match_dup 2) (match_dup 3)] UNSPECV_HLE_CMPXCHG)
+          (match_dup 2)))]
+  "TARGET_CMPXCHG"
+{
+    switch (INTVAL (operands[4]))
+      {
+      case 0:
+	return "cmpxchg{<imodesuffix>}\t{%3, %1|%1, %3}";
+      case 1:
+	return "lock{%;} cmpxchg{<imodesuffix>}\t{%3, %1|%1, %3}";
+      case 2:
+	return ASM_BYTE "0xf2; lock{%;} cmpxchg{<imodesuffix>}\t{%3, %1|%1, %3}";
+      case 3:
+	return ASM_BYTE "0xf3; lock{%;} cmpxchg{<imodesuffix>}\t{%3, %1|%1, %3}";
+      default:
+	gcc_unreachable ();
+      }
+})
+
+(define_expand "hle_bool_cmpxchg<mode>"
+  [(set (match_operand:SI 0 "register_operand" "")
+	(unspec_volatile:SWI
+	  [(match_operand:SWI 1 "memory_operand" "")
+	   (match_operand:SWI 2 "register_operand" "")
+	   (match_operand:SWI 3 "register_operand" "")
+	   (match_operand:SWI 4 "const_int_operand" "")]
+	  UNSPECV_HLE_CMPXCHG))]
+  "TARGET_CMPXCHG"
+{
+  rtx insn, op0, op1, tmp;
+  tmp = gen_reg_rtx (<MODE>mode);
+  op0 = gen_reg_rtx (SImode);
+  emit_move_insn (op0, const0_rtx);
+  op1 = gen_reg_rtx (SImode);
+  emit_move_insn (op1, const1_rtx);
+  emit_insn (gen_hle_cmpxchg<mode> (tmp, operands[1],
+				    operands[2], operands[3],
+				    operands[4]));
+  insn = gen_rtx_EQ (VOIDmode, gen_rtx_REG (CCZmode, FLAGS_REG),
+		     const0_rtx);
+  emit_insn (gen_rtx_SET (VOIDmode, operands[0],
+			  gen_rtx_IF_THEN_ELSE (SImode, insn,
+						op1, op0)));
+  DONE;
+})
+
+(define_insn "hle_xadd<mode>"
+  [(set (match_operand:SWI 0 "register_operand" "=<r>")
+	(unspec_volatile:SWI
+	  [(match_operand:SWI 1 "memory_operand" "+m")
+	   (match_operand:SWI 3 "const_int_operand" "n")] UNSPECV_HLE_XCHG))
+   (set (match_dup 1)
+	(plus:SWI (match_dup 1)
+		  (match_operand:SWI 2 "nonmemory_operand" "0")))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_XADD"
+{
+    switch (INTVAL (operands[3]))
+      {
+      case 0:
+	return "xadd{<imodesuffix>}\t{%0, %1|%1, %0}"; 
+      case 1:
+	return "lock{%;} xadd{<imodesuffix>}\t{%0, %1|%1, %0}"; 
+      case 2:
+	return ASM_BYTE "0xf2; lock{%;} xadd{<imodesuffix>}\t{%0, %1|%1, %0}"; 
+      case 3:
+	return ASM_BYTE "0xf3; lock{%;} xadd{<imodesuffix>}\t{%0, %1|%1, %0}"; 
+      default:
+	gcc_unreachable ();
+      }
+})
+
+(define_insn "hle_xchg<mode>"
+  [(set (match_operand:SWI 0 "register_operand" "=<r>")
+	(unspec_volatile:SWI
+	  [(match_operand:SWI 1 "memory_operand" "+m")
+	   (match_operand:SWI 3 "const_int_operand" "n")] 
+	  UNSPECV_HLE_XCHG))
+   (set (match_dup 1)
+	(match_operand:SWI 2 "register_operand" "0"))]
+  ""
+{
+    switch (INTVAL (operands[3]))
+      {
+      case 0:
+      case 1:
+	return "xchg{<imodesuffix>}\t{%1, %0|%0, %1}";
+      case 2:
+	return ASM_BYTE "0xf2; xchg{<imodesuffix>}\t{%1, %0|%0, %1}";
+      case 3:
+	return ASM_BYTE "0xf3; xchg{<imodesuffix>}\t{%1, %0|%0, %1}";
+      default:
+	gcc_unreachable ();
+      }
+})
+
+(define_expand "hle_bts<mode>"
+  [(parallel
+    [(set (match_operand:SI 0 "register_operand" "")
+	  (unspec_volatile:SI
+	    [(match_operand:SWI248 1 "memory_operand" "")
+	     (match_operand:SWI248 2 "x86_64_nonmemory_operand" "")
+	     (match_operand:SWI248 3 "const_int_operand" "")] 
+	    UNSPECV_BT_CARRY))
+     (set (zero_extract:SWI248
+	    (match_dup 0)
+	    (const_int 1)
+	    (match_dup 1))
+	  (const_int 1))])]
+  ""
+{
+  rtx insn, op0, op1;
+  op0 = gen_reg_rtx (SImode);
+  emit_move_insn (op0, const0_rtx);
+  op1 = gen_reg_rtx (SImode);
+  emit_move_insn (op1, const1_rtx);
+  emit_insn (gen_hle_bts<mode>_1 (operands[1], operands[2], operands[3]));
+  insn = gen_rtx_EQ (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
+		     const0_rtx);
+  emit_insn (gen_rtx_SET (VOIDmode, operands[0],
+			  gen_rtx_IF_THEN_ELSE (SImode, insn,
+						op1, op0)));
+  DONE;
+})
+
+(define_insn "hle_bts<mode>_1"
+  [(set (zero_extract:SWI248
+	   (match_operand:SWI248 0 "memory_operand" "+m")
+	   (const_int 1)
+	   (match_operand:SWI248 1 "x86_64_nonmemory_operand" "rN"))
+	(const_int 1))
+   (set (reg:CCC FLAGS_REG)
+	(unspec_volatile:CCC
+	  [(match_dup 0)
+	   (match_dup 1)
+	   (match_operand:SWI248 2 "const_int_operand" "n")] 
+	  UNSPECV_BT_CARRY))]
+  ""
+{
+  switch (INTVAL (operands[2]))
+    {
+    case 0:
+      return "bts{<imodesuffix>}\t{%1, %0|%0, %1}";
+    case 1:
+      return "lock{%;} bts{<imodesuffix>}\t{%1, %0|%0, %1}";
+    case 2:
+      return ASM_BYTE "0xf2; lock{%;} bts{<imodesuffix>}\t{%1, %0|%0, %1}";
+    case 3:
+      return ASM_BYTE "0xf3; lock{%;} bts{<imodesuffix>}\t{%1, %0|%0, %1}";
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "type" "alu1")
+   (set_attr "prefix_0f" "1")
+   (set_attr "mode" "<MODE>")
+   (set (attr "length_immediate")
+     (if_then_else (match_operand 1 "register_operand" "")
+       (const_string "0")
+       (const_string "1")))])
+
+(define_expand "hle_btr<mode>"
+  [(parallel
+    [(set (match_operand:SI 0 "register_operand" "")
+	  (unspec_volatile:SI
+	    [(match_operand:SWI248 1 "memory_operand" "")
+	     (match_operand:SWI248 2 "x86_64_nonmemory_operand" "")
+	     (match_operand:SWI248 3 "const_int_operand" "")] 
+	    UNSPECV_BT_CARRY))
+     (set (zero_extract:SWI248
+	    (match_dup 0)
+	    (const_int 1)
+	    (match_dup 1))
+	  (const_int 0))])]
+  ""
+{
+  rtx insn, op0, op1;
+  op0 = gen_reg_rtx (SImode);
+  emit_move_insn (op0, const0_rtx);
+  op1 = gen_reg_rtx (SImode);
+  emit_move_insn (op1, const1_rtx);
+  emit_insn (gen_hle_btr<mode>_1 (operands[1], operands[2], operands[3]));
+  insn = gen_rtx_EQ (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
+		     const0_rtx);
+  emit_insn (gen_rtx_SET (VOIDmode, operands[0],
+			  gen_rtx_IF_THEN_ELSE (SImode, insn,
+						op1, op0)));
+  DONE;
+})
+
+(define_insn "hle_btr<mode>_1"
+  [(set (zero_extract:SWI248
+	   (match_operand:SWI248 0 "memory_operand" "+m")
+	   (const_int 1)
+	   (match_operand:SWI248 1 "x86_64_nonmemory_operand" "rN"))
+	(const_int 0))
+   (set (reg:CCC FLAGS_REG)
+	(unspec_volatile:CCC
+	  [(match_dup 0)
+	   (match_dup 1)
+	   (match_operand:SWI248 2 "const_int_operand" "n")] 
+	  UNSPECV_BT_CARRY))]
+  ""
+{
+  switch (INTVAL (operands[2]))
+    {
+    case 0:
+      return "btr{<imodesuffix>}\t{%1, %0|%0, %1}";
+    case 1:
+      return "lock{%;} btr{<imodesuffix>}\t{%1, %0|%0, %1}";
+    case 2:
+      return ASM_BYTE "0xf2; lock{%;} btr{<imodesuffix>}\t{%1, %0|%0, %1}";
+    case 3:
+      return ASM_BYTE "0xf3; lock{%;} btr{<imodesuffix>}\t{%1, %0|%0, %1}";
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "type" "alu1")
+   (set_attr "prefix_0f" "1")
+   (set_attr "mode" "<MODE>")
+   (set (attr "length_immediate")
+     (if_then_else (match_operand 1 "register_operand" "")
+       (const_string "0")
+       (const_string "1")))])
+
+(define_expand "hle_btc<mode>"
+  [(parallel
+    [(set (match_operand:SI 0 "register_operand" "")
+	  (unspec_volatile:SI
+	    [(match_operand:SWI248 1 "memory_operand" "")
+	     (match_operand:SWI248 2 "x86_64_nonmemory_operand" "")
+	     (match_operand:SWI248 3 "const_int_operand" "")] 
+	    UNSPECV_BT_CARRY))
+     (set (zero_extract:SWI248
+	    (match_dup 0)
+	    (const_int 1)
+	    (match_dup 1))
+	  (not:SWI248
+	    (zero_extract:SWI248
+	      (match_dup 0)
+	      (const_int 1)
+	      (match_dup 1))))])]
+  ""
+{
+  rtx insn, op0, op1;
+  op0 = gen_reg_rtx (SImode);
+  emit_move_insn (op0, const0_rtx);
+  op1 = gen_reg_rtx (SImode);
+  emit_move_insn (op1, const1_rtx);
+  emit_insn (gen_hle_btc<mode>_1 (operands[1], operands[2], operands[3]));
+  insn = gen_rtx_EQ (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
+		     const0_rtx);
+  emit_insn (gen_rtx_SET (VOIDmode, operands[0],
+			  gen_rtx_IF_THEN_ELSE (SImode, insn,
+						op1, op0)));
+  DONE;
+})
+
+(define_insn "hle_btc<mode>_1"
+  [(set (zero_extract:SWI248
+	   (match_operand:SWI248 0 "memory_operand" "+m")
+	   (const_int 1)
+	   (match_operand:SWI248 1 "x86_64_nonmemory_operand" "rN"))
+	(not:SWI248
+	  (zero_extract:SWI248
+	    (match_dup 0)
+	    (const_int 1)
+	    (match_dup 1))))
+   (set (reg:CCC FLAGS_REG)
+	(unspec_volatile:CCC
+	  [(match_dup 0)
+	   (match_dup 1)
+	   (match_operand:SWI248 2 "const_int_operand" "n")] 
+	  UNSPECV_BT_CARRY))]
+  ""
+{
+  switch (INTVAL (operands[2]))
+    {
+    case 0:
+      return "btc{<imodesuffix>}\t{%1, %0|%0, %1}";
+    case 1:
+      return "lock{%;} btc{<imodesuffix>}\t{%1, %0|%0, %1}";
+    case 2:
+      return ASM_BYTE "0xf2; lock{%;} btc{<imodesuffix>}\t{%1, %0|%0, %1}";
+    case 3:
+      return ASM_BYTE "0xf3; lock{%;} btc{<imodesuffix>}\t{%1, %0|%0, %1}";
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "type" "alu1")
+   (set_attr "prefix_0f" "1")
+   (set_attr "mode" "<MODE>")
+   (set (attr "length_immediate")
+     (if_then_else (match_operand 1 "register_operand" "")
+       (const_string "0")
+       (const_string "1")))])
+
+(define_insn "hle_store<mode>"
+  [(set (match_operand:SWI 0 "memory_operand" "=m")
+	(unspec_volatile:SWI
+	  [(match_operand:SWI 1 "nonmemory_operand" "rn")
+	   (match_operand:SWI 2 "const_int_operand" "n")] 
+	  UNSPECV_STORE))]
+  ""
+{
+  switch (INTVAL (operands[2]))
+    {
+    case 0:
+      return "mov{<imodesuffix>}\t{%1, %0|%0, %1}";
+    case 3:
+      return ASM_BYTE "0xf3; mov{<imodesuffix>}\t{%1, %0|%0, %1}";
+    default:
+      gcc_unreachable ();
+    }
+})
+
 (include "mmx.md")
 (include "sse.md")
 (include "sync.md")
diff --git a/gcc/config/i386/immintrin.h b/gcc/config/i386/immintrin.h
index 986a573d..c76ddd3 100644
--- a/gcc/config/i386/immintrin.h
+++ b/gcc/config/i386/immintrin.h
@@ -80,6 +80,8 @@
 #include <f16cintrin.h>
 #endif
 
+#include <hleintrin.h>
+
 #ifdef __RDRND__
 extern __inline int
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [i386, patch, RFC] HLE support in GCC
  2012-03-07 11:06 [i386, patch, RFC] HLE support in GCC Kirill Yukhin
@ 2012-03-07 11:10 ` Jakub Jelinek
  2012-03-08  1:57   ` H.J. Lu
  0 siblings, 1 reply; 49+ messages in thread
From: Jakub Jelinek @ 2012-03-07 11:10 UTC (permalink / raw)
  To: Kirill Yukhin
  Cc: Uros Bizjak, Richard Henderson, Andrew MacLeod, Richard Guenther,
	Andi Kleen, H.J. Lu, Sergey Ostanevich, gcc-patches List

On Wed, Mar 07, 2012 at 03:05:58PM +0400, Kirill Yukhin wrote:
> Hello guys,
> I am attaching initial patch which enables TSX's HLE [1] prefixes in
> GCC. Since we have no official intrinsics declarations, I want to hear
> your comments about the patch

I think this is a wrong approach.  Instead we should use for this a flag
on the __atomic_* builtins (some higher bit of the memmodel) that would
say we want to emit an XACQUIRE or XRELEASE insn prefix.

> Note, there is no option '-mhle' and no tests (I'll do that after)
> 
> [1] - http://software.intel.com/en-us/blogs/2012/02/07/transactional-synchronization-in-haswell/

	Jakub

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [i386, patch, RFC] HLE support in GCC
  2012-03-07 11:10 ` Jakub Jelinek
@ 2012-03-08  1:57   ` H.J. Lu
       [not found]     ` <CAGs3Rft_0Cs6v3AP_LH2tbbqrym5rf1A4MJ+6fn1T37=BEzrbw@mail.gmail.com>
  0 siblings, 1 reply; 49+ messages in thread
From: H.J. Lu @ 2012-03-08  1:57 UTC (permalink / raw)
  To: Jakub Jelinek
  Cc: Kirill Yukhin, Uros Bizjak, Richard Henderson, Andrew MacLeod,
	Richard Guenther, Andi Kleen, Sergey Ostanevich,
	gcc-patches List

On Wed, Mar 7, 2012 at 3:10 AM, Jakub Jelinek <jakub@redhat.com> wrote:
> On Wed, Mar 07, 2012 at 03:05:58PM +0400, Kirill Yukhin wrote:
>> Hello guys,
>> I am attaching initial patch which enables TSX's HLE [1] prefixes in
>> GCC. Since we have no official intrinsics declarations, I want to hear
>> your comments about the patch
>
> I think this is a wrong approach.  Instead we should use for this a flag
> on the __atomic_* builtins (some higher bit of the memmodel) that would
> say we want to emit an XACQUIRE or XRELEASE insn prefix.
>

That sounds a good idea. Will it require front-end and middle-end changes?
I assume they will be no-op for other backends.

-- 
H.J.

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [i386, patch, RFC] HLE support in GCC
       [not found]     ` <CAGs3Rft_0Cs6v3AP_LH2tbbqrym5rf1A4MJ+6fn1T37=BEzrbw@mail.gmail.com>
@ 2012-03-08 15:04       ` H.J. Lu
  2012-03-08 15:09         ` Jakub Jelinek
  0 siblings, 1 reply; 49+ messages in thread
From: H.J. Lu @ 2012-03-08 15:04 UTC (permalink / raw)
  To: Kirill Yukhin; +Cc: GCC Patches

On Thu, Mar 8, 2012 at 12:44 AM, Kirill Yukhin <kirill.yukhin@gmail.com> wrote:
> Hi HJ,
> I am working on that. Here's some clarification from Jakub:
>> I meant that e.g. instead of:
>> int
>> foo (int *p, int oldv, int newv)
>> {
>>  __atomic_compare_exchange_n (p, &oldv, newv, 0, __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE);
>>  return oldv;
>> }
>> if you want to generate xacquire lock cmpxchgl instead of just lock cmpxchgl
>> you could write
>>  __atomic_compare_exchange_n (p, &oldv, newv, 0, __ATOMIC_ACQUIRE | __ATOMIC_HLE_XACQUIRE, __ATOMIC_ACQUIRE);
>> (dunno if we'd want the flag to be listed in both success and fail memmodel
>> variants, or just one, for most of other __atomic_* builtins there is just
>> one).
>> __ATOMIC_HLE_XACQUIRE and __ATOMIC_HLE_XRELEASE would be just predefined
>> macros like __ATOMIC_ACQUIRE etc. is.
>>
>>        Jakub
>
> I don't think, we need to change FE for that...

Please note that __ATOMIC_HLE_XACQUIRE has nothing to do with
__ATOMIC_ACQUIRE.  You can have

__ATOMIC_ACQUIRE | __ATOMIC_HLE_XRELEASE

> K
>
>>
>> That sounds a good idea. Will it require front-end and middle-end changes?
>> I assume they will be no-op for other backends.
>>
>> --
>> H.J.



-- 
H.J.

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [i386, patch, RFC] HLE support in GCC
  2012-03-08 15:04       ` H.J. Lu
@ 2012-03-08 15:09         ` Jakub Jelinek
  2012-04-10 14:12           ` Kirill Yukhin
  0 siblings, 1 reply; 49+ messages in thread
From: Jakub Jelinek @ 2012-03-08 15:09 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Kirill Yukhin, GCC Patches

On Thu, Mar 08, 2012 at 07:04:03AM -0800, H.J. Lu wrote:
> > I don't think, we need to change FE for that...
> 
> Please note that __ATOMIC_HLE_XACQUIRE has nothing to do with
> __ATOMIC_ACQUIRE.  You can have
> 
> __ATOMIC_ACQUIRE | __ATOMIC_HLE_XRELEASE

Yeah.  And you don't need to change the FEs in any way, all that is needed
is to change the middle-end/expansion (builtins.c - e.g. get_memmodel)
and the backend (plus predefine the macros in the backend).

	Jakub

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [i386, patch, RFC] HLE support in GCC
  2012-03-08 15:09         ` Jakub Jelinek
@ 2012-04-10 14:12           ` Kirill Yukhin
  2012-04-10 14:21             ` Jakub Jelinek
  2012-04-10 14:43             ` H.J. Lu
  0 siblings, 2 replies; 49+ messages in thread
From: Kirill Yukhin @ 2012-04-10 14:12 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: H.J. Lu, GCC Patches

[-- Attachment #1: Type: text/plain, Size: 661 bytes --]

>
> Yeah.  And you don't need to change the FEs in any way, all that is needed
> is to change the middle-end/expansion (builtins.c - e.g. get_memmodel)
> and the backend (plus predefine the macros in the backend).
>
>        Jakub

Hi Jakub,
Attached patch implements HLE support for __atomic_compare_exchange_n.

So, to emit HLE prefix, it is possible to do:
int
foo2 (int *p, int oldv, int newv)
{
  __atomic_compare_exchange_n (p, &oldv, newv, 0, __ATOMIC_ACQUIRE |
__ATOMIC_USE_HLE, __ATOMIC_ACQUIRE);
  return oldv;
}
Which will generate:
...
        lock xacquire cmpxchgl  %esi, (%rcx)
...

Comments?

PS: No tests and TARGET_HLE defined yet.

Thanks, K

[-- Attachment #2: hle-rfc-2.gcc.patch --]
[-- Type: application/octet-stream, Size: 6737 bytes --]

diff --git a/gcc/builtins.c b/gcc/builtins.c
index b937d3d..bf5eb03 100644
--- a/gcc/builtins.c
+++ b/gcc/builtins.c
@@ -5310,7 +5310,7 @@ get_memmodel (tree exp)
     return MEMMODEL_SEQ_CST;
 
   op = expand_normal (exp);
-  if (INTVAL (op) < 0 || INTVAL (op) >= MEMMODEL_LAST)
+  if (INTVAL (op) < 0 || (INTVAL (op) & ((MEMMODEL_LAST-1 | MEMMODEL_LAST))) >= MEMMODEL_LAST)
     {
       warning (OPT_Winvalid_memory_model,
 	       "invalid memory model argument to builtin");
@@ -5363,11 +5363,13 @@ expand_builtin_atomic_compare_exchange (enum machine_mode mode, tree exp,
   enum memmodel success, failure;
   tree weak;
   bool is_weak;
+  unsigned memmodel_mask = (MEMMODEL_LAST-1) | MEMMODEL_LAST;
 
   success = get_memmodel (CALL_EXPR_ARG (exp, 4));
   failure = get_memmodel (CALL_EXPR_ARG (exp, 5));
 
-  if (failure == MEMMODEL_RELEASE || failure == MEMMODEL_ACQ_REL)
+  if ( (failure & memmodel_mask) == MEMMODEL_RELEASE
+       || (failure & memmodel_mask) == MEMMODEL_ACQ_REL)
     {
       error ("invalid failure memory model for %<__atomic_compare_exchange%>");
       return NULL_RTX;
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index fad5e66..9171ef4 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -30240,6 +30240,12 @@ ix86_vectorize_builtin_gather (const_tree mem_vectype,
   return ix86_builtins[code];
 }
 
+static bool
+x86_have_hle (void)
+{
+  return true;
+}
+
 /* Returns a code for a target-specific builtin that implements
    reciprocal of the function, or NULL_TREE if not available.  */
 
@@ -38714,6 +38720,9 @@ ix86_autovectorize_vector_sizes (void)
 #undef TARGET_CONDITIONAL_REGISTER_USAGE
 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
 
+#undef TARGET_HAVE_HLE
+#define TARGET_HAVE_HLE x86_have_hle
+
 #if TARGET_MACHO
 #undef TARGET_INIT_LIBFUNCS
 #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
diff --git a/gcc/config/i386/sync.md b/gcc/config/i386/sync.md
index 9f91344..052ce7d 100644
--- a/gcc/config/i386/sync.md
+++ b/gcc/config/i386/sync.md
@@ -32,6 +32,7 @@
   UNSPECV_CMPXCHG_2
   UNSPECV_CMPXCHG_3
   UNSPECV_CMPXCHG_4
+  UNSPECV_CMPXCHG_HLE_1
   UNSPECV_XCHG
   UNSPECV_LOCK
 ])
@@ -315,8 +316,16 @@
    (match_operand:SI 7 "const_int_operand" "")]		;; failure model
   "TARGET_CMPXCHG"
 {
-  emit_insn (gen_atomic_compare_and_swap_single<mode>
-	     (operands[1], operands[2], operands[3], operands[4]));
+  int need_hle = INTVAL(operands[6]) & 8;
+
+  if (need_hle) {
+    emit_insn (gen_atomic_hle_compare_and_swap_single<mode>
+	       (operands[1], operands[2], operands[3], operands[4]));
+	       }
+  else
+    emit_insn (gen_atomic_compare_and_swap_single<mode>
+	       (operands[1], operands[2], operands[3], operands[4]));
+
   ix86_expand_setcc (operands[0], EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
 		     const0_rtx);
   DONE;
@@ -344,8 +353,13 @@
 {
   if (<MODE>mode == DImode && TARGET_64BIT)
     {
-      emit_insn (gen_atomic_compare_and_swap_singledi
-		 (operands[1], operands[2], operands[3], operands[4]));
+      
+      if ( INTVAL (operands[6]) & 8 )
+            emit_insn (gen_atomic_compare_and_swap_singledi
+	    	       (operands[1], operands[2], operands[3], operands[4]));
+       else
+            emit_insn (gen_atomic_hle_compare_and_swap_singledi
+	    	       (operands[1], operands[2], operands[3], operands[4]));
     }
   else
     {
@@ -389,7 +403,23 @@
    (set (reg:CCZ FLAGS_REG)
         (unspec_volatile:CCZ [(const_int 0)] UNSPECV_CMPXCHG_3))]
   "TARGET_CMPXCHG"
-  "lock{%;} cmpxchg{<imodesuffix>}\t{%3, %1|%1, %3}")
+  "lock{%;} cmpxchg{<imodesuffix>}\t{%3, %1|%1, %3}"
+)
+
+(define_insn "atomic_hle_compare_and_swap_single<mode>"
+  [(set (match_operand:SWI 0 "register_operand" "=a")
+	(unspec_volatile:SWI
+	  [(match_operand:SWI 1 "memory_operand" "+m")
+	   (match_operand:SWI 2 "register_operand" "0")
+	   (match_operand:SWI 3 "register_operand" "<r>")]
+	  UNSPECV_CMPXCHG_HLE_1))
+   (set (match_dup 1)
+	(unspec_volatile:SWI [(const_int 0)] UNSPECV_CMPXCHG_2))
+   (set (reg:CCZ FLAGS_REG)
+        (unspec_volatile:CCZ [(const_int 0)] UNSPECV_CMPXCHG_3))]
+  "TARGET_CMPXCHG"
+  "lock{%;} xacquire{%;} cmpxchg{<imodesuffix>}\t{%3, %1|%1, %3}"
+)
 
 ;; For double-word compare and swap, we are obliged to play tricks with
 ;; the input newval (op5:op6) because the Intel register numbering does
diff --git a/gcc/cppbuiltin.c b/gcc/cppbuiltin.c
index 05d82f5..119a0dd 100644
--- a/gcc/cppbuiltin.c
+++ b/gcc/cppbuiltin.c
@@ -72,6 +72,13 @@ define__GNUC__ (cpp_reader *pfile)
   cpp_define_formatted (pfile, "__ATOMIC_RELEASE=%d", MEMMODEL_RELEASE);
   cpp_define_formatted (pfile, "__ATOMIC_ACQ_REL=%d", MEMMODEL_ACQ_REL);
   cpp_define_formatted (pfile, "__ATOMIC_CONSUME=%d", MEMMODEL_CONSUME);
+
+  /* kyukhin  */
+  if (targetm.have_hle ()) {
+    cpp_define_formatted (pfile, "__ATOMIC_USE_HLE=%d",
+			  1 << (sizeof(MEMMODEL_LAST) * 8
+				- __builtin_clz (MEMMODEL_LAST)));
+  }
 }
 
 
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 91e4b04..2a15304 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -11156,6 +11156,11 @@ This target hook is required only when the target has several different
 modes and they have different conditional execution capability, such as ARM.
 @end deftypefn
 
+@deftypefn {Target Hook} bool TARGET_HAVE_HLE (void)
+This target hook returns true if the target supports Hardware Lock Elision
+prefixies.
+@end deftypefn
+
 @deftypefn {Target Hook} unsigned TARGET_LOOP_UNROLL_ADJUST (unsigned @var{nunroll}, struct loop *@var{loop})
 This target hook returns a new value for the number of times @var{loop}
 should be unrolled. The parameter @var{nunroll} is the number of times
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 0ebc15d..efe0e59 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -11034,6 +11034,11 @@ This target hook is required only when the target has several different
 modes and they have different conditional execution capability, such as ARM.
 @end deftypefn
 
+@hook TARGET_HAVE_HLE
+This target hook returns true if the target supports Hardware Lock Elision
+prefixies.
+@end deftypefn
+
 @hook TARGET_LOOP_UNROLL_ADJUST
 This target hook returns a new value for the number of times @var{loop}
 should be unrolled. The parameter @var{nunroll} is the number of times
diff --git a/gcc/target.def b/gcc/target.def
index 6084b21..a64a183 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -1326,6 +1326,13 @@ DEFHOOK
  bool, (void),
  default_have_conditional_execution)
 
+/* Return true if the target HLE.  */
+DEFHOOK
+(have_hle,
+ "",
+ bool, (void),
+ hook_bool_void_false)
+
 /* Return a new value for loop unroll size.  */
 DEFHOOK
 (loop_unroll_adjust,

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [i386, patch, RFC] HLE support in GCC
  2012-04-10 14:12           ` Kirill Yukhin
@ 2012-04-10 14:21             ` Jakub Jelinek
  2012-04-10 17:05               ` Uros Bizjak
  2012-04-10 14:43             ` H.J. Lu
  1 sibling, 1 reply; 49+ messages in thread
From: Jakub Jelinek @ 2012-04-10 14:21 UTC (permalink / raw)
  To: Kirill Yukhin; +Cc: H.J. Lu, GCC Patches, Richard Henderson, Uros Bizjak

On Tue, Apr 10, 2012 at 06:12:08PM +0400, Kirill Yukhin wrote:
> Attached patch implements HLE support for __atomic_compare_exchange_n.

The target hook is definitely not appropriate, just define it in
ix86_target_macros in i386-c.c instead or so.

	Jakub

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [i386, patch, RFC] HLE support in GCC
  2012-04-10 14:12           ` Kirill Yukhin
  2012-04-10 14:21             ` Jakub Jelinek
@ 2012-04-10 14:43             ` H.J. Lu
  2012-04-10 14:44               ` Jakub Jelinek
  2012-04-10 16:35               ` Andi Kleen
  1 sibling, 2 replies; 49+ messages in thread
From: H.J. Lu @ 2012-04-10 14:43 UTC (permalink / raw)
  To: Kirill Yukhin; +Cc: Jakub Jelinek, GCC Patches

On Tue, Apr 10, 2012 at 7:12 AM, Kirill Yukhin <kirill.yukhin@gmail.com> wrote:
>>
>> Yeah.  And you don't need to change the FEs in any way, all that is needed
>> is to change the middle-end/expansion (builtins.c - e.g. get_memmodel)
>> and the backend (plus predefine the macros in the backend).
>>
>>        Jakub
>
> Hi Jakub,
> Attached patch implements HLE support for __atomic_compare_exchange_n.
>
> So, to emit HLE prefix, it is possible to do:
> int
> foo2 (int *p, int oldv, int newv)
> {
>  __atomic_compare_exchange_n (p, &oldv, newv, 0, __ATOMIC_ACQUIRE |
> __ATOMIC_USE_HLE, __ATOMIC_ACQUIRE);
>  return oldv;
> }

This is wrong since HLE ACQUIRE/RELEASE has nothing to do with
C++ atomic acquire/release. You can have HLE RELEASE with C++
atomic acquire.


-- 
H.J.

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [i386, patch, RFC] HLE support in GCC
  2012-04-10 14:43             ` H.J. Lu
@ 2012-04-10 14:44               ` Jakub Jelinek
  2012-04-10 16:35               ` Andi Kleen
  1 sibling, 0 replies; 49+ messages in thread
From: Jakub Jelinek @ 2012-04-10 14:44 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Kirill Yukhin, GCC Patches

On Tue, Apr 10, 2012 at 07:42:53AM -0700, H.J. Lu wrote:
> > Attached patch implements HLE support for __atomic_compare_exchange_n.
> >
> > So, to emit HLE prefix, it is possible to do:
> > int
> > foo2 (int *p, int oldv, int newv)
> > {
> >  __atomic_compare_exchange_n (p, &oldv, newv, 0, __ATOMIC_ACQUIRE |
> > __ATOMIC_USE_HLE, __ATOMIC_ACQUIRE);
> >  return oldv;
> > }
> 
> This is wrong since HLE ACQUIRE/RELEASE has nothing to do with
> C++ atomic acquire/release. You can have HLE RELEASE with C++
> atomic acquire.

Yes, of course, there should be two bits for HLE rather than one.

	Jakub

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [i386, patch, RFC] HLE support in GCC
  2012-04-10 14:43             ` H.J. Lu
  2012-04-10 14:44               ` Jakub Jelinek
@ 2012-04-10 16:35               ` Andi Kleen
  2012-04-10 20:00                 ` H.J. Lu
  1 sibling, 1 reply; 49+ messages in thread
From: Andi Kleen @ 2012-04-10 16:35 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Kirill Yukhin, Jakub Jelinek, GCC Patches

"H.J. Lu" <hjl.tools@gmail.com> writes:

>> So, to emit HLE prefix, it is possible to do:
>> int
>> foo2 (int *p, int oldv, int newv)
>> {
>>  __atomic_compare_exchange_n (p, &oldv, newv, 0, __ATOMIC_ACQUIRE |
>> __ATOMIC_USE_HLE, __ATOMIC_ACQUIRE);
>>  return oldv;
>> }
>
> This is wrong since HLE ACQUIRE/RELEASE has nothing to do with
> C++ atomic acquire/release. You can have HLE RELEASE with C++
> atomic acquire.

It makes sense to combine the two. On x86 C++ atomic acquire/release
means the compiler cannot move references outside. For HLE
we really want the same, otherwise some of the memory references
inside the transaction may not be transactional.

So I think HLE_ACQUIRE should imply C++ acquire
and HLE_RELEASE imply C++ release.

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [i386, patch, RFC] HLE support in GCC
  2012-04-10 14:21             ` Jakub Jelinek
@ 2012-04-10 17:05               ` Uros Bizjak
  0 siblings, 0 replies; 49+ messages in thread
From: Uros Bizjak @ 2012-04-10 17:05 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Kirill Yukhin, H.J. Lu, GCC Patches, Richard Henderson

[-- Attachment #1: Type: text/plain, Size: 586 bytes --]

On Tue, Apr 10, 2012 at 4:20 PM, Jakub Jelinek <jakub@redhat.com> wrote:
> On Tue, Apr 10, 2012 at 06:12:08PM +0400, Kirill Yukhin wrote:
>> Attached patch implements HLE support for __atomic_compare_exchange_n.
>
> The target hook is definitely not appropriate, just define it in
> ix86_target_macros in i386-c.c instead or so.

Also, I think it is better to pass operand that holds the model
constant to the final insn and conditionally output xacquire/xrelease
based on INTVAL of this operand.

Something like in attached patch, but probably with a helper function in i386.c.

Uros.

[-- Attachment #2: h.diff.txt --]
[-- Type: text/plain, Size: 3452 bytes --]

Index: config/i386/sync.md
===================================================================
--- config/i386/sync.md	(revision 186282)
+++ config/i386/sync.md	(working copy)
@@ -315,8 +315,9 @@
    (match_operand:SI 7 "const_int_operand")]	;; failure model
   "TARGET_CMPXCHG"
 {
-  emit_insn (gen_atomic_compare_and_swap_single<mode>
-	     (operands[1], operands[2], operands[3], operands[4]));
+  emit_insn
+   (gen_atomic_compare_and_swap_single<mode>
+    (operands[1], operands[2], operands[3], operands[4], operands[6]));
   ix86_expand_setcc (operands[0], EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
 		     const0_rtx);
   DONE;
@@ -344,8 +345,9 @@
 {
   if (<MODE>mode == DImode && TARGET_64BIT)
     {
-      emit_insn (gen_atomic_compare_and_swap_singledi
-		 (operands[1], operands[2], operands[3], operands[4]));
+      emit_insn
+       (gen_atomic_compare_and_swap_singledi
+	(operands[1], operands[2], operands[3], operands[4], operands[6]));
     }
   else
     {
@@ -370,7 +372,7 @@
 	mem = replace_equiv_address (mem, force_reg (Pmode, XEXP (mem, 0)));
 
       emit_insn (gen_atomic_compare_and_swap_double<mode>
-		 (lo_o, hi_o, mem, lo_e, hi_e, lo_n, hi_n));
+		 (lo_o, hi_o, mem, lo_e, hi_e, lo_n, hi_n, operands[6]));
     }
   ix86_expand_setcc (operands[0], EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
 		     const0_rtx);
@@ -382,15 +384,24 @@
 	(unspec_volatile:SWI
 	  [(match_operand:SWI 1 "memory_operand" "+m")
 	   (match_operand:SWI 2 "register_operand" "0")
-	   (match_operand:SWI 3 "register_operand" "<r>")]
+	   (match_operand:SWI 3 "register_operand" "<r>")
+	   (match_operand:SI 4 "const_int_operand")]
 	  UNSPECV_CMPXCHG_1))
    (set (match_dup 1)
 	(unspec_volatile:SWI [(const_int 0)] UNSPECV_CMPXCHG_2))
    (set (reg:CCZ FLAGS_REG)
         (unspec_volatile:CCZ [(const_int 0)] UNSPECV_CMPXCHG_3))]
   "TARGET_CMPXCHG"
-  "lock{%;} cmpxchg{<imodesuffix>}\t{%3, %1|%1, %3}")
+{
+  static char buf[128];
+  const char *hle
+    = (INTVAL (operands[4]) & 8) ? "xacquire " : "";
 
+  snprintf (buf, sizeof (buf),
+  	    "lock{%;} %scmpxchg{<imodesuffix>}\t{%3, %1|%1, %3}", hle);
+  return buf;
+})
+
 ;; For double-word compare and swap, we are obliged to play tricks with
 ;; the input newval (op5:op6) because the Intel register numbering does
 ;; not match the gcc register numbering, so the pair must be CX:BX.
@@ -403,7 +414,8 @@
 	   (match_operand:<DCASHMODE> 3 "register_operand" "0")
 	   (match_operand:<DCASHMODE> 4 "register_operand" "1")
 	   (match_operand:<DCASHMODE> 5 "register_operand" "b")
-	   (match_operand:<DCASHMODE> 6 "register_operand" "c")]
+	   (match_operand:<DCASHMODE> 6 "register_operand" "c")
+	   (match_operand:SI 7 "const_int_operand")]
 	  UNSPECV_CMPXCHG_1))
    (set (match_operand:<DCASHMODE> 1 "register_operand" "=d")
 	(unspec_volatile:<DCASHMODE> [(const_int 0)] UNSPECV_CMPXCHG_2))
@@ -412,8 +424,16 @@
    (set (reg:CCZ FLAGS_REG)
         (unspec_volatile:CCZ [(const_int 0)] UNSPECV_CMPXCHG_4))]
   ""
-  "lock{%;} cmpxchg<doublemodesuffix>b\t%2")
+{
+  static char buf[128];
+  const char *hle
+    = (INTVAL (operands[7]) & 8) ? "xacquire " : "";
 
+  snprintf (buf, sizeof (buf),
+  	    "lock{%;} %scmpxchg<doublemodesuffix>b\t%2", hle);
+  return buf;
+})
+
 ;; Theoretically we'd like to use constraint "r" (any reg) for op5,
 ;; but that includes ecx.  If op5 and op6 are the same (like when
 ;; the input is -1LL) GCC might chose to allocate op5 to ecx, like

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [i386, patch, RFC] HLE support in GCC
  2012-04-10 16:35               ` Andi Kleen
@ 2012-04-10 20:00                 ` H.J. Lu
  2012-04-10 20:20                   ` Andi Kleen
  0 siblings, 1 reply; 49+ messages in thread
From: H.J. Lu @ 2012-04-10 20:00 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Kirill Yukhin, Jakub Jelinek, GCC Patches

On Tue, Apr 10, 2012 at 9:34 AM, Andi Kleen <andi@firstfloor.org> wrote:
> "H.J. Lu" <hjl.tools@gmail.com> writes:
>
>>> So, to emit HLE prefix, it is possible to do:
>>> int
>>> foo2 (int *p, int oldv, int newv)
>>> {
>>>  __atomic_compare_exchange_n (p, &oldv, newv, 0, __ATOMIC_ACQUIRE |
>>> __ATOMIC_USE_HLE, __ATOMIC_ACQUIRE);
>>>  return oldv;
>>> }
>>
>> This is wrong since HLE ACQUIRE/RELEASE has nothing to do with
>> C++ atomic acquire/release. You can have HLE RELEASE with C++
>> atomic acquire.
>
> It makes sense to combine the two. On x86 C++ atomic acquire/release
> means the compiler cannot move references outside. For HLE
> we really want the same, otherwise some of the memory references
> inside the transaction may not be transactional.
>
> So I think HLE_ACQUIRE should imply C++ acquire
> and HLE_RELEASE imply C++ release.
>

If it is the case, can we generate HLE RELEASE/ACQUIRE
prefix automatically for C++ atomics via -mhle command line option.
Then you don't need to modify the source codes to enable HLE support.

-- 
H.J.

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [i386, patch, RFC] HLE support in GCC
  2012-04-10 20:00                 ` H.J. Lu
@ 2012-04-10 20:20                   ` Andi Kleen
  2012-04-11 10:35                     ` Kirill Yukhin
  0 siblings, 1 reply; 49+ messages in thread
From: Andi Kleen @ 2012-04-10 20:20 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Andi Kleen, Kirill Yukhin, Jakub Jelinek, GCC Patches

> If it is the case, can we generate HLE RELEASE/ACQUIRE
> prefix automatically for C++ atomics via -mhle command line option.
> Then you don't need to modify the source codes to enable HLE support.

No, for HLE someone needs to decide whether HLE is beneficial for a given
lock.  There are cases where it is not or even wrong (e.g. if there is no
matching unlock)

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [i386, patch, RFC] HLE support in GCC
  2012-04-10 20:20                   ` Andi Kleen
@ 2012-04-11 10:35                     ` Kirill Yukhin
  2012-04-11 10:51                       ` Jakub Jelinek
  2012-04-11 13:06                       ` Andi Kleen
  0 siblings, 2 replies; 49+ messages in thread
From: Kirill Yukhin @ 2012-04-11 10:35 UTC (permalink / raw)
  To: Uros Bizjak, H.J. Lu, Jakub Jelinek, Andi Kleen; +Cc: GCC Patches

[-- Attachment #1: Type: text/plain, Size: 1441 bytes --]

Folks,
Thanks a lot for inputs and suggestions!
Here is updated version of patch.

ChangeLog entry:
2012-04-11  Kirill Yukhin  <kirill.yukhin@intel.com>

        * builtins.c (get_memmodel): Remove check of upper bound.
        (expand_builtin_atomic_compare_exchange): Mask memmodel values.
        * config/i386/cpuid.h (bit_HLE): New.
        * config/i386/driver-i386.c (host_detect_local_cpu): Detect
        HLE support.
        * config/i386/i386-c.c (ix86_target_macros_internal): Set
        HLE defines.
        (ix86_target_string)<-mhle>: New.
        (ix86_option_override_internal)<PTA_HLE>: Ditto.
        (ix86_valid_target_attribute_inner_p)<OPT_mhle>: Ditto.
        * config/i386/i386.h (PTION_ISA_HLE): Ditto.
        (IX86_HLE_ACQUIRE): Ditto.
        (IX86_HLE_RELEASE): Ditto.
        * config/i386/i386.opt (mhle): Ditto.
        * config/i386/sync.md(atomic_compare_and_swap<mode>): Pass
        success model to instruction emitter.
        (atomic_compare_and_swap_single<mode>): Define and use argument
        for success model.
        (atomic_compare_and_swap_double<mode>): Ditto.

testsuite/ChangeLog entry:
2012-04-11  Kirill Yukhin  <kirill.yukhin@intel.com>

        * gcc.target/i386/hle-cmpxchg-acq-1.c: New.
        * gcc.target/i386/hle-cmpxchg-rel-1.c: Ditto.

Tests passing, bootstrap in progress.

Comments?
If it is ok, I'll proceed with implementation of rest builtins in the same way

Thanks, K

[-- Attachment #2: hle-rfc-3.gcc.patch --]
[-- Type: application/octet-stream, Size: 11904 bytes --]

diff --git a/gcc/builtins.c b/gcc/builtins.c
index 7afe61d..cbef9ed 100644
--- a/gcc/builtins.c
+++ b/gcc/builtins.c
@@ -5339,7 +5339,7 @@ get_memmodel (tree exp)
     return MEMMODEL_SEQ_CST;
 
   op = expand_normal (exp);
-  if (INTVAL (op) < 0 || INTVAL (op) >= MEMMODEL_LAST)
+  if (INTVAL (op) < 0)
     {
       warning (OPT_Winvalid_memory_model,
 	       "invalid memory model argument to builtin");
@@ -5392,11 +5392,15 @@ expand_builtin_atomic_compare_exchange (enum machine_mode mode, tree exp,
   enum memmodel success, failure;
   tree weak;
   bool is_weak;
+  unsigned memmodel_last_mask = (1 << (sizeof(MEMMODEL_LAST) * 8 - 1
+				       - __builtin_clz (MEMMODEL_LAST)));
+  unsigned memmodel_mask = (memmodel_last_mask - 1) | memmodel_last_mask;
 
   success = get_memmodel (CALL_EXPR_ARG (exp, 4));
   failure = get_memmodel (CALL_EXPR_ARG (exp, 5));
 
-  if (failure == MEMMODEL_RELEASE || failure == MEMMODEL_ACQ_REL)
+  if ( (failure & memmodel_mask) == MEMMODEL_RELEASE
+       || (failure & memmodel_mask) == MEMMODEL_ACQ_REL)
     {
       error ("invalid failure memory model for %<__atomic_compare_exchange%>");
       return NULL_RTX;
diff --git a/gcc/config/i386/cpuid.h b/gcc/config/i386/cpuid.h
index 6696b7a..a9d25c5 100644
--- a/gcc/config/i386/cpuid.h
+++ b/gcc/config/i386/cpuid.h
@@ -66,6 +66,7 @@
 /* Extended Features (%eax == 7) */
 #define bit_FSGSBASE	(1 << 0)
 #define bit_BMI		(1 << 3)
+#define bit_HLE		(1 << 4)
 #define bit_AVX2	(1 << 5)
 #define bit_BMI2	(1 << 8)
 #define bit_RTM		(1 << 11)
diff --git a/gcc/config/i386/driver-i386.c b/gcc/config/i386/driver-i386.c
index 09de555..34cd096 100644
--- a/gcc/config/i386/driver-i386.c
+++ b/gcc/config/i386/driver-i386.c
@@ -397,6 +397,7 @@ const char *host_detect_local_cpu (int argc, const char **argv)
   unsigned int has_pclmul = 0, has_abm = 0, has_lwp = 0;
   unsigned int has_fma = 0, has_fma4 = 0, has_xop = 0;
   unsigned int has_bmi = 0, has_bmi2 = 0, has_tbm = 0, has_lzcnt = 0;
+  unsigned int has_hle = 0;
 
   bool arch;
 
@@ -456,6 +457,7 @@ const char *host_detect_local_cpu (int argc, const char **argv)
       __cpuid_count (7, 0, eax, ebx, ecx, edx);
 
       has_bmi = ebx & bit_BMI;
+      has_hle = ebx & bit_HLE;
       has_avx2 = ebx & bit_AVX2;
       has_bmi2 = ebx & bit_BMI2;
     }
@@ -726,10 +728,12 @@ const char *host_detect_local_cpu (int argc, const char **argv)
       const char *sse4_2 = has_sse4_2 ? " -msse4.2" : " -mno-sse4.2";
       const char *sse4_1 = has_sse4_1 ? " -msse4.1" : " -mno-sse4.1";
       const char *lzcnt = has_lzcnt ? " -mlzcnt" : " -mno-lzcnt";
+      const char *hle = has_hle ? " -mhle" : "-mno-hle";
 
       options = concat (options, cx16, sahf, movbe, ase, pclmul,
 			popcnt, abm, lwp, fma, fma4, xop, bmi, bmi2,
-			tbm, avx, avx2, sse4_2, sse4_1, lzcnt, NULL);
+			tbm, avx, avx2, sse4_2, sse4_1, lzcnt,
+			hle, NULL);
     }
 
 done:
diff --git a/gcc/config/i386/i386-c.c b/gcc/config/i386/i386-c.c
index 8adb3b4..d26c998 100644
--- a/gcc/config/i386/i386-c.c
+++ b/gcc/config/i386/i386-c.c
@@ -279,6 +279,17 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
     def_or_undef (parse_in, "__BMI2__");
   if (isa_flag & OPTION_MASK_ISA_LZCNT)
     def_or_undef (parse_in, "__LZCNT__");
+  if (isa_flag & OPTION_MASK_ISA_HLE) {
+    char buf[64];
+
+    sprintf (buf, "__ATOMIC_HLE_ACQUIRE=%d", IX86_HLE_ACQUIRE);
+    def_or_undef (parse_in, buf);
+
+    sprintf (buf, "__ATOMIC_HLE_RELEASE=%d", IX86_HLE_RELEASE);
+    def_or_undef (parse_in, buf);
+
+    def_or_undef (parse_in, "__HLE__");
+  }
   if (isa_flag & OPTION_MASK_ISA_TBM)
     def_or_undef (parse_in, "__TBM__");
   if (isa_flag & OPTION_MASK_ISA_POPCNT)
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 8974ddc..6d68525 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -2679,6 +2679,7 @@ ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
     { "-mbmi",		OPTION_MASK_ISA_BMI },
     { "-mbmi2", 	OPTION_MASK_ISA_BMI2 },
     { "-mlzcnt",	OPTION_MASK_ISA_LZCNT },
+    { "-mhle",		OPTION_MASK_ISA_HLE },
     { "-mtbm",		OPTION_MASK_ISA_TBM },
     { "-mpopcnt",	OPTION_MASK_ISA_POPCNT },
     { "-mmovbe",	OPTION_MASK_ISA_MOVBE },
@@ -2954,6 +2955,7 @@ ix86_option_override_internal (bool main_args_p)
 #define PTA_AVX2		(HOST_WIDE_INT_1 << 30)
 #define PTA_BMI2	 	(HOST_WIDE_INT_1 << 31)
 #define PTA_RTM		 	(HOST_WIDE_INT_1 << 32)
+#define PTA_HLE	 		(HOST_WIDE_INT_1 << 33)
 /* if this reaches 64, need to widen struct pta flags below */
 
   static struct pta
@@ -3012,7 +3014,7 @@ ix86_option_override_internal (bool main_args_p)
 	| PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
 	| PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
 	| PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
-        | PTA_FMA | PTA_MOVBE | PTA_RTM},
+        | PTA_FMA | PTA_MOVBE | PTA_RTM | PTA_HLE},
       {"atom", PROCESSOR_ATOM, CPU_ATOM,
 	PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
 	| PTA_SSSE3 | PTA_CX16 | PTA_MOVBE},
@@ -3430,6 +3432,9 @@ ix86_option_override_internal (bool main_args_p)
 	if (processor_alias_table[i].flags & PTA_RTM
 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
 	  ix86_isa_flags |= OPTION_MASK_ISA_RTM;
+	if (processor_alias_table[i].flags & PTA_HLE
+	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE))
+	  ix86_isa_flags |= OPTION_MASK_ISA_HLE;
 	if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
 	  x86_prefetch_sse = true;
 
@@ -4251,6 +4256,7 @@ ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
     IX86_ATTR_ISA ("rdrnd",	OPT_mrdrnd),
     IX86_ATTR_ISA ("f16c",	OPT_mf16c),
     IX86_ATTR_ISA ("rtm",	OPT_mrtm),
+    IX86_ATTR_ISA ("hle",	OPT_mhle),
 
     /* enum options */
     IX86_ATTR_ENUM ("fpmath=",	OPT_mfpmath_),
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 8942ea8..22c437e 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -75,6 +75,7 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 #define TARGET_RDRND	OPTION_ISA_RDRND
 #define TARGET_F16C	OPTION_ISA_F16C
 #define TARGET_RTM      OPTION_ISA_RTM
+#define TARGET_HLE	OPTION_ISA_HLE
 
 #define TARGET_LP64	OPTION_ABI_64
 #define TARGET_X32	OPTION_ABI_X32
@@ -2344,6 +2345,12 @@ extern void debug_dispatch_window (int);
 #define TARGET_RECIP_VEC_DIV	((recip_mask & RECIP_MASK_VEC_DIV) != 0)
 #define TARGET_RECIP_VEC_SQRT	((recip_mask & RECIP_MASK_VEC_SQRT) != 0)
 
+#define IX86_HLE_ACQUIRE (1 << (sizeof(MEMMODEL_LAST) * 8 -	\
+				__builtin_clz (MEMMODEL_LAST)))
+
+#define IX86_HLE_RELEASE (1 << (sizeof(MEMMODEL_LAST) * 8 - \
+				__builtin_clz (MEMMODEL_LAST) - 1))
+
 /*
 Local variables:
 version-control: t
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index bf50aed..1d16149 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -528,6 +528,10 @@ mlzcnt
 Target Report Mask(ISA_LZCNT) Var(ix86_isa_flags) Save
 Support LZCNT built-in function and code generation
 
+mhle
+Target Report Mask(ISA_HLE) Var(ix86_isa_flags) Save
+Support Hardware Lock Elision prefixies
+
 mtbm
 Target Report Mask(ISA_TBM) Var(ix86_isa_flags) Save
 Support TBM built-in functions and code generation
diff --git a/gcc/config/i386/sync.md b/gcc/config/i386/sync.md
index 18ccabf..f2b132a 100644
--- a/gcc/config/i386/sync.md
+++ b/gcc/config/i386/sync.md
@@ -315,8 +315,9 @@
    (match_operand:SI 7 "const_int_operand")]	;; failure model
   "TARGET_CMPXCHG"
 {
-  emit_insn (gen_atomic_compare_and_swap_single<mode>
-	     (operands[1], operands[2], operands[3], operands[4]));
+  emit_insn
+   (gen_atomic_compare_and_swap_single<mode>
+    (operands[1], operands[2], operands[3], operands[4], operands[6]));
   ix86_expand_setcc (operands[0], EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
 		     const0_rtx);
   DONE;
@@ -344,8 +345,9 @@
 {
   if (<MODE>mode == DImode && TARGET_64BIT)
     {
-      emit_insn (gen_atomic_compare_and_swap_singledi
-		 (operands[1], operands[2], operands[3], operands[4]));
+      emit_insn
+       (gen_atomic_compare_and_swap_singledi
+	(operands[1], operands[2], operands[3], operands[4], operands[6]));
     }
   else
     {
@@ -370,7 +372,7 @@
 	mem = replace_equiv_address (mem, force_reg (Pmode, XEXP (mem, 0)));
 
       emit_insn (gen_atomic_compare_and_swap_double<mode>
-		 (lo_o, hi_o, mem, lo_e, hi_e, lo_n, hi_n));
+		 (lo_o, hi_o, mem, lo_e, hi_e, lo_n, hi_n, operands[6]));
     }
   ix86_expand_setcc (operands[0], EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
 		     const0_rtx);
@@ -382,14 +384,25 @@
 	(unspec_volatile:SWI
 	  [(match_operand:SWI 1 "memory_operand" "+m")
 	   (match_operand:SWI 2 "register_operand" "0")
-	   (match_operand:SWI 3 "register_operand" "<r>")]
+	   (match_operand:SWI 3 "register_operand" "<r>")
+	   (match_operand:SI 4 "const_int_operand")]
 	  UNSPECV_CMPXCHG_1))
    (set (match_dup 1)
 	(unspec_volatile:SWI [(const_int 0)] UNSPECV_CMPXCHG_2))
    (set (reg:CCZ FLAGS_REG)
         (unspec_volatile:CCZ [(const_int 0)] UNSPECV_CMPXCHG_3))]
   "TARGET_CMPXCHG"
-  "lock{%;} cmpxchg{<imodesuffix>}\t{%3, %1|%1, %3}")
+{
+  static char buf[128], hle[16]="";
+  if (INTVAL (operands[4]) & IX86_HLE_ACQUIRE)
+    snprintf (hle, sizeof (hle), "xacquire ");
+  else if (INTVAL (operands[4]) & IX86_HLE_RELEASE)
+    snprintf (hle, sizeof (hle), "release ");
+
+  snprintf (buf, sizeof (buf),
+	    "lock{%%;} %scmpxchg{<imodesuffix>}\t{%%3, %%1|%%1, %%3}", hle);
+  return buf;
+})
 
 ;; For double-word compare and swap, we are obliged to play tricks with
 ;; the input newval (op5:op6) because the Intel register numbering does
@@ -403,7 +416,8 @@
 	   (match_operand:<DCASHMODE> 3 "register_operand" "0")
 	   (match_operand:<DCASHMODE> 4 "register_operand" "1")
 	   (match_operand:<DCASHMODE> 5 "register_operand" "b")
-	   (match_operand:<DCASHMODE> 6 "register_operand" "c")]
+	   (match_operand:<DCASHMODE> 6 "register_operand" "c")
+	   (match_operand:SI 7 "const_int_operand")]
 	  UNSPECV_CMPXCHG_1))
    (set (match_operand:<DCASHMODE> 1 "register_operand" "=d")
 	(unspec_volatile:<DCASHMODE> [(const_int 0)] UNSPECV_CMPXCHG_2))
@@ -412,7 +426,18 @@
    (set (reg:CCZ FLAGS_REG)
         (unspec_volatile:CCZ [(const_int 0)] UNSPECV_CMPXCHG_4))]
   ""
-  "lock{%;} cmpxchg<doublemodesuffix>b\t%2")
+{
+  static char buf[128], hle[16];
+
+  if (INTVAL (operands[4]) & IX86_HLE_ACQUIRE)
+    snprintf (hle, sizeof (hle), "xacquire ");
+  else if (INTVAL (operands[4]) & IX86_HLE_RELEASE)
+    snprintf (hle, sizeof (hle), "release ");
+
+  snprintf (buf, sizeof (buf),
+	    "lock{%%;} %scmpxchg<doublemodesuffix>b\t%%2", hle);
+  return buf;
+})
 
 ;; Theoretically we'd like to use constraint "r" (any reg) for op5,
 ;; but that includes ecx.  If op5 and op6 are the same (like when
diff --git a/gcc/testsuite/gcc.target/i386/hle-cmpxchg-acq-1.c b/gcc/testsuite/gcc.target/i386/hle-cmpxchg-acq-1.c
new file mode 100644
index 0000000..e508a76
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/hle-cmpxchg-acq-1.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-mhle -dp" } */
+/* { dg-final { scan-assembler "lock xacquire cmpxchg" } } */
+
+int
+hle_cmpxchg (int *p, int oldv, int newv)
+{
+  return __atomic_compare_exchange_n (p, &oldv, newv, 0, __ATOMIC_ACQUIRE | __ATOMIC_HLE_ACQUIRE, __ATOMIC_ACQUIRE);
+}
diff --git a/gcc/testsuite/gcc.target/i386/hle-cmpxchg-rel-1.c b/gcc/testsuite/gcc.target/i386/hle-cmpxchg-rel-1.c
new file mode 100644
index 0000000..f48646b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/hle-cmpxchg-rel-1.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-mhle -dp" } */
+/* { dg-final { scan-assembler "lock release cmpxchg" } } */
+
+int
+hle_cmpxchg (int *p, int oldv, int newv)
+{
+  return __atomic_compare_exchange_n (p, &oldv, newv, 0, __ATOMIC_RELEASE | __ATOMIC_HLE_RELEASE, __ATOMIC_ACQUIRE);
+}

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [i386, patch, RFC] HLE support in GCC
  2012-04-11 10:35                     ` Kirill Yukhin
@ 2012-04-11 10:51                       ` Jakub Jelinek
  2012-04-11 11:40                         ` Uros Bizjak
  2012-04-11 13:06                       ` Andi Kleen
  1 sibling, 1 reply; 49+ messages in thread
From: Jakub Jelinek @ 2012-04-11 10:51 UTC (permalink / raw)
  To: Kirill Yukhin; +Cc: Uros Bizjak, H.J. Lu, Andi Kleen, GCC Patches

Hi!

On Wed, Apr 11, 2012 at 02:35:38PM +0400, Kirill Yukhin wrote:

What is TARGET_HLE good for?  I thought the point of HLE prefixes
is that they are silently ignored on older CPUs.  So, HLE should be
always enabled IMHO.  If you don't use __ATOMIC_HLE_* bits in __atomic_*
in your source, it won't be emitted, if you use them, you are supposedly
intending to compile code that will use normal locking on older CPUs
and HLE TM on new CPUs.

+  if (isa_flag & OPTION_MASK_ISA_HLE) {
+    char buf[64];
+
+    sprintf (buf, "__ATOMIC_HLE_ACQUIRE=%d", IX86_HLE_ACQUIRE);
+    def_or_undef (parse_in, buf);
+
+    sprintf (buf, "__ATOMIC_HLE_RELEASE=%d", IX86_HLE_RELEASE);
+    def_or_undef (parse_in, buf);

So IMHO the above two macros should be defined always.

+    def_or_undef (parse_in, "__HLE__");

And I don't see a point for this macro (nor the -m*=native stuff in the
patch).

@@ -2344,6 +2345,12 @@ extern void debug_dispatch_window (int);
 #define TARGET_RECIP_VEC_DIV	((recip_mask & RECIP_MASK_VEC_DIV) != 0)
 #define TARGET_RECIP_VEC_SQRT	((recip_mask & RECIP_MASK_VEC_SQRT) != 0)
 
+#define IX86_HLE_ACQUIRE (1 << (sizeof(MEMMODEL_LAST) * 8 -	\
+				__builtin_clz (MEMMODEL_LAST)))
+
+#define IX86_HLE_RELEASE (1 << (sizeof(MEMMODEL_LAST) * 8 - \
+				__builtin_clz (MEMMODEL_LAST) - 1))

I don't think you can use __builtin_clz in GCC source, at least
not conditionally on the host compiler.  If you use
clz_hwi instead, it will DTRT even for compilers that don't support
__builtin_clz.

--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/hle-cmpxchg-rel-1.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-mhle -dp" } */
+/* { dg-final { scan-assembler "lock release cmpxchg" } } */

Isn't the prefix called xrelease?  At least in my binutils version
it is...  And, why the -dp switch?

+
+int
+hle_cmpxchg (int *p, int oldv, int newv)
+{
+  return __atomic_compare_exchange_n (p, &oldv, newv, 0, __ATOMIC_RELEASE | __ATOMIC_HLE_RELEASE, __ATOMIC_ACQUIRE);
+}

	Jakub

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [i386, patch, RFC] HLE support in GCC
  2012-04-11 10:51                       ` Jakub Jelinek
@ 2012-04-11 11:40                         ` Uros Bizjak
  0 siblings, 0 replies; 49+ messages in thread
From: Uros Bizjak @ 2012-04-11 11:40 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Kirill Yukhin, H.J. Lu, Andi Kleen, GCC Patches

On Wed, Apr 11, 2012 at 12:51 PM, Jakub Jelinek <jakub@redhat.com> wrote:

> What is TARGET_HLE good for?  I thought the point of HLE prefixes
> is that they are silently ignored on older CPUs.  So, HLE should be
> always enabled IMHO.  If you don't use __ATOMIC_HLE_* bits in __atomic_*
> in your source, it won't be emitted, if you use them, you are supposedly
> intending to compile code that will use normal locking on older CPUs
> and HLE TM on new CPUs.

I think that we should keep -mhle, since it controls if we want HLE
prefixes or not, saving a byte per lock prefix if we know that binary
won't run on HLE enabled processor.

You will also need to check assembler support for new prefixes and
emit ASM_BYTE "0xXX" if not supported. Please see how
HAVE_AS_IX86_SAHF is handled.

+{
+  static char buf[128], hle[16]="";
+  if (INTVAL (operands[4]) & IX86_HLE_ACQUIRE)
+    snprintf (hle, sizeof (hle), "xacquire ");
+  else if (INTVAL (operands[4]) & IX86_HLE_RELEASE)
+    snprintf (hle, sizeof (hle), "release ");
+

Ouch...

const char *hle;

if (INTVAL (...)
  hle = "xacquire ";
else if (INTVAL (...)
  hle = "xrelease ";
else
  hle = "";

I assume that all this will be moved to a helper function that will
also handle HAVE_AS_IX86_HLE.

Uros.

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [i386, patch, RFC] HLE support in GCC
  2012-04-11 10:35                     ` Kirill Yukhin
  2012-04-11 10:51                       ` Jakub Jelinek
@ 2012-04-11 13:06                       ` Andi Kleen
  2012-04-11 13:13                         ` Jakub Jelinek
  2012-04-11 22:39                         ` Torvald Riegel
  1 sibling, 2 replies; 49+ messages in thread
From: Andi Kleen @ 2012-04-11 13:06 UTC (permalink / raw)
  To: Kirill Yukhin
  Cc: Uros Bizjak, H.J. Lu, Jakub Jelinek, Andi Kleen, GCC Patches

> Tests passing, bootstrap in progress.
> 
> Comments?

Do you really imply ACQUIRE/RELEASE with HLE_ACQUIRE/RELEASE now? I don't
see that in the code. I think that's really required, otherwise the optimizer
will do the wrong thing and move memory references outside the region.

I second Jakub in disliking -mhle.

release is spelled xrelease.

-Andi

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [i386, patch, RFC] HLE support in GCC
  2012-04-11 13:06                       ` Andi Kleen
@ 2012-04-11 13:13                         ` Jakub Jelinek
  2012-04-11 13:21                           ` Andi Kleen
  2012-04-11 22:39                         ` Torvald Riegel
  1 sibling, 1 reply; 49+ messages in thread
From: Jakub Jelinek @ 2012-04-11 13:13 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Kirill Yukhin, Uros Bizjak, H.J. Lu, GCC Patches

On Wed, Apr 11, 2012 at 03:06:35PM +0200, Andi Kleen wrote:
> Do you really imply ACQUIRE/RELEASE with HLE_ACQUIRE/RELEASE now? I don't
> see that in the code. I think that's really required, otherwise the optimizer
> will do the wrong thing and move memory references outside the region.

IMHO the separate bits for HLE_ACQUIRE/RELEASE are desirable, you could very
well use __ATOMIC_ACQ_REL or __ATOMIC_SEQ_CST if you want stronger memory
barrier, still coupled with either __ATOMIC_HLE_ACQUIRE or
__ATOMIC_HLE_RELEASE...

	Jakub

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [i386, patch, RFC] HLE support in GCC
  2012-04-11 13:13                         ` Jakub Jelinek
@ 2012-04-11 13:21                           ` Andi Kleen
  2012-04-11 15:53                             ` Kirill Yukhin
  0 siblings, 1 reply; 49+ messages in thread
From: Andi Kleen @ 2012-04-11 13:21 UTC (permalink / raw)
  To: Jakub Jelinek
  Cc: Andi Kleen, Kirill Yukhin, Uros Bizjak, H.J. Lu, GCC Patches

On Wed, Apr 11, 2012 at 03:12:44PM +0200, Jakub Jelinek wrote:
> On Wed, Apr 11, 2012 at 03:06:35PM +0200, Andi Kleen wrote:
> > Do you really imply ACQUIRE/RELEASE with HLE_ACQUIRE/RELEASE now? I don't
> > see that in the code. I think that's really required, otherwise the optimizer
> > will do the wrong thing and move memory references outside the region.
> 
> IMHO the separate bits for HLE_ACQUIRE/RELEASE are desirable, you could very
> well use __ATOMIC_ACQ_REL or __ATOMIC_SEQ_CST if you want stronger memory

That would still work, right? If you have multiple the stronger wins.

> barrier, still coupled with either __ATOMIC_HLE_ACQUIRE or
> __ATOMIC_HLE_RELEASE...

If the compiler barrier is not implied I can just see lots of buggy code that 
subtly breaks with higher optimizer levels, compiler updates etc. Of course 
it will be all gccs fault.

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [i386, patch, RFC] HLE support in GCC
  2012-04-11 13:21                           ` Andi Kleen
@ 2012-04-11 15:53                             ` Kirill Yukhin
  2012-04-11 16:07                               ` Andi Kleen
  0 siblings, 1 reply; 49+ messages in thread
From: Kirill Yukhin @ 2012-04-11 15:53 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Jakub Jelinek, Uros Bizjak, H.J. Lu, GCC Patches

[-- Attachment #1: Type: text/plain, Size: 2456 bytes --]

Yet another iteration :)

>> > Do you really imply ACQUIRE/RELEASE with HLE_ACQUIRE/RELEASE now? I don't
Sorry, Andi. Added. So, at the moment you can do smth like
  __atomic_compare_exchange_n (p, &oldv, newv, 0,
__ATOMIC_HLE_ACQUIRE, __ATOMIC_ACQUIRE);
And will get __ATOMIC_ACQUIRE model as well for success model.

I've also reoved few defines (like __HLE__), made HLE defines
uncondtinioanl, extended autoconf to check if assembler can generate
HLE, added hook to make HLE acquire/release imply standard.
I am also made bits upper 16-th of memmodel enum to be target dependant.

ChangeLog entry:
2012-04-11  Kirill Yukhin  <kirill.yukhin@intel.com>

        * builtins.c (get_memmodel): Remove check of upper bound,
        imply HLE to use standard ACQUIRE/RELEASE.
        (expand_builtin_atomic_compare_exchange): Mask memmodel values.
        * config/i386/cpuid.h (bit_HLE): New.
        * config/i386/driver-i386.c (host_detect_local_cpu): Detect
        HLE support.
        * config/i386/i386-protos.h (ix86_generate_hle_prefix): New.
        * config/i386/i386-c.c (ix86_target_macros_internal): Set
        HLE defines.
        (ix86_target_string)<-mhle>: New.
        (ix86_option_override_internal)<PTA_HLE>: Ditto.
        (ix86_valid_target_attribute_inner_p)<OPT_mhle>: Ditto.
        * config/i386/i386.c (ix86_target_string)<OPTION_MASK_ISA_HLE>:
        New.
        (ix86_valid_target_attribute_inner_p)<OPT_mhle>: Ditto.
        (ix86_generate_hle_prefix): Ditto.
        (ix86_extend_hle_macro): Ditto.
        (TARGET_EXTEND_HLE_MACRO): Ditto.
        * config/i386/i386.h (OPTION_ISA_HLE): Ditto.
        (IX86_HLE_ACQUIRE): Ditto.
        (IX86_HLE_RELEASE): Ditto.
        * config/i386/i386.h (ix86_generate_hle_prefix): Ditto.
        * config/i386/i386.opt (mhle): Ditto.
        * config/i386/sync.md(atomic_compare_and_swap<mode>): Pass
        success model to instruction emitter.
        (atomic_compare_and_swap_single<mode>): Define and use argument
        for success model.
        (atomic_compare_and_swap_double<mode>): Ditto.
        * doc/tm.texi.in (TARGET_EXTEND_HLE_MACRO): Describe.
        * hooks.h (hook_uhwi_uhwi_unchange): Declare.
        * hooks.c (hook_uhwi_uhwi_unchange): ... and define.
        * target.def (extend_hle_macro): New hook.
        * configure.ac: Check if assembler support HLE prefixies.
        * configure: Regenerate.
        * config.in: Ditto.

Patch attached.

Thanks, K

[-- Attachment #2: hle-rfc-4.gcc.patch --]
[-- Type: application/octet-stream, Size: 19904 bytes --]

diff --git a/gcc/builtins.c b/gcc/builtins.c
index 7afe61d..86bd4f8 100644
--- a/gcc/builtins.c
+++ b/gcc/builtins.c
@@ -5332,6 +5332,7 @@ static enum memmodel
 get_memmodel (tree exp)
 {
   rtx op;
+  unsigned model;
 
   /* If the parameter is not a constant, it's a run time value so we'll just
      convert it to MEMMODEL_SEQ_CST to avoid annoying runtime checking.  */
@@ -5339,13 +5340,16 @@ get_memmodel (tree exp)
     return MEMMODEL_SEQ_CST;
 
   op = expand_normal (exp);
-  if (INTVAL (op) < 0 || INTVAL (op) >= MEMMODEL_LAST)
+  if (INTVAL (op) < 0)
     {
       warning (OPT_Winvalid_memory_model,
 	       "invalid memory model argument to builtin");
       return MEMMODEL_SEQ_CST;
     }
-  return (enum memmodel) INTVAL (op);
+
+  model = targetm.extend_hle_macro (INTVAL (op));
+
+  return (enum memmodel) model;
 }
 
 /* Expand the __atomic_exchange intrinsic:
@@ -5392,11 +5396,14 @@ expand_builtin_atomic_compare_exchange (enum machine_mode mode, tree exp,
   enum memmodel success, failure;
   tree weak;
   bool is_weak;
+  /* Suppose that higher bits are target dependant.  */
+  unsigned memmodel_mask = (1<<16) - 1;
 
   success = get_memmodel (CALL_EXPR_ARG (exp, 4));
   failure = get_memmodel (CALL_EXPR_ARG (exp, 5));
 
-  if (failure == MEMMODEL_RELEASE || failure == MEMMODEL_ACQ_REL)
+  if ( (failure & memmodel_mask) == MEMMODEL_RELEASE
+       || (failure & memmodel_mask) == MEMMODEL_ACQ_REL)
     {
       error ("invalid failure memory model for %<__atomic_compare_exchange%>");
       return NULL_RTX;
diff --git a/gcc/config.in b/gcc/config.in
index 8806012..4560047 100644
--- a/gcc/config.in
+++ b/gcc/config.in
@@ -350,6 +350,11 @@
 #undef HAVE_AS_IX86_SAHF
 #endif
 
+/* Define if your assembler supports HLE prefixies. */
+#ifndef USED_FOR_TARGET
+#undef HAVE_AS_IX86_HLE
+#endif
+
 
 /* Define if your assembler supports the swap suffix. */
 #ifndef USED_FOR_TARGET
diff --git a/gcc/config/i386/cpuid.h b/gcc/config/i386/cpuid.h
index 6696b7a..a9d25c5 100644
--- a/gcc/config/i386/cpuid.h
+++ b/gcc/config/i386/cpuid.h
@@ -66,6 +66,7 @@
 /* Extended Features (%eax == 7) */
 #define bit_FSGSBASE	(1 << 0)
 #define bit_BMI		(1 << 3)
+#define bit_HLE		(1 << 4)
 #define bit_AVX2	(1 << 5)
 #define bit_BMI2	(1 << 8)
 #define bit_RTM		(1 << 11)
diff --git a/gcc/config/i386/driver-i386.c b/gcc/config/i386/driver-i386.c
index 09de555..34cd096 100644
--- a/gcc/config/i386/driver-i386.c
+++ b/gcc/config/i386/driver-i386.c
@@ -397,6 +397,7 @@ const char *host_detect_local_cpu (int argc, const char **argv)
   unsigned int has_pclmul = 0, has_abm = 0, has_lwp = 0;
   unsigned int has_fma = 0, has_fma4 = 0, has_xop = 0;
   unsigned int has_bmi = 0, has_bmi2 = 0, has_tbm = 0, has_lzcnt = 0;
+  unsigned int has_hle = 0;
 
   bool arch;
 
@@ -456,6 +457,7 @@ const char *host_detect_local_cpu (int argc, const char **argv)
       __cpuid_count (7, 0, eax, ebx, ecx, edx);
 
       has_bmi = ebx & bit_BMI;
+      has_hle = ebx & bit_HLE;
       has_avx2 = ebx & bit_AVX2;
       has_bmi2 = ebx & bit_BMI2;
     }
@@ -726,10 +728,12 @@ const char *host_detect_local_cpu (int argc, const char **argv)
       const char *sse4_2 = has_sse4_2 ? " -msse4.2" : " -mno-sse4.2";
       const char *sse4_1 = has_sse4_1 ? " -msse4.1" : " -mno-sse4.1";
       const char *lzcnt = has_lzcnt ? " -mlzcnt" : " -mno-lzcnt";
+      const char *hle = has_hle ? " -mhle" : "-mno-hle";
 
       options = concat (options, cx16, sahf, movbe, ase, pclmul,
 			popcnt, abm, lwp, fma, fma4, xop, bmi, bmi2,
-			tbm, avx, avx2, sse4_2, sse4_1, lzcnt, NULL);
+			tbm, avx, avx2, sse4_2, sse4_1, lzcnt,
+			hle, NULL);
     }
 
 done:
diff --git a/gcc/config/i386/i386-c.c b/gcc/config/i386/i386-c.c
index 8adb3b4..3a5b08f 100644
--- a/gcc/config/i386/i386-c.c
+++ b/gcc/config/i386/i386-c.c
@@ -54,6 +54,7 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
   size_t tune_len = strlen (ix86_tune_string);
   int last_arch_char = ix86_arch_string[arch_len - 1];
   int last_tune_char = ix86_tune_string[tune_len - 1];
+  char hle_macro[64];
 
   /* Built-ins based on -march=.  */
   switch (arch)
@@ -293,6 +294,12 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
     def_or_undef (parse_in, "__SSE_MATH__");
   if ((fpmath & FPMATH_SSE) && (isa_flag & OPTION_MASK_ISA_SSE2))
     def_or_undef (parse_in, "__SSE2_MATH__");
+
+  sprintf (hle_macro, "__ATOMIC_HLE_ACQUIRE=%d", IX86_HLE_ACQUIRE);
+  def_or_undef (parse_in, hle_macro);
+
+  sprintf (hle_macro, "__ATOMIC_HLE_RELEASE=%d", IX86_HLE_RELEASE);
+  def_or_undef (parse_in, hle_macro);
 }
 
 \f
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index f300a56..5832ab2 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -104,6 +104,7 @@ extern bool ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn);
 extern bool ix86_agi_dependent (rtx set_insn, rtx use_insn);
 extern void ix86_expand_unary_operator (enum rtx_code, enum machine_mode,
 					rtx[]);
+extern const char* ix86_generate_hle_prefix (rtx memmodel);
 extern rtx ix86_build_const_vector (enum machine_mode, bool, rtx);
 extern rtx ix86_build_signbit_mask (enum machine_mode, bool, bool);
 extern void ix86_split_convert_uns_si_sse (rtx[]);
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 8974ddc..64802a4 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -2679,6 +2679,7 @@ ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
     { "-mbmi",		OPTION_MASK_ISA_BMI },
     { "-mbmi2", 	OPTION_MASK_ISA_BMI2 },
     { "-mlzcnt",	OPTION_MASK_ISA_LZCNT },
+    { "-mhle",		OPTION_MASK_ISA_HLE },
     { "-mtbm",		OPTION_MASK_ISA_TBM },
     { "-mpopcnt",	OPTION_MASK_ISA_POPCNT },
     { "-mmovbe",	OPTION_MASK_ISA_MOVBE },
@@ -2954,6 +2955,7 @@ ix86_option_override_internal (bool main_args_p)
 #define PTA_AVX2		(HOST_WIDE_INT_1 << 30)
 #define PTA_BMI2	 	(HOST_WIDE_INT_1 << 31)
 #define PTA_RTM		 	(HOST_WIDE_INT_1 << 32)
+#define PTA_HLE	 		(HOST_WIDE_INT_1 << 33)
 /* if this reaches 64, need to widen struct pta flags below */
 
   static struct pta
@@ -3012,7 +3014,7 @@ ix86_option_override_internal (bool main_args_p)
 	| PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
 	| PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
 	| PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
-        | PTA_FMA | PTA_MOVBE | PTA_RTM},
+        | PTA_FMA | PTA_MOVBE | PTA_RTM | PTA_HLE},
       {"atom", PROCESSOR_ATOM, CPU_ATOM,
 	PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
 	| PTA_SSSE3 | PTA_CX16 | PTA_MOVBE},
@@ -3430,6 +3432,9 @@ ix86_option_override_internal (bool main_args_p)
 	if (processor_alias_table[i].flags & PTA_RTM
 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
 	  ix86_isa_flags |= OPTION_MASK_ISA_RTM;
+	if (processor_alias_table[i].flags & PTA_HLE
+	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE))
+	  ix86_isa_flags |= OPTION_MASK_ISA_HLE;
 	if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
 	  x86_prefetch_sse = true;
 
@@ -4251,6 +4256,7 @@ ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
     IX86_ATTR_ISA ("rdrnd",	OPT_mrdrnd),
     IX86_ATTR_ISA ("f16c",	OPT_mf16c),
     IX86_ATTR_ISA ("rtm",	OPT_mrtm),
+    IX86_ATTR_ISA ("hle",	OPT_mhle),
 
     /* enum options */
     IX86_ATTR_ENUM ("fpmath=",	OPT_mfpmath_),
@@ -16349,6 +16355,26 @@ ix86_expand_unary_operator (enum rtx_code code, enum machine_mode mode,
     emit_move_insn (operands[0], dst);
 }
 
+/* Emit HLE lock prefix depending if specified by memmodel value.  */
+const char*
+ix86_generate_hle_prefix (rtx memmodel)
+{
+  if (INTVAL (memmodel) & IX86_HLE_ACQUIRE)
+#ifdef HAVE_AS_IX86_HLE
+    return "xacquire ";
+#else
+  return "\n" ASM_BYTE "0xf2\n\t";
+#endif
+  else if (INTVAL (memmodel) & IX86_HLE_RELEASE)
+#ifdef HAVE_AS_IX86_HLE
+    return "xrelease ";
+#else
+  return "\n" ASM_BYTE "0xf3\n\t";
+#endif
+  else
+    return "";
+}
+
 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
    divisor are within the range [0-255].  */
 
@@ -38943,6 +38969,20 @@ ix86_autovectorize_vector_sizes (void)
   return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
 }
 
+static unsigned HOST_WIDE_INT
+ix86_extend_hle_macro (unsigned HOST_WIDE_INT memmodel)
+{
+  unsigned HOST_WIDE_INT result = memmodel;
+
+  if (memmodel & IX86_HLE_ACQUIRE)
+    result |= MEMMODEL_ACQUIRE;
+
+  if (memmodel & IX86_HLE_RELEASE)
+    result |= MEMMODEL_RELEASE;
+
+  return result;
+}
+
 /* Initialize the GCC target structure.  */
 #undef TARGET_RETURN_IN_MEMORY
 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
@@ -39253,6 +39293,10 @@ ix86_autovectorize_vector_sizes (void)
 #undef TARGET_CONDITIONAL_REGISTER_USAGE
 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
 
+#undef TARGET_EXTEND_HLE_MACRO
+#define TARGET_EXTEND_HLE_MACRO ix86_extend_hle_macro
+
+
 #if TARGET_MACHO
 #undef TARGET_INIT_LIBFUNCS
 #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 8942ea8..0944260 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -75,6 +75,7 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 #define TARGET_RDRND	OPTION_ISA_RDRND
 #define TARGET_F16C	OPTION_ISA_F16C
 #define TARGET_RTM      OPTION_ISA_RTM
+#define TARGET_HLE	OPTION_ISA_HLE
 
 #define TARGET_LP64	OPTION_ABI_64
 #define TARGET_X32	OPTION_ABI_X32
@@ -2344,6 +2345,9 @@ extern void debug_dispatch_window (int);
 #define TARGET_RECIP_VEC_DIV	((recip_mask & RECIP_MASK_VEC_DIV) != 0)
 #define TARGET_RECIP_VEC_SQRT	((recip_mask & RECIP_MASK_VEC_SQRT) != 0)
 
+#define IX86_HLE_ACQUIRE (1 << 16)
+#define IX86_HLE_RELEASE (1 << 17)
+
 /*
 Local variables:
 version-control: t
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index bf50aed..1d16149 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -528,6 +528,10 @@ mlzcnt
 Target Report Mask(ISA_LZCNT) Var(ix86_isa_flags) Save
 Support LZCNT built-in function and code generation
 
+mhle
+Target Report Mask(ISA_HLE) Var(ix86_isa_flags) Save
+Support Hardware Lock Elision prefixies
+
 mtbm
 Target Report Mask(ISA_TBM) Var(ix86_isa_flags) Save
 Support TBM built-in functions and code generation
diff --git a/gcc/config/i386/sync.md b/gcc/config/i386/sync.md
index 18ccabf..f3673d7 100644
--- a/gcc/config/i386/sync.md
+++ b/gcc/config/i386/sync.md
@@ -315,8 +315,9 @@
    (match_operand:SI 7 "const_int_operand")]	;; failure model
   "TARGET_CMPXCHG"
 {
-  emit_insn (gen_atomic_compare_and_swap_single<mode>
-	     (operands[1], operands[2], operands[3], operands[4]));
+  emit_insn
+   (gen_atomic_compare_and_swap_single<mode>
+    (operands[1], operands[2], operands[3], operands[4], operands[6]));
   ix86_expand_setcc (operands[0], EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
 		     const0_rtx);
   DONE;
@@ -344,8 +345,9 @@
 {
   if (<MODE>mode == DImode && TARGET_64BIT)
     {
-      emit_insn (gen_atomic_compare_and_swap_singledi
-		 (operands[1], operands[2], operands[3], operands[4]));
+      emit_insn
+       (gen_atomic_compare_and_swap_singledi
+	(operands[1], operands[2], operands[3], operands[4], operands[6]));
     }
   else
     {
@@ -370,7 +372,7 @@
 	mem = replace_equiv_address (mem, force_reg (Pmode, XEXP (mem, 0)));
 
       emit_insn (gen_atomic_compare_and_swap_double<mode>
-		 (lo_o, hi_o, mem, lo_e, hi_e, lo_n, hi_n));
+		 (lo_o, hi_o, mem, lo_e, hi_e, lo_n, hi_n, operands[6]));
     }
   ix86_expand_setcc (operands[0], EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
 		     const0_rtx);
@@ -382,14 +384,22 @@
 	(unspec_volatile:SWI
 	  [(match_operand:SWI 1 "memory_operand" "+m")
 	   (match_operand:SWI 2 "register_operand" "0")
-	   (match_operand:SWI 3 "register_operand" "<r>")]
+	   (match_operand:SWI 3 "register_operand" "<r>")
+	   (match_operand:SI 4 "const_int_operand")]
 	  UNSPECV_CMPXCHG_1))
    (set (match_dup 1)
 	(unspec_volatile:SWI [(const_int 0)] UNSPECV_CMPXCHG_2))
    (set (reg:CCZ FLAGS_REG)
         (unspec_volatile:CCZ [(const_int 0)] UNSPECV_CMPXCHG_3))]
   "TARGET_CMPXCHG"
-  "lock{%;} cmpxchg{<imodesuffix>}\t{%3, %1|%1, %3}")
+{
+  static char buf[128];
+
+  snprintf (buf, sizeof (buf),
+	    "lock{%%;} %scmpxchg{<imodesuffix>}\t{%%3, %%1|%%1, %%3}",
+	    ix86_generate_hle_prefix(operands[4]));
+  return buf;
+})
 
 ;; For double-word compare and swap, we are obliged to play tricks with
 ;; the input newval (op5:op6) because the Intel register numbering does
@@ -403,7 +413,8 @@
 	   (match_operand:<DCASHMODE> 3 "register_operand" "0")
 	   (match_operand:<DCASHMODE> 4 "register_operand" "1")
 	   (match_operand:<DCASHMODE> 5 "register_operand" "b")
-	   (match_operand:<DCASHMODE> 6 "register_operand" "c")]
+	   (match_operand:<DCASHMODE> 6 "register_operand" "c")
+	   (match_operand:SI 7 "const_int_operand")]
 	  UNSPECV_CMPXCHG_1))
    (set (match_operand:<DCASHMODE> 1 "register_operand" "=d")
 	(unspec_volatile:<DCASHMODE> [(const_int 0)] UNSPECV_CMPXCHG_2))
@@ -412,7 +423,18 @@
    (set (reg:CCZ FLAGS_REG)
         (unspec_volatile:CCZ [(const_int 0)] UNSPECV_CMPXCHG_4))]
   ""
-  "lock{%;} cmpxchg<doublemodesuffix>b\t%2")
+{
+  static char buf[128], hle[16];
+
+  if (INTVAL (operands[4]) & IX86_HLE_ACQUIRE)
+    snprintf (hle, sizeof (hle), "xacquire ");
+  else if (INTVAL (operands[4]) & IX86_HLE_RELEASE)
+    snprintf (hle, sizeof (hle), "release ");
+
+  snprintf (buf, sizeof (buf),
+	    "lock{%%;} %scmpxchg<doublemodesuffix>b\t%%2", hle);
+  return buf;
+})
 
 ;; Theoretically we'd like to use constraint "r" (any reg) for op5,
 ;; but that includes ecx.  If op5 and op6 are the same (like when
diff --git a/gcc/configure b/gcc/configure
index c1b0e46..55c3230 100755
--- a/gcc/configure
+++ b/gcc/configure
@@ -24628,6 +24628,39 @@ $as_echo "#define HAVE_AS_IX86_SAHF 1" >>confdefs.h
 
 fi
 
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking assembler for hle prefixies" >&5
+$as_echo_n "checking assembler for hle prefixies... " >&6; }
+if test "${gcc_cv_as_ix86_hle+set}" = set; then :
+  $as_echo_n "(cached) " >&6
+else
+  gcc_cv_as_ix86_hle=no
+  if test x$gcc_cv_as != x; then
+    $as_echo '.code64
+       lock xacquire cmpxchg %esi, (%rcx)
+       ' > conftest.s
+    if { ac_try='$gcc_cv_as $gcc_cv_as_flags  -o conftest.o conftest.s >&5'
+  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; }
+    then
+	gcc_cv_as_ix86_hle=yes
+    else
+      echo "configure: failed program was" >&5
+      cat conftest.s >&5
+    fi
+    rm -f conftest.o conftest.s
+  fi
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $gcc_cv_as_ix86_hle" >&5
+$as_echo "$gcc_cv_as_ix86_hle" >&6; }
+if test $gcc_cv_as_ix86_hle = yes; then
+
+$as_echo "#define HAVE_AS_IX86_HLE 1" >>confdefs.h
+
+fi
+
     { $as_echo "$as_me:${as_lineno-$LINENO}: checking assembler for swap suffix" >&5
 $as_echo_n "checking assembler for swap suffix... " >&6; }
 if test "${gcc_cv_as_ix86_swap+set}" = set; then :
diff --git a/gcc/configure.ac b/gcc/configure.ac
index 8869121..a1c2c67 100644
--- a/gcc/configure.ac
+++ b/gcc/configure.ac
@@ -3597,6 +3597,14 @@ foo:	nop
       [AC_DEFINE(HAVE_AS_IX86_SAHF, 1,
         [Define if your assembler supports the sahf mnemonic in 64bit mode.])])
 
+    gcc_GAS_CHECK_FEATURE([hle prefixies],
+      gcc_cv_as_ix86_hle,,,
+      [.code64
+       lock xacquire cmpxchg %esi, (%rcx)
+       ],,
+      [AC_DEFINE(HAVE_AS_IX86_HLE, 1,
+        [Define if your assembler supports HLE prefixies.])])
+
     gcc_GAS_CHECK_FEATURE([swap suffix],
       gcc_cv_as_ix86_swap,,,
       [movl.s %esp, %ebp],,
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 2891bb6..3fc844c 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -11158,6 +11158,12 @@ This target hook is required only when the target has several different
 modes and they have different conditional execution capability, such as ARM.
 @end deftypefn
 
+@deftypefn {Target Hook} {unsigned HOST_WIDE_INT} TARGET_EXTEND_HLE_MACRO (unsigned @var{HOST_WIDE_INT})
+This target hook helps to make HLE-related memmodel values to imply
+standard __ATOMIC_ACQUIRE and __ATOMIC_RELEASE.
+@end deftypefn
+
+
 @deftypefn {Target Hook} unsigned TARGET_LOOP_UNROLL_ADJUST (unsigned @var{nunroll}, struct loop *@var{loop})
 This target hook returns a new value for the number of times @var{loop}
 should be unrolled. The parameter @var{nunroll} is the number of times
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index a222654..df3848d 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -11038,6 +11038,12 @@ This target hook is required only when the target has several different
 modes and they have different conditional execution capability, such as ARM.
 @end deftypefn
 
+@hook TARGET_EXTEND_HLE_MACRO
+This target hook helps to make HLE-related memmodel values to imply
+standard __ATOMIC_ACQUIRE and __ATOMIC_RELEASE.
+@end deftypefn
+
+
 @hook TARGET_LOOP_UNROLL_ADJUST
 This target hook returns a new value for the number of times @var{loop}
 should be unrolled. The parameter @var{nunroll} is the number of times
diff --git a/gcc/hooks.c b/gcc/hooks.c
index ae59c33..a8e3f5d 100644
--- a/gcc/hooks.c
+++ b/gcc/hooks.c
@@ -159,6 +159,12 @@ hook_bool_const_tree_hwi_hwi_const_tree_true (const_tree a ATTRIBUTE_UNUSED,
   return true;
 }
 
+unsigned HOST_WIDE_INT
+hook_uhwi_uhwi_unchange (unsigned HOST_WIDE_INT memmodel)
+{
+  return memmodel;
+}
+
 bool
 default_can_output_mi_thunk_no_vcall (const_tree a ATTRIBUTE_UNUSED,
 				      HOST_WIDE_INT b ATTRIBUTE_UNUSED,
diff --git a/gcc/hooks.h b/gcc/hooks.h
index 2e10d1f..69a0b02 100644
--- a/gcc/hooks.h
+++ b/gcc/hooks.h
@@ -50,6 +50,9 @@ extern bool hook_bool_const_tree_hwi_hwi_const_tree_true (const_tree,
 							  HOST_WIDE_INT,
 							  HOST_WIDE_INT,
 							  const_tree);
+extern unsigned HOST_WIDE_INT
+hook_uhwi_uhwi_unchange (unsigned HOST_WIDE_INT);
+
 extern bool hook_bool_rtx_false (rtx);
 extern bool hook_bool_rtx_int_false (rtx, int);
 extern bool hook_bool_uintp_uintp_false (unsigned int *, unsigned int *);
diff --git a/gcc/target.def b/gcc/target.def
index d658b11..df7300b 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -1326,6 +1326,14 @@ DEFHOOK
  bool, (void),
  default_have_conditional_execution)
 
+/* Return true if the target supports conditional execution.  */
+DEFHOOK
+(extend_hle_macro,
+ "",
+ unsigned HOST_WIDE_INT, (unsigned HOST_WIDE_INT),
+ hook_uhwi_uhwi_unchange)
+
+
 /* Return a new value for loop unroll size.  */
 DEFHOOK
 (loop_unroll_adjust,
diff --git a/gcc/testsuite/gcc.target/i386/hle-cmpxchg-acq-1.c b/gcc/testsuite/gcc.target/i386/hle-cmpxchg-acq-1.c
new file mode 100644
index 0000000..8b43e54
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/hle-cmpxchg-acq-1.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-mhle" } */
+/* { dg-final { scan-assembler "lock\[ \n\t\]+\(xacquire\|\.byte\[ \t\]+0xf2\)\[ \t\n\]+cmpxchg" } } */
+
+int
+hle_cmpxchg (int *p, int oldv, int newv)
+{
+  return __atomic_compare_exchange_n (p, &oldv, newv, 0, __ATOMIC_ACQUIRE | __ATOMIC_HLE_ACQUIRE, __ATOMIC_ACQUIRE);
+}
diff --git a/gcc/testsuite/gcc.target/i386/hle-cmpxchg-rel-1.c b/gcc/testsuite/gcc.target/i386/hle-cmpxchg-rel-1.c
new file mode 100644
index 0000000..8549542
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/hle-cmpxchg-rel-1.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-mhle" } */
+/* { dg-final { scan-assembler "lock\[ \n\t\]+\(xrelease\|\.byte\[ \t\]+0xf3\)\[ \t\n\]+cmpxchg" } } */
+
+int
+hle_cmpxchg (int *p, int oldv, int newv)
+{
+  return __atomic_compare_exchange_n (p, &oldv, newv, 0, __ATOMIC_RELEASE | __ATOMIC_HLE_RELEASE, __ATOMIC_ACQUIRE);
+}

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [i386, patch, RFC] HLE support in GCC
  2012-04-11 15:53                             ` Kirill Yukhin
@ 2012-04-11 16:07                               ` Andi Kleen
  2012-04-11 16:11                                 ` Jakub Jelinek
  2012-04-11 19:03                                 ` Uros Bizjak
  0 siblings, 2 replies; 49+ messages in thread
From: Andi Kleen @ 2012-04-11 16:07 UTC (permalink / raw)
  To: Kirill Yukhin
  Cc: Andi Kleen, Jakub Jelinek, Uros Bizjak, H.J. Lu, GCC Patches

On Wed, Apr 11, 2012 at 07:52:59PM +0400, Kirill Yukhin wrote:
> Yet another iteration :)
> 
> >> > Do you really imply ACQUIRE/RELEASE with HLE_ACQUIRE/RELEASE now? I don't
> Sorry, Andi. Added. So, at the moment you can do smth like
>   __atomic_compare_exchange_n (p, &oldv, newv, 0,
> __ATOMIC_HLE_ACQUIRE, __ATOMIC_ACQUIRE);
> And will get __ATOMIC_ACQUIRE model as well for success model.
> 
> I've also reoved few defines (like __HLE__), made HLE defines
> uncondtinioanl, extended autoconf to check if assembler can generate
> HLE, added hook to make HLE acquire/release imply standard.
> I am also made bits upper 16-th of memmodel enum to be target dependant.

+static unsigned HOST_WIDE_INT
+ix86_extend_hle_macro (unsigned HOST_WIDE_INT memmodel)
+{
+  unsigned HOST_WIDE_INT result = memmodel;
+
+  if (memmodel & IX86_HLE_ACQUIRE)
+    result |= MEMMODEL_ACQUIRE;
+
+  if (memmodel & IX86_HLE_RELEASE)
+    result |= MEMMODEL_RELEASE;
+           
+  return result;
+}

This needs to check whether no existing model is already set
Multiple MEMMODEL_* do not or together.

Also I would not call the target hook "hle", it could be used
for other things too.

+mhle
+Target Report Mask(ISA_HLE) Var(ix86_isa_flags) Save
+Support Hardware Lock Elision prefixies

Typo

Also the new flags (plus the implied barrier semantics) 
need to be described in the manual too.

+  static char buf[128], hle[16];

The hle buffer does not need to be static.
BTW I'm surprised there is no better way to do this in machine descriptions
than to use static buffers.

Other than that it looks good to me. Thanks.

-Andi

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [i386, patch, RFC] HLE support in GCC
  2012-04-11 16:07                               ` Andi Kleen
@ 2012-04-11 16:11                                 ` Jakub Jelinek
  2012-04-11 16:19                                   ` Andi Kleen
  2012-04-11 19:03                                 ` Uros Bizjak
  1 sibling, 1 reply; 49+ messages in thread
From: Jakub Jelinek @ 2012-04-11 16:11 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Kirill Yukhin, Uros Bizjak, H.J. Lu, GCC Patches

On Wed, Apr 11, 2012 at 06:06:58PM +0200, Andi Kleen wrote:
> +static unsigned HOST_WIDE_INT
> +ix86_extend_hle_macro (unsigned HOST_WIDE_INT memmodel)
> +{
> +  unsigned HOST_WIDE_INT result = memmodel;
> +
> +  if (memmodel & IX86_HLE_ACQUIRE)
> +    result |= MEMMODEL_ACQUIRE;
> +
> +  if (memmodel & IX86_HLE_RELEASE)
> +    result |= MEMMODEL_RELEASE;
> +           
> +  return result;
> +}

I actually think it is a bad idea to imply any memory model
from the HLE bits.  If anything, we should warn for memmodel
+ hle bit combinations that are unlikely to DTRT.
Let the developers really use the model they want.

	Jakub

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [i386, patch, RFC] HLE support in GCC
  2012-04-11 16:11                                 ` Jakub Jelinek
@ 2012-04-11 16:19                                   ` Andi Kleen
  2012-04-11 16:24                                     ` Jakub Jelinek
  0 siblings, 1 reply; 49+ messages in thread
From: Andi Kleen @ 2012-04-11 16:19 UTC (permalink / raw)
  To: Jakub Jelinek
  Cc: Andi Kleen, Kirill Yukhin, Uros Bizjak, H.J. Lu, GCC Patches

> I actually think it is a bad idea to imply any memory model
> from the HLE bits.  If anything, we should warn for memmodel
> + hle bit combinations that are unlikely to DTRT.

This would be a warning with _RELAXED/_CONSUME, but there may be very 
obscure situations where someone really wants that (but then there
would be no way to get rid of the warning). So if you do that you would
need another flag to disable the warning.

The "default to ACQUIRE/RELEASE but allow override with no warning" model seems
more user friendly to me.

> Let the developers really use the model they want.

You assume they really want one :-)

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [i386, patch, RFC] HLE support in GCC
  2012-04-11 16:19                                   ` Andi Kleen
@ 2012-04-11 16:24                                     ` Jakub Jelinek
  2012-04-11 16:40                                       ` Andi Kleen
  0 siblings, 1 reply; 49+ messages in thread
From: Jakub Jelinek @ 2012-04-11 16:24 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Kirill Yukhin, Uros Bizjak, H.J. Lu, GCC Patches

On Wed, Apr 11, 2012 at 06:18:56PM +0200, Andi Kleen wrote:
> > I actually think it is a bad idea to imply any memory model
> > from the HLE bits.  If anything, we should warn for memmodel
> > + hle bit combinations that are unlikely to DTRT.
> 
> This would be a warning with _RELAXED/_CONSUME, but there may be very 
> obscure situations where someone really wants that (but then there
> would be no way to get rid of the warning). So if you do that you would
> need another flag to disable the warning.
> 
> The "default to ACQUIRE/RELEASE but allow override with no warning" model seems
> more user friendly to me.

But such a model isn't possible.  The HLE bits are just some high bits
ored into the memory model enum.  So, if you use
__ATOMIC_HLE_ACQUIRE, it is the same thing as
__ATOMIC_HLE_ACQUIRE | __ATOMIC_RELAXED and thus it is a relaxed xacquire,
not xacquire with default memory model.
__atomic_* builtins were introduced already in GCC 4.7, so we need to do
a compatible extension...

	Jakub

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [i386, patch, RFC] HLE support in GCC
  2012-04-11 16:24                                     ` Jakub Jelinek
@ 2012-04-11 16:40                                       ` Andi Kleen
  2012-04-11 18:52                                         ` Jakub Jelinek
  0 siblings, 1 reply; 49+ messages in thread
From: Andi Kleen @ 2012-04-11 16:40 UTC (permalink / raw)
  To: Jakub Jelinek
  Cc: Andi Kleen, Kirill Yukhin, Uros Bizjak, H.J. Lu, GCC Patches

> But such a model isn't possible.  The HLE bits are just some high bits
> ored into the memory model enum.  So, if you use
> __ATOMIC_HLE_ACQUIRE, it is the same thing as
> __ATOMIC_HLE_ACQUIRE | __ATOMIC_RELAXED and thus it is a relaxed xacquire,
> not xacquire with default memory model.
> __atomic_* builtins were introduced already in GCC 4.7, so we need to do
> a compatible extension...

That's true. Actually I see the values are defined by the compiler
at compile time, so it would be possible to move all one up?

For compatibility just would need to make sure that 0 still means relaxed
without HLE.

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [i386, patch, RFC] HLE support in GCC
  2012-04-11 16:40                                       ` Andi Kleen
@ 2012-04-11 18:52                                         ` Jakub Jelinek
  2012-04-11 21:04                                           ` Andi Kleen
  0 siblings, 1 reply; 49+ messages in thread
From: Jakub Jelinek @ 2012-04-11 18:52 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Kirill Yukhin, Uros Bizjak, H.J. Lu, GCC Patches

On Wed, Apr 11, 2012 at 06:40:03PM +0200, Andi Kleen wrote:
> > But such a model isn't possible.  The HLE bits are just some high bits
> > ored into the memory model enum.  So, if you use
> > __ATOMIC_HLE_ACQUIRE, it is the same thing as
> > __ATOMIC_HLE_ACQUIRE | __ATOMIC_RELAXED and thus it is a relaxed xacquire,
> > not xacquire with default memory model.
> > __atomic_* builtins were introduced already in GCC 4.7, so we need to do
> > a compatible extension...
> 
> That's true. Actually I see the values are defined by the compiler
> at compile time, so it would be possible to move all one up?

No, that is IMHO not possible.  They need to match the enum values that are
part of libstdc++ ABI already, and not everybody is going to use the
__ATOMIC_* macros anyway.

	Jakub

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [i386, patch, RFC] HLE support in GCC
  2012-04-11 16:07                               ` Andi Kleen
  2012-04-11 16:11                                 ` Jakub Jelinek
@ 2012-04-11 19:03                                 ` Uros Bizjak
  1 sibling, 0 replies; 49+ messages in thread
From: Uros Bizjak @ 2012-04-11 19:03 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Kirill Yukhin, Jakub Jelinek, H.J. Lu, GCC Patches

On Wed, Apr 11, 2012 at 6:06 PM, Andi Kleen <andi@firstfloor.org> wrote:

> +  static char buf[128], hle[16];
>
> The hle buffer does not need to be static.
> BTW I'm surprised there is no better way to do this in machine descriptions
> than to use static buffers.

Oh, there is. Since we are looking at the operands, we can process
this operand with operand modifier in the asm template.

Basically, "lock{%;} %K4 cmpxchg{<imodesuffix>}\t{%3, %1|%1, %3}",
where K modifier would print correct string from x86_print_operand,
depending on the INTVAL value of operand4.

Uros.

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [i386, patch, RFC] HLE support in GCC
  2012-04-11 18:52                                         ` Jakub Jelinek
@ 2012-04-11 21:04                                           ` Andi Kleen
  0 siblings, 0 replies; 49+ messages in thread
From: Andi Kleen @ 2012-04-11 21:04 UTC (permalink / raw)
  To: Jakub Jelinek
  Cc: Andi Kleen, Kirill Yukhin, Uros Bizjak, H.J. Lu, GCC Patches

> > That's true. Actually I see the values are defined by the compiler
> > at compile time, so it would be possible to move all one up?
> 
> No, that is IMHO not possible.  They need to match the enum values that are
> part of libstdc++ ABI already, and not everybody is going to use the
> __ATOMIC_* macros anyway.

I see. Maybe it's best to just forbid HLE_ACQUIRE + RELAXED 
and HLE_RELEASE + RELAXED, because they are very very likely to be wrong. 
Since this all implementation specific anyways could just forced them to 
HLE_ACQUIRE+ACQUIRE and HLE_RELEASE+RELEASE.

Essentially Kirill's patch did that except it missed the check
to only do it when RELAXED is in the lower bits.

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [i386, patch, RFC] HLE support in GCC
  2012-04-11 13:06                       ` Andi Kleen
  2012-04-11 13:13                         ` Jakub Jelinek
@ 2012-04-11 22:39                         ` Torvald Riegel
  2012-04-12  8:46                           ` Kirill Yukhin
                                             ` (2 more replies)
  1 sibling, 3 replies; 49+ messages in thread
From: Torvald Riegel @ 2012-04-11 22:39 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Kirill Yukhin, Uros Bizjak, H.J. Lu, Jakub Jelinek, GCC Patches

On Wed, 2012-04-11 at 15:06 +0200, Andi Kleen wrote:
> > Tests passing, bootstrap in progress.
> > 
> > Comments?
> 
> Do you really imply ACQUIRE/RELEASE with HLE_ACQUIRE/RELEASE now? I don't
> see that in the code. I think that's really required, otherwise the optimizer
> will do the wrong thing and move memory references outside the region.

Perhaps HLE_ACQUIRE / HLE_RELEASE should be something like HLE_START /
HLE_END instead?  Not particularly great names, but at least it avoids
overloading ACQUIRE/RELEASE and thus should make it clearer that you
still need to specify a memory order.

I agree with Jakub that users really should specify memory order bits,
if they want ordering.  Andi, I also see your point regarding catching
bugs, but this is really expert stuff, and my hope is that we can make
HLE really transparent or at least provide better abstractions around it
(than to just offer the plain GCC builtins).


Torvald

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [i386, patch, RFC] HLE support in GCC
  2012-04-11 22:39                         ` Torvald Riegel
@ 2012-04-12  8:46                           ` Kirill Yukhin
  2012-04-12  9:37                             ` Kirill Yukhin
  2012-04-12 11:36                           ` Andi Kleen
  2012-04-12 12:22                           ` Andrew MacLeod
  2 siblings, 1 reply; 49+ messages in thread
From: Kirill Yukhin @ 2012-04-12  8:46 UTC (permalink / raw)
  To: Torvald Riegel
  Cc: Andi Kleen, Uros Bizjak, H.J. Lu, Jakub Jelinek, GCC Patches

> Perhaps HLE_ACQUIRE / HLE_RELEASE should be something like HLE_START /
> HLE_END instead?  Not particularly great names, but at least it avoids
> overloading ACQUIRE/RELEASE and thus should make it clearer that you
> still need to specify a memory order.
>
IMHO, this is also not as good, since ACQUIRE/RELEASE reflect actual
prefixies names.
HLE_START/END may confuse user even more.

I am also, agree with Jakub, such things shouldn't be implied. Going
to remove it from patch

K

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [i386, patch, RFC] HLE support in GCC
  2012-04-12  8:46                           ` Kirill Yukhin
@ 2012-04-12  9:37                             ` Kirill Yukhin
  2012-04-12  9:47                               ` Jakub Jelinek
  0 siblings, 1 reply; 49+ messages in thread
From: Kirill Yukhin @ 2012-04-12  9:37 UTC (permalink / raw)
  To: Torvald Riegel, Andi Kleen, Uros Bizjak, H.J. Lu, Jakub Jelinek
  Cc: GCC Patches

[-- Attachment #1: Type: text/plain, Size: 1591 bytes --]

Folks,
Here is patch with removed implied atomic ACQUIRE/RELEASE. Could you
please have a look?

ChangeLog entry:
2012-04-12  Kirill Yukhin  <kirill.yukhin@intel.com>

        * builtins.c (get_memmodel): Remove check of upper bound.
        (expand_builtin_atomic_compare_exchange): Mask memmodel values.
        * config/i386/cpuid.h (bit_HLE): New.
        * config/i386/driver-i386.c (host_detect_local_cpu): Detect
        HLE support.
        * config/i386/i386-protos.h (ix86_generate_hle_prefix): New.
        * config/i386/i386-c.c (ix86_target_macros_internal): Set
        HLE defines.
        (ix86_target_string)<-mhle>: New.
        (ix86_option_override_internal)<PTA_HLE>: Ditto.
        (ix86_valid_target_attribute_inner_p)<OPT_mhle>: Ditto.
        * config/i386/i386.c (ix86_target_string)<OPTION_MASK_ISA_HLE>:
        New.
        (ix86_valid_target_attribute_inner_p)<OPT_mhle>: Ditto.
        (ix86_generate_hle_prefix): Ditto.
        * config/i386/i386.h (OPTION_ISA_HLE): Ditto.
        (IX86_HLE_ACQUIRE): Ditto.
        (IX86_HLE_RELEASE): Ditto.
        * config/i386/i386.h (ix86_generate_hle_prefix): Ditto.
        * config/i386/i386.opt (mhle): Ditto.
        * config/i386/sync.md(atomic_compare_and_swap<mode>): Pass
        success model to instruction emitter.
        (atomic_compare_and_swap_single<mode>): Define and use argument
        for success model.
        (atomic_compare_and_swap_double<mode>): Ditto.
        * configure.ac: Check if assembler support HLE prefixies.
        * configure: Regenerate.
        * config.in: Ditto.

Thanks, K

[-- Attachment #2: hle-rfc-5.gcc.patch --]
[-- Type: application/octet-stream, Size: 15650 bytes --]

diff --git a/gcc/builtins.c b/gcc/builtins.c
index b47f218..427bd6b 100644
--- a/gcc/builtins.c
+++ b/gcc/builtins.c
@@ -5345,12 +5345,13 @@ get_memmodel (tree exp)
     return MEMMODEL_SEQ_CST;
 
   op = expand_normal (exp);
-  if (INTVAL (op) < 0 || INTVAL (op) >= MEMMODEL_LAST)
+  if (INTVAL (op) < 0)
     {
       warning (OPT_Winvalid_memory_model,
 	       "invalid memory model argument to builtin");
       return MEMMODEL_SEQ_CST;
     }
+
   return (enum memmodel) INTVAL (op);
 }
 
@@ -5398,11 +5399,14 @@ expand_builtin_atomic_compare_exchange (enum machine_mode mode, tree exp,
   enum memmodel success, failure;
   tree weak;
   bool is_weak;
+  /* Suppose that higher bits are target dependant.  */
+  unsigned memmodel_mask = (1<<16) - 1;
 
   success = get_memmodel (CALL_EXPR_ARG (exp, 4));
   failure = get_memmodel (CALL_EXPR_ARG (exp, 5));
 
-  if (failure == MEMMODEL_RELEASE || failure == MEMMODEL_ACQ_REL)
+  if ( (failure & memmodel_mask) == MEMMODEL_RELEASE
+       || (failure & memmodel_mask) == MEMMODEL_ACQ_REL)
     {
       error ("invalid failure memory model for %<__atomic_compare_exchange%>");
       return NULL_RTX;
diff --git a/gcc/config.in b/gcc/config.in
index 8806012..4560047 100644
--- a/gcc/config.in
+++ b/gcc/config.in
@@ -350,6 +350,11 @@
 #undef HAVE_AS_IX86_SAHF
 #endif
 
+/* Define if your assembler supports HLE prefixies. */
+#ifndef USED_FOR_TARGET
+#undef HAVE_AS_IX86_HLE
+#endif
+
 
 /* Define if your assembler supports the swap suffix. */
 #ifndef USED_FOR_TARGET
diff --git a/gcc/config/i386/cpuid.h b/gcc/config/i386/cpuid.h
index 6696b7a..a9d25c5 100644
--- a/gcc/config/i386/cpuid.h
+++ b/gcc/config/i386/cpuid.h
@@ -66,6 +66,7 @@
 /* Extended Features (%eax == 7) */
 #define bit_FSGSBASE	(1 << 0)
 #define bit_BMI		(1 << 3)
+#define bit_HLE		(1 << 4)
 #define bit_AVX2	(1 << 5)
 #define bit_BMI2	(1 << 8)
 #define bit_RTM		(1 << 11)
diff --git a/gcc/config/i386/driver-i386.c b/gcc/config/i386/driver-i386.c
index 09de555..34cd096 100644
--- a/gcc/config/i386/driver-i386.c
+++ b/gcc/config/i386/driver-i386.c
@@ -397,6 +397,7 @@ const char *host_detect_local_cpu (int argc, const char **argv)
   unsigned int has_pclmul = 0, has_abm = 0, has_lwp = 0;
   unsigned int has_fma = 0, has_fma4 = 0, has_xop = 0;
   unsigned int has_bmi = 0, has_bmi2 = 0, has_tbm = 0, has_lzcnt = 0;
+  unsigned int has_hle = 0;
 
   bool arch;
 
@@ -456,6 +457,7 @@ const char *host_detect_local_cpu (int argc, const char **argv)
       __cpuid_count (7, 0, eax, ebx, ecx, edx);
 
       has_bmi = ebx & bit_BMI;
+      has_hle = ebx & bit_HLE;
       has_avx2 = ebx & bit_AVX2;
       has_bmi2 = ebx & bit_BMI2;
     }
@@ -726,10 +728,12 @@ const char *host_detect_local_cpu (int argc, const char **argv)
       const char *sse4_2 = has_sse4_2 ? " -msse4.2" : " -mno-sse4.2";
       const char *sse4_1 = has_sse4_1 ? " -msse4.1" : " -mno-sse4.1";
       const char *lzcnt = has_lzcnt ? " -mlzcnt" : " -mno-lzcnt";
+      const char *hle = has_hle ? " -mhle" : "-mno-hle";
 
       options = concat (options, cx16, sahf, movbe, ase, pclmul,
 			popcnt, abm, lwp, fma, fma4, xop, bmi, bmi2,
-			tbm, avx, avx2, sse4_2, sse4_1, lzcnt, NULL);
+			tbm, avx, avx2, sse4_2, sse4_1, lzcnt,
+			hle, NULL);
     }
 
 done:
diff --git a/gcc/config/i386/i386-c.c b/gcc/config/i386/i386-c.c
index 8adb3b4..3a5b08f 100644
--- a/gcc/config/i386/i386-c.c
+++ b/gcc/config/i386/i386-c.c
@@ -54,6 +54,7 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
   size_t tune_len = strlen (ix86_tune_string);
   int last_arch_char = ix86_arch_string[arch_len - 1];
   int last_tune_char = ix86_tune_string[tune_len - 1];
+  char hle_macro[64];
 
   /* Built-ins based on -march=.  */
   switch (arch)
@@ -293,6 +294,12 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
     def_or_undef (parse_in, "__SSE_MATH__");
   if ((fpmath & FPMATH_SSE) && (isa_flag & OPTION_MASK_ISA_SSE2))
     def_or_undef (parse_in, "__SSE2_MATH__");
+
+  sprintf (hle_macro, "__ATOMIC_HLE_ACQUIRE=%d", IX86_HLE_ACQUIRE);
+  def_or_undef (parse_in, hle_macro);
+
+  sprintf (hle_macro, "__ATOMIC_HLE_RELEASE=%d", IX86_HLE_RELEASE);
+  def_or_undef (parse_in, hle_macro);
 }
 
 \f
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index f300a56..5832ab2 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -104,6 +104,7 @@ extern bool ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn);
 extern bool ix86_agi_dependent (rtx set_insn, rtx use_insn);
 extern void ix86_expand_unary_operator (enum rtx_code, enum machine_mode,
 					rtx[]);
+extern const char* ix86_generate_hle_prefix (rtx memmodel);
 extern rtx ix86_build_const_vector (enum machine_mode, bool, rtx);
 extern rtx ix86_build_signbit_mask (enum machine_mode, bool, bool);
 extern void ix86_split_convert_uns_si_sse (rtx[]);
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index af4af7c..2283cd7 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -2679,6 +2679,7 @@ ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
     { "-mbmi",		OPTION_MASK_ISA_BMI },
     { "-mbmi2", 	OPTION_MASK_ISA_BMI2 },
     { "-mlzcnt",	OPTION_MASK_ISA_LZCNT },
+    { "-mhle",		OPTION_MASK_ISA_HLE },
     { "-mtbm",		OPTION_MASK_ISA_TBM },
     { "-mpopcnt",	OPTION_MASK_ISA_POPCNT },
     { "-mmovbe",	OPTION_MASK_ISA_MOVBE },
@@ -2954,6 +2955,7 @@ ix86_option_override_internal (bool main_args_p)
 #define PTA_AVX2		(HOST_WIDE_INT_1 << 30)
 #define PTA_BMI2	 	(HOST_WIDE_INT_1 << 31)
 #define PTA_RTM		 	(HOST_WIDE_INT_1 << 32)
+#define PTA_HLE	 		(HOST_WIDE_INT_1 << 33)
 /* if this reaches 64, need to widen struct pta flags below */
 
   static struct pta
@@ -3012,7 +3014,7 @@ ix86_option_override_internal (bool main_args_p)
 	| PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
 	| PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
 	| PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
-        | PTA_FMA | PTA_MOVBE | PTA_RTM},
+        | PTA_FMA | PTA_MOVBE | PTA_RTM | PTA_HLE},
       {"atom", PROCESSOR_ATOM, CPU_ATOM,
 	PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
 	| PTA_SSSE3 | PTA_CX16 | PTA_MOVBE},
@@ -3430,6 +3432,9 @@ ix86_option_override_internal (bool main_args_p)
 	if (processor_alias_table[i].flags & PTA_RTM
 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
 	  ix86_isa_flags |= OPTION_MASK_ISA_RTM;
+	if (processor_alias_table[i].flags & PTA_HLE
+	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE))
+	  ix86_isa_flags |= OPTION_MASK_ISA_HLE;
 	if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
 	  x86_prefetch_sse = true;
 
@@ -4251,6 +4256,7 @@ ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
     IX86_ATTR_ISA ("rdrnd",	OPT_mrdrnd),
     IX86_ATTR_ISA ("f16c",	OPT_mf16c),
     IX86_ATTR_ISA ("rtm",	OPT_mrtm),
+    IX86_ATTR_ISA ("hle",	OPT_mhle),
 
     /* enum options */
     IX86_ATTR_ENUM ("fpmath=",	OPT_mfpmath_),
@@ -16349,6 +16355,26 @@ ix86_expand_unary_operator (enum rtx_code code, enum machine_mode mode,
     emit_move_insn (operands[0], dst);
 }
 
+/* Emit HLE lock prefix depending if specified by memmodel value.  */
+const char*
+ix86_generate_hle_prefix (rtx memmodel)
+{
+  if (INTVAL (memmodel) & IX86_HLE_ACQUIRE)
+#ifdef HAVE_AS_IX86_HLE
+    return "xacquire ";
+#else
+  return "\n" ASM_BYTE "0xf2\n\t";
+#endif
+  else if (INTVAL (memmodel) & IX86_HLE_RELEASE)
+#ifdef HAVE_AS_IX86_HLE
+    return "xrelease ";
+#else
+  return "\n" ASM_BYTE "0xf3\n\t";
+#endif
+  else
+    return "";
+}
+
 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
    divisor are within the range [0-255].  */
 
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 8942ea8..0944260 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -75,6 +75,7 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 #define TARGET_RDRND	OPTION_ISA_RDRND
 #define TARGET_F16C	OPTION_ISA_F16C
 #define TARGET_RTM      OPTION_ISA_RTM
+#define TARGET_HLE	OPTION_ISA_HLE
 
 #define TARGET_LP64	OPTION_ABI_64
 #define TARGET_X32	OPTION_ABI_X32
@@ -2344,6 +2345,9 @@ extern void debug_dispatch_window (int);
 #define TARGET_RECIP_VEC_DIV	((recip_mask & RECIP_MASK_VEC_DIV) != 0)
 #define TARGET_RECIP_VEC_SQRT	((recip_mask & RECIP_MASK_VEC_SQRT) != 0)
 
+#define IX86_HLE_ACQUIRE (1 << 16)
+#define IX86_HLE_RELEASE (1 << 17)
+
 /*
 Local variables:
 version-control: t
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index bf50aed..1d16149 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -528,6 +528,10 @@ mlzcnt
 Target Report Mask(ISA_LZCNT) Var(ix86_isa_flags) Save
 Support LZCNT built-in function and code generation
 
+mhle
+Target Report Mask(ISA_HLE) Var(ix86_isa_flags) Save
+Support Hardware Lock Elision prefixies
+
 mtbm
 Target Report Mask(ISA_TBM) Var(ix86_isa_flags) Save
 Support TBM built-in functions and code generation
diff --git a/gcc/config/i386/sync.md b/gcc/config/i386/sync.md
index 18ccabf..f3673d7 100644
--- a/gcc/config/i386/sync.md
+++ b/gcc/config/i386/sync.md
@@ -315,8 +315,9 @@
    (match_operand:SI 7 "const_int_operand")]	;; failure model
   "TARGET_CMPXCHG"
 {
-  emit_insn (gen_atomic_compare_and_swap_single<mode>
-	     (operands[1], operands[2], operands[3], operands[4]));
+  emit_insn
+   (gen_atomic_compare_and_swap_single<mode>
+    (operands[1], operands[2], operands[3], operands[4], operands[6]));
   ix86_expand_setcc (operands[0], EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
 		     const0_rtx);
   DONE;
@@ -344,8 +345,9 @@
 {
   if (<MODE>mode == DImode && TARGET_64BIT)
     {
-      emit_insn (gen_atomic_compare_and_swap_singledi
-		 (operands[1], operands[2], operands[3], operands[4]));
+      emit_insn
+       (gen_atomic_compare_and_swap_singledi
+	(operands[1], operands[2], operands[3], operands[4], operands[6]));
     }
   else
     {
@@ -370,7 +372,7 @@
 	mem = replace_equiv_address (mem, force_reg (Pmode, XEXP (mem, 0)));
 
       emit_insn (gen_atomic_compare_and_swap_double<mode>
-		 (lo_o, hi_o, mem, lo_e, hi_e, lo_n, hi_n));
+		 (lo_o, hi_o, mem, lo_e, hi_e, lo_n, hi_n, operands[6]));
     }
   ix86_expand_setcc (operands[0], EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
 		     const0_rtx);
@@ -382,14 +384,22 @@
 	(unspec_volatile:SWI
 	  [(match_operand:SWI 1 "memory_operand" "+m")
 	   (match_operand:SWI 2 "register_operand" "0")
-	   (match_operand:SWI 3 "register_operand" "<r>")]
+	   (match_operand:SWI 3 "register_operand" "<r>")
+	   (match_operand:SI 4 "const_int_operand")]
 	  UNSPECV_CMPXCHG_1))
    (set (match_dup 1)
 	(unspec_volatile:SWI [(const_int 0)] UNSPECV_CMPXCHG_2))
    (set (reg:CCZ FLAGS_REG)
         (unspec_volatile:CCZ [(const_int 0)] UNSPECV_CMPXCHG_3))]
   "TARGET_CMPXCHG"
-  "lock{%;} cmpxchg{<imodesuffix>}\t{%3, %1|%1, %3}")
+{
+  static char buf[128];
+
+  snprintf (buf, sizeof (buf),
+	    "lock{%%;} %scmpxchg{<imodesuffix>}\t{%%3, %%1|%%1, %%3}",
+	    ix86_generate_hle_prefix(operands[4]));
+  return buf;
+})
 
 ;; For double-word compare and swap, we are obliged to play tricks with
 ;; the input newval (op5:op6) because the Intel register numbering does
@@ -403,7 +413,8 @@
 	   (match_operand:<DCASHMODE> 3 "register_operand" "0")
 	   (match_operand:<DCASHMODE> 4 "register_operand" "1")
 	   (match_operand:<DCASHMODE> 5 "register_operand" "b")
-	   (match_operand:<DCASHMODE> 6 "register_operand" "c")]
+	   (match_operand:<DCASHMODE> 6 "register_operand" "c")
+	   (match_operand:SI 7 "const_int_operand")]
 	  UNSPECV_CMPXCHG_1))
    (set (match_operand:<DCASHMODE> 1 "register_operand" "=d")
 	(unspec_volatile:<DCASHMODE> [(const_int 0)] UNSPECV_CMPXCHG_2))
@@ -412,7 +423,18 @@
    (set (reg:CCZ FLAGS_REG)
         (unspec_volatile:CCZ [(const_int 0)] UNSPECV_CMPXCHG_4))]
   ""
-  "lock{%;} cmpxchg<doublemodesuffix>b\t%2")
+{
+  static char buf[128], hle[16];
+
+  if (INTVAL (operands[4]) & IX86_HLE_ACQUIRE)
+    snprintf (hle, sizeof (hle), "xacquire ");
+  else if (INTVAL (operands[4]) & IX86_HLE_RELEASE)
+    snprintf (hle, sizeof (hle), "release ");
+
+  snprintf (buf, sizeof (buf),
+	    "lock{%%;} %scmpxchg<doublemodesuffix>b\t%%2", hle);
+  return buf;
+})
 
 ;; Theoretically we'd like to use constraint "r" (any reg) for op5,
 ;; but that includes ecx.  If op5 and op6 are the same (like when
diff --git a/gcc/configure b/gcc/configure
index c1b0e46..55c3230 100755
--- a/gcc/configure
+++ b/gcc/configure
@@ -24628,6 +24628,39 @@ $as_echo "#define HAVE_AS_IX86_SAHF 1" >>confdefs.h
 
 fi
 
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking assembler for hle prefixies" >&5
+$as_echo_n "checking assembler for hle prefixies... " >&6; }
+if test "${gcc_cv_as_ix86_hle+set}" = set; then :
+  $as_echo_n "(cached) " >&6
+else
+  gcc_cv_as_ix86_hle=no
+  if test x$gcc_cv_as != x; then
+    $as_echo '.code64
+       lock xacquire cmpxchg %esi, (%rcx)
+       ' > conftest.s
+    if { ac_try='$gcc_cv_as $gcc_cv_as_flags  -o conftest.o conftest.s >&5'
+  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; }
+    then
+	gcc_cv_as_ix86_hle=yes
+    else
+      echo "configure: failed program was" >&5
+      cat conftest.s >&5
+    fi
+    rm -f conftest.o conftest.s
+  fi
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $gcc_cv_as_ix86_hle" >&5
+$as_echo "$gcc_cv_as_ix86_hle" >&6; }
+if test $gcc_cv_as_ix86_hle = yes; then
+
+$as_echo "#define HAVE_AS_IX86_HLE 1" >>confdefs.h
+
+fi
+
     { $as_echo "$as_me:${as_lineno-$LINENO}: checking assembler for swap suffix" >&5
 $as_echo_n "checking assembler for swap suffix... " >&6; }
 if test "${gcc_cv_as_ix86_swap+set}" = set; then :
diff --git a/gcc/configure.ac b/gcc/configure.ac
index 8869121..a1c2c67 100644
--- a/gcc/configure.ac
+++ b/gcc/configure.ac
@@ -3597,6 +3597,14 @@ foo:	nop
       [AC_DEFINE(HAVE_AS_IX86_SAHF, 1,
         [Define if your assembler supports the sahf mnemonic in 64bit mode.])])
 
+    gcc_GAS_CHECK_FEATURE([hle prefixies],
+      gcc_cv_as_ix86_hle,,,
+      [.code64
+       lock xacquire cmpxchg %esi, (%rcx)
+       ],,
+      [AC_DEFINE(HAVE_AS_IX86_HLE, 1,
+        [Define if your assembler supports HLE prefixies.])])
+
     gcc_GAS_CHECK_FEATURE([swap suffix],
       gcc_cv_as_ix86_swap,,,
       [movl.s %esp, %ebp],,
diff --git a/gcc/testsuite/gcc.target/i386/hle-cmpxchg-acq-1.c b/gcc/testsuite/gcc.target/i386/hle-cmpxchg-acq-1.c
new file mode 100644
index 0000000..8b43e54
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/hle-cmpxchg-acq-1.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-mhle" } */
+/* { dg-final { scan-assembler "lock\[ \n\t\]+\(xacquire\|\.byte\[ \t\]+0xf2\)\[ \t\n\]+cmpxchg" } } */
+
+int
+hle_cmpxchg (int *p, int oldv, int newv)
+{
+  return __atomic_compare_exchange_n (p, &oldv, newv, 0, __ATOMIC_ACQUIRE | __ATOMIC_HLE_ACQUIRE, __ATOMIC_ACQUIRE);
+}
diff --git a/gcc/testsuite/gcc.target/i386/hle-cmpxchg-rel-1.c b/gcc/testsuite/gcc.target/i386/hle-cmpxchg-rel-1.c
new file mode 100644
index 0000000..8549542
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/hle-cmpxchg-rel-1.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-mhle" } */
+/* { dg-final { scan-assembler "lock\[ \n\t\]+\(xrelease\|\.byte\[ \t\]+0xf3\)\[ \t\n\]+cmpxchg" } } */
+
+int
+hle_cmpxchg (int *p, int oldv, int newv)
+{
+  return __atomic_compare_exchange_n (p, &oldv, newv, 0, __ATOMIC_RELEASE | __ATOMIC_HLE_RELEASE, __ATOMIC_ACQUIRE);
+}

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [i386, patch, RFC] HLE support in GCC
  2012-04-12  9:37                             ` Kirill Yukhin
@ 2012-04-12  9:47                               ` Jakub Jelinek
  0 siblings, 0 replies; 49+ messages in thread
From: Jakub Jelinek @ 2012-04-12  9:47 UTC (permalink / raw)
  To: Kirill Yukhin, Richard Henderson
  Cc: Torvald Riegel, Andi Kleen, Uros Bizjak, H.J. Lu, GCC Patches

On Thu, Apr 12, 2012 at 01:37:24PM +0400, Kirill Yukhin wrote:
> Folks,
> Here is patch with removed implied atomic ACQUIRE/RELEASE. Could you
> please have a look?

+                                                                                                                                                  
+  sprintf (hle_macro, "__ATOMIC_HLE_ACQUIRE=%d", IX86_HLE_ACQUIRE);                                                                               
+  def_or_undef (parse_in, hle_macro);                                                                                                             
+                                                                                                                                                  
+  sprintf (hle_macro, "__ATOMIC_HLE_RELEASE=%d", IX86_HLE_RELEASE);                                                                               
+  def_or_undef (parse_in, hle_macro);                                                                                                             

This doesn't belong to ix86_target_macros_internal, but to
ix86_target_macros.  It is defined unconditionally, so you don't want to
undef and define it again on each target pragma, and furthermore
cpp_undef with __ATOMIC_HLE_ACQUIRE=something wouldn't work (for
undef you'd need __ATOMIC_HLE_ACQUIRE).  And in ix86_target_macros
you should be able to use cpp_define_formatted and avoid the temporary
buffer.

As for the rest of the patch, I'd like Richard to chime in...

	Jakub

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [i386, patch, RFC] HLE support in GCC
  2012-04-11 22:39                         ` Torvald Riegel
  2012-04-12  8:46                           ` Kirill Yukhin
@ 2012-04-12 11:36                           ` Andi Kleen
  2012-04-12 12:35                             ` Torvald Riegel
  2012-04-12 12:22                           ` Andrew MacLeod
  2 siblings, 1 reply; 49+ messages in thread
From: Andi Kleen @ 2012-04-12 11:36 UTC (permalink / raw)
  To: Torvald Riegel
  Cc: Andi Kleen, Kirill Yukhin, Uros Bizjak, H.J. Lu, Jakub Jelinek,
	GCC Patches

On Thu, Apr 12, 2012 at 12:38:44AM +0200, Torvald Riegel wrote:
> On Wed, 2012-04-11 at 15:06 +0200, Andi Kleen wrote:
> > > Tests passing, bootstrap in progress.
> > > 
> > > Comments?
> > 
> > Do you really imply ACQUIRE/RELEASE with HLE_ACQUIRE/RELEASE now? I don't
> > see that in the code. I think that's really required, otherwise the optimizer
> > will do the wrong thing and move memory references outside the region.
> 
> Perhaps HLE_ACQUIRE / HLE_RELEASE should be something like HLE_START /
> HLE_END instead?  Not particularly great names, but at least it avoids
> overloading ACQUIRE/RELEASE and thus should make it clearer that you
> still need to specify a memory order.

It still seems wrong to me. HLE is an x86 construct, so weaker
memory orders on the compiler level than what the instruction implements
does not really make sense to me. And the instruction just has LOCK
semantics.

Currently it's highly error prone -- on the Russel hard to misuse scale not higher 
than 1 as is [1]

At the minimum it would need a warning with RELAXED as suggested
by Jakub earlier.


> I agree with Jakub that users really should specify memory order bits,
> if they want ordering.  Andi, I also see your point regarding catching
> bugs, but this is really expert stuff, and my hope is that we can make
> HLE really transparent or at least provide better abstractions around it

At least this form of HLE cannot be transparent, it has to be annotated by the
programmer.

-Andi

[1] http://ozlabs.org/~rusty/index.cgi/tech/2008-03-30.html
-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [i386, patch, RFC] HLE support in GCC
  2012-04-11 22:39                         ` Torvald Riegel
  2012-04-12  8:46                           ` Kirill Yukhin
  2012-04-12 11:36                           ` Andi Kleen
@ 2012-04-12 12:22                           ` Andrew MacLeod
  2012-04-12 12:29                             ` Jakub Jelinek
                                               ` (2 more replies)
  2 siblings, 3 replies; 49+ messages in thread
From: Andrew MacLeod @ 2012-04-12 12:22 UTC (permalink / raw)
  To: Torvald Riegel
  Cc: Andi Kleen, Kirill Yukhin, Uros Bizjak, H.J. Lu, Jakub Jelinek,
	GCC Patches

On 04/11/2012 06:38 PM, Torvald Riegel wrote:
> On Wed, 2012-04-11 at 15:06 +0200, Andi Kleen wrote:
>>
> Perhaps HLE_ACQUIRE / HLE_RELEASE should be something like HLE_START /
> HLE_END instead?  Not particularly great names, but at least it avoids
> overloading ACQUIRE/RELEASE and thus should make it clearer that you
> still need to specify a memory order.
>
>

Does it make any sense to simply predefine the possible valid 
combinations with the HLE bit already set?  it at least removes any 
possible invalid combinations and forces the programmer to consciously 
choose their memory model.

ie,
__ATOMIC_HLE_XACQ_CONSUME
__ATOMIC_HLE_XACQ_ACQUIRE
__ATOMIC_HLE_XACQ_ACQ_REL
__ATOMIC_HLE_XACQ_SEQ_CST

__ATOMIC_HLE_XREL_RELEASE
__ATOMIC_HLE_XREL_ACQ_REL
__ATOMIC_HLE_XREL_SEQ_CST

or whatever happens to be valid...   Doesn't really scale to adding more 
new bits later, but perhaps that doesn't matter.

Just a thought.

Andrew

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [i386, patch, RFC] HLE support in GCC
  2012-04-12 12:22                           ` Andrew MacLeod
@ 2012-04-12 12:29                             ` Jakub Jelinek
  2012-04-12 12:59                               ` Andi Kleen
  2012-04-12 12:47                             ` Andi Kleen
  2012-04-12 13:23                             ` Torvald Riegel
  2 siblings, 1 reply; 49+ messages in thread
From: Jakub Jelinek @ 2012-04-12 12:29 UTC (permalink / raw)
  To: Andrew MacLeod
  Cc: Torvald Riegel, Andi Kleen, Kirill Yukhin, Uros Bizjak, H.J. Lu,
	GCC Patches

On Thu, Apr 12, 2012 at 08:21:47AM -0400, Andrew MacLeod wrote:
> On 04/11/2012 06:38 PM, Torvald Riegel wrote:
> >On Wed, 2012-04-11 at 15:06 +0200, Andi Kleen wrote:
> >Perhaps HLE_ACQUIRE / HLE_RELEASE should be something like HLE_START /
> >HLE_END instead?  Not particularly great names, but at least it avoids
> >overloading ACQUIRE/RELEASE and thus should make it clearer that you
> >still need to specify a memory order.
> 
> Does it make any sense to simply predefine the possible valid
> combinations with the HLE bit already set?  it at least removes any
> possible invalid combinations and forces the programmer to
> consciously choose their memory model.
> 
> ie,
> __ATOMIC_HLE_XACQ_CONSUME
> __ATOMIC_HLE_XACQ_ACQUIRE
> __ATOMIC_HLE_XACQ_ACQ_REL
> __ATOMIC_HLE_XACQ_SEQ_CST
> 
> __ATOMIC_HLE_XREL_RELEASE
> __ATOMIC_HLE_XREL_ACQ_REL
> __ATOMIC_HLE_XREL_SEQ_CST
> 
> or whatever happens to be valid...   Doesn't really scale to adding
> more new bits later, but perhaps that doesn't matter.

I'd prefer not to predefine these.  They can be surely defined in some intrinsic
header, but the number of predefined macros is already huge and is quite
costly (it appears in all -g3 macro info, increases compiler initialization
time even for empty sources, etc.).

	Jakub

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [i386, patch, RFC] HLE support in GCC
  2012-04-12 11:36                           ` Andi Kleen
@ 2012-04-12 12:35                             ` Torvald Riegel
  2012-04-12 12:57                               ` Andi Kleen
  0 siblings, 1 reply; 49+ messages in thread
From: Torvald Riegel @ 2012-04-12 12:35 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Kirill Yukhin, Uros Bizjak, H.J. Lu, Jakub Jelinek, GCC Patches

On Thu, 2012-04-12 at 13:36 +0200, Andi Kleen wrote:
> On Thu, Apr 12, 2012 at 12:38:44AM +0200, Torvald Riegel wrote:
> > On Wed, 2012-04-11 at 15:06 +0200, Andi Kleen wrote:
> > > > Tests passing, bootstrap in progress.
> > > > 
> > > > Comments?
> > > 
> > > Do you really imply ACQUIRE/RELEASE with HLE_ACQUIRE/RELEASE now? I don't
> > > see that in the code. I think that's really required, otherwise the optimizer
> > > will do the wrong thing and move memory references outside the region.
> > 
> > Perhaps HLE_ACQUIRE / HLE_RELEASE should be something like HLE_START /
> > HLE_END instead?  Not particularly great names, but at least it avoids
> > overloading ACQUIRE/RELEASE and thus should make it clearer that you
> > still need to specify a memory order.
> 
> It still seems wrong to me. HLE is an x86 construct, so weaker
> memory orders on the compiler level than what the instruction implements
> does not really make sense to me. And the instruction just has LOCK
> semantics.

What if another vendor shows up, perhaps on another architecture?

> 
> Currently it's highly error prone -- on the Russel hard to misuse scale not higher 
> than 1 as is [1]

It would be a three, if the patch would contain documentation of the
additional bit.  I guess that's a hint :)

It could even be a four, depending on the point of view.  Not from the
POV of the Intel HLE feature.  But from a conceptual POV, HLE is pretty
independent of memory orders.

> 
> At the minimum it would need a warning with RELAXED as suggested
> by Jakub earlier.

That could be helpful.

However, I'm not 100% sure that HLE is only useful paired with
acquire/release memory orders in general (ie, possibly on other archs).
For example, if you only care about having acq_rel atomics being
protected by your possibly-elided lock, then they won't get moved out of
the critical section (unless I read the C++11 memory model too
conservatively).
Therefore, given that I don't see the atomic builtins being used by lots
of programmers, I'd rather make them more general.

> > I agree with Jakub that users really should specify memory order bits,
> > if they want ordering.  Andi, I also see your point regarding catching
> > bugs, but this is really expert stuff, and my hope is that we can make
> > HLE really transparent or at least provide better abstractions around it
> 
> At least this form of HLE cannot be transparent, it has to be annotated by the
> programmer.

Let me elaborate.  The point I was trying to make is that it should be
transparent for the non-concurrency-expert programmer.  Or at least make
this specific detail we're discussing here transparent (ie, whether
HLE_ACQUIRE should imply a certain memory order).  That is, if
non-experts only see a default lock implementation, or see a
lock-implementation with a use-HLE-here-if-possible flag, then they
don't have to deal with memory orders anyway.

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [i386, patch, RFC] HLE support in GCC
  2012-04-12 12:22                           ` Andrew MacLeod
  2012-04-12 12:29                             ` Jakub Jelinek
@ 2012-04-12 12:47                             ` Andi Kleen
  2012-04-12 13:23                             ` Torvald Riegel
  2 siblings, 0 replies; 49+ messages in thread
From: Andi Kleen @ 2012-04-12 12:47 UTC (permalink / raw)
  To: Andrew MacLeod
  Cc: Torvald Riegel, Andi Kleen, Kirill Yukhin, Uros Bizjak, H.J. Lu,
	Jakub Jelinek, GCC Patches

Hi Andrew,

> Does it make any sense to simply predefine the possible valid 
> combinations with the HLE bit already set?  it at least removes any 
> possible invalid combinations and forces the programmer to consciously 
> choose their memory model.
> 
> ie,
> __ATOMIC_HLE_XACQ_CONSUME
> __ATOMIC_HLE_XACQ_ACQUIRE
> __ATOMIC_HLE_XACQ_ACQ_REL
> __ATOMIC_HLE_XACQ_SEQ_CST
> 
> __ATOMIC_HLE_XREL_RELEASE
> __ATOMIC_HLE_XREL_ACQ_REL
> __ATOMIC_HLE_XREL_SEQ_CST
> 
> or whatever happens to be valid...   Doesn't really scale to adding more 
> new bits later, but perhaps that doesn't matter.

Idea sounds good to me. Certainly would be much harder to misuse,
so generally be a better interface.

As to what combinations make sense:

An HLE region ACQUIRE has somewhat interesting ordering semantics.
It's a fairly strong barrier (LOCK prefix) for reads and writes.
The HLE RELEASE is either LOCK too, or MOV without LOCK. If it's running
transactionally the whole block acts like a LOCK too. But we have
to use the weakest.

I suppose that would map to always _SEQ_CST just for most instructions,
except for mov release whih can be _RELEASE too (and would need 
an additional MFENCE generated for anything stronger)

Probably there is not a lot of value in allowing the optimizer
weaker models than what the CPU does.

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [i386, patch, RFC] HLE support in GCC
  2012-04-12 12:35                             ` Torvald Riegel
@ 2012-04-12 12:57                               ` Andi Kleen
  0 siblings, 0 replies; 49+ messages in thread
From: Andi Kleen @ 2012-04-12 12:57 UTC (permalink / raw)
  To: Torvald Riegel
  Cc: Andi Kleen, Kirill Yukhin, Uros Bizjak, H.J. Lu, Jakub Jelinek,
	GCC Patches

> What if another vendor shows up, perhaps on another architecture?

HLE code is currently usually x86 specific. e.g. for practical spin locks
you have to include a __builtin_ia32_pause() on lock locked to stop speculation,
otherwise the lock path will speculate too, which is very inefficient.

So if you wanted abstracted HLE, you would need more abstracted builtins first.

> > Currently it's highly error prone -- on the Russel hard to misuse scale not higher 
> > than 1 as is [1]
> 
> It would be a three, if the patch would contain documentation of the
> additional bit.  I guess that's a hint :)

Yes :)

> However, I'm not 100% sure that HLE is only useful paired with
> acquire/release memory orders in general (ie, possibly on other archs).

It's actually stronger on the instruction level (see other mail),
except for the MOV release.

-Andi
-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [i386, patch, RFC] HLE support in GCC
  2012-04-12 12:29                             ` Jakub Jelinek
@ 2012-04-12 12:59                               ` Andi Kleen
  0 siblings, 0 replies; 49+ messages in thread
From: Andi Kleen @ 2012-04-12 12:59 UTC (permalink / raw)
  To: Jakub Jelinek
  Cc: Andrew MacLeod, Torvald Riegel, Andi Kleen, Kirill Yukhin,
	Uros Bizjak, H.J. Lu, GCC Patches

> > __ATOMIC_HLE_XACQ_CONSUME
> > __ATOMIC_HLE_XACQ_ACQUIRE
> > __ATOMIC_HLE_XACQ_ACQ_REL
> > __ATOMIC_HLE_XACQ_SEQ_CST
> > 
> > __ATOMIC_HLE_XREL_RELEASE
> > __ATOMIC_HLE_XREL_ACQ_REL
> > __ATOMIC_HLE_XREL_SEQ_CST
> > 
> > or whatever happens to be valid...   Doesn't really scale to adding
> > more new bits later, but perhaps that doesn't matter.
> 
> I'd prefer not to predefine these.  They can be surely defined in some intrinsic

I think only a very small number make sense for x86 HLE, two, if you include weak MOV RELEASE
three. That's only one more than currently.

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [i386, patch, RFC] HLE support in GCC
  2012-04-12 12:22                           ` Andrew MacLeod
  2012-04-12 12:29                             ` Jakub Jelinek
  2012-04-12 12:47                             ` Andi Kleen
@ 2012-04-12 13:23                             ` Torvald Riegel
  2012-04-12 13:26                               ` Andi Kleen
  2012-04-12 14:13                               ` Kirill Yukhin
  2 siblings, 2 replies; 49+ messages in thread
From: Torvald Riegel @ 2012-04-12 13:23 UTC (permalink / raw)
  To: Andrew MacLeod
  Cc: Andi Kleen, Kirill Yukhin, Uros Bizjak, H.J. Lu, Jakub Jelinek,
	GCC Patches

On Thu, 2012-04-12 at 08:21 -0400, Andrew MacLeod wrote:
> On 04/11/2012 06:38 PM, Torvald Riegel wrote:
> > On Wed, 2012-04-11 at 15:06 +0200, Andi Kleen wrote:
> >>
> > Perhaps HLE_ACQUIRE / HLE_RELEASE should be something like HLE_START /
> > HLE_END instead?  Not particularly great names, but at least it avoids
> > overloading ACQUIRE/RELEASE and thus should make it clearer that you
> > still need to specify a memory order.
> >
> >
> 
> Does it make any sense to simply predefine the possible valid 
> combinations with the HLE bit already set?  it at least removes any 
> possible invalid combinations and forces the programmer to consciously 
> choose their memory model.
> 
> ie,
> __ATOMIC_HLE_XACQ_CONSUME
> __ATOMIC_HLE_XACQ_ACQUIRE
> __ATOMIC_HLE_XACQ_ACQ_REL
> __ATOMIC_HLE_XACQ_SEQ_CST
> 
> __ATOMIC_HLE_XREL_RELEASE
> __ATOMIC_HLE_XREL_ACQ_REL
> __ATOMIC_HLE_XREL_SEQ_CST
> 
> or whatever happens to be valid...   Doesn't really scale to adding more 
> new bits later, but perhaps that doesn't matter.

I would suggest that we keep the HLE acq/rel bits independent of the
memory order bits.  Both are independent on a conceptual level.  And we
should add documentation that tells programmers that memory orders need
always be specified.

This way, we:
- keep it generic and don't add any arch-specific assumptions,
- remain closer to a true extension of the C++11/C11 atomics instead of
  a more strongly coupled mix-up,
- can still do arch-specific selection of memory orders (e.g., even just
  the memory orders are more fine-granular than you'd need for just
  x86),
- can still do arch-specific warnings if some combinations really don't
  make sense,
- keep the number of macros small as preferred by Jakub.

Would that be an acceptable option for everyone?  Andi, would proper
documentation resolve your ease-of-use concerns?

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [i386, patch, RFC] HLE support in GCC
  2012-04-12 13:23                             ` Torvald Riegel
@ 2012-04-12 13:26                               ` Andi Kleen
  2012-04-12 14:13                               ` Kirill Yukhin
  1 sibling, 0 replies; 49+ messages in thread
From: Andi Kleen @ 2012-04-12 13:26 UTC (permalink / raw)
  To: Torvald Riegel
  Cc: Andrew MacLeod, Andi Kleen, Kirill Yukhin, Uros Bizjak, H.J. Lu,
	Jakub Jelinek, GCC Patches

> Would that be an acceptable option for everyone?  Andi, would proper
> documentation resolve your ease-of-use concerns?

Proper documentation is needed in any case, but I would strongly prefer an
inherently hard-to-misuse interface. I think Andrew's proposal is the best 
for that so far.

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [i386, patch, RFC] HLE support in GCC
  2012-04-12 13:23                             ` Torvald Riegel
  2012-04-12 13:26                               ` Andi Kleen
@ 2012-04-12 14:13                               ` Kirill Yukhin
  2012-04-12 18:05                                 ` Torvald Riegel
  1 sibling, 1 reply; 49+ messages in thread
From: Kirill Yukhin @ 2012-04-12 14:13 UTC (permalink / raw)
  To: Torvald Riegel
  Cc: Andrew MacLeod, Andi Kleen, Uros Bizjak, H.J. Lu, Jakub Jelinek,
	GCC Patches

> I would suggest that we keep the HLE acq/rel bits independent of the
> memory order bits.  Both are independent on a conceptual level.  And we
> should add documentation that tells programmers that memory orders need
> always be specified.
>
Sorry, I didn't get your point. You propose to separate HLE-capable
builtins at all?

How will we make it independent without new param introduction?
New param introduction will make new builtin.
We already had that: see initial patch - it contains HLE-featured
intrinsics (and builtins),
independent of __atomic*

K

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [i386, patch, RFC] HLE support in GCC
  2012-04-12 14:13                               ` Kirill Yukhin
@ 2012-04-12 18:05                                 ` Torvald Riegel
  2012-04-13 11:40                                   ` Kirill Yukhin
  0 siblings, 1 reply; 49+ messages in thread
From: Torvald Riegel @ 2012-04-12 18:05 UTC (permalink / raw)
  To: Kirill Yukhin
  Cc: Andrew MacLeod, Andi Kleen, Uros Bizjak, H.J. Lu, Jakub Jelinek,
	GCC Patches

On Thu, 2012-04-12 at 18:13 +0400, Kirill Yukhin wrote:
> > I would suggest that we keep the HLE acq/rel bits independent of the
> > memory order bits.  Both are independent on a conceptual level.  And we
> > should add documentation that tells programmers that memory orders need
> > always be specified.
> >
> Sorry, I didn't get your point. You propose to separate HLE-capable
> builtins at all?

No, just the bits; programmers would need to do
  __atomic_...(..., __ATOMIC_RELEASE | HLE_RELEASE);
I believe this is what you had in one of your versions of the patch.  My
suggestions was not about doing something new but instead a
suggestions/poll for a resolution of the discussion.


^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [i386, patch, RFC] HLE support in GCC
  2012-04-12 18:05                                 ` Torvald Riegel
@ 2012-04-13 11:40                                   ` Kirill Yukhin
  2012-04-17 14:37                                     ` Sergey Ostanevich
  0 siblings, 1 reply; 49+ messages in thread
From: Kirill Yukhin @ 2012-04-13 11:40 UTC (permalink / raw)
  To: Torvald Riegel
  Cc: Andrew MacLeod, Andi Kleen, Uros Bizjak, H.J. Lu, Jakub Jelinek,
	GCC Patches

> No, just the bits; programmers would need to do
>  __atomic_...(..., __ATOMIC_RELEASE | HLE_RELEASE);
> I believe this is what you had in one of your versions of the patch.  My
> suggestions was not about doing something new but instead a
> suggestions/poll for a resolution of the discussion.

Oh, okay, got it.
So, seems it all covered by my recent patch (hle-rfc-5.gcc.patch).

Any other inputs?

Thanks, K

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [i386, patch, RFC] HLE support in GCC
  2012-04-13 11:40                                   ` Kirill Yukhin
@ 2012-04-17 14:37                                     ` Sergey Ostanevich
  2012-04-17 14:42                                       ` Andi Kleen
  0 siblings, 1 reply; 49+ messages in thread
From: Sergey Ostanevich @ 2012-04-17 14:37 UTC (permalink / raw)
  To: Kirill Yukhin
  Cc: Torvald Riegel, Andrew MacLeod, Andi Kleen, Uros Bizjak, H.J. Lu,
	Jakub Jelinek, GCC Patches

>
> Any other inputs?
>

I would suggest to use "snprintf" b/gcc/config/i386/i386-c.c to avoid
possible buffer overrun.

I also have a question regarding AS compatibility. In case one built
GCC using AS with support of HLE then using this GCC on a machine with
old AS will cause fail because of usupported prefix. Can we support it
compile time rather configure time?

regards,
Sergos

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [i386, patch, RFC] HLE support in GCC
  2012-04-17 14:37                                     ` Sergey Ostanevich
@ 2012-04-17 14:42                                       ` Andi Kleen
  2012-04-18  8:22                                         ` Sergey Ostanevich
  0 siblings, 1 reply; 49+ messages in thread
From: Andi Kleen @ 2012-04-17 14:42 UTC (permalink / raw)
  To: Sergey Ostanevich
  Cc: Kirill Yukhin, Torvald Riegel, Andrew MacLeod, Andi Kleen,
	Uros Bizjak, H.J. Lu, Jakub Jelinek, GCC Patches

> I also have a question regarding AS compatibility. In case one built
> GCC using AS with support of HLE then using this GCC on a machine with
> old AS will cause fail because of usupported prefix. Can we support it

I don't think that's a supported use case for gcc.
It also doesn't work with .cfi* intrinsics and some other things.

> compile time rather configure time?

The only way to do that would be to always generate .byte,
but the people who read the assembler output would hate you 
for it.

-Andi
-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [i386, patch, RFC] HLE support in GCC
  2012-04-17 14:42                                       ` Andi Kleen
@ 2012-04-18  8:22                                         ` Sergey Ostanevich
  0 siblings, 0 replies; 49+ messages in thread
From: Sergey Ostanevich @ 2012-04-18  8:22 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Kirill Yukhin, Torvald Riegel, Andrew MacLeod, Uros Bizjak,
	H.J. Lu, Jakub Jelinek, GCC Patches

On Tue, Apr 17, 2012 at 6:41 PM, Andi Kleen <andi@firstfloor.org> wrote:
>> I also have a question regarding AS compatibility. In case one built
>> GCC using AS with support of HLE then using this GCC on a machine with
>> old AS will cause fail because of usupported prefix. Can we support it
>
> I don't think that's a supported use case for gcc.
> It also doesn't work with .cfi* intrinsics and some other things.

Well, it's hard to speculate here. What I rely upon is the fact that
GCC I have on my Fedora is from gcc-4.6.0-10.fc15.x86_64.rpm and the
latter contains no AS within.
There should be dependencies so that AS will be updated alongside with
GCC? Otherwise upon update to new GCC I can see fails in my project
build.

>> compile time rather configure time?
>
> The only way to do that would be to always generate .byte,
> but the people who read the assembler output would hate you
> for it.

Totally agree, it's the best way to hurt your karma. :)
Although detection of AS capabilities is available at compile time and
can be used to evade compfail - at least in case assembler is involved
(no -S provided)

Sergos

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [i386, patch, RFC] HLE support in GCC
  2012-04-10 17:43 Uros Bizjak
@ 2012-04-10 19:47 ` Andi Kleen
  0 siblings, 0 replies; 49+ messages in thread
From: Andi Kleen @ 2012-04-10 19:47 UTC (permalink / raw)
  To: Uros Bizjak
  Cc: gcc-patches, Andi Kleen, H.J. Lu, Kirill Yukhin, Jakub Jelinek

> In this case, can we reverse this sentence and just emit "lock
> xacquire" for MEMMODEL_ACQUIRE and "lock xrelease" for
> MEMMODEL_RELEASE ? Do we need separate HLE_* defines or can we somehow
> recycle existing C++11 memmodel defines?

No you absolutely can't. Transactions are quite different from a normal
lock. There can be good reasons to have locks that never speculates
(e.g. if they do some operation that always aborts)

-Andi
-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [i386, patch, RFC] HLE support in GCC
@ 2012-04-10 17:43 Uros Bizjak
  2012-04-10 19:47 ` Andi Kleen
  0 siblings, 1 reply; 49+ messages in thread
From: Uros Bizjak @ 2012-04-10 17:43 UTC (permalink / raw)
  To: gcc-patches; +Cc: Andi Kleen, H.J. Lu, Kirill Yukhin, Jakub Jelinek

Hello!

> > This is wrong since HLE ACQUIRE/RELEASE has nothing to do with
> > C++ atomic acquire/release. You can have HLE RELEASE with C++
> > atomic acquire.
>
> It makes sense to combine the two. On x86 C++ atomic acquire/release
> means the compiler cannot move references outside. For HLE
> we really want the same, otherwise some of the memory references
> inside the transaction may not be transactional.
>
> So I think HLE_ACQUIRE should imply C++ acquire
> and HLE_RELEASE imply C++ release.

In this case, can we reverse this sentence and just emit "lock
xacquire" for MEMMODEL_ACQUIRE and "lock xrelease" for
MEMMODEL_RELEASE ? Do we need separate HLE_* defines or can we somehow
recycle existing C++11 memmodel defines?

Uros.

^ permalink raw reply	[flat|nested] 49+ messages in thread

end of thread, other threads:[~2012-04-18  8:22 UTC | newest]

Thread overview: 49+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-03-07 11:06 [i386, patch, RFC] HLE support in GCC Kirill Yukhin
2012-03-07 11:10 ` Jakub Jelinek
2012-03-08  1:57   ` H.J. Lu
     [not found]     ` <CAGs3Rft_0Cs6v3AP_LH2tbbqrym5rf1A4MJ+6fn1T37=BEzrbw@mail.gmail.com>
2012-03-08 15:04       ` H.J. Lu
2012-03-08 15:09         ` Jakub Jelinek
2012-04-10 14:12           ` Kirill Yukhin
2012-04-10 14:21             ` Jakub Jelinek
2012-04-10 17:05               ` Uros Bizjak
2012-04-10 14:43             ` H.J. Lu
2012-04-10 14:44               ` Jakub Jelinek
2012-04-10 16:35               ` Andi Kleen
2012-04-10 20:00                 ` H.J. Lu
2012-04-10 20:20                   ` Andi Kleen
2012-04-11 10:35                     ` Kirill Yukhin
2012-04-11 10:51                       ` Jakub Jelinek
2012-04-11 11:40                         ` Uros Bizjak
2012-04-11 13:06                       ` Andi Kleen
2012-04-11 13:13                         ` Jakub Jelinek
2012-04-11 13:21                           ` Andi Kleen
2012-04-11 15:53                             ` Kirill Yukhin
2012-04-11 16:07                               ` Andi Kleen
2012-04-11 16:11                                 ` Jakub Jelinek
2012-04-11 16:19                                   ` Andi Kleen
2012-04-11 16:24                                     ` Jakub Jelinek
2012-04-11 16:40                                       ` Andi Kleen
2012-04-11 18:52                                         ` Jakub Jelinek
2012-04-11 21:04                                           ` Andi Kleen
2012-04-11 19:03                                 ` Uros Bizjak
2012-04-11 22:39                         ` Torvald Riegel
2012-04-12  8:46                           ` Kirill Yukhin
2012-04-12  9:37                             ` Kirill Yukhin
2012-04-12  9:47                               ` Jakub Jelinek
2012-04-12 11:36                           ` Andi Kleen
2012-04-12 12:35                             ` Torvald Riegel
2012-04-12 12:57                               ` Andi Kleen
2012-04-12 12:22                           ` Andrew MacLeod
2012-04-12 12:29                             ` Jakub Jelinek
2012-04-12 12:59                               ` Andi Kleen
2012-04-12 12:47                             ` Andi Kleen
2012-04-12 13:23                             ` Torvald Riegel
2012-04-12 13:26                               ` Andi Kleen
2012-04-12 14:13                               ` Kirill Yukhin
2012-04-12 18:05                                 ` Torvald Riegel
2012-04-13 11:40                                   ` Kirill Yukhin
2012-04-17 14:37                                     ` Sergey Ostanevich
2012-04-17 14:42                                       ` Andi Kleen
2012-04-18  8:22                                         ` Sergey Ostanevich
2012-04-10 17:43 Uros Bizjak
2012-04-10 19:47 ` Andi Kleen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).