[PATCH, ARM, iWMMXt][2/5]: intrinsic head file change

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH, ARM, iWMMXt][2/5]: intrinsic head file change
@ 2011-07-06 10:15 Xinyu Qi
  2011-08-18  2:35 ` Ramana Radhakrishnan
  0 siblings, 1 reply; 11+ messages in thread
From: Xinyu Qi @ 2011-07-06 10:15 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 189 bytes --]

Hi,

It is the second part of iWMMXt maintenance.

*config/arm/mmintrin.h:
 Revise the iWMMXt intrinsics head file. Fix some intrinsics and add some new intrinsics.

Thanks,
Xinyu

[-- Attachment #2: 2_mmintrin.diff --]
[-- Type: application/octet-stream, Size: 17935 bytes --]

Index: gcc/config/arm/mmintrin.h
===================================================================
--- gcc/config/arm/mmintrin.h	(revision 175285)
+++ gcc/config/arm/mmintrin.h	(working copy)
@@ -24,16 +24,25 @@
 #ifndef _MMINTRIN_H_INCLUDED
 #define _MMINTRIN_H_INCLUDED
 
+#if defined __cplusplus
+extern "C" { /* Begin "C" */
+/* Intrinsics use C name-mangling.  */
+#endif /* __cplusplus */
+
 /* The data type intended for user use.  */
-typedef unsigned long long __m64, __int64;
+
+/*  We will treat __int64 as a long long type
+    and __m64 as an unsigned long long type to conform to VSC++.  */
+typedef unsigned long long __m64;
+typedef long long __int64;
 
 /* Internal data types for implementing the intrinsics.  */
 typedef int __v2si __attribute__ ((vector_size (8)));
 typedef short __v4hi __attribute__ ((vector_size (8)));
-typedef char __v8qi __attribute__ ((vector_size (8)));
+typedef signed char __v8qi __attribute__ ((vector_size (8)));
 
 /* "Convert" __m64 and __int64 into each other.  */
-static __inline __m64 
+static __inline __m64
 _mm_cvtsi64_m64 (__int64 __i)
 {
   return __i;
@@ -54,7 +63,7 @@ _mm_cvtsi64_si32 (__int64 __i)
 static __inline __int64
 _mm_cvtsi32_si64 (int __i)
 {
-  return __i;
+  return (__i & 0xffffffff);
 }
 
 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
@@ -603,7 +612,7 @@ _mm_and_si64 (__m64 __m1, __m64 __m2)
 static __inline __m64
 _mm_andnot_si64 (__m64 __m1, __m64 __m2)
 {
-  return __builtin_arm_wandn (__m1, __m2);
+  return __builtin_arm_wandn (__m2, __m1);
 }
 
 /* Bit-wise inclusive OR the 64-bit values in M1 and M2.  */
@@ -935,7 +944,13 @@ _mm_avg2_pu16 (__m64 __A, __m64 __B)
 static __inline __m64
 _mm_sad_pu8 (__m64 __A, __m64 __B)
 {
-  return (__m64) __builtin_arm_wsadb ((__v8qi)__A, (__v8qi)__B);
+  return (__m64) __builtin_arm_wsadbz ((__v8qi)__A, (__v8qi)__B);
+}
+
+static __inline __m64
+_mm_sada_pu8 (__m64 __A, __m64 __B, __m64 __C)
+{
+  return (__m64) __builtin_arm_wsadb ((__v2si)__A, (__v8qi)__B, (__v8qi)__C);
 }
 
 /* Compute the sum of the absolute differences of the unsigned 16-bit
@@ -944,9 +959,16 @@ _mm_sad_pu8 (__m64 __A, __m64 __B)
 static __inline __m64
 _mm_sad_pu16 (__m64 __A, __m64 __B)
 {
-  return (__m64) __builtin_arm_wsadh ((__v4hi)__A, (__v4hi)__B);
+  return (__m64) __builtin_arm_wsadhz ((__v4hi)__A, (__v4hi)__B);
 }
 
+static __inline __m64
+_mm_sada_pu16 (__m64 __A, __m64 __B, __m64 __C)
+{
+  return (__m64) __builtin_arm_wsadh ((__v2si)__A, (__v4hi)__B, (__v4hi)__C);
+}
+
+
 /* Compute the sum of the absolute differences of the unsigned 8-bit
    values in A and B.  Return the value in the lower 16-bit word; the
    upper words are cleared.  */
@@ -965,11 +987,8 @@ _mm_sadz_pu16 (__m64 __A, __m64 __B)
   return (__m64) __builtin_arm_wsadhz ((__v4hi)__A, (__v4hi)__B);
 }
 
-static __inline __m64
-_mm_align_si64 (__m64 __A, __m64 __B, int __C)
-{
-  return (__m64) __builtin_arm_walign ((__v8qi)__A, (__v8qi)__B, __C);
-}
+#define _mm_align_si64(__A,__B, N) \
+  (__m64) __builtin_arm_walign ((__v8qi) (__A),(__v8qi) (__B), (N))
 
 /* Creates a 64-bit zero.  */
 static __inline __m64
@@ -985,44 +1004,83 @@ _mm_setzero_si64 (void)
 static __inline void
 _mm_setwcx (const int __value, const int __regno)
 {
+  /*Since gcc has the imformation of all wcgr regs
+    in arm backend, use builtin to access them instead
+    of throw asm directly.  Thus, gcc could do some
+    optimization on them.  */
+
   switch (__regno)
     {
-    case 0:  __builtin_arm_setwcx (__value, 0); break;
-    case 1:  __builtin_arm_setwcx (__value, 1); break;
-    case 2:  __builtin_arm_setwcx (__value, 2); break;
-    case 3:  __builtin_arm_setwcx (__value, 3); break;
-    case 8:  __builtin_arm_setwcx (__value, 8); break;
-    case 9:  __builtin_arm_setwcx (__value, 9); break;
-    case 10: __builtin_arm_setwcx (__value, 10); break;
-    case 11: __builtin_arm_setwcx (__value, 11); break;
-    default: break;
+    case 0:
+      __asm __volatile ("tmcr wcid, %0" :: "r"(__value));
+      break;
+    case 1:
+      __asm __volatile ("tmcr wcon, %0" :: "r"(__value));
+      break;
+    case 2:
+      __asm __volatile ("tmcr wcssf, %0" :: "r"(__value));
+      break;
+    case 3:
+      __asm __volatile ("tmcr wcasf, %0" :: "r"(__value));
+      break;
+    case 8:
+      __builtin_arm_setwcgr0 (__value);
+      break;
+    case 9:
+      __builtin_arm_setwcgr1 (__value);
+      break;
+    case 10:
+      __builtin_arm_setwcgr2 (__value);
+      break;
+    case 11:
+      __builtin_arm_setwcgr3 (__value);
+      break;
+    default:
+      break;
     }
 }
 
 static __inline int
 _mm_getwcx (const int __regno)
 {
+  int __value;
   switch (__regno)
     {
-    case 0:  return __builtin_arm_getwcx (0);
-    case 1:  return __builtin_arm_getwcx (1);
-    case 2:  return __builtin_arm_getwcx (2);
-    case 3:  return __builtin_arm_getwcx (3);
-    case 8:  return __builtin_arm_getwcx (8);
-    case 9:  return __builtin_arm_getwcx (9);
-    case 10: return __builtin_arm_getwcx (10);
-    case 11: return __builtin_arm_getwcx (11);
-    default: return 0;
+    case 0:
+      __asm __volatile ("tmrc %0, wcid" : "=r"(__value));
+      break;
+    case 1:
+      __asm __volatile ("tmrc %0, wcon" : "=r"(__value));
+      break;
+    case 2:
+      __asm __volatile ("tmrc %0, wcssf" : "=r"(__value));
+      break;
+    case 3:
+      __asm __volatile ("tmrc %0, wcasf" : "=r"(__value));
+      break;
+    case 8:
+      return __builtin_arm_getwcgr0 ();
+    case 9:
+      return __builtin_arm_getwcgr1 ();
+    case 10:
+      return __builtin_arm_getwcgr2 ();
+    case 11:
+      return __builtin_arm_getwcgr3 ();
+    default:
+      break;
     }
+  return __value;
 }
 
 /* Creates a vector of two 32-bit values; I0 is least significant.  */
 static __inline __m64
 _mm_set_pi32 (int __i1, int __i0)
 {
-  union {
+  union
+  {
     __m64 __q;
-    struct {
+    struct
+    {
       unsigned int __i0;
       unsigned int __i1;
     } __s;
@@ -1041,7 +1099,7 @@ _mm_set_pi16 (short __w3, short __w2, sh
   unsigned int __i1 = (unsigned short)__w3 << 16 | (unsigned short)__w2;
   unsigned int __i0 = (unsigned short)__w1 << 16 | (unsigned short)__w0;
   return _mm_set_pi32 (__i1, __i0);
-		       
+
 }
 
 /* Creates a vector of eight 8-bit values; B0 is least significant.  */
@@ -1110,9 +1168,521 @@ _mm_set1_pi8 (char __b)
 
 /* Convert an integer to a __m64 object.  */
 static __inline __m64
-_m_from_int (int __a)
+_mm_abs_pi8 (__m64 m1)
+{
+  return (__m64) __builtin_arm_wabsb ((__v8qi)m1);
+}
+
+static __inline __m64
+_mm_abs_pi16 (__m64 m1)
+{
+  return (__m64) __builtin_arm_wabsh ((__v4hi)m1);
+
+}
+
+static __inline __m64
+_mm_abs_pi32 (__m64 m1)
+{
+  return (__m64) __builtin_arm_wabsw ((__v2si)m1);
+
+}
+
+static __inline __m64
+_mm_addsubhx_pi16 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_waddsubhx ((__v4hi)a, (__v4hi)b);
+}
+
+static __inline __m64
+_mm_absdiff_pu8 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wabsdiffb ((__v8qi)a, (__v8qi)b);
+}
+
+static __inline __m64
+_mm_absdiff_pu16 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wabsdiffh ((__v4hi)a, (__v4hi)b);
+}
+
+static __inline __m64
+_mm_absdiff_pu32 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wabsdiffw ((__v2si)a, (__v2si)b);
+}
+
+static __inline __m64
+_mm_addc_pu16 (__m64 a, __m64 b)
+{
+  __m64 result;
+  __asm__ __volatile__ ("waddhc	%0, %1, %2" : "=y" (result) : "y" (a),  "y" (b));
+  return result;
+}
+
+static __inline __m64
+_mm_addc_pu32 (__m64 a, __m64 b)
+{
+  __m64 result;
+  __asm__ __volatile__ ("waddwc	%0, %1, %2" : "=y" (result) : "y" (a),  "y" (b));
+  return result;
+}
+
+static __inline __m64
+_mm_avg4_pu8 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wavg4 ((__v8qi)a, (__v8qi)b);
+}
+
+static __inline __m64
+_mm_avg4r_pu8 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wavg4r ((__v8qi)a, (__v8qi)b);
+}
+
+static __inline __m64
+_mm_maddx_pi16 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wmaddsx ((__v4hi)a, (__v4hi)b);
+}
+
+static __inline __m64
+_mm_maddx_pu16 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wmaddux ((__v4hi)a, (__v4hi)b);
+}
+
+static __inline __m64
+_mm_msub_pi16 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wmaddsn ((__v4hi)a, (__v4hi)b);
+}
+
+static __inline __m64
+_mm_msub_pu16 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wmaddun ((__v4hi)a, (__v4hi)b);
+}
+
+static __inline __m64
+_mm_mulhi_pi32 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wmulwsm ((__v2si)a, (__v2si)b);
+}
+
+static __inline __m64
+_mm_mulhi_pu32 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wmulwum ((__v2si)a, (__v2si)b);
+}
+
+static __inline __m64
+_mm_mulhir_pi16 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wmulsmr ((__v4hi)a, (__v4hi)b);
+}
+
+static __inline __m64
+_mm_mulhir_pi32 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wmulwsmr ((__v2si)a, (__v2si)b);
+}
+
+static __inline __m64
+_mm_mulhir_pu16 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wmulumr ((__v4hi)a, (__v4hi)b);
+}
+
+static __inline __m64
+_mm_mulhir_pu32 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wmulwumr ((__v2si)a, (__v2si)b);
+}
+
+static __inline __m64
+_mm_mullo_pi32 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wmulwl ((__v2si)a, (__v2si)b);
+}
+
+static __inline __m64
+_mm_qmulm_pi16 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wqmulm ((__v4hi)a, (__v4hi)b);
+}
+
+static __inline __m64
+_mm_qmulm_pi32 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wqmulwm ((__v2si)a, (__v2si)b);
+}
+
+static __inline __m64
+_mm_qmulmr_pi16 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wqmulmr ((__v4hi)a, (__v4hi)b);
+}
+
+static __inline __m64
+_mm_qmulmr_pi32 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wqmulwmr ((__v2si)a, (__v2si)b);
+}
+
+static __inline __m64
+_mm_subaddhx_pi16 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wsubaddhx ((__v4hi)a, (__v4hi)b);
+}
+
+static __inline __m64
+_mm_addbhusl_pu8 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_waddbhusl ((__v4hi)a, (__v8qi)b);
+}
+
+static __inline __m64
+_mm_addbhusm_pu8 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_waddbhusm ((__v4hi)a, (__v8qi)b);
+}
+
+#define _mm_qmiabb_pi32(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wqmiabb ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_qmiabbn_pi32(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wqmiabbn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_qmiabt_pi32(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wqmiabt ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_qmiabtn_pi32(acc, m1, m2) \
+  ({\
+   __m64 _acc=acc;\
+   __m64 _m1=m1;\
+   __m64 _m2=m2;\
+   _acc = (__m64) __builtin_arm_wqmiabtn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_qmiatb_pi32(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wqmiatb ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_qmiatbn_pi32(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wqmiatbn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_qmiatt_pi32(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wqmiatt ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_qmiattn_pi32(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wqmiattn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiabb_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiabb (_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiabbn_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiabbn (_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiabt_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiabt (_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiabtn_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiabtn (_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiatb_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiatb (_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiatbn_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiatbn (_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiatt_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiatt (_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiattn_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiattn (_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiawbb_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiawbb (_acc, (__v2si)_m1, (__v2si)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiawbbn_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiawbbn (_acc, (__v2si)_m1, (__v2si)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiawbt_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiawbt (_acc, (__v2si)_m1, (__v2si)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiawbtn_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiawbtn (_acc, (__v2si)_m1, (__v2si)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiawtb_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiawtb (_acc, (__v2si)_m1, (__v2si)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiawtbn_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiawtbn (_acc, (__v2si)_m1, (__v2si)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiawtt_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiawtt (_acc, (__v2si)_m1, (__v2si)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiawttn_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiawttn (_acc, (__v2si)_m1, (__v2si)_m2);\
+   _acc;\
+   })
+
+/* The third arguments should be an immediate.  */
+#define _mm_merge_si64(a, b, n) \
+  ({\
+   __m64 result;\
+   result = (__m64) __builtin_arm_wmerge ((__m64) (a), (__m64) (b), (n));\
+   result;\
+   })
+
+static __inline __m64
+_mm_alignr0_si64 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_walignr0 ((__v8qi) a, (__v8qi) b);
+}
+
+static __inline __m64
+_mm_alignr1_si64 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_walignr1 ((__v8qi) a, (__v8qi) b);
+}
+
+static __inline __m64
+_mm_alignr2_si64 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_walignr2 ((__v8qi) a, (__v8qi) b);
+}
+
+static __inline __m64
+_mm_alignr3_si64 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_walignr3 ((__v8qi) a, (__v8qi) b);
+}
+
+static __inline void
+_mm_tandcb ()
+{
+  __asm __volatile ("tandcb r15");
+}
+
+static __inline void
+_mm_tandch ()
+{
+  __asm __volatile ("tandch r15");
+}
+
+static __inline void
+_mm_tandcw ()
+{
+  __asm __volatile ("tandcw r15");
+}
+
+#define _mm_textrcb(n) \
+  ({\
+   __asm__ __volatile__ (\
+     "textrcb r15, %0" : : "i" (n));\
+   })
+
+#define _mm_textrch(n) \
+  ({\
+   __asm__ __volatile__ (\
+     "textrch r15, %0" : : "i" (n));\
+   })
+
+#define _mm_textrcw(n) \
+  ({\
+   __asm__ __volatile__ (\
+     "textrcw r15, %0" : : "i" (n));\
+   })
+
+static __inline void
+_mm_torcb ()
+{
+  __asm __volatile ("torcb r15");
+}
+
+static __inline void
+_mm_torch ()
+{
+  __asm __volatile ("torch r15");
+}
+
+static __inline void
+_mm_torcw ()
+{
+  __asm __volatile ("torcw r15");
+}
+
+static __inline void
+_mm_torvscb ()
+{
+  __asm __volatile ("torvscb r15");
+}
+
+static __inline void
+_mm_torvsch ()
+{
+  __asm __volatile ("torvsch r15");
+}
+
+static __inline void
+_mm_torvscw ()
+{
+  __asm __volatile ("torvscw r15");
+}
+
+static __inline __m64
+_mm_tbcst_pi8 (int value)
+{
+  return (__m64) __builtin_arm_tbcstb ((signed char) value);
+}
+
+static __inline __m64
+_mm_tbcst_pi16 (int value)
+{
+  return (__m64) __builtin_arm_tbcsth ((short) value);
+}
+
+static __inline __m64
+_mm_tbcst_pi32 (int value)
 {
-  return (__m64)__a;
+  return (__m64) __builtin_arm_tbcstw (value);
 }
 
 #define _m_packsswb _mm_packs_pi16
@@ -1250,5 +1820,10 @@ _m_from_int (int __a)
 #define _m_paligniq _mm_align_si64
 #define _m_cvt_si2pi _mm_cvtsi64_m64
 #define _m_cvt_pi2si _mm_cvtm64_si64
+#define _m_from_int _mm_cvtsi32_si64
+#define _m_to_int _mm_cvtsi64_si32
 
+#if defined __cplusplus
+}; /* End "C" */
+#endif /* __cplusplus */
 #endif /* _MMINTRIN_H_INCLUDED */

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH, ARM, iWMMXt][2/5]: intrinsic head file change
  2011-07-06 10:15 [PATCH, ARM, iWMMXt][2/5]: intrinsic head file change Xinyu Qi
@ 2011-08-18  2:35 ` Ramana Radhakrishnan
  2011-08-24  9:07   ` Xinyu Qi
                     ` (5 more replies)
  0 siblings, 6 replies; 11+ messages in thread
From: Ramana Radhakrishnan @ 2011-08-18  2:35 UTC (permalink / raw)
  To: Xinyu Qi; +Cc: gcc-patches

On 6 July 2011 11:11, Xinyu Qi <xyqi@marvell.com> wrote:
> Hi,
>
> It is the second part of iWMMXt maintenance.
>
> *config/arm/mmintrin.h:
>  Revise the iWMMXt intrinsics head file. Fix some intrinsics and add some new intrinsics

Is there a document somewhere that lists these intrinsics and what
each of these are supposed to be doing ? Missing details again . We
seem to be changing quite a few things.

> +
> +/*  We will treat __int64 as a long long type
> +    and __m64 as an unsigned long long type to conform to VSC++.  */Is
> +typedef unsigned long long __m64;
> +typedef long long __int64;

Interesting this sort of a change with these cases where you are
changing the type to conform to VSC++ ? This just means old code that
uses this is pretty much broken. Not that I have much hope of that
happening by default - -flax-conversions appears to be needed even
with a trunk compiler.

> @@ -54,7 +63,7 @@ _mm_cvtsi64_si32 (__int64 __i)
>  static __inline __int64
>  _mm_cvtsi32_si64 (int __i)
>  {
> -  return __i;
> +  return (__i & 0xffffffff);
>  }

Eh ? why the & 0xffffffff before promotion rules.  Is this set of
intrinsics documented some place ?  What is missing and could be the
subject of a follow-up patch is a set of tests for the wMMX intrinsics
....

What's the behaviour of wandn supposed to be ? Does wandn x, y, z
imply x = y & ~z or x = ~y & z ? If the former then your intrinsic
expansion is wrong unless the meaning of this has changed ? Whats the
behaviour of the intrinsic __mm_and_not_si64 . ?

@@ -985,44 +1004,83 @@ _mm_setzero_si64 (void)
 static __inline void
 _mm_setwcx (const int __value, const int __regno)
 {
> +  /*Since gcc has the imformation of all wcgr regs
> +    in arm backend, use builtin to access them instead
> +    of throw asm directly.  Thus, gcc could do some
> +    optimization on them.  */
> +

Also this comment is contradictory to what follows in the patch .
You've prima-facie replaced them with bits of inline assembler. I'm
not sure this comment makes a lot of sense on its own.

Ramana

^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE: [PATCH, ARM, iWMMXt][2/5]: intrinsic head file change
  2011-08-18  2:35 ` Ramana Radhakrishnan
@ 2011-08-24  9:07   ` Xinyu Qi
  2011-09-26  4:31   ` PING: " Xinyu Qi
                     ` (4 subsequent siblings)
  5 siblings, 0 replies; 11+ messages in thread
From: Xinyu Qi @ 2011-08-24  9:07 UTC (permalink / raw)
  To: Ramana Radhakrishnan; +Cc: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 3310 bytes --]

At 2011-08-18 09:33:27,"Ramana Radhakrishnan" <ramana.radhakrishnan@linaro.org> wrote:
> On 6 July 2011 11:11, Xinyu Qi <xyqi@marvell.com> wrote:
> > Hi,
> >
> > It is the second part of iWMMXt maintenance.
> >
> > *config/arm/mmintrin.h:
> >  Revise the iWMMXt intrinsics head file. Fix some intrinsics and add some
> new intrinsics
> 
> Is there a document somewhere that lists these intrinsics and what
> each of these are supposed to be doing ? Missing details again . We
> seem to be changing quite a few things.

Hi,
The intrinsic_doc.txt is attached. It is the piece of iWMMXt intrinsic details doc picked out from "Intel Wireless MMX Technology Intrinsic Support" with some modification.

> > +
> > +/*  We will treat __int64 as a long long type
> > +    and __m64 as an unsigned long long type to conform to VSC++.  */Is
> > +typedef unsigned long long __m64;
> > +typedef long long __int64;
> 
> Interesting this sort of a change with these cases where you are
> changing the type to conform to VSC++ ? This just means old code that
> uses this is pretty much broken. Not that I have much hope of that
> happening by default - -flax-conversions appears to be needed even
> with a trunk compiler.

I couldn't find any material to show why __int64 needs to be redefined. And all the tests are passed without this change. So decide to discard this change.

> 
> > @@ -54,7 +63,7 @@ _mm_cvtsi64_si32 (__int64 __i)
> >  static __inline __int64
> >  _mm_cvtsi32_si64 (int __i)
> >  {
> > -  return __i;
> > +  return (__i & 0xffffffff);
> >  }
> 
> Eh ? why the & 0xffffffff before promotion rules.  Is this set of
> intrinsics documented some place ?  What is missing and could be the
> subject of a follow-up patch is a set of tests for the wMMX intrinsics
> ....

See the intrinsics doc. It says the description of _mm_cvtsi32_si64 is "The integer value is zero-extended to 64 bits.
If r = _mm_cvtsi32_si64(i), then the action is
r [0:31] = i;
r[32:63] = 0;"

> 
> What's the behaviour of wandn supposed to be ? Does wandn x, y, z
> imply x = y & ~z or x = ~y & z ? If the former then your intrinsic
> expansion is wrong unless the meaning of this has changed ? Whats the
> behaviour of the intrinsic __mm_and_not_si64 . ?

The description of _mm_andnot_si64 is "Performs a logical NOT on the 64-bit value in m1 and use the result in a bitwise AND with the 64-bit value in m2."
And, "wandn wRd, wRn, wRm" means "wRd = wRn & ~wRm"
I think __builtin_arm_wandn had better directly match the behavior of wandn.
Therefore, match _mm_andnot_si64 (m1, m2) to __builtin_arm_wandn (m2, m1).

> @@ -985,44 +1004,83 @@ _mm_setzero_si64 (void)
>  static __inline void
>  _mm_setwcx (const int __value, const int __regno)
>  {
> > +  /*Since gcc has the imformation of all wcgr regs
> > +    in arm backend, use builtin to access them instead
> > +    of throw asm directly.  Thus, gcc could do some
> > +    optimization on them.  */
> > +
> 
> Also this comment is contradictory to what follows in the patch .
> You've prima-facie replaced them with bits of inline assembler. I'm
> not sure this comment makes a lot of sense on its own. 

Sorry. This comment should be removed.

The modified diff is attached.

Thanks,
Xinyu

[-- Attachment #2: intrinsic_doc.txt --]
[-- Type: text/plain, Size: 74561 bytes --]

20.9.1 	_mm_abs_pi8

Syntax 
#include <mmintrin.h>
__m64
_mm_abs_pi8 (__m64 m1)
Description 
Changes the eight 8-bit values in m1 to their absolute values and returns the result.
This function uses the assembler instruction WABSB.

20.9.2 	_mm_abs_pi16

Syntax 
#include <mmintrin.h>
__m64
_mm_abs_pi16 (__m64 m1)
Description 
Changes the four 16-bit values in m1 to their absolute values and returns the result.
This function uses the assembler instruction WABSH.

20.9.3 	_mm_abs_pi32

Syntax 
#include <mmintrin.h>
__m64
_mm_abs_pi32 (__m64 m1)
Description 
Changes the two 32-bit values in m1 to their absolute values and returns the result.
This function uses the assembler instruction WABSW.

20.9.4 	_mm_absdiff_pu8

Syntax 
#include <mmintrin.h>
__m64
_mm_absdiff_pu8 (__m64 a, __m64 b)
Description 
Subtracts the unsigned eight 8-bit values of a from their counterparts in b and returns the absolute values of the results.
This function uses the assembler instruction WABSDIFFB.

20.9.5 	_mm_absdiff_pu16

Syntax 
#include <mmintrin.h>
__m64
_mm_absdiff_pu16 (__m64 a, __m64 b)
Description 
Subtracts the four unsigned 16-bit values of a from their counterparts in b and returns the absolute values of the results.
This function uses the assembler instruction WABSDIFFH.

20.9.6 	_mm_absdiff_pu32

Syntax 
#include <mmintrin.h>
__m64
_mm_absdiff_pu32 (__m64 a, __m64 b)
Description 
Subtracts the two unsigned 32-bit values of a from their counterparts in b and returns the absolute values of the results.
This function uses the assembler instruction WABSDIFFW.

20.9.7 	_mm_acc_pu8

Syntax 
#include <mmintrin.h>
__m64
_mm_acc_pu8 (__m64 m1)
Description 
Unsigned accumulate across eight 8-bit values in m1. 
This function uses the assembler instruction WACCB.

20.9.8 	_mm_acc_pu16

Syntax 
#include <mmintrin.h>
__m64
_mm_acc_pu16 (__m64 m1)
Description 
Unsigned accumulate across four 16-bit values in m1.
This function uses the assembler instruction WACCH.

20.9.9 	_mm_acc_pu32

Syntax 
#include <mmintrin.h>
__m64
_mm_acc_pu32 (__m64 m1)
Description 
Unsigned accumulate across two 32-bit values in m1.
This function uses the assembler instruction WACCW.

20.9.10 	_mm_add_pi8

Syntax 
#include <mmintrin.h>
__m64
_mm_add_pi8 (__m64 m1, __m64 m2)
Description 
Adds the eight 8-bit values in m1 to the eight 8-bit values in m2.
This function uses the assembler instruction WADDB.

20.9.11 	_mm_add_pi16

Syntax 
#include <mmintrin.h>
__m64
_mm_add_pi16 (__m64 m1, __m64 m2)
Description 
Adds the four 16-bit values in m1 to the four 16-bit values in m2.
This function uses the assembler instruction WADDH.

20.9.12 	_mm_add_pi32

Syntax 
#include <mmintrin.h>
__m64
_mm_add_pi32 (__m64 m1, __m64 m2)
Description 
Adds the two 32-bit values in m1 to the two 32-bit values in m2.
This function uses the assembler instruction WADDW.

20.9.13 	_mm_addc_pu16

Syntax 
#include <mmintrin.h>
__m64
_mm_addc_pu16 (__m64 m1, __m64 m2)
Description 
Adds the four unsigned 16-bit values in m1 to the four unsigned 16-bit values in m2 using carry flags from the wCASF register as the Carry-in to the addition operation.
This function uses the assembler instruction WADDHC.

20.9.14 	_mm_addc_pu32

Syntax 
#include <mmintrin.h>
__m64
_mm_addc_pu32 (__m64 m1, __m64 m2)
Description 
Two unsigned 32-bit values in m1 to the two unsigned 16-bit values in m2 using carry flags from the wCASF register as the Carry-in to the addition operation.
This function uses the assembler instruction WADDWC.

20.9.15 	_mm_addbhusl_pu8

Syntax 
#include <mmintrin.h>
__m64
_mm_addbhusl_pu8 (__m64 a, __m64 b)
Description 
Performs a vector mixed mode addition of four 16-bit values of parameter a and four 8-bit zero-extended values from the lower half of parameter b and returns the result in the lower half of the return value.
This function uses the assembler instruction WADDBHUSL.

20.9.16 	_mm_addbhusm_pu8

Syntax 
#include <mmintrin.h>
__m64
_mm_addbhusm_pu8 (__m64 a, __m64 b)
Description 
Performs a vector mixed mode addition of four 16-bit values of parameter a and four 8-bit zero-extended values from the upper half of parameter b and returns the result in the upper half of the return value.
This function uses the assembler instruction WADDBHUSM.

20.9.17 	_mm_adds_pi8

Syntax 
#include <mmintrin.h>
__m64
_mm_adds_pi8 (__m64 m1, __m64 m2)
Description 
Adds the eight signed 8-bit values in m1 to the eight signed 8-bit values in m2 using saturating arithmetic.
This function uses the assembler instruction WADDBSS.

20.9.18 	_mm_adds_pi16

Syntax 
#include <mmintrin.h>
__m64
_mm_adds_pi16 (__m64 m1, __m64 m2)
Description 
Adds the four signed 16-bit values in m1 to the four signed 16-bit values in m2 using saturating arithmetic.
This function uses the assembler instruction WADDHSS.

20.9.19 	_mm_adds_pi32

Syntax 
#include <mmintrin.h>
__m64
_mm_adds_pi32 (__m64 m1, __m64 m2)
Description 
Adds the two signed 32-bit values in m1 to the two signed 32-bit values in m2 using saturating arithmetic.
This function uses the assembler instruction WADDWSS.

20.9.20 	_mm_adds_pu8

Syntax 
#include <mmintrin.h>
__m64
_mm_adds_pu8 (__m64 m1, __m64 m2)
Description 
Adds the eight unsigned 8-bit values in m1 to the eight unsigned 8-bit values in m2 using saturating arithmetic.
This function uses the assembler instruction WADDBUS.

20.9.21 	_mm_adds_pu16

Syntax 
#include <mmintrin.h>
__m64
_mm_adds_pu16 (__m64 m1, __m64 m2)
Description 
Adds the four unsigned 16-bit values in m1 to the four unsigned 16-bit values in m2 using saturating arithmetic.
This function uses the assembler instruction WADDHUS.

20.9.22 	_mm_adds_pu32

Syntax 
#include <mmintrin.h>
__m64
_mm_adds_pu32 (__m64 m1, __m64 m2)
Description 
Two unsigned 32-bit values in m1 to the two unsigned 16-bit values in m2 using saturating arithmetic.
This function uses the assembler instruction WADDWUS.

20.9.23 	_mm_addsubhx_pi16

Syntax 
#include <mmintrin.h>
__m64
_mm_addsubhx_pi16 (__m64 a, __m64 b)
Description 
Performs complex vector addition/subtraction of its parameters a and b for vectors of 16-bit data. The four operands from each of the parameters are alternately added and subtracted using a cross selection in each of the parallel operations. The result of the operation is saturated to the signed limits and returned.
This function uses the assembler instruction WADDSUBHX.

20.9.24 	_mm_align_si64

Syntax 
#include <mmintrin.h>
__m64
_mm_align_si64(__m64 m1, __m64 m2, int count)
Description 
Extracts a 64-bit value from the two 64-bit input values m1, m2 with count byte offset.
If r = _mm_align_si64(m1, m2, count), the action is
r = Low_DB_word((m1, m2) >> (count * 8));
 This function uses the assembler instruction WALIGNI.
Note:  	The parameter count has to be a numeric value or expression that can be evaluated at compile-time; it cannot be a variable. The range of count is 0 to 7.

20.9.25 	_mm_alignr0_si64

Syntax 
#include <mmintrin.h>
__m64
_mm_alignr0_si64(__m64 m1, __m64 m2)
Description 
Extracts a 64-bit value from the two 64-bit input values m1, m2 with 3-bit offset stored in the specified general-purpose register 0 (wCGR0).
This function uses the assembler instruction WALIGNR0.

20.9.26 	_mm_alignr1_si64

Syntax 
#include <mmintrin.h>
__m64
_mm_alignr1_si64(__m64 m1, __m64 m2)
Description 
Extracts a 64-bit value from the two 64-bit input values m1, m2 with 3-bit offset stored in the specified general-purpose register 1 (wCGR1).
This function uses the assembler instruction WALIGNR1.

20.9.27 	_mm_alignr2_si64

Syntax 
#include <mmintrin.h>
__m64
_mm_alignr2_si64(__m64 m1, __m64 m2)
Description 
Extracts a 64-bit value from the two 64-bit input values m1, m2 with 3-bit offset stored in the specified general-purpose register 2 (wCGR2).
This function uses the assembler instruction WALIGNR2. 

20.9.28 	_mm_alignr3_si64

Syntax 
#include <mmintrin.h>
__m64
_mm_alignr3_si64(__m64 m1, __m64 m2)
Description 
Extracts a 64-bit value from the two 64-bit input values m1, m2 with 3-bit offset stored in the specified general-purpose register 3 (wCGR3).
This function uses the assembler instruction WALIGNR3.

20.9.29 	_mm_and_si64

Syntax 
#include <mmintrin.h>
__m64
_mm_and_si64 (__m64 m1, __m64 m2)
Description 
Performs a bitwise AND of the 64-bit value in m1 with the 64-bit value in m2.
This function uses the assembler instruction WAND.

20.9.30 	_mm_andnot_si64

Syntax 
#include <mmintrin.h>
__m64
_mm_andnot_si64 (__m64 m1, __m64 m2)
Description 
Performs a logical NOT on the 64-bit value in m1 and use the result in a bitwise AND with the 64-bit value in m2.
This function uses the assembler instruction WANDN.

20.9.31 	_mm_avg_pu8

Syntax 
#include <mmintrin.h>
__m64
_mm_avg_pu8(__m64 a, __m64 b)
Description 
Computes the (rounded) averages of the unsigned bytes in a and b.
If r = _mm_avg_pu8(a, b), the action is
t = (unsigned short)a0 + (unsigned short)b0;
r0 = (t >> 1) | (t & 0x01);
...
t = (unsigned short)a7 + (unsigned short)b7;
r7 = (unsigned char)((t >> 1) | (t & 0x01));
This function uses the assembler instruction WAVG2BR.

20.9.32 	_mm_avg_pu16

Syntax 
#include <mmintrin.h>
__m64
_mm_avg_pu16(__m64 a, __m64 b)
Description 
Computes the (rounded) averages of the unsigned words in a and b.
If r = _mm_avg_pu16(a, b), the action is 
t = (unsigned int)a0 + (unsigned int)b0;
r0 = (t >> 1) | (t & 0x01);
...
t = (unsigned word)a7 + (unsigned word)b7;
r7 = (unsigned short)((t >> 1) | (t & 0x01));
This function uses the assembler instruction WAVG2HR.

20.9.33 	_mm_avg2_pu8

Syntax 
#include <mmintrin.h>
__m64
_mm_avg2_pu8(__m64 a, __m64 b)
Description 
Computes the without rounded averages of the unsigned bytes in a and b.
If r = _mm_avg2_pu8(a, b), the action is
t = (unsigned byte)a0 + (unsigned byte)b0;
r0 = (t >> 1);
...
t = (unsigned byte)a7 + (unsigned byte)b7;
r7 = (unsigned char)(t >> 1);
This function uses the assembler instruction WAVG2B.

20.9.34 	_mm_avg2_pu16

Syntax 
#include <mmintrin.h>
__m64
_mm_avg2_pu16(__m64 a, __m64 b)
Description 
Computes the with out rounded averages of the unsigned words in a and b.
If r = _mm_avg2_pu16(a, b), the action is
t = (unsigned half word)a0 + (unsigned half word)b0;
r0 = (t >> 1);
...
t = (unsigned half word)a3 + (unsigned half word)b3;
r3 = (unsigned short)(t >> 1);
This function uses the assembler instruction WAVG2H.

20.9.35 	_mm_avg4_pu8

Syntax 
#include <mmintrin.h>
__m64
_mm_avg4_pu8 (__m64 a, __m64 b)
Description 
Performs seven 4-pixel averages of unsigned 8-bit operands obtained from the bytes of the parameters a and b, and returns the result.
This function uses the assembler instruction WAVG4.

20.9.36 	_mm_avg4r_pu8

Syntax 
#include <mmintrin.h>
__m64
_mm_avg4r_pu8 (__m64 a, __m64 b)
Description 
Performs seven 4-pixel averages of unsigned 8-bit operands obtained from the bytes of the parameters a and b, and returns the result. Biased rounding is performed by adding +2 or +1 to the intermediate result before the divide-by-2.
This function uses the assembler instruction WAVG4R.

20.9.37 	_mm_cmpeq_pi8
Syntax 
#include <mmintrin.h>
__m64
_mm_cmpeq_pi8 (__m64 m1, __m64 m2)
Description 
If the respective 8-bit values in m1 are equal to the respective 8-bit values in m2, the function sets the respective 8-bit resulting values to all ones, otherwise it sets them to all zeros.
This function uses the assembler instruction WCMPEQB.

20.9.38 	_mm_cmpeq_pi16

Syntax 
#include <mmintrin.h>
__m64
_mm_cmpeq_pi16 (__m64 m1, __m64 m2)
Description 
If the respective 16-bit values in m1 are equal to the respective 16-bit values in m2, the function sets the respective 16-bit resulting values to all ones, otherwise it sets them to all zeros.
This function uses the assembler instruction WCMPEQH.

20.9.39 	_mm_cmpeq_pi32

Syntax 
#include <mmintrin.h>
__m64
_mm_cmpeq_pi32 (__m64 m1, __m64 m2)
Description 
If the respective 32-bit values in m1 are equal to the respective 32-bit values in m2, the function sets the respective 32-bit resulting values to all ones, otherwise it sets them to all zeros.
This function uses the assembler instruction WCMPEQW.

20.9.40 	_mm_cmpgt_pi8

Syntax 
#include <mmintrin.h>
__m64
_mm_cmpgt_pi8 (__m64 m1, __m64 m2)
Description 
If the respective 8-bit values in m1 are greater than the respective 8-bit values in m2, the function sets the respective 8-bit resulting values to all ones, otherwise it sets them to all zeros.
This function uses the assembler instruction WCMPGTSB.

20.9.41 	_mm_cmpgt_pi16

Syntax 
#include <mmintrin.h>
__m64
_mm_cmpgt_pi16 (__m64 m1, __m64 m2)
Description 
If the respective 16-bit values in m1 are greater than the respective 16-bit values in m2, the function sets the respective 16-bit resulting values to all ones, otherwise it sets them to all zeros.
This function uses the assembler instruction WCMPGTSH.

20.9.42 	_mm_cmpgt_pi32

Syntax 
#include <mmintrin.h>
__m64
_mm_cmpgt_pi32 (__m64 m1, __m64 m2)
Description 
If the respective 32-bit values in m1 are greater than the respective 32-bit values in m2, the function sets the respective 32-bit resulting values to all ones, otherwise it sets them all to zeros. 
This function uses the assembler instruction WCMPGTSW.

20.9.43 	_mm_cmpgt_pu8

Syntax 
#include <mmintrin.h>
__m64
_mm_cmpgt_pu8 (__m64 m1, __m64 m2)
Description 
If the respective 8-bit values in m1 are unsigned greater than the respective 8-bit values in m2, the function sets the respective 8-bit resulting values to all ones, otherwise it sets them to all zeros.
This function uses the assembler instruction WCMPGTUB.

20.9.44 	_mm_cmpgt_pu16

Syntax 
#include <mmintrin.h>
__m64
_mm_cmpgt_pu16 (__m64 m1, __m64 m2)
Description 
If the respective 16-bit values in m1 are unsigned greater than the respective 16-bit values in m2, the function sets the respective 16-bit resulting values to all ones, otherwise it sets them to all zeros.
This function uses the assembler instruction WCMPGTUH.

20.9.45 	_mm_cmpgt_pu32

Syntax 
#include <mmintrin.h>
__m64
_mm_cmpgt_pu32 (__m64 m1, __m64 m2)
Description 
If the respective 32-bit values in m1 are unsigned greater than the respective 32-bit values in m2, the function sets the respective 32-bit resulting values to all ones, otherwise it sets them all to zeros.
This function uses the assembler instruction WCMPGTUW.

20.9.46 	_mm_cvtm64_si64

Syntax 
#include <mmintrin.h>
__int64
_mm_cvtm64_si64 (__m64 m)
Description 
Converts the 64-bit __m64 object m to __int64 bit integer.
If r = _mm_cvtm64_si64(a), then the action is
r0 = a[31:0]; (lower word)
r1 = a[63:32]; (upper word)

20.9.47 	_mm_cvtsi32_si64

Syntax 
#include <mmintrin.h>
__int64
_mm_cvtsi32_si64 (int i)
Description 
Converts the integer object i to a 64-bit __m64 object. The integer value is zero-extended to 64 bits.
If r = _mm_cvtsi32_si64(i), then the action is
r [0:31] = i;
r[32:63] = 0;

20.9.48 	_mm_cvtsi64_m64

Syntax 
#include <mmintrin.h>
__m64
_mm_cvtsi64_m64 (__int64 i)
Description 
Converts the __int64 integer i to a 64-bit __m64 object.
If r = _mm_cvtsi64_m64(a), then the action is
r [31:0] = a0; (lower word)
r[63:32] = a1; (upper word)

20.9.49 	_mm_cvtsi64_si32

Syntax 
#include <mmintrin.h>
int
_mm_cvtsi64_si32 (__int64 m)
Description 
Converts the lower 32 bits of the __m64 object m to an integer.
If i = _mm_cvtsi64_si32(m), then the action is
i = a[31:0]; (lower word)

20.9.50 	_mm_extract_pi8

Syntax 
#include <mmintrin.h>
int
_mm_extract_pi8(__m64 a, const int n)
Description 
Extracts one of the eight bytes of a. The selector n must be an immediate and its range must be 0 to 7. The n variable selects the byte that should be extracted.
If r = _mm_extract_pi8(a, n), the action is 
r[7:0] = a[Byte n[2:0]];
r[31:8] = SignReplicate(a[Byte n[2:0]], 24);
This function uses the assembler instruction TEXTRMSB.

20.9.51 	_mm_extract_pi16

Syntax 
#include <mmintrin.h>
int
_mm_extract_pi16(__m64 a, const int n)
Description 
Extracts one of the four half words of a. The selector n must be an immediate and its range must be 0 to 3.
If r = _mm_extract_pi16(a, n), the action is 
r[15:0] = a[Halfword n[1:0]];
r[31:16] = SignReplicate(a[Byte n[1:0]], 16);
This function uses the assembler instruction TEXTRMSH.

20.9.52 	_mm_extract_pi32

Syntax 
#include <mmintrin.h>
int
_mm_extract_pi32(__m64 a, const int n)
Description 
Extracts one of the two words of a. The selector n must be an immediate and its range must be 0 to 1.
If r = _mm_extract_pi32(a, n), the action is 
r[31:0] = a[Byte n[0]];
This function uses the assembler instruction TEXTRMSW.

20.9.53 	_mm_extract_pu8

Syntax 
#include <mmintrin.h>
int
_mm_extract_pu8(__m64 a, const int n)
Description 
Extracts one of the eight bytes of a. The selector n must be an immediate and its range must be 0 to 7.
If r = _mm_extract_pu8(a, n), the action is 
r[7:0] = a[Byte n[2:0]];
r[31:8] = 0;
This function uses the assembler instruction TEXTRMUB.

20.9.54 	_mm_extract_pu16

Syntax 
#include <mmintrin.h>
int
_mm_extract_pu16(__m64 a, const int n)
Description 
Extracts one of the four half words of a. The selector n must be an immediate and its range must be 0 to 3.
If r = _mm_extract_pu16(a, n), the action is 
r [15:0] = a[Halfword n[1:0]];
r[31:16] = 0;
This function uses the assembler instruction TEXTRMUH.

20.9.55 	_mm_extract_pu32

Syntax 
#include <mmintrin.h>
int
_mm_extract_pu32(__m64 a, const int n)
Description 
This provides same functionality as _mm_extract_pi32.

20.9.56 	_mm_getwcx

Syntax 
#include <mmintrin.h>
int
_mm_getwcx(int number)
Description 
Returns contents of Intel Wireless MMX technology control register, which is specified with number, where number is the coprocessor register number. 
This function uses the assembler pseudo-instruction TMRC.
Note:  	The valid range for parameter number is [0, 3] and [8, 11]. The valid control registers are: wCID(0), wCon(1), wCSSF(2), wCASF(3), wCGR0(8), wCGR1(9), wCGR2(10), wCGR3(11).

20.9.57 	_mm_insert_pi8

Syntax 
#include <mmintrin.h>
__m64
_mm_insert_pi8(__m64 a, int d, int n)
Description 
Inserts byte d into one of eight bytes of a. The selector n must be an immediate and its range must be 0 to 7.
If r = _mm_insert_pi8(a, d, n), the action is
r0 = (n==0) ? d[7:0] : a0;
r1 = (n==1) ? d[7:0] : a1;
r2 = (n==2) ? d[7:0] : a2;
r3 = (n==3) ? d[7:0] : a3;
r4 = (n==4) ? d[7:0] : a4;
r5 = (n==5) ? d[7:0] : a5;
r6 = (n==6) ? d[7:0] : a6;
r7 = (n==7) ? d[7:0] : a7;
This function uses the assembler instruction TINSRB.

20.9.58 	_mm_insert_pi16

Syntax 
#include <mmintrin.h>
__m64
_mm_insert_pi16(__m64 a, int d, int n)
Description 
Inserts half word d into one of four half words of a. The selector n must be an immediate and its range must be 0 to 3.
If r = _mm_insert_pi16(a, d, n), the action is
r0 = (n==0) ? d[15:0] : a0;
r1 = (n==1) ? d[15:0] : a1;
r2 = (n==2) ? d[15:0] : a2;
r3 = (n==3) ? d[15:0] : a3;
This function uses the assembler instruction TINSRH.

20.9.59 	_mm_insert_pi32

Syntax 
#include <mmintrin.h>
__m64
_mm_insert_pi32(__m64 a, int d, int n)
Description 
Inserts word d into one of two half words of a. The selector n must be an immediate and its range must be 0 to 1.
If r = _mm_insert_pi32(a, d, n), the action is
r0 = (n==0) ? d[31:0] : a0;
r1 = (n==1) ? d[31:0] : a1;
This function uses the assembler instruction TINSRW.

20.9.60 	_mm_mac_pi16

Syntax 
#include <mmintrin.h>
__m64
_mm_mac_pi16 (__m64 m1, __m64 m2, __m64 m3)
Description 
Multiplies four signed 16-bit values in signed m2 by four 16-bit values in m3 and accumulates result with value in m1.
This function uses the assembler instruction WMACS.

20.9.61 	_mm_mac_pu16

Syntax 
#include <mmintrin.h>
__m64
_mm_mac_pu16 (__m64 m1, __m64 m2, __m64 m3)
Description 
Multiplies four unsigned 16-bit values in unsigned m2 by four 16-bit values in m3 and accumulates result with value in m1. 
This function uses the assembler instruction WMACU.

20.9.62 	_mm_macz_pi16

Syntax 
#include <mmintrin.h>
__m64
_mm_macz_pi16 (__m64 m1, __m64 m2)
Description 
Multiplies four signed 16-bit values in signed m1 by four 16-bit values in signed m2 and returns the result.
This function uses the assembler instruction WMACSZ.

20.9.63 	_mm_macz_pu16

Syntax 
#include <mmintrin.h>
__m64
_mm_macz_pu16 (__m64 m1, __m64 m2)
Description 
Multiplies four unsigned 16-bit values in unsigned m1 by four 16-bit values in m2 and accumulates result with zero.
This function uses the assembler instruction WMACUZ.

20.9.64 	_mm_madd_pi16

Syntax 
#include <mmintrin.h>
__m64
_mm_madd_pi16 (__m64 m1, __m64 m2)
Description 
Multiplies four 16-bit values in m1 by four 16-bit values in m2 producing four 32-bit intermediate results, which are then summed up: The sum of the lower two products yield the lower word and the sum of the upper two products yield the upper word of the result.
This function uses the assembler instruction WMADDS.

20.9.65 	_mm_madd_pu16

Syntax 
#include <mmintrin.h>
__m64
_mm_madd_pu16 (__m64 m1, __m64 m2)
Description 
Multiplies four unsigned 16-bit values in m1 by four unsigned 16-bit values in m2 producing four 32-bit intermediate results, which are then summed the lower products into the bottom word and the upper two products into the upper word of result.
This function uses the assembler instruction WMADDU.

20.9.66 	_mm_maddx_pi16

Syntax 
#include <mmintrin.h>
__m64
_mm_maddx_pi16 (__m64 m1, __m64 m2)
Description 
Cross multiplies four signed 16-bit values in m1 by four signed 16-bit values in m2 producing four 32-bit intermediate results, which are then summed the lower products into the bottom word and the upper two products into the upper word of result.
This function uses the assembler instruction WMADDSX.

20.9.67 	_mm_maddx_pu16

Syntax 
#include <mmintrin.h>
__m64
_mm_maddx_pu16 (__m64 m1, __m64 m2)
Description 
Cross multiplies four unsigned 16-bit values in m1 by four unsigned 16-bit values in m2 producing four 32-bit intermediate results, which are then summed the lower products into the bottom word and the upper two products into the upper word of result.
This function uses the assembler instruction WMADDUX.

20.9.68 	_mm_msub_pi16

Syntax 
#include <mmintrin.h>
__m64
_mm_msub_pi16 (__m64 m1, __m64 m2)
Description 
Multiplies four signed 16-bit values in m1 by four signed 16-bit values in m2 producing four 32-bit intermediate results, and then does subtraction: The difference of the lower two products yield the lower word and the difference of the upper two products yield the upper word of the result.
This function uses the assembler instruction WMADDSN.

20.9.69 	_mm_msub_pu16

Syntax 
#include <mmintrin.h>
__m64
_mm_msub_pu16 (__m64 m1, __m64 m2)
Description 
Multiplies four unsigned 16-bit values in m1 by four unsigned 16-bit values in m2 producing four 32-bit intermediate results, and then does subtraction: The difference of the lower two products yield the lower word and the difference of the upper two products yield the upper word of the result.
This function uses the assembler instruction WMADDUN.

20.9.70 	_mm_max_pi8

Syntax 
#include <mmintrin.h>
__m64
_mm_max_pi8(__m64 a, __m64 b)
Description 
Computes the element-wise maximum of the bytes in a and b.
If r = _mm_max_pi8(a, b), the action is
r0 = max(a0, b0);
r1 = max(a1, b1);
...
r7 = max(a7, b7);
This function uses the assembler instruction WMAXSB.

20.9.71 	_mm_max_pi16

Syntax 
#include <mmintrin.h>
__m64
_mm_max_pi16(__m64 a, __m64 b)
Description 
Computes the element-wise maximum of the half words in a and b.
If r = _mm_max_pi16(a, b), the action is
r0 = max(a0, b0);
r1 = max(a1, b1);
r2 = max(a2, b2);
r3 = max(a3, b3);
This function uses the assembler instruction WMAXSH.

20.9.72 	_mm_max_pi32

Syntax 
#include <mmintrin.h>
__m64
_mm_max_pi32(__m64 a, __m64 b)
Description 
Computes the element-wise maximum of the words in a and b.
If r = _mm_max_pi32(a, b), the action is
r0 = max(a0, b0);
r1 = max(a1, b1);
This function uses the assembler instruction WMAXSW.

20.9.73 	_mm_max_pu8

Syntax 
#include <mmintrin.h>
__m64
_mm_max_pu8(__m64 a, __m64 b)
Description 
Computes the element-wise maximum of the unsigned bytes in a and b.
If r = _mm_max_pu8(a, b), the action is
r0 = max(a0, b0);
r1 = max(a1, b1);
...
r7 = max(a7, b7);
This function uses the assembler instruction WMAXUB.

20.9.74 	_mm_max_pu16

Syntax 
#include <mmintrin.h>
__m64
_mm_max_pu16(__m64 a, __m64 b)
Description 
Computes the element-wise maximum of the unsigned half words in a and b.
If r = _mm_max_pu16(a, b), the action is
r0 = max(a0, b0);
r1 = max(a1, b1);
r2 = max(a2, b2);
r3 = max(a3, b3);
This function uses the assembler instruction WMAXUH.

20.9.75 	_mm_max_pu32

Syntax 
#include <mmintrin.h>
__m64
_mm_max_pu32(__m64 a, __m64 b)
Description 
Computes the element-wise maximum of the unsigned words in a and b.
If r = _mm_max_pu32(a, b), the action is
r0 = max(a0, b0);
r1 = max(a1, b1);
This function uses the assembler instruction WMAXUW.

20.9.76 	_mm_merge_si64

Syntax 
#include <mmintrin.h>
__m64
_mm_merge_si64 (__m64 a, __m64 b, const int n)
Description 
Extracts a 64-bit value that contains elements from the parameters a and b, and returns a merged 64-bit result. The number of elements, in bytes, to be taken from b is represented by the constant parameter n.
This function uses the assembler instruction WMERGE.

20.9.77 	_mm_mia_si64

Syntax 
#include <mmintrin.h>
__m64
_mm_mia_si64 (__m64 m1, int a, int b)
Description 
Multiplies two signed 32-bit values in a & b and accumulates the result with 64-bit value in m1.
This function uses the assembler instruction TMIA.

20.9.78 	_mm_miabb_si64

Syntax 
#include <mmintrin.h>
__m64
_mm_miabb_si64(__m64 m1, int a, int b)
Multiplies bottom half of signed 16-bit values in a and bottom half of signed 16-bit value in b and accumulates the result with 64-bit values in m1.
Result = sign_extend(a[15:0] * b[15:0]) + m1;
This function uses the assembler instruction TMIABB.

20.9.79 	_mm_miabt_si64

Syntax 
#include <mmintrin.h>
__m64
_mm_miabt_si64(__m64 m1, int a, int b)
Description 
Multiplies bottom half of signed 16-bit values in a and top half of signed 16-bit value in b and accumulates the result with 64-bit values in m1.
Result = sign_extend(a[15:0] * b[31:16]) + m1;
This function uses the assembler instruction TMIABT.

20.9.80 	_mm_miaph_si64

Syntax 
#include <mmintrin.h>
__m64
_mm_miaph_si64 (__m64 m1, int a, int b)
Description 
Multiplies accumulate signed 16-bit values in a & b and accumulates the result with 64-bit values in m1.
Result = sign_extend((a[31:16] * b[31:16]) + (a[15:0] * b[15:0])) + m1;
This function uses the assembler instruction TMIAPH.

20.9.81 	_mm_miatb_si64

Syntax 
#include <mmintrin.h>
__m64
_mm_miatb_si64(__m64 m1, int a, int b)
Description 
Multiplies top half of signed 16-bit values in a and bottom half of signed 16-bit value in b and accumulates the result with 64-bit values in m1.
Result = sign_extend(a[31:16] * b[15:0]) + m1;
This function uses the assembler instruction TMIATB.

20.9.82 	_mm_miatt_si64

Syntax 
#include <mmintrin.h>
__m64
_mm_miatt_si64(__m64 m1, int a, int b)
Description 
Multiplies top half of signed 16-bit values in a and top half of signed 16-bit value in b and accumulates the result with 64-bit values in m1.
Result = sign_extend(a[31:16] * b[31:16]) + m1;
This function uses the assembler instruction TMIATT.

20.9.83 	_mm_min_pi8

Syntax 
#include <mmintrin.h>
__m64
_mm_min_pi8(__m64 a, __m64 b)
Description 
Computes the element-wise minimum of the bytes in a and b.
If r = _mm_min_pi8(a, b), the action is
r0 = min(a0, b0);
r1 = min(a1, b1);
...
r7 = min(a7, b7);
This function uses the assembler instruction WMINSB.

20.9.84 	_mm_min_pi16

Syntax 
#include <mmintrin.h>
__m64
_mm_min_pi16(__m64 a, __m64 b)
Description 
Computes the element-wise minimum of the half words in a and b.
If r = _mm_min_pi16(a, b), the action is
r0 = min(a0, b0);
r1 = min(a1, b1);
r2 = min(a2, b2);
r3 = min(a3, b3);
This function uses the assembler instruction WMINSH.

20.9.85 	_mm_min_pi32

Syntax 
#include <mmintrin.h>
__m64
_mm_min_pi32(__m64 a, __m64 b)
Description 
Computes the element-wise minimum of the words in a and b.
If r = _mm_min_pi32(a, b), the action is
r0 = min(a0, b0);
r1 = min(a1, b1);
This function uses the assembler instruction WMINSW.

20.9.86 	_mm_min_pu8

Syntax 
#include <mmintrin.h>
__m64
_mm_min_pu8(__m64 a, __m64 b)
Description 
Computes the element-wise minimum of the unsigned bytes in a and b.
If r = _mm_min_pu8(a, b), the action is
r0 = min(a0, b0);
r1 = min(a1, b1);
...
r7 = min(a7, b7);
This function uses the assembler instruction WMINUB.

20.9.87 	_mm_min_pu16

Syntax 
#include <mmintrin.h>
__m64
_mm_min_pu16(__m64 a, __m64 b)
Description 
Computes the element-wise minimum of the unsigned half words in a and b.
If r = _mm_min_pu16(a, b), the action is
r0 = min(a0, b0);
r1 = min(a1, b1);
r2 = min(a2, b2);
r3 = min(a3, b3);
This function uses the assembler instruction WMINUH.

20.9.88 	_mm_min_pu32

Syntax 
#include <mmintrin.h>
__m64
_mm_min_pu32(__m64 a, __m64 b)
Description 
Computes the element-wise minimum of the unsigned words in a and b.
If r = _mm_min_pu32(a, b), the action is
r0 = min(a0, b0);
r1 = min(a1, b1);
This function uses the assembler instruction WMINUW.

20.9.89 	_mm_movemask_pi8

Syntax 
#include <mmintrin.h>
int
_mm_movemask_pi8(__m64 a)
Description 
Creates an 8-bit mask from the most significant bits of the bytes in a.
If r = _mm_movemask_pi8(a), the action is
r = 0;
r = sign(a7)<<7 | sign(a6)<<6 |... | sign(a0);
This function uses the assembler instruction TMOVMSKB.

20.9.90 	_mm_movemask_pi16

Syntax 
#include <mmintrin.h>
int
_mm_movemask_pi16(__m64 a)
Description 
Creates a 4-bit mask from the most significant bits of the half words in a.
If r = _mm_movemask_pi16(a), the action is
r = 0;
r = sign(a3)<<3 | sign(a2)<<2 |... | sign(a0);
This function uses the assembler instruction TMOVMSKH.

20.9.91 	_mm_movemask_pi32

Syntax 
#include <mmintrin.h>
int
_mm_movemask_pi32(__m64 a)
Description 
Creates a 2-bit mask from the most significant bits of the bytes in a.
If r = _mm_movemask_pi32(a), the action is
r = 0;
r = sign(a1)<<1 | sign(a0);
This function uses the assembler instruction TMOVMSKW.

20.9.92 	_mm_mulhi_pi16

Syntax 
#include <mmintrin.h>
__m64
_mm_mulhi_pi16(__m64 a, __m64 b)
Description 
Multiplies four signed 16-bit values in m1 by four unsigned 16-bit values in m2 and produces the upper 16 bits of the four results.
If r = _mm_mulhi_pi16(a, b), the action is
r0 = hiword(a0 * b0);
r1 = hiword(a1 * b1);
r2 = hiword(a2 * b2);
r3 = hiword(a3 * b3);
This function uses the assembler instruction WMULSM.

20.9.93 	_mm_mulhi_pi32

Syntax 
#include <mmintrin.h>
__m64
_mm_mulhi_pi32 (__m64 a, __m64 b)
Description 
Performs a signed vector multiplication on the 32-bit words of parameters a and b, to produce 64-bit intermediate results. Only the higher 32 bits of the results are returned.
This function uses the assembler instruction WMULWSM.

20.9.94 	_mm_mulhi_pu16

Syntax 
#include <mmintrin.h>
__m64
_mm_mulhi_pu16(__m64 a, __m64 b)
Description 
Multiplies four unsigned 16-bit values in m1 by four unsigned 16-bit values in m2 and produces the upper 16 bits of the four results.
If r = _mm_mulhi_pu16(a, b), the action is
r0 = hiword(a0 * b0);
r1 = hiword(a1 * b1);
r2 = hiword(a2 * b2);
r3 = hiword(a3 * b3);
This function uses the assembler instruction WMULUM.

20.9.95 	_mm_mulhi_pu32

Syntax 
#include <mmintrin.h>
__m64
_mm_mulhi_pu32 (__m64 a, __m64 b)
Description 
Performs an unsigned vector multiplication on the 32-bit words of parameters a and b, to produce 64-bit intermediate results. Only the higher 32 bits of the results are returned.
This function uses the assembler instruction WMULWUM.

20.9.96 	_mm_mulhir_pi16

Syntax 
#include <mmintrin.h>
__m64
_mm_mulhir_pi16 (__m64 a, __m64 b)
Description 
Performs a signed vector multiplication on the 16-bit words of parameters a and b, to produce 32-bit intermediate results. Then the function rounds the least significant 16 bits into the most significant 16 bits. Only the higher 16 bits of the results are returned.
This function uses the assembler instruction WMULSMR.

20.9.97 	_mm_mulhir_pu16

Syntax 
#include <mmintrin.h>
__m64
_mm_mulhir_pu32 (__m64 a, __m64 b)
Description 
Performs an unsigned vector multiplication on the 16-bit words of parameters a and b, to produce 32-bit intermediate results. Then the function rounds the least significant 16 bits into the most significant 16 bits. Only the higher 16 bits of the results are returned.
This function uses the assembler instruction WMULUMR.

20.9.98 	_mm_mulhir_pi32

Syntax 
#include <mmintrin.h>
__m64
_mm_mulhir_pi32 (__m64 a, __m64 b)
Description 
Performs a signed vector multiplication on the 32-bit words of parameters a and b, to produce 64-bit intermediate results. Then the function rounds the least significant 32 bits into the most significant 32 bits. Only the higher 32 bits of the results are returned.
This function uses the assembler instruction WMULWSMR.

20.9.99 	_mm_mulhir_pu32

Syntax 
#include <mmintrin.h>
__m64
_mm_mulhir_pu32 (__m64 a, __m64 b)
Description 
Performs an unsigned vector multiplication on the 32-bit words of parameters a and b, to produce 64-bit intermediate results. Then the function rounds the least significant 32 bits into the most significant 32 bits. Only the higher 32 bits of the results are returned.
This function uses the assembler instruction WMULWUMR.

20.9.100 	_mm_mullo_pi16

Syntax 
#include <mmintrin.h>
__m64
_mm_mullo_pi16 (__m64 m1, __m64 m2)
Description 
Multiplies four 16-bit values in m1 by four 16-bit values in m2 and produces the lower 16 bits of the four results.
If r = _mm_mullo_pu16(a, b), the action is
r0 = lowword(a0 * b0);
r1 = lowword(a1 * b1);
r2 = lowword(a2 * b2);
r3 = lowword(a3 * b3);
This function uses the assembler instruction WMULUL.

20.9.101 	_mm_mullo_pi32

Syntax 
#include <mmintrin.h>
__m64
_mm_mullo_pi32 (__m64 a, __m64 b)
Description 
Performs a vector multiplication on the 32-bit words of parameters a and b, to produce 64-bit intermediate results. Only the lower 32 bits of the results are returned.
This function uses the assembler instruction WMULWL.

20.9.102 	_mm_or_si64

Syntax 
#include <mmintrin.h>
__m64
_mm_or_si64 (__m64 m1, __m64 m2)
Description 
Performs a bitwise OR of the 64-bit value in m1 with the 64-bit value in m2.
This function uses the assembler instruction WOR.

20.9.103 	_mm_packs_pi16

Syntax 
#include <mmintrin.h>
__m64
_mm_packs_pi16 (__m64 m1, __m64 m2)
Description 
Packs the four 16-bit values from m1 into the lower four 8-bit values of the result with signed saturation, and packs the four 16-bit values from m2 into the upper four 8-bit values of the result with signed saturation.
This function uses the assembler instruction WPACKHSS.

20.9.104 	_mm_packs_pi32

Syntax 
#include <mmintrin.h>
__m64
_mm_packs_pi32 (__m64 m1, __m64 m2)
Description 
Packs the two 32-bit values from m1 into the lower two 16-bit values of the result with signed saturation, and packs the two 32-bit values from m2 into the upper two 16-bit values of the result with signed saturation.
This function uses the assembler instruction WPACKWSS.

20.9.105 	_mm_packs_pu16

Syntax 
#include <mmintrin.h>
__m64
_mm_packs_pu16 (__m64 m1, __m64 m2)
Description 
Packs the four 16-bit values from m1 into the lower four 8-bit values of the result with unsigned saturation, and packs the four 16-bit values from m2 into the upper four 8-bit values of the result with unsigned saturation.
This function uses the assembler instruction WPACKHUS.

20.9.106 	_mm_packs_pu32

Syntax 
#include <mmintrin.h>
__m64
_mm_packs_pu32 (__m64 m1, __m64 m2)
Description 
Packs the two 32-bit values from m1 into the lower two 16-bit values of the result with unsigned saturation, and packs the two 32-bit values from m2 into the upper two 16-bit values of the result with unsigned saturation.
This function uses the assembler instruction WPACKWUS.

20.9.107 	_mm_packs_si64

Syntax 
#include <mmintrin.h>
__m64
_mm_packs_si64 (__m64 m1, __m64 m2)
Description 
Packs the 64-bit value from m1 into the lower 32-bit value of the result with signed saturation, and packs the one 32-bit value from m2 into the upper 32-bit value of the result with signed saturation.
This function uses the assembler instruction WPACKDSS.

20.9.108 	_mm_packs_su64

Syntax 
#include <mmintrin.h>
__m64
_mm_packs_su64 (__m64 m1, __m64 m2)
Description 
Packs the 64-bit value from m1 into the lower 32-bit value of the result with signed saturation, and packs the upper 32-bit value from m2 into the upper 32-bit value of the result with signed saturation.
This function uses the assembler instruction WPACKDUS.

20.9.109 	_mm_qmiabb_pi32

Syntax 
#include <mmintrin.h>
__m64
_mm_qmiabb_pi32 (__m64 acc, __m64 m1, __m64 m2)
Description 
Performs a 16-bit multiplication with the lower 16 bits (halfwords) of each of the two words of the parameters m1 and m2, then adds the two 32-bit results to the two words of the parameter acc and returns the result of this addition.
This function uses the assembler instruction WQMIABB.

20.9.110 	_mm_qmiabbn_pi32

Syntax 
#include <mmintrin.h>
__m64
_mm_qmiabbn_pi32 (__m64 acc, __m64 m1, __m64 m2)
Description 
Performs a 16-bit multiplication with the lower 16 bits (halfwords) of each of the two words of the parameters m1 and m2, then subtracts the result from the parameter acc and returns the result of this subtraction.
This function uses the assembler instruction WQMIABBN.

20.9.111 	_mm_qmiabt_pi32

Syntax 
#include <mmintrin.h>
__m64
_mm_qmiabt_pi32 (__m64 acc, __m64 m1, __m64 m2)
Description 
Performs a 16-bit multiplication with the lower 16 bits (halfwords) of each of the two words of parameter m1 and the higher 16 bits (halfwords) of each of the two words of the parameter m2, then adds the two 32-bit results to the two words of the parameter acc and returns the result of this addition.
This function uses the assembler instruction WQMIABT.

20.9.112 	_mm_qmiabtn_pi32

Syntax 
#include <mmintrin.h>
__m64
_mm_qmiabtn_pi32 (__m64 acc, __m64 m1, __m64 m2)
Description 
Performs a 16-bit multiplication with the lower 16 bits (halfwords) of each of the two words of parameter m1 and the higher 16 bits (halfwords) of each of the two words of the parameter m2, then subtracts the result from the parameter acc and returns the result of this subtraction.
This function uses the assembler instruction WQMIABTN.

20.9.113 	_mm_qmiatb_pi32

Syntax 
#include <mmintrin.h>
__m64
_mm_qmiatb_pi32 (__m64 acc, __m64 m1, __m64 m2)
Description 
Performs a 16-bit multiplication with the higher 16 bits (halfwords) of each of the two words of parameter m1 and the lower 16 bits (halfwords) of each of the two words of the parameter m2, then adds the two 32-bit results to the two words of the parameter acc and returns the result of this addition.
This function uses the assembler instruction WQMIATB.

20.9.114 	_mm_qmiatbn_pi32

Syntax 
#include <mmintrin.h>
__m64
_mm_qmiatbn_pi32 (__m64 acc, __m64 m1, __m64 m2)
Description 
Performs a 16-bit multiplication with the higher 16 bits (halfwords) of each of the two words of parameter m1 and the lower 16 bits (halfwords) of each of the two words of the parameter m2, then subtracts the result from the parameter acc and returns the result of this subtraction.
This function uses the assembler instruction WQMIATBN.

20.9.115 	_mm_qmiatt_pi32

Syntax 
#include <mmintrin.h>
__m64
_mm_qmiatt_pi32 (__m64 acc, __m64 m1, __m64 m2)
Description 
Performs a 16-bit multiplication with the higher 16 bits (halfwords) of each of the two words of the parameters m1 and m2, then adds the two 32-bit results to the two words of the parameter acc and returns the result of this addition.
This function uses the assembler instruction WQMIATT.

20.9.116 	_mm_qmiattn_pi32

Syntax 
#include <mmintrin.h>
__m64
_mm_qmiattn_pi32 (__m64 acc, __m64 m1, __m64 m2)
Description 
Performs a 16-bit multiplication with the higher 16 bits (halfwords) of each of the two words of the parameters m1 and m2, then subtracts the result from the parameter acc and returns the result of this subtraction.
This function uses the assembler instruction WQMIATTN.

20.9.117 	_mm_qmulm_pi16

Syntax 
#include <mmintrin.h>
__m64
_mm_qmulm_pi16 (__m64 a, __m64 b)
Description 
Performs parallel vector multiplication on the four 16-bit halfwords of the parameters a and b. The higher order 16 bits of the four 32-bit intermediate results are returned.
This function uses the assembler instruction WQMULM.

20.9.118 	_mm_qmulm_pi32

Syntax 
#include <mmintrin.h>
__m64
_mm_qmulm_pi32 (__m64 a, __m64 b)
Description 
Performs parallel vector multiplication on the two 32-bit words of the parameters a and b. The higher order 32 bits of the two 64-bit results are returned.
This function uses the assembler instruction WQMULWM.

20.9.119 	_mm_qmulmr_pi16

Syntax 
#include <mmintrin.h>
__m64
_mm_qmulmr_pi16 (__m64 a, __m64 b)
Description 
Performs parallel vector multiplication on the four 16-bit halfwords of the parameters a and b. The higher order 16 bits of the four 32-bit intermediate results are returned, with the least significant 16 bits rounded into the most significant 16 bits.
This function uses the assembler instruction WQMULMR.

20.9.120 	_mm_qmulmr_pi32

Syntax 
#include <mmintrin.h>
__m64
_mm_qmulmr_pi32 (__m64 a, __m64 b)
Description 
Performs parallel vector multiplication on the two 32-bit words of the parameters a and b. The higher order 32 bits of the two 64-bit results are returned, with the least significant 32 bits rounded into the most significant 32 bits.
This function uses the assembler instruction WQMULWMR.

20.9.121 	_mm_ror_pi16

Syntax 
#include <mmintrin.h>
__m64
_mm_ror_pi16 (__m64 m, __m64 count)
Description 
Rotates four 16-bit values in m right the amount specified by count.

20.9.122 	_mm_ror_pi32

Syntax 
#include <mmintrin.h>
__m64
_mm_ror_pi32 (__m64 m, __m64 count)
Description 
Rotates two 32-bit values in m right the amount specified by count.

20.9.123 	_mm_ror_si64

Syntax 
#include <mmintrin.h>
__m64
_mm_ror_si64 (__m64 m, __m64 count)
Description 
Rotates 64-bit value in m right the amount specified by count.

20.9.124 	_mm_rori_pi16

Syntax 
#include <mmintrin.h>
__m64
_mm_rori_pi16 (__m64 m, int count)
Description 
Rotates four 16-bit values in m right the amount specified by count. The range of count must be in 0 to 32

20.9.125 	_mm_rori_pi32

Syntax 
#include <mmintrin.h>
__m64
_mm_rori_pi32 (__m64 m, int count)
Description 
Rotates two 32-bit values in m right the amount specified by count. The range of count must be in 0 to 32

20.9.126 	_mm_rori_si64

Syntax 
#include <mmintrin.h>
__m64
_mm_rori_si64 (__m64 m, int count)
Description 
Rotates 64-bit value in m right the amount specified by count. The range of count must be in 0 to 64

20.9.127 	_mm_sad_pu8

Syntax 
#include <mmintrin.h>
__m64
_mm_sad_pu8(__m64 a, __m64 b)
Description 
Computes the sum of the absolute differences of the unsigned bytes in a and b, returning the value in the lower word. The upper word of the result is cleared.
If r = _mm_sad_pu8(a, b), the action is
r [0:31] = abs(a0-b0) +... + abs(a7-b7);
r[32:63] = 0;
This function uses the assembler instruction WSADBZ.

20.9.128 	_mm_sad_pu16

Syntax 
#include <mmintrin.h>
__m64
_mm_sad_pu16(__m64 a, __m64 b)
Description 
Computes the sum of the absolute differences of the unsigned half words in a and b, returning the value in the lower word. The upper word of the result is cleared.
If r = _mm_sad_pu16(a, b), the action is
r [0:31] = abs(a0-b0) +... + abs(a3-b3);
r[32:63] = 0;
This function uses the assembler instruction WSADHZ.

20.9.129 	_mm_sada_pu8

Syntax 
#include <mmintrin.h>
__m64
_mm_sada_pu8(__m64 a, __m64 b, __m64 c)
Description 
Computes the sum of the absolute differences of the bytes in b and c, and accumulates the result with the lower word of a. The upper word of the result is cleared.
If r = _mm_sada_pu8(a, b, c), the action is
r [0:31] = a[0:31] + abs(b0-c0) +... + abs(b7-c7);
r[32:63] = 0;
This function uses the assembler instruction WSADB.

20.9.130 	_mm_sada_pu16

Syntax 
#include <mmintrin.h>
__m64
_mm_sada_pu16(__m64 a, __m64 b, __m64 c)
Description 
Computes the sum of the absolute differences of the half words in b and c, and accumulates the result with the lower word of a.The upper word of the result is cleared.
If r = _mm_sada_pu16(a, b, c), the action is
r [0:31] = a[0:31] + abs(b0-c0) +... + abs(b3-c3);
r[32:63] = 0;
This function uses the assembler instruction WSADH.

20.9.131 	_mm_set_pi16

Syntax 
#include <mmintrin.h>
__m64
_mm_set_pi16 (short w3, short w2, short w1, short w0)
Description 
Sets the 4 signed 16-bit integer values.
If r = _mm_set_pi16 (w3, w2, w1, w0), the action is
r0 = w0;
r1 = w1;
r2 = w2;
r3 = w3;

20.9.132 	_mm_set_pi32

Syntax 
#include <mmintrin.h>
__m64
_mm_set_pi32 (int i1, int i0)
Description 
Sets the 2 signed 32-bit integer values.
If r = _mm_set_pi32(i1, i0), the action is
r0 = i0;
r1 = i1;

20.9.133 	_mm_set_pi8

Syntax 
#include <mmintrin.h>
__m64
_mm_set_pi8 (char b7, char b6,
             char b5, char b4,
             char b3, char b2,
             char b1, char b0)
Description 
Sets the 8 signed 8-bit integer values.
If r = _mm_set_pi8 (b7, b6, b5, b4, b3, b2, b1, b0), the action is
r0 = b0;
r1 = b1;
...
r7 = b7;

20.9.134 	_mm_set1_pi16

Syntax 
#include <mmintrin.h>
__m64
_mm_set1_pi16 (short w)
Description 
Sets the 4 signed 16-bit integer values to w.
If r = _mm_set1_pi16 (w), action is
r0 = w;
r1 = w;
r2 = w;
r3 = w;

20.9.135 	_mm_set1_pi32

Syntax 
#include <mmintrin.h>
__m64
_mm_set1_pi32 (int i)
Description 
Sets the 2 signed 32-bit integer values to i.
If r = _mm_set1_pi32 (i), the action is
r0 = i;
r1 = i;

20.9.136 	_mm_set1_pi8

Syntax 
#include <mmintrin.h>
__m64
_mm_set1_pi8 (char b)
Description 
Sets the 8 signed 8-bit integer values to b.
If r = _mm_set1_pi8 (b), the action is
r0 = b;
r1 = b;
...
r7 = b;

20.9.137 	_mm_setr_pi16

Syntax 
#include <mmintrin.h>
__m64
_mm_setr_pi16 (short w0, short w1, short w2, short w3)
If r = _mm_setr_pi16 (w0, w1, w2, w3), the action is
r0 = w0;
r1 = w1;
r2 = w2;
r3 = w3;

20.9.138 	_mm_setr_pi32

Syntax 
#include <mmintrin.h>
__m64
_mm_setr_pi32 (int i0, int i1)
Description 
Sets the 2 signed 32-bit integer values in reverse order.
If r = _mm_setr_pi32 (i0, i1), the action is
r0 = i0;
r1 = i1;

20.9.139 	_mm_setr_pi8

Syntax 
#include <mmintrin.h>
__m64
_mm_setr_pi8 (char b0, char b1, char b2, char b3,
                    char b4, char b5,
                    char b6, char b7)
Description 
Sets the 8 signed 8-bit integer values in reverse order.
If r = _mm_setr_pi8 (b0, b1, b2, b3, b4, b5, b6, b7), the action is
r0 = b0;
r1 = b1;
...
r7 = b7;

20.9.140 	_mm_setwcx

Syntax 
#include <mmintrin.h>
void
_mm_setwcx(int value, int number)
Description 
Sets the Intel Wireless MMX technology control register specified by number to the contents of value.
This function uses the assembler pseudo-instruction TMCR.
Note:  	The valid range for parameter number is [0, 3] and [8, 11]. The valid control registers are: wCID(0), wCon(1), wCSSF(2), wCASF(3), wCGR0(8), wCGR1(9), wCGR2(10), wCGR3(11).

20.9.141 	_mm_setzero_si64

Syntax 
#include <mmintrin.h>
__m64
_mm_setzero_si64 ()
Description 
Sets the 64-bit value to zero.
If r = _mm_setzero_si64(), the action is 
r = 0x0;

20.9.142 	_mm_shuffle_pi16

Syntax 
#include <mmintrin.h>
__m64
_mm_shuffle_pi16(__m64 a, int n)
Description 
Returns a combination of the four half words of a. The selector n must be an immediate and its range must be 0 to 255.
If r = _mm_shuffle_pi16(a, n), the action is 
r0 = Half word (n&0x3) of a
r1 = Half word ((n>>2)&0x3) of a
r2 = Half word ((n>>4)&0x3) of a
r3 = Half word ((n>>6)&0x3) of a
This function uses the assembler instruction WSHUFH.

20.9.143 	_mm_sll_pi16

Syntax 
#include <mmintrin.h>
__m64
_mm_sll_pi16 (__m64 m, __m64 count)
Description 
Shifts four 16-bit values in m left the amount specified by count while shifting in zeros.

20.9.144 	_mm_sll_pi32

Syntax 
#include <mmintrin.h>
__m64
_mm_sll_pi32 (__m64 m, __m64 count)
Description 
Shifts two 32-bit values in m left the amount specified by count while shifting in zeros.

20.9.145 	_mm_sll_si64

Syntax 
#include <mmintrin.h>
__m64
_mm_sll_si64 (__m64 m, __m64 count)
Description 
Shifts the 64-bit value in m left the amount specified by count while shifting in zeros.

20.9.146 	_mm_slli_pi16

Syntax 
#include <mmintrin.h>
__m64
_mm_slli_pi16 (__m64 m, int count)
Description 
Shifts four 16-bit values in m left the amount specified by count while shifting in zeros. The count must be no less than 0.

20.9.147 	_mm_slli_pi32

Syntax 
#include <mmintrin.h>
__m64
_mm_slli_pi32 (__m64 m, int count)
Description 
Shifts two 32-bit values in m left the amount specified by count while shifting in zeros. The count must be no less than 0.

20.9.148 	_mm_slli_si64

Syntax 
#include <mmintrin.h>
__m64
_mm_slli_si64 (__m64 m, int count)
Description 
Shifts the 64-bit value in m left the amount specified by count while shifting in zeros. The count must be no less than 0.

20.9.149 	_mm_sra_pi16

Syntax 
#include <mmintrin.h>
__m64
_mm_sra_pi16 (__m64 m, __m64 count)
Description 
Shifts four 16-bit values in m right the amount specified by count while shifting in the sign bit.

20.9.150 	_mm_sra_pi32

Syntax 
#include <mmintrin.h>
__m64
_mm_sra_pi32 (__m64 m, __m64 count)
Description 
Shifts two 32-bit values in m right the amount specified by count while shifting in the sign bit.

20.9.151 	_mm_sra_si64

Syntax 
#include <mmintrin.h>
__m64
_mm_sra_si64 (__m64 m, __m64 count)
Description 
Shifts 64-bit value in m right the amount specified by count while shifting in the sign bit.

20.9.152 	_mm_srai_pi16

Syntax 
#include <mmintrin.h>
__m64
_mm_srai_pi16 (__m64 m, int count)
Description 
Shifts four 16-bit values in m right the amount specified by count while shifting in the sign bit. The count must be no less than 0.

20.9.153 	_mm_srai_pi32

Syntax 
#include <mmintrin.h>
__m64
_mm_srai_pi32 (__m64 m, int count)
Description 
Shifts two 32-bit values in m right the amount specified by count while shifting in the sign bit. The count must be no less than 0.

20.9.154 	_mm_srai_si64

Syntax 
#include <mmintrin.h>
__m64
_mm_srai_si64 (__m64 m, int count)
Description 
Shifts 64-bit value in m right the amount specified by count while shifting in the sign bit. The count must be no less than 0.

20.9.155 	_mm_srl_pi16

Syntax 
#include <mmintrin.h>
__m64
_mm_srl_pi16 (__m64 m, __m64 count)
Description 
Shifts four 16-bit values in m right the amount specified by count while shifting in zeros.

20.9.156 	_mm_srl_pi32

Syntax 
#include <mmintrin.h>
__m64
_mm_srl_pi32 (__m64 m, __m64 count)
Description 
Shifts two 32-bit values in m right the amount specified by count while shifting in zeros.

20.9.157 	_mm_srl_si64

Syntax 
#include <mmintrin.h>
__m64
_mm_srl_si64 (__m64 m, __m64 count)
Description 
Shifts the 64-bit value in m right the amount specified by count while shifting in zeros.

20.9.158 	_mm_srli_pi16

Syntax 
#include <mmintrin.h>
__m64
_mm_srli_pi16 (__m64 m, int count)
Description 
Shifts four 16-bit values in m right the amount specified by count while shifting in zeros. The count must be no less than 0.

20.9.159 	_mm_srli_pi32

Syntax 
#include <mmintrin.h>
__m64
_mm_srli_pi32 (__m64 m, int count)
Description 
Shifts two 32-bit values in m right the amount specified by count while shifting in zeros. The count must be no less than 0.

20.9.160 	_mm_srli_si64

Syntax 
#include <mmintrin.h>
__m64
_mm_srli_si64 (__m64 m, int count)
Description 
Shifts the 64-bit value in m right the amount specified by count while shifting in zeros. The count must be no less than 0.

20.9.161 	_mm_sub_pi8

Syntax 
#include <mmintrin.h>
__m64
_mm_sub_pi8 (__m64 m1, __m64 m2)
Description 
Subtracts the eight 8-bit values in m2 from the eight 8-bit values in m1.
This function uses the assembler instruction WSUBB.

20.9.162 	_mm_sub_pi16

Syntax 
#include <mmintrin.h>
__m64
_mm_sub_pi16 (__m64 m1, __m64 m2)
Description 
Subtracts the four 16-bit values in m2 from the four 16-bit values in m1.
This function uses the assembler instruction WSUBH.

20.9.163 	_mm_sub_pi32

Syntax 
#include <mmintrin.h>
__m64
_mm_sub_pi32 (__m64 m1, __m64 m2)
Description 
Subtracts the two 32-bit values in m2 from the two 32-bit values in m1.
This function uses the assembler instruction WSUBW.

20.9.164 	_mm_subaddhx_pi16

Syntax 
#include <mmintrin.h>
__m64
_mm_subaddhx_pi16 (__m64 a, __m64 b)
Description 
The four halfwords of parameter b are alternately added and subtracted from/to the halfwords of parameter a using a cross selection in each of the parallel operations. The result of the operation is saturated to the signed limits. and returned.
This function uses the assembler instruction WSUBADDHX.

20.9.165 	_mm_subs_pi8

Syntax 
#include <mmintrin.h>
__m64
_mm_subs_pi8 (__m64 m1, __m64 m2)
Description 
Subtracts the eight signed 8-bit values in m2 from the eight signed 8-bit values in m1 using saturating arithmetic.
This function uses the assembler instruction WSUBBSS.

20.9.166 	_mm_subs_pi16

Syntax 
#include <mmintrin.h>
__m64
_mm_subs_pi16 (__m64 m1, __m64 m2)
Description 
Subtracts the four signed 16-bit values in m2 from the four signed 16-bit values in m1 using saturating arithmetic.
This function uses the assembler instruction WSUBHSS.

20.9.167 	_mm_subs_pi32

Syntax 
#include <mmintrin.h>
__m64
_mm_subs_pi32 (__m64 m1, __m64 m2)
Description 
Subtracts the two signed 32-bit values in m2 from the two signed 32-bit values in m1 using saturating arithmetic.
This function uses the assembler instruction WSUBWSS.

20.9.168 	_mm_subs_pu8

Syntax 
#include <mmintrin.h>
__m64
_mm_subs_pu8 (__m64 m1, __m64 m2)
Description 
Subtracts the eight unsigned 8-bit values in m2 from the eight unsigned 8-bit values in m1 using saturating arithmetic.
This function uses the assembler instruction WSUBBUS.

20.9.169 	_mm_subs_pu16

Syntax 
#include <mmintrin.h>
__m64
_mm_subs_pu16 (__m64 m1, __m64 m2)
Description 
Subtracts the four unsigned 16-bit values in m2 from the four unsigned 16-bit values in m1 using saturating arithmetic.
This function uses the assembler instruction WSUBHUS.

20.9.170 	_mm_subs_pu32 

Syntax 
#include <mmintrin.h>
__m64
_mm_subs_pu32 (__m64 m1, __m64 m2)
Description 
Subtracts the two unsigned 32-bit values in m2 from the two unsigned 32-bit values in m1 using saturating arithmetic.
This function uses the assembler instruction WSUBWUS.

20.9.171 	_mm_tandcb 

Syntax 
#include <mmintrin.h>
void
_mm_tandcb ()
Description 
Performs ¡°AND¡± across the fields of the SIMD processor status register (PSR) (wCASF) and sends the result to the ARM* CPSR; performed after a byte operation that sets the flags. 
This function uses the assembler instruction TANDCB.

20.9.172 	_mm_tandch 

Syntax 
#include <mmintrin.h>
void
_mm_tandch ()
Description 
Performs ¡°AND¡± across the fields of the SIMD processor status register (PSR) (wCASF) and sends the result to the ARM* CPSR; performed after a half-word operation that sets the flags. 
This function uses the assembler instruction TANDCH.

20.9.173 	_mm_tandcw 

Syntax 
#include <mmintrin.h>
void
_mm_tandcw ()
Description 
Performs ¡°AND¡± across the fields of the SIMD processor status register (PSR) (wCASF) and sends the result to the ARM* CPSR; performed after a word operation that sets the flags. 
This function uses the assembler instruction TANDCW.

20.9.174 	_mm_tbcst_pi8 

Syntax 
#include <mmintrin.h>
__m64
_mm_tbcst_pi8 ( int value)
Description 
Broadcasts a value from the ARM* source register, Rn, or to every SIMD position in the Intel? Wireless MMXTM 2 coprocessor destination register, wRd; operate on 8-bit data values.
This function uses the assembler instruction TBCSTB.

20.9.175 	_mm_tbcst_pi16 

Syntax 
#include <mmintrin.h>
__m64
_mm_tbcst_pi16 ( int value)
Description 
Broadcasts a value from the ARM* source register, Rn, or to every SIMD position in the Intel? Wireless MMXTM 2 coprocessor destination register, wRd; operate on 16-bit data values.
This function uses the assembler instruction TBCSTH.

20.9.176 	_mm_tbcst_pi32 

Syntax 
#include <mmintrin.h>
__m64
_mm_tbcst_pi32 ( int value)
Description 
Broadcasts a value from the ARM* source register, Rn, or to every SIMD position in the Intel? Wireless MMXTM 2 coprocessor destination register, wRd; operate on 32-bit data values.
This function uses the assembler instruction TBCSTW.

20.9.177 	_mm_textrcb 

Syntax 
#include <mmintrin.h>
void
_mm_textrcb(n)
Description 
Extracts 4-bit field specified by the 3-bit immediate n from the SIMD PSR (wCASF), and transfers to the ARM* CPSR. The range of n is 0 to 7.
This function uses the assembler instruction TEXTRCB.

20.9.178 	_mm_ textrch 

Syntax 
#include <mmintrin.h>
void
_mm_textrch(n)
Description 
Extracts 8-bit field specified by the 3-bit immediate n from the SIMD PSR (wCASF), and transfers to the ARM* CPSR. The range of n is 0 to 3.
This function uses the assembler instruction TEXTRCH.

20.9.179 	_mm_ textrcw 

Syntax 
#include <mmintrin.h>
void
_mm_textrcw(n)
Description 
Extracts 16-bit field specified by the 3-bit immediate n from the SIMD PSR (wCASF), and transfers to the ARM* CPSR. The range of n is 0 to 1.
This function uses the assembler instruction TEXTRCW.

20.9.180 	_mm_torcb 

Syntax 
#include <mmintrin.h>
void
_mm_torcb()
Description 
Performs ¡°OR¡± across the fields of the SIMD PSR (wCASF) and sends the result to the ARM* CPSR; operation is performed after a byte operation that sets the flags.
This function uses the assembler instruction TORCB.

20.9.181 	_mm_torch 

Syntax 
#include <mmintrin.h>
void
_mm_torch()
Description 
Performs ¡°OR¡± across the fields of the SIMD PSR (wCASF) and sends the result to the ARM* CPSR; operation is performed after a half-word operation that sets the flags.
This function uses the assembler instruction TORCH.

20.9.182 	_mm_torcw 

Syntax 
#include <mmintrin.h>
void
_mm_torcw()
Description 
Performs ¡°OR¡± across the fields of the SIMD PSR (wCASF) and sends the result to the ARM* CPSR; operation is performed after a word operation that sets the flags.
This function uses the assembler instruction TORCW.

20.9.183 	_mm_torvscb 

Syntax 
#include <mmintrin.h>
void
_mm_torvscb()
Description 
Performs ¡°OR¡± across the fields of the SIMD saturation flags (wCSSF) and sends the result to the ARM* CPSR Overflow, (V), flag; operation is performed after a byte operation that sets the flags.
This function uses the assembler instruction TORVSCB.

20.9.184 	_mm_torvsch 

Syntax 
#include <mmintrin.h>
void
_mm_torvsch()
Description 
Performs ¡°OR¡± across the fields of the SIMD saturation flags (wCSSF) and sends the result to the ARM* CPSR Overflow, (V), flag; operation can be performed after a half-word operation that sets the flags.
This function uses the assembler instruction TORVSCH.

20.9.185 	_mm_torvscw 

Syntax 
#include <mmintrin.h>
void
_mm_torvscw()
Description 
Performs ¡°OR¡± across the fields of the SIMD saturation flags (wCSSF) and sends the result to the ARM* CPSR Overflow, (V), flag; operation can be performed after a word operation that sets the flags.
This function uses the assembler instruction TORVSCW.

20.9.186 	_mm_unpackeh_pi8

Syntax 
#include <mmintrin.h>
__m64
_mm_unpackeh_pi8 (__m64 m1)
Description 
Unpacks the four 8-bit values from the upper half of m1 and sign-extends each value.
This function uses the assembler instruction WUNPCKEHSB.

20.9.187 	_mm_unpackeh_pi16

Syntax 
#include <mmintrin.h>
__m64
_mm_unpackeh_pi16 (__m64 m1)
Description 
Unpacks the two 16-bit values from the upper half of m1 and sign-extends each value.
This function uses the assembler instruction WUNPCKEHSH.

20.9.188 	_mm_unpackeh_pi32

Syntax 
#include <mmintrin.h>
__m64
_mm_unpackeh_pi32 (__m64 m1)
Description 
Unpacks the 32-bit value from the upper half of m1 and sign-extends each value.
This function uses the assembler instruction WUNPCKEHSW.

20.9.189 	_mm_unpackeh_pu8

Syntax 
#include <mmintrin.h>
__m64
_mm_unpackeh_pu8 (__m64 m1)
Description 
Unpacks the four 8-bit values from the upper half of m1 and zero-extends each value.
This function uses the assembler instruction WUNPCKEHUB.

20.9.190 	_mm_unpackeh_pu16

Syntax 
#include <mmintrin.h>
__m64
_mm_unpackeh_pu16 (__m64 m1)
Description 
Unpacks the two 16-bit values from the upper half of m1 and zero-extends each value.
This function uses the assembler instruction WUNPCKEHUH.

20.9.191 	_mm_unpackeh_pu32

Syntax 
#include <mmintrin.h>
__m64
_mm_unpackeh_pu32 (__m64 m1)
Description 
Unpacks the 32-bit value from the upper half of m1 and zero-extends each value.
This function uses the assembler instruction WUNPCKEHUW.

20.9.192 	_mm_unpackel_pi8

Syntax 
#include <mmintrin.h>
__m64
_mm_unpackel_pi8 (__m64 m1)
Description 
Unpacks the four 8-bit values from the lower half of m1 and sign-extends each value.
This function uses the assembler instruction WUNPCKELSB.

20.9.193 	_mm_unpackel_pi16

Syntax 
#include <mmintrin.h>
__m64
_mm_unpackel_pi16 (__m64 m1)
Description 
Unpacks the two 16-bit values from the lower half of m1 and sign-extends each value.
This function uses the assembler instruction WUNPCKELSH.

20.9.194 	_mm_unpackel_pi32

Syntax 
#include <mmintrin.h>
__m64
_mm_unpackel_pi32 (__m64 m1)
Description 
Unpacks the 32-bit value from the lower half of m1 and sign-extends each value.
This function uses the assembler instruction WUNPCKELSW.

20.9.195 	_mm_unpackel_pu8

Syntax 
#include <mmintrin.h>
__m64
_mm_unpackel_pu8 (__m64 m1)
Description 
Unpacks the four 8-bit values from the lower half of m1 and zero-extends each value.
This function uses the assembler instruction WUNPCKELUB.

20.9.196 	_mm_unpackel_pu16

Syntax 
#include <mmintrin.h>
__m64
_mm_unpackel_pu16 (__m64 m1)
Description 
Unpacks the two 16-bit values from the lower half of m1 and zero-extends each value.
This function uses the assembler instruction WUNPCKELUH.

20.9.197 	_mm_unpackel_pu32

Syntax 
#include <mmintrin.h>
__m64
_mm_unpackel_pu32 (__m64 m1)
Description 
Unpacks the 32-bit value from the lower half of m1 and zero-extends each value.
This function uses the assembler instruction WUNPCKELUW.

20.9.198 	_mm_unpackhi_pi8

Syntax 
#include <mmintrin.h>
__m64
_mm_unpackhi_pi8 (__m64 m1, __m64 m2)
Description 
Interleaves the four 8-bit values from the upper half of m1 with the four values from the upper half of m2. The interleaving begins with the data from m1.
This function uses the assembler instruction WUNPCKIHB.

20.9.199 	_mm_unpackhi_pi16

Syntax 
#include <mmintrin.h>
__m64
_mm_unpackhi_pi16 (__m64 m1, __m64 m2)
Description 
Interleaves the two 16-bit values from the upper half of m1 with the two values from the upper half of m2. The interleaving begins with the data from m1.
This function uses the assembler instruction WUNPCKIHH.

20.9.200 	_mm_unpackhi_pi32

Syntax 
#include <mmintrin.h>
__m64
_mm_unpackhi_pi32 (__m64 m1, __m64 m2)
Description 
Interleaves the 32-bit value from the upper half of m1 with the 32-bit value from the upper half of m2. The interleaving begins with the data from m1.
This function uses the assembler instruction WUNPCKIHW.

20.9.201 	_mm_unpacklo_pi8

Syntax 
#include <mmintrin.h>
__m64
_mm_unpacklo_pi8 (__m64 m1, __m64 m2)
Description 
Interleaves the four 8-bit values from the lower half of m1 with the four values from the lower half of m2. The interleaving begins with the data from m1.
This function uses the assembler instruction WUNPCKILB.

20.9.202 	_mm_unpacklo_pi16

Syntax 
#include <mmintrin.h>
__m64
_mm_unpacklo_pi16 (__m64 m1, __m64 m2)
Description 
Interleaves the two 16-bit values from the lower half of m1 with the two values from the lower half of m2. The interleaving begins with the data from m1.
This function uses the assembler instruction WUNPCKILH.

20.9.203 	_mm_unpacklo_pi32

Syntax 
#include <mmintrin.h>
__m64
_mm_unpacklo_pi32 (__m64 m1, __m64 m2)
Description 
Interleaves the 32-bit value from the lower half of m1 with the 32-bit value from the lower half of m2. The interleaving begins with the data from m1. 
This function uses the assembler instruction WUNPCKILW.

20.9.204 	_mm_wmiabb_si64

Syntax 
#include <mmintrin.h>
__m64
_mm_wmiabb_si64 (__m64 acc, __m64 m1, __m64 m2)
Description 
Performs a 16-bit parallel multiply-accumulate with the lower 16 bits (halfwords) of each of the two words of the parameters m1 and m2, then adds the result to the parameter acc and returns the result of this addition.
This function uses the assembler instruction WMIABB.

20.9.205 	_mm_wmiabbn_si64

Syntax 
#include <mmintrin.h>
__m64
_mm_wmiabbn_si64 (__m64 acc, __m64 m1, __m64 m2)
Description 
Performs a 16-bit parallel multiply-accumulate with the lower 16 bits (halfwords) of each of the two words of the parameters m1 and m2, then subtracts the result from the parameter acc and returns the result of this subtraction.
This function uses the assembler instruction WMIABBN.

20.9.206 	_mm_wmiabt_si64

Syntax 
#include <mmintrin.h>
__m64
_mm_wmiabt_si64 (__m64 acc, __m64 m1, __m64 m2)
Description 
Performs a 16-bit parallel multiply-accumulate with the lower 16 bits (halfwords) of each of the two words of the parameter m1 and the upper 16 bits (halfwords) of each of the two words of the parameter m2, then adds the result to the parameter acc and returns the result of this addition.
This function uses the assembler instruction WMIABT.

20.9.207 	_mm_wmiabtn_si64

Syntax 
#include <mmintrin.h>
__m64
_mm_wmiabtn_si64 (__m64 acc, __m64 m1, __m64 m2)
Description 
Performs a 16-bit parallel multiply-accumulate with the lower 16 bits (halfwords) of each of the two words of the parameter m1 and the upper 16 bits (halfwords) of each of the two words of the parameter m2, then subtracts the result from the parameter acc and returns the result of this subtraction.
This function uses the assembler instruction WMIABTN.

20.9.208 	_mm_wmiatb_si64

Syntax 
#include <mmintrin.h>
__m64
_mm_wmiatb_si64 (__m64 acc,__m64 m1, __m64 m2)
Description 
Performs a 16-bit parallel multiply-accumulate with the upper 16 bits (halfwords) of each of the two words of the parameter m1 and the lower 16 bits (halfwords) of each of the two words of the parameter m2, then adds the result to the parameter acc and returns the result of this addition.
This function uses the assembler instruction WMIATB.

20.9.209 	_mm_wmiatbn_si64

Syntax 
#include <mmintrin.h>
__m64
_mm_wmiatbn_si64 (__m64 acc,__m64 m1, __m64 m2)
Description 
Performs a 16-bit parallel multiply-accumulate with the upper 16 bits (halfwords) of each of the two words of the parameter m1 and the lower 16 bits (halfwords) of each of the two words of the parameter m2, then subtracts the result from the parameter acc and returns the result of this subtraction.
This function uses the assembler instruction WMIATBN.

20.9.210 	_mm_wmiatt_si64

Syntax 
#include <mmintrin.h>
__m64
_mm_wmiatt_si64 (__m64 acc, __m64 m1, __m64 m2)
Description 
Performs a 16-bit parallel multiply-accumulate with the upper 16 bits (halfwords) of each of the two words of the parameters m1 and m2, then adds the result to the parameter acc and returns the result of this addition.
This function uses the assembler instruction WMIATT.

20.9.211 	_mm_wmiattn_si64

Syntax 
#include <mmintrin.h>
__m64
_mm_wmiattn_si64 (__m64 acc, __m64 m1, __m64 m2)
Description 
Performs a 16-bit parallel multiply-accumulate with the upper 16 bits (halfwords) of each of the two words of the parameters m1 and m2, then subtracts the result from the parameter acc and returns the result of this subtraction.
This function uses the assembler instruction WMIATTN.

20.9.212 	_mm_wmiawbb_si64

Syntax 
#include <mmintrin.h>
__m64
_mm_wmiawbb_si64 (__m64 acc,__m64 m1, __m64 m2)
Description 
Performs a 32-bit parallel multiply-accumulate with the lower 32 bits (bottom word) of each of the parameters m1 and m2, then adds the result to the parameter acc and returns the result of this addition.
This function uses the assembler instruction WMIAWBB.

20.9.213 	_mm_wmiawbbn_si64

Syntax 
#include <mmintrin.h>
__m64
_mm_wmiawbbn_si64 (__m64 acc, __m64 m1, __m64 m2)
Description 
Performs a 16-bit parallel multiply-accumulate with the lower 16 bits (halfwords) of each of the two words of the parameters m1 and m2, then subtracts the result from the parameter acc and returns the result of this subtraction.
This function uses the assembler instruction WMIAWBBN.

20.9.214 	_mm_wmiawbt_si64

Syntax 
#include <mmintrin.h>
__m64
_mm_wmiawbt_si64 (__m64 acc, __m64 m1, __m64 m2)
Description 
Performs a 32-bit parallel multiply-accumulate with the lower 32 bits (bottom word) of the parameter m1 and the upper 32 bits (top word) of the parameter m2, then adds the result to the parameter acc and returns the result of this addition.
This function uses the assembler instruction WMIAWBT.

20.9.215 	_mm_wmiawbtn_si64

Syntax 
#include <mmintrin.h>
__m64
_mm_wmiawbtn_si64 (__m64 acc, __m64 m1, __m64 m2)
Description 
Performs a 32-bit parallel multiply-accumulate with the lower 32 bits (bottom word) of the parameter m1 and the upper 32 bits (top word) of the parameter m2, then subtracts the result from the parameter acc and returns the result of this subtraction.
This function uses the assembler instruction WMIAWBTN.

20.9.216 	_mm_wmiawtb_si64

Syntax 
#include <mmintrin.h>
__m64
_mm_wmiawtb_si64 (__m64 acc, __m64 m1, __m64 m2)
Description 
Performs a 32-bit parallel multiply-accumulate with the upper 32 bits (top word) of the parameter m1 and the lower 32 bits (bottom word) of the parameter m2, then adds the result to the parameter acc and returns the result of this addition.
This function uses the assembler instruction WMIAWTB.

20.9.217 	_mm_wmiawtbn_si64

Syntax 
#include <mmintrin.h>
__m64
_mm_wmiawtbn_si64 (__m64 acc, __m64 m1, __m64 m2)
Description 
Performs a 32-bit parallel multiply-accumulate with the upper 32 bits (top word) of the parameter m1 and the lower 32 bits (bottom word) of the parameter m2, then subtracts the result from the parameter acc and returns the result of this subtraction.
This function uses the assembler instruction WMIAWTBN.

20.9.218 	_mm_wmiawtt_si64

Syntax 
#include <mmintrin.h>
__m64
_mm_wmiawtt_si64 (__m64 acc, __m64 m1, __m64 m2)
Description 
Performs a 32-bit parallel multiply-accumulate with the upper 32 bits (top word) of the parameters m1 and m2, then adds the result to the parameter acc and returns the result of this addition.
This function uses the assembler instruction WMIAWTT.

20.9.219 	_mm_wmiawttn_si64

Syntax 
#include <mmintrin.h>
__m64
_mm_wmiawttn_si64 (__m64 acc, __m64 m1, __m64 m2)
Description 
Performs a 32-bit parallel multiply-accumulate with the upper 32 bits (top word) of the parameters m1 and m2, then subtracts the result from the parameter acc and returns the result of this subtraction.
This function uses the assembler instruction WMIAWTTN.

20.9.220 	_mm_xor_si64

Syntax 
#include <mmintrin.h>
__m64
_mm_xor_si64 (__m64 m1, __m64 m2)
Description 
Performs a bitwise XOR of the 64-bit value in m1 with the 64-bit value in m2.
This function uses the assembler instruction WXOR.

[-- Attachment #3: 2_mmintrin.diff --]
[-- Type: application/octet-stream, Size: 17503 bytes --]

Index: gcc/config/arm/mmintrin.h
===================================================================
--- gcc/config/arm/mmintrin.h	(revision 178025)
+++ gcc/config/arm/mmintrin.h	(working copy)
@@ -24,16 +24,21 @@
 #ifndef _MMINTRIN_H_INCLUDED
 #define _MMINTRIN_H_INCLUDED

+#if defined __cplusplus
+extern "C" { /* Begin "C" */
+/* Intrinsics use C name-mangling.  */
+#endif /* __cplusplus */
+
 /* The data type intended for user use.  */
 typedef unsigned long long __m64, __int64;

 /* Internal data types for implementing the intrinsics.  */
 typedef int __v2si __attribute__ ((vector_size (8)));
 typedef short __v4hi __attribute__ ((vector_size (8)));
-typedef char __v8qi __attribute__ ((vector_size (8)));
+typedef signed char __v8qi __attribute__ ((vector_size (8)));

 /* "Convert" __m64 and __int64 into each other.  */
-static __inline __m64 
+static __inline __m64
 _mm_cvtsi64_m64 (__int64 __i)
 {
   return __i;
@@ -54,7 +59,7 @@ _mm_cvtsi64_si32 (__int64 __i)
 static __inline __int64
 _mm_cvtsi32_si64 (int __i)
 {
-  return __i;
+  return (__i & 0xffffffff);
 }

 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
@@ -603,7 +608,7 @@ _mm_and_si64 (__m64 __m1, __m64 __m2)
 static __inline __m64
 _mm_andnot_si64 (__m64 __m1, __m64 __m2)
 {
-  return __builtin_arm_wandn (__m1, __m2);
+  return __builtin_arm_wandn (__m2, __m1);
 }

 /* Bit-wise inclusive OR the 64-bit values in M1 and M2.  */
@@ -935,7 +940,13 @@ _mm_avg2_pu16 (__m64 __A, __m64 __B)
 static __inline __m64
 _mm_sad_pu8 (__m64 __A, __m64 __B)
 {
-  return (__m64) __builtin_arm_wsadb ((__v8qi)__A, (__v8qi)__B);
+  return (__m64) __builtin_arm_wsadbz ((__v8qi)__A, (__v8qi)__B);
+}
+
+static __inline __m64
+_mm_sada_pu8 (__m64 __A, __m64 __B, __m64 __C)
+{
+  return (__m64) __builtin_arm_wsadb ((__v2si)__A, (__v8qi)__B, (__v8qi)__C);
 }

 /* Compute the sum of the absolute differences of the unsigned 16-bit
@@ -944,9 +955,16 @@ _mm_sad_pu8 (__m64 __A, __m64 __B)
 static __inline __m64
 _mm_sad_pu16 (__m64 __A, __m64 __B)
 {
-  return (__m64) __builtin_arm_wsadh ((__v4hi)__A, (__v4hi)__B);
+  return (__m64) __builtin_arm_wsadhz ((__v4hi)__A, (__v4hi)__B);
 }

+static __inline __m64
+_mm_sada_pu16 (__m64 __A, __m64 __B, __m64 __C)
+{
+  return (__m64) __builtin_arm_wsadh ((__v2si)__A, (__v4hi)__B, (__v4hi)__C);
+}
+
+
 /* Compute the sum of the absolute differences of the unsigned 8-bit
    values in A and B.  Return the value in the lower 16-bit word; the
    upper words are cleared.  */
@@ -965,11 +983,8 @@ _mm_sadz_pu16 (__m64 __A, __m64 __B)
   return (__m64) __builtin_arm_wsadhz ((__v4hi)__A, (__v4hi)__B);
 }

-static __inline __m64
-_mm_align_si64 (__m64 __A, __m64 __B, int __C)
-{
-  return (__m64) __builtin_arm_walign ((__v8qi)__A, (__v8qi)__B, __C);
-}
+#define _mm_align_si64(__A,__B, N) \
+  (__m64) __builtin_arm_walign ((__v8qi) (__A),(__v8qi) (__B), (N))

 /* Creates a 64-bit zero.  */
 static __inline __m64
@@ -987,42 +1002,76 @@ _mm_setwcx (const int __value, const int
 {
   switch (__regno)
     {
-    case 0:  __builtin_arm_setwcx (__value, 0); break;
-    case 1:  __builtin_arm_setwcx (__value, 1); break;
-    case 2:  __builtin_arm_setwcx (__value, 2); break;
-    case 3:  __builtin_arm_setwcx (__value, 3); break;
-    case 8:  __builtin_arm_setwcx (__value, 8); break;
-    case 9:  __builtin_arm_setwcx (__value, 9); break;
-    case 10: __builtin_arm_setwcx (__value, 10); break;
-    case 11: __builtin_arm_setwcx (__value, 11); break;
-    default: break;
+    case 0:
+      __asm __volatile ("tmcr wcid, %0" :: "r"(__value));
+      break;
+    case 1:
+      __asm __volatile ("tmcr wcon, %0" :: "r"(__value));
+      break;
+    case 2:
+      __asm __volatile ("tmcr wcssf, %0" :: "r"(__value));
+      break;
+    case 3:
+      __asm __volatile ("tmcr wcasf, %0" :: "r"(__value));
+      break;
+    case 8:
+      __builtin_arm_setwcgr0 (__value);
+      break;
+    case 9:
+      __builtin_arm_setwcgr1 (__value);
+      break;
+    case 10:
+      __builtin_arm_setwcgr2 (__value);
+      break;
+    case 11:
+      __builtin_arm_setwcgr3 (__value);
+      break;
+    default:
+      break;
     }
 }

 static __inline int
 _mm_getwcx (const int __regno)
 {
+  int __value;
   switch (__regno)
     {
-    case 0:  return __builtin_arm_getwcx (0);
-    case 1:  return __builtin_arm_getwcx (1);
-    case 2:  return __builtin_arm_getwcx (2);
-    case 3:  return __builtin_arm_getwcx (3);
-    case 8:  return __builtin_arm_getwcx (8);
-    case 9:  return __builtin_arm_getwcx (9);
-    case 10: return __builtin_arm_getwcx (10);
-    case 11: return __builtin_arm_getwcx (11);
-    default: return 0;
+    case 0:
+      __asm __volatile ("tmrc %0, wcid" : "=r"(__value));
+      break;
+    case 1:
+      __asm __volatile ("tmrc %0, wcon" : "=r"(__value));
+      break;
+    case 2:
+      __asm __volatile ("tmrc %0, wcssf" : "=r"(__value));
+      break;
+    case 3:
+      __asm __volatile ("tmrc %0, wcasf" : "=r"(__value));
+      break;
+    case 8:
+      return __builtin_arm_getwcgr0 ();
+    case 9:
+      return __builtin_arm_getwcgr1 ();
+    case 10:
+      return __builtin_arm_getwcgr2 ();
+    case 11:
+      return __builtin_arm_getwcgr3 ();
+    default:
+      break;
     }
+  return __value;
 }

 /* Creates a vector of two 32-bit values; I0 is least significant.  */
 static __inline __m64
 _mm_set_pi32 (int __i1, int __i0)
 {
-  union {
+  union
+  {
     __m64 __q;
-    struct {
+    struct
+    {
       unsigned int __i0;
       unsigned int __i1;
     } __s;
@@ -1041,7 +1090,7 @@ _mm_set_pi16 (short __w3, short __w2, sh
   unsigned int __i1 = (unsigned short)__w3 << 16 | (unsigned short)__w2;
   unsigned int __i0 = (unsigned short)__w1 << 16 | (unsigned short)__w0;
   return _mm_set_pi32 (__i1, __i0);
-		       
+
 }

 /* Creates a vector of eight 8-bit values; B0 is least significant.  */
@@ -1110,9 +1159,521 @@ _mm_set1_pi8 (char __b)

 /* Convert an integer to a __m64 object.  */
 static __inline __m64
-_m_from_int (int __a)
+_mm_abs_pi8 (__m64 m1)
+{
+  return (__m64) __builtin_arm_wabsb ((__v8qi)m1);
+}
+
+static __inline __m64
+_mm_abs_pi16 (__m64 m1)
+{
+  return (__m64) __builtin_arm_wabsh ((__v4hi)m1);
+
+}
+
+static __inline __m64
+_mm_abs_pi32 (__m64 m1)
+{
+  return (__m64) __builtin_arm_wabsw ((__v2si)m1);
+
+}
+
+static __inline __m64
+_mm_addsubhx_pi16 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_waddsubhx ((__v4hi)a, (__v4hi)b);
+}
+
+static __inline __m64
+_mm_absdiff_pu8 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wabsdiffb ((__v8qi)a, (__v8qi)b);
+}
+
+static __inline __m64
+_mm_absdiff_pu16 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wabsdiffh ((__v4hi)a, (__v4hi)b);
+}
+
+static __inline __m64
+_mm_absdiff_pu32 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wabsdiffw ((__v2si)a, (__v2si)b);
+}
+
+static __inline __m64
+_mm_addc_pu16 (__m64 a, __m64 b)
+{
+  __m64 result;
+  __asm__ __volatile__ ("waddhc	%0, %1, %2" : "=y" (result) : "y" (a),  "y" (b));
+  return result;
+}
+
+static __inline __m64
+_mm_addc_pu32 (__m64 a, __m64 b)
+{
+  __m64 result;
+  __asm__ __volatile__ ("waddwc	%0, %1, %2" : "=y" (result) : "y" (a),  "y" (b));
+  return result;
+}
+
+static __inline __m64
+_mm_avg4_pu8 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wavg4 ((__v8qi)a, (__v8qi)b);
+}
+
+static __inline __m64
+_mm_avg4r_pu8 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wavg4r ((__v8qi)a, (__v8qi)b);
+}
+
+static __inline __m64
+_mm_maddx_pi16 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wmaddsx ((__v4hi)a, (__v4hi)b);
+}
+
+static __inline __m64
+_mm_maddx_pu16 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wmaddux ((__v4hi)a, (__v4hi)b);
+}
+
+static __inline __m64
+_mm_msub_pi16 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wmaddsn ((__v4hi)a, (__v4hi)b);
+}
+
+static __inline __m64
+_mm_msub_pu16 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wmaddun ((__v4hi)a, (__v4hi)b);
+}
+
+static __inline __m64
+_mm_mulhi_pi32 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wmulwsm ((__v2si)a, (__v2si)b);
+}
+
+static __inline __m64
+_mm_mulhi_pu32 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wmulwum ((__v2si)a, (__v2si)b);
+}
+
+static __inline __m64
+_mm_mulhir_pi16 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wmulsmr ((__v4hi)a, (__v4hi)b);
+}
+
+static __inline __m64
+_mm_mulhir_pi32 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wmulwsmr ((__v2si)a, (__v2si)b);
+}
+
+static __inline __m64
+_mm_mulhir_pu16 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wmulumr ((__v4hi)a, (__v4hi)b);
+}
+
+static __inline __m64
+_mm_mulhir_pu32 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wmulwumr ((__v2si)a, (__v2si)b);
+}
+
+static __inline __m64
+_mm_mullo_pi32 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wmulwl ((__v2si)a, (__v2si)b);
+}
+
+static __inline __m64
+_mm_qmulm_pi16 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wqmulm ((__v4hi)a, (__v4hi)b);
+}
+
+static __inline __m64
+_mm_qmulm_pi32 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wqmulwm ((__v2si)a, (__v2si)b);
+}
+
+static __inline __m64
+_mm_qmulmr_pi16 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wqmulmr ((__v4hi)a, (__v4hi)b);
+}
+
+static __inline __m64
+_mm_qmulmr_pi32 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wqmulwmr ((__v2si)a, (__v2si)b);
+}
+
+static __inline __m64
+_mm_subaddhx_pi16 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wsubaddhx ((__v4hi)a, (__v4hi)b);
+}
+
+static __inline __m64
+_mm_addbhusl_pu8 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_waddbhusl ((__v4hi)a, (__v8qi)b);
+}
+
+static __inline __m64
+_mm_addbhusm_pu8 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_waddbhusm ((__v4hi)a, (__v8qi)b);
+}
+
+#define _mm_qmiabb_pi32(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wqmiabb ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_qmiabbn_pi32(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wqmiabbn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_qmiabt_pi32(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wqmiabt ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_qmiabtn_pi32(acc, m1, m2) \
+  ({\
+   __m64 _acc=acc;\
+   __m64 _m1=m1;\
+   __m64 _m2=m2;\
+   _acc = (__m64) __builtin_arm_wqmiabtn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_qmiatb_pi32(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wqmiatb ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_qmiatbn_pi32(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wqmiatbn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_qmiatt_pi32(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wqmiatt ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_qmiattn_pi32(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wqmiattn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiabb_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiabb (_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiabbn_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiabbn (_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiabt_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiabt (_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiabtn_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiabtn (_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiatb_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiatb (_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiatbn_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiatbn (_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiatt_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiatt (_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiattn_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiattn (_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiawbb_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiawbb (_acc, (__v2si)_m1, (__v2si)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiawbbn_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiawbbn (_acc, (__v2si)_m1, (__v2si)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiawbt_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiawbt (_acc, (__v2si)_m1, (__v2si)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiawbtn_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiawbtn (_acc, (__v2si)_m1, (__v2si)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiawtb_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiawtb (_acc, (__v2si)_m1, (__v2si)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiawtbn_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiawtbn (_acc, (__v2si)_m1, (__v2si)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiawtt_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiawtt (_acc, (__v2si)_m1, (__v2si)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiawttn_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiawttn (_acc, (__v2si)_m1, (__v2si)_m2);\
+   _acc;\
+   })
+
+/* The third arguments should be an immediate.  */
+#define _mm_merge_si64(a, b, n) \
+  ({\
+   __m64 result;\
+   result = (__m64) __builtin_arm_wmerge ((__m64) (a), (__m64) (b), (n));\
+   result;\
+   })
+
+static __inline __m64
+_mm_alignr0_si64 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_walignr0 ((__v8qi) a, (__v8qi) b);
+}
+
+static __inline __m64
+_mm_alignr1_si64 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_walignr1 ((__v8qi) a, (__v8qi) b);
+}
+
+static __inline __m64
+_mm_alignr2_si64 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_walignr2 ((__v8qi) a, (__v8qi) b);
+}
+
+static __inline __m64
+_mm_alignr3_si64 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_walignr3 ((__v8qi) a, (__v8qi) b);
+}
+
+static __inline void
+_mm_tandcb ()
+{
+  __asm __volatile ("tandcb r15");
+}
+
+static __inline void
+_mm_tandch ()
+{
+  __asm __volatile ("tandch r15");
+}
+
+static __inline void
+_mm_tandcw ()
+{
+  __asm __volatile ("tandcw r15");
+}
+
+#define _mm_textrcb(n) \
+  ({\
+   __asm__ __volatile__ (\
+     "textrcb r15, %0" : : "i" (n));\
+   })
+
+#define _mm_textrch(n) \
+  ({\
+   __asm__ __volatile__ (\
+     "textrch r15, %0" : : "i" (n));\
+   })
+
+#define _mm_textrcw(n) \
+  ({\
+   __asm__ __volatile__ (\
+     "textrcw r15, %0" : : "i" (n));\
+   })
+
+static __inline void
+_mm_torcb ()
+{
+  __asm __volatile ("torcb r15");
+}
+
+static __inline void
+_mm_torch ()
+{
+  __asm __volatile ("torch r15");
+}
+
+static __inline void
+_mm_torcw ()
+{
+  __asm __volatile ("torcw r15");
+}
+
+static __inline void
+_mm_torvscb ()
+{
+  __asm __volatile ("torvscb r15");
+}
+
+static __inline void
+_mm_torvsch ()
+{
+  __asm __volatile ("torvsch r15");
+}
+
+static __inline void
+_mm_torvscw ()
+{
+  __asm __volatile ("torvscw r15");
+}
+
+static __inline __m64
+_mm_tbcst_pi8 (int value)
+{
+  return (__m64) __builtin_arm_tbcstb ((signed char) value);
+}
+
+static __inline __m64
+_mm_tbcst_pi16 (int value)
+{
+  return (__m64) __builtin_arm_tbcsth ((short) value);
+}
+
+static __inline __m64
+_mm_tbcst_pi32 (int value)
 {
-  return (__m64)__a;
+  return (__m64) __builtin_arm_tbcstw (value);
 }

 #define _m_packsswb _mm_packs_pi16
@@ -1250,5 +1811,10 @@ _m_from_int (int __a)
 #define _m_paligniq _mm_align_si64
 #define _m_cvt_si2pi _mm_cvtsi64_m64
 #define _m_cvt_pi2si _mm_cvtm64_si64
+#define _m_from_int _mm_cvtsi32_si64
+#define _m_to_int _mm_cvtsi64_si32

+#if defined __cplusplus
+}; /* End "C" */
+#endif /* __cplusplus */
 #endif /* _MMINTRIN_H_INCLUDED */

^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE: PING:  [PATCH, ARM, iWMMXt][2/5]: intrinsic head file change
  2011-08-18  2:35 ` Ramana Radhakrishnan
  2011-08-24  9:07   ` Xinyu Qi
@ 2011-09-26  4:31   ` Xinyu Qi
  2011-10-20  8:05   ` Xinyu Qi
                     ` (3 subsequent siblings)
  5 siblings, 0 replies; 11+ messages in thread
From: Xinyu Qi @ 2011-09-26  4:31 UTC (permalink / raw)
  To: Ramana Radhakrishnan; +Cc: gcc-patches

Ping

http://gcc.gnu.org/ml/gcc-patches/2011-08/msg01963.html

	* config/arm/mmintrin.h: Revise.


At 2011-08-24 16:14:30,"Xinyu Qi" <xyqi@marvell.com> wrote:
> At 2011-08-18 09:33:27,"Ramana Radhakrishnan"
> <ramana.radhakrishnan@linaro.org> wrote:
> > On 6 July 2011 11:11, Xinyu Qi <xyqi@marvell.com> wrote:
> > > Hi,
> > >
> > > It is the second part of iWMMXt maintenance.
> > >
> > > *config/arm/mmintrin.h:
> > >  Revise the iWMMXt intrinsics head file. Fix some intrinsics and add some
> > new intrinsics
> >
> > Is there a document somewhere that lists these intrinsics and what
> > each of these are supposed to be doing ? Missing details again . We
> > seem to be changing quite a few things.
> 
> Hi,
> The intrinsic_doc.txt is attached. It is the piece of iWMMXt intrinsic details
> doc picked out from "Intel Wireless MMX Technology Intrinsic Support" with some
> modification.
> 
> > > +
> > > +/*  We will treat __int64 as a long long type
> > > +    and __m64 as an unsigned long long type to conform to VSC++.  */Is
> > > +typedef unsigned long long __m64;
> > > +typedef long long __int64;
> >
> > Interesting this sort of a change with these cases where you are
> > changing the type to conform to VSC++ ? This just means old code that
> > uses this is pretty much broken. Not that I have much hope of that
> > happening by default - -flax-conversions appears to be needed even
> > with a trunk compiler.
> 
> I couldn't find any material to show why __int64 needs to be redefined. And
> all the tests are passed without this change. So decide to discard this change.
> 
> >
> > > @@ -54,7 +63,7 @@ _mm_cvtsi64_si32 (__int64 __i)
> > >  static __inline __int64
> > >  _mm_cvtsi32_si64 (int __i)
> > >  {
> > > -  return __i;
> > > +  return (__i & 0xffffffff);
> > >  }
> >
> > Eh ? why the & 0xffffffff before promotion rules.  Is this set of
> > intrinsics documented some place ?  What is missing and could be the
> > subject of a follow-up patch is a set of tests for the wMMX intrinsics
> > ....
> 
> See the intrinsics doc. It says the description of _mm_cvtsi32_si64 is "The
> integer value is zero-extended to 64 bits.
> If r = _mm_cvtsi32_si64(i), then the action is
> r [0:31] = i;
> r[32:63] = 0;"
> 
> >
> > What's the behaviour of wandn supposed to be ? Does wandn x, y, z
> > imply x = y & ~z or x = ~y & z ? If the former then your intrinsic
> > expansion is wrong unless the meaning of this has changed ? Whats the
> > behaviour of the intrinsic __mm_and_not_si64 . ?
> 
> The description of _mm_andnot_si64 is "Performs a logical NOT on the 64-bit
> value in m1 and use the result in a bitwise AND with the 64-bit value in m2."
> And, "wandn wRd, wRn, wRm" means "wRd = wRn & ~wRm"
> I think __builtin_arm_wandn had better directly match the behavior of wandn.
> Therefore, match _mm_andnot_si64 (m1, m2) to __builtin_arm_wandn (m2, m1).
> 
> 
> 
> > @@ -985,44 +1004,83 @@ _mm_setzero_si64 (void)
> >  static __inline void
> >  _mm_setwcx (const int __value, const int __regno)
> >  {
> > > +  /*Since gcc has the imformation of all wcgr regs
> > > +    in arm backend, use builtin to access them instead
> > > +    of throw asm directly.  Thus, gcc could do some
> > > +    optimization on them.  */
> > > +
> >
> > Also this comment is contradictory to what follows in the patch .
> > You've prima-facie replaced them with bits of inline assembler. I'm
> > not sure this comment makes a lot of sense on its own.
> 
> Sorry. This comment should be removed.
> 
> The modified diff is attached.
> 
> Thanks,
> Xinyu
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE: PING:  [PATCH, ARM, iWMMXt][2/5]: intrinsic head file change
  2011-08-18  2:35 ` Ramana Radhakrishnan
  2011-08-24  9:07   ` Xinyu Qi
  2011-09-26  4:31   ` PING: " Xinyu Qi
@ 2011-10-20  8:05   ` Xinyu Qi
  2011-12-29  6:26   ` Xinyu Qi
                     ` (2 subsequent siblings)
  5 siblings, 0 replies; 11+ messages in thread
From: Xinyu Qi @ 2011-10-20  8:05 UTC (permalink / raw)
  To: Ramana Radhakrishnan, gcc-patches

Ping

http://gcc.gnu.org/ml/gcc-patches/2011-08/msg01963.html

	* config/arm/mmintrin.h: Revise.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE: PING:  [PATCH, ARM, iWMMXt][2/5]: intrinsic head file change
  2011-08-18  2:35 ` Ramana Radhakrishnan
                     ` (2 preceding siblings ...)
  2011-10-20  8:05   ` Xinyu Qi
@ 2011-12-29  6:26   ` Xinyu Qi
  2012-02-03  2:08   ` Xinyu Qi
  2012-03-13  8:55   ` Xinyu Qi
  5 siblings, 0 replies; 11+ messages in thread
From: Xinyu Qi @ 2011-12-29  6:26 UTC (permalink / raw)
  To: Richard Earnshaw; +Cc: Ramana Radhakrishnan, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 1957 bytes --]

	* config/arm/mmintrin.h: Use __IWMMXT__ to enable iWMMXt intrinsics.
	Use __IWMMXT2__ to enable iWMMXt2 intrinsics.
	Use C name-mangling for intrinsics.
	(__v8qi): Redefine.
	(_mm_cvtsi32_si64, _mm_andnot_si64, _mm_sad_pu8): Revise.
	(_mm_sad_pu16, _mm_align_si64, _mm_setwcx, _mm_getwcx): Likewise.
	(_m_from_int): Likewise.
	(_mm_sada_pu8, _mm_sada_pu16): New intrinsic.
	(_mm_alignr0_si64, _mm_alignr1_si64, _mm_alignr2_si64): Likewise.
	(_mm_alignr3_si64, _mm_tandcb, _mm_tandch, _mm_tandcw): Likewise.
	(_mm_textrcb, _mm_textrch, _mm_textrcw, _mm_torcb): Likewise.
	(_mm_torch, _mm_torcw, _mm_tbcst_pi8, _mm_tbcst_pi16): Likewise.
	(_mm_tbcst_pi32): Likewise.
	(_mm_abs_pi8, _mm_abs_pi16, _mm_abs_pi32): New iWMMXt2 intrinsic.
	(_mm_addsubhx_pi16, _mm_absdiff_pu8, _mm_absdiff_pu16): Likewise.
	(_mm_absdiff_pu32, _mm_addc_pu16, _mm_addc_pu32): Likewise.
	(_mm_avg4_pu8, _mm_avg4r_pu8, _mm_maddx_pi16, _mm_maddx_pu16): Likewise.
	(_mm_msub_pi16, _mm_msub_pu16, _mm_mulhi_pi32): Likewise.
	(_mm_mulhi_pu32, _mm_mulhir_pi16, _mm_mulhir_pi32): Likewise.
	(_mm_mulhir_pu16, _mm_mulhir_pu32, _mm_mullo_pi32): Likewise.
	(_mm_qmulm_pi16, _mm_qmulm_pi32, _mm_qmulmr_pi16): Likewise.
	(_mm_qmulmr_pi32, _mm_subaddhx_pi16, _mm_addbhusl_pu8): Likewise.
	(_mm_addbhusm_pu8, _mm_qmiabb_pi32, _mm_qmiabbn_pi32): Likewise.
	(_mm_qmiabt_pi32, _mm_qmiabtn_pi32, _mm_qmiatb_pi32): Likewise.
	(_mm_qmiatbn_pi32, _mm_qmiatt_pi32, _mm_qmiattn_pi32): Likewise.
	(_mm_wmiabb_si64, _mm_wmiabbn_si64, _mm_wmiabt_si64): Likewise.
	(_mm_wmiabtn_si64, _mm_wmiatb_si64, _mm_wmiatbn_si64): Likewise.
	(_mm_wmiatt_si64, _mm_wmiattn_si64, _mm_wmiawbb_si64): Likewise.
	(_mm_wmiawbbn_si64, _mm_wmiawbt_si64, _mm_wmiawbtn_si64): Likewise.
	(_mm_wmiawtb_si64, _mm_wmiawtbn_si64, _mm_wmiawtt_si64): Likewise.
	(_mm_wmiawttn_si64, _mm_merge_si64): Likewise.
	(_mm_torvscb, _mm_torvsch, _mm_torvscw): Likewise.
	(_m_to_int): New define.

Thanks,
Xinyu

[-- Attachment #2: 2_mmintrin.diff --]
[-- Type: application/octet-stream, Size: 17960 bytes --]

Index: gcc/config/arm/mmintrin.h
===================================================================
--- gcc/config/arm/mmintrin.h	(revision 182684)
+++ gcc/config/arm/mmintrin.h	(working copy)
@@ -24,16 +24,30 @@
 #ifndef _MMINTRIN_H_INCLUDED
 #define _MMINTRIN_H_INCLUDED
 
+#ifndef __IWMMXT__
+#error You must enable WMMX/WMMX2 instructions (e.g. -march=iwmmxt or -march=iwmmxt2) to use iWMMXt/iWMMXt2 intrinsics
+#else
+
+#ifndef __IWMMXT2__
+#warning You only enable iWMMXt intrinsics. Extended iWMMXt2 intrinsics available only if WMMX2 instructions enabled (e.g. -march=iwmmxt2)
+#endif
+
+
+#if defined __cplusplus
+extern "C" { /* Begin "C" */
+/* Intrinsics use C name-mangling.  */
+#endif /* __cplusplus */
+
 /* The data type intended for user use.  */
 typedef unsigned long long __m64, __int64;
 
 /* Internal data types for implementing the intrinsics.  */
 typedef int __v2si __attribute__ ((vector_size (8)));
 typedef short __v4hi __attribute__ ((vector_size (8)));
-typedef char __v8qi __attribute__ ((vector_size (8)));
+typedef signed char __v8qi __attribute__ ((vector_size (8)));
 
 /* "Convert" __m64 and __int64 into each other.  */
-static __inline __m64 
+static __inline __m64
 _mm_cvtsi64_m64 (__int64 __i)
 {
   return __i;
@@ -54,7 +68,7 @@ _mm_cvtsi64_si32 (__int64 __i)
 static __inline __int64
 _mm_cvtsi32_si64 (int __i)
 {
-  return __i;
+  return (__i & 0xffffffff);
 }
 
 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
@@ -603,7 +617,7 @@ _mm_and_si64 (__m64 __m1, __m64 __m2)
 static __inline __m64
 _mm_andnot_si64 (__m64 __m1, __m64 __m2)
 {
-  return __builtin_arm_wandn (__m1, __m2);
+  return __builtin_arm_wandn (__m2, __m1);
 }
 
 /* Bit-wise inclusive OR the 64-bit values in M1 and M2.  */
@@ -935,7 +949,13 @@ _mm_avg2_pu16 (__m64 __A, __m64 __B)
 static __inline __m64
 _mm_sad_pu8 (__m64 __A, __m64 __B)
 {
-  return (__m64) __builtin_arm_wsadb ((__v8qi)__A, (__v8qi)__B);
+  return (__m64) __builtin_arm_wsadbz ((__v8qi)__A, (__v8qi)__B);
+}
+
+static __inline __m64
+_mm_sada_pu8 (__m64 __A, __m64 __B, __m64 __C)
+{
+  return (__m64) __builtin_arm_wsadb ((__v2si)__A, (__v8qi)__B, (__v8qi)__C);
 }
 
 /* Compute the sum of the absolute differences of the unsigned 16-bit
@@ -944,9 +964,16 @@ _mm_sad_pu8 (__m64 __A, __m64 __B)
 static __inline __m64
 _mm_sad_pu16 (__m64 __A, __m64 __B)
 {
-  return (__m64) __builtin_arm_wsadh ((__v4hi)__A, (__v4hi)__B);
+  return (__m64) __builtin_arm_wsadhz ((__v4hi)__A, (__v4hi)__B);
+}
+
+static __inline __m64
+_mm_sada_pu16 (__m64 __A, __m64 __B, __m64 __C)
+{
+  return (__m64) __builtin_arm_wsadh ((__v2si)__A, (__v4hi)__B, (__v4hi)__C);
 }
 
+
 /* Compute the sum of the absolute differences of the unsigned 8-bit
    values in A and B.  Return the value in the lower 16-bit word; the
    upper words are cleared.  */
@@ -965,11 +992,8 @@ _mm_sadz_pu16 (__m64 __A, __m64 __B)
   return (__m64) __builtin_arm_wsadhz ((__v4hi)__A, (__v4hi)__B);
 }
 
-static __inline __m64
-_mm_align_si64 (__m64 __A, __m64 __B, int __C)
-{
-  return (__m64) __builtin_arm_walign ((__v8qi)__A, (__v8qi)__B, __C);
-}
+#define _mm_align_si64(__A,__B, N) \
+  (__m64) __builtin_arm_walign ((__v8qi) (__A),(__v8qi) (__B), (N))
 
 /* Creates a 64-bit zero.  */
 static __inline __m64
@@ -987,42 +1011,76 @@ _mm_setwcx (const int __value, const int
 {
   switch (__regno)
     {
-    case 0:  __builtin_arm_setwcx (__value, 0); break;
-    case 1:  __builtin_arm_setwcx (__value, 1); break;
-    case 2:  __builtin_arm_setwcx (__value, 2); break;
-    case 3:  __builtin_arm_setwcx (__value, 3); break;
-    case 8:  __builtin_arm_setwcx (__value, 8); break;
-    case 9:  __builtin_arm_setwcx (__value, 9); break;
-    case 10: __builtin_arm_setwcx (__value, 10); break;
-    case 11: __builtin_arm_setwcx (__value, 11); break;
-    default: break;
+    case 0:
+      __asm __volatile ("tmcr wcid, %0" :: "r"(__value));
+      break;
+    case 1:
+      __asm __volatile ("tmcr wcon, %0" :: "r"(__value));
+      break;
+    case 2:
+      __asm __volatile ("tmcr wcssf, %0" :: "r"(__value));
+      break;
+    case 3:
+      __asm __volatile ("tmcr wcasf, %0" :: "r"(__value));
+      break;
+    case 8:
+      __builtin_arm_setwcgr0 (__value);
+      break;
+    case 9:
+      __builtin_arm_setwcgr1 (__value);
+      break;
+    case 10:
+      __builtin_arm_setwcgr2 (__value);
+      break;
+    case 11:
+      __builtin_arm_setwcgr3 (__value);
+      break;
+    default:
+      break;
     }
 }
 
 static __inline int
 _mm_getwcx (const int __regno)
 {
+  int __value;
   switch (__regno)
     {
-    case 0:  return __builtin_arm_getwcx (0);
-    case 1:  return __builtin_arm_getwcx (1);
-    case 2:  return __builtin_arm_getwcx (2);
-    case 3:  return __builtin_arm_getwcx (3);
-    case 8:  return __builtin_arm_getwcx (8);
-    case 9:  return __builtin_arm_getwcx (9);
-    case 10: return __builtin_arm_getwcx (10);
-    case 11: return __builtin_arm_getwcx (11);
-    default: return 0;
+    case 0:
+      __asm __volatile ("tmrc %0, wcid" : "=r"(__value));
+      break;
+    case 1:
+      __asm __volatile ("tmrc %0, wcon" : "=r"(__value));
+      break;
+    case 2:
+      __asm __volatile ("tmrc %0, wcssf" : "=r"(__value));
+      break;
+    case 3:
+      __asm __volatile ("tmrc %0, wcasf" : "=r"(__value));
+      break;
+    case 8:
+      return __builtin_arm_getwcgr0 ();
+    case 9:
+      return __builtin_arm_getwcgr1 ();
+    case 10:
+      return __builtin_arm_getwcgr2 ();
+    case 11:
+      return __builtin_arm_getwcgr3 ();
+    default:
+      break;
     }
+  return __value;
 }
 
 /* Creates a vector of two 32-bit values; I0 is least significant.  */
 static __inline __m64
 _mm_set_pi32 (int __i1, int __i0)
 {
-  union {
+  union
+  {
     __m64 __q;
-    struct {
+    struct
+    {
       unsigned int __i0;
       unsigned int __i1;
     } __s;
@@ -1041,7 +1099,7 @@ _mm_set_pi16 (short __w3, short __w2, sh
   unsigned int __i1 = (unsigned short)__w3 << 16 | (unsigned short)__w2;
   unsigned int __i0 = (unsigned short)__w1 << 16 | (unsigned short)__w0;
   return _mm_set_pi32 (__i1, __i0);
-		       
+
 }
 
 /* Creates a vector of eight 8-bit values; B0 is least significant.  */
@@ -1108,11 +1166,526 @@ _mm_set1_pi8 (char __b)
   return _mm_set1_pi32 (__i);
 }
 
-/* Convert an integer to a __m64 object.  */
+#ifdef __IWMMXT2__
+static __inline __m64
+_mm_abs_pi8 (__m64 m1)
+{
+  return (__m64) __builtin_arm_wabsb ((__v8qi)m1);
+}
+
+static __inline __m64
+_mm_abs_pi16 (__m64 m1)
+{
+  return (__m64) __builtin_arm_wabsh ((__v4hi)m1);
+
+}
+
+static __inline __m64
+_mm_abs_pi32 (__m64 m1)
+{
+  return (__m64) __builtin_arm_wabsw ((__v2si)m1);
+
+}
+
+static __inline __m64
+_mm_addsubhx_pi16 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_waddsubhx ((__v4hi)a, (__v4hi)b);
+}
+
+static __inline __m64
+_mm_absdiff_pu8 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wabsdiffb ((__v8qi)a, (__v8qi)b);
+}
+
+static __inline __m64
+_mm_absdiff_pu16 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wabsdiffh ((__v4hi)a, (__v4hi)b);
+}
+
+static __inline __m64
+_mm_absdiff_pu32 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wabsdiffw ((__v2si)a, (__v2si)b);
+}
+
+static __inline __m64
+_mm_addc_pu16 (__m64 a, __m64 b)
+{
+  __m64 result;
+  __asm__ __volatile__ ("waddhc	%0, %1, %2" : "=y" (result) : "y" (a),  "y" (b));
+  return result;
+}
+
+static __inline __m64
+_mm_addc_pu32 (__m64 a, __m64 b)
+{
+  __m64 result;
+  __asm__ __volatile__ ("waddwc	%0, %1, %2" : "=y" (result) : "y" (a),  "y" (b));
+  return result;
+}
+
+static __inline __m64
+_mm_avg4_pu8 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wavg4 ((__v8qi)a, (__v8qi)b);
+}
+
+static __inline __m64
+_mm_avg4r_pu8 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wavg4r ((__v8qi)a, (__v8qi)b);
+}
+
+static __inline __m64
+_mm_maddx_pi16 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wmaddsx ((__v4hi)a, (__v4hi)b);
+}
+
+static __inline __m64
+_mm_maddx_pu16 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wmaddux ((__v4hi)a, (__v4hi)b);
+}
+
+static __inline __m64
+_mm_msub_pi16 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wmaddsn ((__v4hi)a, (__v4hi)b);
+}
+
+static __inline __m64
+_mm_msub_pu16 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wmaddun ((__v4hi)a, (__v4hi)b);
+}
+
+static __inline __m64
+_mm_mulhi_pi32 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wmulwsm ((__v2si)a, (__v2si)b);
+}
+
+static __inline __m64
+_mm_mulhi_pu32 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wmulwum ((__v2si)a, (__v2si)b);
+}
+
+static __inline __m64
+_mm_mulhir_pi16 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wmulsmr ((__v4hi)a, (__v4hi)b);
+}
+
+static __inline __m64
+_mm_mulhir_pi32 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wmulwsmr ((__v2si)a, (__v2si)b);
+}
+
+static __inline __m64
+_mm_mulhir_pu16 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wmulumr ((__v4hi)a, (__v4hi)b);
+}
+
+static __inline __m64
+_mm_mulhir_pu32 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wmulwumr ((__v2si)a, (__v2si)b);
+}
+
+static __inline __m64
+_mm_mullo_pi32 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wmulwl ((__v2si)a, (__v2si)b);
+}
+
+static __inline __m64
+_mm_qmulm_pi16 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wqmulm ((__v4hi)a, (__v4hi)b);
+}
+
+static __inline __m64
+_mm_qmulm_pi32 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wqmulwm ((__v2si)a, (__v2si)b);
+}
+
+static __inline __m64
+_mm_qmulmr_pi16 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wqmulmr ((__v4hi)a, (__v4hi)b);
+}
+
+static __inline __m64
+_mm_qmulmr_pi32 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wqmulwmr ((__v2si)a, (__v2si)b);
+}
+
+static __inline __m64
+_mm_subaddhx_pi16 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wsubaddhx ((__v4hi)a, (__v4hi)b);
+}
+
+static __inline __m64
+_mm_addbhusl_pu8 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_waddbhusl ((__v4hi)a, (__v8qi)b);
+}
+
+static __inline __m64
+_mm_addbhusm_pu8 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_waddbhusm ((__v4hi)a, (__v8qi)b);
+}
+
+#define _mm_qmiabb_pi32(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wqmiabb ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_qmiabbn_pi32(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wqmiabbn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_qmiabt_pi32(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wqmiabt ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_qmiabtn_pi32(acc, m1, m2) \
+  ({\
+   __m64 _acc=acc;\
+   __m64 _m1=m1;\
+   __m64 _m2=m2;\
+   _acc = (__m64) __builtin_arm_wqmiabtn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_qmiatb_pi32(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wqmiatb ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_qmiatbn_pi32(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wqmiatbn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_qmiatt_pi32(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wqmiatt ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_qmiattn_pi32(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wqmiattn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiabb_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiabb (_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiabbn_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiabbn (_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiabt_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiabt (_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiabtn_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiabtn (_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiatb_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiatb (_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiatbn_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiatbn (_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiatt_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiatt (_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiattn_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiattn (_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiawbb_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiawbb (_acc, (__v2si)_m1, (__v2si)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiawbbn_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiawbbn (_acc, (__v2si)_m1, (__v2si)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiawbt_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiawbt (_acc, (__v2si)_m1, (__v2si)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiawbtn_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiawbtn (_acc, (__v2si)_m1, (__v2si)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiawtb_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiawtb (_acc, (__v2si)_m1, (__v2si)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiawtbn_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiawtbn (_acc, (__v2si)_m1, (__v2si)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiawtt_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiawtt (_acc, (__v2si)_m1, (__v2si)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiawttn_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiawttn (_acc, (__v2si)_m1, (__v2si)_m2);\
+   _acc;\
+   })
+
+/* The third arguments should be an immediate.  */
+#define _mm_merge_si64(a, b, n) \
+  ({\
+   __m64 result;\
+   result = (__m64) __builtin_arm_wmerge ((__m64) (a), (__m64) (b), (n));\
+   result;\
+   })
+#endif  /* __IWMMXT2__ */
+
+static __inline __m64
+_mm_alignr0_si64 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_walignr0 ((__v8qi) a, (__v8qi) b);
+}
+
+static __inline __m64
+_mm_alignr1_si64 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_walignr1 ((__v8qi) a, (__v8qi) b);
+}
+
+static __inline __m64
+_mm_alignr2_si64 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_walignr2 ((__v8qi) a, (__v8qi) b);
+}
+
+static __inline __m64
+_mm_alignr3_si64 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_walignr3 ((__v8qi) a, (__v8qi) b);
+}
+
+static __inline void
+_mm_tandcb ()
+{
+  __asm __volatile ("tandcb r15");
+}
+
+static __inline void
+_mm_tandch ()
+{
+  __asm __volatile ("tandch r15");
+}
+
+static __inline void
+_mm_tandcw ()
+{
+  __asm __volatile ("tandcw r15");
+}
+
+#define _mm_textrcb(n) \
+  ({\
+   __asm__ __volatile__ (\
+     "textrcb r15, %0" : : "i" (n));\
+   })
+
+#define _mm_textrch(n) \
+  ({\
+   __asm__ __volatile__ (\
+     "textrch r15, %0" : : "i" (n));\
+   })
+
+#define _mm_textrcw(n) \
+  ({\
+   __asm__ __volatile__ (\
+     "textrcw r15, %0" : : "i" (n));\
+   })
+
+static __inline void
+_mm_torcb ()
+{
+  __asm __volatile ("torcb r15");
+}
+
+static __inline void
+_mm_torch ()
+{
+  __asm __volatile ("torch r15");
+}
+
+static __inline void
+_mm_torcw ()
+{
+  __asm __volatile ("torcw r15");
+}
+
+#ifdef __IWMMXT2__
+static __inline void
+_mm_torvscb ()
+{
+  __asm __volatile ("torvscb r15");
+}
+
+static __inline void
+_mm_torvsch ()
+{
+  __asm __volatile ("torvsch r15");
+}
+
+static __inline void
+_mm_torvscw ()
+{
+  __asm __volatile ("torvscw r15");
+}
+#endif
+
+static __inline __m64
+_mm_tbcst_pi8 (int value)
+{
+  return (__m64) __builtin_arm_tbcstb ((signed char) value);
+}
+
+static __inline __m64
+_mm_tbcst_pi16 (int value)
+{
+  return (__m64) __builtin_arm_tbcsth ((short) value);
+}
+
 static __inline __m64
-_m_from_int (int __a)
+_mm_tbcst_pi32 (int value)
 {
-  return (__m64)__a;
+  return (__m64) __builtin_arm_tbcstw (value);
 }
 
 #define _m_packsswb _mm_packs_pi16
@@ -1250,5 +1823,11 @@ _m_from_int (int __a)
 #define _m_paligniq _mm_align_si64
 #define _m_cvt_si2pi _mm_cvtsi64_m64
 #define _m_cvt_pi2si _mm_cvtm64_si64
+#define _m_from_int _mm_cvtsi32_si64
+#define _m_to_int _mm_cvtsi64_si32
 
+#if defined __cplusplus
+}; /* End "C" */
+#endif /* __cplusplus */
+#endif /* __IWMMXT__ */
 #endif /* _MMINTRIN_H_INCLUDED */

^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE: PING:  [PATCH, ARM, iWMMXt][2/5]: intrinsic head file change
  2011-08-18  2:35 ` Ramana Radhakrishnan
                     ` (3 preceding siblings ...)
  2011-12-29  6:26   ` Xinyu Qi
@ 2012-02-03  2:08   ` Xinyu Qi
  2012-03-13  8:55   ` Xinyu Qi
  5 siblings, 0 replies; 11+ messages in thread
From: Xinyu Qi @ 2012-02-03  2:08 UTC (permalink / raw)
  To: Richard Earnshaw; +Cc: Ramana Radhakrishnan, gcc-patches

PING

http://gcc.gnu.org/ml/gcc-patches/2011-12/msg01788.html

At 2011-12-29 14:22:50,"Xinyu Qi" <xyqi@marvell.com> wrote:
> 	* config/arm/mmintrin.h: Use __IWMMXT__ to enable iWMMXt
> intrinsics.
> 	Use __IWMMXT2__ to enable iWMMXt2 intrinsics.
> 	Use C name-mangling for intrinsics.
> 	(__v8qi): Redefine.
> 	(_mm_cvtsi32_si64, _mm_andnot_si64, _mm_sad_pu8): Revise.
> 	(_mm_sad_pu16, _mm_align_si64, _mm_setwcx, _mm_getwcx):
> Likewise.
> 	(_m_from_int): Likewise.
> 	(_mm_sada_pu8, _mm_sada_pu16): New intrinsic.
> 	(_mm_alignr0_si64, _mm_alignr1_si64, _mm_alignr2_si64): Likewise.
> 	(_mm_alignr3_si64, _mm_tandcb, _mm_tandch, _mm_tandcw): Likewise.
> 	(_mm_textrcb, _mm_textrch, _mm_textrcw, _mm_torcb): Likewise.
> 	(_mm_torch, _mm_torcw, _mm_tbcst_pi8, _mm_tbcst_pi16): Likewise.
> 	(_mm_tbcst_pi32): Likewise.
> 	(_mm_abs_pi8, _mm_abs_pi16, _mm_abs_pi32): New iWMMXt2
> intrinsic.
> 	(_mm_addsubhx_pi16, _mm_absdiff_pu8, _mm_absdiff_pu16): Likewise.
> 	(_mm_absdiff_pu32, _mm_addc_pu16, _mm_addc_pu32): Likewise.
> 	(_mm_avg4_pu8, _mm_avg4r_pu8, _mm_maddx_pi16,
> _mm_maddx_pu16): Likewise.
> 	(_mm_msub_pi16, _mm_msub_pu16, _mm_mulhi_pi32): Likewise.
> 	(_mm_mulhi_pu32, _mm_mulhir_pi16, _mm_mulhir_pi32): Likewise.
> 	(_mm_mulhir_pu16, _mm_mulhir_pu32, _mm_mullo_pi32): Likewise.
> 	(_mm_qmulm_pi16, _mm_qmulm_pi32, _mm_qmulmr_pi16): Likewise.
> 	(_mm_qmulmr_pi32, _mm_subaddhx_pi16, _mm_addbhusl_pu8):
> Likewise.
> 	(_mm_addbhusm_pu8, _mm_qmiabb_pi32, _mm_qmiabbn_pi32):
> Likewise.
> 	(_mm_qmiabt_pi32, _mm_qmiabtn_pi32, _mm_qmiatb_pi32): Likewise.
> 	(_mm_qmiatbn_pi32, _mm_qmiatt_pi32, _mm_qmiattn_pi32): Likewise.
> 	(_mm_wmiabb_si64, _mm_wmiabbn_si64, _mm_wmiabt_si64): Likewise.
> 	(_mm_wmiabtn_si64, _mm_wmiatb_si64, _mm_wmiatbn_si64):
> Likewise.
> 	(_mm_wmiatt_si64, _mm_wmiattn_si64, _mm_wmiawbb_si64):
> Likewise.
> 	(_mm_wmiawbbn_si64, _mm_wmiawbt_si64, _mm_wmiawbtn_si64):
> Likewise.
> 	(_mm_wmiawtb_si64, _mm_wmiawtbn_si64, _mm_wmiawtt_si64):
> Likewise.
> 	(_mm_wmiawttn_si64, _mm_merge_si64): Likewise.
> 	(_mm_torvscb, _mm_torvsch, _mm_torvscw): Likewise.
> 	(_m_to_int): New define.
> 
> Thanks,
> Xinyu

^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE: PING:  [PATCH, ARM, iWMMXt][2/5]: intrinsic head file change
  2011-08-18  2:35 ` Ramana Radhakrishnan
                     ` (4 preceding siblings ...)
  2012-02-03  2:08   ` Xinyu Qi
@ 2012-03-13  8:55   ` Xinyu Qi
  5 siblings, 0 replies; 11+ messages in thread
From: Xinyu Qi @ 2012-03-13  8:55 UTC (permalink / raw)
  To: Richard Earnshaw; +Cc: Ramana Radhakrishnan, gcc-patches

PING

At 2012-02-03 10:05:22,"Xinyu Qi" <xyqi@marvell.com> wrote:
> PING
> 
> http://gcc.gnu.org/ml/gcc-patches/2011-12/msg01788.html
> 
> At 2011-12-29 14:22:50,"Xinyu Qi" <xyqi@marvell.com> wrote:
> > 	* config/arm/mmintrin.h: Use __IWMMXT__ to enable iWMMXt
> > intrinsics.
> > 	Use __IWMMXT2__ to enable iWMMXt2 intrinsics.
> > 	Use C name-mangling for intrinsics.
> > 	(__v8qi): Redefine.
> > 	(_mm_cvtsi32_si64, _mm_andnot_si64, _mm_sad_pu8): Revise.
> > 	(_mm_sad_pu16, _mm_align_si64, _mm_setwcx, _mm_getwcx):
> > Likewise.
> > 	(_m_from_int): Likewise.
> > 	(_mm_sada_pu8, _mm_sada_pu16): New intrinsic.
> > 	(_mm_alignr0_si64, _mm_alignr1_si64, _mm_alignr2_si64): Likewise.
> > 	(_mm_alignr3_si64, _mm_tandcb, _mm_tandch, _mm_tandcw): Likewise.
> > 	(_mm_textrcb, _mm_textrch, _mm_textrcw, _mm_torcb): Likewise.
> > 	(_mm_torch, _mm_torcw, _mm_tbcst_pi8, _mm_tbcst_pi16): Likewise.
> > 	(_mm_tbcst_pi32): Likewise.
> > 	(_mm_abs_pi8, _mm_abs_pi16, _mm_abs_pi32): New iWMMXt2
> > intrinsic.
> > 	(_mm_addsubhx_pi16, _mm_absdiff_pu8, _mm_absdiff_pu16): Likewise.
> > 	(_mm_absdiff_pu32, _mm_addc_pu16, _mm_addc_pu32): Likewise.
> > 	(_mm_avg4_pu8, _mm_avg4r_pu8, _mm_maddx_pi16,
> > _mm_maddx_pu16): Likewise.
> > 	(_mm_msub_pi16, _mm_msub_pu16, _mm_mulhi_pi32): Likewise.
> > 	(_mm_mulhi_pu32, _mm_mulhir_pi16, _mm_mulhir_pi32): Likewise.
> > 	(_mm_mulhir_pu16, _mm_mulhir_pu32, _mm_mullo_pi32): Likewise.
> > 	(_mm_qmulm_pi16, _mm_qmulm_pi32, _mm_qmulmr_pi16): Likewise.
> > 	(_mm_qmulmr_pi32, _mm_subaddhx_pi16, _mm_addbhusl_pu8):
> > Likewise.
> > 	(_mm_addbhusm_pu8, _mm_qmiabb_pi32, _mm_qmiabbn_pi32):
> > Likewise.
> > 	(_mm_qmiabt_pi32, _mm_qmiabtn_pi32, _mm_qmiatb_pi32): Likewise.
> > 	(_mm_qmiatbn_pi32, _mm_qmiatt_pi32, _mm_qmiattn_pi32): Likewise.
> > 	(_mm_wmiabb_si64, _mm_wmiabbn_si64, _mm_wmiabt_si64): Likewise.
> > 	(_mm_wmiabtn_si64, _mm_wmiatb_si64, _mm_wmiatbn_si64):
> > Likewise.
> > 	(_mm_wmiatt_si64, _mm_wmiattn_si64, _mm_wmiawbb_si64):
> > Likewise.
> > 	(_mm_wmiawbbn_si64, _mm_wmiawbt_si64, _mm_wmiawbtn_si64):
> > Likewise.
> > 	(_mm_wmiawtb_si64, _mm_wmiawtbn_si64, _mm_wmiawtt_si64):
> > Likewise.
> > 	(_mm_wmiawttn_si64, _mm_merge_si64): Likewise.
> > 	(_mm_torvscb, _mm_torvsch, _mm_torvscw): Likewise.
> > 	(_m_to_int): New define.
> >
> > Thanks,
> > Xinyu

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH ARM iWMMXt 2/5] intrinsic head file change
  2012-05-29  4:15 ` [PATCH ARM iWMMXt 2/5] intrinsic head file change Matt Turner
@ 2012-06-06 12:22   ` Ramana Radhakrishnan
  0 siblings, 0 replies; 11+ messages in thread
From: Ramana Radhakrishnan @ 2012-06-06 12:22 UTC (permalink / raw)
  To: Matt Turner
  Cc: gcc-patches, Ramana Radhakrishnan, Richard Earnshaw,
	Nick Clifton, Paul Brook, Xinyu Qi

I've only had a brief look at this and point out certain stylistic
issues that I noticed and would like another set of eyes on this and
the next patch.


On 29 May 2012 05:13, Matt Turner <mattst88@gmail.com> wrote:
> From: Xinyu Qi <xyqi@marvell.com>
>
>        gcc/
>        * config/arm/mmintrin.h: Use __IWMMXT__ to enable iWMMXt intrinsics.
>        Use __IWMMXT2__ to enable iWMMXt2 intrinsics.
>        Use C name-mangling for intrinsics.
>        (__v8qi): Redefine.
>        (_mm_cvtsi32_si64, _mm_andnot_si64, _mm_sad_pu8): Revise.
>        (_mm_sad_pu16, _mm_align_si64, _mm_setwcx, _mm_getwcx): Likewise.
>        (_m_from_int): Likewise.
>        (_mm_sada_pu8, _mm_sada_pu16): New intrinsic.
>        (_mm_alignr0_si64, _mm_alignr1_si64, _mm_alignr2_si64): Likewise.
>        (_mm_alignr3_si64, _mm_tandcb, _mm_tandch, _mm_tandcw): Likewise.
>        (_mm_textrcb, _mm_textrch, _mm_textrcw, _mm_torcb): Likewise.
>        (_mm_torch, _mm_torcw, _mm_tbcst_pi8, _mm_tbcst_pi16): Likewise.
>        (_mm_tbcst_pi32): Likewise.
>        (_mm_abs_pi8, _mm_abs_pi16, _mm_abs_pi32): New iWMMXt2 intrinsic.
>        (_mm_addsubhx_pi16, _mm_absdiff_pu8, _mm_absdiff_pu16): Likewise.
>        (_mm_absdiff_pu32, _mm_addc_pu16, _mm_addc_pu32): Likewise.
>        (_mm_avg4_pu8, _mm_avg4r_pu8, _mm_maddx_pi16, _mm_maddx_pu16): Likewise.
>        (_mm_msub_pi16, _mm_msub_pu16, _mm_mulhi_pi32): Likewise.
>        (_mm_mulhi_pu32, _mm_mulhir_pi16, _mm_mulhir_pi32): Likewise.
>        (_mm_mulhir_pu16, _mm_mulhir_pu32, _mm_mullo_pi32): Likewise.
>        (_mm_qmulm_pi16, _mm_qmulm_pi32, _mm_qmulmr_pi16): Likewise.
>        (_mm_qmulmr_pi32, _mm_subaddhx_pi16, _mm_addbhusl_pu8): Likewise.
>        (_mm_addbhusm_pu8, _mm_qmiabb_pi32, _mm_qmiabbn_pi32): Likewise.
>        (_mm_qmiabt_pi32, _mm_qmiabtn_pi32, _mm_qmiatb_pi32): Likewise.
>        (_mm_qmiatbn_pi32, _mm_qmiatt_pi32, _mm_qmiattn_pi32): Likewise.
>        (_mm_wmiabb_si64, _mm_wmiabbn_si64, _mm_wmiabt_si64): Likewise.
>        (_mm_wmiabtn_si64, _mm_wmiatb_si64, _mm_wmiatbn_si64): Likewise.
>        (_mm_wmiatt_si64, _mm_wmiattn_si64, _mm_wmiawbb_si64): Likewise.
>        (_mm_wmiawbbn_si64, _mm_wmiawbt_si64, _mm_wmiawbtn_si64): Likewise.
>        (_mm_wmiawtb_si64, _mm_wmiawtbn_si64, _mm_wmiawtt_si64): Likewise.
>        (_mm_wmiawttn_si64, _mm_merge_si64): Likewise.
>        (_mm_torvscb, _mm_torvsch, _mm_torvscw): Likewise.
>        (_m_to_int): New define.
> ---
>  gcc/config/arm/mmintrin.h |  649 ++++++++++++++++++++++++++++++++++++++++++---
>  1 files changed, 614 insertions(+), 35 deletions(-)
>
> diff --git a/gcc/config/arm/mmintrin.h b/gcc/config/arm/mmintrin.h
> index 2cc500d..0fe551d 100644
> --- a/gcc/config/arm/mmintrin.h
> +++ b/gcc/config/arm/mmintrin.h
> @@ -24,16 +24,30 @@
>  #ifndef _MMINTRIN_H_INCLUDED
>  #define _MMINTRIN_H_INCLUDED
>
> +#ifndef __IWMMXT__
> +#error You must enable WMMX/WMMX2 instructions (e.g. -march=iwmmxt or -march=iwmmxt2) to use iWMMXt/iWMMXt2 intrinsics
> +#else
> +
> +#ifndef __IWMMXT2__
> +#warning You only enable iWMMXt intrinsics. Extended iWMMXt2 intrinsics available only if WMMX2 instructions enabled (e.g. -march=iwmmxt2)
> +#endif
> +

Extra newline.

> +
> +#if defined __cplusplus
> +extern "C" { /* Begin "C" */
> +/* Intrinsics use C name-mangling.  */
> +#endif /* __cplusplus */
> +
>  /* The data type intended for user use.  */
>  typedef unsigned long long __m64, __int64;
>
>  /* Internal data types for implementing the intrinsics.  */
>  typedef int __v2si __attribute__ ((vector_size (8)));
>  typedef short __v4hi __attribute__ ((vector_size (8)));
> -typedef char __v8qi __attribute__ ((vector_size (8)));
> +typedef signed char __v8qi __attribute__ ((vector_size (8)));
>
>  /* "Convert" __m64 and __int64 into each other.  */
> -static __inline __m64
> +static __inline __m64
>  _mm_cvtsi64_m64 (__int64 __i)
>  {
>   return __i;
> @@ -54,7 +68,7 @@ _mm_cvtsi64_si32 (__int64 __i)
>  static __inline __int64
>  _mm_cvtsi32_si64 (int __i)
>  {
> -  return __i;
> +  return (__i & 0xffffffff);
>  }
>
>  /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
> @@ -603,7 +617,7 @@ _mm_and_si64 (__m64 __m1, __m64 __m2)
>  static __inline __m64
>  _mm_andnot_si64 (__m64 __m1, __m64 __m2)
>  {
> -  return __builtin_arm_wandn (__m1, __m2);
> +  return __builtin_arm_wandn (__m2, __m1);
>  }
>
>  /* Bit-wise inclusive OR the 64-bit values in M1 and M2.  */
> @@ -935,7 +949,13 @@ _mm_avg2_pu16 (__m64 __A, __m64 __B)
>  static __inline __m64
>  _mm_sad_pu8 (__m64 __A, __m64 __B)
>  {
> -  return (__m64) __builtin_arm_wsadb ((__v8qi)__A, (__v8qi)__B);
> +  return (__m64) __builtin_arm_wsadbz ((__v8qi)__A, (__v8qi)__B);
> +}
> +
> +static __inline __m64
> +_mm_sada_pu8 (__m64 __A, __m64 __B, __m64 __C)
> +{
> +  return (__m64) __builtin_arm_wsadb ((__v2si)__A, (__v8qi)__B, (__v8qi)__C);
>  }
>
>  /* Compute the sum of the absolute differences of the unsigned 16-bit
> @@ -944,9 +964,16 @@ _mm_sad_pu8 (__m64 __A, __m64 __B)
>  static __inline __m64
>  _mm_sad_pu16 (__m64 __A, __m64 __B)
>  {
> -  return (__m64) __builtin_arm_wsadh ((__v4hi)__A, (__v4hi)__B);
> +  return (__m64) __builtin_arm_wsadhz ((__v4hi)__A, (__v4hi)__B);
>  }
>
> +static __inline __m64
> +_mm_sada_pu16 (__m64 __A, __m64 __B, __m64 __C)
> +{
> +  return (__m64) __builtin_arm_wsadh ((__v2si)__A, (__v4hi)__B, (__v4hi)__C);
> +}
> +
> +
>  /* Compute the sum of the absolute differences of the unsigned 8-bit
>    values in A and B.  Return the value in the lower 16-bit word; the
>    upper words are cleared.  */
> @@ -965,11 +992,8 @@ _mm_sadz_pu16 (__m64 __A, __m64 __B)
>   return (__m64) __builtin_arm_wsadhz ((__v4hi)__A, (__v4hi)__B);
>  }
>
> -static __inline __m64
> -_mm_align_si64 (__m64 __A, __m64 __B, int __C)
> -{
> -  return (__m64) __builtin_arm_walign ((__v8qi)__A, (__v8qi)__B, __C);
> -}
> +#define _mm_align_si64(__A,__B, N) \
> +  (__m64) __builtin_arm_walign ((__v8qi) (__A),(__v8qi) (__B), (N))
>
>  /* Creates a 64-bit zero.  */
>  static __inline __m64
> @@ -987,42 +1011,76 @@ _mm_setwcx (const int __value, const int __regno)
>  {
>   switch (__regno)
>     {
> -    case 0:  __builtin_arm_setwcx (__value, 0); break;
> -    case 1:  __builtin_arm_setwcx (__value, 1); break;
> -    case 2:  __builtin_arm_setwcx (__value, 2); break;
> -    case 3:  __builtin_arm_setwcx (__value, 3); break;
> -    case 8:  __builtin_arm_setwcx (__value, 8); break;
> -    case 9:  __builtin_arm_setwcx (__value, 9); break;
> -    case 10: __builtin_arm_setwcx (__value, 10); break;
> -    case 11: __builtin_arm_setwcx (__value, 11); break;
> -    default: break;
> +    case 0:
> +      __asm __volatile ("tmcr wcid, %0" :: "r"(__value));
> +      break;
> +    case 1:
> +      __asm __volatile ("tmcr wcon, %0" :: "r"(__value));
> +      break;
> +    case 2:
> +      __asm __volatile ("tmcr wcssf, %0" :: "r"(__value));
> +      break;
> +    case 3:
> +      __asm __volatile ("tmcr wcasf, %0" :: "r"(__value));
> +      break;
> +    case 8:
> +      __builtin_arm_setwcgr0 (__value);
> +      break;
> +    case 9:
> +      __builtin_arm_setwcgr1 (__value);
> +      break;
> +    case 10:
> +      __builtin_arm_setwcgr2 (__value);
> +      break;
> +    case 11:
> +      __builtin_arm_setwcgr3 (__value);
> +      break;
> +    default:
> +      break;
>     }
>  }
>
>  static __inline int
>  _mm_getwcx (const int __regno)
>  {
> +  int __value;
>   switch (__regno)
>     {
> -    case 0:  return __builtin_arm_getwcx (0);
> -    case 1:  return __builtin_arm_getwcx (1);
> -    case 2:  return __builtin_arm_getwcx (2);
> -    case 3:  return __builtin_arm_getwcx (3);
> -    case 8:  return __builtin_arm_getwcx (8);
> -    case 9:  return __builtin_arm_getwcx (9);
> -    case 10: return __builtin_arm_getwcx (10);
> -    case 11: return __builtin_arm_getwcx (11);
> -    default: return 0;
> +    case 0:
> +      __asm __volatile ("tmrc %0, wcid" : "=r"(__value));
> +      break;
> +    case 1:
> +      __asm __volatile ("tmrc %0, wcon" : "=r"(__value));
> +      break;
> +    case 2:
> +      __asm __volatile ("tmrc %0, wcssf" : "=r"(__value));
> +      break;
> +    case 3:
> +      __asm __volatile ("tmrc %0, wcasf" : "=r"(__value));
> +      break;
> +    case 8:
> +      return __builtin_arm_getwcgr0 ();
> +    case 9:
> +      return __builtin_arm_getwcgr1 ();
> +    case 10:
> +      return __builtin_arm_getwcgr2 ();
> +    case 11:
> +      return __builtin_arm_getwcgr3 ();
> +    default:
> +      break;
>     }
> +  return __value;
>  }
>
>  /* Creates a vector of two 32-bit values; I0 is least significant.  */
>  static __inline __m64
>  _mm_set_pi32 (int __i1, int __i0)
>  {
> -  union {
> +  union
> +  {
>     __m64 __q;
> -    struct {
> +    struct
> +    {
>       unsigned int __i0;
>       unsigned int __i1;
>     } __s;
> @@ -1041,7 +1099,7 @@ _mm_set_pi16 (short __w3, short __w2, short __w1, short __w0)
>   unsigned int __i1 = (unsigned short)__w3 << 16 | (unsigned short)__w2;
>   unsigned int __i0 = (unsigned short)__w1 << 16 | (unsigned short)__w0;
>   return _mm_set_pi32 (__i1, __i0);
> -
> +
Extra newline again here.
>  }
>
>  /* Creates a vector of eight 8-bit values; B0 is least significant.  */
> @@ -1108,11 +1166,526 @@ _mm_set1_pi8 (char __b)
>   return _mm_set1_pi32 (__i);
>  }
>
> -/* Convert an integer to a __m64 object.  */
> +#ifdef __IWMMXT2__
> +static __inline __m64
> +_mm_abs_pi8 (__m64 m1)
> +{
> +  return (__m64) __builtin_arm_wabsb ((__v8qi)m1);
> +}
> +
> +static __inline __m64
> +_mm_abs_pi16 (__m64 m1)
> +{
> +  return (__m64) __builtin_arm_wabsh ((__v4hi)m1);
> +

And here.

> +}
> +
> +static __inline __m64
> +_mm_abs_pi32 (__m64 m1)
> +{
> +  return (__m64) __builtin_arm_wabsw ((__v2si)m1);
> +
and here.

<large part snipped.>

> +
> +#define _mm_qmiabb_pi32(acc, m1, m2) \
> +  ({\
> +   __m64 _acc = acc;\
> +   __m64 _m1 = m1;\
> +   __m64 _m2 = m2;\
> +   _acc = (__m64) __builtin_arm_wqmiabb ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
> +   _acc;\
> +   })
> +
> +#define _mm_qmiabbn_pi32(acc, m1, m2) \
> +  ({\
> +   __m64 _acc = acc;\
> +   __m64 _m1 = m1;\
> +   __m64 _m2 = m2;\
> +   _acc = (__m64) __builtin_arm_wqmiabbn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
> +   _acc;\
> +   })
> +
> +#define _mm_qmiabt_pi32(acc, m1, m2) \
> +  ({\
> +   __m64 _acc = acc;\
> +   __m64 _m1 = m1;\
> +   __m64 _m2 = m2;\
> +   _acc = (__m64) __builtin_arm_wqmiabt ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
> +   _acc;\
> +   })
> +
> +#define _mm_qmiabtn_pi32(acc, m1, m2) \
> +  ({\
> +   __m64 _acc=acc;\
> +   __m64 _m1=m1;\
> +   __m64 _m2=m2;\
> +   _acc = (__m64) __builtin_arm_wqmiabtn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
> +   _acc;\
> +   })
> +
> +#define _mm_qmiatb_pi32(acc, m1, m2) \
> +  ({\
> +   __m64 _acc = acc;\
> +   __m64 _m1 = m1;\
> +   __m64 _m2 = m2;\
> +   _acc = (__m64) __builtin_arm_wqmiatb ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
> +   _acc;\
> +   })
> +
> +#define _mm_qmiatbn_pi32(acc, m1, m2) \
> +  ({\
> +   __m64 _acc = acc;\
> +   __m64 _m1 = m1;\
> +   __m64 _m2 = m2;\
> +   _acc = (__m64) __builtin_arm_wqmiatbn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
> +   _acc;\
> +   })
> +
> +#define _mm_qmiatt_pi32(acc, m1, m2) \
> +  ({\
> +   __m64 _acc = acc;\
> +   __m64 _m1 = m1;\
> +   __m64 _m2 = m2;\
> +   _acc = (__m64) __builtin_arm_wqmiatt ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
> +   _acc;\
> +   })
> +
> +#define _mm_qmiattn_pi32(acc, m1, m2) \
> +  ({\
> +   __m64 _acc = acc;\
> +   __m64 _m1 = m1;\
> +   __m64 _m2 = m2;\
> +   _acc = (__m64) __builtin_arm_wqmiattn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
> +   _acc;\
> +   })
> +
> +#define _mm_wmiabb_si64(acc, m1, m2) \
> +  ({\
> +   __m64 _acc = acc;\
> +   __m64 _m1 = m1;\
> +   __m64 _m2 = m2;\
> +   _acc = (__m64) __builtin_arm_wmiabb (_acc, (__v4hi)_m1, (__v4hi)_m2);\
> +   _acc;\
> +   })
> +
> +#define _mm_wmiabbn_si64(acc, m1, m2) \
> +  ({\
> +   __m64 _acc = acc;\
> +   __m64 _m1 = m1;\
> +   __m64 _m2 = m2;\
> +   _acc = (__m64) __builtin_arm_wmiabbn (_acc, (__v4hi)_m1, (__v4hi)_m2);\
> +   _acc;\
> +   })
> +
> +#define _mm_wmiabt_si64(acc, m1, m2) \
> +  ({\
> +   __m64 _acc = acc;\
> +   __m64 _m1 = m1;\
> +   __m64 _m2 = m2;\
> +   _acc = (__m64) __builtin_arm_wmiabt (_acc, (__v4hi)_m1, (__v4hi)_m2);\
> +   _acc;\
> +   })
> +
> +#define _mm_wmiabtn_si64(acc, m1, m2) \
> +  ({\
> +   __m64 _acc = acc;\
> +   __m64 _m1 = m1;\
> +   __m64 _m2 = m2;\
> +   _acc = (__m64) __builtin_arm_wmiabtn (_acc, (__v4hi)_m1, (__v4hi)_m2);\
> +   _acc;\
> +   })
> +
> +#define _mm_wmiatb_si64(acc, m1, m2) \
> +  ({\
> +   __m64 _acc = acc;\
> +   __m64 _m1 = m1;\
> +   __m64 _m2 = m2;\
> +   _acc = (__m64) __builtin_arm_wmiatb (_acc, (__v4hi)_m1, (__v4hi)_m2);\
> +   _acc;\
> +   })
> +
> +#define _mm_wmiatbn_si64(acc, m1, m2) \
> +  ({\
> +   __m64 _acc = acc;\
> +   __m64 _m1 = m1;\
> +   __m64 _m2 = m2;\
> +   _acc = (__m64) __builtin_arm_wmiatbn (_acc, (__v4hi)_m1, (__v4hi)_m2);\
> +   _acc;\
> +   })
> +
> +#define _mm_wmiatt_si64(acc, m1, m2) \
> +  ({\
> +   __m64 _acc = acc;\
> +   __m64 _m1 = m1;\
> +   __m64 _m2 = m2;\
> +   _acc = (__m64) __builtin_arm_wmiatt (_acc, (__v4hi)_m1, (__v4hi)_m2);\
> +   _acc;\
> +   })
> +
> +#define _mm_wmiattn_si64(acc, m1, m2) \
> +  ({\
> +   __m64 _acc = acc;\
> +   __m64 _m1 = m1;\
> +   __m64 _m2 = m2;\
> +   _acc = (__m64) __builtin_arm_wmiattn (_acc, (__v4hi)_m1, (__v4hi)_m2);\
> +   _acc;\
> +   })
> +
> +#define _mm_wmiawbb_si64(acc, m1, m2) \
> +  ({\
> +   __m64 _acc = acc;\
> +   __m64 _m1 = m1;\
> +   __m64 _m2 = m2;\
> +   _acc = (__m64) __builtin_arm_wmiawbb (_acc, (__v2si)_m1, (__v2si)_m2);\
> +   _acc;\
> +   })
> +
> +#define _mm_wmiawbbn_si64(acc, m1, m2) \
> +  ({\
> +   __m64 _acc = acc;\
> +   __m64 _m1 = m1;\
> +   __m64 _m2 = m2;\
> +   _acc = (__m64) __builtin_arm_wmiawbbn (_acc, (__v2si)_m1, (__v2si)_m2);\
> +   _acc;\
> +   })
> +
> +#define _mm_wmiawbt_si64(acc, m1, m2) \
> +  ({\
> +   __m64 _acc = acc;\
> +   __m64 _m1 = m1;\
> +   __m64 _m2 = m2;\
> +   _acc = (__m64) __builtin_arm_wmiawbt (_acc, (__v2si)_m1, (__v2si)_m2);\
> +   _acc;\
> +   })
> +
> +#define _mm_wmiawbtn_si64(acc, m1, m2) \
> +  ({\
> +   __m64 _acc = acc;\
> +   __m64 _m1 = m1;\
> +   __m64 _m2 = m2;\
> +   _acc = (__m64) __builtin_arm_wmiawbtn (_acc, (__v2si)_m1, (__v2si)_m2);\
> +   _acc;\
> +   })
> +
> +#define _mm_wmiawtb_si64(acc, m1, m2) \
> +  ({\
> +   __m64 _acc = acc;\
> +   __m64 _m1 = m1;\
> +   __m64 _m2 = m2;\
> +   _acc = (__m64) __builtin_arm_wmiawtb (_acc, (__v2si)_m1, (__v2si)_m2);\
> +   _acc;\
> +   })
> +
> +#define _mm_wmiawtbn_si64(acc, m1, m2) \
> +  ({\
> +   __m64 _acc = acc;\
> +   __m64 _m1 = m1;\
> +   __m64 _m2 = m2;\
> +   _acc = (__m64) __builtin_arm_wmiawtbn (_acc, (__v2si)_m1, (__v2si)_m2);\
> +   _acc;\
> +   })
> +
> +#define _mm_wmiawtt_si64(acc, m1, m2) \
> +  ({\
> +   __m64 _acc = acc;\
> +   __m64 _m1 = m1;\
> +   __m64 _m2 = m2;\
> +   _acc = (__m64) __builtin_arm_wmiawtt (_acc, (__v2si)_m1, (__v2si)_m2);\
> +   _acc;\
> +   })
> +
> +#define _mm_wmiawttn_si64(acc, m1, m2) \
> +  ({\
> +   __m64 _acc = acc;\
> +   __m64 _m1 = m1;\
> +   __m64 _m2 = m2;\
> +   _acc = (__m64) __builtin_arm_wmiawttn (_acc, (__v2si)_m1, (__v2si)_m2);\
> +   _acc;\
> +   })

I assume someone knows why these are macros and not inline functions
like the others ?


> +
> +/* The third arguments should be an immediate.  */

s/arguments/argument

> +#define _mm_merge_si64(a, b, n) \
> +  ({\
> +   __m64 result;\
> +   result = (__m64) __builtin_arm_wmerge ((__m64) (a), (__m64) (b), (n));\
> +   result;\
> +   })
> +#endif  /* __IWMMXT2__ */
> +

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH ARM iWMMXt 2/5] intrinsic head file change
  2012-05-29  4:13 [PATCH ARM iWMMXt 0/5] Improve iWMMXt support Matt Turner
@ 2012-05-29  4:15 ` Matt Turner
  2012-06-06 12:22   ` Ramana Radhakrishnan
  0 siblings, 1 reply; 11+ messages in thread
From: Matt Turner @ 2012-05-29  4:15 UTC (permalink / raw)
  To: gcc-patches
  Cc: Ramana Radhakrishnan, Richard Earnshaw, Nick Clifton, Paul Brook,
	Xinyu Qi

From: Xinyu Qi <xyqi@marvell.com>

	gcc/
	* config/arm/mmintrin.h: Use __IWMMXT__ to enable iWMMXt intrinsics.
	Use __IWMMXT2__ to enable iWMMXt2 intrinsics.
	Use C name-mangling for intrinsics.
	(__v8qi): Redefine.
	(_mm_cvtsi32_si64, _mm_andnot_si64, _mm_sad_pu8): Revise.
	(_mm_sad_pu16, _mm_align_si64, _mm_setwcx, _mm_getwcx): Likewise.
	(_m_from_int): Likewise.
	(_mm_sada_pu8, _mm_sada_pu16): New intrinsic.
	(_mm_alignr0_si64, _mm_alignr1_si64, _mm_alignr2_si64): Likewise.
	(_mm_alignr3_si64, _mm_tandcb, _mm_tandch, _mm_tandcw): Likewise.
	(_mm_textrcb, _mm_textrch, _mm_textrcw, _mm_torcb): Likewise.
	(_mm_torch, _mm_torcw, _mm_tbcst_pi8, _mm_tbcst_pi16): Likewise.
	(_mm_tbcst_pi32): Likewise.
	(_mm_abs_pi8, _mm_abs_pi16, _mm_abs_pi32): New iWMMXt2 intrinsic.
	(_mm_addsubhx_pi16, _mm_absdiff_pu8, _mm_absdiff_pu16): Likewise.
	(_mm_absdiff_pu32, _mm_addc_pu16, _mm_addc_pu32): Likewise.
	(_mm_avg4_pu8, _mm_avg4r_pu8, _mm_maddx_pi16, _mm_maddx_pu16): Likewise.
	(_mm_msub_pi16, _mm_msub_pu16, _mm_mulhi_pi32): Likewise.
	(_mm_mulhi_pu32, _mm_mulhir_pi16, _mm_mulhir_pi32): Likewise.
	(_mm_mulhir_pu16, _mm_mulhir_pu32, _mm_mullo_pi32): Likewise.
	(_mm_qmulm_pi16, _mm_qmulm_pi32, _mm_qmulmr_pi16): Likewise.
	(_mm_qmulmr_pi32, _mm_subaddhx_pi16, _mm_addbhusl_pu8): Likewise.
	(_mm_addbhusm_pu8, _mm_qmiabb_pi32, _mm_qmiabbn_pi32): Likewise.
	(_mm_qmiabt_pi32, _mm_qmiabtn_pi32, _mm_qmiatb_pi32): Likewise.
	(_mm_qmiatbn_pi32, _mm_qmiatt_pi32, _mm_qmiattn_pi32): Likewise.
	(_mm_wmiabb_si64, _mm_wmiabbn_si64, _mm_wmiabt_si64): Likewise.
	(_mm_wmiabtn_si64, _mm_wmiatb_si64, _mm_wmiatbn_si64): Likewise.
	(_mm_wmiatt_si64, _mm_wmiattn_si64, _mm_wmiawbb_si64): Likewise.
	(_mm_wmiawbbn_si64, _mm_wmiawbt_si64, _mm_wmiawbtn_si64): Likewise.
	(_mm_wmiawtb_si64, _mm_wmiawtbn_si64, _mm_wmiawtt_si64): Likewise.
	(_mm_wmiawttn_si64, _mm_merge_si64): Likewise.
	(_mm_torvscb, _mm_torvsch, _mm_torvscw): Likewise.
	(_m_to_int): New define.
---
 gcc/config/arm/mmintrin.h |  649 ++++++++++++++++++++++++++++++++++++++++++---
 1 files changed, 614 insertions(+), 35 deletions(-)

diff --git a/gcc/config/arm/mmintrin.h b/gcc/config/arm/mmintrin.h
index 2cc500d..0fe551d 100644
--- a/gcc/config/arm/mmintrin.h
+++ b/gcc/config/arm/mmintrin.h
@@ -24,16 +24,30 @@
 #ifndef _MMINTRIN_H_INCLUDED
 #define _MMINTRIN_H_INCLUDED
 
+#ifndef __IWMMXT__
+#error You must enable WMMX/WMMX2 instructions (e.g. -march=iwmmxt or -march=iwmmxt2) to use iWMMXt/iWMMXt2 intrinsics
+#else
+
+#ifndef __IWMMXT2__
+#warning You only enable iWMMXt intrinsics. Extended iWMMXt2 intrinsics available only if WMMX2 instructions enabled (e.g. -march=iwmmxt2)
+#endif
+
+
+#if defined __cplusplus
+extern "C" { /* Begin "C" */
+/* Intrinsics use C name-mangling.  */
+#endif /* __cplusplus */
+
 /* The data type intended for user use.  */
 typedef unsigned long long __m64, __int64;
 
 /* Internal data types for implementing the intrinsics.  */
 typedef int __v2si __attribute__ ((vector_size (8)));
 typedef short __v4hi __attribute__ ((vector_size (8)));
-typedef char __v8qi __attribute__ ((vector_size (8)));
+typedef signed char __v8qi __attribute__ ((vector_size (8)));
 
 /* "Convert" __m64 and __int64 into each other.  */
-static __inline __m64 
+static __inline __m64
 _mm_cvtsi64_m64 (__int64 __i)
 {
   return __i;
@@ -54,7 +68,7 @@ _mm_cvtsi64_si32 (__int64 __i)
 static __inline __int64
 _mm_cvtsi32_si64 (int __i)
 {
-  return __i;
+  return (__i & 0xffffffff);
 }
 
 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
@@ -603,7 +617,7 @@ _mm_and_si64 (__m64 __m1, __m64 __m2)
 static __inline __m64
 _mm_andnot_si64 (__m64 __m1, __m64 __m2)
 {
-  return __builtin_arm_wandn (__m1, __m2);
+  return __builtin_arm_wandn (__m2, __m1);
 }
 
 /* Bit-wise inclusive OR the 64-bit values in M1 and M2.  */
@@ -935,7 +949,13 @@ _mm_avg2_pu16 (__m64 __A, __m64 __B)
 static __inline __m64
 _mm_sad_pu8 (__m64 __A, __m64 __B)
 {
-  return (__m64) __builtin_arm_wsadb ((__v8qi)__A, (__v8qi)__B);
+  return (__m64) __builtin_arm_wsadbz ((__v8qi)__A, (__v8qi)__B);
+}
+
+static __inline __m64
+_mm_sada_pu8 (__m64 __A, __m64 __B, __m64 __C)
+{
+  return (__m64) __builtin_arm_wsadb ((__v2si)__A, (__v8qi)__B, (__v8qi)__C);
 }
 
 /* Compute the sum of the absolute differences of the unsigned 16-bit
@@ -944,9 +964,16 @@ _mm_sad_pu8 (__m64 __A, __m64 __B)
 static __inline __m64
 _mm_sad_pu16 (__m64 __A, __m64 __B)
 {
-  return (__m64) __builtin_arm_wsadh ((__v4hi)__A, (__v4hi)__B);
+  return (__m64) __builtin_arm_wsadhz ((__v4hi)__A, (__v4hi)__B);
 }
 
+static __inline __m64
+_mm_sada_pu16 (__m64 __A, __m64 __B, __m64 __C)
+{
+  return (__m64) __builtin_arm_wsadh ((__v2si)__A, (__v4hi)__B, (__v4hi)__C);
+}
+
+
 /* Compute the sum of the absolute differences of the unsigned 8-bit
    values in A and B.  Return the value in the lower 16-bit word; the
    upper words are cleared.  */
@@ -965,11 +992,8 @@ _mm_sadz_pu16 (__m64 __A, __m64 __B)
   return (__m64) __builtin_arm_wsadhz ((__v4hi)__A, (__v4hi)__B);
 }
 
-static __inline __m64
-_mm_align_si64 (__m64 __A, __m64 __B, int __C)
-{
-  return (__m64) __builtin_arm_walign ((__v8qi)__A, (__v8qi)__B, __C);
-}
+#define _mm_align_si64(__A,__B, N) \
+  (__m64) __builtin_arm_walign ((__v8qi) (__A),(__v8qi) (__B), (N))
 
 /* Creates a 64-bit zero.  */
 static __inline __m64
@@ -987,42 +1011,76 @@ _mm_setwcx (const int __value, const int __regno)
 {
   switch (__regno)
     {
-    case 0:  __builtin_arm_setwcx (__value, 0); break;
-    case 1:  __builtin_arm_setwcx (__value, 1); break;
-    case 2:  __builtin_arm_setwcx (__value, 2); break;
-    case 3:  __builtin_arm_setwcx (__value, 3); break;
-    case 8:  __builtin_arm_setwcx (__value, 8); break;
-    case 9:  __builtin_arm_setwcx (__value, 9); break;
-    case 10: __builtin_arm_setwcx (__value, 10); break;
-    case 11: __builtin_arm_setwcx (__value, 11); break;
-    default: break;
+    case 0:
+      __asm __volatile ("tmcr wcid, %0" :: "r"(__value));
+      break;
+    case 1:
+      __asm __volatile ("tmcr wcon, %0" :: "r"(__value));
+      break;
+    case 2:
+      __asm __volatile ("tmcr wcssf, %0" :: "r"(__value));
+      break;
+    case 3:
+      __asm __volatile ("tmcr wcasf, %0" :: "r"(__value));
+      break;
+    case 8:
+      __builtin_arm_setwcgr0 (__value);
+      break;
+    case 9:
+      __builtin_arm_setwcgr1 (__value);
+      break;
+    case 10:
+      __builtin_arm_setwcgr2 (__value);
+      break;
+    case 11:
+      __builtin_arm_setwcgr3 (__value);
+      break;
+    default:
+      break;
     }
 }
 
 static __inline int
 _mm_getwcx (const int __regno)
 {
+  int __value;
   switch (__regno)
     {
-    case 0:  return __builtin_arm_getwcx (0);
-    case 1:  return __builtin_arm_getwcx (1);
-    case 2:  return __builtin_arm_getwcx (2);
-    case 3:  return __builtin_arm_getwcx (3);
-    case 8:  return __builtin_arm_getwcx (8);
-    case 9:  return __builtin_arm_getwcx (9);
-    case 10: return __builtin_arm_getwcx (10);
-    case 11: return __builtin_arm_getwcx (11);
-    default: return 0;
+    case 0:
+      __asm __volatile ("tmrc %0, wcid" : "=r"(__value));
+      break;
+    case 1:
+      __asm __volatile ("tmrc %0, wcon" : "=r"(__value));
+      break;
+    case 2:
+      __asm __volatile ("tmrc %0, wcssf" : "=r"(__value));
+      break;
+    case 3:
+      __asm __volatile ("tmrc %0, wcasf" : "=r"(__value));
+      break;
+    case 8:
+      return __builtin_arm_getwcgr0 ();
+    case 9:
+      return __builtin_arm_getwcgr1 ();
+    case 10:
+      return __builtin_arm_getwcgr2 ();
+    case 11:
+      return __builtin_arm_getwcgr3 ();
+    default:
+      break;
     }
+  return __value;
 }
 
 /* Creates a vector of two 32-bit values; I0 is least significant.  */
 static __inline __m64
 _mm_set_pi32 (int __i1, int __i0)
 {
-  union {
+  union
+  {
     __m64 __q;
-    struct {
+    struct
+    {
       unsigned int __i0;
       unsigned int __i1;
     } __s;
@@ -1041,7 +1099,7 @@ _mm_set_pi16 (short __w3, short __w2, short __w1, short __w0)
   unsigned int __i1 = (unsigned short)__w3 << 16 | (unsigned short)__w2;
   unsigned int __i0 = (unsigned short)__w1 << 16 | (unsigned short)__w0;
   return _mm_set_pi32 (__i1, __i0);
-		       
+
 }
 
 /* Creates a vector of eight 8-bit values; B0 is least significant.  */
@@ -1108,11 +1166,526 @@ _mm_set1_pi8 (char __b)
   return _mm_set1_pi32 (__i);
 }
 
-/* Convert an integer to a __m64 object.  */
+#ifdef __IWMMXT2__
+static __inline __m64
+_mm_abs_pi8 (__m64 m1)
+{
+  return (__m64) __builtin_arm_wabsb ((__v8qi)m1);
+}
+
+static __inline __m64
+_mm_abs_pi16 (__m64 m1)
+{
+  return (__m64) __builtin_arm_wabsh ((__v4hi)m1);
+
+}
+
+static __inline __m64
+_mm_abs_pi32 (__m64 m1)
+{
+  return (__m64) __builtin_arm_wabsw ((__v2si)m1);
+
+}
+
+static __inline __m64
+_mm_addsubhx_pi16 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_waddsubhx ((__v4hi)a, (__v4hi)b);
+}
+
+static __inline __m64
+_mm_absdiff_pu8 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wabsdiffb ((__v8qi)a, (__v8qi)b);
+}
+
+static __inline __m64
+_mm_absdiff_pu16 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wabsdiffh ((__v4hi)a, (__v4hi)b);
+}
+
+static __inline __m64
+_mm_absdiff_pu32 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wabsdiffw ((__v2si)a, (__v2si)b);
+}
+
+static __inline __m64
+_mm_addc_pu16 (__m64 a, __m64 b)
+{
+  __m64 result;
+  __asm__ __volatile__ ("waddhc	%0, %1, %2" : "=y" (result) : "y" (a),  "y" (b));
+  return result;
+}
+
+static __inline __m64
+_mm_addc_pu32 (__m64 a, __m64 b)
+{
+  __m64 result;
+  __asm__ __volatile__ ("waddwc	%0, %1, %2" : "=y" (result) : "y" (a),  "y" (b));
+  return result;
+}
+
+static __inline __m64
+_mm_avg4_pu8 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wavg4 ((__v8qi)a, (__v8qi)b);
+}
+
+static __inline __m64
+_mm_avg4r_pu8 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wavg4r ((__v8qi)a, (__v8qi)b);
+}
+
+static __inline __m64
+_mm_maddx_pi16 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wmaddsx ((__v4hi)a, (__v4hi)b);
+}
+
+static __inline __m64
+_mm_maddx_pu16 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wmaddux ((__v4hi)a, (__v4hi)b);
+}
+
+static __inline __m64
+_mm_msub_pi16 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wmaddsn ((__v4hi)a, (__v4hi)b);
+}
+
+static __inline __m64
+_mm_msub_pu16 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wmaddun ((__v4hi)a, (__v4hi)b);
+}
+
+static __inline __m64
+_mm_mulhi_pi32 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wmulwsm ((__v2si)a, (__v2si)b);
+}
+
+static __inline __m64
+_mm_mulhi_pu32 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wmulwum ((__v2si)a, (__v2si)b);
+}
+
+static __inline __m64
+_mm_mulhir_pi16 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wmulsmr ((__v4hi)a, (__v4hi)b);
+}
+
+static __inline __m64
+_mm_mulhir_pi32 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wmulwsmr ((__v2si)a, (__v2si)b);
+}
+
+static __inline __m64
+_mm_mulhir_pu16 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wmulumr ((__v4hi)a, (__v4hi)b);
+}
+
+static __inline __m64
+_mm_mulhir_pu32 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wmulwumr ((__v2si)a, (__v2si)b);
+}
+
+static __inline __m64
+_mm_mullo_pi32 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wmulwl ((__v2si)a, (__v2si)b);
+}
+
+static __inline __m64
+_mm_qmulm_pi16 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wqmulm ((__v4hi)a, (__v4hi)b);
+}
+
+static __inline __m64
+_mm_qmulm_pi32 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wqmulwm ((__v2si)a, (__v2si)b);
+}
+
+static __inline __m64
+_mm_qmulmr_pi16 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wqmulmr ((__v4hi)a, (__v4hi)b);
+}
+
+static __inline __m64
+_mm_qmulmr_pi32 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wqmulwmr ((__v2si)a, (__v2si)b);
+}
+
+static __inline __m64
+_mm_subaddhx_pi16 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wsubaddhx ((__v4hi)a, (__v4hi)b);
+}
+
+static __inline __m64
+_mm_addbhusl_pu8 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_waddbhusl ((__v4hi)a, (__v8qi)b);
+}
+
+static __inline __m64
+_mm_addbhusm_pu8 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_waddbhusm ((__v4hi)a, (__v8qi)b);
+}
+
+#define _mm_qmiabb_pi32(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wqmiabb ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_qmiabbn_pi32(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wqmiabbn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_qmiabt_pi32(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wqmiabt ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_qmiabtn_pi32(acc, m1, m2) \
+  ({\
+   __m64 _acc=acc;\
+   __m64 _m1=m1;\
+   __m64 _m2=m2;\
+   _acc = (__m64) __builtin_arm_wqmiabtn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_qmiatb_pi32(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wqmiatb ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_qmiatbn_pi32(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wqmiatbn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_qmiatt_pi32(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wqmiatt ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_qmiattn_pi32(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wqmiattn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiabb_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiabb (_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiabbn_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiabbn (_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiabt_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiabt (_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiabtn_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiabtn (_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiatb_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiatb (_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiatbn_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiatbn (_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiatt_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiatt (_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiattn_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiattn (_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiawbb_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiawbb (_acc, (__v2si)_m1, (__v2si)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiawbbn_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiawbbn (_acc, (__v2si)_m1, (__v2si)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiawbt_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiawbt (_acc, (__v2si)_m1, (__v2si)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiawbtn_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiawbtn (_acc, (__v2si)_m1, (__v2si)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiawtb_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiawtb (_acc, (__v2si)_m1, (__v2si)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiawtbn_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiawtbn (_acc, (__v2si)_m1, (__v2si)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiawtt_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiawtt (_acc, (__v2si)_m1, (__v2si)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiawttn_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiawttn (_acc, (__v2si)_m1, (__v2si)_m2);\
+   _acc;\
+   })
+
+/* The third arguments should be an immediate.  */
+#define _mm_merge_si64(a, b, n) \
+  ({\
+   __m64 result;\
+   result = (__m64) __builtin_arm_wmerge ((__m64) (a), (__m64) (b), (n));\
+   result;\
+   })
+#endif  /* __IWMMXT2__ */
+
+static __inline __m64
+_mm_alignr0_si64 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_walignr0 ((__v8qi) a, (__v8qi) b);
+}
+
+static __inline __m64
+_mm_alignr1_si64 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_walignr1 ((__v8qi) a, (__v8qi) b);
+}
+
+static __inline __m64
+_mm_alignr2_si64 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_walignr2 ((__v8qi) a, (__v8qi) b);
+}
+
+static __inline __m64
+_mm_alignr3_si64 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_walignr3 ((__v8qi) a, (__v8qi) b);
+}
+
+static __inline void
+_mm_tandcb ()
+{
+  __asm __volatile ("tandcb r15");
+}
+
+static __inline void
+_mm_tandch ()
+{
+  __asm __volatile ("tandch r15");
+}
+
+static __inline void
+_mm_tandcw ()
+{
+  __asm __volatile ("tandcw r15");
+}
+
+#define _mm_textrcb(n) \
+  ({\
+   __asm__ __volatile__ (\
+     "textrcb r15, %0" : : "i" (n));\
+   })
+
+#define _mm_textrch(n) \
+  ({\
+   __asm__ __volatile__ (\
+     "textrch r15, %0" : : "i" (n));\
+   })
+
+#define _mm_textrcw(n) \
+  ({\
+   __asm__ __volatile__ (\
+     "textrcw r15, %0" : : "i" (n));\
+   })
+
+static __inline void
+_mm_torcb ()
+{
+  __asm __volatile ("torcb r15");
+}
+
+static __inline void
+_mm_torch ()
+{
+  __asm __volatile ("torch r15");
+}
+
+static __inline void
+_mm_torcw ()
+{
+  __asm __volatile ("torcw r15");
+}
+
+#ifdef __IWMMXT2__
+static __inline void
+_mm_torvscb ()
+{
+  __asm __volatile ("torvscb r15");
+}
+
+static __inline void
+_mm_torvsch ()
+{
+  __asm __volatile ("torvsch r15");
+}
+
+static __inline void
+_mm_torvscw ()
+{
+  __asm __volatile ("torvscw r15");
+}
+#endif
+
+static __inline __m64
+_mm_tbcst_pi8 (int value)
+{
+  return (__m64) __builtin_arm_tbcstb ((signed char) value);
+}
+
+static __inline __m64
+_mm_tbcst_pi16 (int value)
+{
+  return (__m64) __builtin_arm_tbcsth ((short) value);
+}
+
 static __inline __m64
-_m_from_int (int __a)
+_mm_tbcst_pi32 (int value)
 {
-  return (__m64)__a;
+  return (__m64) __builtin_arm_tbcstw (value);
 }
 
 #define _m_packsswb _mm_packs_pi16
@@ -1250,5 +1823,11 @@ _m_from_int (int __a)
 #define _m_paligniq _mm_align_si64
 #define _m_cvt_si2pi _mm_cvtsi64_m64
 #define _m_cvt_pi2si _mm_cvtm64_si64
+#define _m_from_int _mm_cvtsi32_si64
+#define _m_to_int _mm_cvtsi64_si32
 
+#if defined __cplusplus
+}; /* End "C" */
+#endif /* __cplusplus */
+#endif /* __IWMMXT__ */
 #endif /* _MMINTRIN_H_INCLUDED */
-- 
1.7.3.4

^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE: [PATCH, ARM, iWMMXt][2/5]: intrinsic head file change
@ 2011-07-14  7:39 Xinyu Qi
  0 siblings, 0 replies; 11+ messages in thread
From: Xinyu Qi @ 2011-07-14  7:39 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 118 bytes --]

> 
> Hi,
> 
> It is the second part of iWMMXt maintenance.


*config/arm/mmintrin.h: Revise.

Thanks,
Xinyu

[-- Attachment #2: 2_mmintrin.diff --]
[-- Type: application/octet-stream, Size: 17935 bytes --]

Index: gcc/config/arm/mmintrin.h
===================================================================
--- gcc/config/arm/mmintrin.h	(revision 175285)
+++ gcc/config/arm/mmintrin.h	(working copy)
@@ -24,16 +24,25 @@
 #ifndef _MMINTRIN_H_INCLUDED
 #define _MMINTRIN_H_INCLUDED
 
+#if defined __cplusplus
+extern "C" { /* Begin "C" */
+/* Intrinsics use C name-mangling.  */
+#endif /* __cplusplus */
+
 /* The data type intended for user use.  */
-typedef unsigned long long __m64, __int64;
+
+/*  We will treat __int64 as a long long type
+    and __m64 as an unsigned long long type to conform to VSC++.  */
+typedef unsigned long long __m64;
+typedef long long __int64;
 
 /* Internal data types for implementing the intrinsics.  */
 typedef int __v2si __attribute__ ((vector_size (8)));
 typedef short __v4hi __attribute__ ((vector_size (8)));
-typedef char __v8qi __attribute__ ((vector_size (8)));
+typedef signed char __v8qi __attribute__ ((vector_size (8)));
 
 /* "Convert" __m64 and __int64 into each other.  */
-static __inline __m64 
+static __inline __m64
 _mm_cvtsi64_m64 (__int64 __i)
 {
   return __i;
@@ -54,7 +63,7 @@ _mm_cvtsi64_si32 (__int64 __i)
 static __inline __int64
 _mm_cvtsi32_si64 (int __i)
 {
-  return __i;
+  return (__i & 0xffffffff);
 }
 
 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
@@ -603,7 +612,7 @@ _mm_and_si64 (__m64 __m1, __m64 __m2)
 static __inline __m64
 _mm_andnot_si64 (__m64 __m1, __m64 __m2)
 {
-  return __builtin_arm_wandn (__m1, __m2);
+  return __builtin_arm_wandn (__m2, __m1);
 }
 
 /* Bit-wise inclusive OR the 64-bit values in M1 and M2.  */
@@ -935,7 +944,13 @@ _mm_avg2_pu16 (__m64 __A, __m64 __B)
 static __inline __m64
 _mm_sad_pu8 (__m64 __A, __m64 __B)
 {
-  return (__m64) __builtin_arm_wsadb ((__v8qi)__A, (__v8qi)__B);
+  return (__m64) __builtin_arm_wsadbz ((__v8qi)__A, (__v8qi)__B);
+}
+
+static __inline __m64
+_mm_sada_pu8 (__m64 __A, __m64 __B, __m64 __C)
+{
+  return (__m64) __builtin_arm_wsadb ((__v2si)__A, (__v8qi)__B, (__v8qi)__C);
 }
 
 /* Compute the sum of the absolute differences of the unsigned 16-bit
@@ -944,9 +959,16 @@ _mm_sad_pu8 (__m64 __A, __m64 __B)
 static __inline __m64
 _mm_sad_pu16 (__m64 __A, __m64 __B)
 {
-  return (__m64) __builtin_arm_wsadh ((__v4hi)__A, (__v4hi)__B);
+  return (__m64) __builtin_arm_wsadhz ((__v4hi)__A, (__v4hi)__B);
 }
 
+static __inline __m64
+_mm_sada_pu16 (__m64 __A, __m64 __B, __m64 __C)
+{
+  return (__m64) __builtin_arm_wsadh ((__v2si)__A, (__v4hi)__B, (__v4hi)__C);
+}
+
+
 /* Compute the sum of the absolute differences of the unsigned 8-bit
    values in A and B.  Return the value in the lower 16-bit word; the
    upper words are cleared.  */
@@ -965,11 +987,8 @@ _mm_sadz_pu16 (__m64 __A, __m64 __B)
   return (__m64) __builtin_arm_wsadhz ((__v4hi)__A, (__v4hi)__B);
 }
 
-static __inline __m64
-_mm_align_si64 (__m64 __A, __m64 __B, int __C)
-{
-  return (__m64) __builtin_arm_walign ((__v8qi)__A, (__v8qi)__B, __C);
-}
+#define _mm_align_si64(__A,__B, N) \
+  (__m64) __builtin_arm_walign ((__v8qi) (__A),(__v8qi) (__B), (N))
 
 /* Creates a 64-bit zero.  */
 static __inline __m64
@@ -985,44 +1004,83 @@ _mm_setzero_si64 (void)
 static __inline void
 _mm_setwcx (const int __value, const int __regno)
 {
+  /*Since gcc has the imformation of all wcgr regs
+    in arm backend, use builtin to access them instead
+    of throw asm directly.  Thus, gcc could do some
+    optimization on them.  */
+
   switch (__regno)
     {
-    case 0:  __builtin_arm_setwcx (__value, 0); break;
-    case 1:  __builtin_arm_setwcx (__value, 1); break;
-    case 2:  __builtin_arm_setwcx (__value, 2); break;
-    case 3:  __builtin_arm_setwcx (__value, 3); break;
-    case 8:  __builtin_arm_setwcx (__value, 8); break;
-    case 9:  __builtin_arm_setwcx (__value, 9); break;
-    case 10: __builtin_arm_setwcx (__value, 10); break;
-    case 11: __builtin_arm_setwcx (__value, 11); break;
-    default: break;
+    case 0:
+      __asm __volatile ("tmcr wcid, %0" :: "r"(__value));
+      break;
+    case 1:
+      __asm __volatile ("tmcr wcon, %0" :: "r"(__value));
+      break;
+    case 2:
+      __asm __volatile ("tmcr wcssf, %0" :: "r"(__value));
+      break;
+    case 3:
+      __asm __volatile ("tmcr wcasf, %0" :: "r"(__value));
+      break;
+    case 8:
+      __builtin_arm_setwcgr0 (__value);
+      break;
+    case 9:
+      __builtin_arm_setwcgr1 (__value);
+      break;
+    case 10:
+      __builtin_arm_setwcgr2 (__value);
+      break;
+    case 11:
+      __builtin_arm_setwcgr3 (__value);
+      break;
+    default:
+      break;
     }
 }
 
 static __inline int
 _mm_getwcx (const int __regno)
 {
+  int __value;
   switch (__regno)
     {
-    case 0:  return __builtin_arm_getwcx (0);
-    case 1:  return __builtin_arm_getwcx (1);
-    case 2:  return __builtin_arm_getwcx (2);
-    case 3:  return __builtin_arm_getwcx (3);
-    case 8:  return __builtin_arm_getwcx (8);
-    case 9:  return __builtin_arm_getwcx (9);
-    case 10: return __builtin_arm_getwcx (10);
-    case 11: return __builtin_arm_getwcx (11);
-    default: return 0;
+    case 0:
+      __asm __volatile ("tmrc %0, wcid" : "=r"(__value));
+      break;
+    case 1:
+      __asm __volatile ("tmrc %0, wcon" : "=r"(__value));
+      break;
+    case 2:
+      __asm __volatile ("tmrc %0, wcssf" : "=r"(__value));
+      break;
+    case 3:
+      __asm __volatile ("tmrc %0, wcasf" : "=r"(__value));
+      break;
+    case 8:
+      return __builtin_arm_getwcgr0 ();
+    case 9:
+      return __builtin_arm_getwcgr1 ();
+    case 10:
+      return __builtin_arm_getwcgr2 ();
+    case 11:
+      return __builtin_arm_getwcgr3 ();
+    default:
+      break;
     }
+  return __value;
 }
 
 /* Creates a vector of two 32-bit values; I0 is least significant.  */
 static __inline __m64
 _mm_set_pi32 (int __i1, int __i0)
 {
-  union {
+  union
+  {
     __m64 __q;
-    struct {
+    struct
+    {
       unsigned int __i0;
       unsigned int __i1;
     } __s;
@@ -1041,7 +1099,7 @@ _mm_set_pi16 (short __w3, short __w2, sh
   unsigned int __i1 = (unsigned short)__w3 << 16 | (unsigned short)__w2;
   unsigned int __i0 = (unsigned short)__w1 << 16 | (unsigned short)__w0;
   return _mm_set_pi32 (__i1, __i0);
-		       
+
 }
 
 /* Creates a vector of eight 8-bit values; B0 is least significant.  */
@@ -1110,9 +1168,521 @@ _mm_set1_pi8 (char __b)
 
 /* Convert an integer to a __m64 object.  */
 static __inline __m64
-_m_from_int (int __a)
+_mm_abs_pi8 (__m64 m1)
+{
+  return (__m64) __builtin_arm_wabsb ((__v8qi)m1);
+}
+
+static __inline __m64
+_mm_abs_pi16 (__m64 m1)
+{
+  return (__m64) __builtin_arm_wabsh ((__v4hi)m1);
+
+}
+
+static __inline __m64
+_mm_abs_pi32 (__m64 m1)
+{
+  return (__m64) __builtin_arm_wabsw ((__v2si)m1);
+
+}
+
+static __inline __m64
+_mm_addsubhx_pi16 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_waddsubhx ((__v4hi)a, (__v4hi)b);
+}
+
+static __inline __m64
+_mm_absdiff_pu8 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wabsdiffb ((__v8qi)a, (__v8qi)b);
+}
+
+static __inline __m64
+_mm_absdiff_pu16 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wabsdiffh ((__v4hi)a, (__v4hi)b);
+}
+
+static __inline __m64
+_mm_absdiff_pu32 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wabsdiffw ((__v2si)a, (__v2si)b);
+}
+
+static __inline __m64
+_mm_addc_pu16 (__m64 a, __m64 b)
+{
+  __m64 result;
+  __asm__ __volatile__ ("waddhc	%0, %1, %2" : "=y" (result) : "y" (a),  "y" (b));
+  return result;
+}
+
+static __inline __m64
+_mm_addc_pu32 (__m64 a, __m64 b)
+{
+  __m64 result;
+  __asm__ __volatile__ ("waddwc	%0, %1, %2" : "=y" (result) : "y" (a),  "y" (b));
+  return result;
+}
+
+static __inline __m64
+_mm_avg4_pu8 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wavg4 ((__v8qi)a, (__v8qi)b);
+}
+
+static __inline __m64
+_mm_avg4r_pu8 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wavg4r ((__v8qi)a, (__v8qi)b);
+}
+
+static __inline __m64
+_mm_maddx_pi16 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wmaddsx ((__v4hi)a, (__v4hi)b);
+}
+
+static __inline __m64
+_mm_maddx_pu16 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wmaddux ((__v4hi)a, (__v4hi)b);
+}
+
+static __inline __m64
+_mm_msub_pi16 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wmaddsn ((__v4hi)a, (__v4hi)b);
+}
+
+static __inline __m64
+_mm_msub_pu16 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wmaddun ((__v4hi)a, (__v4hi)b);
+}
+
+static __inline __m64
+_mm_mulhi_pi32 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wmulwsm ((__v2si)a, (__v2si)b);
+}
+
+static __inline __m64
+_mm_mulhi_pu32 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wmulwum ((__v2si)a, (__v2si)b);
+}
+
+static __inline __m64
+_mm_mulhir_pi16 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wmulsmr ((__v4hi)a, (__v4hi)b);
+}
+
+static __inline __m64
+_mm_mulhir_pi32 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wmulwsmr ((__v2si)a, (__v2si)b);
+}
+
+static __inline __m64
+_mm_mulhir_pu16 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wmulumr ((__v4hi)a, (__v4hi)b);
+}
+
+static __inline __m64
+_mm_mulhir_pu32 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wmulwumr ((__v2si)a, (__v2si)b);
+}
+
+static __inline __m64
+_mm_mullo_pi32 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wmulwl ((__v2si)a, (__v2si)b);
+}
+
+static __inline __m64
+_mm_qmulm_pi16 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wqmulm ((__v4hi)a, (__v4hi)b);
+}
+
+static __inline __m64
+_mm_qmulm_pi32 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wqmulwm ((__v2si)a, (__v2si)b);
+}
+
+static __inline __m64
+_mm_qmulmr_pi16 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wqmulmr ((__v4hi)a, (__v4hi)b);
+}
+
+static __inline __m64
+_mm_qmulmr_pi32 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wqmulwmr ((__v2si)a, (__v2si)b);
+}
+
+static __inline __m64
+_mm_subaddhx_pi16 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_wsubaddhx ((__v4hi)a, (__v4hi)b);
+}
+
+static __inline __m64
+_mm_addbhusl_pu8 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_waddbhusl ((__v4hi)a, (__v8qi)b);
+}
+
+static __inline __m64
+_mm_addbhusm_pu8 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_waddbhusm ((__v4hi)a, (__v8qi)b);
+}
+
+#define _mm_qmiabb_pi32(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wqmiabb ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_qmiabbn_pi32(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wqmiabbn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_qmiabt_pi32(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wqmiabt ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_qmiabtn_pi32(acc, m1, m2) \
+  ({\
+   __m64 _acc=acc;\
+   __m64 _m1=m1;\
+   __m64 _m2=m2;\
+   _acc = (__m64) __builtin_arm_wqmiabtn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_qmiatb_pi32(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wqmiatb ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_qmiatbn_pi32(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wqmiatbn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_qmiatt_pi32(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wqmiatt ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_qmiattn_pi32(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wqmiattn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiabb_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiabb (_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiabbn_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiabbn (_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiabt_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiabt (_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiabtn_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiabtn (_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiatb_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiatb (_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiatbn_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiatbn (_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiatt_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiatt (_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiattn_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiattn (_acc, (__v4hi)_m1, (__v4hi)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiawbb_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiawbb (_acc, (__v2si)_m1, (__v2si)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiawbbn_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiawbbn (_acc, (__v2si)_m1, (__v2si)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiawbt_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiawbt (_acc, (__v2si)_m1, (__v2si)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiawbtn_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiawbtn (_acc, (__v2si)_m1, (__v2si)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiawtb_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiawtb (_acc, (__v2si)_m1, (__v2si)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiawtbn_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiawtbn (_acc, (__v2si)_m1, (__v2si)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiawtt_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiawtt (_acc, (__v2si)_m1, (__v2si)_m2);\
+   _acc;\
+   })
+
+#define _mm_wmiawttn_si64(acc, m1, m2) \
+  ({\
+   __m64 _acc = acc;\
+   __m64 _m1 = m1;\
+   __m64 _m2 = m2;\
+   _acc = (__m64) __builtin_arm_wmiawttn (_acc, (__v2si)_m1, (__v2si)_m2);\
+   _acc;\
+   })
+
+/* The third arguments should be an immediate.  */
+#define _mm_merge_si64(a, b, n) \
+  ({\
+   __m64 result;\
+   result = (__m64) __builtin_arm_wmerge ((__m64) (a), (__m64) (b), (n));\
+   result;\
+   })
+
+static __inline __m64
+_mm_alignr0_si64 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_walignr0 ((__v8qi) a, (__v8qi) b);
+}
+
+static __inline __m64
+_mm_alignr1_si64 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_walignr1 ((__v8qi) a, (__v8qi) b);
+}
+
+static __inline __m64
+_mm_alignr2_si64 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_walignr2 ((__v8qi) a, (__v8qi) b);
+}
+
+static __inline __m64
+_mm_alignr3_si64 (__m64 a, __m64 b)
+{
+  return (__m64) __builtin_arm_walignr3 ((__v8qi) a, (__v8qi) b);
+}
+
+static __inline void
+_mm_tandcb ()
+{
+  __asm __volatile ("tandcb r15");
+}
+
+static __inline void
+_mm_tandch ()
+{
+  __asm __volatile ("tandch r15");
+}
+
+static __inline void
+_mm_tandcw ()
+{
+  __asm __volatile ("tandcw r15");
+}
+
+#define _mm_textrcb(n) \
+  ({\
+   __asm__ __volatile__ (\
+     "textrcb r15, %0" : : "i" (n));\
+   })
+
+#define _mm_textrch(n) \
+  ({\
+   __asm__ __volatile__ (\
+     "textrch r15, %0" : : "i" (n));\
+   })
+
+#define _mm_textrcw(n) \
+  ({\
+   __asm__ __volatile__ (\
+     "textrcw r15, %0" : : "i" (n));\
+   })
+
+static __inline void
+_mm_torcb ()
+{
+  __asm __volatile ("torcb r15");
+}
+
+static __inline void
+_mm_torch ()
+{
+  __asm __volatile ("torch r15");
+}
+
+static __inline void
+_mm_torcw ()
+{
+  __asm __volatile ("torcw r15");
+}
+
+static __inline void
+_mm_torvscb ()
+{
+  __asm __volatile ("torvscb r15");
+}
+
+static __inline void
+_mm_torvsch ()
+{
+  __asm __volatile ("torvsch r15");
+}
+
+static __inline void
+_mm_torvscw ()
+{
+  __asm __volatile ("torvscw r15");
+}
+
+static __inline __m64
+_mm_tbcst_pi8 (int value)
+{
+  return (__m64) __builtin_arm_tbcstb ((signed char) value);
+}
+
+static __inline __m64
+_mm_tbcst_pi16 (int value)
+{
+  return (__m64) __builtin_arm_tbcsth ((short) value);
+}
+
+static __inline __m64
+_mm_tbcst_pi32 (int value)
 {
-  return (__m64)__a;
+  return (__m64) __builtin_arm_tbcstw (value);
 }
 
 #define _m_packsswb _mm_packs_pi16
@@ -1250,5 +1820,10 @@ _m_from_int (int __a)
 #define _m_paligniq _mm_align_si64
 #define _m_cvt_si2pi _mm_cvtsi64_m64
 #define _m_cvt_pi2si _mm_cvtm64_si64
+#define _m_from_int _mm_cvtsi32_si64
+#define _m_to_int _mm_cvtsi64_si32
 
+#if defined __cplusplus
+}; /* End "C" */
+#endif /* __cplusplus */
 #endif /* _MMINTRIN_H_INCLUDED */

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2012-06-06 12:11 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-07-06 10:15 [PATCH, ARM, iWMMXt][2/5]: intrinsic head file change Xinyu Qi
2011-08-18  2:35 ` Ramana Radhakrishnan
2011-08-24  9:07   ` Xinyu Qi
2011-09-26  4:31   ` PING: " Xinyu Qi
2011-10-20  8:05   ` Xinyu Qi
2011-12-29  6:26   ` Xinyu Qi
2012-02-03  2:08   ` Xinyu Qi
2012-03-13  8:55   ` Xinyu Qi
2011-07-14  7:39 Xinyu Qi
2012-05-29  4:13 [PATCH ARM iWMMXt 0/5] Improve iWMMXt support Matt Turner
2012-05-29  4:15 ` [PATCH ARM iWMMXt 2/5] intrinsic head file change Matt Turner
2012-06-06 12:22   ` Ramana Radhakrishnan

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).