* RE: PING: [PATCH, ARM, iWMMXt][2/5]: intrinsic head file change
@ 2011-07-29 4:04 Xinyu Qi
0 siblings, 0 replies; 6+ messages in thread
From: Xinyu Qi @ 2011-07-29 4:04 UTC (permalink / raw)
To: gcc-patches
Ping.
http://gcc.gnu.org/ml/gcc-patches/2011-07/msg01101.html
At 2011-07-14 15:38:04,"Xinyu Qi" <xyqi@marvell.com> wrote:
> > Hi,
> >
> > It is the second part of iWMMXt maintenance.
>
>
> *config/arm/mmintrin.h: Revise.
>
> Thanks,
> Xinyu
^ permalink raw reply [flat|nested] 6+ messages in thread
* [PATCH, ARM, iWMMXt][2/5]: intrinsic head file change @ 2011-07-06 10:15 Xinyu Qi 2011-08-18 2:35 ` Ramana Radhakrishnan 0 siblings, 1 reply; 6+ messages in thread From: Xinyu Qi @ 2011-07-06 10:15 UTC (permalink / raw) To: gcc-patches [-- Attachment #1: Type: text/plain, Size: 189 bytes --] Hi, It is the second part of iWMMXt maintenance. *config/arm/mmintrin.h: Revise the iWMMXt intrinsics head file. Fix some intrinsics and add some new intrinsics. Thanks, Xinyu [-- Attachment #2: 2_mmintrin.diff --] [-- Type: application/octet-stream, Size: 17935 bytes --] Index: gcc/config/arm/mmintrin.h =================================================================== --- gcc/config/arm/mmintrin.h (revision 175285) +++ gcc/config/arm/mmintrin.h (working copy) @@ -24,16 +24,25 @@ #ifndef _MMINTRIN_H_INCLUDED #define _MMINTRIN_H_INCLUDED +#if defined __cplusplus +extern "C" { /* Begin "C" */ +/* Intrinsics use C name-mangling. */ +#endif /* __cplusplus */ + /* The data type intended for user use. */ -typedef unsigned long long __m64, __int64; + +/* We will treat __int64 as a long long type + and __m64 as an unsigned long long type to conform to VSC++. */ +typedef unsigned long long __m64; +typedef long long __int64; /* Internal data types for implementing the intrinsics. */ typedef int __v2si __attribute__ ((vector_size (8))); typedef short __v4hi __attribute__ ((vector_size (8))); -typedef char __v8qi __attribute__ ((vector_size (8))); +typedef signed char __v8qi __attribute__ ((vector_size (8))); /* "Convert" __m64 and __int64 into each other. */ -static __inline __m64 +static __inline __m64 _mm_cvtsi64_m64 (__int64 __i) { return __i; @@ -54,7 +63,7 @@ _mm_cvtsi64_si32 (__int64 __i) static __inline __int64 _mm_cvtsi32_si64 (int __i) { - return __i; + return (__i & 0xffffffff); } /* Pack the four 16-bit values from M1 into the lower four 8-bit values of @@ -603,7 +612,7 @@ _mm_and_si64 (__m64 __m1, __m64 __m2) static __inline __m64 _mm_andnot_si64 (__m64 __m1, __m64 __m2) { - return __builtin_arm_wandn (__m1, __m2); + return __builtin_arm_wandn (__m2, __m1); } /* Bit-wise inclusive OR the 64-bit values in M1 and M2. */ @@ -935,7 +944,13 @@ _mm_avg2_pu16 (__m64 __A, __m64 __B) static __inline __m64 _mm_sad_pu8 (__m64 __A, __m64 __B) { - return (__m64) __builtin_arm_wsadb ((__v8qi)__A, (__v8qi)__B); + return (__m64) __builtin_arm_wsadbz ((__v8qi)__A, (__v8qi)__B); +} + +static __inline __m64 +_mm_sada_pu8 (__m64 __A, __m64 __B, __m64 __C) +{ + return (__m64) __builtin_arm_wsadb ((__v2si)__A, (__v8qi)__B, (__v8qi)__C); } /* Compute the sum of the absolute differences of the unsigned 16-bit @@ -944,9 +959,16 @@ _mm_sad_pu8 (__m64 __A, __m64 __B) static __inline __m64 _mm_sad_pu16 (__m64 __A, __m64 __B) { - return (__m64) __builtin_arm_wsadh ((__v4hi)__A, (__v4hi)__B); + return (__m64) __builtin_arm_wsadhz ((__v4hi)__A, (__v4hi)__B); } +static __inline __m64 +_mm_sada_pu16 (__m64 __A, __m64 __B, __m64 __C) +{ + return (__m64) __builtin_arm_wsadh ((__v2si)__A, (__v4hi)__B, (__v4hi)__C); +} + + /* Compute the sum of the absolute differences of the unsigned 8-bit values in A and B. Return the value in the lower 16-bit word; the upper words are cleared. */ @@ -965,11 +987,8 @@ _mm_sadz_pu16 (__m64 __A, __m64 __B) return (__m64) __builtin_arm_wsadhz ((__v4hi)__A, (__v4hi)__B); } -static __inline __m64 -_mm_align_si64 (__m64 __A, __m64 __B, int __C) -{ - return (__m64) __builtin_arm_walign ((__v8qi)__A, (__v8qi)__B, __C); -} +#define _mm_align_si64(__A,__B, N) \ + (__m64) __builtin_arm_walign ((__v8qi) (__A),(__v8qi) (__B), (N)) /* Creates a 64-bit zero. */ static __inline __m64 @@ -985,44 +1004,83 @@ _mm_setzero_si64 (void) static __inline void _mm_setwcx (const int __value, const int __regno) { + /*Since gcc has the imformation of all wcgr regs + in arm backend, use builtin to access them instead + of throw asm directly. Thus, gcc could do some + optimization on them. */ + switch (__regno) { - case 0: __builtin_arm_setwcx (__value, 0); break; - case 1: __builtin_arm_setwcx (__value, 1); break; - case 2: __builtin_arm_setwcx (__value, 2); break; - case 3: __builtin_arm_setwcx (__value, 3); break; - case 8: __builtin_arm_setwcx (__value, 8); break; - case 9: __builtin_arm_setwcx (__value, 9); break; - case 10: __builtin_arm_setwcx (__value, 10); break; - case 11: __builtin_arm_setwcx (__value, 11); break; - default: break; + case 0: + __asm __volatile ("tmcr wcid, %0" :: "r"(__value)); + break; + case 1: + __asm __volatile ("tmcr wcon, %0" :: "r"(__value)); + break; + case 2: + __asm __volatile ("tmcr wcssf, %0" :: "r"(__value)); + break; + case 3: + __asm __volatile ("tmcr wcasf, %0" :: "r"(__value)); + break; + case 8: + __builtin_arm_setwcgr0 (__value); + break; + case 9: + __builtin_arm_setwcgr1 (__value); + break; + case 10: + __builtin_arm_setwcgr2 (__value); + break; + case 11: + __builtin_arm_setwcgr3 (__value); + break; + default: + break; } } static __inline int _mm_getwcx (const int __regno) { + int __value; switch (__regno) { - case 0: return __builtin_arm_getwcx (0); - case 1: return __builtin_arm_getwcx (1); - case 2: return __builtin_arm_getwcx (2); - case 3: return __builtin_arm_getwcx (3); - case 8: return __builtin_arm_getwcx (8); - case 9: return __builtin_arm_getwcx (9); - case 10: return __builtin_arm_getwcx (10); - case 11: return __builtin_arm_getwcx (11); - default: return 0; + case 0: + __asm __volatile ("tmrc %0, wcid" : "=r"(__value)); + break; + case 1: + __asm __volatile ("tmrc %0, wcon" : "=r"(__value)); + break; + case 2: + __asm __volatile ("tmrc %0, wcssf" : "=r"(__value)); + break; + case 3: + __asm __volatile ("tmrc %0, wcasf" : "=r"(__value)); + break; + case 8: + return __builtin_arm_getwcgr0 (); + case 9: + return __builtin_arm_getwcgr1 (); + case 10: + return __builtin_arm_getwcgr2 (); + case 11: + return __builtin_arm_getwcgr3 (); + default: + break; } + return __value; } /* Creates a vector of two 32-bit values; I0 is least significant. */ static __inline __m64 _mm_set_pi32 (int __i1, int __i0) { - union { + union + { __m64 __q; - struct { + struct + { unsigned int __i0; unsigned int __i1; } __s; @@ -1041,7 +1099,7 @@ _mm_set_pi16 (short __w3, short __w2, sh unsigned int __i1 = (unsigned short)__w3 << 16 | (unsigned short)__w2; unsigned int __i0 = (unsigned short)__w1 << 16 | (unsigned short)__w0; return _mm_set_pi32 (__i1, __i0); - + } /* Creates a vector of eight 8-bit values; B0 is least significant. */ @@ -1110,9 +1168,521 @@ _mm_set1_pi8 (char __b) /* Convert an integer to a __m64 object. */ static __inline __m64 -_m_from_int (int __a) +_mm_abs_pi8 (__m64 m1) +{ + return (__m64) __builtin_arm_wabsb ((__v8qi)m1); +} + +static __inline __m64 +_mm_abs_pi16 (__m64 m1) +{ + return (__m64) __builtin_arm_wabsh ((__v4hi)m1); + +} + +static __inline __m64 +_mm_abs_pi32 (__m64 m1) +{ + return (__m64) __builtin_arm_wabsw ((__v2si)m1); + +} + +static __inline __m64 +_mm_addsubhx_pi16 (__m64 a, __m64 b) +{ + return (__m64) __builtin_arm_waddsubhx ((__v4hi)a, (__v4hi)b); +} + +static __inline __m64 +_mm_absdiff_pu8 (__m64 a, __m64 b) +{ + return (__m64) __builtin_arm_wabsdiffb ((__v8qi)a, (__v8qi)b); +} + +static __inline __m64 +_mm_absdiff_pu16 (__m64 a, __m64 b) +{ + return (__m64) __builtin_arm_wabsdiffh ((__v4hi)a, (__v4hi)b); +} + +static __inline __m64 +_mm_absdiff_pu32 (__m64 a, __m64 b) +{ + return (__m64) __builtin_arm_wabsdiffw ((__v2si)a, (__v2si)b); +} + +static __inline __m64 +_mm_addc_pu16 (__m64 a, __m64 b) +{ + __m64 result; + __asm__ __volatile__ ("waddhc %0, %1, %2" : "=y" (result) : "y" (a), "y" (b)); + return result; +} + +static __inline __m64 +_mm_addc_pu32 (__m64 a, __m64 b) +{ + __m64 result; + __asm__ __volatile__ ("waddwc %0, %1, %2" : "=y" (result) : "y" (a), "y" (b)); + return result; +} + +static __inline __m64 +_mm_avg4_pu8 (__m64 a, __m64 b) +{ + return (__m64) __builtin_arm_wavg4 ((__v8qi)a, (__v8qi)b); +} + +static __inline __m64 +_mm_avg4r_pu8 (__m64 a, __m64 b) +{ + return (__m64) __builtin_arm_wavg4r ((__v8qi)a, (__v8qi)b); +} + +static __inline __m64 +_mm_maddx_pi16 (__m64 a, __m64 b) +{ + return (__m64) __builtin_arm_wmaddsx ((__v4hi)a, (__v4hi)b); +} + +static __inline __m64 +_mm_maddx_pu16 (__m64 a, __m64 b) +{ + return (__m64) __builtin_arm_wmaddux ((__v4hi)a, (__v4hi)b); +} + +static __inline __m64 +_mm_msub_pi16 (__m64 a, __m64 b) +{ + return (__m64) __builtin_arm_wmaddsn ((__v4hi)a, (__v4hi)b); +} + +static __inline __m64 +_mm_msub_pu16 (__m64 a, __m64 b) +{ + return (__m64) __builtin_arm_wmaddun ((__v4hi)a, (__v4hi)b); +} + +static __inline __m64 +_mm_mulhi_pi32 (__m64 a, __m64 b) +{ + return (__m64) __builtin_arm_wmulwsm ((__v2si)a, (__v2si)b); +} + +static __inline __m64 +_mm_mulhi_pu32 (__m64 a, __m64 b) +{ + return (__m64) __builtin_arm_wmulwum ((__v2si)a, (__v2si)b); +} + +static __inline __m64 +_mm_mulhir_pi16 (__m64 a, __m64 b) +{ + return (__m64) __builtin_arm_wmulsmr ((__v4hi)a, (__v4hi)b); +} + +static __inline __m64 +_mm_mulhir_pi32 (__m64 a, __m64 b) +{ + return (__m64) __builtin_arm_wmulwsmr ((__v2si)a, (__v2si)b); +} + +static __inline __m64 +_mm_mulhir_pu16 (__m64 a, __m64 b) +{ + return (__m64) __builtin_arm_wmulumr ((__v4hi)a, (__v4hi)b); +} + +static __inline __m64 +_mm_mulhir_pu32 (__m64 a, __m64 b) +{ + return (__m64) __builtin_arm_wmulwumr ((__v2si)a, (__v2si)b); +} + +static __inline __m64 +_mm_mullo_pi32 (__m64 a, __m64 b) +{ + return (__m64) __builtin_arm_wmulwl ((__v2si)a, (__v2si)b); +} + +static __inline __m64 +_mm_qmulm_pi16 (__m64 a, __m64 b) +{ + return (__m64) __builtin_arm_wqmulm ((__v4hi)a, (__v4hi)b); +} + +static __inline __m64 +_mm_qmulm_pi32 (__m64 a, __m64 b) +{ + return (__m64) __builtin_arm_wqmulwm ((__v2si)a, (__v2si)b); +} + +static __inline __m64 +_mm_qmulmr_pi16 (__m64 a, __m64 b) +{ + return (__m64) __builtin_arm_wqmulmr ((__v4hi)a, (__v4hi)b); +} + +static __inline __m64 +_mm_qmulmr_pi32 (__m64 a, __m64 b) +{ + return (__m64) __builtin_arm_wqmulwmr ((__v2si)a, (__v2si)b); +} + +static __inline __m64 +_mm_subaddhx_pi16 (__m64 a, __m64 b) +{ + return (__m64) __builtin_arm_wsubaddhx ((__v4hi)a, (__v4hi)b); +} + +static __inline __m64 +_mm_addbhusl_pu8 (__m64 a, __m64 b) +{ + return (__m64) __builtin_arm_waddbhusl ((__v4hi)a, (__v8qi)b); +} + +static __inline __m64 +_mm_addbhusm_pu8 (__m64 a, __m64 b) +{ + return (__m64) __builtin_arm_waddbhusm ((__v4hi)a, (__v8qi)b); +} + +#define _mm_qmiabb_pi32(acc, m1, m2) \ + ({\ + __m64 _acc = acc;\ + __m64 _m1 = m1;\ + __m64 _m2 = m2;\ + _acc = (__m64) __builtin_arm_wqmiabb ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\ + _acc;\ + }) + +#define _mm_qmiabbn_pi32(acc, m1, m2) \ + ({\ + __m64 _acc = acc;\ + __m64 _m1 = m1;\ + __m64 _m2 = m2;\ + _acc = (__m64) __builtin_arm_wqmiabbn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\ + _acc;\ + }) + +#define _mm_qmiabt_pi32(acc, m1, m2) \ + ({\ + __m64 _acc = acc;\ + __m64 _m1 = m1;\ + __m64 _m2 = m2;\ + _acc = (__m64) __builtin_arm_wqmiabt ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\ + _acc;\ + }) + +#define _mm_qmiabtn_pi32(acc, m1, m2) \ + ({\ + __m64 _acc=acc;\ + __m64 _m1=m1;\ + __m64 _m2=m2;\ + _acc = (__m64) __builtin_arm_wqmiabtn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\ + _acc;\ + }) + +#define _mm_qmiatb_pi32(acc, m1, m2) \ + ({\ + __m64 _acc = acc;\ + __m64 _m1 = m1;\ + __m64 _m2 = m2;\ + _acc = (__m64) __builtin_arm_wqmiatb ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\ + _acc;\ + }) + +#define _mm_qmiatbn_pi32(acc, m1, m2) \ + ({\ + __m64 _acc = acc;\ + __m64 _m1 = m1;\ + __m64 _m2 = m2;\ + _acc = (__m64) __builtin_arm_wqmiatbn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\ + _acc;\ + }) + +#define _mm_qmiatt_pi32(acc, m1, m2) \ + ({\ + __m64 _acc = acc;\ + __m64 _m1 = m1;\ + __m64 _m2 = m2;\ + _acc = (__m64) __builtin_arm_wqmiatt ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\ + _acc;\ + }) + +#define _mm_qmiattn_pi32(acc, m1, m2) \ + ({\ + __m64 _acc = acc;\ + __m64 _m1 = m1;\ + __m64 _m2 = m2;\ + _acc = (__m64) __builtin_arm_wqmiattn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\ + _acc;\ + }) + +#define _mm_wmiabb_si64(acc, m1, m2) \ + ({\ + __m64 _acc = acc;\ + __m64 _m1 = m1;\ + __m64 _m2 = m2;\ + _acc = (__m64) __builtin_arm_wmiabb (_acc, (__v4hi)_m1, (__v4hi)_m2);\ + _acc;\ + }) + +#define _mm_wmiabbn_si64(acc, m1, m2) \ + ({\ + __m64 _acc = acc;\ + __m64 _m1 = m1;\ + __m64 _m2 = m2;\ + _acc = (__m64) __builtin_arm_wmiabbn (_acc, (__v4hi)_m1, (__v4hi)_m2);\ + _acc;\ + }) + +#define _mm_wmiabt_si64(acc, m1, m2) \ + ({\ + __m64 _acc = acc;\ + __m64 _m1 = m1;\ + __m64 _m2 = m2;\ + _acc = (__m64) __builtin_arm_wmiabt (_acc, (__v4hi)_m1, (__v4hi)_m2);\ + _acc;\ + }) + +#define _mm_wmiabtn_si64(acc, m1, m2) \ + ({\ + __m64 _acc = acc;\ + __m64 _m1 = m1;\ + __m64 _m2 = m2;\ + _acc = (__m64) __builtin_arm_wmiabtn (_acc, (__v4hi)_m1, (__v4hi)_m2);\ + _acc;\ + }) + +#define _mm_wmiatb_si64(acc, m1, m2) \ + ({\ + __m64 _acc = acc;\ + __m64 _m1 = m1;\ + __m64 _m2 = m2;\ + _acc = (__m64) __builtin_arm_wmiatb (_acc, (__v4hi)_m1, (__v4hi)_m2);\ + _acc;\ + }) + +#define _mm_wmiatbn_si64(acc, m1, m2) \ + ({\ + __m64 _acc = acc;\ + __m64 _m1 = m1;\ + __m64 _m2 = m2;\ + _acc = (__m64) __builtin_arm_wmiatbn (_acc, (__v4hi)_m1, (__v4hi)_m2);\ + _acc;\ + }) + +#define _mm_wmiatt_si64(acc, m1, m2) \ + ({\ + __m64 _acc = acc;\ + __m64 _m1 = m1;\ + __m64 _m2 = m2;\ + _acc = (__m64) __builtin_arm_wmiatt (_acc, (__v4hi)_m1, (__v4hi)_m2);\ + _acc;\ + }) + +#define _mm_wmiattn_si64(acc, m1, m2) \ + ({\ + __m64 _acc = acc;\ + __m64 _m1 = m1;\ + __m64 _m2 = m2;\ + _acc = (__m64) __builtin_arm_wmiattn (_acc, (__v4hi)_m1, (__v4hi)_m2);\ + _acc;\ + }) + +#define _mm_wmiawbb_si64(acc, m1, m2) \ + ({\ + __m64 _acc = acc;\ + __m64 _m1 = m1;\ + __m64 _m2 = m2;\ + _acc = (__m64) __builtin_arm_wmiawbb (_acc, (__v2si)_m1, (__v2si)_m2);\ + _acc;\ + }) + +#define _mm_wmiawbbn_si64(acc, m1, m2) \ + ({\ + __m64 _acc = acc;\ + __m64 _m1 = m1;\ + __m64 _m2 = m2;\ + _acc = (__m64) __builtin_arm_wmiawbbn (_acc, (__v2si)_m1, (__v2si)_m2);\ + _acc;\ + }) + +#define _mm_wmiawbt_si64(acc, m1, m2) \ + ({\ + __m64 _acc = acc;\ + __m64 _m1 = m1;\ + __m64 _m2 = m2;\ + _acc = (__m64) __builtin_arm_wmiawbt (_acc, (__v2si)_m1, (__v2si)_m2);\ + _acc;\ + }) + +#define _mm_wmiawbtn_si64(acc, m1, m2) \ + ({\ + __m64 _acc = acc;\ + __m64 _m1 = m1;\ + __m64 _m2 = m2;\ + _acc = (__m64) __builtin_arm_wmiawbtn (_acc, (__v2si)_m1, (__v2si)_m2);\ + _acc;\ + }) + +#define _mm_wmiawtb_si64(acc, m1, m2) \ + ({\ + __m64 _acc = acc;\ + __m64 _m1 = m1;\ + __m64 _m2 = m2;\ + _acc = (__m64) __builtin_arm_wmiawtb (_acc, (__v2si)_m1, (__v2si)_m2);\ + _acc;\ + }) + +#define _mm_wmiawtbn_si64(acc, m1, m2) \ + ({\ + __m64 _acc = acc;\ + __m64 _m1 = m1;\ + __m64 _m2 = m2;\ + _acc = (__m64) __builtin_arm_wmiawtbn (_acc, (__v2si)_m1, (__v2si)_m2);\ + _acc;\ + }) + +#define _mm_wmiawtt_si64(acc, m1, m2) \ + ({\ + __m64 _acc = acc;\ + __m64 _m1 = m1;\ + __m64 _m2 = m2;\ + _acc = (__m64) __builtin_arm_wmiawtt (_acc, (__v2si)_m1, (__v2si)_m2);\ + _acc;\ + }) + +#define _mm_wmiawttn_si64(acc, m1, m2) \ + ({\ + __m64 _acc = acc;\ + __m64 _m1 = m1;\ + __m64 _m2 = m2;\ + _acc = (__m64) __builtin_arm_wmiawttn (_acc, (__v2si)_m1, (__v2si)_m2);\ + _acc;\ + }) + +/* The third arguments should be an immediate. */ +#define _mm_merge_si64(a, b, n) \ + ({\ + __m64 result;\ + result = (__m64) __builtin_arm_wmerge ((__m64) (a), (__m64) (b), (n));\ + result;\ + }) + +static __inline __m64 +_mm_alignr0_si64 (__m64 a, __m64 b) +{ + return (__m64) __builtin_arm_walignr0 ((__v8qi) a, (__v8qi) b); +} + +static __inline __m64 +_mm_alignr1_si64 (__m64 a, __m64 b) +{ + return (__m64) __builtin_arm_walignr1 ((__v8qi) a, (__v8qi) b); +} + +static __inline __m64 +_mm_alignr2_si64 (__m64 a, __m64 b) +{ + return (__m64) __builtin_arm_walignr2 ((__v8qi) a, (__v8qi) b); +} + +static __inline __m64 +_mm_alignr3_si64 (__m64 a, __m64 b) +{ + return (__m64) __builtin_arm_walignr3 ((__v8qi) a, (__v8qi) b); +} + +static __inline void +_mm_tandcb () +{ + __asm __volatile ("tandcb r15"); +} + +static __inline void +_mm_tandch () +{ + __asm __volatile ("tandch r15"); +} + +static __inline void +_mm_tandcw () +{ + __asm __volatile ("tandcw r15"); +} + +#define _mm_textrcb(n) \ + ({\ + __asm__ __volatile__ (\ + "textrcb r15, %0" : : "i" (n));\ + }) + +#define _mm_textrch(n) \ + ({\ + __asm__ __volatile__ (\ + "textrch r15, %0" : : "i" (n));\ + }) + +#define _mm_textrcw(n) \ + ({\ + __asm__ __volatile__ (\ + "textrcw r15, %0" : : "i" (n));\ + }) + +static __inline void +_mm_torcb () +{ + __asm __volatile ("torcb r15"); +} + +static __inline void +_mm_torch () +{ + __asm __volatile ("torch r15"); +} + +static __inline void +_mm_torcw () +{ + __asm __volatile ("torcw r15"); +} + +static __inline void +_mm_torvscb () +{ + __asm __volatile ("torvscb r15"); +} + +static __inline void +_mm_torvsch () +{ + __asm __volatile ("torvsch r15"); +} + +static __inline void +_mm_torvscw () +{ + __asm __volatile ("torvscw r15"); +} + +static __inline __m64 +_mm_tbcst_pi8 (int value) +{ + return (__m64) __builtin_arm_tbcstb ((signed char) value); +} + +static __inline __m64 +_mm_tbcst_pi16 (int value) +{ + return (__m64) __builtin_arm_tbcsth ((short) value); +} + +static __inline __m64 +_mm_tbcst_pi32 (int value) { - return (__m64)__a; + return (__m64) __builtin_arm_tbcstw (value); } #define _m_packsswb _mm_packs_pi16 @@ -1250,5 +1820,10 @@ _m_from_int (int __a) #define _m_paligniq _mm_align_si64 #define _m_cvt_si2pi _mm_cvtsi64_m64 #define _m_cvt_pi2si _mm_cvtm64_si64 +#define _m_from_int _mm_cvtsi32_si64 +#define _m_to_int _mm_cvtsi64_si32 +#if defined __cplusplus +}; /* End "C" */ +#endif /* __cplusplus */ #endif /* _MMINTRIN_H_INCLUDED */ ^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH, ARM, iWMMXt][2/5]: intrinsic head file change 2011-07-06 10:15 Xinyu Qi @ 2011-08-18 2:35 ` Ramana Radhakrishnan 2011-09-26 4:31 ` PING: " Xinyu Qi ` (4 more replies) 0 siblings, 5 replies; 6+ messages in thread From: Ramana Radhakrishnan @ 2011-08-18 2:35 UTC (permalink / raw) To: Xinyu Qi; +Cc: gcc-patches On 6 July 2011 11:11, Xinyu Qi <xyqi@marvell.com> wrote: > Hi, > > It is the second part of iWMMXt maintenance. > > *config/arm/mmintrin.h: > Revise the iWMMXt intrinsics head file. Fix some intrinsics and add some new intrinsics Is there a document somewhere that lists these intrinsics and what each of these are supposed to be doing ? Missing details again . We seem to be changing quite a few things. > + > +/* We will treat __int64 as a long long type > + and __m64 as an unsigned long long type to conform to VSC++. */Is > +typedef unsigned long long __m64; > +typedef long long __int64; Interesting this sort of a change with these cases where you are changing the type to conform to VSC++ ? This just means old code that uses this is pretty much broken. Not that I have much hope of that happening by default - -flax-conversions appears to be needed even with a trunk compiler. > @@ -54,7 +63,7 @@ _mm_cvtsi64_si32 (__int64 __i) > static __inline __int64 > _mm_cvtsi32_si64 (int __i) > { > - return __i; > + return (__i & 0xffffffff); > } Eh ? why the & 0xffffffff before promotion rules. Is this set of intrinsics documented some place ? What is missing and could be the subject of a follow-up patch is a set of tests for the wMMX intrinsics .... What's the behaviour of wandn supposed to be ? Does wandn x, y, z imply x = y & ~z or x = ~y & z ? If the former then your intrinsic expansion is wrong unless the meaning of this has changed ? Whats the behaviour of the intrinsic __mm_and_not_si64 . ? @@ -985,44 +1004,83 @@ _mm_setzero_si64 (void) static __inline void _mm_setwcx (const int __value, const int __regno) { > + /*Since gcc has the imformation of all wcgr regs > + in arm backend, use builtin to access them instead > + of throw asm directly. Thus, gcc could do some > + optimization on them. */ > + Also this comment is contradictory to what follows in the patch . You've prima-facie replaced them with bits of inline assembler. I'm not sure this comment makes a lot of sense on its own. Ramana ^ permalink raw reply [flat|nested] 6+ messages in thread
* RE: PING: [PATCH, ARM, iWMMXt][2/5]: intrinsic head file change 2011-08-18 2:35 ` Ramana Radhakrishnan @ 2011-09-26 4:31 ` Xinyu Qi 2011-10-20 8:05 ` Xinyu Qi ` (3 subsequent siblings) 4 siblings, 0 replies; 6+ messages in thread From: Xinyu Qi @ 2011-09-26 4:31 UTC (permalink / raw) To: Ramana Radhakrishnan; +Cc: gcc-patches Ping http://gcc.gnu.org/ml/gcc-patches/2011-08/msg01963.html * config/arm/mmintrin.h: Revise. At 2011-08-24 16:14:30,"Xinyu Qi" <xyqi@marvell.com> wrote: > At 2011-08-18 09:33:27,"Ramana Radhakrishnan" > <ramana.radhakrishnan@linaro.org> wrote: > > On 6 July 2011 11:11, Xinyu Qi <xyqi@marvell.com> wrote: > > > Hi, > > > > > > It is the second part of iWMMXt maintenance. > > > > > > *config/arm/mmintrin.h: > > > Revise the iWMMXt intrinsics head file. Fix some intrinsics and add some > > new intrinsics > > > > Is there a document somewhere that lists these intrinsics and what > > each of these are supposed to be doing ? Missing details again . We > > seem to be changing quite a few things. > > Hi, > The intrinsic_doc.txt is attached. It is the piece of iWMMXt intrinsic details > doc picked out from "Intel Wireless MMX Technology Intrinsic Support" with some > modification. > > > > + > > > +/* We will treat __int64 as a long long type > > > + and __m64 as an unsigned long long type to conform to VSC++. */Is > > > +typedef unsigned long long __m64; > > > +typedef long long __int64; > > > > Interesting this sort of a change with these cases where you are > > changing the type to conform to VSC++ ? This just means old code that > > uses this is pretty much broken. Not that I have much hope of that > > happening by default - -flax-conversions appears to be needed even > > with a trunk compiler. > > I couldn't find any material to show why __int64 needs to be redefined. And > all the tests are passed without this change. So decide to discard this change. > > > > > > @@ -54,7 +63,7 @@ _mm_cvtsi64_si32 (__int64 __i) > > > static __inline __int64 > > > _mm_cvtsi32_si64 (int __i) > > > { > > > - return __i; > > > + return (__i & 0xffffffff); > > > } > > > > Eh ? why the & 0xffffffff before promotion rules. Is this set of > > intrinsics documented some place ? What is missing and could be the > > subject of a follow-up patch is a set of tests for the wMMX intrinsics > > .... > > See the intrinsics doc. It says the description of _mm_cvtsi32_si64 is "The > integer value is zero-extended to 64 bits. > If r = _mm_cvtsi32_si64(i), then the action is > r [0:31] = i; > r[32:63] = 0;" > > > > > What's the behaviour of wandn supposed to be ? Does wandn x, y, z > > imply x = y & ~z or x = ~y & z ? If the former then your intrinsic > > expansion is wrong unless the meaning of this has changed ? Whats the > > behaviour of the intrinsic __mm_and_not_si64 . ? > > The description of _mm_andnot_si64 is "Performs a logical NOT on the 64-bit > value in m1 and use the result in a bitwise AND with the 64-bit value in m2." > And, "wandn wRd, wRn, wRm" means "wRd = wRn & ~wRm" > I think __builtin_arm_wandn had better directly match the behavior of wandn. > Therefore, match _mm_andnot_si64 (m1, m2) to __builtin_arm_wandn (m2, m1). > > > > > @@ -985,44 +1004,83 @@ _mm_setzero_si64 (void) > > static __inline void > > _mm_setwcx (const int __value, const int __regno) > > { > > > + /*Since gcc has the imformation of all wcgr regs > > > + in arm backend, use builtin to access them instead > > > + of throw asm directly. Thus, gcc could do some > > > + optimization on them. */ > > > + > > > > Also this comment is contradictory to what follows in the patch . > > You've prima-facie replaced them with bits of inline assembler. I'm > > not sure this comment makes a lot of sense on its own. > > Sorry. This comment should be removed. > > The modified diff is attached. > > Thanks, > Xinyu > ^ permalink raw reply [flat|nested] 6+ messages in thread
* RE: PING: [PATCH, ARM, iWMMXt][2/5]: intrinsic head file change 2011-08-18 2:35 ` Ramana Radhakrishnan 2011-09-26 4:31 ` PING: " Xinyu Qi @ 2011-10-20 8:05 ` Xinyu Qi 2011-12-29 6:26 ` Xinyu Qi ` (2 subsequent siblings) 4 siblings, 0 replies; 6+ messages in thread From: Xinyu Qi @ 2011-10-20 8:05 UTC (permalink / raw) To: Ramana Radhakrishnan, gcc-patches Ping http://gcc.gnu.org/ml/gcc-patches/2011-08/msg01963.html * config/arm/mmintrin.h: Revise. ^ permalink raw reply [flat|nested] 6+ messages in thread
* RE: PING: [PATCH, ARM, iWMMXt][2/5]: intrinsic head file change 2011-08-18 2:35 ` Ramana Radhakrishnan 2011-09-26 4:31 ` PING: " Xinyu Qi 2011-10-20 8:05 ` Xinyu Qi @ 2011-12-29 6:26 ` Xinyu Qi 2012-02-03 2:08 ` Xinyu Qi 2012-03-13 8:55 ` Xinyu Qi 4 siblings, 0 replies; 6+ messages in thread From: Xinyu Qi @ 2011-12-29 6:26 UTC (permalink / raw) To: Richard Earnshaw; +Cc: Ramana Radhakrishnan, gcc-patches [-- Attachment #1: Type: text/plain, Size: 1957 bytes --] * config/arm/mmintrin.h: Use __IWMMXT__ to enable iWMMXt intrinsics. Use __IWMMXT2__ to enable iWMMXt2 intrinsics. Use C name-mangling for intrinsics. (__v8qi): Redefine. (_mm_cvtsi32_si64, _mm_andnot_si64, _mm_sad_pu8): Revise. (_mm_sad_pu16, _mm_align_si64, _mm_setwcx, _mm_getwcx): Likewise. (_m_from_int): Likewise. (_mm_sada_pu8, _mm_sada_pu16): New intrinsic. (_mm_alignr0_si64, _mm_alignr1_si64, _mm_alignr2_si64): Likewise. (_mm_alignr3_si64, _mm_tandcb, _mm_tandch, _mm_tandcw): Likewise. (_mm_textrcb, _mm_textrch, _mm_textrcw, _mm_torcb): Likewise. (_mm_torch, _mm_torcw, _mm_tbcst_pi8, _mm_tbcst_pi16): Likewise. (_mm_tbcst_pi32): Likewise. (_mm_abs_pi8, _mm_abs_pi16, _mm_abs_pi32): New iWMMXt2 intrinsic. (_mm_addsubhx_pi16, _mm_absdiff_pu8, _mm_absdiff_pu16): Likewise. (_mm_absdiff_pu32, _mm_addc_pu16, _mm_addc_pu32): Likewise. (_mm_avg4_pu8, _mm_avg4r_pu8, _mm_maddx_pi16, _mm_maddx_pu16): Likewise. (_mm_msub_pi16, _mm_msub_pu16, _mm_mulhi_pi32): Likewise. (_mm_mulhi_pu32, _mm_mulhir_pi16, _mm_mulhir_pi32): Likewise. (_mm_mulhir_pu16, _mm_mulhir_pu32, _mm_mullo_pi32): Likewise. (_mm_qmulm_pi16, _mm_qmulm_pi32, _mm_qmulmr_pi16): Likewise. (_mm_qmulmr_pi32, _mm_subaddhx_pi16, _mm_addbhusl_pu8): Likewise. (_mm_addbhusm_pu8, _mm_qmiabb_pi32, _mm_qmiabbn_pi32): Likewise. (_mm_qmiabt_pi32, _mm_qmiabtn_pi32, _mm_qmiatb_pi32): Likewise. (_mm_qmiatbn_pi32, _mm_qmiatt_pi32, _mm_qmiattn_pi32): Likewise. (_mm_wmiabb_si64, _mm_wmiabbn_si64, _mm_wmiabt_si64): Likewise. (_mm_wmiabtn_si64, _mm_wmiatb_si64, _mm_wmiatbn_si64): Likewise. (_mm_wmiatt_si64, _mm_wmiattn_si64, _mm_wmiawbb_si64): Likewise. (_mm_wmiawbbn_si64, _mm_wmiawbt_si64, _mm_wmiawbtn_si64): Likewise. (_mm_wmiawtb_si64, _mm_wmiawtbn_si64, _mm_wmiawtt_si64): Likewise. (_mm_wmiawttn_si64, _mm_merge_si64): Likewise. (_mm_torvscb, _mm_torvsch, _mm_torvscw): Likewise. (_m_to_int): New define. Thanks, Xinyu [-- Attachment #2: 2_mmintrin.diff --] [-- Type: application/octet-stream, Size: 17960 bytes --] Index: gcc/config/arm/mmintrin.h =================================================================== --- gcc/config/arm/mmintrin.h (revision 182684) +++ gcc/config/arm/mmintrin.h (working copy) @@ -24,16 +24,30 @@ #ifndef _MMINTRIN_H_INCLUDED #define _MMINTRIN_H_INCLUDED +#ifndef __IWMMXT__ +#error You must enable WMMX/WMMX2 instructions (e.g. -march=iwmmxt or -march=iwmmxt2) to use iWMMXt/iWMMXt2 intrinsics +#else + +#ifndef __IWMMXT2__ +#warning You only enable iWMMXt intrinsics. Extended iWMMXt2 intrinsics available only if WMMX2 instructions enabled (e.g. -march=iwmmxt2) +#endif + + +#if defined __cplusplus +extern "C" { /* Begin "C" */ +/* Intrinsics use C name-mangling. */ +#endif /* __cplusplus */ + /* The data type intended for user use. */ typedef unsigned long long __m64, __int64; /* Internal data types for implementing the intrinsics. */ typedef int __v2si __attribute__ ((vector_size (8))); typedef short __v4hi __attribute__ ((vector_size (8))); -typedef char __v8qi __attribute__ ((vector_size (8))); +typedef signed char __v8qi __attribute__ ((vector_size (8))); /* "Convert" __m64 and __int64 into each other. */ -static __inline __m64 +static __inline __m64 _mm_cvtsi64_m64 (__int64 __i) { return __i; @@ -54,7 +68,7 @@ _mm_cvtsi64_si32 (__int64 __i) static __inline __int64 _mm_cvtsi32_si64 (int __i) { - return __i; + return (__i & 0xffffffff); } /* Pack the four 16-bit values from M1 into the lower four 8-bit values of @@ -603,7 +617,7 @@ _mm_and_si64 (__m64 __m1, __m64 __m2) static __inline __m64 _mm_andnot_si64 (__m64 __m1, __m64 __m2) { - return __builtin_arm_wandn (__m1, __m2); + return __builtin_arm_wandn (__m2, __m1); } /* Bit-wise inclusive OR the 64-bit values in M1 and M2. */ @@ -935,7 +949,13 @@ _mm_avg2_pu16 (__m64 __A, __m64 __B) static __inline __m64 _mm_sad_pu8 (__m64 __A, __m64 __B) { - return (__m64) __builtin_arm_wsadb ((__v8qi)__A, (__v8qi)__B); + return (__m64) __builtin_arm_wsadbz ((__v8qi)__A, (__v8qi)__B); +} + +static __inline __m64 +_mm_sada_pu8 (__m64 __A, __m64 __B, __m64 __C) +{ + return (__m64) __builtin_arm_wsadb ((__v2si)__A, (__v8qi)__B, (__v8qi)__C); } /* Compute the sum of the absolute differences of the unsigned 16-bit @@ -944,9 +964,16 @@ _mm_sad_pu8 (__m64 __A, __m64 __B) static __inline __m64 _mm_sad_pu16 (__m64 __A, __m64 __B) { - return (__m64) __builtin_arm_wsadh ((__v4hi)__A, (__v4hi)__B); + return (__m64) __builtin_arm_wsadhz ((__v4hi)__A, (__v4hi)__B); +} + +static __inline __m64 +_mm_sada_pu16 (__m64 __A, __m64 __B, __m64 __C) +{ + return (__m64) __builtin_arm_wsadh ((__v2si)__A, (__v4hi)__B, (__v4hi)__C); } + /* Compute the sum of the absolute differences of the unsigned 8-bit values in A and B. Return the value in the lower 16-bit word; the upper words are cleared. */ @@ -965,11 +992,8 @@ _mm_sadz_pu16 (__m64 __A, __m64 __B) return (__m64) __builtin_arm_wsadhz ((__v4hi)__A, (__v4hi)__B); } -static __inline __m64 -_mm_align_si64 (__m64 __A, __m64 __B, int __C) -{ - return (__m64) __builtin_arm_walign ((__v8qi)__A, (__v8qi)__B, __C); -} +#define _mm_align_si64(__A,__B, N) \ + (__m64) __builtin_arm_walign ((__v8qi) (__A),(__v8qi) (__B), (N)) /* Creates a 64-bit zero. */ static __inline __m64 @@ -987,42 +1011,76 @@ _mm_setwcx (const int __value, const int { switch (__regno) { - case 0: __builtin_arm_setwcx (__value, 0); break; - case 1: __builtin_arm_setwcx (__value, 1); break; - case 2: __builtin_arm_setwcx (__value, 2); break; - case 3: __builtin_arm_setwcx (__value, 3); break; - case 8: __builtin_arm_setwcx (__value, 8); break; - case 9: __builtin_arm_setwcx (__value, 9); break; - case 10: __builtin_arm_setwcx (__value, 10); break; - case 11: __builtin_arm_setwcx (__value, 11); break; - default: break; + case 0: + __asm __volatile ("tmcr wcid, %0" :: "r"(__value)); + break; + case 1: + __asm __volatile ("tmcr wcon, %0" :: "r"(__value)); + break; + case 2: + __asm __volatile ("tmcr wcssf, %0" :: "r"(__value)); + break; + case 3: + __asm __volatile ("tmcr wcasf, %0" :: "r"(__value)); + break; + case 8: + __builtin_arm_setwcgr0 (__value); + break; + case 9: + __builtin_arm_setwcgr1 (__value); + break; + case 10: + __builtin_arm_setwcgr2 (__value); + break; + case 11: + __builtin_arm_setwcgr3 (__value); + break; + default: + break; } } static __inline int _mm_getwcx (const int __regno) { + int __value; switch (__regno) { - case 0: return __builtin_arm_getwcx (0); - case 1: return __builtin_arm_getwcx (1); - case 2: return __builtin_arm_getwcx (2); - case 3: return __builtin_arm_getwcx (3); - case 8: return __builtin_arm_getwcx (8); - case 9: return __builtin_arm_getwcx (9); - case 10: return __builtin_arm_getwcx (10); - case 11: return __builtin_arm_getwcx (11); - default: return 0; + case 0: + __asm __volatile ("tmrc %0, wcid" : "=r"(__value)); + break; + case 1: + __asm __volatile ("tmrc %0, wcon" : "=r"(__value)); + break; + case 2: + __asm __volatile ("tmrc %0, wcssf" : "=r"(__value)); + break; + case 3: + __asm __volatile ("tmrc %0, wcasf" : "=r"(__value)); + break; + case 8: + return __builtin_arm_getwcgr0 (); + case 9: + return __builtin_arm_getwcgr1 (); + case 10: + return __builtin_arm_getwcgr2 (); + case 11: + return __builtin_arm_getwcgr3 (); + default: + break; } + return __value; } /* Creates a vector of two 32-bit values; I0 is least significant. */ static __inline __m64 _mm_set_pi32 (int __i1, int __i0) { - union { + union + { __m64 __q; - struct { + struct + { unsigned int __i0; unsigned int __i1; } __s; @@ -1041,7 +1099,7 @@ _mm_set_pi16 (short __w3, short __w2, sh unsigned int __i1 = (unsigned short)__w3 << 16 | (unsigned short)__w2; unsigned int __i0 = (unsigned short)__w1 << 16 | (unsigned short)__w0; return _mm_set_pi32 (__i1, __i0); - + } /* Creates a vector of eight 8-bit values; B0 is least significant. */ @@ -1108,11 +1166,526 @@ _mm_set1_pi8 (char __b) return _mm_set1_pi32 (__i); } -/* Convert an integer to a __m64 object. */ +#ifdef __IWMMXT2__ +static __inline __m64 +_mm_abs_pi8 (__m64 m1) +{ + return (__m64) __builtin_arm_wabsb ((__v8qi)m1); +} + +static __inline __m64 +_mm_abs_pi16 (__m64 m1) +{ + return (__m64) __builtin_arm_wabsh ((__v4hi)m1); + +} + +static __inline __m64 +_mm_abs_pi32 (__m64 m1) +{ + return (__m64) __builtin_arm_wabsw ((__v2si)m1); + +} + +static __inline __m64 +_mm_addsubhx_pi16 (__m64 a, __m64 b) +{ + return (__m64) __builtin_arm_waddsubhx ((__v4hi)a, (__v4hi)b); +} + +static __inline __m64 +_mm_absdiff_pu8 (__m64 a, __m64 b) +{ + return (__m64) __builtin_arm_wabsdiffb ((__v8qi)a, (__v8qi)b); +} + +static __inline __m64 +_mm_absdiff_pu16 (__m64 a, __m64 b) +{ + return (__m64) __builtin_arm_wabsdiffh ((__v4hi)a, (__v4hi)b); +} + +static __inline __m64 +_mm_absdiff_pu32 (__m64 a, __m64 b) +{ + return (__m64) __builtin_arm_wabsdiffw ((__v2si)a, (__v2si)b); +} + +static __inline __m64 +_mm_addc_pu16 (__m64 a, __m64 b) +{ + __m64 result; + __asm__ __volatile__ ("waddhc %0, %1, %2" : "=y" (result) : "y" (a), "y" (b)); + return result; +} + +static __inline __m64 +_mm_addc_pu32 (__m64 a, __m64 b) +{ + __m64 result; + __asm__ __volatile__ ("waddwc %0, %1, %2" : "=y" (result) : "y" (a), "y" (b)); + return result; +} + +static __inline __m64 +_mm_avg4_pu8 (__m64 a, __m64 b) +{ + return (__m64) __builtin_arm_wavg4 ((__v8qi)a, (__v8qi)b); +} + +static __inline __m64 +_mm_avg4r_pu8 (__m64 a, __m64 b) +{ + return (__m64) __builtin_arm_wavg4r ((__v8qi)a, (__v8qi)b); +} + +static __inline __m64 +_mm_maddx_pi16 (__m64 a, __m64 b) +{ + return (__m64) __builtin_arm_wmaddsx ((__v4hi)a, (__v4hi)b); +} + +static __inline __m64 +_mm_maddx_pu16 (__m64 a, __m64 b) +{ + return (__m64) __builtin_arm_wmaddux ((__v4hi)a, (__v4hi)b); +} + +static __inline __m64 +_mm_msub_pi16 (__m64 a, __m64 b) +{ + return (__m64) __builtin_arm_wmaddsn ((__v4hi)a, (__v4hi)b); +} + +static __inline __m64 +_mm_msub_pu16 (__m64 a, __m64 b) +{ + return (__m64) __builtin_arm_wmaddun ((__v4hi)a, (__v4hi)b); +} + +static __inline __m64 +_mm_mulhi_pi32 (__m64 a, __m64 b) +{ + return (__m64) __builtin_arm_wmulwsm ((__v2si)a, (__v2si)b); +} + +static __inline __m64 +_mm_mulhi_pu32 (__m64 a, __m64 b) +{ + return (__m64) __builtin_arm_wmulwum ((__v2si)a, (__v2si)b); +} + +static __inline __m64 +_mm_mulhir_pi16 (__m64 a, __m64 b) +{ + return (__m64) __builtin_arm_wmulsmr ((__v4hi)a, (__v4hi)b); +} + +static __inline __m64 +_mm_mulhir_pi32 (__m64 a, __m64 b) +{ + return (__m64) __builtin_arm_wmulwsmr ((__v2si)a, (__v2si)b); +} + +static __inline __m64 +_mm_mulhir_pu16 (__m64 a, __m64 b) +{ + return (__m64) __builtin_arm_wmulumr ((__v4hi)a, (__v4hi)b); +} + +static __inline __m64 +_mm_mulhir_pu32 (__m64 a, __m64 b) +{ + return (__m64) __builtin_arm_wmulwumr ((__v2si)a, (__v2si)b); +} + +static __inline __m64 +_mm_mullo_pi32 (__m64 a, __m64 b) +{ + return (__m64) __builtin_arm_wmulwl ((__v2si)a, (__v2si)b); +} + +static __inline __m64 +_mm_qmulm_pi16 (__m64 a, __m64 b) +{ + return (__m64) __builtin_arm_wqmulm ((__v4hi)a, (__v4hi)b); +} + +static __inline __m64 +_mm_qmulm_pi32 (__m64 a, __m64 b) +{ + return (__m64) __builtin_arm_wqmulwm ((__v2si)a, (__v2si)b); +} + +static __inline __m64 +_mm_qmulmr_pi16 (__m64 a, __m64 b) +{ + return (__m64) __builtin_arm_wqmulmr ((__v4hi)a, (__v4hi)b); +} + +static __inline __m64 +_mm_qmulmr_pi32 (__m64 a, __m64 b) +{ + return (__m64) __builtin_arm_wqmulwmr ((__v2si)a, (__v2si)b); +} + +static __inline __m64 +_mm_subaddhx_pi16 (__m64 a, __m64 b) +{ + return (__m64) __builtin_arm_wsubaddhx ((__v4hi)a, (__v4hi)b); +} + +static __inline __m64 +_mm_addbhusl_pu8 (__m64 a, __m64 b) +{ + return (__m64) __builtin_arm_waddbhusl ((__v4hi)a, (__v8qi)b); +} + +static __inline __m64 +_mm_addbhusm_pu8 (__m64 a, __m64 b) +{ + return (__m64) __builtin_arm_waddbhusm ((__v4hi)a, (__v8qi)b); +} + +#define _mm_qmiabb_pi32(acc, m1, m2) \ + ({\ + __m64 _acc = acc;\ + __m64 _m1 = m1;\ + __m64 _m2 = m2;\ + _acc = (__m64) __builtin_arm_wqmiabb ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\ + _acc;\ + }) + +#define _mm_qmiabbn_pi32(acc, m1, m2) \ + ({\ + __m64 _acc = acc;\ + __m64 _m1 = m1;\ + __m64 _m2 = m2;\ + _acc = (__m64) __builtin_arm_wqmiabbn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\ + _acc;\ + }) + +#define _mm_qmiabt_pi32(acc, m1, m2) \ + ({\ + __m64 _acc = acc;\ + __m64 _m1 = m1;\ + __m64 _m2 = m2;\ + _acc = (__m64) __builtin_arm_wqmiabt ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\ + _acc;\ + }) + +#define _mm_qmiabtn_pi32(acc, m1, m2) \ + ({\ + __m64 _acc=acc;\ + __m64 _m1=m1;\ + __m64 _m2=m2;\ + _acc = (__m64) __builtin_arm_wqmiabtn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\ + _acc;\ + }) + +#define _mm_qmiatb_pi32(acc, m1, m2) \ + ({\ + __m64 _acc = acc;\ + __m64 _m1 = m1;\ + __m64 _m2 = m2;\ + _acc = (__m64) __builtin_arm_wqmiatb ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\ + _acc;\ + }) + +#define _mm_qmiatbn_pi32(acc, m1, m2) \ + ({\ + __m64 _acc = acc;\ + __m64 _m1 = m1;\ + __m64 _m2 = m2;\ + _acc = (__m64) __builtin_arm_wqmiatbn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\ + _acc;\ + }) + +#define _mm_qmiatt_pi32(acc, m1, m2) \ + ({\ + __m64 _acc = acc;\ + __m64 _m1 = m1;\ + __m64 _m2 = m2;\ + _acc = (__m64) __builtin_arm_wqmiatt ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\ + _acc;\ + }) + +#define _mm_qmiattn_pi32(acc, m1, m2) \ + ({\ + __m64 _acc = acc;\ + __m64 _m1 = m1;\ + __m64 _m2 = m2;\ + _acc = (__m64) __builtin_arm_wqmiattn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\ + _acc;\ + }) + +#define _mm_wmiabb_si64(acc, m1, m2) \ + ({\ + __m64 _acc = acc;\ + __m64 _m1 = m1;\ + __m64 _m2 = m2;\ + _acc = (__m64) __builtin_arm_wmiabb (_acc, (__v4hi)_m1, (__v4hi)_m2);\ + _acc;\ + }) + +#define _mm_wmiabbn_si64(acc, m1, m2) \ + ({\ + __m64 _acc = acc;\ + __m64 _m1 = m1;\ + __m64 _m2 = m2;\ + _acc = (__m64) __builtin_arm_wmiabbn (_acc, (__v4hi)_m1, (__v4hi)_m2);\ + _acc;\ + }) + +#define _mm_wmiabt_si64(acc, m1, m2) \ + ({\ + __m64 _acc = acc;\ + __m64 _m1 = m1;\ + __m64 _m2 = m2;\ + _acc = (__m64) __builtin_arm_wmiabt (_acc, (__v4hi)_m1, (__v4hi)_m2);\ + _acc;\ + }) + +#define _mm_wmiabtn_si64(acc, m1, m2) \ + ({\ + __m64 _acc = acc;\ + __m64 _m1 = m1;\ + __m64 _m2 = m2;\ + _acc = (__m64) __builtin_arm_wmiabtn (_acc, (__v4hi)_m1, (__v4hi)_m2);\ + _acc;\ + }) + +#define _mm_wmiatb_si64(acc, m1, m2) \ + ({\ + __m64 _acc = acc;\ + __m64 _m1 = m1;\ + __m64 _m2 = m2;\ + _acc = (__m64) __builtin_arm_wmiatb (_acc, (__v4hi)_m1, (__v4hi)_m2);\ + _acc;\ + }) + +#define _mm_wmiatbn_si64(acc, m1, m2) \ + ({\ + __m64 _acc = acc;\ + __m64 _m1 = m1;\ + __m64 _m2 = m2;\ + _acc = (__m64) __builtin_arm_wmiatbn (_acc, (__v4hi)_m1, (__v4hi)_m2);\ + _acc;\ + }) + +#define _mm_wmiatt_si64(acc, m1, m2) \ + ({\ + __m64 _acc = acc;\ + __m64 _m1 = m1;\ + __m64 _m2 = m2;\ + _acc = (__m64) __builtin_arm_wmiatt (_acc, (__v4hi)_m1, (__v4hi)_m2);\ + _acc;\ + }) + +#define _mm_wmiattn_si64(acc, m1, m2) \ + ({\ + __m64 _acc = acc;\ + __m64 _m1 = m1;\ + __m64 _m2 = m2;\ + _acc = (__m64) __builtin_arm_wmiattn (_acc, (__v4hi)_m1, (__v4hi)_m2);\ + _acc;\ + }) + +#define _mm_wmiawbb_si64(acc, m1, m2) \ + ({\ + __m64 _acc = acc;\ + __m64 _m1 = m1;\ + __m64 _m2 = m2;\ + _acc = (__m64) __builtin_arm_wmiawbb (_acc, (__v2si)_m1, (__v2si)_m2);\ + _acc;\ + }) + +#define _mm_wmiawbbn_si64(acc, m1, m2) \ + ({\ + __m64 _acc = acc;\ + __m64 _m1 = m1;\ + __m64 _m2 = m2;\ + _acc = (__m64) __builtin_arm_wmiawbbn (_acc, (__v2si)_m1, (__v2si)_m2);\ + _acc;\ + }) + +#define _mm_wmiawbt_si64(acc, m1, m2) \ + ({\ + __m64 _acc = acc;\ + __m64 _m1 = m1;\ + __m64 _m2 = m2;\ + _acc = (__m64) __builtin_arm_wmiawbt (_acc, (__v2si)_m1, (__v2si)_m2);\ + _acc;\ + }) + +#define _mm_wmiawbtn_si64(acc, m1, m2) \ + ({\ + __m64 _acc = acc;\ + __m64 _m1 = m1;\ + __m64 _m2 = m2;\ + _acc = (__m64) __builtin_arm_wmiawbtn (_acc, (__v2si)_m1, (__v2si)_m2);\ + _acc;\ + }) + +#define _mm_wmiawtb_si64(acc, m1, m2) \ + ({\ + __m64 _acc = acc;\ + __m64 _m1 = m1;\ + __m64 _m2 = m2;\ + _acc = (__m64) __builtin_arm_wmiawtb (_acc, (__v2si)_m1, (__v2si)_m2);\ + _acc;\ + }) + +#define _mm_wmiawtbn_si64(acc, m1, m2) \ + ({\ + __m64 _acc = acc;\ + __m64 _m1 = m1;\ + __m64 _m2 = m2;\ + _acc = (__m64) __builtin_arm_wmiawtbn (_acc, (__v2si)_m1, (__v2si)_m2);\ + _acc;\ + }) + +#define _mm_wmiawtt_si64(acc, m1, m2) \ + ({\ + __m64 _acc = acc;\ + __m64 _m1 = m1;\ + __m64 _m2 = m2;\ + _acc = (__m64) __builtin_arm_wmiawtt (_acc, (__v2si)_m1, (__v2si)_m2);\ + _acc;\ + }) + +#define _mm_wmiawttn_si64(acc, m1, m2) \ + ({\ + __m64 _acc = acc;\ + __m64 _m1 = m1;\ + __m64 _m2 = m2;\ + _acc = (__m64) __builtin_arm_wmiawttn (_acc, (__v2si)_m1, (__v2si)_m2);\ + _acc;\ + }) + +/* The third arguments should be an immediate. */ +#define _mm_merge_si64(a, b, n) \ + ({\ + __m64 result;\ + result = (__m64) __builtin_arm_wmerge ((__m64) (a), (__m64) (b), (n));\ + result;\ + }) +#endif /* __IWMMXT2__ */ + +static __inline __m64 +_mm_alignr0_si64 (__m64 a, __m64 b) +{ + return (__m64) __builtin_arm_walignr0 ((__v8qi) a, (__v8qi) b); +} + +static __inline __m64 +_mm_alignr1_si64 (__m64 a, __m64 b) +{ + return (__m64) __builtin_arm_walignr1 ((__v8qi) a, (__v8qi) b); +} + +static __inline __m64 +_mm_alignr2_si64 (__m64 a, __m64 b) +{ + return (__m64) __builtin_arm_walignr2 ((__v8qi) a, (__v8qi) b); +} + +static __inline __m64 +_mm_alignr3_si64 (__m64 a, __m64 b) +{ + return (__m64) __builtin_arm_walignr3 ((__v8qi) a, (__v8qi) b); +} + +static __inline void +_mm_tandcb () +{ + __asm __volatile ("tandcb r15"); +} + +static __inline void +_mm_tandch () +{ + __asm __volatile ("tandch r15"); +} + +static __inline void +_mm_tandcw () +{ + __asm __volatile ("tandcw r15"); +} + +#define _mm_textrcb(n) \ + ({\ + __asm__ __volatile__ (\ + "textrcb r15, %0" : : "i" (n));\ + }) + +#define _mm_textrch(n) \ + ({\ + __asm__ __volatile__ (\ + "textrch r15, %0" : : "i" (n));\ + }) + +#define _mm_textrcw(n) \ + ({\ + __asm__ __volatile__ (\ + "textrcw r15, %0" : : "i" (n));\ + }) + +static __inline void +_mm_torcb () +{ + __asm __volatile ("torcb r15"); +} + +static __inline void +_mm_torch () +{ + __asm __volatile ("torch r15"); +} + +static __inline void +_mm_torcw () +{ + __asm __volatile ("torcw r15"); +} + +#ifdef __IWMMXT2__ +static __inline void +_mm_torvscb () +{ + __asm __volatile ("torvscb r15"); +} + +static __inline void +_mm_torvsch () +{ + __asm __volatile ("torvsch r15"); +} + +static __inline void +_mm_torvscw () +{ + __asm __volatile ("torvscw r15"); +} +#endif + +static __inline __m64 +_mm_tbcst_pi8 (int value) +{ + return (__m64) __builtin_arm_tbcstb ((signed char) value); +} + +static __inline __m64 +_mm_tbcst_pi16 (int value) +{ + return (__m64) __builtin_arm_tbcsth ((short) value); +} + static __inline __m64 -_m_from_int (int __a) +_mm_tbcst_pi32 (int value) { - return (__m64)__a; + return (__m64) __builtin_arm_tbcstw (value); } #define _m_packsswb _mm_packs_pi16 @@ -1250,5 +1823,11 @@ _m_from_int (int __a) #define _m_paligniq _mm_align_si64 #define _m_cvt_si2pi _mm_cvtsi64_m64 #define _m_cvt_pi2si _mm_cvtm64_si64 +#define _m_from_int _mm_cvtsi32_si64 +#define _m_to_int _mm_cvtsi64_si32 +#if defined __cplusplus +}; /* End "C" */ +#endif /* __cplusplus */ +#endif /* __IWMMXT__ */ #endif /* _MMINTRIN_H_INCLUDED */ ^ permalink raw reply [flat|nested] 6+ messages in thread
* RE: PING: [PATCH, ARM, iWMMXt][2/5]: intrinsic head file change 2011-08-18 2:35 ` Ramana Radhakrishnan ` (2 preceding siblings ...) 2011-12-29 6:26 ` Xinyu Qi @ 2012-02-03 2:08 ` Xinyu Qi 2012-03-13 8:55 ` Xinyu Qi 4 siblings, 0 replies; 6+ messages in thread From: Xinyu Qi @ 2012-02-03 2:08 UTC (permalink / raw) To: Richard Earnshaw; +Cc: Ramana Radhakrishnan, gcc-patches PING http://gcc.gnu.org/ml/gcc-patches/2011-12/msg01788.html At 2011-12-29 14:22:50,"Xinyu Qi" <xyqi@marvell.com> wrote: > * config/arm/mmintrin.h: Use __IWMMXT__ to enable iWMMXt > intrinsics. > Use __IWMMXT2__ to enable iWMMXt2 intrinsics. > Use C name-mangling for intrinsics. > (__v8qi): Redefine. > (_mm_cvtsi32_si64, _mm_andnot_si64, _mm_sad_pu8): Revise. > (_mm_sad_pu16, _mm_align_si64, _mm_setwcx, _mm_getwcx): > Likewise. > (_m_from_int): Likewise. > (_mm_sada_pu8, _mm_sada_pu16): New intrinsic. > (_mm_alignr0_si64, _mm_alignr1_si64, _mm_alignr2_si64): Likewise. > (_mm_alignr3_si64, _mm_tandcb, _mm_tandch, _mm_tandcw): Likewise. > (_mm_textrcb, _mm_textrch, _mm_textrcw, _mm_torcb): Likewise. > (_mm_torch, _mm_torcw, _mm_tbcst_pi8, _mm_tbcst_pi16): Likewise. > (_mm_tbcst_pi32): Likewise. > (_mm_abs_pi8, _mm_abs_pi16, _mm_abs_pi32): New iWMMXt2 > intrinsic. > (_mm_addsubhx_pi16, _mm_absdiff_pu8, _mm_absdiff_pu16): Likewise. > (_mm_absdiff_pu32, _mm_addc_pu16, _mm_addc_pu32): Likewise. > (_mm_avg4_pu8, _mm_avg4r_pu8, _mm_maddx_pi16, > _mm_maddx_pu16): Likewise. > (_mm_msub_pi16, _mm_msub_pu16, _mm_mulhi_pi32): Likewise. > (_mm_mulhi_pu32, _mm_mulhir_pi16, _mm_mulhir_pi32): Likewise. > (_mm_mulhir_pu16, _mm_mulhir_pu32, _mm_mullo_pi32): Likewise. > (_mm_qmulm_pi16, _mm_qmulm_pi32, _mm_qmulmr_pi16): Likewise. > (_mm_qmulmr_pi32, _mm_subaddhx_pi16, _mm_addbhusl_pu8): > Likewise. > (_mm_addbhusm_pu8, _mm_qmiabb_pi32, _mm_qmiabbn_pi32): > Likewise. > (_mm_qmiabt_pi32, _mm_qmiabtn_pi32, _mm_qmiatb_pi32): Likewise. > (_mm_qmiatbn_pi32, _mm_qmiatt_pi32, _mm_qmiattn_pi32): Likewise. > (_mm_wmiabb_si64, _mm_wmiabbn_si64, _mm_wmiabt_si64): Likewise. > (_mm_wmiabtn_si64, _mm_wmiatb_si64, _mm_wmiatbn_si64): > Likewise. > (_mm_wmiatt_si64, _mm_wmiattn_si64, _mm_wmiawbb_si64): > Likewise. > (_mm_wmiawbbn_si64, _mm_wmiawbt_si64, _mm_wmiawbtn_si64): > Likewise. > (_mm_wmiawtb_si64, _mm_wmiawtbn_si64, _mm_wmiawtt_si64): > Likewise. > (_mm_wmiawttn_si64, _mm_merge_si64): Likewise. > (_mm_torvscb, _mm_torvsch, _mm_torvscw): Likewise. > (_m_to_int): New define. > > Thanks, > Xinyu ^ permalink raw reply [flat|nested] 6+ messages in thread
* RE: PING: [PATCH, ARM, iWMMXt][2/5]: intrinsic head file change 2011-08-18 2:35 ` Ramana Radhakrishnan ` (3 preceding siblings ...) 2012-02-03 2:08 ` Xinyu Qi @ 2012-03-13 8:55 ` Xinyu Qi 4 siblings, 0 replies; 6+ messages in thread From: Xinyu Qi @ 2012-03-13 8:55 UTC (permalink / raw) To: Richard Earnshaw; +Cc: Ramana Radhakrishnan, gcc-patches PING At 2012-02-03 10:05:22,"Xinyu Qi" <xyqi@marvell.com> wrote: > PING > > http://gcc.gnu.org/ml/gcc-patches/2011-12/msg01788.html > > At 2011-12-29 14:22:50,"Xinyu Qi" <xyqi@marvell.com> wrote: > > * config/arm/mmintrin.h: Use __IWMMXT__ to enable iWMMXt > > intrinsics. > > Use __IWMMXT2__ to enable iWMMXt2 intrinsics. > > Use C name-mangling for intrinsics. > > (__v8qi): Redefine. > > (_mm_cvtsi32_si64, _mm_andnot_si64, _mm_sad_pu8): Revise. > > (_mm_sad_pu16, _mm_align_si64, _mm_setwcx, _mm_getwcx): > > Likewise. > > (_m_from_int): Likewise. > > (_mm_sada_pu8, _mm_sada_pu16): New intrinsic. > > (_mm_alignr0_si64, _mm_alignr1_si64, _mm_alignr2_si64): Likewise. > > (_mm_alignr3_si64, _mm_tandcb, _mm_tandch, _mm_tandcw): Likewise. > > (_mm_textrcb, _mm_textrch, _mm_textrcw, _mm_torcb): Likewise. > > (_mm_torch, _mm_torcw, _mm_tbcst_pi8, _mm_tbcst_pi16): Likewise. > > (_mm_tbcst_pi32): Likewise. > > (_mm_abs_pi8, _mm_abs_pi16, _mm_abs_pi32): New iWMMXt2 > > intrinsic. > > (_mm_addsubhx_pi16, _mm_absdiff_pu8, _mm_absdiff_pu16): Likewise. > > (_mm_absdiff_pu32, _mm_addc_pu16, _mm_addc_pu32): Likewise. > > (_mm_avg4_pu8, _mm_avg4r_pu8, _mm_maddx_pi16, > > _mm_maddx_pu16): Likewise. > > (_mm_msub_pi16, _mm_msub_pu16, _mm_mulhi_pi32): Likewise. > > (_mm_mulhi_pu32, _mm_mulhir_pi16, _mm_mulhir_pi32): Likewise. > > (_mm_mulhir_pu16, _mm_mulhir_pu32, _mm_mullo_pi32): Likewise. > > (_mm_qmulm_pi16, _mm_qmulm_pi32, _mm_qmulmr_pi16): Likewise. > > (_mm_qmulmr_pi32, _mm_subaddhx_pi16, _mm_addbhusl_pu8): > > Likewise. > > (_mm_addbhusm_pu8, _mm_qmiabb_pi32, _mm_qmiabbn_pi32): > > Likewise. > > (_mm_qmiabt_pi32, _mm_qmiabtn_pi32, _mm_qmiatb_pi32): Likewise. > > (_mm_qmiatbn_pi32, _mm_qmiatt_pi32, _mm_qmiattn_pi32): Likewise. > > (_mm_wmiabb_si64, _mm_wmiabbn_si64, _mm_wmiabt_si64): Likewise. > > (_mm_wmiabtn_si64, _mm_wmiatb_si64, _mm_wmiatbn_si64): > > Likewise. > > (_mm_wmiatt_si64, _mm_wmiattn_si64, _mm_wmiawbb_si64): > > Likewise. > > (_mm_wmiawbbn_si64, _mm_wmiawbt_si64, _mm_wmiawbtn_si64): > > Likewise. > > (_mm_wmiawtb_si64, _mm_wmiawtbn_si64, _mm_wmiawtt_si64): > > Likewise. > > (_mm_wmiawttn_si64, _mm_merge_si64): Likewise. > > (_mm_torvscb, _mm_torvsch, _mm_torvscw): Likewise. > > (_m_to_int): New define. > > > > Thanks, > > Xinyu ^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2012-03-13 8:55 UTC | newest] Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2011-07-29 4:04 PING: [PATCH, ARM, iWMMXt][2/5]: intrinsic head file change Xinyu Qi -- strict thread matches above, loose matches on Subject: below -- 2011-07-06 10:15 Xinyu Qi 2011-08-18 2:35 ` Ramana Radhakrishnan 2011-09-26 4:31 ` PING: " Xinyu Qi 2011-10-20 8:05 ` Xinyu Qi 2011-12-29 6:26 ` Xinyu Qi 2012-02-03 2:08 ` Xinyu Qi 2012-03-13 8:55 ` Xinyu Qi
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).