public inbox for gcc@gcc.gnu.org
 help / color / mirror / Atom feed
* censored naked SSE reciprocals, -mrecip
@ 2007-12-29 15:35 tbp
  0 siblings, 0 replies; 4+ messages in thread
From: tbp @ 2007-12-29 15:35 UTC (permalink / raw)
  To: GCC

[-- Attachment #1: Type: text/plain, Size: 1481 bytes --]

Merry xmas,

i lately had some use for -mrecip but it turned out to come with all
sorts of strings attached and apparently no opt-out. Briefly, barring
inline asm, i can't get gcc to emit those ops without a NR fixup.

# cat src/pr-recip.c
#include <xmmintrin.h>
typedef float v4sf_t __attribute__ ((__vector_size__ (16)));

__m128 foo(__m128 a) { return _mm_sqrt_ps(a); }
__m128 bar(__m128 a) { return _mm_rsqrt_ps(a); }
__m128 baz(__m128 a) { return _mm_rcp_ps(a); }

v4sf_t nope1(v4sf_t a) { return __builtin_ia32_sqrtps(a); }
v4sf_t nope2(v4sf_t a) { return __builtin_ia32_rsqrtps(a); }
v4sf_t allright(v4sf_t a) { return __builtin_ia32_rcpps(a); }

int main() { return 0; }
# /usr/local/gcc-4.3-20071221/bin/gcc -march=native -ffast-math
-mrecip -O2 src/pr-recip.c
... and as can be witnessed in the attached asm dump foo, bar, nope1,
nope2 get mangled (at least on x86-64 linux).

While i can somehow understand the logic behind the automatic
transformation of _mm_sqrt_ps - it can be argued that's what the user
has asked for - there's no obvious way to opt out. But then i really
don't understand why gcc feels the urge to tinker when i specifically
ask for a rsqrt.
To add insult to injury -mrecip, unlike fast-math, doesn't set any
macro so kludging around is a cat / mouse game.

Questions:
  a) is that really by design?
  b) what's the official way to dodge fixups when -mrecip is active?
  c) any chance for -mrecip to set __FAST_MATH_NONE_SHALL_PASS__ or something?

[-- Attachment #2: dump.asm --]
[-- Type: application/octet-stream, Size: 3705 bytes --]

0000000000400470 <allright>:
  400470:	0f 53 c0             	rcpps  %xmm0,%xmm0
  400473:	c3                   	retq   
  400474:	66 66 66 2e 0f 1f 84 	nopw   %cs:0x0(%rax,%rax,1)
  40047b:	00 00 00 00 00 

0000000000400480 <baz>:
  400480:	0f 53 c0             	rcpps  %xmm0,%xmm0
  400483:	c3                   	retq   
  400484:	66 66 66 2e 0f 1f 84 	nopw   %cs:0x0(%rax,%rax,1)
  40048b:	00 00 00 00 00 

0000000000400490 <nope2>:
  400490:	0f 28 d0             	movaps %xmm0,%xmm2
  400493:	0f 57 c9             	xorps  %xmm1,%xmm1
  400496:	0f 28 05 d3 01 00 00 	movaps 0x1d3(%rip),%xmm0        # 400670 <_IO_stdin_used+0x10>
  40049d:	0f 28 da             	movaps %xmm2,%xmm3
  4004a0:	0f c2 d9 04          	cmpneqps %xmm1,%xmm3
  4004a4:	0f 52 ca             	rsqrtps %xmm2,%xmm1
  4004a7:	0f 54 cb             	andps  %xmm3,%xmm1
  4004aa:	0f 59 d1             	mulps  %xmm1,%xmm2
  4004ad:	0f 59 d1             	mulps  %xmm1,%xmm2
  4004b0:	0f 59 0d c9 01 00 00 	mulps  0x1c9(%rip),%xmm1        # 400680 <_IO_stdin_used+0x20>
  4004b7:	0f 5c c2             	subps  %xmm2,%xmm0
  4004ba:	0f 59 c1             	mulps  %xmm1,%xmm0
  4004bd:	c3                   	retq   
  4004be:	66 90                	xchg   %ax,%ax

00000000004004c0 <bar>:
  4004c0:	0f 28 d0             	movaps %xmm0,%xmm2
  4004c3:	0f 57 c9             	xorps  %xmm1,%xmm1
  4004c6:	0f 28 05 a3 01 00 00 	movaps 0x1a3(%rip),%xmm0        # 400670 <_IO_stdin_used+0x10>
  4004cd:	0f 28 da             	movaps %xmm2,%xmm3
  4004d0:	0f c2 d9 04          	cmpneqps %xmm1,%xmm3
  4004d4:	0f 52 ca             	rsqrtps %xmm2,%xmm1
  4004d7:	0f 54 cb             	andps  %xmm3,%xmm1
  4004da:	0f 59 d1             	mulps  %xmm1,%xmm2
  4004dd:	0f 59 d1             	mulps  %xmm1,%xmm2
  4004e0:	0f 59 0d 99 01 00 00 	mulps  0x199(%rip),%xmm1        # 400680 <_IO_stdin_used+0x20>
  4004e7:	0f 5c c2             	subps  %xmm2,%xmm0
  4004ea:	0f 59 c1             	mulps  %xmm1,%xmm0
  4004ed:	c3                   	retq   
  4004ee:	66 90                	xchg   %ax,%ax

00000000004004f0 <nope1>:
  4004f0:	0f 28 d0             	movaps %xmm0,%xmm2
  4004f3:	0f 57 c9             	xorps  %xmm1,%xmm1
  4004f6:	0f 28 05 73 01 00 00 	movaps 0x173(%rip),%xmm0        # 400670 <_IO_stdin_used+0x10>
  4004fd:	0f 28 da             	movaps %xmm2,%xmm3
  400500:	0f c2 d9 04          	cmpneqps %xmm1,%xmm3
  400504:	0f 52 ca             	rsqrtps %xmm2,%xmm1
  400507:	0f 54 cb             	andps  %xmm3,%xmm1
  40050a:	0f 59 d1             	mulps  %xmm1,%xmm2
  40050d:	0f 59 ca             	mulps  %xmm2,%xmm1
  400510:	0f 59 15 69 01 00 00 	mulps  0x169(%rip),%xmm2        # 400680 <_IO_stdin_used+0x20>
  400517:	0f 5c c1             	subps  %xmm1,%xmm0
  40051a:	0f 59 c2             	mulps  %xmm2,%xmm0
  40051d:	c3                   	retq   
  40051e:	66 90                	xchg   %ax,%ax

0000000000400520 <foo>:
  400520:	0f 28 d0             	movaps %xmm0,%xmm2
  400523:	0f 57 c9             	xorps  %xmm1,%xmm1
  400526:	0f 28 05 43 01 00 00 	movaps 0x143(%rip),%xmm0        # 400670 <_IO_stdin_used+0x10>
  40052d:	0f 28 da             	movaps %xmm2,%xmm3
  400530:	0f c2 d9 04          	cmpneqps %xmm1,%xmm3
  400534:	0f 52 ca             	rsqrtps %xmm2,%xmm1
  400537:	0f 54 cb             	andps  %xmm3,%xmm1
  40053a:	0f 59 d1             	mulps  %xmm1,%xmm2
  40053d:	0f 59 ca             	mulps  %xmm2,%xmm1
  400540:	0f 59 15 39 01 00 00 	mulps  0x139(%rip),%xmm2        # 400680 <_IO_stdin_used+0x20>
  400547:	0f 5c c1             	subps  %xmm1,%xmm0
  40054a:	0f 59 c2             	mulps  %xmm2,%xmm0
  40054d:	c3                   	retq   
  40054e:	90                   	nop    
  40054f:	90                   	nop    

^ permalink raw reply	[flat|nested] 4+ messages in thread

* RE: censored naked SSE reciprocals, -mrecip
  2007-12-30  2:06 ` tbp
@ 2007-12-30  9:16   ` Dave Korn
  0 siblings, 0 replies; 4+ messages in thread
From: Dave Korn @ 2007-12-30  9:16 UTC (permalink / raw)
  To: 'tbp', 'Uros Bizjak'; +Cc: 'GCC', 'GCC Patches'

On 29 December 2007 23:04, tbp wrote:

> Now that's blazing fast after-sales service. 

> As an extremely satisfied customer, i want to nominate you for the
> 2007 man of the year short list.

  Hear hear!  Uros works very hard and contributes a lot.  Thank you, Uros!

    cheers,
      DaveK
-- 
Can't think of a witty .sigline today....

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: censored naked SSE reciprocals, -mrecip
  2007-12-29 16:11 Uros Bizjak
@ 2007-12-30  2:06 ` tbp
  2007-12-30  9:16   ` Dave Korn
  0 siblings, 1 reply; 4+ messages in thread
From: tbp @ 2007-12-30  2:06 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: GCC, GCC Patches

On Dec 29, 2007 4:35 PM, Uros Bizjak <ubizjak@gmail.com> wrote:
> Attached patch fixes these problems by using correct shortcuts when
> generating intrinsic functions.
>
> Patch was bootstrapped and regression tested with {,-m32} on
> x86_64-pc-linux-gnu. Patch is committed to SVN.
>
> Thanks a lot for your report,
Now that's blazing fast after-sales service. And i get no less than
two undocumented but functional builtins (as opposed to, say
__builtin_ia32_movddup, which is documented but dysfunctional) for the
same price.
As an extremely satisfied customer, i want to nominate you for the
2007 man of the year short list.

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: censored naked SSE reciprocals, -mrecip
@ 2007-12-29 16:11 Uros Bizjak
  2007-12-30  2:06 ` tbp
  0 siblings, 1 reply; 4+ messages in thread
From: Uros Bizjak @ 2007-12-29 16:11 UTC (permalink / raw)
  To: GCC; +Cc: GCC Patches, tbp

[-- Attachment #1: Type: text/plain, Size: 1399 bytes --]

Hello!

> i lately had some use for -mrecip but it turned out to come with all
> sorts of strings attached and apparently no opt-out. Briefly, barring
> inline asm, i can't get gcc to emit those ops without a NR fixup.
>   

<snip>

> Questions:
>   a) is that really by design?

No.

Attached patch fixes these problems by using correct shortcuts when 
generating intrinsic functions.

2007-12-29  Uros Bizjak  <ubizjak@gmail.com>

        * config/i386/sse.md ("*divv4sf3"): Rename to "sse_divv4sf3".
        ("*sse_rsqrtv4sf2"): Export.
        ("*sse_sqrtv4sf2"): Ditto.
        * config/i386/i386.c (enum ix86_builtins) [IX86_BUILTIN_RSQRTPS_NR,
        IX86_BUILTIN_SQRTPS_NR]: New constants.
        (struct builtin_description) [IX86_BUILTIN_DIVPS]: Use
        CODE_FOR_sse_divv4sf3.
        [IX86_BUILTIN_SQRTPS]: Use CODE_FOR_sse_sqrtv4sf2.
        [IX86_BUILTIN_SQRTPS_NR]: New.
        [IX86_BUILTIN_RSQRTPS_NR]: Ditto.
        (ix86_init_mmx_sse_builtins): Initialize 
__builtin_ia32_rsqrtps_nr and
        __builtin_ia32_sqrtps_nr.
        (ix86_builtin_vectorized_function): Convert BUILT_IN_SQRTF to
        IX86_BUILTIN_SQRTPS_NR.
        (ix86_builtin_reciprocal): Convert IX86_BUILTIN_SQRTPS_NR to
        IX86_BUILTIN_RSQRTPS_NR.

Patch was bootstrapped and regression tested with {,-m32} on 
x86_64-pc-linux-gnu. Patch is committed to SVN.

Thanks a lot for your report,
Uros.

[-- Attachment #2: r.diff.txt --]
[-- Type: text/plain, Size: 6407 bytes --]

Index: sse.md
===================================================================
--- sse.md	(revision 131218)
+++ sse.md	(working copy)
@@ -490,7 +490,7 @@
     }
 })
 
-(define_insn "*divv4sf3"
+(define_insn "sse_divv4sf3"
   [(set (match_operand:V4SF 0 "register_operand" "=x")
 	(div:V4SF (match_operand:V4SF 1 "register_operand" "0")
 		  (match_operand:V4SF 2 "nonimmediate_operand" "xm")))]
@@ -532,16 +532,7 @@
   [(set_attr "type" "sse")
    (set_attr "mode" "SF")])
 
-(define_insn "*sse_rsqrtv4sf2"
-  [(set (match_operand:V4SF 0 "register_operand" "=x")
-	(unspec:V4SF
-	  [(match_operand:V4SF 1 "nonimmediate_operand" "xm")] UNSPEC_RSQRT))]
-  "TARGET_SSE"
-  "rsqrtps\t{%1, %0|%0, %1}"
-  [(set_attr "type" "sse")
-   (set_attr "mode" "V4SF")])
-
-(define_expand "sse_rsqrtv4sf2"
+(define_expand "rsqrtv4sf2"
   [(set (match_operand:V4SF 0 "register_operand" "")
 	(unspec:V4SF
 	  [(match_operand:V4SF 1 "nonimmediate_operand" "")] UNSPEC_RSQRT))]
@@ -556,6 +547,15 @@
     }
 })
 
+(define_insn "sse_rsqrtv4sf2"
+  [(set (match_operand:V4SF 0 "register_operand" "=x")
+	(unspec:V4SF
+	  [(match_operand:V4SF 1 "nonimmediate_operand" "xm")] UNSPEC_RSQRT))]
+  "TARGET_SSE"
+  "rsqrtps\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sse")
+   (set_attr "mode" "V4SF")])
+
 (define_insn "sse_vmrsqrtv4sf2"
   [(set (match_operand:V4SF 0 "register_operand" "=x")
 	(vec_merge:V4SF
@@ -568,14 +568,6 @@
   [(set_attr "type" "sse")
    (set_attr "mode" "SF")])
 
-(define_insn "*sqrtv4sf2"
-  [(set (match_operand:V4SF 0 "register_operand" "=x")
-	(sqrt:V4SF (match_operand:V4SF 1 "nonimmediate_operand" "xm")))]
-  "TARGET_SSE"
-  "sqrtps\t{%1, %0|%0, %1}"
-  [(set_attr "type" "sse")
-   (set_attr "mode" "V4SF")])
-
 (define_expand "sqrtv4sf2"
   [(set (match_operand:V4SF 0 "register_operand" "=")
 	(sqrt:V4SF (match_operand:V4SF 1 "nonimmediate_operand" "")))]
@@ -590,6 +582,14 @@
     }
 })
 
+(define_insn "sse_sqrtv4sf2"
+  [(set (match_operand:V4SF 0 "register_operand" "=x")
+	(sqrt:V4SF (match_operand:V4SF 1 "nonimmediate_operand" "xm")))]
+  "TARGET_SSE"
+  "sqrtps\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sse")
+   (set_attr "mode" "V4SF")])
+
 (define_insn "sse_vmsqrtv4sf2"
   [(set (match_operand:V4SF 0 "register_operand" "=x")
 	(vec_merge:V4SF
Index: i386.c
===================================================================
--- i386.c	(revision 131218)
+++ i386.c	(working copy)
@@ -17093,9 +17093,11 @@ enum ix86_builtins
   IX86_BUILTIN_RCPPS,
   IX86_BUILTIN_RCPSS,
   IX86_BUILTIN_RSQRTPS,
+  IX86_BUILTIN_RSQRTPS_NR,
   IX86_BUILTIN_RSQRTSS,
   IX86_BUILTIN_RSQRTF,
   IX86_BUILTIN_SQRTPS,
+  IX86_BUILTIN_SQRTPS_NR,
   IX86_BUILTIN_SQRTSS,
 
   IX86_BUILTIN_UNPCKHPS,
@@ -17849,7 +17851,7 @@ static const struct builtin_description 
   { OPTION_MASK_ISA_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, UNKNOWN, 0 },
   { OPTION_MASK_ISA_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, UNKNOWN, 0 },
   { OPTION_MASK_ISA_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, UNKNOWN, 0 },
-  { OPTION_MASK_ISA_SSE, CODE_FOR_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, 0 },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, 0 },
   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmaddv4sf3,  "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, UNKNOWN, 0 },
   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsubv4sf3,  "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, UNKNOWN, 0 },
   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmulv4sf3,  "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, UNKNOWN, 0 },
@@ -18158,8 +18160,10 @@ static const struct builtin_description 
   { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pmovmskb, 0, IX86_BUILTIN_PMOVMSKB, UNKNOWN, 0 },
   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movmskps, 0, IX86_BUILTIN_MOVMSKPS, UNKNOWN, 0 },
 
-  { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, 0, IX86_BUILTIN_SQRTPS, UNKNOWN, 0 },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_sqrtv4sf2, 0, IX86_BUILTIN_SQRTPS, UNKNOWN, 0 },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, 0, IX86_BUILTIN_SQRTPS_NR, UNKNOWN, 0 },
   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rsqrtv4sf2, 0, IX86_BUILTIN_RSQRTPS, UNKNOWN, 0 },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtv4sf2, 0, IX86_BUILTIN_RSQRTPS_NR, UNKNOWN, 0 },
   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rcpv4sf2, 0, IX86_BUILTIN_RCPPS, UNKNOWN, 0 },
 
   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtps2pi, 0, IX86_BUILTIN_CVTPS2PI, UNKNOWN, 0 },
@@ -19279,12 +19283,14 @@ ix86_init_mmx_sse_builtins (void)
   def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_rcpps", v4sf_ftype_v4sf, IX86_BUILTIN_RCPPS);
   def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_rcpss", v4sf_ftype_v4sf, IX86_BUILTIN_RCPSS);
   def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_rsqrtps", v4sf_ftype_v4sf, IX86_BUILTIN_RSQRTPS);
+  def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_rsqrtps_nr", v4sf_ftype_v4sf, IX86_BUILTIN_RSQRTPS_NR);
   def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_rsqrtss", v4sf_ftype_v4sf, IX86_BUILTIN_RSQRTSS);
   ftype = build_function_type_list (float_type_node,
 				    float_type_node,
 				    NULL_TREE);
   def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_rsqrtf", ftype, IX86_BUILTIN_RSQRTF);
   def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_sqrtps", v4sf_ftype_v4sf, IX86_BUILTIN_SQRTPS);
+  def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_sqrtps_nr", v4sf_ftype_v4sf, IX86_BUILTIN_SQRTPS_NR);
   def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_sqrtss", v4sf_ftype_v4sf, IX86_BUILTIN_SQRTSS);
 
   def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_shufps", v4sf_ftype_v4sf_v4sf_int, IX86_BUILTIN_SHUFPS);
@@ -21301,7 +21307,7 @@ ix86_builtin_vectorized_function (unsign
     case BUILT_IN_SQRTF:
       if (out_mode == SFmode && out_n == 4
 	  && in_mode == SFmode && in_n == 4)
-	return ix86_builtins[IX86_BUILTIN_SQRTPS];
+	return ix86_builtins[IX86_BUILTIN_SQRTPS_NR];
       break;
 
     case BUILT_IN_LRINT:
@@ -21463,8 +21469,8 @@ ix86_builtin_reciprocal (unsigned int fn
     switch (fn)
       {
 	/* Vectorized version of sqrt to rsqrt conversion.  */
-      case IX86_BUILTIN_SQRTPS:
-	return ix86_builtins[IX86_BUILTIN_RSQRTPS];
+      case IX86_BUILTIN_SQRTPS_NR:
+	return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR];
 
       default:
 	return NULL_TREE;

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2007-12-30  2:06 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-12-29 15:35 censored naked SSE reciprocals, -mrecip tbp
2007-12-29 16:11 Uros Bizjak
2007-12-30  2:06 ` tbp
2007-12-30  9:16   ` Dave Korn

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).