public inbox for gcc-help@gcc.gnu.org
 help / color / mirror / Atom feed
* problems with gcc inline assembly using xmm registers
@ 2004-12-03 15:29 David Palao
  2004-12-03 16:37 ` Nathan Sidwell
       [not found] ` <Pine.LNX.4.61.0412031705390.2211@tripper.tr69.homelinux.net>
  0 siblings, 2 replies; 5+ messages in thread
From: David Palao @ 2004-12-03 15:29 UTC (permalink / raw)
  To: gcc-help

Hi!

I'm newbie using assembly, but I'm working in computational physics and we 
need to design a very fast function doing an operation which is repeated 
billions of times in our calculations.
In order to gain performance we decided to use some of the SSE features 
(properly used, xmm registers should provide a very nice increase in 
performance).
However, I'm having serious troubles using xmm[0...7] registers with gcc 
inlines.
One example of the code in which I got these troubles is at the end of the 
message.

The error I get is:

../Libraries/fermiqcd_fermi_actions_sse2.h:818: can't find a register in class
   `GENERAL_REGS' while reloading `asm'


what I understand is that the function I'm trying to compile uses too many 
`GENERAL_REGS' registers but...
...do xmm registers belong to this group? aren't they a special set of 
registers in the SSE area?
...Could it be that the inlines use too many intermediate registers from the 
general registers group? In that case, how could I avoid such behaviour?

The problem occurs whenever I try to use the xmm intensively; if I don't use 
SSE extensions eveything goes ok, but I NEED xmm registers!

Thanks in advance!!!
Best regards


David


PS gcc -v:
gcc version 3.2.3 20030502 (Red Hat Linux 3.2.3-34)




CODE:

__asm__ __volatile__ ("movsd %0, %%xmm3 \n\t" \
      "movsd %1, %%xmm6 \n\t" \
      "movsd %2, %%xmm4 \n\t" \
      "movsd %3, %%xmm7 \n\t" \
      "movsd %4, %%xmm5 \n\t" \
      "unpcklpd %%xmm3, %%xmm3 \n\t" \
      "unpcklpd %%xmm6, %%xmm6 \n\t" \
      "unpcklpd %%xmm4, %%xmm4 \n\t" \
      "mulpd %%xmm0, %%xmm3 \n\t" \
      "unpcklpd %%xmm7, %%xmm7 \n\t" \
      "mulpd %%xmm1, %%xmm6 \n\t" \
      "unpcklpd %%xmm5, %%xmm5 \n\t" \
      "mulpd %%xmm0, %%xmm4 \n\t" \
      "addpd %%xmm6, %%xmm3 \n\t" \
      "mulpd %%xmm2, %%xmm7 \n\t" \
      "mulpd %%xmm0, %%xmm5 \n\t" \
      "addpd %%xmm7, %%xmm4 \n\t" \
      "movsd %5, %%xmm6 \n\t" \
      "movsd %6, %%xmm7 \n\t" \
      "unpcklpd %%xmm6, %%xmm6 \n\t" \
      "unpcklpd %%xmm7, %%xmm7 \n\t" \
      "mulpd %%xmm1, %%xmm6 \n\t" \
      "mulpd %%xmm2, %%xmm7 \n\t" \
      "addpd %%xmm6, %%xmm5 \n\t" \
      "addpd %%xmm7, %%xmm3 \n\t" \
      "movsd %7, %%xmm6 \n\t" \
      "movsd %8, %%xmm7 \n\t" \
      "unpcklpd %%xmm6, %%xmm6 \n\t" \
      "unpcklpd %%xmm7, %%xmm7 \n\t" \
      "mulpd %%xmm1, %%xmm6 \n\t" \
      "mulpd %%xmm2, %%xmm7 \n\t" \
      "addpd %%xmm6, %%xmm4 \n\t" \
      "addpd %%xmm7, %%xmm5" \
      : \
      : \
      "m" ((u).c11.real()), \
      "m" ((u).c12.real()), \
      "m" ((u).c21.real()), \
      "m" ((u).c23.real()), \
      "m" ((u).c31.real()), \
      "m" ((u).c32.real()), \
      "m" ((u).c13.real()), \
      "m" ((u).c22.real()), \
      "m" ((u).c33.real())); \

__asm__ __volatile__ ("movsd %0, %%xmm6 \n\t" \
      "movsd %1, %%xmm7 \n\t" \
      "shufpd $0x1, %%xmm0, %%xmm0 \n\t" \
      "shufpd $0x1, %%xmm1, %%xmm1 \n\t" \
      "shufpd $0x1, %%xmm2, %%xmm2 \n\t" \
      "unpcklpd %%xmm6, %%xmm6 \n\t" \
      "unpcklpd %%xmm7, %%xmm7 \n\t" \
      "xorpd %9, %%xmm0 \n\t" \
      "xorpd %9, %%xmm1 \n\t" \
      "xorpd %9, %%xmm2 \n\t" \
      "mulpd %%xmm0, %%xmm6 \n\t" \
      "mulpd %%xmm1, %%xmm7 \n\t" \
      "addpd %%xmm6, %%xmm3 \n\t" \
      "addpd %%xmm7, %%xmm4 \n\t" \
      "movsd %2, %%xmm6 \n\t" \
      "movsd %3, %%xmm7 \n\t" \
      "unpcklpd %%xmm6, %%xmm6 \n\t" \
      "unpcklpd %%xmm7, %%xmm7 \n\t" \
      "mulpd %%xmm2, %%xmm6 \n\t" \
      "mulpd %%xmm0, %%xmm7 \n\t" \
      "addpd %%xmm6, %%xmm5 \n\t" \
      "addpd %%xmm7, %%xmm4 \n\t" \
      "movsd %4, %%xmm6 \n\t" \
      "movsd %5, %%xmm7 \n\t" \
      "unpcklpd %%xmm6, %%xmm6 \n\t" \
      "unpcklpd %%xmm7, %%xmm7 \n\t" \
      "mulpd %%xmm1, %%xmm6 \n\t" \
      "mulpd %%xmm0, %%xmm7 \n\t" \
      "addpd %%xmm6, %%xmm3 \n\t" \
      "addpd %%xmm7, %%xmm5 \n\t" \
      "movsd %6, %%xmm0 \n\t" \
      "movsd %7, %%xmm6 \n\t" \
      "movsd %8, %%xmm7 \n\t" \
      "unpcklpd %%xmm0, %%xmm0 \n\t" \
      "unpcklpd %%xmm6, %%xmm6 \n\t" \
      "unpcklpd %%xmm7, %%xmm7 \n\t" \
      "mulpd %%xmm2, %%xmm0 \n\t" \
      "mulpd %%xmm1, %%xmm6 \n\t" \
      "mulpd %%xmm2, %%xmm7 \n\t" \
      "addpd %%xmm0, %%xmm3 \n\t" \
      "addpd %%xmm6, %%xmm5 \n\t" \
      "addpd %%xmm7, %%xmm4" \
      : \
      : \
      "m" ((u).c11.imag()), \
      "m" ((u).c22.imag()), \
      "m" ((u).c33.imag()), \
      "m" ((u).c21.imag()), \
      "m" ((u).c12.imag()), \
      "m" ((u).c31.imag()), \
      "m" ((u).c13.imag()), \
      "m" ((u).c32.imag()), \
      "m" ((u).c23.imag()), \
      "m" (_sse_double_sgn));

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: problems with gcc inline assembly using xmm registers
  2004-12-03 15:29 problems with gcc inline assembly using xmm registers David Palao
@ 2004-12-03 16:37 ` Nathan Sidwell
  2004-12-03 17:15   ` David Palao
       [not found] ` <Pine.LNX.4.61.0412031705390.2211@tripper.tr69.homelinux.net>
  1 sibling, 1 reply; 5+ messages in thread
From: Nathan Sidwell @ 2004-12-03 16:37 UTC (permalink / raw)
  To: David Palao; +Cc: gcc-help

David Palao wrote:

> __asm__ __volatile__ ("movsd %0, %%xmm3 \n\t" \
>       "movsd %1, %%xmm6 \n\t" \
>       "movsd %2, %%xmm4 \n\t" \
>       "movsd %3, %%xmm7 \n\t" \
>       "movsd %4, %%xmm5 \n\t" \
>       "unpcklpd %%xmm3, %%xmm3 \n\t" \
>       "unpcklpd %%xmm6, %%xmm6 \n\t" \
>       "unpcklpd %%xmm4, %%xmm4 \n\t" \
>       "mulpd %%xmm0, %%xmm3 \n\t" \
....
>       "addpd %%xmm6, %%xmm5 \n\t" \
>       "addpd %%xmm7, %%xmm3 \n\t" \
>       "movsd %7, %%xmm6 \n\t" \
>       "movsd %8, %%xmm7 \n\t" \
>       "unpcklpd %%xmm6, %%xmm6 \n\t" \
>       "unpcklpd %%xmm7, %%xmm7 \n\t" \
>       "mulpd %%xmm1, %%xmm6 \n\t" \
>       "mulpd %%xmm2, %%xmm7 \n\t" \
>       "addpd %%xmm6, %%xmm4 \n\t" \
>       "addpd %%xmm7, %%xmm5" \

don't write it this way, use the mmx builtins directly and then the
compiler can handle all the register allocation for you.  You'll
have to be careful to arrange for no more than 8 mmx things
to be live at one time though.  That's not too hard to achieve
if you're careful. I had success using this technique to do some
2D FFTs, it was way simpler than writing assembly directly.

nathan

-- 
Nathan Sidwell    ::   http://www.codesourcery.com   ::     CodeSourcery LLC
nathan@codesourcery.com    ::     http://www.planetfall.pwp.blueyonder.co.uk

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: problems with gcc inline assembly using xmm registers
  2004-12-03 16:37 ` Nathan Sidwell
@ 2004-12-03 17:15   ` David Palao
  2004-12-03 17:23     ` Nathan Sidwell
  0 siblings, 1 reply; 5+ messages in thread
From: David Palao @ 2004-12-03 17:15 UTC (permalink / raw)
  To: Nathan Sidwell; +Cc: gcc-help

El Viernes, 3 de Diciembre de 2004 17:37, escribió:
> David Palao wrote:
> > __asm__ __volatile__ ("movsd %0, %%xmm3 \n\t" \
> >       "movsd %1, %%xmm6 \n\t" \
> >       "movsd %2, %%xmm4 \n\t" \
> >       "movsd %3, %%xmm7 \n\t" \
> >       "movsd %4, %%xmm5 \n\t" \
> >       "unpcklpd %%xmm3, %%xmm3 \n\t" \
> >       "unpcklpd %%xmm6, %%xmm6 \n\t" \
> >       "unpcklpd %%xmm4, %%xmm4 \n\t" \
> >       "mulpd %%xmm0, %%xmm3 \n\t" \
>
> ....
>
> >       "addpd %%xmm6, %%xmm5 \n\t" \
> >       "addpd %%xmm7, %%xmm3 \n\t" \
> >       "movsd %7, %%xmm6 \n\t" \
> >       "movsd %8, %%xmm7 \n\t" \
> >       "unpcklpd %%xmm6, %%xmm6 \n\t" \
> >       "unpcklpd %%xmm7, %%xmm7 \n\t" \
> >       "mulpd %%xmm1, %%xmm6 \n\t" \
> >       "mulpd %%xmm2, %%xmm7 \n\t" \
> >       "addpd %%xmm6, %%xmm4 \n\t" \
> >       "addpd %%xmm7, %%xmm5" \
>
> don't write it this way, use the mmx builtins directly and then the
> compiler can handle all the register allocation for you.  You'll
> have to be careful to arrange for no more than 8 mmx things
> to be live at one time though.  That's not too hard to achieve
> if you're careful. I had success using this technique to do some
> 2D FFTs, it was way simpler than writing assembly directly.
>
> nathan

First, thanks for the reply.

What do you mean with "use the mmx builtins directly"?
(remember I'm learning this stuff right now...)
If I understand correctly, it is possible to say to the compiler that it has 
to use ONLY xmm registers (you mean xmm, right?) in doing certain part(s) of 
the code (and only this part(s) ) and it will be done in the most efficient 
way, is it true???
How can I do it?

Regards

David

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: problems with gcc inline assembly using xmm registers
  2004-12-03 17:15   ` David Palao
@ 2004-12-03 17:23     ` Nathan Sidwell
  0 siblings, 0 replies; 5+ messages in thread
From: Nathan Sidwell @ 2004-12-03 17:23 UTC (permalink / raw)
  To: David Palao; +Cc: gcc-help

David Palao wrote:
irst, thanks for the reply.
> 
> What do you mean with "use the mmx builtins directly"?
> (remember I'm learning this stuff right now...)
> If I understand correctly, it is possible to say to the compiler that it has 
> to use ONLY xmm registers (you mean xmm, right?) in doing certain part(s) of 
> the code (and only this part(s) ) and it will be done in the most efficient 
> way, is it true???

see xmmintrin.h, I'm not sure what other documentation is
available, or for that matter what version of GCC it was added to.
it allows you to write stuff like,

/* great big comment snipped */
{
               /* all 8 get live here. */
               __m128 fd = _mm_loadl_pi (invert, (__m64 *)forward);
               __m128 rd = _mm_loadl_pi (invert, (__m64 *)reverse);
               __m128 fd2 = _mm_shuffle_ps (fd, fd, _MM_SHUFFLE (0,1,1,0));
               __m128 rd2 = _mm_shuffle_ps (rd, rd, _MM_SHUFFLE (0,1,1,0));
               __m128 h = _mm_add_ps (fd2, _mm_mul_ps (rd2, invert));
               __m128 h1 = _mm_shuffle_ps (h, h, _MM_SHUFFLE (1,0,1,0));
               __m128 h2r = _mm_shuffle_ps (h, h, _MM_SHUFFLE (3,2,3,2));
               __m128 h2i = _mm_shuffle_ps (h, h, _MM_SHUFFLE (2,3,2,3));
               __m128 r = _mm_add_ps (_mm_mul_ps (h2r, wr),
                                      _mm_mul_ps (_mm_mul_ps (h2i, wi),
                                                  invert));

               r = _mm_add_ps (r, _mm_mul_ps (h1, half1));

               _mm_storel_pi ((__m64 *)forward, r);
               _mm_storeh_pi ((__m64 *)reverse,r);


nathan

-- 
Nathan Sidwell    ::   http://www.codesourcery.com   ::     CodeSourcery LLC
nathan@codesourcery.com    ::     http://www.planetfall.pwp.blueyonder.co.uk

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: problems with gcc inline assembly using xmm registers
       [not found] ` <Pine.LNX.4.61.0412031705390.2211@tripper.tr69.homelinux.net>
@ 2004-12-03 17:42   ` David Palao
  0 siblings, 0 replies; 5+ messages in thread
From: David Palao @ 2004-12-03 17:42 UTC (permalink / raw)
  To: Thorsten Reinecke; +Cc: gcc-help

Thank you for the answer!

>
> I had some trouble using mmx and xmm registers, too. But my code works
> now. See the attached code snippet. You can get an idea of how to use the
> input list, output list and clobber list and also how to share xmm
> registers between different asm inline blocks.
>

I will read it, but it looks hardcore to me (as I'm new in assembly)


> You use only memory operands, so this shouldn't be a problem. But you
> haven't declared any output. You're clobbering xmm registers, but you
> don't tell the compiler that you do so. Maybe that's the problem.

What's the problem if I don't need output operands?

Concerning to clobbering part; well, I have tried clobbering xmm registers as 
well (I hope I did it right). For instance:

__asm__ __volatile__ ("movsd %0, %%xmm3 \n\t" \
      "movsd %1, %%xmm6 \n\t" \
      "movsd %2, %%xmm4 \n\t" \
      "movsd %3, %%xmm7 \n\t" \
      "movsd %4, %%xmm5 \n\t" \
      "unpcklpd %%xmm3, %%xmm3 \n\t" \
      "unpcklpd %%xmm6, %%xmm6 \n\t" \
      "unpcklpd %%xmm4, %%xmm4 \n\t" \
      "mulpd %%xmm0, %%xmm3 \n\t" \
      "unpcklpd %%xmm7, %%xmm7 \n\t" \
      "mulpd %%xmm1, %%xmm6 \n\t" \
      "unpcklpd %%xmm5, %%xmm5 \n\t" \
      "mulpd %%xmm0, %%xmm4 \n\t" \
      "addpd %%xmm6, %%xmm3 \n\t" \
      "mulpd %%xmm2, %%xmm7 \n\t" \
      "mulpd %%xmm0, %%xmm5 \n\t" \
      "addpd %%xmm7, %%xmm4 \n\t" \
      "movsd %5, %%xmm6 \n\t" \
      "movsd %6, %%xmm7 \n\t" \
      "unpcklpd %%xmm6, %%xmm6 \n\t" \
      "unpcklpd %%xmm7, %%xmm7 \n\t" \
      "mulpd %%xmm1, %%xmm6 \n\t" \
      "mulpd %%xmm2, %%xmm7 \n\t" \
      "addpd %%xmm6, %%xmm5 \n\t" \
      "addpd %%xmm7, %%xmm3 \n\t" \
      "movsd %7, %%xmm6 \n\t" \
      "movsd %8, %%xmm7 \n\t" \
      "unpcklpd %%xmm6, %%xmm6 \n\t" \
      "unpcklpd %%xmm7, %%xmm7 \n\t" \
      "mulpd %%xmm1, %%xmm6 \n\t" \
      "mulpd %%xmm2, %%xmm7 \n\t" \
      "addpd %%xmm6, %%xmm4 \n\t" \
      "addpd %%xmm7, %%xmm5" \
      : \
      : \
      "m" ((u).c11.real()), \
      "m" ((u).c12.real()), \
      "m" ((u).c21.real()), \
      "m" ((u).c23.real()), \
      "m" ((u).c31.real()), \
      "m" ((u).c32.real()), \
      "m" ((u).c13.real()), \
      "m" ((u).c22.real()), \
      "m" ((u).c33.real())  \
: \
"%xmm0", \
"%xmm1", \
"%xmm2", \
"%xmm3", \
"%xmm4", \
"%xmm5", \
"%xmm6", \
"%xmm7" );

BUT it doesn't work either way (with/without clobbering list).
Any idea???

Regards

David

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2004-12-03 17:42 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2004-12-03 15:29 problems with gcc inline assembly using xmm registers David Palao
2004-12-03 16:37 ` Nathan Sidwell
2004-12-03 17:15   ` David Palao
2004-12-03 17:23     ` Nathan Sidwell
     [not found] ` <Pine.LNX.4.61.0412031705390.2211@tripper.tr69.homelinux.net>
2004-12-03 17:42   ` David Palao

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).