Re: [PATCH, x86] merge movsd/movhpd pair in peephole

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

From: Xinliang David Li <davidxl@google.com>
To: "Bin.Cheng" <amker.cheng@gmail.com>
Cc: Wei Mi <wmi@google.com>, GCC Patches <gcc-patches@gcc.gnu.org>,
		Uros Bizjak <ubizjak@gmail.com>, "H.J. Lu" <hjl.tools@gmail.com>
Subject: Re: [PATCH, x86] merge movsd/movhpd pair in peephole
Date: Mon, 21 Apr 2014 19:26:00 -0000	[thread overview]
Message-ID: <CAAkRFZL4ajQghy7LR58ob8LX4cpcsEeFM=N7HqcJiPEh9t+8Gw@mail.gmail.com> (raw)
In-Reply-To: <CAHFci28JaB_45BishBAXsoRBp_c=4Y1yVeCutYt6=y_ivApfyg@mail.gmail.com>

Bin, when will the patch for the generic pass be available for review?

David

On Wed, Apr 9, 2014 at 7:27 PM, Bin.Cheng <amker.cheng@gmail.com> wrote:
> On Thu, Apr 10, 2014 at 8:18 AM, Wei Mi <wmi@google.com> wrote:
>> Hi,
>>
>> For the testcase 1.c
>>
>> #include <emmintrin.h>
>>
>> double a[1000];
>>
>> __m128d foo1() {
>>   __m128d res;
>>   res = _mm_load_sd(&a[1]);
>>   res = _mm_loadh_pd(res, &a[2]);
>>   return res;
>> }
>>
>> llvm will merge movsd/movhpd to movupd while gcc will not. The merge
>> is beneficial on x86 machines starting from Nehalem.
>>
>> The patch is to add the merging in peephole.
>> bootstrap and regression pass. Is it ok for stage1?
>>
>> Thanks,
>> Wei.
>>
>> gcc/ChangeLog:
>>
>> 2014-04-09  Wei Mi  <wmi@google.com>
>>
>>         * config/i386/i386.c (get_memref_parts): New function.
>>         (adjacent_mem_locations): Ditto.
>>         * config/i386/i386-protos.h: Add decl for adjacent_mem_locations.
>>         * config/i386/sse.md: Add define_peephole rule.
>>
>> gcc/testsuite/ChangeLog:
>>
>> 2014-04-09  Wei Mi  <wmi@google.com>
>>
>>         * gcc.target/i386/sse2-unaligned-mov.c: New test.
>>
>> diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
>> index 6e32978..3ae0d6d 100644
>> --- a/gcc/config/i386/i386-protos.h
>> +++ b/gcc/config/i386/i386-protos.h
>> @@ -312,6 +312,7 @@ extern enum attr_cpu ix86_schedule;
>>  #endif
>>
>>  extern const char * ix86_output_call_insn (rtx insn, rtx call_op);
>> +extern bool adjacent_mem_locations (rtx mem1, rtx mem2);
>>
>>  #ifdef RTX_CODE
>>  /* Target data for multipass lookahead scheduling.
>> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
>> index 3eefe4a..a330e84 100644
>> --- a/gcc/config/i386/i386.c
>> +++ b/gcc/config/i386/i386.c
>> @@ -46737,6 +46737,70 @@ ix86_atomic_assign_expand_fenv (tree *hold,
>> tree *clear, tree *update)
>>                     atomic_feraiseexcept_call);
>>  }
>>
>> +/* Try to determine BASE/OFFSET/SIZE parts of the given MEM.
>> +   Return true if successful, false if all the values couldn't
>> +   be determined.
>> +
>> +   This function only looks for REG/SYMBOL or REG/SYMBOL+CONST
>> +   address forms. */
>> +
>> +static bool
>> +get_memref_parts (rtx mem, rtx *base, HOST_WIDE_INT *offset,
>> +                 HOST_WIDE_INT *size)
>> +{
>> +  rtx addr_rtx;
>> +  if MEM_SIZE_KNOWN_P (mem)
>> +    *size = MEM_SIZE (mem);
>> +  else
>> +    return false;
>> +
>> +  if (GET_CODE (XEXP (mem, 0)) == CONST)
>> +    addr_rtx = XEXP (XEXP (mem, 0), 0);
>> +  else
>> +    addr_rtx = (XEXP (mem, 0));
>> +
>> +  if (GET_CODE (addr_rtx) == REG
>> +      || GET_CODE (addr_rtx) == SYMBOL_REF)
>> +    {
>> +      *base = addr_rtx;
>> +      *offset = 0;
>> +    }
>> +  else if (GET_CODE (addr_rtx) == PLUS
>> +          && CONST_INT_P (XEXP (addr_rtx, 1)))
>> +    {
>> +      *base = XEXP (addr_rtx, 0);
>> +      *offset = INTVAL (XEXP (addr_rtx, 1));
>> +    }
>> +  else
>> +    return false;
>> +
>> +  return true;
>> +}
>> +
>> +/* If MEM1 is adjacent to MEM2 and MEM1 has lower address,
>> +   return true.  */
>> +
>> +extern bool
>> +adjacent_mem_locations (rtx mem1, rtx mem2)
>> +{
>> +  rtx base1, base2;
>> +  HOST_WIDE_INT off1, size1, off2, size2;
>> +
>> +  if (get_memref_parts (mem1, &base1, &off1, &size1)
>> +      && get_memref_parts (mem2, &base2, &off2, &size2))
>> +    {
>> +      if (GET_CODE (base1) == SYMBOL_REF
>> +         && GET_CODE (base2) == SYMBOL_REF
>> +         && SYMBOL_REF_DECL (base1) == SYMBOL_REF_DECL (base2))
>> +        return (off1 + size1 == off2);
>> +      else if (REG_P (base1)
>> +              && REG_P (base2)
>> +              && REGNO (base1) == REGNO (base2))
>> +        return (off1 + size1 == off2);
>> +    }
>> +  return false;
>> +}
>> +
>>  /* Initialize the GCC target structure.  */
>>  #undef TARGET_RETURN_IN_MEMORY
>>  #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
>> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
>> index 72a4d6d..4bf8461 100644
>> --- a/gcc/config/i386/sse.md
>> +++ b/gcc/config/i386/sse.md
>> @@ -15606,3 +15606,37 @@
>>    [(set_attr "type" "sselog1")
>>     (set_attr "length_immediate" "1")
>>     (set_attr "mode" "TI")])
>> +
>> +;; merge movsd/movhpd to movupd when TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
>> +;; is true.
>> +(define_peephole2
>> +  [(set (match_operand:DF 0 "register_operand")
>> +       (match_operand:DF 1 "memory_operand"))
>> +   (set (match_operand:V2DF 2 "register_operand")
>> +       (vec_concat:V2DF (match_dup 0)
>> +        (match_operand:DF 3 "memory_operand")))]
>> +  "TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
>> +   && REGNO (operands[0]) == REGNO (operands[2])
>> +   && adjacent_mem_locations (operands[1], operands[3])"
>> +  [(set (match_dup 2)
>> +       (unspec:V2DF [(match_dup 4)] UNSPEC_LOADU))]
>> +{
>> +  operands[4] = gen_rtx_MEM (V2DFmode, XEXP(operands[1], 0));
>> +})
>> +
>> +;; merge movsd/movhpd to movupd when TARGET_SSE_UNALIGNED_STORE_OPTIMAL
>> +;; is true.
>> +(define_peephole2
>> +  [(set (match_operand:DF 0 "memory_operand")
>> +        (vec_select:DF (match_operand:V2DF 1 "register_operand")
>> +                      (parallel [(const_int 0)])))
>> +   (set (match_operand:DF 2 "memory_operand")
>> +        (vec_select:DF (match_dup 1)
>> +                       (parallel [(const_int 1)])))]
>> +  "TARGET_SSE_UNALIGNED_STORE_OPTIMAL
>> +   && adjacent_mem_locations (operands[0], operands[2])"
>> +  [(set (match_dup 3)
>> +        (unspec:V2DF [(match_dup 1)] UNSPEC_STOREU))]
>> +{
>> +  operands[3] = gen_rtx_MEM (V2DFmode, XEXP(operands[0], 0));
>> +})
>> diff --git a/gcc/testsuite/gcc.target/i386/sse2-unaligned-mov.c
>> b/gcc/testsuite/gcc.target/i386/sse2-unaligned-mov.c
>> new file mode 100644
>> index 0000000..28470ce
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/i386/sse2-unaligned-mov.c
>> @@ -0,0 +1,20 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-mtune=corei7 -O2" } */
>> +
>> +#include <emmintrin.h>
>> +
>> +double a[1000];
>> +
>> +__m128d foo1() {
>> +  __m128d res;
>> +  res = _mm_load_sd(&a[1]);
>> +  res = _mm_loadh_pd(res, &a[2]);
>> +  return res;
>> +}
>> +
>> +void foo2(__m128d res) {
>> +  _mm_store_sd(&a[1], res);
>> +  _mm_storeh_pd(&a[2], res);
>> +}
>> +
>> +/* { dg-final { scan-assembler-times "movup" 2 } } */
>
> Hi Wei,
> Just FYI, though I am not sure whether it can help.  I am working on
> load store pairing on ARM too.  On ARM (maybe other machines like
> mips), the problem is more general because we can pair memory accesses
> with respect to both core register and fpu register.  The current
> strategy taken by GCC is to use peephole2 but it's too weak, for
> example, it can't handle pairs which are intervened by other
> instructions.  Right now I am working on a new pass in GCC to do that
> work, and have already worked out a draft patch.  The instrumental
> data looks promising with many opportunities captured besides the
> original peephole2 pass.  Furthermore, the result could be even better
> if register renaming is enabled.  Hopefully I will try to bring back
> register renaming for this purpose later.
>
> Thanks,
> bin

next prev parent reply	other threads:[~2014-04-21 18:59 UTC|newest]

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-04-10  0:18 Wei Mi
2014-04-10  2:27 ` Bin.Cheng
2014-04-10  4:08   ` Wei Mi
2014-04-21 19:26   ` Xinliang David Li [this message]
2014-04-22  9:51     ` Bin.Cheng
2014-04-21 18:39 ` Wei Mi
2014-04-21 18:43   ` Uros Bizjak

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to='CAAkRFZL4ajQghy7LR58ob8LX4cpcsEeFM=N7HqcJiPEh9t+8Gw@mail.gmail.com' \
    --to=davidxl@google.com \
    --cc=amker.cheng@gmail.com \
    --cc=gcc-patches@gcc.gnu.org \
    --cc=hjl.tools@gmail.com \
    --cc=ubizjak@gmail.com \
    --cc=wmi@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).