[PATCH, ARM] Unaligned accesses for builtin memcpy [2/2]

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH, ARM] Unaligned accesses for builtin memcpy [2/2]
@ 2011-05-06 13:25 Julian Brown
  2011-09-14 15:24 ` Julian Brown
  2011-09-28 14:23 ` Ramana Radhakrishnan
  0 siblings, 2 replies; 6+ messages in thread
From: Julian Brown @ 2011-05-06 13:25 UTC (permalink / raw)
  To: gcc-patches, paul, Richard Earnshaw, Ramana Radhakrishnan

[-- Attachment #1: Type: text/plain, Size: 3070 bytes --]

Hi,

This is the second of two patches to add unaligned-access support to
the ARM backend. It builds on the first patch to provide support for
unaligned accesses when expanding block moves (i.e. for builtin memcpy
operations). It makes some effort to use load/store multiple
instructions where appropriate (when accessing sufficiently-aligned
source or destination addresses), and also makes some effort to
generate fast code (for -O1/2/3) or small code (for -Os), though some
of the heuristics may need tweaking still.

Examples:

#include <string.h>

void foo (char *dest, char *src)
{
  memcpy (dest, src, AMOUNT);
}

char known[64];

void dst_aligned (char *src)
{
  memcpy (known, src, AMOUNT);
}

void src_aligned (char *dst)
{
  memcpy (dst, known, AMOUNT);
}

For "-mcpu=cortex-m4 -mthumb -O2 -DAMOUNT=15" we get:

foo:
        ldr     r2, [r1, #4]    @ unaligned
        ldr     r3, [r1, #8]    @ unaligned
        push    {r4}
        ldr     r4, [r1, #0]    @ unaligned
        str     r2, [r0, #4]    @ unaligned
        str     r4, [r0, #0]    @ unaligned
        str     r3, [r0, #8]    @ unaligned
        ldrh    r2, [r1, #12]   @ unaligned
        ldrb    r3, [r1, #14]   @ zero_extendqisi2
        strh    r2, [r0, #12]   @ unaligned
        strb    r3, [r0, #14]
        pop     {r4}
        bx      lr

dst_aligned:
        push    {r4}
        mov     r4, r0
        movw    r3, #:lower16:known
        ldr     r1, [r4, #4]    @ unaligned
        ldr     r2, [r4, #8]    @ unaligned
        ldr     r0, [r0, #0]    @ unaligned
        movt    r3, #:upper16:known
        stmia   r3!, {r0, r1, r2}
        ldrh    r1, [r4, #12]   @ unaligned
        ldrb    r2, [r4, #14]   @ zero_extendqisi2
        strh    r1, [r3, #0]    @ unaligned
        strb    r2, [r3, #2]
        pop     {r4}
        bx      lr

src_aligned:
        push    {r4}
        movw    r3, #:lower16:known
        movt    r3, #:upper16:known
        mov     r4, r0
        ldmia   r3!, {r0, r1, r2}
        str     r0, [r4, #0]    @ unaligned
        str     r1, [r4, #4]    @ unaligned
        str     r2, [r4, #8]    @ unaligned
        ldrh    r2, [r3, #0]    @ unaligned
        ldrb    r3, [r3, #2]    @ zero_extendqisi2
        strh    r2, [r4, #12]   @ unaligned
        strb    r3, [r4, #14]
        pop     {r4}
        bx      lr

Whereas for "-mcpu=cortex-m4 -mthumb -Os -DAMOUNT=15", e.g.:

foo:
        add     r3, r1, #12
.L2:
        ldr     r2, [r1], #4    @ unaligned
        cmp     r1, r3
        str     r2, [r0], #4    @ unaligned
        bne     .L2
        ldrh    r3, [r1, #0]    @ unaligned
        strh    r3, [r0, #0]    @ unaligned
        ldrb    r3, [r1, #2]    @ zero_extendqisi2
        strb    r3, [r0, #2]
        bx      lr

Tested (alongside the first patch) with cross to ARM Linux. OK to apply?

Thanks,

Julian

ChangeLog

    gcc/
    * config/arm/arm.c (arm_block_move_unaligned_straight)
    (arm_adjust_block_mem, arm_block_move_unaligned_loop)
    (arm_movmemqi_unaligned): New.
    (arm_gen_movmemqi): Support unaligned block copies.

[-- Attachment #2: arm-unaligned-memcpy-fsf-1.diff --]
[-- Type: text/x-patch, Size: 12025 bytes --]

commit 16973f69fce37a2b347ea7daffd6f593aba843d5
Author: Julian Brown <julian@henry7.codesourcery.com>
Date:   Wed May 4 11:26:01 2011 -0700

    Optimize block moves when unaligned accesses are permitted.

diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index a18aea6..b6df0d3 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -10362,6 +10362,335 @@ gen_const_stm_seq (rtx *operands, int nops)
   return true;
 }
 
+/* Copy a block of memory using plain ldr/str/ldrh/strh instructions, to permit
+   unaligned copies on processors which support unaligned semantics for those
+   instructions.  INTERLEAVE_FACTOR can be used to attempt to hide load latency
+   (using more registers) by doing e.g. load/load/store/store for a factor of 2.
+   An interleave factor of 1 (the minimum) will perform no interleaving. 
+   Load/store multiple are used for aligned addresses where possible.  */
+
+static void
+arm_block_move_unaligned_straight (rtx dstbase, rtx srcbase,
+				   HOST_WIDE_INT length,
+				   unsigned int interleave_factor)
+{
+  rtx *regs = XALLOCAVEC (rtx, interleave_factor);
+  int *regnos = XALLOCAVEC (int, interleave_factor);
+  HOST_WIDE_INT block_size_bytes = interleave_factor * UNITS_PER_WORD;
+  HOST_WIDE_INT i, j;
+  HOST_WIDE_INT remaining = length, words;
+  rtx halfword_tmp = NULL, byte_tmp = NULL;
+  rtx dst, src;
+  bool src_aligned = MEM_ALIGN (srcbase) >= BITS_PER_WORD;
+  bool dst_aligned = MEM_ALIGN (dstbase) >= BITS_PER_WORD;
+  HOST_WIDE_INT srcoffset, dstoffset;
+  HOST_WIDE_INT src_autoinc, dst_autoinc;
+  rtx mem, addr;
+  
+  gcc_assert (1 <= interleave_factor && interleave_factor <= 4);
+  
+  /* Use hard registers if we have aligned source or destination so we can use
+     load/store multiple with contiguous registers.  */
+  if (dst_aligned || src_aligned)
+    for (i = 0; i < interleave_factor; i++)
+      regs[i] = gen_rtx_REG (SImode, i);
+  else
+    for (i = 0; i < interleave_factor; i++)
+      regs[i] = gen_reg_rtx (SImode);
+
+  dst = copy_addr_to_reg (XEXP (dstbase, 0));
+  src = copy_addr_to_reg (XEXP (srcbase, 0));
+
+  srcoffset = dstoffset = 0;
+  
+  /* Calls to arm_gen_load_multiple and arm_gen_store_multiple update SRC/DST.
+     For copying the last bytes we want to subtract this offset again.  */
+  src_autoinc = dst_autoinc = 0;
+
+  for (i = 0; i < interleave_factor; i++)
+    regnos[i] = i;
+
+  /* Copy BLOCK_SIZE_BYTES chunks.  */
+
+  for (i = 0; i + block_size_bytes <= length; i += block_size_bytes)
+    {
+      /* Load words.  */
+      if (src_aligned && interleave_factor > 1)
+        {
+	  emit_insn (arm_gen_load_multiple (regnos, interleave_factor, src,
+					    TRUE, srcbase, &srcoffset));
+	  src_autoinc += UNITS_PER_WORD * interleave_factor;
+	}
+      else
+        {
+	  for (j = 0; j < interleave_factor; j++)
+	    {
+	      addr = plus_constant (src, srcoffset + j * UNITS_PER_WORD
+					 - src_autoinc);
+	      mem = adjust_automodify_address (srcbase, SImode, addr,
+					       srcoffset + j * UNITS_PER_WORD);
+	      emit_insn (gen_unaligned_loadsi (regs[j], mem));
+	    }
+	  srcoffset += block_size_bytes;
+	}
+
+      /* Store words.  */
+      if (dst_aligned && interleave_factor > 1)
+        {
+          emit_insn (arm_gen_store_multiple (regnos, interleave_factor, dst,
+					     TRUE, dstbase, &dstoffset));
+	  dst_autoinc += UNITS_PER_WORD * interleave_factor;
+	}
+      else
+        {
+	  for (j = 0; j < interleave_factor; j++)
+	    {
+	      addr = plus_constant (dst, dstoffset + j * UNITS_PER_WORD
+					 - dst_autoinc);
+	      mem = adjust_automodify_address (dstbase, SImode, addr,
+					       dstoffset + j * UNITS_PER_WORD);
+	      emit_insn (gen_unaligned_storesi (mem, regs[j]));
+	    }
+	  dstoffset += block_size_bytes;
+	}
+
+      remaining -= block_size_bytes;
+    }
+  
+  /* Copy any whole words left (note these aren't interleaved with any
+     subsequent halfword/byte load/stores in the interests of simplicity).  */
+  
+  words = remaining / UNITS_PER_WORD;
+
+  gcc_assert (words < interleave_factor);
+  
+  if (src_aligned && words > 1)
+    {
+      emit_insn (arm_gen_load_multiple (regnos, words, src, TRUE, srcbase,
+					&srcoffset));
+      src_autoinc += UNITS_PER_WORD * words;
+    }
+  else
+    {
+      for (j = 0; j < words; j++)
+        {
+	  addr = plus_constant (src,
+				srcoffset + j * UNITS_PER_WORD - src_autoinc);
+	  mem = adjust_automodify_address (srcbase, SImode, addr,
+					   srcoffset + j * UNITS_PER_WORD);
+	  emit_insn (gen_unaligned_loadsi (regs[j], mem));
+	}
+      srcoffset += words * UNITS_PER_WORD;
+    }
+
+  if (dst_aligned && words > 1)
+    {
+      emit_insn (arm_gen_store_multiple (regnos, words, dst, TRUE, dstbase,
+					 &dstoffset));
+      dst_autoinc += words * UNITS_PER_WORD;
+    }
+  else
+    {
+      for (j = 0; j < words; j++)
+        {
+	  addr = plus_constant (dst,
+				dstoffset + j * UNITS_PER_WORD - dst_autoinc);
+	  mem = adjust_automodify_address (dstbase, SImode, addr,
+					   dstoffset + j * UNITS_PER_WORD);
+	  emit_insn (gen_unaligned_storesi (mem, regs[j]));
+	}
+      dstoffset += words * UNITS_PER_WORD;
+    }
+
+  remaining -= words * UNITS_PER_WORD;
+  
+  gcc_assert (remaining < 4);
+  
+  /* Copy a halfword if necessary.  */
+  
+  if (remaining >= 2)
+    {
+      halfword_tmp = gen_reg_rtx (SImode);
+
+      addr = plus_constant (src, srcoffset - src_autoinc);
+      mem = adjust_automodify_address (srcbase, HImode, addr, srcoffset);
+      emit_insn (gen_unaligned_loadhiu (halfword_tmp, mem));
+
+      /* Either write out immediately, or delay until we've loaded the last
+	 byte, depending on interleave factor.  */
+      if (interleave_factor == 1)
+        {
+	  addr = plus_constant (dst, dstoffset - dst_autoinc);
+	  mem = adjust_automodify_address (dstbase, HImode, addr, dstoffset);
+	  emit_insn (gen_unaligned_storehi (mem,
+		       gen_lowpart (HImode, halfword_tmp)));
+	  halfword_tmp = NULL;
+	  dstoffset += 2;
+	}
+
+      remaining -= 2;
+      srcoffset += 2;
+    }
+  
+  gcc_assert (remaining < 2);
+  
+  /* Copy last byte.  */
+  
+  if ((remaining & 1) != 0)
+    {
+      byte_tmp = gen_reg_rtx (SImode);
+
+      addr = plus_constant (src, srcoffset - src_autoinc);
+      mem = adjust_automodify_address (srcbase, QImode, addr, srcoffset);
+      emit_move_insn (gen_lowpart (QImode, byte_tmp), mem);
+
+      if (interleave_factor == 1)
+        {
+	  addr = plus_constant (dst, dstoffset - dst_autoinc);
+	  mem = adjust_automodify_address (dstbase, QImode, addr, dstoffset);
+	  emit_move_insn (mem, gen_lowpart (QImode, byte_tmp));
+	  byte_tmp = NULL;
+	  dstoffset++;
+	}
+
+      remaining--;
+      srcoffset++;
+    }
+  
+  /* Store last halfword if we haven't done so already.  */
+  
+  if (halfword_tmp)
+    {
+      addr = plus_constant (dst, dstoffset - dst_autoinc);
+      mem = adjust_automodify_address (dstbase, HImode, addr, dstoffset);
+      emit_insn (gen_unaligned_storehi (mem,
+		   gen_lowpart (HImode, halfword_tmp)));
+      dstoffset += 2;
+    }
+
+  /* Likewise for last byte.  */
+
+  if (byte_tmp)
+    {
+      addr = plus_constant (dst, dstoffset - dst_autoinc);
+      mem = adjust_automodify_address (dstbase, QImode, addr, dstoffset);
+      emit_move_insn (mem, gen_lowpart (QImode, byte_tmp));
+      dstoffset++;
+    }
+  
+  gcc_assert (remaining == 0 && srcoffset == dstoffset);
+}
+
+/* From mips_adjust_block_mem:
+
+   Helper function for doing a loop-based block operation on memory
+   reference MEM.  Each iteration of the loop will operate on LENGTH
+   bytes of MEM.
+
+   Create a new base register for use within the loop and point it to
+   the start of MEM.  Create a new memory reference that uses this
+   register.  Store them in *LOOP_REG and *LOOP_MEM respectively.  */
+
+static void
+arm_adjust_block_mem (rtx mem, HOST_WIDE_INT length, rtx *loop_reg,
+		      rtx *loop_mem)
+{
+  *loop_reg = copy_addr_to_reg (XEXP (mem, 0));
+  
+  /* Although the new mem does not refer to a known location,
+     it does keep up to LENGTH bytes of alignment.  */
+  *loop_mem = change_address (mem, BLKmode, *loop_reg);
+  set_mem_align (*loop_mem, MIN (MEM_ALIGN (mem), length * BITS_PER_UNIT));
+}
+
+/* From mips_block_move_loop:
+
+   Move LENGTH bytes from SRC to DEST using a loop that moves BYTES_PER_ITER
+   bytes at a time.  LENGTH must be at least BYTES_PER_ITER.  Assume that
+   the memory regions do not overlap.  */
+
+static void
+arm_block_move_unaligned_loop (rtx dest, rtx src, HOST_WIDE_INT length,
+			       unsigned int interleave_factor,
+			       HOST_WIDE_INT bytes_per_iter)
+{
+  rtx label, src_reg, dest_reg, final_src, test;
+  HOST_WIDE_INT leftover;
+  
+  leftover = length % bytes_per_iter;
+  length -= leftover;
+  
+  /* Create registers and memory references for use within the loop.  */
+  arm_adjust_block_mem (src, bytes_per_iter, &src_reg, &src);
+  arm_adjust_block_mem (dest, bytes_per_iter, &dest_reg, &dest);
+  
+  /* Calculate the value that SRC_REG should have after the last iteration of
+     the loop.  */
+  final_src = expand_simple_binop (Pmode, PLUS, src_reg, GEN_INT (length),
+				   0, 0, OPTAB_WIDEN);
+
+  /* Emit the start of the loop.  */
+  label = gen_label_rtx ();
+  emit_label (label);
+  
+  /* Emit the loop body.  */
+  arm_block_move_unaligned_straight (dest, src, bytes_per_iter,
+				     interleave_factor);
+
+  /* Move on to the next block.  */
+  emit_move_insn (src_reg, plus_constant (src_reg, bytes_per_iter));
+  emit_move_insn (dest_reg, plus_constant (dest_reg, bytes_per_iter));
+  
+  /* Emit the loop condition.  */
+  test = gen_rtx_NE (VOIDmode, src_reg, final_src);
+  emit_jump_insn (gen_cbranchsi4 (test, src_reg, final_src, label));
+  
+  /* Mop up any left-over bytes.  */
+  if (leftover)
+    arm_block_move_unaligned_straight (dest, src, leftover, interleave_factor);
+}
+
+/* Emit a block move when either the source or destination is unaligned (not
+   aligned to a four-byte boundary).  This may need further tuning depending on
+   core type, optimize_size setting, etc.  */
+
+static int
+arm_movmemqi_unaligned (rtx *operands)
+{
+  HOST_WIDE_INT length = INTVAL (operands[2]);
+  
+  if (optimize_size)
+    {
+      bool src_aligned = MEM_ALIGN (operands[1]) >= BITS_PER_WORD;
+      bool dst_aligned = MEM_ALIGN (operands[0]) >= BITS_PER_WORD;
+      /* Inlined memcpy using ldr/str/ldrh/strh can be quite big: try to limit
+         size of code if optimizing for size.  We'll use ldm/stm if src_aligned
+	 or dst_aligned though: allow more interleaving in those cases since the
+	 resulting code can be smaller.  */
+      unsigned int interleave_factor = (src_aligned || dst_aligned) ? 2 : 1;
+      HOST_WIDE_INT bytes_per_iter = (src_aligned || dst_aligned) ? 8 : 4;
+      
+      if (length > 12)
+	arm_block_move_unaligned_loop (operands[0], operands[1], length,
+				       interleave_factor, bytes_per_iter);
+      else
+	arm_block_move_unaligned_straight (operands[0], operands[1], length,
+					   interleave_factor);
+    }
+  else
+    {
+      /* Note that the loop created by arm_block_move_unaligned_loop may be
+         subject to loop unrolling, which makes tuning this condition a little
+	 redundant.  */
+      if (length > 32)
+	arm_block_move_unaligned_loop (operands[0], operands[1], length, 4, 16);
+      else
+	arm_block_move_unaligned_straight (operands[0], operands[1], length, 4);
+    }
+  
+  return 1;
+}
+
 int
 arm_gen_movmemqi (rtx *operands)
 {
@@ -10374,8 +10703,13 @@ arm_gen_movmemqi (rtx *operands)
 
   if (GET_CODE (operands[2]) != CONST_INT
       || GET_CODE (operands[3]) != CONST_INT
-      || INTVAL (operands[2]) > 64
-      || INTVAL (operands[3]) & 3)
+      || INTVAL (operands[2]) > 64)
+    return 0;
+
+  if (unaligned_access && (INTVAL (operands[3]) & 3) != 0)
+    return arm_movmemqi_unaligned (operands);
+
+  if (INTVAL (operands[3]) & 3)
     return 0;
 
   dstbase = operands[0];

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH, ARM] Unaligned accesses for builtin memcpy [2/2]
  2011-05-06 13:25 [PATCH, ARM] Unaligned accesses for builtin memcpy [2/2] Julian Brown
@ 2011-09-14 15:24 ` Julian Brown
  2011-09-28 14:23 ` Ramana Radhakrishnan
  1 sibling, 0 replies; 6+ messages in thread
From: Julian Brown @ 2011-09-14 15:24 UTC (permalink / raw)
  To: gcc-patches, paul; +Cc: Richard Earnshaw, Ramana Radhakrishnan

On Fri, 6 May 2011 14:13:32 +0100
Julian Brown <julian@codesourcery.com> wrote:

> Hi,
> 
> This is the second of two patches to add unaligned-access support to
> the ARM backend. It builds on the first patch to provide support for
> unaligned accesses when expanding block moves (i.e. for builtin memcpy
> operations). It makes some effort to use load/store multiple
> instructions where appropriate (when accessing sufficiently-aligned
> source or destination addresses), and also makes some effort to
> generate fast code (for -O1/2/3) or small code (for -Os), though some
> of the heuristics may need tweaking still.

The preceding patch to this one has now been approved & applied:

http://gcc.gnu.org/ml/gcc-patches/2011-09/msg00575.html

FAOD, is this one OK too?

Thanks,

Julian

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH, ARM] Unaligned accesses for builtin memcpy [2/2]
  2011-05-06 13:25 [PATCH, ARM] Unaligned accesses for builtin memcpy [2/2] Julian Brown
  2011-09-14 15:24 ` Julian Brown
@ 2011-09-28 14:23 ` Ramana Radhakrishnan
  2011-10-13 18:37   ` Julian Brown
  1 sibling, 1 reply; 6+ messages in thread
From: Ramana Radhakrishnan @ 2011-09-28 14:23 UTC (permalink / raw)
  To: Julian Brown; +Cc: gcc-patches, paul, Richard Earnshaw

On 6 May 2011 14:13, Julian Brown <julian@codesourcery.com> wrote:
> Hi,
>
> This is the second of two patches to add unaligned-access support to
> the ARM backend. It builds on the first patch to provide support for
> unaligned accesses when expanding block moves (i.e. for builtin memcpy
> operations). It makes some effort to use load/store multiple
> instructions where appropriate (when accessing sufficiently-aligned
> source or destination addresses), and also makes some effort to
> generate fast code (for -O1/2/3) or small code (for -Os), though some
> of the heuristics may need tweaking still

Sorry it's taken me a while to get around to this one. Do you know
what difference this makes to performance on some standard benchmarks
on let's say an A9 and an M4 as I see that this gets triggered only
when we have less than 64 bytes to copy. ?

Please add a few testcases from the examples that you've shown here to
be sure that ldm's are being generated in the right cases.

cheers
Ramana

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH, ARM] Unaligned accesses for builtin memcpy [2/2]
  2011-09-28 14:23 ` Ramana Radhakrishnan
@ 2011-10-13 18:37   ` Julian Brown
  2011-10-17 17:31     ` Ramana Radhakrishnan
  2011-10-18  5:27     ` Michael Hope
  0 siblings, 2 replies; 6+ messages in thread
From: Julian Brown @ 2011-10-13 18:37 UTC (permalink / raw)
  To: Ramana Radhakrishnan; +Cc: gcc-patches, paul, Richard Earnshaw

[-- Attachment #1: Type: text/plain, Size: 2160 bytes --]

On Wed, 28 Sep 2011 14:33:17 +0100
Ramana Radhakrishnan <ramana.radhakrishnan@linaro.org> wrote:

> On 6 May 2011 14:13, Julian Brown <julian@codesourcery.com> wrote:
> > Hi,
> >
> > This is the second of two patches to add unaligned-access support to
> > the ARM backend. It builds on the first patch to provide support for
> > unaligned accesses when expanding block moves (i.e. for builtin
> > memcpy operations). It makes some effort to use load/store multiple
> > instructions where appropriate (when accessing sufficiently-aligned
> > source or destination addresses), and also makes some effort to
> > generate fast code (for -O1/2/3) or small code (for -Os), though
> > some of the heuristics may need tweaking still
> 
> Sorry it's taken me a while to get around to this one. Do you know
> what difference this makes to performance on some standard benchmarks
> on let's say an A9 and an M4 as I see that this gets triggered only
> when we have less than 64 bytes to copy. ?

No, sorry, I don't have any benchmark results available at present. I
think we'd have to have terrifically bad luck for it to be a
performance degradation, though...

> Please add a few testcases from the examples that you've shown here to
> be sure that ldm's are being generated in the right cases.

I've added test cases which cover copies with combinations of
aligned/unaligned sources/destinations, gated on a new
require-effective-target so the tests only run when suitable support is
available.

I re-tested the patch for good measure, in case of bitrot (and the new
tests pass with the patch applied, of course). OK to apply now?

Thanks,

Julian

ChangeLog

    gcc/
    * config/arm/arm.c (arm_block_move_unaligned_straight)
    (arm_adjust_block_mem, arm_block_move_unaligned_loop)
    (arm_movmemqi_unaligned): New.
    (arm_gen_movmemqi): Support unaligned block copies.

    gcc/testsuite/
    * lib/target-supports.exp (check_effective_target_arm_unaligned):
    New.
    * gcc.target/arm/unaligned-memcpy-1.c: New.
    * gcc.target/arm/unaligned-memcpy-2.c: New.
    * gcc.target/arm/unaligned-memcpy-3.c: New.
    * gcc.target/arm/unaligned-memcpy-4.c: New.

[-- Attachment #2: arm-unaligned-memcpy-fsf-3.diff --]
[-- Type: text/x-patch, Size: 16033 bytes --]

Index: gcc/testsuite/gcc.target/arm/unaligned-memcpy-3.c
===================================================================
--- gcc/testsuite/gcc.target/arm/unaligned-memcpy-3.c	(revision 0)
+++ gcc/testsuite/gcc.target/arm/unaligned-memcpy-3.c	(revision 0)
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_unaligned } */
+/* { dg-options "-O2" } */
+
+#include <string.h>
+
+char src[16];
+
+void aligned_src (char *dest)
+{
+  memcpy (dest, src, 15);
+}
+
+/* Expect a multi-word load for the main part of the copy, but subword
+   loads/stores for the remainder.  */
+
+/* { dg-final { scan-assembler-times "ldmia" 1 } } */
+/* { dg-final { scan-assembler-times "ldrh" 1 } } */
+/* { dg-final { scan-assembler-times "strh" 1 } } */
+/* { dg-final { scan-assembler-times "ldrb" 1 } } */
+/* { dg-final { scan-assembler-times "strb" 1 } } */
Index: gcc/testsuite/gcc.target/arm/unaligned-memcpy-4.c
===================================================================
--- gcc/testsuite/gcc.target/arm/unaligned-memcpy-4.c	(revision 0)
+++ gcc/testsuite/gcc.target/arm/unaligned-memcpy-4.c	(revision 0)
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_unaligned } */
+/* { dg-options "-O2" } */
+
+#include <string.h>
+
+char src[16];
+char dest[16];
+
+void aligned_both (void)
+{
+  memcpy (dest, src, 15);
+}
+
+/* We know both src and dest to be aligned: expect multiword loads/stores.  */
+
+/* { dg-final { scan-assembler-times "ldmia" 1 } } */
+/* { dg-final { scan-assembler-times "stmia" 1 } } */
Index: gcc/testsuite/gcc.target/arm/unaligned-memcpy-1.c
===================================================================
--- gcc/testsuite/gcc.target/arm/unaligned-memcpy-1.c	(revision 0)
+++ gcc/testsuite/gcc.target/arm/unaligned-memcpy-1.c	(revision 0)
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_unaligned } */
+/* { dg-options "-O2" } */
+
+#include <string.h>
+
+void unknown_alignment (char *dest, char *src)
+{
+  memcpy (dest, src, 15);
+}
+
+/* We should see three unaligned word loads and store pairs, one unaligned
+   ldrh/strh pair, and an ldrb/strb pair.  Sanity check that.  */
+
+/* { dg-final { scan-assembler-times "@ unaligned" 8 } } */
+/* { dg-final { scan-assembler-times "ldrh" 1 } } */
+/* { dg-final { scan-assembler-times "strh" 1 } } */
+/* { dg-final { scan-assembler-times "ldrb" 1 } } */
+/* { dg-final { scan-assembler-times "strb" 1 } } */
Index: gcc/testsuite/gcc.target/arm/unaligned-memcpy-2.c
===================================================================
--- gcc/testsuite/gcc.target/arm/unaligned-memcpy-2.c	(revision 0)
+++ gcc/testsuite/gcc.target/arm/unaligned-memcpy-2.c	(revision 0)
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_unaligned } */
+/* { dg-options "-O2" } */
+
+#include <string.h>
+
+char dest[16];
+
+void aligned_dest (char *src)
+{
+  memcpy (dest, src, 15);
+}
+
+/* Expect a multi-word store for the main part of the copy, but subword
+   loads/stores for the remainder.  */
+
+/* { dg-final { scan-assembler-times "stmia" 1 } } */
+/* { dg-final { scan-assembler-times "ldrh" 1 } } */
+/* { dg-final { scan-assembler-times "strh" 1 } } */
+/* { dg-final { scan-assembler-times "ldrb" 1 } } */
+/* { dg-final { scan-assembler-times "strb" 1 } } */
Index: gcc/testsuite/lib/target-supports.exp
===================================================================
--- gcc/testsuite/lib/target-supports.exp	(revision 179860)
+++ gcc/testsuite/lib/target-supports.exp	(working copy)
@@ -1973,6 +1973,18 @@ proc check_effective_target_arm_dsp { } 
     }]
 }
 
+# Return 1 if this is an ARM target that supports unaligned word/halfword
+# load/store instructions.
+
+proc check_effective_target_arm_unaligned { } {
+    return [check_no_compiler_messages arm_unaligned assembly {
+	#ifndef __ARM_FEATURE_UNALIGNED
+	#error no unaligned support
+	#endif
+	int i;
+    }]
+}
+
 # Add the options needed for NEON.  We need either -mfloat-abi=softfp
 # or -mfloat-abi=hard, but if one is already specified by the
 # multilib, use it.  Similarly, if a -mfpu option already enables
Index: gcc/config/arm/arm.c
===================================================================
--- gcc/config/arm/arm.c	(revision 179860)
+++ gcc/config/arm/arm.c	(working copy)
@@ -10766,6 +10766,335 @@ gen_const_stm_seq (rtx *operands, int no
   return true;
 }
 
+/* Copy a block of memory using plain ldr/str/ldrh/strh instructions, to permit
+   unaligned copies on processors which support unaligned semantics for those
+   instructions.  INTERLEAVE_FACTOR can be used to attempt to hide load latency
+   (using more registers) by doing e.g. load/load/store/store for a factor of 2.
+   An interleave factor of 1 (the minimum) will perform no interleaving. 
+   Load/store multiple are used for aligned addresses where possible.  */
+
+static void
+arm_block_move_unaligned_straight (rtx dstbase, rtx srcbase,
+				   HOST_WIDE_INT length,
+				   unsigned int interleave_factor)
+{
+  rtx *regs = XALLOCAVEC (rtx, interleave_factor);
+  int *regnos = XALLOCAVEC (int, interleave_factor);
+  HOST_WIDE_INT block_size_bytes = interleave_factor * UNITS_PER_WORD;
+  HOST_WIDE_INT i, j;
+  HOST_WIDE_INT remaining = length, words;
+  rtx halfword_tmp = NULL, byte_tmp = NULL;
+  rtx dst, src;
+  bool src_aligned = MEM_ALIGN (srcbase) >= BITS_PER_WORD;
+  bool dst_aligned = MEM_ALIGN (dstbase) >= BITS_PER_WORD;
+  HOST_WIDE_INT srcoffset, dstoffset;
+  HOST_WIDE_INT src_autoinc, dst_autoinc;
+  rtx mem, addr;
+  
+  gcc_assert (1 <= interleave_factor && interleave_factor <= 4);
+  
+  /* Use hard registers if we have aligned source or destination so we can use
+     load/store multiple with contiguous registers.  */
+  if (dst_aligned || src_aligned)
+    for (i = 0; i < interleave_factor; i++)
+      regs[i] = gen_rtx_REG (SImode, i);
+  else
+    for (i = 0; i < interleave_factor; i++)
+      regs[i] = gen_reg_rtx (SImode);
+
+  dst = copy_addr_to_reg (XEXP (dstbase, 0));
+  src = copy_addr_to_reg (XEXP (srcbase, 0));
+
+  srcoffset = dstoffset = 0;
+  
+  /* Calls to arm_gen_load_multiple and arm_gen_store_multiple update SRC/DST.
+     For copying the last bytes we want to subtract this offset again.  */
+  src_autoinc = dst_autoinc = 0;
+
+  for (i = 0; i < interleave_factor; i++)
+    regnos[i] = i;
+
+  /* Copy BLOCK_SIZE_BYTES chunks.  */
+
+  for (i = 0; i + block_size_bytes <= length; i += block_size_bytes)
+    {
+      /* Load words.  */
+      if (src_aligned && interleave_factor > 1)
+        {
+	  emit_insn (arm_gen_load_multiple (regnos, interleave_factor, src,
+					    TRUE, srcbase, &srcoffset));
+	  src_autoinc += UNITS_PER_WORD * interleave_factor;
+	}
+      else
+        {
+	  for (j = 0; j < interleave_factor; j++)
+	    {
+	      addr = plus_constant (src, srcoffset + j * UNITS_PER_WORD
+					 - src_autoinc);
+	      mem = adjust_automodify_address (srcbase, SImode, addr,
+					       srcoffset + j * UNITS_PER_WORD);
+	      emit_insn (gen_unaligned_loadsi (regs[j], mem));
+	    }
+	  srcoffset += block_size_bytes;
+	}
+
+      /* Store words.  */
+      if (dst_aligned && interleave_factor > 1)
+        {
+          emit_insn (arm_gen_store_multiple (regnos, interleave_factor, dst,
+					     TRUE, dstbase, &dstoffset));
+	  dst_autoinc += UNITS_PER_WORD * interleave_factor;
+	}
+      else
+        {
+	  for (j = 0; j < interleave_factor; j++)
+	    {
+	      addr = plus_constant (dst, dstoffset + j * UNITS_PER_WORD
+					 - dst_autoinc);
+	      mem = adjust_automodify_address (dstbase, SImode, addr,
+					       dstoffset + j * UNITS_PER_WORD);
+	      emit_insn (gen_unaligned_storesi (mem, regs[j]));
+	    }
+	  dstoffset += block_size_bytes;
+	}
+
+      remaining -= block_size_bytes;
+    }
+  
+  /* Copy any whole words left (note these aren't interleaved with any
+     subsequent halfword/byte load/stores in the interests of simplicity).  */
+  
+  words = remaining / UNITS_PER_WORD;
+
+  gcc_assert (words < interleave_factor);
+  
+  if (src_aligned && words > 1)
+    {
+      emit_insn (arm_gen_load_multiple (regnos, words, src, TRUE, srcbase,
+					&srcoffset));
+      src_autoinc += UNITS_PER_WORD * words;
+    }
+  else
+    {
+      for (j = 0; j < words; j++)
+        {
+	  addr = plus_constant (src,
+				srcoffset + j * UNITS_PER_WORD - src_autoinc);
+	  mem = adjust_automodify_address (srcbase, SImode, addr,
+					   srcoffset + j * UNITS_PER_WORD);
+	  emit_insn (gen_unaligned_loadsi (regs[j], mem));
+	}
+      srcoffset += words * UNITS_PER_WORD;
+    }
+
+  if (dst_aligned && words > 1)
+    {
+      emit_insn (arm_gen_store_multiple (regnos, words, dst, TRUE, dstbase,
+					 &dstoffset));
+      dst_autoinc += words * UNITS_PER_WORD;
+    }
+  else
+    {
+      for (j = 0; j < words; j++)
+        {
+	  addr = plus_constant (dst,
+				dstoffset + j * UNITS_PER_WORD - dst_autoinc);
+	  mem = adjust_automodify_address (dstbase, SImode, addr,
+					   dstoffset + j * UNITS_PER_WORD);
+	  emit_insn (gen_unaligned_storesi (mem, regs[j]));
+	}
+      dstoffset += words * UNITS_PER_WORD;
+    }
+
+  remaining -= words * UNITS_PER_WORD;
+  
+  gcc_assert (remaining < 4);
+  
+  /* Copy a halfword if necessary.  */
+  
+  if (remaining >= 2)
+    {
+      halfword_tmp = gen_reg_rtx (SImode);
+
+      addr = plus_constant (src, srcoffset - src_autoinc);
+      mem = adjust_automodify_address (srcbase, HImode, addr, srcoffset);
+      emit_insn (gen_unaligned_loadhiu (halfword_tmp, mem));
+
+      /* Either write out immediately, or delay until we've loaded the last
+	 byte, depending on interleave factor.  */
+      if (interleave_factor == 1)
+        {
+	  addr = plus_constant (dst, dstoffset - dst_autoinc);
+	  mem = adjust_automodify_address (dstbase, HImode, addr, dstoffset);
+	  emit_insn (gen_unaligned_storehi (mem,
+		       gen_lowpart (HImode, halfword_tmp)));
+	  halfword_tmp = NULL;
+	  dstoffset += 2;
+	}
+
+      remaining -= 2;
+      srcoffset += 2;
+    }
+  
+  gcc_assert (remaining < 2);
+  
+  /* Copy last byte.  */
+  
+  if ((remaining & 1) != 0)
+    {
+      byte_tmp = gen_reg_rtx (SImode);
+
+      addr = plus_constant (src, srcoffset - src_autoinc);
+      mem = adjust_automodify_address (srcbase, QImode, addr, srcoffset);
+      emit_move_insn (gen_lowpart (QImode, byte_tmp), mem);
+
+      if (interleave_factor == 1)
+        {
+	  addr = plus_constant (dst, dstoffset - dst_autoinc);
+	  mem = adjust_automodify_address (dstbase, QImode, addr, dstoffset);
+	  emit_move_insn (mem, gen_lowpart (QImode, byte_tmp));
+	  byte_tmp = NULL;
+	  dstoffset++;
+	}
+
+      remaining--;
+      srcoffset++;
+    }
+  
+  /* Store last halfword if we haven't done so already.  */
+  
+  if (halfword_tmp)
+    {
+      addr = plus_constant (dst, dstoffset - dst_autoinc);
+      mem = adjust_automodify_address (dstbase, HImode, addr, dstoffset);
+      emit_insn (gen_unaligned_storehi (mem,
+		   gen_lowpart (HImode, halfword_tmp)));
+      dstoffset += 2;
+    }
+
+  /* Likewise for last byte.  */
+
+  if (byte_tmp)
+    {
+      addr = plus_constant (dst, dstoffset - dst_autoinc);
+      mem = adjust_automodify_address (dstbase, QImode, addr, dstoffset);
+      emit_move_insn (mem, gen_lowpart (QImode, byte_tmp));
+      dstoffset++;
+    }
+  
+  gcc_assert (remaining == 0 && srcoffset == dstoffset);
+}
+
+/* From mips_adjust_block_mem:
+
+   Helper function for doing a loop-based block operation on memory
+   reference MEM.  Each iteration of the loop will operate on LENGTH
+   bytes of MEM.
+
+   Create a new base register for use within the loop and point it to
+   the start of MEM.  Create a new memory reference that uses this
+   register.  Store them in *LOOP_REG and *LOOP_MEM respectively.  */
+
+static void
+arm_adjust_block_mem (rtx mem, HOST_WIDE_INT length, rtx *loop_reg,
+		      rtx *loop_mem)
+{
+  *loop_reg = copy_addr_to_reg (XEXP (mem, 0));
+  
+  /* Although the new mem does not refer to a known location,
+     it does keep up to LENGTH bytes of alignment.  */
+  *loop_mem = change_address (mem, BLKmode, *loop_reg);
+  set_mem_align (*loop_mem, MIN (MEM_ALIGN (mem), length * BITS_PER_UNIT));
+}
+
+/* From mips_block_move_loop:
+
+   Move LENGTH bytes from SRC to DEST using a loop that moves BYTES_PER_ITER
+   bytes at a time.  LENGTH must be at least BYTES_PER_ITER.  Assume that
+   the memory regions do not overlap.  */
+
+static void
+arm_block_move_unaligned_loop (rtx dest, rtx src, HOST_WIDE_INT length,
+			       unsigned int interleave_factor,
+			       HOST_WIDE_INT bytes_per_iter)
+{
+  rtx label, src_reg, dest_reg, final_src, test;
+  HOST_WIDE_INT leftover;
+  
+  leftover = length % bytes_per_iter;
+  length -= leftover;
+  
+  /* Create registers and memory references for use within the loop.  */
+  arm_adjust_block_mem (src, bytes_per_iter, &src_reg, &src);
+  arm_adjust_block_mem (dest, bytes_per_iter, &dest_reg, &dest);
+  
+  /* Calculate the value that SRC_REG should have after the last iteration of
+     the loop.  */
+  final_src = expand_simple_binop (Pmode, PLUS, src_reg, GEN_INT (length),
+				   0, 0, OPTAB_WIDEN);
+
+  /* Emit the start of the loop.  */
+  label = gen_label_rtx ();
+  emit_label (label);
+  
+  /* Emit the loop body.  */
+  arm_block_move_unaligned_straight (dest, src, bytes_per_iter,
+				     interleave_factor);
+
+  /* Move on to the next block.  */
+  emit_move_insn (src_reg, plus_constant (src_reg, bytes_per_iter));
+  emit_move_insn (dest_reg, plus_constant (dest_reg, bytes_per_iter));
+  
+  /* Emit the loop condition.  */
+  test = gen_rtx_NE (VOIDmode, src_reg, final_src);
+  emit_jump_insn (gen_cbranchsi4 (test, src_reg, final_src, label));
+  
+  /* Mop up any left-over bytes.  */
+  if (leftover)
+    arm_block_move_unaligned_straight (dest, src, leftover, interleave_factor);
+}
+
+/* Emit a block move when either the source or destination is unaligned (not
+   aligned to a four-byte boundary).  This may need further tuning depending on
+   core type, optimize_size setting, etc.  */
+
+static int
+arm_movmemqi_unaligned (rtx *operands)
+{
+  HOST_WIDE_INT length = INTVAL (operands[2]);
+  
+  if (optimize_size)
+    {
+      bool src_aligned = MEM_ALIGN (operands[1]) >= BITS_PER_WORD;
+      bool dst_aligned = MEM_ALIGN (operands[0]) >= BITS_PER_WORD;
+      /* Inlined memcpy using ldr/str/ldrh/strh can be quite big: try to limit
+         size of code if optimizing for size.  We'll use ldm/stm if src_aligned
+	 or dst_aligned though: allow more interleaving in those cases since the
+	 resulting code can be smaller.  */
+      unsigned int interleave_factor = (src_aligned || dst_aligned) ? 2 : 1;
+      HOST_WIDE_INT bytes_per_iter = (src_aligned || dst_aligned) ? 8 : 4;
+      
+      if (length > 12)
+	arm_block_move_unaligned_loop (operands[0], operands[1], length,
+				       interleave_factor, bytes_per_iter);
+      else
+	arm_block_move_unaligned_straight (operands[0], operands[1], length,
+					   interleave_factor);
+    }
+  else
+    {
+      /* Note that the loop created by arm_block_move_unaligned_loop may be
+         subject to loop unrolling, which makes tuning this condition a little
+	 redundant.  */
+      if (length > 32)
+	arm_block_move_unaligned_loop (operands[0], operands[1], length, 4, 16);
+      else
+	arm_block_move_unaligned_straight (operands[0], operands[1], length, 4);
+    }
+  
+  return 1;
+}
+
 int
 arm_gen_movmemqi (rtx *operands)
 {
@@ -10778,8 +11107,13 @@ arm_gen_movmemqi (rtx *operands)
 
   if (GET_CODE (operands[2]) != CONST_INT
       || GET_CODE (operands[3]) != CONST_INT
-      || INTVAL (operands[2]) > 64
-      || INTVAL (operands[3]) & 3)
+      || INTVAL (operands[2]) > 64)
+    return 0;
+
+  if (unaligned_access && (INTVAL (operands[3]) & 3) != 0)
+    return arm_movmemqi_unaligned (operands);
+
+  if (INTVAL (operands[3]) & 3)
     return 0;
 
   dstbase = operands[0];

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH, ARM] Unaligned accesses for builtin memcpy [2/2]
  2011-10-13 18:37   ` Julian Brown
@ 2011-10-17 17:31     ` Ramana Radhakrishnan
  2011-10-18  5:27     ` Michael Hope
  1 sibling, 0 replies; 6+ messages in thread
From: Ramana Radhakrishnan @ 2011-10-17 17:31 UTC (permalink / raw)
  To: Julian Brown; +Cc: gcc-patches, paul, Richard Earnshaw

Hi Julian,


There are a couple of minor formatting nits.

>+static int
>+arm_movmemqi_unaligned (rtx *operands)

....

>+      /* Inlined memcpy using ldr/str/ldrh/strh can be quite big: try to limit
>+         size of code if optimizing for size.  We'll use ldm/stm if src_aligned
>+	 or dst_aligned though: allow more interleaving in those cases since the
>+	 resulting code can be smaller.  */

Watch out the or being misaligned compared to the other text.

>+      /* Note that the loop created by arm_block_move_unaligned_loop may be
>+         subject to loop unrolling, which makes tuning this condition a little
>+	 redundant.  */

Same with `redundant'

On 13 October 2011 18:53, Julian Brown <julian@codesourcery.com> wrote:
> On Wed, 28 Sep 2011 14:33:17 +0100

> No, sorry, I don't have any benchmark results available at present. I
> think we'd have to have terrifically bad luck for it to be a
> performance degradation, though...

Hmmm OK - but please watch out for any bug reports or any test-suite
fallout this week with
multilibs other than what you might have tested.

>
> I re-tested the patch for good measure, in case of bitrot (and the new
> tests pass with the patch applied, of course). OK to apply now?

OK by me with those changes.

cheers
Ramana

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH, ARM] Unaligned accesses for builtin memcpy [2/2]
  2011-10-13 18:37   ` Julian Brown
  2011-10-17 17:31     ` Ramana Radhakrishnan
@ 2011-10-18  5:27     ` Michael Hope
  1 sibling, 0 replies; 6+ messages in thread
From: Michael Hope @ 2011-10-18  5:27 UTC (permalink / raw)
  To: Julian Brown; +Cc: Ramana Radhakrishnan, gcc-patches, paul, Richard Earnshaw

On Fri, Oct 14, 2011 at 6:53 AM, Julian Brown <julian@codesourcery.com> wrote:
> On Wed, 28 Sep 2011 14:33:17 +0100
> Ramana Radhakrishnan <ramana.radhakrishnan@linaro.org> wrote:
>
>> On 6 May 2011 14:13, Julian Brown <julian@codesourcery.com> wrote:
>> > Hi,
>> >
>> > This is the second of two patches to add unaligned-access support to
>> > the ARM backend. It builds on the first patch to provide support for
>> > unaligned accesses when expanding block moves (i.e. for builtin
>> > memcpy operations). It makes some effort to use load/store multiple
>> > instructions where appropriate (when accessing sufficiently-aligned
>> > source or destination addresses), and also makes some effort to
>> > generate fast code (for -O1/2/3) or small code (for -Os), though
>> > some of the heuristics may need tweaking still
>>
>> Sorry it's taken me a while to get around to this one. Do you know
>> what difference this makes to performance on some standard benchmarks
>> on let's say an A9 and an M4 as I see that this gets triggered only
>> when we have less than 64 bytes to copy. ?
>
> No, sorry, I don't have any benchmark results available at present. I
> think we'd have to have terrifically bad luck for it to be a
> performance degradation, though...

I've backported the unaligned struct and memcpy patches to our 4.6
based compilers and benchmarked them.  The worst is a 0.84 % drop in
performance, the best a 7.17 % improvement, and a geomean of 0.18 %.
This was in a Cortex-A9 NEON -O3 configuration.  The results are
accurate to less than 0.1 %.

I'll send you and Ramana the raw results privately.

-- Michael

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2011-10-18  1:40 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-05-06 13:25 [PATCH, ARM] Unaligned accesses for builtin memcpy [2/2] Julian Brown
2011-09-14 15:24 ` Julian Brown
2011-09-28 14:23 ` Ramana Radhakrishnan
2011-10-13 18:37   ` Julian Brown
2011-10-17 17:31     ` Ramana Radhakrishnan
2011-10-18  5:27     ` Michael Hope

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).