[AArch64] Implement movmem for the benefit of inline memcpy

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [AArch64] Implement movmem for the benefit of inline memcpy
@ 2014-06-06  8:50 James Greenhalgh
  2014-06-06 10:39 ` Richard Earnshaw
                   ` (2 more replies)
  0 siblings, 3 replies; 62+ messages in thread
From: James Greenhalgh @ 2014-06-06  8:50 UTC (permalink / raw)
  To: gcc-patches; +Cc: marcus.shawcroft

[-- Attachment #1: Type: text/plain, Size: 1642 bytes --]


Hi,

The move_by_pieces infrastructure performs a copy by repeatedly trying
the largest safe copy it can make. So for a 15-byte copy we might see:

offset   amount  bytes copied
0        8       0-7
8        4       8-11
12       2       12-13
14       1       14

However, we can implement a 15-byte copy as so:

offset   amount  bytes copied
0        8       0-7
7        8       7-14

Which can prove more efficient for both space and speed.

In this patch we set MOVE_RATIO low to avoid using move_by_pieces, and
implement the movmem pattern name to expand small block copy cases. Note, this
optimization does not apply for -mstrict-align targets, which must continue
copying byte-by-byte.

Setting MOVE_RATIO low in this way causes a few tests to begin failing,
both of these are documented in the test-case as expected to fail for
low MOVE_RATIO targets, which do not allow certain Tree-Level optimizations.

Bootstrapped on aarch64-unknown-linux-gnu with no issues.

OK for trunk?

Thanks,
James

---
gcc/

2014-06-06  James Greenhalgh  <james.greenhalgh@arm.com>

	* config/aarch64/aarch64-protos.h (aarch64_expand_movmem): New.
	* config/aarch64/aarch64.c (aarch64_move_pointer): New.
	(aarch64_progress_pointer): Likewise.
	(aarch64_copy_one_part_and_move_pointers): Likewise.
	(aarch64_expand_movmen): Likewise.
	* config/aarch64/aarch64.h (MOVE_RATIO): Set low.
	* config/aarch64/aarch64.md (movmem<mode>): New.

gcc/testsuite/

2014-06-06  James Greenhalgh  <james.greenhalgh@arm.com>

	* gcc.dg/tree-ssa/pr42585.c: Skip for AArch64.
	* gcc.dg/tree-ssa/sra-12.c: Likewise.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0001-AArch64-Implement-movmem-for-the-benefit-of-inline-m.patch --]
[-- Type: text/x-patch;  name=0001-AArch64-Implement-movmem-for-the-benefit-of-inline-m.patch, Size: 9751 bytes --]

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 68d488d..c4f75b3 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -180,6 +180,7 @@ bool aarch64_cannot_change_mode_class (enum machine_mode,
 enum aarch64_symbol_type
 aarch64_classify_symbolic_expression (rtx, enum aarch64_symbol_context);
 bool aarch64_constant_address_p (rtx);
+bool aarch64_expand_movmem (rtx *);
 bool aarch64_float_const_zero_rtx_p (rtx);
 bool aarch64_function_arg_regno_p (unsigned);
 bool aarch64_gen_movmemqi (rtx *);
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index b26e5f5..0ae21cd 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -9434,6 +9434,164 @@ aarch64_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
   return false;
 }
 
+/* Return a new RTX holding the result of moving POINTER forward by
+   AMOUNT bytes.  */
+
+static rtx
+aarch64_move_pointer (rtx pointer, int amount)
+{
+  rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
+
+  return adjust_automodify_address (pointer, GET_MODE (pointer),
+				    next, amount);
+}
+
+/* Return a new RTX holding the result of moving POINTER forward by the
+   size of the mode it points to.  */
+
+static rtx
+aarch64_progress_pointer (rtx pointer)
+{
+  HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
+
+  return aarch64_move_pointer (pointer, amount);
+}
+
+/* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
+   MODE bytes.  */
+
+static void
+aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
+					      enum machine_mode mode)
+{
+  rtx reg = gen_reg_rtx (mode);
+
+  /* "Cast" the pointers to the correct mode.  */
+  *src = adjust_address (*src, mode, 0);
+  *dst = adjust_address (*dst, mode, 0);
+  /* Emit the memcpy.  */
+  emit_move_insn (reg, *src);
+  emit_move_insn (*dst, reg);
+  /* Move the pointers forward.  */
+  *src = aarch64_progress_pointer (*src);
+  *dst = aarch64_progress_pointer (*dst);
+}
+
+/* Expand movmem, as if from a __builtin_memcpy.  Return true if
+   we succeed, otherwise return false.  */
+
+bool
+aarch64_expand_movmem (rtx *operands)
+{
+  unsigned int n;
+  rtx dst = operands[0];
+  rtx src = operands[1];
+  rtx base;
+  bool speed_p = !optimize_function_for_size_p (cfun);
+
+  /* When optimizing for size, give a better estimate of the length of a
+     memcpy call, but use the default otherwise.  */
+  unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
+
+  /* We can't do anything smart if the amount to copy is not constant.  */
+  if (!CONST_INT_P (operands[2]))
+    return false;
+
+  n = UINTVAL (operands[2]);
+
+  /* Try to keep the number of instructions low.  For cases below 16 bytes we
+     need to make at most two moves.  For cases above 16 bytes it will be one
+     move for each 16 byte chunk, then at most two additional moves.  */
+  if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
+    return false;
+
+  base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
+  dst = adjust_automodify_address (dst, VOIDmode, base, 0);
+
+  base = copy_to_mode_reg (Pmode, XEXP (src, 0));
+  src = adjust_automodify_address (src, VOIDmode, base, 0);
+
+  /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
+     1-byte chunk.  */
+  if (n < 4)
+    {
+      if (n >= 2)
+	{
+	  aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
+	  n -= 2;
+	}
+
+      if (n == 1)
+	aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
+
+      return true;
+    }
+
+  /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
+     4-byte chunk, partially overlapping with the previously copied chunk.  */
+  if (n < 8)
+    {
+      aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
+      n -= 4;
+      if (n > 0)
+	{
+	  int move = n - 4;
+
+	  src = aarch64_move_pointer (src, move);
+	  dst = aarch64_move_pointer (dst, move);
+	  aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
+	}
+      return true;
+    }
+
+  /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
+     them, then (if applicable) an 8-byte chunk.  */
+  while (n >= 8)
+    {
+      if (n / 16)
+	{
+	  aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
+	  n -= 16;
+	}
+      else
+	{
+	  aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
+	  n -= 8;
+	}
+    }
+
+  /* Finish the final bytes of the copy.  We can always do this in one
+     instruction.  We either copy the exact amount we need, or partially
+     overlap with the previous chunk we copied and copy 8-bytes.  */
+  if (n == 0)
+    return true;
+  else if (n == 1)
+    aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
+  else if (n == 2)
+    aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
+  else if (n == 4)
+    aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
+  else
+    {
+      if (n == 3)
+	{
+	  src = aarch64_move_pointer (src, -1);
+	  dst = aarch64_move_pointer (dst, -1);
+	  aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
+	}
+      else
+	{
+	  int move = n - 8;
+
+	  src = aarch64_move_pointer (src, move);
+	  dst = aarch64_move_pointer (dst, move);
+	  aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
+	}
+    }
+
+  return true;
+}
+
 #undef TARGET_ADDRESS_COST
 #define TARGET_ADDRESS_COST aarch64_address_cost
 
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index ced5a5e..c5d144e 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -659,12 +659,14 @@ do {									     \
 /* The base cost overhead of a memcpy call, for MOVE_RATIO and friends.  */
 #define AARCH64_CALL_RATIO 8
 
-/* When optimizing for size, give a better estimate of the length of a memcpy
-   call, but use the default otherwise.  But move_by_pieces_ninsns() counts
-   memory-to-memory moves, and we'll have to generate a load & store for each,
-   so halve the value to take that into account.  */
+/* MOVE_RATIO dictates when we will use the move_by_pieces infrastructure.
+   move_by_pieces will continually copy the largest safe chunks.  So a
+   7-byte copy is a 4-byte + 2-byte + byte copy.  This proves inefficient
+   for both size and speed of copy, so we will instead use the "movmem"
+   standard name to implement the copy.  This logic does not apply when
+   targeting -mstrict-align, so keep a sensible default in that case.  */
 #define MOVE_RATIO(speed) \
-  (((speed) ? 15 : AARCH64_CALL_RATIO) / 2)
+  (!STRICT_ALIGNMENT ? 2 : (((speed) ? 15 : AARCH64_CALL_RATIO) / 2))
 
 /* For CLEAR_RATIO, when optimizing for size, give a better estimate
    of the length of a memset call, but use the default otherwise.  */
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 6e605c19f0acbe88d53f460cb513d24dde6d658f..661d784b93e60fd2f636f5b5f03c10c6d53493dd 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -883,6 +883,24 @@ (define_split
   }
 )
 
+;; 0 is dst
+;; 1 is src
+;; 2 is size of move in bytes
+;; 3 is alignment
+
+(define_expand "movmemdi"
+  [(match_operand:BLK 0 "memory_operand")
+   (match_operand:BLK 1 "memory_operand")
+   (match_operand:DI 2 "immediate_operand")
+   (match_operand:DI 3 "immediate_operand")]
+   "!STRICT_ALIGNMENT"
+{
+  if (aarch64_expand_movmem (operands))
+    DONE;
+  FAIL;
+}
+)
+
 ;; Operands 1 and 3 are tied together by the final condition; so we allow
 ;; fairly lax checking on the second memory operation.
 (define_insn "load_pair<mode>"
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c b/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c
index a970c85..07f575d 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c
@@ -35,6 +35,6 @@ Cyc_string_ungetc (int ignore, struct _fat_ptr *sptr)
 /* Whether the structs are totally scalarized or not depends on the
    MOVE_RATIO macro definition in the back end.  The scalarization will
    not take place when using small values for MOVE_RATIO.  */
-/* { dg-final { scan-tree-dump-times "struct _fat_ptr _ans" 0 "optimized" { target { ! "arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } } */
-/* { dg-final { scan-tree-dump-times "struct _fat_ptr _T2" 0 "optimized" { target { ! "arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } } */
+/* { dg-final { scan-tree-dump-times "struct _fat_ptr _ans" 0 "optimized" { target { ! "aarch64*-*-* arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } } */
+/* { dg-final { scan-tree-dump-times "struct _fat_ptr _T2" 0 "optimized" { target { ! "aarch64*-*-* arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } } */
 /* { dg-final { cleanup-tree-dump "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c b/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c
index 59e5e6a..45aa963 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c
@@ -21,5 +21,5 @@ int foo (struct S *p)
   *p = l;
 }
 
-/* { dg-final { scan-tree-dump-times "l;" 0 "release_ssa" { target { ! "avr*-*-* nds32*-*-*" } } } } */
+/* { dg-final { scan-tree-dump-times "l;" 0 "release_ssa" { target { ! "aarch64*-*-* avr*-*-* nds32*-*-*" } } } } */
 /* { dg-final { cleanup-tree-dump "release_ssa" } } */

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [AArch64] Implement movmem for the benefit of inline memcpy
  2014-06-06  8:50 [AArch64] Implement movmem for the benefit of inline memcpy James Greenhalgh
@ 2014-06-06 10:39 ` Richard Earnshaw
  2014-08-01  6:38 ` Andrew Pinski
  2014-08-01  9:21 ` pinskia
  2 siblings, 0 replies; 62+ messages in thread
From: Richard Earnshaw @ 2014-06-06 10:39 UTC (permalink / raw)
  To: James Greenhalgh; +Cc: gcc-patches, Marcus Shawcroft

On 06/06/14 09:50, James Greenhalgh wrote:
> 
> Hi,
> 
> The move_by_pieces infrastructure performs a copy by repeatedly trying
> the largest safe copy it can make. So for a 15-byte copy we might see:
> 
> offset   amount  bytes copied
> 0        8       0-7
> 8        4       8-11
> 12       2       12-13
> 14       1       14
> 
> However, we can implement a 15-byte copy as so:
> 
> offset   amount  bytes copied
> 0        8       0-7
> 7        8       7-14
> 
> Which can prove more efficient for both space and speed.
> 
> In this patch we set MOVE_RATIO low to avoid using move_by_pieces, and
> implement the movmem pattern name to expand small block copy cases. Note, this
> optimization does not apply for -mstrict-align targets, which must continue
> copying byte-by-byte.
> 
> Setting MOVE_RATIO low in this way causes a few tests to begin failing,
> both of these are documented in the test-case as expected to fail for
> low MOVE_RATIO targets, which do not allow certain Tree-Level optimizations.
> 
> Bootstrapped on aarch64-unknown-linux-gnu with no issues.
> 
> OK for trunk?
> 

This is OK.

Hmm, I notice sra-12 fails on ARM as well.  Is that for the same reason?
 If so, perhaps you could prepare a patch for that as well (consider it
pre-approved).

R.

> Thanks,
> James
> 
> ---
> gcc/
> 
> 2014-06-06  James Greenhalgh  <james.greenhalgh@arm.com>
> 
> 	* config/aarch64/aarch64-protos.h (aarch64_expand_movmem): New.
> 	* config/aarch64/aarch64.c (aarch64_move_pointer): New.
> 	(aarch64_progress_pointer): Likewise.
> 	(aarch64_copy_one_part_and_move_pointers): Likewise.
> 	(aarch64_expand_movmen): Likewise.
> 	* config/aarch64/aarch64.h (MOVE_RATIO): Set low.
> 	* config/aarch64/aarch64.md (movmem<mode>): New.
> 
> gcc/testsuite/
> 
> 2014-06-06  James Greenhalgh  <james.greenhalgh@arm.com>
> 
> 	* gcc.dg/tree-ssa/pr42585.c: Skip for AArch64.
> 	* gcc.dg/tree-ssa/sra-12.c: Likewise.
> 
> 
> 0001-AArch64-Implement-movmem-for-the-benefit-of-inline-m.patch
> 
> 
> diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
> index 68d488d..c4f75b3 100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -180,6 +180,7 @@ bool aarch64_cannot_change_mode_class (enum machine_mode,
>  enum aarch64_symbol_type
>  aarch64_classify_symbolic_expression (rtx, enum aarch64_symbol_context);
>  bool aarch64_constant_address_p (rtx);
> +bool aarch64_expand_movmem (rtx *);
>  bool aarch64_float_const_zero_rtx_p (rtx);
>  bool aarch64_function_arg_regno_p (unsigned);
>  bool aarch64_gen_movmemqi (rtx *);
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index b26e5f5..0ae21cd 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -9434,6 +9434,164 @@ aarch64_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
>    return false;
>  }
>  
> +/* Return a new RTX holding the result of moving POINTER forward by
> +   AMOUNT bytes.  */
> +
> +static rtx
> +aarch64_move_pointer (rtx pointer, int amount)
> +{
> +  rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
> +
> +  return adjust_automodify_address (pointer, GET_MODE (pointer),
> +				    next, amount);
> +}
> +
> +/* Return a new RTX holding the result of moving POINTER forward by the
> +   size of the mode it points to.  */
> +
> +static rtx
> +aarch64_progress_pointer (rtx pointer)
> +{
> +  HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
> +
> +  return aarch64_move_pointer (pointer, amount);
> +}
> +
> +/* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
> +   MODE bytes.  */
> +
> +static void
> +aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
> +					      enum machine_mode mode)
> +{
> +  rtx reg = gen_reg_rtx (mode);
> +
> +  /* "Cast" the pointers to the correct mode.  */
> +  *src = adjust_address (*src, mode, 0);
> +  *dst = adjust_address (*dst, mode, 0);
> +  /* Emit the memcpy.  */
> +  emit_move_insn (reg, *src);
> +  emit_move_insn (*dst, reg);
> +  /* Move the pointers forward.  */
> +  *src = aarch64_progress_pointer (*src);
> +  *dst = aarch64_progress_pointer (*dst);
> +}
> +
> +/* Expand movmem, as if from a __builtin_memcpy.  Return true if
> +   we succeed, otherwise return false.  */
> +
> +bool
> +aarch64_expand_movmem (rtx *operands)
> +{
> +  unsigned int n;
> +  rtx dst = operands[0];
> +  rtx src = operands[1];
> +  rtx base;
> +  bool speed_p = !optimize_function_for_size_p (cfun);
> +
> +  /* When optimizing for size, give a better estimate of the length of a
> +     memcpy call, but use the default otherwise.  */
> +  unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
> +
> +  /* We can't do anything smart if the amount to copy is not constant.  */
> +  if (!CONST_INT_P (operands[2]))
> +    return false;
> +
> +  n = UINTVAL (operands[2]);
> +
> +  /* Try to keep the number of instructions low.  For cases below 16 bytes we
> +     need to make at most two moves.  For cases above 16 bytes it will be one
> +     move for each 16 byte chunk, then at most two additional moves.  */
> +  if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
> +    return false;
> +
> +  base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
> +  dst = adjust_automodify_address (dst, VOIDmode, base, 0);
> +
> +  base = copy_to_mode_reg (Pmode, XEXP (src, 0));
> +  src = adjust_automodify_address (src, VOIDmode, base, 0);
> +
> +  /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
> +     1-byte chunk.  */
> +  if (n < 4)
> +    {
> +      if (n >= 2)
> +	{
> +	  aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
> +	  n -= 2;
> +	}
> +
> +      if (n == 1)
> +	aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
> +
> +      return true;
> +    }
> +
> +  /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
> +     4-byte chunk, partially overlapping with the previously copied chunk.  */
> +  if (n < 8)
> +    {
> +      aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
> +      n -= 4;
> +      if (n > 0)
> +	{
> +	  int move = n - 4;
> +
> +	  src = aarch64_move_pointer (src, move);
> +	  dst = aarch64_move_pointer (dst, move);
> +	  aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
> +	}
> +      return true;
> +    }
> +
> +  /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
> +     them, then (if applicable) an 8-byte chunk.  */
> +  while (n >= 8)
> +    {
> +      if (n / 16)
> +	{
> +	  aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
> +	  n -= 16;
> +	}
> +      else
> +	{
> +	  aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
> +	  n -= 8;
> +	}
> +    }
> +
> +  /* Finish the final bytes of the copy.  We can always do this in one
> +     instruction.  We either copy the exact amount we need, or partially
> +     overlap with the previous chunk we copied and copy 8-bytes.  */
> +  if (n == 0)
> +    return true;
> +  else if (n == 1)
> +    aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
> +  else if (n == 2)
> +    aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
> +  else if (n == 4)
> +    aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
> +  else
> +    {
> +      if (n == 3)
> +	{
> +	  src = aarch64_move_pointer (src, -1);
> +	  dst = aarch64_move_pointer (dst, -1);
> +	  aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
> +	}
> +      else
> +	{
> +	  int move = n - 8;
> +
> +	  src = aarch64_move_pointer (src, move);
> +	  dst = aarch64_move_pointer (dst, move);
> +	  aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
> +	}
> +    }
> +
> +  return true;
> +}
> +
>  #undef TARGET_ADDRESS_COST
>  #define TARGET_ADDRESS_COST aarch64_address_cost
>  
> diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> index ced5a5e..c5d144e 100644
> --- a/gcc/config/aarch64/aarch64.h
> +++ b/gcc/config/aarch64/aarch64.h
> @@ -659,12 +659,14 @@ do {									     \
>  /* The base cost overhead of a memcpy call, for MOVE_RATIO and friends.  */
>  #define AARCH64_CALL_RATIO 8
>  
> -/* When optimizing for size, give a better estimate of the length of a memcpy
> -   call, but use the default otherwise.  But move_by_pieces_ninsns() counts
> -   memory-to-memory moves, and we'll have to generate a load & store for each,
> -   so halve the value to take that into account.  */
> +/* MOVE_RATIO dictates when we will use the move_by_pieces infrastructure.
> +   move_by_pieces will continually copy the largest safe chunks.  So a
> +   7-byte copy is a 4-byte + 2-byte + byte copy.  This proves inefficient
> +   for both size and speed of copy, so we will instead use the "movmem"
> +   standard name to implement the copy.  This logic does not apply when
> +   targeting -mstrict-align, so keep a sensible default in that case.  */
>  #define MOVE_RATIO(speed) \
> -  (((speed) ? 15 : AARCH64_CALL_RATIO) / 2)
> +  (!STRICT_ALIGNMENT ? 2 : (((speed) ? 15 : AARCH64_CALL_RATIO) / 2))
>  
>  /* For CLEAR_RATIO, when optimizing for size, give a better estimate
>     of the length of a memset call, but use the default otherwise.  */
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 6e605c19f0acbe88d53f460cb513d24dde6d658f..661d784b93e60fd2f636f5b5f03c10c6d53493dd 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -883,6 +883,24 @@ (define_split
>    }
>  )
>  
> +;; 0 is dst
> +;; 1 is src
> +;; 2 is size of move in bytes
> +;; 3 is alignment
> +
> +(define_expand "movmemdi"
> +  [(match_operand:BLK 0 "memory_operand")
> +   (match_operand:BLK 1 "memory_operand")
> +   (match_operand:DI 2 "immediate_operand")
> +   (match_operand:DI 3 "immediate_operand")]
> +   "!STRICT_ALIGNMENT"
> +{
> +  if (aarch64_expand_movmem (operands))
> +    DONE;
> +  FAIL;
> +}
> +)
> +
>  ;; Operands 1 and 3 are tied together by the final condition; so we allow
>  ;; fairly lax checking on the second memory operation.
>  (define_insn "load_pair<mode>"
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c b/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c
> index a970c85..07f575d 100644
> --- a/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c
> @@ -35,6 +35,6 @@ Cyc_string_ungetc (int ignore, struct _fat_ptr *sptr)
>  /* Whether the structs are totally scalarized or not depends on the
>     MOVE_RATIO macro definition in the back end.  The scalarization will
>     not take place when using small values for MOVE_RATIO.  */
> -/* { dg-final { scan-tree-dump-times "struct _fat_ptr _ans" 0 "optimized" { target { ! "arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } } */
> -/* { dg-final { scan-tree-dump-times "struct _fat_ptr _T2" 0 "optimized" { target { ! "arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } } */
> +/* { dg-final { scan-tree-dump-times "struct _fat_ptr _ans" 0 "optimized" { target { ! "aarch64*-*-* arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } } */
> +/* { dg-final { scan-tree-dump-times "struct _fat_ptr _T2" 0 "optimized" { target { ! "aarch64*-*-* arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } } */
>  /* { dg-final { cleanup-tree-dump "optimized" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c b/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c
> index 59e5e6a..45aa963 100644
> --- a/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c
> @@ -21,5 +21,5 @@ int foo (struct S *p)
>    *p = l;
>  }
>  
> -/* { dg-final { scan-tree-dump-times "l;" 0 "release_ssa" { target { ! "avr*-*-* nds32*-*-*" } } } } */
> +/* { dg-final { scan-tree-dump-times "l;" 0 "release_ssa" { target { ! "aarch64*-*-* avr*-*-* nds32*-*-*" } } } } */
>  /* { dg-final { cleanup-tree-dump "release_ssa" } } */
> 


^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [AArch64] Implement movmem for the benefit of inline memcpy
  2014-06-06  8:50 [AArch64] Implement movmem for the benefit of inline memcpy James Greenhalgh
  2014-06-06 10:39 ` Richard Earnshaw
@ 2014-08-01  6:38 ` Andrew Pinski
  2014-08-01  9:05   ` Richard Biener
  2014-08-01  9:21 ` pinskia
  2 siblings, 1 reply; 62+ messages in thread
From: Andrew Pinski @ 2014-08-01  6:38 UTC (permalink / raw)
  To: James Greenhalgh; +Cc: GCC Patches, Marcus Shawcroft

On Fri, Jun 6, 2014 at 1:50 AM, James Greenhalgh
<james.greenhalgh@arm.com> wrote:
>
> Hi,
>
> The move_by_pieces infrastructure performs a copy by repeatedly trying
> the largest safe copy it can make. So for a 15-byte copy we might see:
>
> offset   amount  bytes copied
> 0        8       0-7
> 8        4       8-11
> 12       2       12-13
> 14       1       14
>
> However, we can implement a 15-byte copy as so:
>
> offset   amount  bytes copied
> 0        8       0-7
> 7        8       7-14
>
> Which can prove more efficient for both space and speed.
>
> In this patch we set MOVE_RATIO low to avoid using move_by_pieces, and
> implement the movmem pattern name to expand small block copy cases. Note, this
> optimization does not apply for -mstrict-align targets, which must continue
> copying byte-by-byte.
>
> Setting MOVE_RATIO low in this way causes a few tests to begin failing,
> both of these are documented in the test-case as expected to fail for
> low MOVE_RATIO targets, which do not allow certain Tree-Level optimizations.


I think you should reevaluate setting MOVE_RATIO this low.  It is used
for SRA and IPA-SRA which both are very useful; more useful than
memmove optimizations can do.

In fact this optimization is not even valid for volatile variables.
Here is a testcase for the volatile issue:
struct __attribute__((packed)) t15{
  long long t8;
  int t4;
  short t2;
  unsigned char t1;
};
volatile struct t15 t15;
int f(struct t15 *a)
{
  t15 = *a;
}

Notice how we are writing to byte 7 twice to t15 in the outputted code.

Thanks,
Andrew Pinski

>
> Bootstrapped on aarch64-unknown-linux-gnu with no issues.
>
> OK for trunk?
>
> Thanks,
> James
>
> ---
> gcc/
>
> 2014-06-06  James Greenhalgh  <james.greenhalgh@arm.com>
>
>         * config/aarch64/aarch64-protos.h (aarch64_expand_movmem): New.
>         * config/aarch64/aarch64.c (aarch64_move_pointer): New.
>         (aarch64_progress_pointer): Likewise.
>         (aarch64_copy_one_part_and_move_pointers): Likewise.
>         (aarch64_expand_movmen): Likewise.
>         * config/aarch64/aarch64.h (MOVE_RATIO): Set low.
>         * config/aarch64/aarch64.md (movmem<mode>): New.
>
> gcc/testsuite/
>
> 2014-06-06  James Greenhalgh  <james.greenhalgh@arm.com>
>
>         * gcc.dg/tree-ssa/pr42585.c: Skip for AArch64.
>         * gcc.dg/tree-ssa/sra-12.c: Likewise.

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [AArch64] Implement movmem for the benefit of inline memcpy
  2014-08-01  6:38 ` Andrew Pinski
@ 2014-08-01  9:05   ` Richard Biener
  0 siblings, 0 replies; 62+ messages in thread
From: Richard Biener @ 2014-08-01  9:05 UTC (permalink / raw)
  To: Andrew Pinski; +Cc: James Greenhalgh, GCC Patches, Marcus Shawcroft

On Fri, Aug 1, 2014 at 8:38 AM, Andrew Pinski <pinskia@gmail.com> wrote:
> On Fri, Jun 6, 2014 at 1:50 AM, James Greenhalgh
> <james.greenhalgh@arm.com> wrote:
>>
>> Hi,
>>
>> The move_by_pieces infrastructure performs a copy by repeatedly trying
>> the largest safe copy it can make. So for a 15-byte copy we might see:
>>
>> offset   amount  bytes copied
>> 0        8       0-7
>> 8        4       8-11
>> 12       2       12-13
>> 14       1       14
>>
>> However, we can implement a 15-byte copy as so:
>>
>> offset   amount  bytes copied
>> 0        8       0-7
>> 7        8       7-14
>>
>> Which can prove more efficient for both space and speed.
>>
>> In this patch we set MOVE_RATIO low to avoid using move_by_pieces, and
>> implement the movmem pattern name to expand small block copy cases. Note, this
>> optimization does not apply for -mstrict-align targets, which must continue
>> copying byte-by-byte.
>>
>> Setting MOVE_RATIO low in this way causes a few tests to begin failing,
>> both of these are documented in the test-case as expected to fail for
>> low MOVE_RATIO targets, which do not allow certain Tree-Level optimizations.
>
>
> I think you should reevaluate setting MOVE_RATIO this low.  It is used
> for SRA and IPA-SRA which both are very useful; more useful than
> memmove optimizations can do.

Maybe we should finally decouple SRA and IPA-SRA from MOVE_RATIO
and have a --param to control the heuristic a target can adjust separately.
(we can still default to MOVE_RATIO here).  Ok, maybe we need
two params, one for size and one for speed optimization (though
the size effects are hard to estimate).

Richard.

> In fact this optimization is not even valid for volatile variables.
> Here is a testcase for the volatile issue:
> struct __attribute__((packed)) t15{
>   long long t8;
>   int t4;
>   short t2;
>   unsigned char t1;
> };
> volatile struct t15 t15;
> int f(struct t15 *a)
> {
>   t15 = *a;
> }
>
> Notice how we are writing to byte 7 twice to t15 in the outputted code.
>
> Thanks,
> Andrew Pinski
>
>>
>> Bootstrapped on aarch64-unknown-linux-gnu with no issues.
>>
>> OK for trunk?
>>
>> Thanks,
>> James
>>
>> ---
>> gcc/
>>
>> 2014-06-06  James Greenhalgh  <james.greenhalgh@arm.com>
>>
>>         * config/aarch64/aarch64-protos.h (aarch64_expand_movmem): New.
>>         * config/aarch64/aarch64.c (aarch64_move_pointer): New.
>>         (aarch64_progress_pointer): Likewise.
>>         (aarch64_copy_one_part_and_move_pointers): Likewise.
>>         (aarch64_expand_movmen): Likewise.
>>         * config/aarch64/aarch64.h (MOVE_RATIO): Set low.
>>         * config/aarch64/aarch64.md (movmem<mode>): New.
>>
>> gcc/testsuite/
>>
>> 2014-06-06  James Greenhalgh  <james.greenhalgh@arm.com>
>>
>>         * gcc.dg/tree-ssa/pr42585.c: Skip for AArch64.
>>         * gcc.dg/tree-ssa/sra-12.c: Likewise.

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [AArch64] Implement movmem for the benefit of inline memcpy
  2014-06-06  8:50 [AArch64] Implement movmem for the benefit of inline memcpy James Greenhalgh
  2014-06-06 10:39 ` Richard Earnshaw
  2014-08-01  6:38 ` Andrew Pinski
@ 2014-08-01  9:21 ` pinskia
  2014-08-05  7:05   ` Andrew Pinski
  2 siblings, 1 reply; 62+ messages in thread
From: pinskia @ 2014-08-01  9:21 UTC (permalink / raw)
  To: James Greenhalgh; +Cc: gcc-patches, marcus.shawcroft



> On Jun 6, 2014, at 1:50 AM, James Greenhalgh <james.greenhalgh@arm.com> wrote:
> 
> 
> Hi,
> 
> The move_by_pieces infrastructure performs a copy by repeatedly trying
> the largest safe copy it can make. So for a 15-byte copy we might see:
> 
> offset   amount  bytes copied
> 0        8       0-7
> 8        4       8-11
> 12       2       12-13
> 14       1       14
> 
> However, we can implement a 15-byte copy as so:
> 
> offset   amount  bytes copied
> 0        8       0-7
> 7        8       7-14
> 
> Which can prove more efficient for both space and speed.
> 
> In this patch we set MOVE_RATIO low to avoid using move_by_pieces, and
> implement the movmem pattern name to expand small block copy cases. Note, this
> optimization does not apply for -mstrict-align targets, which must continue
> copying byte-by-byte.

Why not change move_by_pieces instead of having a target specific code? This seems like a better option. You can check is unaligned slow target macro to see if you want to do this optimization too.   As I mentioned in the other email make sure you check the volatile ness of the from and to before doing this optimization. 

Thanks,
Andrew


> 
> Setting MOVE_RATIO low in this way causes a few tests to begin failing,
> both of these are documented in the test-case as expected to fail for
> low MOVE_RATIO targets, which do not allow certain Tree-Level optimizations.
> 
> Bootstrapped on aarch64-unknown-linux-gnu with no issues.
> 
> OK for trunk?
> 
> Thanks,
> James
> 
> ---
> gcc/
> 
> 2014-06-06  James Greenhalgh  <james.greenhalgh@arm.com>
> 
>    * config/aarch64/aarch64-protos.h (aarch64_expand_movmem): New.
>    * config/aarch64/aarch64.c (aarch64_move_pointer): New.
>    (aarch64_progress_pointer): Likewise.
>    (aarch64_copy_one_part_and_move_pointers): Likewise.
>    (aarch64_expand_movmen): Likewise.
>    * config/aarch64/aarch64.h (MOVE_RATIO): Set low.
>    * config/aarch64/aarch64.md (movmem<mode>): New.
> 
> gcc/testsuite/
> 
> 2014-06-06  James Greenhalgh  <james.greenhalgh@arm.com>
> 
>    * gcc.dg/tree-ssa/pr42585.c: Skip for AArch64.
>    * gcc.dg/tree-ssa/sra-12.c: Likewise.
> <0001-AArch64-Implement-movmem-for-the-benefit-of-inline-m.patch>

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [AArch64] Implement movmem for the benefit of inline memcpy
  2014-08-01  9:21 ` pinskia
@ 2014-08-05  7:05   ` Andrew Pinski
  2014-08-07 14:20     ` James Greenhalgh
  0 siblings, 1 reply; 62+ messages in thread
From: Andrew Pinski @ 2014-08-05  7:05 UTC (permalink / raw)
  To: James Greenhalgh; +Cc: gcc-patches, marcus.shawcroft

[-- Attachment #1: Type: text/plain, Size: 3267 bytes --]

On Fri, Aug 1, 2014 at 2:21 AM,  <pinskia@gmail.com> wrote:
>
>
>> On Jun 6, 2014, at 1:50 AM, James Greenhalgh <james.greenhalgh@arm.com> wrote:
>>
>>
>> Hi,
>>
>> The move_by_pieces infrastructure performs a copy by repeatedly trying
>> the largest safe copy it can make. So for a 15-byte copy we might see:
>>
>> offset   amount  bytes copied
>> 0        8       0-7
>> 8        4       8-11
>> 12       2       12-13
>> 14       1       14
>>
>> However, we can implement a 15-byte copy as so:
>>
>> offset   amount  bytes copied
>> 0        8       0-7
>> 7        8       7-14
>>
>> Which can prove more efficient for both space and speed.
>>
>> In this patch we set MOVE_RATIO low to avoid using move_by_pieces, and
>> implement the movmem pattern name to expand small block copy cases. Note, this
>> optimization does not apply for -mstrict-align targets, which must continue
>> copying byte-by-byte.
>
> Why not change move_by_pieces instead of having a target specific code? This seems like a better option. You can check is unaligned slow target macro to see if you want to do this optimization too.   As I mentioned in the other email make sure you check the volatile ness of the from and to before doing this optimization.

Attached is the patch which does what I mentioned, I also changed
store_by_pieces to implement a similar optimization there (for memset
and strcpy).  Also since I used SLOW_UNALIGNED_ACCESS, this is a
generic optimization.

I had tested an earlier version on x86_64-linux-gnu and I am in the
middle of bootstrap/testing on this one.

Thanks,
Andrew Pinski

* expr.c (move_by_pieces):
Take the min of max_size and len to speed up things
and to take advatage of the mode in move_by_pieces_1.
(move_by_pieces_1): Read/write the leftovers using an overlapping
memory locations to reduce the number of reads/writes.
(store_by_pieces_1): Take the min of max_size and len to speed up things
and to take advatage of the mode in store_by_pieces_2.
(store_by_pieces_2): Write the leftovers using an overlapping
memory locations to reduce the number of writes.


>
> Thanks,
> Andrew
>
>
>>
>> Setting MOVE_RATIO low in this way causes a few tests to begin failing,
>> both of these are documented in the test-case as expected to fail for
>> low MOVE_RATIO targets, which do not allow certain Tree-Level optimizations.
>>
>> Bootstrapped on aarch64-unknown-linux-gnu with no issues.
>>
>> OK for trunk?
>>
>> Thanks,
>> James
>>
>> ---
>> gcc/
>>
>> 2014-06-06  James Greenhalgh  <james.greenhalgh@arm.com>
>>
>>    * config/aarch64/aarch64-protos.h (aarch64_expand_movmem): New.
>>    * config/aarch64/aarch64.c (aarch64_move_pointer): New.
>>    (aarch64_progress_pointer): Likewise.
>>    (aarch64_copy_one_part_and_move_pointers): Likewise.
>>    (aarch64_expand_movmen): Likewise.
>>    * config/aarch64/aarch64.h (MOVE_RATIO): Set low.
>>    * config/aarch64/aarch64.md (movmem<mode>): New.
>>
>> gcc/testsuite/
>>
>> 2014-06-06  James Greenhalgh  <james.greenhalgh@arm.com>
>>
>>    * gcc.dg/tree-ssa/pr42585.c: Skip for AArch64.
>>    * gcc.dg/tree-ssa/sra-12.c: Likewise.
>> <0001-AArch64-Implement-movmem-for-the-benefit-of-inline-m.patch>

[-- Attachment #2: addunaligned.diff.txt --]
[-- Type: text/plain, Size: 2918 bytes --]

Index: expr.c
===================================================================
--- expr.c	(revision 213306)
+++ expr.c	(working copy)
@@ -876,6 +876,9 @@ move_by_pieces (rtx to, rtx from, unsign
   if (data.reverse) data.offset = len;
   data.len = len;
 
+  /* Use the MIN of the length and the max size we can use. */
+  max_size = max_size > (len + 1) ? (len + 1) : max_size;
+
   /* If copying requires more than two move insns,
      copy addresses to registers (to make displacements shorter)
      and use post-increment if available.  */
@@ -1073,6 +1076,32 @@ move_by_pieces_1 (insn_gen_fn genfun, ma
 
       data->len -= size;
     }
+
+  /* If we have some data left and unalign accesses
+     are not slow, back up slightly and emit the move. */
+  if (data->len > 0
+      && !STRICT_ALIGNMENT
+      && !SLOW_UNALIGNED_ACCESS (mode, 1)
+      /* Not a stack push */
+      && data->to
+      /* Neither side is volatile memory. */
+      && !MEM_VOLATILE_P (data->to)
+      && !MEM_VOLATILE_P (data->from)
+      && ceil_log2 (data->len) == exact_log2 (size)
+      /* No incrementing of the to or from. */
+      && data->explicit_inc_to == 0
+      && data->explicit_inc_from == 0
+      /* No auto-incrementing of the to or from. */
+      && !data->autinc_to
+      && !data->autinc_from
+      && !data->reverse)
+    {
+      unsigned offset = data->offset - (size - data->len);
+      to1 = adjust_address (data->to, mode, offset);
+      from1 = adjust_address (data->from, mode, offset);
+      emit_insn ((*genfun) (to1, from1));
+      data->len = 0;
+    }
 }
 \f
 /* Emit code to move a block Y to a block X.  This may be done with
@@ -2636,6 +2665,9 @@ store_by_pieces_1 (struct store_by_piece
   if (data->reverse)
     data->offset = data->len;
 
+  /* Use the MIN of the length and the max size we can use. */
+  max_size = max_size > (data->len + 1) ? (data->len + 1)  : max_size;
+
   /* If storing requires more than two move insns,
      copy addresses to registers (to make displacements shorter)
      and use post-increment if available.  */
@@ -2733,6 +2765,24 @@ store_by_pieces_2 (insn_gen_fn genfun, m
 
       data->len -= size;
     }
+
+  /* If we have some data left and unalign accesses
+     are not slow, back up slightly and emit that constant.  */
+  if (data->len > 0
+      && !STRICT_ALIGNMENT
+      && !SLOW_UNALIGNED_ACCESS (mode, 1)
+      && !MEM_VOLATILE_P (data->to)
+      && ceil_log2 (data->len) == exact_log2 (size)
+      && data->explicit_inc_to == 0
+      && !data->autinc_to
+      && !data->reverse)
+    {
+      unsigned offset = data->offset - (size - data->len);
+      to1 = adjust_address (data->to, mode, offset);
+      cst = (*data->constfun) (data->constfundata, offset, mode);
+      emit_insn ((*genfun) (to1, cst));
+      data->len = 0;
+    }
 }
 \f
 /* Write zeros through the storage of OBJECT.  If OBJECT has BLKmode, SIZE is

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [AArch64] Implement movmem for the benefit of inline memcpy
  2014-08-05  7:05   ` Andrew Pinski
@ 2014-08-07 14:20     ` James Greenhalgh
  2014-08-07 14:34       ` Richard Biener
  0 siblings, 1 reply; 62+ messages in thread
From: James Greenhalgh @ 2014-08-07 14:20 UTC (permalink / raw)
  To: Andrew Pinski; +Cc: gcc-patches, Marcus Shawcroft

On Tue, Aug 05, 2014 at 08:05:00AM +0100, Andrew Pinski wrote:
> On Fri, Aug 1, 2014 at 2:21 AM,  <pinskia@gmail.com> wrote:
> >> On Jun 6, 2014, at 1:50 AM, James Greenhalgh <james.greenhalgh@arm.com> wrote:
> >>
> >>
> >> Hi,
> >>
> >> The move_by_pieces infrastructure performs a copy by repeatedly trying
> >> the largest safe copy it can make. So for a 15-byte copy we might see:
> >>
> >> offset   amount  bytes copied
> >> 0        8       0-7
> >> 8        4       8-11
> >> 12       2       12-13
> >> 14       1       14
> >>
> >> However, we can implement a 15-byte copy as so:
> >>
> >> offset   amount  bytes copied
> >> 0        8       0-7
> >> 7        8       7-14
> >>
> >> Which can prove more efficient for both space and speed.
> >>
> >> In this patch we set MOVE_RATIO low to avoid using move_by_pieces, and
> >> implement the movmem pattern name to expand small block copy cases. Note, this
> >> optimization does not apply for -mstrict-align targets, which must continue
> >> copying byte-by-byte.
> >
> > Why not change move_by_pieces instead of having a target specific code?
> > This seems like a better option. You can check is unaligned slow target
> > macro to see if you want to do this optimization too.   As I mentioned in
> > the other email make sure you check the volatile ness of the from and to
> > before doing this optimization.

Hi Andrew,

If we are converting these volatile copies to memcpy calls, then there is an
additional bug there. There is nothing in the C standard which imposes that
memcpy/memmov read and write each byte only once and it seems reasonable to
assume that the movmem optab inherits this lack of restrictions.  This gotcha
either needs fixed, or at least documented for the movmem optab.

If I'm going to fix this, I have to write it in the back-end. Failing
the expand will cause a buggy call to memcpy.  Having said that, I'm not
sure I've seen a good definition of the semantics of a volatile struct
copy. It feels to me that an element-by-element copy is more likely to
match user expectations than a chunk-by-chunk copy. It is probably too
late for me to get that by the time I am in memov, so I'll have to push
the fix earlier (probably somewhere generic?).

Do you know of a good write-up/argument/discussion on volatile struct
copy semantics? The testcase you provided is obviously broken, what is less
obvious is what should happen for:

struct __attribute__((packed)) t16{
  long long t8;
  int t4;
  short t2;
  unsigned char t1;
  unsigned char t1a;
};
volatile struct t16 t16;
int f(struct t16 *a)
{
  t16 = *a;
}

We have at least two choices...

> Attached is the patch which does what I mentioned, I also changed
> store_by_pieces to implement a similar optimization there (for memset
> and strcpy).  Also since I used SLOW_UNALIGNED_ACCESS, this is a
> generic optimization.

I'm not sure this helps your situation on AArch64. There are still AArch64
implementations for which we will want to bypass move_by_pieces and provide
a back-end implementation.

We could more reasonably be controlling this with MOVE_BY_PIECES_P, but
this is only a thin wrapper around MOVE_RATIO, so the result for you is
much the same (pending a patch fixing SRA not to read MOVE_RATIO, it should
make no difference whether we disable by MOVE_RATIO or MOVE_BY_PIECES_P).

Have you done much micro/macro-benchmarking to show that this is indeed
a sensible optimization for !SLOW_UNALIGNED_ACCESS targets? The documentation
suggests that SLOW_UNALIGNED_ACCESS should be set if unaligned accesses
are "many times" slower. This is a bit of a blunt hammer - there are likely
targets which will suffer from this optimization, but which don't set
SLOW_UNALIGNED_ACCESS. Maybe you need some finer grained cost function?

Thanks,
James

> I had tested an earlier version on x86_64-linux-gnu and I am in the
> middle of bootstrap/testing on this one.
> 
> Thanks,
> Andrew Pinski
> 
> * expr.c (move_by_pieces):
> Take the min of max_size and len to speed up things
> and to take advatage of the mode in move_by_pieces_1.
> (move_by_pieces_1): Read/write the leftovers using an overlapping
> memory locations to reduce the number of reads/writes.
> (store_by_pieces_1): Take the min of max_size and len to speed up things
> and to take advatage of the mode in store_by_pieces_2.
> (store_by_pieces_2): Write the leftovers using an overlapping
>

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [AArch64] Implement movmem for the benefit of inline memcpy
  2014-08-07 14:20     ` James Greenhalgh
@ 2014-08-07 14:34       ` Richard Biener
  2014-08-20  9:10         ` [Patch 1/2] Control SRA and IPA-SRA by a param rather than MOVE_RATIO James Greenhalgh
  2014-08-21 10:34         ` [Patch 1/2] Don't put out a call to memcpy for volatile struct operations James Greenhalgh
  0 siblings, 2 replies; 62+ messages in thread
From: Richard Biener @ 2014-08-07 14:34 UTC (permalink / raw)
  To: James Greenhalgh; +Cc: Andrew Pinski, gcc-patches, Marcus Shawcroft

On Thu, Aug 7, 2014 at 4:20 PM, James Greenhalgh
<james.greenhalgh@arm.com> wrote:
> On Tue, Aug 05, 2014 at 08:05:00AM +0100, Andrew Pinski wrote:
>> On Fri, Aug 1, 2014 at 2:21 AM,  <pinskia@gmail.com> wrote:
>> >> On Jun 6, 2014, at 1:50 AM, James Greenhalgh <james.greenhalgh@arm.com> wrote:
>> >>
>> >>
>> >> Hi,
>> >>
>> >> The move_by_pieces infrastructure performs a copy by repeatedly trying
>> >> the largest safe copy it can make. So for a 15-byte copy we might see:
>> >>
>> >> offset   amount  bytes copied
>> >> 0        8       0-7
>> >> 8        4       8-11
>> >> 12       2       12-13
>> >> 14       1       14
>> >>
>> >> However, we can implement a 15-byte copy as so:
>> >>
>> >> offset   amount  bytes copied
>> >> 0        8       0-7
>> >> 7        8       7-14
>> >>
>> >> Which can prove more efficient for both space and speed.
>> >>
>> >> In this patch we set MOVE_RATIO low to avoid using move_by_pieces, and
>> >> implement the movmem pattern name to expand small block copy cases. Note, this
>> >> optimization does not apply for -mstrict-align targets, which must continue
>> >> copying byte-by-byte.
>> >
>> > Why not change move_by_pieces instead of having a target specific code?
>> > This seems like a better option. You can check is unaligned slow target
>> > macro to see if you want to do this optimization too.   As I mentioned in
>> > the other email make sure you check the volatile ness of the from and to
>> > before doing this optimization.
>
> Hi Andrew,
>
> If we are converting these volatile copies to memcpy calls, then there is an
> additional bug there. There is nothing in the C standard which imposes that
> memcpy/memmov read and write each byte only once and it seems reasonable to
> assume that the movmem optab inherits this lack of restrictions.  This gotcha
> either needs fixed, or at least documented for the movmem optab.
>
> If I'm going to fix this, I have to write it in the back-end. Failing
> the expand will cause a buggy call to memcpy.  Having said that, I'm not
> sure I've seen a good definition of the semantics of a volatile struct
> copy. It feels to me that an element-by-element copy is more likely to
> match user expectations than a chunk-by-chunk copy. It is probably too
> late for me to get that by the time I am in memov, so I'll have to push
> the fix earlier (probably somewhere generic?).
>
> Do you know of a good write-up/argument/discussion on volatile struct
> copy semantics? The testcase you provided is obviously broken, what is less
> obvious is what should happen for:
>
> struct __attribute__((packed)) t16{
>   long long t8;
>   int t4;
>   short t2;
>   unsigned char t1;
>   unsigned char t1a;
> };
> volatile struct t16 t16;
> int f(struct t16 *a)
> {
>   t16 = *a;
> }
>
> We have at least two choices...

It's the language frontends job to present the middle-end with
something sensible.  For example an element-wise copy.

Also consider

struct X { int i; volatile int j; int k[1024]; } x;
void f (struct X *a)
{
  x = *a;
}

Richard.

>> Attached is the patch which does what I mentioned, I also changed
>> store_by_pieces to implement a similar optimization there (for memset
>> and strcpy).  Also since I used SLOW_UNALIGNED_ACCESS, this is a
>> generic optimization.
>
> I'm not sure this helps your situation on AArch64. There are still AArch64
> implementations for which we will want to bypass move_by_pieces and provide
> a back-end implementation.
>
> We could more reasonably be controlling this with MOVE_BY_PIECES_P, but
> this is only a thin wrapper around MOVE_RATIO, so the result for you is
> much the same (pending a patch fixing SRA not to read MOVE_RATIO, it should
> make no difference whether we disable by MOVE_RATIO or MOVE_BY_PIECES_P).
>
> Have you done much micro/macro-benchmarking to show that this is indeed
> a sensible optimization for !SLOW_UNALIGNED_ACCESS targets? The documentation
> suggests that SLOW_UNALIGNED_ACCESS should be set if unaligned accesses
> are "many times" slower. This is a bit of a blunt hammer - there are likely
> targets which will suffer from this optimization, but which don't set
> SLOW_UNALIGNED_ACCESS. Maybe you need some finer grained cost function?
>
> Thanks,
> James
>
>> I had tested an earlier version on x86_64-linux-gnu and I am in the
>> middle of bootstrap/testing on this one.
>>
>> Thanks,
>> Andrew Pinski
>>
>> * expr.c (move_by_pieces):
>> Take the min of max_size and len to speed up things
>> and to take advatage of the mode in move_by_pieces_1.
>> (move_by_pieces_1): Read/write the leftovers using an overlapping
>> memory locations to reduce the number of reads/writes.
>> (store_by_pieces_1): Take the min of max_size and len to speed up things
>> and to take advatage of the mode in store_by_pieces_2.
>> (store_by_pieces_2): Write the leftovers using an overlapping
>>

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [Patch AArch64 2/2] Wire up TARGET_DEFAULT_MAX_SCALARIZATION_SIZE
  2014-08-20  9:10         ` [Patch 1/2] Control SRA and IPA-SRA by a param rather than MOVE_RATIO James Greenhalgh
@ 2014-08-20  9:10           ` James Greenhalgh
  2014-08-20  9:21           ` [Patch 1/2] Control SRA and IPA-SRA by a param rather than MOVE_RATIO Richard Biener
  1 sibling, 0 replies; 62+ messages in thread
From: James Greenhalgh @ 2014-08-20  9:10 UTC (permalink / raw)
  To: gcc-patches; +Cc: richard.guenther, marcus.shawcroft, richard.earnshaw, pinskia

[-- Attachment #1: Type: text/plain, Size: 913 bytes --]


Hi,

This patch wires up our new target hook for AArch64. This means we can
bring back the two failing SRA tests (churn :( ). For now, I've just used
the old values we had for MOVE_RATIO. We should refactor that, as we use it
in two places (more churn :( ).

Bootstrapped on AArch64 with no issues and benchmarked with no discernible
impact.

OK for trunk?

Thanks,
James

---
gcc/

2014-08-20  James Greenhalgh  <james.greenhalgh@arm.com>

	* config/aarch64/aarch64.c
	(aarch64_expand_movmem): Refactor old move costs.
	(aarch64_default_max_total_scalarization_size): New.
	(TARGET_DEFAULT_MAX_TOTAL_SCALARIZATION_SIZE): Likewise.
	* config/aarch64/aarch64.h (AARCH64_MOVE_RATIO): New.
	(MOVE_RATIO): Use it.

gcc/testsuite/

2014-08-20  James Greenhalgh  <james.greenhalgh@arm.com>

	* gcc.dg/tree-ssa/pr42585.c: Bring back for AArch64.
	* gcc.dg/tree-ssa/sra-12.c: Likewise.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0002-Patch-AArch64-2-2-Wire-up-TARGET_DEFAULT_MAX_SCALARI.patch --]
[-- Type: text/x-patch;  name=0002-Patch-AArch64-2-2-Wire-up-TARGET_DEFAULT_MAX_SCALARI.patch, Size: 4314 bytes --]

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index c3c871e..d608717 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -9725,7 +9725,7 @@ aarch64_expand_movmem (rtx *operands)
 
   /* When optimizing for size, give a better estimate of the length of a
      memcpy call, but use the default otherwise.  */
-  unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
+  unsigned int max_instructions = AARCH64_MOVE_RATIO (speed_p);
 
   /* We can't do anything smart if the amount to copy is not constant.  */
   if (!CONST_INT_P (operands[2]))
@@ -9826,6 +9826,14 @@ aarch64_expand_movmem (rtx *operands)
   return true;
 }
 
+/* Implement TARGET_DEFAULT_MAX_TOTAL_SCALARIZATION_SIZE.  */
+
+static unsigned int
+aarch64_default_max_total_scalarization_size (bool size_p)
+{
+  return AARCH64_MOVE_RATIO (!size_p);
+}
+
 #undef TARGET_ADDRESS_COST
 #define TARGET_ADDRESS_COST aarch64_address_cost
 
@@ -9949,6 +9957,10 @@ aarch64_expand_movmem (rtx *operands)
 #undef TARGET_MANGLE_TYPE
 #define TARGET_MANGLE_TYPE aarch64_mangle_type
 
+#undef TARGET_DEFAULT_MAX_TOTAL_SCALARIZATION_SIZE
+#define TARGET_DEFAULT_MAX_TOTAL_SCALARIZATION_SIZE \
+  aarch64_default_max_total_scalarization_size
+
 #undef TARGET_MEMORY_MOVE_COST
 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
 
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index db950da..5401061 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -681,6 +681,8 @@ do {									     \
 /* The base cost overhead of a memcpy call, for MOVE_RATIO and friends.  */
 #define AARCH64_CALL_RATIO 8
 
+#define AARCH64_MOVE_RATIO(speed) (((speed) ? 15 : AARCH64_CALL_RATIO) / 2)
+
 /* MOVE_RATIO dictates when we will use the move_by_pieces infrastructure.
    move_by_pieces will continually copy the largest safe chunks.  So a
    7-byte copy is a 4-byte + 2-byte + byte copy.  This proves inefficient
@@ -688,7 +690,7 @@ do {									     \
    standard name to implement the copy.  This logic does not apply when
    targeting -mstrict-align, so keep a sensible default in that case.  */
 #define MOVE_RATIO(speed) \
-  (!STRICT_ALIGNMENT ? 2 : (((speed) ? 15 : AARCH64_CALL_RATIO) / 2))
+  (!STRICT_ALIGNMENT ? 2 : AARCH64_MOVE_RATIO (speed))
 
 /* For CLEAR_RATIO, when optimizing for size, give a better estimate
    of the length of a memset call, but use the default otherwise.  */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c b/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c
index 07f575d..a970c85 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c
@@ -35,6 +35,6 @@ Cyc_string_ungetc (int ignore, struct _fat_ptr *sptr)
 /* Whether the structs are totally scalarized or not depends on the
    MOVE_RATIO macro definition in the back end.  The scalarization will
    not take place when using small values for MOVE_RATIO.  */
-/* { dg-final { scan-tree-dump-times "struct _fat_ptr _ans" 0 "optimized" { target { ! "aarch64*-*-* arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } } */
-/* { dg-final { scan-tree-dump-times "struct _fat_ptr _T2" 0 "optimized" { target { ! "aarch64*-*-* arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } } */
+/* { dg-final { scan-tree-dump-times "struct _fat_ptr _ans" 0 "optimized" { target { ! "arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } } */
+/* { dg-final { scan-tree-dump-times "struct _fat_ptr _T2" 0 "optimized" { target { ! "arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } } */
 /* { dg-final { cleanup-tree-dump "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c b/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c
index 45aa963..59e5e6a 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c
@@ -21,5 +21,5 @@ int foo (struct S *p)
   *p = l;
 }
 
-/* { dg-final { scan-tree-dump-times "l;" 0 "release_ssa" { target { ! "aarch64*-*-* avr*-*-* nds32*-*-*" } } } } */
+/* { dg-final { scan-tree-dump-times "l;" 0 "release_ssa" { target { ! "avr*-*-* nds32*-*-*" } } } } */
 /* { dg-final { cleanup-tree-dump "release_ssa" } } */

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [Patch 1/2] Control SRA and IPA-SRA by a param rather than MOVE_RATIO
  2014-08-07 14:34       ` Richard Biener
@ 2014-08-20  9:10         ` James Greenhalgh
  2014-08-20  9:10           ` [Patch AArch64 2/2] Wire up TARGET_DEFAULT_MAX_SCALARIZATION_SIZE James Greenhalgh
  2014-08-20  9:21           ` [Patch 1/2] Control SRA and IPA-SRA by a param rather than MOVE_RATIO Richard Biener
  2014-08-21 10:34         ` [Patch 1/2] Don't put out a call to memcpy for volatile struct operations James Greenhalgh
  1 sibling, 2 replies; 62+ messages in thread
From: James Greenhalgh @ 2014-08-20  9:10 UTC (permalink / raw)
  To: gcc-patches; +Cc: richard.guenther, marcus.shawcroft, richard.earnshaw, pinskia

[-- Attachment #1: Type: text/plain, Size: 1715 bytes --]


Hi,

Presently the decision as to whether to completely scalarize an aggregate
or not is made based on MOVE_RATIO. This is an undocumented, and unexpected,
overloading of the target macro.

In this patch we fix this.

First, we we add a new target hook
TARGET_DEFAULT_MAX_TOTAL_SCALARIZATION_SIZE, which returns MOVE_RATIO
by default.

Then we add two new parameters:

  sra-max-total-scalarization-size-Ospeed - The maximum size of aggregate
  to consider when compiling for speed
  sra-max-total-scalarization-size-Osize - The maximum size of aggregate
  to consider when compiling for size.

Set to default to 0.

Finally we wire up SRA to prefer using the parameters, and if it doesn't
find values for them, fallback to the target hook.

Bootstrapped and regression tested for x86, arm and aarch64 with no
issues, I've also thrown a smoke-test of popular small benchmarks at
each platform without seeing meaningful differences (as you would expect).

OK?

Thanks,
James

---
gcc/

2014-08-20  James Greenhalgh  <james.greenhalgh@arm.com>

	* doc/invoke.texi (sra-max-total-scalarization-size-Ospeed): Document.
	(sra-max-total-scalarization-size-Osize): Likewise.
	* doc/tm.texi.in
	(TARGET_DEFAULT_MAX_TOTAL_SCALARIZATION_SIZE): Add hook.
	* doc/tm.texi: Regenerate.
	* params.def (sra-max-total-scalarization-size-Ospeed): New.
	(sra-max-total-scalarization-size-Osize): Likewise.
	* target.def (default_max_total_scalarization_size): New.
	* targhooks.c (default_max_total_scalarization_size): New.
	* targhooks.h (default_max_total_scalarization_size): New.
	* tree-sra.c (get_max_total_scalarization_size): New.
	(analyze_all_variable_accesses): Use it.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0001-Patch-1-2-Control-SRA-and-IPA-SRA-by-a-param-rather-.patch --]
[-- Type: text/x-patch;  name=0001-Patch-1-2-Control-SRA-and-IPA-SRA-by-a-param-rather-.patch, Size: 7486 bytes --]

diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 6374261..2b6593d 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -10232,6 +10232,15 @@ parameters only when their cumulative size is less or equal to
 @option{ipa-sra-ptr-growth-factor} times the size of the original
 pointer parameter.
 
+@item sra-max-total-scalarization-size-Ospeed
+@item sra-max-total-scalarization-size-Osize
+The two Scalar Reduction of Aggregates passes (SRA and IPA-SRA) aim to
+replace scalar parts of aggregates with uses of independent scalar
+variables. These parameters control the maximum size of aggregate
+which will be considered for replacement when compiling for speed
+(@option{sra-max-total-scalarization-size-Ospeed}) or size
+(@option{sra-max-total-scalarization-size-Osize}) respectively.
+
 @item tm-max-aggregate-size
 When making copies of thread-local variables in a transaction, this
 parameter specifies the size in bytes after which variables are
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 9dd8d68..42ef37f 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -6118,6 +6118,16 @@ value to the result of that function.  The arguments to that function
 are the same as to this target hook.
 @end deftypefn
 
+@deftypefn {Target Hook} {unsigned int} TARGET_DEFAULT_MAX_TOTAL_SCALARIZATION_SIZE (bool @var{size_p})
+This target hook is used by the Scalar Replacement of Aggregates pass
+  to determine the maximum size, in words, of aggregate to consider for
+  replacement.  @code{size_p} is used to indicate whether we are compiling
+  for size or speed.  By default, the maximum total scalarization size
+  is determined by MOVE_RATIO and can be further controlled using the
+  parameters @code{sra-max-total-scalarization-size-Ospeed} and
+  @code{sra-max-total-scalarization-size-Osize}.
+@end deftypefn
+
 @defmac BRANCH_COST (@var{speed_p}, @var{predictable_p})
 A C expression for the cost of a branch instruction.  A value of 1 is
 the default; other values are interpreted relative to that. Parameter
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index dd72b98..d560521 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4607,6 +4607,8 @@ These macros are obsolete, new ports should use the target hook
 
 @hook TARGET_MEMORY_MOVE_COST
 
+@hook TARGET_DEFAULT_MAX_TOTAL_SCALARIZATION_SIZE
+
 @defmac BRANCH_COST (@var{speed_p}, @var{predictable_p})
 A C expression for the cost of a branch instruction.  A value of 1 is
 the default; other values are interpreted relative to that. Parameter
diff --git a/gcc/params.def b/gcc/params.def
index aefdd07..dea6fb3 100644
--- a/gcc/params.def
+++ b/gcc/params.def
@@ -942,6 +942,18 @@ DEFPARAM (PARAM_TM_MAX_AGGREGATE_SIZE,
 	  "pairs",
 	  9, 0, 0)
 
+DEFPARAM (PARAM_SRA_TOTAL_SCALARIZATION_SIZE_SPEED,
+	  "sra-max-total-scalarization-size-Ospeed",
+	  "Maximum size, in words, of an aggregate which should be "
+	  "considered for scalarization when compiling for speed",
+	  0, 0, 0)
+
+DEFPARAM (PARAM_SRA_TOTAL_SCALARIZATION_SIZE_SIZE,
+	  "sra-max-total-scalarization-size-Osize",
+	  "Maximum size, in words, of an aggregate which should be "
+	  "considered for scalarization when compiling for size",
+	  0, 0, 0)
+
 DEFPARAM (PARAM_IPA_CP_VALUE_LIST_SIZE,
 	  "ipa-cp-value-list-size",
 	  "Maximum size of a list of values associated with each parameter for "
diff --git a/gcc/target.def b/gcc/target.def
index 3a41db1..f879a3f 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -3037,6 +3037,20 @@ are the same as to this target hook.",
  int, (enum machine_mode mode, reg_class_t rclass, bool in),
  default_memory_move_cost)
 
+/* Return the maximum size in words of aggregate which will be considered
+   for replacement by SRA/IP-SRA.  */
+DEFHOOK
+(default_max_total_scalarization_size,
+ "This target hook is used by the Scalar Replacement of Aggregates pass\n\
+  to determine the maximum size, in words, of aggregate to consider for\n\
+  replacement.  @code{size_p} is used to indicate whether we are compiling\n\
+  for size or speed.  By default, the maximum total scalarization size\n\
+  is determined by MOVE_RATIO and can be further controlled using the\n\
+  parameters @code{sra-max-total-scalarization-size-Ospeed} and\n\
+  @code{sra-max-total-scalarization-size-Osize}.",
+ unsigned int, (bool size_p),
+ default_max_total_scalarization_size)
+
 /* True for MODE if the target expects that registers in this mode will
    be allocated to registers in a small register class.  The compiler is
    allowed to use registers explicitly used in the rtl as spill registers
diff --git a/gcc/targhooks.c b/gcc/targhooks.c
index 0f27a5a..3b2d1b8 100644
--- a/gcc/targhooks.c
+++ b/gcc/targhooks.c
@@ -1375,6 +1375,15 @@ default_register_move_cost (enum machine_mode mode ATTRIBUTE_UNUSED,
 #endif
 }
 
+/* Return the maximum size in words of aggregate which will be considered
+   for replacement by SRA/IP-SRA.  */
+
+unsigned int
+default_max_total_scalarization_size (bool size_p ATTRIBUTE_UNUSED)
+{
+  return MOVE_RATIO (!size_p);
+}
+
 bool
 default_profile_before_prologue (void)
 {
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index 4be33f8..20168f4 100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -177,6 +177,8 @@ extern int default_memory_move_cost (enum machine_mode, reg_class_t, bool);
 extern int default_register_move_cost (enum machine_mode, reg_class_t,
 				       reg_class_t);
 
+extern unsigned int default_max_total_scalarization_size (bool size_p);
+
 extern bool default_profile_before_prologue (void);
 extern reg_class_t default_preferred_reload_class (rtx, reg_class_t);
 extern reg_class_t default_preferred_output_reload_class (rtx, reg_class_t);
diff --git a/gcc/tree-sra.c b/gcc/tree-sra.c
index 2f80497..90ad068 100644
--- a/gcc/tree-sra.c
+++ b/gcc/tree-sra.c
@@ -2482,6 +2482,24 @@ propagate_all_subaccesses (void)
     }
 }
 
+/* Return the max_total_scalarization_size as requested by the user in
+   parameters, or the target through
+   TARGET_DEFAULT_MAX_TOTAL_SCALARIZATION_SIZE.  */
+
+unsigned int
+get_max_total_scalarization_size (bool size_p)
+{
+  unsigned param_max_scalarization_size
+    = size_p
+      ? PARAM_VALUE (PARAM_SRA_TOTAL_SCALARIZATION_SIZE_SIZE)
+      : PARAM_VALUE (PARAM_SRA_TOTAL_SCALARIZATION_SIZE_SPEED);
+
+  if (param_max_scalarization_size > 0)
+    return param_max_scalarization_size;
+  else
+    return targetm.default_max_total_scalarization_size (size_p);
+}
+
 /* Go through all accesses collected throughout the (intraprocedural) analysis
    stage, exclude overlapping ones, identify representatives and build trees
    out of them, making decisions about scalarization on the way.  Return true
@@ -2493,10 +2511,10 @@ analyze_all_variable_accesses (void)
   int res = 0;
   bitmap tmp = BITMAP_ALLOC (NULL);
   bitmap_iterator bi;
-  unsigned i, max_total_scalarization_size;
-
-  max_total_scalarization_size = UNITS_PER_WORD * BITS_PER_UNIT
-    * MOVE_RATIO (optimize_function_for_speed_p (cfun));
+  unsigned i;
+  unsigned max_total_scalarization_size
+    = get_max_total_scalarization_size (optimize_function_for_size_p (cfun))
+      * UNITS_PER_WORD * BITS_PER_UNIT;
 
   EXECUTE_IF_SET_IN_BITMAP (candidate_bitmap, 0, i, bi)
     if (bitmap_bit_p (should_scalarize_away_bitmap, i)

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [Patch 1/2] Control SRA and IPA-SRA by a param rather than MOVE_RATIO
  2014-08-20  9:10         ` [Patch 1/2] Control SRA and IPA-SRA by a param rather than MOVE_RATIO James Greenhalgh
  2014-08-20  9:10           ` [Patch AArch64 2/2] Wire up TARGET_DEFAULT_MAX_SCALARIZATION_SIZE James Greenhalgh
@ 2014-08-20  9:21           ` Richard Biener
  2014-09-25 14:58             ` [Patch 0/4] " James Greenhalgh
  1 sibling, 1 reply; 62+ messages in thread
From: Richard Biener @ 2014-08-20  9:21 UTC (permalink / raw)
  To: James Greenhalgh
  Cc: GCC Patches, Marcus Shawcroft, Richard Earnshaw, Andrew Pinski

On Wed, Aug 20, 2014 at 11:09 AM, James Greenhalgh
<james.greenhalgh@arm.com> wrote:
>
> Hi,
>
> Presently the decision as to whether to completely scalarize an aggregate
> or not is made based on MOVE_RATIO. This is an undocumented, and unexpected,
> overloading of the target macro.
>
> In this patch we fix this.
>
> First, we we add a new target hook
> TARGET_DEFAULT_MAX_TOTAL_SCALARIZATION_SIZE, which returns MOVE_RATIO
> by default.
>
> Then we add two new parameters:
>
>   sra-max-total-scalarization-size-Ospeed - The maximum size of aggregate
>   to consider when compiling for speed
>   sra-max-total-scalarization-size-Osize - The maximum size of aggregate
>   to consider when compiling for size.
>
> Set to default to 0.
>
> Finally we wire up SRA to prefer using the parameters, and if it doesn't
> find values for them, fallback to the target hook.
>
> Bootstrapped and regression tested for x86, arm and aarch64 with no
> issues, I've also thrown a smoke-test of popular small benchmarks at
> each platform without seeing meaningful differences (as you would expect).
>
> OK?

I think this is overly complicated and instead SRA should only
use the parameters.  Targets can adjust their default (like they
do for other parameters).

The default should be MOVE_RATIO which should be applied
where the common code adjusts parameters (see existing
examples for not overriding user specified ones).

Thanks,
Richard.

> Thanks,
> James
>
> ---
> gcc/
>
> 2014-08-20  James Greenhalgh  <james.greenhalgh@arm.com>
>
>         * doc/invoke.texi (sra-max-total-scalarization-size-Ospeed): Document.
>         (sra-max-total-scalarization-size-Osize): Likewise.
>         * doc/tm.texi.in
>         (TARGET_DEFAULT_MAX_TOTAL_SCALARIZATION_SIZE): Add hook.
>         * doc/tm.texi: Regenerate.
>         * params.def (sra-max-total-scalarization-size-Ospeed): New.
>         (sra-max-total-scalarization-size-Osize): Likewise.
>         * target.def (default_max_total_scalarization_size): New.
>         * targhooks.c (default_max_total_scalarization_size): New.
>         * targhooks.h (default_max_total_scalarization_size): New.
>         * tree-sra.c (get_max_total_scalarization_size): New.
>         (analyze_all_variable_accesses): Use it.

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [Patch 1/2] Don't put out a call to memcpy for volatile struct operations
  2014-08-07 14:34       ` Richard Biener
  2014-08-20  9:10         ` [Patch 1/2] Control SRA and IPA-SRA by a param rather than MOVE_RATIO James Greenhalgh
@ 2014-08-21 10:34         ` James Greenhalgh
  2014-08-21 10:34           ` [Patch AArch64 2/2] Do not double-copy bytes in " James Greenhalgh
  2014-08-21 11:22           ` [Patch 1/2] Don't put out a call to memcpy for " Richard Biener
  1 sibling, 2 replies; 62+ messages in thread
From: James Greenhalgh @ 2014-08-21 10:34 UTC (permalink / raw)
  To: gcc-patches; +Cc: marcus-shawcroft, richard.guenther, pinskia

[-- Attachment #1: Type: text/plain, Size: 1067 bytes --]


Hi,

Andrew is quite right, we break the contract for volatile struct copies
if we start double copying bytes.

But, the generic code will call memcpy - at which point anything could
happen. So, we must not put out a call to memcpy if either our source or
destination operands are volatile. The same is true of memset, so also
disable that call for volatile operands, and add a fallback loop
implementation.

Bootstrapped on x86, Arm and AArch64 with no issues.

OK?

Thanks,
James

---
gcc/

2014-08-21  James Greenhalgh  <james.greenhalgh@arm.com>

	* expr.c (set_storage_via_loop): New.
	(emit_block_move_hints): Do not call memcpy with volatile operands.
	(emit_block_move_via_movmem): Clarify that targets do have to care
	about volatile operands.
	(clear_storage_hints): Do not call memset for volatile operands,
	fall back to a loop implementation.

gcc/testsuite/

2014-08-21  James Greenhalgh  <james.greenhalgh@arm.com>

	* gcc.dg/large-volatile-struct-copy-1.c: New.
	* gcc.dg/large-volatile-struct-set-1.c: Likewise.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0001-Patch-1-2-Don-t-put-out-a-call-to-memcpy-for-volatil.patch --]
[-- Type: text/x-patch;  name=0001-Patch-1-2-Don-t-put-out-a-call-to-memcpy-for-volatil.patch, Size: 4974 bytes --]

diff --git a/gcc/expr.c b/gcc/expr.c
index 920d47b..764525f 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -134,6 +134,7 @@ static void store_by_pieces_1 (struct store_by_pieces_d *, unsigned int);
 static void store_by_pieces_2 (insn_gen_fn, machine_mode,
 			       struct store_by_pieces_d *);
 static tree clear_storage_libcall_fn (int);
+static void set_storage_via_loop (rtx, rtx, rtx, unsigned int);
 static rtx_insn *compress_float_constant (rtx, rtx);
 static rtx get_subtarget (rtx);
 static void store_constructor_field (rtx, unsigned HOST_WIDE_INT,
@@ -1139,6 +1140,10 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
   x = adjust_address (x, BLKmode, 0);
   y = adjust_address (y, BLKmode, 0);
 
+  /* memcpy is not guaranteed to be safe for volatile operands.  */
+  may_use_call &= (!MEM_VOLATILE_P (x)
+		   && !MEM_VOLATILE_P (y));
+
   /* Set MEM_SIZE as appropriate for this block copy.  The main place this
      can be incorrect is coming from __builtin_memcpy.  */
   if (CONST_INT_P (size))
@@ -2788,15 +2793,62 @@ clear_storage_hints (rtx object, rtx size, enum block_op_methods method,
 				   expected_align, expected_size,
 				   min_size, max_size, probable_max_size))
     ;
-  else if (ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (object)))
+  else if (ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (object))
+	   && !MEM_VOLATILE_P (object))
     return set_storage_via_libcall (object, size, const0_rtx,
 				    method == BLOCK_OP_TAILCALL);
   else
-    gcc_unreachable ();
+    set_storage_via_loop (object, size, const0_rtx, align);
 
   return NULL;
 }
 
+/* A subroutine of clear_storage.  Set the data via an explicit
+   loop.  This is used only when libcalls are forbidden.  */
+/* ??? It'd be nice to set in hunks larger than QImode.  */
+
+static void
+set_storage_via_loop (rtx object, rtx size, rtx val,
+		      unsigned int align ATTRIBUTE_UNUSED)
+{
+  rtx cmp_label, top_label, iter, object_addr, tmp;
+  enum machine_mode object_addr_mode = get_address_mode (object);
+  enum machine_mode iter_mode;
+
+  iter_mode = GET_MODE (size);
+  if (iter_mode == VOIDmode)
+    iter_mode = word_mode;
+
+  top_label = gen_label_rtx ();
+  cmp_label = gen_label_rtx ();
+  iter = gen_reg_rtx (iter_mode);
+
+  emit_move_insn (iter, const0_rtx);
+
+  object_addr = force_operand (XEXP (object, 0), NULL_RTX);
+  do_pending_stack_adjust ();
+
+  emit_jump (cmp_label);
+  emit_label (top_label);
+
+  tmp = convert_modes (object_addr_mode, iter_mode, iter, true);
+  object_addr = simplify_gen_binary (PLUS, object_addr_mode, object_addr, tmp);
+
+  object = change_address (object, QImode, object_addr);
+
+  emit_move_insn (object, val);
+
+  tmp = expand_simple_binop (iter_mode, PLUS, iter, const1_rtx, iter,
+			     true, OPTAB_LIB_WIDEN);
+  if (tmp != iter)
+    emit_move_insn (iter, tmp);
+
+  emit_label (cmp_label);
+
+  emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
+			   true, top_label, REG_BR_PROB_BASE * 90 / 100);
+}
+
 rtx
 clear_storage (rtx object, rtx size, enum block_op_methods method)
 {
diff --git a/gcc/testsuite/gcc.dg/large-volatile-struct-copy-1.c b/gcc/testsuite/gcc.dg/large-volatile-struct-copy-1.c
new file mode 100644
index 0000000..32e4bdf
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/large-volatile-struct-copy-1.c
@@ -0,0 +1,37 @@
+/* { dg-do run } */
+/* { dg-options "-O3 --save-temps" } */
+
+#define SIZE 1000
+
+extern void abort (void);
+
+struct foo
+{
+  char data[SIZE];
+};
+
+void __attribute__ ((noinline))
+func (struct foo volatile *x, struct foo volatile *y)
+{
+  *x = *y;
+}
+
+int
+main (int argc, char** argv)
+{
+  /* We just need something to copy, it doesn't much matter what it is.  */
+  volatile struct foo y = { 1, 2, 3 };
+  volatile struct foo x;
+  int i = 0;
+
+  func (&x, &y);
+
+  for (i = 0; i < SIZE; ++i)
+    if (x.data[i] != y.data[i])
+      abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-assembler-not "memcpy" } } */
+/* { dg-final { cleanup-saved-temps } } */
diff --git a/gcc/testsuite/gcc.dg/large-volatile-struct-set-1.c b/gcc/testsuite/gcc.dg/large-volatile-struct-set-1.c
new file mode 100644
index 0000000..a41909c
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/large-volatile-struct-set-1.c
@@ -0,0 +1,34 @@
+/* { dg-do run } */
+/* { dg-options "-O3 --save-temps" } */
+
+#define SIZE 1000
+
+extern void abort (void);
+
+struct foo
+{
+  char data[SIZE];
+};
+
+void __attribute__ ((__noinline__))
+func (struct foo volatile * x)
+{
+  *x = (volatile struct foo) {{0}};
+}
+
+int
+main (int argc, char** argv)
+{
+  volatile struct foo x;
+  int i = 0;
+  func (&x);
+
+  for (i = 0; i < SIZE; ++i)
+    if (x.data[i])
+      abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-assembler-not "memset" } } */
+/* { dg-final { cleanup-saved-temps } } */

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [Patch AArch64 2/2] Do not double-copy bytes in volatile struct operations
  2014-08-21 10:34         ` [Patch 1/2] Don't put out a call to memcpy for volatile struct operations James Greenhalgh
@ 2014-08-21 10:34           ` James Greenhalgh
  2014-08-21 11:22           ` [Patch 1/2] Don't put out a call to memcpy for " Richard Biener
  1 sibling, 0 replies; 62+ messages in thread
From: James Greenhalgh @ 2014-08-21 10:34 UTC (permalink / raw)
  To: gcc-patches; +Cc: marcus-shawcroft, richard.guenther, pinskia

[-- Attachment #1: Type: text/plain, Size: 345 bytes --]


Hi,

We also need to be careful in AArch64's movmem implementation, we
can't expand to our overlapping mode of operation.

Bootstrapped with no issues.

OK?

Thanks,
James

---

2014-08-21  James Greenhalgh  <james.greenhalgh@arm.com>

	* config/aarch64/aarch64.c (aarch64_expand_movmem): Fail if we
	have volatile operands.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0002-Patch-AArch64-2-2-Do-not-double-copy-bytes-in-volati.patch --]
[-- Type: text/x-patch;  name=0002-Patch-AArch64-2-2-Do-not-double-copy-bytes-in-volati.patch, Size: 565 bytes --]

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 0f3c74b..56434bc 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -9741,6 +9741,10 @@ aarch64_expand_movmem (rtx *operands)
   if (!CONST_INT_P (operands[2]))
     return false;
 
+  /* We can't do anything smart if either of the operands are volatile.  */
+  if (MEM_VOLATILE_P (src) || MEM_VOLATILE_P (dst))
+    return false;
+
   n = UINTVAL (operands[2]);
 
   /* Try to keep the number of instructions low.  For cases below 16 bytes we

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [Patch 1/2] Don't put out a call to memcpy for volatile struct operations
  2014-08-21 10:34         ` [Patch 1/2] Don't put out a call to memcpy for volatile struct operations James Greenhalgh
  2014-08-21 10:34           ` [Patch AArch64 2/2] Do not double-copy bytes in " James Greenhalgh
@ 2014-08-21 11:22           ` Richard Biener
  2014-08-21 23:47             ` Mike Stump
  1 sibling, 1 reply; 62+ messages in thread
From: Richard Biener @ 2014-08-21 11:22 UTC (permalink / raw)
  To: James Greenhalgh; +Cc: GCC Patches, marcus-shawcroft, Andrew Pinski

On Thu, Aug 21, 2014 at 12:34 PM, James Greenhalgh
<james.greenhalgh@arm.com> wrote:
>
> Hi,
>
> Andrew is quite right, we break the contract for volatile struct copies
> if we start double copying bytes.
>
> But, the generic code will call memcpy - at which point anything could
> happen. So, we must not put out a call to memcpy if either our source or
> destination operands are volatile. The same is true of memset, so also
> disable that call for volatile operands, and add a fallback loop
> implementation.
>
> Bootstrapped on x86, Arm and AArch64 with no issues.
>
> OK?

Umm... using a byte-wise clearing loop is surely always the wrong
thing for an access that _really_ cares about volatile.

I see we do the same for the block-move case... ugh.

I still say we need to solve the issue at language level - that is,
try to figure out what the language standard says about

volatile struct X x, y;

x = y;

or about

struct X { volatile int x; } x, y;

x = y;

where we don't even _have_ MEM_VOLATILE set on x or y.

I expect that most structs have volatile for a bogus reason
anyway and we slow down and enlarge code for no good reason.

So - why bother fixing this?  ISTR reading in the C standard
that structure assignments are expected to compile to memcpy.

Richard.

> Thanks,
> James
>
> ---
> gcc/
>
> 2014-08-21  James Greenhalgh  <james.greenhalgh@arm.com>
>
>         * expr.c (set_storage_via_loop): New.
>         (emit_block_move_hints): Do not call memcpy with volatile operands.
>         (emit_block_move_via_movmem): Clarify that targets do have to care
>         about volatile operands.
>         (clear_storage_hints): Do not call memset for volatile operands,
>         fall back to a loop implementation.
>
> gcc/testsuite/
>
> 2014-08-21  James Greenhalgh  <james.greenhalgh@arm.com>
>
>         * gcc.dg/large-volatile-struct-copy-1.c: New.
>         * gcc.dg/large-volatile-struct-set-1.c: Likewise.

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [Patch 1/2] Don't put out a call to memcpy for volatile struct operations
  2014-08-21 11:22           ` [Patch 1/2] Don't put out a call to memcpy for " Richard Biener
@ 2014-08-21 23:47             ` Mike Stump
  2014-08-22 15:42               ` Joseph S. Myers
  2014-08-26  8:35               ` Richard Biener
  0 siblings, 2 replies; 62+ messages in thread
From: Mike Stump @ 2014-08-21 23:47 UTC (permalink / raw)
  To: Richard Biener
  Cc: James Greenhalgh, GCC Patches, marcus-shawcroft, Andrew Pinski

On Aug 21, 2014, at 4:22 AM, Richard Biener <richard.guenther@gmail.com> wrote:
> I still say we need to solve the issue at language level - that is,
> try to figure out what the language standard says about
> 
> volatile struct X x, y;

> x = y;

The definition of x = y doesn’t change wrt volatile above.  See below for the semantic of x = y;  What this does is it makes the members of x and y volatile:

       [#7] EXAMPLE 2 In:                                           |

               struct s { int i; const int ci; };
               struct s s;
               const struct s cs;
               volatile struct s vs;

       the various members have the types:                          |

               s.i     int
               s.ci    const int
               cs.i    const int
               cs.ci   const int
               vs.i    volatile int
               vs.ci   volatile const int

> or about
> 
> struct X { volatile int x; } x, y;
> 
> x = y;

So, what the C99 standard[1] says is that memcpy copies n characters from one to the other, leaving unspecified the order of the copy.  C++98 reuses by reference those semantics.  Of course, there are quite a few memcpy implementations that don’t do that.

For x = y, in C++98, it is defined like so:

8 The implicitly-defined copy constructor for class X performs a member-
  wise  copy of its subobjects.  The order of copying is the same as the
  order of initialization of bases and members in  a  user-defined  con-
  structor  (see  _class.base.init_).   Each  subobject is copied in the
  manner appropriate to its type

which means a volatile int member translates to volatile SI read/write as appropriate, or put another way, one can’t use memcpy for it.  Now, that isn’t to say that we can’t change the language standard or improve it with different semantics.

For C99:

       [#2] In simple  assignment  (=),  the  value  of  the  right
       operand   is   converted  to  the  type  of  the  assignment
       expression and replaces  the  value  stored  in  the  object
       designated by the left operand.

which I’d claim isn’t exactly clear and precise.  Clearly what they were thinking was:

       36)Thus,  for   example,   structure   assignment   may   be                                              
          implemented element-at-a-time or via memcpy.                                                           

left not exactly well defined is the case of volatile.  Reasonable people would say that volatile semantics are likely the same as C++98 (also, C++ was mostly just noting what we thought the C standard said in the first place).

I don’t keep up on DRs that might explicitly cover details, so I’d defer those those if any.

> I expect that most structs have volatile for a bogus reason
> anyway and we slow down and enlarge code for no good reason.

Yes, I suspect if we put in code to handle volatile members better, that no code will care.  Why, cause no one has asked for those semantics, no code depends upon those semantics.  Though, in time, some code might care.

> So - why bother fixing this?  ISTR reading in the C standard
> that structure assignments are expected to compile to memcpy.

Your ISTR is quoted for you above.  That wording isn’t a prescription of semantics.  It is an observation that there are some situations where the implementation may use memcpy.

In C99, sig_atomic_t defines when something is lock free, leaving unspecific what else may be.  In later C++ standards (for example C++14), [atomics.lockfree] defines additional types that are atomic.

1 - I use n843 for C99, which is slightly different from the standard, but in this case I suspect it is the same.

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [Patch 1/2] Don't put out a call to memcpy for volatile struct operations
  2014-08-21 23:47             ` Mike Stump
@ 2014-08-22 15:42               ` Joseph S. Myers
  2014-08-22 17:33                 ` Mike Stump
  2014-08-26  8:35               ` Richard Biener
  1 sibling, 1 reply; 62+ messages in thread
From: Joseph S. Myers @ 2014-08-22 15:42 UTC (permalink / raw)
  To: Mike Stump
  Cc: Richard Biener, James Greenhalgh, GCC Patches, marcus-shawcroft,
	Andrew Pinski

On Thu, 21 Aug 2014, Mike Stump wrote:

> 1 - I use n843 for C99, which is slightly different from the standard, but in this case I suspect it is the same.

Use N1256 (C99+TC1+TC2+TC3) instead.

-- 
Joseph S. Myers
joseph@codesourcery.com

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [Patch 1/2] Don't put out a call to memcpy for volatile struct operations
  2014-08-22 15:42               ` Joseph S. Myers
@ 2014-08-22 17:33                 ` Mike Stump
  0 siblings, 0 replies; 62+ messages in thread
From: Mike Stump @ 2014-08-22 17:33 UTC (permalink / raw)
  To: Joseph S. Myers
  Cc: Richard Biener, James Greenhalgh, GCC Patches, marcus-shawcroft,
	Andrew Pinski

On Aug 22, 2014, at 8:42 AM, Joseph S. Myers <joseph@codesourcery.com> wrote:
> On Thu, 21 Aug 2014, Mike Stump wrote:
>> 1 - I use n843 for C99, which is slightly different from the standard, but in this case I suspect it is the same.
> 
> Use N1256 (C99+TC1+TC2+TC3) instead.

Thanks.  I had the C99.pdf to quote from, it is just the plain .txt file I find way easier than the pdf file.  I’ve updated to n1256 as I found a .txt version of it.

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [Patch 1/2] Don't put out a call to memcpy for volatile struct operations
  2014-08-21 23:47             ` Mike Stump
  2014-08-22 15:42               ` Joseph S. Myers
@ 2014-08-26  8:35               ` Richard Biener
  2014-08-26 16:42                 ` Mike Stump
  1 sibling, 1 reply; 62+ messages in thread
From: Richard Biener @ 2014-08-26  8:35 UTC (permalink / raw)
  To: Mike Stump; +Cc: James Greenhalgh, GCC Patches, marcus-shawcroft, Andrew Pinski

On Fri, Aug 22, 2014 at 1:47 AM, Mike Stump <mikestump@comcast.net> wrote:
> On Aug 21, 2014, at 4:22 AM, Richard Biener <richard.guenther@gmail.com> wrote:
>> I still say we need to solve the issue at language level - that is,
>> try to figure out what the language standard says about
>>
>> volatile struct X x, y;
>
>> x = y;
>
> The definition of x = y doesn’t change wrt volatile above.  See below for the semantic of x = y;  What this does is it makes the members of x and y volatile:
>
>        [#7] EXAMPLE 2 In:                                           |
>
>                struct s { int i; const int ci; };
>                struct s s;
>                const struct s cs;
>                volatile struct s vs;
>
>        the various members have the types:                          |
>
>                s.i     int
>                s.ci    const int
>                cs.i    const int
>                cs.ci   const int
>                vs.i    volatile int
>                vs.ci   volatile const int
>
>> or about
>>
>> struct X { volatile int x; } x, y;
>>
>> x = y;
>
> So, what the C99 standard[1] says is that memcpy copies n characters from one to the other, leaving unspecified the order of the copy.  C++98 reuses by reference those semantics.  Of course, there are quite a few memcpy implementations that don’t do that.
>
> For x = y, in C++98, it is defined like so:
>
> 8 The implicitly-defined copy constructor for class X performs a member-
>   wise  copy of its subobjects.  The order of copying is the same as the
>   order of initialization of bases and members in  a  user-defined  con-
>   structor  (see  _class.base.init_).   Each  subobject is copied in the
>   manner appropriate to its type

Thats quite specific ;)

> which means a volatile int member translates to volatile SI read/write as appropriate, or put another way, one can’t use memcpy for it.  Now, that isn’t to say that we can’t change the language standard or improve it with different semantics.
>
> For C99:
>
>        [#2] In simple  assignment  (=),  the  value  of  the  right
>        operand   is   converted  to  the  type  of  the  assignment
>        expression and replaces  the  value  stored  in  the  object
>        designated by the left operand.
>
> which I’d claim isn’t exactly clear and precise.  Clearly what they were thinking was:

Indeed.

>        36)Thus,  for   example,   structure   assignment   may   be
>           implemented element-at-a-time or via memcpy.
>
> left not exactly well defined is the case of volatile.  Reasonable people would say that volatile semantics are likely the same as C++98 (also, C++ was mostly just noting what we thought the C standard said in the first place).
>
> I don’t keep up on DRs that might explicitly cover details, so I’d defer those those if any.
>
>> I expect that most structs have volatile for a bogus reason
>> anyway and we slow down and enlarge code for no good reason.
>
> Yes, I suspect if we put in code to handle volatile members better, that no code will care.  Why, cause no one has asked for those semantics, no code depends upon those semantics.  Though, in time, some code might care.
>
>> So - why bother fixing this?  ISTR reading in the C standard
>> that structure assignments are expected to compile to memcpy.
>
> Your ISTR is quoted for you above.  That wording isn’t a prescription of semantics.  It is an observation that there are some situations where the implementation may use memcpy.
>
> In C99, sig_atomic_t defines when something is lock free, leaving unspecific what else may be.  In later C++ standards (for example C++14), [atomics.lockfree] defines additional types that are atomic.
>
>
> 1 - I use n843 for C99, which is slightly different from the standard, but in this case I suspect it is the same.

So after reading the std quotations I still think that if we want to
fix anything
here then we want to fix it in the frontends (only the C++ FE knows
init order in the details required - though I suppose the description was for
non-POD types where the FE may already do this).

_If_ we want to do this in the middle-end then I suggest to do the
decomposition during gimplification as the rest of the middle-end
doesn't treat the second example as a volatile aggregate copy at all.

Fixing this with the proposed patch doesn't really fix it and it will
perform the worst of all implementations (a byte-by-byte copy
certainly will break that hardware access the patch was meant to fix,
also thinking of a struct with volatile bitfields and
-fstrict-volatile-bitfields).

I'd still lean towards doing this in frontends (or c-family/ code).

Richard.

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [Patch 1/2] Don't put out a call to memcpy for volatile struct operations
  2014-08-26  8:35               ` Richard Biener
@ 2014-08-26 16:42                 ` Mike Stump
  0 siblings, 0 replies; 62+ messages in thread
From: Mike Stump @ 2014-08-26 16:42 UTC (permalink / raw)
  To: Richard Biener
  Cc: James Greenhalgh, GCC Patches, marcus-shawcroft, Andrew Pinski

On Aug 26, 2014, at 1:35 AM, Richard Biener <richard.guenther@gmail.com> wrote:
> 
>> 8 The implicitly-defined copy constructor for class X performs a member-
>>  wise  copy of its subobjects.  The order of copying is the same as the
>>  order of initialization of bases and members in  a  user-defined  con-
>>  structor  (see  _class.base.init_).   Each  subobject is copied in the
>>  manner appropriate to its type
> 
> Thats quite specific ;)

I think you are making fun of it, but actually, it is very specific.  There are a ton of other words that back it up.  For example, you might have to print Hello World for each member copied.  The front end would generate calls to printf and those would be the semantics.  This last case is what happens in the general case of user defined copy constructors.  However, the case we care about is a narrow case when various front end specific bits are checked in the front end and we decide we can use memcpy to implement the copy or not.  The bits used to make that decision are front-end bits:

/* Nonzero for class type means that copy initialization of this type can use
   a bitwise copy.  */
#define TYPE_HAS_TRIVIAL_COPY_CTOR(NODE) ...

So, whenever this is true, we can use a bitwise copy (aka memcpy).  This is communicated to the middle end by the primitives generated.  If you want the middle end to generate the code, then this bit has to be communicated to the middle end.  The problem of course, if you want the middle end to generate the printf, then more of the C++ type/oobject system would have to be communicated.

> So after reading the std quotations I still think that if we want to
> fix anything
> here then we want to fix it in the frontends (only the C++ FE knows
> init order in the details required - though I suppose the description was for
> non-POD types where the FE may already do this).

Yes.  Or put other way, once you want to fix it in the middle end, you discover pulling on large amounts of code from the front end…  This is reasonable if one wants to share with other front ends that have similar rules and semantics, but then you want cooperating front-end people to figure out what to push down and how and why.  For example, we had to push exception handling down, if for no other reason, the optimizer had to be aware of it.

I’ll give you a concrete case where pushing down would be beneficial.  For example, there is a field in a structure that after lto optimization runs, we discover the semantics for the copy and decide then that the copy is trivial enough to do with a bitwise copy.  That can be pushed up (into TYPE_HAS_TRIVIAL_COPY_CTOR), and then all decisions based upon it (TYPE_HAS_TRIVIAL_COPY_CTOR ) can be redone and further optimized.

Why do this?  This type of optimization removes the abstraction penalty of code and allows people to write with more abstractions and yet not pay the price for those at runtime.  Not an unreasonable goal.  I mention this, just so someone might be able to see why one might want to do this.  I’m not arguing for it.

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [Patch 1/4] Hookize MOVE_BY_PIECES_P, remove most uses of MOVE_RATIO
  2014-09-25 14:58             ` [Patch 0/4] " James Greenhalgh
                                 ` (2 preceding siblings ...)
  2014-09-25 14:58               ` [Patchv2 3/4] Control SRA and IPA-SRA by a param rather than MOVE_RATIO James Greenhalgh
@ 2014-09-25 14:58               ` James Greenhalgh
  2014-09-25 15:09                 ` Steven Bosscher
  2014-10-29 10:45                 ` [Patch 0/6] Hookize MOVE_BY_PIECES_P James Greenhalgh
  3 siblings, 2 replies; 62+ messages in thread
From: James Greenhalgh @ 2014-09-25 14:58 UTC (permalink / raw)
  To: gcc-patches; +Cc: richard.guenther, richard.earnshaw, marcus.shawcroft, pinskia

[-- Attachment #1: Type: text/plain, Size: 1224 bytes --]


Hi,

This patch started off by Hookizing MOVE_RATIO, but pulling on that
thread made it clear that most users of MOVE_RATIO really want to know
whether move_by_pieces is going to be used or not. For that we have
MOVE_BY_PIECES_P.

We can hookize this, and clean up most other callers of MOVE_RATIO.
We leave behind one in SRA and one in tree-inline, which we will clean
up shortly.

Bootstrapped on x86_64, AArch64 and ARM. OK for trunk?

Thanks,
James

---
gcc/

2014-09-25  James Greenhalgh  <james.greenhalgh@arm.com>

	* target.def (move_by_pieces_profitable_p): New.
	* doc/tm.texi.in (MOVE_BY_PIECES_P): Reduce documentation to a stub
	describing that this macro is deprecated.
	(TARGET_MOVE_BY_PIECES_PROFITABLE_P): Add hook.
	* doc/tm.texi: Regenerate.
	* expr.c (MOVE_BY_PIECES_P): Remove.
	(STORE_BY_PIECES_P): Rewrite in terms of
	TARGET_MOVE_BY_PIECES_PROFITABLE_P.
	(can_move_by_pieces): Likewise.
	(emit_block_move_hints): Rewrite in terms of can_move_by_pieces.
	(emit_push_insn): Likewise.
	(expand_constructor): Likewise.
	* targhooks.c (get_move_ratio): New.
	(default_move_by_pieces_profitable_p): Likewise.
	* targhooks.h (default_move_by_pieces_profitable_p): New.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0001-Patch-1-4-Hookize-MOVE_BY_PIECES_P-remove-most-uses-.patch --]
[-- Type: text/x-patch;  name=0001-Patch-1-4-Hookize-MOVE_BY_PIECES_P-remove-most-uses-.patch, Size: 10395 bytes --]

diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 10af50e..162aa30 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -6114,11 +6114,38 @@ If you don't define this, a reasonable default is used.
 @end defmac
 
 @defmac MOVE_BY_PIECES_P (@var{size}, @var{alignment})
-A C expression used to determine whether @code{move_by_pieces} will be used to
-copy a chunk of memory, or whether some other block move mechanism
-will be used.  Defaults to 1 if @code{move_by_pieces_ninsns} returns less
-than @code{MOVE_RATIO}.
-@end defmac
+A C expression used to implement the default behaviour of
+@code{TARGET_MOVE_BY_PIECES_PROFITABLE_P}.  New ports should implement
+that hook in preference to this macro, which is deprecated.
+@end defmac
+
+@deftypefn {Target Hook} bool TARGET_MOVE_BY_PIECES_PROFITABLE_P (unsigned int @var{size}, unsigned int @var{alignment}, bool @var{speed_p})
+GCC will attempt several strategies when asked to copy between
+two areas of memory, for example when copying a @code{struct}.
+@code{move_by_pieces} implements such a copy as a sequence of
+memory-to-memory move insns.  Alternate strategies are to expand the
+@code{movmem} optab, to emit a library call, or to emit a unit-by-unit
+loop-based copy.
+
+This target hook should return true if, for a memory move with a given
+@var{size} and @var{alignment}, using the @code{move_by_pieces}
+infrastructure is expected to result in better code generation.
+Both @var{size} and @var{alignment} are measured in terms of storage
+units.
+
+The parameter @var{speed_p} is true if the code is currently being
+optimized for speed rather than size.
+
+Returning true for higher values of @var{size} can improve code generation
+for speed if the target does not provide an implementation of the
+@code{movmem} standard name, if the @code{movmem} implementation would be
+more expensive than a sequence of move insns, or if the overhead of a
+library call would dominate that of the body of the copy.
+
+Returning true for higher values of @code{size} may also cause an increase
+in code size, for example where the number of insns emitted to perform a
+move would be greater than that of a library call.
+@end deftypefn
 
 @defmac MOVE_MAX_PIECES
 A C expression used by @code{move_by_pieces} to determine the largest unit
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index f6f241b..1894745 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4597,12 +4597,13 @@ If you don't define this, a reasonable default is used.
 @end defmac
 
 @defmac MOVE_BY_PIECES_P (@var{size}, @var{alignment})
-A C expression used to determine whether @code{move_by_pieces} will be used to
-copy a chunk of memory, or whether some other block move mechanism
-will be used.  Defaults to 1 if @code{move_by_pieces_ninsns} returns less
-than @code{MOVE_RATIO}.
+A C expression used to implement the default behaviour of
+@code{TARGET_MOVE_BY_PIECES_PROFITABLE_P}.  New ports should implement
+that hook in preference to this macro, which is deprecated.
 @end defmac
 
+@hook TARGET_MOVE_BY_PIECES_PROFITABLE_P
+
 @defmac MOVE_MAX_PIECES
 A C expression used by @code{move_by_pieces} to determine the largest unit
 a load or store used to copy memory is.  Defaults to @code{MOVE_MAX}.
diff --git a/gcc/expr.c b/gcc/expr.c
index 0af9b9a..59a85f7 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -157,14 +157,6 @@ static void do_tablejump (rtx, enum machine_mode, rtx, rtx, rtx, int);
 static rtx const_vector_from_tree (tree);
 static void write_complex_part (rtx, rtx, bool);
 
-/* This macro is used to determine whether move_by_pieces should be called
-   to perform a structure copy.  */
-#ifndef MOVE_BY_PIECES_P
-#define MOVE_BY_PIECES_P(SIZE, ALIGN) \
-  (move_by_pieces_ninsns (SIZE, ALIGN, MOVE_MAX_PIECES) \
-   < (unsigned int) MOVE_RATIO (optimize_insn_for_speed_p ()))
-#endif
-
 /* This macro is used to determine whether clear_by_pieces should be
    called to clear storage.  */
 #ifndef CLEAR_BY_PIECES_P
@@ -185,8 +177,7 @@ static void write_complex_part (rtx, rtx, bool);
    called to "memcpy" storage when the source is a constant string.  */
 #ifndef STORE_BY_PIECES_P
 #define STORE_BY_PIECES_P(SIZE, ALIGN) \
-  (move_by_pieces_ninsns (SIZE, ALIGN, STORE_MAX_PIECES) \
-   < (unsigned int) MOVE_RATIO (optimize_insn_for_speed_p ()))
+  (targetm.move_by_pieces_profitable_p (SIZE, ALIGN, STORE_MAX_PIECES))
 #endif
 \f
 /* This is run to set up which modes can be used
@@ -837,7 +828,8 @@ int
 can_move_by_pieces (unsigned HOST_WIDE_INT len ATTRIBUTE_UNUSED,
 		    unsigned int align ATTRIBUTE_UNUSED)
 {
-  return MOVE_BY_PIECES_P (len, align);
+  return targetm.move_by_pieces_profitable_p (len, align,
+					      optimize_insn_for_speed_p ());
 }
 
 /* Generate several move instructions to copy LEN bytes from block FROM to
@@ -1180,7 +1172,7 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
       set_mem_size (y, INTVAL (size));
     }
 
-  if (CONST_INT_P (size) && MOVE_BY_PIECES_P (INTVAL (size), align))
+  if (CONST_INT_P (size) && can_move_by_pieces (INTVAL (size), align))
     move_by_pieces (x, y, INTVAL (size), align, 0);
   else if (emit_block_move_via_movmem (x, y, size, align,
 				       expected_align, expected_size,
@@ -4224,7 +4216,7 @@ emit_push_insn (rtx x, enum machine_mode mode, tree type, rtx size,
 	  && CONST_INT_P (size)
 	  && skip == 0
 	  && MEM_ALIGN (xinner) >= align
-	  && (MOVE_BY_PIECES_P ((unsigned) INTVAL (size) - used, align))
+	  && can_move_by_pieces ((unsigned) INTVAL (size) - used, align)
 	  /* Here we avoid the case of a structure whose weak alignment
 	     forces many pushes of a small amount of data,
 	     and such small pushes do rounding that causes trouble.  */
@@ -7845,7 +7837,7 @@ expand_constructor (tree exp, rtx target, enum expand_modifier modifier,
 	    && ! (target != 0 && safe_from_p (target, exp, 1)))
 		  || TREE_ADDRESSABLE (exp)
 		  || (tree_fits_uhwi_p (TYPE_SIZE_UNIT (type))
-		      && (! MOVE_BY_PIECES_P
+		      && (! can_move_by_pieces
 				     (tree_to_uhwi (TYPE_SIZE_UNIT (type)),
 				      TYPE_ALIGN (type)))
 		      && ! mostly_zeros_p (exp))))
diff --git a/gcc/target.def b/gcc/target.def
index ce11eae..0fd6235 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -3049,6 +3049,36 @@ are the same as to this target hook.",
  int, (enum machine_mode mode, reg_class_t rclass, bool in),
  default_memory_move_cost)
 
+DEFHOOK
+(move_by_pieces_profitable_p,
+ "GCC will attempt several strategies when asked to copy between\n\
+two areas of memory, for example when copying a @code{struct}.\n\
+@code{move_by_pieces} implements such a copy as a sequence of\n\
+memory-to-memory move insns.  Alternate strategies are to expand the\n\
+@code{movmem} optab, to emit a library call, or to emit a unit-by-unit\n\
+loop-based copy.\n\
+\n\
+This target hook should return true if, for a memory move with a given\n\
+@var{size} and @var{alignment}, using the @code{move_by_pieces}\n\
+infrastructure is expected to result in better code generation.\n\
+Both @var{size} and @var{alignment} are measured in terms of storage\n\
+units.\n\
+\n\
+The parameter @var{speed_p} is true if the code is currently being\n\
+optimized for speed rather than size.\n\
+\n\
+Returning true for higher values of @var{size} can improve code generation\n\
+for speed if the target does not provide an implementation of the\n\
+@code{movmem} standard name, if the @code{movmem} implementation would be\n\
+more expensive than a sequence of move insns, or if the overhead of a\n\
+library call would dominate that of the body of the copy.\n\
+\n\
+Returning true for higher values of @code{size} may also cause an increase\n\
+in code size, for example where the number of insns emitted to perform a\n\
+move would be greater than that of a library call.",
+ bool, (unsigned int size, unsigned int alignment, bool speed_p),
+ default_move_by_pieces_profitable_p)
+
 /* True for MODE if the target expects that registers in this mode will
    be allocated to registers in a small register class.  The compiler is
    allowed to use registers explicitly used in the rtl as spill registers
diff --git a/gcc/targhooks.c b/gcc/targhooks.c
index 9f15559..ffe7080 100644
--- a/gcc/targhooks.c
+++ b/gcc/targhooks.c
@@ -1402,6 +1402,41 @@ default_register_move_cost (enum machine_mode mode ATTRIBUTE_UNUSED,
 #endif
 }
 
+/* For hooks which use the MOVE_RATIO macro, this gives the legacy default
+   behaviour.  SPEED_P is true if we are compiling for speed.  */
+
+static unsigned int
+get_move_ratio (bool speed_p ATTRIBUTE_UNUSED)
+{
+  unsigned int move_ratio;
+#ifdef MOVE_RATIO
+  move_ratio = (unsigned int) MOVE_RATIO (speed_p);
+#else
+#if defined (HAVE_movmemqi) || defined (HAVE_movmemhi) || defined (HAVE_movmemsi) || defined (HAVE_movmemdi) || defined (HAVE_movmemti)
+  move_ratio = 2;
+#else /* No movmem patterns, pick a default.  */
+  move_ratio = ((speed_p) ? 15 : 3);
+#endif
+#endif
+  return move_ratio;
+}
+
+/* The threshold of move insns below which the movmem optab is expanded or a
+   call to memcpy is emitted.  */
+
+bool
+default_move_by_pieces_profitable_p (unsigned int size ATTRIBUTE_UNUSED,
+				     unsigned int alignment ATTRIBUTE_UNUSED,
+				     bool speed_p ATTRIBUTE_UNUSED)
+{
+#ifndef MOVE_BY_PIECES_P
+  return move_by_pieces_ninsns (size, alignment, MOVE_MAX_PIECES)
+	 < get_move_ratio (speed_p);
+#else
+  return !!(MOVE_BY_PIECES_P (size, alignment));
+#endif
+}
+
 bool
 default_profile_before_prologue (void)
 {
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index 9178c30..93f21f8 100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -181,6 +181,9 @@ extern int default_memory_move_cost (enum machine_mode, reg_class_t, bool);
 extern int default_register_move_cost (enum machine_mode, reg_class_t,
 				       reg_class_t);
 
+extern bool default_move_by_pieces_profitable_p (unsigned int,
+						 unsigned int, bool);
+
 extern bool default_profile_before_prologue (void);
 extern reg_class_t default_preferred_reload_class (rtx, reg_class_t);
 extern reg_class_t default_preferred_output_reload_class (rtx, reg_class_t);

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [Patch 2/4] Hack out a use of MOVE_RATIO in tree-inline.c
  2014-09-25 14:58             ` [Patch 0/4] " James Greenhalgh
  2014-09-25 14:58               ` [Patch AArch64 4/4] Wire up New target hooks James Greenhalgh
@ 2014-09-25 14:58               ` James Greenhalgh
  2014-09-26  8:58                 ` Richard Biener
  2014-09-25 14:58               ` [Patchv2 3/4] Control SRA and IPA-SRA by a param rather than MOVE_RATIO James Greenhalgh
  2014-09-25 14:58               ` [Patch 1/4] Hookize MOVE_BY_PIECES_P, remove most uses of MOVE_RATIO James Greenhalgh
  3 siblings, 1 reply; 62+ messages in thread
From: James Greenhalgh @ 2014-09-25 14:58 UTC (permalink / raw)
  To: gcc-patches; +Cc: richard.guenther, richard.earnshaw, marcus.shawcroft, pinskia

[-- Attachment #1: Type: text/plain, Size: 1153 bytes --]


Hi,

This patch hookizes the use of MOVE_RATIO in
tree-inline.c:estimate_move_cost as TARGET_ESTIMATE_BLOCK_COPY_NINSNS.
This hook should return an estimate for the number of instructions
which will be emitted to copy a block of memory.

tree-inline.c uses this in inlining heuristics to estimate the cost of
moving an object. The implementation is lacking, and will likely
underestimate the size of most copies.

An initial iteration of this patch migrated tree-inline.c to use
move_by_pieces_profitable_p and move_by_pieces_ninsns, but this
proved painful for performance on ARM.

This patch puts the control in the hands of the backend, and uses
the existing logic as a default.

Bootstrapped on x86_64, ARM, AArch64.

Ok?

Thanks,
James

---
2014-09-25  James Greenhalgh  <james.greenhalgh@arm.com>

	* target.def (estimate_block_copy_ninsns): New.
	* targhooks.h (default_estimate_block_copy_ninsns): New.
	* targhooks.c (default_estimate_block_copy_ninsns): New.
	* tree-inline.c (estimate_move_cost): Use new target hook.
	* doc/tm.texi.in (TARGET_ESTIMATE_BLOCK_COPY_NINSNS): New.
	* doc/tm.texi: Regenerate.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0002-Patch-2-4-Hack-out-a-use-of-MOVE_RATIO-in-tree-inlin.patch --]
[-- Type: text/x-patch;  name=0002-Patch-2-4-Hack-out-a-use-of-MOVE_RATIO-in-tree-inlin.patch, Size: 4705 bytes --]

diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 162aa30..f59641a 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -6147,6 +6147,19 @@ in code size, for example where the number of insns emitted to perform a
 move would be greater than that of a library call.
 @end deftypefn
 
+@deftypefn {Target Hook} {unsigned int} TARGET_ESTIMATE_BLOCK_COPY_NINSNS (HOST_WIDE_INT @var{size}, bool @var{speed_p})
+This target hook should return an estimate of the number of
+instructions which will be emitted when copying an object with a size
+in units @var{size}.
+
+The parameter @var{speed_p} is true if the code is currently being
+optimized for speed rather than size.
+
+Where the block copy would be implemented using a library call, the
+estimate should be for the number of instructions required to set up
+and perform that call.
+@end deftypefn
+
 @defmac MOVE_MAX_PIECES
 A C expression used by @code{move_by_pieces} to determine the largest unit
 a load or store used to copy memory is.  Defaults to @code{MOVE_MAX}.
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 1894745..d2a4386 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4604,6 +4604,8 @@ that hook in preference to this macro, which is deprecated.
 
 @hook TARGET_MOVE_BY_PIECES_PROFITABLE_P
 
+@hook TARGET_ESTIMATE_BLOCK_COPY_NINSNS
+
 @defmac MOVE_MAX_PIECES
 A C expression used by @code{move_by_pieces} to determine the largest unit
 a load or store used to copy memory is.  Defaults to @code{MOVE_MAX}.
diff --git a/gcc/target.def b/gcc/target.def
index 0fd6235..10f3b2e 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -3079,6 +3079,21 @@ move would be greater than that of a library call.",
  bool, (unsigned int size, unsigned int alignment, bool speed_p),
  default_move_by_pieces_profitable_p)
 
+DEFHOOK
+(estimate_block_copy_ninsns,
+ "This target hook should return an estimate of the number of\n\
+instructions which will be emitted when copying an object with a size\n\
+in units @var{size}.\n\
+\n\
+The parameter @var{speed_p} is true if the code is currently being\n\
+optimized for speed rather than size.\n\
+\n\
+Where the block copy would be implemented using a library call, the\n\
+estimate should be for the number of instructions required to set up\n\
+and perform that call.",
+ unsigned int, (HOST_WIDE_INT size, bool speed_p),
+ default_estimate_block_copy_ninsns)
+
 /* True for MODE if the target expects that registers in this mode will
    be allocated to registers in a small register class.  The compiler is
    allowed to use registers explicitly used in the rtl as spill registers
diff --git a/gcc/targhooks.c b/gcc/targhooks.c
index ffe7080..eb0a4cd 100644
--- a/gcc/targhooks.c
+++ b/gcc/targhooks.c
@@ -1437,6 +1437,16 @@ default_move_by_pieces_profitable_p (unsigned int size ATTRIBUTE_UNUSED,
 #endif
 }
 
+unsigned int
+default_estimate_block_copy_ninsns (HOST_WIDE_INT size, bool speed_p)
+{
+  if (size < 0 || size > MOVE_MAX_PIECES * get_move_ratio (speed_p))
+    /* Cost of a memcpy call, 3 arguments and the call.  */
+    return 4;
+  else
+    return ((size + MOVE_MAX_PIECES - 1) / MOVE_MAX_PIECES);
+}
+
 bool
 default_profile_before_prologue (void)
 {
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index 93f21f8..f76ad31 100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -183,6 +183,7 @@ extern int default_register_move_cost (enum machine_mode, reg_class_t,
 
 extern bool default_move_by_pieces_profitable_p (unsigned int,
 						 unsigned int, bool);
+extern unsigned int default_estimate_block_copy_ninsns (HOST_WIDE_INT, bool);
 
 extern bool default_profile_before_prologue (void);
 extern reg_class_t default_preferred_reload_class (rtx, reg_class_t);
diff --git a/gcc/tree-inline.c b/gcc/tree-inline.c
index ad474a5..e5f8653 100644
--- a/gcc/tree-inline.c
+++ b/gcc/tree-inline.c
@@ -3617,7 +3617,7 @@ tree_inlinable_function_p (tree fn)
    cost based on whether optimizing for size or speed according to SPEED_P.  */
 
 int
-estimate_move_cost (tree type, bool ARG_UNUSED (speed_p))
+estimate_move_cost (tree type, bool speed_p)
 {
   HOST_WIDE_INT size;
 
@@ -3635,11 +3635,7 @@ estimate_move_cost (tree type, bool ARG_UNUSED (speed_p))
 
   size = int_size_in_bytes (type);
 
-  if (size < 0 || size > MOVE_MAX_PIECES * MOVE_RATIO (speed_p))
-    /* Cost of a memcpy call, 3 arguments and the call.  */
-    return 4;
-  else
-    return ((size + MOVE_MAX_PIECES - 1) / MOVE_MAX_PIECES);
+  return targetm.estimate_block_copy_ninsns (size, speed_p);
 }
 
 /* Returns cost of operation CODE, according to WEIGHTS  */

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [Patchv2 3/4] Control SRA and IPA-SRA by a param rather than MOVE_RATIO
  2014-09-25 14:58             ` [Patch 0/4] " James Greenhalgh
  2014-09-25 14:58               ` [Patch AArch64 4/4] Wire up New target hooks James Greenhalgh
  2014-09-25 14:58               ` [Patch 2/4] Hack out a use of MOVE_RATIO in tree-inline.c James Greenhalgh
@ 2014-09-25 14:58               ` James Greenhalgh
  2014-09-26  9:11                 ` Richard Biener
  2014-09-25 14:58               ` [Patch 1/4] Hookize MOVE_BY_PIECES_P, remove most uses of MOVE_RATIO James Greenhalgh
  3 siblings, 1 reply; 62+ messages in thread
From: James Greenhalgh @ 2014-09-25 14:58 UTC (permalink / raw)
  To: gcc-patches; +Cc: richard.guenther, richard.earnshaw, marcus.shawcroft, pinskia

[-- Attachment #1: Type: text/plain, Size: 1964 bytes --]


Hi,

After hookizing MOVE_BY_PIECES_P and migrating tree-inline.c, we are
left with only one user of MOVE_RATIO - deciding the maximum size of
aggregate for SRA.

Past discussions have made it clear [1] that keeping this use of
MOVE_RATIO is undesirable. Clearly it is now also misnamed.

The previous iteration of this patch was rejected as too complicated. I
went off and tried simplifying it to use MOVE_RATIO, but if we do that we
end up breaking some interface boundaries between the driver and the
backend.

This patch partially hookizes MOVE_RATIO under the new name
TARGET_MAX_SCALARIZATION_SIZE and uses it to set default values for two
new parameters:

  sra-max-scalarization-size-Ospeed - The maximum size of aggregate
  to consider when compiling for speed
  sra-max-scalarization-size-Osize - The maximum size of aggregate
  to consider when compiling for size.

We then modify SRA to use these parameters rather than MOVE_RATIO.

Bootstrapped and regression tested for x86, arm and aarch64 with no
issues.

OK for trunk?

[1]: https://gcc.gnu.org/ml/gcc-patches/2014-08/msg01997.html

---
gcc/

2014-09-25  James Greenhalgh  <james.greenhalgh@arm.com>

	* doc/invoke.texi (sra-max-scalarization-size-Ospeed): Document.
	(sra-max-scalarization-size-Osize): Likewise.
	* doc/tm.texi.in
	(MOVE_RATIO): Reduce documentation to a stub, deprecate.
	(TARGET_MAX_SCALARIZATION_SIZE): Add hook.
	* doc/tm.texi: Regenerate.
	* defaults.h (MOVE_RATIO): Remove default implementation.
	(SET_RATIO): Add a default implementation if MOVE_RATIO
	is not defined.
	* params.def (sra-max-scalarization-size-Ospeed): New.
	(sra-max-scalarization-size-Osize): Likewise.
	* target.def (max_scalarization_size): New.
	* targhooks.c (default_max_scalarization_size): New.
	* targhooks.h (default_max_scalarization_size): New.
	* tree-sra.c (get_max_scalarization_size): New.
	(analyze_all_variable_accesses): Use it.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0003-Patchv2-3-4-Control-SRA-and-IPA-SRA-by-a-param-rathe.patch --]
[-- Type: text/x-patch;  name=0003-Patchv2-3-4-Control-SRA-and-IPA-SRA-by-a-param-rathe.patch, Size: 11182 bytes --]

diff --git a/gcc/defaults.h b/gcc/defaults.h
index c1776b0..f723e2c 100644
--- a/gcc/defaults.h
+++ b/gcc/defaults.h
@@ -1191,18 +1191,6 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 #define BRANCH_COST(speed_p, predictable_p) 1
 #endif
 
-/* If a memory-to-memory move would take MOVE_RATIO or more simple
-   move-instruction sequences, we will do a movmem or libcall instead.  */
-
-#ifndef MOVE_RATIO
-#if defined (HAVE_movmemqi) || defined (HAVE_movmemhi) || defined (HAVE_movmemsi) || defined (HAVE_movmemdi) || defined (HAVE_movmemti)
-#define MOVE_RATIO(speed) 2
-#else
-/* If we are optimizing for space (-Os), cut down the default move ratio.  */
-#define MOVE_RATIO(speed) ((speed) ? 15 : 3)
-#endif
-#endif
-
 /* If a clear memory operation would take CLEAR_RATIO or more simple
    move-instruction sequences, we will do a setmem or libcall instead.  */
 
@@ -1219,7 +1207,14 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
    SET_RATIO or more simple move-instruction sequences, we will do a movmem
    or libcall instead.  */
 #ifndef SET_RATIO
+#ifdef MOVE_RATIO
 #define SET_RATIO(speed) MOVE_RATIO (speed)
+#elif defined (HAVE_movmemqi) || defined (HAVE_movmemhi) || defined (HAVE_movmemsi) || defined (HAVE_movmemdi) || defined (HAVE_movmemti)
+#define SET_RATIO(speed) 2
+#else
+/* If we are optimizing for space (-Os), cut down the default move ratio.  */
+#define SET_RATIO(speed) ((speed) ? 15 : 3)
+#endif
 #endif
 
 /* Supply a default definition for FUNCTION_ARG_PADDING:
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index eae4ab1..c3e6eaa 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -10301,6 +10301,16 @@ parameters only when their cumulative size is less or equal to
 @option{ipa-sra-ptr-growth-factor} times the size of the original
 pointer parameter.
 
+@item sra-max-scalarization-size-Ospeed
+@item sra-max-scalarization-size-Osize
+The two Scalar Reduction of Aggregates passes (SRA and IPA-SRA) aim to
+replace scalar parts of aggregates with uses of independent scalar
+variables.  These parameters control the maximum size, in storage units,
+of aggregate which will be considered for replacement when compiling for
+speed
+(@option{sra-max-scalarization-size-Ospeed}) or size
+(@option{sra-max-scalarization-size-Osize}) respectively.
+
 @item tm-max-aggregate-size
 When making copies of thread-local variables in a transaction, this
 parameter specifies the size in bytes after which variables are
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index f59641a..b4061eb 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -6098,20 +6098,25 @@ this macro is defined, it should produce a nonzero value when
 @end defmac
 
 @defmac MOVE_RATIO (@var{speed})
-The threshold of number of scalar memory-to-memory move insns, @emph{below}
-which a sequence of insns should be generated instead of a
-string move insn or a library call.  Increasing the value will always
-make code faster, but eventually incurs high cost in increased code size.
+This macro is deprecated and is only used to guide the default behaviours
+of @code{TARGET_MOVE_BY_PIECES_PROFITABLE_P} and
+@code{TARGET_MAX_TOTAL_SCALARIZATION_SIZE}.  New ports should implement
+that hook in preference to this macro.
+@end defmac
 
-Note that on machines where the corresponding move insn is a
-@code{define_expand} that emits a sequence of insns, this macro counts
-the number of such sequences.
+@deftypefn {Target Hook} {unsigned int} TARGET_MAX_SCALARIZATION_SIZE (bool @var{speed_p})
+This target hook is used by the Scalar Replacement of Aggregates passes
+(SRA and IPA-SRA).  This hook gives the maximimum size, in storage units,
+of aggregate to consider for replacement.  @var{speed_p} is true if we are
+currently compiling for speed.
 
-The parameter @var{speed} is true if the code is currently being
-optimized for speed rather than size.
+By default, the maximum scalarization size is determined by MOVE_RATIO,
+if it is defined.  Otherwise, a sensible default is chosen.
 
-If you don't define this, a reasonable default is used.
-@end defmac
+Note that a user may choose to override this target hook with the
+parameters @code{sra-max-scalarization-size-Ospeed} and
+@code{sra-max-scalarization-size-Osize}.
+@end deftypefn
 
 @defmac MOVE_BY_PIECES_P (@var{size}, @var{alignment})
 A C expression used to implement the default behaviour of
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index d2a4386..bdd1ec4 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4581,21 +4581,14 @@ this macro is defined, it should produce a nonzero value when
 @end defmac
 
 @defmac MOVE_RATIO (@var{speed})
-The threshold of number of scalar memory-to-memory move insns, @emph{below}
-which a sequence of insns should be generated instead of a
-string move insn or a library call.  Increasing the value will always
-make code faster, but eventually incurs high cost in increased code size.
-
-Note that on machines where the corresponding move insn is a
-@code{define_expand} that emits a sequence of insns, this macro counts
-the number of such sequences.
-
-The parameter @var{speed} is true if the code is currently being
-optimized for speed rather than size.
-
-If you don't define this, a reasonable default is used.
+This macro is deprecated and is only used to guide the default behaviours
+of @code{TARGET_MOVE_BY_PIECES_PROFITABLE_P} and
+@code{TARGET_MAX_TOTAL_SCALARIZATION_SIZE}.  New ports should implement
+that hook in preference to this macro.
 @end defmac
 
+@hook TARGET_MAX_SCALARIZATION_SIZE
+
 @defmac MOVE_BY_PIECES_P (@var{size}, @var{alignment})
 A C expression used to implement the default behaviour of
 @code{TARGET_MOVE_BY_PIECES_PROFITABLE_P}.  New ports should implement
diff --git a/gcc/params.def b/gcc/params.def
index aefdd07..7b6c7e2 100644
--- a/gcc/params.def
+++ b/gcc/params.def
@@ -942,6 +942,18 @@ DEFPARAM (PARAM_TM_MAX_AGGREGATE_SIZE,
 	  "pairs",
 	  9, 0, 0)
 
+DEFPARAM (PARAM_SRA_MAX_SCALARIZATION_SIZE_SPEED,
+	  "sra-max-scalarization-size-Ospeed",
+	  "Maximum size, in storage units, of an aggregate which should be "
+	  "considered for scalarization when compiling for speed",
+	  0, 0, 0)
+
+DEFPARAM (PARAM_SRA_MAX_SCALARIZATION_SIZE_SIZE,
+	  "sra-max-scalarization-size-Osize",
+	  "Maximum size, in storage units, of an aggregate which should be "
+	  "considered for scalarization when compiling for size",
+	  0, 0, 0)
+
 DEFPARAM (PARAM_IPA_CP_VALUE_LIST_SIZE,
 	  "ipa-cp-value-list-size",
 	  "Maximum size of a list of values associated with each parameter for "
diff --git a/gcc/target.def b/gcc/target.def
index 10f3b2e..4e19845 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -3049,6 +3049,24 @@ are the same as to this target hook.",
  int, (enum machine_mode mode, reg_class_t rclass, bool in),
  default_memory_move_cost)
 
+/* Return the maximum size in bytes of aggregate which will be considered
+   for replacement by SRA/IP-SRA.  */
+DEFHOOK
+(max_scalarization_size,
+ "This target hook is used by the Scalar Replacement of Aggregates passes\n\
+(SRA and IPA-SRA).  This hook gives the maximimum size, in storage units,\n\
+of aggregate to consider for replacement.  @var{speed_p} is true if we are\n\
+currently compiling for speed.\n\
+\n\
+By default, the maximum scalarization size is determined by MOVE_RATIO,\n\
+if it is defined.  Otherwise, a sensible default is chosen.\n\
+\n\
+Note that a user may choose to override this target hook with the\n\
+parameters @code{sra-max-scalarization-size-Ospeed} and\n\
+@code{sra-max-scalarization-size-Osize}.",
+ unsigned int, (bool speed_p),
+ default_max_scalarization_size)
+
 DEFHOOK
 (move_by_pieces_profitable_p,
  "GCC will attempt several strategies when asked to copy between\n\
diff --git a/gcc/targhooks.c b/gcc/targhooks.c
index eb0a4cd..abc94ff 100644
--- a/gcc/targhooks.c
+++ b/gcc/targhooks.c
@@ -1421,6 +1421,15 @@ get_move_ratio (bool speed_p ATTRIBUTE_UNUSED)
   return move_ratio;
 }
 
+/* Return the maximum size, in storage units, of aggregate
+   which will be considered for replacement by SRA/IP-SRA.  */
+
+unsigned int
+default_max_scalarization_size (bool speed_p ATTRIBUTE_UNUSED)
+{
+  return get_move_ratio (speed_p) * MOVE_MAX_PIECES;
+}
+
 /* The threshold of move insns below which the movmem optab is expanded or a
    call to memcpy is emitted.  */
 
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index f76ad31..35467f8 100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -181,6 +181,7 @@ extern int default_memory_move_cost (enum machine_mode, reg_class_t, bool);
 extern int default_register_move_cost (enum machine_mode, reg_class_t,
 				       reg_class_t);
 
+extern unsigned int default_max_scalarization_size (bool size_p);
 extern bool default_move_by_pieces_profitable_p (unsigned int,
 						 unsigned int, bool);
 extern unsigned int default_estimate_block_copy_ninsns (HOST_WIDE_INT, bool);
diff --git a/gcc/tree-sra.c b/gcc/tree-sra.c
index 8259dba..c611d29 100644
--- a/gcc/tree-sra.c
+++ b/gcc/tree-sra.c
@@ -2482,6 +2482,25 @@ propagate_all_subaccesses (void)
     }
 }
 
+/* Return the appropriate parameter value giving the maximum size of
+   aggregate (in storage units) to be considered for scalerization.
+   SPEED_P, which is true if we are currently optimizing for speed
+   rather than size.  */
+
+unsigned int
+get_max_scalarization_size (bool speed_p)
+{
+  unsigned param_max_scalarization_size
+    = speed_p
+      ? PARAM_VALUE (PARAM_SRA_MAX_SCALARIZATION_SIZE_SPEED)
+      : PARAM_VALUE (PARAM_SRA_MAX_SCALARIZATION_SIZE_SIZE);
+
+  if (!param_max_scalarization_size)
+    return targetm.max_scalarization_size (speed_p);
+
+  return param_max_scalarization_size;
+}
+
 /* Go through all accesses collected throughout the (intraprocedural) analysis
    stage, exclude overlapping ones, identify representatives and build trees
    out of them, making decisions about scalarization on the way.  Return true
@@ -2493,10 +2512,10 @@ analyze_all_variable_accesses (void)
   int res = 0;
   bitmap tmp = BITMAP_ALLOC (NULL);
   bitmap_iterator bi;
-  unsigned i, max_total_scalarization_size;
-
-  max_total_scalarization_size = UNITS_PER_WORD * BITS_PER_UNIT
-    * MOVE_RATIO (optimize_function_for_speed_p (cfun));
+  unsigned i;
+  unsigned int max_scalarization_size
+    = get_max_scalarization_size (optimize_function_for_size_p (cfun))
+      * BITS_PER_UNIT;
 
   EXECUTE_IF_SET_IN_BITMAP (candidate_bitmap, 0, i, bi)
     if (bitmap_bit_p (should_scalarize_away_bitmap, i)
@@ -2508,7 +2527,7 @@ analyze_all_variable_accesses (void)
 	    && type_consists_of_records_p (TREE_TYPE (var)))
 	  {
 	    if (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (var)))
-		<= max_total_scalarization_size)
+		<= max_scalarization_size)
 	      {
 		completely_scalarize_var (var);
 		if (dump_file && (dump_flags & TDF_DETAILS))

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [Patch AArch64 4/4] Wire up New target hooks
  2014-09-25 14:58             ` [Patch 0/4] " James Greenhalgh
@ 2014-09-25 14:58               ` James Greenhalgh
  2014-09-26 13:31                 ` James Greenhalgh
  2014-09-25 14:58               ` [Patch 2/4] Hack out a use of MOVE_RATIO in tree-inline.c James Greenhalgh
                                 ` (2 subsequent siblings)
  3 siblings, 1 reply; 62+ messages in thread
From: James Greenhalgh @ 2014-09-25 14:58 UTC (permalink / raw)
  To: gcc-patches; +Cc: richard.guenther, richard.earnshaw, marcus.shawcroft, pinskia

[-- Attachment #1: Type: text/plain, Size: 893 bytes --]


Hi,

This patch wires up our new target hooks for AArch64. This also means
we can bring back the two failing SRA tests.

Bootstrapped on AArch64 with no issues.

OK for trunk?

Thanks,
James

---
gcc/

2014-09-25  James Greenhalgh  <james.greenhalgh@arm.com>

	* config/aarch64/aarch64.c
	(aarch64_estimate_movmem_ninsns): New.
	(aarch64_expand_movmem): Refactor old move costs.
	(aarch64_move_by_pieces_profitable_p): New.
	(aarch64_estimate_block_copy_ninsns): Likewise.
	(aarch64_max_scalarization_size): Likewise.
	(TARGET_MAX_SCALARIZATION_SIZE): Likewise.
	(TARGET_MOVE_BY_PIECES_PROFITABLE_P): Likewise.
	* config/aarch64/aarch64.h (AARCH64_MOVE_RATIO): New.
	(MOVE_RATIO): Delete.

gcc/testsuite/

2014-09-25  James Greenhalgh  <james.greenhalgh@arm.com>

	* gcc.dg/tree-ssa/pr42585.c: Bring back for AArch64.
	* gcc.dg/tree-ssa/sra-12.c: Likewise.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0004-Patch-AArch64-4-4-Wire-up-New-target-hooks.patch --]
[-- Type: text/x-patch;  name=0004-Patch-AArch64-4-4-Wire-up-New-target-hooks.patch, Size: 7981 bytes --]

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 3483081..d8b5a4a 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -9616,6 +9616,34 @@ aarch64_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
   return false;
 }
 
+static unsigned int
+aarch64_estimate_movmem_ninsns (HOST_WIDE_INT size)
+{
+  HOST_WIDE_INT chunks = 0;
+  int n = size;
+
+  /* 3 bytes is a 2-byte then a 1-byte copy.  */
+  if (n == 3)
+    return 2;
+
+  /* 5, 6, 7 bytes need an extra copy.  */
+  if (n > 4 && n < 8)
+    chunks++;
+
+  /* If n was greater than 8, it is dealt with in 8/16-byte chunks
+     first.  */
+  chunks += n / 16;
+  n %= 16;
+  chunks += n / 8;
+  n %= 8;
+
+  /* Anything left is dealt with in one instruction.  */
+  if (n != 0)
+    chunks++;
+
+  return chunks;
+}
+
 /* Return a new RTX holding the result of moving POINTER forward by
    AMOUNT bytes.  */
 
@@ -9673,7 +9701,7 @@ aarch64_expand_movmem (rtx *operands)
 
   /* When optimizing for size, give a better estimate of the length of a
      memcpy call, but use the default otherwise.  */
-  unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
+  unsigned int max_instructions = AARCH64_MOVE_RATIO (speed_p);
 
   /* We can't do anything smart if the amount to copy is not constant.  */
   if (!CONST_INT_P (operands[2]))
@@ -9681,10 +9709,9 @@ aarch64_expand_movmem (rtx *operands)
 
   n = UINTVAL (operands[2]);
 
-  /* Try to keep the number of instructions low.  For cases below 16 bytes we
-     need to make at most two moves.  For cases above 16 bytes it will be one
-     move for each 16 byte chunk, then at most two additional moves.  */
-  if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
+  /* Try to keep the number of instructions we emit low, fail expansion
+     if we are unable to and leave it to memcpy.  */
+  if (aarch64_estimate_movmem_ninsns (n) > max_instructions)
     return false;
 
   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
@@ -9774,6 +9801,57 @@ aarch64_expand_movmem (rtx *operands)
   return true;
 }
 
+/* Implement TARGET_MOVE_BY_PIECES_PROFITABLE_P.  */
+
+bool
+aarch64_move_by_pieces_profitable_p (unsigned int size
+				     unsigned int align
+				     bool speed_p)
+{
+  /* For strict alignment we don't want to use our unaligned
+     movmem implementation.  */
+  if (STRICT_ALIGNMENT)
+    return (AARCH64_MOVE_RATIO (speed_p)
+	    < move_by_pieces_ninsns (size, align, speed_p));
+
+  /* If we have an overhang of 3, 6 or 7 bytes, we would emit an unaligned
+     load to cover it, if this is likely to be slow we would do better
+     going through move_by_pieces.  */
+  if (size % 8 > 5)
+    return SLOW_UNALIGNED_ACCESS (DImode, 1);
+  else if (size % 4 == 3)
+    return SLOW_UNALIGNED_ACCESS (SImode, 1);
+
+  /* We can likely do a better job than the move_by_pieces infrastructure
+     can.  */
+  return false;
+}
+
+/* Implement TARGET_ESTIMATE_BLOCK_COPY_NINSNS.  */
+
+unsigned int
+aarch64_estimate_block_copy_ninsns (HOST_WIDE_INT size, bool speed_p)
+{
+  if (aarch64_move_by_pieces_profitable_p (size, 8, speed_p))
+    return move_by_pieces_ninsns (size, 8, MOVE_MAX_PIECES);
+  else if (aarch64_estimate_movmem_ninsns (size)
+	   < AARCH64_MOVE_RATIO (speed_p))
+    return aarch64_estimate_movmem_ninsns (size);
+  else
+    /* memcpy.  Set up 3 arguments and make a call.  */
+    return 4;
+}
+
+/* Implement TARGET_MAX_SCALARIZATION_SIZE.  */
+
+unsigned int
+aarch64_max_scalarization_size (bool speed_p)
+{
+  /* The maximum number of instructions we are willing to use * the
+     maximum size we can move in one instruction (LDP/STP).  */
+  return AARCH64_MOVE_RATIO (speed_p) * 16;
+}
+
 #undef TARGET_ADDRESS_COST
 #define TARGET_ADDRESS_COST aarch64_address_cost
 
@@ -9843,6 +9921,10 @@ aarch64_expand_movmem (rtx *operands)
 #undef TARGET_BUILTIN_DECL
 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
 
+#undef TARGET_ESTIMATE_BLOCK_COPY_NINSNS
+#define TARGET_ESTIMATE_BLOCK_COPY_NINSNS \
+  aarch64_estimate_block_copy_ninsns
+
 #undef  TARGET_EXPAND_BUILTIN
 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
 
@@ -9897,9 +9979,17 @@ aarch64_expand_movmem (rtx *operands)
 #undef TARGET_MANGLE_TYPE
 #define TARGET_MANGLE_TYPE aarch64_mangle_type
 
+#undef TARGET_MAX_SCALARIZATION_SIZE
+#define TARGET_MAX_SCALARIZATION_SIZE \
+  aarch64_max_scalarization_size
+
 #undef TARGET_MEMORY_MOVE_COST
 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
 
+#undef TARGET_MOVE_BY_PIECES_PROFITABLE_P
+#define TARGET_MOVE_BY_PIECES_PROFITABLE_P \
+  aarch64_move_by_pieces_profitable_p
+
 #undef TARGET_MUST_PASS_IN_STACK
 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
 
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index db950da..5c8d37d 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -678,17 +678,10 @@ do {									     \
 /* Maximum bytes moved by a single instruction (load/store pair).  */
 #define MOVE_MAX (UNITS_PER_WORD * 2)
 
-/* The base cost overhead of a memcpy call, for MOVE_RATIO and friends.  */
+/* The base cost overhead of a memcpy call, for CLEAR_RATIO and friends.  */
 #define AARCH64_CALL_RATIO 8
 
-/* MOVE_RATIO dictates when we will use the move_by_pieces infrastructure.
-   move_by_pieces will continually copy the largest safe chunks.  So a
-   7-byte copy is a 4-byte + 2-byte + byte copy.  This proves inefficient
-   for both size and speed of copy, so we will instead use the "movmem"
-   standard name to implement the copy.  This logic does not apply when
-   targeting -mstrict-align, so keep a sensible default in that case.  */
-#define MOVE_RATIO(speed) \
-  (!STRICT_ALIGNMENT ? 2 : (((speed) ? 15 : AARCH64_CALL_RATIO) / 2))
+#define AARCH64_MOVE_RATIO(speed) (((speed) ? 15 : AARCH64_CALL_RATIO) / 2)
 
 /* For CLEAR_RATIO, when optimizing for size, give a better estimate
    of the length of a memset call, but use the default otherwise.  */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c b/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c
index 07f575d..a970c85 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c
@@ -35,6 +35,6 @@ Cyc_string_ungetc (int ignore, struct _fat_ptr *sptr)
 /* Whether the structs are totally scalarized or not depends on the
    MOVE_RATIO macro definition in the back end.  The scalarization will
    not take place when using small values for MOVE_RATIO.  */
-/* { dg-final { scan-tree-dump-times "struct _fat_ptr _ans" 0 "optimized" { target { ! "aarch64*-*-* arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } } */
-/* { dg-final { scan-tree-dump-times "struct _fat_ptr _T2" 0 "optimized" { target { ! "aarch64*-*-* arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } } */
+/* { dg-final { scan-tree-dump-times "struct _fat_ptr _ans" 0 "optimized" { target { ! "arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } } */
+/* { dg-final { scan-tree-dump-times "struct _fat_ptr _T2" 0 "optimized" { target { ! "arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } } */
 /* { dg-final { cleanup-tree-dump "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c b/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c
index 45aa963..59e5e6a 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c
@@ -21,5 +21,5 @@ int foo (struct S *p)
   *p = l;
 }
 
-/* { dg-final { scan-tree-dump-times "l;" 0 "release_ssa" { target { ! "aarch64*-*-* avr*-*-* nds32*-*-*" } } } } */
+/* { dg-final { scan-tree-dump-times "l;" 0 "release_ssa" { target { ! "avr*-*-* nds32*-*-*" } } } } */
 /* { dg-final { cleanup-tree-dump "release_ssa" } } */

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [Patch 0/4] Re: Control SRA and IPA-SRA by a param rather than MOVE_RATIO
  2014-08-20  9:21           ` [Patch 1/2] Control SRA and IPA-SRA by a param rather than MOVE_RATIO Richard Biener
@ 2014-09-25 14:58             ` James Greenhalgh
  2014-09-25 14:58               ` [Patch AArch64 4/4] Wire up New target hooks James Greenhalgh
                                 ` (3 more replies)
  0 siblings, 4 replies; 62+ messages in thread
From: James Greenhalgh @ 2014-09-25 14:58 UTC (permalink / raw)
  To: gcc-patches; +Cc: richard.guenther, richard.earnshaw, marcus.shawcroft, pinskia

[-- Attachment #1: Type: text/plain, Size: 2405 bytes --]

On Wed, Aug 20, 2014 at 10:21:41AM +0100, Richard Biener wrote:
> I think this is overly complicated and instead SRA should only
> use the parameters.  Targets can adjust their default (like they
> do for other parameters).
> 
> The default should be MOVE_RATIO which should be applied
> where the common code adjusts parameters (see existing
> examples for not overriding user specified ones).

Hi,

My attempts to simplify this patch didn't work out so well...

If I move the target hook to the driver, I can't use MOVE_RATIO to
find a deafult value. MOVE_RATIO for some targets is wired to a
function in the back-end, or otherwise references symbols we don't
want to pull in to libcommon/libcommon-target.

My next approach was to hookize just this one use of MOVE_RATIO - again,
this was a failure as libcommon-target doesn't have enough access to the
CPU tuning tables used by backends (nor should it).

That took me to my current approach. Hookize each of the three unique
uses of MOVE_RATIO, allowing us to eliminate it entirely. This still
doesn't let us simplify the patch I sent in August, but it does neaten
up the users of MOVE_RATIO allowing us to separate out concerns.

This gives targets and micro-architectures much more fine-grained
control over the tuning parameters for inlining, SRA and move_by_pieces,
which were all previously wrapped in MOVE_RATIO.

I've bootstrapped and tested the series on x86_64, ARM and AArch64 with
no issues.

The patches coming are:

  [Patch 1/4] Hookize MOVE_BY_PIECES_P, remove most uses of MOVE_RATIO

    Which moves everything consulting MOVE_RATIO to decide whether
    the move_by_pieces infrastructure will be used to a new hook
    TARGET_MOVE_BY_PIECES_PROFITABLE_P.

  [Patch 2/4] Hack out a use of MOVE_RATIO in tree-inline.c

    Which adds the target hook TARGET_ESTIMATE_BLOCK_COPY_NINSNS,
    used to estimate the number of instructions a target will require
    to move a block. This is used by inlining to estimate the cost of
    various parameters.

  [Patchv2 3/4] Control SRA and IPA-SRA by a param rather than
    MOVE_RATIO

    Which is a similar patch to that I sent in August, adding new
    parameters and a new target hook to control when SRA should be used.

  [Patch AArch64 4/4] Wire up New target hooks

    Which wires all of this up for AArch64.

Thanks,
James

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [Patch 1/4] Hookize MOVE_BY_PIECES_P, remove most uses of MOVE_RATIO
  2014-09-25 14:58               ` [Patch 1/4] Hookize MOVE_BY_PIECES_P, remove most uses of MOVE_RATIO James Greenhalgh
@ 2014-09-25 15:09                 ` Steven Bosscher
  2014-09-26  9:16                   ` Richard Biener
  2014-10-29 10:45                 ` [Patch 0/6] Hookize MOVE_BY_PIECES_P James Greenhalgh
  1 sibling, 1 reply; 62+ messages in thread
From: Steven Bosscher @ 2014-09-25 15:09 UTC (permalink / raw)
  To: James Greenhalgh
  Cc: GCC Patches, Richard Biener, Richard Earnshaw, Marcus Shawcroft,
	Andrew Pinski

On Thu, Sep 25, 2014 at 4:57 PM, James Greenhalgh wrote:
>         * doc/tm.texi.in (MOVE_BY_PIECES_P): Reduce documentation to a stub
>         describing that this macro is deprecated.

Remove it entirely and poison it in system.h?
It takes changes to only a few targets: mips, arc, s390, and sh.

Thanks for hookizing this!

Ciao!
Steven

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [Patch 2/4] Hack out a use of MOVE_RATIO in tree-inline.c
  2014-09-25 14:58               ` [Patch 2/4] Hack out a use of MOVE_RATIO in tree-inline.c James Greenhalgh
@ 2014-09-26  8:58                 ` Richard Biener
  0 siblings, 0 replies; 62+ messages in thread
From: Richard Biener @ 2014-09-26  8:58 UTC (permalink / raw)
  To: James Greenhalgh
  Cc: GCC Patches, Richard Earnshaw, Marcus Shawcroft, Andrew Pinski

On Thu, Sep 25, 2014 at 4:57 PM, James Greenhalgh
<james.greenhalgh@arm.com> wrote:
>
> Hi,
>
> This patch hookizes the use of MOVE_RATIO in
> tree-inline.c:estimate_move_cost as TARGET_ESTIMATE_BLOCK_COPY_NINSNS.
> This hook should return an estimate for the number of instructions
> which will be emitted to copy a block of memory.
>
> tree-inline.c uses this in inlining heuristics to estimate the cost of
> moving an object. The implementation is lacking, and will likely
> underestimate the size of most copies.
>
> An initial iteration of this patch migrated tree-inline.c to use
> move_by_pieces_profitable_p and move_by_pieces_ninsns, but this
> proved painful for performance on ARM.
>
> This patch puts the control in the hands of the backend, and uses
> the existing logic as a default.
>
> Bootstrapped on x86_64, ARM, AArch64.
>
> Ok?

Note that if you are here then one issue is that the inliner uses
this very same function to estimate cost of function parameters/returns
that are eventually passed/returned in registers.  That's of course
a pre-existing issue.

+ "This target hook should return an estimate of the number of\n\
+instructions which will be emitted when copying an object with a size\n\
+in units @var{size}.\n\

I'm confused by this sentence.  Doesn't it mean to say
"when copying an object with size @var{size} in units of word_mode."?

It's always difficult when transforming a heuristic using existing
target macros to a new hook.  It would be best to think about the
heuristic itself again and make the hook more closely match
the uses of the heuristic.  In this case it would mean splitting
this up into the load/store and the function parameter case.

Note that estimate_move_cost is used elsewhere as well.

Richard.

> Thanks,
> James
>
> ---
> 2014-09-25  James Greenhalgh  <james.greenhalgh@arm.com>
>
>         * target.def (estimate_block_copy_ninsns): New.
>         * targhooks.h (default_estimate_block_copy_ninsns): New.
>         * targhooks.c (default_estimate_block_copy_ninsns): New.
>         * tree-inline.c (estimate_move_cost): Use new target hook.
>         * doc/tm.texi.in (TARGET_ESTIMATE_BLOCK_COPY_NINSNS): New.
>         * doc/tm.texi: Regenerate.

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [Patchv2 3/4] Control SRA and IPA-SRA by a param rather than MOVE_RATIO
  2014-09-25 14:58               ` [Patchv2 3/4] Control SRA and IPA-SRA by a param rather than MOVE_RATIO James Greenhalgh
@ 2014-09-26  9:11                 ` Richard Biener
  2014-10-01 16:38                   ` James Greenhalgh
  0 siblings, 1 reply; 62+ messages in thread
From: Richard Biener @ 2014-09-26  9:11 UTC (permalink / raw)
  To: James Greenhalgh
  Cc: GCC Patches, Richard Earnshaw, Marcus Shawcroft, Andrew Pinski

On Thu, Sep 25, 2014 at 4:57 PM, James Greenhalgh
<james.greenhalgh@arm.com> wrote:
>
> Hi,
>
> After hookizing MOVE_BY_PIECES_P and migrating tree-inline.c, we are
> left with only one user of MOVE_RATIO - deciding the maximum size of
> aggregate for SRA.
>
> Past discussions have made it clear [1] that keeping this use of
> MOVE_RATIO is undesirable. Clearly it is now also misnamed.
>
> The previous iteration of this patch was rejected as too complicated. I
> went off and tried simplifying it to use MOVE_RATIO, but if we do that we
> end up breaking some interface boundaries between the driver and the
> backend.
>
> This patch partially hookizes MOVE_RATIO under the new name
> TARGET_MAX_SCALARIZATION_SIZE and uses it to set default values for two
> new parameters:
>
>   sra-max-scalarization-size-Ospeed - The maximum size of aggregate
>   to consider when compiling for speed
>   sra-max-scalarization-size-Osize - The maximum size of aggregate
>   to consider when compiling for size.
>
> We then modify SRA to use these parameters rather than MOVE_RATIO.
>
> Bootstrapped and regression tested for x86, arm and aarch64 with no
> issues.
>
> OK for trunk?

+/* Return the maximum size in bytes of aggregate which will be considered
+   for replacement by SRA/IP-SRA.  */
+DEFHOOK
+(max_scalarization_size,
+ "This target hook is used by the Scalar Replacement of Aggregates passes\n\
+(SRA and IPA-SRA).  This hook gives the maximimum size, in storage units,\n\
+of aggregate to consider for replacement.  @var{speed_p} is true if we are\n\
+currently compiling for speed.\n\
+\n\
+By default, the maximum scalarization size is determined by MOVE_RATIO,\n\
+if it is defined.  Otherwise, a sensible default is chosen.\n\

doesn't match

+unsigned int
+default_max_scalarization_size (bool speed_p ATTRIBUTE_UNUSED)
+{
+  return get_move_ratio (speed_p) * MOVE_MAX_PIECES;

+unsigned int
+get_max_scalarization_size (bool speed_p)
+{
+  unsigned param_max_scalarization_size
+    = speed_p
+      ? PARAM_VALUE (PARAM_SRA_MAX_SCALARIZATION_SIZE_SPEED)
+      : PARAM_VALUE (PARAM_SRA_MAX_SCALARIZATION_SIZE_SIZE);
+
+  if (!param_max_scalarization_size)
+    return targetm.max_scalarization_size (speed_p);
+

the target-hook takes a size_p parameter, here you have a speed_p
parameter but call it as

+  unsigned i;
+  unsigned int max_scalarization_size
+    = get_max_scalarization_size (optimize_function_for_size_p (cfun))
+      * BITS_PER_UNIT;

there is some mismatch.  Not sure if we generally prefer speed_p
over size_p, grepping headers shows zero size_p parameters and
some speed_p ones.

Given the special value to note the default for the new --params is
zero a user cannot disable scalarization that way.

I still somehow dislike that you need a target hook to compute the
default.  Why doesn't it work to do, in opts.c:default_options_optimization

maybe_set_param_value
  (PARAM_SRA_MAX_SCALARIZATION_SIZE_SPEED,
   get_move_ratio (speed_p) * MOVE_MAX_PIECES,
   opts->x_param_values, opts_set->x_param_values);

and override that default in targets option_override hook the same way?

Thanks,
Richard.

> [1]: https://gcc.gnu.org/ml/gcc-patches/2014-08/msg01997.html
>
> ---
> gcc/
>
> 2014-09-25  James Greenhalgh  <james.greenhalgh@arm.com>
>
>         * doc/invoke.texi (sra-max-scalarization-size-Ospeed): Document.
>         (sra-max-scalarization-size-Osize): Likewise.
>         * doc/tm.texi.in
>         (MOVE_RATIO): Reduce documentation to a stub, deprecate.
>         (TARGET_MAX_SCALARIZATION_SIZE): Add hook.
>         * doc/tm.texi: Regenerate.
>         * defaults.h (MOVE_RATIO): Remove default implementation.
>         (SET_RATIO): Add a default implementation if MOVE_RATIO
>         is not defined.
>         * params.def (sra-max-scalarization-size-Ospeed): New.
>         (sra-max-scalarization-size-Osize): Likewise.
>         * target.def (max_scalarization_size): New.
>         * targhooks.c (default_max_scalarization_size): New.
>         * targhooks.h (default_max_scalarization_size): New.
>         * tree-sra.c (get_max_scalarization_size): New.
>         (analyze_all_variable_accesses): Use it.

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [Patch 1/4] Hookize MOVE_BY_PIECES_P, remove most uses of MOVE_RATIO
  2014-09-25 15:09                 ` Steven Bosscher
@ 2014-09-26  9:16                   ` Richard Biener
  0 siblings, 0 replies; 62+ messages in thread
From: Richard Biener @ 2014-09-26  9:16 UTC (permalink / raw)
  To: Steven Bosscher
  Cc: James Greenhalgh, GCC Patches, Richard Earnshaw,
	Marcus Shawcroft, Andrew Pinski

On Thu, Sep 25, 2014 at 5:08 PM, Steven Bosscher <stevenb.gcc@gmail.com> wrote:
> On Thu, Sep 25, 2014 at 4:57 PM, James Greenhalgh wrote:
>>         * doc/tm.texi.in (MOVE_BY_PIECES_P): Reduce documentation to a stub
>>         describing that this macro is deprecated.
>
> Remove it entirely and poison it in system.h?
> It takes changes to only a few targets: mips, arc, s390, and sh.
>
> Thanks for hookizing this!

Indeed.

The patch is ok - please consider doing what Steven suggested as
followup.

Thanks,
Richard.

> Ciao!
> Steven

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [Patch AArch64 4/4] Wire up New target hooks
  2014-09-25 14:58               ` [Patch AArch64 4/4] Wire up New target hooks James Greenhalgh
@ 2014-09-26 13:31                 ` James Greenhalgh
  0 siblings, 0 replies; 62+ messages in thread
From: James Greenhalgh @ 2014-09-26 13:31 UTC (permalink / raw)
  To: gcc-patches; +Cc: richard.guenther, Richard Earnshaw, Marcus Shawcroft, pinskia

On Thu, Sep 25, 2014 at 03:57:36PM +0100, James Greenhalgh wrote:
> 
> Hi,
> 
> This patch wires up our new target hooks for AArch64. This also means
> we can bring back the two failing SRA tests.
> 
> Bootstrapped on AArch64 with no issues.
> 
> OK for trunk?

No way! This patch is nonsense as it stands!

I'd like to withdraw this for now while I have a think about what
has gone wrong!

Thanks,
James

> 
> Thanks,
> James
> 
> ---
> gcc/
> 
> 2014-09-25  James Greenhalgh  <james.greenhalgh@arm.com>
> 
> 	* config/aarch64/aarch64.c
> 	(aarch64_estimate_movmem_ninsns): New.
> 	(aarch64_expand_movmem): Refactor old move costs.
> 	(aarch64_move_by_pieces_profitable_p): New.
> 	(aarch64_estimate_block_copy_ninsns): Likewise.
> 	(aarch64_max_scalarization_size): Likewise.
> 	(TARGET_MAX_SCALARIZATION_SIZE): Likewise.
> 	(TARGET_MOVE_BY_PIECES_PROFITABLE_P): Likewise.
> 	* config/aarch64/aarch64.h (AARCH64_MOVE_RATIO): New.
> 	(MOVE_RATIO): Delete.
> 
> gcc/testsuite/
> 
> 2014-09-25  James Greenhalgh  <james.greenhalgh@arm.com>
> 
> 	* gcc.dg/tree-ssa/pr42585.c: Bring back for AArch64.
> 	* gcc.dg/tree-ssa/sra-12.c: Likewise.
> 

> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 3483081..d8b5a4a 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -9616,6 +9616,34 @@ aarch64_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
>    return false;
>  }
>  
> +static unsigned int
> +aarch64_estimate_movmem_ninsns (HOST_WIDE_INT size)
> +{
> +  HOST_WIDE_INT chunks = 0;
> +  int n = size;
> +
> +  /* 3 bytes is a 2-byte then a 1-byte copy.  */
> +  if (n == 3)
> +    return 2;
> +
> +  /* 5, 6, 7 bytes need an extra copy.  */
> +  if (n > 4 && n < 8)
> +    chunks++;
> +
> +  /* If n was greater than 8, it is dealt with in 8/16-byte chunks
> +     first.  */
> +  chunks += n / 16;
> +  n %= 16;
> +  chunks += n / 8;
> +  n %= 8;
> +
> +  /* Anything left is dealt with in one instruction.  */
> +  if (n != 0)
> +    chunks++;
> +
> +  return chunks;
> +}
> +
>  /* Return a new RTX holding the result of moving POINTER forward by
>     AMOUNT bytes.  */
>  
> @@ -9673,7 +9701,7 @@ aarch64_expand_movmem (rtx *operands)
>  
>    /* When optimizing for size, give a better estimate of the length of a
>       memcpy call, but use the default otherwise.  */
> -  unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
> +  unsigned int max_instructions = AARCH64_MOVE_RATIO (speed_p);
>  
>    /* We can't do anything smart if the amount to copy is not constant.  */
>    if (!CONST_INT_P (operands[2]))
> @@ -9681,10 +9709,9 @@ aarch64_expand_movmem (rtx *operands)
>  
>    n = UINTVAL (operands[2]);
>  
> -  /* Try to keep the number of instructions low.  For cases below 16 bytes we
> -     need to make at most two moves.  For cases above 16 bytes it will be one
> -     move for each 16 byte chunk, then at most two additional moves.  */
> -  if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
> +  /* Try to keep the number of instructions we emit low, fail expansion
> +     if we are unable to and leave it to memcpy.  */
> +  if (aarch64_estimate_movmem_ninsns (n) > max_instructions)
>      return false;
>  
>    base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
> @@ -9774,6 +9801,57 @@ aarch64_expand_movmem (rtx *operands)
>    return true;
>  }
>  
> +/* Implement TARGET_MOVE_BY_PIECES_PROFITABLE_P.  */
> +
> +bool
> +aarch64_move_by_pieces_profitable_p (unsigned int size
> +				     unsigned int align
> +				     bool speed_p)
> +{
> +  /* For strict alignment we don't want to use our unaligned
> +     movmem implementation.  */
> +  if (STRICT_ALIGNMENT)
> +    return (AARCH64_MOVE_RATIO (speed_p)
> +	    < move_by_pieces_ninsns (size, align, speed_p));
> +
> +  /* If we have an overhang of 3, 6 or 7 bytes, we would emit an unaligned
> +     load to cover it, if this is likely to be slow we would do better
> +     going through move_by_pieces.  */
> +  if (size % 8 > 5)
> +    return SLOW_UNALIGNED_ACCESS (DImode, 1);
> +  else if (size % 4 == 3)
> +    return SLOW_UNALIGNED_ACCESS (SImode, 1);
> +
> +  /* We can likely do a better job than the move_by_pieces infrastructure
> +     can.  */
> +  return false;
> +}
> +
> +/* Implement TARGET_ESTIMATE_BLOCK_COPY_NINSNS.  */
> +
> +unsigned int
> +aarch64_estimate_block_copy_ninsns (HOST_WIDE_INT size, bool speed_p)
> +{
> +  if (aarch64_move_by_pieces_profitable_p (size, 8, speed_p))
> +    return move_by_pieces_ninsns (size, 8, MOVE_MAX_PIECES);
> +  else if (aarch64_estimate_movmem_ninsns (size)
> +	   < AARCH64_MOVE_RATIO (speed_p))
> +    return aarch64_estimate_movmem_ninsns (size);
> +  else
> +    /* memcpy.  Set up 3 arguments and make a call.  */
> +    return 4;
> +}
> +
> +/* Implement TARGET_MAX_SCALARIZATION_SIZE.  */
> +
> +unsigned int
> +aarch64_max_scalarization_size (bool speed_p)
> +{
> +  /* The maximum number of instructions we are willing to use * the
> +     maximum size we can move in one instruction (LDP/STP).  */
> +  return AARCH64_MOVE_RATIO (speed_p) * 16;
> +}
> +
>  #undef TARGET_ADDRESS_COST
>  #define TARGET_ADDRESS_COST aarch64_address_cost
>  
> @@ -9843,6 +9921,10 @@ aarch64_expand_movmem (rtx *operands)
>  #undef TARGET_BUILTIN_DECL
>  #define TARGET_BUILTIN_DECL aarch64_builtin_decl
>  
> +#undef TARGET_ESTIMATE_BLOCK_COPY_NINSNS
> +#define TARGET_ESTIMATE_BLOCK_COPY_NINSNS \
> +  aarch64_estimate_block_copy_ninsns
> +
>  #undef  TARGET_EXPAND_BUILTIN
>  #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
>  
> @@ -9897,9 +9979,17 @@ aarch64_expand_movmem (rtx *operands)
>  #undef TARGET_MANGLE_TYPE
>  #define TARGET_MANGLE_TYPE aarch64_mangle_type
>  
> +#undef TARGET_MAX_SCALARIZATION_SIZE
> +#define TARGET_MAX_SCALARIZATION_SIZE \
> +  aarch64_max_scalarization_size
> +
>  #undef TARGET_MEMORY_MOVE_COST
>  #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
>  
> +#undef TARGET_MOVE_BY_PIECES_PROFITABLE_P
> +#define TARGET_MOVE_BY_PIECES_PROFITABLE_P \
> +  aarch64_move_by_pieces_profitable_p
> +
>  #undef TARGET_MUST_PASS_IN_STACK
>  #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
>  
> diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> index db950da..5c8d37d 100644
> --- a/gcc/config/aarch64/aarch64.h
> +++ b/gcc/config/aarch64/aarch64.h
> @@ -678,17 +678,10 @@ do {									     \
>  /* Maximum bytes moved by a single instruction (load/store pair).  */
>  #define MOVE_MAX (UNITS_PER_WORD * 2)
>  
> -/* The base cost overhead of a memcpy call, for MOVE_RATIO and friends.  */
> +/* The base cost overhead of a memcpy call, for CLEAR_RATIO and friends.  */
>  #define AARCH64_CALL_RATIO 8
>  
> -/* MOVE_RATIO dictates when we will use the move_by_pieces infrastructure.
> -   move_by_pieces will continually copy the largest safe chunks.  So a
> -   7-byte copy is a 4-byte + 2-byte + byte copy.  This proves inefficient
> -   for both size and speed of copy, so we will instead use the "movmem"
> -   standard name to implement the copy.  This logic does not apply when
> -   targeting -mstrict-align, so keep a sensible default in that case.  */
> -#define MOVE_RATIO(speed) \
> -  (!STRICT_ALIGNMENT ? 2 : (((speed) ? 15 : AARCH64_CALL_RATIO) / 2))
> +#define AARCH64_MOVE_RATIO(speed) (((speed) ? 15 : AARCH64_CALL_RATIO) / 2)
>  
>  /* For CLEAR_RATIO, when optimizing for size, give a better estimate
>     of the length of a memset call, but use the default otherwise.  */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c b/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c
> index 07f575d..a970c85 100644
> --- a/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c
> @@ -35,6 +35,6 @@ Cyc_string_ungetc (int ignore, struct _fat_ptr *sptr)
>  /* Whether the structs are totally scalarized or not depends on the
>     MOVE_RATIO macro definition in the back end.  The scalarization will
>     not take place when using small values for MOVE_RATIO.  */
> -/* { dg-final { scan-tree-dump-times "struct _fat_ptr _ans" 0 "optimized" { target { ! "aarch64*-*-* arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } } */
> -/* { dg-final { scan-tree-dump-times "struct _fat_ptr _T2" 0 "optimized" { target { ! "aarch64*-*-* arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } } */
> +/* { dg-final { scan-tree-dump-times "struct _fat_ptr _ans" 0 "optimized" { target { ! "arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } } */
> +/* { dg-final { scan-tree-dump-times "struct _fat_ptr _T2" 0 "optimized" { target { ! "arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } } */
>  /* { dg-final { cleanup-tree-dump "optimized" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c b/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c
> index 45aa963..59e5e6a 100644
> --- a/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c
> @@ -21,5 +21,5 @@ int foo (struct S *p)
>    *p = l;
>  }
>  
> -/* { dg-final { scan-tree-dump-times "l;" 0 "release_ssa" { target { ! "aarch64*-*-* avr*-*-* nds32*-*-*" } } } } */
> +/* { dg-final { scan-tree-dump-times "l;" 0 "release_ssa" { target { ! "avr*-*-* nds32*-*-*" } } } } */
>  /* { dg-final { cleanup-tree-dump "release_ssa" } } */

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [Patchv2 3/4] Control SRA and IPA-SRA by a param rather than MOVE_RATIO
  2014-09-26  9:11                 ` Richard Biener
@ 2014-10-01 16:38                   ` James Greenhalgh
  2014-10-29 14:39                     ` James Greenhalgh
  0 siblings, 1 reply; 62+ messages in thread
From: James Greenhalgh @ 2014-10-01 16:38 UTC (permalink / raw)
  To: Richard Biener
  Cc: GCC Patches, Richard Earnshaw, Marcus Shawcroft, Andrew Pinski

On Fri, Sep 26, 2014 at 10:11:13AM +0100, Richard Biener wrote:
> On Thu, Sep 25, 2014 at 4:57 PM, James Greenhalgh
> <james.greenhalgh@arm.com> wrote:
> Given the special value to note the default for the new --params is
> zero a user cannot disable scalarization that way.
> 
> I still somehow dislike that you need a target hook to compute the
> default.  Why doesn't it work to do, in opts.c:default_options_optimization
> 
> maybe_set_param_value
>   (PARAM_SRA_MAX_SCALARIZATION_SIZE_SPEED,
>    get_move_ratio (speed_p) * MOVE_MAX_PIECES,
>    opts->x_param_values, opts_set->x_param_values);
> 
> and override that default in targets option_override hook the same way?

The problem I am having is getting "get_move_ratio" right, without breaking
the modular design.

default_options_optimization, and the rest of opts.c is going to end up in
libcommon-target.a, so we are not going to have access to any
backend-specific symbols.

An early draft of this patch used the MOVE_RATIO macro to set the default
value. This worked fine for AArch64 and ARM targets (which both use a
simple C expression for MOVE_RATIO), but failed for x86_64 which defines
MOVE_RATIO as so:

  #define MOVE_RATIO(speed) ((speed) ? ix86_cost->move_ratio : 3)

Dealing with that ix86_cost symbol is what causes us the pain.

It seems reasonable that a target might want to define MOVE_RATIO
as some function of their tuning parameters, so I don't want to
disallow that usage.

This inspired me to try turning this in to a target hook, but this
doesn't help as opts.c only gets access to common-target.def target
hooks. These suffer the same problem, they don't have access to any
backend symbols.

I suppose I could port any target with a definition of MOVE_RATIO to
override the default parameter value in their option overriding code,
but that makes this a very large patch set (many targets define
MOVE_RATIO).

Is this an avenue worth exploring? I agree the very special target
hook is not ideal.

Thanks,
James

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [Patch 0/6] Hookize MOVE_BY_PIECES_P
  2014-09-25 14:58               ` [Patch 1/4] Hookize MOVE_BY_PIECES_P, remove most uses of MOVE_RATIO James Greenhalgh
  2014-09-25 15:09                 ` Steven Bosscher
@ 2014-10-29 10:45                 ` James Greenhalgh
  2014-10-29 10:47                   ` [Patch 1/6] Hookize MOVE_BY_PIECES_P, remove most uses of MOVE_RATIO James Greenhalgh
                                     ` (5 more replies)
  1 sibling, 6 replies; 62+ messages in thread
From: James Greenhalgh @ 2014-10-29 10:45 UTC (permalink / raw)
  To: gcc-patches; +Cc: richard.guenther, stevenb.gcc

[-- Attachment #1: Type: text/plain, Size: 1221 bytes --]

Hi,

As discussed in the thread starting at:
https://gcc.gnu.org/ml/gcc-patches/2014-09/msg02359.html
it would be useful to completely remove MOVE_BY_PIECES_P, rather
than leaving it half-dead.

This patch series has a small respin of the patch approved in that
thread, followed by patches for each of the architectures using
MOVE_BY_PIECES_P, followed by a final patch removing and poisoning the
target macro.

I haven't been able to test the target patches beyond building a
compiler as I don't have access to hardware or emulators for these
platforms. I would appreciate help from the maintainers of those ports
where it can be given.

The target-independent patches I've bootstrapped and tested on
x86_64/ARM/AArch64 with no issues.

OK for trunk?

Thanks,
James

---

James Greenhalgh (6):
  [Patch 1/6] Hookize MOVE_BY_PIECES_P, remove most uses of MOVE_RATIO
  [Patch 2/6 s390] Deprecate MOVE_BY_PIECES_P, move to hookized version
  [Patch 3/6 arc] Deprecate MOVE_BY_PIECES_P, move to hookized version
  [Patch 4/6 sh] Deprecate MOVE_BY_PIECES_P, move to hookized version
  [Patch 5/6 mips] Deprecate MOVE_BY_PIECES_P, move to hookized version
  [Patch 6/6] Remove MOVE_BY_PIECES_P

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [Patch 1/6] Hookize MOVE_BY_PIECES_P, remove most uses of MOVE_RATIO
  2014-10-29 10:45                 ` [Patch 0/6] Hookize MOVE_BY_PIECES_P James Greenhalgh
@ 2014-10-29 10:47                   ` James Greenhalgh
  2014-10-29 12:29                     ` Matthew Fortune
  2014-10-29 10:49                   ` [Patch 2/6 s390] Deprecate MOVE_BY_PIECES_P, move to hookized version James Greenhalgh
                                     ` (4 subsequent siblings)
  5 siblings, 1 reply; 62+ messages in thread
From: James Greenhalgh @ 2014-10-29 10:47 UTC (permalink / raw)
  To: gcc-patches; +Cc: richard.guenther, stevenb.gcc

[-- Attachment #1: Type: text/plain, Size: 1086 bytes --]


Hi,

This is a very minor respin of the patch at:
https://gcc.gnu.org/ml/gcc-patches/2014-09/msg02359.html

dropping the dependency on the refactor in:
https://gcc.gnu.org/ml/gcc-patches/2014-09/msg01925.html

The patch is otherwise unmodified from what was approved in
September.

Is this still OK?

Thanks,
James

---
gcc/

2014-10-28  James Greenhalgh  <james.greenhalgh@arm.com>

	* target.def (move_by_pieces_profitable_p): New.
	* doc/tm.texi.in (MOVE_BY_PIECES_P): Reduce documentation to a stub
	describing that this macro is deprecated.
	(TARGET_MOVE_BY_PIECES_PROFITABLE_P): Add hook.
	* doc/tm.texi: Regenerate.
	* expr.c (MOVE_BY_PIECES_P): Remove.
	(STORE_BY_PIECES_P): Rewrite in terms of
	TARGET_MOVE_BY_PIECES_PROFITABLE_P.
	(can_move_by_pieces): Likewise.
	(emit_block_move_hints): Rewrite in terms of can_move_by_pieces.
	(emit_push_insn): Likewise.
	(expand_constructor): Likewise.
	* targhooks.c (get_move_ratio): New.
	(default_move_by_pieces_profitable_p): Likewise.
	* targhooks.h (default_move_by_pieces_profitable_p): New.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0001-Patch-1-6-Hookize-MOVE_BY_PIECES_P-remove-most-uses-.patch --]
[-- Type: text/x-patch;  name=0001-Patch-1-6-Hookize-MOVE_BY_PIECES_P-remove-most-uses-.patch, Size: 10407 bytes --]

diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 5036d4f..c50227a 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -6124,11 +6124,38 @@ If you don't define this, a reasonable default is used.
 @end defmac
 
 @defmac MOVE_BY_PIECES_P (@var{size}, @var{alignment})
-A C expression used to determine whether @code{move_by_pieces} will be used to
-copy a chunk of memory, or whether some other block move mechanism
-will be used.  Defaults to 1 if @code{move_by_pieces_ninsns} returns less
-than @code{MOVE_RATIO}.
-@end defmac
+A C expression used to implement the default behaviour of
+@code{TARGET_MOVE_BY_PIECES_PROFITABLE_P}.  New ports should implement
+that hook in preference to this macro, which is deprecated.
+@end defmac
+
+@deftypefn {Target Hook} bool TARGET_MOVE_BY_PIECES_PROFITABLE_P (unsigned int @var{size}, unsigned int @var{alignment}, bool @var{speed_p})
+GCC will attempt several strategies when asked to copy between
+two areas of memory, for example when copying a @code{struct}.
+@code{move_by_pieces} implements such a copy as a sequence of
+memory-to-memory move insns.  Alternate strategies are to expand the
+@code{movmem} optab, to emit a library call, or to emit a unit-by-unit
+loop-based copy.
+
+This target hook should return true if, for a memory move with a given
+@var{size} and @var{alignment}, using the @code{move_by_pieces}
+infrastructure is expected to result in better code generation.
+Both @var{size} and @var{alignment} are measured in terms of storage
+units.
+
+The parameter @var{speed_p} is true if the code is currently being
+optimized for speed rather than size.
+
+Returning true for higher values of @var{size} can improve code generation
+for speed if the target does not provide an implementation of the
+@code{movmem} standard name, if the @code{movmem} implementation would be
+more expensive than a sequence of move insns, or if the overhead of a
+library call would dominate that of the body of the copy.
+
+Returning true for higher values of @code{size} may also cause an increase
+in code size, for example where the number of insns emitted to perform a
+move would be greater than that of a library call.
+@end deftypefn
 
 @defmac MOVE_MAX_PIECES
 A C expression used by @code{move_by_pieces} to determine the largest unit
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 5674e6c..f3c90f8 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4601,12 +4601,13 @@ If you don't define this, a reasonable default is used.
 @end defmac
 
 @defmac MOVE_BY_PIECES_P (@var{size}, @var{alignment})
-A C expression used to determine whether @code{move_by_pieces} will be used to
-copy a chunk of memory, or whether some other block move mechanism
-will be used.  Defaults to 1 if @code{move_by_pieces_ninsns} returns less
-than @code{MOVE_RATIO}.
+A C expression used to implement the default behaviour of
+@code{TARGET_MOVE_BY_PIECES_PROFITABLE_P}.  New ports should implement
+that hook in preference to this macro, which is deprecated.
 @end defmac
 
+@hook TARGET_MOVE_BY_PIECES_PROFITABLE_P
+
 @defmac MOVE_MAX_PIECES
 A C expression used by @code{move_by_pieces} to determine the largest unit
 a load or store used to copy memory is.  Defaults to @code{MOVE_MAX}.
diff --git a/gcc/expr.c b/gcc/expr.c
index a5bf13a..6b3291f 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -164,14 +164,6 @@ static void do_tablejump (rtx, enum machine_mode, rtx, rtx, rtx, int);
 static rtx const_vector_from_tree (tree);
 static void write_complex_part (rtx, rtx, bool);
 
-/* This macro is used to determine whether move_by_pieces should be called
-   to perform a structure copy.  */
-#ifndef MOVE_BY_PIECES_P
-#define MOVE_BY_PIECES_P(SIZE, ALIGN) \
-  (move_by_pieces_ninsns (SIZE, ALIGN, MOVE_MAX_PIECES + 1) \
-   < (unsigned int) MOVE_RATIO (optimize_insn_for_speed_p ()))
-#endif
-
 /* This macro is used to determine whether clear_by_pieces should be
    called to clear storage.  */
 #ifndef CLEAR_BY_PIECES_P
@@ -192,8 +184,7 @@ static void write_complex_part (rtx, rtx, bool);
    called to "memcpy" storage when the source is a constant string.  */
 #ifndef STORE_BY_PIECES_P
 #define STORE_BY_PIECES_P(SIZE, ALIGN) \
-  (move_by_pieces_ninsns (SIZE, ALIGN, STORE_MAX_PIECES + 1) \
-   < (unsigned int) MOVE_RATIO (optimize_insn_for_speed_p ()))
+  (targetm.move_by_pieces_profitable_p (SIZE, ALIGN, STORE_MAX_PIECES))
 #endif
 \f
 /* This is run to set up which modes can be used
@@ -839,7 +830,8 @@ int
 can_move_by_pieces (unsigned HOST_WIDE_INT len ATTRIBUTE_UNUSED,
 		    unsigned int align ATTRIBUTE_UNUSED)
 {
-  return MOVE_BY_PIECES_P (len, align);
+  return targetm.move_by_pieces_profitable_p (len, align,
+					      optimize_insn_for_speed_p ());
 }
 
 /* Generate several move instructions to copy LEN bytes from block FROM to
@@ -1176,7 +1168,7 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
       set_mem_size (y, INTVAL (size));
     }
 
-  if (CONST_INT_P (size) && MOVE_BY_PIECES_P (INTVAL (size), align))
+  if (CONST_INT_P (size) && can_move_by_pieces (INTVAL (size), align))
     move_by_pieces (x, y, INTVAL (size), align, 0);
   else if (emit_block_move_via_movmem (x, y, size, align,
 				       expected_align, expected_size,
@@ -4219,7 +4211,7 @@ emit_push_insn (rtx x, enum machine_mode mode, tree type, rtx size,
 	  && CONST_INT_P (size)
 	  && skip == 0
 	  && MEM_ALIGN (xinner) >= align
-	  && (MOVE_BY_PIECES_P ((unsigned) INTVAL (size) - used, align))
+	  && can_move_by_pieces ((unsigned) INTVAL (size) - used, align)
 	  /* Here we avoid the case of a structure whose weak alignment
 	     forces many pushes of a small amount of data,
 	     and such small pushes do rounding that causes trouble.  */
@@ -7840,7 +7832,7 @@ expand_constructor (tree exp, rtx target, enum expand_modifier modifier,
 	    && ! (target != 0 && safe_from_p (target, exp, 1)))
 		  || TREE_ADDRESSABLE (exp)
 		  || (tree_fits_uhwi_p (TYPE_SIZE_UNIT (type))
-		      && (! MOVE_BY_PIECES_P
+		      && (! can_move_by_pieces
 				     (tree_to_uhwi (TYPE_SIZE_UNIT (type)),
 				      TYPE_ALIGN (type)))
 		      && ! mostly_zeros_p (exp))))
diff --git a/gcc/target.def b/gcc/target.def
index 4d90fc2..ed33563 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -3049,6 +3049,36 @@ are the same as to this target hook.",
  int, (enum machine_mode mode, reg_class_t rclass, bool in),
  default_memory_move_cost)
 
+DEFHOOK
+(move_by_pieces_profitable_p,
+ "GCC will attempt several strategies when asked to copy between\n\
+two areas of memory, for example when copying a @code{struct}.\n\
+@code{move_by_pieces} implements such a copy as a sequence of\n\
+memory-to-memory move insns.  Alternate strategies are to expand the\n\
+@code{movmem} optab, to emit a library call, or to emit a unit-by-unit\n\
+loop-based copy.\n\
+\n\
+This target hook should return true if, for a memory move with a given\n\
+@var{size} and @var{alignment}, using the @code{move_by_pieces}\n\
+infrastructure is expected to result in better code generation.\n\
+Both @var{size} and @var{alignment} are measured in terms of storage\n\
+units.\n\
+\n\
+The parameter @var{speed_p} is true if the code is currently being\n\
+optimized for speed rather than size.\n\
+\n\
+Returning true for higher values of @var{size} can improve code generation\n\
+for speed if the target does not provide an implementation of the\n\
+@code{movmem} standard name, if the @code{movmem} implementation would be\n\
+more expensive than a sequence of move insns, or if the overhead of a\n\
+library call would dominate that of the body of the copy.\n\
+\n\
+Returning true for higher values of @code{size} may also cause an increase\n\
+in code size, for example where the number of insns emitted to perform a\n\
+move would be greater than that of a library call.",
+ bool, (unsigned int size, unsigned int alignment, bool speed_p),
+ default_move_by_pieces_profitable_p)
+
 /* True for MODE if the target expects that registers in this mode will
    be allocated to registers in a small register class.  The compiler is
    allowed to use registers explicitly used in the rtl as spill registers
diff --git a/gcc/targhooks.c b/gcc/targhooks.c
index b6745f0..9ba3f8b 100644
--- a/gcc/targhooks.c
+++ b/gcc/targhooks.c
@@ -1406,6 +1406,41 @@ default_register_move_cost (enum machine_mode mode ATTRIBUTE_UNUSED,
 #endif
 }
 
+/* For hooks which use the MOVE_RATIO macro, this gives the legacy default
+   behaviour.  SPEED_P is true if we are compiling for speed.  */
+
+static unsigned int
+get_move_ratio (bool speed_p ATTRIBUTE_UNUSED)
+{
+  unsigned int move_ratio;
+#ifdef MOVE_RATIO
+  move_ratio = (unsigned int) MOVE_RATIO (speed_p);
+#else
+#if defined (HAVE_movmemqi) || defined (HAVE_movmemhi) || defined (HAVE_movmemsi) || defined (HAVE_movmemdi) || defined (HAVE_movmemti)
+  move_ratio = 2;
+#else /* No movmem patterns, pick a default.  */
+  move_ratio = ((speed_p) ? 15 : 3);
+#endif
+#endif
+  return move_ratio;
+}
+
+/* The threshold of move insns below which the movmem optab is expanded or a
+   call to memcpy is emitted.  */
+
+bool
+default_move_by_pieces_profitable_p (unsigned int size ATTRIBUTE_UNUSED,
+				     unsigned int alignment ATTRIBUTE_UNUSED,
+				     bool speed_p ATTRIBUTE_UNUSED)
+{
+#ifndef MOVE_BY_PIECES_P
+  return move_by_pieces_ninsns (size, alignment, MOVE_MAX_PIECES + 1)
+	 < get_move_ratio (speed_p);
+#else
+  return !!(MOVE_BY_PIECES_P (size, alignment));
+#endif
+}
+
 bool
 default_profile_before_prologue (void)
 {
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index 9178c30..93f21f8 100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -181,6 +181,9 @@ extern int default_memory_move_cost (enum machine_mode, reg_class_t, bool);
 extern int default_register_move_cost (enum machine_mode, reg_class_t,
 				       reg_class_t);
 
+extern bool default_move_by_pieces_profitable_p (unsigned int,
+						 unsigned int, bool);
+
 extern bool default_profile_before_prologue (void);
 extern reg_class_t default_preferred_reload_class (rtx, reg_class_t);
 extern reg_class_t default_preferred_output_reload_class (rtx, reg_class_t);

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [Patch 2/6 s390] Deprecate MOVE_BY_PIECES_P, move to hookized version
  2014-10-29 10:45                 ` [Patch 0/6] Hookize MOVE_BY_PIECES_P James Greenhalgh
  2014-10-29 10:47                   ` [Patch 1/6] Hookize MOVE_BY_PIECES_P, remove most uses of MOVE_RATIO James Greenhalgh
@ 2014-10-29 10:49                   ` James Greenhalgh
  2014-10-29 21:09                     ` Jeff Law
  2014-10-29 10:50                   ` [Patch 4/6 sh] " James Greenhalgh
                                     ` (3 subsequent siblings)
  5 siblings, 1 reply; 62+ messages in thread
From: James Greenhalgh @ 2014-10-29 10:49 UTC (permalink / raw)
  To: gcc-patches; +Cc: uweigand, Andreas.Krebbel

[-- Attachment #1: Type: text/plain, Size: 534 bytes --]


Hi,

This patch moves s390 to TARGET_MOVE_BY_PIECES_PROFITABLE_P.

I tried building a compiler and there were no fires, but otherwise,
I have no reasonable way to test this patch. If one of the s390
maintainers wants to pick it up and test it, that would be much
appreciated.

Ok?

James

---
2014-10-29  James Greenhalgh  <james.greenhalgh@arm.com>

	* config/s390/s390.c (s390_move_by_pieces_profitable_p): New.
	(TARGET_MOVE_BY_PIECES_PROFITABLE_P): Likewise.
	* config/s390/s390.h (MOVE_BY_PIECES_P): Remove.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0002-Patch-2-6-s390-Deprecate-MOVE_BY_PIECES_P-move-to-ho.patch --]
[-- Type: text/x-patch;  name=0002-Patch-2-6-s390-Deprecate-MOVE_BY_PIECES_P-move-to-ho.patch, Size: 2137 bytes --]

diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c
index 1b10805..f531e12 100644
--- a/gcc/config/s390/s390.c
+++ b/gcc/config/s390/s390.c
@@ -12043,6 +12043,17 @@ s390_option_override (void)
   register_pass (&insert_pass_s390_early_mach);
 }
 
+/* Implement TARGET_MOVE_BY_PIECES_PROFITABLE_P.  */
+
+static bool
+s390_move_by_pieces_profitable_p (unsigned int size,
+				  unsigned int align ATTRIBUTE_UNUSED,
+				  bool speed_p ATTRIBUTE_UNUSED)
+{
+  return (size == 1 || size == 2
+	  || size == 4 || (TARGET_ZARCH && size == 8));
+}
+
 /* Initialize GCC target structure.  */
 
 #undef  TARGET_ASM_ALIGNED_HI_OP
@@ -12228,6 +12239,9 @@ s390_option_override (void)
 #undef TARGET_SET_UP_BY_PROLOGUE
 #define TARGET_SET_UP_BY_PROLOGUE s300_set_up_by_prologue
 
+#undef TARGET_MOVE_BY_PIECES_PROFITABLE_P
+#define TARGET_MOVE_BY_PIECES_PROFITABLE_P s390_move_by_pieces_profitable_p
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 #include "gt-s390.h"
diff --git a/gcc/config/s390/s390.h b/gcc/config/s390/s390.h
index c5edace..688c2fb 100644
--- a/gcc/config/s390/s390.h
+++ b/gcc/config/s390/s390.h
@@ -744,11 +744,6 @@ do {									\
 #define MOVE_MAX_PIECES (TARGET_ZARCH ? 8 : 4)
 #define MAX_MOVE_MAX 16
 
-/* Determine whether to use move_by_pieces or block move insn.  */
-#define MOVE_BY_PIECES_P(SIZE, ALIGN)		\
-  ( (SIZE) == 1 || (SIZE) == 2 || (SIZE) == 4	\
-    || (TARGET_ZARCH && (SIZE) == 8) )
-
 /* Determine whether to use clear_by_pieces or block clear insn.  */
 #define CLEAR_BY_PIECES_P(SIZE, ALIGN)		\
   ( (SIZE) == 1 || (SIZE) == 2 || (SIZE) == 4	\
@@ -756,7 +751,9 @@ do {									\
 
 /* This macro is used to determine whether store_by_pieces should be
    called to "memcpy" storage when the source is a constant string.  */
-#define STORE_BY_PIECES_P(SIZE, ALIGN) MOVE_BY_PIECES_P (SIZE, ALIGN)
+#define STORE_BY_PIECES_P(SIZE, ALIGN) \
+  targetm.move_by_pieces_profitable_p \
+    (SIZE, ALIGN, optimize_function_for_size_p (cfun))
 
 /* Likewise to decide whether to "memset" storage with byte values
    other than zero.  */

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [Patch 4/6 sh] Deprecate MOVE_BY_PIECES_P, move to hookized version
  2014-10-29 10:45                 ` [Patch 0/6] Hookize MOVE_BY_PIECES_P James Greenhalgh
  2014-10-29 10:47                   ` [Patch 1/6] Hookize MOVE_BY_PIECES_P, remove most uses of MOVE_RATIO James Greenhalgh
  2014-10-29 10:49                   ` [Patch 2/6 s390] Deprecate MOVE_BY_PIECES_P, move to hookized version James Greenhalgh
@ 2014-10-29 10:50                   ` James Greenhalgh
  2014-10-29 21:10                     ` Jeff Law
  2014-10-30  0:49                     ` Kaz Kojima
  2014-10-29 10:50                   ` [Patch 3/6 arc] " James Greenhalgh
                                     ` (2 subsequent siblings)
  5 siblings, 2 replies; 62+ messages in thread
From: James Greenhalgh @ 2014-10-29 10:50 UTC (permalink / raw)
  To: gcc-patches; +Cc: aoliva, kkojima, olegendo

[-- Attachment #1: Type: text/plain, Size: 530 bytes --]


Hi,

This patch moves sh to TARGET_MOVE_BY_PIECES_PROFITABLE_P.

I tried building a compiler and there were no fires, but otherwise,
I have no reasonable way to test this patch. If one of the sh
maintainers wants to pick it up and test it, that would be much
appreciated.

Thanks,
James

---
gcc/

2014-10-28  James Greenhalgh  <james.greenhalgh@arm.com>

	* config/sh/sh.c (TARGET_MOVE_BY_PIECES_PROFITABLE_P): New.
	(sh_move_by_pieces_profitable_p): Likewise.
	* config/sh/sh.h (MOVE_BY_PIECES_P): Remove.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0004-Patch-4-6-sh-Deprecate-MOVE_BY_PIECES_P-move-to-hook.patch --]
[-- Type: text/x-patch;  name=0004-Patch-4-6-sh-Deprecate-MOVE_BY_PIECES_P-move-to-hook.patch, Size: 2228 bytes --]

diff --git a/gcc/config/sh/sh.c b/gcc/config/sh/sh.c
index 1662b55..0b907b9 100644
--- a/gcc/config/sh/sh.c
+++ b/gcc/config/sh/sh.c
@@ -338,6 +338,9 @@ static void sh_conditional_register_usage (void);
 static bool sh_legitimate_constant_p (enum machine_mode, rtx);
 static int mov_insn_size (enum machine_mode, bool);
 static int mov_insn_alignment_mask (enum machine_mode, bool);
+static bool sh_move_by_pieces_profitable_p (unsigned int size,
+					    unsigned int align,
+					    bool speed_p);
 static bool sequence_insn_p (rtx_insn *);
 static void sh_canonicalize_comparison (int *, rtx *, rtx *, bool);
 static void sh_canonicalize_comparison (enum rtx_code&, rtx&, rtx&,
@@ -640,6 +643,9 @@ static const struct attribute_spec sh_attribute_table[] =
 #undef TARGET_FIXED_CONDITION_CODE_REGS
 #define TARGET_FIXED_CONDITION_CODE_REGS sh_fixed_condition_code_regs
 
+#undef TARGET_MOVE_BY_PIECES_PROFITABLE_P
+#define TARGET_MOVE_BY_PIECES_PROFITABLE_P sh_move_by_pieces_profitable_p
+
 /* Machine-specific symbol_ref flags.  */
 #define SYMBOL_FLAG_FUNCVEC_FUNCTION	(SYMBOL_FLAG_MACH_DEP << 0)
 
@@ -13674,4 +13680,15 @@ sh_mode_priority (int entity ATTRIBUTE_UNUSED, int n)
   return ((TARGET_FPU_SINGLE != 0) ^ (n) ? FP_MODE_SINGLE : FP_MODE_DOUBLE);
 }
 
+/* Implement TARGET_MOVE_BY_PIECES_PROFITABLE_P.  */
+
+static bool
+sh_move_by_pieces_profitable_p (unsigned int size,
+				unsigned int align,
+				bool speed_p)
+{
+  return move_by_pieces_ninsns (size, align, MOVE_MAX_PIECES + 1)
+	 < (!speed_p ? 2 : (align >= 32) ? 16 : 2);
+}
+
 #include "gt-sh.h"
diff --git a/gcc/config/sh/sh.h b/gcc/config/sh/sh.h
index 5b8b4a1..e115b1e 100644
--- a/gcc/config/sh/sh.h
+++ b/gcc/config/sh/sh.h
@@ -1591,10 +1591,6 @@ struct sh_args {
 #define USE_STORE_PRE_DECREMENT(mode)    ((mode == SImode || mode == DImode) \
 					  ? 0 : TARGET_SH1)
 
-#define MOVE_BY_PIECES_P(SIZE, ALIGN) \
-  (move_by_pieces_ninsns (SIZE, ALIGN, MOVE_MAX_PIECES + 1) \
-   < (optimize_size ? 2 : ((ALIGN >= 32) ? 16 : 2)))
-
 #define STORE_BY_PIECES_P(SIZE, ALIGN) \
   (move_by_pieces_ninsns (SIZE, ALIGN, STORE_MAX_PIECES + 1) \
    < (optimize_size ? 2 : ((ALIGN >= 32) ? 16 : 2)))

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [Patch 3/6 arc] Deprecate MOVE_BY_PIECES_P, move to hookized version
  2014-10-29 10:45                 ` [Patch 0/6] Hookize MOVE_BY_PIECES_P James Greenhalgh
                                     ` (2 preceding siblings ...)
  2014-10-29 10:50                   ` [Patch 4/6 sh] " James Greenhalgh
@ 2014-10-29 10:50                   ` James Greenhalgh
  2014-10-29 21:10                     ` Jeff Law
  2014-10-29 10:51                   ` [Patch 5/6 mips] " James Greenhalgh
  2014-10-29 10:53                   ` [Patch 6/6] Remove MOVE_BY_PIECES_P James Greenhalgh
  5 siblings, 1 reply; 62+ messages in thread
From: James Greenhalgh @ 2014-10-29 10:50 UTC (permalink / raw)
  To: gcc-patches; +Cc: joern.rennecke

[-- Attachment #1: Type: text/plain, Size: 682 bytes --]


Hi,

This patch moves arc to TARGET_MOVE_BY_PIECES_PROFITABLE_P.

While I am there, arc defines a macro CAN_MOVE_BY_PIECES, which is
unused, so clean that up too.

I tried building a compiler but no amount of fiddling with target
strings got me to a sensible result, so this patch is completely
untested.

If one of the arc maintainers could give it a spin that would be
helpful.

OK?

Thanks,
James

 ---
2014-10-28  James Greenhalgh  <james.greenhalgh@arm.com>

	* config/arc/arc.c (TARGET_MOVE_BY_PIECES_PROFITABLE_P): New.
	(arc_move_by_pieces_profitable_p): Likewise.
	* confir/arc/arc.h (MOVE_BY_PIECES_P): Delete.
	(CAN_MOVE_BY_PIECES): Likewise.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0003-Patch-3-6-arc-Deprecate-MOVE_BY_PIECES_P-move-to-hoo.patch --]
[-- Type: text/x-patch;  name=0003-Patch-3-6-arc-Deprecate-MOVE_BY_PIECES_P-move-to-hoo.patch, Size: 2010 bytes --]

diff --git a/gcc/config/arc/arc.c b/gcc/config/arc/arc.c
index 8bfebfd..fcebe59 100644
--- a/gcc/config/arc/arc.c
+++ b/gcc/config/arc/arc.c
@@ -415,6 +415,10 @@ static void output_short_suffix (FILE *file);
 
 static bool arc_frame_pointer_required (void);
 
+static bool arc_move_by_pieces_profitable_p (unsigned int,
+					     unsigned int,
+					     bool);
+
 /* Implements target hook vector_mode_supported_p.  */
 
 static bool
@@ -530,6 +534,9 @@ static void arc_finalize_pic (void);
 #undef TARGET_DELEGITIMIZE_ADDRESS
 #define TARGET_DELEGITIMIZE_ADDRESS arc_delegitimize_address
 
+#undef TARGET_MOVE_BY_PIECES_PROFITABLE_P
+#define TARGET_MOVE_BY_PIECES_PROFITABLE_P arc_move_by_pieces_profitable_p
+
 /* Usually, we will be able to scale anchor offsets.
    When this fails, we want LEGITIMIZE_ADDRESS to kick in.  */
 #undef TARGET_MIN_ANCHOR_OFFSET
@@ -9383,6 +9390,16 @@ arc_legitimize_reload_address (rtx *p, enum machine_mode mode, int opnum,
   return false;
 }
 
+/* Implement TARGET_MOVE_BY_PIECES_PROFITABLE_P.  */
+
+static bool
+arc_move_by_pieces_profitable_p (unsigned int size ATTRIBUTE_UNUSED,
+				 unsigned int align ATTRIBUTE_UNUSED,
+				 bool speed_p ATTRIBUTE_UNUSED)
+{
+  return false;
+}
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 #include "gt-arc.h"
diff --git a/gcc/config/arc/arc.h b/gcc/config/arc/arc.h
index 2b0a04c..1a2c6b1 100644
--- a/gcc/config/arc/arc.h
+++ b/gcc/config/arc/arc.h
@@ -1553,12 +1553,6 @@ extern int arc_return_address_regs[4];
    in one reasonably fast instruction.  */
 #define MOVE_MAX 4
 
-/* Let the movmem expander handle small block moves.  */
-#define MOVE_BY_PIECES_P(LEN, ALIGN)  0
-#define CAN_MOVE_BY_PIECES(SIZE, ALIGN) \
-  (move_by_pieces_ninsns (SIZE, ALIGN, MOVE_MAX_PIECES + 1) \
-   < (unsigned int) MOVE_RATIO (!optimize_size))
-
 /* Undo the effects of the movmem pattern presence on STORE_BY_PIECES_P .  */
 #define MOVE_RATIO(SPEED) ((SPEED) ? 15 : 3)
 

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [Patch 5/6 mips] Deprecate MOVE_BY_PIECES_P, move to hookized version
  2014-10-29 10:45                 ` [Patch 0/6] Hookize MOVE_BY_PIECES_P James Greenhalgh
                                     ` (3 preceding siblings ...)
  2014-10-29 10:50                   ` [Patch 3/6 arc] " James Greenhalgh
@ 2014-10-29 10:51                   ` James Greenhalgh
  2014-10-29 21:18                     ` Jeff Law
  2014-10-29 10:53                   ` [Patch 6/6] Remove MOVE_BY_PIECES_P James Greenhalgh
  5 siblings, 1 reply; 62+ messages in thread
From: James Greenhalgh @ 2014-10-29 10:51 UTC (permalink / raw)
  To: gcc-patches; +Cc: clm, echristo, matthew.fortune

[-- Attachment #1: Type: text/plain, Size: 663 bytes --]


Hi,

This patch moves mips to TARGET_MOVE_BY_PIECES_PROFITABLE_P.

I tried building a compiler and there were no fires, I don't have access
to any MIPS hardware, so if one of the MIPS maintainers wanted to pick
this up and test it, that would be very much appreciated.

OK?

Thanks,
James

---
gcc/

2014-10-28  James Greenhalgh  <james.greenhalgh@arm.com>

	* config/mips/mips.h (MOVE_BY_PIECES_P): Remove.
	* config/mips/mips.c (TARGET_MOVE_BY_PIECES_PROFITABLE_P): New.
	(mips_move_by_pieces_p): Rename to...
	(mips_move_by_pieces_profitable_p): ...this, use new hook
	parameters, use the default hook implementation as a
	fall-back.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0005-Patch-5-6-mips-Deprecate-MOVE_BY_PIECES_P-move-to-ho.patch --]
[-- Type: text/x-patch;  name=0005-Patch-5-6-mips-Deprecate-MOVE_BY_PIECES_P-move-to-ho.patch, Size: 1948 bytes --]

diff --git a/gcc/config/mips/mips.c b/gcc/config/mips/mips.c
index 2f9d2da..4d7ef81 100644
--- a/gcc/config/mips/mips.c
+++ b/gcc/config/mips/mips.c
@@ -7172,7 +7172,9 @@ mips_function_ok_for_sibcall (tree decl, tree exp ATTRIBUTE_UNUSED)
 /* Implement MOVE_BY_PIECES_P.  */
 
 bool
-mips_move_by_pieces_p (unsigned HOST_WIDE_INT size, unsigned int align)
+mips_move_by_pieces_profitable_p (unsigned int size,
+				  unsigned int align,
+				  bool speed_p)
 {
   if (HAVE_movmemsi)
     {
@@ -7191,10 +7193,8 @@ mips_move_by_pieces_p (unsigned HOST_WIDE_INT size, unsigned int align)
 	return size < UNITS_PER_WORD;
       return size <= MIPS_MAX_MOVE_BYTES_STRAIGHT;
     }
-  /* The default value.  If this becomes a target hook, we should
-     call the default definition instead.  */
-  return (move_by_pieces_ninsns (size, align, MOVE_MAX_PIECES + 1)
-	  < (unsigned int) MOVE_RATIO (optimize_insn_for_speed_p ()));
+
+  return default_move_by_pieces_profitable_p (size, align, speed_p);
 }
 
 /* Implement STORE_BY_PIECES_P.  */
@@ -19116,6 +19116,9 @@ mips_lra_p (void)
 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
 
+#undef TARGET_MOVE_BY_PIECES_PROFITABLE_P
+#define TARGET_MOVE_BY_PIECES_PROFITABLE_P mips_move_by_pieces_profitable_p
+
 #undef TARGET_SPILL_CLASS
 #define TARGET_SPILL_CLASS mips_spill_class
 #undef TARGET_LRA_P
diff --git a/gcc/config/mips/mips.h b/gcc/config/mips/mips.h
index c7b998b..6872940 100644
--- a/gcc/config/mips/mips.h
+++ b/gcc/config/mips/mips.h
@@ -2872,9 +2872,6 @@ while (0)
    ? MIPS_MAX_MOVE_BYTES_STRAIGHT / MOVE_MAX		\
    : MIPS_CALL_RATIO / 2)
 
-#define MOVE_BY_PIECES_P(SIZE, ALIGN) \
-  mips_move_by_pieces_p (SIZE, ALIGN)
-
 /* For CLEAR_RATIO, when optimizing for size, give a better estimate
    of the length of a memset call, but use the default otherwise.  */
 

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [Patch 6/6] Remove MOVE_BY_PIECES_P
  2014-10-29 10:45                 ` [Patch 0/6] Hookize MOVE_BY_PIECES_P James Greenhalgh
                                     ` (4 preceding siblings ...)
  2014-10-29 10:51                   ` [Patch 5/6 mips] " James Greenhalgh
@ 2014-10-29 10:53                   ` James Greenhalgh
  2014-10-29 21:20                     ` Jeff Law
  5 siblings, 1 reply; 62+ messages in thread
From: James Greenhalgh @ 2014-10-29 10:53 UTC (permalink / raw)
  To: gcc-patches; +Cc: richard.guenther, stevenb.gcc

[-- Attachment #1: Type: text/plain, Size: 391 bytes --]


Hi,

This final patch gets rid of MOVE_BY_PIECES_P.

Bootstrapped on x86_64, ARM and AArch64.

Thanks,
James

---
gcc/

2014-10-28  James Greenhalgh  <james.greenhalgh@arm.com>

	* doc/tm.texi.in (MOVE_BY_PIECES_P): Remove.
	* doc/tm.texi: Regenerate.
	* system.h: Poison MOVE_BY_PIECES_P.
	* targhooks.c (default_move_by_pieces_profitable_p): Remove
	MOVE_BY_PIECES_P.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0006-Patch-6-6-Remove-MOVE_BY_PIECES_P.patch --]
[-- Type: text/x-patch;  name=0006-Patch-6-6-Remove-MOVE_BY_PIECES_P.patch, Size: 2650 bytes --]

diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index c50227a..86d783e 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -6123,12 +6123,6 @@ optimized for speed rather than size.
 If you don't define this, a reasonable default is used.
 @end defmac
 
-@defmac MOVE_BY_PIECES_P (@var{size}, @var{alignment})
-A C expression used to implement the default behaviour of
-@code{TARGET_MOVE_BY_PIECES_PROFITABLE_P}.  New ports should implement
-that hook in preference to this macro, which is deprecated.
-@end defmac
-
 @deftypefn {Target Hook} bool TARGET_MOVE_BY_PIECES_PROFITABLE_P (unsigned int @var{size}, unsigned int @var{alignment}, bool @var{speed_p})
 GCC will attempt several strategies when asked to copy between
 two areas of memory, for example when copying a @code{struct}.
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index f3c90f8..f085796 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4600,12 +4600,6 @@ optimized for speed rather than size.
 If you don't define this, a reasonable default is used.
 @end defmac
 
-@defmac MOVE_BY_PIECES_P (@var{size}, @var{alignment})
-A C expression used to implement the default behaviour of
-@code{TARGET_MOVE_BY_PIECES_PROFITABLE_P}.  New ports should implement
-that hook in preference to this macro, which is deprecated.
-@end defmac
-
 @hook TARGET_MOVE_BY_PIECES_PROFITABLE_P
 
 @defmac MOVE_MAX_PIECES
diff --git a/gcc/system.h b/gcc/system.h
index dbe1ceb..b9b90d4 100644
--- a/gcc/system.h
+++ b/gcc/system.h
@@ -847,7 +847,8 @@ extern void fancy_abort (const char *, int, const char *) ATTRIBUTE_NORETURN;
 	HOT_TEXT_SECTION_NAME LEGITIMATE_CONSTANT_P ALWAYS_STRIP_DOTDOT	\
 	OUTPUT_ADDR_CONST_EXTRA SMALL_REGISTER_CLASSES ASM_OUTPUT_IDENT	\
 	ASM_BYTE_OP MEMBER_TYPE_FORCES_BLK LIBGCC2_HAS_SF_MODE		\
-	LIBGCC2_HAS_DF_MODE LIBGCC2_HAS_XF_MODE LIBGCC2_HAS_TF_MODE
+	LIBGCC2_HAS_DF_MODE LIBGCC2_HAS_XF_MODE LIBGCC2_HAS_TF_MODE	\
+	MOVE_BY_PIECES_P
 
 /* Target macros only used for code built for the target, that have
    moved to libgcc-tm.h or have never been present elsewhere.  */
diff --git a/gcc/targhooks.c b/gcc/targhooks.c
index 9ba3f8b..068e24e 100644
--- a/gcc/targhooks.c
+++ b/gcc/targhooks.c
@@ -1433,12 +1433,8 @@ default_move_by_pieces_profitable_p (unsigned int size ATTRIBUTE_UNUSED,
 				     unsigned int alignment ATTRIBUTE_UNUSED,
 				     bool speed_p ATTRIBUTE_UNUSED)
 {
-#ifndef MOVE_BY_PIECES_P
   return move_by_pieces_ninsns (size, alignment, MOVE_MAX_PIECES + 1)
 	 < get_move_ratio (speed_p);
-#else
-  return !!(MOVE_BY_PIECES_P (size, alignment));
-#endif
 }
 
 bool

^ permalink raw reply	[flat|nested] 62+ messages in thread

* RE: [Patch 1/6] Hookize MOVE_BY_PIECES_P, remove most uses of MOVE_RATIO
  2014-10-29 10:47                   ` [Patch 1/6] Hookize MOVE_BY_PIECES_P, remove most uses of MOVE_RATIO James Greenhalgh
@ 2014-10-29 12:29                     ` Matthew Fortune
  2014-10-29 16:11                       ` James Greenhalgh
  0 siblings, 1 reply; 62+ messages in thread
From: Matthew Fortune @ 2014-10-29 12:29 UTC (permalink / raw)
  To: James Greenhalgh, gcc-patches; +Cc: richard.guenther, stevenb.gcc

Hi James,

I think you have a bug in the following hunk where you pass
STORE_MAX_PIECES in place of the optimise for speed flag. I guess you
would need an extra argument to pass a different *_MAX_PIECES value
in.

thanks,
Matthew

>@@ -192,8 +184,7 @@ static void write_complex_part (rtx, rtx, bool);
>    called to "memcpy" storage when the source is a constant string.  */
> #ifndef STORE_BY_PIECES_P
> #define STORE_BY_PIECES_P(SIZE, ALIGN) \
>-  (move_by_pieces_ninsns (SIZE, ALIGN, STORE_MAX_PIECES + 1) \
>-   < (unsigned int) MOVE_RATIO (optimize_insn_for_speed_p ()))
>+  (targetm.move_by_pieces_profitable_p (SIZE, ALIGN, STORE_MAX_PIECES))
> #endif
> 
>
> /* This is run to set up which modes can be use



^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [Patchv2 3/4] Control SRA and IPA-SRA by a param rather than MOVE_RATIO
  2014-10-01 16:38                   ` James Greenhalgh
@ 2014-10-29 14:39                     ` James Greenhalgh
  2014-10-31 10:58                       ` Richard Biener
  0 siblings, 1 reply; 62+ messages in thread
From: James Greenhalgh @ 2014-10-29 14:39 UTC (permalink / raw)
  To: Richard Biener
  Cc: GCC Patches, Richard Earnshaw, Marcus Shawcroft, Andrew Pinski

On Wed, Oct 01, 2014 at 05:38:12PM +0100, James Greenhalgh wrote:
> On Fri, Sep 26, 2014 at 10:11:13AM +0100, Richard Biener wrote:
> > On Thu, Sep 25, 2014 at 4:57 PM, James Greenhalgh
> > <james.greenhalgh@arm.com> wrote:
> > Given the special value to note the default for the new --params is
> > zero a user cannot disable scalarization that way.
> > 
> > I still somehow dislike that you need a target hook to compute the
> > default.  Why doesn't it work to do, in opts.c:default_options_optimization
> > 
> > maybe_set_param_value
> >   (PARAM_SRA_MAX_SCALARIZATION_SIZE_SPEED,
> >    get_move_ratio (speed_p) * MOVE_MAX_PIECES,
> >    opts->x_param_values, opts_set->x_param_values);
> > 
> > and override that default in targets option_override hook the same way?
> 
> The problem I am having is getting "get_move_ratio" right, without breaking
> the modular design.
> 
> default_options_optimization, and the rest of opts.c is going to end up in
> libcommon-target.a, so we are not going to have access to any
> backend-specific symbols.
> 
> An early draft of this patch used the MOVE_RATIO macro to set the default
> value. This worked fine for AArch64 and ARM targets (which both use a
> simple C expression for MOVE_RATIO), but failed for x86_64 which defines
> MOVE_RATIO as so:
> 
>   #define MOVE_RATIO(speed) ((speed) ? ix86_cost->move_ratio : 3)
> 
> Dealing with that ix86_cost symbol is what causes us the pain.
> 
> It seems reasonable that a target might want to define MOVE_RATIO
> as some function of their tuning parameters, so I don't want to
> disallow that usage.
> 
> This inspired me to try turning this in to a target hook, but this
> doesn't help as opts.c only gets access to common-target.def target
> hooks. These suffer the same problem, they don't have access to any
> backend symbols.
> 
> I suppose I could port any target with a definition of MOVE_RATIO to
> override the default parameter value in their option overriding code,
> but that makes this a very large patch set (many targets define
> MOVE_RATIO).
> 
> Is this an avenue worth exploring? I agree the very special target
> hook is not ideal.

Hi,

Did you have any further thoughts on this? I'm still unable to come up
with a way to set these parameters which allows them to default to their
current (MOVE_RATIO derived) values.

If the only way to make this work is to add code to
TARGET_OPTION_OVERRIDE for all targets that define MOVE_RATIO, then I
suppose I can do that, but I'd prefer a neater way to make it work, if
you can think of one.

Thanks,
James

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [Patch 1/6] Hookize MOVE_BY_PIECES_P, remove most uses of MOVE_RATIO
  2014-10-29 12:29                     ` Matthew Fortune
@ 2014-10-29 16:11                       ` James Greenhalgh
  2014-10-31 15:09                         ` James Greenhalgh
  0 siblings, 1 reply; 62+ messages in thread
From: James Greenhalgh @ 2014-10-29 16:11 UTC (permalink / raw)
  To: Matthew Fortune; +Cc: gcc-patches, richard.guenther, stevenb.gcc

On Wed, Oct 29, 2014 at 11:42:06AM +0000, Matthew Fortune wrote:
> Hi James,
> 
> I think you have a bug in the following hunk where you pass
> STORE_MAX_PIECES in place of the optimise for speed flag. I guess you
> would need an extra argument to pass a different *_MAX_PIECES value
> in.

Yup, good spot and agreed. I think I'll respin this series and get rid of all
the *_BY_PIECES_P in one sweep. I'm thinking of something like:

use_by_pieces_infrastructure_p (unsigned int size,
				unsigned int alignment,
				enum by_pieces_mode mode,
				bool speed_p)

which will take the type of by_pieces operation as the third parameter.

Thanks,
James

> >@@ -192,8 +184,7 @@ static void write_complex_part (rtx, rtx, bool);
> >    called to "memcpy" storage when the source is a constant string.  */
> > #ifndef STORE_BY_PIECES_P
> > #define STORE_BY_PIECES_P(SIZE, ALIGN) \
> >-  (move_by_pieces_ninsns (SIZE, ALIGN, STORE_MAX_PIECES + 1) \
> >-   < (unsigned int) MOVE_RATIO (optimize_insn_for_speed_p ()))
> >+  (targetm.move_by_pieces_profitable_p (SIZE, ALIGN, STORE_MAX_PIECES))
> > #endif
> > 
> >
> > /* This is run to set up which modes can be use
> 
> 

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [Patch 2/6 s390] Deprecate MOVE_BY_PIECES_P, move to hookized version
  2014-10-29 10:49                   ` [Patch 2/6 s390] Deprecate MOVE_BY_PIECES_P, move to hookized version James Greenhalgh
@ 2014-10-29 21:09                     ` Jeff Law
  0 siblings, 0 replies; 62+ messages in thread
From: Jeff Law @ 2014-10-29 21:09 UTC (permalink / raw)
  To: James Greenhalgh, gcc-patches; +Cc: uweigand, Andreas.Krebbel

On 10/29/14 04:46, James Greenhalgh wrote:
>
> Hi,
>
> This patch moves s390 to TARGET_MOVE_BY_PIECES_PROFITABLE_P.
>
> I tried building a compiler and there were no fires, but otherwise,
> I have no reasonable way to test this patch. If one of the s390
> maintainers wants to pick it up and test it, that would be much
> appreciated.
>
> Ok?
>
> James
>
> ---
> 2014-10-29  James Greenhalgh  <james.greenhalgh@arm.com>
>
> 	* config/s390/s390.c (s390_move_by_pieces_profitable_p): New.
> 	(TARGET_MOVE_BY_PIECES_PROFITABLE_P): Likewise.
> 	* config/s390/s390.h (MOVE_BY_PIECES_P): Remove.
This will be fine once patch #1 is approved (my understanding is you'll 
be resubmitting patch #1 after some minor fixes).

jeff

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [Patch 4/6 sh] Deprecate MOVE_BY_PIECES_P, move to hookized version
  2014-10-29 10:50                   ` [Patch 4/6 sh] " James Greenhalgh
@ 2014-10-29 21:10                     ` Jeff Law
  2014-10-30  0:49                     ` Kaz Kojima
  1 sibling, 0 replies; 62+ messages in thread
From: Jeff Law @ 2014-10-29 21:10 UTC (permalink / raw)
  To: James Greenhalgh, gcc-patches; +Cc: aoliva, kkojima, olegendo

On 10/29/14 04:49, James Greenhalgh wrote:
>
> Hi,
>
> This patch moves sh to TARGET_MOVE_BY_PIECES_PROFITABLE_P.
>
> I tried building a compiler and there were no fires, but otherwise,
> I have no reasonable way to test this patch. If one of the sh
> maintainers wants to pick it up and test it, that would be much
> appreciated.
>
> Thanks,
> James
>
> ---
> gcc/
>
> 2014-10-28  James Greenhalgh  <james.greenhalgh@arm.com>
>
> 	* config/sh/sh.c (TARGET_MOVE_BY_PIECES_PROFITABLE_P): New.
> 	(sh_move_by_pieces_profitable_p): Likewise.
> 	* config/sh/sh.h (MOVE_BY_PIECES_P): Remove.
OK.
jeff

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [Patch 3/6 arc] Deprecate MOVE_BY_PIECES_P, move to hookized version
  2014-10-29 10:50                   ` [Patch 3/6 arc] " James Greenhalgh
@ 2014-10-29 21:10                     ` Jeff Law
  0 siblings, 0 replies; 62+ messages in thread
From: Jeff Law @ 2014-10-29 21:10 UTC (permalink / raw)
  To: James Greenhalgh, gcc-patches; +Cc: joern.rennecke

On 10/29/14 04:48, James Greenhalgh wrote:
>
> Hi,
>
> This patch moves arc to TARGET_MOVE_BY_PIECES_PROFITABLE_P.
>
> While I am there, arc defines a macro CAN_MOVE_BY_PIECES, which is
> unused, so clean that up too.
>
> I tried building a compiler but no amount of fiddling with target
> strings got me to a sensible result, so this patch is completely
> untested.
>
> If one of the arc maintainers could give it a spin that would be
> helpful.
>
> OK?
>
> Thanks,
> James
>
>   ---
> 2014-10-28  James Greenhalgh  <james.greenhalgh@arm.com>
>
> 	* config/arc/arc.c (TARGET_MOVE_BY_PIECES_PROFITABLE_P): New.
> 	(arc_move_by_pieces_profitable_p): Likewise.
> 	* confir/arc/arc.h (MOVE_BY_PIECES_P): Delete.
> 	(CAN_MOVE_BY_PIECES): Likewise.
OK unless once of the ARC maintainers objects.  Thanks for trying to 
test this.

jeff

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [Patch 5/6 mips] Deprecate MOVE_BY_PIECES_P, move to hookized version
  2014-10-29 10:51                   ` [Patch 5/6 mips] " James Greenhalgh
@ 2014-10-29 21:18                     ` Jeff Law
  0 siblings, 0 replies; 62+ messages in thread
From: Jeff Law @ 2014-10-29 21:18 UTC (permalink / raw)
  To: James Greenhalgh, gcc-patches; +Cc: clm, echristo, matthew.fortune

On 10/29/14 04:50, James Greenhalgh wrote:
>
> Hi,
>
> This patch moves mips to TARGET_MOVE_BY_PIECES_PROFITABLE_P.
>
> I tried building a compiler and there were no fires, I don't have access
> to any MIPS hardware, so if one of the MIPS maintainers wanted to pick
> this up and test it, that would be very much appreciated.
>
> OK?
>
> Thanks,
> James
>
> ---
> gcc/
>
> 2014-10-28  James Greenhalgh  <james.greenhalgh@arm.com>
>
> 	* config/mips/mips.h (MOVE_BY_PIECES_P): Remove.
> 	* config/mips/mips.c (TARGET_MOVE_BY_PIECES_PROFITABLE_P): New.
> 	(mips_move_by_pieces_p): Rename to...
> 	(mips_move_by_pieces_profitable_p): ...this, use new hook
> 	parameters, use the default hook implementation as a
> 	fall-back.
OK.
Jeff

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [Patch 6/6] Remove MOVE_BY_PIECES_P
  2014-10-29 10:53                   ` [Patch 6/6] Remove MOVE_BY_PIECES_P James Greenhalgh
@ 2014-10-29 21:20                     ` Jeff Law
  0 siblings, 0 replies; 62+ messages in thread
From: Jeff Law @ 2014-10-29 21:20 UTC (permalink / raw)
  To: James Greenhalgh, gcc-patches; +Cc: richard.guenther, stevenb.gcc

On 10/29/14 04:50, James Greenhalgh wrote:
>
> Hi,
>
> This final patch gets rid of MOVE_BY_PIECES_P.
>
> Bootstrapped on x86_64, ARM and AArch64.
>
> Thanks,
> James
>
> ---
> gcc/
>
> 2014-10-28  James Greenhalgh  <james.greenhalgh@arm.com>
>
> 	* doc/tm.texi.in (MOVE_BY_PIECES_P): Remove.
> 	* doc/tm.texi: Regenerate.
> 	* system.h: Poison MOVE_BY_PIECES_P.
> 	* targhooks.c (default_move_by_pieces_profitable_p): Remove
> 	MOVE_BY_PIECES_P.
Ok.
jeff

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [Patch 4/6 sh] Deprecate MOVE_BY_PIECES_P, move to hookized version
  2014-10-29 10:50                   ` [Patch 4/6 sh] " James Greenhalgh
  2014-10-29 21:10                     ` Jeff Law
@ 2014-10-30  0:49                     ` Kaz Kojima
  1 sibling, 0 replies; 62+ messages in thread
From: Kaz Kojima @ 2014-10-30  0:49 UTC (permalink / raw)
  To: james.greenhalgh; +Cc: gcc-patches

James Greenhalgh <james.greenhalgh@arm.com> wrote:
> This patch moves sh to TARGET_MOVE_BY_PIECES_PROFITABLE_P.
> 
> I tried building a compiler and there were no fires, but otherwise,
> I have no reasonable way to test this patch. If one of the sh
> maintainers wants to pick it up and test it, that would be much
> appreciated.

FYI, I've tested the patch with the top level "make -k check" on
sh4-unknown-linux-gnu with no new failures.

Regards,
        kaz

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [Patchv2 3/4] Control SRA and IPA-SRA by a param rather than MOVE_RATIO
  2014-10-29 14:39                     ` James Greenhalgh
@ 2014-10-31 10:58                       ` Richard Biener
  2014-11-06 11:53                         ` [Patchv3] " James Greenhalgh
  0 siblings, 1 reply; 62+ messages in thread
From: Richard Biener @ 2014-10-31 10:58 UTC (permalink / raw)
  To: James Greenhalgh
  Cc: GCC Patches, Richard Earnshaw, Marcus Shawcroft, Andrew Pinski

On Wed, Oct 29, 2014 at 3:39 PM, James Greenhalgh
<james.greenhalgh@arm.com> wrote:
> On Wed, Oct 01, 2014 at 05:38:12PM +0100, James Greenhalgh wrote:
>> On Fri, Sep 26, 2014 at 10:11:13AM +0100, Richard Biener wrote:
>> > On Thu, Sep 25, 2014 at 4:57 PM, James Greenhalgh
>> > <james.greenhalgh@arm.com> wrote:
>> > Given the special value to note the default for the new --params is
>> > zero a user cannot disable scalarization that way.
>> >
>> > I still somehow dislike that you need a target hook to compute the
>> > default.  Why doesn't it work to do, in opts.c:default_options_optimization
>> >
>> > maybe_set_param_value
>> >   (PARAM_SRA_MAX_SCALARIZATION_SIZE_SPEED,
>> >    get_move_ratio (speed_p) * MOVE_MAX_PIECES,
>> >    opts->x_param_values, opts_set->x_param_values);
>> >
>> > and override that default in targets option_override hook the same way?
>>
>> The problem I am having is getting "get_move_ratio" right, without breaking
>> the modular design.
>>
>> default_options_optimization, and the rest of opts.c is going to end up in
>> libcommon-target.a, so we are not going to have access to any
>> backend-specific symbols.
>>
>> An early draft of this patch used the MOVE_RATIO macro to set the default
>> value. This worked fine for AArch64 and ARM targets (which both use a
>> simple C expression for MOVE_RATIO), but failed for x86_64 which defines
>> MOVE_RATIO as so:
>>
>>   #define MOVE_RATIO(speed) ((speed) ? ix86_cost->move_ratio : 3)
>>
>> Dealing with that ix86_cost symbol is what causes us the pain.
>>
>> It seems reasonable that a target might want to define MOVE_RATIO
>> as some function of their tuning parameters, so I don't want to
>> disallow that usage.
>>
>> This inspired me to try turning this in to a target hook, but this
>> doesn't help as opts.c only gets access to common-target.def target
>> hooks. These suffer the same problem, they don't have access to any
>> backend symbols.
>>
>> I suppose I could port any target with a definition of MOVE_RATIO to
>> override the default parameter value in their option overriding code,
>> but that makes this a very large patch set (many targets define
>> MOVE_RATIO).
>>
>> Is this an avenue worth exploring? I agree the very special target
>> hook is not ideal.
>
> Hi,
>
> Did you have any further thoughts on this? I'm still unable to come up
> with a way to set these parameters which allows them to default to their
> current (MOVE_RATIO derived) values.
>
> If the only way to make this work is to add code to
> TARGET_OPTION_OVERRIDE for all targets that define MOVE_RATIO, then I
> suppose I can do that, but I'd prefer a neater way to make it work, if
> you can think of one.

Maybe instead of putting the code in opts.c put it right before we call
targetm.target_option.override () in toplev.c:process_options.  With
a comment on why it cannot be in opts.c.

Thanks,
Richard.

> Thanks,
> James
>

^ permalink raw reply	[flat|nested] 62+ messages in thread

* RE: [Patch 1/6] Hookize MOVE_BY_PIECES_P, remove most uses of MOVE_RATIO
  2014-10-29 16:11                       ` James Greenhalgh
@ 2014-10-31 15:09                         ` James Greenhalgh
  2014-10-31 15:10                           ` [Patch 2/7 s390] Deprecate *_BY_PIECES_P, move to hookized version James Greenhalgh
                                             ` (6 more replies)
  0 siblings, 7 replies; 62+ messages in thread
From: James Greenhalgh @ 2014-10-31 15:09 UTC (permalink / raw)
  To: gcc-patches; +Cc: richard.guenther, stevenb.gcc, law, matthew.fortune

[-- Attachment #1: Type: text/plain, Size: 1978 bytes --]

On Wed, Oct 29, 2014 at 03:31:54PM +0000, James Greenhalgh wrote:
> On Wed, Oct 29, 2014 at 11:42:06AM +0000, Matthew Fortune wrote:
> > Hi James,
> > 
> > I think you have a bug in the following hunk where you pass
> > STORE_MAX_PIECES in place of the optimise for speed flag. I guess you
> > would need an extra argument to pass a different *_MAX_PIECES value
> > in.
> 
> Yup, good spot and agreed. I think I'll respin this series and get rid of all
> the *_BY_PIECES_P in one sweep. I'm thinking of something like:
> 
> use_by_pieces_infrastructure_p (unsigned int size,
> 				unsigned int alignment,
> 				enum by_pieces_mode mode,
> 				bool speed_p)
> 
> which will take the type of by_pieces operation as the third parameter.

...and that patch series would look something like this.

We hookize all of CLEAR_BY_PIECES_P, MOVE_BY_PIECES_P, SET_BY_PIECES_P
and STORE_BY_PIECES_P behind one target hook:
TARGET_USE_BY_PIECES_INFRASTRUCTURE_P.

We then clean up each of the targets who use these macros, and finally
delete the macros entirely and poison them.

Jeff, thanks for reviewing the previous patch set, unfortunately finishing
the job off properly costs a little extra for each target, so I guess I'll
need a new set of review. Sorry about that.

The series looks like this:

[Patch 1/7] Hookize *_BY_PIECES_P
[Patch 2/7 s390] Deprecate *_BY_PIECES_P, move to hookized version
[Patch 3/7 arc] Deprecate *_BY_PIECES_P, move to hookized version
[Patch 4/7 sh] Deprecate *_BY_PIECES_P, move to hookized version
[Patch 5/7 mips] Deprecate *_BY_PIECES_P, move to hookized version
[Patch 6/7 AArch64] Deprecate *_BY_PIECES_P, move to hookized version
[Patch 7/7] Remove *_BY_PIECES_P

I've bootstrapped and tested the full patch series on x86_64, ARM and
AArch64, and as before I've successfully built toolchains for the
other architectures (except arc, which I've struggled to build a binutils
for).

OK?

Thanks,
James

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [Patch 2/7 s390] Deprecate *_BY_PIECES_P, move to hookized version
  2014-10-31 15:09                         ` James Greenhalgh
@ 2014-10-31 15:10                           ` James Greenhalgh
  2014-10-31 15:10                           ` [Patch 1/7] Hookize *_BY_PIECES_P James Greenhalgh
                                             ` (5 subsequent siblings)
  6 siblings, 0 replies; 62+ messages in thread
From: James Greenhalgh @ 2014-10-31 15:10 UTC (permalink / raw)
  To: gcc-patches; +Cc: uweigand, Andreas.Krebbel

[-- Attachment #1: Type: text/plain, Size: 755 bytes --]


Hi,

This patch moves s390 to TARGET_USE_BY_PIECES_INFRASTRUCTURE_P.

It looks to me that s390 wires all the hooks in this set to the
same return value, so that is what I've implemented.

I tried building a compiler and there were no fires, but otherwise,
I have no reasonable way to test this patch. If one of the s390
maintainers wants to pick it up and test it, that would be much
appreciated.

Ok?

James

---
2014-10-31  James Greenhalgh  <james.greenhalgh@arm.com>

	* config/s390/s390.c (s390_use_by_pieces_infrastructure_p): New.
	(TARGET_USE_BY_PIECES_INFRASTRUCTURE_P): Likewise.
	* config/s390/s390.h (MOVE_BY_PIECES_P): Remove.
	(CLEAR_BY_PIECES): Likewise.
	(SET_BY_PIECES): Likewise.
	(STORE_BY_PIECES): Likewise.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0002-Patch-2-7-s390-Deprecate-_BY_PIECES_P-move-to-hookiz.patch --]
[-- Type: text/x-patch;  name=0002-Patch-2-7-s390-Deprecate-_BY_PIECES_P-move-to-hookiz.patch, Size: 2238 bytes --]

diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c
index 874eb7c..51ae90c 100644
--- a/gcc/config/s390/s390.c
+++ b/gcc/config/s390/s390.c
@@ -12032,6 +12032,18 @@ s390_option_override (void)
   register_pass (&insert_pass_s390_early_mach);
 }
 
+/* Implement TARGET_USE_BY_PIECES_INFRASTRUCTURE_P.  */
+
+static bool
+s390_use_by_pieces_infrastructure_p (unsigned int size,
+				     unsigned int align ATTRIBUTE_UNUSED,
+				     enum by_pieces_operation op ATTRIBUTE_UNUSED,
+				     bool speed_p ATTRIBUTE_UNUSED)
+{
+  return (size == 1 || size == 2
+	  || size == 4 || (TARGET_ZARCH && size == 8));
+}
+
 /* Initialize GCC target structure.  */
 
 #undef  TARGET_ASM_ALIGNED_HI_OP
@@ -12217,6 +12229,10 @@ s390_option_override (void)
 #undef TARGET_SET_UP_BY_PROLOGUE
 #define TARGET_SET_UP_BY_PROLOGUE s300_set_up_by_prologue
 
+#undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
+#define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
+  s390_use_by_pieces_infrastructure_p
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 #include "gt-s390.h"
diff --git a/gcc/config/s390/s390.h b/gcc/config/s390/s390.h
index 0a935ee..d933b8d 100644
--- a/gcc/config/s390/s390.h
+++ b/gcc/config/s390/s390.h
@@ -744,24 +744,6 @@ do {									\
 #define MOVE_MAX_PIECES (TARGET_ZARCH ? 8 : 4)
 #define MAX_MOVE_MAX 16
 
-/* Determine whether to use move_by_pieces or block move insn.  */
-#define MOVE_BY_PIECES_P(SIZE, ALIGN)		\
-  ( (SIZE) == 1 || (SIZE) == 2 || (SIZE) == 4	\
-    || (TARGET_ZARCH && (SIZE) == 8) )
-
-/* Determine whether to use clear_by_pieces or block clear insn.  */
-#define CLEAR_BY_PIECES_P(SIZE, ALIGN)		\
-  ( (SIZE) == 1 || (SIZE) == 2 || (SIZE) == 4	\
-    || (TARGET_ZARCH && (SIZE) == 8) )
-
-/* This macro is used to determine whether store_by_pieces should be
-   called to "memcpy" storage when the source is a constant string.  */
-#define STORE_BY_PIECES_P(SIZE, ALIGN) MOVE_BY_PIECES_P (SIZE, ALIGN)
-
-/* Likewise to decide whether to "memset" storage with byte values
-   other than zero.  */
-#define SET_BY_PIECES_P(SIZE, ALIGN) STORE_BY_PIECES_P (SIZE, ALIGN)
-
 /* Don't perform CSE on function addresses.  */
 #define NO_FUNCTION_CSE
 

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [Patch 1/7] Hookize *_BY_PIECES_P
  2014-10-31 15:09                         ` James Greenhalgh
  2014-10-31 15:10                           ` [Patch 2/7 s390] Deprecate *_BY_PIECES_P, move to hookized version James Greenhalgh
@ 2014-10-31 15:10                           ` James Greenhalgh
  2014-10-31 21:08                             ` Jeff Law
  2014-10-31 15:11                           ` [Patch 4/7 sh] Deprecate *_BY_PIECES_P, move to hookized version James Greenhalgh
                                             ` (4 subsequent siblings)
  6 siblings, 1 reply; 62+ messages in thread
From: James Greenhalgh @ 2014-10-31 15:10 UTC (permalink / raw)
  To: gcc-patches; +Cc: richard.guenther, stevenb.gcc, law, matthew.fortune

[-- Attachment #1: Type: text/plain, Size: 1076 bytes --]


Hi,

This patch prepares for removing all the *BY_PIECES_P macros by
introducing a new target hook TARGET_USE_BY_PIECES_INFRASTRUCTURE_P.

Tested on ARM/AArch64/x86_64 with no issues.

Ok for trunk?

Thanks,
James

---
gcc/

2014-10-31  James Greenhalgh  <james.greenhalgh@arm.com>

	* target.def (use_by_pieces_infrastructure_p): New.
	* doc/tm.texi.in (MOVE_BY_PIECES_P): Describe that this macro
	is deprecated.
	(STORE_BY_PIECES_P): Likewise.
	(CLEAR_BY_PIECES_P): Likewise.
	(SET_BY_PIECES_P): Likewise.
	(TARGET_MOVE_BY_PIECES_PROFITABLE_P): Add hook.
	* doc/tm.texi: Regenerate.
	* expr.c (MOVE_BY_PIECES_P): Rewrite in terms of
	TARGET_USE_BY_PIECES_INFRASTRUCTURE_P.
	(STORE_BY_PIECES_P): Likewise.
	(CLEAR_BY_PIECES_P): Likewise.
	(SET_BY_PIECES_P): Likewise.
	(STORE_MAX_PIECES): Move to...
	* defaults.h (STORE_MAX_PIECES): ...here.
	* targhooks.c (get_move_ratio): New.
	(default_use_by_pieces_infrastructure_p): Likewise.
	* targhooks.h (default_use_by_pieces_infrastructure_p): New.
	* target.h (by_pieces_operation): New.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0001-Patch-1-7-Hookize-_BY_PIECES_P.patch --]
[-- Type: text/x-patch; name=0001-Patch-1-7-Hookize-_BY_PIECES_P.patch, Size: 14685 bytes --]

diff --git a/gcc/defaults.h b/gcc/defaults.h
index c1776b0..d2609e7 100644
--- a/gcc/defaults.h
+++ b/gcc/defaults.h
@@ -1006,6 +1006,15 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 #define MOVE_MAX_PIECES   MOVE_MAX
 #endif
 
+/* STORE_MAX_PIECES is the number of bytes at a time that we can
+   store efficiently.  Due to internal GCC limitations, this is
+   MOVE_MAX_PIECES limited by the number of bytes GCC can represent
+   for an immediate constant.  */
+
+#ifndef STORE_MAX_PIECES
+#define STORE_MAX_PIECES  MIN (MOVE_MAX_PIECES, 2 * sizeof (HOST_WIDE_INT))
+#endif
+
 #ifndef MAX_MOVE_MAX
 #define MAX_MOVE_MAX MOVE_MAX
 #endif
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index bb04401..cfb8388 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -6128,8 +6128,45 @@ A C expression used to determine whether @code{move_by_pieces} will be used to
 copy a chunk of memory, or whether some other block move mechanism
 will be used.  Defaults to 1 if @code{move_by_pieces_ninsns} returns less
 than @code{MOVE_RATIO}.
+
+This macro is deprecated.  New ports should implement
+@code{TARGET_USE_BY_PIECES_INFRASTRUCTURE_P} instead.
 @end defmac
 
+@deftypefn {Target Hook} bool TARGET_USE_BY_PIECES_INFRASTRUCTURE_P (unsigned int @var{size}, unsigned int @var{alignment}, enum by_pieces_operation @var{op}, bool @var{speed_p})
+GCC will attempt several strategies when asked to copy between
+two areas of memory, or to set, clear or store to memory, for example
+when copying a @code{struct}. The @code{by_pieces} infrastructure
+implements such memory operations as a sequence of load, store or move
+insns.  Alternate strategies are to expand the
+@code{movmem} or @code{setmem} optabs, to emit a library call, or to emit
+unit-by-unit, loop-based operations.
+
+This target hook should return true if, for a memory operation with a
+given @var{size} and @var{alignment}, using the @code{by_pieces}
+infrastructure is expected to result in better code generation.
+Both @var{size} and @var{alignment} are measured in terms of storage
+units.
+
+The parameter @var{op} is one of: @code{CLEAR_BY_PIECES},
+@code{MOVE_BY_PIECES}, @code{SET_BY_PIECES}, @code{STORE_BY_PIECES}.
+These describe the type of memory operation under consideration.
+
+The parameter @var{speed_p} is true if the code is currently being
+optimized for speed rather than size.
+
+Returning true for higher values of @var{size} can improve code generation
+for speed if the target does not provide an implementation of the
+@code{movmem} or @code{setmem} standard names, if the @code{movmem} or
+@code{setmem} implementation would be more expensive than a sequence of
+insns, or if the overhead of a library call would dominate that of
+the body of the memory operation.
+
+Returning true for higher values of @code{size} may also cause an increase
+in code size, for example where the number of insns emitted to perform a
+move would be greater than that of a library call.
+@end deftypefn
+
 @defmac MOVE_MAX_PIECES
 A C expression used by @code{move_by_pieces} to determine the largest unit
 a load or store used to copy memory is.  Defaults to @code{MOVE_MAX}.
@@ -6152,6 +6189,9 @@ A C expression used to determine whether @code{clear_by_pieces} will be used
 to clear a chunk of memory, or whether some other block clear mechanism
 will be used.  Defaults to 1 if @code{move_by_pieces_ninsns} returns less
 than @code{CLEAR_RATIO}.
+
+This macro is deprecated.  New ports should implement
+@code{TARGET_USE_BY_PIECES_INFRASTRUCTURE_P} instead.
 @end defmac
 
 @defmac SET_RATIO (@var{speed})
@@ -6174,6 +6214,9 @@ other mechanism will be used.  Used by @code{__builtin_memset} when
 storing values other than constant zero.
 Defaults to 1 if @code{move_by_pieces_ninsns} returns less
 than @code{SET_RATIO}.
+
+This macro is deprecated.  New ports should implement
+@code{TARGET_USE_BY_PIECES_INFRASTRUCTURE_P} instead.
 @end defmac
 
 @defmac STORE_BY_PIECES_P (@var{size}, @var{alignment})
@@ -6183,6 +6226,9 @@ other mechanism will be used.  Used by @code{__builtin_strcpy} when
 called with a constant source string.
 Defaults to 1 if @code{move_by_pieces_ninsns} returns less
 than @code{MOVE_RATIO}.
+
+This macro is deprecated.  New ports should implement
+@code{TARGET_USE_BY_PIECES_INFRASTRUCTURE_P} instead.
 @end defmac
 
 @defmac USE_LOAD_POST_INCREMENT (@var{mode})
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index aa19360..3f66543 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4605,8 +4605,13 @@ A C expression used to determine whether @code{move_by_pieces} will be used to
 copy a chunk of memory, or whether some other block move mechanism
 will be used.  Defaults to 1 if @code{move_by_pieces_ninsns} returns less
 than @code{MOVE_RATIO}.
+
+This macro is deprecated.  New ports should implement
+@code{TARGET_USE_BY_PIECES_INFRASTRUCTURE_P} instead.
 @end defmac
 
+@hook TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
+
 @defmac MOVE_MAX_PIECES
 A C expression used by @code{move_by_pieces} to determine the largest unit
 a load or store used to copy memory is.  Defaults to @code{MOVE_MAX}.
@@ -4629,6 +4634,9 @@ A C expression used to determine whether @code{clear_by_pieces} will be used
 to clear a chunk of memory, or whether some other block clear mechanism
 will be used.  Defaults to 1 if @code{move_by_pieces_ninsns} returns less
 than @code{CLEAR_RATIO}.
+
+This macro is deprecated.  New ports should implement
+@code{TARGET_USE_BY_PIECES_INFRASTRUCTURE_P} instead.
 @end defmac
 
 @defmac SET_RATIO (@var{speed})
@@ -4651,6 +4659,9 @@ other mechanism will be used.  Used by @code{__builtin_memset} when
 storing values other than constant zero.
 Defaults to 1 if @code{move_by_pieces_ninsns} returns less
 than @code{SET_RATIO}.
+
+This macro is deprecated.  New ports should implement
+@code{TARGET_USE_BY_PIECES_INFRASTRUCTURE_P} instead.
 @end defmac
 
 @defmac STORE_BY_PIECES_P (@var{size}, @var{alignment})
@@ -4660,6 +4671,9 @@ other mechanism will be used.  Used by @code{__builtin_strcpy} when
 called with a constant source string.
 Defaults to 1 if @code{move_by_pieces_ninsns} returns less
 than @code{MOVE_RATIO}.
+
+This macro is deprecated.  New ports should implement
+@code{TARGET_USE_BY_PIECES_INFRASTRUCTURE_P} instead.
 @end defmac
 
 @defmac USE_LOAD_POST_INCREMENT (@var{mode})
diff --git a/gcc/expr.c b/gcc/expr.c
index 9b81e62..ef85177 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -171,32 +171,32 @@ static void write_complex_part (rtx, rtx, bool);
    to perform a structure copy.  */
 #ifndef MOVE_BY_PIECES_P
 #define MOVE_BY_PIECES_P(SIZE, ALIGN) \
-  (move_by_pieces_ninsns (SIZE, ALIGN, MOVE_MAX_PIECES + 1) \
-   < (unsigned int) MOVE_RATIO (optimize_insn_for_speed_p ()))
+  (targetm.use_by_pieces_infrastructure_p (SIZE, ALIGN, MOVE_BY_PIECES, \
+					   optimize_insn_for_speed_p ()))
 #endif
 
 /* This macro is used to determine whether clear_by_pieces should be
    called to clear storage.  */
 #ifndef CLEAR_BY_PIECES_P
 #define CLEAR_BY_PIECES_P(SIZE, ALIGN) \
-  (move_by_pieces_ninsns (SIZE, ALIGN, STORE_MAX_PIECES + 1) \
-   < (unsigned int) CLEAR_RATIO (optimize_insn_for_speed_p ()))
+  (targetm.use_by_pieces_infrastructure_p (SIZE, ALIGN, CLEAR_BY_PIECES, \
+					   optimize_insn_for_speed_p ()))
 #endif
 
 /* This macro is used to determine whether store_by_pieces should be
    called to "memset" storage with byte values other than zero.  */
 #ifndef SET_BY_PIECES_P
 #define SET_BY_PIECES_P(SIZE, ALIGN) \
-  (move_by_pieces_ninsns (SIZE, ALIGN, STORE_MAX_PIECES + 1) \
-   < (unsigned int) SET_RATIO (optimize_insn_for_speed_p ()))
+  (targetm.use_by_pieces_infrastructure_p (SIZE, ALIGN, SET_BY_PIECES, \
+					   optimize_insn_for_speed_p ()))
 #endif
 
 /* This macro is used to determine whether store_by_pieces should be
    called to "memcpy" storage when the source is a constant string.  */
 #ifndef STORE_BY_PIECES_P
 #define STORE_BY_PIECES_P(SIZE, ALIGN) \
-  (move_by_pieces_ninsns (SIZE, ALIGN, STORE_MAX_PIECES + 1) \
-   < (unsigned int) MOVE_RATIO (optimize_insn_for_speed_p ()))
+  (targetm.use_by_pieces_infrastructure_p (SIZE, ALIGN, STORE_BY_PIECES, \
+					   optimize_insn_for_speed_p ()))
 #endif
 \f
 /* This is run to set up which modes can be used
@@ -827,13 +827,6 @@ widest_int_mode_for_size (unsigned int size)
   return mode;
 }
 
-/* STORE_MAX_PIECES is the number of bytes at a time that we can
-   store efficiently.  Due to internal GCC limitations, this is
-   MOVE_MAX_PIECES limited by the number of bytes GCC can represent
-   for an immediate constant.  */
-
-#define STORE_MAX_PIECES  MIN (MOVE_MAX_PIECES, 2 * sizeof (HOST_WIDE_INT))
-
 /* Determine whether the LEN bytes can be moved by using several move
    instructions.  Return nonzero if a call to move_by_pieces should
    succeed.  */
diff --git a/gcc/target.def b/gcc/target.def
index 14e19e8..23cae25 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -3049,6 +3049,43 @@ are the same as to this target hook.",
  int, (machine_mode mode, reg_class_t rclass, bool in),
  default_memory_move_cost)
 
+DEFHOOK
+(use_by_pieces_infrastructure_p,
+ "GCC will attempt several strategies when asked to copy between\n\
+two areas of memory, or to set, clear or store to memory, for example\n\
+when copying a @code{struct}. The @code{by_pieces} infrastructure\n\
+implements such memory operations as a sequence of load, store or move\n\
+insns.  Alternate strategies are to expand the\n\
+@code{movmem} or @code{setmem} optabs, to emit a library call, or to emit\n\
+unit-by-unit, loop-based operations.\n\
+\n\
+This target hook should return true if, for a memory operation with a\n\
+given @var{size} and @var{alignment}, using the @code{by_pieces}\n\
+infrastructure is expected to result in better code generation.\n\
+Both @var{size} and @var{alignment} are measured in terms of storage\n\
+units.\n\
+\n\
+The parameter @var{op} is one of: @code{CLEAR_BY_PIECES},\n\
+@code{MOVE_BY_PIECES}, @code{SET_BY_PIECES}, @code{STORE_BY_PIECES}.\n\
+These describe the type of memory operation under consideration.\n\
+\n\
+The parameter @var{speed_p} is true if the code is currently being\n\
+optimized for speed rather than size.\n\
+\n\
+Returning true for higher values of @var{size} can improve code generation\n\
+for speed if the target does not provide an implementation of the\n\
+@code{movmem} or @code{setmem} standard names, if the @code{movmem} or\n\
+@code{setmem} implementation would be more expensive than a sequence of\n\
+insns, or if the overhead of a library call would dominate that of\n\
+the body of the memory operation.\n\
+\n\
+Returning true for higher values of @code{size} may also cause an increase\n\
+in code size, for example where the number of insns emitted to perform a\n\
+move would be greater than that of a library call.",
+ bool, (unsigned int size, unsigned int alignment,
+        enum by_pieces_operation op, bool speed_p),
+ default_use_by_pieces_infrastructure_p)
+
 /* True for MODE if the target expects that registers in this mode will
    be allocated to registers in a small register class.  The compiler is
    allowed to use registers explicitly used in the rtl as spill registers
diff --git a/gcc/target.h b/gcc/target.h
index 7be94b8..40d7841 100644
--- a/gcc/target.h
+++ b/gcc/target.h
@@ -80,6 +80,17 @@ enum print_switch_type
   SWITCH_TYPE_LINE_END		/* Please emit a line terminator.  */
 };
 
+/* Types of memory operation understood by the "by_pieces" infrastructure.
+   Used by the TARGET_USE_BY_PIECES_INFRASTRUCTURE_P target hook.  */
+
+enum by_pieces_operation
+{
+  CLEAR_BY_PIECES,
+  MOVE_BY_PIECES,
+  SET_BY_PIECES,
+  STORE_BY_PIECES
+};
+
 typedef int (* print_switch_fn_type) (print_switch_type, const char *);
 
 /* An example implementation for ELF targets.  Defined in varasm.c  */
diff --git a/gcc/targhooks.c b/gcc/targhooks.c
index e482991..eef3d45 100644
--- a/gcc/targhooks.c
+++ b/gcc/targhooks.c
@@ -1406,6 +1406,61 @@ default_register_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
 #endif
 }
 
+/* For hooks which use the MOVE_RATIO macro, this gives the legacy default
+   behaviour.  SPEED_P is true if we are compiling for speed.  */
+
+static unsigned int
+get_move_ratio (bool speed_p ATTRIBUTE_UNUSED)
+{
+  unsigned int move_ratio;
+#ifdef MOVE_RATIO
+  move_ratio = (unsigned int) MOVE_RATIO (speed_p);
+#else
+#if defined (HAVE_movmemqi) || defined (HAVE_movmemhi) || defined (HAVE_movmemsi) || defined (HAVE_movmemdi) || defined (HAVE_movmemti)
+  move_ratio = 2;
+#else /* No movmem patterns, pick a default.  */
+  move_ratio = ((speed_p) ? 15 : 3);
+#endif
+#endif
+  return move_ratio;
+}
+
+/* Return TRUE if the move_by_pieces/set_by_pieces infrastructure should be
+   used; return FALSE if the movmem/setmem optab should be expanded, or
+   a call to memcpy emitted.  */
+
+bool
+default_use_by_pieces_infrastructure_p (unsigned int size,
+					unsigned int alignment,
+					enum by_pieces_operation op,
+					bool speed_p)
+{
+  unsigned int max_size = 0;
+  unsigned int ratio = 0;
+
+  switch (op)
+    {
+      case CLEAR_BY_PIECES:
+	max_size = STORE_MAX_PIECES;
+	ratio = CLEAR_RATIO (speed_p);
+	break;
+      case MOVE_BY_PIECES:
+	max_size = MOVE_MAX_PIECES;
+	ratio = get_move_ratio (speed_p);
+	break;
+      case SET_BY_PIECES:
+	max_size = STORE_MAX_PIECES;
+	ratio = SET_RATIO (speed_p);
+	break;
+      case STORE_BY_PIECES:
+	max_size = STORE_MAX_PIECES;
+	ratio = get_move_ratio (speed_p);
+	break;
+    }
+
+  return move_by_pieces_ninsns (size, alignment, max_size + 1) < ratio;
+}
+
 bool
 default_profile_before_prologue (void)
 {
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index 25f4fed..4bbf492 100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -181,6 +181,11 @@ extern int default_memory_move_cost (machine_mode, reg_class_t, bool);
 extern int default_register_move_cost (machine_mode, reg_class_t,
 				       reg_class_t);
 
+extern bool default_use_by_pieces_infrastructure_p (unsigned int,
+						    unsigned int,
+						    enum by_pieces_operation,
+						    bool);
+
 extern bool default_profile_before_prologue (void);
 extern reg_class_t default_preferred_reload_class (rtx, reg_class_t);
 extern reg_class_t default_preferred_output_reload_class (rtx, reg_class_t);

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [Patch 3/7 arc] Deprecate *_BY_PIECES_P, move to hookized version
  2014-10-31 15:09                         ` James Greenhalgh
                                             ` (2 preceding siblings ...)
  2014-10-31 15:11                           ` [Patch 4/7 sh] Deprecate *_BY_PIECES_P, move to hookized version James Greenhalgh
@ 2014-10-31 15:11                           ` James Greenhalgh
  2014-11-04 12:08                             ` Joern Rennecke
  2014-10-31 15:12                           ` [Patch 5/7 mips] " James Greenhalgh
                                             ` (2 subsequent siblings)
  6 siblings, 1 reply; 62+ messages in thread
From: James Greenhalgh @ 2014-10-31 15:11 UTC (permalink / raw)
  To: gcc-patches; +Cc: joern.rennecke

[-- Attachment #1: Type: text/plain, Size: 833 bytes --]


Hi,

This patch moves arc to TARGET_USE_BY_PIECES_INFRASTRUCTURE_P.

While I am there, arc defines a macro CAN_MOVE_BY_PIECES, which is
unused, so clean that up too.

arc only implements MOVE_BY_PIECES_P, wiring it to false. Mirror that
behaviour, and use the default hook for other by_pieces operations.

I tried building a compiler but no amount of fiddling with target
strings got me to a sensible result, so this patch is completely
untested.

If one of the arc maintainers could give it a spin that would be
helpful.

OK?

Thanks,
James

 ---
2014-10-31  James Greenhalgh  <james.greenhalgh@arm.com>

	* config/arc/arc.c (TARGET_USE_BY_PIECES_INFRASTRUCTURE_P): New.
	(arc_use_by_pieces_infrastructure_p): Likewise.
	* confir/arc/arc.h (MOVE_BY_PIECES_P): Delete.
	(CAN_MOVE_BY_PIECES): Likewise.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0003-Patch-3-7-arc-Deprecate-_BY_PIECES_P-move-to-hookize.patch --]
[-- Type: text/x-patch;  name=0003-Patch-3-7-arc-Deprecate-_BY_PIECES_P-move-to-hookize.patch, Size: 2229 bytes --]

diff --git a/gcc/config/arc/arc.c b/gcc/config/arc/arc.c
index d04be01..c5b8b80 100644
--- a/gcc/config/arc/arc.c
+++ b/gcc/config/arc/arc.c
@@ -415,6 +415,11 @@ static void output_short_suffix (FILE *file);
 
 static bool arc_frame_pointer_required (void);
 
+static bool arc_use_by_pieces_infrastructure_p (unsigned int,
+						unsigned int,
+						enum by_pieces_operation op,
+						bool);
+
 /* Implements target hook vector_mode_supported_p.  */
 
 static bool
@@ -530,6 +535,10 @@ static void arc_finalize_pic (void);
 #undef TARGET_DELEGITIMIZE_ADDRESS
 #define TARGET_DELEGITIMIZE_ADDRESS arc_delegitimize_address
 
+#undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
+#define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
+  arc_use_by_pieces_infrastructure_p
+
 /* Usually, we will be able to scale anchor offsets.
    When this fails, we want LEGITIMIZE_ADDRESS to kick in.  */
 #undef TARGET_MIN_ANCHOR_OFFSET
@@ -9383,6 +9392,21 @@ arc_legitimize_reload_address (rtx *p, machine_mode mode, int opnum,
   return false;
 }
 
+/* Implement TARGET_USE_BY_PIECES_INFRASTRUCTURE_P.  */
+
+static bool
+arc_use_by_pieces_infrastructure_p (unsigned int size,
+				    unsigned int align,
+				    enum by_pieces_operation op,
+				    bool speed_p)
+{
+  /* Let the movmem expander handle small block moves.  */
+  if (op == MOVE_BY_PIECES_P)
+    return false;
+
+  return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
+}
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 #include "gt-arc.h"
diff --git a/gcc/config/arc/arc.h b/gcc/config/arc/arc.h
index d40f5c3..2d27787 100644
--- a/gcc/config/arc/arc.h
+++ b/gcc/config/arc/arc.h
@@ -1553,12 +1553,6 @@ extern int arc_return_address_regs[4];
    in one reasonably fast instruction.  */
 #define MOVE_MAX 4
 
-/* Let the movmem expander handle small block moves.  */
-#define MOVE_BY_PIECES_P(LEN, ALIGN)  0
-#define CAN_MOVE_BY_PIECES(SIZE, ALIGN) \
-  (move_by_pieces_ninsns (SIZE, ALIGN, MOVE_MAX_PIECES + 1) \
-   < (unsigned int) MOVE_RATIO (!optimize_size))
-
 /* Undo the effects of the movmem pattern presence on STORE_BY_PIECES_P .  */
 #define MOVE_RATIO(SPEED) ((SPEED) ? 15 : 3)
 

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [Patch 4/7 sh] Deprecate *_BY_PIECES_P, move to hookized version
  2014-10-31 15:09                         ` James Greenhalgh
  2014-10-31 15:10                           ` [Patch 2/7 s390] Deprecate *_BY_PIECES_P, move to hookized version James Greenhalgh
  2014-10-31 15:10                           ` [Patch 1/7] Hookize *_BY_PIECES_P James Greenhalgh
@ 2014-10-31 15:11                           ` James Greenhalgh
  2014-11-01 23:27                             ` Kaz Kojima
  2014-10-31 15:11                           ` [Patch 3/7 arc] " James Greenhalgh
                                             ` (3 subsequent siblings)
  6 siblings, 1 reply; 62+ messages in thread
From: James Greenhalgh @ 2014-10-31 15:11 UTC (permalink / raw)
  To: gcc-patches; +Cc: aoliva, kkojima, olegendo

[-- Attachment #1: Type: text/plain, Size: 777 bytes --]


Hi,

This patch moves sh to TARGET_USE_BY_PIECES_INFRASTRUCTURE_P.

For sh, STORE_BY_PIECES and SET_BY_PIECES share an implementation,
MOVE_BY_PIECES is different. Implement as so, and use the default
implementation for CLEAR_BY_PIECES.

I tried building a compiler and there were no fires, but otherwise,
I have no reasonable way to test this patch. If one of the sh
maintainers wants to pick it up and test it, that would be much
appreciated.

Thanks,
James

---
gcc/

2014-10-31  James Greenhalgh  <james.greenhalgh@arm.com>

	* config/sh/sh.c (TARGET_USE_BY_PIECES_INFRASTRUCTURE_P): New.
	(sh_use_by_pieces_infrastructure_p): Likewise.
	* config/sh/sh.h (MOVE_BY_PIECES_P): Remove.
	(STORE_BY_PIECES_P): Likewise.
	(SET_BY_PIECES_P): Likewise.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0004-Patch-4-7-sh-Deprecate-_BY_PIECES_P-move-to-hookized.patch --]
[-- Type: text/x-patch;  name=0004-Patch-4-7-sh-Deprecate-_BY_PIECES_P-move-to-hookized.patch, Size: 2861 bytes --]

diff --git a/gcc/config/sh/sh.c b/gcc/config/sh/sh.c
index 1dc1bf4..3bbbd23 100644
--- a/gcc/config/sh/sh.c
+++ b/gcc/config/sh/sh.c
@@ -338,6 +338,10 @@ static void sh_conditional_register_usage (void);
 static bool sh_legitimate_constant_p (machine_mode, rtx);
 static int mov_insn_size (machine_mode, bool);
 static int mov_insn_alignment_mask (machine_mode, bool);
+static bool sh_use_by_pieces_infrastructure_p (unsigned int,
+					       unsigned int,
+					       enum by_pieces_operation,
+					       bool);
 static bool sequence_insn_p (rtx_insn *);
 static void sh_canonicalize_comparison (int *, rtx *, rtx *, bool);
 static void sh_canonicalize_comparison (enum rtx_code&, rtx&, rtx&,
@@ -640,6 +644,10 @@ static const struct attribute_spec sh_attribute_table[] =
 #undef TARGET_FIXED_CONDITION_CODE_REGS
 #define TARGET_FIXED_CONDITION_CODE_REGS sh_fixed_condition_code_regs
 
+#undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
+#define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
+  sh_use_by_pieces_infrastructure_p
+
 /* Machine-specific symbol_ref flags.  */
 #define SYMBOL_FLAG_FUNCVEC_FUNCTION	(SYMBOL_FLAG_MACH_DEP << 0)
 
@@ -13674,4 +13682,27 @@ sh_mode_priority (int entity ATTRIBUTE_UNUSED, int n)
   return ((TARGET_FPU_SINGLE != 0) ^ (n) ? FP_MODE_SINGLE : FP_MODE_DOUBLE);
 }
 
+/* Implement TARGET_USE_BY_PIECES_INFRASTRUCTURE_P.  */
+
+static bool
+sh_use_by_pieces_infrastructure_p (unsigned int size,
+				   unsigned int align,
+				   enum by_pieces_operation op,
+				   bool speed_p)
+{
+  switch (op)
+    {
+      case MOVE_BY_PIECES:
+	return move_by_pieces_ninsns (size, align, MOVE_MAX_PIECES + 1)
+	  < (!speed_p ? 2 : (align >= 32) ? 16 : 2);
+      case STORE_BY_PIECES:
+      case SET_BY_PIECES:
+	return move_by_pieces_ninsns (size, align, STORE_MAX_PIECES + 1)
+	  < (!speed_p ? 2 : (align >= 32) ? 16 : 2);
+      default:
+	return default_use_by_pieces_infrastructure_p (size, align,
+						       op, speed_p);
+    }
+}
+
 #include "gt-sh.h"
diff --git a/gcc/config/sh/sh.h b/gcc/config/sh/sh.h
index 5b8b4a1..92835d7 100644
--- a/gcc/config/sh/sh.h
+++ b/gcc/config/sh/sh.h
@@ -1591,16 +1591,6 @@ struct sh_args {
 #define USE_STORE_PRE_DECREMENT(mode)    ((mode == SImode || mode == DImode) \
 					  ? 0 : TARGET_SH1)
 
-#define MOVE_BY_PIECES_P(SIZE, ALIGN) \
-  (move_by_pieces_ninsns (SIZE, ALIGN, MOVE_MAX_PIECES + 1) \
-   < (optimize_size ? 2 : ((ALIGN >= 32) ? 16 : 2)))
-
-#define STORE_BY_PIECES_P(SIZE, ALIGN) \
-  (move_by_pieces_ninsns (SIZE, ALIGN, STORE_MAX_PIECES + 1) \
-   < (optimize_size ? 2 : ((ALIGN >= 32) ? 16 : 2)))
-
-#define SET_BY_PIECES_P(SIZE, ALIGN) STORE_BY_PIECES_P(SIZE, ALIGN)
-
 /* If a memory clear move would take CLEAR_RATIO or more simple
    move-instruction pairs, we will do a setmem instead.  */
 

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [Patch 5/7 mips] Deprecate *_BY_PIECES_P, move to hookized version
  2014-10-31 15:09                         ` James Greenhalgh
                                             ` (3 preceding siblings ...)
  2014-10-31 15:11                           ` [Patch 3/7 arc] " James Greenhalgh
@ 2014-10-31 15:12                           ` James Greenhalgh
  2014-10-31 15:16                           ` [Patch 6/7 AArch64] " James Greenhalgh
  2014-10-31 15:34                           ` [Patch 7/7] Remove *_BY_PIECES_P James Greenhalgh
  6 siblings, 0 replies; 62+ messages in thread
From: James Greenhalgh @ 2014-10-31 15:12 UTC (permalink / raw)
  To: gcc-patches; +Cc: clm, echristo, matthew.fortune

[-- Attachment #1: Type: text/plain, Size: 907 bytes --]


Hi,

This patch moves mips to TARGET_USE_BY_PIECES_INFRASTRUCTURE_P.

For MIPS, this means adapting mips_move_by_pieces_p (and fixing the
long-standing comment "if this becomes a target hook, we should call
the default definition instead) and mips_store_by_pieces_p.

I tried building a compiler and there were no fires, I don't have access
to any MIPS hardware, so if one of the MIPS maintainers wanted to pick
this up and test it, that would be very much appreciated.

OK?

Thanks,
James

---
gcc/

2014-10-31  James Greenhalgh  <james.greenhalgh@arm.com>

	* config/mips/mips.h (MOVE_BY_PIECES_P): Remove.
	(STORE_BY_PIECES_P): Likewise.
	* config/mips/mips.c (TARGET_USE_BY_PIECES_INFRASTRUCTURE_P): New.
	(mips_move_by_pieces_p): Rename to...
	(mips_use_by_pieces_infrastructure_p): ...this, use new hook
	parameters, use the default hook implementation as a
	fall-back.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0005-Patch-5-7-mips-Deprecate-_BY_PIECES_P-move-to-hookiz.patch --]
[-- Type: text/x-patch;  name=0005-Patch-5-7-mips-Deprecate-_BY_PIECES_P-move-to-hookiz.patch, Size: 2919 bytes --]

diff --git a/gcc/config/mips/mips.c b/gcc/config/mips/mips.c
index 3d9db92..ac7746c 100644
--- a/gcc/config/mips/mips.c
+++ b/gcc/config/mips/mips.c
@@ -7172,12 +7172,17 @@ mips_function_ok_for_sibcall (tree decl, tree exp ATTRIBUTE_UNUSED)
   return true;
 }
 \f
-/* Implement MOVE_BY_PIECES_P.  */
+/* Implement TARGET_USE_MOVE_BY_PIECES_INFRASTRUCTURE_P.  */
 
 bool
-mips_move_by_pieces_p (unsigned HOST_WIDE_INT size, unsigned int align)
+mips_use_by_pieces_infrastructure_p (unsigned int size,
+				     unsigned int align,
+				     enum by_pieces_operation op,
+				     bool speed_p)
 {
-  if (HAVE_movmemsi)
+  if (op == STORE_BY_PIECES)
+    return mips_store_by_pieces_p (size, align);
+  if (op == MOVE_BY_PIECES && HAVE_movmemsi)
     {
       /* movmemsi is meant to generate code that is at least as good as
 	 move_by_pieces.  However, movmemsi effectively uses a by-pieces
@@ -7194,13 +7199,12 @@ mips_move_by_pieces_p (unsigned HOST_WIDE_INT size, unsigned int align)
 	return size < UNITS_PER_WORD;
       return size <= MIPS_MAX_MOVE_BYTES_STRAIGHT;
     }
-  /* The default value.  If this becomes a target hook, we should
-     call the default definition instead.  */
-  return (move_by_pieces_ninsns (size, align, MOVE_MAX_PIECES + 1)
-	  < (unsigned int) MOVE_RATIO (optimize_insn_for_speed_p ()));
+
+  return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
 }
 
-/* Implement STORE_BY_PIECES_P.  */
+/* Implement a handler for STORE_BY_PIECES operations
+   for TARGET_USE_MOVE_BY_PIECES_INFRASTRUCTURE_P.  */
 
 bool
 mips_store_by_pieces_p (unsigned HOST_WIDE_INT size, unsigned int align)
@@ -19119,6 +19123,10 @@ mips_lra_p (void)
 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
 
+#undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
+#define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
+  mips_use_by_pieces_infrastructure_p
+
 #undef TARGET_SPILL_CLASS
 #define TARGET_SPILL_CLASS mips_spill_class
 #undef TARGET_LRA_P
diff --git a/gcc/config/mips/mips.h b/gcc/config/mips/mips.h
index c7b998b..bf19920 100644
--- a/gcc/config/mips/mips.h
+++ b/gcc/config/mips/mips.h
@@ -2872,9 +2872,6 @@ while (0)
    ? MIPS_MAX_MOVE_BYTES_STRAIGHT / MOVE_MAX		\
    : MIPS_CALL_RATIO / 2)
 
-#define MOVE_BY_PIECES_P(SIZE, ALIGN) \
-  mips_move_by_pieces_p (SIZE, ALIGN)
-
 /* For CLEAR_RATIO, when optimizing for size, give a better estimate
    of the length of a memset call, but use the default otherwise.  */
 
@@ -2887,9 +2884,6 @@ while (0)
 
 #define SET_RATIO(speed) \
   ((speed) ? 15 : MIPS_CALL_RATIO - 2)
-
-#define STORE_BY_PIECES_P(SIZE, ALIGN) \
-  mips_store_by_pieces_p (SIZE, ALIGN)
 \f
 /* Since the bits of the _init and _fini function is spread across
    many object files, each potentially with its own GP, we must assume

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [Patch 6/7 AArch64] Deprecate *_BY_PIECES_P, move to hookized version
  2014-10-31 15:09                         ` James Greenhalgh
                                             ` (4 preceding siblings ...)
  2014-10-31 15:12                           ` [Patch 5/7 mips] " James Greenhalgh
@ 2014-10-31 15:16                           ` James Greenhalgh
  2014-10-31 15:34                           ` [Patch 7/7] Remove *_BY_PIECES_P James Greenhalgh
  6 siblings, 0 replies; 62+ messages in thread
From: James Greenhalgh @ 2014-10-31 15:16 UTC (permalink / raw)
  To: gcc-patches; +Cc: marcus.shawcroft, richard.earnshaw

[-- Attachment #1: Type: text/plain, Size: 626 bytes --]


Hi,

This patch moves aarch64 to TARGET_USE_BY_PIECES_INFRASTRUCTURE_P.

AArch64 turns off STORE_BY_PIECES, so honour that and use the default
implementation for other operations.

Unlike the other patches in this series I do actually have some
hardware for AArch64! So this one has been through a bootstrap run
with no issues.

OK?

Cheers,
James

---
gcc/

2014-10-31  James Greenhalgh  <james.greenhalgh@arm.com>

	* config/aarch64/aarch64.c
	(aarch64_use_by_pieces_infrastructre_p): New.
	(TARGET_USE_BY_PIECES_INFRASTRUCTURE): Likewise.
	* config/aarch64/aarch64.h (STORE_BY_PIECES_P): Delete.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0006-Patch-6-7-AArch64-Deprecate-_BY_PIECES_P-move-to-hoo.patch --]
[-- Type: text/x-patch;  name=0006-Patch-6-7-AArch64-Deprecate-_BY_PIECES_P-move-to-hoo.patch, Size: 2148 bytes --]

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 0400fd5..9aeac7c 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -10001,6 +10001,22 @@ aarch64_asan_shadow_offset (void)
   return (HOST_WIDE_INT_1 << 36);
 }
 
+static bool
+aarch64_use_by_pieces_infrastructure_p (unsigned int size,
+					unsigned int align,
+					enum by_pieces_operation op,
+					bool speed_p)
+{
+  /* STORE_BY_PIECES can be used when copying a constant string, but
+     in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
+     For now we always fail this and let the move_by_pieces code copy
+     the string from read-only memory.  */
+  if (op == STORE_BY_PIECES)
+    return false;
+
+  return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
+}
+
 #undef TARGET_ADDRESS_COST
 #define TARGET_ADDRESS_COST aarch64_address_cost
 
@@ -10253,6 +10269,10 @@ aarch64_asan_shadow_offset (void)
 #undef TARGET_LEGITIMIZE_ADDRESS
 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
 
+#undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
+#define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
+  aarch64_use_by_pieces_infrastructure_p
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 #include "gt-aarch64.h"
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index 97b1848..e22163e 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -723,12 +723,6 @@ do {									     \
 #define SET_RATIO(speed) \
   ((speed) ? 15 : AARCH64_CALL_RATIO - 2)
 
-/* STORE_BY_PIECES_P can be used when copying a constant string, but
-   in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
-   For now we always fail this and let the move_by_pieces code copy
-   the string from read-only memory.  */
-#define STORE_BY_PIECES_P(SIZE, ALIGN) 0
-
 /* Disable auto-increment in move_by_pieces et al.  Use of auto-increment is
    rarely a good idea in straight-line code since it adds an extra address
    dependency between each instruction.  Better to use incrementing offsets.  */

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [Patch 7/7] Remove *_BY_PIECES_P
  2014-10-31 15:09                         ` James Greenhalgh
                                             ` (5 preceding siblings ...)
  2014-10-31 15:16                           ` [Patch 6/7 AArch64] " James Greenhalgh
@ 2014-10-31 15:34                           ` James Greenhalgh
  6 siblings, 0 replies; 62+ messages in thread
From: James Greenhalgh @ 2014-10-31 15:34 UTC (permalink / raw)
  To: gcc-patches; +Cc: richard.guenther, stevenb.gcc, law, matthew.fortune

[-- Attachment #1: Type: text/plain, Size: 899 bytes --]


Hi,

This final patch gets rid of all the *_BY_PIECES_P macros.

Bootstrapped on x86_64, ARM and AArch64.

Thanks,
James

---
gcc/

2014-10-31  James Greenhalgh  <james.greenhalgh@arm.com>

	* doc/tm.texi.in (MOVE_BY_PIECES_P): Remove.
	(CLEAR_BY_PIECES_P): Likewise.
	(SET_BY_PIECES_P): Likewise.
	(STORE_BY_PIECES_P): Likewise.
	* doc/tm.texi: Regenerate.
	* system.h: Poison MOVE_BY_PIECES_P, CLEAR_BY_PIECES_P,
	SET_BY_PIECES_P, STORE_BY_PIECES_P.
	* expr.c (MOVE_BY_PIECES_P): Remove.
	(CLEAR_BY_PIECES_P): Likewise.
	(SET_BY_PIECES_P): Likewise.
	(STORE_BY_PIECES_P): Likewise.
	(can_move_by_pieces): Rewrite in terms of
	targetm.use_by_pieces_infrastructure_p.
	(emit_block_move_hints): Likewise.
	(can_store_by_pieces): Likewise.
	(store_by_pieces): Likewise.
	(clear_storage_hints): Likewise.
	(emit_push_insn): Likewise.
	(expand_constructor): Likewise.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0007-Patch-7-7-Remove-_BY_PIECES_P.patch --]
[-- Type: text/x-patch; name=0007-Patch-7-7-Remove-_BY_PIECES_P.patch, Size: 12000 bytes --]

diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index cfb8388..0d1f149 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -6123,16 +6123,6 @@ optimized for speed rather than size.
 If you don't define this, a reasonable default is used.
 @end defmac
 
-@defmac MOVE_BY_PIECES_P (@var{size}, @var{alignment})
-A C expression used to determine whether @code{move_by_pieces} will be used to
-copy a chunk of memory, or whether some other block move mechanism
-will be used.  Defaults to 1 if @code{move_by_pieces_ninsns} returns less
-than @code{MOVE_RATIO}.
-
-This macro is deprecated.  New ports should implement
-@code{TARGET_USE_BY_PIECES_INFRASTRUCTURE_P} instead.
-@end defmac
-
 @deftypefn {Target Hook} bool TARGET_USE_BY_PIECES_INFRASTRUCTURE_P (unsigned int @var{size}, unsigned int @var{alignment}, enum by_pieces_operation @var{op}, bool @var{speed_p})
 GCC will attempt several strategies when asked to copy between
 two areas of memory, or to set, clear or store to memory, for example
@@ -6184,16 +6174,6 @@ optimized for speed rather than size.
 If you don't define this, a reasonable default is used.
 @end defmac
 
-@defmac CLEAR_BY_PIECES_P (@var{size}, @var{alignment})
-A C expression used to determine whether @code{clear_by_pieces} will be used
-to clear a chunk of memory, or whether some other block clear mechanism
-will be used.  Defaults to 1 if @code{move_by_pieces_ninsns} returns less
-than @code{CLEAR_RATIO}.
-
-This macro is deprecated.  New ports should implement
-@code{TARGET_USE_BY_PIECES_INFRASTRUCTURE_P} instead.
-@end defmac
-
 @defmac SET_RATIO (@var{speed})
 The threshold of number of scalar move insns, @emph{below} which a sequence
 of insns should be generated to set memory to a constant value, instead of
@@ -6207,30 +6187,6 @@ optimized for speed rather than size.
 If you don't define this, it defaults to the value of @code{MOVE_RATIO}.
 @end defmac
 
-@defmac SET_BY_PIECES_P (@var{size}, @var{alignment})
-A C expression used to determine whether @code{store_by_pieces} will be
-used to set a chunk of memory to a constant value, or whether some
-other mechanism will be used.  Used by @code{__builtin_memset} when
-storing values other than constant zero.
-Defaults to 1 if @code{move_by_pieces_ninsns} returns less
-than @code{SET_RATIO}.
-
-This macro is deprecated.  New ports should implement
-@code{TARGET_USE_BY_PIECES_INFRASTRUCTURE_P} instead.
-@end defmac
-
-@defmac STORE_BY_PIECES_P (@var{size}, @var{alignment})
-A C expression used to determine whether @code{store_by_pieces} will be
-used to set a chunk of memory to a constant string value, or whether some
-other mechanism will be used.  Used by @code{__builtin_strcpy} when
-called with a constant source string.
-Defaults to 1 if @code{move_by_pieces_ninsns} returns less
-than @code{MOVE_RATIO}.
-
-This macro is deprecated.  New ports should implement
-@code{TARGET_USE_BY_PIECES_INFRASTRUCTURE_P} instead.
-@end defmac
-
 @defmac USE_LOAD_POST_INCREMENT (@var{mode})
 A C expression used to determine whether a load postincrement is a good
 thing to use for a given mode.  Defaults to the value of
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 3f66543..679b3d1 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4600,16 +4600,6 @@ optimized for speed rather than size.
 If you don't define this, a reasonable default is used.
 @end defmac
 
-@defmac MOVE_BY_PIECES_P (@var{size}, @var{alignment})
-A C expression used to determine whether @code{move_by_pieces} will be used to
-copy a chunk of memory, or whether some other block move mechanism
-will be used.  Defaults to 1 if @code{move_by_pieces_ninsns} returns less
-than @code{MOVE_RATIO}.
-
-This macro is deprecated.  New ports should implement
-@code{TARGET_USE_BY_PIECES_INFRASTRUCTURE_P} instead.
-@end defmac
-
 @hook TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
 
 @defmac MOVE_MAX_PIECES
@@ -4629,16 +4619,6 @@ optimized for speed rather than size.
 If you don't define this, a reasonable default is used.
 @end defmac
 
-@defmac CLEAR_BY_PIECES_P (@var{size}, @var{alignment})
-A C expression used to determine whether @code{clear_by_pieces} will be used
-to clear a chunk of memory, or whether some other block clear mechanism
-will be used.  Defaults to 1 if @code{move_by_pieces_ninsns} returns less
-than @code{CLEAR_RATIO}.
-
-This macro is deprecated.  New ports should implement
-@code{TARGET_USE_BY_PIECES_INFRASTRUCTURE_P} instead.
-@end defmac
-
 @defmac SET_RATIO (@var{speed})
 The threshold of number of scalar move insns, @emph{below} which a sequence
 of insns should be generated to set memory to a constant value, instead of
@@ -4652,30 +4632,6 @@ optimized for speed rather than size.
 If you don't define this, it defaults to the value of @code{MOVE_RATIO}.
 @end defmac
 
-@defmac SET_BY_PIECES_P (@var{size}, @var{alignment})
-A C expression used to determine whether @code{store_by_pieces} will be
-used to set a chunk of memory to a constant value, or whether some
-other mechanism will be used.  Used by @code{__builtin_memset} when
-storing values other than constant zero.
-Defaults to 1 if @code{move_by_pieces_ninsns} returns less
-than @code{SET_RATIO}.
-
-This macro is deprecated.  New ports should implement
-@code{TARGET_USE_BY_PIECES_INFRASTRUCTURE_P} instead.
-@end defmac
-
-@defmac STORE_BY_PIECES_P (@var{size}, @var{alignment})
-A C expression used to determine whether @code{store_by_pieces} will be
-used to set a chunk of memory to a constant string value, or whether some
-other mechanism will be used.  Used by @code{__builtin_strcpy} when
-called with a constant source string.
-Defaults to 1 if @code{move_by_pieces_ninsns} returns less
-than @code{MOVE_RATIO}.
-
-This macro is deprecated.  New ports should implement
-@code{TARGET_USE_BY_PIECES_INFRASTRUCTURE_P} instead.
-@end defmac
-
 @defmac USE_LOAD_POST_INCREMENT (@var{mode})
 A C expression used to determine whether a load postincrement is a good
 thing to use for a given mode.  Defaults to the value of
diff --git a/gcc/expr.c b/gcc/expr.c
index ef85177..af42b61 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -167,37 +167,6 @@ static void do_tablejump (rtx, machine_mode, rtx, rtx, rtx, int);
 static rtx const_vector_from_tree (tree);
 static void write_complex_part (rtx, rtx, bool);
 
-/* This macro is used to determine whether move_by_pieces should be called
-   to perform a structure copy.  */
-#ifndef MOVE_BY_PIECES_P
-#define MOVE_BY_PIECES_P(SIZE, ALIGN) \
-  (targetm.use_by_pieces_infrastructure_p (SIZE, ALIGN, MOVE_BY_PIECES, \
-					   optimize_insn_for_speed_p ()))
-#endif
-
-/* This macro is used to determine whether clear_by_pieces should be
-   called to clear storage.  */
-#ifndef CLEAR_BY_PIECES_P
-#define CLEAR_BY_PIECES_P(SIZE, ALIGN) \
-  (targetm.use_by_pieces_infrastructure_p (SIZE, ALIGN, CLEAR_BY_PIECES, \
-					   optimize_insn_for_speed_p ()))
-#endif
-
-/* This macro is used to determine whether store_by_pieces should be
-   called to "memset" storage with byte values other than zero.  */
-#ifndef SET_BY_PIECES_P
-#define SET_BY_PIECES_P(SIZE, ALIGN) \
-  (targetm.use_by_pieces_infrastructure_p (SIZE, ALIGN, SET_BY_PIECES, \
-					   optimize_insn_for_speed_p ()))
-#endif
-
-/* This macro is used to determine whether store_by_pieces should be
-   called to "memcpy" storage when the source is a constant string.  */
-#ifndef STORE_BY_PIECES_P
-#define STORE_BY_PIECES_P(SIZE, ALIGN) \
-  (targetm.use_by_pieces_infrastructure_p (SIZE, ALIGN, STORE_BY_PIECES, \
-					   optimize_insn_for_speed_p ()))
-#endif
 \f
 /* This is run to set up which modes can be used
    directly in memory and to initialize the block move optab.  It is run
@@ -832,10 +801,11 @@ widest_int_mode_for_size (unsigned int size)
    succeed.  */
 
 int
-can_move_by_pieces (unsigned HOST_WIDE_INT len ATTRIBUTE_UNUSED,
-		    unsigned int align ATTRIBUTE_UNUSED)
+can_move_by_pieces (unsigned HOST_WIDE_INT len,
+		    unsigned int align)
 {
-  return MOVE_BY_PIECES_P (len, align);
+  return targetm.use_by_pieces_infrastructure_p (len, align, MOVE_BY_PIECES,
+						 optimize_insn_for_speed_p ());
 }
 
 /* Generate several move instructions to copy LEN bytes from block FROM to
@@ -1172,7 +1142,7 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
       set_mem_size (y, INTVAL (size));
     }
 
-  if (CONST_INT_P (size) && MOVE_BY_PIECES_P (INTVAL (size), align))
+  if (CONST_INT_P (size) && can_move_by_pieces (INTVAL (size), align))
     move_by_pieces (x, y, INTVAL (size), align, 0);
   else if (emit_block_move_via_movmem (x, y, size, align,
 				       expected_align, expected_size,
@@ -2489,9 +2459,11 @@ can_store_by_pieces (unsigned HOST_WIDE_INT len,
   if (len == 0)
     return 1;
 
-  if (! (memsetp
-	 ? SET_BY_PIECES_P (len, align)
-	 : STORE_BY_PIECES_P (len, align)))
+  if (!targetm.use_by_pieces_infrastructure_p (len, align,
+					       memsetp
+						 ? SET_BY_PIECES
+						 : STORE_BY_PIECES,
+					       optimize_insn_for_speed_p ()))
     return 0;
 
   align = alignment_for_piecewise_move (STORE_MAX_PIECES, align);
@@ -2567,9 +2539,13 @@ store_by_pieces (rtx to, unsigned HOST_WIDE_INT len,
       return to;
     }
 
-  gcc_assert (memsetp
-	      ? SET_BY_PIECES_P (len, align)
-	      : STORE_BY_PIECES_P (len, align));
+  gcc_assert (targetm.use_by_pieces_infrastructure_p
+		(len, align,
+		 memsetp
+		   ? SET_BY_PIECES
+		   : STORE_BY_PIECES,
+		 optimize_insn_for_speed_p ()));
+
   data.constfun = constfun;
   data.constfundata = constfundata;
   data.len = len;
@@ -2806,7 +2782,9 @@ clear_storage_hints (rtx object, rtx size, enum block_op_methods method,
   align = MEM_ALIGN (object);
 
   if (CONST_INT_P (size)
-      && CLEAR_BY_PIECES_P (INTVAL (size), align))
+      && targetm.use_by_pieces_infrastructure_p (INTVAL (size), align,
+						 CLEAR_BY_PIECES,
+						 optimize_insn_for_speed_p ()))
     clear_by_pieces (object, INTVAL (size), align);
   else if (set_storage_via_setmem (object, size, const0_rtx, align,
 				   expected_align, expected_size,
@@ -4215,7 +4193,7 @@ emit_push_insn (rtx x, machine_mode mode, tree type, rtx size,
 	  && CONST_INT_P (size)
 	  && skip == 0
 	  && MEM_ALIGN (xinner) >= align
-	  && (MOVE_BY_PIECES_P ((unsigned) INTVAL (size) - used, align))
+	  && can_move_by_pieces ((unsigned) INTVAL (size) - used, align)
 	  /* Here we avoid the case of a structure whose weak alignment
 	     forces many pushes of a small amount of data,
 	     and such small pushes do rounding that causes trouble.  */
@@ -7836,7 +7814,7 @@ expand_constructor (tree exp, rtx target, enum expand_modifier modifier,
 	    && ! (target != 0 && safe_from_p (target, exp, 1)))
 		  || TREE_ADDRESSABLE (exp)
 		  || (tree_fits_uhwi_p (TYPE_SIZE_UNIT (type))
-		      && (! MOVE_BY_PIECES_P
+		      && (! can_move_by_pieces
 				     (tree_to_uhwi (TYPE_SIZE_UNIT (type)),
 				      TYPE_ALIGN (type)))
 		      && ! mostly_zeros_p (exp))))
diff --git a/gcc/system.h b/gcc/system.h
index dbe1ceb..74ddfe9 100644
--- a/gcc/system.h
+++ b/gcc/system.h
@@ -847,7 +847,9 @@ extern void fancy_abort (const char *, int, const char *) ATTRIBUTE_NORETURN;
 	HOT_TEXT_SECTION_NAME LEGITIMATE_CONSTANT_P ALWAYS_STRIP_DOTDOT	\
 	OUTPUT_ADDR_CONST_EXTRA SMALL_REGISTER_CLASSES ASM_OUTPUT_IDENT	\
 	ASM_BYTE_OP MEMBER_TYPE_FORCES_BLK LIBGCC2_HAS_SF_MODE		\
-	LIBGCC2_HAS_DF_MODE LIBGCC2_HAS_XF_MODE LIBGCC2_HAS_TF_MODE
+	LIBGCC2_HAS_DF_MODE LIBGCC2_HAS_XF_MODE LIBGCC2_HAS_TF_MODE	\
+	CLEAR_BY_PIECES_P MOVE_BY_PIECES_P SET_BY_PIECES_P		\
+	STORE_BY_PIECES_P
 
 /* Target macros only used for code built for the target, that have
    moved to libgcc-tm.h or have never been present elsewhere.  */

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [Patch 1/7] Hookize *_BY_PIECES_P
  2014-10-31 15:10                           ` [Patch 1/7] Hookize *_BY_PIECES_P James Greenhalgh
@ 2014-10-31 21:08                             ` Jeff Law
  0 siblings, 0 replies; 62+ messages in thread
From: Jeff Law @ 2014-10-31 21:08 UTC (permalink / raw)
  To: James Greenhalgh, gcc-patches
  Cc: richard.guenther, stevenb.gcc, matthew.fortune

On 10/31/14 09:08, James Greenhalgh wrote:
>
> Hi,
>
> This patch prepares for removing all the *BY_PIECES_P macros by
> introducing a new target hook TARGET_USE_BY_PIECES_INFRASTRUCTURE_P.
>
> Tested on ARM/AArch64/x86_64 with no issues.
>
> Ok for trunk?
>
> Thanks,
> James
>
> ---
> gcc/
>
> 2014-10-31  James Greenhalgh  <james.greenhalgh@arm.com>
>
> 	* target.def (use_by_pieces_infrastructure_p): New.
> 	* doc/tm.texi.in (MOVE_BY_PIECES_P): Describe that this macro
> 	is deprecated.
> 	(STORE_BY_PIECES_P): Likewise.
> 	(CLEAR_BY_PIECES_P): Likewise.
> 	(SET_BY_PIECES_P): Likewise.
> 	(TARGET_MOVE_BY_PIECES_PROFITABLE_P): Add hook.
> 	* doc/tm.texi: Regenerate.
> 	* expr.c (MOVE_BY_PIECES_P): Rewrite in terms of
> 	TARGET_USE_BY_PIECES_INFRASTRUCTURE_P.
> 	(STORE_BY_PIECES_P): Likewise.
> 	(CLEAR_BY_PIECES_P): Likewise.
> 	(SET_BY_PIECES_P): Likewise.
> 	(STORE_MAX_PIECES): Move to...
> 	* defaults.h (STORE_MAX_PIECES): ...here.
> 	* targhooks.c (get_move_ratio): New.
> 	(default_use_by_pieces_infrastructure_p): Likewise.
> 	* targhooks.h (default_use_by_pieces_infrastructure_p): New.
> 	* target.h (by_pieces_operation): New.
This isn't radically different than what's been reviewed, thankfully :-)

All 7 components approved.

jeff

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [Patch 4/7 sh] Deprecate *_BY_PIECES_P, move to hookized version
  2014-10-31 15:11                           ` [Patch 4/7 sh] Deprecate *_BY_PIECES_P, move to hookized version James Greenhalgh
@ 2014-11-01 23:27                             ` Kaz Kojima
  0 siblings, 0 replies; 62+ messages in thread
From: Kaz Kojima @ 2014-11-01 23:27 UTC (permalink / raw)
  To: james.greenhalgh; +Cc: gcc-patches

James Greenhalgh <james.greenhalgh@arm.com> wrote:
> I tried building a compiler and there were no fires, but otherwise,
> I have no reasonable way to test this patch. If one of the sh
> maintainers wants to pick it up and test it, that would be much
> appreciated.

SH portion looks fine.  No new failures with the top level
"make -k check" on sh4-unknown-linux-gnu. 

Regards,
        kaz

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [Patch 3/7 arc] Deprecate *_BY_PIECES_P, move to hookized version
  2014-10-31 15:11                           ` [Patch 3/7 arc] " James Greenhalgh
@ 2014-11-04 12:08                             ` Joern Rennecke
  2014-11-04 14:24                               ` James Greenhalgh
  0 siblings, 1 reply; 62+ messages in thread
From: Joern Rennecke @ 2014-11-04 12:08 UTC (permalink / raw)
  To: James Greenhalgh; +Cc: GCC Patches

On 31 October 2014 15:10, James Greenhalgh <james.greenhalgh@arm.com> wrote:

> While I am there, arc defines a macro CAN_MOVE_BY_PIECES, which is
> unused, so clean that up too.

That's not a clean-up.  This pertains to PR 39350.
Which, incidentally, this hookization completely ignores, entrenching
the conflation of
move expander and move cost estimates.
Thus, can_move_by_pieces gives the wrong result for purposes of rtl
optimizations
when a target-specific movmem etc expander emits target-specific code.
The patch at https://gcc.gnu.org/ml/gcc-patches/2009-03/txt00018.txt
shows a number of call sites that are affected.
>
> arc only implements MOVE_BY_PIECES_P, wiring it to false. Mirror that
> behaviour, and use the default hook for other by_pieces operations.
>
> I tried building a compiler but no amount of fiddling with target
> strings got me to a sensible result, so this patch is completely
> untested.

You could just pick one of the configs in contrib/config-list.mk

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [Patch 3/7 arc] Deprecate *_BY_PIECES_P, move to hookized version
  2014-11-04 12:08                             ` Joern Rennecke
@ 2014-11-04 14:24                               ` James Greenhalgh
  2014-11-04 16:20                                 ` Joern Rennecke
  0 siblings, 1 reply; 62+ messages in thread
From: James Greenhalgh @ 2014-11-04 14:24 UTC (permalink / raw)
  To: Joern Rennecke; +Cc: GCC Patches

On Tue, Nov 04, 2014 at 12:07:56PM +0000, Joern Rennecke wrote:
> On 31 October 2014 15:10, James Greenhalgh <james.greenhalgh@arm.com> wrote:
> 
> > While I am there, arc defines a macro CAN_MOVE_BY_PIECES, which is
> > unused, so clean that up too.
> 
> That's not a clean-up.  This pertains to PR 39350.

Well, it is a clean-up in the sense that this macro is completely unused
in the compiler and has no effect, but please revert this hunk if that
is your preference.

> Which, incidentally, this hookization completely ignores, entrenching
> the conflation of move expander and move cost estimates.

No, I have to disagree. The use_by_pieces_infrastructure_p hook doesn't
conflate anything - it gives a response to the question "Should the
by_pieces infrastructure be used?". A target specific movmem pattern
- though it might itself choose to move things by pieces, is
categorically not using the move_by_pieces infrastructure.

If we want to keep a clean separation of concerns here, we would
want a similar target hook asking the single question "will your
movmem/setmem expander succeed?".

> Thus, can_move_by_pieces gives the wrong result for purposes of rtl
> optimizations
> when a target-specific movmem etc expander emits target-specific code.
> The patch at https://gcc.gnu.org/ml/gcc-patches/2009-03/txt00018.txt
> shows a number of call sites that are affected.

can_move_by_pieces (likewise can_store_by_pieces) gives the right
result, the RTL expanders are using it wrong.

I disagree with the approach taken in your patch as it overloads the
purpose of can_move_by_pieces. However, I would support a patch pulling
this out in to two hooks, so the call in
value-prof.c:gimple_stringops_transform would change from:

  if (!can_move_by_pieces (val, MIN (dest_align, src_align)))
    return false;

to something like:

  if (!can_move_by_pieces (val, MIN (dest_align, src_align))
      && !targetm.can_expand_mem_op_p (val, MIN (dest_align, src_align),
				       MOVE_BY_PIECES))
    return false;

But let's not confuse the use of what should be a simple hook!

> > arc only implements MOVE_BY_PIECES_P, wiring it to false. Mirror that
> > behaviour, and use the default hook for other by_pieces operations.
> >
> > I tried building a compiler but no amount of fiddling with target
> > strings got me to a sensible result, so this patch is completely
> > untested.
> 
> You could just pick one of the configs in contrib/config-list.mk

Digging in to this, my scripts like to integrate a GDB build - which
doesn't work for arc-unknown-elf. I've been following the builds on
Jan Benedict-Glaw's since I put the patches in on Saturday morning,
and it doesn't look like I broke anything for arc. If I have any more
arc patches I'll keep this in mind.

Thanks,
James

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [Patch 3/7 arc] Deprecate *_BY_PIECES_P, move to hookized version
  2014-11-04 14:24                               ` James Greenhalgh
@ 2014-11-04 16:20                                 ` Joern Rennecke
  0 siblings, 0 replies; 62+ messages in thread
From: Joern Rennecke @ 2014-11-04 16:20 UTC (permalink / raw)
  To: James Greenhalgh; +Cc: GCC Patches

On 4 November 2014 14:24, James Greenhalgh <james.greenhalgh@arm.com> wrote:
> On Tue, Nov 04, 2014 at 12:07:56PM +0000, Joern Rennecke wrote:
>> On 31 October 2014 15:10, James Greenhalgh <james.greenhalgh@arm.com> wrote:
>>
>> > While I am there, arc defines a macro CAN_MOVE_BY_PIECES, which is
>> > unused, so clean that up too.
>>
>> That's not a clean-up.  This pertains to PR 39350.
>
> Well, it is a clean-up in the sense that this macro is completely unused
> in the compiler and has no effect, but please revert this hunk if that
> is your preference.
>
>> Which, incidentally, this hookization completely ignores, entrenching
>> the conflation of move expander and move cost estimates.
>
> No, I have to disagree. The use_by_pieces_infrastructure_p hook doesn't
> conflate anything - it gives a response to the question "Should the
> by_pieces infrastructure be used?". A target specific movmem pattern
> - though it might itself choose to move things by pieces, is
> categorically not using the move_by_pieces infrastructure.
>
> If we want to keep a clean separation of concerns here, we would
> want a similar target hook asking the single question "will your
> movmem/setmem expander succeed?".

That would not be helpful.  What the rtl optimizers actually want to know is
"will this block copy / memset be cheap?" .
A movmem expander might succeed (or not) for various reasons.  The one that's
interesting for the above question is if the call has been inlined
with a fast set
of instructions.

>> Thus, can_move_by_pieces gives the wrong result for purposes of rtl
>> optimizations
>> when a target-specific movmem etc expander emits target-specific code.
>> The patch at https://gcc.gnu.org/ml/gcc-patches/2009-03/txt00018.txt
>> shows a number of call sites that are affected.
>
> can_move_by_pieces (likewise can_store_by_pieces) gives the right
> result, the RTL expanders are using it wrong.

I could agree with that view if there was a good strategy agreed what the rtl
expanders should do instead.

> I disagree with the approach taken in your patch as it overloads the
> purpose of can_move_by_pieces. However, I would support a patch pulling
> this out in to two hooks, so the call in
> value-prof.c:gimple_stringops_transform would change from:
>
>   if (!can_move_by_pieces (val, MIN (dest_align, src_align)))
>     return false;
>
> to something like:
>
>   if (!can_move_by_pieces (val, MIN (dest_align, src_align))
>       && !targetm.can_expand_mem_op_p (val, MIN (dest_align, src_align),
>                                        MOVE_BY_PIECES))
>     return false;

But this goes back to the problem that it's not about if we can expand the mem
op at all, but if we have a fast expansion.  We can always expand via libcall
(the middle end does this as a fall-back).  Also, the target might do some
target-specific slow expansion, e.g. call a function with another name
and maybe a
modified ABI, but still relatively slow to work.

So, either the new hook would answer the wrong question, or it would be
misnamed, in which case it's likely that the semantics will sooner or
later follow
the name.
it will gravitate to answer the wrong question again.

> But let's not confuse the use of what should be a simple hook!

What would that be?  TARGET_RTX_COST is unsuitable because the RTL
for the call hasn't been made yet, and it it was, it would tend to be multiple
instructions, maybe even a loop.
Should we have an analogous TARGET_TREE_COST hook, so that you can ask the
target what it thinks the cost of a tree will be once it's expanded?

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [Patchv3] Control SRA and IPA-SRA by a param rather than MOVE_RATIO
  2014-10-31 10:58                       ` Richard Biener
@ 2014-11-06 11:53                         ` James Greenhalgh
  2014-11-06 14:10                           ` Richard Biener
  0 siblings, 1 reply; 62+ messages in thread
From: James Greenhalgh @ 2014-11-06 11:53 UTC (permalink / raw)
  To: gcc-patches; +Cc: richard.guenther, pinskia

[-- Attachment #1: Type: text/plain, Size: 1818 bytes --]


On Fri, Oct 31, 2014 at 10:46:12AM +0000, Richard Biener wrote:
> On Wed, Oct 29, 2014 at 3:39 PM, James Greenhalgh wrote:
> >> I suppose I could port any target with a definition of MOVE_RATIO to
> >> override the default parameter value in their option overriding code,
> >> but that makes this a very large patch set (many targets define
> >> MOVE_RATIO).
> >>
> >> Is this an avenue worth exploring? I agree the very special target
> >> hook is not ideal.
> >
> > Hi,
> >
> > Did you have any further thoughts on this? I'm still unable to come up
> > with a way to set these parameters which allows them to default to their
> > current (MOVE_RATIO derived) values.
> >
> > If the only way to make this work is to add code to
> > TARGET_OPTION_OVERRIDE for all targets that define MOVE_RATIO, then I
> > suppose I can do that, but I'd prefer a neater way to make it work, if
> > you can think of one.
>
> Maybe instead of putting the code in opts.c put it right before we call
> targetm.target_option.override () in toplev.c:process_options.  With
> a comment on why it cannot be in opts.c.

Ah, that makes sense. The (much simplified) patch then looks like
this...

Bootstrapped on x86_64, ARM and AArch64 with no issues.

OK?

Thanks,
James

---
gcc/

2014-11-06  James Greenhalgh  <james.greenhalgh@arm.com>

	* params.def (sra-max-scalarization-size-Ospeed): New.
	(sra-max-scalarization-size-Osize): Likewise.
	* doc/invoke.texi (sra-max-scalarization-size-Ospeed): Document.
	(sra-max-scalarization-size-Osize): Likewise.
	* toplev.c (process_options): Set default values for new
	parameters.
	* tree-sra.c (analyze_all_variable_accesses): Use new parameters.
	* targhooks.c (get_move_ratio): Remove static designator.
	* target.h (get_move_ratio): Declare.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0001-Patchv3-Control-SRA-and-IPA-SRA-by-a-param-rather-th.patch --]
[-- Type: text/x-patch;  name=0001-Patchv3-Control-SRA-and-IPA-SRA-by-a-param-rather-th.patch, Size: 5162 bytes --]

diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 792f25b..fcc5c89 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -10403,6 +10403,16 @@ parameters only when their cumulative size is less or equal to
 @option{ipa-sra-ptr-growth-factor} times the size of the original
 pointer parameter.
 
+@item sra-max-scalarization-size-Ospeed
+@item sra-max-scalarization-size-Osize
+The two Scalar Reduction of Aggregates passes (SRA and IPA-SRA) aim to
+replace scalar parts of aggregates with uses of independent scalar
+variables.  These parameters control the maximum size, in storage units,
+of aggregate which will be considered for replacement when compiling for
+speed
+(@option{sra-max-scalarization-size-Ospeed}) or size
+(@option{sra-max-scalarization-size-Osize}) respectively.
+
 @item tm-max-aggregate-size
 When making copies of thread-local variables in a transaction, this
 parameter specifies the size in bytes after which variables are
diff --git a/gcc/params.def b/gcc/params.def
index beff7e6..7cba3b3 100644
--- a/gcc/params.def
+++ b/gcc/params.def
@@ -950,6 +950,18 @@ DEFPARAM (PARAM_TM_MAX_AGGREGATE_SIZE,
 	  "pairs",
 	  9, 0, 0)
 
+DEFPARAM (PARAM_SRA_MAX_SCALARIZATION_SIZE_SPEED,
+	  "sra-max-scalarization-size-Ospeed",
+	  "Maximum size, in storage units, of an aggregate which should be "
+	  "considered for scalarization when compiling for speed",
+	  0, 0, 0)
+
+DEFPARAM (PARAM_SRA_MAX_SCALARIZATION_SIZE_SIZE,
+	  "sra-max-scalarization-size-Osize",
+	  "Maximum size, in storage units, of an aggregate which should be "
+	  "considered for scalarization when compiling for size",
+	  0, 0, 0)
+
 DEFPARAM (PARAM_IPA_CP_VALUE_LIST_SIZE,
 	  "ipa-cp-value-list-size",
 	  "Maximum size of a list of values associated with each parameter for "
diff --git a/gcc/target.h b/gcc/target.h
index 40d7841..65250ed 100644
--- a/gcc/target.h
+++ b/gcc/target.h
@@ -102,6 +102,10 @@ extern int elf_record_gcc_switches (print_switch_type type, const char *);
    we disable such optimizations on such targets, using this function.  */
 extern bool target_default_pointer_address_modes_p (void);
 
+/* For hooks which use the MOVE_RATIO macro, this gives the legacy default
+   behaviour.  */
+extern unsigned int get_move_ratio (bool);
+
 struct stdarg_info;
 struct spec_info_def;
 struct hard_reg_set_container;
diff --git a/gcc/targhooks.c b/gcc/targhooks.c
index eef3d45..d798217 100644
--- a/gcc/targhooks.c
+++ b/gcc/targhooks.c
@@ -1409,7 +1409,7 @@ default_register_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
 /* For hooks which use the MOVE_RATIO macro, this gives the legacy default
    behaviour.  SPEED_P is true if we are compiling for speed.  */
 
-static unsigned int
+unsigned int
 get_move_ratio (bool speed_p ATTRIBUTE_UNUSED)
 {
   unsigned int move_ratio;
diff --git a/gcc/toplev.c b/gcc/toplev.c
index 2c570d4..71589f2 100644
--- a/gcc/toplev.c
+++ b/gcc/toplev.c
@@ -1265,6 +1265,20 @@ process_options (void)
      so we can correctly initialize debug output.  */
   no_backend = lang_hooks.post_options (&main_input_filename);
 
+  /* Set default values for parameters relation to the Scalar Reduction
+     of Aggregates passes (SRA and IP-SRA).  We must do this here, rather
+     than in opts.c:default_options_optimization as historically these
+     tuning heuristics have been based on MOVE_RATIO, which on some
+     targets requires other symbols from the backend.  */
+  maybe_set_param_value
+    (PARAM_SRA_MAX_SCALARIZATION_SIZE_SPEED,
+     get_move_ratio (true) * UNITS_PER_WORD,
+     global_options.x_param_values, global_options_set.x_param_values);
+  maybe_set_param_value
+    (PARAM_SRA_MAX_SCALARIZATION_SIZE_SIZE,
+     get_move_ratio (false) * UNITS_PER_WORD,
+     global_options.x_param_values, global_options_set.x_param_values);
+
   /* Some machines may reject certain combinations of options.  */
   targetm.target_option.override ();
 
diff --git a/gcc/tree-sra.c b/gcc/tree-sra.c
index b723ca5..1e629bc 100644
--- a/gcc/tree-sra.c
+++ b/gcc/tree-sra.c
@@ -2511,10 +2511,12 @@ analyze_all_variable_accesses (void)
   int res = 0;
   bitmap tmp = BITMAP_ALLOC (NULL);
   bitmap_iterator bi;
-  unsigned i, max_total_scalarization_size;
-
-  max_total_scalarization_size = UNITS_PER_WORD * BITS_PER_UNIT
-    * MOVE_RATIO (optimize_function_for_speed_p (cfun));
+  unsigned i;
+  unsigned max_scalarization_size
+    = (optimize_function_for_size_p (cfun)
+	? PARAM_VALUE (PARAM_SRA_MAX_SCALARIZATION_SIZE_SIZE)
+	: PARAM_VALUE (PARAM_SRA_MAX_SCALARIZATION_SIZE_SPEED))
+      * BITS_PER_UNIT;
 
   EXECUTE_IF_SET_IN_BITMAP (candidate_bitmap, 0, i, bi)
     if (bitmap_bit_p (should_scalarize_away_bitmap, i)
@@ -2526,7 +2528,7 @@ analyze_all_variable_accesses (void)
 	    && type_consists_of_records_p (TREE_TYPE (var)))
 	  {
 	    if (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (var)))
-		<= max_total_scalarization_size)
+		<= max_scalarization_size)
 	      {
 		completely_scalarize_var (var);
 		if (dump_file && (dump_flags & TDF_DETAILS))

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [Patchv3] Control SRA and IPA-SRA by a param rather than MOVE_RATIO
  2014-11-06 11:53                         ` [Patchv3] " James Greenhalgh
@ 2014-11-06 14:10                           ` Richard Biener
  0 siblings, 0 replies; 62+ messages in thread
From: Richard Biener @ 2014-11-06 14:10 UTC (permalink / raw)
  To: James Greenhalgh; +Cc: GCC Patches, Andrew Pinski

On Thu, Nov 6, 2014 at 12:53 PM, James Greenhalgh
<james.greenhalgh@arm.com> wrote:
>
> On Fri, Oct 31, 2014 at 10:46:12AM +0000, Richard Biener wrote:
>> On Wed, Oct 29, 2014 at 3:39 PM, James Greenhalgh wrote:
>> >> I suppose I could port any target with a definition of MOVE_RATIO to
>> >> override the default parameter value in their option overriding code,
>> >> but that makes this a very large patch set (many targets define
>> >> MOVE_RATIO).
>> >>
>> >> Is this an avenue worth exploring? I agree the very special target
>> >> hook is not ideal.
>> >
>> > Hi,
>> >
>> > Did you have any further thoughts on this? I'm still unable to come up
>> > with a way to set these parameters which allows them to default to their
>> > current (MOVE_RATIO derived) values.
>> >
>> > If the only way to make this work is to add code to
>> > TARGET_OPTION_OVERRIDE for all targets that define MOVE_RATIO, then I
>> > suppose I can do that, but I'd prefer a neater way to make it work, if
>> > you can think of one.
>>
>> Maybe instead of putting the code in opts.c put it right before we call
>> targetm.target_option.override () in toplev.c:process_options.  With
>> a comment on why it cannot be in opts.c.
>
> Ah, that makes sense. The (much simplified) patch then looks like
> this...
>
> Bootstrapped on x86_64, ARM and AArch64 with no issues.
>
> OK?

Ok.

Thanks,
Richard.

> Thanks,
> James
>
> ---
> gcc/
>
> 2014-11-06  James Greenhalgh  <james.greenhalgh@arm.com>
>
>         * params.def (sra-max-scalarization-size-Ospeed): New.
>         (sra-max-scalarization-size-Osize): Likewise.
>         * doc/invoke.texi (sra-max-scalarization-size-Ospeed): Document.
>         (sra-max-scalarization-size-Osize): Likewise.
>         * toplev.c (process_options): Set default values for new
>         parameters.
>         * tree-sra.c (analyze_all_variable_accesses): Use new parameters.
>         * targhooks.c (get_move_ratio): Remove static designator.
>         * target.h (get_move_ratio): Declare.

^ permalink raw reply	[flat|nested] 62+ messages in thread

end of thread, other threads:[~2014-11-06 14:10 UTC | newest]

Thread overview: 62+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-06-06  8:50 [AArch64] Implement movmem for the benefit of inline memcpy James Greenhalgh
2014-06-06 10:39 ` Richard Earnshaw
2014-08-01  6:38 ` Andrew Pinski
2014-08-01  9:05   ` Richard Biener
2014-08-01  9:21 ` pinskia
2014-08-05  7:05   ` Andrew Pinski
2014-08-07 14:20     ` James Greenhalgh
2014-08-07 14:34       ` Richard Biener
2014-08-20  9:10         ` [Patch 1/2] Control SRA and IPA-SRA by a param rather than MOVE_RATIO James Greenhalgh
2014-08-20  9:10           ` [Patch AArch64 2/2] Wire up TARGET_DEFAULT_MAX_SCALARIZATION_SIZE James Greenhalgh
2014-08-20  9:21           ` [Patch 1/2] Control SRA and IPA-SRA by a param rather than MOVE_RATIO Richard Biener
2014-09-25 14:58             ` [Patch 0/4] " James Greenhalgh
2014-09-25 14:58               ` [Patch AArch64 4/4] Wire up New target hooks James Greenhalgh
2014-09-26 13:31                 ` James Greenhalgh
2014-09-25 14:58               ` [Patch 2/4] Hack out a use of MOVE_RATIO in tree-inline.c James Greenhalgh
2014-09-26  8:58                 ` Richard Biener
2014-09-25 14:58               ` [Patchv2 3/4] Control SRA and IPA-SRA by a param rather than MOVE_RATIO James Greenhalgh
2014-09-26  9:11                 ` Richard Biener
2014-10-01 16:38                   ` James Greenhalgh
2014-10-29 14:39                     ` James Greenhalgh
2014-10-31 10:58                       ` Richard Biener
2014-11-06 11:53                         ` [Patchv3] " James Greenhalgh
2014-11-06 14:10                           ` Richard Biener
2014-09-25 14:58               ` [Patch 1/4] Hookize MOVE_BY_PIECES_P, remove most uses of MOVE_RATIO James Greenhalgh
2014-09-25 15:09                 ` Steven Bosscher
2014-09-26  9:16                   ` Richard Biener
2014-10-29 10:45                 ` [Patch 0/6] Hookize MOVE_BY_PIECES_P James Greenhalgh
2014-10-29 10:47                   ` [Patch 1/6] Hookize MOVE_BY_PIECES_P, remove most uses of MOVE_RATIO James Greenhalgh
2014-10-29 12:29                     ` Matthew Fortune
2014-10-29 16:11                       ` James Greenhalgh
2014-10-31 15:09                         ` James Greenhalgh
2014-10-31 15:10                           ` [Patch 2/7 s390] Deprecate *_BY_PIECES_P, move to hookized version James Greenhalgh
2014-10-31 15:10                           ` [Patch 1/7] Hookize *_BY_PIECES_P James Greenhalgh
2014-10-31 21:08                             ` Jeff Law
2014-10-31 15:11                           ` [Patch 4/7 sh] Deprecate *_BY_PIECES_P, move to hookized version James Greenhalgh
2014-11-01 23:27                             ` Kaz Kojima
2014-10-31 15:11                           ` [Patch 3/7 arc] " James Greenhalgh
2014-11-04 12:08                             ` Joern Rennecke
2014-11-04 14:24                               ` James Greenhalgh
2014-11-04 16:20                                 ` Joern Rennecke
2014-10-31 15:12                           ` [Patch 5/7 mips] " James Greenhalgh
2014-10-31 15:16                           ` [Patch 6/7 AArch64] " James Greenhalgh
2014-10-31 15:34                           ` [Patch 7/7] Remove *_BY_PIECES_P James Greenhalgh
2014-10-29 10:49                   ` [Patch 2/6 s390] Deprecate MOVE_BY_PIECES_P, move to hookized version James Greenhalgh
2014-10-29 21:09                     ` Jeff Law
2014-10-29 10:50                   ` [Patch 4/6 sh] " James Greenhalgh
2014-10-29 21:10                     ` Jeff Law
2014-10-30  0:49                     ` Kaz Kojima
2014-10-29 10:50                   ` [Patch 3/6 arc] " James Greenhalgh
2014-10-29 21:10                     ` Jeff Law
2014-10-29 10:51                   ` [Patch 5/6 mips] " James Greenhalgh
2014-10-29 21:18                     ` Jeff Law
2014-10-29 10:53                   ` [Patch 6/6] Remove MOVE_BY_PIECES_P James Greenhalgh
2014-10-29 21:20                     ` Jeff Law
2014-08-21 10:34         ` [Patch 1/2] Don't put out a call to memcpy for volatile struct operations James Greenhalgh
2014-08-21 10:34           ` [Patch AArch64 2/2] Do not double-copy bytes in " James Greenhalgh
2014-08-21 11:22           ` [Patch 1/2] Don't put out a call to memcpy for " Richard Biener
2014-08-21 23:47             ` Mike Stump
2014-08-22 15:42               ` Joseph S. Myers
2014-08-22 17:33                 ` Mike Stump
2014-08-26  8:35               ` Richard Biener
2014-08-26 16:42                 ` Mike Stump

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).