[PATCH][GCC][AARCH64]Bad code-gen for structure/block/unaligned memory access

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH][GCC][AARCH64]Bad code-gen for structure/block/unaligned memory access
@ 2017-06-07 11:38 Tamar Christina
  2017-06-08  8:01 ` Richard Sandiford
  0 siblings, 1 reply; 14+ messages in thread
From: Tamar Christina @ 2017-06-07 11:38 UTC (permalink / raw)
  To: GCC Patches; +Cc: nd, James Greenhalgh, Richard Earnshaw, Marcus Shawcroft

[-- Attachment #1: Type: text/plain, Size: 2142 bytes --]

Hi All, 


This patch allows larger bitsizes to be used as copy size
when the target does not have SLOW_UNALIGNED_ACCESS.

It also provides an optimized routine for MEM to REG
copying which avoid reconstructing the value piecewise on the stack
and instead uses a combination of shifts and ORs.

This now generates

	adrp	x0, .LANCHOR0
	add	x0, x0, :lo12:.LANCHOR0
	sub	sp, sp, #16
	ldr	w1, [x0, 120]
	str	w1, [sp, 8]
	ldr	x0, [x0, 112]
	ldr	x1, [sp, 8]
	add	sp, sp, 16

instead of:

	adrp	x3, .LANCHOR0
	add	x3, x3, :lo12:.LANCHOR0
	mov	x0, 0
	mov	x1, 0
	sub	sp, sp, #16
	ldr	x2, [x3, 112]
	ldr	w3, [x3, 120]
	add	sp, sp, 16
	ubfx	x5, x2, 8, 8
	bfi	x0, x2, 0, 8
	ubfx	x4, x2, 16, 8
	lsr	w9, w2, 24
	bfi	x0, x5, 8, 8
	ubfx	x7, x2, 32, 8
	ubfx	x5, x2, 40, 8
	ubfx	x8, x3, 8, 8
	bfi	x0, x4, 16, 8
	bfi	x1, x3, 0, 8
	ubfx	x4, x2, 48, 8
	ubfx	x6, x3, 16, 8
	bfi	x0, x9, 24, 8
	bfi	x1, x8, 8, 8
	lsr	x2, x2, 56
	lsr	w3, w3, 24
	bfi	x0, x7, 32, 8
	bfi	x1, x6, 16, 8
	bfi	x0, x5, 40, 8
	bfi	x1, x3, 24, 8
	bfi	x0, x4, 48, 8
	bfi	x0, x2, 56, 8

To load a 12 1-byte element struct.

and

	adrp	x0, .LANCHOR0
	add	x0, x0, :lo12:.LANCHOR0
	sub	sp, sp, #16
	ldrb	w1, [x0, 18]
	ldrh	w0, [x0, 16]
	orr	w0, w0, w1, lsr 16
	str	w0, [sp, 8]
	add	sp, sp, 16

instead of

	adrp	x2, .LANCHOR0
	add	x2, x2, :lo12:.LANCHOR0
	mov	x0, 0
	sub	sp, sp, #16
	ldrh	w1, [x2, 16]
	ldrb	w2, [x2, 18]
	add	sp, sp, 16
	bfi	x0, x1, 0, 8
	ubfx	x1, x1, 8, 8
	bfi	x0, x1, 8, 8
	bfi	x0, x2, 16, 8

These changes only have an effect on structures smaller than 16 bytes.

The remaining stores come from an existing incomplete data-flow analysis
which thinks the value on the stack is being used and doesn't mark
the value as dead.

Regression tested on aarch64-none-linux-gnu and x86_64-pc-linux-gnu and no regressions.

OK for trunk?

Thanks,
Tamar


gcc/
2017-06-07  Tamar Christina  <tamar.christina@arm.com>

	* expr.c (copy_blkmode_to_reg): Fix bitsize for targets
	with fast unaligned access.
	* config/aarch64/aarch64.c (aarch64_expand_movmem):
	Add MEM to REG optimized case.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: struct-copy.patch --]
[-- Type: text/x-patch; name="struct-copy.patch", Size: 2423 bytes --]

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 4f769a40a4e9de83cb5aacfd3ff58301c2feeb78..8906d9a9445ed36f43302708d1f6212bcf017bdc 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -13498,6 +13498,41 @@ aarch64_expand_movmem (rtx *operands)
   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
   src = adjust_automodify_address (src, VOIDmode, base, 0);
 
+  /* Optimize routines for MEM to REG copies.  */
+  if (n < 8 && !REG_P (XEXP (operands[0], 0)))
+   {
+     unsigned int max_align = UINTVAL (operands[2]);
+     max_align = n < max_align ? max_align : n;
+     machine_mode mov_mode, dest_mode
+       = smallest_mode_for_size (max_align * BITS_PER_UNIT, MODE_INT);
+     rtx result = gen_reg_rtx (dest_mode);
+     emit_insn (gen_move_insn (result, GEN_INT (0)));
+
+     unsigned int shift_cnt = 0;
+     for (; n > shift_cnt; shift_cnt += GET_MODE_SIZE (mov_mode))
+       {
+	 int nearest = 0;
+	 /* Find the mode to use, but limit the max to TI mode.  */
+	 for (unsigned max = 1; max <= (n - shift_cnt) && max <= 16; max *= 2)
+	      nearest = max;
+
+	  mov_mode = smallest_mode_for_size (nearest * BITS_PER_UNIT, MODE_INT);
+	  rtx reg = gen_reg_rtx (mov_mode);
+
+	  src = adjust_address (src, mov_mode, 0);
+	  emit_insn (gen_move_insn (reg, src));
+	  src = aarch64_progress_pointer (src);
+
+	  reg = gen_rtx_ASHIFT (dest_mode, reg,
+				GEN_INT (shift_cnt * BITS_PER_UNIT));
+	  result = gen_rtx_IOR (dest_mode, reg, result);
+      }
+
+    dst = adjust_address (dst, dest_mode, 0);
+    emit_insn (gen_move_insn (dst, result));
+    return true;
+  }
+
   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
      1-byte chunk.  */
   if (n < 4)
diff --git a/gcc/expr.c b/gcc/expr.c
index 91d7ea217229fac62380b5d4b646961bf7c836c1..b1df4651e7942346007cda1cce8ee5a19297ab16 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -2743,7 +2743,9 @@ copy_blkmode_to_reg (machine_mode mode, tree src)
 
   n_regs = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
   dst_words = XALLOCAVEC (rtx, n_regs);
-  bitsize = MIN (TYPE_ALIGN (TREE_TYPE (src)), BITS_PER_WORD);
+  bitsize = BITS_PER_WORD;
+  if (SLOW_UNALIGNED_ACCESS (BLKmode, TYPE_ALIGN (TREE_TYPE (src))))
+    bitsize = MIN (TYPE_ALIGN (TREE_TYPE (src)), BITS_PER_WORD);
 
   /* Copy the structure BITSIZE bits at a time.  */
   for (bitpos = 0, xbitpos = padding_correction;

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH][GCC][AARCH64]Bad code-gen for structure/block/unaligned memory access
  2017-06-07 11:38 [PATCH][GCC][AARCH64]Bad code-gen for structure/block/unaligned memory access Tamar Christina
@ 2017-06-08  8:01 ` Richard Sandiford
  2017-06-09  9:04   ` Tamar Christina
  2017-06-09 15:51   ` Tamar Christina
  0 siblings, 2 replies; 14+ messages in thread
From: Richard Sandiford @ 2017-06-08  8:01 UTC (permalink / raw)
  To: Tamar Christina
  Cc: GCC Patches, nd, James Greenhalgh, Richard Earnshaw, Marcus Shawcroft

Tamar Christina <Tamar.Christina@arm.com> writes:
> Hi All, 
>
> This patch allows larger bitsizes to be used as copy size
> when the target does not have SLOW_UNALIGNED_ACCESS.
>
> It also provides an optimized routine for MEM to REG
> copying which avoid reconstructing the value piecewise on the stack
> and instead uses a combination of shifts and ORs.
>
> This now generates
>
> 	adrp	x0, .LANCHOR0
> 	add	x0, x0, :lo12:.LANCHOR0
> 	sub	sp, sp, #16
> 	ldr	w1, [x0, 120]
> 	str	w1, [sp, 8]
> 	ldr	x0, [x0, 112]
> 	ldr	x1, [sp, 8]
> 	add	sp, sp, 16
>
> instead of:
>
> 	adrp	x3, .LANCHOR0
> 	add	x3, x3, :lo12:.LANCHOR0
> 	mov	x0, 0
> 	mov	x1, 0
> 	sub	sp, sp, #16
> 	ldr	x2, [x3, 112]
> 	ldr	w3, [x3, 120]
> 	add	sp, sp, 16
> 	ubfx	x5, x2, 8, 8
> 	bfi	x0, x2, 0, 8
> 	ubfx	x4, x2, 16, 8
> 	lsr	w9, w2, 24
> 	bfi	x0, x5, 8, 8
> 	ubfx	x7, x2, 32, 8
> 	ubfx	x5, x2, 40, 8
> 	ubfx	x8, x3, 8, 8
> 	bfi	x0, x4, 16, 8
> 	bfi	x1, x3, 0, 8
> 	ubfx	x4, x2, 48, 8
> 	ubfx	x6, x3, 16, 8
> 	bfi	x0, x9, 24, 8
> 	bfi	x1, x8, 8, 8
> 	lsr	x2, x2, 56
> 	lsr	w3, w3, 24
> 	bfi	x0, x7, 32, 8
> 	bfi	x1, x6, 16, 8
> 	bfi	x0, x5, 40, 8
> 	bfi	x1, x3, 24, 8
> 	bfi	x0, x4, 48, 8
> 	bfi	x0, x2, 56, 8
>
> To load a 12 1-byte element struct.

Nice!

[...]

> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 4f769a40a4e9de83cb5aacfd3ff58301c2feeb78..8906d9a9445ed36f43302708d1f6212bcf017bdc 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -13498,6 +13498,41 @@ aarch64_expand_movmem (rtx *operands)
>    base = copy_to_mode_reg (Pmode, XEXP (src, 0));
>    src = adjust_automodify_address (src, VOIDmode, base, 0);
>  
> +  /* Optimize routines for MEM to REG copies.  */
> +  if (n < 8 && !REG_P (XEXP (operands[0], 0)))

This seems to be checking that the address of the original destination
memory isn't a plain base register.  Why's it important to reject that
case but allow e.g. base+offset?

> +   {
> +     unsigned int max_align = UINTVAL (operands[2]);
> +     max_align = n < max_align ? max_align : n;

Might be misunderstanding, but isn't max_align always equal to n here,
since n was set by:

  n = UINTVAL (operands[2]);

Indentation of the enclosing { ... } is slightly off.

> +     machine_mode mov_mode, dest_mode
> +       = smallest_mode_for_size (max_align * BITS_PER_UNIT, MODE_INT);
> +     rtx result = gen_reg_rtx (dest_mode);
> +     emit_insn (gen_move_insn (result, GEN_INT (0)));
> +
> +     unsigned int shift_cnt = 0;
> +     for (; n > shift_cnt; shift_cnt += GET_MODE_SIZE (mov_mode))
> +       {
> +	 int nearest = 0;
> +	 /* Find the mode to use, but limit the max to TI mode.  */
> +	 for (unsigned max = 1; max <= (n - shift_cnt) && max <= 16; max *= 2)
> +	      nearest = max;

In the if statement above, you required n < 8, so can max ever by > 16 here?

> +	  mov_mode = smallest_mode_for_size (nearest * BITS_PER_UNIT, MODE_INT);
> +	  rtx reg = gen_reg_rtx (mov_mode);
> +
> +	  src = adjust_address (src, mov_mode, 0);
> +	  emit_insn (gen_move_insn (reg, src));
> +	  src = aarch64_progress_pointer (src);
> +
> +	  reg = gen_rtx_ASHIFT (dest_mode, reg,
> +				GEN_INT (shift_cnt * BITS_PER_UNIT));

This seems to be mixing modes: reg has mode "mov_mode" but the result has
mode "dest_mode".  That isn't well-formed: the mode of a shift result needs
to be the same as the mode of the operand.  I think the load would need
to be a zero-extend of "src" into "reg", with "reg" having mode "dest_mode".

> +	  result = gen_rtx_IOR (dest_mode, reg, result);
> +      }
> +
> +    dst = adjust_address (dst, dest_mode, 0);
> +    emit_insn (gen_move_insn (dst, result));

dest_mode was chosen by smallest_mode_for_size, so can be bigger than n.
Doesn't that mean that we'll write beyond the end of the copy region
when n is an awkward number?

> diff --git a/gcc/expr.c b/gcc/expr.c
> index 91d7ea217229fac62380b5d4b646961bf7c836c1..b1df4651e7942346007cda1cce8ee5a19297ab16 100644
> --- a/gcc/expr.c
> +++ b/gcc/expr.c
> @@ -2743,7 +2743,9 @@ copy_blkmode_to_reg (machine_mode mode, tree src)
>  
>    n_regs = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
>    dst_words = XALLOCAVEC (rtx, n_regs);
> -  bitsize = MIN (TYPE_ALIGN (TREE_TYPE (src)), BITS_PER_WORD);
> +  bitsize = BITS_PER_WORD;
> +  if (SLOW_UNALIGNED_ACCESS (BLKmode, TYPE_ALIGN (TREE_TYPE (src))))
> +    bitsize = MIN (TYPE_ALIGN (TREE_TYPE (src)), BITS_PER_WORD);

I think this ought to be testing word_mode instead of BLKmode.
(Testing BLKmode doesn't really make sense in general, because the
mode doesn't have a meaningful alignment.)

Thanks,
Richard

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH][GCC][AARCH64]Bad code-gen for structure/block/unaligned memory access
  2017-06-08  8:01 ` Richard Sandiford
@ 2017-06-09  9:04   ` Tamar Christina
  2017-06-09 15:51   ` Tamar Christina
  1 sibling, 0 replies; 14+ messages in thread
From: Tamar Christina @ 2017-06-09  9:04 UTC (permalink / raw)
  To: Richard Sandiford
  Cc: GCC Patches, nd, James Greenhalgh, Richard Earnshaw, Marcus Shawcroft

> > +  /* Optimize routines for MEM to REG copies.  */  if (n < 8 &&
> > + !REG_P (XEXP (operands[0], 0)))
> 
> This seems to be checking that the address of the original destination
> memory isn't a plain base register.  Why's it important to reject that case but
> allow e.g. base+offset?
> 

In the case of e.g.

void Fun3(struct struct3 foo3)
{
  L3 = foo3;
}


> > +   {
> > +     unsigned int max_align = UINTVAL (operands[2]);
> > +     max_align = n < max_align ? max_align : n;
> 
> Might be misunderstanding, but isn't max_align always equal to n here, since
> n was set by:
> 
>   n = UINTVAL (operands[2]);
> 
> Indentation of the enclosing { ... } is slightly off.
> 
> > +     machine_mode mov_mode, dest_mode
> > +       = smallest_mode_for_size (max_align * BITS_PER_UNIT, MODE_INT);
> > +     rtx result = gen_reg_rtx (dest_mode);
> > +     emit_insn (gen_move_insn (result, GEN_INT (0)));
> > +
> > +     unsigned int shift_cnt = 0;
> > +     for (; n > shift_cnt; shift_cnt += GET_MODE_SIZE (mov_mode))
> > +       {
> > +	 int nearest = 0;
> > +	 /* Find the mode to use, but limit the max to TI mode.  */
> > +	 for (unsigned max = 1; max <= (n - shift_cnt) && max <= 16; max *=
> 2)
> > +	      nearest = max;
> 
> In the if statement above, you required n < 8, so can max ever by > 16 here?
> 
> > +	  mov_mode = smallest_mode_for_size (nearest * BITS_PER_UNIT,
> MODE_INT);
> > +	  rtx reg = gen_reg_rtx (mov_mode);
> > +
> > +	  src = adjust_address (src, mov_mode, 0);
> > +	  emit_insn (gen_move_insn (reg, src));
> > +	  src = aarch64_progress_pointer (src);
> > +
> > +	  reg = gen_rtx_ASHIFT (dest_mode, reg,
> > +				GEN_INT (shift_cnt * BITS_PER_UNIT));
> 
> This seems to be mixing modes: reg has mode "mov_mode" but the result
> has mode "dest_mode".  That isn't well-formed: the mode of a shift result
> needs to be the same as the mode of the operand.  I think the load would
> need to be a zero-extend of "src" into "reg", with "reg" having mode
> "dest_mode".
> 
> > +	  result = gen_rtx_IOR (dest_mode, reg, result);
> > +      }
> > +
> > +    dst = adjust_address (dst, dest_mode, 0);
> > +    emit_insn (gen_move_insn (dst, result));
> 
> dest_mode was chosen by smallest_mode_for_size, so can be bigger than n.
> Doesn't that mean that we'll write beyond the end of the copy region when
> n is an awkward number?
> 
> > diff --git a/gcc/expr.c b/gcc/expr.c
> > index
> >
> 91d7ea217229fac62380b5d4b646961bf7c836c1..b1df4651e7942346007cda1cce
> 8e
> > e5a19297ab16 100644
> > --- a/gcc/expr.c
> > +++ b/gcc/expr.c
> > @@ -2743,7 +2743,9 @@ copy_blkmode_to_reg (machine_mode mode,
> tree
> > src)
> >
> >    n_regs = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
> >    dst_words = XALLOCAVEC (rtx, n_regs);
> > -  bitsize = MIN (TYPE_ALIGN (TREE_TYPE (src)), BITS_PER_WORD);
> > +  bitsize = BITS_PER_WORD;
> > +  if (SLOW_UNALIGNED_ACCESS (BLKmode, TYPE_ALIGN (TREE_TYPE
> (src))))
> > +    bitsize = MIN (TYPE_ALIGN (TREE_TYPE (src)), BITS_PER_WORD);
> 
> I think this ought to be testing word_mode instead of BLKmode.
> (Testing BLKmode doesn't really make sense in general, because the mode
> doesn't have a meaningful alignment.)
> 
> Thanks,
> Richard

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH][GCC][AARCH64]Bad code-gen for structure/block/unaligned memory access
  2017-06-08  8:01 ` Richard Sandiford
  2017-06-09  9:04   ` Tamar Christina
@ 2017-06-09 15:51   ` Tamar Christina
  2017-07-03  6:18     ` Tamar Christina
  1 sibling, 1 reply; 14+ messages in thread
From: Tamar Christina @ 2017-06-09 15:51 UTC (permalink / raw)
  To: Richard Sandiford
  Cc: GCC Patches, nd, James Greenhalgh, Richard Earnshaw, Marcus Shawcroft


> > +  /* Optimize routines for MEM to REG copies.  */  if (n < 8 &&
> > + !REG_P (XEXP (operands[0], 0)))
> 
> This seems to be checking that the address of the original destination
> memory isn't a plain base register.  Why's it important to reject that case but
> allow e.g. base+offset?

this function is (as far as I could tell) only being called with two types of destinations:
a location on the stack or a plain register.

When the destination is a register such as with

void Fun3(struct struct3 foo3)
{
  L3 = foo3;
}

You run into the issue you had pointed to below where we might write too much. Ideally the constraint
I would like is to check if the destination is either a new register (no data) or that the structure was padded.

I couldn't figure out how to do this and the gains over the existing code for this case was quite small.
So I just disallowed it leaving it to the existing code, which isn't so bad, only 1 extra instruction.

> 
> > +   {
> > +     unsigned int max_align = UINTVAL (operands[2]);
> > +     max_align = n < max_align ? max_align : n;
> 
> Might be misunderstanding, but isn't max_align always equal to n here, since
> n was set by:

Correct, previously this patch was made to allow n < 16. These were left over from the cleanup.
I'll correct.

> 
> > +	  result = gen_rtx_IOR (dest_mode, reg, result);
> > +      }
> > +
> > +    dst = adjust_address (dst, dest_mode, 0);
> > +    emit_insn (gen_move_insn (dst, result));
> 
> dest_mode was chosen by smallest_mode_for_size, so can be bigger than n.
> Doesn't that mean that we'll write beyond the end of the copy region when
> n is an awkward number?

Yes, see my answer above. For the other case, when we write onto a location on the stack, this is fine
due to the alignment. 

> 
> > diff --git a/gcc/expr.c b/gcc/expr.c
> > index
> >
> 91d7ea217229fac62380b5d4b646961bf7c836c1..b1df4651e7942346007cda1cce
> 8e
> > e5a19297ab16 100644
> > --- a/gcc/expr.c
> > +++ b/gcc/expr.c
> > @@ -2743,7 +2743,9 @@ copy_blkmode_to_reg (machine_mode mode,
> tree
> > src)
> >
> >    n_regs = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
> >    dst_words = XALLOCAVEC (rtx, n_regs);
> > -  bitsize = MIN (TYPE_ALIGN (TREE_TYPE (src)), BITS_PER_WORD);
> > +  bitsize = BITS_PER_WORD;
> > +  if (SLOW_UNALIGNED_ACCESS (BLKmode, TYPE_ALIGN (TREE_TYPE
> (src))))
> > +    bitsize = MIN (TYPE_ALIGN (TREE_TYPE (src)), BITS_PER_WORD);
> 
> I think this ought to be testing word_mode instead of BLKmode.
> (Testing BLKmode doesn't really make sense in general, because the mode
> doesn't have a meaningful alignment.)

Ah, yes that makes sense. I'll update the patch.

New patch is validating and will submit it soon.

> 
> Thanks,
> Richard

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH][GCC][AARCH64]Bad code-gen for structure/block/unaligned memory access
  2017-06-09 15:51   ` Tamar Christina
@ 2017-07-03  6:18     ` Tamar Christina
  2017-08-25 19:35       ` Jeff Law
                         ` (2 more replies)
  0 siblings, 3 replies; 14+ messages in thread
From: Tamar Christina @ 2017-07-03  6:18 UTC (permalink / raw)
  To: Richard Sandiford
  Cc: GCC Patches, nd, James Greenhalgh, Richard Earnshaw, Marcus Shawcroft

[-- Attachment #1: Type: text/plain, Size: 3305 bytes --]

Hi All,

Sorry I just realized I never actually sent the updated patch...

So here it is :)

Regards,
Tamar
________________________________________
From: gcc-patches-owner@gcc.gnu.org <gcc-patches-owner@gcc.gnu.org> on behalf of Tamar Christina <Tamar.Christina@arm.com>
Sent: Friday, June 9, 2017 4:51:52 PM
To: Richard Sandiford
Cc: GCC Patches; nd; James Greenhalgh; Richard Earnshaw; Marcus Shawcroft
Subject: RE: [PATCH][GCC][AARCH64]Bad code-gen for structure/block/unaligned memory access

> > +  /* Optimize routines for MEM to REG copies.  */  if (n < 8 &&
> > + !REG_P (XEXP (operands[0], 0)))
>
> This seems to be checking that the address of the original destination
> memory isn't a plain base register.  Why's it important to reject that case but
> allow e.g. base+offset?

this function is (as far as I could tell) only being called with two types of destinations:
a location on the stack or a plain register.

When the destination is a register such as with

void Fun3(struct struct3 foo3)
{
  L3 = foo3;
}

You run into the issue you had pointed to below where we might write too much. Ideally the constraint
I would like is to check if the destination is either a new register (no data) or that the structure was padded.

I couldn't figure out how to do this and the gains over the existing code for this case was quite small.
So I just disallowed it leaving it to the existing code, which isn't so bad, only 1 extra instruction.

>
> > +   {
> > +     unsigned int max_align = UINTVAL (operands[2]);
> > +     max_align = n < max_align ? max_align : n;
>
> Might be misunderstanding, but isn't max_align always equal to n here, since
> n was set by:

Correct, previously this patch was made to allow n < 16. These were left over from the cleanup.
I'll correct.

>
> > +     result = gen_rtx_IOR (dest_mode, reg, result);
> > +      }
> > +
> > +    dst = adjust_address (dst, dest_mode, 0);
> > +    emit_insn (gen_move_insn (dst, result));
>
> dest_mode was chosen by smallest_mode_for_size, so can be bigger than n.
> Doesn't that mean that we'll write beyond the end of the copy region when
> n is an awkward number?

Yes, see my answer above. For the other case, when we write onto a location on the stack, this is fine
due to the alignment.

>
> > diff --git a/gcc/expr.c b/gcc/expr.c
> > index
> >
> 91d7ea217229fac62380b5d4b646961bf7c836c1..b1df4651e7942346007cda1cce
> 8e
> > e5a19297ab16 100644
> > --- a/gcc/expr.c
> > +++ b/gcc/expr.c
> > @@ -2743,7 +2743,9 @@ copy_blkmode_to_reg (machine_mode mode,
> tree
> > src)
> >
> >    n_regs = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
> >    dst_words = XALLOCAVEC (rtx, n_regs);
> > -  bitsize = MIN (TYPE_ALIGN (TREE_TYPE (src)), BITS_PER_WORD);
> > +  bitsize = BITS_PER_WORD;
> > +  if (SLOW_UNALIGNED_ACCESS (BLKmode, TYPE_ALIGN (TREE_TYPE
> (src))))
> > +    bitsize = MIN (TYPE_ALIGN (TREE_TYPE (src)), BITS_PER_WORD);
>
> I think this ought to be testing word_mode instead of BLKmode.
> (Testing BLKmode doesn't really make sense in general, because the mode
> doesn't have a meaningful alignment.)

Ah, yes that makes sense. I'll update the patch.

New patch is validating and will submit it soon.

>
> Thanks,
> Richard

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: memcopy-2.patch --]
[-- Type: text/x-patch; name="memcopy-2.patch", Size: 3202 bytes --]

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index ab1bdc0233afe7a3c41501cb724a5c4c719123b8..1cf6c4b891571745f740d7dbd03754bbba9264a6 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -13466,7 +13466,6 @@ aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
 
 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
    we succeed, otherwise return false.  */
-
 bool
 aarch64_expand_movmem (rtx *operands)
 {
@@ -13498,16 +13497,55 @@ aarch64_expand_movmem (rtx *operands)
   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
   src = adjust_automodify_address (src, VOIDmode, base, 0);
 
+  /* Optimize routines for MEM to REG copies.
+     This particular function only handles copying to two
+     destination types: 1) a regular register and 2) the stack.
+     When writing to a regular register we may end up writting too much in cases
+     where the register already contains a live value or when no data padding is
+     happening so we disallow it.  */
+  if (n < 8 && !REG_P (XEXP (operands[0], 0)))
+   {
+     machine_mode mov_mode, dest_mode
+       = smallest_mode_for_size (n * BITS_PER_UNIT, MODE_INT);
+     rtx result = gen_reg_rtx (dest_mode);
+     emit_insn (gen_move_insn (result, GEN_INT (0)));
+
+     unsigned int shift_cnt = 0;
+     for (; n > shift_cnt; shift_cnt += GET_MODE_SIZE (mov_mode))
+       {
+	 int nearest = 0;
+	 /* Find the mode to use.  */
+	 for (unsigned max = 1; max <= (n - shift_cnt); max *= 2)
+	      nearest = max;
+
+	  mov_mode = smallest_mode_for_size (nearest * BITS_PER_UNIT, MODE_INT);
+	  rtx reg = gen_reg_rtx (mov_mode);
+
+	  src = adjust_address (src, mov_mode, 0);
+	  emit_insn (gen_move_insn (reg, src));
+	  src = aarch64_progress_pointer (src);
+
+	  reg = gen_rtx_ZERO_EXTEND (dest_mode, reg);
+	  reg = gen_rtx_ASHIFT (dest_mode, reg,
+				GEN_INT (shift_cnt * BITS_PER_UNIT));
+	  result = gen_rtx_IOR (dest_mode, reg, result);
+       }
+
+    dst = adjust_address (dst, dest_mode, 0);
+    emit_insn (gen_move_insn (dst, result));
+    return true;
+  }
+
   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
      1-byte chunk.  */
   if (n < 4)
     {
       if (n >= 2)
-	{
-	  aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
-	  n -= 2;
-	}
 
+      {
+	aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
+	n -= 2;
+      }
       if (n == 1)
 	aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
 
diff --git a/gcc/expr.c b/gcc/expr.c
index 2d8868e52cefdfdc2d81135811cbf1cb3e6ff82b..7eca579e769acb131bd0db344815e344fdc948f2 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -2743,7 +2743,9 @@ copy_blkmode_to_reg (machine_mode mode, tree src)
 
   n_regs = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
   dst_words = XALLOCAVEC (rtx, n_regs);
-  bitsize = MIN (TYPE_ALIGN (TREE_TYPE (src)), BITS_PER_WORD);
+  bitsize = BITS_PER_WORD;
+  if (SLOW_UNALIGNED_ACCESS (word_mode, TYPE_ALIGN (TREE_TYPE (src))))
+    bitsize = MIN (TYPE_ALIGN (TREE_TYPE (src)), BITS_PER_WORD);
 
   /* Copy the structure BITSIZE bits at a time.  */
   for (bitpos = 0, xbitpos = padding_correction;

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH][GCC][AARCH64]Bad code-gen for structure/block/unaligned memory access
  2017-07-03  6:18     ` Tamar Christina
@ 2017-08-25 19:35       ` Jeff Law
  2017-08-25 19:51       ` H.J. Lu
  2017-08-30 16:54       ` James Greenhalgh
  2 siblings, 0 replies; 14+ messages in thread
From: Jeff Law @ 2017-08-25 19:35 UTC (permalink / raw)
  To: Tamar Christina, Richard Sandiford
  Cc: GCC Patches, nd, James Greenhalgh, Richard Earnshaw, Marcus Shawcroft

On 07/03/2017 12:17 AM, Tamar Christina wrote:
> Hi All,
> 
> Sorry I just realized I never actually sent the updated patch...
AFAICT this never got resolved.

I'll let the aarch64 folks own those changes.  I'm going to focus just
on the small change to expr.c where you change the computation of
bitsize in copy_blkmode_to_reg.

My biggest worry is that this could end up breaking STRICT_ALIGNMENT
targets.  Thankfully, the default definition of SLOW_UNALIGNED_ACCESS is
STRICT_ALIGNMENT.  So on a STRICT_ALIGNMENT target that does not define
SLOW_UNALIGNED_ACCESS we should OK.  I also did a quick scan of
STRICT_ALIGNMENT targets that override SLOW_UNALIGNED_ACCESS and they
seem reasonable.

So I'll ack the expr.c change.  You'll need to chase down an ack on the
aarch64 specific changes.


jeff

> 
> So here it is :)
> 
> Regards,
> Tamar
> ________________________________________
> From: gcc-patches-owner@gcc.gnu.org <gcc-patches-owner@gcc.gnu.org> on behalf of Tamar Christina <Tamar.Christina@arm.com>
> Sent: Friday, June 9, 2017 4:51:52 PM
> To: Richard Sandiford
> Cc: GCC Patches; nd; James Greenhalgh; Richard Earnshaw; Marcus Shawcroft
> Subject: RE: [PATCH][GCC][AARCH64]Bad code-gen for structure/block/unaligned memory access
> 
>>> +  /* Optimize routines for MEM to REG copies.  */  if (n < 8 &&
>>> + !REG_P (XEXP (operands[0], 0)))
>>
>> This seems to be checking that the address of the original destination
>> memory isn't a plain base register.  Why's it important to reject that case but
>> allow e.g. base+offset?
> 
> this function is (as far as I could tell) only being called with two types of destinations:
> a location on the stack or a plain register.
> 
> When the destination is a register such as with
> 
> void Fun3(struct struct3 foo3)
> {
>   L3 = foo3;
> }
> 
> You run into the issue you had pointed to below where we might write too much. Ideally the constraint
> I would like is to check if the destination is either a new register (no data) or that the structure was padded.
> 
> I couldn't figure out how to do this and the gains over the existing code for this case was quite small.
> So I just disallowed it leaving it to the existing code, which isn't so bad, only 1 extra instruction.
> 
>>
>>> +   {
>>> +     unsigned int max_align = UINTVAL (operands[2]);
>>> +     max_align = n < max_align ? max_align : n;
>>
>> Might be misunderstanding, but isn't max_align always equal to n here, since
>> n was set by:
> 
> Correct, previously this patch was made to allow n < 16. These were left over from the cleanup.
> I'll correct.
> 
>>
>>> +     result = gen_rtx_IOR (dest_mode, reg, result);
>>> +      }
>>> +
>>> +    dst = adjust_address (dst, dest_mode, 0);
>>> +    emit_insn (gen_move_insn (dst, result));
>>
>> dest_mode was chosen by smallest_mode_for_size, so can be bigger than n.
>> Doesn't that mean that we'll write beyond the end of the copy region when
>> n is an awkward number?
> 
> Yes, see my answer above. For the other case, when we write onto a location on the stack, this is fine
> due to the alignment.
> 
>>
>>> diff --git a/gcc/expr.c b/gcc/expr.c
>>> index
>>>
>> 91d7ea217229fac62380b5d4b646961bf7c836c1..b1df4651e7942346007cda1cce
>> 8e
>>> e5a19297ab16 100644
>>> --- a/gcc/expr.c
>>> +++ b/gcc/expr.c
>>> @@ -2743,7 +2743,9 @@ copy_blkmode_to_reg (machine_mode mode,
>> tree
>>> src)
>>>
>>>    n_regs = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
>>>    dst_words = XALLOCAVEC (rtx, n_regs);
>>> -  bitsize = MIN (TYPE_ALIGN (TREE_TYPE (src)), BITS_PER_WORD);
>>> +  bitsize = BITS_PER_WORD;
>>> +  if (SLOW_UNALIGNED_ACCESS (BLKmode, TYPE_ALIGN (TREE_TYPE
>> (src))))
>>> +    bitsize = MIN (TYPE_ALIGN (TREE_TYPE (src)), BITS_PER_WORD);
>>
>> I think this ought to be testing word_mode instead of BLKmode.
>> (Testing BLKmode doesn't really make sense in general, because the mode
>> doesn't have a meaningful alignment.)
> 
> Ah, yes that makes sense. I'll update the patch.
> 
> New patch is validating and will submit it soon.
> 
>>
>> Thanks,
>> Richard

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH][GCC][AARCH64]Bad code-gen for structure/block/unaligned memory access
  2017-07-03  6:18     ` Tamar Christina
  2017-08-25 19:35       ` Jeff Law
@ 2017-08-25 19:51       ` H.J. Lu
  2017-08-30  9:46         ` Tamar Christina
  2017-08-30 16:54       ` James Greenhalgh
  2 siblings, 1 reply; 14+ messages in thread
From: H.J. Lu @ 2017-08-25 19:51 UTC (permalink / raw)
  To: Tamar Christina
  Cc: Richard Sandiford, GCC Patches, nd, James Greenhalgh,
	Richard Earnshaw, Marcus Shawcroft

On Sun, Jul 2, 2017 at 11:17 PM, Tamar Christina
<Tamar.Christina@arm.com> wrote:
> Hi All,
>
> Sorry I just realized I never actually sent the updated patch...
>
> So here it is :)
>

Are there any testcases?

-- 
H.J.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH][GCC][AARCH64]Bad code-gen for structure/block/unaligned memory access
  2017-08-25 19:51       ` H.J. Lu
@ 2017-08-30  9:46         ` Tamar Christina
  2017-08-30 14:19           ` H.J. Lu
  0 siblings, 1 reply; 14+ messages in thread
From: Tamar Christina @ 2017-08-30  9:46 UTC (permalink / raw)
  To: H.J. Lu
  Cc: Richard Sandiford, GCC Patches, nd, James Greenhalgh,
	Richard Earnshaw, Marcus Shawcroft

Hi,

The test I used was testsuite/gcc.c-torture/compile/structs.c

Thanks,
Tamar
________________________________________
From: H.J. Lu <hjl.tools@gmail.com>
Sent: Friday, August 25, 2017 8:10:22 PM
To: Tamar Christina
Cc: Richard Sandiford; GCC Patches; nd; James Greenhalgh; Richard Earnshaw; Marcus Shawcroft
Subject: Re: [PATCH][GCC][AARCH64]Bad code-gen for structure/block/unaligned memory access

On Sun, Jul 2, 2017 at 11:17 PM, Tamar Christina
<Tamar.Christina@arm.com> wrote:
> Hi All,
>
> Sorry I just realized I never actually sent the updated patch...
>
> So here it is :)
>

Are there any testcases?

--
H.J.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH][GCC][AARCH64]Bad code-gen for structure/block/unaligned memory access
  2017-08-30  9:46         ` Tamar Christina
@ 2017-08-30 14:19           ` H.J. Lu
  2017-08-30 15:31             ` Tamar Christina
  0 siblings, 1 reply; 14+ messages in thread
From: H.J. Lu @ 2017-08-30 14:19 UTC (permalink / raw)
  To: Tamar Christina
  Cc: Richard Sandiford, GCC Patches, nd, James Greenhalgh,
	Richard Earnshaw, Marcus Shawcroft

On Wed, Aug 30, 2017 at 2:21 AM, Tamar Christina
<Tamar.Christina@arm.com> wrote:
> Hi,
>
> The test I used was testsuite/gcc.c-torture/compile/structs.c

Results look very nice on x86-64.  Can we add a testcase to scan the
compiler output
to verify this optimization?

Thanks.

> Thanks,
> Tamar
> ________________________________________
> From: H.J. Lu <hjl.tools@gmail.com>
> Sent: Friday, August 25, 2017 8:10:22 PM
> To: Tamar Christina
> Cc: Richard Sandiford; GCC Patches; nd; James Greenhalgh; Richard Earnshaw; Marcus Shawcroft
> Subject: Re: [PATCH][GCC][AARCH64]Bad code-gen for structure/block/unaligned memory access
>
> On Sun, Jul 2, 2017 at 11:17 PM, Tamar Christina
> <Tamar.Christina@arm.com> wrote:
>> Hi All,
>>
>> Sorry I just realized I never actually sent the updated patch...
>>
>> So here it is :)
>>
>
> Are there any testcases?
>
> --
> H.J.



-- 
H.J.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH][GCC][AARCH64]Bad code-gen for structure/block/unaligned memory access
  2017-08-30 14:19           ` H.J. Lu
@ 2017-08-30 15:31             ` Tamar Christina
  0 siblings, 0 replies; 14+ messages in thread
From: Tamar Christina @ 2017-08-30 15:31 UTC (permalink / raw)
  To: H.J. Lu
  Cc: Richard Sandiford, GCC Patches, nd, James Greenhalgh,
	Richard Earnshaw, Marcus Shawcroft

Sure, I'll split mid-end parts off and add a test.

Thanks,
Tamar
________________________________________
From: H.J. Lu <hjl.tools@gmail.com>
Sent: Wednesday, August 30, 2017 2:16:12 PM
To: Tamar Christina
Cc: Richard Sandiford; GCC Patches; nd; James Greenhalgh; Richard Earnshaw; Marcus Shawcroft
Subject: Re: [PATCH][GCC][AARCH64]Bad code-gen for structure/block/unaligned memory access

On Wed, Aug 30, 2017 at 2:21 AM, Tamar Christina
<Tamar.Christina@arm.com> wrote:
> Hi,
>
> The test I used was testsuite/gcc.c-torture/compile/structs.c

Results look very nice on x86-64.  Can we add a testcase to scan the
compiler output
to verify this optimization?

Thanks.

> Thanks,
> Tamar
> ________________________________________
> From: H.J. Lu <hjl.tools@gmail.com>
> Sent: Friday, August 25, 2017 8:10:22 PM
> To: Tamar Christina
> Cc: Richard Sandiford; GCC Patches; nd; James Greenhalgh; Richard Earnshaw; Marcus Shawcroft
> Subject: Re: [PATCH][GCC][AARCH64]Bad code-gen for structure/block/unaligned memory access
>
> On Sun, Jul 2, 2017 at 11:17 PM, Tamar Christina
> <Tamar.Christina@arm.com> wrote:
>> Hi All,
>>
>> Sorry I just realized I never actually sent the updated patch...
>>
>> So here it is :)
>>
>
> Are there any testcases?
>
> --
> H.J.



--
H.J.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH][GCC][AARCH64]Bad code-gen for structure/block/unaligned memory access
  2017-07-03  6:18     ` Tamar Christina
  2017-08-25 19:35       ` Jeff Law
  2017-08-25 19:51       ` H.J. Lu
@ 2017-08-30 16:54       ` James Greenhalgh
  2017-11-14 16:08         ` Tamar Christina
  2 siblings, 1 reply; 14+ messages in thread
From: James Greenhalgh @ 2017-08-30 16:54 UTC (permalink / raw)
  To: Tamar Christina
  Cc: Richard Sandiford, GCC Patches, nd, Richard Earnshaw, Marcus Shawcroft

Hi Tamar,

I think the AArch64 parts of this patch can be substantially simplified.

On Mon, Jul 03, 2017 at 07:17:51AM +0100, Tamar Christina wrote:
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index ab1bdc0233afe7a3c41501cb724a5c4c719123b8..1cf6c4b891571745f740d7dbd03754bbba9264a6 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -13466,7 +13466,6 @@ aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
>  
>  /* Expand movmem, as if from a __builtin_memcpy.  Return true if
>     we succeed, otherwise return false.  */
> -
>  bool
>  aarch64_expand_movmem (rtx *operands)
>  {
> @@ -13498,16 +13497,55 @@ aarch64_expand_movmem (rtx *operands)
>    base = copy_to_mode_reg (Pmode, XEXP (src, 0));
>    src = adjust_automodify_address (src, VOIDmode, base, 0);
>  
> +  /* Optimize routines for MEM to REG copies.
> +     This particular function only handles copying to two
> +     destination types: 1) a regular register and 2) the stack.
> +     When writing to a regular register we may end up writting too much in cases
> +     where the register already contains a live value or when no data padding is
> +     happening so we disallow it.  */
> +  if (n < 8 && !REG_P (XEXP (operands[0], 0)))

I don't understand how this comment lines up with the condition on this
code path. On the one hand you say you permit regular registers, but on the
other hand you say you disallow regular registers because you may overwrite.


> +   {
> +     machine_mode mov_mode, dest_mode
> +       = smallest_mode_for_size (n * BITS_PER_UNIT, MODE_INT);
> +     rtx result = gen_reg_rtx (dest_mode);
> +     emit_insn (gen_move_insn (result, GEN_INT (0)));
> +
> +     unsigned int shift_cnt = 0;

Any reason not to just initialise the shift_cnt in place here?

> +     for (; n > shift_cnt; shift_cnt += GET_MODE_SIZE (mov_mode))


     for (unsigned int shift_cnt = 0;
          shift_cnt <=  n;
          shift_cnt += GET_MODE_SIZE (mov_mode))

Having the loop counter first in the comparison is personal preference.

mov_mode looks uninitialised, but I guess by the time you check the loop
condition you have actually initialized it.

> +       {
> +	 int nearest = 0;

This isn't required, we could do the loop below with one 

> +	 /* Find the mode to use.  */
> +	 for (unsigned max = 1; max <= (n - shift_cnt); max *= 2)
> +	      nearest = max;

Wrong formatting.

So, when this loop terminates for the first time (shift_cnt == 0) nearest
is the first power of two greater than n.

> +
> +	  mov_mode = smallest_mode_for_size (nearest * BITS_PER_UNIT, MODE_INT);

That means this is always a mode of at least the right size, which in turn
means that we can't execute round the loop again (GET_MODE_SIZE (mov_mode)
will always be greater than n). So, we can be sure this loop only executes
once, and we can fold mov_mode and dest_mode to be equal.

> +	  rtx reg = gen_reg_rtx (mov_mode);
> +
> +	  src = adjust_address (src, mov_mode, 0);
> +	  emit_insn (gen_move_insn (reg, src));
> +	  src = aarch64_progress_pointer (src);
> +
> +	  reg = gen_rtx_ZERO_EXTEND (dest_mode, reg);
> +	  reg = gen_rtx_ASHIFT (dest_mode, reg,
> +				GEN_INT (shift_cnt * BITS_PER_UNIT));
> +	  result = gen_rtx_IOR (dest_mode, reg, result);
> +       }
> +
> +    dst = adjust_address (dst, dest_mode, 0);
> +    emit_insn (gen_move_insn (dst, result));
> +    return true;
> +  }
> +
>    /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
>       1-byte chunk.  */
>    if (n < 4)
>      {
>        if (n >= 2)
> -	{
> -	  aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
> -	  n -= 2;
> -	}
>  
> +      {
> +	aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
> +	n -= 2;
> +      }

These formatting changes leave a blank newline between if (n >= 2) and the
remainder of this block. Why?

>        if (n == 1)
>  	aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
>  

Thanks,
James

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH][GCC][AARCH64]Bad code-gen for structure/block/unaligned memory access
  2017-08-30 16:54       ` James Greenhalgh
@ 2017-11-14 16:08         ` Tamar Christina
  2017-11-14 18:01           ` James Greenhalgh
  0 siblings, 1 reply; 14+ messages in thread
From: Tamar Christina @ 2017-11-14 16:08 UTC (permalink / raw)
  To: James Greenhalgh
  Cc: Richard Sandiford, GCC Patches, nd, Richard Earnshaw, Marcus Shawcroft

[-- Attachment #1: Type: text/plain, Size: 5366 bytes --]

Hi James,

I have split off the aarch64 bit off from the generic parts and processed your feedback.

Attached is the reworked patch.

Ok for Tunk?

Thanks,
Tamar

Thanks,
Tamar

gcc/
2017-11-14  Tamar Christina  <tamar.christina@arm.com>

	* config/aarch64/aarch64.c (aarch64_expand_movmem):
	Add MEM to REG optimized case.

gcc/testsuite/
2017-11-14  Tamar Christina  <tamar.christina@arm.com>

	* gcc.target/aarch64/structs.c

> -----Original Message-----
> From: James Greenhalgh [mailto:james.greenhalgh@arm.com]
> Sent: Wednesday, August 30, 2017 16:56
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: Richard Sandiford <richard.sandiford@linaro.org>; GCC Patches <gcc-
> patches@gcc.gnu.org>; nd <nd@arm.com>; Richard Earnshaw
> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> <Marcus.Shawcroft@arm.com>
> Subject: Re: [PATCH][GCC][AARCH64]Bad code-gen for
> structure/block/unaligned memory access
> 
> Hi Tamar,
> 
> I think the AArch64 parts of this patch can be substantially simplified.
> 
> On Mon, Jul 03, 2017 at 07:17:51AM +0100, Tamar Christina wrote:
> > diff --git a/gcc/config/aarch64/aarch64.c
> > b/gcc/config/aarch64/aarch64.c index
> >
> ab1bdc0233afe7a3c41501cb724a5c4c719123b8..1cf6c4b891571745f740d7dbd0
> 37
> > 54bbba9264a6 100644
> > --- a/gcc/config/aarch64/aarch64.c
> > +++ b/gcc/config/aarch64/aarch64.c
> > @@ -13466,7 +13466,6 @@
> aarch64_copy_one_block_and_progress_pointers
> > (rtx *src, rtx *dst,
> >
> >  /* Expand movmem, as if from a __builtin_memcpy.  Return true if
> >     we succeed, otherwise return false.  */
> > -
> >  bool
> >  aarch64_expand_movmem (rtx *operands)  { @@ -13498,16 +13497,55
> @@
> > aarch64_expand_movmem (rtx *operands)
> >    base = copy_to_mode_reg (Pmode, XEXP (src, 0));
> >    src = adjust_automodify_address (src, VOIDmode, base, 0);
> >
> > +  /* Optimize routines for MEM to REG copies.
> > +     This particular function only handles copying to two
> > +     destination types: 1) a regular register and 2) the stack.
> > +     When writing to a regular register we may end up writting too much in
> cases
> > +     where the register already contains a live value or when no data
> padding is
> > +     happening so we disallow it.  */  if (n < 8 && !REG_P (XEXP
> > + (operands[0], 0)))
> 
> I don't understand how this comment lines up with the condition on this
> code path. On the one hand you say you permit regular registers, but on the
> other hand you say you disallow regular registers because you may overwrite.
> 
> 
> > +   {
> > +     machine_mode mov_mode, dest_mode
> > +       = smallest_mode_for_size (n * BITS_PER_UNIT, MODE_INT);
> > +     rtx result = gen_reg_rtx (dest_mode);
> > +     emit_insn (gen_move_insn (result, GEN_INT (0)));
> > +
> > +     unsigned int shift_cnt = 0;
> 
> Any reason not to just initialise the shift_cnt in place here?
> 
> > +     for (; n > shift_cnt; shift_cnt += GET_MODE_SIZE (mov_mode))
> 
> 
>      for (unsigned int shift_cnt = 0;
>           shift_cnt <=  n;
>           shift_cnt += GET_MODE_SIZE (mov_mode))
> 
> Having the loop counter first in the comparison is personal preference.
> 
> mov_mode looks uninitialised, but I guess by the time you check the loop
> condition you have actually initialized it.
> 
> > +       {
> > +	 int nearest = 0;
> 
> This isn't required, we could do the loop below with one
> 
> > +	 /* Find the mode to use.  */
> > +	 for (unsigned max = 1; max <= (n - shift_cnt); max *= 2)
> > +	      nearest = max;
> 
> Wrong formatting.
> 
> So, when this loop terminates for the first time (shift_cnt == 0) nearest is the
> first power of two greater than n.
> 
> > +
> > +	  mov_mode = smallest_mode_for_size (nearest * BITS_PER_UNIT,
> > +MODE_INT);
> 
> That means this is always a mode of at least the right size, which in turn
> means that we can't execute round the loop again (GET_MODE_SIZE
> (mov_mode) will always be greater than n). So, we can be sure this loop only
> executes once, and we can fold mov_mode and dest_mode to be equal.
> 
> > +	  rtx reg = gen_reg_rtx (mov_mode);
> > +
> > +	  src = adjust_address (src, mov_mode, 0);
> > +	  emit_insn (gen_move_insn (reg, src));
> > +	  src = aarch64_progress_pointer (src);
> > +
> > +	  reg = gen_rtx_ZERO_EXTEND (dest_mode, reg);
> > +	  reg = gen_rtx_ASHIFT (dest_mode, reg,
> > +				GEN_INT (shift_cnt * BITS_PER_UNIT));
> > +	  result = gen_rtx_IOR (dest_mode, reg, result);
> > +       }
> > +
> > +    dst = adjust_address (dst, dest_mode, 0);
> > +    emit_insn (gen_move_insn (dst, result));
> > +    return true;
> > +  }
> > +
> >    /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
> >       1-byte chunk.  */
> >    if (n < 4)
> >      {
> >        if (n >= 2)
> > -	{
> > -	  aarch64_copy_one_block_and_progress_pointers (&src, &dst,
> HImode);
> > -	  n -= 2;
> > -	}
> >
> > +      {
> > +	aarch64_copy_one_block_and_progress_pointers (&src, &dst,
> HImode);
> > +	n -= 2;
> > +      }
> 
> These formatting changes leave a blank newline between if (n >= 2) and the
> remainder of this block. Why?
> 
> >        if (n == 1)
> >  	aarch64_copy_one_block_and_progress_pointers (&src, &dst,
> QImode);
> >
> 
> Thanks,
> James

[-- Attachment #2: memcpy-aarch64.patch --]
[-- Type: application/octet-stream, Size: 6602 bytes --]

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index e89c8156976cecf200cd67c1e938c8156c1240c4..bc197cc973a2bbc2c263809c7e8ccfbc86309c26 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -13974,6 +13974,28 @@ aarch64_expand_movmem (rtx *operands)
   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
   src = adjust_automodify_address (src, VOIDmode, base, 0);
 
+  /* Optimize routines for MEM to REG copies.
+     This particular function only handles copying to two
+     destination types: 1) a regular register and 2) the stack.
+     When writing to a regular register we may end up writting too much in cases
+     where the register already contains a live value or when no data padding is
+     happening so we disallow regular registers to use this new code path.  */
+  if (n < 8 && !REG_P (XEXP (operands[0], 0)))
+    {
+       machine_mode dest_mode
+	 = smallest_mode_for_size (n * BITS_PER_UNIT, MODE_INT);
+       rtx result = gen_reg_rtx (dest_mode);
+       emit_insn (gen_move_insn (result, GEN_INT (0)));
+
+       src = adjust_address (src, dest_mode, 0);
+       emit_insn (gen_move_insn (result, src));
+       src = aarch64_progress_pointer (src);
+
+       dst = adjust_address (dst, dest_mode, 0);
+       emit_insn (gen_move_insn (dst, result));
+       return true;
+    }
+
   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
      1-byte chunk.  */
   if (n < 4)
diff --git a/gcc/testsuite/gcc.target/aarch64/structs.c b/gcc/testsuite/gcc.target/aarch64/structs.c
new file mode 100644
index 0000000000000000000000000000000000000000..2be8fae9479500cdbd5f22c6ab4f6adcdd7fcbce
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/structs.c
@@ -0,0 +1,226 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O2" } */
+/* Copyright 1996, 1999, 2007 Free Software Foundation, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.
+
+   Please email any bugs, comments, and/or additions to this file to:
+   bug-gdb@prep.ai.mit.edu  */
+
+struct struct1 { char a;};
+struct struct2 { char a, b;};
+struct struct3 { char a, b, c; };
+struct struct4 { char a, b, c, d; };
+struct struct5 { char a, b, c, d, e; };
+struct struct6 { char a, b, c, d, e, f; };
+struct struct7 { char a, b, c, d, e, f, g; };
+struct struct8 { char a, b, c, d, e, f, g, h; };
+struct struct9 { char a, b, c, d, e, f, g, h, i; };
+struct struct10 { char a, b, c, d, e, f, g, h, i, j; };
+struct struct11 { char a, b, c, d, e, f, g, h, i, j, k; };
+struct struct12 { char a, b, c, d, e, f, g, h, i, j, k, l; };
+struct struct16 { char a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p; };
+
+struct struct1 foo1 = {'1'},  L1;
+struct struct2 foo2 = { 'a', 'b'},  L2;
+struct struct3 foo3 = { 'A', 'B', 'C'},  L3;
+struct struct4 foo4 = {'1', '2', '3', '4'},  L4;
+struct struct5 foo5 = {'a', 'b', 'c', 'd', 'e'},  L5;
+struct struct6 foo6 = {'A', 'B', 'C', 'D', 'E', 'F'},  L6;
+struct struct7 foo7 = {'1', '2', '3', '4', '5', '6', '7'},  L7;
+struct struct8 foo8 = {'1', '2', '3', '4', '5', '6', '7', '8'},  L8;
+struct struct9 foo9 = {'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i'},  L9;
+struct struct10 foo10 = {
+  'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'},  L10;
+struct struct11 foo11 = {
+  '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B'}, L11;
+struct struct12 foo12 = {
+  'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L'}, L12;
+struct struct16 foo16 = {
+  'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p'}, L16;
+
+struct struct1  fun1()
+{
+  return foo1;
+}
+struct struct2  fun2()
+{
+  return foo2;
+}
+struct struct3  fun3()
+{
+  return foo3;
+}
+struct struct4  fun4()
+{
+  return foo4;
+}
+struct struct5  fun5()
+{
+  return foo5;
+}
+struct struct6  fun6()
+{
+  return foo6;
+}
+struct struct7  fun7()
+{
+  return foo7;
+}
+struct struct8  fun8()
+{
+  return foo8;
+}
+struct struct9  fun9()
+{
+  return foo9;
+}
+struct struct10 fun10()
+{
+  return foo10;
+}
+struct struct11 fun11()
+{
+  return foo11;
+}
+struct struct12 fun12()
+{
+  return foo12;
+}
+struct struct16 fun16()
+{
+  return foo16;
+}
+
+#ifdef PROTOTYPES
+void Fun1(struct struct1 foo1)
+#else
+void Fun1(foo1)
+     struct struct1 foo1;
+#endif
+{
+  L1 = foo1;
+}
+#ifdef PROTOTYPES
+void Fun2(struct struct2 foo2)
+#else
+void Fun2(foo2)
+     struct struct2 foo2;
+#endif
+{
+  L2 = foo2;
+}
+#ifdef PROTOTYPES
+void Fun3(struct struct3 foo3)
+#else
+void Fun3(foo3)
+     struct struct3 foo3;
+#endif
+{
+  L3 = foo3;
+}
+#ifdef PROTOTYPES
+void Fun4(struct struct4 foo4)
+#else
+void Fun4(foo4)
+     struct struct4 foo4;
+#endif
+{
+  L4 = foo4;
+}
+#ifdef PROTOTYPES
+void Fun5(struct struct5 foo5)
+#else
+void Fun5(foo5)
+     struct struct5 foo5;
+#endif
+{
+  L5 = foo5;
+}
+#ifdef PROTOTYPES
+void Fun6(struct struct6 foo6)
+#else
+void Fun6(foo6)
+     struct struct6 foo6;
+#endif
+{
+  L6 = foo6;
+}
+#ifdef PROTOTYPES
+void Fun7(struct struct7 foo7)
+#else
+void Fun7(foo7)
+     struct struct7 foo7;
+#endif
+{
+  L7 = foo7;
+}
+#ifdef PROTOTYPES
+void Fun8(struct struct8 foo8)
+#else
+void Fun8(foo8)
+     struct struct8 foo8;
+#endif
+{
+  L8 = foo8;
+}
+#ifdef PROTOTYPES
+void Fun9(struct struct9 foo9)
+#else
+void Fun9(foo9)
+     struct struct9 foo9;
+#endif
+{
+  L9 = foo9;
+}
+#ifdef PROTOTYPES
+void Fun10(struct struct10 foo10)
+#else
+void Fun10(foo10)
+     struct struct10 foo10;
+#endif
+{
+  L10 = foo10;
+}
+#ifdef PROTOTYPES
+void Fun11(struct struct11 foo11)
+#else
+void Fun11(foo11)
+     struct struct11 foo11;
+#endif
+{
+  L11 = foo11;
+}
+#ifdef PROTOTYPES
+void Fun12(struct struct12 foo12)
+#else
+void Fun12(foo12)
+     struct struct12 foo12;
+#endif
+{
+  L12 = foo12;
+}
+#ifdef PROTOTYPES
+void Fun16(struct struct16 foo16)
+#else
+void Fun16(foo16)
+     struct struct16 foo16;
+#endif
+{
+  L16 = foo16;
+}
+
+/* { dg-final { scan-assembler-not {bfi} } } */
+/* { dg-final { scan-assembler-times {bfx} 5 } } */

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH][GCC][AARCH64]Bad code-gen for structure/block/unaligned memory access
  2017-11-14 16:08         ` Tamar Christina
@ 2017-11-14 18:01           ` James Greenhalgh
  2017-11-14 18:59             ` Tamar Christina
  0 siblings, 1 reply; 14+ messages in thread
From: James Greenhalgh @ 2017-11-14 18:01 UTC (permalink / raw)
  To: Tamar Christina
  Cc: Richard Sandiford, GCC Patches, nd, Richard Earnshaw, Marcus Shawcroft

On Tue, Nov 14, 2017 at 04:05:12PM +0000, Tamar Christina wrote:
> Hi James,
> 
> I have split off the aarch64 bit off from the generic parts and processed your feedback.
> 
> Attached is the reworked patch.
> 
> Ok for Tunk?

Thanks for the respin, I'm a bit confused by this comment.

> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index e89c8156976cecf200cd67c1e938c8156c1240c4..bc197cc973a2bbc2c263809c7e8ccfbc86309c26 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -13974,6 +13974,28 @@ aarch64_expand_movmem (rtx *operands)
>    base = copy_to_mode_reg (Pmode, XEXP (src, 0));
>    src = adjust_automodify_address (src, VOIDmode, base, 0);
>  
> +  /* Optimize routines for MEM to REG copies.
> +     This particular function only handles copying to two
> +     destination types: 1) a regular register and 2) the stack.
> +     When writing to a regular register we may end up writting too much in cases
> +     where the register already contains a live value or when no data padding is
> +     happening so we disallow regular registers to use this new code path.  */

I'm struggling to understand when you'd end up with a struct held in
a partial register going through this code path. Maybe the restriction
makes sense, can you write a testcase, or show some sample code where
this goes wrong (or simplify the comment).

Thanks,
James

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH][GCC][AARCH64]Bad code-gen for structure/block/unaligned memory access
  2017-11-14 18:01           ` James Greenhalgh
@ 2017-11-14 18:59             ` Tamar Christina
  0 siblings, 0 replies; 14+ messages in thread
From: Tamar Christina @ 2017-11-14 18:59 UTC (permalink / raw)
  To: James Greenhalgh
  Cc: nd, richard.sandiford, gcc-patches, Richard.Earnshaw, Marcus.Shawcroft

[-- Attachment #1: Type: text/plain, Size: 1688 bytes --]

Th 11/14/2017 17:48, James Greenhalgh wrote:
> On Tue, Nov 14, 2017 at 04:05:12PM +0000, Tamar Christina wrote:
> > Hi James,
> > 
> > I have split off the aarch64 bit off from the generic parts and processed your feedback.
> > 
> > Attached is the reworked patch.
> > 
> > Ok for Tunk?
> 
> Thanks for the respin, I'm a bit confused by this comment.

Sorry, the comment was a bit nonsensical. I update the last part but didn't re-read the comment
as a whole.

I've respun the patch again.

Thanks,
Tamar
> 
> > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> > index e89c8156976cecf200cd67c1e938c8156c1240c4..bc197cc973a2bbc2c263809c7e8ccfbc86309c26 100644
> > --- a/gcc/config/aarch64/aarch64.c
> > +++ b/gcc/config/aarch64/aarch64.c
> > @@ -13974,6 +13974,28 @@ aarch64_expand_movmem (rtx *operands)
> >    base = copy_to_mode_reg (Pmode, XEXP (src, 0));
> >    src = adjust_automodify_address (src, VOIDmode, base, 0);
> >  
> > +  /* Optimize routines for MEM to REG copies.
> > +     This particular function only handles copying to two
> > +     destination types: 1) a regular register and 2) the stack.
> > +     When writing to a regular register we may end up writting too much in cases
> > +     where the register already contains a live value or when no data padding is
> > +     happening so we disallow regular registers to use this new code path.  */
> 
> I'm struggling to understand when you'd end up with a struct held in
> a partial register going through this code path. Maybe the restriction
> makes sense, can you write a testcase, or show some sample code where
> this goes wrong (or simplify the comment).
> 
> Thanks,
> James
> 

-- 

[-- Attachment #2: memcopy-aarch64-2.patch --]
[-- Type: text/x-diff, Size: 6427 bytes --]

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index e89c8156976cecf200cd67c1e938c8156c1240c4..2597ec70ff2c49c8276622d3f9d6fe394a1f80c1 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -13974,6 +13974,26 @@ aarch64_expand_movmem (rtx *operands)
   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
   src = adjust_automodify_address (src, VOIDmode, base, 0);
 
+  /* Optimize routines for MEM to REG copies.
+     This particular function only handles copying to two
+     destination types: 1) a regular register and 2) the stack and only when
+     the amount to copy fits in less than 8 bytes.  */
+  if (n < 8 && !REG_P (XEXP (operands[0], 0)))
+    {
+       machine_mode dest_mode
+	 = smallest_mode_for_size (n * BITS_PER_UNIT, MODE_INT);
+       rtx result = gen_reg_rtx (dest_mode);
+       emit_insn (gen_move_insn (result, GEN_INT (0)));
+
+       src = adjust_address (src, dest_mode, 0);
+       emit_insn (gen_move_insn (result, src));
+       src = aarch64_progress_pointer (src);
+
+       dst = adjust_address (dst, dest_mode, 0);
+       emit_insn (gen_move_insn (dst, result));
+       return true;
+    }
+
   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
      1-byte chunk.  */
   if (n < 4)
diff --git a/gcc/testsuite/gcc.target/aarch64/structs.c b/gcc/testsuite/gcc.target/aarch64/structs.c
new file mode 100644
index 0000000000000000000000000000000000000000..2be8fae9479500cdbd5f22c6ab4f6adcdd7fcbce
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/structs.c
@@ -0,0 +1,226 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O2" } */
+/* Copyright 1996, 1999, 2007 Free Software Foundation, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.
+
+   Please email any bugs, comments, and/or additions to this file to:
+   bug-gdb@prep.ai.mit.edu  */
+
+struct struct1 { char a;};
+struct struct2 { char a, b;};
+struct struct3 { char a, b, c; };
+struct struct4 { char a, b, c, d; };
+struct struct5 { char a, b, c, d, e; };
+struct struct6 { char a, b, c, d, e, f; };
+struct struct7 { char a, b, c, d, e, f, g; };
+struct struct8 { char a, b, c, d, e, f, g, h; };
+struct struct9 { char a, b, c, d, e, f, g, h, i; };
+struct struct10 { char a, b, c, d, e, f, g, h, i, j; };
+struct struct11 { char a, b, c, d, e, f, g, h, i, j, k; };
+struct struct12 { char a, b, c, d, e, f, g, h, i, j, k, l; };
+struct struct16 { char a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p; };
+
+struct struct1 foo1 = {'1'},  L1;
+struct struct2 foo2 = { 'a', 'b'},  L2;
+struct struct3 foo3 = { 'A', 'B', 'C'},  L3;
+struct struct4 foo4 = {'1', '2', '3', '4'},  L4;
+struct struct5 foo5 = {'a', 'b', 'c', 'd', 'e'},  L5;
+struct struct6 foo6 = {'A', 'B', 'C', 'D', 'E', 'F'},  L6;
+struct struct7 foo7 = {'1', '2', '3', '4', '5', '6', '7'},  L7;
+struct struct8 foo8 = {'1', '2', '3', '4', '5', '6', '7', '8'},  L8;
+struct struct9 foo9 = {'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i'},  L9;
+struct struct10 foo10 = {
+  'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'},  L10;
+struct struct11 foo11 = {
+  '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B'}, L11;
+struct struct12 foo12 = {
+  'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L'}, L12;
+struct struct16 foo16 = {
+  'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p'}, L16;
+
+struct struct1  fun1()
+{
+  return foo1;
+}
+struct struct2  fun2()
+{
+  return foo2;
+}
+struct struct3  fun3()
+{
+  return foo3;
+}
+struct struct4  fun4()
+{
+  return foo4;
+}
+struct struct5  fun5()
+{
+  return foo5;
+}
+struct struct6  fun6()
+{
+  return foo6;
+}
+struct struct7  fun7()
+{
+  return foo7;
+}
+struct struct8  fun8()
+{
+  return foo8;
+}
+struct struct9  fun9()
+{
+  return foo9;
+}
+struct struct10 fun10()
+{
+  return foo10;
+}
+struct struct11 fun11()
+{
+  return foo11;
+}
+struct struct12 fun12()
+{
+  return foo12;
+}
+struct struct16 fun16()
+{
+  return foo16;
+}
+
+#ifdef PROTOTYPES
+void Fun1(struct struct1 foo1)
+#else
+void Fun1(foo1)
+     struct struct1 foo1;
+#endif
+{
+  L1 = foo1;
+}
+#ifdef PROTOTYPES
+void Fun2(struct struct2 foo2)
+#else
+void Fun2(foo2)
+     struct struct2 foo2;
+#endif
+{
+  L2 = foo2;
+}
+#ifdef PROTOTYPES
+void Fun3(struct struct3 foo3)
+#else
+void Fun3(foo3)
+     struct struct3 foo3;
+#endif
+{
+  L3 = foo3;
+}
+#ifdef PROTOTYPES
+void Fun4(struct struct4 foo4)
+#else
+void Fun4(foo4)
+     struct struct4 foo4;
+#endif
+{
+  L4 = foo4;
+}
+#ifdef PROTOTYPES
+void Fun5(struct struct5 foo5)
+#else
+void Fun5(foo5)
+     struct struct5 foo5;
+#endif
+{
+  L5 = foo5;
+}
+#ifdef PROTOTYPES
+void Fun6(struct struct6 foo6)
+#else
+void Fun6(foo6)
+     struct struct6 foo6;
+#endif
+{
+  L6 = foo6;
+}
+#ifdef PROTOTYPES
+void Fun7(struct struct7 foo7)
+#else
+void Fun7(foo7)
+     struct struct7 foo7;
+#endif
+{
+  L7 = foo7;
+}
+#ifdef PROTOTYPES
+void Fun8(struct struct8 foo8)
+#else
+void Fun8(foo8)
+     struct struct8 foo8;
+#endif
+{
+  L8 = foo8;
+}
+#ifdef PROTOTYPES
+void Fun9(struct struct9 foo9)
+#else
+void Fun9(foo9)
+     struct struct9 foo9;
+#endif
+{
+  L9 = foo9;
+}
+#ifdef PROTOTYPES
+void Fun10(struct struct10 foo10)
+#else
+void Fun10(foo10)
+     struct struct10 foo10;
+#endif
+{
+  L10 = foo10;
+}
+#ifdef PROTOTYPES
+void Fun11(struct struct11 foo11)
+#else
+void Fun11(foo11)
+     struct struct11 foo11;
+#endif
+{
+  L11 = foo11;
+}
+#ifdef PROTOTYPES
+void Fun12(struct struct12 foo12)
+#else
+void Fun12(foo12)
+     struct struct12 foo12;
+#endif
+{
+  L12 = foo12;
+}
+#ifdef PROTOTYPES
+void Fun16(struct struct16 foo16)
+#else
+void Fun16(foo16)
+     struct struct16 foo16;
+#endif
+{
+  L16 = foo16;
+}
+
+/* { dg-final { scan-assembler-not {bfi} } } */
+/* { dg-final { scan-assembler-times {bfx} 5 } } */

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2017-11-14 18:43 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-06-07 11:38 [PATCH][GCC][AARCH64]Bad code-gen for structure/block/unaligned memory access Tamar Christina
2017-06-08  8:01 ` Richard Sandiford
2017-06-09  9:04   ` Tamar Christina
2017-06-09 15:51   ` Tamar Christina
2017-07-03  6:18     ` Tamar Christina
2017-08-25 19:35       ` Jeff Law
2017-08-25 19:51       ` H.J. Lu
2017-08-30  9:46         ` Tamar Christina
2017-08-30 14:19           ` H.J. Lu
2017-08-30 15:31             ` Tamar Christina
2017-08-30 16:54       ` James Greenhalgh
2017-11-14 16:08         ` Tamar Christina
2017-11-14 18:01           ` James Greenhalgh
2017-11-14 18:59             ` Tamar Christina

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).