public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH 00/12] Allow TImode/OImode/XImode in op_by_pieces operations
@ 2021-04-29 12:54 H.J. Lu
  2021-04-29 12:54 ` [PATCH 01/12] Update alignment_for_piecewise_move H.J. Lu
                   ` (11 more replies)
  0 siblings, 12 replies; 28+ messages in thread
From: H.J. Lu @ 2021-04-29 12:54 UTC (permalink / raw)
  To: gcc-patches

The maximum size of the current op_by_pieces operations are limited by 
MAX_FIXED_MODE_SIZE which is an integer expression for the size in bits
of the largest integer machine mode that should actually be used.  But
a target can use TImode/OImode/XImode, which can be larger than
MAX_FIXED_MODE_SIZE, to perform op_by_pieces operations.  Here are a
set of patches to remove such limitation so that TImode/OImode/XImode
can be used for piecewise move and store:

1. Remove MAX_FIXED_MODE_SIZE limit in alignment_for_piecewise_move.
2. Allow generating pseudo register with specific alignment for hard
registers which will never be spilled onto stack to avoid re-aligning
stack.
3. Add TARGET_READ_MEMSET_VALUE and TARGET_GEN_MEMSET_VALUE to support
target instructions to duplicate QImode value to TImode/OImode/XImode
value for memmset.
4. x86: Avoid stack realignment when copying data
5. x86: Remov MAX_BITSIZE_MODE_ANY_INT.  Only x86 backend defines it.
6. x86: Use TImode/OImode/XImode integers for piecewise move and store.
7. x86: Add tests for TImode/OImode/XImode for piecewise move and store.
8. x86: Adjust existing tests.

On x86-64, SPEC CPU 2017 performance impact is neutral.  Glibc code size
differences with -O2 build are:

             Before         After
libc.so     1870718        1870222
ld.so        185120         184984

Some code sequence differences in libc.so are:

            Before                                                           After
	mov    0x10(%rsp),%edx						mov    0x10(%rsp),%edx
	mov    %edx,(%rax)						mov    %edx,(%rax)
	movzwl 0x14(%rsp),%edx				      |		mov    0x13(%rsp),%edx
	mov    %dx,0x4(%rax)				      |		mov    %edx,0x3(%rax)
	movzbl 0x16(%rsp),%edx				      <
	mov    %dl,0x6(%rax)				      <
	add    %rcx,%rax						add    %rcx,%rax
	ret    								ret    

	movdqu (%rsi),%xmm1				      |		movdqu (%rcx),%xmm1
	mov    %rdi,0x20(%rsp)						mov    %rdi,0x20(%rsp)
	movups %xmm1,(%rax)						movups %xmm1,(%rax)
	mov    0x10(%rsi),%rdx				      |		movdqu 0xc(%rcx),%xmm2
	mov    %rdx,0x10(%rax)				      |		movups %xmm2,0xc(%rax)
	mov    0x18(%rsi),%edx				      |		mov    %rax,(%r14,%rdx,8)
	mov    %edx,0x18(%rax)				      |		add    $0x1,%rdx
	mov    %rax,(%r14,%rcx,8)			      |		cmp    %r8,%rdx
	add    $0x1,%rcx				      |		je     <__resolv_conf_allocate+0x22d>
	cmp    %r8,%rcx					      |		mov    0x20(%rsp),%rsi
	je     <__resolv_conf_allocate+0x22f>		      |		mov    (%r9,%rdx,8),%rcx

	test   %eax,%eax						test   %eax,%eax
	mov    $0xff,%eax						mov    $0xff,%eax
	cmove  %eax,%ebx						cmove  %eax,%ebx
	movzbl %bl,%ecx					      |		movd   %ebx,%xmm0
	mov    %ebx,0xc(%rsp)						mov    %ebx,0xc(%rsp)
	mov    %rcx,%rax				      |		punpcklbw %xmm0,%xmm0
	mov    %rcx,%rsi				      |		punpcklwd %xmm0,%xmm0
	mul    %rdi					      |		pshufd $0x0,%xmm0,%xmm0
	imul   %rdi,%rsi				      |		movups %xmm0,0x50(%r12)
	mov    %rax,0x50(%r12)				      |		movups %xmm0,0x60(%r12)
	mov    %rcx,%rax				      |		movups %xmm0,0x70(%r12)
	add    %rdx,%rsi				      |		movups %xmm0,0x80(%r12)
	mul    %rdi					      |		movups %xmm0,0x90(%r12)
	mov    %rsi,0x58(%r12)				      |		movups %xmm0,0xa0(%r12)
	mov    %rsi,0x68(%r12)				      |		movups %xmm0,0xb0(%r12)
	mov    %rax,0x60(%r12)				      |		movups %xmm0,0xc0(%r12)
	mov    %rcx,%rax				      |		movups %xmm0,0xd0(%r12)
	mul    %rdi					      |		movups %xmm0,0xe0(%r12)
	mov    %rsi,0x78(%r12)				      |		movups %xmm0,0xf0(%r12)
	mov    %rsi,0x88(%r12)				      |		movups %xmm0,0x100(%r12)
	mov    %rsi,0x98(%r12)				      |		movups %xmm0,0x110(%r12)
	mov    %rax,0x70(%r12)				      |		movups %xmm0,0x120(%r12)
	mov    %rcx,%rax				      |		movups %xmm0,0x130(%r12)
	mul    %rdi					      |		movups %xmm0,0x140(%r12)
	mov    %rsi,0xa8(%r12)				      <
	mov    %rsi,0xb8(%r12)				      <
	mov    %rsi,0xc8(%r12)				      <
	mov    %rax,0x80(%r12)				      <
	mov    %rcx,%rax				      <
	mul    %rdi					      <
	mov    %rsi,0xd8(%r12)				      <
	mov    %rsi,0xe8(%r12)				      <
	mov    %rsi,0xf8(%r12)				      <
	mov    %rax,0x90(%r12)				      <
	mov    %rcx,%rax				      <
	mul    %rdi					      <
	mov    %rsi,0x108(%r12)				      <
	mov    %rsi,0x118(%r12)				      <
	mov    %rsi,0x128(%r12)				      <
	mov    %rax,0xa0(%r12)				      <
	mov    %rcx,%rax				      <
	mul    %rdi					      <
	mov    %rsi,0x138(%r12)				      <
	mov    %rax,0xb0(%r12)				      <
	mov    %rcx,%rax				      <
	mul    %rdi					      <
	mov    %rax,0xc0(%r12)				      <
	mov    %rcx,%rax				      <
	mul    %rdi					      <
	mov    %rax,0xd0(%r12)				      <
	mov    %rcx,%rax				      <
	mul    %rdi					      <
	mov    %rax,0xe0(%r12)				      <
	mov    %rcx,%rax				      <
	mul    %rdi					      <
	mov    %rax,0xf0(%r12)				      <
	mov    %rcx,%rax				      <
	mul    %rdi					      <
	mov    %rax,0x100(%r12)				      <
	mov    %rcx,%rax				      <
	mul    %rdi					      <
	mov    %rax,0x110(%r12)				      <
	mov    %rcx,%rax				      <
	mul    %rdi					      <
	mov    %rax,0x120(%r12)				      <
	mov    %rcx,%rax				      <
	mul    %rdi					      <
	mov    %rax,0x130(%r12)				      <
	mov    %rcx,%rax				      <
	mul    %rdi					      <
	mov    %r12,%rdi				      <
	mov    %rax,0x140(%r12)				      <
	mov    %rsi,0x148(%r12)				      <
	call   <xprt_register@GLIBC_2.2.5>				call   <xprt_register@GLIBC_2.2.5>
	add    $0x28,%rsp						add    $0x28,%rsp
	mov    %r12,%rax						mov    %r12,%rax
	pop    %rbx							pop    %rbx
	pop    %rbp							pop    %rbp
	pop    %r12							pop    %r12
	pop    %r13							pop    %r13
	pop    %r14							pop    %r14
	pop    %r15							pop    %r15
	ret    								ret    

H.J. Lu (12):
  Update alignment_for_piecewise_move
  Allow generating pseudo register with specific alignment
  Add TARGET_READ_MEMSET_VALUE/TARGET_GEN_MEMSET_VALUE
  x86: Avoid stack realignment when copying data
  Remove MAX_BITSIZE_MODE_ANY_INT
  x86: Update piecewise move and store
  x86: Add AVX2 tests for PR middle-end/90773
  x86: Add tests for piecewise move and store
  x86: Also pass -mno-avx to pr72839.c
  x86: Also pass -mno-avx to cold-attribute-1.c
  x86: Also pass -mno-avx to sw-1.c for ia32
  x86: Update gcc.target/i386/incoming-11.c

 gcc/builtins.c                                |  45 +--
 gcc/config/i386/i386-expand.c                 |   9 +-
 gcc/config/i386/i386-modes.def                |  15 +-
 gcc/config/i386/i386-protos.h                 |   2 +
 gcc/config/i386/i386.c                        | 257 +++++++++++++++++-
 gcc/config/i386/i386.h                        |  31 ++-
 gcc/doc/tm.texi                               |  16 ++
 gcc/doc/tm.texi.in                            |   4 +
 gcc/emit-rtl.c                                |   5 +-
 gcc/explow.c                                  |   6 +-
 gcc/explow.h                                  |   2 +-
 gcc/expr.c                                    |  13 +-
 gcc/expr.h                                    |   6 +-
 gcc/rtl.h                                     |   2 +-
 gcc/target.def                                |  20 ++
 gcc/targhooks.c                               |  54 ++++
 gcc/targhooks.h                               |   4 +
 .../gcc.target/i386/cold-attribute-1.c        |   2 +-
 gcc/testsuite/gcc.target/i386/incoming-11.c   |   2 +-
 .../gcc.target/i386/pieces-memcpy-10.c        |  16 ++
 .../gcc.target/i386/pieces-memcpy-11.c        |  17 ++
 .../gcc.target/i386/pieces-memcpy-12.c        |  16 ++
 .../gcc.target/i386/pieces-memcpy-13.c        |  16 ++
 .../gcc.target/i386/pieces-memcpy-14.c        |  17 ++
 .../gcc.target/i386/pieces-memcpy-15.c        |  16 ++
 .../gcc.target/i386/pieces-memcpy-16.c        |  16 ++
 .../gcc.target/i386/pieces-memcpy-7.c         |  15 +
 .../gcc.target/i386/pieces-memcpy-8.c         |  14 +
 .../gcc.target/i386/pieces-memcpy-9.c         |  14 +
 .../gcc.target/i386/pieces-memset-1.c         |  16 ++
 .../gcc.target/i386/pieces-memset-10.c        |  16 ++
 .../gcc.target/i386/pieces-memset-11.c        |  16 ++
 .../gcc.target/i386/pieces-memset-12.c        |  16 ++
 .../gcc.target/i386/pieces-memset-13.c        |  16 ++
 .../gcc.target/i386/pieces-memset-14.c        |  16 ++
 .../gcc.target/i386/pieces-memset-15.c        |  16 ++
 .../gcc.target/i386/pieces-memset-16.c        |  16 ++
 .../gcc.target/i386/pieces-memset-17.c        |  16 ++
 .../gcc.target/i386/pieces-memset-18.c        |  16 ++
 .../gcc.target/i386/pieces-memset-19.c        |  17 ++
 .../gcc.target/i386/pieces-memset-2.c         |  12 +
 .../gcc.target/i386/pieces-memset-20.c        |  17 ++
 .../gcc.target/i386/pieces-memset-21.c        |  17 ++
 .../gcc.target/i386/pieces-memset-22.c        |  17 ++
 .../gcc.target/i386/pieces-memset-23.c        |  17 ++
 .../gcc.target/i386/pieces-memset-24.c        |  17 ++
 .../gcc.target/i386/pieces-memset-25.c        |  17 ++
 .../gcc.target/i386/pieces-memset-26.c        |  17 ++
 .../gcc.target/i386/pieces-memset-27.c        |  17 ++
 .../gcc.target/i386/pieces-memset-28.c        |  17 ++
 .../gcc.target/i386/pieces-memset-29.c        |  17 ++
 .../gcc.target/i386/pieces-memset-3.c         |  18 ++
 .../gcc.target/i386/pieces-memset-30.c        |  17 ++
 .../gcc.target/i386/pieces-memset-31.c        |  17 ++
 .../gcc.target/i386/pieces-memset-32.c        |  17 ++
 .../gcc.target/i386/pieces-memset-33.c        |  17 ++
 .../gcc.target/i386/pieces-memset-34.c        |  17 ++
 .../gcc.target/i386/pieces-memset-35.c        |  17 ++
 .../gcc.target/i386/pieces-memset-36.c        |  17 ++
 .../gcc.target/i386/pieces-memset-37.c        |  15 +
 .../gcc.target/i386/pieces-memset-38.c        |  17 ++
 .../gcc.target/i386/pieces-memset-39.c        |  16 ++
 .../gcc.target/i386/pieces-memset-4.c         |  16 ++
 .../gcc.target/i386/pieces-memset-40.c        |  17 ++
 .../gcc.target/i386/pieces-memset-41.c        |  16 ++
 .../gcc.target/i386/pieces-memset-42.c        |  17 ++
 .../gcc.target/i386/pieces-memset-43.c        |  17 ++
 .../gcc.target/i386/pieces-memset-5.c         |  12 +
 .../gcc.target/i386/pieces-memset-6.c         |  16 ++
 .../gcc.target/i386/pieces-memset-7.c         |  16 ++
 .../gcc.target/i386/pieces-memset-8.c         |  16 ++
 .../gcc.target/i386/pieces-memset-9.c         |  16 ++
 gcc/testsuite/gcc.target/i386/pr72839.c       |   2 +-
 gcc/testsuite/gcc.target/i386/pr90773-1.c     |  10 +-
 gcc/testsuite/gcc.target/i386/pr90773-14.c    |   2 +-
 gcc/testsuite/gcc.target/i386/pr90773-15.c    |  14 +
 gcc/testsuite/gcc.target/i386/pr90773-16.c    |  14 +
 gcc/testsuite/gcc.target/i386/pr90773-17.c    |  14 +
 gcc/testsuite/gcc.target/i386/pr90773-18.c    |  15 +
 gcc/testsuite/gcc.target/i386/pr90773-19.c    |  14 +
 gcc/testsuite/gcc.target/i386/pr90773-20.c    |  13 +
 gcc/testsuite/gcc.target/i386/pr90773-21.c    |  13 +
 gcc/testsuite/gcc.target/i386/pr90773-22.c    |  13 +
 gcc/testsuite/gcc.target/i386/pr90773-23.c    |  13 +
 gcc/testsuite/gcc.target/i386/pr90773-4.c     |   2 +-
 gcc/testsuite/gcc.target/i386/sw-1.c          |   1 +
 86 files changed, 1404 insertions(+), 91 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-11.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-12.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-13.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-14.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-15.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-16.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-7.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-8.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-9.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-10.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-11.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-12.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-13.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-14.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-15.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-16.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-17.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-18.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-19.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-20.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-21.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-22.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-23.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-24.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-25.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-26.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-27.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-28.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-29.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-30.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-31.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-32.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-33.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-34.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-35.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-36.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-37.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-38.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-39.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-4.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-40.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-41.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-42.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-43.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-5.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-6.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-7.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-8.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-9.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-15.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-16.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-17.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-18.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-19.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-20.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-21.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-22.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-23.c

-- 
2.31.1


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 01/12] Update alignment_for_piecewise_move
  2021-04-29 12:54 [PATCH 00/12] Allow TImode/OImode/XImode in op_by_pieces operations H.J. Lu
@ 2021-04-29 12:54 ` H.J. Lu
  2021-04-30  8:59   ` Richard Sandiford
  2021-04-29 12:54 ` [PATCH 02/12] Allow generating pseudo register with specific alignment H.J. Lu
                   ` (10 subsequent siblings)
  11 siblings, 1 reply; 28+ messages in thread
From: H.J. Lu @ 2021-04-29 12:54 UTC (permalink / raw)
  To: gcc-patches

alignment_for_piecewise_move is called only with MOVE_MAX_PIECES or
STORE_MAX_PIECES, which are the number of bytes at a time that we
can move or store efficiently.  We should call mode_for_size without
limit to MAX_FIXED_MODE_SIZE, which is an integer expression for the
size in bits of the largest integer machine mode that should actually
be used, may be smaller than MOVE_MAX_PIECES or STORE_MAX_PIECES, which
may use vector.

	* expr.c (alignment_for_piecewise_move): Call mode_for_size
	without limit to MAX_FIXED_MODE_SIZE.
---
 gcc/expr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/expr.c b/gcc/expr.c
index e0167b77410..b4c110f8c17 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -746,7 +746,7 @@ static unsigned int
 alignment_for_piecewise_move (unsigned int max_pieces, unsigned int align)
 {
   scalar_int_mode tmode
-    = int_mode_for_size (max_pieces * BITS_PER_UNIT, 1).require ();
+    = int_mode_for_size (max_pieces * BITS_PER_UNIT, 0).require ();
 
   if (align >= GET_MODE_ALIGNMENT (tmode))
     align = GET_MODE_ALIGNMENT (tmode);
-- 
2.31.1


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 02/12] Allow generating pseudo register with specific alignment
  2021-04-29 12:54 [PATCH 00/12] Allow TImode/OImode/XImode in op_by_pieces operations H.J. Lu
  2021-04-29 12:54 ` [PATCH 01/12] Update alignment_for_piecewise_move H.J. Lu
@ 2021-04-29 12:54 ` H.J. Lu
  2021-04-30  9:06   ` Richard Sandiford
  2021-04-29 12:54 ` [PATCH 03/12] Add TARGET_READ_MEMSET_VALUE/TARGET_GEN_MEMSET_VALUE H.J. Lu
                   ` (9 subsequent siblings)
  11 siblings, 1 reply; 28+ messages in thread
From: H.J. Lu @ 2021-04-29 12:54 UTC (permalink / raw)
  To: gcc-patches

gen_reg_rtx tracks stack alignment needed for pseudo registers so that
associated hard registers can be properly spilled onto stack.  But there
are cases where associated hard registers will never be spilled onto
stack.  gen_reg_rtx is changed to take an argument for register alignment
so that stack realignment can be avoided when not needed.

	* emit-rtl.c (gen_reg_rtx): Add an argument for register
	alignment and use it if it isn't zero.
	* explow.c (force_reg): Add an argument for register alignment
	and pass it to gen_reg_rtx.
	* explow.h (force_reg): Add an argument for register alignment
	and default it to 0.
	* expr.h (convert_to_mode): Likewise.
	(convert_modes): Likewise.
	* expr.c (convert_to_mode): Add an argument for register
	alignment and pass it to convert_modes.
	(convert_modes): Add an argument for register alignment and
	pass it to gen_reg_rtx.
---
 gcc/emit-rtl.c |  5 +++--
 gcc/explow.c   |  6 +++---
 gcc/explow.h   |  2 +-
 gcc/expr.c     | 10 ++++++----
 gcc/expr.h     |  6 ++++--
 gcc/rtl.h      |  2 +-
 6 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/gcc/emit-rtl.c b/gcc/emit-rtl.c
index 07e908624a0..4accf851d23 100644
--- a/gcc/emit-rtl.c
+++ b/gcc/emit-rtl.c
@@ -1160,10 +1160,11 @@ subreg_memory_offset (const_rtx x)
    This pseudo is assigned the next sequential register number.  */
 
 rtx
-gen_reg_rtx (machine_mode mode)
+gen_reg_rtx (machine_mode mode, unsigned int align)
 {
   rtx val;
-  unsigned int align = GET_MODE_ALIGNMENT (mode);
+  if (align == 0)
+    align = GET_MODE_ALIGNMENT (mode);
 
   gcc_assert (can_create_pseudo_p ());
 
diff --git a/gcc/explow.c b/gcc/explow.c
index b6da277f689..c8673ce512d 100644
--- a/gcc/explow.c
+++ b/gcc/explow.c
@@ -663,7 +663,7 @@ copy_to_mode_reg (machine_mode mode, rtx x)
    since we mark it as a "constant" register.  */
 
 rtx
-force_reg (machine_mode mode, rtx x)
+force_reg (machine_mode mode, rtx x, unsigned int reg_align)
 {
   rtx temp, set;
   rtx_insn *insn;
@@ -673,7 +673,7 @@ force_reg (machine_mode mode, rtx x)
 
   if (general_operand (x, mode))
     {
-      temp = gen_reg_rtx (mode);
+      temp = gen_reg_rtx (mode, reg_align);
       insn = emit_move_insn (temp, x);
     }
   else
@@ -683,7 +683,7 @@ force_reg (machine_mode mode, rtx x)
 	insn = get_last_insn ();
       else
 	{
-	  rtx temp2 = gen_reg_rtx (mode);
+	  rtx temp2 = gen_reg_rtx (mode, reg_align);
 	  insn = emit_move_insn (temp2, temp);
 	  temp = temp2;
 	}
diff --git a/gcc/explow.h b/gcc/explow.h
index 698f2a2a21c..621cdd7d356 100644
--- a/gcc/explow.h
+++ b/gcc/explow.h
@@ -40,7 +40,7 @@ extern rtx copy_to_suggested_reg (rtx, rtx, machine_mode);
 
 /* Copy a value to a register if it isn't already a register.
    Args are mode (in case value is a constant) and the value.  */
-extern rtx force_reg (machine_mode, rtx);
+extern rtx force_reg (machine_mode, rtx, unsigned int reg_align = 0);
 
 /* Return given rtx, copied into a new temp reg if it was in memory.  */
 extern rtx force_not_mem (rtx);
diff --git a/gcc/expr.c b/gcc/expr.c
index b4c110f8c17..42db4ddbe0a 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -658,9 +658,10 @@ convert_mode_scalar (rtx to, rtx from, int unsignedp)
    or by copying to a new temporary with conversion.  */
 
 rtx
-convert_to_mode (machine_mode mode, rtx x, int unsignedp)
+convert_to_mode (machine_mode mode, rtx x, int unsignedp,
+		 unsigned int reg_align)
 {
-  return convert_modes (mode, VOIDmode, x, unsignedp);
+  return convert_modes (mode, VOIDmode, x, unsignedp, reg_align);
 }
 
 /* Return an rtx for a value that would result
@@ -674,7 +675,8 @@ convert_to_mode (machine_mode mode, rtx x, int unsignedp)
    You can give VOIDmode for OLDMODE, if you are sure X has a nonvoid mode.  */
 
 rtx
-convert_modes (machine_mode mode, machine_mode oldmode, rtx x, int unsignedp)
+convert_modes (machine_mode mode, machine_mode oldmode, rtx x,
+	       int unsignedp, unsigned int reg_align)
 {
   rtx temp;
   scalar_int_mode int_mode;
@@ -734,7 +736,7 @@ convert_modes (machine_mode mode, machine_mode oldmode, rtx x, int unsignedp)
       return simplify_gen_subreg (mode, x, oldmode, 0);
     }
 
-  temp = gen_reg_rtx (mode);
+  temp = gen_reg_rtx (mode, reg_align);
   convert_move (temp, x, unsignedp);
   return temp;
 }
diff --git a/gcc/expr.h b/gcc/expr.h
index 9a2736f69fa..2b06da1a889 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -66,10 +66,12 @@ extern void init_expr (void);
 extern void convert_move (rtx, rtx, int);
 
 /* Convert an rtx to specified machine mode and return the result.  */
-extern rtx convert_to_mode (machine_mode, rtx, int);
+extern rtx convert_to_mode (machine_mode, rtx, int,
+			    unsigned int reg_align = 0);
 
 /* Convert an rtx to MODE from OLDMODE and return the result.  */
-extern rtx convert_modes (machine_mode, machine_mode, rtx, int);
+extern rtx convert_modes (machine_mode, machine_mode, rtx, int,
+			  unsigned int reg_align = 0);
 
 /* Expand a call to memcpy or memmove or memcmp, and return the result.  */
 extern rtx emit_block_op_via_libcall (enum built_in_function, rtx, rtx, rtx,
diff --git a/gcc/rtl.h b/gcc/rtl.h
index 398d745aff5..c72f7fd59b9 100644
--- a/gcc/rtl.h
+++ b/gcc/rtl.h
@@ -3125,7 +3125,7 @@ subreg_promoted_mode (rtx x)
 /* In emit-rtl.c */
 extern rtvec gen_rtvec_v (int, rtx *);
 extern rtvec gen_rtvec_v (int, rtx_insn **);
-extern rtx gen_reg_rtx (machine_mode);
+extern rtx gen_reg_rtx (machine_mode, unsigned int align = 0);
 extern rtx gen_rtx_REG_offset (rtx, machine_mode, unsigned int, poly_int64);
 extern rtx gen_reg_rtx_offset (rtx, machine_mode, int);
 extern rtx gen_reg_rtx_and_attrs (rtx);
-- 
2.31.1


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 03/12] Add TARGET_READ_MEMSET_VALUE/TARGET_GEN_MEMSET_VALUE
  2021-04-29 12:54 [PATCH 00/12] Allow TImode/OImode/XImode in op_by_pieces operations H.J. Lu
  2021-04-29 12:54 ` [PATCH 01/12] Update alignment_for_piecewise_move H.J. Lu
  2021-04-29 12:54 ` [PATCH 02/12] Allow generating pseudo register with specific alignment H.J. Lu
@ 2021-04-29 12:54 ` H.J. Lu
  2021-04-29 12:54 ` [PATCH 04/12] x86: Avoid stack realignment when copying data H.J. Lu
                   ` (8 subsequent siblings)
  11 siblings, 0 replies; 28+ messages in thread
From: H.J. Lu @ 2021-04-29 12:54 UTC (permalink / raw)
  To: gcc-patches

Add TARGET_READ_MEMSET_VALUE and TARGET_GEN_MEMSET_VALUE to support
target instructions to duplicate QImode value to TImode/OImode/XImode
value for memmset.

gcc/

	PR middle-end/90773
	* builtins.c (builtin_memset_read_str): Call
	targetm.read_memset_value.
	(builtin_memset_gen_str): Call targetm.gen_memset_value.
	* target.def (read_memset_value): New hook.
	(gen_memset_value): Likewise.
	* targhooks.c: Inclue "builtins.h".
	(default_read_memset_value): New function.
	(default_gen_memset_value): Likewise.
	* targhooks.h ()default_read_memset_value: New prototype.
	(default_gen_memset_value): Likewise.
	* config/i386/i386-expand.c (ix86_expand_vector_init_duplicate):
	Make it global.
	* config/i386/i386-protos.h (ix86_expand_vector_init_duplicate):
	New.
	* config/i386/i386.c (ix86_gen_memset_value_from_prev): New
	function.
	(ix86_gen_memset_value): Likewise.
	(ix86_read_memset_value): Likewise.
	(TARGET_GEN_MEMSET_VALUE): New.
	(TARGET_READ_MEMSET_VALUE): Likewise.
	* doc/tm.texi.in: Add TARGET_READ_MEMSET_VALUE and
	TARGET_GEN_MEMSET_VALUE hooks.
	* doc/tm.texi: Regenerated.

gcc/testsuite/

	PR middle-end/90773
	* gcc.target/i386/pr90773-15.c: New test.
	* gcc.target/i386/pr90773-16.c: Likewise.
	* gcc.target/i386/pr90773-17.c: Likewise.
	* gcc.target/i386/pr90773-18.c: Likewise.
	* gcc.target/i386/pr90773-19.c: Likewise.
---
 gcc/builtins.c                             |  45 +---
 gcc/config/i386/i386-expand.c              |   2 +-
 gcc/config/i386/i386-protos.h              |   2 +
 gcc/config/i386/i386.c                     | 236 +++++++++++++++++++++
 gcc/doc/tm.texi                            |  16 ++
 gcc/doc/tm.texi.in                         |   4 +
 gcc/expr.c                                 |   1 -
 gcc/target.def                             |  20 ++
 gcc/targhooks.c                            |  54 +++++
 gcc/targhooks.h                            |   4 +
 gcc/testsuite/gcc.target/i386/pr90773-15.c |  14 ++
 gcc/testsuite/gcc.target/i386/pr90773-16.c |  14 ++
 gcc/testsuite/gcc.target/i386/pr90773-17.c |  14 ++
 gcc/testsuite/gcc.target/i386/pr90773-18.c |  15 ++
 gcc/testsuite/gcc.target/i386/pr90773-19.c |  14 ++
 15 files changed, 412 insertions(+), 43 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-15.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-16.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-17.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-18.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-19.c

diff --git a/gcc/builtins.c b/gcc/builtins.c
index 2d6bf4a65b4..c5610795eec 100644
--- a/gcc/builtins.c
+++ b/gcc/builtins.c
@@ -6586,24 +6586,11 @@ expand_builtin_strncpy (tree exp, rtx target)
    previous iteration.  */
 
 rtx
-builtin_memset_read_str (void *data, void *prevp,
+builtin_memset_read_str (void *data, void *prev,
 			 HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
 			 scalar_int_mode mode)
 {
-  by_pieces_prev *prev = (by_pieces_prev *) prevp;
-  if (prev != nullptr && prev->data != nullptr)
-    {
-      /* Use the previous data in the same mode.  */
-      if (prev->mode == mode)
-	return prev->data;
-    }
-
-  const char *c = (const char *) data;
-  char *p = XALLOCAVEC (char, GET_MODE_SIZE (mode));
-
-  memset (p, *c, GET_MODE_SIZE (mode));
-
-  return c_readstr (p, mode);
+  return targetm.read_memset_value ((const char *) data, prev, mode);
 }
 
 /* Callback routine for store_by_pieces.  Return the RTL of a register
@@ -6613,35 +6600,11 @@ builtin_memset_read_str (void *data, void *prevp,
    nullptr, it has the RTL info from the previous iteration.  */
 
 static rtx
-builtin_memset_gen_str (void *data, void *prevp,
+builtin_memset_gen_str (void *data, void *prev,
 			HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
 			scalar_int_mode mode)
 {
-  rtx target, coeff;
-  size_t size;
-  char *p;
-
-  by_pieces_prev *prev = (by_pieces_prev *) prevp;
-  if (prev != nullptr && prev->data != nullptr)
-    {
-      /* Use the previous data in the same mode.  */
-      if (prev->mode == mode)
-	return prev->data;
-
-      return simplify_gen_subreg (mode, prev->data, prev->mode, 0);
-    }
-
-  size = GET_MODE_SIZE (mode);
-  if (size == 1)
-    return (rtx) data;
-
-  p = XALLOCAVEC (char, size);
-  memset (p, 1, size);
-  coeff = c_readstr (p, mode);
-
-  target = convert_to_mode (mode, (rtx) data, 1);
-  target = expand_mult (mode, target, coeff, NULL_RTX, 1);
-  return force_reg (mode, target);
+  return targetm.gen_memset_value ((rtx) data, prev, mode);
 }
 
 /* Expand expression EXP, which is a call to the memset builtin.  Return
diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index 516440eb5c1..1942b46efbf 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -13586,7 +13586,7 @@ static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
 /* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
    with all elements equal to VAR.  Return true if successful.  */
 
-static bool
+bool
 ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
 				   rtx target, rtx val)
 {
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 7782cf1163f..eae28acbc8d 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -257,6 +257,8 @@ extern void ix86_expand_mul_widen_hilo (rtx, rtx, rtx, bool, bool);
 extern void ix86_expand_sse2_mulv4si3 (rtx, rtx, rtx);
 extern void ix86_expand_sse2_mulvxdi3 (rtx, rtx, rtx);
 extern void ix86_expand_sse2_abs (rtx, rtx);
+extern bool ix86_expand_vector_init_duplicate (bool, machine_mode, rtx,
+					       rtx);
 
 /* In i386-c.c  */
 extern void ix86_target_macros (void);
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 68f33f96f5a..e6ee3ef630a 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -23008,6 +23008,236 @@ ix86_optab_supported_p (int op, machine_mode mode1, machine_mode,
     }
 }
 
+/* Return the RTL for memset in MODE from PREV.  */
+
+static rtx
+ix86_gen_memset_value_from_prev (by_pieces_prev *prevp,
+				 scalar_int_mode mode)
+{
+  rtx prev = prevp->data;
+
+  /* Use the previous data in the same mode.  */
+  if (prevp->mode == mode)
+    return prev;
+
+  machine_mode prev_mode = prevp->mode;
+  size_t size = GET_MODE_SIZE (prev_mode);
+
+  /* NB: Skip if the previous value is 1 byte or less.  CONST_WIDE_INT
+     is in VOIDmode whose size is 0.  */
+  if (size <= 1)
+    return nullptr;
+
+  rtx reg, reg_ti;
+  switch (size)
+    {
+    default:
+      gcc_unreachable ();
+
+    case 2:
+    case 4:
+      return simplify_gen_subreg (mode, prev, prev_mode, 0);
+
+    case 8:
+      /* In 64-bit mode, use SUBREG since word size is 8 bytes.  */
+      if (TARGET_64BIT)
+	return simplify_gen_subreg (mode, prev, prev_mode, 0);
+
+      switch (GET_MODE_SIZE (mode))
+	{
+	default:
+	  gcc_unreachable ();
+	case 2:
+	case 4:
+do_hi_si_mode:
+	  /* In 32-bit mode, Extract the value from an 8-byte
+	     register into an integer register first.  */
+	  reg = gen_reg_rtx (SImode);
+	  emit_move_insn (reg,
+			  simplify_gen_subreg (SImode, prev,
+					       prev_mode, 0));
+	  return simplify_gen_subreg (mode, reg, SImode, 0);
+	}
+      break;
+
+    case 16:
+      switch (GET_MODE_SIZE (mode))
+	{
+	default:
+	  gcc_unreachable ();
+	case 2:
+	case 4:
+	  /* Extract the value from a 16-byte vector register into
+	     an integer register first.  */
+	  goto do_hi_si_mode;
+	case 8:
+	  return simplify_gen_subreg (mode, prev, prev_mode, 0);
+	case 16:
+	  return prev;
+	}
+      break;
+
+    case 32:
+      switch (GET_MODE_SIZE (mode))
+	{
+	default:
+	  gcc_unreachable ();
+	case 2:
+do_himode:
+	  /* Extract the value from a 32-byte vector register into
+	     a 16-byte vector register first.  */
+	  reg_ti = gen_reg_rtx (TImode);
+	  emit_move_insn (reg_ti,
+			  simplify_gen_subreg (TImode, prev,
+					       prev_mode, 0));
+	  /* Then extract the value from a 16-byte vector register
+	     into an integer register.  */
+	  reg = gen_reg_rtx (SImode);
+	  emit_move_insn (reg,
+			  simplify_gen_subreg (SImode, reg_ti,
+					       TImode, 0));
+	  return simplify_gen_subreg (mode, reg, SImode, 0);
+
+	case 4:
+	case 8:
+do_si_di_mode:
+	  /* Extract the value from a 32-byte vector register into
+	     a 16-byte vector register first.  */
+	  reg_ti = gen_reg_rtx (TImode);
+	  emit_move_insn (reg_ti,
+			  simplify_gen_subreg (TImode, prev,
+					       prev_mode, 0));
+	  /* Generate 4/8-byte SSE -> INT move instruction.  */
+	  reg = gen_reg_rtx (mode);
+	  emit_move_insn (reg,
+			  simplify_gen_subreg (mode, reg_ti,
+					       TImode, 0));
+	  return reg;
+	case 16:
+	  return simplify_gen_subreg (mode, prev, prev_mode, 0);
+	case 32:
+	  return prev;
+	}
+
+    case 64:
+      switch (GET_MODE_SIZE (mode))
+	{
+	default:
+	  gcc_unreachable ();
+	case 2:
+	  /* Extract the value from a 64-byte vector register into
+	     a 16-byte vector register first.  */
+	  goto do_himode;
+	case 4:
+	case 8:
+	  /* Extract the value from a 64-byte vector register into
+	     a 16-byte vector register first.  */
+	  goto do_si_di_mode;
+	case 16:
+	case 32:
+	  return simplify_gen_subreg (mode, prev, prev_mode, 0);
+	case 64:
+	  return prev;
+	}
+    }
+
+  return nullptr;
+}
+
+/* Implement the TARGET_GEN_MEMSET_VALUE hook.  */
+
+static rtx
+ix86_gen_memset_value (rtx data, void *prevp, scalar_int_mode mode)
+{
+  /* Don't use the previous value if size is 1.  */
+  if (GET_MODE_SIZE (mode) == 1)
+    return data;
+
+  by_pieces_prev *prev = (by_pieces_prev *) prevp;
+  if (prev != nullptr && prev->data != nullptr)
+    {
+      rtx value = ix86_gen_memset_value_from_prev (prev, mode);
+      if (value)
+	return value;
+    }
+
+  /* Use default_gen_memset_value for vector store won't be used.  */
+  if (GET_MODE_SIZE (mode) <= GET_MODE_SIZE (DImode))
+    return default_gen_memset_value (data, prevp, mode);
+
+  rtx one, target;
+  scalar_mode one_mode;
+
+  switch (GET_MODE_SIZE (mode))
+    {
+    default:
+      gcc_unreachable ();
+
+    case 64:
+      if (!TARGET_AVX512BW)
+	{
+	  rtx tmp = gen_reg_rtx (V32QImode);
+	  if (!ix86_expand_vector_init_duplicate (false, V32QImode,
+						  tmp, data))
+	    gcc_unreachable ();
+	  target = gen_rtx_VEC_CONCAT (V64QImode, tmp, tmp);
+	  return convert_to_mode (mode, target, 1);
+	}
+      /* FALLTHRU */
+    case 16:
+    case 32:
+      one_mode = QImode;
+      one = data;
+      break;
+    }
+
+  unsigned int nunits = GET_MODE_SIZE (mode) / GET_MODE_SIZE (one_mode);
+  machine_mode vector_mode;
+  if (!mode_for_vector (one_mode, nunits).exists (&vector_mode))
+    gcc_unreachable ();
+
+  target = gen_reg_rtx (vector_mode, UNITS_PER_WORD * BITS_PER_UNIT);
+  if (!ix86_expand_vector_init_duplicate (false, vector_mode, target,
+					  one))
+    gcc_unreachable ();
+
+  return convert_to_mode (mode, target, 1,
+			  UNITS_PER_WORD * BITS_PER_UNIT);
+}
+
+/* Implement the TARGET_READ_MEMSET_VALUE hook.  */
+
+static rtx
+ix86_read_memset_value (const char *str, void *prevp,
+			scalar_int_mode mode)
+{
+  rtx value;
+
+  by_pieces_prev *prev = (by_pieces_prev *) prevp;
+  if (prev != nullptr && prev->data != nullptr)
+    {
+      /* Don't use the previous value if size is 1.  */
+      if (GET_MODE_SIZE (mode) == 1)
+	return default_read_memset_value (str, nullptr, mode);
+
+      value = ix86_gen_memset_value_from_prev (prev, mode);
+      if (value)
+	return value;
+
+      return default_read_memset_value (str, nullptr, mode);
+    }
+
+  /* Use default_gen_memset_value if vector store can't be used.
+     NB: Need AVX2 for fast vector duplication and gen_reg_rtx.  */
+  if (GET_MODE_SIZE (mode) <= GET_MODE_SIZE (DImode)
+      || !TARGET_AVX2
+      || !reg_rtx_no)
+   return default_read_memset_value (str, nullptr, mode);
+
+  value = default_read_memset_value (str, nullptr, QImode);
+  return ix86_gen_memset_value (value, nullptr, mode);
+}
+
 /* Address space support.
 
    This is not "far pointers" in the 16-bit sense, but an easy way
@@ -23909,6 +24139,12 @@ static bool ix86_libc_has_fast_function (int fcode ATTRIBUTE_UNUSED)
 #undef TARGET_LIBC_HAS_FAST_FUNCTION
 #define TARGET_LIBC_HAS_FAST_FUNCTION ix86_libc_has_fast_function
 
+#undef TARGET_GEN_MEMSET_VALUE
+#define TARGET_GEN_MEMSET_VALUE ix86_gen_memset_value
+
+#undef TARGET_READ_MEMSET_VALUE
+#define TARGET_READ_MEMSET_VALUE ix86_read_memset_value
+
 #if CHECKING_P
 #undef TARGET_RUN_TARGET_SELFTESTS
 #define TARGET_RUN_TARGET_SELFTESTS selftest::ix86_run_selftests
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 7e8fb8b6ee8..2861d60ff28 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -11944,6 +11944,22 @@ This function prepares to emit a conditional comparison within a sequence
  @var{bit_code} is @code{AND} or @code{IOR}, which is the op on the compares.
 @end deftypefn
 
+@deftypefn {Target Hook} rtx TARGET_READ_MEMSET_VALUE (const char *@var{c}, void *@var{prev}, scalar_int_mode @var{mode})
+This function returns the RTL of a constant integer corresponding to
+target reading @code{GET_MODE_SIZE (@var{mode})} bytes from the stringn
+constant @var{str}.  If @var{prev} is not @samp{nullptr}, it contains
+the RTL information from the previous interation.
+@end deftypefn
+
+@deftypefn {Target Hook} rtx TARGET_GEN_MEMSET_VALUE (rtx @var{data}, void *@var{prev}, scalar_int_mode @var{mode})
+This function returns the RTL of a register containing
+@code{GET_MODE_SIZE (@var{mode})} consecutive copies of the unsigned
+char value given in the RTL register @var{data}.  For example, if
+@var{mode} is 4 bytes wide, return the RTL for 0x01010101*@var{data}.
+If @var{PREV} is not @samp{nullptr}, it is the RTL information from
+the previous iteration.
+@end deftypefn
+
 @deftypefn {Target Hook} unsigned TARGET_LOOP_UNROLL_ADJUST (unsigned @var{nunroll}, class loop *@var{loop})
 This target hook returns a new value for the number of times @var{loop}
 should be unrolled. The parameter @var{nunroll} is the number of times
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 20acf363ed9..3fabf2b6181 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -8032,6 +8032,10 @@ lists.
 
 @hook TARGET_GEN_CCMP_NEXT
 
+@hook TARGET_READ_MEMSET_VALUE
+
+@hook TARGET_GEN_MEMSET_VALUE
+
 @hook TARGET_LOOP_UNROLL_ADJUST
 
 @defmac POWI_MAX_MULTS
diff --git a/gcc/expr.c b/gcc/expr.c
index 42db4ddbe0a..56e845a40da 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -1171,7 +1171,6 @@ op_by_pieces_d::run ()
   /* NB: widest_int_mode_for_size checks M_MAX_SIZE > 1.  */
   scalar_int_mode mode = widest_int_mode_for_size (m_max_size);
   mode = get_usable_mode (mode, m_len);
-
   by_pieces_prev to_prev = { nullptr, mode };
   by_pieces_prev from_prev = { nullptr, mode };
 
diff --git a/gcc/target.def b/gcc/target.def
index c3a4280b655..25dc1850e0c 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -2692,6 +2692,26 @@ DEFHOOK
  rtx, (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev, int cmp_code, tree op0, tree op1, int bit_code),
  NULL)
 
+DEFHOOK
+(read_memset_value,
+ "This function returns the RTL of a constant integer corresponding to\n\
+target reading @code{GET_MODE_SIZE (@var{mode})} bytes from the stringn\n\
+constant @var{str}.  If @var{prev} is not @samp{nullptr}, it contains\n\
+the RTL information from the previous interation.",
+ rtx, (const char *c, void *prev, scalar_int_mode mode),
+ default_read_memset_value)
+
+DEFHOOK
+(gen_memset_value,
+ "This function returns the RTL of a register containing\n\
+@code{GET_MODE_SIZE (@var{mode})} consecutive copies of the unsigned\n\
+char value given in the RTL register @var{data}.  For example, if\n\
+@var{mode} is 4 bytes wide, return the RTL for 0x01010101*@var{data}.\n\
+If @var{PREV} is not @samp{nullptr}, it is the RTL information from\n\
+the previous iteration.",
+ rtx, (rtx data, void *prev, scalar_int_mode mode),
+ default_gen_memset_value)
+
 /* Return a new value for loop unroll size.  */
 DEFHOOK
 (loop_unroll_adjust,
diff --git a/gcc/targhooks.c b/gcc/targhooks.c
index 952fad422eb..e4766be6683 100644
--- a/gcc/targhooks.c
+++ b/gcc/targhooks.c
@@ -90,6 +90,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "attribs.h"
 #include "asan.h"
 #include "emit-rtl.h"
+#include "builtins.h"
 
 bool
 default_legitimate_address_p (machine_mode mode ATTRIBUTE_UNUSED,
@@ -2547,4 +2548,57 @@ default_memtag_untagged_pointer (rtx tagged_pointer, rtx target)
   return untagged_base;
 }
 
+/* Default implementation of TARGET_READ_MEMSET_VALUE.  */
+
+rtx
+default_read_memset_value (const char *c, void *prevp,
+			   scalar_int_mode mode)
+{
+  by_pieces_prev *prev = (by_pieces_prev *) prevp;
+  if (prev != nullptr && prev->data != nullptr)
+    {
+      /* Use the previous data in the same mode.  */
+      if (prev->mode == mode)
+	return prev->data;
+    }
+
+  char *p = XALLOCAVEC (char, GET_MODE_SIZE (mode));
+
+  memset (p, *c, GET_MODE_SIZE (mode));
+
+  return c_readstr (p, mode);
+}
+
+/* Default implementation of TARGET_GEN_MEMSET_VALUE.  */
+
+rtx
+default_gen_memset_value (rtx data, void *prevp, scalar_int_mode mode)
+{
+  rtx target, coeff;
+  size_t size;
+  char *p;
+
+  by_pieces_prev *prev = (by_pieces_prev *) prevp;
+  if (prev != nullptr && prev->data != nullptr)
+    {
+      /* Use the previous data in the same mode.  */
+      if (prev->mode == mode)
+	return prev->data;
+
+      return simplify_gen_subreg (mode, prev->data, prev->mode, 0);
+    }
+
+  size = GET_MODE_SIZE (mode);
+  if (size == 1)
+    return data;
+
+  p = XALLOCAVEC (char, size);
+  memset (p, 1, size);
+  coeff = c_readstr (p, mode);
+
+  target = convert_to_mode (mode, data, 1);
+  target = expand_mult (mode, target, coeff, NULL_RTX, 1);
+  return force_reg (mode, target);
+}
+
 #include "gt-targhooks.h"
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index 9928d064abd..c34f3f9480e 100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -300,4 +300,8 @@ extern rtx default_memtag_set_tag (rtx, rtx, rtx);
 extern rtx default_memtag_extract_tag (rtx, rtx);
 extern rtx default_memtag_untagged_pointer (rtx, rtx);
 
+extern rtx default_read_memset_value (const char *, void *,
+				      scalar_int_mode);
+extern rtx default_gen_memset_value (rtx, void *, scalar_int_mode);
+
 #endif /* GCC_TARGHOOKS_H */
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-15.c b/gcc/testsuite/gcc.target/i386/pr90773-15.c
new file mode 100644
index 00000000000..c0a96fed892
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr90773-15.c
@@ -0,0 +1,14 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -march=skylake-avx512" } */
+
+extern char *dst;
+
+void
+foo (int c)
+{
+  __builtin_memset (dst, c, 17);
+}
+
+/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%edi, %xmm\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]+%xmm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */
+/* { dg-final { scan-assembler-times "movb\[\\t \]+%dil, 16\\(%\[\^,\]+\\)" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-16.c b/gcc/testsuite/gcc.target/i386/pr90773-16.c
new file mode 100644
index 00000000000..d2d1ec6141c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr90773-16.c
@@ -0,0 +1,14 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -march=skylake-avx512" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, -1, 17);
+}
+
+/* { dg-final { scan-assembler-times "vpcmpeqd" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]+%xmm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */
+/* { dg-final { scan-assembler-times "movb\[\\t \]+\\\$-1, 16\\(%\[\^,\]+\\)" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-17.c b/gcc/testsuite/gcc.target/i386/pr90773-17.c
new file mode 100644
index 00000000000..6c8da7d24ef
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr90773-17.c
@@ -0,0 +1,14 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -march=skylake-avx512" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, 12, 19);
+}
+
+/* { dg-final { scan-assembler-times "vpbroadcastb" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]+%xmm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovd\[\\t \]+%xmm\[0-9\]+, 15\\(%\[\^,\]+\\)" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-18.c b/gcc/testsuite/gcc.target/i386/pr90773-18.c
new file mode 100644
index 00000000000..b0687abbe01
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr90773-18.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake-avx512" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, 12, 9);
+}
+
+/* { dg-final { scan-assembler-times "movabsq\[\\t \]+\\\$868082074056920076, %r" 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]+\\\$202116108, \\(%\[\^,\]+\\)" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]+\\\$202116108, 4\\(%\[\^,\]+\\)" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "movb\[\\t \]+\\\$12, 8\\(%\[\^,\]+\\)" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-19.c b/gcc/testsuite/gcc.target/i386/pr90773-19.c
new file mode 100644
index 00000000000..8aa5540bacc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr90773-19.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, 12, 9);
+}
+
+/* { dg-final { scan-assembler-times "movabsq\[\\t \]+\\\$868082074056920076, %r" 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]+\\\$202116108, \\(%\[\^,\]+\\)" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]+\\\$202116108, 4\\(%\[\^,\]+\\)" 1 { target ia32 } } } */
-- 
2.31.1


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 04/12] x86: Avoid stack realignment when copying data
  2021-04-29 12:54 [PATCH 00/12] Allow TImode/OImode/XImode in op_by_pieces operations H.J. Lu
                   ` (2 preceding siblings ...)
  2021-04-29 12:54 ` [PATCH 03/12] Add TARGET_READ_MEMSET_VALUE/TARGET_GEN_MEMSET_VALUE H.J. Lu
@ 2021-04-29 12:54 ` H.J. Lu
  2021-04-29 12:54 ` [PATCH 05/12] Remove MAX_BITSIZE_MODE_ANY_INT H.J. Lu
                   ` (7 subsequent siblings)
  11 siblings, 0 replies; 28+ messages in thread
From: H.J. Lu @ 2021-04-29 12:54 UTC (permalink / raw)
  To: gcc-patches

Pass UNITS_PER_WORD * BITS_PER_UNIT to force_reg, when copying data from
one memory location to another with vector registers, to avoid stack
realignment.

	* config/i386/i386-expand.c (ix86_expand_vector_move): Pass
	UNITS_PER_WORD * BITS_PER_UNIT to force_reg when copying data
	from one memory location to another.
---
 gcc/config/i386/i386-expand.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index 1942b46efbf..b3c9b94f717 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -431,7 +431,12 @@ ix86_expand_vector_move (machine_mode mode, rtx operands[])
       && !register_operand (op0, mode)
       && !register_operand (op1, mode))
     {
-      emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
+      /* NB: Don't increase stack alignment requirement when forcing
+	 operand1 into a pseudo register to copy data from one memory
+	 location to another since it doen't require spill.  */
+      emit_move_insn (op0,
+		      force_reg (GET_MODE (op0), op1,
+				 (UNITS_PER_WORD * BITS_PER_UNIT)));
       return;
     }
 
-- 
2.31.1


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 05/12] Remove MAX_BITSIZE_MODE_ANY_INT
  2021-04-29 12:54 [PATCH 00/12] Allow TImode/OImode/XImode in op_by_pieces operations H.J. Lu
                   ` (3 preceding siblings ...)
  2021-04-29 12:54 ` [PATCH 04/12] x86: Avoid stack realignment when copying data H.J. Lu
@ 2021-04-29 12:54 ` H.J. Lu
  2021-04-29 12:54 ` [PATCH 06/12] x86: Update piecewise move and store H.J. Lu
                   ` (6 subsequent siblings)
  11 siblings, 0 replies; 28+ messages in thread
From: H.J. Lu @ 2021-04-29 12:54 UTC (permalink / raw)
  To: gcc-patches

It is only defined for i386 and everyone uses the default:

 #define MAX_BITSIZE_MODE_ANY_INT (64*BITS_PER_UNIT)

Whatever problems we had before, they have been fixed now.

	* config/i386/i386-modes.def (MAX_BITSIZE_MODE_ANY_INT): Removed.
---
 gcc/config/i386/i386-modes.def | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

diff --git a/gcc/config/i386/i386-modes.def b/gcc/config/i386/i386-modes.def
index dbddfd8e48f..4e7014be034 100644
--- a/gcc/config/i386/i386-modes.def
+++ b/gcc/config/i386/i386-modes.def
@@ -107,19 +107,10 @@ INT_MODE (XI, 64);
 PARTIAL_INT_MODE (HI, 16, P2QI);
 PARTIAL_INT_MODE (SI, 32, P2HI);
 
-/* Mode used for signed overflow checking of TImode.  As
-   MAX_BITSIZE_MODE_ANY_INT is only 160, wide-int.h reserves only that
-   rounded up to multiple of HOST_BITS_PER_WIDE_INT bits in wide_int etc.,
-   so OImode is too large.  For the overflow checking we actually need
-   just 1 or 2 bits beyond TImode precision.  Use 160 bits to have
-   a multiple of 32.  */
+/* Mode used for signed overflow checking of TImode.  For the overflow
+   checking we actually need just 1 or 2 bits beyond TImode precision.
+   Use 160 bits to have a multiple of 32.  */
 PARTIAL_INT_MODE (OI, 160, POI);
 
-/* Keep the OI and XI modes from confusing the compiler into thinking
-   that these modes could actually be used for computation.  They are
-   only holders for vectors during data movement.  Include POImode precision
-   though.  */
-#define MAX_BITSIZE_MODE_ANY_INT (160)
-
 /* The symbol Pmode stands for one of the above machine modes (usually SImode).
    The tm.h file specifies which one.  It is not a distinct mode.  */
-- 
2.31.1


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 06/12] x86: Update piecewise move and store
  2021-04-29 12:54 [PATCH 00/12] Allow TImode/OImode/XImode in op_by_pieces operations H.J. Lu
                   ` (4 preceding siblings ...)
  2021-04-29 12:54 ` [PATCH 05/12] Remove MAX_BITSIZE_MODE_ANY_INT H.J. Lu
@ 2021-04-29 12:54 ` H.J. Lu
  2021-04-29 12:54 ` [PATCH 07/12] x86: Add AVX2 tests for PR middle-end/90773 H.J. Lu
                   ` (5 subsequent siblings)
  11 siblings, 0 replies; 28+ messages in thread
From: H.J. Lu @ 2021-04-29 12:54 UTC (permalink / raw)
  To: gcc-patches

We can use TImode/OImode/XImode integers for piecewise move and store.
When vector register is used for piecewise move and store, we don't
increase stack_alignment_needed since vector register spill isn't
required for piecewise move and store.  Since stack_realign_needed is
set to true by checking stack_alignment_estimated set by pseudo vector
register usage, we also need to check stack_realign_needed to eliminate
frame pointer.

gcc/

	* config/i386/i386.c (ix86_finalize_stack_frame_flags): Also
	check stack_realign_needed for stack realignment.
	(ix86_legitimate_constant_p): Always allow CONST_WIDE_INT smaller
	than the largest integer supported by vector register.
	* config/i386/i386.h (MOVE_MAX): Set to 64.
	(MOVE_MAX_PIECES): Set to bytes of the largest integer supported
	by vector register.
	(STORE_MAX_PIECES): New.

gcc/testsuite/

	* gcc.target/i386/pr90773-1.c: Adjust to expect movq for 32-bit.
	* gcc.target/i386/pr90773-4.c: Also run for 32-bit.
	* gcc.target/i386/pr90773-14.c: Likewise.
	* gcc.target/i386/pr90773-15.c: Likewise.
	* gcc.target/i386/pr90773-16.c: Likewise.
	* gcc.target/i386/pr90773-17.c: Likewise.
---
 gcc/config/i386/i386.c                     | 21 ++++++++++++---
 gcc/config/i386/i386.h                     | 31 +++++++++++++++++-----
 gcc/testsuite/gcc.target/i386/pr90773-1.c  | 10 +++----
 gcc/testsuite/gcc.target/i386/pr90773-14.c |  2 +-
 gcc/testsuite/gcc.target/i386/pr90773-15.c |  6 ++---
 gcc/testsuite/gcc.target/i386/pr90773-16.c |  2 +-
 gcc/testsuite/gcc.target/i386/pr90773-17.c |  2 +-
 gcc/testsuite/gcc.target/i386/pr90773-4.c  |  2 +-
 8 files changed, 53 insertions(+), 23 deletions(-)

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index e6ee3ef630a..8ae0fa764f6 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -7925,8 +7925,17 @@ ix86_finalize_stack_frame_flags (void)
      assumed stack realignment might be needed or -fno-omit-frame-pointer
      is used, but in the end nothing that needed the stack alignment had
      been spilled nor stack access, clear frame_pointer_needed and say we
-     don't need stack realignment.  */
-  if ((stack_realign || (!flag_omit_frame_pointer && optimize))
+     don't need stack realignment.
+
+     When vector register is used for piecewise move and store, we don't
+     increase stack_alignment_needed as there is no register spill for
+     piecewise move and store.  Since stack_realign_needed is set to true
+     by checking stack_alignment_estimated which is updated by pseudo
+     vector register usage, we also need to check stack_realign_needed to
+     eliminate frame pointer.  */
+  if ((stack_realign
+       || (!flag_omit_frame_pointer && optimize)
+       || crtl->stack_realign_needed)
       && frame_pointer_needed
       && crtl->is_leaf
       && crtl->sp_is_unchanging
@@ -10385,7 +10394,13 @@ ix86_legitimate_constant_p (machine_mode mode, rtx x)
 	  /* FALLTHRU */
 	case E_OImode:
 	case E_XImode:
-	  if (!standard_sse_constant_p (x, mode))
+	  if (!standard_sse_constant_p (x, mode)
+	      && GET_MODE_SIZE (TARGET_AVX512F
+				? XImode
+				: (TARGET_AVX
+				   ? OImode
+				   : (TARGET_SSE2
+				      ? TImode : DImode))) < GET_MODE_SIZE (mode))
 	    return false;
 	default:
 	  break;
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 96b46bac238..b3213f85698 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -1750,7 +1750,7 @@ typedef struct ix86_args {
 
 /* Max number of bytes we can move from memory to memory
    in one reasonably fast instruction.  */
-#define MOVE_MAX 16
+#define MOVE_MAX 64
 
 /* MOVE_MAX_PIECES is the number of bytes at a time which we can
    move efficiently, as opposed to  MOVE_MAX which is the maximum
@@ -1761,11 +1761,30 @@ typedef struct ix86_args {
    widest mode with MAX_FIXED_MODE_SIZE, we can only use TImode in
    64-bit mode.  */
 #define MOVE_MAX_PIECES \
-  ((TARGET_64BIT \
-    && TARGET_SSE2 \
-    && TARGET_SSE_UNALIGNED_LOAD_OPTIMAL \
-    && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
-   ? GET_MODE_SIZE (TImode) : UNITS_PER_WORD)
+  ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \
+   ? 64 \
+   : ((TARGET_AVX \
+       && !TARGET_PREFER_AVX128 \
+       && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD \
+       && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \
+      ? 32 \
+      : ((TARGET_SSE2 \
+	  && TARGET_SSE_UNALIGNED_LOAD_OPTIMAL \
+	  && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
+	 ? 16 : UNITS_PER_WORD)))
+
+/* STORE_MAX_PIECES is the number of bytes at a time that we can
+   store efficiently.  */
+#define STORE_MAX_PIECES \
+  ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \
+   ? 64 \
+   : ((TARGET_AVX \
+       && !TARGET_PREFER_AVX128 \
+       && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \
+      ? 32 \
+      : ((TARGET_SSE2 \
+	  && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
+	 ? 16 : UNITS_PER_WORD)))
 
 /* If a memory-to-memory move would take MOVE_RATIO or more simple
    move-instruction pairs, we will do a cpymem or libcall instead.
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-1.c b/gcc/testsuite/gcc.target/i386/pr90773-1.c
index 1d9f282dc0d..4fd5a40d99d 100644
--- a/gcc/testsuite/gcc.target/i386/pr90773-1.c
+++ b/gcc/testsuite/gcc.target/i386/pr90773-1.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -mtune=generic" } */
+/* { dg-options "-O2 -msse2 -mtune=generic" } */
 
 extern char *dst, *src;
 
@@ -9,9 +9,5 @@ foo (void)
   __builtin_memcpy (dst, src, 15);
 }
 
-/* { dg-final { scan-assembler-times "movq\[\\t \]+\\(%\[\^,\]+\\)," 1 { target { ! ia32 } } } } */
-/* { dg-final { scan-assembler-times "movq\[\\t \]+7\\(%\[\^,\]+\\)," 1 { target { ! ia32 } } } } */
-/* { dg-final { scan-assembler-times "movl\[\\t \]+\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
-/* { dg-final { scan-assembler-times "movl\[\\t \]+4\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
-/* { dg-final { scan-assembler-times "movl\[\\t \]+8\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
-/* { dg-final { scan-assembler-times "movl\[\\t \]+11\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "movq\[\\t \]+\\(%\[\^,\]+\\)," 1 } } */
+/* { dg-final { scan-assembler-times "movq\[\\t \]+7\\(%\[\^,\]+\\)," 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-14.c b/gcc/testsuite/gcc.target/i386/pr90773-14.c
index 6364916ecac..74ba5055960 100644
--- a/gcc/testsuite/gcc.target/i386/pr90773-14.c
+++ b/gcc/testsuite/gcc.target/i386/pr90773-14.c
@@ -1,4 +1,4 @@
-/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-do compile } */
 /* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
 
 extern char *dst;
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-15.c b/gcc/testsuite/gcc.target/i386/pr90773-15.c
index c0a96fed892..880f71d1567 100644
--- a/gcc/testsuite/gcc.target/i386/pr90773-15.c
+++ b/gcc/testsuite/gcc.target/i386/pr90773-15.c
@@ -1,4 +1,4 @@
-/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-do compile } */
 /* { dg-options "-O2 -march=skylake-avx512" } */
 
 extern char *dst;
@@ -9,6 +9,6 @@ foo (int c)
   __builtin_memset (dst, c, 17);
 }
 
-/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%edi, %xmm\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%.*, %xmm\[0-9\]+" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqu\[\\t \]+%xmm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */
-/* { dg-final { scan-assembler-times "movb\[\\t \]+%dil, 16\\(%\[\^,\]+\\)" 1 } } */
+/* { dg-final { scan-assembler-times "movb\[\\t \]+%.*, 16\\(%\[\^,\]+\\)" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-16.c b/gcc/testsuite/gcc.target/i386/pr90773-16.c
index d2d1ec6141c..32a976b10df 100644
--- a/gcc/testsuite/gcc.target/i386/pr90773-16.c
+++ b/gcc/testsuite/gcc.target/i386/pr90773-16.c
@@ -1,4 +1,4 @@
-/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-do compile } */
 /* { dg-options "-O2 -march=skylake-avx512" } */
 
 extern char *dst;
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-17.c b/gcc/testsuite/gcc.target/i386/pr90773-17.c
index 6c8da7d24ef..2d6fbf22a8b 100644
--- a/gcc/testsuite/gcc.target/i386/pr90773-17.c
+++ b/gcc/testsuite/gcc.target/i386/pr90773-17.c
@@ -1,4 +1,4 @@
-/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-do compile } */
 /* { dg-options "-O2 -march=skylake-avx512" } */
 
 extern char *dst;
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-4.c b/gcc/testsuite/gcc.target/i386/pr90773-4.c
index ec0bc0100ae..ee4c04678d1 100644
--- a/gcc/testsuite/gcc.target/i386/pr90773-4.c
+++ b/gcc/testsuite/gcc.target/i386/pr90773-4.c
@@ -1,4 +1,4 @@
-/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-do compile } */
 /* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
 
 extern char *dst;
-- 
2.31.1


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 07/12] x86: Add AVX2 tests for PR middle-end/90773
  2021-04-29 12:54 [PATCH 00/12] Allow TImode/OImode/XImode in op_by_pieces operations H.J. Lu
                   ` (5 preceding siblings ...)
  2021-04-29 12:54 ` [PATCH 06/12] x86: Update piecewise move and store H.J. Lu
@ 2021-04-29 12:54 ` H.J. Lu
  2021-04-29 12:54 ` [PATCH 08/12] x86: Add tests for piecewise move and store H.J. Lu
                   ` (4 subsequent siblings)
  11 siblings, 0 replies; 28+ messages in thread
From: H.J. Lu @ 2021-04-29 12:54 UTC (permalink / raw)
  To: gcc-patches

	PR middle-end/90773
	* gcc.target/i386/pr90773-20.c: New test.
	* gcc.target/i386/pr90773-21.c: Likewise.
	* gcc.target/i386/pr90773-22.c: Likewise.
	* gcc.target/i386/pr90773-23.c: Likewise.
---
 gcc/testsuite/gcc.target/i386/pr90773-20.c | 13 +++++++++++++
 gcc/testsuite/gcc.target/i386/pr90773-21.c | 13 +++++++++++++
 gcc/testsuite/gcc.target/i386/pr90773-22.c | 13 +++++++++++++
 gcc/testsuite/gcc.target/i386/pr90773-23.c | 13 +++++++++++++
 4 files changed, 52 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-20.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-21.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-22.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-23.c

diff --git a/gcc/testsuite/gcc.target/i386/pr90773-20.c b/gcc/testsuite/gcc.target/i386/pr90773-20.c
new file mode 100644
index 00000000000..e61e405f2b6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr90773-20.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake" } */
+
+extern char *dst;
+
+void
+foo (int c)
+{
+  __builtin_memset (dst, c, 33);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */
+/* { dg-final { scan-assembler-times "movb\[\\t \]+.+, 32\\(%\[\^,\]+\\)" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-21.c b/gcc/testsuite/gcc.target/i386/pr90773-21.c
new file mode 100644
index 00000000000..16ad17f3cbb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr90773-21.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake" } */
+
+extern char *dst;
+
+void
+foo (int c)
+{
+  __builtin_memset (dst, c, 34);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */
+/* { dg-final { scan-assembler-times "movw\[\\t \]%.*, 32\\(%\[\^,\]+\\)" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-22.c b/gcc/testsuite/gcc.target/i386/pr90773-22.c
new file mode 100644
index 00000000000..45a8ff65a84
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr90773-22.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, 0, 33);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */
+/* { dg-final { scan-assembler-times "movb\[\\t \]+.+, 32\\(%\[\^,\]+\\)" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-23.c b/gcc/testsuite/gcc.target/i386/pr90773-23.c
new file mode 100644
index 00000000000..9256ce10ff0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr90773-23.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, 0, 34);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */
+/* { dg-final { scan-assembler-times "movw\[\\t \]+.+, 32\\(%\[\^,\]+\\)" 1 } } */
-- 
2.31.1


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 08/12] x86: Add tests for piecewise move and store
  2021-04-29 12:54 [PATCH 00/12] Allow TImode/OImode/XImode in op_by_pieces operations H.J. Lu
                   ` (6 preceding siblings ...)
  2021-04-29 12:54 ` [PATCH 07/12] x86: Add AVX2 tests for PR middle-end/90773 H.J. Lu
@ 2021-04-29 12:54 ` H.J. Lu
  2021-04-29 12:54 ` [PATCH 09/12] x86: Also pass -mno-avx to pr72839.c H.J. Lu
                   ` (3 subsequent siblings)
  11 siblings, 0 replies; 28+ messages in thread
From: H.J. Lu @ 2021-04-29 12:54 UTC (permalink / raw)
  To: gcc-patches

	* gcc.target/i386/pieces-memcpy-10.c: New test.
	* gcc.target/i386/pieces-memcpy-11.c: Likewise.
	* gcc.target/i386/pieces-memcpy-12.c: Likewise.
	* gcc.target/i386/pieces-memcpy-13.c: Likewise.
	* gcc.target/i386/pieces-memcpy-14.c: Likewise.
	* gcc.target/i386/pieces-memcpy-15.c: Likewise.
	* gcc.target/i386/pieces-memcpy-16.c: Likewise.
	* gcc.target/i386/pieces-memcpy-17.c: Likewise.
	* gcc.target/i386/pieces-memcpy-18.c: Likewise.
	* gcc.target/i386/pieces-memcpy-19.c: Likewise.
	* gcc.target/i386/pieces-memset-1.c: Likewise.
	* gcc.target/i386/pieces-memset-2.c: Likewise.
	* gcc.target/i386/pieces-memset-3.c: Likewise.
	* gcc.target/i386/pieces-memset-4.c: Likewise.
	* gcc.target/i386/pieces-memset-5.c: Likewise.
	* gcc.target/i386/pieces-memset-6.c: Likewise.
	* gcc.target/i386/pieces-memset-7.c: Likewise.
	* gcc.target/i386/pieces-memset-8.c: Likewise.
	* gcc.target/i386/pieces-memset-9.c: Likewise.
	* gcc.target/i386/pieces-memset-10.c: Likewise.
	* gcc.target/i386/pieces-memset-11.c: Likewise.
	* gcc.target/i386/pieces-memset-12.c: Likewise.
	* gcc.target/i386/pieces-memset-13.c: Likewise.
	* gcc.target/i386/pieces-memset-14.c: Likewise.
	* gcc.target/i386/pieces-memset-15.c: Likewise.
	* gcc.target/i386/pieces-memset-16.c: Likewise.
	* gcc.target/i386/pieces-memset-17.c: Likewise.
	* gcc.target/i386/pieces-memset-18.c: Likewise.
	* gcc.target/i386/pieces-memset-19.c: Likewise.
	* gcc.target/i386/pieces-memset-20.c: Likewise.
	* gcc.target/i386/pieces-memset-21.c: Likewise.
	* gcc.target/i386/pieces-memset-22.c: Likewise.
	* gcc.target/i386/pieces-memset-23.c: Likewise.
	* gcc.target/i386/pieces-memset-24.c: Likewise.
	* gcc.target/i386/pieces-memset-25.c: Likewise.
	* gcc.target/i386/pieces-memset-26.c: Likewise.
	* gcc.target/i386/pieces-memset-27.c: Likewise.
	* gcc.target/i386/pieces-memset-28.c: Likewise.
	* gcc.target/i386/pieces-memset-29.c: Likewise.
	* gcc.target/i386/pieces-memset-30.c: Likewise.
	* gcc.target/i386/pieces-memset-31.c: Likewise.
	* gcc.target/i386/pieces-memset-32.c: Likewise.
	* gcc.target/i386/pieces-memset-33.c: Likewise.
	* gcc.target/i386/pieces-memset-34.c: Likewise.
	* gcc.target/i386/pieces-memset-35.c: Likewise.
	* gcc.target/i386/pieces-memset-36.c: Likewise.
	* gcc.target/i386/pieces-memset-37.c: Likewise.
	* gcc.target/i386/pieces-memset-38.c: Likewise.
	* gcc.target/i386/pieces-memset-39.c: Likewise.
	* gcc.target/i386/pieces-memset-40.c: Likewise.
	* gcc.target/i386/pieces-memset-41.c: Likewise.
	* gcc.target/i386/pieces-memset-42.c: Likewise.
	* gcc.target/i386/pieces-memset-43.c: Likewise.
	* gcc.target/i386/pieces-memset-44.c: Likewise.
---
 .../gcc.target/i386/pieces-memcpy-10.c         | 16 ++++++++++++++++
 .../gcc.target/i386/pieces-memcpy-11.c         | 17 +++++++++++++++++
 .../gcc.target/i386/pieces-memcpy-12.c         | 16 ++++++++++++++++
 .../gcc.target/i386/pieces-memcpy-13.c         | 16 ++++++++++++++++
 .../gcc.target/i386/pieces-memcpy-14.c         | 17 +++++++++++++++++
 .../gcc.target/i386/pieces-memcpy-15.c         | 16 ++++++++++++++++
 .../gcc.target/i386/pieces-memcpy-16.c         | 16 ++++++++++++++++
 .../gcc.target/i386/pieces-memcpy-7.c          | 15 +++++++++++++++
 .../gcc.target/i386/pieces-memcpy-8.c          | 14 ++++++++++++++
 .../gcc.target/i386/pieces-memcpy-9.c          | 14 ++++++++++++++
 .../gcc.target/i386/pieces-memset-1.c          | 16 ++++++++++++++++
 .../gcc.target/i386/pieces-memset-10.c         | 16 ++++++++++++++++
 .../gcc.target/i386/pieces-memset-11.c         | 16 ++++++++++++++++
 .../gcc.target/i386/pieces-memset-12.c         | 16 ++++++++++++++++
 .../gcc.target/i386/pieces-memset-13.c         | 16 ++++++++++++++++
 .../gcc.target/i386/pieces-memset-14.c         | 16 ++++++++++++++++
 .../gcc.target/i386/pieces-memset-15.c         | 16 ++++++++++++++++
 .../gcc.target/i386/pieces-memset-16.c         | 16 ++++++++++++++++
 .../gcc.target/i386/pieces-memset-17.c         | 16 ++++++++++++++++
 .../gcc.target/i386/pieces-memset-18.c         | 16 ++++++++++++++++
 .../gcc.target/i386/pieces-memset-19.c         | 17 +++++++++++++++++
 .../gcc.target/i386/pieces-memset-2.c          | 12 ++++++++++++
 .../gcc.target/i386/pieces-memset-20.c         | 17 +++++++++++++++++
 .../gcc.target/i386/pieces-memset-21.c         | 17 +++++++++++++++++
 .../gcc.target/i386/pieces-memset-22.c         | 17 +++++++++++++++++
 .../gcc.target/i386/pieces-memset-23.c         | 17 +++++++++++++++++
 .../gcc.target/i386/pieces-memset-24.c         | 17 +++++++++++++++++
 .../gcc.target/i386/pieces-memset-25.c         | 17 +++++++++++++++++
 .../gcc.target/i386/pieces-memset-26.c         | 17 +++++++++++++++++
 .../gcc.target/i386/pieces-memset-27.c         | 17 +++++++++++++++++
 .../gcc.target/i386/pieces-memset-28.c         | 17 +++++++++++++++++
 .../gcc.target/i386/pieces-memset-29.c         | 17 +++++++++++++++++
 .../gcc.target/i386/pieces-memset-3.c          | 18 ++++++++++++++++++
 .../gcc.target/i386/pieces-memset-30.c         | 17 +++++++++++++++++
 .../gcc.target/i386/pieces-memset-31.c         | 17 +++++++++++++++++
 .../gcc.target/i386/pieces-memset-32.c         | 17 +++++++++++++++++
 .../gcc.target/i386/pieces-memset-33.c         | 17 +++++++++++++++++
 .../gcc.target/i386/pieces-memset-34.c         | 17 +++++++++++++++++
 .../gcc.target/i386/pieces-memset-35.c         | 17 +++++++++++++++++
 .../gcc.target/i386/pieces-memset-36.c         | 17 +++++++++++++++++
 .../gcc.target/i386/pieces-memset-37.c         | 15 +++++++++++++++
 .../gcc.target/i386/pieces-memset-38.c         | 17 +++++++++++++++++
 .../gcc.target/i386/pieces-memset-39.c         | 16 ++++++++++++++++
 .../gcc.target/i386/pieces-memset-4.c          | 16 ++++++++++++++++
 .../gcc.target/i386/pieces-memset-40.c         | 17 +++++++++++++++++
 .../gcc.target/i386/pieces-memset-41.c         | 16 ++++++++++++++++
 .../gcc.target/i386/pieces-memset-42.c         | 17 +++++++++++++++++
 .../gcc.target/i386/pieces-memset-43.c         | 17 +++++++++++++++++
 .../gcc.target/i386/pieces-memset-5.c          | 12 ++++++++++++
 .../gcc.target/i386/pieces-memset-6.c          | 16 ++++++++++++++++
 .../gcc.target/i386/pieces-memset-7.c          | 16 ++++++++++++++++
 .../gcc.target/i386/pieces-memset-8.c          | 16 ++++++++++++++++
 .../gcc.target/i386/pieces-memset-9.c          | 16 ++++++++++++++++
 53 files changed, 860 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-11.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-12.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-13.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-14.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-15.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-16.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-7.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-8.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-9.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-10.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-11.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-12.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-13.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-14.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-15.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-16.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-17.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-18.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-19.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-20.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-21.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-22.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-23.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-24.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-25.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-26.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-27.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-28.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-29.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-30.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-31.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-32.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-33.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-34.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-35.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-36.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-37.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-38.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-39.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-4.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-40.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-41.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-42.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-43.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-5.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-6.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-7.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-8.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-9.c

diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c b/gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c
new file mode 100644
index 00000000000..5faee21f9b9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mtune=sandybridge" } */
+
+extern char *dst, *src;
+
+void
+foo (void)
+{
+  __builtin_memcpy (dst, src, 33);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%xmm" 4 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-11.c b/gcc/testsuite/gcc.target/i386/pieces-memcpy-11.c
new file mode 100644
index 00000000000..b8917a7f917
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-11.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
+
+extern char *dst, *src;
+
+void
+foo (void)
+{
+  __builtin_memcpy (dst, src, 64);
+}
+
+/* { dg-final { scan-assembler-times "movdqu\[ \\t\]+\[^\n\]*%xmm" 4 } } */
+/* { dg-final { scan-assembler-times "movups\[ \\t\]+\[^\n\]*%xmm" 4 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-12.c b/gcc/testsuite/gcc.target/i386/pieces-memcpy-12.c
new file mode 100644
index 00000000000..f1432ebe517
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-12.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mtune=haswell" } */
+
+extern char *dst, *src;
+
+void
+foo (void)
+{
+  __builtin_memcpy (dst, src, 64);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%ymm" 4 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-13.c b/gcc/testsuite/gcc.target/i386/pieces-memcpy-13.c
new file mode 100644
index 00000000000..97e6067fec9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-13.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f -mtune=generic" } */
+
+extern char *dst, *src;
+
+void
+foo (void)
+{
+  __builtin_memcpy (dst, src, 66);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu64\[ \\t\]+\[^\n\]*%zmm" 2 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-14.c b/gcc/testsuite/gcc.target/i386/pieces-memcpy-14.c
new file mode 100644
index 00000000000..7addc4c0a28
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-14.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
+
+extern char *dst, *src;
+
+void
+foo (void)
+{
+  __builtin_memcpy (dst, src, 33);
+}
+
+/* { dg-final { scan-assembler-times "movdqu\[ \\t\]+\[^\n\]*%xmm" 2 } } */
+/* { dg-final { scan-assembler-times "movups\[ \\t\]+\[^\n\]*%xmm" 2 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-15.c b/gcc/testsuite/gcc.target/i386/pieces-memcpy-15.c
new file mode 100644
index 00000000000..695e8c3fa67
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-15.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mtune=haswell" } */
+
+extern char *dst, *src;
+
+void
+foo (void)
+{
+  __builtin_memcpy (dst, src, 33);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%ymm" 2 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-16.c b/gcc/testsuite/gcc.target/i386/pieces-memcpy-16.c
new file mode 100644
index 00000000000..b0643d05ee7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-16.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f -mtune=generic" } */
+
+extern char *dst, *src;
+
+void
+foo (void)
+{
+  __builtin_memcpy (dst, src, 34);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%ymm" 2 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-7.c b/gcc/testsuite/gcc.target/i386/pieces-memcpy-7.c
new file mode 100644
index 00000000000..3d248d447ea
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-7.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
+
+void
+foo (int a1, int a2, int a3, int a4, int a5, int a6, char *dst, char *src)
+{
+  __builtin_memcpy (dst, src, 17);
+}
+
+/* { dg-final { scan-assembler-times "movdqu\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* { dg-final { scan-assembler-times "movups\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-8.c b/gcc/testsuite/gcc.target/i386/pieces-memcpy-8.c
new file mode 100644
index 00000000000..c13a2beb2f0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-8.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mtune=generic" } */
+
+void
+foo (int a1, int a2, int a3, int a4, int a5, int a6, char *dst, char *src)
+{
+  __builtin_memcpy (dst, src, 18);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%xmm" 2 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-9.c b/gcc/testsuite/gcc.target/i386/pieces-memcpy-9.c
new file mode 100644
index 00000000000..238f88b275e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-9.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f -mtune=generic" } */
+
+void
+foo (int a1, int a2, int a3, int a4, int a5, int a6, char *dst, char *src)
+{
+  __builtin_memcpy (dst, src, 19);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%xmm" 2 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-1.c b/gcc/testsuite/gcc.target/i386/pieces-memset-1.c
new file mode 100644
index 00000000000..2b8032684b3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-1.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
+
+extern char *dst;
+
+void
+foo (int x)
+{
+  __builtin_memset (dst, x, 64);
+}
+
+/* { dg-final { scan-assembler-times "movups\[ \\t\]+\[^\n\]*%xmm" 4 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-10.c b/gcc/testsuite/gcc.target/i386/pieces-memset-10.c
new file mode 100644
index 00000000000..a6390d1bd8f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-10.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, 3, 64);
+}
+
+/* { dg-final { scan-assembler-times "movups\[ \\t\]+\[^\n\]*%xmm" 4 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-11.c b/gcc/testsuite/gcc.target/i386/pieces-memset-11.c
new file mode 100644
index 00000000000..3fb9038b04f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-11.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mtune=haswell" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, 3, 64);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%ymm" 2 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-12.c b/gcc/testsuite/gcc.target/i386/pieces-memset-12.c
new file mode 100644
index 00000000000..fa834566097
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-12.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f -mtune=generic" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, 3, 66);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu64\[ \\t\]+\[^\n\]*%zmm" 1 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-13.c b/gcc/testsuite/gcc.target/i386/pieces-memset-13.c
new file mode 100644
index 00000000000..7f2cd3f58ec
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-13.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, 3, 33);
+}
+
+/* { dg-final { scan-assembler-times "movups\[ \\t\]+\[^\n\]*%xmm" 2 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-14.c b/gcc/testsuite/gcc.target/i386/pieces-memset-14.c
new file mode 100644
index 00000000000..45ece482464
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-14.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mtune=haswell" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, 3, 33);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%ymm" 1 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-15.c b/gcc/testsuite/gcc.target/i386/pieces-memset-15.c
new file mode 100644
index 00000000000..bddf47d728e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-15.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f -mtune=generic" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, 3, 33);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%ymm" 1 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-16.c b/gcc/testsuite/gcc.target/i386/pieces-memset-16.c
new file mode 100644
index 00000000000..1c5d124cecc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-16.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, 3, 17);
+}
+
+/* { dg-final { scan-assembler-times "movups\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-17.c b/gcc/testsuite/gcc.target/i386/pieces-memset-17.c
new file mode 100644
index 00000000000..6cdb33557c0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-17.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mtune=generic" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, 3, 17);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-18.c b/gcc/testsuite/gcc.target/i386/pieces-memset-18.c
new file mode 100644
index 00000000000..adbd201b4e7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-18.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f -mtune=generic" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, 3, 18);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-19.c b/gcc/testsuite/gcc.target/i386/pieces-memset-19.c
new file mode 100644
index 00000000000..7e9cf2e26d8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-19.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, 0, 64);
+}
+
+/* { dg-final { scan-assembler-times "pxor\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* { dg-final { scan-assembler-times "movups\[ \\t\]+\[^\n\]*%xmm" 4 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-2.c b/gcc/testsuite/gcc.target/i386/pieces-memset-2.c
new file mode 100644
index 00000000000..649f344e8f6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-2.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mtune=haswell" } */
+
+extern char *dst;
+
+void
+foo (int x)
+{
+  __builtin_memset (dst, x, 64);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%ymm" 2 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-20.c b/gcc/testsuite/gcc.target/i386/pieces-memset-20.c
new file mode 100644
index 00000000000..b8747e669e8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-20.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mtune=haswell" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, 0, 64);
+}
+
+/* { dg-final { scan-assembler-times "vpxor\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%ymm" 2 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-21.c b/gcc/testsuite/gcc.target/i386/pieces-memset-21.c
new file mode 100644
index 00000000000..4f001c6d06c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-21.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f -mtune=generic" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, 0, 66);
+}
+
+/* { dg-final { scan-assembler-times "vpxor\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu64\[ \\t\]+\[^\n\]*%zmm" 1 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-22.c b/gcc/testsuite/gcc.target/i386/pieces-memset-22.c
new file mode 100644
index 00000000000..5f3c454ef8f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-22.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, 0, 33);
+}
+
+/* { dg-final { scan-assembler-times "pxor\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* { dg-final { scan-assembler-times "movups\[ \\t\]+\[^\n\]*%xmm" 2 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-23.c b/gcc/testsuite/gcc.target/i386/pieces-memset-23.c
new file mode 100644
index 00000000000..a3b4ffc18e0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-23.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mtune=haswell" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, 0, 33);
+}
+
+/* { dg-final { scan-assembler-times "vpxor\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%ymm" 1 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-24.c b/gcc/testsuite/gcc.target/i386/pieces-memset-24.c
new file mode 100644
index 00000000000..e222787b541
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-24.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f -mtune=generic" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, 0, 33);
+}
+
+/* { dg-final { scan-assembler-times "vpxor\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%ymm" 1 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-25.c b/gcc/testsuite/gcc.target/i386/pieces-memset-25.c
new file mode 100644
index 00000000000..195ddb635eb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-25.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, 0, 17);
+}
+
+/* { dg-final { scan-assembler-times "pxor\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* { dg-final { scan-assembler-times "movups\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-26.c b/gcc/testsuite/gcc.target/i386/pieces-memset-26.c
new file mode 100644
index 00000000000..13606b2da54
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-26.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mtune=generic" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, 0, 17);
+}
+
+/* { dg-final { scan-assembler-times "pxor\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-27.c b/gcc/testsuite/gcc.target/i386/pieces-memset-27.c
new file mode 100644
index 00000000000..54a672b6015
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-27.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f -mtune=generic" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, 0, 17);
+}
+
+/* { dg-final { scan-assembler-times "pxor\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-28.c b/gcc/testsuite/gcc.target/i386/pieces-memset-28.c
new file mode 100644
index 00000000000..83c2d3f0fde
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-28.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, -1, 64);
+}
+
+/* { dg-final { scan-assembler-times "pcmpeqd\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* { dg-final { scan-assembler-times "movups\[ \\t\]+\[^\n\]*%xmm" 4 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-29.c b/gcc/testsuite/gcc.target/i386/pieces-memset-29.c
new file mode 100644
index 00000000000..650e6fe66a5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-29.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mtune=haswell" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, -1, 64);
+}
+
+/* { dg-final { scan-assembler-not "vpcmpeqd\[ \\t\]+\[^\n\]*%ymm" } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%ymm" 2 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-3.c b/gcc/testsuite/gcc.target/i386/pieces-memset-3.c
new file mode 100644
index 00000000000..2aed6dbc68e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-3.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx512bw -mno-avx512vl -mavx512f -mtune=intel" } */
+
+extern char *dst;
+
+void
+foo (int x)
+{
+  __builtin_memset (dst, x, 66);
+}
+
+/* { dg-final { scan-assembler-times "vpbroadcastb\[ \\t\]+\[^\n\]*%ymm" 1 } } */
+/* { dg-final { scan-assembler-times "vinserti64x4\[ \\t\]+\[^\n\]*%zmm" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu64\[ \\t\]+\[^\n\]*%zmm" 1 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" { target { ! ia32 } } } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-30.c b/gcc/testsuite/gcc.target/i386/pieces-memset-30.c
new file mode 100644
index 00000000000..dcec2c700fc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-30.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx512f -mavx2 -mtune=haswell" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, -1, 64);
+}
+
+/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+\[^\n\]*%ymm" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%ymm" 2 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-31.c b/gcc/testsuite/gcc.target/i386/pieces-memset-31.c
new file mode 100644
index 00000000000..5d20af0938d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-31.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f -mtune=generic" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, -1, 66);
+}
+
+/* { dg-final { scan-assembler-times "vpternlogd\[ \\t\]+\[^\n\]*%zmm" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu64\[ \\t\]+\[^\n\]*%zmm" 1 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-32.c b/gcc/testsuite/gcc.target/i386/pieces-memset-32.c
new file mode 100644
index 00000000000..c5ca0bd17ba
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-32.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, -1, 33);
+}
+
+/* { dg-final { scan-assembler-times "pcmpeqd\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* { dg-final { scan-assembler-times "movups\[ \\t\]+\[^\n\]*%xmm" 2 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-33.c b/gcc/testsuite/gcc.target/i386/pieces-memset-33.c
new file mode 100644
index 00000000000..a87d1b80ae6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-33.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mtune=haswell" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, -1, 33);
+}
+
+/* { dg-final { scan-assembler-not "vpcmpeqd\[ \\t\]+\[^\n\]*%ymm" } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%ymm" 1 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-34.c b/gcc/testsuite/gcc.target/i386/pieces-memset-34.c
new file mode 100644
index 00000000000..0c2f1ee6049
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-34.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx512f -mavx2 -mtune=haswell" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, -1, 33);
+}
+
+/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+\[^\n\]*%ymm" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%ymm" 1 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-35.c b/gcc/testsuite/gcc.target/i386/pieces-memset-35.c
new file mode 100644
index 00000000000..b0f4a8b898e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-35.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f -mtune=generic" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, -1, 34);
+}
+
+/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+\[^\n\]*%ymm" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%ymm" 1 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-36.c b/gcc/testsuite/gcc.target/i386/pieces-memset-36.c
new file mode 100644
index 00000000000..d1f1263c7b2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-36.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx512f -mavx2 -mtune=generic" } */
+
+extern char *dst;
+
+void
+foo (int x)
+{
+  __builtin_memset (dst, x, 17);
+}
+
+/* { dg-final { scan-assembler-times "vpbroadcastb\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-37.c b/gcc/testsuite/gcc.target/i386/pieces-memset-37.c
new file mode 100644
index 00000000000..ec59497b116
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-37.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx512f -mavx2 -mtune=generic" } */
+
+void
+foo (int a1, int a2, int a3, int a4, int a5, int a6, int x, char *dst)
+{
+  __builtin_memset (dst, x, 66);
+}
+
+/* { dg-final { scan-assembler-times "vpbroadcastb\[ \\t\]+\[^\n\]*%ymm" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%ymm" 2 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-38.c b/gcc/testsuite/gcc.target/i386/pieces-memset-38.c
new file mode 100644
index 00000000000..ed4a24a54fd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-38.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx512f -mavx2 -mtune=sandybridge" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, -1, 33);
+}
+
+/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%xmm" 2 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-39.c b/gcc/testsuite/gcc.target/i386/pieces-memset-39.c
new file mode 100644
index 00000000000..a330bff5f3f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-39.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512bw -mtune=generic" } */
+
+void
+foo (int a1, int a2, int a3, int a4, int a5, int a6, int x, char *dst)
+{
+  __builtin_memset (dst, x, 66);
+}
+
+/* { dg-final { scan-assembler-times "vpbroadcastb\[ \\t\]+\[^\n\]*%zmm" 1 } } */
+/* { dg-final { scan-assembler-not "vinserti64x4" } } */
+/* { dg-final { scan-assembler-times "vmovdqu64\[ \\t\]+\[^\n\]*%zmm" 1 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-4.c b/gcc/testsuite/gcc.target/i386/pieces-memset-4.c
new file mode 100644
index 00000000000..9256919bfdf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-4.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
+
+extern char *dst;
+
+void
+foo (int x)
+{
+  __builtin_memset (dst, x, 33);
+}
+
+/* { dg-final { scan-assembler-times "movups\[ \\t\]+\[^\n\]*%xmm" 2 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-40.c b/gcc/testsuite/gcc.target/i386/pieces-memset-40.c
new file mode 100644
index 00000000000..4eda73ead59
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-40.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx512f -mavx2 -mtune=sandybridge" } */
+
+extern char *dst;
+
+void
+foo (int x)
+{
+  __builtin_memset (dst, x, 66);
+}
+
+/* { dg-final { scan-assembler-times "vpbroadcastb\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%xmm" 4 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-41.c b/gcc/testsuite/gcc.target/i386/pieces-memset-41.c
new file mode 100644
index 00000000000..f86b6986da9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-41.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mtune=sandybridge" } */
+
+extern char *dst;
+
+void
+foo (int x)
+{
+  __builtin_memset (dst, x, 33);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%xmm" 2 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-42.c b/gcc/testsuite/gcc.target/i386/pieces-memset-42.c
new file mode 100644
index 00000000000..df0c122aae7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-42.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mtune=sandybridge" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, 0, 33);
+}
+
+/* { dg-final { scan-assembler-times "vpxor\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%xmm" 2 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-43.c b/gcc/testsuite/gcc.target/i386/pieces-memset-43.c
new file mode 100644
index 00000000000..2f2179c2df9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-43.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mtune=sandybridge" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, -1, 33);
+}
+
+/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%xmm" 2 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-5.c b/gcc/testsuite/gcc.target/i386/pieces-memset-5.c
new file mode 100644
index 00000000000..3e95db5efef
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-5.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mtune=haswell" } */
+
+extern char *dst;
+
+void
+foo (int x)
+{
+  __builtin_memset (dst, x, 33);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%ymm" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-6.c b/gcc/testsuite/gcc.target/i386/pieces-memset-6.c
new file mode 100644
index 00000000000..571113c3a33
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-6.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f -mtune=intel" } */
+
+extern char *dst;
+
+void
+foo (int x)
+{
+  __builtin_memset (dst, x, 33);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%ymm" 1 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" { target { ! ia32 } } } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-7.c b/gcc/testsuite/gcc.target/i386/pieces-memset-7.c
new file mode 100644
index 00000000000..fd159869817
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-7.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
+
+extern char *dst;
+
+void
+foo (int x)
+{
+  __builtin_memset (dst, x, 17);
+}
+
+/* { dg-final { scan-assembler-times "movups\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-8.c b/gcc/testsuite/gcc.target/i386/pieces-memset-8.c
new file mode 100644
index 00000000000..7df0019ef63
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-8.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mtune=generic" } */
+
+extern char *dst;
+
+void
+foo (int x)
+{
+  __builtin_memset (dst, x, 17);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-9.c b/gcc/testsuite/gcc.target/i386/pieces-memset-9.c
new file mode 100644
index 00000000000..ed45d590875
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-9.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f -mtune=generic" } */
+
+extern char *dst;
+
+void
+foo (int x)
+{
+  __builtin_memset (dst, x, 17);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
-- 
2.31.1


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 09/12] x86: Also pass -mno-avx to pr72839.c
  2021-04-29 12:54 [PATCH 00/12] Allow TImode/OImode/XImode in op_by_pieces operations H.J. Lu
                   ` (7 preceding siblings ...)
  2021-04-29 12:54 ` [PATCH 08/12] x86: Add tests for piecewise move and store H.J. Lu
@ 2021-04-29 12:54 ` H.J. Lu
  2021-04-29 12:54 ` [PATCH 10/12] x86: Also pass -mno-avx to cold-attribute-1.c H.J. Lu
                   ` (2 subsequent siblings)
  11 siblings, 0 replies; 28+ messages in thread
From: H.J. Lu @ 2021-04-29 12:54 UTC (permalink / raw)
  To: gcc-patches

Also pass -mno-avx to pr72839.c to avoid copying data with YMM or ZMM
registers.

	* gcc.target/i386/pr72839.c: Also pass -mno-avx.
---
 gcc/testsuite/gcc.target/i386/pr72839.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.target/i386/pr72839.c b/gcc/testsuite/gcc.target/i386/pr72839.c
index ea724f70377..6888d9d0a55 100644
--- a/gcc/testsuite/gcc.target/i386/pr72839.c
+++ b/gcc/testsuite/gcc.target/i386/pr72839.c
@@ -1,6 +1,6 @@
 /* { dg-do compile } */
 /* { dg-require-effective-target ia32 } */
-/* { dg-options "-O2 -mtune=lakemont" } */
+/* { dg-options "-O2 -mtune=lakemont -mno-avx" } */
 
 extern char *strcpy (char *, const char *);
 
-- 
2.31.1


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 10/12] x86: Also pass -mno-avx to cold-attribute-1.c
  2021-04-29 12:54 [PATCH 00/12] Allow TImode/OImode/XImode in op_by_pieces operations H.J. Lu
                   ` (8 preceding siblings ...)
  2021-04-29 12:54 ` [PATCH 09/12] x86: Also pass -mno-avx to pr72839.c H.J. Lu
@ 2021-04-29 12:54 ` H.J. Lu
  2021-04-29 12:54 ` [PATCH 11/12] x86: Also pass -mno-avx to sw-1.c for ia32 H.J. Lu
  2021-04-29 12:54 ` [PATCH 12/12] x86: Update gcc.target/i386/incoming-11.c H.J. Lu
  11 siblings, 0 replies; 28+ messages in thread
From: H.J. Lu @ 2021-04-29 12:54 UTC (permalink / raw)
  To: gcc-patches

Also pass -mno-avx to pr72839.c to avoid copying data with YMM or ZMM
registers.

	* gcc.target/i386/cold-attribute-1.c: Also pass -mno-avx.
---
 gcc/testsuite/gcc.target/i386/cold-attribute-1.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.target/i386/cold-attribute-1.c b/gcc/testsuite/gcc.target/i386/cold-attribute-1.c
index 57666ac60b6..658eb3e25bb 100644
--- a/gcc/testsuite/gcc.target/i386/cold-attribute-1.c
+++ b/gcc/testsuite/gcc.target/i386/cold-attribute-1.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2" } */
+/* { dg-options "-O2 -mno-avx" } */
 #include <string.h>
 static inline
 __attribute__ ((cold)) void
-- 
2.31.1


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 11/12] x86: Also pass -mno-avx to sw-1.c for ia32
  2021-04-29 12:54 [PATCH 00/12] Allow TImode/OImode/XImode in op_by_pieces operations H.J. Lu
                   ` (9 preceding siblings ...)
  2021-04-29 12:54 ` [PATCH 10/12] x86: Also pass -mno-avx to cold-attribute-1.c H.J. Lu
@ 2021-04-29 12:54 ` H.J. Lu
  2021-04-29 12:54 ` [PATCH 12/12] x86: Update gcc.target/i386/incoming-11.c H.J. Lu
  11 siblings, 0 replies; 28+ messages in thread
From: H.J. Lu @ 2021-04-29 12:54 UTC (permalink / raw)
  To: gcc-patches

Also pass -mno-avx to sw-1.c for ia32 since copying data with YMM or ZMM
registers disables shrink-wrapping when the second argument is passed on
stack.

	* gcc.target/i386/sw-1.c: Also pass -mno-avx for ia32.
---
 gcc/testsuite/gcc.target/i386/sw-1.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/gcc/testsuite/gcc.target/i386/sw-1.c b/gcc/testsuite/gcc.target/i386/sw-1.c
index aec095eda62..a9c89fca4ec 100644
--- a/gcc/testsuite/gcc.target/i386/sw-1.c
+++ b/gcc/testsuite/gcc.target/i386/sw-1.c
@@ -1,5 +1,6 @@
 /* { dg-do compile } */
 /* { dg-options "-O2 -mtune=generic -fshrink-wrap -fdump-rtl-pro_and_epilogue" } */
+/* { dg-additional-options "-mno-avx" { target ia32 } } */
 /* { dg-skip-if "No shrink-wrapping preformed" { x86_64-*-mingw* } } */
 
 #include <string.h>
-- 
2.31.1


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 12/12] x86: Update gcc.target/i386/incoming-11.c
  2021-04-29 12:54 [PATCH 00/12] Allow TImode/OImode/XImode in op_by_pieces operations H.J. Lu
                   ` (10 preceding siblings ...)
  2021-04-29 12:54 ` [PATCH 11/12] x86: Also pass -mno-avx to sw-1.c for ia32 H.J. Lu
@ 2021-04-29 12:54 ` H.J. Lu
  11 siblings, 0 replies; 28+ messages in thread
From: H.J. Lu @ 2021-04-29 12:54 UTC (permalink / raw)
  To: gcc-patches

Expect no stack realignment since we no longer realign stack when
copying data.

	* gcc.target/i386/incoming-11.c: Expect no stack realignment.
---
 gcc/testsuite/gcc.target/i386/incoming-11.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.target/i386/incoming-11.c b/gcc/testsuite/gcc.target/i386/incoming-11.c
index a830c96f7d1..4b822684b88 100644
--- a/gcc/testsuite/gcc.target/i386/incoming-11.c
+++ b/gcc/testsuite/gcc.target/i386/incoming-11.c
@@ -15,4 +15,4 @@ void f()
 	for (i = 0; i < 100; i++) q[i] = 1;
 }
 
-/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
+/* { dg-final { scan-assembler-not "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
-- 
2.31.1


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 01/12] Update alignment_for_piecewise_move
  2021-04-29 12:54 ` [PATCH 01/12] Update alignment_for_piecewise_move H.J. Lu
@ 2021-04-30  8:59   ` Richard Sandiford
  0 siblings, 0 replies; 28+ messages in thread
From: Richard Sandiford @ 2021-04-30  8:59 UTC (permalink / raw)
  To: H.J. Lu via Gcc-patches

"H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> alignment_for_piecewise_move is called only with MOVE_MAX_PIECES or
> STORE_MAX_PIECES, which are the number of bytes at a time that we
> can move or store efficiently.  We should call mode_for_size without
> limit to MAX_FIXED_MODE_SIZE, which is an integer expression for the
> size in bits of the largest integer machine mode that should actually
> be used, may be smaller than MOVE_MAX_PIECES or STORE_MAX_PIECES, which
> may use vector.
>
> 	* expr.c (alignment_for_piecewise_move): Call mode_for_size
> 	without limit to MAX_FIXED_MODE_SIZE.

OK.  I agree it doesn't make sense to apply the limit here, given
that the size is entirely under the target's control anyway.
And it should be safe, since any target that sets MOVE_MAX_PIECES
or STORE_MAX_PIECES higher than MAX_FIXED_MODE_SIZE would trigger
the require () assert.

Thanks,
Richard

> ---
>  gcc/expr.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/gcc/expr.c b/gcc/expr.c
> index e0167b77410..b4c110f8c17 100644
> --- a/gcc/expr.c
> +++ b/gcc/expr.c
> @@ -746,7 +746,7 @@ static unsigned int
>  alignment_for_piecewise_move (unsigned int max_pieces, unsigned int align)
>  {
>    scalar_int_mode tmode
> -    = int_mode_for_size (max_pieces * BITS_PER_UNIT, 1).require ();
> +    = int_mode_for_size (max_pieces * BITS_PER_UNIT, 0).require ();
>  
>    if (align >= GET_MODE_ALIGNMENT (tmode))
>      align = GET_MODE_ALIGNMENT (tmode);

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 02/12] Allow generating pseudo register with specific alignment
  2021-04-29 12:54 ` [PATCH 02/12] Allow generating pseudo register with specific alignment H.J. Lu
@ 2021-04-30  9:06   ` Richard Sandiford
  2021-04-30 12:06     ` H.J. Lu
  0 siblings, 1 reply; 28+ messages in thread
From: Richard Sandiford @ 2021-04-30  9:06 UTC (permalink / raw)
  To: H.J. Lu via Gcc-patches

"H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> gen_reg_rtx tracks stack alignment needed for pseudo registers so that
> associated hard registers can be properly spilled onto stack.  But there
> are cases where associated hard registers will never be spilled onto
> stack.  gen_reg_rtx is changed to take an argument for register alignment
> so that stack realignment can be avoided when not needed.

How is it guaranteed that they will never be spilled though?
I don't think that that guarantee exists for any kind of pseudo,
except perhaps for the temporary pseudos that the RA creates to
replace (match_scratch …)es.

Thanks,
Richard

> 	* emit-rtl.c (gen_reg_rtx): Add an argument for register
> 	alignment and use it if it isn't zero.
> 	* explow.c (force_reg): Add an argument for register alignment
> 	and pass it to gen_reg_rtx.
> 	* explow.h (force_reg): Add an argument for register alignment
> 	and default it to 0.
> 	* expr.h (convert_to_mode): Likewise.
> 	(convert_modes): Likewise.
> 	* expr.c (convert_to_mode): Add an argument for register
> 	alignment and pass it to convert_modes.
> 	(convert_modes): Add an argument for register alignment and
> 	pass it to gen_reg_rtx.
> ---
>  gcc/emit-rtl.c |  5 +++--
>  gcc/explow.c   |  6 +++---
>  gcc/explow.h   |  2 +-
>  gcc/expr.c     | 10 ++++++----
>  gcc/expr.h     |  6 ++++--
>  gcc/rtl.h      |  2 +-
>  6 files changed, 18 insertions(+), 13 deletions(-)
>
> diff --git a/gcc/emit-rtl.c b/gcc/emit-rtl.c
> index 07e908624a0..4accf851d23 100644
> --- a/gcc/emit-rtl.c
> +++ b/gcc/emit-rtl.c
> @@ -1160,10 +1160,11 @@ subreg_memory_offset (const_rtx x)
>     This pseudo is assigned the next sequential register number.  */
>  
>  rtx
> -gen_reg_rtx (machine_mode mode)
> +gen_reg_rtx (machine_mode mode, unsigned int align)
>  {
>    rtx val;
> -  unsigned int align = GET_MODE_ALIGNMENT (mode);
> +  if (align == 0)
> +    align = GET_MODE_ALIGNMENT (mode);
>  
>    gcc_assert (can_create_pseudo_p ());
>  
> diff --git a/gcc/explow.c b/gcc/explow.c
> index b6da277f689..c8673ce512d 100644
> --- a/gcc/explow.c
> +++ b/gcc/explow.c
> @@ -663,7 +663,7 @@ copy_to_mode_reg (machine_mode mode, rtx x)
>     since we mark it as a "constant" register.  */
>  
>  rtx
> -force_reg (machine_mode mode, rtx x)
> +force_reg (machine_mode mode, rtx x, unsigned int reg_align)
>  {
>    rtx temp, set;
>    rtx_insn *insn;
> @@ -673,7 +673,7 @@ force_reg (machine_mode mode, rtx x)
>  
>    if (general_operand (x, mode))
>      {
> -      temp = gen_reg_rtx (mode);
> +      temp = gen_reg_rtx (mode, reg_align);
>        insn = emit_move_insn (temp, x);
>      }
>    else
> @@ -683,7 +683,7 @@ force_reg (machine_mode mode, rtx x)
>  	insn = get_last_insn ();
>        else
>  	{
> -	  rtx temp2 = gen_reg_rtx (mode);
> +	  rtx temp2 = gen_reg_rtx (mode, reg_align);
>  	  insn = emit_move_insn (temp2, temp);
>  	  temp = temp2;
>  	}
> diff --git a/gcc/explow.h b/gcc/explow.h
> index 698f2a2a21c..621cdd7d356 100644
> --- a/gcc/explow.h
> +++ b/gcc/explow.h
> @@ -40,7 +40,7 @@ extern rtx copy_to_suggested_reg (rtx, rtx, machine_mode);
>  
>  /* Copy a value to a register if it isn't already a register.
>     Args are mode (in case value is a constant) and the value.  */
> -extern rtx force_reg (machine_mode, rtx);
> +extern rtx force_reg (machine_mode, rtx, unsigned int reg_align = 0);
>  
>  /* Return given rtx, copied into a new temp reg if it was in memory.  */
>  extern rtx force_not_mem (rtx);
> diff --git a/gcc/expr.c b/gcc/expr.c
> index b4c110f8c17..42db4ddbe0a 100644
> --- a/gcc/expr.c
> +++ b/gcc/expr.c
> @@ -658,9 +658,10 @@ convert_mode_scalar (rtx to, rtx from, int unsignedp)
>     or by copying to a new temporary with conversion.  */
>  
>  rtx
> -convert_to_mode (machine_mode mode, rtx x, int unsignedp)
> +convert_to_mode (machine_mode mode, rtx x, int unsignedp,
> +		 unsigned int reg_align)
>  {
> -  return convert_modes (mode, VOIDmode, x, unsignedp);
> +  return convert_modes (mode, VOIDmode, x, unsignedp, reg_align);
>  }
>  
>  /* Return an rtx for a value that would result
> @@ -674,7 +675,8 @@ convert_to_mode (machine_mode mode, rtx x, int unsignedp)
>     You can give VOIDmode for OLDMODE, if you are sure X has a nonvoid mode.  */
>  
>  rtx
> -convert_modes (machine_mode mode, machine_mode oldmode, rtx x, int unsignedp)
> +convert_modes (machine_mode mode, machine_mode oldmode, rtx x,
> +	       int unsignedp, unsigned int reg_align)
>  {
>    rtx temp;
>    scalar_int_mode int_mode;
> @@ -734,7 +736,7 @@ convert_modes (machine_mode mode, machine_mode oldmode, rtx x, int unsignedp)
>        return simplify_gen_subreg (mode, x, oldmode, 0);
>      }
>  
> -  temp = gen_reg_rtx (mode);
> +  temp = gen_reg_rtx (mode, reg_align);
>    convert_move (temp, x, unsignedp);
>    return temp;
>  }
> diff --git a/gcc/expr.h b/gcc/expr.h
> index 9a2736f69fa..2b06da1a889 100644
> --- a/gcc/expr.h
> +++ b/gcc/expr.h
> @@ -66,10 +66,12 @@ extern void init_expr (void);
>  extern void convert_move (rtx, rtx, int);
>  
>  /* Convert an rtx to specified machine mode and return the result.  */
> -extern rtx convert_to_mode (machine_mode, rtx, int);
> +extern rtx convert_to_mode (machine_mode, rtx, int,
> +			    unsigned int reg_align = 0);
>  
>  /* Convert an rtx to MODE from OLDMODE and return the result.  */
> -extern rtx convert_modes (machine_mode, machine_mode, rtx, int);
> +extern rtx convert_modes (machine_mode, machine_mode, rtx, int,
> +			  unsigned int reg_align = 0);
>  
>  /* Expand a call to memcpy or memmove or memcmp, and return the result.  */
>  extern rtx emit_block_op_via_libcall (enum built_in_function, rtx, rtx, rtx,
> diff --git a/gcc/rtl.h b/gcc/rtl.h
> index 398d745aff5..c72f7fd59b9 100644
> --- a/gcc/rtl.h
> +++ b/gcc/rtl.h
> @@ -3125,7 +3125,7 @@ subreg_promoted_mode (rtx x)
>  /* In emit-rtl.c */
>  extern rtvec gen_rtvec_v (int, rtx *);
>  extern rtvec gen_rtvec_v (int, rtx_insn **);
> -extern rtx gen_reg_rtx (machine_mode);
> +extern rtx gen_reg_rtx (machine_mode, unsigned int align = 0);
>  extern rtx gen_rtx_REG_offset (rtx, machine_mode, unsigned int, poly_int64);
>  extern rtx gen_reg_rtx_offset (rtx, machine_mode, int);
>  extern rtx gen_reg_rtx_and_attrs (rtx);

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 02/12] Allow generating pseudo register with specific alignment
  2021-04-30  9:06   ` Richard Sandiford
@ 2021-04-30 12:06     ` H.J. Lu
  2021-04-30 12:42       ` Richard Sandiford
  0 siblings, 1 reply; 28+ messages in thread
From: H.J. Lu @ 2021-04-30 12:06 UTC (permalink / raw)
  To: H.J. Lu via Gcc-patches, Richard Sandiford

On Fri, Apr 30, 2021 at 2:06 AM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> > gen_reg_rtx tracks stack alignment needed for pseudo registers so that
> > associated hard registers can be properly spilled onto stack.  But there
> > are cases where associated hard registers will never be spilled onto
> > stack.  gen_reg_rtx is changed to take an argument for register alignment
> > so that stack realignment can be avoided when not needed.
>
> How is it guaranteed that they will never be spilled though?
> I don't think that that guarantee exists for any kind of pseudo,
> except perhaps for the temporary pseudos that the RA creates to
> replace (match_scratch …)es.
>

The caller of creating pseudo registers with specific alignment must
guarantee that they will never be spilled.   I am only using it in

  /* Make operand1 a register if it isn't already.  */
  if (can_create_pseudo_p ()
      && !register_operand (op0, mode)
      && !register_operand (op1, mode))
    {
      /* NB: Don't increase stack alignment requirement when forcing
         operand1 into a pseudo register to copy data from one memory
         location to another since it doesn't require a spill.  */
      emit_move_insn (op0,
                      force_reg (GET_MODE (op0), op1,
                                 (UNITS_PER_WORD * BITS_PER_UNIT)));
      return;
    }

for vector moves.  RA shouldn't spill it.

> Thanks,
> Richard
>
> >       * emit-rtl.c (gen_reg_rtx): Add an argument for register
> >       alignment and use it if it isn't zero.
> >       * explow.c (force_reg): Add an argument for register alignment
> >       and pass it to gen_reg_rtx.
> >       * explow.h (force_reg): Add an argument for register alignment
> >       and default it to 0.
> >       * expr.h (convert_to_mode): Likewise.
> >       (convert_modes): Likewise.
> >       * expr.c (convert_to_mode): Add an argument for register
> >       alignment and pass it to convert_modes.
> >       (convert_modes): Add an argument for register alignment and
> >       pass it to gen_reg_rtx.
> > ---
> >  gcc/emit-rtl.c |  5 +++--
> >  gcc/explow.c   |  6 +++---
> >  gcc/explow.h   |  2 +-
> >  gcc/expr.c     | 10 ++++++----
> >  gcc/expr.h     |  6 ++++--
> >  gcc/rtl.h      |  2 +-
> >  6 files changed, 18 insertions(+), 13 deletions(-)
> >
> > diff --git a/gcc/emit-rtl.c b/gcc/emit-rtl.c
> > index 07e908624a0..4accf851d23 100644
> > --- a/gcc/emit-rtl.c
> > +++ b/gcc/emit-rtl.c
> > @@ -1160,10 +1160,11 @@ subreg_memory_offset (const_rtx x)
> >     This pseudo is assigned the next sequential register number.  */
> >
> >  rtx
> > -gen_reg_rtx (machine_mode mode)
> > +gen_reg_rtx (machine_mode mode, unsigned int align)
> >  {
> >    rtx val;
> > -  unsigned int align = GET_MODE_ALIGNMENT (mode);
> > +  if (align == 0)
> > +    align = GET_MODE_ALIGNMENT (mode);
> >
> >    gcc_assert (can_create_pseudo_p ());
> >
> > diff --git a/gcc/explow.c b/gcc/explow.c
> > index b6da277f689..c8673ce512d 100644
> > --- a/gcc/explow.c
> > +++ b/gcc/explow.c
> > @@ -663,7 +663,7 @@ copy_to_mode_reg (machine_mode mode, rtx x)
> >     since we mark it as a "constant" register.  */
> >
> >  rtx
> > -force_reg (machine_mode mode, rtx x)
> > +force_reg (machine_mode mode, rtx x, unsigned int reg_align)
> >  {
> >    rtx temp, set;
> >    rtx_insn *insn;
> > @@ -673,7 +673,7 @@ force_reg (machine_mode mode, rtx x)
> >
> >    if (general_operand (x, mode))
> >      {
> > -      temp = gen_reg_rtx (mode);
> > +      temp = gen_reg_rtx (mode, reg_align);
> >        insn = emit_move_insn (temp, x);
> >      }
> >    else
> > @@ -683,7 +683,7 @@ force_reg (machine_mode mode, rtx x)
> >       insn = get_last_insn ();
> >        else
> >       {
> > -       rtx temp2 = gen_reg_rtx (mode);
> > +       rtx temp2 = gen_reg_rtx (mode, reg_align);
> >         insn = emit_move_insn (temp2, temp);
> >         temp = temp2;
> >       }
> > diff --git a/gcc/explow.h b/gcc/explow.h
> > index 698f2a2a21c..621cdd7d356 100644
> > --- a/gcc/explow.h
> > +++ b/gcc/explow.h
> > @@ -40,7 +40,7 @@ extern rtx copy_to_suggested_reg (rtx, rtx, machine_mode);
> >
> >  /* Copy a value to a register if it isn't already a register.
> >     Args are mode (in case value is a constant) and the value.  */
> > -extern rtx force_reg (machine_mode, rtx);
> > +extern rtx force_reg (machine_mode, rtx, unsigned int reg_align = 0);
> >
> >  /* Return given rtx, copied into a new temp reg if it was in memory.  */
> >  extern rtx force_not_mem (rtx);
> > diff --git a/gcc/expr.c b/gcc/expr.c
> > index b4c110f8c17..42db4ddbe0a 100644
> > --- a/gcc/expr.c
> > +++ b/gcc/expr.c
> > @@ -658,9 +658,10 @@ convert_mode_scalar (rtx to, rtx from, int unsignedp)
> >     or by copying to a new temporary with conversion.  */
> >
> >  rtx
> > -convert_to_mode (machine_mode mode, rtx x, int unsignedp)
> > +convert_to_mode (machine_mode mode, rtx x, int unsignedp,
> > +              unsigned int reg_align)
> >  {
> > -  return convert_modes (mode, VOIDmode, x, unsignedp);
> > +  return convert_modes (mode, VOIDmode, x, unsignedp, reg_align);
> >  }
> >
> >  /* Return an rtx for a value that would result
> > @@ -674,7 +675,8 @@ convert_to_mode (machine_mode mode, rtx x, int unsignedp)
> >     You can give VOIDmode for OLDMODE, if you are sure X has a nonvoid mode.  */
> >
> >  rtx
> > -convert_modes (machine_mode mode, machine_mode oldmode, rtx x, int unsignedp)
> > +convert_modes (machine_mode mode, machine_mode oldmode, rtx x,
> > +            int unsignedp, unsigned int reg_align)
> >  {
> >    rtx temp;
> >    scalar_int_mode int_mode;
> > @@ -734,7 +736,7 @@ convert_modes (machine_mode mode, machine_mode oldmode, rtx x, int unsignedp)
> >        return simplify_gen_subreg (mode, x, oldmode, 0);
> >      }
> >
> > -  temp = gen_reg_rtx (mode);
> > +  temp = gen_reg_rtx (mode, reg_align);
> >    convert_move (temp, x, unsignedp);
> >    return temp;
> >  }
> > diff --git a/gcc/expr.h b/gcc/expr.h
> > index 9a2736f69fa..2b06da1a889 100644
> > --- a/gcc/expr.h
> > +++ b/gcc/expr.h
> > @@ -66,10 +66,12 @@ extern void init_expr (void);
> >  extern void convert_move (rtx, rtx, int);
> >
> >  /* Convert an rtx to specified machine mode and return the result.  */
> > -extern rtx convert_to_mode (machine_mode, rtx, int);
> > +extern rtx convert_to_mode (machine_mode, rtx, int,
> > +                         unsigned int reg_align = 0);
> >
> >  /* Convert an rtx to MODE from OLDMODE and return the result.  */
> > -extern rtx convert_modes (machine_mode, machine_mode, rtx, int);
> > +extern rtx convert_modes (machine_mode, machine_mode, rtx, int,
> > +                       unsigned int reg_align = 0);
> >
> >  /* Expand a call to memcpy or memmove or memcmp, and return the result.  */
> >  extern rtx emit_block_op_via_libcall (enum built_in_function, rtx, rtx, rtx,
> > diff --git a/gcc/rtl.h b/gcc/rtl.h
> > index 398d745aff5..c72f7fd59b9 100644
> > --- a/gcc/rtl.h
> > +++ b/gcc/rtl.h
> > @@ -3125,7 +3125,7 @@ subreg_promoted_mode (rtx x)
> >  /* In emit-rtl.c */
> >  extern rtvec gen_rtvec_v (int, rtx *);
> >  extern rtvec gen_rtvec_v (int, rtx_insn **);
> > -extern rtx gen_reg_rtx (machine_mode);
> > +extern rtx gen_reg_rtx (machine_mode, unsigned int align = 0);
> >  extern rtx gen_rtx_REG_offset (rtx, machine_mode, unsigned int, poly_int64);
> >  extern rtx gen_reg_rtx_offset (rtx, machine_mode, int);
> >  extern rtx gen_reg_rtx_and_attrs (rtx);



-- 
H.J.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 02/12] Allow generating pseudo register with specific alignment
  2021-04-30 12:06     ` H.J. Lu
@ 2021-04-30 12:42       ` Richard Sandiford
  2021-04-30 12:49         ` H.J. Lu
  0 siblings, 1 reply; 28+ messages in thread
From: Richard Sandiford @ 2021-04-30 12:42 UTC (permalink / raw)
  To: H.J. Lu via Gcc-patches

"H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> On Fri, Apr 30, 2021 at 2:06 AM Richard Sandiford
> <richard.sandiford@arm.com> wrote:
>>
>> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
>> > gen_reg_rtx tracks stack alignment needed for pseudo registers so that
>> > associated hard registers can be properly spilled onto stack.  But there
>> > are cases where associated hard registers will never be spilled onto
>> > stack.  gen_reg_rtx is changed to take an argument for register alignment
>> > so that stack realignment can be avoided when not needed.
>>
>> How is it guaranteed that they will never be spilled though?
>> I don't think that that guarantee exists for any kind of pseudo,
>> except perhaps for the temporary pseudos that the RA creates to
>> replace (match_scratch …)es.
>>
>
> The caller of creating pseudo registers with specific alignment must
> guarantee that they will never be spilled.   I am only using it in
>
>   /* Make operand1 a register if it isn't already.  */
>   if (can_create_pseudo_p ()
>       && !register_operand (op0, mode)
>       && !register_operand (op1, mode))
>     {
>       /* NB: Don't increase stack alignment requirement when forcing
>          operand1 into a pseudo register to copy data from one memory
>          location to another since it doesn't require a spill.  */
>       emit_move_insn (op0,
>                       force_reg (GET_MODE (op0), op1,
>                                  (UNITS_PER_WORD * BITS_PER_UNIT)));
>       return;
>     }
>
> for vector moves.  RA shouldn't spill it.

But this is the point: it's a case of hoping that the RA won't spill it,
rather than having a guarantee that it won't.

Even if the moves start out adjacent, they could be separated by later
RTL optimisations, particularly scheduling.  (I realise pre-RA scheduling
isn't enabled by default for x86, but it can still be enabled explicitly.)
Or if the same data is being copied to two locations, we might reuse
values loaded by the first copy for the second copy as well.

The only way to guarantee that the temporary won't be spilled is to hide
it until after RA.

Thanks,
Richard

>
>> Thanks,
>> Richard
>>
>> >       * emit-rtl.c (gen_reg_rtx): Add an argument for register
>> >       alignment and use it if it isn't zero.
>> >       * explow.c (force_reg): Add an argument for register alignment
>> >       and pass it to gen_reg_rtx.
>> >       * explow.h (force_reg): Add an argument for register alignment
>> >       and default it to 0.
>> >       * expr.h (convert_to_mode): Likewise.
>> >       (convert_modes): Likewise.
>> >       * expr.c (convert_to_mode): Add an argument for register
>> >       alignment and pass it to convert_modes.
>> >       (convert_modes): Add an argument for register alignment and
>> >       pass it to gen_reg_rtx.
>> > ---
>> >  gcc/emit-rtl.c |  5 +++--
>> >  gcc/explow.c   |  6 +++---
>> >  gcc/explow.h   |  2 +-
>> >  gcc/expr.c     | 10 ++++++----
>> >  gcc/expr.h     |  6 ++++--
>> >  gcc/rtl.h      |  2 +-
>> >  6 files changed, 18 insertions(+), 13 deletions(-)
>> >
>> > diff --git a/gcc/emit-rtl.c b/gcc/emit-rtl.c
>> > index 07e908624a0..4accf851d23 100644
>> > --- a/gcc/emit-rtl.c
>> > +++ b/gcc/emit-rtl.c
>> > @@ -1160,10 +1160,11 @@ subreg_memory_offset (const_rtx x)
>> >     This pseudo is assigned the next sequential register number.  */
>> >
>> >  rtx
>> > -gen_reg_rtx (machine_mode mode)
>> > +gen_reg_rtx (machine_mode mode, unsigned int align)
>> >  {
>> >    rtx val;
>> > -  unsigned int align = GET_MODE_ALIGNMENT (mode);
>> > +  if (align == 0)
>> > +    align = GET_MODE_ALIGNMENT (mode);
>> >
>> >    gcc_assert (can_create_pseudo_p ());
>> >
>> > diff --git a/gcc/explow.c b/gcc/explow.c
>> > index b6da277f689..c8673ce512d 100644
>> > --- a/gcc/explow.c
>> > +++ b/gcc/explow.c
>> > @@ -663,7 +663,7 @@ copy_to_mode_reg (machine_mode mode, rtx x)
>> >     since we mark it as a "constant" register.  */
>> >
>> >  rtx
>> > -force_reg (machine_mode mode, rtx x)
>> > +force_reg (machine_mode mode, rtx x, unsigned int reg_align)
>> >  {
>> >    rtx temp, set;
>> >    rtx_insn *insn;
>> > @@ -673,7 +673,7 @@ force_reg (machine_mode mode, rtx x)
>> >
>> >    if (general_operand (x, mode))
>> >      {
>> > -      temp = gen_reg_rtx (mode);
>> > +      temp = gen_reg_rtx (mode, reg_align);
>> >        insn = emit_move_insn (temp, x);
>> >      }
>> >    else
>> > @@ -683,7 +683,7 @@ force_reg (machine_mode mode, rtx x)
>> >       insn = get_last_insn ();
>> >        else
>> >       {
>> > -       rtx temp2 = gen_reg_rtx (mode);
>> > +       rtx temp2 = gen_reg_rtx (mode, reg_align);
>> >         insn = emit_move_insn (temp2, temp);
>> >         temp = temp2;
>> >       }
>> > diff --git a/gcc/explow.h b/gcc/explow.h
>> > index 698f2a2a21c..621cdd7d356 100644
>> > --- a/gcc/explow.h
>> > +++ b/gcc/explow.h
>> > @@ -40,7 +40,7 @@ extern rtx copy_to_suggested_reg (rtx, rtx, machine_mode);
>> >
>> >  /* Copy a value to a register if it isn't already a register.
>> >     Args are mode (in case value is a constant) and the value.  */
>> > -extern rtx force_reg (machine_mode, rtx);
>> > +extern rtx force_reg (machine_mode, rtx, unsigned int reg_align = 0);
>> >
>> >  /* Return given rtx, copied into a new temp reg if it was in memory.  */
>> >  extern rtx force_not_mem (rtx);
>> > diff --git a/gcc/expr.c b/gcc/expr.c
>> > index b4c110f8c17..42db4ddbe0a 100644
>> > --- a/gcc/expr.c
>> > +++ b/gcc/expr.c
>> > @@ -658,9 +658,10 @@ convert_mode_scalar (rtx to, rtx from, int unsignedp)
>> >     or by copying to a new temporary with conversion.  */
>> >
>> >  rtx
>> > -convert_to_mode (machine_mode mode, rtx x, int unsignedp)
>> > +convert_to_mode (machine_mode mode, rtx x, int unsignedp,
>> > +              unsigned int reg_align)
>> >  {
>> > -  return convert_modes (mode, VOIDmode, x, unsignedp);
>> > +  return convert_modes (mode, VOIDmode, x, unsignedp, reg_align);
>> >  }
>> >
>> >  /* Return an rtx for a value that would result
>> > @@ -674,7 +675,8 @@ convert_to_mode (machine_mode mode, rtx x, int unsignedp)
>> >     You can give VOIDmode for OLDMODE, if you are sure X has a nonvoid mode.  */
>> >
>> >  rtx
>> > -convert_modes (machine_mode mode, machine_mode oldmode, rtx x, int unsignedp)
>> > +convert_modes (machine_mode mode, machine_mode oldmode, rtx x,
>> > +            int unsignedp, unsigned int reg_align)
>> >  {
>> >    rtx temp;
>> >    scalar_int_mode int_mode;
>> > @@ -734,7 +736,7 @@ convert_modes (machine_mode mode, machine_mode oldmode, rtx x, int unsignedp)
>> >        return simplify_gen_subreg (mode, x, oldmode, 0);
>> >      }
>> >
>> > -  temp = gen_reg_rtx (mode);
>> > +  temp = gen_reg_rtx (mode, reg_align);
>> >    convert_move (temp, x, unsignedp);
>> >    return temp;
>> >  }
>> > diff --git a/gcc/expr.h b/gcc/expr.h
>> > index 9a2736f69fa..2b06da1a889 100644
>> > --- a/gcc/expr.h
>> > +++ b/gcc/expr.h
>> > @@ -66,10 +66,12 @@ extern void init_expr (void);
>> >  extern void convert_move (rtx, rtx, int);
>> >
>> >  /* Convert an rtx to specified machine mode and return the result.  */
>> > -extern rtx convert_to_mode (machine_mode, rtx, int);
>> > +extern rtx convert_to_mode (machine_mode, rtx, int,
>> > +                         unsigned int reg_align = 0);
>> >
>> >  /* Convert an rtx to MODE from OLDMODE and return the result.  */
>> > -extern rtx convert_modes (machine_mode, machine_mode, rtx, int);
>> > +extern rtx convert_modes (machine_mode, machine_mode, rtx, int,
>> > +                       unsigned int reg_align = 0);
>> >
>> >  /* Expand a call to memcpy or memmove or memcmp, and return the result.  */
>> >  extern rtx emit_block_op_via_libcall (enum built_in_function, rtx, rtx, rtx,
>> > diff --git a/gcc/rtl.h b/gcc/rtl.h
>> > index 398d745aff5..c72f7fd59b9 100644
>> > --- a/gcc/rtl.h
>> > +++ b/gcc/rtl.h
>> > @@ -3125,7 +3125,7 @@ subreg_promoted_mode (rtx x)
>> >  /* In emit-rtl.c */
>> >  extern rtvec gen_rtvec_v (int, rtx *);
>> >  extern rtvec gen_rtvec_v (int, rtx_insn **);
>> > -extern rtx gen_reg_rtx (machine_mode);
>> > +extern rtx gen_reg_rtx (machine_mode, unsigned int align = 0);
>> >  extern rtx gen_rtx_REG_offset (rtx, machine_mode, unsigned int, poly_int64);
>> >  extern rtx gen_reg_rtx_offset (rtx, machine_mode, int);
>> >  extern rtx gen_reg_rtx_and_attrs (rtx);

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 02/12] Allow generating pseudo register with specific alignment
  2021-04-30 12:42       ` Richard Sandiford
@ 2021-04-30 12:49         ` H.J. Lu
  2021-04-30 13:34           ` H.J. Lu
  0 siblings, 1 reply; 28+ messages in thread
From: H.J. Lu @ 2021-04-30 12:49 UTC (permalink / raw)
  To: H.J. Lu via Gcc-patches, Richard Sandiford

On Fri, Apr 30, 2021 at 5:42 AM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> > On Fri, Apr 30, 2021 at 2:06 AM Richard Sandiford
> > <richard.sandiford@arm.com> wrote:
> >>
> >> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> >> > gen_reg_rtx tracks stack alignment needed for pseudo registers so that
> >> > associated hard registers can be properly spilled onto stack.  But there
> >> > are cases where associated hard registers will never be spilled onto
> >> > stack.  gen_reg_rtx is changed to take an argument for register alignment
> >> > so that stack realignment can be avoided when not needed.
> >>
> >> How is it guaranteed that they will never be spilled though?
> >> I don't think that that guarantee exists for any kind of pseudo,
> >> except perhaps for the temporary pseudos that the RA creates to
> >> replace (match_scratch …)es.
> >>
> >
> > The caller of creating pseudo registers with specific alignment must
> > guarantee that they will never be spilled.   I am only using it in
> >
> >   /* Make operand1 a register if it isn't already.  */
> >   if (can_create_pseudo_p ()
> >       && !register_operand (op0, mode)
> >       && !register_operand (op1, mode))
> >     {
> >       /* NB: Don't increase stack alignment requirement when forcing
> >          operand1 into a pseudo register to copy data from one memory
> >          location to another since it doesn't require a spill.  */
> >       emit_move_insn (op0,
> >                       force_reg (GET_MODE (op0), op1,
> >                                  (UNITS_PER_WORD * BITS_PER_UNIT)));
> >       return;
> >     }
> >
> > for vector moves.  RA shouldn't spill it.
>
> But this is the point: it's a case of hoping that the RA won't spill it,
> rather than having a guarantee that it won't.
>
> Even if the moves start out adjacent, they could be separated by later
> RTL optimisations, particularly scheduling.  (I realise pre-RA scheduling
> isn't enabled by default for x86, but it can still be enabled explicitly.)
> Or if the same data is being copied to two locations, we might reuse
> values loaded by the first copy for the second copy as well.
>
> The only way to guarantee that the temporary won't be spilled is to hide
> it until after RA.
>

Let me think about it.

-- 
H.J.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 02/12] Allow generating pseudo register with specific alignment
  2021-04-30 12:49         ` H.J. Lu
@ 2021-04-30 13:34           ` H.J. Lu
  2021-04-30 15:56             ` Richard Sandiford
  0 siblings, 1 reply; 28+ messages in thread
From: H.J. Lu @ 2021-04-30 13:34 UTC (permalink / raw)
  To: H.J. Lu via Gcc-patches, Richard Sandiford

On Fri, Apr 30, 2021 at 5:49 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Fri, Apr 30, 2021 at 5:42 AM Richard Sandiford
> <richard.sandiford@arm.com> wrote:
> >
> > "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> > > On Fri, Apr 30, 2021 at 2:06 AM Richard Sandiford
> > > <richard.sandiford@arm.com> wrote:
> > >>
> > >> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> > >> > gen_reg_rtx tracks stack alignment needed for pseudo registers so that
> > >> > associated hard registers can be properly spilled onto stack.  But there
> > >> > are cases where associated hard registers will never be spilled onto
> > >> > stack.  gen_reg_rtx is changed to take an argument for register alignment
> > >> > so that stack realignment can be avoided when not needed.
> > >>
> > >> How is it guaranteed that they will never be spilled though?
> > >> I don't think that that guarantee exists for any kind of pseudo,
> > >> except perhaps for the temporary pseudos that the RA creates to
> > >> replace (match_scratch …)es.
> > >>
> > >
> > > The caller of creating pseudo registers with specific alignment must
> > > guarantee that they will never be spilled.   I am only using it in
> > >
> > >   /* Make operand1 a register if it isn't already.  */
> > >   if (can_create_pseudo_p ()
> > >       && !register_operand (op0, mode)
> > >       && !register_operand (op1, mode))
> > >     {
> > >       /* NB: Don't increase stack alignment requirement when forcing
> > >          operand1 into a pseudo register to copy data from one memory
> > >          location to another since it doesn't require a spill.  */
> > >       emit_move_insn (op0,
> > >                       force_reg (GET_MODE (op0), op1,
> > >                                  (UNITS_PER_WORD * BITS_PER_UNIT)));
> > >       return;
> > >     }
> > >
> > > for vector moves.  RA shouldn't spill it.
> >
> > But this is the point: it's a case of hoping that the RA won't spill it,
> > rather than having a guarantee that it won't.
> >
> > Even if the moves start out adjacent, they could be separated by later
> > RTL optimisations, particularly scheduling.  (I realise pre-RA scheduling
> > isn't enabled by default for x86, but it can still be enabled explicitly.)
> > Or if the same data is being copied to two locations, we might reuse
> > values loaded by the first copy for the second copy as well.

There are cases where pseudo vector registers are created as pure
temporary registers in the backend and they shouldn't ever be spilled
to stack.   They will be spilled to stack only if there are other non-temporary
vector register usage in which case stack will be properly re-aligned.
Caller of creating pseudo registers with specific alignment guarantees
that they are used only as pure temporary registers.

> > The only way to guarantee that the temporary won't be spilled is to hide
> > it until after RA.
> >
>
> Let me think about it.
>
> --
> H.J.



-- 
H.J.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 02/12] Allow generating pseudo register with specific alignment
  2021-04-30 13:34           ` H.J. Lu
@ 2021-04-30 15:56             ` Richard Sandiford
  2021-04-30 17:33               ` H.J. Lu
  2021-05-03  8:18               ` Richard Biener
  0 siblings, 2 replies; 28+ messages in thread
From: Richard Sandiford @ 2021-04-30 15:56 UTC (permalink / raw)
  To: H.J. Lu via Gcc-patches

"H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> On Fri, Apr 30, 2021 at 5:49 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>>
>> On Fri, Apr 30, 2021 at 5:42 AM Richard Sandiford
>> <richard.sandiford@arm.com> wrote:
>> >
>> > "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
>> > > On Fri, Apr 30, 2021 at 2:06 AM Richard Sandiford
>> > > <richard.sandiford@arm.com> wrote:
>> > >>
>> > >> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
>> > >> > gen_reg_rtx tracks stack alignment needed for pseudo registers so that
>> > >> > associated hard registers can be properly spilled onto stack.  But there
>> > >> > are cases where associated hard registers will never be spilled onto
>> > >> > stack.  gen_reg_rtx is changed to take an argument for register alignment
>> > >> > so that stack realignment can be avoided when not needed.
>> > >>
>> > >> How is it guaranteed that they will never be spilled though?
>> > >> I don't think that that guarantee exists for any kind of pseudo,
>> > >> except perhaps for the temporary pseudos that the RA creates to
>> > >> replace (match_scratch …)es.
>> > >>
>> > >
>> > > The caller of creating pseudo registers with specific alignment must
>> > > guarantee that they will never be spilled.   I am only using it in
>> > >
>> > >   /* Make operand1 a register if it isn't already.  */
>> > >   if (can_create_pseudo_p ()
>> > >       && !register_operand (op0, mode)
>> > >       && !register_operand (op1, mode))
>> > >     {
>> > >       /* NB: Don't increase stack alignment requirement when forcing
>> > >          operand1 into a pseudo register to copy data from one memory
>> > >          location to another since it doesn't require a spill.  */
>> > >       emit_move_insn (op0,
>> > >                       force_reg (GET_MODE (op0), op1,
>> > >                                  (UNITS_PER_WORD * BITS_PER_UNIT)));
>> > >       return;
>> > >     }
>> > >
>> > > for vector moves.  RA shouldn't spill it.
>> >
>> > But this is the point: it's a case of hoping that the RA won't spill it,
>> > rather than having a guarantee that it won't.
>> >
>> > Even if the moves start out adjacent, they could be separated by later
>> > RTL optimisations, particularly scheduling.  (I realise pre-RA scheduling
>> > isn't enabled by default for x86, but it can still be enabled explicitly.)
>> > Or if the same data is being copied to two locations, we might reuse
>> > values loaded by the first copy for the second copy as well.
>
> There are cases where pseudo vector registers are created as pure
> temporary registers in the backend and they shouldn't ever be spilled
> to stack.   They will be spilled to stack only if there are other non-temporary
> vector register usage in which case stack will be properly re-aligned.
> Caller of creating pseudo registers with specific alignment guarantees
> that they are used only as pure temporary registers.

I don't think there's really a distinct category of pure temporary
registers though.  The things I mentioned above can happen for any
kind of pseudo register.

Thanks,
Richard

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 02/12] Allow generating pseudo register with specific alignment
  2021-04-30 15:56             ` Richard Sandiford
@ 2021-04-30 17:33               ` H.J. Lu
  2021-05-03  8:18               ` Richard Biener
  1 sibling, 0 replies; 28+ messages in thread
From: H.J. Lu @ 2021-04-30 17:33 UTC (permalink / raw)
  To: H.J. Lu via Gcc-patches, richard.sandiford

On Fri, Apr 30, 2021 at 04:56:30PM +0100, Richard Sandiford wrote:
> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> > On Fri, Apr 30, 2021 at 5:49 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >>
> >> On Fri, Apr 30, 2021 at 5:42 AM Richard Sandiford
> >> <richard.sandiford@arm.com> wrote:
> >> >
> >> > "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> >> > > On Fri, Apr 30, 2021 at 2:06 AM Richard Sandiford
> >> > > <richard.sandiford@arm.com> wrote:
> >> > >>
> >> > >> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> >> > >> > gen_reg_rtx tracks stack alignment needed for pseudo registers so that
> >> > >> > associated hard registers can be properly spilled onto stack.  But there
> >> > >> > are cases where associated hard registers will never be spilled onto
> >> > >> > stack.  gen_reg_rtx is changed to take an argument for register alignment
> >> > >> > so that stack realignment can be avoided when not needed.
> >> > >>
> >> > >> How is it guaranteed that they will never be spilled though?
> >> > >> I don't think that that guarantee exists for any kind of pseudo,
> >> > >> except perhaps for the temporary pseudos that the RA creates to
> >> > >> replace (match_scratch …)es.
> >> > >>
> >> > >
> >> > > The caller of creating pseudo registers with specific alignment must
> >> > > guarantee that they will never be spilled.   I am only using it in
> >> > >
> >> > >   /* Make operand1 a register if it isn't already.  */
> >> > >   if (can_create_pseudo_p ()
> >> > >       && !register_operand (op0, mode)
> >> > >       && !register_operand (op1, mode))
> >> > >     {
> >> > >       /* NB: Don't increase stack alignment requirement when forcing
> >> > >          operand1 into a pseudo register to copy data from one memory
> >> > >          location to another since it doesn't require a spill.  */
> >> > >       emit_move_insn (op0,
> >> > >                       force_reg (GET_MODE (op0), op1,
> >> > >                                  (UNITS_PER_WORD * BITS_PER_UNIT)));
> >> > >       return;
> >> > >     }
> >> > >
> >> > > for vector moves.  RA shouldn't spill it.
> >> >
> >> > But this is the point: it's a case of hoping that the RA won't spill it,
> >> > rather than having a guarantee that it won't.
> >> >
> >> > Even if the moves start out adjacent, they could be separated by later
> >> > RTL optimisations, particularly scheduling.  (I realise pre-RA scheduling
> >> > isn't enabled by default for x86, but it can still be enabled explicitly.)
> >> > Or if the same data is being copied to two locations, we might reuse
> >> > values loaded by the first copy for the second copy as well.
> >
> > There are cases where pseudo vector registers are created as pure
> > temporary registers in the backend and they shouldn't ever be spilled
> > to stack.   They will be spilled to stack only if there are other non-temporary
> > vector register usage in which case stack will be properly re-aligned.
> > Caller of creating pseudo registers with specific alignment guarantees
> > that they are used only as pure temporary registers.
> 
> I don't think there's really a distinct category of pure temporary
> registers though.  The things I mentioned above can happen for any
> kind of pseudo register.
> 

This special pseudo register is only generated when inlining memcpy and
memset.  For memcpy, there is no need to spill:

[hjl@gnu-cfl-2 pieces]$ cat spill1.i
extern void *ops1;
extern void *ops2;

extern void bar (void);

void
foo (void)
{
  __builtin_memcpy (ops1, ops2, 32);
  bar ();
  __builtin_memcpy (ops1, ops2, 32);
}
[hjl@gnu-cfl-2 pieces]$ make spill1.s
/export/build/gnu/tools-build/gcc-gitlab-debug/build-x86_64-linux/gcc/xgcc -B/export/build/gnu/tools-build/gcc-gitlab-debug/build-x86_64-linux/gcc/ -O2 -march=haswell -S spill1.i
[hjl@gnu-cfl-2 pieces]$ cat spill1.s
	.file	"spill1.i"
	.text
	.p2align 4
	.globl	foo
	.type	foo, @function
foo:
.LFB0:
	.cfi_startproc
	subq	$8, %rsp
	.cfi_def_cfa_offset 16
	movq	ops2(%rip), %rax
	vmovdqu	(%rax), %ymm0
	movq	ops1(%rip), %rax
	vmovdqu	%ymm0, (%rax)
	vzeroupper
	call	bar
	movq	ops2(%rip), %rax
	vmovdqu	(%rax), %ymm0
	movq	ops1(%rip), %rax
	vmovdqu	%ymm0, (%rax)
	vzeroupper
	addq	$8, %rsp
	.cfi_def_cfa_offset 8
	ret
	.cfi_endproc
.LFE0:
	.size	foo, .-foo
	.ident	"GCC: (GNU) 12.0.0 20210430 (experimental)"
	.section	.note.GNU-stack,"",@progbits
[hjl@gnu-cfl-2 pieces]$

For memeset, x86 backend supports unaligned spill:

[hjl@gnu-cfl-2 pieces]$ cat spill2.i
extern void *ops1;
extern void *ops2;

extern void bar (void);

void
foo (int c)
{
  __builtin_memset (ops1, c, 32);
  bar ();
  __builtin_memset (ops2, c, 32);
}
[hjl@gnu-cfl-2 pieces]$ make spill2.s
/export/build/gnu/tools-build/gcc-gitlab-debug/build-x86_64-linux/gcc/xgcc -B/export/build/gnu/tools-build/gcc-gitlab-debug/build-x86_64-linux/gcc/ -O2 -march=haswell -S spill2.i
[hjl@gnu-cfl-2 pieces]$ cat spill2.s
	.file	"spill2.i"
	.text
	.p2align 4
	.globl	foo
	.type	foo, @function
foo:
.LFB0:
	.cfi_startproc
	subq	$40, %rsp
	.cfi_def_cfa_offset 48
	vmovd	%edi, %xmm0
	movq	ops1(%rip), %rax
	vpbroadcastb	%xmm0, %ymm0
	vmovdqu	%ymm0, (%rax)
	vmovdqu	%ymm0, (%rsp)
	vzeroupper
	call	bar
	movq	ops2(%rip), %rax
	vmovdqu	(%rsp), %ymm0
	vmovdqu	%ymm0, (%rax)
	vzeroupper
	addq	$40, %rsp
	.cfi_def_cfa_offset 8
	ret
	.cfi_endproc
.LFE0:
	.size	foo, .-foo
	.ident	"GCC: (GNU) 12.0.0 20210430 (experimental)"
	.section	.note.GNU-stack,"",@progbits
[hjl@gnu-cfl-2 pieces]$


H.J.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 02/12] Allow generating pseudo register with specific alignment
  2021-04-30 15:56             ` Richard Sandiford
  2021-04-30 17:33               ` H.J. Lu
@ 2021-05-03  8:18               ` Richard Biener
  2021-05-10  9:39                 ` Richard Sandiford
  1 sibling, 1 reply; 28+ messages in thread
From: Richard Biener @ 2021-05-03  8:18 UTC (permalink / raw)
  To: Richard Sandiford, H.J. Lu via Gcc-patches, H.J. Lu

On Fri, Apr 30, 2021 at 8:30 PM Richard Sandiford via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> > On Fri, Apr 30, 2021 at 5:49 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >>
> >> On Fri, Apr 30, 2021 at 5:42 AM Richard Sandiford
> >> <richard.sandiford@arm.com> wrote:
> >> >
> >> > "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> >> > > On Fri, Apr 30, 2021 at 2:06 AM Richard Sandiford
> >> > > <richard.sandiford@arm.com> wrote:
> >> > >>
> >> > >> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> >> > >> > gen_reg_rtx tracks stack alignment needed for pseudo registers so that
> >> > >> > associated hard registers can be properly spilled onto stack.  But there
> >> > >> > are cases where associated hard registers will never be spilled onto
> >> > >> > stack.  gen_reg_rtx is changed to take an argument for register alignment
> >> > >> > so that stack realignment can be avoided when not needed.
> >> > >>
> >> > >> How is it guaranteed that they will never be spilled though?
> >> > >> I don't think that that guarantee exists for any kind of pseudo,
> >> > >> except perhaps for the temporary pseudos that the RA creates to
> >> > >> replace (match_scratch …)es.
> >> > >>
> >> > >
> >> > > The caller of creating pseudo registers with specific alignment must
> >> > > guarantee that they will never be spilled.   I am only using it in
> >> > >
> >> > >   /* Make operand1 a register if it isn't already.  */
> >> > >   if (can_create_pseudo_p ()
> >> > >       && !register_operand (op0, mode)
> >> > >       && !register_operand (op1, mode))
> >> > >     {
> >> > >       /* NB: Don't increase stack alignment requirement when forcing
> >> > >          operand1 into a pseudo register to copy data from one memory
> >> > >          location to another since it doesn't require a spill.  */
> >> > >       emit_move_insn (op0,
> >> > >                       force_reg (GET_MODE (op0), op1,
> >> > >                                  (UNITS_PER_WORD * BITS_PER_UNIT)));
> >> > >       return;
> >> > >     }
> >> > >
> >> > > for vector moves.  RA shouldn't spill it.
> >> >
> >> > But this is the point: it's a case of hoping that the RA won't spill it,
> >> > rather than having a guarantee that it won't.
> >> >
> >> > Even if the moves start out adjacent, they could be separated by later
> >> > RTL optimisations, particularly scheduling.  (I realise pre-RA scheduling
> >> > isn't enabled by default for x86, but it can still be enabled explicitly.)
> >> > Or if the same data is being copied to two locations, we might reuse
> >> > values loaded by the first copy for the second copy as well.
> >
> > There are cases where pseudo vector registers are created as pure
> > temporary registers in the backend and they shouldn't ever be spilled
> > to stack.   They will be spilled to stack only if there are other non-temporary
> > vector register usage in which case stack will be properly re-aligned.
> > Caller of creating pseudo registers with specific alignment guarantees
> > that they are used only as pure temporary registers.
>
> I don't think there's really a distinct category of pure temporary
> registers though.  The things I mentioned above can happen for any
> kind of pseudo register.

I wonder if for the cases HJ thinks of it is appropriate to use hardregs?
Do we generally handle those well?  That is, are they again subject
to be allocated by RA when no longer live?

Richard.

> Thanks,
> Richard

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 02/12] Allow generating pseudo register with specific alignment
  2021-05-03  8:18               ` Richard Biener
@ 2021-05-10  9:39                 ` Richard Sandiford
  2021-05-10 13:29                   ` H.J. Lu
  0 siblings, 1 reply; 28+ messages in thread
From: Richard Sandiford @ 2021-05-10  9:39 UTC (permalink / raw)
  To: Richard Biener via Gcc-patches

Richard Biener via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> On Fri, Apr 30, 2021 at 8:30 PM Richard Sandiford via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
>>
>> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
>> > On Fri, Apr 30, 2021 at 5:49 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>> >>
>> >> On Fri, Apr 30, 2021 at 5:42 AM Richard Sandiford
>> >> <richard.sandiford@arm.com> wrote:
>> >> >
>> >> > "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
>> >> > > On Fri, Apr 30, 2021 at 2:06 AM Richard Sandiford
>> >> > > <richard.sandiford@arm.com> wrote:
>> >> > >>
>> >> > >> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
>> >> > >> > gen_reg_rtx tracks stack alignment needed for pseudo registers so that
>> >> > >> > associated hard registers can be properly spilled onto stack.  But there
>> >> > >> > are cases where associated hard registers will never be spilled onto
>> >> > >> > stack.  gen_reg_rtx is changed to take an argument for register alignment
>> >> > >> > so that stack realignment can be avoided when not needed.
>> >> > >>
>> >> > >> How is it guaranteed that they will never be spilled though?
>> >> > >> I don't think that that guarantee exists for any kind of pseudo,
>> >> > >> except perhaps for the temporary pseudos that the RA creates to
>> >> > >> replace (match_scratch …)es.
>> >> > >>
>> >> > >
>> >> > > The caller of creating pseudo registers with specific alignment must
>> >> > > guarantee that they will never be spilled.   I am only using it in
>> >> > >
>> >> > >   /* Make operand1 a register if it isn't already.  */
>> >> > >   if (can_create_pseudo_p ()
>> >> > >       && !register_operand (op0, mode)
>> >> > >       && !register_operand (op1, mode))
>> >> > >     {
>> >> > >       /* NB: Don't increase stack alignment requirement when forcing
>> >> > >          operand1 into a pseudo register to copy data from one memory
>> >> > >          location to another since it doesn't require a spill.  */
>> >> > >       emit_move_insn (op0,
>> >> > >                       force_reg (GET_MODE (op0), op1,
>> >> > >                                  (UNITS_PER_WORD * BITS_PER_UNIT)));
>> >> > >       return;
>> >> > >     }
>> >> > >
>> >> > > for vector moves.  RA shouldn't spill it.
>> >> >
>> >> > But this is the point: it's a case of hoping that the RA won't spill it,
>> >> > rather than having a guarantee that it won't.
>> >> >
>> >> > Even if the moves start out adjacent, they could be separated by later
>> >> > RTL optimisations, particularly scheduling.  (I realise pre-RA scheduling
>> >> > isn't enabled by default for x86, but it can still be enabled explicitly.)
>> >> > Or if the same data is being copied to two locations, we might reuse
>> >> > values loaded by the first copy for the second copy as well.
>> >
>> > There are cases where pseudo vector registers are created as pure
>> > temporary registers in the backend and they shouldn't ever be spilled
>> > to stack.   They will be spilled to stack only if there are other non-temporary
>> > vector register usage in which case stack will be properly re-aligned.
>> > Caller of creating pseudo registers with specific alignment guarantees
>> > that they are used only as pure temporary registers.
>>
>> I don't think there's really a distinct category of pure temporary
>> registers though.  The things I mentioned above can happen for any
>> kind of pseudo register.
>
> I wonder if for the cases HJ thinks of it is appropriate to use hardregs?
> Do we generally handle those well?  That is, are they again subject
> to be allocated by RA when no longer live?

Yeah, using hard registers should work.  Of course, any given fixed choice
of hard register has the potential to be suboptimal in some situation,
but it should be safe.

Thanks,
Richard

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 02/12] Allow generating pseudo register with specific alignment
  2021-05-10  9:39                 ` Richard Sandiford
@ 2021-05-10 13:29                   ` H.J. Lu
  2021-05-10 13:59                     ` Richard Biener
  0 siblings, 1 reply; 28+ messages in thread
From: H.J. Lu @ 2021-05-10 13:29 UTC (permalink / raw)
  To: Richard Biener via Gcc-patches, H.J. Lu, Richard Biener,
	Richard Sandiford

On Mon, May 10, 2021 at 2:39 AM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Richard Biener via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> > On Fri, Apr 30, 2021 at 8:30 PM Richard Sandiford via Gcc-patches
> > <gcc-patches@gcc.gnu.org> wrote:
> >>
> >> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> >> > On Fri, Apr 30, 2021 at 5:49 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >> >>
> >> >> On Fri, Apr 30, 2021 at 5:42 AM Richard Sandiford
> >> >> <richard.sandiford@arm.com> wrote:
> >> >> >
> >> >> > "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> >> >> > > On Fri, Apr 30, 2021 at 2:06 AM Richard Sandiford
> >> >> > > <richard.sandiford@arm.com> wrote:
> >> >> > >>
> >> >> > >> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> >> >> > >> > gen_reg_rtx tracks stack alignment needed for pseudo registers so that
> >> >> > >> > associated hard registers can be properly spilled onto stack.  But there
> >> >> > >> > are cases where associated hard registers will never be spilled onto
> >> >> > >> > stack.  gen_reg_rtx is changed to take an argument for register alignment
> >> >> > >> > so that stack realignment can be avoided when not needed.
> >> >> > >>
> >> >> > >> How is it guaranteed that they will never be spilled though?
> >> >> > >> I don't think that that guarantee exists for any kind of pseudo,
> >> >> > >> except perhaps for the temporary pseudos that the RA creates to
> >> >> > >> replace (match_scratch …)es.
> >> >> > >>
> >> >> > >
> >> >> > > The caller of creating pseudo registers with specific alignment must
> >> >> > > guarantee that they will never be spilled.   I am only using it in
> >> >> > >
> >> >> > >   /* Make operand1 a register if it isn't already.  */
> >> >> > >   if (can_create_pseudo_p ()
> >> >> > >       && !register_operand (op0, mode)
> >> >> > >       && !register_operand (op1, mode))
> >> >> > >     {
> >> >> > >       /* NB: Don't increase stack alignment requirement when forcing
> >> >> > >          operand1 into a pseudo register to copy data from one memory
> >> >> > >          location to another since it doesn't require a spill.  */
> >> >> > >       emit_move_insn (op0,
> >> >> > >                       force_reg (GET_MODE (op0), op1,
> >> >> > >                                  (UNITS_PER_WORD * BITS_PER_UNIT)));
> >> >> > >       return;
> >> >> > >     }
> >> >> > >
> >> >> > > for vector moves.  RA shouldn't spill it.
> >> >> >
> >> >> > But this is the point: it's a case of hoping that the RA won't spill it,
> >> >> > rather than having a guarantee that it won't.
> >> >> >
> >> >> > Even if the moves start out adjacent, they could be separated by later
> >> >> > RTL optimisations, particularly scheduling.  (I realise pre-RA scheduling
> >> >> > isn't enabled by default for x86, but it can still be enabled explicitly.)
> >> >> > Or if the same data is being copied to two locations, we might reuse
> >> >> > values loaded by the first copy for the second copy as well.
> >> >
> >> > There are cases where pseudo vector registers are created as pure
> >> > temporary registers in the backend and they shouldn't ever be spilled
> >> > to stack.   They will be spilled to stack only if there are other non-temporary
> >> > vector register usage in which case stack will be properly re-aligned.
> >> > Caller of creating pseudo registers with specific alignment guarantees
> >> > that they are used only as pure temporary registers.
> >>
> >> I don't think there's really a distinct category of pure temporary
> >> registers though.  The things I mentioned above can happen for any
> >> kind of pseudo register.
> >
> > I wonder if for the cases HJ thinks of it is appropriate to use hardregs?
> > Do we generally handle those well?  That is, are they again subject
> > to be allocated by RA when no longer live?
>
> Yeah, using hard registers should work.  Of course, any given fixed choice
> of hard register has the potential to be suboptimal in some situation,
> but it should be safe.

I tried hard registers.  The generated code isn't as good as pseudo registers.
But I want to avoid align the shack when YMM registers are only used to
inline memcpy/memset.  Any suggestions?

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 02/12] Allow generating pseudo register with specific alignment
  2021-05-10 13:29                   ` H.J. Lu
@ 2021-05-10 13:59                     ` Richard Biener
  2021-05-10 14:11                       ` H.J. Lu
  0 siblings, 1 reply; 28+ messages in thread
From: Richard Biener @ 2021-05-10 13:59 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Richard Biener via Gcc-patches, Richard Sandiford

On Mon, May 10, 2021 at 3:29 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Mon, May 10, 2021 at 2:39 AM Richard Sandiford
> <richard.sandiford@arm.com> wrote:
> >
> > Richard Biener via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> > > On Fri, Apr 30, 2021 at 8:30 PM Richard Sandiford via Gcc-patches
> > > <gcc-patches@gcc.gnu.org> wrote:
> > >>
> > >> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> > >> > On Fri, Apr 30, 2021 at 5:49 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >> >>
> > >> >> On Fri, Apr 30, 2021 at 5:42 AM Richard Sandiford
> > >> >> <richard.sandiford@arm.com> wrote:
> > >> >> >
> > >> >> > "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> > >> >> > > On Fri, Apr 30, 2021 at 2:06 AM Richard Sandiford
> > >> >> > > <richard.sandiford@arm.com> wrote:
> > >> >> > >>
> > >> >> > >> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> > >> >> > >> > gen_reg_rtx tracks stack alignment needed for pseudo registers so that
> > >> >> > >> > associated hard registers can be properly spilled onto stack.  But there
> > >> >> > >> > are cases where associated hard registers will never be spilled onto
> > >> >> > >> > stack.  gen_reg_rtx is changed to take an argument for register alignment
> > >> >> > >> > so that stack realignment can be avoided when not needed.
> > >> >> > >>
> > >> >> > >> How is it guaranteed that they will never be spilled though?
> > >> >> > >> I don't think that that guarantee exists for any kind of pseudo,
> > >> >> > >> except perhaps for the temporary pseudos that the RA creates to
> > >> >> > >> replace (match_scratch …)es.
> > >> >> > >>
> > >> >> > >
> > >> >> > > The caller of creating pseudo registers with specific alignment must
> > >> >> > > guarantee that they will never be spilled.   I am only using it in
> > >> >> > >
> > >> >> > >   /* Make operand1 a register if it isn't already.  */
> > >> >> > >   if (can_create_pseudo_p ()
> > >> >> > >       && !register_operand (op0, mode)
> > >> >> > >       && !register_operand (op1, mode))
> > >> >> > >     {
> > >> >> > >       /* NB: Don't increase stack alignment requirement when forcing
> > >> >> > >          operand1 into a pseudo register to copy data from one memory
> > >> >> > >          location to another since it doesn't require a spill.  */
> > >> >> > >       emit_move_insn (op0,
> > >> >> > >                       force_reg (GET_MODE (op0), op1,
> > >> >> > >                                  (UNITS_PER_WORD * BITS_PER_UNIT)));
> > >> >> > >       return;
> > >> >> > >     }
> > >> >> > >
> > >> >> > > for vector moves.  RA shouldn't spill it.
> > >> >> >
> > >> >> > But this is the point: it's a case of hoping that the RA won't spill it,
> > >> >> > rather than having a guarantee that it won't.
> > >> >> >
> > >> >> > Even if the moves start out adjacent, they could be separated by later
> > >> >> > RTL optimisations, particularly scheduling.  (I realise pre-RA scheduling
> > >> >> > isn't enabled by default for x86, but it can still be enabled explicitly.)
> > >> >> > Or if the same data is being copied to two locations, we might reuse
> > >> >> > values loaded by the first copy for the second copy as well.
> > >> >
> > >> > There are cases where pseudo vector registers are created as pure
> > >> > temporary registers in the backend and they shouldn't ever be spilled
> > >> > to stack.   They will be spilled to stack only if there are other non-temporary
> > >> > vector register usage in which case stack will be properly re-aligned.
> > >> > Caller of creating pseudo registers with specific alignment guarantees
> > >> > that they are used only as pure temporary registers.
> > >>
> > >> I don't think there's really a distinct category of pure temporary
> > >> registers though.  The things I mentioned above can happen for any
> > >> kind of pseudo register.
> > >
> > > I wonder if for the cases HJ thinks of it is appropriate to use hardregs?
> > > Do we generally handle those well?  That is, are they again subject
> > > to be allocated by RA when no longer live?
> >
> > Yeah, using hard registers should work.  Of course, any given fixed choice
> > of hard register has the potential to be suboptimal in some situation,
> > but it should be safe.
>
> I tried hard registers.  The generated code isn't as good as pseudo registers.
> But I want to avoid align the shack when YMM registers are only used to
> inline memcpy/memset.  Any suggestions?

I wonder if we can mark pseudos with a new reg flag, like 'nospill' and
enforce this in LRA or ICE if we can't?  That said, we should be able
to verify our assumption holds.  Now, we then of course need to avoid
CSE re-using such pseudo in ways that could lead to spilling
(not sure how that could happen, but ...).

Did you investigate closer what made the hardreg case generate worse
code?  Can we hide the copies behind UNSPECs and split them late
after reload?  Or is that too awkward to support when generating the
sequence from the middle-end (I suppose it's not going via the optabs?)

Richard.

> Thanks.
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 02/12] Allow generating pseudo register with specific alignment
  2021-05-10 13:59                     ` Richard Biener
@ 2021-05-10 14:11                       ` H.J. Lu
  2021-05-10 16:23                         ` Richard Sandiford
  2021-05-11  6:06                         ` Richard Biener
  0 siblings, 2 replies; 28+ messages in thread
From: H.J. Lu @ 2021-05-10 14:11 UTC (permalink / raw)
  To: Richard Biener; +Cc: Richard Biener via Gcc-patches, Richard Sandiford

On Mon, May 10, 2021 at 6:59 AM Richard Biener
<richard.guenther@gmail.com> wrote:
>
> On Mon, May 10, 2021 at 3:29 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Mon, May 10, 2021 at 2:39 AM Richard Sandiford
> > <richard.sandiford@arm.com> wrote:
> > >
> > > Richard Biener via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> > > > On Fri, Apr 30, 2021 at 8:30 PM Richard Sandiford via Gcc-patches
> > > > <gcc-patches@gcc.gnu.org> wrote:
> > > >>
> > > >> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> > > >> > On Fri, Apr 30, 2021 at 5:49 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > >> >>
> > > >> >> On Fri, Apr 30, 2021 at 5:42 AM Richard Sandiford
> > > >> >> <richard.sandiford@arm.com> wrote:
> > > >> >> >
> > > >> >> > "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> > > >> >> > > On Fri, Apr 30, 2021 at 2:06 AM Richard Sandiford
> > > >> >> > > <richard.sandiford@arm.com> wrote:
> > > >> >> > >>
> > > >> >> > >> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> > > >> >> > >> > gen_reg_rtx tracks stack alignment needed for pseudo registers so that
> > > >> >> > >> > associated hard registers can be properly spilled onto stack.  But there
> > > >> >> > >> > are cases where associated hard registers will never be spilled onto
> > > >> >> > >> > stack.  gen_reg_rtx is changed to take an argument for register alignment
> > > >> >> > >> > so that stack realignment can be avoided when not needed.
> > > >> >> > >>
> > > >> >> > >> How is it guaranteed that they will never be spilled though?
> > > >> >> > >> I don't think that that guarantee exists for any kind of pseudo,
> > > >> >> > >> except perhaps for the temporary pseudos that the RA creates to
> > > >> >> > >> replace (match_scratch …)es.
> > > >> >> > >>
> > > >> >> > >
> > > >> >> > > The caller of creating pseudo registers with specific alignment must
> > > >> >> > > guarantee that they will never be spilled.   I am only using it in
> > > >> >> > >
> > > >> >> > >   /* Make operand1 a register if it isn't already.  */
> > > >> >> > >   if (can_create_pseudo_p ()
> > > >> >> > >       && !register_operand (op0, mode)
> > > >> >> > >       && !register_operand (op1, mode))
> > > >> >> > >     {
> > > >> >> > >       /* NB: Don't increase stack alignment requirement when forcing
> > > >> >> > >          operand1 into a pseudo register to copy data from one memory
> > > >> >> > >          location to another since it doesn't require a spill.  */
> > > >> >> > >       emit_move_insn (op0,
> > > >> >> > >                       force_reg (GET_MODE (op0), op1,
> > > >> >> > >                                  (UNITS_PER_WORD * BITS_PER_UNIT)));
> > > >> >> > >       return;
> > > >> >> > >     }
> > > >> >> > >
> > > >> >> > > for vector moves.  RA shouldn't spill it.
> > > >> >> >
> > > >> >> > But this is the point: it's a case of hoping that the RA won't spill it,
> > > >> >> > rather than having a guarantee that it won't.
> > > >> >> >
> > > >> >> > Even if the moves start out adjacent, they could be separated by later
> > > >> >> > RTL optimisations, particularly scheduling.  (I realise pre-RA scheduling
> > > >> >> > isn't enabled by default for x86, but it can still be enabled explicitly.)
> > > >> >> > Or if the same data is being copied to two locations, we might reuse
> > > >> >> > values loaded by the first copy for the second copy as well.
> > > >> >
> > > >> > There are cases where pseudo vector registers are created as pure
> > > >> > temporary registers in the backend and they shouldn't ever be spilled
> > > >> > to stack.   They will be spilled to stack only if there are other non-temporary
> > > >> > vector register usage in which case stack will be properly re-aligned.
> > > >> > Caller of creating pseudo registers with specific alignment guarantees
> > > >> > that they are used only as pure temporary registers.
> > > >>
> > > >> I don't think there's really a distinct category of pure temporary
> > > >> registers though.  The things I mentioned above can happen for any
> > > >> kind of pseudo register.
> > > >
> > > > I wonder if for the cases HJ thinks of it is appropriate to use hardregs?
> > > > Do we generally handle those well?  That is, are they again subject
> > > > to be allocated by RA when no longer live?
> > >
> > > Yeah, using hard registers should work.  Of course, any given fixed choice
> > > of hard register has the potential to be suboptimal in some situation,
> > > but it should be safe.
> >
> > I tried hard registers.  The generated code isn't as good as pseudo registers.
> > But I want to avoid align the shack when YMM registers are only used to
> > inline memcpy/memset.  Any suggestions?
>
> I wonder if we can mark pseudos with a new reg flag, like 'nospill' and
> enforce this in LRA or ICE if we can't?  That said, we should be able
> to verify our assumption holds.  Now, we then of course need to avoid
> CSE re-using such pseudo in ways that could lead to spilling
> (not sure how that could happen, but ...).

Spill should be rare.  It is up to backends to decide if unaligned spill
should be used when spill does happen.

> Did you investigate closer what made the hardreg case generate worse
> code?  Can we hide the copies behind UNSPECs and split them late

I chose XMM7 for memcpy/memset.   Only XMM7 is used for memcpy
vs XMM0/XMM1/.....

> after reload?  Or is that too awkward to support when generating the
> sequence from the middle-end (I suppose it's not going via the optabs?)

That is correct.

> Richard.
>
> > Thanks.
> >
> > --
> > H.J.



-- 
H.J.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 02/12] Allow generating pseudo register with specific alignment
  2021-05-10 14:11                       ` H.J. Lu
@ 2021-05-10 16:23                         ` Richard Sandiford
  2021-05-11  6:06                         ` Richard Biener
  1 sibling, 0 replies; 28+ messages in thread
From: Richard Sandiford @ 2021-05-10 16:23 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Richard Biener, Richard Biener via Gcc-patches

"H.J. Lu" <hjl.tools@gmail.com> writes:
> On Mon, May 10, 2021 at 6:59 AM Richard Biener
> <richard.guenther@gmail.com> wrote:
>>
>> On Mon, May 10, 2021 at 3:29 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>> >
>> > On Mon, May 10, 2021 at 2:39 AM Richard Sandiford
>> > <richard.sandiford@arm.com> wrote:
>> > >
>> > > Richard Biener via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
>> > > > On Fri, Apr 30, 2021 at 8:30 PM Richard Sandiford via Gcc-patches
>> > > > <gcc-patches@gcc.gnu.org> wrote:
>> > > >>
>> > > >> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
>> > > >> > On Fri, Apr 30, 2021 at 5:49 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>> > > >> >>
>> > > >> >> On Fri, Apr 30, 2021 at 5:42 AM Richard Sandiford
>> > > >> >> <richard.sandiford@arm.com> wrote:
>> > > >> >> >
>> > > >> >> > "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
>> > > >> >> > > On Fri, Apr 30, 2021 at 2:06 AM Richard Sandiford
>> > > >> >> > > <richard.sandiford@arm.com> wrote:
>> > > >> >> > >>
>> > > >> >> > >> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
>> > > >> >> > >> > gen_reg_rtx tracks stack alignment needed for pseudo registers so that
>> > > >> >> > >> > associated hard registers can be properly spilled onto stack.  But there
>> > > >> >> > >> > are cases where associated hard registers will never be spilled onto
>> > > >> >> > >> > stack.  gen_reg_rtx is changed to take an argument for register alignment
>> > > >> >> > >> > so that stack realignment can be avoided when not needed.
>> > > >> >> > >>
>> > > >> >> > >> How is it guaranteed that they will never be spilled though?
>> > > >> >> > >> I don't think that that guarantee exists for any kind of pseudo,
>> > > >> >> > >> except perhaps for the temporary pseudos that the RA creates to
>> > > >> >> > >> replace (match_scratch …)es.
>> > > >> >> > >>
>> > > >> >> > >
>> > > >> >> > > The caller of creating pseudo registers with specific alignment must
>> > > >> >> > > guarantee that they will never be spilled.   I am only using it in
>> > > >> >> > >
>> > > >> >> > >   /* Make operand1 a register if it isn't already.  */
>> > > >> >> > >   if (can_create_pseudo_p ()
>> > > >> >> > >       && !register_operand (op0, mode)
>> > > >> >> > >       && !register_operand (op1, mode))
>> > > >> >> > >     {
>> > > >> >> > >       /* NB: Don't increase stack alignment requirement when forcing
>> > > >> >> > >          operand1 into a pseudo register to copy data from one memory
>> > > >> >> > >          location to another since it doesn't require a spill.  */
>> > > >> >> > >       emit_move_insn (op0,
>> > > >> >> > >                       force_reg (GET_MODE (op0), op1,
>> > > >> >> > >                                  (UNITS_PER_WORD * BITS_PER_UNIT)));
>> > > >> >> > >       return;
>> > > >> >> > >     }
>> > > >> >> > >
>> > > >> >> > > for vector moves.  RA shouldn't spill it.
>> > > >> >> >
>> > > >> >> > But this is the point: it's a case of hoping that the RA won't spill it,
>> > > >> >> > rather than having a guarantee that it won't.
>> > > >> >> >
>> > > >> >> > Even if the moves start out adjacent, they could be separated by later
>> > > >> >> > RTL optimisations, particularly scheduling.  (I realise pre-RA scheduling
>> > > >> >> > isn't enabled by default for x86, but it can still be enabled explicitly.)
>> > > >> >> > Or if the same data is being copied to two locations, we might reuse
>> > > >> >> > values loaded by the first copy for the second copy as well.
>> > > >> >
>> > > >> > There are cases where pseudo vector registers are created as pure
>> > > >> > temporary registers in the backend and they shouldn't ever be spilled
>> > > >> > to stack.   They will be spilled to stack only if there are other non-temporary
>> > > >> > vector register usage in which case stack will be properly re-aligned.
>> > > >> > Caller of creating pseudo registers with specific alignment guarantees
>> > > >> > that they are used only as pure temporary registers.
>> > > >>
>> > > >> I don't think there's really a distinct category of pure temporary
>> > > >> registers though.  The things I mentioned above can happen for any
>> > > >> kind of pseudo register.
>> > > >
>> > > > I wonder if for the cases HJ thinks of it is appropriate to use hardregs?
>> > > > Do we generally handle those well?  That is, are they again subject
>> > > > to be allocated by RA when no longer live?
>> > >
>> > > Yeah, using hard registers should work.  Of course, any given fixed choice
>> > > of hard register has the potential to be suboptimal in some situation,
>> > > but it should be safe.
>> >
>> > I tried hard registers.  The generated code isn't as good as pseudo registers.
>> > But I want to avoid align the shack when YMM registers are only used to
>> > inline memcpy/memset.  Any suggestions?
>>
>> I wonder if we can mark pseudos with a new reg flag, like 'nospill' and
>> enforce this in LRA or ICE if we can't?  That said, we should be able
>> to verify our assumption holds.  Now, we then of course need to avoid
>> CSE re-using such pseudo in ways that could lead to spilling
>> (not sure how that could happen, but ...).
>
> Spill should be rare.  It is up to backends to decide if unaligned spill
> should be used when spill does happen.
>
>> Did you investigate closer what made the hardreg case generate worse
>> code?  Can we hide the copies behind UNSPECs and split them late
>
> I chose XMM7 for memcpy/memset.   Only XMM7 is used for memcpy
> vs XMM0/XMM1/.....

Could you show the kind of code you'd like to generate with multiple
registers?  Also, why doesn't register renaming hide the difference?

One option might be to:

(a) have a pass that:
    - determines which pseudos P might force stack realignment
    - tries to do very simple early RA for P, replacing the pseudos
      with hard registers
    - bails out if it can't handle all P this way, or if something
      else forces stack realignment anyway

(b) only force stack realignment for pseudos after this pass has run

E.g. the pass could be restricted to pseudos that are never live
across block boundaries.

This might help in other situations, not just the memcpy one.

Thanks,
Richard

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 02/12] Allow generating pseudo register with specific alignment
  2021-05-10 14:11                       ` H.J. Lu
  2021-05-10 16:23                         ` Richard Sandiford
@ 2021-05-11  6:06                         ` Richard Biener
  1 sibling, 0 replies; 28+ messages in thread
From: Richard Biener @ 2021-05-11  6:06 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Richard Biener via Gcc-patches, Richard Sandiford

On Mon, May 10, 2021 at 4:12 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Mon, May 10, 2021 at 6:59 AM Richard Biener
> <richard.guenther@gmail.com> wrote:
> >
> > On Mon, May 10, 2021 at 3:29 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Mon, May 10, 2021 at 2:39 AM Richard Sandiford
> > > <richard.sandiford@arm.com> wrote:
> > > >
> > > > Richard Biener via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> > > > > On Fri, Apr 30, 2021 at 8:30 PM Richard Sandiford via Gcc-patches
> > > > > <gcc-patches@gcc.gnu.org> wrote:
> > > > >>
> > > > >> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> > > > >> > On Fri, Apr 30, 2021 at 5:49 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > >> >>
> > > > >> >> On Fri, Apr 30, 2021 at 5:42 AM Richard Sandiford
> > > > >> >> <richard.sandiford@arm.com> wrote:
> > > > >> >> >
> > > > >> >> > "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> > > > >> >> > > On Fri, Apr 30, 2021 at 2:06 AM Richard Sandiford
> > > > >> >> > > <richard.sandiford@arm.com> wrote:
> > > > >> >> > >>
> > > > >> >> > >> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> > > > >> >> > >> > gen_reg_rtx tracks stack alignment needed for pseudo registers so that
> > > > >> >> > >> > associated hard registers can be properly spilled onto stack.  But there
> > > > >> >> > >> > are cases where associated hard registers will never be spilled onto
> > > > >> >> > >> > stack.  gen_reg_rtx is changed to take an argument for register alignment
> > > > >> >> > >> > so that stack realignment can be avoided when not needed.
> > > > >> >> > >>
> > > > >> >> > >> How is it guaranteed that they will never be spilled though?
> > > > >> >> > >> I don't think that that guarantee exists for any kind of pseudo,
> > > > >> >> > >> except perhaps for the temporary pseudos that the RA creates to
> > > > >> >> > >> replace (match_scratch …)es.
> > > > >> >> > >>
> > > > >> >> > >
> > > > >> >> > > The caller of creating pseudo registers with specific alignment must
> > > > >> >> > > guarantee that they will never be spilled.   I am only using it in
> > > > >> >> > >
> > > > >> >> > >   /* Make operand1 a register if it isn't already.  */
> > > > >> >> > >   if (can_create_pseudo_p ()
> > > > >> >> > >       && !register_operand (op0, mode)
> > > > >> >> > >       && !register_operand (op1, mode))
> > > > >> >> > >     {
> > > > >> >> > >       /* NB: Don't increase stack alignment requirement when forcing
> > > > >> >> > >          operand1 into a pseudo register to copy data from one memory
> > > > >> >> > >          location to another since it doesn't require a spill.  */
> > > > >> >> > >       emit_move_insn (op0,
> > > > >> >> > >                       force_reg (GET_MODE (op0), op1,
> > > > >> >> > >                                  (UNITS_PER_WORD * BITS_PER_UNIT)));
> > > > >> >> > >       return;
> > > > >> >> > >     }
> > > > >> >> > >
> > > > >> >> > > for vector moves.  RA shouldn't spill it.
> > > > >> >> >
> > > > >> >> > But this is the point: it's a case of hoping that the RA won't spill it,
> > > > >> >> > rather than having a guarantee that it won't.
> > > > >> >> >
> > > > >> >> > Even if the moves start out adjacent, they could be separated by later
> > > > >> >> > RTL optimisations, particularly scheduling.  (I realise pre-RA scheduling
> > > > >> >> > isn't enabled by default for x86, but it can still be enabled explicitly.)
> > > > >> >> > Or if the same data is being copied to two locations, we might reuse
> > > > >> >> > values loaded by the first copy for the second copy as well.
> > > > >> >
> > > > >> > There are cases where pseudo vector registers are created as pure
> > > > >> > temporary registers in the backend and they shouldn't ever be spilled
> > > > >> > to stack.   They will be spilled to stack only if there are other non-temporary
> > > > >> > vector register usage in which case stack will be properly re-aligned.
> > > > >> > Caller of creating pseudo registers with specific alignment guarantees
> > > > >> > that they are used only as pure temporary registers.
> > > > >>
> > > > >> I don't think there's really a distinct category of pure temporary
> > > > >> registers though.  The things I mentioned above can happen for any
> > > > >> kind of pseudo register.
> > > > >
> > > > > I wonder if for the cases HJ thinks of it is appropriate to use hardregs?
> > > > > Do we generally handle those well?  That is, are they again subject
> > > > > to be allocated by RA when no longer live?
> > > >
> > > > Yeah, using hard registers should work.  Of course, any given fixed choice
> > > > of hard register has the potential to be suboptimal in some situation,
> > > > but it should be safe.
> > >
> > > I tried hard registers.  The generated code isn't as good as pseudo registers.
> > > But I want to avoid align the shack when YMM registers are only used to
> > > inline memcpy/memset.  Any suggestions?
> >
> > I wonder if we can mark pseudos with a new reg flag, like 'nospill' and
> > enforce this in LRA or ICE if we can't?  That said, we should be able
> > to verify our assumption holds.  Now, we then of course need to avoid
> > CSE re-using such pseudo in ways that could lead to spilling
> > (not sure how that could happen, but ...).
>
> Spill should be rare.  It is up to backends to decide if unaligned spill
> should be used when spill does happen.

Can we transparently decide this somehow?  Thus when we didn't
do stack re-alignment force unaligned spills?

> > Did you investigate closer what made the hardreg case generate worse
> > code?  Can we hide the copies behind UNSPECs and split them late
>
> I chose XMM7 for memcpy/memset.   Only XMM7 is used for memcpy
> vs XMM0/XMM1/.....
>
> > after reload?  Or is that too awkward to support when generating the
> > sequence from the middle-end (I suppose it's not going via the optabs?)
>
> That is correct.
>
> > Richard.
> >
> > > Thanks.
> > >
> > > --
> > > H.J.
>
>
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2021-05-11  6:06 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-04-29 12:54 [PATCH 00/12] Allow TImode/OImode/XImode in op_by_pieces operations H.J. Lu
2021-04-29 12:54 ` [PATCH 01/12] Update alignment_for_piecewise_move H.J. Lu
2021-04-30  8:59   ` Richard Sandiford
2021-04-29 12:54 ` [PATCH 02/12] Allow generating pseudo register with specific alignment H.J. Lu
2021-04-30  9:06   ` Richard Sandiford
2021-04-30 12:06     ` H.J. Lu
2021-04-30 12:42       ` Richard Sandiford
2021-04-30 12:49         ` H.J. Lu
2021-04-30 13:34           ` H.J. Lu
2021-04-30 15:56             ` Richard Sandiford
2021-04-30 17:33               ` H.J. Lu
2021-05-03  8:18               ` Richard Biener
2021-05-10  9:39                 ` Richard Sandiford
2021-05-10 13:29                   ` H.J. Lu
2021-05-10 13:59                     ` Richard Biener
2021-05-10 14:11                       ` H.J. Lu
2021-05-10 16:23                         ` Richard Sandiford
2021-05-11  6:06                         ` Richard Biener
2021-04-29 12:54 ` [PATCH 03/12] Add TARGET_READ_MEMSET_VALUE/TARGET_GEN_MEMSET_VALUE H.J. Lu
2021-04-29 12:54 ` [PATCH 04/12] x86: Avoid stack realignment when copying data H.J. Lu
2021-04-29 12:54 ` [PATCH 05/12] Remove MAX_BITSIZE_MODE_ANY_INT H.J. Lu
2021-04-29 12:54 ` [PATCH 06/12] x86: Update piecewise move and store H.J. Lu
2021-04-29 12:54 ` [PATCH 07/12] x86: Add AVX2 tests for PR middle-end/90773 H.J. Lu
2021-04-29 12:54 ` [PATCH 08/12] x86: Add tests for piecewise move and store H.J. Lu
2021-04-29 12:54 ` [PATCH 09/12] x86: Also pass -mno-avx to pr72839.c H.J. Lu
2021-04-29 12:54 ` [PATCH 10/12] x86: Also pass -mno-avx to cold-attribute-1.c H.J. Lu
2021-04-29 12:54 ` [PATCH 11/12] x86: Also pass -mno-avx to sw-1.c for ia32 H.J. Lu
2021-04-29 12:54 ` [PATCH 12/12] x86: Update gcc.target/i386/incoming-11.c H.J. Lu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).