public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH v2 00/11] Allow TImode/OImode/XImode in op_by_pieces operations
@ 2021-05-11 23:35 H.J. Lu
  2021-05-11 23:35 ` [PATCH v2 01/11] Add TARGET_READ_MEMSET_VALUE/TARGET_GEN_MEMSET_VALUE H.J. Lu
                   ` (10 more replies)
  0 siblings, 11 replies; 21+ messages in thread
From: H.J. Lu @ 2021-05-11 23:35 UTC (permalink / raw)
  To: gcc-patches; +Cc: Richard Biener, Richard Sandiford, Uros Bizjak

1. Add TARGET_READ_MEMSET_VALUE and TARGET_GEN_MEMSET_VALUE to support
target instructions to duplicate QImode value to TImode/OImode/XImode
value for memmset.
2. x86: Avoid stack realignment when copying data
3. x86: Remov MAX_BITSIZE_MODE_ANY_INT.  Only x86 backend defines it.
4. x86: Use TImode/OImode/XImode integers for piecewise move and store.
5. x86: Add tests for TImode/OImode/XImode for piecewise move and store.
6. x86: Adjust existing tests.

On x86-64, SPEC CPU 2017 performance impact is neutral.  Glibc code size
differences with -O2 build are:

             Before         After
libc.so     1906572        1906444

Some code sequence differences in libc.so are:

<svcudp_bufcreate@GLIBC_2.2.5>:
	...
	jne    <svcudp_bufcreate@GLIBC_2.2.5+0x318>	      |		jne    <svcudp_bufcreate@GLIBC_2.2.5+0x2a8>
	test   %r15,%r15						test   %r15,%r15
	je     <svcudp_bufcreate@GLIBC_2.2.5+0x318>	      |		je     <svcudp_bufcreate@GLIBC_2.2.5+0x2a8>
	mov    %r13d,(%r14)						mov    %r13d,(%r14)
	lea    0x10(%r14),%rdi						lea    0x10(%r14),%rdi
	mov    $0x1,%ecx						mov    $0x1,%ecx
	mov    %r13d,%edx						mov    %r13d,%edx
	mov    %r15,0x40(%r12)						mov    %r15,0x40(%r12)
	mov    %r15,%rsi						mov    %r15,%rsi
	call   <xdrmem_create@GLIBC_2.2.5>				call   <xdrmem_create@GLIBC_2.2.5>
	lea    0xa2f9b(%rip),%rax        # <svcudp_op>	      |		lea    0xa2fab(%rip),%rax        # <svcudp_op>
	xor    %esi,%esi						xor    %esi,%esi
	mov    %ebp,%edi						mov    %ebp,%edi
	mov    %rax,0x8(%r12)						mov    %rax,0x8(%r12)
	movzwl 0x12(%rsp),%eax						movzwl 0x12(%rsp),%eax
	mov    $0x8,%edx				      <
	lea    0xc(%rsp),%rcx						lea    0xc(%rsp),%rcx
	mov    %r14,0x48(%r12)				      <
	add    $0x40,%r14				      <
	mov    $0x4,%r8d						mov    $0x4,%r8d
							      >		movq   $0x0,0x1d0(%r14)
							      >		mov    $0x8,%edx
	rol    $0x8,%ax							rol    $0x8,%ax
	mov    %ebp,(%r12)				      |		mov    %r14,0x48(%r12)
	movq   $0x0,0x190(%r14)				      |		add    $0x40,%r14
	mov    %ax,0x4(%r12)				      <
	mov    %r14,0x30(%r12)						mov    %r14,0x30(%r12)
							      >		mov    %ax,0x4(%r12)
							      >		mov    %ebp,(%r12)
	movl   $0x1,0xc(%rsp)						movl   $0x1,0xc(%rsp)
	call   <setsockopt>						call   <setsockopt>
	mov    %r12,%rdi						mov    %r12,%rdi
	movabs $0x101010101010101,%rdx			      <
	test   %eax,%eax						test   %eax,%eax
	mov    $0xff,%eax						mov    $0xff,%eax
	cmove  %eax,%ebx						cmove  %eax,%ebx
	movzbl %bl,%eax					      |		movd   %ebx,%xmm0
	mov    %ebx,0xc(%rsp)						mov    %ebx,0xc(%rsp)
	mov    %rax,%rsi				      |		punpcklbw %xmm0,%xmm0
	imul   %rdx,%rsi				      |		punpcklwd %xmm0,%xmm0
	mul    %rdx					      |		pshufd $0x0,%xmm0,%xmm0
	add    %rsi,%rdx				      |		movups %xmm0,0x50(%r12)
	mov    %rax,0x50(%r12)				      |		movups %xmm0,0x60(%r12)
	mov    %rdx,0x58(%r12)				      |		movups %xmm0,0x70(%r12)
	mov    %rax,0x60(%r12)				      |		movups %xmm0,0x80(%r12)
	mov    %rdx,0x68(%r12)				      |		movups %xmm0,0x90(%r12)
	mov    %rax,0x70(%r12)				      |		movups %xmm0,0xa0(%r12)
	mov    %rdx,0x78(%r12)				      |		movups %xmm0,0xb0(%r12)
	mov    %rax,0x80(%r12)				      |		movups %xmm0,0xc0(%r12)
	mov    %rdx,0x88(%r12)				      |		movups %xmm0,0xd0(%r12)
	mov    %rax,0x90(%r12)				      |		movups %xmm0,0xe0(%r12)
	mov    %rdx,0x98(%r12)				      |		movups %xmm0,0xf0(%r12)
	mov    %rax,0xa0(%r12)				      |		movups %xmm0,0x100(%r12)
	mov    %rdx,0xa8(%r12)				      |		movups %xmm0,0x110(%r12)
	mov    %rax,0xb0(%r12)				      |		movups %xmm0,0x120(%r12)
	mov    %rdx,0xb8(%r12)				      |		movups %xmm0,0x130(%r12)
	mov    %rax,0xc0(%r12)				      |		movups %xmm0,0x140(%r12)
	mov    %rdx,0xc8(%r12)				      <
	mov    %rax,0xd0(%r12)				      <
	mov    %rdx,0xd8(%r12)				      <
	mov    %rax,0xe0(%r12)				      <
	mov    %rdx,0xe8(%r12)				      <
	mov    %rax,0xf0(%r12)				      <
	mov    %rdx,0xf8(%r12)				      <
	mov    %rax,0x100(%r12)				      <
	mov    %rdx,0x108(%r12)				      <
	mov    %rax,0x110(%r12)				      <
	mov    %rdx,0x118(%r12)				      <
	mov    %rax,0x120(%r12)				      <
	mov    %rdx,0x128(%r12)				      <
	mov    %rax,0x130(%r12)				      <
	mov    %rdx,0x138(%r12)				      <
	mov    %rax,0x140(%r12)				      <
	mov    %rdx,0x148(%r12)				      <
	call   <xprt_register@GLIBC_2.2.5>				call   <xprt_register@GLIBC_2.2.5>
	add    $0x28,%rsp						add    $0x28,%rsp
	mov    %r12,%rax						mov    %r12,%rax
	pop    %rbx							pop    %rbx
	pop    %rbp							pop    %rbp
	pop    %r12							pop    %r12
	pop    %r13							pop    %r13
	pop    %r14							pop    %r14
	pop    %r15							pop    %r15
	ret    								ret    

H.J. Lu (11):
  Add TARGET_READ_MEMSET_VALUE/TARGET_GEN_MEMSET_VALUE
  x86: Avoid stack realignment when copying data
  Remove MAX_BITSIZE_MODE_ANY_INT
  x86: Update piecewise move and store
  x86: Add AVX2 tests for PR middle-end/90773
  x86: Add tests for piecewise move and store
  x86: Also pass -mno-avx to pr72839.c
  x86: Also pass -mno-avx to cold-attribute-1.c
  x86: Also pass -mno-avx to sw-1.c for ia32
  x86: Update gcc.target/i386/incoming-11.c
  constructor: Check if it is faster to load constant from memory

 gcc/builtins.c                                |  47 +--
 gcc/config/i386/i386-expand.c                 |  18 +-
 gcc/config/i386/i386-modes.def                |  15 +-
 gcc/config/i386/i386-protos.h                 |   5 +
 gcc/config/i386/i386.c                        | 289 +++++++++++++++++-
 gcc/config/i386/i386.h                        |  35 ++-
 gcc/doc/tm.texi                               |  16 +
 gcc/doc/tm.texi.in                            |   4 +
 gcc/expr.c                                    |  11 +-
 gcc/target.def                                |  20 ++
 gcc/targhooks.c                               |  56 ++++
 gcc/targhooks.h                               |   4 +
 .../gcc.target/i386/cold-attribute-1.c        |   2 +-
 gcc/testsuite/gcc.target/i386/eh_return-1.c   |  26 ++
 gcc/testsuite/gcc.target/i386/incoming-11.c   |   2 +-
 .../gcc.target/i386/pieces-memcpy-10.c        |  16 +
 .../gcc.target/i386/pieces-memcpy-11.c        |  17 ++
 .../gcc.target/i386/pieces-memcpy-12.c        |  16 +
 .../gcc.target/i386/pieces-memcpy-13.c        |  16 +
 .../gcc.target/i386/pieces-memcpy-14.c        |  17 ++
 .../gcc.target/i386/pieces-memcpy-15.c        |  16 +
 .../gcc.target/i386/pieces-memcpy-16.c        |  16 +
 .../gcc.target/i386/pieces-memcpy-7.c         |  15 +
 .../gcc.target/i386/pieces-memcpy-8.c         |  14 +
 .../gcc.target/i386/pieces-memcpy-9.c         |  14 +
 .../gcc.target/i386/pieces-memset-1.c         |  16 +
 .../gcc.target/i386/pieces-memset-10.c        |  16 +
 .../gcc.target/i386/pieces-memset-11.c        |  16 +
 .../gcc.target/i386/pieces-memset-12.c        |  16 +
 .../gcc.target/i386/pieces-memset-13.c        |  16 +
 .../gcc.target/i386/pieces-memset-14.c        |  16 +
 .../gcc.target/i386/pieces-memset-15.c        |  16 +
 .../gcc.target/i386/pieces-memset-16.c        |  16 +
 .../gcc.target/i386/pieces-memset-17.c        |  16 +
 .../gcc.target/i386/pieces-memset-18.c        |  16 +
 .../gcc.target/i386/pieces-memset-19.c        |  17 ++
 .../gcc.target/i386/pieces-memset-2.c         |  12 +
 .../gcc.target/i386/pieces-memset-20.c        |  17 ++
 .../gcc.target/i386/pieces-memset-21.c        |  17 ++
 .../gcc.target/i386/pieces-memset-22.c        |  17 ++
 .../gcc.target/i386/pieces-memset-23.c        |  17 ++
 .../gcc.target/i386/pieces-memset-24.c        |  17 ++
 .../gcc.target/i386/pieces-memset-25.c        |  17 ++
 .../gcc.target/i386/pieces-memset-26.c        |  17 ++
 .../gcc.target/i386/pieces-memset-27.c        |  17 ++
 .../gcc.target/i386/pieces-memset-28.c        |  17 ++
 .../gcc.target/i386/pieces-memset-29.c        |  17 ++
 .../gcc.target/i386/pieces-memset-3.c         |  18 ++
 .../gcc.target/i386/pieces-memset-30.c        |  17 ++
 .../gcc.target/i386/pieces-memset-31.c        |  17 ++
 .../gcc.target/i386/pieces-memset-32.c        |  17 ++
 .../gcc.target/i386/pieces-memset-33.c        |  17 ++
 .../gcc.target/i386/pieces-memset-34.c        |  17 ++
 .../gcc.target/i386/pieces-memset-35.c        |  17 ++
 .../gcc.target/i386/pieces-memset-36.c        |  17 ++
 .../gcc.target/i386/pieces-memset-37.c        |  15 +
 .../gcc.target/i386/pieces-memset-38.c        |  17 ++
 .../gcc.target/i386/pieces-memset-39.c        |  16 +
 .../gcc.target/i386/pieces-memset-4.c         |  16 +
 .../gcc.target/i386/pieces-memset-40.c        |  17 ++
 .../gcc.target/i386/pieces-memset-41.c        |  16 +
 .../gcc.target/i386/pieces-memset-42.c        |  17 ++
 .../gcc.target/i386/pieces-memset-43.c        |  17 ++
 .../gcc.target/i386/pieces-memset-5.c         |  12 +
 .../gcc.target/i386/pieces-memset-6.c         |  16 +
 .../gcc.target/i386/pieces-memset-7.c         |  16 +
 .../gcc.target/i386/pieces-memset-8.c         |  16 +
 .../gcc.target/i386/pieces-memset-9.c         |  16 +
 gcc/testsuite/gcc.target/i386/pr72839.c       |   2 +-
 gcc/testsuite/gcc.target/i386/pr90773-1.c     |  10 +-
 gcc/testsuite/gcc.target/i386/pr90773-14.c    |   2 +-
 gcc/testsuite/gcc.target/i386/pr90773-15.c    |  14 +
 gcc/testsuite/gcc.target/i386/pr90773-16.c    |  14 +
 gcc/testsuite/gcc.target/i386/pr90773-17.c    |  14 +
 gcc/testsuite/gcc.target/i386/pr90773-18.c    |  15 +
 gcc/testsuite/gcc.target/i386/pr90773-19.c    |  14 +
 gcc/testsuite/gcc.target/i386/pr90773-20.c    |  13 +
 gcc/testsuite/gcc.target/i386/pr90773-21.c    |  13 +
 gcc/testsuite/gcc.target/i386/pr90773-22.c    |  13 +
 gcc/testsuite/gcc.target/i386/pr90773-23.c    |  13 +
 gcc/testsuite/gcc.target/i386/pr90773-24.c    |  22 ++
 gcc/testsuite/gcc.target/i386/pr90773-25.c    |  20 ++
 gcc/testsuite/gcc.target/i386/pr90773-4.c     |   2 +-
 gcc/testsuite/gcc.target/i386/sw-1.c          |   1 +
 84 files changed, 1509 insertions(+), 83 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/eh_return-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-11.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-12.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-13.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-14.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-15.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-16.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-7.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-8.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-9.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-10.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-11.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-12.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-13.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-14.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-15.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-16.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-17.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-18.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-19.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-20.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-21.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-22.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-23.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-24.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-25.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-26.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-27.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-28.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-29.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-30.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-31.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-32.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-33.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-34.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-35.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-36.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-37.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-38.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-39.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-4.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-40.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-41.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-42.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-43.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-5.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-6.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-7.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-8.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-9.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-15.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-16.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-17.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-18.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-19.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-20.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-21.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-22.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-23.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-24.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-25.c

-- 
2.31.1


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v2 01/11] Add TARGET_READ_MEMSET_VALUE/TARGET_GEN_MEMSET_VALUE
  2021-05-11 23:35 [PATCH v2 00/11] Allow TImode/OImode/XImode in op_by_pieces operations H.J. Lu
@ 2021-05-11 23:35 ` H.J. Lu
  2021-05-30 18:49   ` Jeff Law
  2021-05-11 23:35 ` [PATCH v2 02/11] x86: Avoid stack realignment when copying data H.J. Lu
                   ` (9 subsequent siblings)
  10 siblings, 1 reply; 21+ messages in thread
From: H.J. Lu @ 2021-05-11 23:35 UTC (permalink / raw)
  To: gcc-patches; +Cc: Richard Biener, Richard Sandiford, Uros Bizjak

Add TARGET_READ_MEMSET_VALUE and TARGET_GEN_MEMSET_VALUE to support
target instructions to duplicate QImode value to TImode/OImode/XImode
value for memmset.  Define SCRATCH_SSE_REG as a scratch register for
ix86_gen_memset_value.

gcc/

	PR middle-end/90773
	* builtins.c (builtin_memset_read_str): Call
	targetm.read_memset_value.
	(builtin_memset_gen_str): Call targetm.gen_memset_value.
	* target.def (read_memset_value): New hook.
	(gen_memset_value): Likewise.
	* targhooks.c: Inclue "builtins.h".
	(default_read_memset_value): New function.
	(default_gen_memset_value): Likewise.
	* targhooks.h (default_read_memset_value): New prototype.
	(default_gen_memset_value): Likewise.
	* config/i386/i386-expand.c (ix86_expand_vector_init_duplicate):
	Make it global.
	* config/i386/i386-protos.h (ix86_minimum_incoming_stack_boundary):
	New.
	(ix86_expand_vector_init_duplicate): Likewise.
	* config/i386/i386.c (ix86_minimum_incoming_stack_boundary): Add
	an argument to ignore stack_alignment_estimated.  It is passed
	as false by default.
	(ix86_gen_memset_value_from_prev): New function.
	(ix86_gen_memset_value): Likewise.
	(ix86_read_memset_value): Likewise.
	(TARGET_GEN_MEMSET_VALUE): New.
	(TARGET_READ_MEMSET_VALUE): Likewise.
	* config/i386/i386.h (SCRATCH_SSE_REG): New.
	* doc/tm.texi.in: Add TARGET_READ_MEMSET_VALUE and
	TARGET_GEN_MEMSET_VALUE hooks.
	* doc/tm.texi: Regenerated.

gcc/testsuite/

	PR middle-end/90773
	* gcc.target/i386/pr90773-15.c: New test.
	* gcc.target/i386/pr90773-16.c: Likewise.
	* gcc.target/i386/pr90773-17.c: Likewise.
	* gcc.target/i386/pr90773-18.c: Likewise.
	* gcc.target/i386/pr90773-19.c: Likewise.
---
 gcc/builtins.c                             |  47 +---
 gcc/config/i386/i386-expand.c              |   2 +-
 gcc/config/i386/i386-protos.h              |   5 +
 gcc/config/i386/i386.c                     | 268 ++++++++++++++++++++-
 gcc/config/i386/i386.h                     |   4 +
 gcc/doc/tm.texi                            |  16 ++
 gcc/doc/tm.texi.in                         |   4 +
 gcc/expr.c                                 |   1 -
 gcc/target.def                             |  20 ++
 gcc/targhooks.c                            |  56 +++++
 gcc/targhooks.h                            |   4 +
 gcc/testsuite/gcc.target/i386/pr90773-15.c |  14 ++
 gcc/testsuite/gcc.target/i386/pr90773-16.c |  14 ++
 gcc/testsuite/gcc.target/i386/pr90773-17.c |  14 ++
 gcc/testsuite/gcc.target/i386/pr90773-18.c |  15 ++
 gcc/testsuite/gcc.target/i386/pr90773-19.c |  14 ++
 16 files changed, 449 insertions(+), 49 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-15.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-16.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-17.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-18.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-19.c

diff --git a/gcc/builtins.c b/gcc/builtins.c
index 2f0efae11e8..6951f2d3633 100644
--- a/gcc/builtins.c
+++ b/gcc/builtins.c
@@ -6584,24 +6584,11 @@ expand_builtin_strncpy (tree exp, rtx target)
    previous iteration.  */
 
 rtx
-builtin_memset_read_str (void *data, void *prevp,
+builtin_memset_read_str (void *data, void *prev,
 			 HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
 			 scalar_int_mode mode)
 {
-  by_pieces_prev *prev = (by_pieces_prev *) prevp;
-  if (prev != nullptr && prev->data != nullptr)
-    {
-      /* Use the previous data in the same mode.  */
-      if (prev->mode == mode)
-	return prev->data;
-    }
-
-  const char *c = (const char *) data;
-  char *p = XALLOCAVEC (char, GET_MODE_SIZE (mode));
-
-  memset (p, *c, GET_MODE_SIZE (mode));
-
-  return c_readstr (p, mode);
+  return targetm.read_memset_value ((const char *) data, prev, mode);
 }
 
 /* Callback routine for store_by_pieces.  Return the RTL of a register
@@ -6611,37 +6598,11 @@ builtin_memset_read_str (void *data, void *prevp,
    nullptr, it has the RTL info from the previous iteration.  */
 
 static rtx
-builtin_memset_gen_str (void *data, void *prevp,
+builtin_memset_gen_str (void *data, void *prev,
 			HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
 			scalar_int_mode mode)
 {
-  rtx target, coeff;
-  size_t size;
-  char *p;
-
-  by_pieces_prev *prev = (by_pieces_prev *) prevp;
-  if (prev != nullptr && prev->data != nullptr)
-    {
-      /* Use the previous data in the same mode.  */
-      if (prev->mode == mode)
-	return prev->data;
-
-      target = simplify_gen_subreg (mode, prev->data, prev->mode, 0);
-      if (target != nullptr)
-	return target;
-    }
-
-  size = GET_MODE_SIZE (mode);
-  if (size == 1)
-    return (rtx) data;
-
-  p = XALLOCAVEC (char, size);
-  memset (p, 1, size);
-  coeff = c_readstr (p, mode);
-
-  target = convert_to_mode (mode, (rtx) data, 1);
-  target = expand_mult (mode, target, coeff, NULL_RTX, 1);
-  return force_reg (mode, target);
+  return targetm.gen_memset_value ((rtx) data, prev, mode);
 }
 
 /* Expand expression EXP, which is a call to the memset builtin.  Return
diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index 5cfde5b3d30..7f1dff6337c 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -13640,7 +13640,7 @@ static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
 /* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
    with all elements equal to VAR.  Return true if successful.  */
 
-static bool
+bool
 ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
 				   rtx target, rtx val)
 {
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 7782cf1163f..c4896c2da74 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -50,6 +50,9 @@ extern void ix86_reset_previous_fndecl (void);
 
 extern bool ix86_using_red_zone (void);
 
+extern unsigned int ix86_minimum_incoming_stack_boundary (bool,
+							  bool = false);
+
 extern unsigned int ix86_regmode_natural_size (machine_mode);
 #ifdef RTX_CODE
 extern int standard_80387_constant_p (rtx);
@@ -257,6 +260,8 @@ extern void ix86_expand_mul_widen_hilo (rtx, rtx, rtx, bool, bool);
 extern void ix86_expand_sse2_mulv4si3 (rtx, rtx, rtx);
 extern void ix86_expand_sse2_mulvxdi3 (rtx, rtx, rtx);
 extern void ix86_expand_sse2_abs (rtx, rtx);
+extern bool ix86_expand_vector_init_duplicate (bool, machine_mode, rtx,
+					       rtx);
 
 /* In i386-c.c  */
 extern void ix86_target_macros (void);
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 915f89f571a..f9cbc1d10eb 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -415,7 +415,6 @@ static unsigned int split_stack_prologue_scratch_regno (void);
 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
 
 static bool ix86_can_inline_p (tree, tree);
-static unsigned int ix86_minimum_incoming_stack_boundary (bool);
 
 \f
 /* Whether -mtune= or -march= were specified */
@@ -7232,8 +7231,9 @@ find_drap_reg (void)
 
 /* Return minimum incoming stack alignment.  */
 
-static unsigned int
-ix86_minimum_incoming_stack_boundary (bool sibcall)
+unsigned int
+ix86_minimum_incoming_stack_boundary (bool sibcall,
+				      bool ignore_estimated)
 {
   unsigned int incoming_stack_boundary;
 
@@ -7248,7 +7248,8 @@ ix86_minimum_incoming_stack_boundary (bool sibcall)
      estimated stack alignment is 128bit.  */
   else if (!sibcall
 	   && ix86_force_align_arg_pointer
-	   && crtl->stack_alignment_estimated == 128)
+	   && (ignore_estimated
+	       || crtl->stack_alignment_estimated == 128))
     incoming_stack_boundary = MIN_STACK_BOUNDARY;
   else
     incoming_stack_boundary = ix86_default_incoming_stack_boundary;
@@ -23031,6 +23032,259 @@ ix86_optab_supported_p (int op, machine_mode mode1, machine_mode,
     }
 }
 
+/* Return the RTL for memset in MODE from PREV.  */
+
+static rtx
+ix86_gen_memset_value_from_prev (by_pieces_prev *prevp,
+				 scalar_int_mode mode)
+{
+  rtx prev = prevp->data;
+
+  /* Use the previous data in the same mode.  */
+  if (prevp->mode == mode)
+    return prev;
+
+  machine_mode prev_mode = prevp->mode;
+  size_t size = GET_MODE_SIZE (prev_mode);
+
+  /* NB: Skip if the previous value is 1 byte or less.  CONST_WIDE_INT
+     is in VOIDmode whose size is 0.  */
+  if (size <= 1)
+    return nullptr;
+
+  rtx reg, reg_ti;
+  switch (size)
+    {
+    default:
+      gcc_unreachable ();
+
+    case 2:
+    case 4:
+      return simplify_gen_subreg (mode, prev, prev_mode, 0);
+
+    case 8:
+      /* In 64-bit mode, use SUBREG since word size is 8 bytes.  */
+      if (TARGET_64BIT)
+	return simplify_gen_subreg (mode, prev, prev_mode, 0);
+
+      switch (GET_MODE_SIZE (mode))
+	{
+	default:
+	  gcc_unreachable ();
+	case 2:
+	case 4:
+do_hi_si_mode:
+	  /* In 32-bit mode, Extract the value from an 8-byte
+	     register into an integer register first.  */
+	  reg = gen_reg_rtx (SImode);
+	  emit_move_insn (reg,
+			  simplify_gen_subreg (SImode, prev,
+					       prev_mode, 0));
+	  return simplify_gen_subreg (mode, reg, SImode, 0);
+	}
+      break;
+
+    case 16:
+      switch (GET_MODE_SIZE (mode))
+	{
+	default:
+	  gcc_unreachable ();
+	case 2:
+	case 4:
+	  /* Extract the value from a 16-byte vector register into
+	     an integer register first.  */
+	  goto do_hi_si_mode;
+	case 8:
+	  return simplify_gen_subreg (mode, prev, prev_mode, 0);
+	case 16:
+	  return prev;
+	}
+      break;
+
+    case 32:
+      switch (GET_MODE_SIZE (mode))
+	{
+	default:
+	  gcc_unreachable ();
+	case 2:
+do_himode:
+	  /* Extract the value from a 32-byte vector register into
+	     a 16-byte vector register first.  */
+	  reg_ti = gen_reg_rtx (TImode);
+	  emit_move_insn (reg_ti,
+			  simplify_gen_subreg (TImode, prev,
+					       prev_mode, 0));
+	  /* Then extract the value from a 16-byte vector register
+	     into an integer register.  */
+	  reg = gen_reg_rtx (SImode);
+	  emit_move_insn (reg,
+			  simplify_gen_subreg (SImode, reg_ti,
+					       TImode, 0));
+	  return simplify_gen_subreg (mode, reg, SImode, 0);
+
+	case 4:
+	case 8:
+do_si_di_mode:
+	  /* Extract the value from a 32-byte vector register into
+	     a 16-byte vector register first.  */
+	  reg_ti = gen_reg_rtx (TImode);
+	  emit_move_insn (reg_ti,
+			  simplify_gen_subreg (TImode, prev,
+					       prev_mode, 0));
+	  /* Generate 4/8-byte SSE -> INT move instruction.  */
+	  reg = gen_reg_rtx (mode);
+	  emit_move_insn (reg,
+			  simplify_gen_subreg (mode, reg_ti,
+					       TImode, 0));
+	  return reg;
+	case 16:
+	  return simplify_gen_subreg (mode, prev, prev_mode, 0);
+	case 32:
+	  return prev;
+	}
+
+    case 64:
+      switch (GET_MODE_SIZE (mode))
+	{
+	default:
+	  gcc_unreachable ();
+	case 2:
+	  /* Extract the value from a 64-byte vector register into
+	     a 16-byte vector register first.  */
+	  goto do_himode;
+	case 4:
+	case 8:
+	  /* Extract the value from a 64-byte vector register into
+	     a 16-byte vector register first.  */
+	  goto do_si_di_mode;
+	case 16:
+	case 32:
+	  return simplify_gen_subreg (mode, prev, prev_mode, 0);
+	case 64:
+	  return prev;
+	}
+    }
+
+  return nullptr;
+}
+
+/* Implement the TARGET_GEN_MEMSET_VALUE hook.  */
+
+static rtx
+ix86_gen_memset_value (rtx data, void *prevp, scalar_int_mode mode)
+{
+  /* Don't use the previous value if size is 1.  */
+  if (GET_MODE_SIZE (mode) == 1)
+    return data;
+
+  by_pieces_prev *prev = (by_pieces_prev *) prevp;
+  if (prev != nullptr && prev->data != nullptr)
+    {
+      rtx value = ix86_gen_memset_value_from_prev (prev, mode);
+      if (value)
+	return value;
+    }
+
+  /* Use default_gen_memset_value for vector store won't be used.  */
+  if (GET_MODE_SIZE (mode) <= GET_MODE_SIZE (DImode))
+    return default_gen_memset_value (data, prevp, mode);
+
+  rtx one, target;
+  scalar_mode one_mode;
+
+  unsigned int incoming_stack_boundary
+    = ix86_minimum_incoming_stack_boundary (false, true);
+
+  switch (GET_MODE_SIZE (mode))
+    {
+    default:
+      gcc_unreachable ();
+
+    case 64:
+      if (!TARGET_AVX512BW)
+	{
+	  rtx tmp;
+	  /* NB: Don't increase stack alignment requirement by using a
+	     scratch SSE register.  */
+	  if (GET_MODE_ALIGNMENT (V32QImode) > incoming_stack_boundary)
+	    tmp = gen_rtx_REG (V32QImode, SCRATCH_SSE_REG);
+	  else
+	    tmp = gen_reg_rtx (V32QImode);
+	  if (!ix86_expand_vector_init_duplicate (false, V32QImode,
+						  tmp, data))
+	    gcc_unreachable ();
+	  target = gen_rtx_VEC_CONCAT (V64QImode, tmp, tmp);
+	  if (REGNO (tmp) == SCRATCH_SSE_REG)
+	    {
+	      tmp = gen_rtx_REG (V64QImode, SCRATCH_SSE_REG);
+	      emit_move_insn (tmp, target);
+	      return gen_rtx_REG (mode, SCRATCH_SSE_REG);
+	    }
+	  else
+	    return convert_to_mode (mode, target, 1);
+	}
+      /* FALLTHRU */
+    case 16:
+    case 32:
+      one_mode = QImode;
+      one = data;
+      break;
+    }
+
+  unsigned int nunits = GET_MODE_SIZE (mode) / GET_MODE_SIZE (one_mode);
+  machine_mode vector_mode;
+  if (!mode_for_vector (one_mode, nunits).exists (&vector_mode))
+    gcc_unreachable ();
+
+  /* NB: Don't increase stack alignment requirement by using a scratch
+     SSE register.  */
+  if (GET_MODE_ALIGNMENT (vector_mode) > incoming_stack_boundary)
+    target = gen_rtx_REG (vector_mode, SCRATCH_SSE_REG);
+  else
+    target = gen_reg_rtx (vector_mode);
+  if (!ix86_expand_vector_init_duplicate (false, vector_mode, target,
+					  one))
+    gcc_unreachable ();
+
+  if (REGNO (target) == SCRATCH_SSE_REG)
+    return gen_rtx_REG (mode, SCRATCH_SSE_REG);
+  else
+    return convert_to_mode (mode, target, 1);
+}
+
+/* Implement the TARGET_READ_MEMSET_VALUE hook.  */
+
+static rtx
+ix86_read_memset_value (const char *str, void *prevp,
+			scalar_int_mode mode)
+{
+  rtx value;
+
+  by_pieces_prev *prev = (by_pieces_prev *) prevp;
+  if (prev != nullptr && prev->data != nullptr)
+    {
+      /* Don't use the previous value if size is 1.  */
+      if (GET_MODE_SIZE (mode) == 1)
+	return default_read_memset_value (str, nullptr, mode);
+
+      value = ix86_gen_memset_value_from_prev (prev, mode);
+      if (value)
+	return value;
+
+      return default_read_memset_value (str, nullptr, mode);
+    }
+
+  /* Use default_gen_memset_value if vector store can't be used.
+     NB: Need AVX2 for fast vector duplication and gen_reg_rtx.  */
+  if (GET_MODE_SIZE (mode) <= GET_MODE_SIZE (DImode)
+      || !TARGET_AVX2
+      || !reg_rtx_no)
+   return default_read_memset_value (str, nullptr, mode);
+
+  value = default_read_memset_value (str, nullptr, QImode);
+  return ix86_gen_memset_value (value, nullptr, mode);
+}
+
 /* Address space support.
 
    This is not "far pointers" in the 16-bit sense, but an easy way
@@ -23932,6 +24186,12 @@ static bool ix86_libc_has_fast_function (int fcode ATTRIBUTE_UNUSED)
 #undef TARGET_LIBC_HAS_FAST_FUNCTION
 #define TARGET_LIBC_HAS_FAST_FUNCTION ix86_libc_has_fast_function
 
+#undef TARGET_GEN_MEMSET_VALUE
+#define TARGET_GEN_MEMSET_VALUE ix86_gen_memset_value
+
+#undef TARGET_READ_MEMSET_VALUE
+#define TARGET_READ_MEMSET_VALUE ix86_read_memset_value
+
 #if CHECKING_P
 #undef TARGET_RUN_TARGET_SELFTESTS
 #define TARGET_RUN_TARGET_SELFTESTS selftest::ix86_run_selftests
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 97d6f3863cb..45d86802c51 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -1131,6 +1131,10 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
 #define FIRST_MASK_REG  MASK0_REG
 #define LAST_MASK_REG   MASK7_REG
 
+/* A scratch vector reg.  */
+#define SCRATCH_SSE_REG \
+  (TARGET_64BIT ? LAST_REX_SSE_REG : LAST_SSE_REG)
+
 /* Override this in other tm.h files to cope with various OS lossage
    requiring a frame pointer.  */
 #ifndef SUBTARGET_FRAME_POINTER_REQUIRED
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 85ea9395560..51385044e76 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -11868,6 +11868,22 @@ This function prepares to emit a conditional comparison within a sequence
  @var{bit_code} is @code{AND} or @code{IOR}, which is the op on the compares.
 @end deftypefn
 
+@deftypefn {Target Hook} rtx TARGET_READ_MEMSET_VALUE (const char *@var{c}, void *@var{prev}, scalar_int_mode @var{mode})
+This function returns the RTL of a constant integer corresponding to
+target reading @code{GET_MODE_SIZE (@var{mode})} bytes from the stringn
+constant @var{str}.  If @var{prev} is not @samp{nullptr}, it contains
+the RTL information from the previous interation.
+@end deftypefn
+
+@deftypefn {Target Hook} rtx TARGET_GEN_MEMSET_VALUE (rtx @var{data}, void *@var{prev}, scalar_int_mode @var{mode})
+This function returns the RTL of a register containing
+@code{GET_MODE_SIZE (@var{mode})} consecutive copies of the unsigned
+char value given in the RTL register @var{data}.  For example, if
+@var{mode} is 4 bytes wide, return the RTL for 0x01010101*@var{data}.
+If @var{PREV} is not @samp{nullptr}, it is the RTL information from
+the previous iteration.
+@end deftypefn
+
 @deftypefn {Target Hook} unsigned TARGET_LOOP_UNROLL_ADJUST (unsigned @var{nunroll}, class loop *@var{loop})
 This target hook returns a new value for the number of times @var{loop}
 should be unrolled. The parameter @var{nunroll} is the number of times
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index d8e3de14af1..8d4c3949fbf 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -7956,6 +7956,10 @@ lists.
 
 @hook TARGET_GEN_CCMP_NEXT
 
+@hook TARGET_READ_MEMSET_VALUE
+
+@hook TARGET_GEN_MEMSET_VALUE
+
 @hook TARGET_LOOP_UNROLL_ADJUST
 
 @defmac POWI_MAX_MULTS
diff --git a/gcc/expr.c b/gcc/expr.c
index 1b65f6b3245..42ef5bdf5d5 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -1169,7 +1169,6 @@ op_by_pieces_d::run ()
   /* NB: widest_int_mode_for_size checks M_MAX_SIZE > 1.  */
   scalar_int_mode mode = widest_int_mode_for_size (m_max_size);
   mode = get_usable_mode (mode, m_len);
-
   by_pieces_prev to_prev = { nullptr, mode };
   by_pieces_prev from_prev = { nullptr, mode };
 
diff --git a/gcc/target.def b/gcc/target.def
index bbaf6b4f3a0..c9aca40fa88 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -2694,6 +2694,26 @@ DEFHOOK
  rtx, (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev, int cmp_code, tree op0, tree op1, int bit_code),
  NULL)
 
+DEFHOOK
+(read_memset_value,
+ "This function returns the RTL of a constant integer corresponding to\n\
+target reading @code{GET_MODE_SIZE (@var{mode})} bytes from the stringn\n\
+constant @var{str}.  If @var{prev} is not @samp{nullptr}, it contains\n\
+the RTL information from the previous interation.",
+ rtx, (const char *c, void *prev, scalar_int_mode mode),
+ default_read_memset_value)
+
+DEFHOOK
+(gen_memset_value,
+ "This function returns the RTL of a register containing\n\
+@code{GET_MODE_SIZE (@var{mode})} consecutive copies of the unsigned\n\
+char value given in the RTL register @var{data}.  For example, if\n\
+@var{mode} is 4 bytes wide, return the RTL for 0x01010101*@var{data}.\n\
+If @var{PREV} is not @samp{nullptr}, it is the RTL information from\n\
+the previous iteration.",
+ rtx, (rtx data, void *prev, scalar_int_mode mode),
+ default_gen_memset_value)
+
 /* Return a new value for loop unroll size.  */
 DEFHOOK
 (loop_unroll_adjust,
diff --git a/gcc/targhooks.c b/gcc/targhooks.c
index 2e0fdb797e0..287907c72d7 100644
--- a/gcc/targhooks.c
+++ b/gcc/targhooks.c
@@ -90,6 +90,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "attribs.h"
 #include "asan.h"
 #include "emit-rtl.h"
+#include "builtins.h"
 
 bool
 default_legitimate_address_p (machine_mode mode ATTRIBUTE_UNUSED,
@@ -2548,4 +2549,59 @@ default_memtag_untagged_pointer (rtx tagged_pointer, rtx target)
   return untagged_base;
 }
 
+/* Default implementation of TARGET_READ_MEMSET_VALUE.  */
+
+rtx
+default_read_memset_value (const char *c, void *prevp,
+			   scalar_int_mode mode)
+{
+  by_pieces_prev *prev = (by_pieces_prev *) prevp;
+  if (prev != nullptr && prev->data != nullptr)
+    {
+      /* Use the previous data in the same mode.  */
+      if (prev->mode == mode)
+	return prev->data;
+    }
+
+  char *p = XALLOCAVEC (char, GET_MODE_SIZE (mode));
+
+  memset (p, *c, GET_MODE_SIZE (mode));
+
+  return c_readstr (p, mode);
+}
+
+/* Default implementation of TARGET_GEN_MEMSET_VALUE.  */
+
+rtx
+default_gen_memset_value (rtx data, void *prevp, scalar_int_mode mode)
+{
+  rtx target, coeff;
+  size_t size;
+  char *p;
+
+  by_pieces_prev *prev = (by_pieces_prev *) prevp;
+  if (prev != nullptr && prev->data != nullptr)
+    {
+      /* Use the previous data in the same mode.  */
+      if (prev->mode == mode)
+	return prev->data;
+
+      target = simplify_gen_subreg (mode, prev->data, prev->mode, 0);
+      if (target != nullptr)
+	return target;
+    }
+
+  size = GET_MODE_SIZE (mode);
+  if (size == 1)
+    return data;
+
+  p = XALLOCAVEC (char, size);
+  memset (p, 1, size);
+  coeff = c_readstr (p, mode);
+
+  target = convert_to_mode (mode, data, 1);
+  target = expand_mult (mode, target, coeff, NULL_RTX, 1);
+  return force_reg (mode, target);
+}
+
 #include "gt-targhooks.h"
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index b537038c0aa..3c00927e196 100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -300,4 +300,8 @@ extern rtx default_memtag_set_tag (rtx, rtx, rtx);
 extern rtx default_memtag_extract_tag (rtx, rtx);
 extern rtx default_memtag_untagged_pointer (rtx, rtx);
 
+extern rtx default_read_memset_value (const char *, void *,
+				      scalar_int_mode);
+extern rtx default_gen_memset_value (rtx, void *, scalar_int_mode);
+
 #endif /* GCC_TARGHOOKS_H */
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-15.c b/gcc/testsuite/gcc.target/i386/pr90773-15.c
new file mode 100644
index 00000000000..c0a96fed892
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr90773-15.c
@@ -0,0 +1,14 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -march=skylake-avx512" } */
+
+extern char *dst;
+
+void
+foo (int c)
+{
+  __builtin_memset (dst, c, 17);
+}
+
+/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%edi, %xmm\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]+%xmm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */
+/* { dg-final { scan-assembler-times "movb\[\\t \]+%dil, 16\\(%\[\^,\]+\\)" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-16.c b/gcc/testsuite/gcc.target/i386/pr90773-16.c
new file mode 100644
index 00000000000..d2d1ec6141c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr90773-16.c
@@ -0,0 +1,14 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -march=skylake-avx512" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, -1, 17);
+}
+
+/* { dg-final { scan-assembler-times "vpcmpeqd" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]+%xmm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */
+/* { dg-final { scan-assembler-times "movb\[\\t \]+\\\$-1, 16\\(%\[\^,\]+\\)" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-17.c b/gcc/testsuite/gcc.target/i386/pr90773-17.c
new file mode 100644
index 00000000000..6c8da7d24ef
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr90773-17.c
@@ -0,0 +1,14 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -march=skylake-avx512" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, 12, 19);
+}
+
+/* { dg-final { scan-assembler-times "vpbroadcastb" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]+%xmm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovd\[\\t \]+%xmm\[0-9\]+, 15\\(%\[\^,\]+\\)" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-18.c b/gcc/testsuite/gcc.target/i386/pr90773-18.c
new file mode 100644
index 00000000000..b0687abbe01
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr90773-18.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake-avx512" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, 12, 9);
+}
+
+/* { dg-final { scan-assembler-times "movabsq\[\\t \]+\\\$868082074056920076, %r" 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]+\\\$202116108, \\(%\[\^,\]+\\)" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]+\\\$202116108, 4\\(%\[\^,\]+\\)" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "movb\[\\t \]+\\\$12, 8\\(%\[\^,\]+\\)" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-19.c b/gcc/testsuite/gcc.target/i386/pr90773-19.c
new file mode 100644
index 00000000000..8aa5540bacc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr90773-19.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, 12, 9);
+}
+
+/* { dg-final { scan-assembler-times "movabsq\[\\t \]+\\\$868082074056920076, %r" 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]+\\\$202116108, \\(%\[\^,\]+\\)" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]+\\\$202116108, 4\\(%\[\^,\]+\\)" 1 { target ia32 } } } */
-- 
2.31.1


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v2 02/11] x86: Avoid stack realignment when copying data
  2021-05-11 23:35 [PATCH v2 00/11] Allow TImode/OImode/XImode in op_by_pieces operations H.J. Lu
  2021-05-11 23:35 ` [PATCH v2 01/11] Add TARGET_READ_MEMSET_VALUE/TARGET_GEN_MEMSET_VALUE H.J. Lu
@ 2021-05-11 23:35 ` H.J. Lu
  2021-05-11 23:35 ` [PATCH v2 03/11] Remove MAX_BITSIZE_MODE_ANY_INT H.J. Lu
                   ` (8 subsequent siblings)
  10 siblings, 0 replies; 21+ messages in thread
From: H.J. Lu @ 2021-05-11 23:35 UTC (permalink / raw)
  To: gcc-patches; +Cc: Richard Biener, Richard Sandiford, Uros Bizjak

To avoid stack realignment, use SCRATCH_SSE_REG to copy data from one
memory location to another.

gcc/

	* config/i386/i386-expand.c (ix86_expand_vector_move): Use
	SCRATCH_SSE_REG to copy data from one memory location to
	another.

gcc/testsuite/

	* gcc.target/i386/eh_return-1.c: New test.
---
 gcc/config/i386/i386-expand.c               | 16 ++++++++++++-
 gcc/testsuite/gcc.target/i386/eh_return-1.c | 26 +++++++++++++++++++++
 2 files changed, 41 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/eh_return-1.c

diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index 7f1dff6337c..09d5e5d88af 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -431,7 +431,21 @@ ix86_expand_vector_move (machine_mode mode, rtx operands[])
       && !register_operand (op0, mode)
       && !register_operand (op1, mode))
     {
-      emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
+      rtx tmp;
+      mode = GET_MODE (op0);
+      if (TARGET_SSE
+	  && (GET_MODE_ALIGNMENT (mode)
+	      > ix86_minimum_incoming_stack_boundary (false, true)))
+	{
+	  /* NB: Don't increase stack alignment requirement by using
+	     a scratch SSE register to copy data from one memory
+	     location to another since it doesn't require a spill.  */
+	  tmp = gen_rtx_REG (mode, SCRATCH_SSE_REG);
+	  emit_move_insn (tmp, op1);
+	}
+      else
+	tmp = force_reg (mode, op1);
+      emit_move_insn (op0, tmp);
       return;
     }
 
diff --git a/gcc/testsuite/gcc.target/i386/eh_return-1.c b/gcc/testsuite/gcc.target/i386/eh_return-1.c
new file mode 100644
index 00000000000..671ba635e88
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/eh_return-1.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=haswell -mno-avx512f" } */
+
+struct _Unwind_Context
+{
+  void *ra;
+  char array[48];
+};
+
+extern long uw_install_context_1 (struct _Unwind_Context *);
+
+void
+_Unwind_RaiseException (void)
+{
+  struct _Unwind_Context this_context, cur_context;
+  long offset = uw_install_context_1 (&this_context);
+  __builtin_memcpy (&this_context, &cur_context,
+		    sizeof (struct _Unwind_Context));
+  void *handler = __builtin_frob_return_addr ((&cur_context)->ra);
+  uw_install_context_1 (&cur_context);
+  __builtin_eh_return (offset, handler);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%ymm" 4 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
-- 
2.31.1


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v2 03/11] Remove MAX_BITSIZE_MODE_ANY_INT
  2021-05-11 23:35 [PATCH v2 00/11] Allow TImode/OImode/XImode in op_by_pieces operations H.J. Lu
  2021-05-11 23:35 ` [PATCH v2 01/11] Add TARGET_READ_MEMSET_VALUE/TARGET_GEN_MEMSET_VALUE H.J. Lu
  2021-05-11 23:35 ` [PATCH v2 02/11] x86: Avoid stack realignment when copying data H.J. Lu
@ 2021-05-11 23:35 ` H.J. Lu
  2021-05-12  9:00   ` Richard Biener
  2021-05-11 23:35 ` [PATCH v2 04/11] x86: Update piecewise move and store H.J. Lu
                   ` (7 subsequent siblings)
  10 siblings, 1 reply; 21+ messages in thread
From: H.J. Lu @ 2021-05-11 23:35 UTC (permalink / raw)
  To: gcc-patches; +Cc: Richard Biener, Richard Sandiford, Uros Bizjak

It is only defined for i386 and everyone uses the default:

 #define MAX_BITSIZE_MODE_ANY_INT (64*BITS_PER_UNIT)

Whatever problems we had before, they have been fixed now.

	* config/i386/i386-modes.def (MAX_BITSIZE_MODE_ANY_INT): Removed.
---
 gcc/config/i386/i386-modes.def | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

diff --git a/gcc/config/i386/i386-modes.def b/gcc/config/i386/i386-modes.def
index dbddfd8e48f..4e7014be034 100644
--- a/gcc/config/i386/i386-modes.def
+++ b/gcc/config/i386/i386-modes.def
@@ -107,19 +107,10 @@ INT_MODE (XI, 64);
 PARTIAL_INT_MODE (HI, 16, P2QI);
 PARTIAL_INT_MODE (SI, 32, P2HI);
 
-/* Mode used for signed overflow checking of TImode.  As
-   MAX_BITSIZE_MODE_ANY_INT is only 160, wide-int.h reserves only that
-   rounded up to multiple of HOST_BITS_PER_WIDE_INT bits in wide_int etc.,
-   so OImode is too large.  For the overflow checking we actually need
-   just 1 or 2 bits beyond TImode precision.  Use 160 bits to have
-   a multiple of 32.  */
+/* Mode used for signed overflow checking of TImode.  For the overflow
+   checking we actually need just 1 or 2 bits beyond TImode precision.
+   Use 160 bits to have a multiple of 32.  */
 PARTIAL_INT_MODE (OI, 160, POI);
 
-/* Keep the OI and XI modes from confusing the compiler into thinking
-   that these modes could actually be used for computation.  They are
-   only holders for vectors during data movement.  Include POImode precision
-   though.  */
-#define MAX_BITSIZE_MODE_ANY_INT (160)
-
 /* The symbol Pmode stands for one of the above machine modes (usually SImode).
    The tm.h file specifies which one.  It is not a distinct mode.  */
-- 
2.31.1


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v2 04/11] x86: Update piecewise move and store
  2021-05-11 23:35 [PATCH v2 00/11] Allow TImode/OImode/XImode in op_by_pieces operations H.J. Lu
                   ` (2 preceding siblings ...)
  2021-05-11 23:35 ` [PATCH v2 03/11] Remove MAX_BITSIZE_MODE_ANY_INT H.J. Lu
@ 2021-05-11 23:35 ` H.J. Lu
  2021-05-11 23:35 ` [PATCH v2 05/11] x86: Add AVX2 tests for PR middle-end/90773 H.J. Lu
                   ` (6 subsequent siblings)
  10 siblings, 0 replies; 21+ messages in thread
From: H.J. Lu @ 2021-05-11 23:35 UTC (permalink / raw)
  To: gcc-patches; +Cc: Richard Biener, Richard Sandiford, Uros Bizjak

We can use TImode/OImode/XImode integers for piecewise move and store.
When vector register is used for piecewise move and store, we don't
increase stack_alignment_needed since vector register spill isn't
required for piecewise move and store.  Since stack_realign_needed is
set to true by checking stack_alignment_estimated set by pseudo vector
register usage, we also need to check stack_realign_needed to eliminate
frame pointer.

gcc/

	* config/i386/i386.c (ix86_finalize_stack_frame_flags): Also
	check stack_realign_needed for stack realignment.
	(ix86_legitimate_constant_p): Always allow CONST_WIDE_INT smaller
	than the largest integer supported by vector register.
	* config/i386/i386.h (MOVE_MAX): Set to 64.
	(MOVE_MAX_PIECES): Set to bytes of the largest integer supported
	by vector register.
	(STORE_MAX_PIECES): New.

gcc/testsuite/

	* gcc.target/i386/pr90773-1.c: Adjust to expect movq for 32-bit.
	* gcc.target/i386/pr90773-4.c: Also run for 32-bit.
	* gcc.target/i386/pr90773-14.c: Likewise.
	* gcc.target/i386/pr90773-15.c: Likewise.
	* gcc.target/i386/pr90773-16.c: Likewise.
	* gcc.target/i386/pr90773-17.c: Likewise.
---
 gcc/config/i386/i386.c                     | 21 ++++++++++++---
 gcc/config/i386/i386.h                     | 31 +++++++++++++++++-----
 gcc/testsuite/gcc.target/i386/pr90773-1.c  | 10 +++----
 gcc/testsuite/gcc.target/i386/pr90773-14.c |  2 +-
 gcc/testsuite/gcc.target/i386/pr90773-15.c |  6 ++---
 gcc/testsuite/gcc.target/i386/pr90773-16.c |  2 +-
 gcc/testsuite/gcc.target/i386/pr90773-17.c |  2 +-
 gcc/testsuite/gcc.target/i386/pr90773-4.c  |  2 +-
 8 files changed, 53 insertions(+), 23 deletions(-)

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index f9cbc1d10eb..98bf08b854b 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -7943,8 +7943,17 @@ ix86_finalize_stack_frame_flags (void)
      assumed stack realignment might be needed or -fno-omit-frame-pointer
      is used, but in the end nothing that needed the stack alignment had
      been spilled nor stack access, clear frame_pointer_needed and say we
-     don't need stack realignment.  */
-  if ((stack_realign || (!flag_omit_frame_pointer && optimize))
+     don't need stack realignment.
+
+     When vector register is used for piecewise move and store, we don't
+     increase stack_alignment_needed as there is no register spill for
+     piecewise move and store.  Since stack_realign_needed is set to true
+     by checking stack_alignment_estimated which is updated by pseudo
+     vector register usage, we also need to check stack_realign_needed to
+     eliminate frame pointer.  */
+  if ((stack_realign
+       || (!flag_omit_frame_pointer && optimize)
+       || crtl->stack_realign_needed)
       && frame_pointer_needed
       && crtl->is_leaf
       && crtl->sp_is_unchanging
@@ -10403,7 +10412,13 @@ ix86_legitimate_constant_p (machine_mode mode, rtx x)
 	  /* FALLTHRU */
 	case E_OImode:
 	case E_XImode:
-	  if (!standard_sse_constant_p (x, mode))
+	  if (!standard_sse_constant_p (x, mode)
+	      && GET_MODE_SIZE (TARGET_AVX512F
+				? XImode
+				: (TARGET_AVX
+				   ? OImode
+				   : (TARGET_SSE2
+				      ? TImode : DImode))) < GET_MODE_SIZE (mode))
 	    return false;
 	default:
 	  break;
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 45d86802c51..677afbf7031 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -1754,7 +1754,7 @@ typedef struct ix86_args {
 
 /* Max number of bytes we can move from memory to memory
    in one reasonably fast instruction.  */
-#define MOVE_MAX 16
+#define MOVE_MAX 64
 
 /* MOVE_MAX_PIECES is the number of bytes at a time which we can
    move efficiently, as opposed to  MOVE_MAX which is the maximum
@@ -1765,11 +1765,30 @@ typedef struct ix86_args {
    widest mode with MAX_FIXED_MODE_SIZE, we can only use TImode in
    64-bit mode.  */
 #define MOVE_MAX_PIECES \
-  ((TARGET_64BIT \
-    && TARGET_SSE2 \
-    && TARGET_SSE_UNALIGNED_LOAD_OPTIMAL \
-    && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
-   ? GET_MODE_SIZE (TImode) : UNITS_PER_WORD)
+  ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \
+   ? 64 \
+   : ((TARGET_AVX \
+       && !TARGET_PREFER_AVX128 \
+       && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD \
+       && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \
+      ? 32 \
+      : ((TARGET_SSE2 \
+	  && TARGET_SSE_UNALIGNED_LOAD_OPTIMAL \
+	  && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
+	 ? 16 : UNITS_PER_WORD)))
+
+/* STORE_MAX_PIECES is the number of bytes at a time that we can
+   store efficiently.  */
+#define STORE_MAX_PIECES \
+  ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \
+   ? 64 \
+   : ((TARGET_AVX \
+       && !TARGET_PREFER_AVX128 \
+       && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \
+      ? 32 \
+      : ((TARGET_SSE2 \
+	  && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
+	 ? 16 : UNITS_PER_WORD)))
 
 /* If a memory-to-memory move would take MOVE_RATIO or more simple
    move-instruction pairs, we will do a cpymem or libcall instead.
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-1.c b/gcc/testsuite/gcc.target/i386/pr90773-1.c
index 1d9f282dc0d..4fd5a40d99d 100644
--- a/gcc/testsuite/gcc.target/i386/pr90773-1.c
+++ b/gcc/testsuite/gcc.target/i386/pr90773-1.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -mtune=generic" } */
+/* { dg-options "-O2 -msse2 -mtune=generic" } */
 
 extern char *dst, *src;
 
@@ -9,9 +9,5 @@ foo (void)
   __builtin_memcpy (dst, src, 15);
 }
 
-/* { dg-final { scan-assembler-times "movq\[\\t \]+\\(%\[\^,\]+\\)," 1 { target { ! ia32 } } } } */
-/* { dg-final { scan-assembler-times "movq\[\\t \]+7\\(%\[\^,\]+\\)," 1 { target { ! ia32 } } } } */
-/* { dg-final { scan-assembler-times "movl\[\\t \]+\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
-/* { dg-final { scan-assembler-times "movl\[\\t \]+4\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
-/* { dg-final { scan-assembler-times "movl\[\\t \]+8\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
-/* { dg-final { scan-assembler-times "movl\[\\t \]+11\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "movq\[\\t \]+\\(%\[\^,\]+\\)," 1 } } */
+/* { dg-final { scan-assembler-times "movq\[\\t \]+7\\(%\[\^,\]+\\)," 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-14.c b/gcc/testsuite/gcc.target/i386/pr90773-14.c
index 6364916ecac..74ba5055960 100644
--- a/gcc/testsuite/gcc.target/i386/pr90773-14.c
+++ b/gcc/testsuite/gcc.target/i386/pr90773-14.c
@@ -1,4 +1,4 @@
-/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-do compile } */
 /* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
 
 extern char *dst;
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-15.c b/gcc/testsuite/gcc.target/i386/pr90773-15.c
index c0a96fed892..880f71d1567 100644
--- a/gcc/testsuite/gcc.target/i386/pr90773-15.c
+++ b/gcc/testsuite/gcc.target/i386/pr90773-15.c
@@ -1,4 +1,4 @@
-/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-do compile } */
 /* { dg-options "-O2 -march=skylake-avx512" } */
 
 extern char *dst;
@@ -9,6 +9,6 @@ foo (int c)
   __builtin_memset (dst, c, 17);
 }
 
-/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%edi, %xmm\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%.*, %xmm\[0-9\]+" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqu\[\\t \]+%xmm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */
-/* { dg-final { scan-assembler-times "movb\[\\t \]+%dil, 16\\(%\[\^,\]+\\)" 1 } } */
+/* { dg-final { scan-assembler-times "movb\[\\t \]+%.*, 16\\(%\[\^,\]+\\)" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-16.c b/gcc/testsuite/gcc.target/i386/pr90773-16.c
index d2d1ec6141c..32a976b10df 100644
--- a/gcc/testsuite/gcc.target/i386/pr90773-16.c
+++ b/gcc/testsuite/gcc.target/i386/pr90773-16.c
@@ -1,4 +1,4 @@
-/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-do compile } */
 /* { dg-options "-O2 -march=skylake-avx512" } */
 
 extern char *dst;
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-17.c b/gcc/testsuite/gcc.target/i386/pr90773-17.c
index 6c8da7d24ef..2d6fbf22a8b 100644
--- a/gcc/testsuite/gcc.target/i386/pr90773-17.c
+++ b/gcc/testsuite/gcc.target/i386/pr90773-17.c
@@ -1,4 +1,4 @@
-/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-do compile } */
 /* { dg-options "-O2 -march=skylake-avx512" } */
 
 extern char *dst;
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-4.c b/gcc/testsuite/gcc.target/i386/pr90773-4.c
index ec0bc0100ae..ee4c04678d1 100644
--- a/gcc/testsuite/gcc.target/i386/pr90773-4.c
+++ b/gcc/testsuite/gcc.target/i386/pr90773-4.c
@@ -1,4 +1,4 @@
-/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-do compile } */
 /* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
 
 extern char *dst;
-- 
2.31.1


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v2 05/11] x86: Add AVX2 tests for PR middle-end/90773
  2021-05-11 23:35 [PATCH v2 00/11] Allow TImode/OImode/XImode in op_by_pieces operations H.J. Lu
                   ` (3 preceding siblings ...)
  2021-05-11 23:35 ` [PATCH v2 04/11] x86: Update piecewise move and store H.J. Lu
@ 2021-05-11 23:35 ` H.J. Lu
  2021-05-11 23:35 ` [PATCH v2 06/11] x86: Add tests for piecewise move and store H.J. Lu
                   ` (5 subsequent siblings)
  10 siblings, 0 replies; 21+ messages in thread
From: H.J. Lu @ 2021-05-11 23:35 UTC (permalink / raw)
  To: gcc-patches; +Cc: Richard Biener, Richard Sandiford, Uros Bizjak

	PR middle-end/90773
	* gcc.target/i386/pr90773-20.c: New test.
	* gcc.target/i386/pr90773-21.c: Likewise.
	* gcc.target/i386/pr90773-22.c: Likewise.
	* gcc.target/i386/pr90773-23.c: Likewise.
---
 gcc/testsuite/gcc.target/i386/pr90773-20.c | 13 +++++++++++++
 gcc/testsuite/gcc.target/i386/pr90773-21.c | 13 +++++++++++++
 gcc/testsuite/gcc.target/i386/pr90773-22.c | 13 +++++++++++++
 gcc/testsuite/gcc.target/i386/pr90773-23.c | 13 +++++++++++++
 4 files changed, 52 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-20.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-21.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-22.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-23.c

diff --git a/gcc/testsuite/gcc.target/i386/pr90773-20.c b/gcc/testsuite/gcc.target/i386/pr90773-20.c
new file mode 100644
index 00000000000..e61e405f2b6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr90773-20.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake" } */
+
+extern char *dst;
+
+void
+foo (int c)
+{
+  __builtin_memset (dst, c, 33);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */
+/* { dg-final { scan-assembler-times "movb\[\\t \]+.+, 32\\(%\[\^,\]+\\)" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-21.c b/gcc/testsuite/gcc.target/i386/pr90773-21.c
new file mode 100644
index 00000000000..16ad17f3cbb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr90773-21.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake" } */
+
+extern char *dst;
+
+void
+foo (int c)
+{
+  __builtin_memset (dst, c, 34);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */
+/* { dg-final { scan-assembler-times "movw\[\\t \]%.*, 32\\(%\[\^,\]+\\)" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-22.c b/gcc/testsuite/gcc.target/i386/pr90773-22.c
new file mode 100644
index 00000000000..45a8ff65a84
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr90773-22.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, 0, 33);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */
+/* { dg-final { scan-assembler-times "movb\[\\t \]+.+, 32\\(%\[\^,\]+\\)" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-23.c b/gcc/testsuite/gcc.target/i386/pr90773-23.c
new file mode 100644
index 00000000000..9256ce10ff0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr90773-23.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, 0, 34);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */
+/* { dg-final { scan-assembler-times "movw\[\\t \]+.+, 32\\(%\[\^,\]+\\)" 1 } } */
-- 
2.31.1


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v2 06/11] x86: Add tests for piecewise move and store
  2021-05-11 23:35 [PATCH v2 00/11] Allow TImode/OImode/XImode in op_by_pieces operations H.J. Lu
                   ` (4 preceding siblings ...)
  2021-05-11 23:35 ` [PATCH v2 05/11] x86: Add AVX2 tests for PR middle-end/90773 H.J. Lu
@ 2021-05-11 23:35 ` H.J. Lu
  2021-05-11 23:35 ` [PATCH v2 07/11] x86: Also pass -mno-avx to pr72839.c H.J. Lu
                   ` (4 subsequent siblings)
  10 siblings, 0 replies; 21+ messages in thread
From: H.J. Lu @ 2021-05-11 23:35 UTC (permalink / raw)
  To: gcc-patches; +Cc: Richard Biener, Richard Sandiford, Uros Bizjak

	* gcc.target/i386/pieces-memcpy-10.c: New test.
	* gcc.target/i386/pieces-memcpy-11.c: Likewise.
	* gcc.target/i386/pieces-memcpy-12.c: Likewise.
	* gcc.target/i386/pieces-memcpy-13.c: Likewise.
	* gcc.target/i386/pieces-memcpy-14.c: Likewise.
	* gcc.target/i386/pieces-memcpy-15.c: Likewise.
	* gcc.target/i386/pieces-memcpy-16.c: Likewise.
	* gcc.target/i386/pieces-memcpy-17.c: Likewise.
	* gcc.target/i386/pieces-memcpy-18.c: Likewise.
	* gcc.target/i386/pieces-memcpy-19.c: Likewise.
	* gcc.target/i386/pieces-memset-1.c: Likewise.
	* gcc.target/i386/pieces-memset-2.c: Likewise.
	* gcc.target/i386/pieces-memset-3.c: Likewise.
	* gcc.target/i386/pieces-memset-4.c: Likewise.
	* gcc.target/i386/pieces-memset-5.c: Likewise.
	* gcc.target/i386/pieces-memset-6.c: Likewise.
	* gcc.target/i386/pieces-memset-7.c: Likewise.
	* gcc.target/i386/pieces-memset-8.c: Likewise.
	* gcc.target/i386/pieces-memset-9.c: Likewise.
	* gcc.target/i386/pieces-memset-10.c: Likewise.
	* gcc.target/i386/pieces-memset-11.c: Likewise.
	* gcc.target/i386/pieces-memset-12.c: Likewise.
	* gcc.target/i386/pieces-memset-13.c: Likewise.
	* gcc.target/i386/pieces-memset-14.c: Likewise.
	* gcc.target/i386/pieces-memset-15.c: Likewise.
	* gcc.target/i386/pieces-memset-16.c: Likewise.
	* gcc.target/i386/pieces-memset-17.c: Likewise.
	* gcc.target/i386/pieces-memset-18.c: Likewise.
	* gcc.target/i386/pieces-memset-19.c: Likewise.
	* gcc.target/i386/pieces-memset-20.c: Likewise.
	* gcc.target/i386/pieces-memset-21.c: Likewise.
	* gcc.target/i386/pieces-memset-22.c: Likewise.
	* gcc.target/i386/pieces-memset-23.c: Likewise.
	* gcc.target/i386/pieces-memset-24.c: Likewise.
	* gcc.target/i386/pieces-memset-25.c: Likewise.
	* gcc.target/i386/pieces-memset-26.c: Likewise.
	* gcc.target/i386/pieces-memset-27.c: Likewise.
	* gcc.target/i386/pieces-memset-28.c: Likewise.
	* gcc.target/i386/pieces-memset-29.c: Likewise.
	* gcc.target/i386/pieces-memset-30.c: Likewise.
	* gcc.target/i386/pieces-memset-31.c: Likewise.
	* gcc.target/i386/pieces-memset-32.c: Likewise.
	* gcc.target/i386/pieces-memset-33.c: Likewise.
	* gcc.target/i386/pieces-memset-34.c: Likewise.
	* gcc.target/i386/pieces-memset-35.c: Likewise.
	* gcc.target/i386/pieces-memset-36.c: Likewise.
	* gcc.target/i386/pieces-memset-37.c: Likewise.
	* gcc.target/i386/pieces-memset-38.c: Likewise.
	* gcc.target/i386/pieces-memset-39.c: Likewise.
	* gcc.target/i386/pieces-memset-40.c: Likewise.
	* gcc.target/i386/pieces-memset-41.c: Likewise.
	* gcc.target/i386/pieces-memset-42.c: Likewise.
	* gcc.target/i386/pieces-memset-43.c: Likewise.
	* gcc.target/i386/pieces-memset-44.c: Likewise.
---
 .../gcc.target/i386/pieces-memcpy-10.c         | 16 ++++++++++++++++
 .../gcc.target/i386/pieces-memcpy-11.c         | 17 +++++++++++++++++
 .../gcc.target/i386/pieces-memcpy-12.c         | 16 ++++++++++++++++
 .../gcc.target/i386/pieces-memcpy-13.c         | 16 ++++++++++++++++
 .../gcc.target/i386/pieces-memcpy-14.c         | 17 +++++++++++++++++
 .../gcc.target/i386/pieces-memcpy-15.c         | 16 ++++++++++++++++
 .../gcc.target/i386/pieces-memcpy-16.c         | 16 ++++++++++++++++
 .../gcc.target/i386/pieces-memcpy-7.c          | 15 +++++++++++++++
 .../gcc.target/i386/pieces-memcpy-8.c          | 14 ++++++++++++++
 .../gcc.target/i386/pieces-memcpy-9.c          | 14 ++++++++++++++
 .../gcc.target/i386/pieces-memset-1.c          | 16 ++++++++++++++++
 .../gcc.target/i386/pieces-memset-10.c         | 16 ++++++++++++++++
 .../gcc.target/i386/pieces-memset-11.c         | 16 ++++++++++++++++
 .../gcc.target/i386/pieces-memset-12.c         | 16 ++++++++++++++++
 .../gcc.target/i386/pieces-memset-13.c         | 16 ++++++++++++++++
 .../gcc.target/i386/pieces-memset-14.c         | 16 ++++++++++++++++
 .../gcc.target/i386/pieces-memset-15.c         | 16 ++++++++++++++++
 .../gcc.target/i386/pieces-memset-16.c         | 16 ++++++++++++++++
 .../gcc.target/i386/pieces-memset-17.c         | 16 ++++++++++++++++
 .../gcc.target/i386/pieces-memset-18.c         | 16 ++++++++++++++++
 .../gcc.target/i386/pieces-memset-19.c         | 17 +++++++++++++++++
 .../gcc.target/i386/pieces-memset-2.c          | 12 ++++++++++++
 .../gcc.target/i386/pieces-memset-20.c         | 17 +++++++++++++++++
 .../gcc.target/i386/pieces-memset-21.c         | 17 +++++++++++++++++
 .../gcc.target/i386/pieces-memset-22.c         | 17 +++++++++++++++++
 .../gcc.target/i386/pieces-memset-23.c         | 17 +++++++++++++++++
 .../gcc.target/i386/pieces-memset-24.c         | 17 +++++++++++++++++
 .../gcc.target/i386/pieces-memset-25.c         | 17 +++++++++++++++++
 .../gcc.target/i386/pieces-memset-26.c         | 17 +++++++++++++++++
 .../gcc.target/i386/pieces-memset-27.c         | 17 +++++++++++++++++
 .../gcc.target/i386/pieces-memset-28.c         | 17 +++++++++++++++++
 .../gcc.target/i386/pieces-memset-29.c         | 17 +++++++++++++++++
 .../gcc.target/i386/pieces-memset-3.c          | 18 ++++++++++++++++++
 .../gcc.target/i386/pieces-memset-30.c         | 17 +++++++++++++++++
 .../gcc.target/i386/pieces-memset-31.c         | 17 +++++++++++++++++
 .../gcc.target/i386/pieces-memset-32.c         | 17 +++++++++++++++++
 .../gcc.target/i386/pieces-memset-33.c         | 17 +++++++++++++++++
 .../gcc.target/i386/pieces-memset-34.c         | 17 +++++++++++++++++
 .../gcc.target/i386/pieces-memset-35.c         | 17 +++++++++++++++++
 .../gcc.target/i386/pieces-memset-36.c         | 17 +++++++++++++++++
 .../gcc.target/i386/pieces-memset-37.c         | 15 +++++++++++++++
 .../gcc.target/i386/pieces-memset-38.c         | 17 +++++++++++++++++
 .../gcc.target/i386/pieces-memset-39.c         | 16 ++++++++++++++++
 .../gcc.target/i386/pieces-memset-4.c          | 16 ++++++++++++++++
 .../gcc.target/i386/pieces-memset-40.c         | 17 +++++++++++++++++
 .../gcc.target/i386/pieces-memset-41.c         | 16 ++++++++++++++++
 .../gcc.target/i386/pieces-memset-42.c         | 17 +++++++++++++++++
 .../gcc.target/i386/pieces-memset-43.c         | 17 +++++++++++++++++
 .../gcc.target/i386/pieces-memset-5.c          | 12 ++++++++++++
 .../gcc.target/i386/pieces-memset-6.c          | 16 ++++++++++++++++
 .../gcc.target/i386/pieces-memset-7.c          | 16 ++++++++++++++++
 .../gcc.target/i386/pieces-memset-8.c          | 16 ++++++++++++++++
 .../gcc.target/i386/pieces-memset-9.c          | 16 ++++++++++++++++
 53 files changed, 860 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-11.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-12.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-13.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-14.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-15.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-16.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-7.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-8.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-9.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-10.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-11.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-12.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-13.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-14.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-15.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-16.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-17.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-18.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-19.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-20.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-21.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-22.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-23.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-24.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-25.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-26.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-27.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-28.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-29.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-30.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-31.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-32.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-33.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-34.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-35.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-36.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-37.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-38.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-39.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-4.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-40.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-41.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-42.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-43.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-5.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-6.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-7.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-8.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-9.c

diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c b/gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c
new file mode 100644
index 00000000000..5faee21f9b9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mtune=sandybridge" } */
+
+extern char *dst, *src;
+
+void
+foo (void)
+{
+  __builtin_memcpy (dst, src, 33);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%xmm" 4 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-11.c b/gcc/testsuite/gcc.target/i386/pieces-memcpy-11.c
new file mode 100644
index 00000000000..b8917a7f917
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-11.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
+
+extern char *dst, *src;
+
+void
+foo (void)
+{
+  __builtin_memcpy (dst, src, 64);
+}
+
+/* { dg-final { scan-assembler-times "movdqu\[ \\t\]+\[^\n\]*%xmm" 4 } } */
+/* { dg-final { scan-assembler-times "movups\[ \\t\]+\[^\n\]*%xmm" 4 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-12.c b/gcc/testsuite/gcc.target/i386/pieces-memcpy-12.c
new file mode 100644
index 00000000000..f1432ebe517
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-12.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mtune=haswell" } */
+
+extern char *dst, *src;
+
+void
+foo (void)
+{
+  __builtin_memcpy (dst, src, 64);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%ymm" 4 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-13.c b/gcc/testsuite/gcc.target/i386/pieces-memcpy-13.c
new file mode 100644
index 00000000000..97e6067fec9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-13.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f -mtune=generic" } */
+
+extern char *dst, *src;
+
+void
+foo (void)
+{
+  __builtin_memcpy (dst, src, 66);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu64\[ \\t\]+\[^\n\]*%zmm" 2 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-14.c b/gcc/testsuite/gcc.target/i386/pieces-memcpy-14.c
new file mode 100644
index 00000000000..7addc4c0a28
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-14.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
+
+extern char *dst, *src;
+
+void
+foo (void)
+{
+  __builtin_memcpy (dst, src, 33);
+}
+
+/* { dg-final { scan-assembler-times "movdqu\[ \\t\]+\[^\n\]*%xmm" 2 } } */
+/* { dg-final { scan-assembler-times "movups\[ \\t\]+\[^\n\]*%xmm" 2 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-15.c b/gcc/testsuite/gcc.target/i386/pieces-memcpy-15.c
new file mode 100644
index 00000000000..695e8c3fa67
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-15.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mtune=haswell" } */
+
+extern char *dst, *src;
+
+void
+foo (void)
+{
+  __builtin_memcpy (dst, src, 33);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%ymm" 2 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-16.c b/gcc/testsuite/gcc.target/i386/pieces-memcpy-16.c
new file mode 100644
index 00000000000..b0643d05ee7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-16.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f -mtune=generic" } */
+
+extern char *dst, *src;
+
+void
+foo (void)
+{
+  __builtin_memcpy (dst, src, 34);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%ymm" 2 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-7.c b/gcc/testsuite/gcc.target/i386/pieces-memcpy-7.c
new file mode 100644
index 00000000000..3d248d447ea
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-7.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
+
+void
+foo (int a1, int a2, int a3, int a4, int a5, int a6, char *dst, char *src)
+{
+  __builtin_memcpy (dst, src, 17);
+}
+
+/* { dg-final { scan-assembler-times "movdqu\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* { dg-final { scan-assembler-times "movups\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-8.c b/gcc/testsuite/gcc.target/i386/pieces-memcpy-8.c
new file mode 100644
index 00000000000..c13a2beb2f0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-8.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mtune=generic" } */
+
+void
+foo (int a1, int a2, int a3, int a4, int a5, int a6, char *dst, char *src)
+{
+  __builtin_memcpy (dst, src, 18);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%xmm" 2 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-9.c b/gcc/testsuite/gcc.target/i386/pieces-memcpy-9.c
new file mode 100644
index 00000000000..238f88b275e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-9.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f -mtune=generic" } */
+
+void
+foo (int a1, int a2, int a3, int a4, int a5, int a6, char *dst, char *src)
+{
+  __builtin_memcpy (dst, src, 19);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%xmm" 2 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-1.c b/gcc/testsuite/gcc.target/i386/pieces-memset-1.c
new file mode 100644
index 00000000000..2b8032684b3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-1.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
+
+extern char *dst;
+
+void
+foo (int x)
+{
+  __builtin_memset (dst, x, 64);
+}
+
+/* { dg-final { scan-assembler-times "movups\[ \\t\]+\[^\n\]*%xmm" 4 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-10.c b/gcc/testsuite/gcc.target/i386/pieces-memset-10.c
new file mode 100644
index 00000000000..a6390d1bd8f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-10.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, 3, 64);
+}
+
+/* { dg-final { scan-assembler-times "movups\[ \\t\]+\[^\n\]*%xmm" 4 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-11.c b/gcc/testsuite/gcc.target/i386/pieces-memset-11.c
new file mode 100644
index 00000000000..3fb9038b04f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-11.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mtune=haswell" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, 3, 64);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%ymm" 2 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-12.c b/gcc/testsuite/gcc.target/i386/pieces-memset-12.c
new file mode 100644
index 00000000000..fa834566097
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-12.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f -mtune=generic" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, 3, 66);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu64\[ \\t\]+\[^\n\]*%zmm" 1 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-13.c b/gcc/testsuite/gcc.target/i386/pieces-memset-13.c
new file mode 100644
index 00000000000..7f2cd3f58ec
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-13.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, 3, 33);
+}
+
+/* { dg-final { scan-assembler-times "movups\[ \\t\]+\[^\n\]*%xmm" 2 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-14.c b/gcc/testsuite/gcc.target/i386/pieces-memset-14.c
new file mode 100644
index 00000000000..45ece482464
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-14.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mtune=haswell" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, 3, 33);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%ymm" 1 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-15.c b/gcc/testsuite/gcc.target/i386/pieces-memset-15.c
new file mode 100644
index 00000000000..bddf47d728e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-15.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f -mtune=generic" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, 3, 33);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%ymm" 1 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-16.c b/gcc/testsuite/gcc.target/i386/pieces-memset-16.c
new file mode 100644
index 00000000000..1c5d124cecc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-16.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, 3, 17);
+}
+
+/* { dg-final { scan-assembler-times "movups\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-17.c b/gcc/testsuite/gcc.target/i386/pieces-memset-17.c
new file mode 100644
index 00000000000..6cdb33557c0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-17.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mtune=generic" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, 3, 17);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-18.c b/gcc/testsuite/gcc.target/i386/pieces-memset-18.c
new file mode 100644
index 00000000000..adbd201b4e7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-18.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f -mtune=generic" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, 3, 18);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-19.c b/gcc/testsuite/gcc.target/i386/pieces-memset-19.c
new file mode 100644
index 00000000000..7e9cf2e26d8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-19.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, 0, 64);
+}
+
+/* { dg-final { scan-assembler-times "pxor\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* { dg-final { scan-assembler-times "movups\[ \\t\]+\[^\n\]*%xmm" 4 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-2.c b/gcc/testsuite/gcc.target/i386/pieces-memset-2.c
new file mode 100644
index 00000000000..649f344e8f6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-2.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mtune=haswell" } */
+
+extern char *dst;
+
+void
+foo (int x)
+{
+  __builtin_memset (dst, x, 64);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%ymm" 2 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-20.c b/gcc/testsuite/gcc.target/i386/pieces-memset-20.c
new file mode 100644
index 00000000000..b8747e669e8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-20.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mtune=haswell" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, 0, 64);
+}
+
+/* { dg-final { scan-assembler-times "vpxor\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%ymm" 2 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-21.c b/gcc/testsuite/gcc.target/i386/pieces-memset-21.c
new file mode 100644
index 00000000000..4f001c6d06c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-21.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f -mtune=generic" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, 0, 66);
+}
+
+/* { dg-final { scan-assembler-times "vpxor\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu64\[ \\t\]+\[^\n\]*%zmm" 1 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-22.c b/gcc/testsuite/gcc.target/i386/pieces-memset-22.c
new file mode 100644
index 00000000000..5f3c454ef8f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-22.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, 0, 33);
+}
+
+/* { dg-final { scan-assembler-times "pxor\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* { dg-final { scan-assembler-times "movups\[ \\t\]+\[^\n\]*%xmm" 2 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-23.c b/gcc/testsuite/gcc.target/i386/pieces-memset-23.c
new file mode 100644
index 00000000000..a3b4ffc18e0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-23.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mtune=haswell" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, 0, 33);
+}
+
+/* { dg-final { scan-assembler-times "vpxor\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%ymm" 1 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-24.c b/gcc/testsuite/gcc.target/i386/pieces-memset-24.c
new file mode 100644
index 00000000000..e222787b541
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-24.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f -mtune=generic" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, 0, 33);
+}
+
+/* { dg-final { scan-assembler-times "vpxor\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%ymm" 1 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-25.c b/gcc/testsuite/gcc.target/i386/pieces-memset-25.c
new file mode 100644
index 00000000000..195ddb635eb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-25.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, 0, 17);
+}
+
+/* { dg-final { scan-assembler-times "pxor\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* { dg-final { scan-assembler-times "movups\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-26.c b/gcc/testsuite/gcc.target/i386/pieces-memset-26.c
new file mode 100644
index 00000000000..13606b2da54
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-26.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mtune=generic" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, 0, 17);
+}
+
+/* { dg-final { scan-assembler-times "pxor\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-27.c b/gcc/testsuite/gcc.target/i386/pieces-memset-27.c
new file mode 100644
index 00000000000..54a672b6015
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-27.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f -mtune=generic" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, 0, 17);
+}
+
+/* { dg-final { scan-assembler-times "pxor\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-28.c b/gcc/testsuite/gcc.target/i386/pieces-memset-28.c
new file mode 100644
index 00000000000..83c2d3f0fde
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-28.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, -1, 64);
+}
+
+/* { dg-final { scan-assembler-times "pcmpeqd\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* { dg-final { scan-assembler-times "movups\[ \\t\]+\[^\n\]*%xmm" 4 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-29.c b/gcc/testsuite/gcc.target/i386/pieces-memset-29.c
new file mode 100644
index 00000000000..650e6fe66a5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-29.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mtune=haswell" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, -1, 64);
+}
+
+/* { dg-final { scan-assembler-not "vpcmpeqd\[ \\t\]+\[^\n\]*%ymm" } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%ymm" 2 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-3.c b/gcc/testsuite/gcc.target/i386/pieces-memset-3.c
new file mode 100644
index 00000000000..2aed6dbc68e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-3.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx512bw -mno-avx512vl -mavx512f -mtune=intel" } */
+
+extern char *dst;
+
+void
+foo (int x)
+{
+  __builtin_memset (dst, x, 66);
+}
+
+/* { dg-final { scan-assembler-times "vpbroadcastb\[ \\t\]+\[^\n\]*%ymm" 1 } } */
+/* { dg-final { scan-assembler-times "vinserti64x4\[ \\t\]+\[^\n\]*%zmm" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu64\[ \\t\]+\[^\n\]*%zmm" 1 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" { target { ! ia32 } } } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-30.c b/gcc/testsuite/gcc.target/i386/pieces-memset-30.c
new file mode 100644
index 00000000000..dcec2c700fc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-30.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx512f -mavx2 -mtune=haswell" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, -1, 64);
+}
+
+/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+\[^\n\]*%ymm" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%ymm" 2 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-31.c b/gcc/testsuite/gcc.target/i386/pieces-memset-31.c
new file mode 100644
index 00000000000..5d20af0938d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-31.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f -mtune=generic" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, -1, 66);
+}
+
+/* { dg-final { scan-assembler-times "vpternlogd\[ \\t\]+\[^\n\]*%zmm" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu64\[ \\t\]+\[^\n\]*%zmm" 1 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-32.c b/gcc/testsuite/gcc.target/i386/pieces-memset-32.c
new file mode 100644
index 00000000000..c5ca0bd17ba
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-32.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, -1, 33);
+}
+
+/* { dg-final { scan-assembler-times "pcmpeqd\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* { dg-final { scan-assembler-times "movups\[ \\t\]+\[^\n\]*%xmm" 2 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-33.c b/gcc/testsuite/gcc.target/i386/pieces-memset-33.c
new file mode 100644
index 00000000000..a87d1b80ae6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-33.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mtune=haswell" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, -1, 33);
+}
+
+/* { dg-final { scan-assembler-not "vpcmpeqd\[ \\t\]+\[^\n\]*%ymm" } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%ymm" 1 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-34.c b/gcc/testsuite/gcc.target/i386/pieces-memset-34.c
new file mode 100644
index 00000000000..0c2f1ee6049
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-34.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx512f -mavx2 -mtune=haswell" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, -1, 33);
+}
+
+/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+\[^\n\]*%ymm" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%ymm" 1 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-35.c b/gcc/testsuite/gcc.target/i386/pieces-memset-35.c
new file mode 100644
index 00000000000..b0f4a8b898e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-35.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f -mtune=generic" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, -1, 34);
+}
+
+/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+\[^\n\]*%ymm" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%ymm" 1 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-36.c b/gcc/testsuite/gcc.target/i386/pieces-memset-36.c
new file mode 100644
index 00000000000..d1f1263c7b2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-36.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx512f -mavx2 -mtune=generic" } */
+
+extern char *dst;
+
+void
+foo (int x)
+{
+  __builtin_memset (dst, x, 17);
+}
+
+/* { dg-final { scan-assembler-times "vpbroadcastb\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-37.c b/gcc/testsuite/gcc.target/i386/pieces-memset-37.c
new file mode 100644
index 00000000000..ec59497b116
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-37.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx512f -mavx2 -mtune=generic" } */
+
+void
+foo (int a1, int a2, int a3, int a4, int a5, int a6, int x, char *dst)
+{
+  __builtin_memset (dst, x, 66);
+}
+
+/* { dg-final { scan-assembler-times "vpbroadcastb\[ \\t\]+\[^\n\]*%ymm" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%ymm" 2 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-38.c b/gcc/testsuite/gcc.target/i386/pieces-memset-38.c
new file mode 100644
index 00000000000..ed4a24a54fd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-38.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx512f -mavx2 -mtune=sandybridge" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, -1, 33);
+}
+
+/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%xmm" 2 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-39.c b/gcc/testsuite/gcc.target/i386/pieces-memset-39.c
new file mode 100644
index 00000000000..a330bff5f3f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-39.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512bw -mtune=generic" } */
+
+void
+foo (int a1, int a2, int a3, int a4, int a5, int a6, int x, char *dst)
+{
+  __builtin_memset (dst, x, 66);
+}
+
+/* { dg-final { scan-assembler-times "vpbroadcastb\[ \\t\]+\[^\n\]*%zmm" 1 } } */
+/* { dg-final { scan-assembler-not "vinserti64x4" } } */
+/* { dg-final { scan-assembler-times "vmovdqu64\[ \\t\]+\[^\n\]*%zmm" 1 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-4.c b/gcc/testsuite/gcc.target/i386/pieces-memset-4.c
new file mode 100644
index 00000000000..9256919bfdf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-4.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
+
+extern char *dst;
+
+void
+foo (int x)
+{
+  __builtin_memset (dst, x, 33);
+}
+
+/* { dg-final { scan-assembler-times "movups\[ \\t\]+\[^\n\]*%xmm" 2 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-40.c b/gcc/testsuite/gcc.target/i386/pieces-memset-40.c
new file mode 100644
index 00000000000..4eda73ead59
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-40.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx512f -mavx2 -mtune=sandybridge" } */
+
+extern char *dst;
+
+void
+foo (int x)
+{
+  __builtin_memset (dst, x, 66);
+}
+
+/* { dg-final { scan-assembler-times "vpbroadcastb\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%xmm" 4 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-41.c b/gcc/testsuite/gcc.target/i386/pieces-memset-41.c
new file mode 100644
index 00000000000..f86b6986da9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-41.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mtune=sandybridge" } */
+
+extern char *dst;
+
+void
+foo (int x)
+{
+  __builtin_memset (dst, x, 33);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%xmm" 2 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-42.c b/gcc/testsuite/gcc.target/i386/pieces-memset-42.c
new file mode 100644
index 00000000000..df0c122aae7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-42.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mtune=sandybridge" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, 0, 33);
+}
+
+/* { dg-final { scan-assembler-times "vpxor\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%xmm" 2 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-43.c b/gcc/testsuite/gcc.target/i386/pieces-memset-43.c
new file mode 100644
index 00000000000..2f2179c2df9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-43.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mtune=sandybridge" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+  __builtin_memset (dst, -1, 33);
+}
+
+/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%xmm" 2 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-5.c b/gcc/testsuite/gcc.target/i386/pieces-memset-5.c
new file mode 100644
index 00000000000..3e95db5efef
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-5.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mtune=haswell" } */
+
+extern char *dst;
+
+void
+foo (int x)
+{
+  __builtin_memset (dst, x, 33);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%ymm" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-6.c b/gcc/testsuite/gcc.target/i386/pieces-memset-6.c
new file mode 100644
index 00000000000..571113c3a33
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-6.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f -mtune=intel" } */
+
+extern char *dst;
+
+void
+foo (int x)
+{
+  __builtin_memset (dst, x, 33);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%ymm" 1 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" { target { ! ia32 } } } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-7.c b/gcc/testsuite/gcc.target/i386/pieces-memset-7.c
new file mode 100644
index 00000000000..fd159869817
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-7.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
+
+extern char *dst;
+
+void
+foo (int x)
+{
+  __builtin_memset (dst, x, 17);
+}
+
+/* { dg-final { scan-assembler-times "movups\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-8.c b/gcc/testsuite/gcc.target/i386/pieces-memset-8.c
new file mode 100644
index 00000000000..7df0019ef63
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-8.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mtune=generic" } */
+
+extern char *dst;
+
+void
+foo (int x)
+{
+  __builtin_memset (dst, x, 17);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-9.c b/gcc/testsuite/gcc.target/i386/pieces-memset-9.c
new file mode 100644
index 00000000000..ed45d590875
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-9.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f -mtune=generic" } */
+
+extern char *dst;
+
+void
+foo (int x)
+{
+  __builtin_memset (dst, x, 17);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%xmm" 1 } } */
+/* No need to dynamically realign the stack here.  */
+/* { dg-final { scan-assembler-not "and\[^\n\r]*%\[re\]sp" } } */
+/* Nor use a frame pointer.  */
+/* { dg-final { scan-assembler-not "%\[re\]bp" } } */
-- 
2.31.1


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v2 07/11] x86: Also pass -mno-avx to pr72839.c
  2021-05-11 23:35 [PATCH v2 00/11] Allow TImode/OImode/XImode in op_by_pieces operations H.J. Lu
                   ` (5 preceding siblings ...)
  2021-05-11 23:35 ` [PATCH v2 06/11] x86: Add tests for piecewise move and store H.J. Lu
@ 2021-05-11 23:35 ` H.J. Lu
  2021-05-11 23:35 ` [PATCH v2 08/11] x86: Also pass -mno-avx to cold-attribute-1.c H.J. Lu
                   ` (3 subsequent siblings)
  10 siblings, 0 replies; 21+ messages in thread
From: H.J. Lu @ 2021-05-11 23:35 UTC (permalink / raw)
  To: gcc-patches; +Cc: Richard Biener, Richard Sandiford, Uros Bizjak

Also pass -mno-avx to pr72839.c to avoid copying data with YMM or ZMM
registers.

	* gcc.target/i386/pr72839.c: Also pass -mno-avx.
---
 gcc/testsuite/gcc.target/i386/pr72839.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.target/i386/pr72839.c b/gcc/testsuite/gcc.target/i386/pr72839.c
index ea724f70377..6888d9d0a55 100644
--- a/gcc/testsuite/gcc.target/i386/pr72839.c
+++ b/gcc/testsuite/gcc.target/i386/pr72839.c
@@ -1,6 +1,6 @@
 /* { dg-do compile } */
 /* { dg-require-effective-target ia32 } */
-/* { dg-options "-O2 -mtune=lakemont" } */
+/* { dg-options "-O2 -mtune=lakemont -mno-avx" } */
 
 extern char *strcpy (char *, const char *);
 
-- 
2.31.1


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v2 08/11] x86: Also pass -mno-avx to cold-attribute-1.c
  2021-05-11 23:35 [PATCH v2 00/11] Allow TImode/OImode/XImode in op_by_pieces operations H.J. Lu
                   ` (6 preceding siblings ...)
  2021-05-11 23:35 ` [PATCH v2 07/11] x86: Also pass -mno-avx to pr72839.c H.J. Lu
@ 2021-05-11 23:35 ` H.J. Lu
  2021-05-11 23:35 ` [PATCH v2 09/11] x86: Also pass -mno-avx to sw-1.c for ia32 H.J. Lu
                   ` (2 subsequent siblings)
  10 siblings, 0 replies; 21+ messages in thread
From: H.J. Lu @ 2021-05-11 23:35 UTC (permalink / raw)
  To: gcc-patches; +Cc: Richard Biener, Richard Sandiford, Uros Bizjak

Also pass -mno-avx to pr72839.c to avoid copying data with YMM or ZMM
registers.

	* gcc.target/i386/cold-attribute-1.c: Also pass -mno-avx.
---
 gcc/testsuite/gcc.target/i386/cold-attribute-1.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.target/i386/cold-attribute-1.c b/gcc/testsuite/gcc.target/i386/cold-attribute-1.c
index 57666ac60b6..658eb3e25bb 100644
--- a/gcc/testsuite/gcc.target/i386/cold-attribute-1.c
+++ b/gcc/testsuite/gcc.target/i386/cold-attribute-1.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2" } */
+/* { dg-options "-O2 -mno-avx" } */
 #include <string.h>
 static inline
 __attribute__ ((cold)) void
-- 
2.31.1


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v2 09/11] x86: Also pass -mno-avx to sw-1.c for ia32
  2021-05-11 23:35 [PATCH v2 00/11] Allow TImode/OImode/XImode in op_by_pieces operations H.J. Lu
                   ` (7 preceding siblings ...)
  2021-05-11 23:35 ` [PATCH v2 08/11] x86: Also pass -mno-avx to cold-attribute-1.c H.J. Lu
@ 2021-05-11 23:35 ` H.J. Lu
  2021-05-11 23:35 ` [PATCH v2 10/11] x86: Update gcc.target/i386/incoming-11.c H.J. Lu
  2021-05-11 23:35 ` [PATCH v2 11/11] constructor: Check if it is faster to load constant from memory H.J. Lu
  10 siblings, 0 replies; 21+ messages in thread
From: H.J. Lu @ 2021-05-11 23:35 UTC (permalink / raw)
  To: gcc-patches; +Cc: Richard Biener, Richard Sandiford, Uros Bizjak

Also pass -mno-avx to sw-1.c for ia32 since copying data with YMM or ZMM
registers disables shrink-wrapping when the second argument is passed on
stack.

	* gcc.target/i386/sw-1.c: Also pass -mno-avx for ia32.
---
 gcc/testsuite/gcc.target/i386/sw-1.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/gcc/testsuite/gcc.target/i386/sw-1.c b/gcc/testsuite/gcc.target/i386/sw-1.c
index aec095eda62..a9c89fca4ec 100644
--- a/gcc/testsuite/gcc.target/i386/sw-1.c
+++ b/gcc/testsuite/gcc.target/i386/sw-1.c
@@ -1,5 +1,6 @@
 /* { dg-do compile } */
 /* { dg-options "-O2 -mtune=generic -fshrink-wrap -fdump-rtl-pro_and_epilogue" } */
+/* { dg-additional-options "-mno-avx" { target ia32 } } */
 /* { dg-skip-if "No shrink-wrapping preformed" { x86_64-*-mingw* } } */
 
 #include <string.h>
-- 
2.31.1


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v2 10/11] x86: Update gcc.target/i386/incoming-11.c
  2021-05-11 23:35 [PATCH v2 00/11] Allow TImode/OImode/XImode in op_by_pieces operations H.J. Lu
                   ` (8 preceding siblings ...)
  2021-05-11 23:35 ` [PATCH v2 09/11] x86: Also pass -mno-avx to sw-1.c for ia32 H.J. Lu
@ 2021-05-11 23:35 ` H.J. Lu
  2021-05-11 23:35 ` [PATCH v2 11/11] constructor: Check if it is faster to load constant from memory H.J. Lu
  10 siblings, 0 replies; 21+ messages in thread
From: H.J. Lu @ 2021-05-11 23:35 UTC (permalink / raw)
  To: gcc-patches; +Cc: Richard Biener, Richard Sandiford, Uros Bizjak

Expect no stack realignment since we no longer realign stack when
copying data.

	* gcc.target/i386/incoming-11.c: Expect no stack realignment.
---
 gcc/testsuite/gcc.target/i386/incoming-11.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.target/i386/incoming-11.c b/gcc/testsuite/gcc.target/i386/incoming-11.c
index a830c96f7d1..4b822684b88 100644
--- a/gcc/testsuite/gcc.target/i386/incoming-11.c
+++ b/gcc/testsuite/gcc.target/i386/incoming-11.c
@@ -15,4 +15,4 @@ void f()
 	for (i = 0; i < 100; i++) q[i] = 1;
 }
 
-/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
+/* { dg-final { scan-assembler-not "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
-- 
2.31.1


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v2 11/11] constructor: Check if it is faster to load constant from memory
  2021-05-11 23:35 [PATCH v2 00/11] Allow TImode/OImode/XImode in op_by_pieces operations H.J. Lu
                   ` (9 preceding siblings ...)
  2021-05-11 23:35 ` [PATCH v2 10/11] x86: Update gcc.target/i386/incoming-11.c H.J. Lu
@ 2021-05-11 23:35 ` H.J. Lu
  10 siblings, 0 replies; 21+ messages in thread
From: H.J. Lu @ 2021-05-11 23:35 UTC (permalink / raw)
  To: gcc-patches; +Cc: Richard Biener, Richard Sandiford, Uros Bizjak

When expanding a constant constructor, don't call expand_constructor if
it is more efficient to load the data from the memory via move by pieces.

gcc/

	PR middle-end/90773
	* expr.c (expand_expr_real_1): Don't call expand_constructor if
	it is more efficient to load the data from the memory.

gcc/testsuite/

	PR middle-end/90773
	* gcc.target/i386/pr90773-24.c: New test.
	* gcc.target/i386/pr90773-25.c: Likewise.
---
 gcc/expr.c                                 | 10 ++++++++++
 gcc/testsuite/gcc.target/i386/pr90773-24.c | 22 ++++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr90773-25.c | 20 ++++++++++++++++++++
 3 files changed, 52 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-24.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-25.c

diff --git a/gcc/expr.c b/gcc/expr.c
index 42ef5bdf5d5..6ad7265702e 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -10885,6 +10885,16 @@ expand_expr_real_1 (tree exp, rtx target, machine_mode tmode,
 		unsigned HOST_WIDE_INT ix;
 		tree field, value;
 
+		/* Check if it is more efficient to load the data from
+		   the memory directly.  FIXME: How many stores do we
+		   need here if not moved by pieces?  */
+		unsigned HOST_WIDE_INT bytes
+		  = tree_to_uhwi (TYPE_SIZE_UNIT (type));
+		if ((bytes / UNITS_PER_WORD) > 2
+		    && MOVE_MAX_PIECES > UNITS_PER_WORD
+		    && can_move_by_pieces (bytes, TYPE_ALIGN (type)))
+		  goto normal_inner_ref;
+
 		FOR_EACH_CONSTRUCTOR_ELT (CONSTRUCTOR_ELTS (init), ix,
 					  field, value)
 		  if (tree_int_cst_equal (field, index))
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-24.c b/gcc/testsuite/gcc.target/i386/pr90773-24.c
new file mode 100644
index 00000000000..4a4b62533dc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr90773-24.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64" } */
+
+struct S
+{
+  long long s1 __attribute__ ((aligned (8)));
+  unsigned s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14;
+};
+
+const struct S array[] = {
+  { 0, 60, 640, 2112543726, 39682, 48, 16, 33, 10, 96, 2, 0, 0, 4 }
+};
+
+void
+foo (struct S *x)
+{
+  x[0] = array[0];
+}
+/* { dg-final { scan-assembler-times "movups\[\\t \]%xmm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */
+/* { dg-final { scan-assembler-times "movups\[\\t \]%xmm\[0-9\]+, 16\\(%\[\^,\]+\\)" 1 } } */
+/* { dg-final { scan-assembler-times "movups\[\\t \]%xmm\[0-9\]+, 32\\(%\[\^,\]+\\)" 1 } } */
+/* { dg-final { scan-assembler-times "movups\[\\t \]%xmm\[0-9\]+, 48\\(%\[\^,\]+\\)" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr90773-25.c b/gcc/testsuite/gcc.target/i386/pr90773-25.c
new file mode 100644
index 00000000000..2520b670989
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr90773-25.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake" } */
+
+struct S
+{
+  long long s1 __attribute__ ((aligned (8)));
+  unsigned s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14;
+};
+
+const struct S array[] = {
+  { 0, 60, 640, 2112543726, 39682, 48, 16, 33, 10, 96, 2, 0, 0, 4 }
+};
+
+void
+foo (struct S *x)
+{
+  x[0] = array[0];
+}
+/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, 32\\(%\[\^,\]+\\)" 1 } } */
-- 
2.31.1


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v2 03/11] Remove MAX_BITSIZE_MODE_ANY_INT
  2021-05-11 23:35 ` [PATCH v2 03/11] Remove MAX_BITSIZE_MODE_ANY_INT H.J. Lu
@ 2021-05-12  9:00   ` Richard Biener
  2021-05-12 12:06     ` H.J. Lu
  0 siblings, 1 reply; 21+ messages in thread
From: Richard Biener @ 2021-05-12  9:00 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GCC Patches, Richard Sandiford, Uros Bizjak

On Wed, May 12, 2021 at 1:35 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> It is only defined for i386 and everyone uses the default:
>
>  #define MAX_BITSIZE_MODE_ANY_INT (64*BITS_PER_UNIT)

The default is determined by genmodes.c which looks at the maximum
size of any [partial] integer mode.  Since we have

INT_MODE (OI, 32);
INT_MODE (XI, 64);

this will increase the size of wide_int.  aarch64 and arm are the only
other targets with XImode, OImode is also used by s390 and ia64.

Keeping wide_int small is desirable.

Richard.

> Whatever problems we had before, they have been fixed now.
>
>         * config/i386/i386-modes.def (MAX_BITSIZE_MODE_ANY_INT): Removed.
> ---
>  gcc/config/i386/i386-modes.def | 15 +++------------
>  1 file changed, 3 insertions(+), 12 deletions(-)
>
> diff --git a/gcc/config/i386/i386-modes.def b/gcc/config/i386/i386-modes.def
> index dbddfd8e48f..4e7014be034 100644
> --- a/gcc/config/i386/i386-modes.def
> +++ b/gcc/config/i386/i386-modes.def
> @@ -107,19 +107,10 @@ INT_MODE (XI, 64);
>  PARTIAL_INT_MODE (HI, 16, P2QI);
>  PARTIAL_INT_MODE (SI, 32, P2HI);
>
> -/* Mode used for signed overflow checking of TImode.  As
> -   MAX_BITSIZE_MODE_ANY_INT is only 160, wide-int.h reserves only that
> -   rounded up to multiple of HOST_BITS_PER_WIDE_INT bits in wide_int etc.,
> -   so OImode is too large.  For the overflow checking we actually need
> -   just 1 or 2 bits beyond TImode precision.  Use 160 bits to have
> -   a multiple of 32.  */
> +/* Mode used for signed overflow checking of TImode.  For the overflow
> +   checking we actually need just 1 or 2 bits beyond TImode precision.
> +   Use 160 bits to have a multiple of 32.  */
>  PARTIAL_INT_MODE (OI, 160, POI);
>
> -/* Keep the OI and XI modes from confusing the compiler into thinking
> -   that these modes could actually be used for computation.  They are
> -   only holders for vectors during data movement.  Include POImode precision
> -   though.  */
> -#define MAX_BITSIZE_MODE_ANY_INT (160)
> -
>  /* The symbol Pmode stands for one of the above machine modes (usually SImode).
>     The tm.h file specifies which one.  It is not a distinct mode.  */
> --
> 2.31.1
>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v2 03/11] Remove MAX_BITSIZE_MODE_ANY_INT
  2021-05-12  9:00   ` Richard Biener
@ 2021-05-12 12:06     ` H.J. Lu
  2021-05-12 12:15       ` Richard Biener
  0 siblings, 1 reply; 21+ messages in thread
From: H.J. Lu @ 2021-05-12 12:06 UTC (permalink / raw)
  To: Richard Biener; +Cc: GCC Patches, Richard Sandiford, Uros Bizjak

On Wed, May 12, 2021 at 2:00 AM Richard Biener
<richard.guenther@gmail.com> wrote:
>
> On Wed, May 12, 2021 at 1:35 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > It is only defined for i386 and everyone uses the default:
> >
> >  #define MAX_BITSIZE_MODE_ANY_INT (64*BITS_PER_UNIT)
>
> The default is determined by genmodes.c which looks at the maximum
> size of any [partial] integer mode.  Since we have
>
> INT_MODE (OI, 32);
> INT_MODE (XI, 64);
>
> this will increase the size of wide_int.  aarch64 and arm are the only
> other targets with XImode, OImode is also used by s390 and ia64.
>
> Keeping wide_int small is desirable.

Since I want to use OImode and XImode in op_by_pieces operations,
wide_int needs to support OImode and XImode.

> Richard.
>
> > Whatever problems we had before, they have been fixed now.
> >
> >         * config/i386/i386-modes.def (MAX_BITSIZE_MODE_ANY_INT): Removed.
> > ---
> >  gcc/config/i386/i386-modes.def | 15 +++------------
> >  1 file changed, 3 insertions(+), 12 deletions(-)
> >
> > diff --git a/gcc/config/i386/i386-modes.def b/gcc/config/i386/i386-modes.def
> > index dbddfd8e48f..4e7014be034 100644
> > --- a/gcc/config/i386/i386-modes.def
> > +++ b/gcc/config/i386/i386-modes.def
> > @@ -107,19 +107,10 @@ INT_MODE (XI, 64);
> >  PARTIAL_INT_MODE (HI, 16, P2QI);
> >  PARTIAL_INT_MODE (SI, 32, P2HI);
> >
> > -/* Mode used for signed overflow checking of TImode.  As
> > -   MAX_BITSIZE_MODE_ANY_INT is only 160, wide-int.h reserves only that
> > -   rounded up to multiple of HOST_BITS_PER_WIDE_INT bits in wide_int etc.,
> > -   so OImode is too large.  For the overflow checking we actually need
> > -   just 1 or 2 bits beyond TImode precision.  Use 160 bits to have
> > -   a multiple of 32.  */
> > +/* Mode used for signed overflow checking of TImode.  For the overflow
> > +   checking we actually need just 1 or 2 bits beyond TImode precision.
> > +   Use 160 bits to have a multiple of 32.  */
> >  PARTIAL_INT_MODE (OI, 160, POI);
> >
> > -/* Keep the OI and XI modes from confusing the compiler into thinking
> > -   that these modes could actually be used for computation.  They are
> > -   only holders for vectors during data movement.  Include POImode precision
> > -   though.  */
> > -#define MAX_BITSIZE_MODE_ANY_INT (160)
> > -
> >  /* The symbol Pmode stands for one of the above machine modes (usually SImode).
> >     The tm.h file specifies which one.  It is not a distinct mode.  */
> > --
> > 2.31.1
> >



-- 
H.J.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v2 03/11] Remove MAX_BITSIZE_MODE_ANY_INT
  2021-05-12 12:06     ` H.J. Lu
@ 2021-05-12 12:15       ` Richard Biener
  0 siblings, 0 replies; 21+ messages in thread
From: Richard Biener @ 2021-05-12 12:15 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GCC Patches, Richard Sandiford, Uros Bizjak

On Wed, May 12, 2021 at 2:06 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Wed, May 12, 2021 at 2:00 AM Richard Biener
> <richard.guenther@gmail.com> wrote:
> >
> > On Wed, May 12, 2021 at 1:35 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > It is only defined for i386 and everyone uses the default:
> > >
> > >  #define MAX_BITSIZE_MODE_ANY_INT (64*BITS_PER_UNIT)
> >
> > The default is determined by genmodes.c which looks at the maximum
> > size of any [partial] integer mode.  Since we have
> >
> > INT_MODE (OI, 32);
> > INT_MODE (XI, 64);
> >
> > this will increase the size of wide_int.  aarch64 and arm are the only
> > other targets with XImode, OImode is also used by s390 and ia64.
> >
> > Keeping wide_int small is desirable.
>
> Since I want to use OImode and XImode in op_by_pieces operations,
> wide_int needs to support OImode and XImode.

OK, I see.

Richard.

> > Richard.
> >
> > > Whatever problems we had before, they have been fixed now.
> > >
> > >         * config/i386/i386-modes.def (MAX_BITSIZE_MODE_ANY_INT): Removed.
> > > ---
> > >  gcc/config/i386/i386-modes.def | 15 +++------------
> > >  1 file changed, 3 insertions(+), 12 deletions(-)
> > >
> > > diff --git a/gcc/config/i386/i386-modes.def b/gcc/config/i386/i386-modes.def
> > > index dbddfd8e48f..4e7014be034 100644
> > > --- a/gcc/config/i386/i386-modes.def
> > > +++ b/gcc/config/i386/i386-modes.def
> > > @@ -107,19 +107,10 @@ INT_MODE (XI, 64);
> > >  PARTIAL_INT_MODE (HI, 16, P2QI);
> > >  PARTIAL_INT_MODE (SI, 32, P2HI);
> > >
> > > -/* Mode used for signed overflow checking of TImode.  As
> > > -   MAX_BITSIZE_MODE_ANY_INT is only 160, wide-int.h reserves only that
> > > -   rounded up to multiple of HOST_BITS_PER_WIDE_INT bits in wide_int etc.,
> > > -   so OImode is too large.  For the overflow checking we actually need
> > > -   just 1 or 2 bits beyond TImode precision.  Use 160 bits to have
> > > -   a multiple of 32.  */
> > > +/* Mode used for signed overflow checking of TImode.  For the overflow
> > > +   checking we actually need just 1 or 2 bits beyond TImode precision.
> > > +   Use 160 bits to have a multiple of 32.  */
> > >  PARTIAL_INT_MODE (OI, 160, POI);
> > >
> > > -/* Keep the OI and XI modes from confusing the compiler into thinking
> > > -   that these modes could actually be used for computation.  They are
> > > -   only holders for vectors during data movement.  Include POImode precision
> > > -   though.  */
> > > -#define MAX_BITSIZE_MODE_ANY_INT (160)
> > > -
> > >  /* The symbol Pmode stands for one of the above machine modes (usually SImode).
> > >     The tm.h file specifies which one.  It is not a distinct mode.  */
> > > --
> > > 2.31.1
> > >
>
>
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v2 01/11] Add TARGET_READ_MEMSET_VALUE/TARGET_GEN_MEMSET_VALUE
  2021-05-11 23:35 ` [PATCH v2 01/11] Add TARGET_READ_MEMSET_VALUE/TARGET_GEN_MEMSET_VALUE H.J. Lu
@ 2021-05-30 18:49   ` Jeff Law
  2021-05-31 12:04     ` H.J. Lu
  0 siblings, 1 reply; 21+ messages in thread
From: Jeff Law @ 2021-05-30 18:49 UTC (permalink / raw)
  To: H.J. Lu, gcc-patches; +Cc: Richard Sandiford



On 5/11/2021 5:35 PM, H.J. Lu via Gcc-patches wrote:
> Add TARGET_READ_MEMSET_VALUE and TARGET_GEN_MEMSET_VALUE to support
> target instructions to duplicate QImode value to TImode/OImode/XImode
> value for memmset.  Define SCRATCH_SSE_REG as a scratch register for
> ix86_gen_memset_value.
>
> gcc/
>
> 	PR middle-end/90773
> 	* builtins.c (builtin_memset_read_str): Call
> 	targetm.read_memset_value.
> 	(builtin_memset_gen_str): Call targetm.gen_memset_value.
> 	* target.def (read_memset_value): New hook.
> 	(gen_memset_value): Likewise.
> 	* targhooks.c: Inclue "builtins.h".
> 	(default_read_memset_value): New function.
> 	(default_gen_memset_value): Likewise.
> 	* targhooks.h (default_read_memset_value): New prototype.
> 	(default_gen_memset_value): Likewise.
> 	* config/i386/i386-expand.c (ix86_expand_vector_init_duplicate):
> 	Make it global.
> 	* config/i386/i386-protos.h (ix86_minimum_incoming_stack_boundary):
> 	New.
> 	(ix86_expand_vector_init_duplicate): Likewise.
> 	* config/i386/i386.c (ix86_minimum_incoming_stack_boundary): Add
> 	an argument to ignore stack_alignment_estimated.  It is passed
> 	as false by default.
> 	(ix86_gen_memset_value_from_prev): New function.
> 	(ix86_gen_memset_value): Likewise.
> 	(ix86_read_memset_value): Likewise.
> 	(TARGET_GEN_MEMSET_VALUE): New.
> 	(TARGET_READ_MEMSET_VALUE): Likewise.
> 	* config/i386/i386.h (SCRATCH_SSE_REG): New.
> 	* doc/tm.texi.in: Add TARGET_READ_MEMSET_VALUE and
> 	TARGET_GEN_MEMSET_VALUE hooks.
> 	* doc/tm.texi: Regenerated.
>
> gcc/testsuite/
>
> 	PR middle-end/90773
> 	* gcc.target/i386/pr90773-15.c: New test.
> 	* gcc.target/i386/pr90773-16.c: Likewise.
> 	* gcc.target/i386/pr90773-17.c: Likewise.
> 	* gcc.target/i386/pr90773-18.c: Likewise.
> 	* gcc.target/i386/pr90773-19.c: Likewise.
Why does this need target hooks?  ISTM the right way to go here is to 
just emit the constant load to the target register and let the target 
figure out how best to construct the constant into the register.  If 
that means load it via QImode and broadcast, that's fine, but I'm not 
sure why that's not all implemented in the target files.

jeff


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v2 01/11] Add TARGET_READ_MEMSET_VALUE/TARGET_GEN_MEMSET_VALUE
  2021-05-30 18:49   ` Jeff Law
@ 2021-05-31 12:04     ` H.J. Lu
  2021-05-31 18:07       ` Jeff Law
  0 siblings, 1 reply; 21+ messages in thread
From: H.J. Lu @ 2021-05-31 12:04 UTC (permalink / raw)
  To: Jeff Law; +Cc: GCC Patches, Richard Sandiford

On Sun, May 30, 2021 at 11:49 AM Jeff Law <jeffreyalaw@gmail.com> wrote:
>
>
>
> On 5/11/2021 5:35 PM, H.J. Lu via Gcc-patches wrote:
> > Add TARGET_READ_MEMSET_VALUE and TARGET_GEN_MEMSET_VALUE to support
> > target instructions to duplicate QImode value to TImode/OImode/XImode
> > value for memmset.  Define SCRATCH_SSE_REG as a scratch register for
> > ix86_gen_memset_value.
> >
> > gcc/
> >
> >       PR middle-end/90773
> >       * builtins.c (builtin_memset_read_str): Call
> >       targetm.read_memset_value.
> >       (builtin_memset_gen_str): Call targetm.gen_memset_value.
> >       * target.def (read_memset_value): New hook.
> >       (gen_memset_value): Likewise.
> >       * targhooks.c: Inclue "builtins.h".
> >       (default_read_memset_value): New function.
> >       (default_gen_memset_value): Likewise.
> >       * targhooks.h (default_read_memset_value): New prototype.
> >       (default_gen_memset_value): Likewise.
> >       * config/i386/i386-expand.c (ix86_expand_vector_init_duplicate):
> >       Make it global.
> >       * config/i386/i386-protos.h (ix86_minimum_incoming_stack_boundary):
> >       New.
> >       (ix86_expand_vector_init_duplicate): Likewise.
> >       * config/i386/i386.c (ix86_minimum_incoming_stack_boundary): Add
> >       an argument to ignore stack_alignment_estimated.  It is passed
> >       as false by default.
> >       (ix86_gen_memset_value_from_prev): New function.
> >       (ix86_gen_memset_value): Likewise.
> >       (ix86_read_memset_value): Likewise.
> >       (TARGET_GEN_MEMSET_VALUE): New.
> >       (TARGET_READ_MEMSET_VALUE): Likewise.
> >       * config/i386/i386.h (SCRATCH_SSE_REG): New.
> >       * doc/tm.texi.in: Add TARGET_READ_MEMSET_VALUE and
> >       TARGET_GEN_MEMSET_VALUE hooks.
> >       * doc/tm.texi: Regenerated.
> >
> > gcc/testsuite/
> >
> >       PR middle-end/90773
> >       * gcc.target/i386/pr90773-15.c: New test.
> >       * gcc.target/i386/pr90773-16.c: Likewise.
> >       * gcc.target/i386/pr90773-17.c: Likewise.
> >       * gcc.target/i386/pr90773-18.c: Likewise.
> >       * gcc.target/i386/pr90773-19.c: Likewise.
> Why does this need target hooks?  ISTM the right way to go here is to
> just emit the constant load to the target register and let the target
> figure out how best to construct the constant into the register.  If
> that means load it via QImode and broadcast, that's fine, but I'm not
> sure why that's not all implemented in the target files.
>

I will submit a patch to add optabs instead.

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v2 01/11] Add TARGET_READ_MEMSET_VALUE/TARGET_GEN_MEMSET_VALUE
  2021-05-31 12:04     ` H.J. Lu
@ 2021-05-31 18:07       ` Jeff Law
  2021-05-31 18:13         ` H.J. Lu
  0 siblings, 1 reply; 21+ messages in thread
From: Jeff Law @ 2021-05-31 18:07 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GCC Patches, Richard Sandiford



On 5/31/2021 6:04 AM, H.J. Lu wrote:
> On Sun, May 30, 2021 at 11:49 AM Jeff Law <jeffreyalaw@gmail.com> wrote:
>>
>>
>> On 5/11/2021 5:35 PM, H.J. Lu via Gcc-patches wrote:
>>> Add TARGET_READ_MEMSET_VALUE and TARGET_GEN_MEMSET_VALUE to support
>>> target instructions to duplicate QImode value to TImode/OImode/XImode
>>> value for memmset.  Define SCRATCH_SSE_REG as a scratch register for
>>> ix86_gen_memset_value.
>>>
>>> gcc/
>>>
>>>        PR middle-end/90773
>>>        * builtins.c (builtin_memset_read_str): Call
>>>        targetm.read_memset_value.
>>>        (builtin_memset_gen_str): Call targetm.gen_memset_value.
>>>        * target.def (read_memset_value): New hook.
>>>        (gen_memset_value): Likewise.
>>>        * targhooks.c: Inclue "builtins.h".
>>>        (default_read_memset_value): New function.
>>>        (default_gen_memset_value): Likewise.
>>>        * targhooks.h (default_read_memset_value): New prototype.
>>>        (default_gen_memset_value): Likewise.
>>>        * config/i386/i386-expand.c (ix86_expand_vector_init_duplicate):
>>>        Make it global.
>>>        * config/i386/i386-protos.h (ix86_minimum_incoming_stack_boundary):
>>>        New.
>>>        (ix86_expand_vector_init_duplicate): Likewise.
>>>        * config/i386/i386.c (ix86_minimum_incoming_stack_boundary): Add
>>>        an argument to ignore stack_alignment_estimated.  It is passed
>>>        as false by default.
>>>        (ix86_gen_memset_value_from_prev): New function.
>>>        (ix86_gen_memset_value): Likewise.
>>>        (ix86_read_memset_value): Likewise.
>>>        (TARGET_GEN_MEMSET_VALUE): New.
>>>        (TARGET_READ_MEMSET_VALUE): Likewise.
>>>        * config/i386/i386.h (SCRATCH_SSE_REG): New.
>>>        * doc/tm.texi.in: Add TARGET_READ_MEMSET_VALUE and
>>>        TARGET_GEN_MEMSET_VALUE hooks.
>>>        * doc/tm.texi: Regenerated.
>>>
>>> gcc/testsuite/
>>>
>>>        PR middle-end/90773
>>>        * gcc.target/i386/pr90773-15.c: New test.
>>>        * gcc.target/i386/pr90773-16.c: Likewise.
>>>        * gcc.target/i386/pr90773-17.c: Likewise.
>>>        * gcc.target/i386/pr90773-18.c: Likewise.
>>>        * gcc.target/i386/pr90773-19.c: Likewise.
>> Why does this need target hooks?  ISTM the right way to go here is to
>> just emit the constant load to the target register and let the target
>> figure out how best to construct the constant into the register.  If
>> that means load it via QImode and broadcast, that's fine, but I'm not
>> sure why that's not all implemented in the target files.
>>
> I will submit a patch to add optabs instead.
I may be missing something, but I'm not even sure why we need special 
optabs.

Aren't you just trying to efficiently get a constant element broadcast 
across an entire vector?

jeff

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v2 01/11] Add TARGET_READ_MEMSET_VALUE/TARGET_GEN_MEMSET_VALUE
  2021-05-31 18:07       ` Jeff Law
@ 2021-05-31 18:13         ` H.J. Lu
  2021-05-31 18:20           ` H.J. Lu
  0 siblings, 1 reply; 21+ messages in thread
From: H.J. Lu @ 2021-05-31 18:13 UTC (permalink / raw)
  To: Jeff Law; +Cc: GCC Patches, Richard Sandiford

On Mon, May 31, 2021 at 11:07 AM Jeff Law <jeffreyalaw@gmail.com> wrote:
>
>
>
> On 5/31/2021 6:04 AM, H.J. Lu wrote:
> > On Sun, May 30, 2021 at 11:49 AM Jeff Law <jeffreyalaw@gmail.com> wrote:
> >>
> >>
> >> On 5/11/2021 5:35 PM, H.J. Lu via Gcc-patches wrote:
> >>> Add TARGET_READ_MEMSET_VALUE and TARGET_GEN_MEMSET_VALUE to support
> >>> target instructions to duplicate QImode value to TImode/OImode/XImode
> >>> value for memmset.  Define SCRATCH_SSE_REG as a scratch register for
> >>> ix86_gen_memset_value.
> >>>
> >>> gcc/
> >>>
> >>>        PR middle-end/90773
> >>>        * builtins.c (builtin_memset_read_str): Call
> >>>        targetm.read_memset_value.
> >>>        (builtin_memset_gen_str): Call targetm.gen_memset_value.
> >>>        * target.def (read_memset_value): New hook.
> >>>        (gen_memset_value): Likewise.
> >>>        * targhooks.c: Inclue "builtins.h".
> >>>        (default_read_memset_value): New function.
> >>>        (default_gen_memset_value): Likewise.
> >>>        * targhooks.h (default_read_memset_value): New prototype.
> >>>        (default_gen_memset_value): Likewise.
> >>>        * config/i386/i386-expand.c (ix86_expand_vector_init_duplicate):
> >>>        Make it global.
> >>>        * config/i386/i386-protos.h (ix86_minimum_incoming_stack_boundary):
> >>>        New.
> >>>        (ix86_expand_vector_init_duplicate): Likewise.
> >>>        * config/i386/i386.c (ix86_minimum_incoming_stack_boundary): Add
> >>>        an argument to ignore stack_alignment_estimated.  It is passed
> >>>        as false by default.
> >>>        (ix86_gen_memset_value_from_prev): New function.
> >>>        (ix86_gen_memset_value): Likewise.
> >>>        (ix86_read_memset_value): Likewise.
> >>>        (TARGET_GEN_MEMSET_VALUE): New.
> >>>        (TARGET_READ_MEMSET_VALUE): Likewise.
> >>>        * config/i386/i386.h (SCRATCH_SSE_REG): New.
> >>>        * doc/tm.texi.in: Add TARGET_READ_MEMSET_VALUE and
> >>>        TARGET_GEN_MEMSET_VALUE hooks.
> >>>        * doc/tm.texi: Regenerated.
> >>>
> >>> gcc/testsuite/
> >>>
> >>>        PR middle-end/90773
> >>>        * gcc.target/i386/pr90773-15.c: New test.
> >>>        * gcc.target/i386/pr90773-16.c: Likewise.
> >>>        * gcc.target/i386/pr90773-17.c: Likewise.
> >>>        * gcc.target/i386/pr90773-18.c: Likewise.
> >>>        * gcc.target/i386/pr90773-19.c: Likewise.
> >> Why does this need target hooks?  ISTM the right way to go here is to
> >> just emit the constant load to the target register and let the target
> >> figure out how best to construct the constant into the register.  If
> >> that means load it via QImode and broadcast, that's fine, but I'm not
> >> sure why that's not all implemented in the target files.
> >>
> > I will submit a patch to add optabs instead.
> I may be missing something, but I'm not even sure why we need special
> optabs.
>
> Aren't you just trying to efficiently get a constant element broadcast
> across an entire vector?

Since vec_duplicate must not fail and for broadcast from a constant QImode
value, vec_duplicate may not be faster than a compile-time constant, I am
adding vec_const_duplicate.   If vec_duplicate can fail, I don't need
vec_const_duplicate.

-- 
H.J.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v2 01/11] Add TARGET_READ_MEMSET_VALUE/TARGET_GEN_MEMSET_VALUE
  2021-05-31 18:13         ` H.J. Lu
@ 2021-05-31 18:20           ` H.J. Lu
  2021-06-01  7:24             ` Richard Biener
  0 siblings, 1 reply; 21+ messages in thread
From: H.J. Lu @ 2021-05-31 18:20 UTC (permalink / raw)
  To: Jeff Law; +Cc: GCC Patches, Richard Sandiford

On Mon, May 31, 2021 at 11:13 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Mon, May 31, 2021 at 11:07 AM Jeff Law <jeffreyalaw@gmail.com> wrote:
> >
> >
> >
> > On 5/31/2021 6:04 AM, H.J. Lu wrote:
> > > On Sun, May 30, 2021 at 11:49 AM Jeff Law <jeffreyalaw@gmail.com> wrote:
> > >>
> > >>
> > >> On 5/11/2021 5:35 PM, H.J. Lu via Gcc-patches wrote:
> > >>> Add TARGET_READ_MEMSET_VALUE and TARGET_GEN_MEMSET_VALUE to support
> > >>> target instructions to duplicate QImode value to TImode/OImode/XImode
> > >>> value for memmset.  Define SCRATCH_SSE_REG as a scratch register for
> > >>> ix86_gen_memset_value.
> > >>>
> > >>> gcc/
> > >>>
> > >>>        PR middle-end/90773
> > >>>        * builtins.c (builtin_memset_read_str): Call
> > >>>        targetm.read_memset_value.
> > >>>        (builtin_memset_gen_str): Call targetm.gen_memset_value.
> > >>>        * target.def (read_memset_value): New hook.
> > >>>        (gen_memset_value): Likewise.
> > >>>        * targhooks.c: Inclue "builtins.h".
> > >>>        (default_read_memset_value): New function.
> > >>>        (default_gen_memset_value): Likewise.
> > >>>        * targhooks.h (default_read_memset_value): New prototype.
> > >>>        (default_gen_memset_value): Likewise.
> > >>>        * config/i386/i386-expand.c (ix86_expand_vector_init_duplicate):
> > >>>        Make it global.
> > >>>        * config/i386/i386-protos.h (ix86_minimum_incoming_stack_boundary):
> > >>>        New.
> > >>>        (ix86_expand_vector_init_duplicate): Likewise.
> > >>>        * config/i386/i386.c (ix86_minimum_incoming_stack_boundary): Add
> > >>>        an argument to ignore stack_alignment_estimated.  It is passed
> > >>>        as false by default.
> > >>>        (ix86_gen_memset_value_from_prev): New function.
> > >>>        (ix86_gen_memset_value): Likewise.
> > >>>        (ix86_read_memset_value): Likewise.
> > >>>        (TARGET_GEN_MEMSET_VALUE): New.
> > >>>        (TARGET_READ_MEMSET_VALUE): Likewise.
> > >>>        * config/i386/i386.h (SCRATCH_SSE_REG): New.
> > >>>        * doc/tm.texi.in: Add TARGET_READ_MEMSET_VALUE and
> > >>>        TARGET_GEN_MEMSET_VALUE hooks.
> > >>>        * doc/tm.texi: Regenerated.
> > >>>
> > >>> gcc/testsuite/
> > >>>
> > >>>        PR middle-end/90773
> > >>>        * gcc.target/i386/pr90773-15.c: New test.
> > >>>        * gcc.target/i386/pr90773-16.c: Likewise.
> > >>>        * gcc.target/i386/pr90773-17.c: Likewise.
> > >>>        * gcc.target/i386/pr90773-18.c: Likewise.
> > >>>        * gcc.target/i386/pr90773-19.c: Likewise.
> > >> Why does this need target hooks?  ISTM the right way to go here is to
> > >> just emit the constant load to the target register and let the target
> > >> figure out how best to construct the constant into the register.  If
> > >> that means load it via QImode and broadcast, that's fine, but I'm not
> > >> sure why that's not all implemented in the target files.
> > >>
> > > I will submit a patch to add optabs instead.
> > I may be missing something, but I'm not even sure why we need special
> > optabs.
> >
> > Aren't you just trying to efficiently get a constant element broadcast
> > across an entire vector?
>
> Since vec_duplicate must not fail and for broadcast from a constant QImode
> value, vec_duplicate may not be faster than a compile-time constant, I am
> adding vec_const_duplicate.   If vec_duplicate can fail, I don't need
> vec_const_duplicate.
>
> --
> H.J.


For

extern void *ops;

void
foo (int c)
{
  __builtin_memset (ops, 4, 32);
}

without  vec_const_duplicate, I got

movl $4, %eax
movq ops(%rip), %rdx
movd %eax, %xmm0
punpcklbw %xmm0, %xmm0
punpcklwd %xmm0, %xmm0
pshufd $0, %xmm0, %xmm0
movups %xmm0, (%rdx)
movups %xmm0, 16(%rdx)
ret

with vec_const_duplicate, I got

movq ops(%rip), %rax
movdqa .LC0(%rip), %xmm0
movups %xmm0, (%rax)
movups %xmm0, 16(%rax)
ret

-- 
H.J.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v2 01/11] Add TARGET_READ_MEMSET_VALUE/TARGET_GEN_MEMSET_VALUE
  2021-05-31 18:20           ` H.J. Lu
@ 2021-06-01  7:24             ` Richard Biener
  0 siblings, 0 replies; 21+ messages in thread
From: Richard Biener @ 2021-06-01  7:24 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Jeff Law, Richard Sandiford, GCC Patches

On Mon, May 31, 2021 at 8:33 PM H.J. Lu via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> On Mon, May 31, 2021 at 11:13 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Mon, May 31, 2021 at 11:07 AM Jeff Law <jeffreyalaw@gmail.com> wrote:
> > >
> > >
> > >
> > > On 5/31/2021 6:04 AM, H.J. Lu wrote:
> > > > On Sun, May 30, 2021 at 11:49 AM Jeff Law <jeffreyalaw@gmail.com> wrote:
> > > >>
> > > >>
> > > >> On 5/11/2021 5:35 PM, H.J. Lu via Gcc-patches wrote:
> > > >>> Add TARGET_READ_MEMSET_VALUE and TARGET_GEN_MEMSET_VALUE to support
> > > >>> target instructions to duplicate QImode value to TImode/OImode/XImode
> > > >>> value for memmset.  Define SCRATCH_SSE_REG as a scratch register for
> > > >>> ix86_gen_memset_value.
> > > >>>
> > > >>> gcc/
> > > >>>
> > > >>>        PR middle-end/90773
> > > >>>        * builtins.c (builtin_memset_read_str): Call
> > > >>>        targetm.read_memset_value.
> > > >>>        (builtin_memset_gen_str): Call targetm.gen_memset_value.
> > > >>>        * target.def (read_memset_value): New hook.
> > > >>>        (gen_memset_value): Likewise.
> > > >>>        * targhooks.c: Inclue "builtins.h".
> > > >>>        (default_read_memset_value): New function.
> > > >>>        (default_gen_memset_value): Likewise.
> > > >>>        * targhooks.h (default_read_memset_value): New prototype.
> > > >>>        (default_gen_memset_value): Likewise.
> > > >>>        * config/i386/i386-expand.c (ix86_expand_vector_init_duplicate):
> > > >>>        Make it global.
> > > >>>        * config/i386/i386-protos.h (ix86_minimum_incoming_stack_boundary):
> > > >>>        New.
> > > >>>        (ix86_expand_vector_init_duplicate): Likewise.
> > > >>>        * config/i386/i386.c (ix86_minimum_incoming_stack_boundary): Add
> > > >>>        an argument to ignore stack_alignment_estimated.  It is passed
> > > >>>        as false by default.
> > > >>>        (ix86_gen_memset_value_from_prev): New function.
> > > >>>        (ix86_gen_memset_value): Likewise.
> > > >>>        (ix86_read_memset_value): Likewise.
> > > >>>        (TARGET_GEN_MEMSET_VALUE): New.
> > > >>>        (TARGET_READ_MEMSET_VALUE): Likewise.
> > > >>>        * config/i386/i386.h (SCRATCH_SSE_REG): New.
> > > >>>        * doc/tm.texi.in: Add TARGET_READ_MEMSET_VALUE and
> > > >>>        TARGET_GEN_MEMSET_VALUE hooks.
> > > >>>        * doc/tm.texi: Regenerated.
> > > >>>
> > > >>> gcc/testsuite/
> > > >>>
> > > >>>        PR middle-end/90773
> > > >>>        * gcc.target/i386/pr90773-15.c: New test.
> > > >>>        * gcc.target/i386/pr90773-16.c: Likewise.
> > > >>>        * gcc.target/i386/pr90773-17.c: Likewise.
> > > >>>        * gcc.target/i386/pr90773-18.c: Likewise.
> > > >>>        * gcc.target/i386/pr90773-19.c: Likewise.
> > > >> Why does this need target hooks?  ISTM the right way to go here is to
> > > >> just emit the constant load to the target register and let the target
> > > >> figure out how best to construct the constant into the register.  If
> > > >> that means load it via QImode and broadcast, that's fine, but I'm not
> > > >> sure why that's not all implemented in the target files.
> > > >>
> > > > I will submit a patch to add optabs instead.
> > > I may be missing something, but I'm not even sure why we need special
> > > optabs.
> > >
> > > Aren't you just trying to efficiently get a constant element broadcast
> > > across an entire vector?
> >
> > Since vec_duplicate must not fail and for broadcast from a constant QImode
> > value, vec_duplicate may not be faster than a compile-time constant, I am
> > adding vec_const_duplicate.   If vec_duplicate can fail, I don't need
> > vec_const_duplicate.
> >
> > --
> > H.J.
>
>
> For
>
> extern void *ops;
>
> void
> foo (int c)
> {
>   __builtin_memset (ops, 4, 32);
> }
>
> without  vec_const_duplicate, I got
>
> movl $4, %eax
> movq ops(%rip), %rdx
> movd %eax, %xmm0
> punpcklbw %xmm0, %xmm0
> punpcklwd %xmm0, %xmm0
> pshufd $0, %xmm0, %xmm0
> movups %xmm0, (%rdx)
> movups %xmm0, 16(%rdx)
> ret
>
> with vec_const_duplicate, I got
>
> movq ops(%rip), %rax
> movdqa .LC0(%rip), %xmm0
> movups %xmm0, (%rax)
> movups %xmm0, 16(%rax)
> ret

But you can construct the duplicated constant at compile-time?
I thought the issue was that a constant pool load is _not_ the
most efficient variant?

>
> --
> H.J.

^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2021-06-01  7:24 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-05-11 23:35 [PATCH v2 00/11] Allow TImode/OImode/XImode in op_by_pieces operations H.J. Lu
2021-05-11 23:35 ` [PATCH v2 01/11] Add TARGET_READ_MEMSET_VALUE/TARGET_GEN_MEMSET_VALUE H.J. Lu
2021-05-30 18:49   ` Jeff Law
2021-05-31 12:04     ` H.J. Lu
2021-05-31 18:07       ` Jeff Law
2021-05-31 18:13         ` H.J. Lu
2021-05-31 18:20           ` H.J. Lu
2021-06-01  7:24             ` Richard Biener
2021-05-11 23:35 ` [PATCH v2 02/11] x86: Avoid stack realignment when copying data H.J. Lu
2021-05-11 23:35 ` [PATCH v2 03/11] Remove MAX_BITSIZE_MODE_ANY_INT H.J. Lu
2021-05-12  9:00   ` Richard Biener
2021-05-12 12:06     ` H.J. Lu
2021-05-12 12:15       ` Richard Biener
2021-05-11 23:35 ` [PATCH v2 04/11] x86: Update piecewise move and store H.J. Lu
2021-05-11 23:35 ` [PATCH v2 05/11] x86: Add AVX2 tests for PR middle-end/90773 H.J. Lu
2021-05-11 23:35 ` [PATCH v2 06/11] x86: Add tests for piecewise move and store H.J. Lu
2021-05-11 23:35 ` [PATCH v2 07/11] x86: Also pass -mno-avx to pr72839.c H.J. Lu
2021-05-11 23:35 ` [PATCH v2 08/11] x86: Also pass -mno-avx to cold-attribute-1.c H.J. Lu
2021-05-11 23:35 ` [PATCH v2 09/11] x86: Also pass -mno-avx to sw-1.c for ia32 H.J. Lu
2021-05-11 23:35 ` [PATCH v2 10/11] x86: Update gcc.target/i386/incoming-11.c H.J. Lu
2021-05-11 23:35 ` [PATCH v2 11/11] constructor: Check if it is faster to load constant from memory H.J. Lu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).