public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH 00/12] Allow TImode/OImode/XImode in op_by_pieces operations
@ 2021-04-29 12:54 H.J. Lu
  2021-04-29 12:54 ` [PATCH 01/12] Update alignment_for_piecewise_move H.J. Lu
                   ` (11 more replies)
  0 siblings, 12 replies; 28+ messages in thread
From: H.J. Lu @ 2021-04-29 12:54 UTC (permalink / raw)
  To: gcc-patches

The maximum size of the current op_by_pieces operations are limited by 
MAX_FIXED_MODE_SIZE which is an integer expression for the size in bits
of the largest integer machine mode that should actually be used.  But
a target can use TImode/OImode/XImode, which can be larger than
MAX_FIXED_MODE_SIZE, to perform op_by_pieces operations.  Here are a
set of patches to remove such limitation so that TImode/OImode/XImode
can be used for piecewise move and store:

1. Remove MAX_FIXED_MODE_SIZE limit in alignment_for_piecewise_move.
2. Allow generating pseudo register with specific alignment for hard
registers which will never be spilled onto stack to avoid re-aligning
stack.
3. Add TARGET_READ_MEMSET_VALUE and TARGET_GEN_MEMSET_VALUE to support
target instructions to duplicate QImode value to TImode/OImode/XImode
value for memmset.
4. x86: Avoid stack realignment when copying data
5. x86: Remov MAX_BITSIZE_MODE_ANY_INT.  Only x86 backend defines it.
6. x86: Use TImode/OImode/XImode integers for piecewise move and store.
7. x86: Add tests for TImode/OImode/XImode for piecewise move and store.
8. x86: Adjust existing tests.

On x86-64, SPEC CPU 2017 performance impact is neutral.  Glibc code size
differences with -O2 build are:

             Before         After
libc.so     1870718        1870222
ld.so        185120         184984

Some code sequence differences in libc.so are:

            Before                                                           After
	mov    0x10(%rsp),%edx						mov    0x10(%rsp),%edx
	mov    %edx,(%rax)						mov    %edx,(%rax)
	movzwl 0x14(%rsp),%edx				      |		mov    0x13(%rsp),%edx
	mov    %dx,0x4(%rax)				      |		mov    %edx,0x3(%rax)
	movzbl 0x16(%rsp),%edx				      <
	mov    %dl,0x6(%rax)				      <
	add    %rcx,%rax						add    %rcx,%rax
	ret    								ret    

	movdqu (%rsi),%xmm1				      |		movdqu (%rcx),%xmm1
	mov    %rdi,0x20(%rsp)						mov    %rdi,0x20(%rsp)
	movups %xmm1,(%rax)						movups %xmm1,(%rax)
	mov    0x10(%rsi),%rdx				      |		movdqu 0xc(%rcx),%xmm2
	mov    %rdx,0x10(%rax)				      |		movups %xmm2,0xc(%rax)
	mov    0x18(%rsi),%edx				      |		mov    %rax,(%r14,%rdx,8)
	mov    %edx,0x18(%rax)				      |		add    $0x1,%rdx
	mov    %rax,(%r14,%rcx,8)			      |		cmp    %r8,%rdx
	add    $0x1,%rcx				      |		je     <__resolv_conf_allocate+0x22d>
	cmp    %r8,%rcx					      |		mov    0x20(%rsp),%rsi
	je     <__resolv_conf_allocate+0x22f>		      |		mov    (%r9,%rdx,8),%rcx

	test   %eax,%eax						test   %eax,%eax
	mov    $0xff,%eax						mov    $0xff,%eax
	cmove  %eax,%ebx						cmove  %eax,%ebx
	movzbl %bl,%ecx					      |		movd   %ebx,%xmm0
	mov    %ebx,0xc(%rsp)						mov    %ebx,0xc(%rsp)
	mov    %rcx,%rax				      |		punpcklbw %xmm0,%xmm0
	mov    %rcx,%rsi				      |		punpcklwd %xmm0,%xmm0
	mul    %rdi					      |		pshufd $0x0,%xmm0,%xmm0
	imul   %rdi,%rsi				      |		movups %xmm0,0x50(%r12)
	mov    %rax,0x50(%r12)				      |		movups %xmm0,0x60(%r12)
	mov    %rcx,%rax				      |		movups %xmm0,0x70(%r12)
	add    %rdx,%rsi				      |		movups %xmm0,0x80(%r12)
	mul    %rdi					      |		movups %xmm0,0x90(%r12)
	mov    %rsi,0x58(%r12)				      |		movups %xmm0,0xa0(%r12)
	mov    %rsi,0x68(%r12)				      |		movups %xmm0,0xb0(%r12)
	mov    %rax,0x60(%r12)				      |		movups %xmm0,0xc0(%r12)
	mov    %rcx,%rax				      |		movups %xmm0,0xd0(%r12)
	mul    %rdi					      |		movups %xmm0,0xe0(%r12)
	mov    %rsi,0x78(%r12)				      |		movups %xmm0,0xf0(%r12)
	mov    %rsi,0x88(%r12)				      |		movups %xmm0,0x100(%r12)
	mov    %rsi,0x98(%r12)				      |		movups %xmm0,0x110(%r12)
	mov    %rax,0x70(%r12)				      |		movups %xmm0,0x120(%r12)
	mov    %rcx,%rax				      |		movups %xmm0,0x130(%r12)
	mul    %rdi					      |		movups %xmm0,0x140(%r12)
	mov    %rsi,0xa8(%r12)				      <
	mov    %rsi,0xb8(%r12)				      <
	mov    %rsi,0xc8(%r12)				      <
	mov    %rax,0x80(%r12)				      <
	mov    %rcx,%rax				      <
	mul    %rdi					      <
	mov    %rsi,0xd8(%r12)				      <
	mov    %rsi,0xe8(%r12)				      <
	mov    %rsi,0xf8(%r12)				      <
	mov    %rax,0x90(%r12)				      <
	mov    %rcx,%rax				      <
	mul    %rdi					      <
	mov    %rsi,0x108(%r12)				      <
	mov    %rsi,0x118(%r12)				      <
	mov    %rsi,0x128(%r12)				      <
	mov    %rax,0xa0(%r12)				      <
	mov    %rcx,%rax				      <
	mul    %rdi					      <
	mov    %rsi,0x138(%r12)				      <
	mov    %rax,0xb0(%r12)				      <
	mov    %rcx,%rax				      <
	mul    %rdi					      <
	mov    %rax,0xc0(%r12)				      <
	mov    %rcx,%rax				      <
	mul    %rdi					      <
	mov    %rax,0xd0(%r12)				      <
	mov    %rcx,%rax				      <
	mul    %rdi					      <
	mov    %rax,0xe0(%r12)				      <
	mov    %rcx,%rax				      <
	mul    %rdi					      <
	mov    %rax,0xf0(%r12)				      <
	mov    %rcx,%rax				      <
	mul    %rdi					      <
	mov    %rax,0x100(%r12)				      <
	mov    %rcx,%rax				      <
	mul    %rdi					      <
	mov    %rax,0x110(%r12)				      <
	mov    %rcx,%rax				      <
	mul    %rdi					      <
	mov    %rax,0x120(%r12)				      <
	mov    %rcx,%rax				      <
	mul    %rdi					      <
	mov    %rax,0x130(%r12)				      <
	mov    %rcx,%rax				      <
	mul    %rdi					      <
	mov    %r12,%rdi				      <
	mov    %rax,0x140(%r12)				      <
	mov    %rsi,0x148(%r12)				      <
	call   <xprt_register@GLIBC_2.2.5>				call   <xprt_register@GLIBC_2.2.5>
	add    $0x28,%rsp						add    $0x28,%rsp
	mov    %r12,%rax						mov    %r12,%rax
	pop    %rbx							pop    %rbx
	pop    %rbp							pop    %rbp
	pop    %r12							pop    %r12
	pop    %r13							pop    %r13
	pop    %r14							pop    %r14
	pop    %r15							pop    %r15
	ret    								ret    

H.J. Lu (12):
  Update alignment_for_piecewise_move
  Allow generating pseudo register with specific alignment
  Add TARGET_READ_MEMSET_VALUE/TARGET_GEN_MEMSET_VALUE
  x86: Avoid stack realignment when copying data
  Remove MAX_BITSIZE_MODE_ANY_INT
  x86: Update piecewise move and store
  x86: Add AVX2 tests for PR middle-end/90773
  x86: Add tests for piecewise move and store
  x86: Also pass -mno-avx to pr72839.c
  x86: Also pass -mno-avx to cold-attribute-1.c
  x86: Also pass -mno-avx to sw-1.c for ia32
  x86: Update gcc.target/i386/incoming-11.c

 gcc/builtins.c                                |  45 +--
 gcc/config/i386/i386-expand.c                 |   9 +-
 gcc/config/i386/i386-modes.def                |  15 +-
 gcc/config/i386/i386-protos.h                 |   2 +
 gcc/config/i386/i386.c                        | 257 +++++++++++++++++-
 gcc/config/i386/i386.h                        |  31 ++-
 gcc/doc/tm.texi                               |  16 ++
 gcc/doc/tm.texi.in                            |   4 +
 gcc/emit-rtl.c                                |   5 +-
 gcc/explow.c                                  |   6 +-
 gcc/explow.h                                  |   2 +-
 gcc/expr.c                                    |  13 +-
 gcc/expr.h                                    |   6 +-
 gcc/rtl.h                                     |   2 +-
 gcc/target.def                                |  20 ++
 gcc/targhooks.c                               |  54 ++++
 gcc/targhooks.h                               |   4 +
 .../gcc.target/i386/cold-attribute-1.c        |   2 +-
 gcc/testsuite/gcc.target/i386/incoming-11.c   |   2 +-
 .../gcc.target/i386/pieces-memcpy-10.c        |  16 ++
 .../gcc.target/i386/pieces-memcpy-11.c        |  17 ++
 .../gcc.target/i386/pieces-memcpy-12.c        |  16 ++
 .../gcc.target/i386/pieces-memcpy-13.c        |  16 ++
 .../gcc.target/i386/pieces-memcpy-14.c        |  17 ++
 .../gcc.target/i386/pieces-memcpy-15.c        |  16 ++
 .../gcc.target/i386/pieces-memcpy-16.c        |  16 ++
 .../gcc.target/i386/pieces-memcpy-7.c         |  15 +
 .../gcc.target/i386/pieces-memcpy-8.c         |  14 +
 .../gcc.target/i386/pieces-memcpy-9.c         |  14 +
 .../gcc.target/i386/pieces-memset-1.c         |  16 ++
 .../gcc.target/i386/pieces-memset-10.c        |  16 ++
 .../gcc.target/i386/pieces-memset-11.c        |  16 ++
 .../gcc.target/i386/pieces-memset-12.c        |  16 ++
 .../gcc.target/i386/pieces-memset-13.c        |  16 ++
 .../gcc.target/i386/pieces-memset-14.c        |  16 ++
 .../gcc.target/i386/pieces-memset-15.c        |  16 ++
 .../gcc.target/i386/pieces-memset-16.c        |  16 ++
 .../gcc.target/i386/pieces-memset-17.c        |  16 ++
 .../gcc.target/i386/pieces-memset-18.c        |  16 ++
 .../gcc.target/i386/pieces-memset-19.c        |  17 ++
 .../gcc.target/i386/pieces-memset-2.c         |  12 +
 .../gcc.target/i386/pieces-memset-20.c        |  17 ++
 .../gcc.target/i386/pieces-memset-21.c        |  17 ++
 .../gcc.target/i386/pieces-memset-22.c        |  17 ++
 .../gcc.target/i386/pieces-memset-23.c        |  17 ++
 .../gcc.target/i386/pieces-memset-24.c        |  17 ++
 .../gcc.target/i386/pieces-memset-25.c        |  17 ++
 .../gcc.target/i386/pieces-memset-26.c        |  17 ++
 .../gcc.target/i386/pieces-memset-27.c        |  17 ++
 .../gcc.target/i386/pieces-memset-28.c        |  17 ++
 .../gcc.target/i386/pieces-memset-29.c        |  17 ++
 .../gcc.target/i386/pieces-memset-3.c         |  18 ++
 .../gcc.target/i386/pieces-memset-30.c        |  17 ++
 .../gcc.target/i386/pieces-memset-31.c        |  17 ++
 .../gcc.target/i386/pieces-memset-32.c        |  17 ++
 .../gcc.target/i386/pieces-memset-33.c        |  17 ++
 .../gcc.target/i386/pieces-memset-34.c        |  17 ++
 .../gcc.target/i386/pieces-memset-35.c        |  17 ++
 .../gcc.target/i386/pieces-memset-36.c        |  17 ++
 .../gcc.target/i386/pieces-memset-37.c        |  15 +
 .../gcc.target/i386/pieces-memset-38.c        |  17 ++
 .../gcc.target/i386/pieces-memset-39.c        |  16 ++
 .../gcc.target/i386/pieces-memset-4.c         |  16 ++
 .../gcc.target/i386/pieces-memset-40.c        |  17 ++
 .../gcc.target/i386/pieces-memset-41.c        |  16 ++
 .../gcc.target/i386/pieces-memset-42.c        |  17 ++
 .../gcc.target/i386/pieces-memset-43.c        |  17 ++
 .../gcc.target/i386/pieces-memset-5.c         |  12 +
 .../gcc.target/i386/pieces-memset-6.c         |  16 ++
 .../gcc.target/i386/pieces-memset-7.c         |  16 ++
 .../gcc.target/i386/pieces-memset-8.c         |  16 ++
 .../gcc.target/i386/pieces-memset-9.c         |  16 ++
 gcc/testsuite/gcc.target/i386/pr72839.c       |   2 +-
 gcc/testsuite/gcc.target/i386/pr90773-1.c     |  10 +-
 gcc/testsuite/gcc.target/i386/pr90773-14.c    |   2 +-
 gcc/testsuite/gcc.target/i386/pr90773-15.c    |  14 +
 gcc/testsuite/gcc.target/i386/pr90773-16.c    |  14 +
 gcc/testsuite/gcc.target/i386/pr90773-17.c    |  14 +
 gcc/testsuite/gcc.target/i386/pr90773-18.c    |  15 +
 gcc/testsuite/gcc.target/i386/pr90773-19.c    |  14 +
 gcc/testsuite/gcc.target/i386/pr90773-20.c    |  13 +
 gcc/testsuite/gcc.target/i386/pr90773-21.c    |  13 +
 gcc/testsuite/gcc.target/i386/pr90773-22.c    |  13 +
 gcc/testsuite/gcc.target/i386/pr90773-23.c    |  13 +
 gcc/testsuite/gcc.target/i386/pr90773-4.c     |   2 +-
 gcc/testsuite/gcc.target/i386/sw-1.c          |   1 +
 86 files changed, 1404 insertions(+), 91 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-11.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-12.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-13.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-14.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-15.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-16.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-7.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-8.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-9.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-10.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-11.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-12.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-13.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-14.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-15.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-16.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-17.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-18.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-19.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-20.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-21.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-22.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-23.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-24.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-25.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-26.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-27.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-28.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-29.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-30.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-31.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-32.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-33.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-34.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-35.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-36.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-37.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-38.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-39.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-4.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-40.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-41.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-42.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-43.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-5.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-6.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-7.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-8.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-9.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-15.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-16.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-17.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-18.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-19.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-20.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-21.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-22.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-23.c

-- 
2.31.1


^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2021-05-11  6:06 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-04-29 12:54 [PATCH 00/12] Allow TImode/OImode/XImode in op_by_pieces operations H.J. Lu
2021-04-29 12:54 ` [PATCH 01/12] Update alignment_for_piecewise_move H.J. Lu
2021-04-30  8:59   ` Richard Sandiford
2021-04-29 12:54 ` [PATCH 02/12] Allow generating pseudo register with specific alignment H.J. Lu
2021-04-30  9:06   ` Richard Sandiford
2021-04-30 12:06     ` H.J. Lu
2021-04-30 12:42       ` Richard Sandiford
2021-04-30 12:49         ` H.J. Lu
2021-04-30 13:34           ` H.J. Lu
2021-04-30 15:56             ` Richard Sandiford
2021-04-30 17:33               ` H.J. Lu
2021-05-03  8:18               ` Richard Biener
2021-05-10  9:39                 ` Richard Sandiford
2021-05-10 13:29                   ` H.J. Lu
2021-05-10 13:59                     ` Richard Biener
2021-05-10 14:11                       ` H.J. Lu
2021-05-10 16:23                         ` Richard Sandiford
2021-05-11  6:06                         ` Richard Biener
2021-04-29 12:54 ` [PATCH 03/12] Add TARGET_READ_MEMSET_VALUE/TARGET_GEN_MEMSET_VALUE H.J. Lu
2021-04-29 12:54 ` [PATCH 04/12] x86: Avoid stack realignment when copying data H.J. Lu
2021-04-29 12:54 ` [PATCH 05/12] Remove MAX_BITSIZE_MODE_ANY_INT H.J. Lu
2021-04-29 12:54 ` [PATCH 06/12] x86: Update piecewise move and store H.J. Lu
2021-04-29 12:54 ` [PATCH 07/12] x86: Add AVX2 tests for PR middle-end/90773 H.J. Lu
2021-04-29 12:54 ` [PATCH 08/12] x86: Add tests for piecewise move and store H.J. Lu
2021-04-29 12:54 ` [PATCH 09/12] x86: Also pass -mno-avx to pr72839.c H.J. Lu
2021-04-29 12:54 ` [PATCH 10/12] x86: Also pass -mno-avx to cold-attribute-1.c H.J. Lu
2021-04-29 12:54 ` [PATCH 11/12] x86: Also pass -mno-avx to sw-1.c for ia32 H.J. Lu
2021-04-29 12:54 ` [PATCH 12/12] x86: Update gcc.target/i386/incoming-11.c H.J. Lu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).