public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug middle-end/109537] New: Improve code generation for dynamic loop unrolling
@ 2023-04-17 13:24 helijia.i at foxmail dot com
2023-04-17 13:32 ` [Bug middle-end/109537] " helijia.i at foxmail dot com
2023-04-17 16:40 ` pinskia at gcc dot gnu.org
0 siblings, 2 replies; 3+ messages in thread
From: helijia.i at foxmail dot com @ 2023-04-17 13:24 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109537
Bug ID: 109537
Summary: Improve code generation for dynamic loop unrolling
Product: gcc
Version: 13.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: middle-end
Assignee: unassigned at gcc dot gnu.org
Reporter: helijia.i at foxmail dot com
Target Milestone: ---
For the current dynamic loop unrolling implementation, we will try to do the
followng transform:
'''
for (i = 0; i < n; i++)
body;
==> (LOOP->LPT_DECISION.TIMES == 3)
i = 0;
mod = n % 4;
switch (mod)
{
case 3:
body; i++;
case 2:
body; i++;
case 1:
body; i++;
case 0: ;
}
while (i < n)
{
body; i++;
body; i++;
body; i++;
body; i++;
}
'''
It would be better if we could carry out loop unrolling in the following way (R
== # unrolls)
'''
i=0; if (i > n-R-1) goto remain (not needed with loop bounds as shown)
for(; i< n-R-1; i+= R)
{
body;
body;
...
body; // R times
}
remain:
if (i < n)
for(; i < n; i++)
body
'''
For the following sample code:
'''
void matrix_add_const(int N1, int* A, int val)
{
int i, j;
for (j = 0; j < N1; j++)
A[j] += val;
}
'''
Gcc will generate more jump instructions compared to clang.
gcc's assemly
'''
$ cat unroll.gcc.s
.file "unroll.c"
.text
.p2align 4
.globl matrix_add_const
.type matrix_add_const, @function
matrix_add_const:
.LFB0:
.cfi_startproc
testl %edi, %edi
jle .L1
movslq %edi, %rdi
leaq -4(,%rdi,4), %rax
leaq (%rsi,%rdi,4), %rcx
shrq $2, %rax
addq $1, %rax
andl $7, %eax
je .L3
cmpq $1, %rax
je .L26
cmpq $2, %rax
je .L27
cmpq $3, %rax
je .L28
cmpq $4, %rax
je .L29
cmpq $5, %rax
je .L30
cmpq $6, %rax
jne .L41
.L31:
addl %edx, (%rsi)
addq $4, %rsi
.L30:
addl %edx, (%rsi)
addq $4, %rsi
.L29:
addl %edx, (%rsi)
addq $4, %rsi
.L28:
addl %edx, (%rsi)
addq $4, %rsi
.L27:
addl %edx, (%rsi)
addq $4, %rsi
.L26:
addl %edx, (%rsi)
addq $4, %rsi
cmpq %rcx, %rsi
je .L42
.L3:
addl %edx, (%rsi)
addl %edx, 4(%rsi)
addl %edx, 8(%rsi)
addl %edx, 12(%rsi)
addl %edx, 16(%rsi)
addl %edx, 20(%rsi)
addl %edx, 24(%rsi)
addl %edx, 28(%rsi)
addq $32, %rsi
cmpq %rcx, %rsi
jne .L3
.L1:
ret
.p2align 4,,10
.p2align 3
.L41:
addl %edx, (%rsi)
addq $4, %rsi
jmp .L31
.L42:
ret
.cfi_endproc
.LFE0:
.size matrix_add_const, .-matrix_add_const
.ident "GCC: (GNU) 13.0.0 20221022 (experimental)"
.section .note.GNU-stack,"",@progbits
'''
clang's assembly
'''
$ cat unroll.clang.s
.text
.file "unroll.c"
.globl matrix_add_const # -- Begin function matrix_add_const
.p2align 4, 0x90
.type matrix_add_const,@function
matrix_add_const: # @matrix_add_const
.cfi_startproc
# %bb.0:
testl %edi, %edi
jle .LBB0_11
# %bb.1:
movl %edi, %r9d
cmpl $8, %edi
jae .LBB0_3
# %bb.2:
xorl %ecx, %ecx
jmp .LBB0_10
.LBB0_3:
movl %r9d, %ecx
andl $-8, %ecx
movd %edx, %xmm0
pshufd $0, %xmm0, %xmm0 # xmm0 = xmm0[0,0,0,0]
leaq -8(%rcx), %rax
movq %rax, %rdi
shrq $3, %rdi
addq $1, %rdi
movl %edi, %r8d
andl $1, %r8d
testq %rax, %rax
je .LBB0_4
# %bb.5:
movq %r8, %rax
subq %rdi, %rax
xorl %edi, %edi
.p2align 4, 0x90
.LBB0_6: # =>This Inner Loop Header: Depth=1
movdqu (%rsi,%rdi,4), %xmm1
movdqu 16(%rsi,%rdi,4), %xmm2
movdqu 32(%rsi,%rdi,4), %xmm3
movdqu 48(%rsi,%rdi,4), %xmm4
paddd %xmm0, %xmm1
paddd %xmm0, %xmm2
movdqu %xmm1, (%rsi,%rdi,4)
movdqu %xmm2, 16(%rsi,%rdi,4)
paddd %xmm0, %xmm3
paddd %xmm0, %xmm4
movdqu %xmm3, 32(%rsi,%rdi,4)
movdqu %xmm4, 48(%rsi,%rdi,4)
addq $16, %rdi
addq $2, %rax
jne .LBB0_6
# %bb.7:
testq %r8, %r8
je .LBB0_9
.LBB0_8:
movdqu (%rsi,%rdi,4), %xmm1
movdqu 16(%rsi,%rdi,4), %xmm2
paddd %xmm0, %xmm1
paddd %xmm0, %xmm2
movdqu %xmm1, (%rsi,%rdi,4)
movdqu %xmm2, 16(%rsi,%rdi,4)
.LBB0_9:
cmpq %r9, %rcx
je .LBB0_11
.p2align 4, 0x90
.LBB0_10: # =>This Inner Loop Header: Depth=1
addl %edx, (%rsi,%rcx,4)
addq $1, %rcx
cmpq %rcx, %r9
jne .LBB0_10
.LBB0_11:
retq
.LBB0_4:
xorl %edi, %edi
testq %r8, %r8
jne .LBB0_8
jmp .LBB0_9
.Lfunc_end0:
.size matrix_add_const, .Lfunc_end0-matrix_add_const
.cfi_endproc
# -- End function
.ident "clang version 10.0.0-4ubuntu1 "
.section ".note.GNU-stack","",@progbits
.addrsig
'''
^ permalink raw reply [flat|nested] 3+ messages in thread
* [Bug middle-end/109537] Improve code generation for dynamic loop unrolling
2023-04-17 13:24 [Bug middle-end/109537] New: Improve code generation for dynamic loop unrolling helijia.i at foxmail dot com
@ 2023-04-17 13:32 ` helijia.i at foxmail dot com
2023-04-17 16:40 ` pinskia at gcc dot gnu.org
1 sibling, 0 replies; 3+ messages in thread
From: helijia.i at foxmail dot com @ 2023-04-17 13:32 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109537
--- Comment #1 from Li Jia He <helijia.i at foxmail dot com> ---
Update gcc's assembly as gcc did not turn on auto vectorization during O2,
The compilation command for gcc is 'cc1 unroll.c -O3 -funroll-loops',
The compilation command for clang is 'clang unroll.c -O3 -S'
'''
$ cat unroll.gcc.s
.file "unroll.c"
.text
.p2align 4
.globl matrix_add_const
.type matrix_add_const, @function
matrix_add_const:
.LFB0:
.cfi_startproc
movl %edi, %ecx
movl %edx, %edi
testl %ecx, %ecx
jle .L1
leal -1(%rcx), %eax
cmpl $2, %eax
jbe .L6
movl %ecx, %r9d
movd %edx, %xmm2
movq %rsi, %r10
shrl $2, %r9d
pshufd $0, %xmm2, %xmm0
salq $4, %r9
leaq (%r9,%rsi), %r8
subq $16, %r9
shrq $4, %r9
addq $1, %r9
andl $7, %r9d
je .L4
cmpq $1, %r9
je .L29
cmpq $2, %r9
je .L30
cmpq $3, %r9
je .L31
cmpq $4, %r9
je .L32
cmpq $5, %r9
je .L33
cmpq $6, %r9
jne .L45
.L34:
movdqu (%r10), %xmm3
addq $16, %r10
paddd %xmm0, %xmm3
movups %xmm3, -16(%r10)
.L33:
movdqu (%r10), %xmm4
addq $16, %r10
paddd %xmm0, %xmm4
movups %xmm4, -16(%r10)
.L32:
movdqu (%r10), %xmm5
addq $16, %r10
paddd %xmm0, %xmm5
movups %xmm5, -16(%r10)
.L31:
movdqu (%r10), %xmm6
addq $16, %r10
paddd %xmm0, %xmm6
movups %xmm6, -16(%r10)
.L30:
movdqu (%r10), %xmm7
addq $16, %r10
paddd %xmm0, %xmm7
movups %xmm7, -16(%r10)
.L29:
movdqu (%r10), %xmm8
addq $16, %r10
paddd %xmm0, %xmm8
movups %xmm8, -16(%r10)
cmpq %r8, %r10
je .L43
.L4:
movdqu (%r10), %xmm9
movdqu 16(%r10), %xmm10
subq $-128, %r10
movdqu -96(%r10), %xmm11
movdqu -80(%r10), %xmm12
movdqu -64(%r10), %xmm13
paddd %xmm0, %xmm9
paddd %xmm0, %xmm10
movdqu -48(%r10), %xmm14
movdqu -32(%r10), %xmm15
movdqu -16(%r10), %xmm2
paddd %xmm0, %xmm11
paddd %xmm0, %xmm12
paddd %xmm0, %xmm13
paddd %xmm0, %xmm14
movups %xmm9, -128(%r10)
paddd %xmm0, %xmm15
paddd %xmm0, %xmm2
movups %xmm10, -112(%r10)
movups %xmm11, -96(%r10)
movups %xmm12, -80(%r10)
movups %xmm13, -64(%r10)
movups %xmm14, -48(%r10)
movups %xmm15, -32(%r10)
movups %xmm2, -16(%r10)
cmpq %r8, %r10
jne .L4
.L43:
movl %ecx, %edx
andl $-4, %edx
testb $3, %cl
je .L46
.L3:
movslq %edx, %r11
leal 1(%rdx), %eax
salq $2, %r11
addl %edi, (%rsi,%r11)
cmpl %eax, %ecx
jle .L1
addl $2, %edx
addl %edi, 4(%rsi,%r11)
cmpl %edx, %ecx
jle .L1
addl %edi, 8(%rsi,%r11)
.L1:
ret
.p2align 4,,10
.p2align 3
.L46:
ret
.p2align 4,,10
.p2align 3
.L45:
movdqu (%rsi), %xmm1
leaq 16(%rsi), %r10
paddd %xmm0, %xmm1
movups %xmm1, (%rsi)
jmp .L34
.L6:
xorl %edx, %edx
jmp .L3
.cfi_endproc
.LFE0:
.size matrix_add_const, .-matrix_add_const
.ident "GCC: (GNU) 13.0.0 20221022 (experimental)"
.section .note.GNU-stack,"",@progbits
'''
^ permalink raw reply [flat|nested] 3+ messages in thread
* [Bug middle-end/109537] Improve code generation for dynamic loop unrolling
2023-04-17 13:24 [Bug middle-end/109537] New: Improve code generation for dynamic loop unrolling helijia.i at foxmail dot com
2023-04-17 13:32 ` [Bug middle-end/109537] " helijia.i at foxmail dot com
@ 2023-04-17 16:40 ` pinskia at gcc dot gnu.org
1 sibling, 0 replies; 3+ messages in thread
From: pinskia at gcc dot gnu.org @ 2023-04-17 16:40 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109537
Andrew Pinski <pinskia at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Keywords| |missed-optimization
Severity|normal |enhancement
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2023-04-17 16:40 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-04-17 13:24 [Bug middle-end/109537] New: Improve code generation for dynamic loop unrolling helijia.i at foxmail dot com
2023-04-17 13:32 ` [Bug middle-end/109537] " helijia.i at foxmail dot com
2023-04-17 16:40 ` pinskia at gcc dot gnu.org
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).