[ARM]Extra load store/instructions compared to gcc-3.4

public inbox for gcc@gcc.gnu.org
 help / color / mirror / Atom feed

* [ARM]Extra load store/instructions compared to gcc-3.4
@ 2012-04-25 12:17 Alexey Kravets
  2012-04-25 13:00 ` Alexander Monakov
  0 siblings, 1 reply; 3+ messages in thread
From: Alexey Kravets @ 2012-04-25 12:17 UTC (permalink / raw)
  To: gcc

[-- Attachment #1: Type: text/plain, Size: 761 bytes --]

Hi guys,
I have a test case (shell sort, see attached) compiled with different
ARM compilers:
GCC-4.6.3, GCC-3.4.6, and ARMCC.

Both ARMCC and GCC-3.4.6  generate quite optimal assembly while GCC-4.6.3
inserts extra load/store instructions compared to the other compilers.

Can the SSA representation usage in modern GCC be the reason for this?

If so, has anyone tried to do something about it?

The generated assembly codes are attached:
sort-3.4.s: Assembly, generated by the GCC-3.4.6
sort-4.6.3.s: Assembly, generated by the GCC-4.6.3
sort-armcc.s: Assembly, generated by the ARMCC

% armcc
ARM C/C++ Compiler, 4.1 [Build 713]

The file has been compiled with following options:
for GCC:
-O3
for ARMCC:
-O3 -Otime


-- 
Alexey Kravets
mr.kayrick@gmail.com

[-- Attachment #2: sort-3.4.s --]
[-- Type: text/plain, Size: 924 bytes --]

	.file	"sort.i"
	.global	__divsi3
	.text
	.align	2
	.global	shell_sort
	.type	shell_sort, %function
shell_sort:
	@ args = 0, pretend = 0, frame = 0
	@ frame_needed = 0, uses_anonymous_args = 0
	stmfd	sp!, {r4, r5, r6, r7, r8, r9, sl, fp, lr}
	mov	fp, r1
	sub	r7, r0, #4
	mov	r8, #1
.L2:
	add	r3, r8, r8, asl #1
	add	r8, r3, #1
	cmp	r8, fp
	ble	.L2
.L17:
	mov	r0, r8
	mov	r1, #3
	bl	__divsi3
	add	r9, r0, #1
	cmp	r9, fp
	mov	r8, r0
	bgt	.L16
.L26:
	ldr	sl, [r7, r9, asl #2]
	mov	r4, r9
	b	.L11
.L25:
	ldr	r5, [r7, r6, asl #2]
	mov	r0, r5
	bl	strcmp
	cmp	r0, #0
	ble	.L12
	str	r5, [r7, r4, asl #2]
	mov	r4, r6
.L11:
	cmp	r4, r8
	rsb	r6, r8, r4
	mov	r1, sl
	bgt	.L25
.L12:
	add	r9, r9, #1
	cmp	r9, fp
	str	sl, [r7, r4, asl #2]
	ble	.L26
.L16:
	cmp	r8, #1
	bgt	.L17
	ldmfd	sp!, {r4, r5, r6, r7, r8, r9, sl, fp, pc}
	.size	shell_sort, .-shell_sort
	.ident	"GCC: (GNU) 3.4.6"

[-- Attachment #3: shell_sort.c --]
[-- Type: text/x-csrc, Size: 535 bytes --]

#include <string.h>
void shell_sort(char *strings[], int n)
{
    int h, i, j;
    char *v;

    strings--;        /* Make array 1 origin */
    h = 1;
    do {h = h * 3 + 1;} while (h <= n);
    do {
        h = h / 3;
        for (i = h + 1; i <= n; i++) {
            v = strings[i];
            j = i;
            while (j > h && strcmp(strings[j-h], v) > 0) {
                strings[j] = strings[j-h];
                j = j-h;
            }
            strings[j] = v;
        }
    }
    while (h > 1);
}

[-- Attachment #4: sort-4.6.3.s --]
[-- Type: text/plain, Size: 2033 bytes --]

	.cpu cortex-a9
	.eabi_attribute 27, 3
	.fpu vfp3
	.eabi_attribute 20, 1
	.eabi_attribute 21, 1
	.eabi_attribute 23, 3
	.eabi_attribute 24, 1
	.eabi_attribute 25, 1
	.eabi_attribute 26, 2
	.eabi_attribute 30, 2
	.eabi_attribute 34, 1
	.eabi_attribute 18, 2
	.file	"shell_sort.c"
	.text
	.align	2
	.global	shell_sort
	.type	shell_sort, %function
shell_sort:
	@ args = 0, pretend = 0, frame = 40
	@ frame_needed = 0, uses_anonymous_args = 0
	stmfd	sp!, {r4, r5, r6, r7, r8, r9, sl, fp, lr}
	mov	r3, r1
	mov	r9, #1
	sub	sp, sp, #44
	sub	r2, r0, #4
	str	r0, [sp, #28]
	str	r1, [sp, #32]
	str	r2, [sp, #24]
.L2:
	add	r9, r9, r9, asl #1
	add	r9, r9, #1
	cmp	r9, r3
	ble	.L2
	ldr	r2, [sp, #32]
	movw	r3, #21846
	movt	r3, 21845
	str	r3, [sp, #36]
	add	r2, r2, #1
	str	r2, [sp, #20]
.L9:
	ldr	r2, [sp, #36]
	smull	r2, r3, r2, r9
	ldr	r2, [sp, #32]
	sub	r9, r3, r9, asr #31
	add	r3, r9, #1
	cmp	r2, r3
	str	r3, [sp, #8]
	blt	.L3
	rsb	r3, r9, r9, asl #30
	mov	r2, r9, asl #2
	mov	r3, r3, asl #2
	str	r2, [sp, #0]
	rsb	fp, r9, #0
	str	r3, [sp, #4]
	ldr	r3, [sp, #28]
	add	r3, r3, r2
	ldr	r2, [sp, #28]
	str	r3, [sp, #12]
	str	r2, [sp, #16]
.L7:
	ldr	r8, [sp, #12]
	ldr	r3, [sp, #8]
	mov	r2, r8
	ldr	sl, [r2], #4
	cmp	r9, r3
	str	r2, [sp, #12]
	bge	.L4
	ldr	r4, [sp, #16]
	mov	r7, r3
	b	.L5
.L6:
	ldr	r3, [sp, #0]
	cmp	r9, r6
	ldr	r2, [sp, #4]
	mov	r7, r6
	str	r5, [r4, r3]
	add	r4, r4, r2
	bge	.L4
.L5:
	ldr	r5, [r4, #0]
	mov	r1, sl
	add	r6, r7, fp
	mov	r8, r4
	mov	r0, r5
	bl	strcmp
	cmp	r0, #0
	bgt	.L6
	ldr	r3, [sp, #24]
	add	r8, r3, r7, asl #2
.L4:
	ldr	r2, [sp, #8]
	ldr	r3, [sp, #20]
	str	sl, [r8, #0]
	add	r2, r2, #1
	str	r2, [sp, #8]
	cmp	r2, r3
	ldr	r2, [sp, #16]
	add	r2, r2, #4
	str	r2, [sp, #16]
	bne	.L7
.L3:
	cmp	r9, #1
	bgt	.L9
	add	sp, sp, #44
	ldmfd	sp!, {r4, r5, r6, r7, r8, r9, sl, fp, pc}
	.size	shell_sort, .-shell_sort
	.ident	"GCC: (VDLinux.RC1) 4.6.3 20120105 (prerelease)"
	.section	.note.GNU-stack,"",%progbits

[-- Attachment #5: sort-armcc.s --]
[-- Type: text/plain, Size: 2083 bytes --]

; generated by ARM C/C++ Compiler, 4.1 [Build 713]
; commandline armcc [-S --cpu=Cortex-A9 --fpu=VFPv3 -O3 -Otime shell_sort.c]
        ARM
        REQUIRE8
        PRESERVE8

        AREA ||.text||, CODE, READONLY, ALIGN=2

shell_sort PROC
        PUSH     {r4-r12,lr}
        MOV      r10,r1
        SUB      r6,r0,#4
        MOV      r5,#1
|L0.16|
        ADD      r0,r5,r5,LSL #1
        ADD      r5,r0,#1
        CMP      r5,r10
        BLE      |L0.16|
        LDR      r11,|L0.140|
|L0.36|
        SMULL    r1,r0,r11,r5
        SUB      r5,r0,r0,ASR #31
        ADD      r8,r5,#1
        CMP      r8,r10
        BGT      |L0.128|
|L0.56|
        LDR      r9,[r6,r8,LSL #2]
        MOV      r4,r8
        B        |L0.80|
|L0.68|
        LDR      r0,[r6,r7,LSL #2]
        STR      r0,[r6,r4,LSL #2]
        MOV      r4,r7
|L0.80|
        CMP      r4,r5
        BLE      |L0.112|
        SUB      r7,r4,r5
        MOV      r1,r9
        LDR      r0,[r6,r7,LSL #2]
        BL       strcmp
        CMP      r0,#0
        BGT      |L0.68|
|L0.112|
        ADD      r8,r8,#1
        CMP      r8,r10
        STR      r9,[r6,r4,LSL #2]
        BLE      |L0.56|
|L0.128|
        CMP      r5,#1
        BGT      |L0.36|
        POP      {r4-r12,pc}
        ENDP

|L0.140|
        DCD      0x55555556

        AREA ||.arm_vfe_header||, DATA, READONLY, NOALLOC, ALIGN=2

        DCD      0x00000000

        EXPORT shell_sort [CODE]

        IMPORT ||Lib$$Request$$armlib|| [CODE,WEAK]
        IMPORT strcmp [CODE]

        ATTR FILESCOPE
        ATTR SETVALUE Tag_ABI_PCS_wchar_t,2
        ATTR SETVALUE Tag_ABI_enum_size,1
        ATTR SETVALUE Tag_ABI_optimization_goals,2
        ATTR SETSTRING Tag_conformance,"2.06"
        ATTR SETVALUE AV,18,1

        ASSERT {ENDIAN} = "little"
        ASSERT {INTER} = {TRUE}
        ASSERT {ROPI} = {FALSE}
        ASSERT {RWPI} = {FALSE}
        ASSERT {IEEE_FULL} = {FALSE}
        ASSERT {IEEE_PART} = {FALSE}
        ASSERT {IEEE_JAVA} = {FALSE}
        END

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [ARM]Extra load store/instructions compared to gcc-3.4
  2012-04-25 12:17 [ARM]Extra load store/instructions compared to gcc-3.4 Alexey Kravets
@ 2012-04-25 13:00 ` Alexander Monakov
  2012-04-25 13:22   ` Alexey Kravets
  0 siblings, 1 reply; 3+ messages in thread
From: Alexander Monakov @ 2012-04-25 13:00 UTC (permalink / raw)
  To: Alexey Kravets; +Cc: gcc

Hi,

This is a case when ivopts pass makes too many induction variables, exceeding
the number of available registers.  If you want to debug it, see
ivopts_global_cost_for_size in tree-ssa-loop-ivopts.c and its callers;
perhaps, the issue is that it fails to account for IVs created in inner loops
when processing outer loops.

In any case, I recommend to create a bug report in the GCC bugzilla for this
problem.

Hope that helps,
Alexander

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [ARM]Extra load store/instructions compared to gcc-3.4
  2012-04-25 13:00 ` Alexander Monakov
@ 2012-04-25 13:22   ` Alexey Kravets
  0 siblings, 0 replies; 3+ messages in thread
From: Alexey Kravets @ 2012-04-25 13:22 UTC (permalink / raw)
  To: Alexander Monakov; +Cc: gcc

Hi,
On Wed, Apr 25, 2012 at 05:00:40PM +0400, Alexander Monakov wrote:
> Hi,
>
> This is a case when ivopts pass makes too many induction variables, exceeding
> the number of available registers.  If you want to debug it, see
> ivopts_global_cost_for_size in tree-ssa-loop-ivopts.c and its callers;
> perhaps, the issue is that it fails to account for IVs created in inner loops
> when processing outer loops.
Thank you, I will check it.
>
> In any case, I recommend to create a bug report in the GCC bugzilla for this
> problem.
Yep, will do it.
>
> Hope that helps,
> Alexander
>

-- 
Alexey Kravets
a.kravets@samsung.com
local ext. 3964

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2012-04-25 13:22 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-04-25 12:17 [ARM]Extra load store/instructions compared to gcc-3.4 Alexey Kravets
2012-04-25 13:00 ` Alexander Monakov
2012-04-25 13:22   ` Alexey Kravets

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).