From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <gcc-patches-return-430864-listarch-gcc-patches=gcc.gnu.org@gcc.gnu.org>
Received: (qmail 6987 invoked by alias); 1 Jul 2016 13:41:08 -0000
Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm
Precedence: bulk
List-Id: <gcc-patches.gcc.gnu.org>
List-Archive: <http://gcc.gnu.org/ml/gcc-patches/>
List-Post: <mailto:gcc-patches@gcc.gnu.org>
List-Help: <mailto:gcc-patches-help@gcc.gnu.org>
Sender: gcc-patches-owner@gcc.gnu.org
Received: (qmail 6978 invoked by uid 89); 1 Jul 2016 13:41:07 -0000
Authentication-Results: sourceware.org; auth=none
X-Virus-Found: No
X-Spam-SWARE-Status: No, score=-1.4 required=5.0 tests=BAYES_00,KAM_ASCII_DIVIDERS,KAM_LAZY_DOMAIN_SECURITY,RP_MATCHES_RCVD autolearn=no version=3.3.2 spammy=aug, Aug, lsl, adr
X-HELO: foss.arm.com
Received: from foss.arm.com (HELO foss.arm.com) (217.140.101.70) by sourceware.org (qpsmtpd/0.93/v0.84-503-g423c35a) with ESMTP; Fri, 01 Jul 2016 13:40:57 +0000
Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.72.51.249])	by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 7A6602F;	Fri,  1 Jul 2016 06:41:49 -0700 (PDT)
Received: from [10.2.206.27] (e105545-lin.cambridge.arm.com [10.2.206.27])	by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPSA id 4899C3F445;	Fri,  1 Jul 2016 06:40:54 -0700 (PDT)
Subject: Re: [PATCHv2, ARM, libgcc] New aeabi_idiv function for armv6-m
To: Andre Vieira <Andre.SimoesDiasVieira@arm.com>
References: <561D38F7.809@arm.com>
Cc: "gcc-patches@gcc.gnu.org" <gcc-patches@gcc.gnu.org>
From: Ramana Radhakrishnan <ramana.radhakrishnan@foss.arm.com>
Message-ID: <577672E3.6080002@foss.arm.com>
Date: Fri, 01 Jul 2016 13:41:00 -0000
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Thunderbird/38.8.0
MIME-Version: 1.0
In-Reply-To: <561D38F7.809@arm.com>
Content-Type: text/plain; charset=utf-8
Content-Transfer-Encoding: 7bit
X-IsSubscribed: yes
X-SW-Source: 2016-07/txt/msg00036.txt.bz2


On 13/10/15 18:01, Andre Vieira wrote:
> This patch ports the aeabi_idiv routine from Linaro Cortex-Strings (https://git.linaro.org/toolchain/cortex-strings.git), which was contributed by ARM under Free BSD license.
> 
> The new aeabi_idiv routine is used to replace the one in libgcc/config/arm/lib1funcs.S. This replacement happens within the Thumb1 wrapper. The new routine is under LGPLv3 license.

This is not under LGPLv3 . It is under GPLv3 with the runtime library exception license, there's a difference. Assuming your licensing expectation is ok .... read on for more of a review.

> 
> The main advantage of this version is that it can improve the performance of the aeabi_idiv function for Thumb1. This solution will also increase the code size. So it will only be used if __OPTIMIZE_SIZE__ is not defined.
> 
> Make check passed for armv6-m.
> 
> libgcc/ChangeLog:
> 2015-08-10  Hale Wang  <hale.wang@arm.com>
>             Andre Vieira  <andre.simoesdiasvieira@arm.com>
> 
>   * config/arm/lib1funcs.S: Add new wrapper.
> 
> 0001-integer-division.patch
> 
> 
> From 832a3d6af6f06399f70b5a4ac3727d55960c93b7 Mon Sep 17 00:00:00 2001
> From: Andre Simoes Dias Vieira <andsim01@arm.com>
> Date: Fri, 21 Aug 2015 14:23:28 +0100
> Subject: [PATCH] new wrapper idivmod
> 
> ---
>  libgcc/config/arm/lib1funcs.S | 250 ++++++++++++++++++++++++++++++++++++------
>  1 file changed, 217 insertions(+), 33 deletions(-)
> 
> diff --git a/libgcc/config/arm/lib1funcs.S b/libgcc/config/arm/lib1funcs.S
> index 252efcbd5385cc58a5ce1e48c6816d36a6f4c797..c9e544114590da8cde88382bea0f67206e593816 100644
> --- a/libgcc/config/arm/lib1funcs.S
> +++ b/libgcc/config/arm/lib1funcs.S
> @@ -306,34 +306,12 @@ LSYM(Lend_fde):
>  #ifdef __ARM_EABI__
>  .macro THUMB_LDIV0 name signed
>  #if defined(__ARM_ARCH_6M__)
> -	.ifc \signed, unsigned
> -	cmp	r0, #0
> -	beq	1f
> -	mov	r0, #0
> -	mvn	r0, r0		@ 0xffffffff
> -1:
> -	.else
> -	cmp	r0, #0
> -	beq	2f
> -	blt	3f
> +
> +	push	{r0, lr}
>  	mov	r0, #0
> -	mvn	r0, r0
> -	lsr	r0, r0, #1	@ 0x7fffffff
> -	b	2f
> -3:	mov	r0, #0x80
> -	lsl	r0, r0, #24	@ 0x80000000
> -2:
> -	.endif
> -	push	{r0, r1, r2}
> -	ldr	r0, 4f
> -	adr	r1, 4f
> -	add	r0, r1
> -	str	r0, [sp, #8]
> -	@ We know we are not on armv4t, so pop pc is safe.
> -	pop	{r0, r1, pc}
> -	.align	2
> -4:
> -	.word	__aeabi_idiv0 - 4b
> +	bl	SYM(__aeabi_idiv0)
> +	pop	{r1, pc}
> +

I'd still retain the comment about pop pc here because there's often a misconception of merging armv4t and armv6m code.

>  #elif defined(__thumb2__)
>  	.syntax unified
>  	.ifc \signed, unsigned
> @@ -945,7 +923,170 @@ LSYM(Lover7):
>  	add	dividend, work
>    .endif
>  LSYM(Lgot_result):
> -.endm	
> +.endm
> +
> +#if defined(__prefer_thumb__) && !defined(__OPTIMIZE_SIZE__)
> +/* If performance is preferred, the following functions are provided.  */
> +

Comment above #if please and also check elsewhere in patch.

> +/* Branch to div(n), and jump to label if curbit is lo than divisior.  */
> +.macro BranchToDiv n, label
> +	lsr	curbit, dividend, \n
> +	cmp	curbit, divisor
> +	blo	\label
> +.endm
> +
> +/* Body of div(n).  Shift the divisor in n bits and compare the divisor
> +   and dividend.  Update the dividend as the substruction result.  */
> +.macro DoDiv n
> +	lsr	curbit, dividend, \n
> +	cmp	curbit, divisor
> +	bcc	1f
> +	lsl	curbit, divisor, \n
> +	sub	dividend, dividend, curbit
> +
> +1:	adc	result, result
> +.endm
> +
> +/* The body of division with positive divisor.  Unless the divisor is very
> +   big, shift it up in multiples of four bits, since this is the amount of
> +   unwinding in the main division loop.  Continue shifting until the divisor
> +   is larger than the dividend.  */
> +.macro THUMB1_Div_Positive
> +	mov	result, #0
> +	BranchToDiv #1, LSYM(Lthumb1_div1)
> +	BranchToDiv #4, LSYM(Lthumb1_div4)
> +	BranchToDiv #8, LSYM(Lthumb1_div8)
> +	BranchToDiv #12, LSYM(Lthumb1_div12)
> +	BranchToDiv #16, LSYM(Lthumb1_div16)
> +LSYM(Lthumb1_div_large_positive):
> +	mov	result, #0xff
> +	lsl	divisor, divisor, #8
> +	rev	result, result
> +	lsr	curbit, dividend, #16
> +	cmp	curbit, divisor
> +	blo	1f
> +	asr	result, #8
> +	lsl	divisor, divisor, #8
> +	beq	LSYM(Ldivbyzero_waypoint)
> +
> +1:	lsr	curbit, dividend, #12
> +	cmp	curbit, divisor
> +	blo	LSYM(Lthumb1_div12)
> +	b	LSYM(Lthumb1_div16)
> +LSYM(Lthumb1_div_loop):
> +	lsr	divisor, divisor, #8
> +LSYM(Lthumb1_div16):
> +	Dodiv	#15
> +	Dodiv	#14
> +	Dodiv	#13
> +	Dodiv	#12
> +LSYM(Lthumb1_div12):
> +	Dodiv	#11
> +	Dodiv	#10
> +	Dodiv	#9
> +	Dodiv	#8
> +	bcs	LSYM(Lthumb1_div_loop)
> +LSYM(Lthumb1_div8):
> +	Dodiv	#7
> +	Dodiv	#6
> +	Dodiv	#5
> +LSYM(Lthumb1_div5):
> +	Dodiv	#4
> +LSYM(Lthumb1_div4):
> +	Dodiv	#3
> +LSYM(Lthumb1_div3):
> +	Dodiv	#2
> +LSYM(Lthumb1_div2):
> +	Dodiv	#1
> +LSYM(Lthumb1_div1):
> +	sub	divisor, dividend, divisor
> +	bcs	1f
> +	cpy	divisor, dividend
> +
> +1:	adc	result, result
> +	cpy	dividend, result
> +	RET
> +
> +LSYM(Ldivbyzero_waypoint):
> +	b	LSYM(Ldiv0)
> +.endm
> +
> +/* The body of division with negative divisor.  Similar with
> +   THUMB1_Div_Positive except that the shift steps are in multiples
> +   of six bits.  */
> +.macro THUMB1_Div_Negative
> +	lsr	result, divisor, #31
> +	beq	1f
> +	neg	divisor, divisor
> +
> +1:	asr	curbit, dividend, #32
> +	bcc	2f
> +	neg	dividend, dividend
> +
> +2:	eor	curbit, result
> +	mov	result, #0
> +	cpy	ip, curbit
> +	BranchToDiv #4, LSYM(Lthumb1_div_negative4)
> +	BranchToDiv #8, LSYM(Lthumb1_div_negative8)
> +LSYM(Lthumb1_div_large):
> +	mov	result, #0xfc
> +	lsl	divisor, divisor, #6
> +	rev	result, result
> +	lsr	curbit, dividend, #8
> +	cmp	curbit, divisor
> +	blo	LSYM(Lthumb1_div_negative8)
> +
> +	lsl	divisor, divisor, #6
> +	asr	result, result, #6
> +	cmp	curbit, divisor
> +	blo	LSYM(Lthumb1_div_negative8)
> +
> +	lsl	divisor, divisor, #6
> +	asr	result, result, #6
> +	cmp	curbit, divisor
> +	blo	LSYM(Lthumb1_div_negative8)
> +
> +	lsl	divisor, divisor, #6
> +	beq	LSYM(Ldivbyzero_negative)
> +	asr	result, result, #6
> +	b	LSYM(Lthumb1_div_negative8)
> +LSYM(Lthumb1_div_negative_loop):
> +	lsr	divisor, divisor, #6
> +LSYM(Lthumb1_div_negative8):
> +	DoDiv	#7
> +	DoDiv	#6
> +	DoDiv	#5
> +	DoDiv	#4
> +LSYM(Lthumb1_div_negative4):
> +	DoDiv	#3
> +	DoDiv	#2
> +	bcs	LSYM(Lthumb1_div_negative_loop)
> +	DoDiv	#1
> +	sub	divisor, dividend, divisor
> +	bcs	1f
> +	cpy	divisor, dividend
> +
> +1:	cpy	curbit, ip
> +	adc	result, result
> +	asr	curbit, curbit, #1
> +	cpy	dividend, result
> +	bcc	2f
> +	neg	dividend, dividend
> +	cmp	curbit, #0
> +
> +2:	bpl	3f
> +	neg	divisor, divisor
> +
> +3:	RET
> +
> +LSYM(Ldivbyzero_negative):
> +	cpy	curbit, ip
> +	asr	curbit, curbit, #1
> +	bcc	LSYM(Ldiv0)
> +	neg	dividend, dividend
> +.endm
> +#endif /* ARM Thumb version.  */
> +
>  /* ------------------------------------------------------------------------ */
>  /*		Start of the Real Functions				    */
>  /* ------------------------------------------------------------------------ */
> @@ -955,6 +1096,7 @@ LSYM(Lgot_result):
>  
>  	FUNC_START udivsi3
>  	FUNC_ALIAS aeabi_uidiv udivsi3
> +#if defined(__OPTIMIZE_SIZE__)
>  
>  	cmp	divisor, #0
>  	beq	LSYM(Ldiv0)
> @@ -972,6 +1114,14 @@ LSYM(udivsi3_skip_div0_test):
>  	pop	{ work }
>  	RET
>  
> +#else
> +	/* Implementation of aeabi_uidiv for ARMv6m.  This version is only
> +	   used in ARMv6-M when we need an efficient implementation.  */
> +LSYM(udivsi3_skip_div0_test):
> +	THUMB1_Div_Positive
> +
> +#endif /* __OPTIMIZE_SIZE__ */
> +
>  #elif defined(__ARM_ARCH_EXT_IDIV__)
>  
>  	ARM_FUNC_START udivsi3
> @@ -1023,12 +1173,21 @@ LSYM(udivsi3_skip_div0_test):
>  FUNC_START aeabi_uidivmod
>  	cmp	r1, #0
>  	beq	LSYM(Ldiv0)
> +# if defined(__OPTIMIZE_SIZE__)
>  	push	{r0, r1, lr}
>  	bl	LSYM(udivsi3_skip_div0_test)
>  	POP	{r1, r2, r3}
>  	mul	r2, r0
>  	sub	r1, r1, r2
>  	bx	r3
> +# else
> +	/* Both the quotient and remainder are calculated simultaneously
> +	   in THUMB1_Div_Positive.  There is no need to calculate the
> +	   remainder again here.  */
> +	b	LSYM(udivsi3_skip_div0_test)
> +	RET
> +# endif /* __OPTIMIZE_SIZE__ */
> +
>  #elif defined(__ARM_ARCH_EXT_IDIV__)
>  ARM_FUNC_START aeabi_uidivmod
>  	cmp	r1, #0
> @@ -1084,7 +1243,7 @@ LSYM(Lover10):
>  	RET
>  	
>  #else  /* ARM version.  */
> -	
> +
>  	FUNC_START umodsi3
>  
>  	subs	r2, r1, #1			@ compare divisor with 1
> @@ -1109,8 +1268,9 @@ LSYM(Lover10):
>  
>  #if defined(__prefer_thumb__)
>  
> -	FUNC_START divsi3	
> +	FUNC_START divsi3
>  	FUNC_ALIAS aeabi_idiv divsi3
> +#if defined(__OPTIMIZE_SIZE__)
>  
>  	cmp	divisor, #0
>  	beq	LSYM(Ldiv0)
> @@ -1133,7 +1293,7 @@ LSYM(Lover11):
>  	blo	LSYM(Lgot_result)
>  
>  	THUMB_DIV_MOD_BODY 0
> -	
> +
>  	mov	r0, result
>  	mov	work, ip
>  	cmp	work, #0
> @@ -1142,6 +1302,21 @@ LSYM(Lover11):
>  LSYM(Lover12):
>  	pop	{ work }
>  	RET
> +#else
> +	/* Implementation of aeabi_idiv for ARMv6m.  This version is only
> +	   used in ARMv6-M when we need an efficient implementation.  */
> +LSYM(divsi3_skip_div0_test):
> +	cpy	curbit, dividend
> +	orr	curbit, divisor
> +	bmi	LSYM(Lthumb1_div_negative)
> +
> +LSYM(Lthumb1_div_positive):
> +	THUMB1_Div_Positive
> +
> +LSYM(Lthumb1_div_negative):
> +	THUMB1_Div_Negative
> +
> +#endif /* __OPTIMIZE_SIZE__ */
>  
>  #elif defined(__ARM_ARCH_EXT_IDIV__)
>  
> @@ -1154,8 +1329,8 @@ LSYM(Lover12):
>  	RET
>  
>  #else /* ARM/Thumb-2 version.  */
> -	
> -	ARM_FUNC_START divsi3	
> +
> +	ARM_FUNC_START divsi3
>  	ARM_FUNC_ALIAS aeabi_idiv divsi3
>  
>  	cmp	r1, #0
> @@ -1209,12 +1384,21 @@ LSYM(divsi3_skip_div0_test):
>  FUNC_START aeabi_idivmod
>  	cmp	r1, #0
>  	beq	LSYM(Ldiv0)
> +# if defined(__OPTIMIZE_SIZE__)
>  	push	{r0, r1, lr}
>  	bl	LSYM(divsi3_skip_div0_test)
>  	POP	{r1, r2, r3}
>  	mul	r2, r0
>  	sub	r1, r1, r2
>  	bx	r3
> +# else
> +	/* Both the quotient and remainder are calculated simultaneously
> +	   in THUMB1_Div_Positive and THUMB1_Div_Negative.  There is no
> +	   need to calculate the remainder again here.  */
> +	b	LSYM(divsi3_skip_div0_test)
> +	RET
> +# endif /* __OPTIMIZE_SIZE__ */
> +
>  #elif defined(__ARM_ARCH_EXT_IDIV__)
>  ARM_FUNC_START aeabi_idivmod
>  	cmp 	r1, #0
> -- 1.9.1
> 

Otherwise OK if no regressions and the following request passes.

Can you ensure that libgcc for one ARM state and one Thumb2 state non-v6m configuration should give identical binaries with and without your patch, no ? 

regards
Ramana