[PATCHv2, ARM, libgcc] New aeabi_idiv function for armv6-m

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCHv2, ARM, libgcc] New aeabi_idiv function for armv6-m
@ 2015-10-13 17:01 Andre Vieira
  2015-10-27 17:28 ` [PING][PATCHv2, " Andre Vieira
  2016-07-01 13:41 ` [PATCHv2, " Ramana Radhakrishnan
  0 siblings, 2 replies; 10+ messages in thread
From: Andre Vieira @ 2015-10-13 17:01 UTC (permalink / raw)
  To: GCC Patches

[-- Attachment #1: Type: text/plain, Size: 808 bytes --]

This patch ports the aeabi_idiv routine from Linaro Cortex-Strings 
(https://git.linaro.org/toolchain/cortex-strings.git), which was 
contributed by ARM under Free BSD license.

The new aeabi_idiv routine is used to replace the one in 
libgcc/config/arm/lib1funcs.S. This replacement happens within the 
Thumb1 wrapper. The new routine is under LGPLv3 license.

The main advantage of this version is that it can improve the 
performance of the aeabi_idiv function for Thumb1. This solution will 
also increase the code size. So it will only be used if 
__OPTIMIZE_SIZE__ is not defined.

Make check passed for armv6-m.

libgcc/ChangeLog:
2015-08-10  Hale Wang  <hale.wang@arm.com>
             Andre Vieira  <andre.simoesdiasvieira@arm.com>

   * config/arm/lib1funcs.S: Add new wrapper.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0001-integer-division.patch --]
[-- Type: text/x-patch; name=0001-integer-division.patch, Size: 8544 bytes --]

From 832a3d6af6f06399f70b5a4ac3727d55960c93b7 Mon Sep 17 00:00:00 2001
From: Andre Simoes Dias Vieira <andsim01@arm.com>
Date: Fri, 21 Aug 2015 14:23:28 +0100
Subject: [PATCH] new wrapper idivmod

---
 libgcc/config/arm/lib1funcs.S | 250 ++++++++++++++++++++++++++++++++++++------
 1 file changed, 217 insertions(+), 33 deletions(-)

diff --git a/libgcc/config/arm/lib1funcs.S b/libgcc/config/arm/lib1funcs.S
index 252efcbd5385cc58a5ce1e48c6816d36a6f4c797..c9e544114590da8cde88382bea0f67206e593816 100644
--- a/libgcc/config/arm/lib1funcs.S
+++ b/libgcc/config/arm/lib1funcs.S
@@ -306,34 +306,12 @@ LSYM(Lend_fde):
 #ifdef __ARM_EABI__
 .macro THUMB_LDIV0 name signed
 #if defined(__ARM_ARCH_6M__)
-	.ifc \signed, unsigned
-	cmp	r0, #0
-	beq	1f
-	mov	r0, #0
-	mvn	r0, r0		@ 0xffffffff
-1:
-	.else
-	cmp	r0, #0
-	beq	2f
-	blt	3f
+
+	push	{r0, lr}
 	mov	r0, #0
-	mvn	r0, r0
-	lsr	r0, r0, #1	@ 0x7fffffff
-	b	2f
-3:	mov	r0, #0x80
-	lsl	r0, r0, #24	@ 0x80000000
-2:
-	.endif
-	push	{r0, r1, r2}
-	ldr	r0, 4f
-	adr	r1, 4f
-	add	r0, r1
-	str	r0, [sp, #8]
-	@ We know we are not on armv4t, so pop pc is safe.
-	pop	{r0, r1, pc}
-	.align	2
-4:
-	.word	__aeabi_idiv0 - 4b
+	bl	SYM(__aeabi_idiv0)
+	pop	{r1, pc}
+
 #elif defined(__thumb2__)
 	.syntax unified
 	.ifc \signed, unsigned
@@ -945,7 +923,170 @@ LSYM(Lover7):
 	add	dividend, work
   .endif
 LSYM(Lgot_result):
-.endm	
+.endm
+
+#if defined(__prefer_thumb__) && !defined(__OPTIMIZE_SIZE__)
+/* If performance is preferred, the following functions are provided.  */
+
+/* Branch to div(n), and jump to label if curbit is lo than divisior.  */
+.macro BranchToDiv n, label
+	lsr	curbit, dividend, \n
+	cmp	curbit, divisor
+	blo	\label
+.endm
+
+/* Body of div(n).  Shift the divisor in n bits and compare the divisor
+   and dividend.  Update the dividend as the substruction result.  */
+.macro DoDiv n
+	lsr	curbit, dividend, \n
+	cmp	curbit, divisor
+	bcc	1f
+	lsl	curbit, divisor, \n
+	sub	dividend, dividend, curbit
+
+1:	adc	result, result
+.endm
+
+/* The body of division with positive divisor.  Unless the divisor is very
+   big, shift it up in multiples of four bits, since this is the amount of
+   unwinding in the main division loop.  Continue shifting until the divisor
+   is larger than the dividend.  */
+.macro THUMB1_Div_Positive
+	mov	result, #0
+	BranchToDiv #1, LSYM(Lthumb1_div1)
+	BranchToDiv #4, LSYM(Lthumb1_div4)
+	BranchToDiv #8, LSYM(Lthumb1_div8)
+	BranchToDiv #12, LSYM(Lthumb1_div12)
+	BranchToDiv #16, LSYM(Lthumb1_div16)
+LSYM(Lthumb1_div_large_positive):
+	mov	result, #0xff
+	lsl	divisor, divisor, #8
+	rev	result, result
+	lsr	curbit, dividend, #16
+	cmp	curbit, divisor
+	blo	1f
+	asr	result, #8
+	lsl	divisor, divisor, #8
+	beq	LSYM(Ldivbyzero_waypoint)
+
+1:	lsr	curbit, dividend, #12
+	cmp	curbit, divisor
+	blo	LSYM(Lthumb1_div12)
+	b	LSYM(Lthumb1_div16)
+LSYM(Lthumb1_div_loop):
+	lsr	divisor, divisor, #8
+LSYM(Lthumb1_div16):
+	Dodiv	#15
+	Dodiv	#14
+	Dodiv	#13
+	Dodiv	#12
+LSYM(Lthumb1_div12):
+	Dodiv	#11
+	Dodiv	#10
+	Dodiv	#9
+	Dodiv	#8
+	bcs	LSYM(Lthumb1_div_loop)
+LSYM(Lthumb1_div8):
+	Dodiv	#7
+	Dodiv	#6
+	Dodiv	#5
+LSYM(Lthumb1_div5):
+	Dodiv	#4
+LSYM(Lthumb1_div4):
+	Dodiv	#3
+LSYM(Lthumb1_div3):
+	Dodiv	#2
+LSYM(Lthumb1_div2):
+	Dodiv	#1
+LSYM(Lthumb1_div1):
+	sub	divisor, dividend, divisor
+	bcs	1f
+	cpy	divisor, dividend
+
+1:	adc	result, result
+	cpy	dividend, result
+	RET
+
+LSYM(Ldivbyzero_waypoint):
+	b	LSYM(Ldiv0)
+.endm
+
+/* The body of division with negative divisor.  Similar with
+   THUMB1_Div_Positive except that the shift steps are in multiples
+   of six bits.  */
+.macro THUMB1_Div_Negative
+	lsr	result, divisor, #31
+	beq	1f
+	neg	divisor, divisor
+
+1:	asr	curbit, dividend, #32
+	bcc	2f
+	neg	dividend, dividend
+
+2:	eor	curbit, result
+	mov	result, #0
+	cpy	ip, curbit
+	BranchToDiv #4, LSYM(Lthumb1_div_negative4)
+	BranchToDiv #8, LSYM(Lthumb1_div_negative8)
+LSYM(Lthumb1_div_large):
+	mov	result, #0xfc
+	lsl	divisor, divisor, #6
+	rev	result, result
+	lsr	curbit, dividend, #8
+	cmp	curbit, divisor
+	blo	LSYM(Lthumb1_div_negative8)
+
+	lsl	divisor, divisor, #6
+	asr	result, result, #6
+	cmp	curbit, divisor
+	blo	LSYM(Lthumb1_div_negative8)
+
+	lsl	divisor, divisor, #6
+	asr	result, result, #6
+	cmp	curbit, divisor
+	blo	LSYM(Lthumb1_div_negative8)
+
+	lsl	divisor, divisor, #6
+	beq	LSYM(Ldivbyzero_negative)
+	asr	result, result, #6
+	b	LSYM(Lthumb1_div_negative8)
+LSYM(Lthumb1_div_negative_loop):
+	lsr	divisor, divisor, #6
+LSYM(Lthumb1_div_negative8):
+	DoDiv	#7
+	DoDiv	#6
+	DoDiv	#5
+	DoDiv	#4
+LSYM(Lthumb1_div_negative4):
+	DoDiv	#3
+	DoDiv	#2
+	bcs	LSYM(Lthumb1_div_negative_loop)
+	DoDiv	#1
+	sub	divisor, dividend, divisor
+	bcs	1f
+	cpy	divisor, dividend
+
+1:	cpy	curbit, ip
+	adc	result, result
+	asr	curbit, curbit, #1
+	cpy	dividend, result
+	bcc	2f
+	neg	dividend, dividend
+	cmp	curbit, #0
+
+2:	bpl	3f
+	neg	divisor, divisor
+
+3:	RET
+
+LSYM(Ldivbyzero_negative):
+	cpy	curbit, ip
+	asr	curbit, curbit, #1
+	bcc	LSYM(Ldiv0)
+	neg	dividend, dividend
+.endm
+#endif /* ARM Thumb version.  */
+
 /* ------------------------------------------------------------------------ */
 /*		Start of the Real Functions				    */
 /* ------------------------------------------------------------------------ */
@@ -955,6 +1096,7 @@ LSYM(Lgot_result):
 
 	FUNC_START udivsi3
 	FUNC_ALIAS aeabi_uidiv udivsi3
+#if defined(__OPTIMIZE_SIZE__)
 
 	cmp	divisor, #0
 	beq	LSYM(Ldiv0)
@@ -972,6 +1114,14 @@ LSYM(udivsi3_skip_div0_test):
 	pop	{ work }
 	RET
 
+#else
+	/* Implementation of aeabi_uidiv for ARMv6m.  This version is only
+	   used in ARMv6-M when we need an efficient implementation.  */
+LSYM(udivsi3_skip_div0_test):
+	THUMB1_Div_Positive
+
+#endif /* __OPTIMIZE_SIZE__ */
+
 #elif defined(__ARM_ARCH_EXT_IDIV__)
 
 	ARM_FUNC_START udivsi3
@@ -1023,12 +1173,21 @@ LSYM(udivsi3_skip_div0_test):
 FUNC_START aeabi_uidivmod
 	cmp	r1, #0
 	beq	LSYM(Ldiv0)
+# if defined(__OPTIMIZE_SIZE__)
 	push	{r0, r1, lr}
 	bl	LSYM(udivsi3_skip_div0_test)
 	POP	{r1, r2, r3}
 	mul	r2, r0
 	sub	r1, r1, r2
 	bx	r3
+# else
+	/* Both the quotient and remainder are calculated simultaneously
+	   in THUMB1_Div_Positive.  There is no need to calculate the
+	   remainder again here.  */
+	b	LSYM(udivsi3_skip_div0_test)
+	RET
+# endif /* __OPTIMIZE_SIZE__ */
+
 #elif defined(__ARM_ARCH_EXT_IDIV__)
 ARM_FUNC_START aeabi_uidivmod
 	cmp	r1, #0
@@ -1084,7 +1243,7 @@ LSYM(Lover10):
 	RET
 	
 #else  /* ARM version.  */
-	
+
 	FUNC_START umodsi3
 
 	subs	r2, r1, #1			@ compare divisor with 1
@@ -1109,8 +1268,9 @@ LSYM(Lover10):
 
 #if defined(__prefer_thumb__)
 
-	FUNC_START divsi3	
+	FUNC_START divsi3
 	FUNC_ALIAS aeabi_idiv divsi3
+#if defined(__OPTIMIZE_SIZE__)
 
 	cmp	divisor, #0
 	beq	LSYM(Ldiv0)
@@ -1133,7 +1293,7 @@ LSYM(Lover11):
 	blo	LSYM(Lgot_result)
 
 	THUMB_DIV_MOD_BODY 0
-	
+
 	mov	r0, result
 	mov	work, ip
 	cmp	work, #0
@@ -1142,6 +1302,21 @@ LSYM(Lover11):
 LSYM(Lover12):
 	pop	{ work }
 	RET
+#else
+	/* Implementation of aeabi_idiv for ARMv6m.  This version is only
+	   used in ARMv6-M when we need an efficient implementation.  */
+LSYM(divsi3_skip_div0_test):
+	cpy	curbit, dividend
+	orr	curbit, divisor
+	bmi	LSYM(Lthumb1_div_negative)
+
+LSYM(Lthumb1_div_positive):
+	THUMB1_Div_Positive
+
+LSYM(Lthumb1_div_negative):
+	THUMB1_Div_Negative
+
+#endif /* __OPTIMIZE_SIZE__ */
 
 #elif defined(__ARM_ARCH_EXT_IDIV__)
 
@@ -1154,8 +1329,8 @@ LSYM(Lover12):
 	RET
 
 #else /* ARM/Thumb-2 version.  */
-	
-	ARM_FUNC_START divsi3	
+
+	ARM_FUNC_START divsi3
 	ARM_FUNC_ALIAS aeabi_idiv divsi3
 
 	cmp	r1, #0
@@ -1209,12 +1384,21 @@ LSYM(divsi3_skip_div0_test):
 FUNC_START aeabi_idivmod
 	cmp	r1, #0
 	beq	LSYM(Ldiv0)
+# if defined(__OPTIMIZE_SIZE__)
 	push	{r0, r1, lr}
 	bl	LSYM(divsi3_skip_div0_test)
 	POP	{r1, r2, r3}
 	mul	r2, r0
 	sub	r1, r1, r2
 	bx	r3
+# else
+	/* Both the quotient and remainder are calculated simultaneously
+	   in THUMB1_Div_Positive and THUMB1_Div_Negative.  There is no
+	   need to calculate the remainder again here.  */
+	b	LSYM(divsi3_skip_div0_test)
+	RET
+# endif /* __OPTIMIZE_SIZE__ */
+
 #elif defined(__ARM_ARCH_EXT_IDIV__)
 ARM_FUNC_START aeabi_idivmod
 	cmp 	r1, #0
-- 
1.9.1


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PING][PATCHv2, ARM, libgcc] New aeabi_idiv function for armv6-m
  2015-10-13 17:01 [PATCHv2, ARM, libgcc] New aeabi_idiv function for armv6-m Andre Vieira
@ 2015-10-27 17:28 ` Andre Vieira
  2016-01-25 17:15   ` [PING^2][PATCHv2, " Andre Vieira (lists)
  2016-07-01 13:41 ` [PATCHv2, " Ramana Radhakrishnan
  1 sibling, 1 reply; 10+ messages in thread
From: Andre Vieira @ 2015-10-27 17:28 UTC (permalink / raw)
  To: gcc-patches

Ping.

BR,
Andre

On 13/10/15 18:01, Andre Vieira wrote:
> This patch ports the aeabi_idiv routine from Linaro Cortex-Strings
> (https://git.linaro.org/toolchain/cortex-strings.git), which was
> contributed by ARM under Free BSD license.
>
> The new aeabi_idiv routine is used to replace the one in
> libgcc/config/arm/lib1funcs.S. This replacement happens within the
> Thumb1 wrapper. The new routine is under LGPLv3 license.
>
> The main advantage of this version is that it can improve the
> performance of the aeabi_idiv function for Thumb1. This solution will
> also increase the code size. So it will only be used if
> __OPTIMIZE_SIZE__ is not defined.
>
> Make check passed for armv6-m.
>
> libgcc/ChangeLog:
> 2015-08-10  Hale Wang  <hale.wang@arm.com>
>               Andre Vieira  <andre.simoesdiasvieira@arm.com>
>
>     * config/arm/lib1funcs.S: Add new wrapper.
>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PING^2][PATCHv2, ARM, libgcc] New aeabi_idiv function for armv6-m
  2015-10-27 17:28 ` [PING][PATCHv2, " Andre Vieira
@ 2016-01-25 17:15   ` Andre Vieira (lists)
  2016-05-19 10:19     ` [PING^3][PATCHv2, " Andre Vieira (lists)
  0 siblings, 1 reply; 10+ messages in thread
From: Andre Vieira (lists) @ 2016-01-25 17:15 UTC (permalink / raw)
  To: gcc-patches

Ping.

On 27/10/15 17:03, Andre Vieira wrote:
> Ping.
>
> BR,
> Andre
>
> On 13/10/15 18:01, Andre Vieira wrote:
>> This patch ports the aeabi_idiv routine from Linaro Cortex-Strings
>> (https://git.linaro.org/toolchain/cortex-strings.git), which was
>> contributed by ARM under Free BSD license.
>>
>> The new aeabi_idiv routine is used to replace the one in
>> libgcc/config/arm/lib1funcs.S. This replacement happens within the
>> Thumb1 wrapper. The new routine is under LGPLv3 license.
>>
>> The main advantage of this version is that it can improve the
>> performance of the aeabi_idiv function for Thumb1. This solution will
>> also increase the code size. So it will only be used if
>> __OPTIMIZE_SIZE__ is not defined.
>>
>> Make check passed for armv6-m.
>>
>> libgcc/ChangeLog:
>> 2015-08-10  Hale Wang  <hale.wang@arm.com>
>>               Andre Vieira  <andre.simoesdiasvieira@arm.com>
>>
>>     * config/arm/lib1funcs.S: Add new wrapper.
>>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PING^3][PATCHv2, ARM, libgcc] New aeabi_idiv function for armv6-m
  2016-01-25 17:15   ` [PING^2][PATCHv2, " Andre Vieira (lists)
@ 2016-05-19 10:19     ` Andre Vieira (lists)
  2016-06-08 14:36       ` [PING^4][PATCHv2, " Andre Vieira (lists)
  0 siblings, 1 reply; 10+ messages in thread
From: Andre Vieira (lists) @ 2016-05-19 10:19 UTC (permalink / raw)
  To: gcc-patches; +Cc: Richard Earnshaw

Ping for GCC-7, patch applies cleanly, passed make check for cortex-m0.

Might be worth mentioning that this patch has been used in three
releases of the GNU ARM embedded toolchain, using GCC versions 4.9 and
5, and no issues have been reported so far.

On 25/01/16 17:15, Andre Vieira (lists) wrote:
> Ping.
> 
> On 27/10/15 17:03, Andre Vieira wrote:
>> Ping.
>>
>> BR,
>> Andre
>>
>> On 13/10/15 18:01, Andre Vieira wrote:
>>> This patch ports the aeabi_idiv routine from Linaro Cortex-Strings
>>> (https://git.linaro.org/toolchain/cortex-strings.git), which was
>>> contributed by ARM under Free BSD license.
>>>
>>> The new aeabi_idiv routine is used to replace the one in
>>> libgcc/config/arm/lib1funcs.S. This replacement happens within the
>>> Thumb1 wrapper. The new routine is under LGPLv3 license.
>>>
>>> The main advantage of this version is that it can improve the
>>> performance of the aeabi_idiv function for Thumb1. This solution will
>>> also increase the code size. So it will only be used if
>>> __OPTIMIZE_SIZE__ is not defined.
>>>
>>> Make check passed for armv6-m.
>>>
>>> libgcc/ChangeLog:
>>> 2015-08-10  Hale Wang  <hale.wang@arm.com>
>>>               Andre Vieira  <andre.simoesdiasvieira@arm.com>
>>>
>>>     * config/arm/lib1funcs.S: Add new wrapper.
>>>
> 

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PING^4][PATCHv2, ARM, libgcc] New aeabi_idiv function for armv6-m
  2016-05-19 10:19     ` [PING^3][PATCHv2, " Andre Vieira (lists)
@ 2016-06-08 14:36       ` Andre Vieira (lists)
  2016-06-23 16:52         ` [PING^5][PATCHv2, " Andre Vieira (lists)
  0 siblings, 1 reply; 10+ messages in thread
From: Andre Vieira (lists) @ 2016-06-08 14:36 UTC (permalink / raw)
  To: gcc-patches; +Cc: Richard Earnshaw

Ping.

On 19/05/16 11:19, Andre Vieira (lists) wrote:
> Ping for GCC-7, patch applies cleanly, passed make check for cortex-m0.
> 
> Might be worth mentioning that this patch has been used in three
> releases of the GNU ARM embedded toolchain, using GCC versions 4.9 and
> 5, and no issues have been reported so far.
> 
> On 25/01/16 17:15, Andre Vieira (lists) wrote:
>> Ping.
>>
>> On 27/10/15 17:03, Andre Vieira wrote:
>>> Ping.
>>>
>>> BR,
>>> Andre
>>>
>>> On 13/10/15 18:01, Andre Vieira wrote:
>>>> This patch ports the aeabi_idiv routine from Linaro Cortex-Strings
>>>> (https://git.linaro.org/toolchain/cortex-strings.git), which was
>>>> contributed by ARM under Free BSD license.
>>>>
>>>> The new aeabi_idiv routine is used to replace the one in
>>>> libgcc/config/arm/lib1funcs.S. This replacement happens within the
>>>> Thumb1 wrapper. The new routine is under LGPLv3 license.
>>>>
>>>> The main advantage of this version is that it can improve the
>>>> performance of the aeabi_idiv function for Thumb1. This solution will
>>>> also increase the code size. So it will only be used if
>>>> __OPTIMIZE_SIZE__ is not defined.
>>>>
>>>> Make check passed for armv6-m.
>>>>
>>>> libgcc/ChangeLog:
>>>> 2015-08-10  Hale Wang  <hale.wang@arm.com>
>>>>               Andre Vieira  <andre.simoesdiasvieira@arm.com>
>>>>
>>>>     * config/arm/lib1funcs.S: Add new wrapper.
>>>>
>>
> 

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PING^5][PATCHv2, ARM, libgcc] New aeabi_idiv function for armv6-m
  2016-06-08 14:36       ` [PING^4][PATCHv2, " Andre Vieira (lists)
@ 2016-06-23 16:52         ` Andre Vieira (lists)
  0 siblings, 0 replies; 10+ messages in thread
From: Andre Vieira (lists) @ 2016-06-23 16:52 UTC (permalink / raw)
  To: gcc-patches; +Cc: Richard Earnshaw

Ping.

On 08/06/16 15:35, Andre Vieira (lists) wrote:
> Ping.
> 
> On 19/05/16 11:19, Andre Vieira (lists) wrote:
>> Ping for GCC-7, patch applies cleanly, passed make check for cortex-m0.
>>
>> Might be worth mentioning that this patch has been used in three
>> releases of the GNU ARM embedded toolchain, using GCC versions 4.9 and
>> 5, and no issues have been reported so far.
>>
>> On 25/01/16 17:15, Andre Vieira (lists) wrote:
>>> Ping.
>>>
>>> On 27/10/15 17:03, Andre Vieira wrote:
>>>> Ping.
>>>>
>>>> BR,
>>>> Andre
>>>>
>>>> On 13/10/15 18:01, Andre Vieira wrote:
>>>>> This patch ports the aeabi_idiv routine from Linaro Cortex-Strings
>>>>> (https://git.linaro.org/toolchain/cortex-strings.git), which was
>>>>> contributed by ARM under Free BSD license.
>>>>>
>>>>> The new aeabi_idiv routine is used to replace the one in
>>>>> libgcc/config/arm/lib1funcs.S. This replacement happens within the
>>>>> Thumb1 wrapper. The new routine is under LGPLv3 license.
>>>>>
>>>>> The main advantage of this version is that it can improve the
>>>>> performance of the aeabi_idiv function for Thumb1. This solution will
>>>>> also increase the code size. So it will only be used if
>>>>> __OPTIMIZE_SIZE__ is not defined.
>>>>>
>>>>> Make check passed for armv6-m.
>>>>>
>>>>> libgcc/ChangeLog:
>>>>> 2015-08-10  Hale Wang  <hale.wang@arm.com>
>>>>>               Andre Vieira  <andre.simoesdiasvieira@arm.com>
>>>>>
>>>>>     * config/arm/lib1funcs.S: Add new wrapper.
>>>>>
>>>
>>
> 

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCHv2, ARM, libgcc] New aeabi_idiv function for armv6-m
  2015-10-13 17:01 [PATCHv2, ARM, libgcc] New aeabi_idiv function for armv6-m Andre Vieira
  2015-10-27 17:28 ` [PING][PATCHv2, " Andre Vieira
@ 2016-07-01 13:41 ` Ramana Radhakrishnan
  2016-07-06 10:52   ` Andre Vieira (lists)
  1 sibling, 1 reply; 10+ messages in thread
From: Ramana Radhakrishnan @ 2016-07-01 13:41 UTC (permalink / raw)
  To: Andre Vieira; +Cc: gcc-patches



On 13/10/15 18:01, Andre Vieira wrote:
> This patch ports the aeabi_idiv routine from Linaro Cortex-Strings (https://git.linaro.org/toolchain/cortex-strings.git), which was contributed by ARM under Free BSD license.
> 
> The new aeabi_idiv routine is used to replace the one in libgcc/config/arm/lib1funcs.S. This replacement happens within the Thumb1 wrapper. The new routine is under LGPLv3 license.

This is not under LGPLv3 . It is under GPLv3 with the runtime library exception license, there's a difference. Assuming your licensing expectation is ok .... read on for more of a review.

> 
> The main advantage of this version is that it can improve the performance of the aeabi_idiv function for Thumb1. This solution will also increase the code size. So it will only be used if __OPTIMIZE_SIZE__ is not defined.
> 
> Make check passed for armv6-m.
> 
> libgcc/ChangeLog:
> 2015-08-10  Hale Wang  <hale.wang@arm.com>
>             Andre Vieira  <andre.simoesdiasvieira@arm.com>
> 
>   * config/arm/lib1funcs.S: Add new wrapper.
> 
> 0001-integer-division.patch
> 
> 
> From 832a3d6af6f06399f70b5a4ac3727d55960c93b7 Mon Sep 17 00:00:00 2001
> From: Andre Simoes Dias Vieira <andsim01@arm.com>
> Date: Fri, 21 Aug 2015 14:23:28 +0100
> Subject: [PATCH] new wrapper idivmod
> 
> ---
>  libgcc/config/arm/lib1funcs.S | 250 ++++++++++++++++++++++++++++++++++++------
>  1 file changed, 217 insertions(+), 33 deletions(-)
> 
> diff --git a/libgcc/config/arm/lib1funcs.S b/libgcc/config/arm/lib1funcs.S
> index 252efcbd5385cc58a5ce1e48c6816d36a6f4c797..c9e544114590da8cde88382bea0f67206e593816 100644
> --- a/libgcc/config/arm/lib1funcs.S
> +++ b/libgcc/config/arm/lib1funcs.S
> @@ -306,34 +306,12 @@ LSYM(Lend_fde):
>  #ifdef __ARM_EABI__
>  .macro THUMB_LDIV0 name signed
>  #if defined(__ARM_ARCH_6M__)
> -	.ifc \signed, unsigned
> -	cmp	r0, #0
> -	beq	1f
> -	mov	r0, #0
> -	mvn	r0, r0		@ 0xffffffff
> -1:
> -	.else
> -	cmp	r0, #0
> -	beq	2f
> -	blt	3f
> +
> +	push	{r0, lr}
>  	mov	r0, #0
> -	mvn	r0, r0
> -	lsr	r0, r0, #1	@ 0x7fffffff
> -	b	2f
> -3:	mov	r0, #0x80
> -	lsl	r0, r0, #24	@ 0x80000000
> -2:
> -	.endif
> -	push	{r0, r1, r2}
> -	ldr	r0, 4f
> -	adr	r1, 4f
> -	add	r0, r1
> -	str	r0, [sp, #8]
> -	@ We know we are not on armv4t, so pop pc is safe.
> -	pop	{r0, r1, pc}
> -	.align	2
> -4:
> -	.word	__aeabi_idiv0 - 4b
> +	bl	SYM(__aeabi_idiv0)
> +	pop	{r1, pc}
> +

I'd still retain the comment about pop pc here because there's often a misconception of merging armv4t and armv6m code.

>  #elif defined(__thumb2__)
>  	.syntax unified
>  	.ifc \signed, unsigned
> @@ -945,7 +923,170 @@ LSYM(Lover7):
>  	add	dividend, work
>    .endif
>  LSYM(Lgot_result):
> -.endm	
> +.endm
> +
> +#if defined(__prefer_thumb__) && !defined(__OPTIMIZE_SIZE__)
> +/* If performance is preferred, the following functions are provided.  */
> +

Comment above #if please and also check elsewhere in patch.

> +/* Branch to div(n), and jump to label if curbit is lo than divisior.  */
> +.macro BranchToDiv n, label
> +	lsr	curbit, dividend, \n
> +	cmp	curbit, divisor
> +	blo	\label
> +.endm
> +
> +/* Body of div(n).  Shift the divisor in n bits and compare the divisor
> +   and dividend.  Update the dividend as the substruction result.  */
> +.macro DoDiv n
> +	lsr	curbit, dividend, \n
> +	cmp	curbit, divisor
> +	bcc	1f
> +	lsl	curbit, divisor, \n
> +	sub	dividend, dividend, curbit
> +
> +1:	adc	result, result
> +.endm
> +
> +/* The body of division with positive divisor.  Unless the divisor is very
> +   big, shift it up in multiples of four bits, since this is the amount of
> +   unwinding in the main division loop.  Continue shifting until the divisor
> +   is larger than the dividend.  */
> +.macro THUMB1_Div_Positive
> +	mov	result, #0
> +	BranchToDiv #1, LSYM(Lthumb1_div1)
> +	BranchToDiv #4, LSYM(Lthumb1_div4)
> +	BranchToDiv #8, LSYM(Lthumb1_div8)
> +	BranchToDiv #12, LSYM(Lthumb1_div12)
> +	BranchToDiv #16, LSYM(Lthumb1_div16)
> +LSYM(Lthumb1_div_large_positive):
> +	mov	result, #0xff
> +	lsl	divisor, divisor, #8
> +	rev	result, result
> +	lsr	curbit, dividend, #16
> +	cmp	curbit, divisor
> +	blo	1f
> +	asr	result, #8
> +	lsl	divisor, divisor, #8
> +	beq	LSYM(Ldivbyzero_waypoint)
> +
> +1:	lsr	curbit, dividend, #12
> +	cmp	curbit, divisor
> +	blo	LSYM(Lthumb1_div12)
> +	b	LSYM(Lthumb1_div16)
> +LSYM(Lthumb1_div_loop):
> +	lsr	divisor, divisor, #8
> +LSYM(Lthumb1_div16):
> +	Dodiv	#15
> +	Dodiv	#14
> +	Dodiv	#13
> +	Dodiv	#12
> +LSYM(Lthumb1_div12):
> +	Dodiv	#11
> +	Dodiv	#10
> +	Dodiv	#9
> +	Dodiv	#8
> +	bcs	LSYM(Lthumb1_div_loop)
> +LSYM(Lthumb1_div8):
> +	Dodiv	#7
> +	Dodiv	#6
> +	Dodiv	#5
> +LSYM(Lthumb1_div5):
> +	Dodiv	#4
> +LSYM(Lthumb1_div4):
> +	Dodiv	#3
> +LSYM(Lthumb1_div3):
> +	Dodiv	#2
> +LSYM(Lthumb1_div2):
> +	Dodiv	#1
> +LSYM(Lthumb1_div1):
> +	sub	divisor, dividend, divisor
> +	bcs	1f
> +	cpy	divisor, dividend
> +
> +1:	adc	result, result
> +	cpy	dividend, result
> +	RET
> +
> +LSYM(Ldivbyzero_waypoint):
> +	b	LSYM(Ldiv0)
> +.endm
> +
> +/* The body of division with negative divisor.  Similar with
> +   THUMB1_Div_Positive except that the shift steps are in multiples
> +   of six bits.  */
> +.macro THUMB1_Div_Negative
> +	lsr	result, divisor, #31
> +	beq	1f
> +	neg	divisor, divisor
> +
> +1:	asr	curbit, dividend, #32
> +	bcc	2f
> +	neg	dividend, dividend
> +
> +2:	eor	curbit, result
> +	mov	result, #0
> +	cpy	ip, curbit
> +	BranchToDiv #4, LSYM(Lthumb1_div_negative4)
> +	BranchToDiv #8, LSYM(Lthumb1_div_negative8)
> +LSYM(Lthumb1_div_large):
> +	mov	result, #0xfc
> +	lsl	divisor, divisor, #6
> +	rev	result, result
> +	lsr	curbit, dividend, #8
> +	cmp	curbit, divisor
> +	blo	LSYM(Lthumb1_div_negative8)
> +
> +	lsl	divisor, divisor, #6
> +	asr	result, result, #6
> +	cmp	curbit, divisor
> +	blo	LSYM(Lthumb1_div_negative8)
> +
> +	lsl	divisor, divisor, #6
> +	asr	result, result, #6
> +	cmp	curbit, divisor
> +	blo	LSYM(Lthumb1_div_negative8)
> +
> +	lsl	divisor, divisor, #6
> +	beq	LSYM(Ldivbyzero_negative)
> +	asr	result, result, #6
> +	b	LSYM(Lthumb1_div_negative8)
> +LSYM(Lthumb1_div_negative_loop):
> +	lsr	divisor, divisor, #6
> +LSYM(Lthumb1_div_negative8):
> +	DoDiv	#7
> +	DoDiv	#6
> +	DoDiv	#5
> +	DoDiv	#4
> +LSYM(Lthumb1_div_negative4):
> +	DoDiv	#3
> +	DoDiv	#2
> +	bcs	LSYM(Lthumb1_div_negative_loop)
> +	DoDiv	#1
> +	sub	divisor, dividend, divisor
> +	bcs	1f
> +	cpy	divisor, dividend
> +
> +1:	cpy	curbit, ip
> +	adc	result, result
> +	asr	curbit, curbit, #1
> +	cpy	dividend, result
> +	bcc	2f
> +	neg	dividend, dividend
> +	cmp	curbit, #0
> +
> +2:	bpl	3f
> +	neg	divisor, divisor
> +
> +3:	RET
> +
> +LSYM(Ldivbyzero_negative):
> +	cpy	curbit, ip
> +	asr	curbit, curbit, #1
> +	bcc	LSYM(Ldiv0)
> +	neg	dividend, dividend
> +.endm
> +#endif /* ARM Thumb version.  */
> +
>  /* ------------------------------------------------------------------------ */
>  /*		Start of the Real Functions				    */
>  /* ------------------------------------------------------------------------ */
> @@ -955,6 +1096,7 @@ LSYM(Lgot_result):
>  
>  	FUNC_START udivsi3
>  	FUNC_ALIAS aeabi_uidiv udivsi3
> +#if defined(__OPTIMIZE_SIZE__)
>  
>  	cmp	divisor, #0
>  	beq	LSYM(Ldiv0)
> @@ -972,6 +1114,14 @@ LSYM(udivsi3_skip_div0_test):
>  	pop	{ work }
>  	RET
>  
> +#else
> +	/* Implementation of aeabi_uidiv for ARMv6m.  This version is only
> +	   used in ARMv6-M when we need an efficient implementation.  */
> +LSYM(udivsi3_skip_div0_test):
> +	THUMB1_Div_Positive
> +
> +#endif /* __OPTIMIZE_SIZE__ */
> +
>  #elif defined(__ARM_ARCH_EXT_IDIV__)
>  
>  	ARM_FUNC_START udivsi3
> @@ -1023,12 +1173,21 @@ LSYM(udivsi3_skip_div0_test):
>  FUNC_START aeabi_uidivmod
>  	cmp	r1, #0
>  	beq	LSYM(Ldiv0)
> +# if defined(__OPTIMIZE_SIZE__)
>  	push	{r0, r1, lr}
>  	bl	LSYM(udivsi3_skip_div0_test)
>  	POP	{r1, r2, r3}
>  	mul	r2, r0
>  	sub	r1, r1, r2
>  	bx	r3
> +# else
> +	/* Both the quotient and remainder are calculated simultaneously
> +	   in THUMB1_Div_Positive.  There is no need to calculate the
> +	   remainder again here.  */
> +	b	LSYM(udivsi3_skip_div0_test)
> +	RET
> +# endif /* __OPTIMIZE_SIZE__ */
> +
>  #elif defined(__ARM_ARCH_EXT_IDIV__)
>  ARM_FUNC_START aeabi_uidivmod
>  	cmp	r1, #0
> @@ -1084,7 +1243,7 @@ LSYM(Lover10):
>  	RET
>  	
>  #else  /* ARM version.  */
> -	
> +
>  	FUNC_START umodsi3
>  
>  	subs	r2, r1, #1			@ compare divisor with 1
> @@ -1109,8 +1268,9 @@ LSYM(Lover10):
>  
>  #if defined(__prefer_thumb__)
>  
> -	FUNC_START divsi3	
> +	FUNC_START divsi3
>  	FUNC_ALIAS aeabi_idiv divsi3
> +#if defined(__OPTIMIZE_SIZE__)
>  
>  	cmp	divisor, #0
>  	beq	LSYM(Ldiv0)
> @@ -1133,7 +1293,7 @@ LSYM(Lover11):
>  	blo	LSYM(Lgot_result)
>  
>  	THUMB_DIV_MOD_BODY 0
> -	
> +
>  	mov	r0, result
>  	mov	work, ip
>  	cmp	work, #0
> @@ -1142,6 +1302,21 @@ LSYM(Lover11):
>  LSYM(Lover12):
>  	pop	{ work }
>  	RET
> +#else
> +	/* Implementation of aeabi_idiv for ARMv6m.  This version is only
> +	   used in ARMv6-M when we need an efficient implementation.  */
> +LSYM(divsi3_skip_div0_test):
> +	cpy	curbit, dividend
> +	orr	curbit, divisor
> +	bmi	LSYM(Lthumb1_div_negative)
> +
> +LSYM(Lthumb1_div_positive):
> +	THUMB1_Div_Positive
> +
> +LSYM(Lthumb1_div_negative):
> +	THUMB1_Div_Negative
> +
> +#endif /* __OPTIMIZE_SIZE__ */
>  
>  #elif defined(__ARM_ARCH_EXT_IDIV__)
>  
> @@ -1154,8 +1329,8 @@ LSYM(Lover12):
>  	RET
>  
>  #else /* ARM/Thumb-2 version.  */
> -	
> -	ARM_FUNC_START divsi3	
> +
> +	ARM_FUNC_START divsi3
>  	ARM_FUNC_ALIAS aeabi_idiv divsi3
>  
>  	cmp	r1, #0
> @@ -1209,12 +1384,21 @@ LSYM(divsi3_skip_div0_test):
>  FUNC_START aeabi_idivmod
>  	cmp	r1, #0
>  	beq	LSYM(Ldiv0)
> +# if defined(__OPTIMIZE_SIZE__)
>  	push	{r0, r1, lr}
>  	bl	LSYM(divsi3_skip_div0_test)
>  	POP	{r1, r2, r3}
>  	mul	r2, r0
>  	sub	r1, r1, r2
>  	bx	r3
> +# else
> +	/* Both the quotient and remainder are calculated simultaneously
> +	   in THUMB1_Div_Positive and THUMB1_Div_Negative.  There is no
> +	   need to calculate the remainder again here.  */
> +	b	LSYM(divsi3_skip_div0_test)
> +	RET
> +# endif /* __OPTIMIZE_SIZE__ */
> +
>  #elif defined(__ARM_ARCH_EXT_IDIV__)
>  ARM_FUNC_START aeabi_idivmod
>  	cmp 	r1, #0
> -- 1.9.1
> 

Otherwise OK if no regressions and the following request passes.

Can you ensure that libgcc for one ARM state and one Thumb2 state non-v6m configuration should give identical binaries with and without your patch, no ? 

regards
Ramana

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCHv2, ARM, libgcc] New aeabi_idiv function for armv6-m
  2016-07-01 13:41 ` [PATCHv2, " Ramana Radhakrishnan
@ 2016-07-06 10:52   ` Andre Vieira (lists)
  2016-07-11 17:09     ` Andre Vieira (lists)
  0 siblings, 1 reply; 10+ messages in thread
From: Andre Vieira (lists) @ 2016-07-06 10:52 UTC (permalink / raw)
  To: gcc-patches; +Cc: Ramana Radhakrishnan

[-- Attachment #1: Type: text/plain, Size: 11527 bytes --]

On 01/07/16 14:40, Ramana Radhakrishnan wrote:
> 
> 
> On 13/10/15 18:01, Andre Vieira wrote:
>> This patch ports the aeabi_idiv routine from Linaro Cortex-Strings (https://git.linaro.org/toolchain/cortex-strings.git), which was contributed by ARM under Free BSD license.
>>
>> The new aeabi_idiv routine is used to replace the one in libgcc/config/arm/lib1funcs.S. This replacement happens within the Thumb1 wrapper. The new routine is under LGPLv3 license.
> 
> This is not under LGPLv3 . It is under GPLv3 with the runtime library exception license, there's a difference. Assuming your licensing expectation is ok .... read on for more of a review.
> 
>>
>> The main advantage of this version is that it can improve the performance of the aeabi_idiv function for Thumb1. This solution will also increase the code size. So it will only be used if __OPTIMIZE_SIZE__ is not defined.
>>
>> Make check passed for armv6-m.
>>
>> libgcc/ChangeLog:
>> 2015-08-10  Hale Wang  <hale.wang@arm.com>
>>             Andre Vieira  <andre.simoesdiasvieira@arm.com>
>>
>>   * config/arm/lib1funcs.S: Add new wrapper.
>>
>> 0001-integer-division.patch
>>
>>
>> From 832a3d6af6f06399f70b5a4ac3727d55960c93b7 Mon Sep 17 00:00:00 2001
>> From: Andre Simoes Dias Vieira <andsim01@arm.com>
>> Date: Fri, 21 Aug 2015 14:23:28 +0100
>> Subject: [PATCH] new wrapper idivmod
>>
>> ---
>>  libgcc/config/arm/lib1funcs.S | 250 ++++++++++++++++++++++++++++++++++++------
>>  1 file changed, 217 insertions(+), 33 deletions(-)
>>
>> diff --git a/libgcc/config/arm/lib1funcs.S b/libgcc/config/arm/lib1funcs.S
>> index 252efcbd5385cc58a5ce1e48c6816d36a6f4c797..c9e544114590da8cde88382bea0f67206e593816 100644
>> --- a/libgcc/config/arm/lib1funcs.S
>> +++ b/libgcc/config/arm/lib1funcs.S
>> @@ -306,34 +306,12 @@ LSYM(Lend_fde):
>>  #ifdef __ARM_EABI__
>>  .macro THUMB_LDIV0 name signed
>>  #if defined(__ARM_ARCH_6M__)
>> -	.ifc \signed, unsigned
>> -	cmp	r0, #0
>> -	beq	1f
>> -	mov	r0, #0
>> -	mvn	r0, r0		@ 0xffffffff
>> -1:
>> -	.else
>> -	cmp	r0, #0
>> -	beq	2f
>> -	blt	3f
>> +
>> +	push	{r0, lr}
>>  	mov	r0, #0
>> -	mvn	r0, r0
>> -	lsr	r0, r0, #1	@ 0x7fffffff
>> -	b	2f
>> -3:	mov	r0, #0x80
>> -	lsl	r0, r0, #24	@ 0x80000000
>> -2:
>> -	.endif
>> -	push	{r0, r1, r2}
>> -	ldr	r0, 4f
>> -	adr	r1, 4f
>> -	add	r0, r1
>> -	str	r0, [sp, #8]
>> -	@ We know we are not on armv4t, so pop pc is safe.
>> -	pop	{r0, r1, pc}
>> -	.align	2
>> -4:
>> -	.word	__aeabi_idiv0 - 4b
>> +	bl	SYM(__aeabi_idiv0)
>> +	pop	{r1, pc}
>> +
> 
> I'd still retain the comment about pop pc here because there's often a misconception of merging armv4t and armv6m code.
> 
>>  #elif defined(__thumb2__)
>>  	.syntax unified
>>  	.ifc \signed, unsigned
>> @@ -945,7 +923,170 @@ LSYM(Lover7):
>>  	add	dividend, work
>>    .endif
>>  LSYM(Lgot_result):
>> -.endm	
>> +.endm
>> +
>> +#if defined(__prefer_thumb__) && !defined(__OPTIMIZE_SIZE__)
>> +/* If performance is preferred, the following functions are provided.  */
>> +
> 
> Comment above #if please and also check elsewhere in patch.
> 
>> +/* Branch to div(n), and jump to label if curbit is lo than divisior.  */
>> +.macro BranchToDiv n, label
>> +	lsr	curbit, dividend, \n
>> +	cmp	curbit, divisor
>> +	blo	\label
>> +.endm
>> +
>> +/* Body of div(n).  Shift the divisor in n bits and compare the divisor
>> +   and dividend.  Update the dividend as the substruction result.  */
>> +.macro DoDiv n
>> +	lsr	curbit, dividend, \n
>> +	cmp	curbit, divisor
>> +	bcc	1f
>> +	lsl	curbit, divisor, \n
>> +	sub	dividend, dividend, curbit
>> +
>> +1:	adc	result, result
>> +.endm
>> +
>> +/* The body of division with positive divisor.  Unless the divisor is very
>> +   big, shift it up in multiples of four bits, since this is the amount of
>> +   unwinding in the main division loop.  Continue shifting until the divisor
>> +   is larger than the dividend.  */
>> +.macro THUMB1_Div_Positive
>> +	mov	result, #0
>> +	BranchToDiv #1, LSYM(Lthumb1_div1)
>> +	BranchToDiv #4, LSYM(Lthumb1_div4)
>> +	BranchToDiv #8, LSYM(Lthumb1_div8)
>> +	BranchToDiv #12, LSYM(Lthumb1_div12)
>> +	BranchToDiv #16, LSYM(Lthumb1_div16)
>> +LSYM(Lthumb1_div_large_positive):
>> +	mov	result, #0xff
>> +	lsl	divisor, divisor, #8
>> +	rev	result, result
>> +	lsr	curbit, dividend, #16
>> +	cmp	curbit, divisor
>> +	blo	1f
>> +	asr	result, #8
>> +	lsl	divisor, divisor, #8
>> +	beq	LSYM(Ldivbyzero_waypoint)
>> +
>> +1:	lsr	curbit, dividend, #12
>> +	cmp	curbit, divisor
>> +	blo	LSYM(Lthumb1_div12)
>> +	b	LSYM(Lthumb1_div16)
>> +LSYM(Lthumb1_div_loop):
>> +	lsr	divisor, divisor, #8
>> +LSYM(Lthumb1_div16):
>> +	Dodiv	#15
>> +	Dodiv	#14
>> +	Dodiv	#13
>> +	Dodiv	#12
>> +LSYM(Lthumb1_div12):
>> +	Dodiv	#11
>> +	Dodiv	#10
>> +	Dodiv	#9
>> +	Dodiv	#8
>> +	bcs	LSYM(Lthumb1_div_loop)
>> +LSYM(Lthumb1_div8):
>> +	Dodiv	#7
>> +	Dodiv	#6
>> +	Dodiv	#5
>> +LSYM(Lthumb1_div5):
>> +	Dodiv	#4
>> +LSYM(Lthumb1_div4):
>> +	Dodiv	#3
>> +LSYM(Lthumb1_div3):
>> +	Dodiv	#2
>> +LSYM(Lthumb1_div2):
>> +	Dodiv	#1
>> +LSYM(Lthumb1_div1):
>> +	sub	divisor, dividend, divisor
>> +	bcs	1f
>> +	cpy	divisor, dividend
>> +
>> +1:	adc	result, result
>> +	cpy	dividend, result
>> +	RET
>> +
>> +LSYM(Ldivbyzero_waypoint):
>> +	b	LSYM(Ldiv0)
>> +.endm
>> +
>> +/* The body of division with negative divisor.  Similar with
>> +   THUMB1_Div_Positive except that the shift steps are in multiples
>> +   of six bits.  */
>> +.macro THUMB1_Div_Negative
>> +	lsr	result, divisor, #31
>> +	beq	1f
>> +	neg	divisor, divisor
>> +
>> +1:	asr	curbit, dividend, #32
>> +	bcc	2f
>> +	neg	dividend, dividend
>> +
>> +2:	eor	curbit, result
>> +	mov	result, #0
>> +	cpy	ip, curbit
>> +	BranchToDiv #4, LSYM(Lthumb1_div_negative4)
>> +	BranchToDiv #8, LSYM(Lthumb1_div_negative8)
>> +LSYM(Lthumb1_div_large):
>> +	mov	result, #0xfc
>> +	lsl	divisor, divisor, #6
>> +	rev	result, result
>> +	lsr	curbit, dividend, #8
>> +	cmp	curbit, divisor
>> +	blo	LSYM(Lthumb1_div_negative8)
>> +
>> +	lsl	divisor, divisor, #6
>> +	asr	result, result, #6
>> +	cmp	curbit, divisor
>> +	blo	LSYM(Lthumb1_div_negative8)
>> +
>> +	lsl	divisor, divisor, #6
>> +	asr	result, result, #6
>> +	cmp	curbit, divisor
>> +	blo	LSYM(Lthumb1_div_negative8)
>> +
>> +	lsl	divisor, divisor, #6
>> +	beq	LSYM(Ldivbyzero_negative)
>> +	asr	result, result, #6
>> +	b	LSYM(Lthumb1_div_negative8)
>> +LSYM(Lthumb1_div_negative_loop):
>> +	lsr	divisor, divisor, #6
>> +LSYM(Lthumb1_div_negative8):
>> +	DoDiv	#7
>> +	DoDiv	#6
>> +	DoDiv	#5
>> +	DoDiv	#4
>> +LSYM(Lthumb1_div_negative4):
>> +	DoDiv	#3
>> +	DoDiv	#2
>> +	bcs	LSYM(Lthumb1_div_negative_loop)
>> +	DoDiv	#1
>> +	sub	divisor, dividend, divisor
>> +	bcs	1f
>> +	cpy	divisor, dividend
>> +
>> +1:	cpy	curbit, ip
>> +	adc	result, result
>> +	asr	curbit, curbit, #1
>> +	cpy	dividend, result
>> +	bcc	2f
>> +	neg	dividend, dividend
>> +	cmp	curbit, #0
>> +
>> +2:	bpl	3f
>> +	neg	divisor, divisor
>> +
>> +3:	RET
>> +
>> +LSYM(Ldivbyzero_negative):
>> +	cpy	curbit, ip
>> +	asr	curbit, curbit, #1
>> +	bcc	LSYM(Ldiv0)
>> +	neg	dividend, dividend
>> +.endm
>> +#endif /* ARM Thumb version.  */
>> +
>>  /* ------------------------------------------------------------------------ */
>>  /*		Start of the Real Functions				    */
>>  /* ------------------------------------------------------------------------ */
>> @@ -955,6 +1096,7 @@ LSYM(Lgot_result):
>>  
>>  	FUNC_START udivsi3
>>  	FUNC_ALIAS aeabi_uidiv udivsi3
>> +#if defined(__OPTIMIZE_SIZE__)
>>  
>>  	cmp	divisor, #0
>>  	beq	LSYM(Ldiv0)
>> @@ -972,6 +1114,14 @@ LSYM(udivsi3_skip_div0_test):
>>  	pop	{ work }
>>  	RET
>>  
>> +#else
>> +	/* Implementation of aeabi_uidiv for ARMv6m.  This version is only
>> +	   used in ARMv6-M when we need an efficient implementation.  */
>> +LSYM(udivsi3_skip_div0_test):
>> +	THUMB1_Div_Positive
>> +
>> +#endif /* __OPTIMIZE_SIZE__ */
>> +
>>  #elif defined(__ARM_ARCH_EXT_IDIV__)
>>  
>>  	ARM_FUNC_START udivsi3
>> @@ -1023,12 +1173,21 @@ LSYM(udivsi3_skip_div0_test):
>>  FUNC_START aeabi_uidivmod
>>  	cmp	r1, #0
>>  	beq	LSYM(Ldiv0)
>> +# if defined(__OPTIMIZE_SIZE__)
>>  	push	{r0, r1, lr}
>>  	bl	LSYM(udivsi3_skip_div0_test)
>>  	POP	{r1, r2, r3}
>>  	mul	r2, r0
>>  	sub	r1, r1, r2
>>  	bx	r3
>> +# else
>> +	/* Both the quotient and remainder are calculated simultaneously
>> +	   in THUMB1_Div_Positive.  There is no need to calculate the
>> +	   remainder again here.  */
>> +	b	LSYM(udivsi3_skip_div0_test)
>> +	RET
>> +# endif /* __OPTIMIZE_SIZE__ */
>> +
>>  #elif defined(__ARM_ARCH_EXT_IDIV__)
>>  ARM_FUNC_START aeabi_uidivmod
>>  	cmp	r1, #0
>> @@ -1084,7 +1243,7 @@ LSYM(Lover10):
>>  	RET
>>  	
>>  #else  /* ARM version.  */
>> -	
>> +
>>  	FUNC_START umodsi3
>>  
>>  	subs	r2, r1, #1			@ compare divisor with 1
>> @@ -1109,8 +1268,9 @@ LSYM(Lover10):
>>  
>>  #if defined(__prefer_thumb__)
>>  
>> -	FUNC_START divsi3	
>> +	FUNC_START divsi3
>>  	FUNC_ALIAS aeabi_idiv divsi3
>> +#if defined(__OPTIMIZE_SIZE__)
>>  
>>  	cmp	divisor, #0
>>  	beq	LSYM(Ldiv0)
>> @@ -1133,7 +1293,7 @@ LSYM(Lover11):
>>  	blo	LSYM(Lgot_result)
>>  
>>  	THUMB_DIV_MOD_BODY 0
>> -	
>> +
>>  	mov	r0, result
>>  	mov	work, ip
>>  	cmp	work, #0
>> @@ -1142,6 +1302,21 @@ LSYM(Lover11):
>>  LSYM(Lover12):
>>  	pop	{ work }
>>  	RET
>> +#else
>> +	/* Implementation of aeabi_idiv for ARMv6m.  This version is only
>> +	   used in ARMv6-M when we need an efficient implementation.  */
>> +LSYM(divsi3_skip_div0_test):
>> +	cpy	curbit, dividend
>> +	orr	curbit, divisor
>> +	bmi	LSYM(Lthumb1_div_negative)
>> +
>> +LSYM(Lthumb1_div_positive):
>> +	THUMB1_Div_Positive
>> +
>> +LSYM(Lthumb1_div_negative):
>> +	THUMB1_Div_Negative
>> +
>> +#endif /* __OPTIMIZE_SIZE__ */
>>  
>>  #elif defined(__ARM_ARCH_EXT_IDIV__)
>>  
>> @@ -1154,8 +1329,8 @@ LSYM(Lover12):
>>  	RET
>>  
>>  #else /* ARM/Thumb-2 version.  */
>> -	
>> -	ARM_FUNC_START divsi3	
>> +
>> +	ARM_FUNC_START divsi3
>>  	ARM_FUNC_ALIAS aeabi_idiv divsi3
>>  
>>  	cmp	r1, #0
>> @@ -1209,12 +1384,21 @@ LSYM(divsi3_skip_div0_test):
>>  FUNC_START aeabi_idivmod
>>  	cmp	r1, #0
>>  	beq	LSYM(Ldiv0)
>> +# if defined(__OPTIMIZE_SIZE__)
>>  	push	{r0, r1, lr}
>>  	bl	LSYM(divsi3_skip_div0_test)
>>  	POP	{r1, r2, r3}
>>  	mul	r2, r0
>>  	sub	r1, r1, r2
>>  	bx	r3
>> +# else
>> +	/* Both the quotient and remainder are calculated simultaneously
>> +	   in THUMB1_Div_Positive and THUMB1_Div_Negative.  There is no
>> +	   need to calculate the remainder again here.  */
>> +	b	LSYM(divsi3_skip_div0_test)
>> +	RET
>> +# endif /* __OPTIMIZE_SIZE__ */
>> +
>>  #elif defined(__ARM_ARCH_EXT_IDIV__)
>>  ARM_FUNC_START aeabi_idivmod
>>  	cmp 	r1, #0
>> -- 1.9.1
>>
> 
> Otherwise OK if no regressions and the following request passes.
> 
> Can you ensure that libgcc for one ARM state and one Thumb2 state non-v6m configuration should give identical binaries with and without your patch, no ? 
> 
> regards
> Ramana
> 
Hi Ramana,

Thank you for the comments. Sorry about the license, must have been a
mixup somewhere.

I put back the 'pop pc is safe' assembly comment and I moved some
comments before the #if and #else as requested. I left some in place
because they did not apply to the whole block but simply to the first
assembly instruction after the #if/else.

I checked that the assembly generated for libgcc was the same with and
without the patch for armv7-a in arm mode and armv7-m in thumb mode.

Is this OK?

Cheers,
Andre

libgcc/ChangeLog:
2016-07-06  Hale Wang  <hale.wang@arm.com>
            Andre Vieira  <andre.simoesdiasvieira@arm.com>

   * config/arm/lib1funcs.S: Add new wrapper.

[-- Attachment #2: 0001-integer-division.patch --]
[-- Type: text/x-patch, Size: 8194 bytes --]

From b5b129e698b9e7446907d3da1fbce0236b09b67c Mon Sep 17 00:00:00 2001
From: Andre Simoes Dias Vieira <andsim01@arm.com>
Date: Mon, 16 May 2016 18:34:52 +0100
Subject: [PATCH 1/2] integer division

---
 libgcc/config/arm/lib1funcs.S | 250 ++++++++++++++++++++++++++++++++++++------
 1 file changed, 218 insertions(+), 32 deletions(-)

diff --git a/libgcc/config/arm/lib1funcs.S b/libgcc/config/arm/lib1funcs.S
index 375a5135110895faa44267ebee045fd315515027..8a245b7a9333b249f120cd7e7ecc77248b150610 100644
--- a/libgcc/config/arm/lib1funcs.S
+++ b/libgcc/config/arm/lib1funcs.S
@@ -306,34 +306,13 @@ LSYM(Lend_fde):
 #ifdef __ARM_EABI__
 .macro THUMB_LDIV0 name signed
 #if defined(__ARM_ARCH_6M__)
-	.ifc \signed, unsigned
-	cmp	r0, #0
-	beq	1f
-	mov	r0, #0
-	mvn	r0, r0		@ 0xffffffff
-1:
-	.else
-	cmp	r0, #0
-	beq	2f
-	blt	3f
+
+	push	{r0, lr}
 	mov	r0, #0
-	mvn	r0, r0
-	lsr	r0, r0, #1	@ 0x7fffffff
-	b	2f
-3:	mov	r0, #0x80
-	lsl	r0, r0, #24	@ 0x80000000
-2:
-	.endif
-	push	{r0, r1, r2}
-	ldr	r0, 4f
-	adr	r1, 4f
-	add	r0, r1
-	str	r0, [sp, #8]
+	bl	SYM(__aeabi_idiv0)
 	@ We know we are not on armv4t, so pop pc is safe.
-	pop	{r0, r1, pc}
-	.align	2
-4:
-	.word	__aeabi_idiv0 - 4b
+	pop	{r1, pc}
+
 #elif defined(__thumb2__)
 	.syntax unified
 	.ifc \signed, unsigned
@@ -945,7 +924,170 @@ LSYM(Lover7):
 	add	dividend, work
   .endif
 LSYM(Lgot_result):
-.endm	
+.endm
+
+/* If performance is preferred, the following functions are provided.  */
+#if defined(__prefer_thumb__) && !defined(__OPTIMIZE_SIZE__)
+
+/* Branch to div(n), and jump to label if curbit is lo than divisior.  */
+.macro BranchToDiv n, label
+	lsr	curbit, dividend, \n
+	cmp	curbit, divisor
+	blo	\label
+.endm
+
+/* Body of div(n).  Shift the divisor in n bits and compare the divisor
+   and dividend.  Update the dividend as the substruction result.  */
+.macro DoDiv n
+	lsr	curbit, dividend, \n
+	cmp	curbit, divisor
+	bcc	1f
+	lsl	curbit, divisor, \n
+	sub	dividend, dividend, curbit
+
+1:	adc	result, result
+.endm
+
+/* The body of division with positive divisor.  Unless the divisor is very
+   big, shift it up in multiples of four bits, since this is the amount of
+   unwinding in the main division loop.  Continue shifting until the divisor
+   is larger than the dividend.  */
+.macro THUMB1_Div_Positive
+	mov	result, #0
+	BranchToDiv #1, LSYM(Lthumb1_div1)
+	BranchToDiv #4, LSYM(Lthumb1_div4)
+	BranchToDiv #8, LSYM(Lthumb1_div8)
+	BranchToDiv #12, LSYM(Lthumb1_div12)
+	BranchToDiv #16, LSYM(Lthumb1_div16)
+LSYM(Lthumb1_div_large_positive):
+	mov	result, #0xff
+	lsl	divisor, divisor, #8
+	rev	result, result
+	lsr	curbit, dividend, #16
+	cmp	curbit, divisor
+	blo	1f
+	asr	result, #8
+	lsl	divisor, divisor, #8
+	beq	LSYM(Ldivbyzero_waypoint)
+
+1:	lsr	curbit, dividend, #12
+	cmp	curbit, divisor
+	blo	LSYM(Lthumb1_div12)
+	b	LSYM(Lthumb1_div16)
+LSYM(Lthumb1_div_loop):
+	lsr	divisor, divisor, #8
+LSYM(Lthumb1_div16):
+	Dodiv	#15
+	Dodiv	#14
+	Dodiv	#13
+	Dodiv	#12
+LSYM(Lthumb1_div12):
+	Dodiv	#11
+	Dodiv	#10
+	Dodiv	#9
+	Dodiv	#8
+	bcs	LSYM(Lthumb1_div_loop)
+LSYM(Lthumb1_div8):
+	Dodiv	#7
+	Dodiv	#6
+	Dodiv	#5
+LSYM(Lthumb1_div5):
+	Dodiv	#4
+LSYM(Lthumb1_div4):
+	Dodiv	#3
+LSYM(Lthumb1_div3):
+	Dodiv	#2
+LSYM(Lthumb1_div2):
+	Dodiv	#1
+LSYM(Lthumb1_div1):
+	sub	divisor, dividend, divisor
+	bcs	1f
+	cpy	divisor, dividend
+
+1:	adc	result, result
+	cpy	dividend, result
+	RET
+
+LSYM(Ldivbyzero_waypoint):
+	b	LSYM(Ldiv0)
+.endm
+
+/* The body of division with negative divisor.  Similar with
+   THUMB1_Div_Positive except that the shift steps are in multiples
+   of six bits.  */
+.macro THUMB1_Div_Negative
+	lsr	result, divisor, #31
+	beq	1f
+	neg	divisor, divisor
+
+1:	asr	curbit, dividend, #32
+	bcc	2f
+	neg	dividend, dividend
+
+2:	eor	curbit, result
+	mov	result, #0
+	cpy	ip, curbit
+	BranchToDiv #4, LSYM(Lthumb1_div_negative4)
+	BranchToDiv #8, LSYM(Lthumb1_div_negative8)
+LSYM(Lthumb1_div_large):
+	mov	result, #0xfc
+	lsl	divisor, divisor, #6
+	rev	result, result
+	lsr	curbit, dividend, #8
+	cmp	curbit, divisor
+	blo	LSYM(Lthumb1_div_negative8)
+
+	lsl	divisor, divisor, #6
+	asr	result, result, #6
+	cmp	curbit, divisor
+	blo	LSYM(Lthumb1_div_negative8)
+
+	lsl	divisor, divisor, #6
+	asr	result, result, #6
+	cmp	curbit, divisor
+	blo	LSYM(Lthumb1_div_negative8)
+
+	lsl	divisor, divisor, #6
+	beq	LSYM(Ldivbyzero_negative)
+	asr	result, result, #6
+	b	LSYM(Lthumb1_div_negative8)
+LSYM(Lthumb1_div_negative_loop):
+	lsr	divisor, divisor, #6
+LSYM(Lthumb1_div_negative8):
+	DoDiv	#7
+	DoDiv	#6
+	DoDiv	#5
+	DoDiv	#4
+LSYM(Lthumb1_div_negative4):
+	DoDiv	#3
+	DoDiv	#2
+	bcs	LSYM(Lthumb1_div_negative_loop)
+	DoDiv	#1
+	sub	divisor, dividend, divisor
+	bcs	1f
+	cpy	divisor, dividend
+
+1:	cpy	curbit, ip
+	adc	result, result
+	asr	curbit, curbit, #1
+	cpy	dividend, result
+	bcc	2f
+	neg	dividend, dividend
+	cmp	curbit, #0
+
+2:	bpl	3f
+	neg	divisor, divisor
+
+3:	RET
+
+LSYM(Ldivbyzero_negative):
+	cpy	curbit, ip
+	asr	curbit, curbit, #1
+	bcc	LSYM(Ldiv0)
+	neg	dividend, dividend
+.endm
+#endif /* ARM Thumb version.  */
+
 /* ------------------------------------------------------------------------ */
 /*		Start of the Real Functions				    */
 /* ------------------------------------------------------------------------ */
@@ -955,6 +1097,7 @@ LSYM(Lgot_result):
 
 	FUNC_START udivsi3
 	FUNC_ALIAS aeabi_uidiv udivsi3
+#if defined(__OPTIMIZE_SIZE__)
 
 	cmp	divisor, #0
 	beq	LSYM(Ldiv0)
@@ -972,6 +1115,14 @@ LSYM(udivsi3_skip_div0_test):
 	pop	{ work }
 	RET
 
+/* Implementation of aeabi_uidiv for ARMv6m.  This version is only
+   used in ARMv6-M when we need an efficient implementation.  */
+#else
+LSYM(udivsi3_skip_div0_test):
+	THUMB1_Div_Positive
+
+#endif /* __OPTIMIZE_SIZE__ */
+
 #elif defined(__ARM_ARCH_EXT_IDIV__)
 
 	ARM_FUNC_START udivsi3
@@ -1023,12 +1174,21 @@ LSYM(udivsi3_skip_div0_test):
 FUNC_START aeabi_uidivmod
 	cmp	r1, #0
 	beq	LSYM(Ldiv0)
+# if defined(__OPTIMIZE_SIZE__)
 	push	{r0, r1, lr}
 	bl	LSYM(udivsi3_skip_div0_test)
 	POP	{r1, r2, r3}
 	mul	r2, r0
 	sub	r1, r1, r2
 	bx	r3
+# else
+	/* Both the quotient and remainder are calculated simultaneously
+	   in THUMB1_Div_Positive.  There is no need to calculate the
+	   remainder again here.  */
+	b	LSYM(udivsi3_skip_div0_test)
+	RET
+# endif /* __OPTIMIZE_SIZE__ */
+
 #elif defined(__ARM_ARCH_EXT_IDIV__)
 ARM_FUNC_START aeabi_uidivmod
 	cmp	r1, #0
@@ -1084,7 +1244,7 @@ LSYM(Lover10):
 	RET
 	
 #else  /* ARM version.  */
-	
+
 	FUNC_START umodsi3
 
 	subs	r2, r1, #1			@ compare divisor with 1
@@ -1109,8 +1269,9 @@ LSYM(Lover10):
 
 #if defined(__prefer_thumb__)
 
-	FUNC_START divsi3	
+	FUNC_START divsi3
 	FUNC_ALIAS aeabi_idiv divsi3
+#if defined(__OPTIMIZE_SIZE__)
 
 	cmp	divisor, #0
 	beq	LSYM(Ldiv0)
@@ -1133,7 +1294,7 @@ LSYM(Lover11):
 	blo	LSYM(Lgot_result)
 
 	THUMB_DIV_MOD_BODY 0
-	
+
 	mov	r0, result
 	mov	work, ip
 	cmp	work, #0
@@ -1143,6 +1304,22 @@ LSYM(Lover12):
 	pop	{ work }
 	RET
 
+/* Implementation of aeabi_idiv for ARMv6m.  This version is only
+   used in ARMv6-M when we need an efficient implementation.  */
+#else
+LSYM(divsi3_skip_div0_test):
+	cpy	curbit, dividend
+	orr	curbit, divisor
+	bmi	LSYM(Lthumb1_div_negative)
+
+LSYM(Lthumb1_div_positive):
+	THUMB1_Div_Positive
+
+LSYM(Lthumb1_div_negative):
+	THUMB1_Div_Negative
+
+#endif /* __OPTIMIZE_SIZE__ */
+
 #elif defined(__ARM_ARCH_EXT_IDIV__)
 
 	ARM_FUNC_START divsi3
@@ -1154,8 +1331,8 @@ LSYM(Lover12):
 	RET
 
 #else /* ARM/Thumb-2 version.  */
-	
-	ARM_FUNC_START divsi3	
+
+	ARM_FUNC_START divsi3
 	ARM_FUNC_ALIAS aeabi_idiv divsi3
 
 	cmp	r1, #0
@@ -1209,12 +1386,21 @@ LSYM(divsi3_skip_div0_test):
 FUNC_START aeabi_idivmod
 	cmp	r1, #0
 	beq	LSYM(Ldiv0)
+# if defined(__OPTIMIZE_SIZE__)
 	push	{r0, r1, lr}
 	bl	LSYM(divsi3_skip_div0_test)
 	POP	{r1, r2, r3}
 	mul	r2, r0
 	sub	r1, r1, r2
 	bx	r3
+# else
+	/* Both the quotient and remainder are calculated simultaneously
+	   in THUMB1_Div_Positive and THUMB1_Div_Negative.  There is no
+	   need to calculate the remainder again here.  */
+	b	LSYM(divsi3_skip_div0_test)
+	RET
+# endif /* __OPTIMIZE_SIZE__ */
+
 #elif defined(__ARM_ARCH_EXT_IDIV__)
 ARM_FUNC_START aeabi_idivmod
 	cmp 	r1, #0
-- 
1.9.1


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCHv2, ARM, libgcc] New aeabi_idiv function for armv6-m
  2016-07-06 10:52   ` Andre Vieira (lists)
@ 2016-07-11 17:09     ` Andre Vieira (lists)
  2016-07-12 10:41       ` Andre Vieira (lists)
  0 siblings, 1 reply; 10+ messages in thread
From: Andre Vieira (lists) @ 2016-07-11 17:09 UTC (permalink / raw)
  To: gcc-patches; +Cc: Ramana Radhakrishnan

[-- Attachment #1: Type: text/plain, Size: 12402 bytes --]

On 06/07/16 11:52, Andre Vieira (lists) wrote:
> On 01/07/16 14:40, Ramana Radhakrishnan wrote:
>>
>>
>> On 13/10/15 18:01, Andre Vieira wrote:
>>> This patch ports the aeabi_idiv routine from Linaro Cortex-Strings (https://git.linaro.org/toolchain/cortex-strings.git), which was contributed by ARM under Free BSD license.
>>>
>>> The new aeabi_idiv routine is used to replace the one in libgcc/config/arm/lib1funcs.S. This replacement happens within the Thumb1 wrapper. The new routine is under LGPLv3 license.
>>
>> This is not under LGPLv3 . It is under GPLv3 with the runtime library exception license, there's a difference. Assuming your licensing expectation is ok .... read on for more of a review.
>>
>>>
>>> The main advantage of this version is that it can improve the performance of the aeabi_idiv function for Thumb1. This solution will also increase the code size. So it will only be used if __OPTIMIZE_SIZE__ is not defined.
>>>
>>> Make check passed for armv6-m.
>>>
>>> libgcc/ChangeLog:
>>> 2015-08-10  Hale Wang  <hale.wang@arm.com>
>>>             Andre Vieira  <andre.simoesdiasvieira@arm.com>
>>>
>>>   * config/arm/lib1funcs.S: Add new wrapper.
>>>
>>> 0001-integer-division.patch
>>>
>>>
>>> From 832a3d6af6f06399f70b5a4ac3727d55960c93b7 Mon Sep 17 00:00:00 2001
>>> From: Andre Simoes Dias Vieira <andsim01@arm.com>
>>> Date: Fri, 21 Aug 2015 14:23:28 +0100
>>> Subject: [PATCH] new wrapper idivmod
>>>
>>> ---
>>>  libgcc/config/arm/lib1funcs.S | 250 ++++++++++++++++++++++++++++++++++++------
>>>  1 file changed, 217 insertions(+), 33 deletions(-)
>>>
>>> diff --git a/libgcc/config/arm/lib1funcs.S b/libgcc/config/arm/lib1funcs.S
>>> index 252efcbd5385cc58a5ce1e48c6816d36a6f4c797..c9e544114590da8cde88382bea0f67206e593816 100644
>>> --- a/libgcc/config/arm/lib1funcs.S
>>> +++ b/libgcc/config/arm/lib1funcs.S
>>> @@ -306,34 +306,12 @@ LSYM(Lend_fde):
>>>  #ifdef __ARM_EABI__
>>>  .macro THUMB_LDIV0 name signed
>>>  #if defined(__ARM_ARCH_6M__)
>>> -	.ifc \signed, unsigned
>>> -	cmp	r0, #0
>>> -	beq	1f
>>> -	mov	r0, #0
>>> -	mvn	r0, r0		@ 0xffffffff
>>> -1:
>>> -	.else
>>> -	cmp	r0, #0
>>> -	beq	2f
>>> -	blt	3f
>>> +
>>> +	push	{r0, lr}
>>>  	mov	r0, #0
>>> -	mvn	r0, r0
>>> -	lsr	r0, r0, #1	@ 0x7fffffff
>>> -	b	2f
>>> -3:	mov	r0, #0x80
>>> -	lsl	r0, r0, #24	@ 0x80000000
>>> -2:
>>> -	.endif
>>> -	push	{r0, r1, r2}
>>> -	ldr	r0, 4f
>>> -	adr	r1, 4f
>>> -	add	r0, r1
>>> -	str	r0, [sp, #8]
>>> -	@ We know we are not on armv4t, so pop pc is safe.
>>> -	pop	{r0, r1, pc}
>>> -	.align	2
>>> -4:
>>> -	.word	__aeabi_idiv0 - 4b
>>> +	bl	SYM(__aeabi_idiv0)
>>> +	pop	{r1, pc}
>>> +
>>
>> I'd still retain the comment about pop pc here because there's often a misconception of merging armv4t and armv6m code.
>>
>>>  #elif defined(__thumb2__)
>>>  	.syntax unified
>>>  	.ifc \signed, unsigned
>>> @@ -945,7 +923,170 @@ LSYM(Lover7):
>>>  	add	dividend, work
>>>    .endif
>>>  LSYM(Lgot_result):
>>> -.endm	
>>> +.endm
>>> +
>>> +#if defined(__prefer_thumb__) && !defined(__OPTIMIZE_SIZE__)
>>> +/* If performance is preferred, the following functions are provided.  */
>>> +
>>
>> Comment above #if please and also check elsewhere in patch.
>>
>>> +/* Branch to div(n), and jump to label if curbit is lo than divisior.  */
>>> +.macro BranchToDiv n, label
>>> +	lsr	curbit, dividend, \n
>>> +	cmp	curbit, divisor
>>> +	blo	\label
>>> +.endm
>>> +
>>> +/* Body of div(n).  Shift the divisor in n bits and compare the divisor
>>> +   and dividend.  Update the dividend as the substruction result.  */
>>> +.macro DoDiv n
>>> +	lsr	curbit, dividend, \n
>>> +	cmp	curbit, divisor
>>> +	bcc	1f
>>> +	lsl	curbit, divisor, \n
>>> +	sub	dividend, dividend, curbit
>>> +
>>> +1:	adc	result, result
>>> +.endm
>>> +
>>> +/* The body of division with positive divisor.  Unless the divisor is very
>>> +   big, shift it up in multiples of four bits, since this is the amount of
>>> +   unwinding in the main division loop.  Continue shifting until the divisor
>>> +   is larger than the dividend.  */
>>> +.macro THUMB1_Div_Positive
>>> +	mov	result, #0
>>> +	BranchToDiv #1, LSYM(Lthumb1_div1)
>>> +	BranchToDiv #4, LSYM(Lthumb1_div4)
>>> +	BranchToDiv #8, LSYM(Lthumb1_div8)
>>> +	BranchToDiv #12, LSYM(Lthumb1_div12)
>>> +	BranchToDiv #16, LSYM(Lthumb1_div16)
>>> +LSYM(Lthumb1_div_large_positive):
>>> +	mov	result, #0xff
>>> +	lsl	divisor, divisor, #8
>>> +	rev	result, result
>>> +	lsr	curbit, dividend, #16
>>> +	cmp	curbit, divisor
>>> +	blo	1f
>>> +	asr	result, #8
>>> +	lsl	divisor, divisor, #8
>>> +	beq	LSYM(Ldivbyzero_waypoint)
>>> +
>>> +1:	lsr	curbit, dividend, #12
>>> +	cmp	curbit, divisor
>>> +	blo	LSYM(Lthumb1_div12)
>>> +	b	LSYM(Lthumb1_div16)
>>> +LSYM(Lthumb1_div_loop):
>>> +	lsr	divisor, divisor, #8
>>> +LSYM(Lthumb1_div16):
>>> +	Dodiv	#15
>>> +	Dodiv	#14
>>> +	Dodiv	#13
>>> +	Dodiv	#12
>>> +LSYM(Lthumb1_div12):
>>> +	Dodiv	#11
>>> +	Dodiv	#10
>>> +	Dodiv	#9
>>> +	Dodiv	#8
>>> +	bcs	LSYM(Lthumb1_div_loop)
>>> +LSYM(Lthumb1_div8):
>>> +	Dodiv	#7
>>> +	Dodiv	#6
>>> +	Dodiv	#5
>>> +LSYM(Lthumb1_div5):
>>> +	Dodiv	#4
>>> +LSYM(Lthumb1_div4):
>>> +	Dodiv	#3
>>> +LSYM(Lthumb1_div3):
>>> +	Dodiv	#2
>>> +LSYM(Lthumb1_div2):
>>> +	Dodiv	#1
>>> +LSYM(Lthumb1_div1):
>>> +	sub	divisor, dividend, divisor
>>> +	bcs	1f
>>> +	cpy	divisor, dividend
>>> +
>>> +1:	adc	result, result
>>> +	cpy	dividend, result
>>> +	RET
>>> +
>>> +LSYM(Ldivbyzero_waypoint):
>>> +	b	LSYM(Ldiv0)
>>> +.endm
>>> +
>>> +/* The body of division with negative divisor.  Similar with
>>> +   THUMB1_Div_Positive except that the shift steps are in multiples
>>> +   of six bits.  */
>>> +.macro THUMB1_Div_Negative
>>> +	lsr	result, divisor, #31
>>> +	beq	1f
>>> +	neg	divisor, divisor
>>> +
>>> +1:	asr	curbit, dividend, #32
>>> +	bcc	2f
>>> +	neg	dividend, dividend
>>> +
>>> +2:	eor	curbit, result
>>> +	mov	result, #0
>>> +	cpy	ip, curbit
>>> +	BranchToDiv #4, LSYM(Lthumb1_div_negative4)
>>> +	BranchToDiv #8, LSYM(Lthumb1_div_negative8)
>>> +LSYM(Lthumb1_div_large):
>>> +	mov	result, #0xfc
>>> +	lsl	divisor, divisor, #6
>>> +	rev	result, result
>>> +	lsr	curbit, dividend, #8
>>> +	cmp	curbit, divisor
>>> +	blo	LSYM(Lthumb1_div_negative8)
>>> +
>>> +	lsl	divisor, divisor, #6
>>> +	asr	result, result, #6
>>> +	cmp	curbit, divisor
>>> +	blo	LSYM(Lthumb1_div_negative8)
>>> +
>>> +	lsl	divisor, divisor, #6
>>> +	asr	result, result, #6
>>> +	cmp	curbit, divisor
>>> +	blo	LSYM(Lthumb1_div_negative8)
>>> +
>>> +	lsl	divisor, divisor, #6
>>> +	beq	LSYM(Ldivbyzero_negative)
>>> +	asr	result, result, #6
>>> +	b	LSYM(Lthumb1_div_negative8)
>>> +LSYM(Lthumb1_div_negative_loop):
>>> +	lsr	divisor, divisor, #6
>>> +LSYM(Lthumb1_div_negative8):
>>> +	DoDiv	#7
>>> +	DoDiv	#6
>>> +	DoDiv	#5
>>> +	DoDiv	#4
>>> +LSYM(Lthumb1_div_negative4):
>>> +	DoDiv	#3
>>> +	DoDiv	#2
>>> +	bcs	LSYM(Lthumb1_div_negative_loop)
>>> +	DoDiv	#1
>>> +	sub	divisor, dividend, divisor
>>> +	bcs	1f
>>> +	cpy	divisor, dividend
>>> +
>>> +1:	cpy	curbit, ip
>>> +	adc	result, result
>>> +	asr	curbit, curbit, #1
>>> +	cpy	dividend, result
>>> +	bcc	2f
>>> +	neg	dividend, dividend
>>> +	cmp	curbit, #0
>>> +
>>> +2:	bpl	3f
>>> +	neg	divisor, divisor
>>> +
>>> +3:	RET
>>> +
>>> +LSYM(Ldivbyzero_negative):
>>> +	cpy	curbit, ip
>>> +	asr	curbit, curbit, #1
>>> +	bcc	LSYM(Ldiv0)
>>> +	neg	dividend, dividend
>>> +.endm
>>> +#endif /* ARM Thumb version.  */
>>> +
>>>  /* ------------------------------------------------------------------------ */
>>>  /*		Start of the Real Functions				    */
>>>  /* ------------------------------------------------------------------------ */
>>> @@ -955,6 +1096,7 @@ LSYM(Lgot_result):
>>>  
>>>  	FUNC_START udivsi3
>>>  	FUNC_ALIAS aeabi_uidiv udivsi3
>>> +#if defined(__OPTIMIZE_SIZE__)
>>>  
>>>  	cmp	divisor, #0
>>>  	beq	LSYM(Ldiv0)
>>> @@ -972,6 +1114,14 @@ LSYM(udivsi3_skip_div0_test):
>>>  	pop	{ work }
>>>  	RET
>>>  
>>> +#else
>>> +	/* Implementation of aeabi_uidiv for ARMv6m.  This version is only
>>> +	   used in ARMv6-M when we need an efficient implementation.  */
>>> +LSYM(udivsi3_skip_div0_test):
>>> +	THUMB1_Div_Positive
>>> +
>>> +#endif /* __OPTIMIZE_SIZE__ */
>>> +
>>>  #elif defined(__ARM_ARCH_EXT_IDIV__)
>>>  
>>>  	ARM_FUNC_START udivsi3
>>> @@ -1023,12 +1173,21 @@ LSYM(udivsi3_skip_div0_test):
>>>  FUNC_START aeabi_uidivmod
>>>  	cmp	r1, #0
>>>  	beq	LSYM(Ldiv0)
>>> +# if defined(__OPTIMIZE_SIZE__)
>>>  	push	{r0, r1, lr}
>>>  	bl	LSYM(udivsi3_skip_div0_test)
>>>  	POP	{r1, r2, r3}
>>>  	mul	r2, r0
>>>  	sub	r1, r1, r2
>>>  	bx	r3
>>> +# else
>>> +	/* Both the quotient and remainder are calculated simultaneously
>>> +	   in THUMB1_Div_Positive.  There is no need to calculate the
>>> +	   remainder again here.  */
>>> +	b	LSYM(udivsi3_skip_div0_test)
>>> +	RET
>>> +# endif /* __OPTIMIZE_SIZE__ */
>>> +
>>>  #elif defined(__ARM_ARCH_EXT_IDIV__)
>>>  ARM_FUNC_START aeabi_uidivmod
>>>  	cmp	r1, #0
>>> @@ -1084,7 +1243,7 @@ LSYM(Lover10):
>>>  	RET
>>>  	
>>>  #else  /* ARM version.  */
>>> -	
>>> +
>>>  	FUNC_START umodsi3
>>>  
>>>  	subs	r2, r1, #1			@ compare divisor with 1
>>> @@ -1109,8 +1268,9 @@ LSYM(Lover10):
>>>  
>>>  #if defined(__prefer_thumb__)
>>>  
>>> -	FUNC_START divsi3	
>>> +	FUNC_START divsi3
>>>  	FUNC_ALIAS aeabi_idiv divsi3
>>> +#if defined(__OPTIMIZE_SIZE__)
>>>  
>>>  	cmp	divisor, #0
>>>  	beq	LSYM(Ldiv0)
>>> @@ -1133,7 +1293,7 @@ LSYM(Lover11):
>>>  	blo	LSYM(Lgot_result)
>>>  
>>>  	THUMB_DIV_MOD_BODY 0
>>> -	
>>> +
>>>  	mov	r0, result
>>>  	mov	work, ip
>>>  	cmp	work, #0
>>> @@ -1142,6 +1302,21 @@ LSYM(Lover11):
>>>  LSYM(Lover12):
>>>  	pop	{ work }
>>>  	RET
>>> +#else
>>> +	/* Implementation of aeabi_idiv for ARMv6m.  This version is only
>>> +	   used in ARMv6-M when we need an efficient implementation.  */
>>> +LSYM(divsi3_skip_div0_test):
>>> +	cpy	curbit, dividend
>>> +	orr	curbit, divisor
>>> +	bmi	LSYM(Lthumb1_div_negative)
>>> +
>>> +LSYM(Lthumb1_div_positive):
>>> +	THUMB1_Div_Positive
>>> +
>>> +LSYM(Lthumb1_div_negative):
>>> +	THUMB1_Div_Negative
>>> +
>>> +#endif /* __OPTIMIZE_SIZE__ */
>>>  
>>>  #elif defined(__ARM_ARCH_EXT_IDIV__)
>>>  
>>> @@ -1154,8 +1329,8 @@ LSYM(Lover12):
>>>  	RET
>>>  
>>>  #else /* ARM/Thumb-2 version.  */
>>> -	
>>> -	ARM_FUNC_START divsi3	
>>> +
>>> +	ARM_FUNC_START divsi3
>>>  	ARM_FUNC_ALIAS aeabi_idiv divsi3
>>>  
>>>  	cmp	r1, #0
>>> @@ -1209,12 +1384,21 @@ LSYM(divsi3_skip_div0_test):
>>>  FUNC_START aeabi_idivmod
>>>  	cmp	r1, #0
>>>  	beq	LSYM(Ldiv0)
>>> +# if defined(__OPTIMIZE_SIZE__)
>>>  	push	{r0, r1, lr}
>>>  	bl	LSYM(divsi3_skip_div0_test)
>>>  	POP	{r1, r2, r3}
>>>  	mul	r2, r0
>>>  	sub	r1, r1, r2
>>>  	bx	r3
>>> +# else
>>> +	/* Both the quotient and remainder are calculated simultaneously
>>> +	   in THUMB1_Div_Positive and THUMB1_Div_Negative.  There is no
>>> +	   need to calculate the remainder again here.  */
>>> +	b	LSYM(divsi3_skip_div0_test)
>>> +	RET
>>> +# endif /* __OPTIMIZE_SIZE__ */
>>> +
>>>  #elif defined(__ARM_ARCH_EXT_IDIV__)
>>>  ARM_FUNC_START aeabi_idivmod
>>>  	cmp 	r1, #0
>>> -- 1.9.1
>>>
>>
>> Otherwise OK if no regressions and the following request passes.
>>
>> Can you ensure that libgcc for one ARM state and one Thumb2 state non-v6m configuration should give identical binaries with and without your patch, no ? 
>>
>> regards
>> Ramana
>>
> Hi Ramana,
> 
> Thank you for the comments. Sorry about the license, must have been a
> mixup somewhere.
> 
> I put back the 'pop pc is safe' assembly comment and I moved some
> comments before the #if and #else as requested. I left some in place
> because they did not apply to the whole block but simply to the first
> assembly instruction after the #if/else.
> 
> I checked that the assembly generated for libgcc was the same with and
> without the patch for armv7-a in arm mode and armv7-m in thumb mode.
> 
> Is this OK?
> 
> Cheers,
> Andre
> 
> libgcc/ChangeLog:
> 2016-07-06  Hale Wang  <hale.wang@arm.com>
>             Andre Vieira  <andre.simoesdiasvieira@arm.com>
> 
>    * config/arm/lib1funcs.S: Add new wrapper.
> 
I had to rebase the patch due to the ARMv8-M patches. This implied
changing a context line that changed due to the code for ARMv6-M being
reused for ARMv8-M Baseline.

I ran regression tests for both ARMv6-M and ARMv8-M Baseline and
compared the generated libgcc for ARMv7-A in ARM mode and ARMv7-M in
Thumb mode, observing no changes.

Applying patch, as it was previously OK'ed.

Cheers,
Andre

[-- Attachment #2: 0001-integer-division.patch --]
[-- Type: text/x-patch, Size: 8188 bytes --]

From ad757144fad2d9608ed840153071bb5d470193ef Mon Sep 17 00:00:00 2001
From: Andre Simoes Dias Vieira <andsim01@arm.com>
Date: Thu, 7 Jul 2016 10:49:19 +0100
Subject: [PATCH] integer division

---
 libgcc/config/arm/lib1funcs.S | 250 ++++++++++++++++++++++++++++++++++++------
 1 file changed, 218 insertions(+), 32 deletions(-)

diff --git a/libgcc/config/arm/lib1funcs.S b/libgcc/config/arm/lib1funcs.S
index 96e206ee542126c5d68091087446afe9f01aa51f..ba52e7b762f5573445349a574a3878859a992f13 100644
--- a/libgcc/config/arm/lib1funcs.S
+++ b/libgcc/config/arm/lib1funcs.S
@@ -311,34 +311,13 @@ LSYM(Lend_fde):
 #ifdef __ARM_EABI__
 .macro THUMB_LDIV0 name signed
 #ifdef NOT_ISA_TARGET_32BIT
-	.ifc \signed, unsigned
-	cmp	r0, #0
-	beq	1f
-	mov	r0, #0
-	mvn	r0, r0		@ 0xffffffff
-1:
-	.else
-	cmp	r0, #0
-	beq	2f
-	blt	3f
+
+	push	{r0, lr}
 	mov	r0, #0
-	mvn	r0, r0
-	lsr	r0, r0, #1	@ 0x7fffffff
-	b	2f
-3:	mov	r0, #0x80
-	lsl	r0, r0, #24	@ 0x80000000
-2:
-	.endif
-	push	{r0, r1, r2}
-	ldr	r0, 4f
-	adr	r1, 4f
-	add	r0, r1
-	str	r0, [sp, #8]
+	bl	SYM(__aeabi_idiv0)
 	@ We know we are not on armv4t, so pop pc is safe.
-	pop	{r0, r1, pc}
-	.align	2
-4:
-	.word	__aeabi_idiv0 - 4b
+	pop	{r1, pc}
+
 #elif defined(__thumb2__)
 	.syntax unified
 	.ifc \signed, unsigned
@@ -950,7 +929,170 @@ LSYM(Lover7):
 	add	dividend, work
   .endif
 LSYM(Lgot_result):
-.endm	
+.endm
+
+/* If performance is preferred, the following functions are provided.  */
+#if defined(__prefer_thumb__) && !defined(__OPTIMIZE_SIZE__)
+
+/* Branch to div(n), and jump to label if curbit is lo than divisior.  */
+.macro BranchToDiv n, label
+	lsr	curbit, dividend, \n
+	cmp	curbit, divisor
+	blo	\label
+.endm
+
+/* Body of div(n).  Shift the divisor in n bits and compare the divisor
+   and dividend.  Update the dividend as the substruction result.  */
+.macro DoDiv n
+	lsr	curbit, dividend, \n
+	cmp	curbit, divisor
+	bcc	1f
+	lsl	curbit, divisor, \n
+	sub	dividend, dividend, curbit
+
+1:	adc	result, result
+.endm
+
+/* The body of division with positive divisor.  Unless the divisor is very
+   big, shift it up in multiples of four bits, since this is the amount of
+   unwinding in the main division loop.  Continue shifting until the divisor
+   is larger than the dividend.  */
+.macro THUMB1_Div_Positive
+	mov	result, #0
+	BranchToDiv #1, LSYM(Lthumb1_div1)
+	BranchToDiv #4, LSYM(Lthumb1_div4)
+	BranchToDiv #8, LSYM(Lthumb1_div8)
+	BranchToDiv #12, LSYM(Lthumb1_div12)
+	BranchToDiv #16, LSYM(Lthumb1_div16)
+LSYM(Lthumb1_div_large_positive):
+	mov	result, #0xff
+	lsl	divisor, divisor, #8
+	rev	result, result
+	lsr	curbit, dividend, #16
+	cmp	curbit, divisor
+	blo	1f
+	asr	result, #8
+	lsl	divisor, divisor, #8
+	beq	LSYM(Ldivbyzero_waypoint)
+
+1:	lsr	curbit, dividend, #12
+	cmp	curbit, divisor
+	blo	LSYM(Lthumb1_div12)
+	b	LSYM(Lthumb1_div16)
+LSYM(Lthumb1_div_loop):
+	lsr	divisor, divisor, #8
+LSYM(Lthumb1_div16):
+	Dodiv	#15
+	Dodiv	#14
+	Dodiv	#13
+	Dodiv	#12
+LSYM(Lthumb1_div12):
+	Dodiv	#11
+	Dodiv	#10
+	Dodiv	#9
+	Dodiv	#8
+	bcs	LSYM(Lthumb1_div_loop)
+LSYM(Lthumb1_div8):
+	Dodiv	#7
+	Dodiv	#6
+	Dodiv	#5
+LSYM(Lthumb1_div5):
+	Dodiv	#4
+LSYM(Lthumb1_div4):
+	Dodiv	#3
+LSYM(Lthumb1_div3):
+	Dodiv	#2
+LSYM(Lthumb1_div2):
+	Dodiv	#1
+LSYM(Lthumb1_div1):
+	sub	divisor, dividend, divisor
+	bcs	1f
+	cpy	divisor, dividend
+
+1:	adc	result, result
+	cpy	dividend, result
+	RET
+
+LSYM(Ldivbyzero_waypoint):
+	b	LSYM(Ldiv0)
+.endm
+
+/* The body of division with negative divisor.  Similar with
+   THUMB1_Div_Positive except that the shift steps are in multiples
+   of six bits.  */
+.macro THUMB1_Div_Negative
+	lsr	result, divisor, #31
+	beq	1f
+	neg	divisor, divisor
+
+1:	asr	curbit, dividend, #32
+	bcc	2f
+	neg	dividend, dividend
+
+2:	eor	curbit, result
+	mov	result, #0
+	cpy	ip, curbit
+	BranchToDiv #4, LSYM(Lthumb1_div_negative4)
+	BranchToDiv #8, LSYM(Lthumb1_div_negative8)
+LSYM(Lthumb1_div_large):
+	mov	result, #0xfc
+	lsl	divisor, divisor, #6
+	rev	result, result
+	lsr	curbit, dividend, #8
+	cmp	curbit, divisor
+	blo	LSYM(Lthumb1_div_negative8)
+
+	lsl	divisor, divisor, #6
+	asr	result, result, #6
+	cmp	curbit, divisor
+	blo	LSYM(Lthumb1_div_negative8)
+
+	lsl	divisor, divisor, #6
+	asr	result, result, #6
+	cmp	curbit, divisor
+	blo	LSYM(Lthumb1_div_negative8)
+
+	lsl	divisor, divisor, #6
+	beq	LSYM(Ldivbyzero_negative)
+	asr	result, result, #6
+	b	LSYM(Lthumb1_div_negative8)
+LSYM(Lthumb1_div_negative_loop):
+	lsr	divisor, divisor, #6
+LSYM(Lthumb1_div_negative8):
+	DoDiv	#7
+	DoDiv	#6
+	DoDiv	#5
+	DoDiv	#4
+LSYM(Lthumb1_div_negative4):
+	DoDiv	#3
+	DoDiv	#2
+	bcs	LSYM(Lthumb1_div_negative_loop)
+	DoDiv	#1
+	sub	divisor, dividend, divisor
+	bcs	1f
+	cpy	divisor, dividend
+
+1:	cpy	curbit, ip
+	adc	result, result
+	asr	curbit, curbit, #1
+	cpy	dividend, result
+	bcc	2f
+	neg	dividend, dividend
+	cmp	curbit, #0
+
+2:	bpl	3f
+	neg	divisor, divisor
+
+3:	RET
+
+LSYM(Ldivbyzero_negative):
+	cpy	curbit, ip
+	asr	curbit, curbit, #1
+	bcc	LSYM(Ldiv0)
+	neg	dividend, dividend
+.endm
+#endif /* ARM Thumb version.  */
+
 /* ------------------------------------------------------------------------ */
 /*		Start of the Real Functions				    */
 /* ------------------------------------------------------------------------ */
@@ -960,6 +1102,7 @@ LSYM(Lgot_result):
 
 	FUNC_START udivsi3
 	FUNC_ALIAS aeabi_uidiv udivsi3
+#if defined(__OPTIMIZE_SIZE__)
 
 	cmp	divisor, #0
 	beq	LSYM(Ldiv0)
@@ -977,6 +1120,14 @@ LSYM(udivsi3_skip_div0_test):
 	pop	{ work }
 	RET
 
+/* Implementation of aeabi_uidiv for ARMv6m.  This version is only
+   used in ARMv6-M when we need an efficient implementation.  */
+#else
+LSYM(udivsi3_skip_div0_test):
+	THUMB1_Div_Positive
+
+#endif /* __OPTIMIZE_SIZE__ */
+
 #elif defined(__ARM_ARCH_EXT_IDIV__)
 
 	ARM_FUNC_START udivsi3
@@ -1028,12 +1179,21 @@ LSYM(udivsi3_skip_div0_test):
 FUNC_START aeabi_uidivmod
 	cmp	r1, #0
 	beq	LSYM(Ldiv0)
+# if defined(__OPTIMIZE_SIZE__)
 	push	{r0, r1, lr}
 	bl	LSYM(udivsi3_skip_div0_test)
 	POP	{r1, r2, r3}
 	mul	r2, r0
 	sub	r1, r1, r2
 	bx	r3
+# else
+	/* Both the quotient and remainder are calculated simultaneously
+	   in THUMB1_Div_Positive.  There is no need to calculate the
+	   remainder again here.  */
+	b	LSYM(udivsi3_skip_div0_test)
+	RET
+# endif /* __OPTIMIZE_SIZE__ */
+
 #elif defined(__ARM_ARCH_EXT_IDIV__)
 ARM_FUNC_START aeabi_uidivmod
 	cmp	r1, #0
@@ -1089,7 +1249,7 @@ LSYM(Lover10):
 	RET
 	
 #else  /* ARM version.  */
-	
+
 	FUNC_START umodsi3
 
 	subs	r2, r1, #1			@ compare divisor with 1
@@ -1114,8 +1274,9 @@ LSYM(Lover10):
 
 #if defined(__prefer_thumb__)
 
-	FUNC_START divsi3	
+	FUNC_START divsi3
 	FUNC_ALIAS aeabi_idiv divsi3
+#if defined(__OPTIMIZE_SIZE__)
 
 	cmp	divisor, #0
 	beq	LSYM(Ldiv0)
@@ -1138,7 +1299,7 @@ LSYM(Lover11):
 	blo	LSYM(Lgot_result)
 
 	THUMB_DIV_MOD_BODY 0
-	
+
 	mov	r0, result
 	mov	work, ip
 	cmp	work, #0
@@ -1148,6 +1309,22 @@ LSYM(Lover12):
 	pop	{ work }
 	RET
 
+/* Implementation of aeabi_idiv for ARMv6m.  This version is only
+   used in ARMv6-M when we need an efficient implementation.  */
+#else
+LSYM(divsi3_skip_div0_test):
+	cpy	curbit, dividend
+	orr	curbit, divisor
+	bmi	LSYM(Lthumb1_div_negative)
+
+LSYM(Lthumb1_div_positive):
+	THUMB1_Div_Positive
+
+LSYM(Lthumb1_div_negative):
+	THUMB1_Div_Negative
+
+#endif /* __OPTIMIZE_SIZE__ */
+
 #elif defined(__ARM_ARCH_EXT_IDIV__)
 
 	ARM_FUNC_START divsi3
@@ -1159,8 +1336,8 @@ LSYM(Lover12):
 	RET
 
 #else /* ARM/Thumb-2 version.  */
-	
-	ARM_FUNC_START divsi3	
+
+	ARM_FUNC_START divsi3
 	ARM_FUNC_ALIAS aeabi_idiv divsi3
 
 	cmp	r1, #0
@@ -1214,12 +1391,21 @@ LSYM(divsi3_skip_div0_test):
 FUNC_START aeabi_idivmod
 	cmp	r1, #0
 	beq	LSYM(Ldiv0)
+# if defined(__OPTIMIZE_SIZE__)
 	push	{r0, r1, lr}
 	bl	LSYM(divsi3_skip_div0_test)
 	POP	{r1, r2, r3}
 	mul	r2, r0
 	sub	r1, r1, r2
 	bx	r3
+# else
+	/* Both the quotient and remainder are calculated simultaneously
+	   in THUMB1_Div_Positive and THUMB1_Div_Negative.  There is no
+	   need to calculate the remainder again here.  */
+	b	LSYM(divsi3_skip_div0_test)
+	RET
+# endif /* __OPTIMIZE_SIZE__ */
+
 #elif defined(__ARM_ARCH_EXT_IDIV__)
 ARM_FUNC_START aeabi_idivmod
 	cmp 	r1, #0
-- 
1.9.1


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCHv2, ARM, libgcc] New aeabi_idiv function for armv6-m
  2016-07-11 17:09     ` Andre Vieira (lists)
@ 2016-07-12 10:41       ` Andre Vieira (lists)
  0 siblings, 0 replies; 10+ messages in thread
From: Andre Vieira (lists) @ 2016-07-12 10:41 UTC (permalink / raw)
  To: gcc-patches

On 11/07/16 18:09, Andre Vieira (lists) wrote:
> On 06/07/16 11:52, Andre Vieira (lists) wrote:
>> On 01/07/16 14:40, Ramana Radhakrishnan wrote:
>>>
>>>
>>> On 13/10/15 18:01, Andre Vieira wrote:
>>>> This patch ports the aeabi_idiv routine from Linaro Cortex-Strings (https://git.linaro.org/toolchain/cortex-strings.git), which was contributed by ARM under Free BSD license.
>>>>
>>>> The new aeabi_idiv routine is used to replace the one in libgcc/config/arm/lib1funcs.S. This replacement happens within the Thumb1 wrapper. The new routine is under LGPLv3 license.
>>>
>>> This is not under LGPLv3 . It is under GPLv3 with the runtime library exception license, there's a difference. Assuming your licensing expectation is ok .... read on for more of a review.
>>>
>>>>
>>>> The main advantage of this version is that it can improve the performance of the aeabi_idiv function for Thumb1. This solution will also increase the code size. So it will only be used if __OPTIMIZE_SIZE__ is not defined.
>>>>
>>>> Make check passed for armv6-m.
>>>>
>>>> libgcc/ChangeLog:
>>>> 2015-08-10  Hale Wang  <hale.wang@arm.com>
>>>>             Andre Vieira  <andre.simoesdiasvieira@arm.com>
>>>>
>>>>   * config/arm/lib1funcs.S: Add new wrapper.
>>>>
>>>> 0001-integer-division.patch
>>>>
>>>>
>>>> From 832a3d6af6f06399f70b5a4ac3727d55960c93b7 Mon Sep 17 00:00:00 2001
>>>> From: Andre Simoes Dias Vieira <andsim01@arm.com>
>>>> Date: Fri, 21 Aug 2015 14:23:28 +0100
>>>> Subject: [PATCH] new wrapper idivmod
>>>>
>>>> ---
>>>>  libgcc/config/arm/lib1funcs.S | 250 ++++++++++++++++++++++++++++++++++++------
>>>>  1 file changed, 217 insertions(+), 33 deletions(-)
>>>>
>>>> diff --git a/libgcc/config/arm/lib1funcs.S b/libgcc/config/arm/lib1funcs.S
>>>> index 252efcbd5385cc58a5ce1e48c6816d36a6f4c797..c9e544114590da8cde88382bea0f67206e593816 100644
>>>> --- a/libgcc/config/arm/lib1funcs.S
>>>> +++ b/libgcc/config/arm/lib1funcs.S
>>>> @@ -306,34 +306,12 @@ LSYM(Lend_fde):
>>>>  #ifdef __ARM_EABI__
>>>>  .macro THUMB_LDIV0 name signed
>>>>  #if defined(__ARM_ARCH_6M__)
>>>> -	.ifc \signed, unsigned
>>>> -	cmp	r0, #0
>>>> -	beq	1f
>>>> -	mov	r0, #0
>>>> -	mvn	r0, r0		@ 0xffffffff
>>>> -1:
>>>> -	.else
>>>> -	cmp	r0, #0
>>>> -	beq	2f
>>>> -	blt	3f
>>>> +
>>>> +	push	{r0, lr}
>>>>  	mov	r0, #0
>>>> -	mvn	r0, r0
>>>> -	lsr	r0, r0, #1	@ 0x7fffffff
>>>> -	b	2f
>>>> -3:	mov	r0, #0x80
>>>> -	lsl	r0, r0, #24	@ 0x80000000
>>>> -2:
>>>> -	.endif
>>>> -	push	{r0, r1, r2}
>>>> -	ldr	r0, 4f
>>>> -	adr	r1, 4f
>>>> -	add	r0, r1
>>>> -	str	r0, [sp, #8]
>>>> -	@ We know we are not on armv4t, so pop pc is safe.
>>>> -	pop	{r0, r1, pc}
>>>> -	.align	2
>>>> -4:
>>>> -	.word	__aeabi_idiv0 - 4b
>>>> +	bl	SYM(__aeabi_idiv0)
>>>> +	pop	{r1, pc}
>>>> +
>>>
>>> I'd still retain the comment about pop pc here because there's often a misconception of merging armv4t and armv6m code.
>>>
>>>>  #elif defined(__thumb2__)
>>>>  	.syntax unified
>>>>  	.ifc \signed, unsigned
>>>> @@ -945,7 +923,170 @@ LSYM(Lover7):
>>>>  	add	dividend, work
>>>>    .endif
>>>>  LSYM(Lgot_result):
>>>> -.endm	
>>>> +.endm
>>>> +
>>>> +#if defined(__prefer_thumb__) && !defined(__OPTIMIZE_SIZE__)
>>>> +/* If performance is preferred, the following functions are provided.  */
>>>> +
>>>
>>> Comment above #if please and also check elsewhere in patch.
>>>
>>>> +/* Branch to div(n), and jump to label if curbit is lo than divisior.  */
>>>> +.macro BranchToDiv n, label
>>>> +	lsr	curbit, dividend, \n
>>>> +	cmp	curbit, divisor
>>>> +	blo	\label
>>>> +.endm
>>>> +
>>>> +/* Body of div(n).  Shift the divisor in n bits and compare the divisor
>>>> +   and dividend.  Update the dividend as the substruction result.  */
>>>> +.macro DoDiv n
>>>> +	lsr	curbit, dividend, \n
>>>> +	cmp	curbit, divisor
>>>> +	bcc	1f
>>>> +	lsl	curbit, divisor, \n
>>>> +	sub	dividend, dividend, curbit
>>>> +
>>>> +1:	adc	result, result
>>>> +.endm
>>>> +
>>>> +/* The body of division with positive divisor.  Unless the divisor is very
>>>> +   big, shift it up in multiples of four bits, since this is the amount of
>>>> +   unwinding in the main division loop.  Continue shifting until the divisor
>>>> +   is larger than the dividend.  */
>>>> +.macro THUMB1_Div_Positive
>>>> +	mov	result, #0
>>>> +	BranchToDiv #1, LSYM(Lthumb1_div1)
>>>> +	BranchToDiv #4, LSYM(Lthumb1_div4)
>>>> +	BranchToDiv #8, LSYM(Lthumb1_div8)
>>>> +	BranchToDiv #12, LSYM(Lthumb1_div12)
>>>> +	BranchToDiv #16, LSYM(Lthumb1_div16)
>>>> +LSYM(Lthumb1_div_large_positive):
>>>> +	mov	result, #0xff
>>>> +	lsl	divisor, divisor, #8
>>>> +	rev	result, result
>>>> +	lsr	curbit, dividend, #16
>>>> +	cmp	curbit, divisor
>>>> +	blo	1f
>>>> +	asr	result, #8
>>>> +	lsl	divisor, divisor, #8
>>>> +	beq	LSYM(Ldivbyzero_waypoint)
>>>> +
>>>> +1:	lsr	curbit, dividend, #12
>>>> +	cmp	curbit, divisor
>>>> +	blo	LSYM(Lthumb1_div12)
>>>> +	b	LSYM(Lthumb1_div16)
>>>> +LSYM(Lthumb1_div_loop):
>>>> +	lsr	divisor, divisor, #8
>>>> +LSYM(Lthumb1_div16):
>>>> +	Dodiv	#15
>>>> +	Dodiv	#14
>>>> +	Dodiv	#13
>>>> +	Dodiv	#12
>>>> +LSYM(Lthumb1_div12):
>>>> +	Dodiv	#11
>>>> +	Dodiv	#10
>>>> +	Dodiv	#9
>>>> +	Dodiv	#8
>>>> +	bcs	LSYM(Lthumb1_div_loop)
>>>> +LSYM(Lthumb1_div8):
>>>> +	Dodiv	#7
>>>> +	Dodiv	#6
>>>> +	Dodiv	#5
>>>> +LSYM(Lthumb1_div5):
>>>> +	Dodiv	#4
>>>> +LSYM(Lthumb1_div4):
>>>> +	Dodiv	#3
>>>> +LSYM(Lthumb1_div3):
>>>> +	Dodiv	#2
>>>> +LSYM(Lthumb1_div2):
>>>> +	Dodiv	#1
>>>> +LSYM(Lthumb1_div1):
>>>> +	sub	divisor, dividend, divisor
>>>> +	bcs	1f
>>>> +	cpy	divisor, dividend
>>>> +
>>>> +1:	adc	result, result
>>>> +	cpy	dividend, result
>>>> +	RET
>>>> +
>>>> +LSYM(Ldivbyzero_waypoint):
>>>> +	b	LSYM(Ldiv0)
>>>> +.endm
>>>> +
>>>> +/* The body of division with negative divisor.  Similar with
>>>> +   THUMB1_Div_Positive except that the shift steps are in multiples
>>>> +   of six bits.  */
>>>> +.macro THUMB1_Div_Negative
>>>> +	lsr	result, divisor, #31
>>>> +	beq	1f
>>>> +	neg	divisor, divisor
>>>> +
>>>> +1:	asr	curbit, dividend, #32
>>>> +	bcc	2f
>>>> +	neg	dividend, dividend
>>>> +
>>>> +2:	eor	curbit, result
>>>> +	mov	result, #0
>>>> +	cpy	ip, curbit
>>>> +	BranchToDiv #4, LSYM(Lthumb1_div_negative4)
>>>> +	BranchToDiv #8, LSYM(Lthumb1_div_negative8)
>>>> +LSYM(Lthumb1_div_large):
>>>> +	mov	result, #0xfc
>>>> +	lsl	divisor, divisor, #6
>>>> +	rev	result, result
>>>> +	lsr	curbit, dividend, #8
>>>> +	cmp	curbit, divisor
>>>> +	blo	LSYM(Lthumb1_div_negative8)
>>>> +
>>>> +	lsl	divisor, divisor, #6
>>>> +	asr	result, result, #6
>>>> +	cmp	curbit, divisor
>>>> +	blo	LSYM(Lthumb1_div_negative8)
>>>> +
>>>> +	lsl	divisor, divisor, #6
>>>> +	asr	result, result, #6
>>>> +	cmp	curbit, divisor
>>>> +	blo	LSYM(Lthumb1_div_negative8)
>>>> +
>>>> +	lsl	divisor, divisor, #6
>>>> +	beq	LSYM(Ldivbyzero_negative)
>>>> +	asr	result, result, #6
>>>> +	b	LSYM(Lthumb1_div_negative8)
>>>> +LSYM(Lthumb1_div_negative_loop):
>>>> +	lsr	divisor, divisor, #6
>>>> +LSYM(Lthumb1_div_negative8):
>>>> +	DoDiv	#7
>>>> +	DoDiv	#6
>>>> +	DoDiv	#5
>>>> +	DoDiv	#4
>>>> +LSYM(Lthumb1_div_negative4):
>>>> +	DoDiv	#3
>>>> +	DoDiv	#2
>>>> +	bcs	LSYM(Lthumb1_div_negative_loop)
>>>> +	DoDiv	#1
>>>> +	sub	divisor, dividend, divisor
>>>> +	bcs	1f
>>>> +	cpy	divisor, dividend
>>>> +
>>>> +1:	cpy	curbit, ip
>>>> +	adc	result, result
>>>> +	asr	curbit, curbit, #1
>>>> +	cpy	dividend, result
>>>> +	bcc	2f
>>>> +	neg	dividend, dividend
>>>> +	cmp	curbit, #0
>>>> +
>>>> +2:	bpl	3f
>>>> +	neg	divisor, divisor
>>>> +
>>>> +3:	RET
>>>> +
>>>> +LSYM(Ldivbyzero_negative):
>>>> +	cpy	curbit, ip
>>>> +	asr	curbit, curbit, #1
>>>> +	bcc	LSYM(Ldiv0)
>>>> +	neg	dividend, dividend
>>>> +.endm
>>>> +#endif /* ARM Thumb version.  */
>>>> +
>>>>  /* ------------------------------------------------------------------------ */
>>>>  /*		Start of the Real Functions				    */
>>>>  /* ------------------------------------------------------------------------ */
>>>> @@ -955,6 +1096,7 @@ LSYM(Lgot_result):
>>>>  
>>>>  	FUNC_START udivsi3
>>>>  	FUNC_ALIAS aeabi_uidiv udivsi3
>>>> +#if defined(__OPTIMIZE_SIZE__)
>>>>  
>>>>  	cmp	divisor, #0
>>>>  	beq	LSYM(Ldiv0)
>>>> @@ -972,6 +1114,14 @@ LSYM(udivsi3_skip_div0_test):
>>>>  	pop	{ work }
>>>>  	RET
>>>>  
>>>> +#else
>>>> +	/* Implementation of aeabi_uidiv for ARMv6m.  This version is only
>>>> +	   used in ARMv6-M when we need an efficient implementation.  */
>>>> +LSYM(udivsi3_skip_div0_test):
>>>> +	THUMB1_Div_Positive
>>>> +
>>>> +#endif /* __OPTIMIZE_SIZE__ */
>>>> +
>>>>  #elif defined(__ARM_ARCH_EXT_IDIV__)
>>>>  
>>>>  	ARM_FUNC_START udivsi3
>>>> @@ -1023,12 +1173,21 @@ LSYM(udivsi3_skip_div0_test):
>>>>  FUNC_START aeabi_uidivmod
>>>>  	cmp	r1, #0
>>>>  	beq	LSYM(Ldiv0)
>>>> +# if defined(__OPTIMIZE_SIZE__)
>>>>  	push	{r0, r1, lr}
>>>>  	bl	LSYM(udivsi3_skip_div0_test)
>>>>  	POP	{r1, r2, r3}
>>>>  	mul	r2, r0
>>>>  	sub	r1, r1, r2
>>>>  	bx	r3
>>>> +# else
>>>> +	/* Both the quotient and remainder are calculated simultaneously
>>>> +	   in THUMB1_Div_Positive.  There is no need to calculate the
>>>> +	   remainder again here.  */
>>>> +	b	LSYM(udivsi3_skip_div0_test)
>>>> +	RET
>>>> +# endif /* __OPTIMIZE_SIZE__ */
>>>> +
>>>>  #elif defined(__ARM_ARCH_EXT_IDIV__)
>>>>  ARM_FUNC_START aeabi_uidivmod
>>>>  	cmp	r1, #0
>>>> @@ -1084,7 +1243,7 @@ LSYM(Lover10):
>>>>  	RET
>>>>  	
>>>>  #else  /* ARM version.  */
>>>> -	
>>>> +
>>>>  	FUNC_START umodsi3
>>>>  
>>>>  	subs	r2, r1, #1			@ compare divisor with 1
>>>> @@ -1109,8 +1268,9 @@ LSYM(Lover10):
>>>>  
>>>>  #if defined(__prefer_thumb__)
>>>>  
>>>> -	FUNC_START divsi3	
>>>> +	FUNC_START divsi3
>>>>  	FUNC_ALIAS aeabi_idiv divsi3
>>>> +#if defined(__OPTIMIZE_SIZE__)
>>>>  
>>>>  	cmp	divisor, #0
>>>>  	beq	LSYM(Ldiv0)
>>>> @@ -1133,7 +1293,7 @@ LSYM(Lover11):
>>>>  	blo	LSYM(Lgot_result)
>>>>  
>>>>  	THUMB_DIV_MOD_BODY 0
>>>> -	
>>>> +
>>>>  	mov	r0, result
>>>>  	mov	work, ip
>>>>  	cmp	work, #0
>>>> @@ -1142,6 +1302,21 @@ LSYM(Lover11):
>>>>  LSYM(Lover12):
>>>>  	pop	{ work }
>>>>  	RET
>>>> +#else
>>>> +	/* Implementation of aeabi_idiv for ARMv6m.  This version is only
>>>> +	   used in ARMv6-M when we need an efficient implementation.  */
>>>> +LSYM(divsi3_skip_div0_test):
>>>> +	cpy	curbit, dividend
>>>> +	orr	curbit, divisor
>>>> +	bmi	LSYM(Lthumb1_div_negative)
>>>> +
>>>> +LSYM(Lthumb1_div_positive):
>>>> +	THUMB1_Div_Positive
>>>> +
>>>> +LSYM(Lthumb1_div_negative):
>>>> +	THUMB1_Div_Negative
>>>> +
>>>> +#endif /* __OPTIMIZE_SIZE__ */
>>>>  
>>>>  #elif defined(__ARM_ARCH_EXT_IDIV__)
>>>>  
>>>> @@ -1154,8 +1329,8 @@ LSYM(Lover12):
>>>>  	RET
>>>>  
>>>>  #else /* ARM/Thumb-2 version.  */
>>>> -	
>>>> -	ARM_FUNC_START divsi3	
>>>> +
>>>> +	ARM_FUNC_START divsi3
>>>>  	ARM_FUNC_ALIAS aeabi_idiv divsi3
>>>>  
>>>>  	cmp	r1, #0
>>>> @@ -1209,12 +1384,21 @@ LSYM(divsi3_skip_div0_test):
>>>>  FUNC_START aeabi_idivmod
>>>>  	cmp	r1, #0
>>>>  	beq	LSYM(Ldiv0)
>>>> +# if defined(__OPTIMIZE_SIZE__)
>>>>  	push	{r0, r1, lr}
>>>>  	bl	LSYM(divsi3_skip_div0_test)
>>>>  	POP	{r1, r2, r3}
>>>>  	mul	r2, r0
>>>>  	sub	r1, r1, r2
>>>>  	bx	r3
>>>> +# else
>>>> +	/* Both the quotient and remainder are calculated simultaneously
>>>> +	   in THUMB1_Div_Positive and THUMB1_Div_Negative.  There is no
>>>> +	   need to calculate the remainder again here.  */
>>>> +	b	LSYM(divsi3_skip_div0_test)
>>>> +	RET
>>>> +# endif /* __OPTIMIZE_SIZE__ */
>>>> +
>>>>  #elif defined(__ARM_ARCH_EXT_IDIV__)
>>>>  ARM_FUNC_START aeabi_idivmod
>>>>  	cmp 	r1, #0
>>>> -- 1.9.1
>>>>
>>>
>>> Otherwise OK if no regressions and the following request passes.
>>>
>>> Can you ensure that libgcc for one ARM state and one Thumb2 state non-v6m configuration should give identical binaries with and without your patch, no ? 
>>>
>>> regards
>>> Ramana
>>>
>> Hi Ramana,
>>
>> Thank you for the comments. Sorry about the license, must have been a
>> mixup somewhere.
>>
>> I put back the 'pop pc is safe' assembly comment and I moved some
>> comments before the #if and #else as requested. I left some in place
>> because they did not apply to the whole block but simply to the first
>> assembly instruction after the #if/else.
>>
>> I checked that the assembly generated for libgcc was the same with and
>> without the patch for armv7-a in arm mode and armv7-m in thumb mode.
>>
>> Is this OK?
>>
>> Cheers,
>> Andre
>>
>> libgcc/ChangeLog:
>> 2016-07-06  Hale Wang  <hale.wang@arm.com>
>>             Andre Vieira  <andre.simoesdiasvieira@arm.com>
>>
>>    * config/arm/lib1funcs.S: Add new wrapper.
>>
> I had to rebase the patch due to the ARMv8-M patches. This implied
> changing a context line that changed due to the code for ARMv6-M being
> reused for ARMv8-M Baseline.
> 
> I ran regression tests for both ARMv6-M and ARMv8-M Baseline and
> compared the generated libgcc for ARMv7-A in ARM mode and ARMv7-M in
> Thumb mode, observing no changes.
> 
> Applying patch, as it was previously OK'ed.
> 
> Cheers,
> Andre
> 
Backported to embedded-6-branch at revision r238240.

Cheers,
Andre

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2016-07-12 10:41 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-10-13 17:01 [PATCHv2, ARM, libgcc] New aeabi_idiv function for armv6-m Andre Vieira
2015-10-27 17:28 ` [PING][PATCHv2, " Andre Vieira
2016-01-25 17:15   ` [PING^2][PATCHv2, " Andre Vieira (lists)
2016-05-19 10:19     ` [PING^3][PATCHv2, " Andre Vieira (lists)
2016-06-08 14:36       ` [PING^4][PATCHv2, " Andre Vieira (lists)
2016-06-23 16:52         ` [PING^5][PATCHv2, " Andre Vieira (lists)
2016-07-01 13:41 ` [PATCHv2, " Ramana Radhakrishnan
2016-07-06 10:52   ` Andre Vieira (lists)
2016-07-11 17:09     ` Andre Vieira (lists)
2016-07-12 10:41       ` Andre Vieira (lists)

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).