[ARM] implement division using vrecpe/vrecps with -funsafe-math-optimizations

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [ARM] implement division using vrecpe/vrecps with -funsafe-math-optimizations
@ 2015-07-29 10:33 Prathamesh Kulkarni
  2015-07-29 11:10 ` Kyrill Tkachov
  2015-07-31  9:38 ` Ramana Radhakrishnan
  0 siblings, 2 replies; 13+ messages in thread
From: Prathamesh Kulkarni @ 2015-07-29 10:33 UTC (permalink / raw)
  To: gcc Patches, Charles Baylis

[-- Attachment #1: Type: text/plain, Size: 246 bytes --]

Hi,
This patch tries to implement division with multiplication by
reciprocal using vrecpe/vrecps
with -funsafe-math-optimizations and -freciprocal-math enabled.
Tested on arm-none-linux-gnueabihf using qemu.
OK for trunk ?

Thank you,
Prathamesh

[-- Attachment #2: ChangeLog.txt --]
[-- Type: text/plain, Size: 266 bytes --]

2015-07-28  Prathamesh Kulkarni  <prathamesh.kulkarni@linaro.org>
	    Charles Baylis  <charles.baylis@linaro.org>

	* config/arm/neon.md (div<mode>3): New pattern.

testsuite/
	* gcc.target/arm/vect-div-1.c: New test-case.
	* gcc.target/arm/vect-div-2.c: Likewise.

[-- Attachment #3: patch.diff --]
[-- Type: text/plain, Size: 2660 bytes --]

diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index 654d9d5..28c2e2a 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -548,6 +548,32 @@
                     (const_string "neon_mul_<V_elem_ch><q>")))]
 )
 
+(define_expand "div<mode>3"
+  [(set (match_operand:VCVTF 0 "s_register_operand" "=w")
+        (div:VCVTF (match_operand:VCVTF 1 "s_register_operand" "w")
+		  (match_operand:VCVTF 2 "s_register_operand" "w")))]
+  "TARGET_NEON && flag_unsafe_math_optimizations && flag_reciprocal_math"
+  {
+    rtx rec = gen_reg_rtx (<MODE>mode);
+    rtx vrecps_temp = gen_reg_rtx (<MODE>mode);
+
+    /* Reciprocal estimate */
+    emit_insn (gen_neon_vrecpe<mode> (rec, operands[2]));
+
+    /* Perform 2 iterations of Newton-Raphson method for better accuracy */
+    for (int i = 0; i < 2; i++)
+      {
+	emit_insn (gen_neon_vrecps<mode> (vrecps_temp, rec, operands[2]));
+	emit_insn (gen_mul<mode>3 (rec, rec, vrecps_temp));
+      }
+
+    /* We now have reciprocal in rec, perform operands[0] = operands[1] * rec */
+    emit_insn (gen_mul<mode>3 (operands[0], operands[1], rec));
+    DONE;
+  }
+)
+
+
 (define_insn "mul<mode>3add<mode>_neon"
   [(set (match_operand:VDQW 0 "s_register_operand" "=w")
         (plus:VDQW (mult:VDQW (match_operand:VDQW 2 "s_register_operand" "w")
diff --git a/gcc/testsuite/gcc.target/arm/vect-div-1.c b/gcc/testsuite/gcc.target/arm/vect-div-1.c
new file mode 100644
index 0000000..e562ef3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/vect-div-1.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_neon_ok } */
+/* { dg-options "-O2 -funsafe-math-optimizations -ftree-vectorize -fdump-tree-vect-all" } */
+/* { dg-add-options arm_v8_neon } */
+
+void
+foo (int len, float * __restrict p, float *__restrict x)
+{
+  len = len & ~31;
+  for (int i = 0; i < len; i++)
+    p[i] = p[i] / x[i];
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/arm/vect-div-2.c b/gcc/testsuite/gcc.target/arm/vect-div-2.c
new file mode 100644
index 0000000..8e15d0a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/vect-div-2.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_neon_ok } */
+/* { dg-options "-O2 -funsafe-math-optimizations -fno-reciprocal-math -ftree-vectorize -fdump-tree-vect-all" } */
+/* { dg-add-options arm_v8_neon } */
+
+void
+foo (int len, float * __restrict p, float *__restrict x)
+{
+  len = len & ~31;
+  for (int i = 0; i < len; i++)
+    p[i] = p[i] / x[i];
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" } } */

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [ARM] implement division using vrecpe/vrecps with -funsafe-math-optimizations
  2015-07-29 10:33 [ARM] implement division using vrecpe/vrecps with -funsafe-math-optimizations Prathamesh Kulkarni
@ 2015-07-29 11:10 ` Kyrill Tkachov
  2015-07-30 22:19   ` Prathamesh Kulkarni
  2015-07-31  9:38 ` Ramana Radhakrishnan
  1 sibling, 1 reply; 13+ messages in thread
From: Kyrill Tkachov @ 2015-07-29 11:10 UTC (permalink / raw)
  To: Prathamesh Kulkarni, gcc Patches, Charles Baylis

Hi Prathamesh,

This is probably not appropriate for -Os optimisation.
And for speed optimisation I imagine it can vary a lot on the target the code is run.
Do you have any benchmark results for this patch?

Thanks,
Kyrill

On 29/07/15 11:09, Prathamesh Kulkarni wrote:
> Hi,
> This patch tries to implement division with multiplication by
> reciprocal using vrecpe/vrecps
> with -funsafe-math-optimizations and -freciprocal-math enabled.
> Tested on arm-none-linux-gnueabihf using qemu.
> OK for trunk ?
>
> Thank you,
> Prathamesh
+    /* Perform 2 iterations of Newton-Raphson method for better accuracy */
+    for (int i = 0; i < 2; i++)
+      {
+    emit_insn (gen_neon_vrecps<mode> (vrecps_temp, rec, operands[2]));
+    emit_insn (gen_mul<mode>3 (rec, rec, vrecps_temp));
+      }
+
+    /* We now have reciprocal in rec, perform operands[0] = operands[1] * rec */
+    emit_insn (gen_mul<mode>3 (operands[0], operands[1], rec));
+    DONE;
+  }
+)
+

Full stop and two spaces at the end of the comments.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [ARM] implement division using vrecpe/vrecps with -funsafe-math-optimizations
  2015-07-29 11:10 ` Kyrill Tkachov
@ 2015-07-30 22:19   ` Prathamesh Kulkarni
  0 siblings, 0 replies; 13+ messages in thread
From: Prathamesh Kulkarni @ 2015-07-30 22:19 UTC (permalink / raw)
  To: Kyrill Tkachov; +Cc: gcc Patches, Charles Baylis

[-- Attachment #1: Type: text/plain, Size: 2081 bytes --]

On 29 July 2015 at 16:03, Kyrill Tkachov <kyrylo.tkachov@arm.com> wrote:
> Hi Prathamesh,
>
> This is probably not appropriate for -Os optimisation.
> And for speed optimisation I imagine it can vary a lot on the target the
> code is run.
> Do you have any benchmark results for this patch?
Hi Kyrill,
Thanks for the review. I have attempted to address your comments in
the attached patch.
Does it look OK from correctness perspective ?
Unfortunately I haven't done benchmarking yet.
I ran a test-case (attached) prepared by Charles for target
arm-linux-gnueabihf (on APM Mustang),
and it appeared to run faster with the patch:
Options passed: -O3 -mfpu=neon -mfloat-abi=hard -funsafe-math-optimizations

Before:
t8a, len =       32, took   2593977 ticks
t8a, len =      128, took   2408907 ticks
t8a, len =     1024, took   2354950 ticks
t8a, len =    65536, took   2365041 ticks
t8a, len =  1048576, took   2692928 ticks

After:
t8a, len =       32, took   2027323 ticks
t8a, len =      128, took   1920595 ticks
t8a, len =     1024, took   1827250 ticks
t8a, len =    65536, took   1797924 ticks
t8a, len =  1048576, took   2026274 ticks

I will get back to you soon with benchmarking results.

Thanks,
Prathamesh
>
> Thanks,
> Kyrill
>
>
> On 29/07/15 11:09, Prathamesh Kulkarni wrote:
>>
>> Hi,
>> This patch tries to implement division with multiplication by
>> reciprocal using vrecpe/vrecps
>> with -funsafe-math-optimizations and -freciprocal-math enabled.
>> Tested on arm-none-linux-gnueabihf using qemu.
>> OK for trunk ?
>>
>> Thank you,
>> Prathamesh
>
> +    /* Perform 2 iterations of Newton-Raphson method for better accuracy */
> +    for (int i = 0; i < 2; i++)
> +      {
> +    emit_insn (gen_neon_vrecps<mode> (vrecps_temp, rec, operands[2]));
> +    emit_insn (gen_mul<mode>3 (rec, rec, vrecps_temp));
> +      }
> +
> +    /* We now have reciprocal in rec, perform operands[0] = operands[1] *
> rec */
> +    emit_insn (gen_mul<mode>3 (operands[0], operands[1], rec));
> +    DONE;
> +  }
> +)
> +
>
> Full stop and two spaces at the end of the comments.
>

[-- Attachment #2: ChangeLog.txt --]
[-- Type: text/plain, Size: 266 bytes --]

2015-07-28  Prathamesh Kulkarni  <prathamesh.kulkarni@linaro.org>
	    Charles Baylis  <charles.baylis@linaro.org>

	* config/arm/neon.md (div<mode>3): New pattern.

testsuite/
	* gcc.target/arm/vect-div-1.c: New test-case.
	* gcc.target/arm/vect-div-2.c: Likewise.

[-- Attachment #3: patch2.diff --]
[-- Type: text/plain, Size: 2668 bytes --]

diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index 654d9d5..f2dbcc4 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -548,6 +548,33 @@
                     (const_string "neon_mul_<V_elem_ch><q>")))]
 )
 
+(define_expand "div<mode>3"
+  [(set (match_operand:VCVTF 0 "s_register_operand" "=w")
+        (div:VCVTF (match_operand:VCVTF 1 "s_register_operand" "w")
+		  (match_operand:VCVTF 2 "s_register_operand" "w")))]
+  "TARGET_NEON && !optimize_size
+   && flag_unsafe_math_optimizations && flag_reciprocal_math"
+  {
+    rtx rec = gen_reg_rtx (<MODE>mode);
+    rtx vrecps_temp = gen_reg_rtx (<MODE>mode);
+
+    /* Reciprocal estimate.  */
+    emit_insn (gen_neon_vrecpe<mode> (rec, operands[2]));
+
+    /* Perform 2 iterations of newton-raphson method.  */
+    for (int i = 0; i < 2; i++)
+      {
+	emit_insn (gen_neon_vrecps<mode> (vrecps_temp, rec, operands[2]));
+	emit_insn (gen_mul<mode>3 (rec, rec, vrecps_temp));
+      }
+
+    /* We now have reciprocal in rec, perform operands[0] = operands[1] * rec.  */
+    emit_insn (gen_mul<mode>3 (operands[0], operands[1], rec));
+    DONE;
+  }
+)
+
+
 (define_insn "mul<mode>3add<mode>_neon"
   [(set (match_operand:VDQW 0 "s_register_operand" "=w")
         (plus:VDQW (mult:VDQW (match_operand:VDQW 2 "s_register_operand" "w")
diff --git a/gcc/testsuite/gcc.target/arm/vect-div-1.c b/gcc/testsuite/gcc.target/arm/vect-div-1.c
new file mode 100644
index 0000000..e562ef3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/vect-div-1.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_neon_ok } */
+/* { dg-options "-O2 -funsafe-math-optimizations -ftree-vectorize -fdump-tree-vect-all" } */
+/* { dg-add-options arm_v8_neon } */
+
+void
+foo (int len, float * __restrict p, float *__restrict x)
+{
+  len = len & ~31;
+  for (int i = 0; i < len; i++)
+    p[i] = p[i] / x[i];
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/arm/vect-div-2.c b/gcc/testsuite/gcc.target/arm/vect-div-2.c
new file mode 100644
index 0000000..8e15d0a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/vect-div-2.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_neon_ok } */
+/* { dg-options "-O2 -funsafe-math-optimizations -fno-reciprocal-math -ftree-vectorize -fdump-tree-vect-all" } */
+/* { dg-add-options arm_v8_neon } */
+
+void
+foo (int len, float * __restrict p, float *__restrict x)
+{
+  len = len & ~31;
+  for (int i = 0; i < len; i++)
+    p[i] = p[i] / x[i];
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" } } */

[-- Attachment #4: test.tar.gz --]
[-- Type: application/x-gzip, Size: 374 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [ARM] implement division using vrecpe/vrecps with -funsafe-math-optimizations
  2015-07-29 10:33 [ARM] implement division using vrecpe/vrecps with -funsafe-math-optimizations Prathamesh Kulkarni
  2015-07-29 11:10 ` Kyrill Tkachov
@ 2015-07-31  9:38 ` Ramana Radhakrishnan
  2015-07-31 12:36   ` Charles Baylis
  2016-01-17  9:06   ` Prathamesh Kulkarni
  1 sibling, 2 replies; 13+ messages in thread
From: Ramana Radhakrishnan @ 2015-07-31  9:38 UTC (permalink / raw)
  To: Prathamesh Kulkarni, gcc Patches, Charles Baylis



On 29/07/15 11:09, Prathamesh Kulkarni wrote:
> Hi,
> This patch tries to implement division with multiplication by
> reciprocal using vrecpe/vrecps
> with -funsafe-math-optimizations and -freciprocal-math enabled.
> Tested on arm-none-linux-gnueabihf using qemu.
> OK for trunk ?
> 
> Thank you,
> Prathamesh
> 

I've tried this in the past and never been convinced that 2 iterations are enough to get to stability with this given that the results are only precise for 8 bits / iteration. Thus I've always believed you need 3 iterations rather than 2 at which point I've never been sure that it's worth it. So the testing that you've done with this currently is not enough for this to go into the tree. 

I'd like this to be tested on a couple of different AArch32 implementations with a wider range of inputs to verify that the results are acceptable as well as running something like SPEC2k(6) with atleast one iteration to ensure correctness.


moving on to the patches.

> diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
> index 654d9d5..28c2e2a 100644
> --- a/gcc/config/arm/neon.md
> +++ b/gcc/config/arm/neon.md
> @@ -548,6 +548,32 @@
>                      (const_string "neon_mul_<V_elem_ch><q>")))]
>  )
>  

Please add a comment here.

> +(define_expand "div<mode>3"
> +  [(set (match_operand:VCVTF 0 "s_register_operand" "=w")
> +        (div:VCVTF (match_operand:VCVTF 1 "s_register_operand" "w")
> +		  (match_operand:VCVTF 2 "s_register_operand" "w")))]

I want to double check that this doesn't collide with Alan's patches for FP16 especially if he reuses the VCVTF iterator for all the vcvt f16 cases.

> +  "TARGET_NEON && flag_unsafe_math_optimizations && flag_reciprocal_math"
> +  {
> +    rtx rec = gen_reg_rtx (<MODE>mode);
> +    rtx vrecps_temp = gen_reg_rtx (<MODE>mode);
> +
> +    /* Reciprocal estimate */
> +    emit_insn (gen_neon_vrecpe<mode> (rec, operands[2]));
> +
> +    /* Perform 2 iterations of Newton-Raphson method for better accuracy */
> +    for (int i = 0; i < 2; i++)
> +      {
> +	emit_insn (gen_neon_vrecps<mode> (vrecps_temp, rec, operands[2]));
> +	emit_insn (gen_mul<mode>3 (rec, rec, vrecps_temp));
> +      }
> +
> +    /* We now have reciprocal in rec, perform operands[0] = operands[1] * rec */
> +    emit_insn (gen_mul<mode>3 (operands[0], operands[1], rec));
> +    DONE;
> +  }
> +)
> +
> +
>  (define_insn "mul<mode>3add<mode>_neon"
>    [(set (match_operand:VDQW 0 "s_register_operand" "=w")
>          (plus:VDQW (mult:VDQW (match_operand:VDQW 2 "s_register_operand" "w")
> diff --git a/gcc/testsuite/gcc.target/arm/vect-div-1.c b/gcc/testsuite/gcc.target/arm/vect-div-1.c
> new file mode 100644
> index 0000000..e562ef3
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/arm/vect-div-1.c
> @@ -0,0 +1,14 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target arm_v8_neon_ok } */
> +/* { dg-options "-O2 -funsafe-math-optimizations -ftree-vectorize -fdump-tree-vect-all" } */
> +/* { dg-add-options arm_v8_neon } */

No this is wrong.

What is armv8 specific about this test ? This is just like another test that is for Neon. vrecpe / vrecps are not instructions that were introduced in the v8 version of the architecture. They've existed in the base Neon instruction set. The code generation above in the patterns will be enabled when TARGET_NEON is true which can happen when -mfpu=neon -mfloat-abi={softfp/hard} is true.

> +
> +void
> +foo (int len, float * __restrict p, float *__restrict x)
> +{
> +  len = len & ~31;
> +  for (int i = 0; i < len; i++)
> +    p[i] = p[i] / x[i];
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/arm/vect-div-2.c b/gcc/testsuite/gcc.target/arm/vect-div-2.c
> new file mode 100644
> index 0000000..8e15d0a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/arm/vect-div-2.c
> @@ -0,0 +1,14 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target arm_v8_neon_ok } */

And likewise.

> +/* { dg-options "-O2 -funsafe-math-optimizations -fno-reciprocal-math -ftree-vectorize -fdump-tree-vect-all" } */
> +/* { dg-add-options arm_v8_neon } */
> +
> +void
> +foo (int len, float * __restrict p, float *__restrict x)
> +{
> +  len = len & ~31;
> +  for (int i = 0; i < len; i++)
> +    p[i] = p[i] / x[i];
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" } } */


regards
Ramana

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [ARM] implement division using vrecpe/vrecps with -funsafe-math-optimizations
  2015-07-31  9:38 ` Ramana Radhakrishnan
@ 2015-07-31 12:36   ` Charles Baylis
  2016-01-17  9:06   ` Prathamesh Kulkarni
  1 sibling, 0 replies; 13+ messages in thread
From: Charles Baylis @ 2015-07-31 12:36 UTC (permalink / raw)
  To: Ramana Radhakrishnan; +Cc: Prathamesh Kulkarni, gcc Patches

On 31 July 2015 at 10:34, Ramana Radhakrishnan
<ramana.radhakrishnan@foss.arm.com> wrote:
> I've tried this in the past and never been convinced that 2 iterations are enough to get to stability with this given that the results are only precise for 8 bits / iteration. Thus I've always believed you need 3 iterations rather than 2 at which point I've never been sure that it's worth it. So the testing that you've done with this currently is not enough for this to go into the tree.

My understanding is that 2 iterations is sufficient for single
precision floating point (although not for double precision), because
each iteration of Newton-Raphson doubles the number of bits of
accuracy.

I haven't worked through the maths myself, but
    https://en.wikipedia.org/wiki/Division_algorithm#Newton.E2.80.93Raphson_division
says
    "This squaring of the error at each iteration step — the so-called
    quadratic convergence of Newton–Raphson's method — has the
    effect that the number of correct digits in the result roughly
    doubles for every iteration, a property that becomes extremely
    valuable when the numbers involved have many digits"

Therefore:
vrecpe -> 8 bits of accuracy
+1 iteration -> 16 bits of accuracy
+2 iterations -> 32 bits of accuracy (but in reality limited to
precision of 32bit float)

Since 32 bits is much more accuracy than the 24 bits of precision in a
single precision FP value, 2 iterations should be sufficient.

> I'd like this to be tested on a couple of different AArch32 implementations with a wider range of inputs to verify that the results are acceptable as well as running something like SPEC2k(6) with atleast one iteration to ensure correctness.

I can't argue with confirming theory matches practice :)

Some corner cases (eg numbers around FLT_MAX, FLT_MIN etc) may result
in denormals or out of range values during the reciprocal calculation
which could result in answers which are less accurate than the typical
case but I think that is acceptable with -ffast-math.

Charles

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [ARM] implement division using vrecpe/vrecps with -funsafe-math-optimizations
  2015-07-31  9:38 ` Ramana Radhakrishnan
  2015-07-31 12:36   ` Charles Baylis
@ 2016-01-17  9:06   ` Prathamesh Kulkarni
  2016-02-04 11:01     ` Ramana Radhakrishnan
  1 sibling, 1 reply; 13+ messages in thread
From: Prathamesh Kulkarni @ 2016-01-17  9:06 UTC (permalink / raw)
  To: Ramana Radhakrishnan; +Cc: gcc Patches, Charles Baylis

On 31 July 2015 at 15:04, Ramana Radhakrishnan
<ramana.radhakrishnan@foss.arm.com> wrote:
>
>
> On 29/07/15 11:09, Prathamesh Kulkarni wrote:
>> Hi,
>> This patch tries to implement division with multiplication by
>> reciprocal using vrecpe/vrecps
>> with -funsafe-math-optimizations and -freciprocal-math enabled.
>> Tested on arm-none-linux-gnueabihf using qemu.
>> OK for trunk ?
>>
>> Thank you,
>> Prathamesh
>>
>
> I've tried this in the past and never been convinced that 2 iterations are enough to get to stability with this given that the results are only precise for 8 bits / iteration. Thus I've always believed you need 3 iterations rather than 2 at which point I've never been sure that it's worth it. So the testing that you've done with this currently is not enough for this to go into the tree.
>
> I'd like this to be tested on a couple of different AArch32 implementations with a wider range of inputs to verify that the results are acceptable as well as running something like SPEC2k(6) with atleast one iteration to ensure correctness.
Hi,
I got results of SPEC2k6 fp benchmarks:
a15: +0.64% overall, 481.wrf: +6.46%
a53: +0.21% overall, 416.gamess: -1.39%, 481.wrf: +6.76%
a57: +0.35% overall, 481.wrf: +3.84%
The other benchmarks had (almost) identical results.

Thanks,
Prathamesh
>
>
> moving on to the patches.
>
>> diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
>> index 654d9d5..28c2e2a 100644
>> --- a/gcc/config/arm/neon.md
>> +++ b/gcc/config/arm/neon.md
>> @@ -548,6 +548,32 @@
>>                      (const_string "neon_mul_<V_elem_ch><q>")))]
>>  )
>>
>
> Please add a comment here.
>
>> +(define_expand "div<mode>3"
>> +  [(set (match_operand:VCVTF 0 "s_register_operand" "=w")
>> +        (div:VCVTF (match_operand:VCVTF 1 "s_register_operand" "w")
>> +               (match_operand:VCVTF 2 "s_register_operand" "w")))]
>
> I want to double check that this doesn't collide with Alan's patches for FP16 especially if he reuses the VCVTF iterator for all the vcvt f16 cases.
>
>> +  "TARGET_NEON && flag_unsafe_math_optimizations && flag_reciprocal_math"
>> +  {
>> +    rtx rec = gen_reg_rtx (<MODE>mode);
>> +    rtx vrecps_temp = gen_reg_rtx (<MODE>mode);
>> +
>> +    /* Reciprocal estimate */
>> +    emit_insn (gen_neon_vrecpe<mode> (rec, operands[2]));
>> +
>> +    /* Perform 2 iterations of Newton-Raphson method for better accuracy */
>> +    for (int i = 0; i < 2; i++)
>> +      {
>> +     emit_insn (gen_neon_vrecps<mode> (vrecps_temp, rec, operands[2]));
>> +     emit_insn (gen_mul<mode>3 (rec, rec, vrecps_temp));
>> +      }
>> +
>> +    /* We now have reciprocal in rec, perform operands[0] = operands[1] * rec */
>> +    emit_insn (gen_mul<mode>3 (operands[0], operands[1], rec));
>> +    DONE;
>> +  }
>> +)
>> +
>> +
>>  (define_insn "mul<mode>3add<mode>_neon"
>>    [(set (match_operand:VDQW 0 "s_register_operand" "=w")
>>          (plus:VDQW (mult:VDQW (match_operand:VDQW 2 "s_register_operand" "w")
>> diff --git a/gcc/testsuite/gcc.target/arm/vect-div-1.c b/gcc/testsuite/gcc.target/arm/vect-div-1.c
>> new file mode 100644
>> index 0000000..e562ef3
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/arm/vect-div-1.c
>> @@ -0,0 +1,14 @@
>> +/* { dg-do compile } */
>> +/* { dg-require-effective-target arm_v8_neon_ok } */
>> +/* { dg-options "-O2 -funsafe-math-optimizations -ftree-vectorize -fdump-tree-vect-all" } */
>> +/* { dg-add-options arm_v8_neon } */
>
> No this is wrong.
>
> What is armv8 specific about this test ? This is just like another test that is for Neon. vrecpe / vrecps are not instructions that were introduced in the v8 version of the architecture. They've existed in the base Neon instruction set. The code generation above in the patterns will be enabled when TARGET_NEON is true which can happen when -mfpu=neon -mfloat-abi={softfp/hard} is true.
>
>> +
>> +void
>> +foo (int len, float * __restrict p, float *__restrict x)
>> +{
>> +  len = len & ~31;
>> +  for (int i = 0; i < len; i++)
>> +    p[i] = p[i] / x[i];
>> +}
>> +
>> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
>> diff --git a/gcc/testsuite/gcc.target/arm/vect-div-2.c b/gcc/testsuite/gcc.target/arm/vect-div-2.c
>> new file mode 100644
>> index 0000000..8e15d0a
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/arm/vect-div-2.c
>> @@ -0,0 +1,14 @@
>> +/* { dg-do compile } */
>> +/* { dg-require-effective-target arm_v8_neon_ok } */
>
> And likewise.
>
>> +/* { dg-options "-O2 -funsafe-math-optimizations -fno-reciprocal-math -ftree-vectorize -fdump-tree-vect-all" } */
>> +/* { dg-add-options arm_v8_neon } */
>> +
>> +void
>> +foo (int len, float * __restrict p, float *__restrict x)
>> +{
>> +  len = len & ~31;
>> +  for (int i = 0; i < len; i++)
>> +    p[i] = p[i] / x[i];
>> +}
>> +
>> +/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" } } */
>
>
> regards
> Ramana

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [ARM] implement division using vrecpe/vrecps with -funsafe-math-optimizations
  2016-01-17  9:06   ` Prathamesh Kulkarni
@ 2016-02-04 11:01     ` Ramana Radhakrishnan
  2016-02-05 13:10       ` Prathamesh Kulkarni
  0 siblings, 1 reply; 13+ messages in thread
From: Ramana Radhakrishnan @ 2016-02-04 11:01 UTC (permalink / raw)
  To: Prathamesh Kulkarni; +Cc: Ramana Radhakrishnan, gcc Patches, Charles Baylis

On Sun, Jan 17, 2016 at 9:06 AM, Prathamesh Kulkarni
<prathamesh.kulkarni@linaro.org> wrote:
> On 31 July 2015 at 15:04, Ramana Radhakrishnan
> <ramana.radhakrishnan@foss.arm.com> wrote:
>>
>>
>> On 29/07/15 11:09, Prathamesh Kulkarni wrote:
>>> Hi,
>>> This patch tries to implement division with multiplication by
>>> reciprocal using vrecpe/vrecps
>>> with -funsafe-math-optimizations and -freciprocal-math enabled.
>>> Tested on arm-none-linux-gnueabihf using qemu.
>>> OK for trunk ?
>>>
>>> Thank you,
>>> Prathamesh
>>>
>>
>> I've tried this in the past and never been convinced that 2 iterations are enough to get to stability with this given that the results are only precise for 8 bits / iteration. Thus I've always believed you need 3 iterations rather than 2 at which point I've never been sure that it's worth it. So the testing that you've done with this currently is not enough for this to go into the tree.
>>
>> I'd like this to be tested on a couple of different AArch32 implementations with a wider range of inputs to verify that the results are acceptable as well as running something like SPEC2k(6) with atleast one iteration to ensure correctness.
> Hi,
> I got results of SPEC2k6 fp benchmarks:
> a15: +0.64% overall, 481.wrf: +6.46%
> a53: +0.21% overall, 416.gamess: -1.39%, 481.wrf: +6.76%
> a57: +0.35% overall, 481.wrf: +3.84%
> The other benchmarks had (almost) identical results.

Thanks for the benchmarking results -  Please repost the patch with
the changes that I had requested in my previous review - given it is
now stage4 , I would rather queue changes like this for stage1 now.

Thanks,
Ramana

>
> Thanks,
> Prathamesh
>>
>>
>> moving on to the patches.
>>
>>> diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
>>> index 654d9d5..28c2e2a 100644
>>> --- a/gcc/config/arm/neon.md
>>> +++ b/gcc/config/arm/neon.md
>>> @@ -548,6 +548,32 @@
>>>                      (const_string "neon_mul_<V_elem_ch><q>")))]
>>>  )
>>>
>>
>> Please add a comment here.
>>
>>> +(define_expand "div<mode>3"
>>> +  [(set (match_operand:VCVTF 0 "s_register_operand" "=w")
>>> +        (div:VCVTF (match_operand:VCVTF 1 "s_register_operand" "w")
>>> +               (match_operand:VCVTF 2 "s_register_operand" "w")))]
>>
>> I want to double check that this doesn't collide with Alan's patches for FP16 especially if he reuses the VCVTF iterator for all the vcvt f16 cases.
>>
>>> +  "TARGET_NEON && flag_unsafe_math_optimizations && flag_reciprocal_math"
>>> +  {
>>> +    rtx rec = gen_reg_rtx (<MODE>mode);
>>> +    rtx vrecps_temp = gen_reg_rtx (<MODE>mode);
>>> +
>>> +    /* Reciprocal estimate */
>>> +    emit_insn (gen_neon_vrecpe<mode> (rec, operands[2]));
>>> +
>>> +    /* Perform 2 iterations of Newton-Raphson method for better accuracy */
>>> +    for (int i = 0; i < 2; i++)
>>> +      {
>>> +     emit_insn (gen_neon_vrecps<mode> (vrecps_temp, rec, operands[2]));
>>> +     emit_insn (gen_mul<mode>3 (rec, rec, vrecps_temp));
>>> +      }
>>> +
>>> +    /* We now have reciprocal in rec, perform operands[0] = operands[1] * rec */
>>> +    emit_insn (gen_mul<mode>3 (operands[0], operands[1], rec));
>>> +    DONE;
>>> +  }
>>> +)
>>> +
>>> +
>>>  (define_insn "mul<mode>3add<mode>_neon"
>>>    [(set (match_operand:VDQW 0 "s_register_operand" "=w")
>>>          (plus:VDQW (mult:VDQW (match_operand:VDQW 2 "s_register_operand" "w")
>>> diff --git a/gcc/testsuite/gcc.target/arm/vect-div-1.c b/gcc/testsuite/gcc.target/arm/vect-div-1.c
>>> new file mode 100644
>>> index 0000000..e562ef3
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/arm/vect-div-1.c
>>> @@ -0,0 +1,14 @@
>>> +/* { dg-do compile } */
>>> +/* { dg-require-effective-target arm_v8_neon_ok } */
>>> +/* { dg-options "-O2 -funsafe-math-optimizations -ftree-vectorize -fdump-tree-vect-all" } */
>>> +/* { dg-add-options arm_v8_neon } */
>>
>> No this is wrong.
>>
>> What is armv8 specific about this test ? This is just like another test that is for Neon. vrecpe / vrecps are not instructions that were introduced in the v8 version of the architecture. They've existed in the base Neon instruction set. The code generation above in the patterns will be enabled when TARGET_NEON is true which can happen when -mfpu=neon -mfloat-abi={softfp/hard} is true.
>>
>>> +
>>> +void
>>> +foo (int len, float * __restrict p, float *__restrict x)
>>> +{
>>> +  len = len & ~31;
>>> +  for (int i = 0; i < len; i++)
>>> +    p[i] = p[i] / x[i];
>>> +}
>>> +
>>> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
>>> diff --git a/gcc/testsuite/gcc.target/arm/vect-div-2.c b/gcc/testsuite/gcc.target/arm/vect-div-2.c
>>> new file mode 100644
>>> index 0000000..8e15d0a
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/arm/vect-div-2.c
>>> @@ -0,0 +1,14 @@
>>> +/* { dg-do compile } */
>>> +/* { dg-require-effective-target arm_v8_neon_ok } */
>>
>> And likewise.
>>
>>> +/* { dg-options "-O2 -funsafe-math-optimizations -fno-reciprocal-math -ftree-vectorize -fdump-tree-vect-all" } */
>>> +/* { dg-add-options arm_v8_neon } */
>>> +
>>> +void
>>> +foo (int len, float * __restrict p, float *__restrict x)
>>> +{
>>> +  len = len & ~31;
>>> +  for (int i = 0; i < len; i++)
>>> +    p[i] = p[i] / x[i];
>>> +}
>>> +
>>> +/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" } } */
>>
>>
>> regards
>> Ramana

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [ARM] implement division using vrecpe/vrecps with -funsafe-math-optimizations
  2016-02-04 11:01     ` Ramana Radhakrishnan
@ 2016-02-05 13:10       ` Prathamesh Kulkarni
  2016-05-23  9:30         ` Prathamesh Kulkarni
  0 siblings, 1 reply; 13+ messages in thread
From: Prathamesh Kulkarni @ 2016-02-05 13:10 UTC (permalink / raw)
  To: Ramana Radhakrishnan; +Cc: Ramana Radhakrishnan, gcc Patches, Charles Baylis

[-- Attachment #1: Type: text/plain, Size: 6465 bytes --]

On 4 February 2016 at 16:31, Ramana Radhakrishnan
<ramana.gcc@googlemail.com> wrote:
> On Sun, Jan 17, 2016 at 9:06 AM, Prathamesh Kulkarni
> <prathamesh.kulkarni@linaro.org> wrote:
>> On 31 July 2015 at 15:04, Ramana Radhakrishnan
>> <ramana.radhakrishnan@foss.arm.com> wrote:
>>>
>>>
>>> On 29/07/15 11:09, Prathamesh Kulkarni wrote:
>>>> Hi,
>>>> This patch tries to implement division with multiplication by
>>>> reciprocal using vrecpe/vrecps
>>>> with -funsafe-math-optimizations and -freciprocal-math enabled.
>>>> Tested on arm-none-linux-gnueabihf using qemu.
>>>> OK for trunk ?
>>>>
>>>> Thank you,
>>>> Prathamesh
>>>>
>>>
>>> I've tried this in the past and never been convinced that 2 iterations are enough to get to stability with this given that the results are only precise for 8 bits / iteration. Thus I've always believed you need 3 iterations rather than 2 at which point I've never been sure that it's worth it. So the testing that you've done with this currently is not enough for this to go into the tree.
>>>
>>> I'd like this to be tested on a couple of different AArch32 implementations with a wider range of inputs to verify that the results are acceptable as well as running something like SPEC2k(6) with atleast one iteration to ensure correctness.
>> Hi,
>> I got results of SPEC2k6 fp benchmarks:
>> a15: +0.64% overall, 481.wrf: +6.46%
>> a53: +0.21% overall, 416.gamess: -1.39%, 481.wrf: +6.76%
>> a57: +0.35% overall, 481.wrf: +3.84%
>> The other benchmarks had (almost) identical results.
>
> Thanks for the benchmarking results -  Please repost the patch with
> the changes that I had requested in my previous review - given it is
> now stage4 , I would rather queue changes like this for stage1 now.
Hi,
Please find the updated patch attached.
It passes testsuite for arm-none-linux-gnueabi, arm-none-linux-gnueabihf and
arm-none-eabi.
However the test-case added in the patch (neon-vect-div-1.c) fails to
get vectorized at -O2
for armeb-none-linux-gnueabihf.
Charles suggested me to try with -O3, which worked.
It appears the test-case fails to get vectorized with
-fvect-cost-model=cheap (which is default enabled at -O2)
and passes for -fno-vect-cost-model / -fvect-cost-model=dynamic

I can't figure out why it fails -fvect-cost-model=cheap.
From the vect dump (attached):
neon-vect-div-1.c:12:3: note: Setting misalignment to -1.
neon-vect-div-1.c:12:3: note: not vectorized: unsupported unaligned load.*_9

Thanks,
Prathamesh
>
> Thanks,
> Ramana
>
>>
>> Thanks,
>> Prathamesh
>>>
>>>
>>> moving on to the patches.
>>>
>>>> diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
>>>> index 654d9d5..28c2e2a 100644
>>>> --- a/gcc/config/arm/neon.md
>>>> +++ b/gcc/config/arm/neon.md
>>>> @@ -548,6 +548,32 @@
>>>>                      (const_string "neon_mul_<V_elem_ch><q>")))]
>>>>  )
>>>>
>>>
>>> Please add a comment here.
>>>
>>>> +(define_expand "div<mode>3"
>>>> +  [(set (match_operand:VCVTF 0 "s_register_operand" "=w")
>>>> +        (div:VCVTF (match_operand:VCVTF 1 "s_register_operand" "w")
>>>> +               (match_operand:VCVTF 2 "s_register_operand" "w")))]
>>>
>>> I want to double check that this doesn't collide with Alan's patches for FP16 especially if he reuses the VCVTF iterator for all the vcvt f16 cases.
>>>
>>>> +  "TARGET_NEON && flag_unsafe_math_optimizations && flag_reciprocal_math"
>>>> +  {
>>>> +    rtx rec = gen_reg_rtx (<MODE>mode);
>>>> +    rtx vrecps_temp = gen_reg_rtx (<MODE>mode);
>>>> +
>>>> +    /* Reciprocal estimate */
>>>> +    emit_insn (gen_neon_vrecpe<mode> (rec, operands[2]));
>>>> +
>>>> +    /* Perform 2 iterations of Newton-Raphson method for better accuracy */
>>>> +    for (int i = 0; i < 2; i++)
>>>> +      {
>>>> +     emit_insn (gen_neon_vrecps<mode> (vrecps_temp, rec, operands[2]));
>>>> +     emit_insn (gen_mul<mode>3 (rec, rec, vrecps_temp));
>>>> +      }
>>>> +
>>>> +    /* We now have reciprocal in rec, perform operands[0] = operands[1] * rec */
>>>> +    emit_insn (gen_mul<mode>3 (operands[0], operands[1], rec));
>>>> +    DONE;
>>>> +  }
>>>> +)
>>>> +
>>>> +
>>>>  (define_insn "mul<mode>3add<mode>_neon"
>>>>    [(set (match_operand:VDQW 0 "s_register_operand" "=w")
>>>>          (plus:VDQW (mult:VDQW (match_operand:VDQW 2 "s_register_operand" "w")
>>>> diff --git a/gcc/testsuite/gcc.target/arm/vect-div-1.c b/gcc/testsuite/gcc.target/arm/vect-div-1.c
>>>> new file mode 100644
>>>> index 0000000..e562ef3
>>>> --- /dev/null
>>>> +++ b/gcc/testsuite/gcc.target/arm/vect-div-1.c
>>>> @@ -0,0 +1,14 @@
>>>> +/* { dg-do compile } */
>>>> +/* { dg-require-effective-target arm_v8_neon_ok } */
>>>> +/* { dg-options "-O2 -funsafe-math-optimizations -ftree-vectorize -fdump-tree-vect-all" } */
>>>> +/* { dg-add-options arm_v8_neon } */
>>>
>>> No this is wrong.
>>>
>>> What is armv8 specific about this test ? This is just like another test that is for Neon. vrecpe / vrecps are not instructions that were introduced in the v8 version of the architecture. They've existed in the base Neon instruction set. The code generation above in the patterns will be enabled when TARGET_NEON is true which can happen when -mfpu=neon -mfloat-abi={softfp/hard} is true.
>>>
>>>> +
>>>> +void
>>>> +foo (int len, float * __restrict p, float *__restrict x)
>>>> +{
>>>> +  len = len & ~31;
>>>> +  for (int i = 0; i < len; i++)
>>>> +    p[i] = p[i] / x[i];
>>>> +}
>>>> +
>>>> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
>>>> diff --git a/gcc/testsuite/gcc.target/arm/vect-div-2.c b/gcc/testsuite/gcc.target/arm/vect-div-2.c
>>>> new file mode 100644
>>>> index 0000000..8e15d0a
>>>> --- /dev/null
>>>> +++ b/gcc/testsuite/gcc.target/arm/vect-div-2.c
>>>> @@ -0,0 +1,14 @@
>>>> +/* { dg-do compile } */
>>>> +/* { dg-require-effective-target arm_v8_neon_ok } */
>>>
>>> And likewise.
>>>
>>>> +/* { dg-options "-O2 -funsafe-math-optimizations -fno-reciprocal-math -ftree-vectorize -fdump-tree-vect-all" } */
>>>> +/* { dg-add-options arm_v8_neon } */
>>>> +
>>>> +void
>>>> +foo (int len, float * __restrict p, float *__restrict x)
>>>> +{
>>>> +  len = len & ~31;
>>>> +  for (int i = 0; i < len; i++)
>>>> +    p[i] = p[i] / x[i];
>>>> +}
>>>> +
>>>> +/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" } } */
>>>
>>>
>>> regards
>>> Ramana

[-- Attachment #2: patch.diff --]
[-- Type: text/plain, Size: 2957 bytes --]

diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index 55b61eb..7eafee4 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -578,6 +578,38 @@
                     (const_string "neon_mul_<V_elem_ch><q>")))]
 )
 
+/* Perform division using multiply-by-reciprocal. 
+   Reciprocal is calculated using Newton-Raphson method.
+   Enabled with -funsafe-math-optimizations -freciprocal-math
+   and disabled for -Os since it increases code size .  */
+
+(define_expand "div<mode>3"
+  [(set (match_operand:VCVTF 0 "s_register_operand" "=w")
+        (div:VCVTF (match_operand:VCVTF 1 "s_register_operand" "w")
+		  (match_operand:VCVTF 2 "s_register_operand" "w")))]
+  "TARGET_NEON && !optimize_size
+   && flag_unsafe_math_optimizations && flag_reciprocal_math"
+  {
+    rtx rec = gen_reg_rtx (<MODE>mode);
+    rtx vrecps_temp = gen_reg_rtx (<MODE>mode);
+
+    /* Reciprocal estimate.  */
+    emit_insn (gen_neon_vrecpe<mode> (rec, operands[2]));
+
+    /* Perform 2 iterations of newton-raphson method.  */
+    for (int i = 0; i < 2; i++)
+      {
+	emit_insn (gen_neon_vrecps<mode> (vrecps_temp, rec, operands[2]));
+	emit_insn (gen_mul<mode>3 (rec, rec, vrecps_temp));
+      }
+
+    /* We now have reciprocal in rec, perform operands[0] = operands[1] * rec.  */
+    emit_insn (gen_mul<mode>3 (operands[0], operands[1], rec));
+    DONE;
+  }
+)
+
+
 (define_insn "mul<mode>3add<mode>_neon"
   [(set (match_operand:VDQW 0 "s_register_operand" "=w")
         (plus:VDQW (mult:VDQW (match_operand:VDQW 2 "s_register_operand" "w")
diff --git a/gcc/testsuite/gcc.target/arm/neon-vect-div-1.c b/gcc/testsuite/gcc.target/arm/neon-vect-div-1.c
new file mode 100644
index 0000000..dae724a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vect-div-1.c
@@ -0,0 +1,16 @@
+/* Test pattern div<mode>3.  */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O3 -funsafe-math-optimizations -fdump-tree-vect-all" } */
+/* { dg-add-options arm_neon } */
+
+void
+foo (int len, float * __restrict p, float *__restrict x)
+{
+  len = len & ~31;
+  for (int i = 0; i < len; i++)
+    p[i] = p[i] / x[i];
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon-vect-div-2.c b/gcc/testsuite/gcc.target/arm/neon-vect-div-2.c
new file mode 100644
index 0000000..0450b70
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vect-div-2.c
@@ -0,0 +1,16 @@
+/* Test pattern div<mode>3.  */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O3 -funsafe-math-optimizations -fno-reciprocal-math -fdump-tree-vect-all" } */
+/* { dg-add-options arm_neon } */
+
+void
+foo (int len, float * __restrict p, float *__restrict x)
+{
+  len = len & ~31;
+  for (int i = 0; i < len; i++)
+    p[i] = p[i] / x[i];
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" } } */

[-- Attachment #3: ChangeLog --]
[-- Type: application/octet-stream, Size: 227 bytes --]

2016-02-05  Prathamesh Kulkarni  <prathamesh.kulkarni@linaro.org>

	* config/arm/neon.md (div<mode>3): New pattern.

testsuite/
	* gcc.target/arm/neon-vect-div-1.c: New test-case.
	* gcc.target/arm/neon-vect-div-2.c: Likewise.

[-- Attachment #4: neon-vect-div-1.c.148t.vect --]
[-- Type: application/octet-stream, Size: 19209 bytes --]


;; Function foo (foo, funcdef_no=0, decl_uid=5329, cgraph_uid=0, symbol_order=0)


Analyzing loop at neon-vect-div-1.c:12
neon-vect-div-1.c:12:3: note: ===== analyze_loop_nest =====
neon-vect-div-1.c:12:3: note: === vect_analyze_loop_form ===
neon-vect-div-1.c:12:3: note: === get_loop_niters ===
Analyzing # of iterations of loop 1
  exit condition [1, + , 1](no_overflow) < len_4
  bounds on difference of bases: 0 ... 2147483646
Applying pattern match.pd:693, generic-match.c:64
Applying pattern match.pd:730, generic-match.c:10695
  result:
    # of iterations (unsigned int) len_4 + 4294967295, bounded by 2147483646
Applying pattern match.pd:1068, generic-match.c:5523
Applying pattern match.pd:83, generic-match.c:8994
neon-vect-div-1.c:12:3: note: Symbolic number of iterations is (unsigned int) len_4
Creating dr for *_9
analyze_innermost: success.
	base_address: p_8(D)
	offset from base address: 0
	constant offset from base address: 0
	step: 4
	aligned to: 64
	base_object: *p_8(D)
	Access function 0: {0B, +, 4}_1
Creating dr for *_12
analyze_innermost: success.
	base_address: x_11(D)
	offset from base address: 0
	constant offset from base address: 0
	step: 4
	aligned to: 64
	base_object: *x_11(D)
	Access function 0: {0B, +, 4}_1
Creating dr for *_9
analyze_innermost: success.
	base_address: p_8(D)
	offset from base address: 0
	constant offset from base address: 0
	step: 4
	aligned to: 64
	base_object: *p_8(D)
	Access function 0: {0B, +, 4}_1
neon-vect-div-1.c:12:3: note: === vect_analyze_data_refs ===
neon-vect-div-1.c:12:3: note: got vectype for stmt: _10 = *_9;
vector(4) float
neon-vect-div-1.c:12:3: note: got vectype for stmt: _13 = *_12;
vector(4) float
neon-vect-div-1.c:12:3: note: got vectype for stmt: *_9 = _14;
vector(4) float
neon-vect-div-1.c:12:3: note: === vect_analyze_scalar_cycles ===
neon-vect-div-1.c:12:3: note: Analyze phi: i_19 = PHI <0(4), i_16(7)>

neon-vect-div-1.c:12:3: note: Access function of PHI: {0, +, 1}_1
neon-vect-div-1.c:12:3: note: step: 1,  init: 0
neon-vect-div-1.c:12:3: note: Detected induction.
neon-vect-div-1.c:12:3: note: Analyze phi: .MEM_20 = PHI <.MEM_5(D)(4), .MEM_15(7)>

neon-vect-div-1.c:12:3: note: === vect_pattern_recog ===
neon-vect-div-1.c:12:3: note: vect_is_simple_use: operand i.0_6
neon-vect-div-1.c:12:3: note: def_stmt: i.0_6 = (unsigned int) i_19;
neon-vect-div-1.c:12:3: note: type of def: internal
neon-vect-div-1.c:12:3: note: vect_is_simple_use: operand i_19
neon-vect-div-1.c:12:3: note: def_stmt: i_19 = PHI <0(4), i_16(7)>
neon-vect-div-1.c:12:3: note: type of def: induction
neon-vect-div-1.c:12:3: note: === vect_analyze_data_ref_accesses ===
neon-vect-div-1.c:12:3: note: === vect_mark_stmts_to_be_vectorized ===
neon-vect-div-1.c:12:3: note: init: phi relevant? i_19 = PHI <0(4), i_16(7)>
neon-vect-div-1.c:12:3: note: init: phi relevant? .MEM_20 = PHI <.MEM_5(D)(4), .MEM_15(7)>
neon-vect-div-1.c:12:3: note: init: stmt relevant? i.0_6 = (unsigned int) i_19;
neon-vect-div-1.c:12:3: note: init: stmt relevant? _7 = i.0_6 * 4;
neon-vect-div-1.c:12:3: note: init: stmt relevant? _9 = p_8(D) + _7;
neon-vect-div-1.c:12:3: note: init: stmt relevant? _10 = *_9;
neon-vect-div-1.c:12:3: note: init: stmt relevant? _12 = x_11(D) + _7;
neon-vect-div-1.c:12:3: note: init: stmt relevant? _13 = *_12;
neon-vect-div-1.c:12:3: note: init: stmt relevant? _14 = _10 / _13;
neon-vect-div-1.c:12:3: note: init: stmt relevant? *_9 = _14;
neon-vect-div-1.c:12:3: note: vec_stmt_relevant_p: stmt has vdefs.
neon-vect-div-1.c:12:3: note: mark relevant 4, live 0: *_9 = _14;
neon-vect-div-1.c:12:3: note: init: stmt relevant? i_16 = i_19 + 1;
neon-vect-div-1.c:12:3: note: init: stmt relevant? if (len_4 > i_16)
neon-vect-div-1.c:12:3: note: worklist: examine stmt: *_9 = _14;
neon-vect-div-1.c:12:3: note: vect_is_simple_use: operand _14
neon-vect-div-1.c:12:3: note: def_stmt: _14 = _10 / _13;
neon-vect-div-1.c:12:3: note: type of def: internal
neon-vect-div-1.c:12:3: note: mark relevant 4, live 0: _14 = _10 / _13;
neon-vect-div-1.c:12:3: note: worklist: examine stmt: _14 = _10 / _13;
neon-vect-div-1.c:12:3: note: vect_is_simple_use: operand _10
neon-vect-div-1.c:12:3: note: def_stmt: _10 = *_9;
neon-vect-div-1.c:12:3: note: type of def: internal
neon-vect-div-1.c:12:3: note: mark relevant 4, live 0: _10 = *_9;
neon-vect-div-1.c:12:3: note: vect_is_simple_use: operand _13
neon-vect-div-1.c:12:3: note: def_stmt: _13 = *_12;
neon-vect-div-1.c:12:3: note: type of def: internal
neon-vect-div-1.c:12:3: note: mark relevant 4, live 0: _13 = *_12;
neon-vect-div-1.c:12:3: note: worklist: examine stmt: _13 = *_12;
neon-vect-div-1.c:12:3: note: worklist: examine stmt: _10 = *_9;
neon-vect-div-1.c:12:3: note: === vect_analyze_data_ref_dependences ===
(compute_affine_dependence
  stmt_a: _10 = *_9;
  stmt_b: _13 = *_12;
) -> no dependence
(compute_affine_dependence
  stmt_a: _10 = *_9;
  stmt_b: *_9 = _14;
(analyze_overlapping_iterations 
  (chrec_a = {0B, +, 4}_1)
  (chrec_b = {0B, +, 4}_1)
  (overlap_iterations_a = [0])
  (overlap_iterations_b = [0]))
)
(compute_affine_dependence
  stmt_a: _13 = *_12;
  stmt_b: *_9 = _14;
) -> no dependence
(compute_affine_dependence
  stmt_a: _10 = *_9;
  stmt_b: _10 = *_9;
(analyze_overlapping_iterations 
  (chrec_a = {0B, +, 4}_1)
  (chrec_b = {0B, +, 4}_1)
  (overlap_iterations_a = [0])
  (overlap_iterations_b = [0]))
)
(compute_affine_dependence
  stmt_a: _13 = *_12;
  stmt_b: _13 = *_12;
(analyze_overlapping_iterations 
  (chrec_a = {0B, +, 4}_1)
  (chrec_b = {0B, +, 4}_1)
  (overlap_iterations_a = [0])
  (overlap_iterations_b = [0]))
)
(compute_affine_dependence
  stmt_a: *_9 = _14;
  stmt_b: *_9 = _14;
(analyze_overlapping_iterations 
  (chrec_a = {0B, +, 4}_1)
  (chrec_b = {0B, +, 4}_1)
  (overlap_iterations_a = [0])
  (overlap_iterations_b = [0]))
)
neon-vect-div-1.c:12:3: note: dependence distance  = 0.
neon-vect-div-1.c:12:3: note: dependence distance == 0 between *_9 and *_9
neon-vect-div-1.c:12:3: note: === vect_determine_vectorization_factor ===
neon-vect-div-1.c:12:3: note: ==> examining phi: i_19 = PHI <0(4), i_16(7)>

neon-vect-div-1.c:12:3: note: ==> examining phi: .MEM_20 = PHI <.MEM_5(D)(4), .MEM_15(7)>

neon-vect-div-1.c:12:3: note: ==> examining statement: i.0_6 = (unsigned int) i_19;

neon-vect-div-1.c:12:3: note: skip.
neon-vect-div-1.c:12:3: note: ==> examining statement: _7 = i.0_6 * 4;

neon-vect-div-1.c:12:3: note: skip.
neon-vect-div-1.c:12:3: note: ==> examining statement: _9 = p_8(D) + _7;

neon-vect-div-1.c:12:3: note: skip.
neon-vect-div-1.c:12:3: note: ==> examining statement: _10 = *_9;

neon-vect-div-1.c:12:3: note: get vectype for scalar type:  float
neon-vect-div-1.c:12:3: note: vectype: vector(4) float
neon-vect-div-1.c:12:3: note: nunits = 4
neon-vect-div-1.c:12:3: note: ==> examining statement: _12 = x_11(D) + _7;

neon-vect-div-1.c:12:3: note: skip.
neon-vect-div-1.c:12:3: note: ==> examining statement: _13 = *_12;

neon-vect-div-1.c:12:3: note: get vectype for scalar type:  float
neon-vect-div-1.c:12:3: note: vectype: vector(4) float
neon-vect-div-1.c:12:3: note: nunits = 4
neon-vect-div-1.c:12:3: note: ==> examining statement: _14 = _10 / _13;

neon-vect-div-1.c:12:3: note: get vectype for scalar type:  float
neon-vect-div-1.c:12:3: note: vectype: vector(4) float
neon-vect-div-1.c:12:3: note: get vectype for scalar type:  float
neon-vect-div-1.c:12:3: note: vectype: vector(4) float
neon-vect-div-1.c:12:3: note: nunits = 4
neon-vect-div-1.c:12:3: note: ==> examining statement: *_9 = _14;

neon-vect-div-1.c:12:3: note: get vectype for scalar type:  float
neon-vect-div-1.c:12:3: note: vectype: vector(4) float
neon-vect-div-1.c:12:3: note: nunits = 4
neon-vect-div-1.c:12:3: note: ==> examining statement: i_16 = i_19 + 1;

neon-vect-div-1.c:12:3: note: skip.
neon-vect-div-1.c:12:3: note: ==> examining statement: if (len_4 > i_16)

neon-vect-div-1.c:12:3: note: skip.
neon-vect-div-1.c:12:3: note: vectorization factor = 4
neon-vect-div-1.c:12:3: note: === vect_analyze_slp ===
neon-vect-div-1.c:12:3: note: === vect_make_slp_decision ===
neon-vect-div-1.c:12:3: note: === vect_analyze_data_refs_alignment ===
neon-vect-div-1.c:12:3: note: dependence distance  = 0.
neon-vect-div-1.c:12:3: note: accesses have the same alignment.
dependence distance modulo vf == 0 between *_9 and *_9
neon-vect-div-1.c:12:3: note: vect_compute_data_ref_alignment:
neon-vect-div-1.c:12:3: note: can't force alignment of ref: *_9
neon-vect-div-1.c:12:3: note: vect_compute_data_ref_alignment:
neon-vect-div-1.c:12:3: note: can't force alignment of ref: *_12
neon-vect-div-1.c:12:3: note: vect_compute_data_ref_alignment:
neon-vect-div-1.c:12:3: note: can't force alignment of ref: *_9
neon-vect-div-1.c:12:3: note: === vect_prune_runtime_alias_test_list ===
neon-vect-div-1.c:12:3: note: === vect_enhance_data_refs_alignment ===
neon-vect-div-1.c:12:3: note: Unknown misalignment, is_packed = 0
neon-vect-div-1.c:12:3: note: Unknown misalignment, is_packed = 0
neon-vect-div-1.c:12:3: note: Unknown misalignment, is_packed = 0
neon-vect-div-1.c:12:3: note: vect_can_advance_ivs_p:
neon-vect-div-1.c:12:3: note: Analyze phi: i_19 = PHI <0(4), i_16(7)>

neon-vect-div-1.c:12:3: note: Analyze phi: .MEM_20 = PHI <.MEM_5(D)(4), .MEM_15(7)>

neon-vect-div-1.c:12:3: note: virtual phi. skip.
neon-vect-div-1.c:12:3: note: Setting misalignment to -1.
neon-vect-div-1.c:12:3: note: not vectorized: unsupported unaligned load.*_9
neon-vect-div-1.c:12:3: note: bad data alignment.
neon-vect-div-1.c:12:3: note: ***** Re-trying analysis with vector size 8
neon-vect-div-1.c:12:3: note: === vect_analyze_loop_form ===
neon-vect-div-1.c:12:3: note: === get_loop_niters ===
Applying pattern match.pd:1068, generic-match.c:5523
Applying pattern match.pd:83, generic-match.c:8994
neon-vect-div-1.c:12:3: note: Symbolic number of iterations is (unsigned int) len_4
Creating dr for *_9
analyze_innermost: success.
	base_address: p_8(D)
	offset from base address: 0
	constant offset from base address: 0
	step: 4
	aligned to: 64
	base_object: *p_8(D)
	Access function 0: {0B, +, 4}_1
Creating dr for *_12
analyze_innermost: success.
	base_address: x_11(D)
	offset from base address: 0
	constant offset from base address: 0
	step: 4
	aligned to: 64
	base_object: *x_11(D)
	Access function 0: {0B, +, 4}_1
Creating dr for *_9
analyze_innermost: success.
	base_address: p_8(D)
	offset from base address: 0
	constant offset from base address: 0
	step: 4
	aligned to: 64
	base_object: *p_8(D)
	Access function 0: {0B, +, 4}_1
neon-vect-div-1.c:12:3: note: === vect_analyze_data_refs ===
neon-vect-div-1.c:12:3: note: got vectype for stmt: _10 = *_9;
vector(2) float
neon-vect-div-1.c:12:3: note: got vectype for stmt: _13 = *_12;
vector(2) float
neon-vect-div-1.c:12:3: note: got vectype for stmt: *_9 = _14;
vector(2) float
neon-vect-div-1.c:12:3: note: === vect_analyze_scalar_cycles ===
neon-vect-div-1.c:12:3: note: Analyze phi: i_19 = PHI <0(4), i_16(7)>

neon-vect-div-1.c:12:3: note: Access function of PHI: {0, +, 1}_1
neon-vect-div-1.c:12:3: note: step: 1,  init: 0
neon-vect-div-1.c:12:3: note: Detected induction.
neon-vect-div-1.c:12:3: note: Analyze phi: .MEM_20 = PHI <.MEM_5(D)(4), .MEM_15(7)>

neon-vect-div-1.c:12:3: note: === vect_pattern_recog ===
neon-vect-div-1.c:12:3: note: vect_is_simple_use: operand i.0_6
neon-vect-div-1.c:12:3: note: def_stmt: i.0_6 = (unsigned int) i_19;
neon-vect-div-1.c:12:3: note: type of def: internal
neon-vect-div-1.c:12:3: note: vect_is_simple_use: operand i_19
neon-vect-div-1.c:12:3: note: def_stmt: i_19 = PHI <0(4), i_16(7)>
neon-vect-div-1.c:12:3: note: type of def: induction
neon-vect-div-1.c:12:3: note: === vect_analyze_data_ref_accesses ===
neon-vect-div-1.c:12:3: note: === vect_mark_stmts_to_be_vectorized ===
neon-vect-div-1.c:12:3: note: init: phi relevant? i_19 = PHI <0(4), i_16(7)>
neon-vect-div-1.c:12:3: note: init: phi relevant? .MEM_20 = PHI <.MEM_5(D)(4), .MEM_15(7)>
neon-vect-div-1.c:12:3: note: init: stmt relevant? i.0_6 = (unsigned int) i_19;
neon-vect-div-1.c:12:3: note: init: stmt relevant? _7 = i.0_6 * 4;
neon-vect-div-1.c:12:3: note: init: stmt relevant? _9 = p_8(D) + _7;
neon-vect-div-1.c:12:3: note: init: stmt relevant? _10 = *_9;
neon-vect-div-1.c:12:3: note: init: stmt relevant? _12 = x_11(D) + _7;
neon-vect-div-1.c:12:3: note: init: stmt relevant? _13 = *_12;
neon-vect-div-1.c:12:3: note: init: stmt relevant? _14 = _10 / _13;
neon-vect-div-1.c:12:3: note: init: stmt relevant? *_9 = _14;
neon-vect-div-1.c:12:3: note: vec_stmt_relevant_p: stmt has vdefs.
neon-vect-div-1.c:12:3: note: mark relevant 4, live 0: *_9 = _14;
neon-vect-div-1.c:12:3: note: init: stmt relevant? i_16 = i_19 + 1;
neon-vect-div-1.c:12:3: note: init: stmt relevant? if (len_4 > i_16)
neon-vect-div-1.c:12:3: note: worklist: examine stmt: *_9 = _14;
neon-vect-div-1.c:12:3: note: vect_is_simple_use: operand _14
neon-vect-div-1.c:12:3: note: def_stmt: _14 = _10 / _13;
neon-vect-div-1.c:12:3: note: type of def: internal
neon-vect-div-1.c:12:3: note: mark relevant 4, live 0: _14 = _10 / _13;
neon-vect-div-1.c:12:3: note: worklist: examine stmt: _14 = _10 / _13;
neon-vect-div-1.c:12:3: note: vect_is_simple_use: operand _10
neon-vect-div-1.c:12:3: note: def_stmt: _10 = *_9;
neon-vect-div-1.c:12:3: note: type of def: internal
neon-vect-div-1.c:12:3: note: mark relevant 4, live 0: _10 = *_9;
neon-vect-div-1.c:12:3: note: vect_is_simple_use: operand _13
neon-vect-div-1.c:12:3: note: def_stmt: _13 = *_12;
neon-vect-div-1.c:12:3: note: type of def: internal
neon-vect-div-1.c:12:3: note: mark relevant 4, live 0: _13 = *_12;
neon-vect-div-1.c:12:3: note: worklist: examine stmt: _13 = *_12;
neon-vect-div-1.c:12:3: note: worklist: examine stmt: _10 = *_9;
neon-vect-div-1.c:12:3: note: === vect_analyze_data_ref_dependences ===
(compute_affine_dependence
  stmt_a: _10 = *_9;
  stmt_b: _13 = *_12;
) -> no dependence
(compute_affine_dependence
  stmt_a: _10 = *_9;
  stmt_b: *_9 = _14;
(analyze_overlapping_iterations 
  (chrec_a = {0B, +, 4}_1)
  (chrec_b = {0B, +, 4}_1)
  (overlap_iterations_a = [0])
  (overlap_iterations_b = [0]))
)
(compute_affine_dependence
  stmt_a: _13 = *_12;
  stmt_b: *_9 = _14;
) -> no dependence
(compute_affine_dependence
  stmt_a: _10 = *_9;
  stmt_b: _10 = *_9;
(analyze_overlapping_iterations 
  (chrec_a = {0B, +, 4}_1)
  (chrec_b = {0B, +, 4}_1)
  (overlap_iterations_a = [0])
  (overlap_iterations_b = [0]))
)
(compute_affine_dependence
  stmt_a: _13 = *_12;
  stmt_b: _13 = *_12;
(analyze_overlapping_iterations 
  (chrec_a = {0B, +, 4}_1)
  (chrec_b = {0B, +, 4}_1)
  (overlap_iterations_a = [0])
  (overlap_iterations_b = [0]))
)
(compute_affine_dependence
  stmt_a: *_9 = _14;
  stmt_b: *_9 = _14;
(analyze_overlapping_iterations 
  (chrec_a = {0B, +, 4}_1)
  (chrec_b = {0B, +, 4}_1)
  (overlap_iterations_a = [0])
  (overlap_iterations_b = [0]))
)
neon-vect-div-1.c:12:3: note: dependence distance  = 0.
neon-vect-div-1.c:12:3: note: dependence distance == 0 between *_9 and *_9
neon-vect-div-1.c:12:3: note: === vect_determine_vectorization_factor ===
neon-vect-div-1.c:12:3: note: ==> examining phi: i_19 = PHI <0(4), i_16(7)>

neon-vect-div-1.c:12:3: note: ==> examining phi: .MEM_20 = PHI <.MEM_5(D)(4), .MEM_15(7)>

neon-vect-div-1.c:12:3: note: ==> examining statement: i.0_6 = (unsigned int) i_19;

neon-vect-div-1.c:12:3: note: skip.
neon-vect-div-1.c:12:3: note: ==> examining statement: _7 = i.0_6 * 4;

neon-vect-div-1.c:12:3: note: skip.
neon-vect-div-1.c:12:3: note: ==> examining statement: _9 = p_8(D) + _7;

neon-vect-div-1.c:12:3: note: skip.
neon-vect-div-1.c:12:3: note: ==> examining statement: _10 = *_9;

neon-vect-div-1.c:12:3: note: get vectype for scalar type:  float
neon-vect-div-1.c:12:3: note: vectype: vector(2) float
neon-vect-div-1.c:12:3: note: nunits = 2
neon-vect-div-1.c:12:3: note: ==> examining statement: _12 = x_11(D) + _7;

neon-vect-div-1.c:12:3: note: skip.
neon-vect-div-1.c:12:3: note: ==> examining statement: _13 = *_12;

neon-vect-div-1.c:12:3: note: get vectype for scalar type:  float
neon-vect-div-1.c:12:3: note: vectype: vector(2) float
neon-vect-div-1.c:12:3: note: nunits = 2
neon-vect-div-1.c:12:3: note: ==> examining statement: _14 = _10 / _13;

neon-vect-div-1.c:12:3: note: get vectype for scalar type:  float
neon-vect-div-1.c:12:3: note: vectype: vector(2) float
neon-vect-div-1.c:12:3: note: get vectype for scalar type:  float
neon-vect-div-1.c:12:3: note: vectype: vector(2) float
neon-vect-div-1.c:12:3: note: nunits = 2
neon-vect-div-1.c:12:3: note: ==> examining statement: *_9 = _14;

neon-vect-div-1.c:12:3: note: get vectype for scalar type:  float
neon-vect-div-1.c:12:3: note: vectype: vector(2) float
neon-vect-div-1.c:12:3: note: nunits = 2
neon-vect-div-1.c:12:3: note: ==> examining statement: i_16 = i_19 + 1;

neon-vect-div-1.c:12:3: note: skip.
neon-vect-div-1.c:12:3: note: ==> examining statement: if (len_4 > i_16)

neon-vect-div-1.c:12:3: note: skip.
neon-vect-div-1.c:12:3: note: vectorization factor = 2
neon-vect-div-1.c:12:3: note: === vect_analyze_slp ===
neon-vect-div-1.c:12:3: note: === vect_make_slp_decision ===
neon-vect-div-1.c:12:3: note: === vect_analyze_data_refs_alignment ===
neon-vect-div-1.c:12:3: note: dependence distance  = 0.
neon-vect-div-1.c:12:3: note: accesses have the same alignment.
dependence distance modulo vf == 0 between *_9 and *_9
neon-vect-div-1.c:12:3: note: vect_compute_data_ref_alignment:
neon-vect-div-1.c:12:3: note: can't force alignment of ref: *_9
neon-vect-div-1.c:12:3: note: vect_compute_data_ref_alignment:
neon-vect-div-1.c:12:3: note: can't force alignment of ref: *_12
neon-vect-div-1.c:12:3: note: vect_compute_data_ref_alignment:
neon-vect-div-1.c:12:3: note: can't force alignment of ref: *_9
neon-vect-div-1.c:12:3: note: === vect_prune_runtime_alias_test_list ===
neon-vect-div-1.c:12:3: note: === vect_enhance_data_refs_alignment ===
neon-vect-div-1.c:12:3: note: Unknown misalignment, is_packed = 0
neon-vect-div-1.c:12:3: note: Unknown misalignment, is_packed = 0
neon-vect-div-1.c:12:3: note: Unknown misalignment, is_packed = 0
neon-vect-div-1.c:12:3: note: vect_can_advance_ivs_p:
neon-vect-div-1.c:12:3: note: Analyze phi: i_19 = PHI <0(4), i_16(7)>

neon-vect-div-1.c:12:3: note: Analyze phi: .MEM_20 = PHI <.MEM_5(D)(4), .MEM_15(7)>

neon-vect-div-1.c:12:3: note: virtual phi. skip.
neon-vect-div-1.c:12:3: note: Setting misalignment to -1.
neon-vect-div-1.c:12:3: note: not vectorized: unsupported unaligned load.*_9
neon-vect-div-1.c:12:3: note: bad data alignment.
neon-vect-div-1.c:9:1: note: vectorized 0 loops in function.
foo (int len, float * restrict p, float * restrict x)
{
  int i;
  unsigned int i.0_6;
  unsigned int _7;
  float * _9;
  float _10;
  float * _12;
  float _13;
  float _14;

  <bb 2>:
  len_4 = len_3(D) & -32;
  if (len_4 > 0)
    goto <bb 4>;
  else
    goto <bb 3>;

  <bb 3>:
  return;

  <bb 4>:

  <bb 5>:
  # i_19 = PHI <0(4), i_16(7)>
  i.0_6 = (unsigned int) i_19;
  _7 = i.0_6 * 4;
  _9 = p_8(D) + _7;
  _10 = *_9;
  _12 = x_11(D) + _7;
  _13 = *_12;
  _14 = _10 / _13;
  *_9 = _14;
  i_16 = i_19 + 1;
  if (len_4 > i_16)
    goto <bb 7>;
  else
    goto <bb 6>;

  <bb 6>:
  goto <bb 3>;

  <bb 7>:
  goto <bb 5>;

}



^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [ARM] implement division using vrecpe/vrecps with -funsafe-math-optimizations
  2016-02-05 13:10       ` Prathamesh Kulkarni
@ 2016-05-23  9:30         ` Prathamesh Kulkarni
  2016-05-30 10:55           ` Prathamesh Kulkarni
  2016-06-07  8:37           ` Ramana Radhakrishnan
  0 siblings, 2 replies; 13+ messages in thread
From: Prathamesh Kulkarni @ 2016-05-23  9:30 UTC (permalink / raw)
  To: Ramana Radhakrishnan; +Cc: Ramana Radhakrishnan, gcc Patches, Charles Baylis

[-- Attachment #1: Type: text/plain, Size: 9994 bytes --]

On 5 February 2016 at 18:40, Prathamesh Kulkarni
<prathamesh.kulkarni@linaro.org> wrote:
> On 4 February 2016 at 16:31, Ramana Radhakrishnan
> <ramana.gcc@googlemail.com> wrote:
>> On Sun, Jan 17, 2016 at 9:06 AM, Prathamesh Kulkarni
>> <prathamesh.kulkarni@linaro.org> wrote:
>>> On 31 July 2015 at 15:04, Ramana Radhakrishnan
>>> <ramana.radhakrishnan@foss.arm.com> wrote:
>>>>
>>>>
>>>> On 29/07/15 11:09, Prathamesh Kulkarni wrote:
>>>>> Hi,
>>>>> This patch tries to implement division with multiplication by
>>>>> reciprocal using vrecpe/vrecps
>>>>> with -funsafe-math-optimizations and -freciprocal-math enabled.
>>>>> Tested on arm-none-linux-gnueabihf using qemu.
>>>>> OK for trunk ?
>>>>>
>>>>> Thank you,
>>>>> Prathamesh
>>>>>
>>>>
>>>> I've tried this in the past and never been convinced that 2 iterations are enough to get to stability with this given that the results are only precise for 8 bits / iteration. Thus I've always believed you need 3 iterations rather than 2 at which point I've never been sure that it's worth it. So the testing that you've done with this currently is not enough for this to go into the tree.
>>>>
>>>> I'd like this to be tested on a couple of different AArch32 implementations with a wider range of inputs to verify that the results are acceptable as well as running something like SPEC2k(6) with atleast one iteration to ensure correctness.
>>> Hi,
>>> I got results of SPEC2k6 fp benchmarks:
>>> a15: +0.64% overall, 481.wrf: +6.46%
>>> a53: +0.21% overall, 416.gamess: -1.39%, 481.wrf: +6.76%
>>> a57: +0.35% overall, 481.wrf: +3.84%
>>> The other benchmarks had (almost) identical results.
>>
>> Thanks for the benchmarking results -  Please repost the patch with
>> the changes that I had requested in my previous review - given it is
>> now stage4 , I would rather queue changes like this for stage1 now.
> Hi,
> Please find the updated patch attached.
> It passes testsuite for arm-none-linux-gnueabi, arm-none-linux-gnueabihf and
> arm-none-eabi.
> However the test-case added in the patch (neon-vect-div-1.c) fails to
> get vectorized at -O2
> for armeb-none-linux-gnueabihf.
> Charles suggested me to try with -O3, which worked.
> It appears the test-case fails to get vectorized with
> -fvect-cost-model=cheap (which is default enabled at -O2)
> and passes for -fno-vect-cost-model / -fvect-cost-model=dynamic
>
> I can't figure out why it fails -fvect-cost-model=cheap.
> From the vect dump (attached):
> neon-vect-div-1.c:12:3: note: Setting misalignment to -1.
> neon-vect-div-1.c:12:3: note: not vectorized: unsupported unaligned load.*_9
Hi,
I think I have some idea why the test-case fails attached with patch
fail to get vectorized on armeb with -O2.

Issue with big endian vectorizer:
The patch does not cause regressions on big endian vectorizer but
fails to vectorize the test-cases attached with the patch, while they
get vectorized on
litttle-endian.
Fails with armeb with the following message in dump:
note: not vectorized: unsupported unaligned load.*_9

The behavior of big and little endian vectorizer seems to be different
in arm_builtin_support_vector_misalignment() which overrides the hook
targetm.vectorize.support_vector_misalignment().

targetm.vectorize.support_vector_misalignment is called by
vect_supportable_dr_alignment () which in turn is called
by verify_data_refs_alignment ().

Execution upto following condition is common between arm and armeb
in vect_supportable_dr_alignment():

if ((TYPE_USER_ALIGN (type) && !is_packed)
      || targetm.vectorize.support_vector_misalignment (mode, type,
                                            DR_MISALIGNMENT (dr), is_packed))
        /* Can't software pipeline the loads, but can at least do them.  */
        return dr_unaligned_supported;

For little endian case:
arm_builtin_support_vector_misalignment() is called with
V2SF mode and misalignment == -1, and the following condition
becomes true:
/* If the misalignment is unknown, we should be able to handle the access
         so long as it is not to a member of a packed data structure.  */
  if (misalignment == -1)
    return true;

Since the hook returned true we enter the condition above in
vect_supportable_dr_alignment() and return dr_unaligned_supported;

For big-endian:
arm_builtin_support_vector_misalignment() is called with V2SF mode.
The following condition that gates the entire function body fails:
 if (TARGET_NEON && !BYTES_BIG_ENDIAN && unaligned_access)
and the default hook gets called with V2SF mode and the default hook
returns false because
movmisalign_optab does not exist for V2SF mode.

So the condition above in vect_supportable_dr_alignment() fails
and we come here:
 /* Unsupported.  */
return dr_unaligned_unsupported;

And hence we get the unaligned load not supported message in the dump
for armeb in verify_data_ref_alignment ():

static bool
verify_data_ref_alignment (data_reference_p dr)
{
  enum dr_alignment_support supportable_dr_alignment
    = vect_supportable_dr_alignment (dr, false);
  if (!supportable_dr_alignment)
    {
      if (dump_enabled_p ())
        {
          if (DR_IS_READ (dr))
            dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                             "not vectorized: unsupported unaligned load.");

With -O3, the test-cases vectorize for armeb, because loop peeling for alignment
is turned on.
The above behavior is also reproducible with test-case which is
irrelevant to the patch.
for instance, we get the same unsupported unaligned load for following
test-case (replaced / with +)

void
foo (int len, float * __restrict p, float *__restrict x)
{
  len = len & ~31;
  for (int i = 0; i < len; i++)
    p[i] = p[i] + x[i];
}
Is the patch OK to commit after bootstrap+test ?

Thanks,
Prathamesh
>
> Thanks,
> Prathamesh
>>
>> Thanks,
>> Ramana
>>
>>>
>>> Thanks,
>>> Prathamesh
>>>>
>>>>
>>>> moving on to the patches.
>>>>
>>>>> diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
>>>>> index 654d9d5..28c2e2a 100644
>>>>> --- a/gcc/config/arm/neon.md
>>>>> +++ b/gcc/config/arm/neon.md
>>>>> @@ -548,6 +548,32 @@
>>>>>                      (const_string "neon_mul_<V_elem_ch><q>")))]
>>>>>  )
>>>>>
>>>>
>>>> Please add a comment here.
>>>>
>>>>> +(define_expand "div<mode>3"
>>>>> +  [(set (match_operand:VCVTF 0 "s_register_operand" "=w")
>>>>> +        (div:VCVTF (match_operand:VCVTF 1 "s_register_operand" "w")
>>>>> +               (match_operand:VCVTF 2 "s_register_operand" "w")))]
>>>>
>>>> I want to double check that this doesn't collide with Alan's patches for FP16 especially if he reuses the VCVTF iterator for all the vcvt f16 cases.
>>>>
>>>>> +  "TARGET_NEON && flag_unsafe_math_optimizations && flag_reciprocal_math"
>>>>> +  {
>>>>> +    rtx rec = gen_reg_rtx (<MODE>mode);
>>>>> +    rtx vrecps_temp = gen_reg_rtx (<MODE>mode);
>>>>> +
>>>>> +    /* Reciprocal estimate */
>>>>> +    emit_insn (gen_neon_vrecpe<mode> (rec, operands[2]));
>>>>> +
>>>>> +    /* Perform 2 iterations of Newton-Raphson method for better accuracy */
>>>>> +    for (int i = 0; i < 2; i++)
>>>>> +      {
>>>>> +     emit_insn (gen_neon_vrecps<mode> (vrecps_temp, rec, operands[2]));
>>>>> +     emit_insn (gen_mul<mode>3 (rec, rec, vrecps_temp));
>>>>> +      }
>>>>> +
>>>>> +    /* We now have reciprocal in rec, perform operands[0] = operands[1] * rec */
>>>>> +    emit_insn (gen_mul<mode>3 (operands[0], operands[1], rec));
>>>>> +    DONE;
>>>>> +  }
>>>>> +)
>>>>> +
>>>>> +
>>>>>  (define_insn "mul<mode>3add<mode>_neon"
>>>>>    [(set (match_operand:VDQW 0 "s_register_operand" "=w")
>>>>>          (plus:VDQW (mult:VDQW (match_operand:VDQW 2 "s_register_operand" "w")
>>>>> diff --git a/gcc/testsuite/gcc.target/arm/vect-div-1.c b/gcc/testsuite/gcc.target/arm/vect-div-1.c
>>>>> new file mode 100644
>>>>> index 0000000..e562ef3
>>>>> --- /dev/null
>>>>> +++ b/gcc/testsuite/gcc.target/arm/vect-div-1.c
>>>>> @@ -0,0 +1,14 @@
>>>>> +/* { dg-do compile } */
>>>>> +/* { dg-require-effective-target arm_v8_neon_ok } */
>>>>> +/* { dg-options "-O2 -funsafe-math-optimizations -ftree-vectorize -fdump-tree-vect-all" } */
>>>>> +/* { dg-add-options arm_v8_neon } */
>>>>
>>>> No this is wrong.
>>>>
>>>> What is armv8 specific about this test ? This is just like another test that is for Neon. vrecpe / vrecps are not instructions that were introduced in the v8 version of the architecture. They've existed in the base Neon instruction set. The code generation above in the patterns will be enabled when TARGET_NEON is true which can happen when -mfpu=neon -mfloat-abi={softfp/hard} is true.
>>>>
>>>>> +
>>>>> +void
>>>>> +foo (int len, float * __restrict p, float *__restrict x)
>>>>> +{
>>>>> +  len = len & ~31;
>>>>> +  for (int i = 0; i < len; i++)
>>>>> +    p[i] = p[i] / x[i];
>>>>> +}
>>>>> +
>>>>> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
>>>>> diff --git a/gcc/testsuite/gcc.target/arm/vect-div-2.c b/gcc/testsuite/gcc.target/arm/vect-div-2.c
>>>>> new file mode 100644
>>>>> index 0000000..8e15d0a
>>>>> --- /dev/null
>>>>> +++ b/gcc/testsuite/gcc.target/arm/vect-div-2.c
>>>>> @@ -0,0 +1,14 @@
>>>>> +/* { dg-do compile } */
>>>>> +/* { dg-require-effective-target arm_v8_neon_ok } */
>>>>
>>>> And likewise.
>>>>
>>>>> +/* { dg-options "-O2 -funsafe-math-optimizations -fno-reciprocal-math -ftree-vectorize -fdump-tree-vect-all" } */
>>>>> +/* { dg-add-options arm_v8_neon } */
>>>>> +
>>>>> +void
>>>>> +foo (int len, float * __restrict p, float *__restrict x)
>>>>> +{
>>>>> +  len = len & ~31;
>>>>> +  for (int i = 0; i < len; i++)
>>>>> +    p[i] = p[i] / x[i];
>>>>> +}
>>>>> +
>>>>> +/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" } } */
>>>>
>>>>
>>>> regards
>>>> Ramana

[-- Attachment #2: patch-1.diff --]
[-- Type: text/plain, Size: 2957 bytes --]

diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index 6b4896d..862e31b 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -578,6 +578,38 @@
                     (const_string "neon_mul_<V_elem_ch><q>")))]
 )
 
+/* Perform division using multiply-by-reciprocal. 
+   Reciprocal is calculated using Newton-Raphson method.
+   Enabled with -funsafe-math-optimizations -freciprocal-math
+   and disabled for -Os since it increases code size .  */
+
+(define_expand "div<mode>3"
+  [(set (match_operand:VCVTF 0 "s_register_operand" "=w")
+        (div:VCVTF (match_operand:VCVTF 1 "s_register_operand" "w")
+		  (match_operand:VCVTF 2 "s_register_operand" "w")))]
+  "TARGET_NEON && !optimize_size
+   && flag_unsafe_math_optimizations && flag_reciprocal_math"
+  {
+    rtx rec = gen_reg_rtx (<MODE>mode);
+    rtx vrecps_temp = gen_reg_rtx (<MODE>mode);
+
+    /* Reciprocal estimate.  */
+    emit_insn (gen_neon_vrecpe<mode> (rec, operands[2]));
+
+    /* Perform 2 iterations of newton-raphson method.  */
+    for (int i = 0; i < 2; i++)
+      {
+	emit_insn (gen_neon_vrecps<mode> (vrecps_temp, rec, operands[2]));
+	emit_insn (gen_mul<mode>3 (rec, rec, vrecps_temp));
+      }
+
+    /* We now have reciprocal in rec, perform operands[0] = operands[1] * rec.  */
+    emit_insn (gen_mul<mode>3 (operands[0], operands[1], rec));
+    DONE;
+  }
+)
+
+
 (define_insn "mul<mode>3add<mode>_neon"
   [(set (match_operand:VDQW 0 "s_register_operand" "=w")
         (plus:VDQW (mult:VDQW (match_operand:VDQW 2 "s_register_operand" "w")
diff --git a/gcc/testsuite/gcc.target/arm/neon-vect-div-1.c b/gcc/testsuite/gcc.target/arm/neon-vect-div-1.c
new file mode 100644
index 0000000..dae724a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vect-div-1.c
@@ -0,0 +1,16 @@
+/* Test pattern div<mode>3.  */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O3 -funsafe-math-optimizations -fdump-tree-vect-all" } */
+/* { dg-add-options arm_neon } */
+
+void
+foo (int len, float * __restrict p, float *__restrict x)
+{
+  len = len & ~31;
+  for (int i = 0; i < len; i++)
+    p[i] = p[i] / x[i];
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon-vect-div-2.c b/gcc/testsuite/gcc.target/arm/neon-vect-div-2.c
new file mode 100644
index 0000000..0450b70
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vect-div-2.c
@@ -0,0 +1,16 @@
+/* Test pattern div<mode>3.  */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O3 -funsafe-math-optimizations -fno-reciprocal-math -fdump-tree-vect-all" } */
+/* { dg-add-options arm_neon } */
+
+void
+foo (int len, float * __restrict p, float *__restrict x)
+{
+  len = len & ~31;
+  for (int i = 0; i < len; i++)
+    p[i] = p[i] / x[i];
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" } } */

[-- Attachment #3: ChangeLog --]
[-- Type: application/octet-stream, Size: 227 bytes --]

2016-05-24  Prathamesh Kulkarni  <prathamesh.kulkarni@linaro.org>

	* config/arm/neon.md (div<mode>3): New pattern.

testsuite/
	* gcc.target/arm/neon-vect-div-1.c: New test-case.
	* gcc.target/arm/neon-vect-div-2.c: Likewise.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [ARM] implement division using vrecpe/vrecps with -funsafe-math-optimizations
  2016-05-23  9:30         ` Prathamesh Kulkarni
@ 2016-05-30 10:55           ` Prathamesh Kulkarni
  2016-06-07  8:26             ` Prathamesh Kulkarni
  2016-06-07  8:37           ` Ramana Radhakrishnan
  1 sibling, 1 reply; 13+ messages in thread
From: Prathamesh Kulkarni @ 2016-05-30 10:55 UTC (permalink / raw)
  To: Ramana Radhakrishnan; +Cc: Ramana Radhakrishnan, gcc Patches, Charles Baylis

On 23 May 2016 at 14:59, Prathamesh Kulkarni
<prathamesh.kulkarni@linaro.org> wrote:
> On 5 February 2016 at 18:40, Prathamesh Kulkarni
> <prathamesh.kulkarni@linaro.org> wrote:
>> On 4 February 2016 at 16:31, Ramana Radhakrishnan
>> <ramana.gcc@googlemail.com> wrote:
>>> On Sun, Jan 17, 2016 at 9:06 AM, Prathamesh Kulkarni
>>> <prathamesh.kulkarni@linaro.org> wrote:
>>>> On 31 July 2015 at 15:04, Ramana Radhakrishnan
>>>> <ramana.radhakrishnan@foss.arm.com> wrote:
>>>>>
>>>>>
>>>>> On 29/07/15 11:09, Prathamesh Kulkarni wrote:
>>>>>> Hi,
>>>>>> This patch tries to implement division with multiplication by
>>>>>> reciprocal using vrecpe/vrecps
>>>>>> with -funsafe-math-optimizations and -freciprocal-math enabled.
>>>>>> Tested on arm-none-linux-gnueabihf using qemu.
>>>>>> OK for trunk ?
>>>>>>
>>>>>> Thank you,
>>>>>> Prathamesh
>>>>>>
>>>>>
>>>>> I've tried this in the past and never been convinced that 2 iterations are enough to get to stability with this given that the results are only precise for 8 bits / iteration. Thus I've always believed you need 3 iterations rather than 2 at which point I've never been sure that it's worth it. So the testing that you've done with this currently is not enough for this to go into the tree.
>>>>>
>>>>> I'd like this to be tested on a couple of different AArch32 implementations with a wider range of inputs to verify that the results are acceptable as well as running something like SPEC2k(6) with atleast one iteration to ensure correctness.
>>>> Hi,
>>>> I got results of SPEC2k6 fp benchmarks:
>>>> a15: +0.64% overall, 481.wrf: +6.46%
>>>> a53: +0.21% overall, 416.gamess: -1.39%, 481.wrf: +6.76%
>>>> a57: +0.35% overall, 481.wrf: +3.84%
>>>> The other benchmarks had (almost) identical results.
>>>
>>> Thanks for the benchmarking results -  Please repost the patch with
>>> the changes that I had requested in my previous review - given it is
>>> now stage4 , I would rather queue changes like this for stage1 now.
>> Hi,
>> Please find the updated patch attached.
>> It passes testsuite for arm-none-linux-gnueabi, arm-none-linux-gnueabihf and
>> arm-none-eabi.
>> However the test-case added in the patch (neon-vect-div-1.c) fails to
>> get vectorized at -O2
>> for armeb-none-linux-gnueabihf.
>> Charles suggested me to try with -O3, which worked.
>> It appears the test-case fails to get vectorized with
>> -fvect-cost-model=cheap (which is default enabled at -O2)
>> and passes for -fno-vect-cost-model / -fvect-cost-model=dynamic
>>
>> I can't figure out why it fails -fvect-cost-model=cheap.
>> From the vect dump (attached):
>> neon-vect-div-1.c:12:3: note: Setting misalignment to -1.
>> neon-vect-div-1.c:12:3: note: not vectorized: unsupported unaligned load.*_9
> Hi,
> I think I have some idea why the test-case fails attached with patch
> fail to get vectorized on armeb with -O2.
>
> Issue with big endian vectorizer:
> The patch does not cause regressions on big endian vectorizer but
> fails to vectorize the test-cases attached with the patch, while they
> get vectorized on
> litttle-endian.
> Fails with armeb with the following message in dump:
> note: not vectorized: unsupported unaligned load.*_9
>
> The behavior of big and little endian vectorizer seems to be different
> in arm_builtin_support_vector_misalignment() which overrides the hook
> targetm.vectorize.support_vector_misalignment().
>
> targetm.vectorize.support_vector_misalignment is called by
> vect_supportable_dr_alignment () which in turn is called
> by verify_data_refs_alignment ().
>
> Execution upto following condition is common between arm and armeb
> in vect_supportable_dr_alignment():
>
> if ((TYPE_USER_ALIGN (type) && !is_packed)
>       || targetm.vectorize.support_vector_misalignment (mode, type,
>                                             DR_MISALIGNMENT (dr), is_packed))
>         /* Can't software pipeline the loads, but can at least do them.  */
>         return dr_unaligned_supported;
>
> For little endian case:
> arm_builtin_support_vector_misalignment() is called with
> V2SF mode and misalignment == -1, and the following condition
> becomes true:
> /* If the misalignment is unknown, we should be able to handle the access
>          so long as it is not to a member of a packed data structure.  */
>   if (misalignment == -1)
>     return true;
>
> Since the hook returned true we enter the condition above in
> vect_supportable_dr_alignment() and return dr_unaligned_supported;
>
> For big-endian:
> arm_builtin_support_vector_misalignment() is called with V2SF mode.
> The following condition that gates the entire function body fails:
>  if (TARGET_NEON && !BYTES_BIG_ENDIAN && unaligned_access)
> and the default hook gets called with V2SF mode and the default hook
> returns false because
> movmisalign_optab does not exist for V2SF mode.
>
> So the condition above in vect_supportable_dr_alignment() fails
> and we come here:
>  /* Unsupported.  */
> return dr_unaligned_unsupported;
>
> And hence we get the unaligned load not supported message in the dump
> for armeb in verify_data_ref_alignment ():
>
> static bool
> verify_data_ref_alignment (data_reference_p dr)
> {
>   enum dr_alignment_support supportable_dr_alignment
>     = vect_supportable_dr_alignment (dr, false);
>   if (!supportable_dr_alignment)
>     {
>       if (dump_enabled_p ())
>         {
>           if (DR_IS_READ (dr))
>             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>                              "not vectorized: unsupported unaligned load.");
>
> With -O3, the test-cases vectorize for armeb, because loop peeling for alignment
> is turned on.
> The above behavior is also reproducible with test-case which is
> irrelevant to the patch.
> for instance, we get the same unsupported unaligned load for following
> test-case (replaced / with +)
>
> void
> foo (int len, float * __restrict p, float *__restrict x)
> {
>   len = len & ~31;
>   for (int i = 0; i < len; i++)
>     p[i] = p[i] + x[i];
> }
> Is the patch OK to commit after bootstrap+test ?
ping https://gcc.gnu.org/ml/gcc-patches/2016-05/msg01765.html

Thanks,
Prathamesh
>
> Thanks,
> Prathamesh
>>
>> Thanks,
>> Prathamesh
>>>
>>> Thanks,
>>> Ramana
>>>
>>>>
>>>> Thanks,
>>>> Prathamesh
>>>>>
>>>>>
>>>>> moving on to the patches.
>>>>>
>>>>>> diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
>>>>>> index 654d9d5..28c2e2a 100644
>>>>>> --- a/gcc/config/arm/neon.md
>>>>>> +++ b/gcc/config/arm/neon.md
>>>>>> @@ -548,6 +548,32 @@
>>>>>>                      (const_string "neon_mul_<V_elem_ch><q>")))]
>>>>>>  )
>>>>>>
>>>>>
>>>>> Please add a comment here.
>>>>>
>>>>>> +(define_expand "div<mode>3"
>>>>>> +  [(set (match_operand:VCVTF 0 "s_register_operand" "=w")
>>>>>> +        (div:VCVTF (match_operand:VCVTF 1 "s_register_operand" "w")
>>>>>> +               (match_operand:VCVTF 2 "s_register_operand" "w")))]
>>>>>
>>>>> I want to double check that this doesn't collide with Alan's patches for FP16 especially if he reuses the VCVTF iterator for all the vcvt f16 cases.
>>>>>
>>>>>> +  "TARGET_NEON && flag_unsafe_math_optimizations && flag_reciprocal_math"
>>>>>> +  {
>>>>>> +    rtx rec = gen_reg_rtx (<MODE>mode);
>>>>>> +    rtx vrecps_temp = gen_reg_rtx (<MODE>mode);
>>>>>> +
>>>>>> +    /* Reciprocal estimate */
>>>>>> +    emit_insn (gen_neon_vrecpe<mode> (rec, operands[2]));
>>>>>> +
>>>>>> +    /* Perform 2 iterations of Newton-Raphson method for better accuracy */
>>>>>> +    for (int i = 0; i < 2; i++)
>>>>>> +      {
>>>>>> +     emit_insn (gen_neon_vrecps<mode> (vrecps_temp, rec, operands[2]));
>>>>>> +     emit_insn (gen_mul<mode>3 (rec, rec, vrecps_temp));
>>>>>> +      }
>>>>>> +
>>>>>> +    /* We now have reciprocal in rec, perform operands[0] = operands[1] * rec */
>>>>>> +    emit_insn (gen_mul<mode>3 (operands[0], operands[1], rec));
>>>>>> +    DONE;
>>>>>> +  }
>>>>>> +)
>>>>>> +
>>>>>> +
>>>>>>  (define_insn "mul<mode>3add<mode>_neon"
>>>>>>    [(set (match_operand:VDQW 0 "s_register_operand" "=w")
>>>>>>          (plus:VDQW (mult:VDQW (match_operand:VDQW 2 "s_register_operand" "w")
>>>>>> diff --git a/gcc/testsuite/gcc.target/arm/vect-div-1.c b/gcc/testsuite/gcc.target/arm/vect-div-1.c
>>>>>> new file mode 100644
>>>>>> index 0000000..e562ef3
>>>>>> --- /dev/null
>>>>>> +++ b/gcc/testsuite/gcc.target/arm/vect-div-1.c
>>>>>> @@ -0,0 +1,14 @@
>>>>>> +/* { dg-do compile } */
>>>>>> +/* { dg-require-effective-target arm_v8_neon_ok } */
>>>>>> +/* { dg-options "-O2 -funsafe-math-optimizations -ftree-vectorize -fdump-tree-vect-all" } */
>>>>>> +/* { dg-add-options arm_v8_neon } */
>>>>>
>>>>> No this is wrong.
>>>>>
>>>>> What is armv8 specific about this test ? This is just like another test that is for Neon. vrecpe / vrecps are not instructions that were introduced in the v8 version of the architecture. They've existed in the base Neon instruction set. The code generation above in the patterns will be enabled when TARGET_NEON is true which can happen when -mfpu=neon -mfloat-abi={softfp/hard} is true.
>>>>>
>>>>>> +
>>>>>> +void
>>>>>> +foo (int len, float * __restrict p, float *__restrict x)
>>>>>> +{
>>>>>> +  len = len & ~31;
>>>>>> +  for (int i = 0; i < len; i++)
>>>>>> +    p[i] = p[i] / x[i];
>>>>>> +}
>>>>>> +
>>>>>> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
>>>>>> diff --git a/gcc/testsuite/gcc.target/arm/vect-div-2.c b/gcc/testsuite/gcc.target/arm/vect-div-2.c
>>>>>> new file mode 100644
>>>>>> index 0000000..8e15d0a
>>>>>> --- /dev/null
>>>>>> +++ b/gcc/testsuite/gcc.target/arm/vect-div-2.c
>>>>>> @@ -0,0 +1,14 @@
>>>>>> +/* { dg-do compile } */
>>>>>> +/* { dg-require-effective-target arm_v8_neon_ok } */
>>>>>
>>>>> And likewise.
>>>>>
>>>>>> +/* { dg-options "-O2 -funsafe-math-optimizations -fno-reciprocal-math -ftree-vectorize -fdump-tree-vect-all" } */
>>>>>> +/* { dg-add-options arm_v8_neon } */
>>>>>> +
>>>>>> +void
>>>>>> +foo (int len, float * __restrict p, float *__restrict x)
>>>>>> +{
>>>>>> +  len = len & ~31;
>>>>>> +  for (int i = 0; i < len; i++)
>>>>>> +    p[i] = p[i] / x[i];
>>>>>> +}
>>>>>> +
>>>>>> +/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" } } */
>>>>>
>>>>>
>>>>> regards
>>>>> Ramana

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [ARM] implement division using vrecpe/vrecps with -funsafe-math-optimizations
  2016-05-30 10:55           ` Prathamesh Kulkarni
@ 2016-06-07  8:26             ` Prathamesh Kulkarni
  0 siblings, 0 replies; 13+ messages in thread
From: Prathamesh Kulkarni @ 2016-06-07  8:26 UTC (permalink / raw)
  To: Ramana Radhakrishnan; +Cc: Ramana Radhakrishnan, gcc Patches, Charles Baylis

On 30 May 2016 at 13:52, Prathamesh Kulkarni
<prathamesh.kulkarni@linaro.org> wrote:
> On 23 May 2016 at 14:59, Prathamesh Kulkarni
> <prathamesh.kulkarni@linaro.org> wrote:
>> On 5 February 2016 at 18:40, Prathamesh Kulkarni
>> <prathamesh.kulkarni@linaro.org> wrote:
>>> On 4 February 2016 at 16:31, Ramana Radhakrishnan
>>> <ramana.gcc@googlemail.com> wrote:
>>>> On Sun, Jan 17, 2016 at 9:06 AM, Prathamesh Kulkarni
>>>> <prathamesh.kulkarni@linaro.org> wrote:
>>>>> On 31 July 2015 at 15:04, Ramana Radhakrishnan
>>>>> <ramana.radhakrishnan@foss.arm.com> wrote:
>>>>>>
>>>>>>
>>>>>> On 29/07/15 11:09, Prathamesh Kulkarni wrote:
>>>>>>> Hi,
>>>>>>> This patch tries to implement division with multiplication by
>>>>>>> reciprocal using vrecpe/vrecps
>>>>>>> with -funsafe-math-optimizations and -freciprocal-math enabled.
>>>>>>> Tested on arm-none-linux-gnueabihf using qemu.
>>>>>>> OK for trunk ?
>>>>>>>
>>>>>>> Thank you,
>>>>>>> Prathamesh
>>>>>>>
>>>>>>
>>>>>> I've tried this in the past and never been convinced that 2 iterations are enough to get to stability with this given that the results are only precise for 8 bits / iteration. Thus I've always believed you need 3 iterations rather than 2 at which point I've never been sure that it's worth it. So the testing that you've done with this currently is not enough for this to go into the tree.
>>>>>>
>>>>>> I'd like this to be tested on a couple of different AArch32 implementations with a wider range of inputs to verify that the results are acceptable as well as running something like SPEC2k(6) with atleast one iteration to ensure correctness.
>>>>> Hi,
>>>>> I got results of SPEC2k6 fp benchmarks:
>>>>> a15: +0.64% overall, 481.wrf: +6.46%
>>>>> a53: +0.21% overall, 416.gamess: -1.39%, 481.wrf: +6.76%
>>>>> a57: +0.35% overall, 481.wrf: +3.84%
>>>>> The other benchmarks had (almost) identical results.
>>>>
>>>> Thanks for the benchmarking results -  Please repost the patch with
>>>> the changes that I had requested in my previous review - given it is
>>>> now stage4 , I would rather queue changes like this for stage1 now.
>>> Hi,
>>> Please find the updated patch attached.
>>> It passes testsuite for arm-none-linux-gnueabi, arm-none-linux-gnueabihf and
>>> arm-none-eabi.
>>> However the test-case added in the patch (neon-vect-div-1.c) fails to
>>> get vectorized at -O2
>>> for armeb-none-linux-gnueabihf.
>>> Charles suggested me to try with -O3, which worked.
>>> It appears the test-case fails to get vectorized with
>>> -fvect-cost-model=cheap (which is default enabled at -O2)
>>> and passes for -fno-vect-cost-model / -fvect-cost-model=dynamic
>>>
>>> I can't figure out why it fails -fvect-cost-model=cheap.
>>> From the vect dump (attached):
>>> neon-vect-div-1.c:12:3: note: Setting misalignment to -1.
>>> neon-vect-div-1.c:12:3: note: not vectorized: unsupported unaligned load.*_9
>> Hi,
>> I think I have some idea why the test-case fails attached with patch
>> fail to get vectorized on armeb with -O2.
>>
>> Issue with big endian vectorizer:
>> The patch does not cause regressions on big endian vectorizer but
>> fails to vectorize the test-cases attached with the patch, while they
>> get vectorized on
>> litttle-endian.
>> Fails with armeb with the following message in dump:
>> note: not vectorized: unsupported unaligned load.*_9
>>
>> The behavior of big and little endian vectorizer seems to be different
>> in arm_builtin_support_vector_misalignment() which overrides the hook
>> targetm.vectorize.support_vector_misalignment().
>>
>> targetm.vectorize.support_vector_misalignment is called by
>> vect_supportable_dr_alignment () which in turn is called
>> by verify_data_refs_alignment ().
>>
>> Execution upto following condition is common between arm and armeb
>> in vect_supportable_dr_alignment():
>>
>> if ((TYPE_USER_ALIGN (type) && !is_packed)
>>       || targetm.vectorize.support_vector_misalignment (mode, type,
>>                                             DR_MISALIGNMENT (dr), is_packed))
>>         /* Can't software pipeline the loads, but can at least do them.  */
>>         return dr_unaligned_supported;
>>
>> For little endian case:
>> arm_builtin_support_vector_misalignment() is called with
>> V2SF mode and misalignment == -1, and the following condition
>> becomes true:
>> /* If the misalignment is unknown, we should be able to handle the access
>>          so long as it is not to a member of a packed data structure.  */
>>   if (misalignment == -1)
>>     return true;
>>
>> Since the hook returned true we enter the condition above in
>> vect_supportable_dr_alignment() and return dr_unaligned_supported;
>>
>> For big-endian:
>> arm_builtin_support_vector_misalignment() is called with V2SF mode.
>> The following condition that gates the entire function body fails:
>>  if (TARGET_NEON && !BYTES_BIG_ENDIAN && unaligned_access)
>> and the default hook gets called with V2SF mode and the default hook
>> returns false because
>> movmisalign_optab does not exist for V2SF mode.
>>
>> So the condition above in vect_supportable_dr_alignment() fails
>> and we come here:
>>  /* Unsupported.  */
>> return dr_unaligned_unsupported;
>>
>> And hence we get the unaligned load not supported message in the dump
>> for armeb in verify_data_ref_alignment ():
>>
>> static bool
>> verify_data_ref_alignment (data_reference_p dr)
>> {
>>   enum dr_alignment_support supportable_dr_alignment
>>     = vect_supportable_dr_alignment (dr, false);
>>   if (!supportable_dr_alignment)
>>     {
>>       if (dump_enabled_p ())
>>         {
>>           if (DR_IS_READ (dr))
>>             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>>                              "not vectorized: unsupported unaligned load.");
>>
>> With -O3, the test-cases vectorize for armeb, because loop peeling for alignment
>> is turned on.
>> The above behavior is also reproducible with test-case which is
>> irrelevant to the patch.
>> for instance, we get the same unsupported unaligned load for following
>> test-case (replaced / with +)
>>
>> void
>> foo (int len, float * __restrict p, float *__restrict x)
>> {
>>   len = len & ~31;
>>   for (int i = 0; i < len; i++)
>>     p[i] = p[i] + x[i];
>> }
>> Is the patch OK to commit after bootstrap+test ?
> ping https://gcc.gnu.org/ml/gcc-patches/2016-05/msg01765.html
ping * 2 https://gcc.gnu.org/ml/gcc-patches/2016-05/msg01765.html

Thanks,
Prathamesh
>
> Thanks,
> Prathamesh
>>
>> Thanks,
>> Prathamesh
>>>
>>> Thanks,
>>> Prathamesh
>>>>
>>>> Thanks,
>>>> Ramana
>>>>
>>>>>
>>>>> Thanks,
>>>>> Prathamesh
>>>>>>
>>>>>>
>>>>>> moving on to the patches.
>>>>>>
>>>>>>> diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
>>>>>>> index 654d9d5..28c2e2a 100644
>>>>>>> --- a/gcc/config/arm/neon.md
>>>>>>> +++ b/gcc/config/arm/neon.md
>>>>>>> @@ -548,6 +548,32 @@
>>>>>>>                      (const_string "neon_mul_<V_elem_ch><q>")))]
>>>>>>>  )
>>>>>>>
>>>>>>
>>>>>> Please add a comment here.
>>>>>>
>>>>>>> +(define_expand "div<mode>3"
>>>>>>> +  [(set (match_operand:VCVTF 0 "s_register_operand" "=w")
>>>>>>> +        (div:VCVTF (match_operand:VCVTF 1 "s_register_operand" "w")
>>>>>>> +               (match_operand:VCVTF 2 "s_register_operand" "w")))]
>>>>>>
>>>>>> I want to double check that this doesn't collide with Alan's patches for FP16 especially if he reuses the VCVTF iterator for all the vcvt f16 cases.
>>>>>>
>>>>>>> +  "TARGET_NEON && flag_unsafe_math_optimizations && flag_reciprocal_math"
>>>>>>> +  {
>>>>>>> +    rtx rec = gen_reg_rtx (<MODE>mode);
>>>>>>> +    rtx vrecps_temp = gen_reg_rtx (<MODE>mode);
>>>>>>> +
>>>>>>> +    /* Reciprocal estimate */
>>>>>>> +    emit_insn (gen_neon_vrecpe<mode> (rec, operands[2]));
>>>>>>> +
>>>>>>> +    /* Perform 2 iterations of Newton-Raphson method for better accuracy */
>>>>>>> +    for (int i = 0; i < 2; i++)
>>>>>>> +      {
>>>>>>> +     emit_insn (gen_neon_vrecps<mode> (vrecps_temp, rec, operands[2]));
>>>>>>> +     emit_insn (gen_mul<mode>3 (rec, rec, vrecps_temp));
>>>>>>> +      }
>>>>>>> +
>>>>>>> +    /* We now have reciprocal in rec, perform operands[0] = operands[1] * rec */
>>>>>>> +    emit_insn (gen_mul<mode>3 (operands[0], operands[1], rec));
>>>>>>> +    DONE;
>>>>>>> +  }
>>>>>>> +)
>>>>>>> +
>>>>>>> +
>>>>>>>  (define_insn "mul<mode>3add<mode>_neon"
>>>>>>>    [(set (match_operand:VDQW 0 "s_register_operand" "=w")
>>>>>>>          (plus:VDQW (mult:VDQW (match_operand:VDQW 2 "s_register_operand" "w")
>>>>>>> diff --git a/gcc/testsuite/gcc.target/arm/vect-div-1.c b/gcc/testsuite/gcc.target/arm/vect-div-1.c
>>>>>>> new file mode 100644
>>>>>>> index 0000000..e562ef3
>>>>>>> --- /dev/null
>>>>>>> +++ b/gcc/testsuite/gcc.target/arm/vect-div-1.c
>>>>>>> @@ -0,0 +1,14 @@
>>>>>>> +/* { dg-do compile } */
>>>>>>> +/* { dg-require-effective-target arm_v8_neon_ok } */
>>>>>>> +/* { dg-options "-O2 -funsafe-math-optimizations -ftree-vectorize -fdump-tree-vect-all" } */
>>>>>>> +/* { dg-add-options arm_v8_neon } */
>>>>>>
>>>>>> No this is wrong.
>>>>>>
>>>>>> What is armv8 specific about this test ? This is just like another test that is for Neon. vrecpe / vrecps are not instructions that were introduced in the v8 version of the architecture. They've existed in the base Neon instruction set. The code generation above in the patterns will be enabled when TARGET_NEON is true which can happen when -mfpu=neon -mfloat-abi={softfp/hard} is true.
>>>>>>
>>>>>>> +
>>>>>>> +void
>>>>>>> +foo (int len, float * __restrict p, float *__restrict x)
>>>>>>> +{
>>>>>>> +  len = len & ~31;
>>>>>>> +  for (int i = 0; i < len; i++)
>>>>>>> +    p[i] = p[i] / x[i];
>>>>>>> +}
>>>>>>> +
>>>>>>> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
>>>>>>> diff --git a/gcc/testsuite/gcc.target/arm/vect-div-2.c b/gcc/testsuite/gcc.target/arm/vect-div-2.c
>>>>>>> new file mode 100644
>>>>>>> index 0000000..8e15d0a
>>>>>>> --- /dev/null
>>>>>>> +++ b/gcc/testsuite/gcc.target/arm/vect-div-2.c
>>>>>>> @@ -0,0 +1,14 @@
>>>>>>> +/* { dg-do compile } */
>>>>>>> +/* { dg-require-effective-target arm_v8_neon_ok } */
>>>>>>
>>>>>> And likewise.
>>>>>>
>>>>>>> +/* { dg-options "-O2 -funsafe-math-optimizations -fno-reciprocal-math -ftree-vectorize -fdump-tree-vect-all" } */
>>>>>>> +/* { dg-add-options arm_v8_neon } */
>>>>>>> +
>>>>>>> +void
>>>>>>> +foo (int len, float * __restrict p, float *__restrict x)
>>>>>>> +{
>>>>>>> +  len = len & ~31;
>>>>>>> +  for (int i = 0; i < len; i++)
>>>>>>> +    p[i] = p[i] / x[i];
>>>>>>> +}
>>>>>>> +
>>>>>>> +/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" } } */
>>>>>>
>>>>>>
>>>>>> regards
>>>>>> Ramana

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [ARM] implement division using vrecpe/vrecps with -funsafe-math-optimizations
  2016-05-23  9:30         ` Prathamesh Kulkarni
  2016-05-30 10:55           ` Prathamesh Kulkarni
@ 2016-06-07  8:37           ` Ramana Radhakrishnan
  2016-06-10  8:56             ` Prathamesh Kulkarni
  1 sibling, 1 reply; 13+ messages in thread
From: Ramana Radhakrishnan @ 2016-06-07  8:37 UTC (permalink / raw)
  To: Prathamesh Kulkarni; +Cc: Ramana Radhakrishnan, gcc Patches, Charles Baylis

>> Please find the updated patch attached.
>> It passes testsuite for arm-none-linux-gnueabi, arm-none-linux-gnueabihf and
>> arm-none-eabi.
>> However the test-case added in the patch (neon-vect-div-1.c) fails to
>> get vectorized at -O2
>> for armeb-none-linux-gnueabihf.
>> Charles suggested me to try with -O3, which worked.
>> It appears the test-case fails to get vectorized with
>> -fvect-cost-model=cheap (which is default enabled at -O2)
>> and passes for -fno-vect-cost-model / -fvect-cost-model=dynamic
>>
>> I can't figure out why it fails -fvect-cost-model=cheap.
>> From the vect dump (attached):
>> neon-vect-div-1.c:12:3: note: Setting misalignment to -1.
>> neon-vect-div-1.c:12:3: note: not vectorized: unsupported unaligned load.*_9
> Hi,
> I think I have some idea why the test-case fails attached with patch
> fail to get vectorized on armeb with -O2.
>
> Issue with big endian vectorizer:
> The patch does not cause regressions on big endian vectorizer but
> fails to vectorize the test-cases attached with the patch, while they
> get vectorized on
> litttle-endian.
> Fails with armeb with the following message in dump:
> note: not vectorized: unsupported unaligned load.*_9
>
> The behavior of big and little endian vectorizer seems to be different
> in arm_builtin_support_vector_misalignment() which overrides the hook
> targetm.vectorize.support_vector_misalignment().
>
> targetm.vectorize.support_vector_misalignment is called by
> vect_supportable_dr_alignment () which in turn is called
> by verify_data_refs_alignment ().
>
> Execution upto following condition is common between arm and armeb
> in vect_supportable_dr_alignment():
>
> if ((TYPE_USER_ALIGN (type) && !is_packed)
>       || targetm.vectorize.support_vector_misalignment (mode, type,
>                                             DR_MISALIGNMENT (dr), is_packed))
>         /* Can't software pipeline the loads, but can at least do them.  */
>         return dr_unaligned_supported;
>
> For little endian case:
> arm_builtin_support_vector_misalignment() is called with
> V2SF mode and misalignment == -1, and the following condition
> becomes true:
> /* If the misalignment is unknown, we should be able to handle the access
>          so long as it is not to a member of a packed data structure.  */
>   if (misalignment == -1)
>     return true;
>
> Since the hook returned true we enter the condition above in
> vect_supportable_dr_alignment() and return dr_unaligned_supported;
>
> For big-endian:
> arm_builtin_support_vector_misalignment() is called with V2SF mode.
> The following condition that gates the entire function body fails:
>  if (TARGET_NEON && !BYTES_BIG_ENDIAN && unaligned_access)
> and the default hook gets called with V2SF mode and the default hook
> returns false because
> movmisalign_optab does not exist for V2SF mode.
>
> So the condition above in vect_supportable_dr_alignment() fails
> and we come here:
>  /* Unsupported.  */
> return dr_unaligned_unsupported;
>
> And hence we get the unaligned load not supported message in the dump
> for armeb in verify_data_ref_alignment ():
>
> static bool
> verify_data_ref_alignment (data_reference_p dr)
> {
>   enum dr_alignment_support supportable_dr_alignment
>     = vect_supportable_dr_alignment (dr, false);
>   if (!supportable_dr_alignment)
>     {
>       if (dump_enabled_p ())
>         {
>           if (DR_IS_READ (dr))
>             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>                              "not vectorized: unsupported unaligned load.");
>
> With -O3, the test-cases vectorize for armeb, because loop peeling for alignment
> is turned on.
> The above behavior is also reproducible with test-case which is
> irrelevant to the patch.
> for instance, we get the same unsupported unaligned load for following
> test-case (replaced / with +)
>
> void
> foo (int len, float * __restrict p, float *__restrict x)
> {
>   len = len & ~31;
>   for (int i = 0; i < len; i++)
>     p[i] = p[i] + x[i];
> }
> Is the patch OK to commit after bootstrap+test ?


Thanks for the analysis - all the test needs is an additional marker
to skip it on armeb (is there a helper for misaligned loads from the
vectorizer ? ) - Ah probably vect_hw_misalign is sufficient for your
usecase and you want to appropriately fix it for little endian arm
with neon support enabled.

From the patch.

>>+   && flag_unsafe_math_optimizations && flag_reciprocal_math"

Why do we need flag_unsafe_math_optimizations && flag_reciprocal_math
? flag_unsafe_math_optimizations should be sufficient since it enables
flag_reciprocal_math - the reason for flag_unsafe_math_optimizations
is to prevent loss of precision and the fact that on neon denormalized
numbers are flushed to zero.

Ok with that change and a quick test with vect_hw_misalign added to
your testcase.

Sorry about the delay in reviewing.

Ramana


>
> Thanks,
> Prathamesh
>>
>> Thanks,
>> Prathamesh
>>>
>>> Thanks,
>>> Ramana
>>>
>>>>
>>>> Thanks,
>>>> Prathamesh
>>>>>
>>>>>
>>>>> moving on to the patches.
>>>>>
>>>>>> diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
>>>>>> index 654d9d5..28c2e2a 100644
>>>>>> --- a/gcc/config/arm/neon.md
>>>>>> +++ b/gcc/config/arm/neon.md
>>>>>> @@ -548,6 +548,32 @@
>>>>>>                      (const_string "neon_mul_<V_elem_ch><q>")))]
>>>>>>  )
>>>>>>
>>>>>
>>>>> Please add a comment here.
>>>>>
>>>>>> +(define_expand "div<mode>3"
>>>>>> +  [(set (match_operand:VCVTF 0 "s_register_operand" "=w")
>>>>>> +        (div:VCVTF (match_operand:VCVTF 1 "s_register_operand" "w")
>>>>>> +               (match_operand:VCVTF 2 "s_register_operand" "w")))]
>>>>>
>>>>> I want to double check that this doesn't collide with Alan's patches for FP16 especially if he reuses the VCVTF iterator for all the vcvt f16 cases.
>>>>>
>>>>>> +  "TARGET_NEON && flag_unsafe_math_optimizations && flag_reciprocal_math"
>>>>>> +  {
>>>>>> +    rtx rec = gen_reg_rtx (<MODE>mode);
>>>>>> +    rtx vrecps_temp = gen_reg_rtx (<MODE>mode);
>>>>>> +
>>>>>> +    /* Reciprocal estimate */
>>>>>> +    emit_insn (gen_neon_vrecpe<mode> (rec, operands[2]));
>>>>>> +
>>>>>> +    /* Perform 2 iterations of Newton-Raphson method for better accuracy */
>>>>>> +    for (int i = 0; i < 2; i++)
>>>>>> +      {
>>>>>> +     emit_insn (gen_neon_vrecps<mode> (vrecps_temp, rec, operands[2]));
>>>>>> +     emit_insn (gen_mul<mode>3 (rec, rec, vrecps_temp));
>>>>>> +      }
>>>>>> +
>>>>>> +    /* We now have reciprocal in rec, perform operands[0] = operands[1] * rec */
>>>>>> +    emit_insn (gen_mul<mode>3 (operands[0], operands[1], rec));
>>>>>> +    DONE;
>>>>>> +  }
>>>>>> +)
>>>>>> +
>>>>>> +
>>>>>>  (define_insn "mul<mode>3add<mode>_neon"
>>>>>>    [(set (match_operand:VDQW 0 "s_register_operand" "=w")
>>>>>>          (plus:VDQW (mult:VDQW (match_operand:VDQW 2 "s_register_operand" "w")
>>>>>> diff --git a/gcc/testsuite/gcc.target/arm/vect-div-1.c b/gcc/testsuite/gcc.target/arm/vect-div-1.c
>>>>>> new file mode 100644
>>>>>> index 0000000..e562ef3
>>>>>> --- /dev/null
>>>>>> +++ b/gcc/testsuite/gcc.target/arm/vect-div-1.c
>>>>>> @@ -0,0 +1,14 @@
>>>>>> +/* { dg-do compile } */
>>>>>> +/* { dg-require-effective-target arm_v8_neon_ok } */
>>>>>> +/* { dg-options "-O2 -funsafe-math-optimizations -ftree-vectorize -fdump-tree-vect-all" } */
>>>>>> +/* { dg-add-options arm_v8_neon } */
>>>>>
>>>>> No this is wrong.
>>>>>
>>>>> What is armv8 specific about this test ? This is just like another test that is for Neon. vrecpe / vrecps are not instructions that were introduced in the v8 version of the architecture. They've existed in the base Neon instruction set. The code generation above in the patterns will be enabled when TARGET_NEON is true which can happen when -mfpu=neon -mfloat-abi={softfp/hard} is true.
>>>>>
>>>>>> +
>>>>>> +void
>>>>>> +foo (int len, float * __restrict p, float *__restrict x)
>>>>>> +{
>>>>>> +  len = len & ~31;
>>>>>> +  for (int i = 0; i < len; i++)
>>>>>> +    p[i] = p[i] / x[i];
>>>>>> +}
>>>>>> +
>>>>>> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
>>>>>> diff --git a/gcc/testsuite/gcc.target/arm/vect-div-2.c b/gcc/testsuite/gcc.target/arm/vect-div-2.c
>>>>>> new file mode 100644
>>>>>> index 0000000..8e15d0a
>>>>>> --- /dev/null
>>>>>> +++ b/gcc/testsuite/gcc.target/arm/vect-div-2.c
>>>>>> @@ -0,0 +1,14 @@
>>>>>> +/* { dg-do compile } */
>>>>>> +/* { dg-require-effective-target arm_v8_neon_ok } */
>>>>>
>>>>> And likewise.
>>>>>
>>>>>> +/* { dg-options "-O2 -funsafe-math-optimizations -fno-reciprocal-math -ftree-vectorize -fdump-tree-vect-all" } */
>>>>>> +/* { dg-add-options arm_v8_neon } */
>>>>>> +
>>>>>> +void
>>>>>> +foo (int len, float * __restrict p, float *__restrict x)
>>>>>> +{
>>>>>> +  len = len & ~31;
>>>>>> +  for (int i = 0; i < len; i++)
>>>>>> +    p[i] = p[i] / x[i];
>>>>>> +}
>>>>>> +
>>>>>> +/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" } } */
>>>>>
>>>>>
>>>>> regards
>>>>> Ramana

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [ARM] implement division using vrecpe/vrecps with -funsafe-math-optimizations
  2016-06-07  8:37           ` Ramana Radhakrishnan
@ 2016-06-10  8:56             ` Prathamesh Kulkarni
  0 siblings, 0 replies; 13+ messages in thread
From: Prathamesh Kulkarni @ 2016-06-10  8:56 UTC (permalink / raw)
  To: Ramana Radhakrishnan; +Cc: Ramana Radhakrishnan, gcc Patches, Charles Baylis

[-- Attachment #1: Type: text/plain, Size: 10243 bytes --]

On 7 June 2016 at 14:07, Ramana Radhakrishnan <ramana.gcc@googlemail.com> wrote:
>>> Please find the updated patch attached.
>>> It passes testsuite for arm-none-linux-gnueabi, arm-none-linux-gnueabihf and
>>> arm-none-eabi.
>>> However the test-case added in the patch (neon-vect-div-1.c) fails to
>>> get vectorized at -O2
>>> for armeb-none-linux-gnueabihf.
>>> Charles suggested me to try with -O3, which worked.
>>> It appears the test-case fails to get vectorized with
>>> -fvect-cost-model=cheap (which is default enabled at -O2)
>>> and passes for -fno-vect-cost-model / -fvect-cost-model=dynamic
>>>
>>> I can't figure out why it fails -fvect-cost-model=cheap.
>>> From the vect dump (attached):
>>> neon-vect-div-1.c:12:3: note: Setting misalignment to -1.
>>> neon-vect-div-1.c:12:3: note: not vectorized: unsupported unaligned load.*_9
>> Hi,
>> I think I have some idea why the test-case fails attached with patch
>> fail to get vectorized on armeb with -O2.
>>
>> Issue with big endian vectorizer:
>> The patch does not cause regressions on big endian vectorizer but
>> fails to vectorize the test-cases attached with the patch, while they
>> get vectorized on
>> litttle-endian.
>> Fails with armeb with the following message in dump:
>> note: not vectorized: unsupported unaligned load.*_9
>>
>> The behavior of big and little endian vectorizer seems to be different
>> in arm_builtin_support_vector_misalignment() which overrides the hook
>> targetm.vectorize.support_vector_misalignment().
>>
>> targetm.vectorize.support_vector_misalignment is called by
>> vect_supportable_dr_alignment () which in turn is called
>> by verify_data_refs_alignment ().
>>
>> Execution upto following condition is common between arm and armeb
>> in vect_supportable_dr_alignment():
>>
>> if ((TYPE_USER_ALIGN (type) && !is_packed)
>>       || targetm.vectorize.support_vector_misalignment (mode, type,
>>                                             DR_MISALIGNMENT (dr), is_packed))
>>         /* Can't software pipeline the loads, but can at least do them.  */
>>         return dr_unaligned_supported;
>>
>> For little endian case:
>> arm_builtin_support_vector_misalignment() is called with
>> V2SF mode and misalignment == -1, and the following condition
>> becomes true:
>> /* If the misalignment is unknown, we should be able to handle the access
>>          so long as it is not to a member of a packed data structure.  */
>>   if (misalignment == -1)
>>     return true;
>>
>> Since the hook returned true we enter the condition above in
>> vect_supportable_dr_alignment() and return dr_unaligned_supported;
>>
>> For big-endian:
>> arm_builtin_support_vector_misalignment() is called with V2SF mode.
>> The following condition that gates the entire function body fails:
>>  if (TARGET_NEON && !BYTES_BIG_ENDIAN && unaligned_access)
>> and the default hook gets called with V2SF mode and the default hook
>> returns false because
>> movmisalign_optab does not exist for V2SF mode.
>>
>> So the condition above in vect_supportable_dr_alignment() fails
>> and we come here:
>>  /* Unsupported.  */
>> return dr_unaligned_unsupported;
>>
>> And hence we get the unaligned load not supported message in the dump
>> for armeb in verify_data_ref_alignment ():
>>
>> static bool
>> verify_data_ref_alignment (data_reference_p dr)
>> {
>>   enum dr_alignment_support supportable_dr_alignment
>>     = vect_supportable_dr_alignment (dr, false);
>>   if (!supportable_dr_alignment)
>>     {
>>       if (dump_enabled_p ())
>>         {
>>           if (DR_IS_READ (dr))
>>             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>>                              "not vectorized: unsupported unaligned load.");
>>
>> With -O3, the test-cases vectorize for armeb, because loop peeling for alignment
>> is turned on.
>> The above behavior is also reproducible with test-case which is
>> irrelevant to the patch.
>> for instance, we get the same unsupported unaligned load for following
>> test-case (replaced / with +)
>>
>> void
>> foo (int len, float * __restrict p, float *__restrict x)
>> {
>>   len = len & ~31;
>>   for (int i = 0; i < len; i++)
>>     p[i] = p[i] + x[i];
>> }
>> Is the patch OK to commit after bootstrap+test ?
>
>
> Thanks for the analysis - all the test needs is an additional marker
> to skip it on armeb (is there a helper for misaligned loads from the
> vectorizer ? ) - Ah probably vect_hw_misalign is sufficient for your
> usecase and you want to appropriately fix it for little endian arm
> with neon support enabled.
Hi,
I added dg-require-effective-target vect_hw_misalign to the tests in the patch,
and modified vect_hw_misalign to return true for little-endian arm configs
with neon support enabled. The patch makes the tests unsupported for
armeb.
Does it look correct ?

Unfortunately the change to vect_hw_misalign breaks gcc.dg/vect/vect-align-1.c,
which were passing before:
XPASS: gcc.dg/vect/vect-align-1.c scan-tree-dump-times vect "Alignment
of access forced using versioning" 1
FAIL: gcc.dg/vect/vect-align-1.c scan-tree-dump-times vect
"Vectorizing an unaligned access" 1

I am not sure how to fix this and would be grateful for suggestions.

Thanks,
Prathamesh
>
> From the patch.
>
>>>+   && flag_unsafe_math_optimizations && flag_reciprocal_math"
>
> Why do we need flag_unsafe_math_optimizations && flag_reciprocal_math
> ? flag_unsafe_math_optimizations should be sufficient since it enables
> flag_reciprocal_math - the reason for flag_unsafe_math_optimizations
> is to prevent loss of precision and the fact that on neon denormalized
> numbers are flushed to zero.
>
> Ok with that change and a quick test with vect_hw_misalign added to
> your testcase.
>
> Sorry about the delay in reviewing.
>
> Ramana
>
>
>>
>> Thanks,
>> Prathamesh
>>>
>>> Thanks,
>>> Prathamesh
>>>>
>>>> Thanks,
>>>> Ramana
>>>>
>>>>>
>>>>> Thanks,
>>>>> Prathamesh
>>>>>>
>>>>>>
>>>>>> moving on to the patches.
>>>>>>
>>>>>>> diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
>>>>>>> index 654d9d5..28c2e2a 100644
>>>>>>> --- a/gcc/config/arm/neon.md
>>>>>>> +++ b/gcc/config/arm/neon.md
>>>>>>> @@ -548,6 +548,32 @@
>>>>>>>                      (const_string "neon_mul_<V_elem_ch><q>")))]
>>>>>>>  )
>>>>>>>
>>>>>>
>>>>>> Please add a comment here.
>>>>>>
>>>>>>> +(define_expand "div<mode>3"
>>>>>>> +  [(set (match_operand:VCVTF 0 "s_register_operand" "=w")
>>>>>>> +        (div:VCVTF (match_operand:VCVTF 1 "s_register_operand" "w")
>>>>>>> +               (match_operand:VCVTF 2 "s_register_operand" "w")))]
>>>>>>
>>>>>> I want to double check that this doesn't collide with Alan's patches for FP16 especially if he reuses the VCVTF iterator for all the vcvt f16 cases.
>>>>>>
>>>>>>> +  "TARGET_NEON && flag_unsafe_math_optimizations && flag_reciprocal_math"
>>>>>>> +  {
>>>>>>> +    rtx rec = gen_reg_rtx (<MODE>mode);
>>>>>>> +    rtx vrecps_temp = gen_reg_rtx (<MODE>mode);
>>>>>>> +
>>>>>>> +    /* Reciprocal estimate */
>>>>>>> +    emit_insn (gen_neon_vrecpe<mode> (rec, operands[2]));
>>>>>>> +
>>>>>>> +    /* Perform 2 iterations of Newton-Raphson method for better accuracy */
>>>>>>> +    for (int i = 0; i < 2; i++)
>>>>>>> +      {
>>>>>>> +     emit_insn (gen_neon_vrecps<mode> (vrecps_temp, rec, operands[2]));
>>>>>>> +     emit_insn (gen_mul<mode>3 (rec, rec, vrecps_temp));
>>>>>>> +      }
>>>>>>> +
>>>>>>> +    /* We now have reciprocal in rec, perform operands[0] = operands[1] * rec */
>>>>>>> +    emit_insn (gen_mul<mode>3 (operands[0], operands[1], rec));
>>>>>>> +    DONE;
>>>>>>> +  }
>>>>>>> +)
>>>>>>> +
>>>>>>> +
>>>>>>>  (define_insn "mul<mode>3add<mode>_neon"
>>>>>>>    [(set (match_operand:VDQW 0 "s_register_operand" "=w")
>>>>>>>          (plus:VDQW (mult:VDQW (match_operand:VDQW 2 "s_register_operand" "w")
>>>>>>> diff --git a/gcc/testsuite/gcc.target/arm/vect-div-1.c b/gcc/testsuite/gcc.target/arm/vect-div-1.c
>>>>>>> new file mode 100644
>>>>>>> index 0000000..e562ef3
>>>>>>> --- /dev/null
>>>>>>> +++ b/gcc/testsuite/gcc.target/arm/vect-div-1.c
>>>>>>> @@ -0,0 +1,14 @@
>>>>>>> +/* { dg-do compile } */
>>>>>>> +/* { dg-require-effective-target arm_v8_neon_ok } */
>>>>>>> +/* { dg-options "-O2 -funsafe-math-optimizations -ftree-vectorize -fdump-tree-vect-all" } */
>>>>>>> +/* { dg-add-options arm_v8_neon } */
>>>>>>
>>>>>> No this is wrong.
>>>>>>
>>>>>> What is armv8 specific about this test ? This is just like another test that is for Neon. vrecpe / vrecps are not instructions that were introduced in the v8 version of the architecture. They've existed in the base Neon instruction set. The code generation above in the patterns will be enabled when TARGET_NEON is true which can happen when -mfpu=neon -mfloat-abi={softfp/hard} is true.
>>>>>>
>>>>>>> +
>>>>>>> +void
>>>>>>> +foo (int len, float * __restrict p, float *__restrict x)
>>>>>>> +{
>>>>>>> +  len = len & ~31;
>>>>>>> +  for (int i = 0; i < len; i++)
>>>>>>> +    p[i] = p[i] / x[i];
>>>>>>> +}
>>>>>>> +
>>>>>>> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
>>>>>>> diff --git a/gcc/testsuite/gcc.target/arm/vect-div-2.c b/gcc/testsuite/gcc.target/arm/vect-div-2.c
>>>>>>> new file mode 100644
>>>>>>> index 0000000..8e15d0a
>>>>>>> --- /dev/null
>>>>>>> +++ b/gcc/testsuite/gcc.target/arm/vect-div-2.c
>>>>>>> @@ -0,0 +1,14 @@
>>>>>>> +/* { dg-do compile } */
>>>>>>> +/* { dg-require-effective-target arm_v8_neon_ok } */
>>>>>>
>>>>>> And likewise.
>>>>>>
>>>>>>> +/* { dg-options "-O2 -funsafe-math-optimizations -fno-reciprocal-math -ftree-vectorize -fdump-tree-vect-all" } */
>>>>>>> +/* { dg-add-options arm_v8_neon } */
>>>>>>> +
>>>>>>> +void
>>>>>>> +foo (int len, float * __restrict p, float *__restrict x)
>>>>>>> +{
>>>>>>> +  len = len & ~31;
>>>>>>> +  for (int i = 0; i < len; i++)
>>>>>>> +    p[i] = p[i] / x[i];
>>>>>>> +}
>>>>>>> +
>>>>>>> +/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" } } */
>>>>>>
>>>>>>
>>>>>> regards
>>>>>> Ramana

[-- Attachment #2: tcwg-319-1_2.diff --]
[-- Type: text/plain, Size: 3751 bytes --]

diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index e2fdfbb..fbd4bb6 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -578,6 +578,38 @@
                     (const_string "neon_mul_<V_elem_ch><q>")))]
 )
 
+/* Perform division using multiply-by-reciprocal. 
+   Reciprocal is calculated using Newton-Raphson method.
+   Enabled with -funsafe-math-optimizations -freciprocal-math
+   and disabled for -Os since it increases code size .  */
+
+(define_expand "div<mode>3"
+  [(set (match_operand:VCVTF 0 "s_register_operand" "=w")
+        (div:VCVTF (match_operand:VCVTF 1 "s_register_operand" "w")
+		  (match_operand:VCVTF 2 "s_register_operand" "w")))]
+  "TARGET_NEON && !optimize_size
+   && flag_unsafe_math_optimizations && flag_reciprocal_math"
+  {
+    rtx rec = gen_reg_rtx (<MODE>mode);
+    rtx vrecps_temp = gen_reg_rtx (<MODE>mode);
+
+    /* Reciprocal estimate.  */
+    emit_insn (gen_neon_vrecpe<mode> (rec, operands[2]));
+
+    /* Perform 2 iterations of newton-raphson method.  */
+    for (int i = 0; i < 2; i++)
+      {
+	emit_insn (gen_neon_vrecps<mode> (vrecps_temp, rec, operands[2]));
+	emit_insn (gen_mul<mode>3 (rec, rec, vrecps_temp));
+      }
+
+    /* We now have reciprocal in rec, perform operands[0] = operands[1] * rec.  */
+    emit_insn (gen_mul<mode>3 (operands[0], operands[1], rec));
+    DONE;
+  }
+)
+
+
 (define_insn "mul<mode>3add<mode>_neon"
   [(set (match_operand:VDQW 0 "s_register_operand" "=w")
         (plus:VDQW (mult:VDQW (match_operand:VDQW 2 "s_register_operand" "w")
diff --git a/gcc/testsuite/gcc.target/arm/neon-vect-div-1.c b/gcc/testsuite/gcc.target/arm/neon-vect-div-1.c
new file mode 100644
index 0000000..dc507a0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vect-div-1.c
@@ -0,0 +1,16 @@
+/* Test pattern div<mode>3.  */
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-require-effective-target vect_hw_misalign } */
+/* { dg-options "-O2 -ftree-vectorize -funsafe-math-optimizations -fdump-tree-vect-all" } */
+/* { dg-add-options arm_neon } */
+
+void
+foo (int len, float * __restrict p, float *__restrict x)
+{
+  len = len & ~31;
+  for (int i = 0; i < len; i++)
+    p[i] = p[i] / x[i];
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon-vect-div-2.c b/gcc/testsuite/gcc.target/arm/neon-vect-div-2.c
new file mode 100644
index 0000000..9654232
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vect-div-2.c
@@ -0,0 +1,17 @@
+/* Test pattern div<mode>3.  */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-require-effective-target vect_hw_misalign } */
+/* { dg-options "-O3 -funsafe-math-optimizations -fno-reciprocal-math -fdump-tree-vect-all" } */
+/* { dg-add-options arm_neon } */
+
+void
+foo (int len, float * __restrict p, float *__restrict x)
+{
+  len = len & ~31;
+  for (int i = 0; i < len; i++)
+    p[i] = p[i] / x[i];
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" } } */
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index 0b991a5..48feb99 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -4812,7 +4812,9 @@ proc check_effective_target_vect_hw_misalign { } {
         set et_vect_hw_misalign_saved 0
        if { [istarget i?86-*-*] || [istarget x86_64-*-*]
             || ([istarget powerpc*-*-*] && [check_p8vector_hw_available])
-	    || [istarget aarch64*-*-*] } {
+	    || [istarget aarch64*-*-*]
+            || ([istarget arm-*-*]
+		&& [is-effective-target arm_neon_ok]) } {
           set et_vect_hw_misalign_saved 1
        }
     }

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2016-06-10  8:56 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-07-29 10:33 [ARM] implement division using vrecpe/vrecps with -funsafe-math-optimizations Prathamesh Kulkarni
2015-07-29 11:10 ` Kyrill Tkachov
2015-07-30 22:19   ` Prathamesh Kulkarni
2015-07-31  9:38 ` Ramana Radhakrishnan
2015-07-31 12:36   ` Charles Baylis
2016-01-17  9:06   ` Prathamesh Kulkarni
2016-02-04 11:01     ` Ramana Radhakrishnan
2016-02-05 13:10       ` Prathamesh Kulkarni
2016-05-23  9:30         ` Prathamesh Kulkarni
2016-05-30 10:55           ` Prathamesh Kulkarni
2016-06-07  8:26             ` Prathamesh Kulkarni
2016-06-07  8:37           ` Ramana Radhakrishnan
2016-06-10  8:56             ` Prathamesh Kulkarni

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).