[PATCH] Implement x86 reduc_plus_scal_v{16,32,64}qi (PR tree-optimization/91201)

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH] Implement x86 reduc_plus_scal_v{16,32,64}qi (PR tree-optimization/91201)
@ 2019-07-31  7:20 Jakub Jelinek
  2019-07-31  8:57 ` Uros Bizjak
  0 siblings, 1 reply; 4+ messages in thread
From: Jakub Jelinek @ 2019-07-31  7:20 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: gcc-patches

Hi!

As mentioned in the PR, we can use psadbw to shorten the final reductions to
scalar for 8-bit elements.  E.g. for -mavx2 the difference is:
-	vmovdqa	%xmm1, %xmm0
-	vextracti128	$0x1, %ymm1, %xmm1
-	vpaddb	%xmm1, %xmm0, %xmm0
-	vpsrldq	$8, %xmm0, %xmm1
-	vpaddb	%xmm1, %xmm0, %xmm0
-	vpsrldq	$4, %xmm0, %xmm1
-	vpaddb	%xmm1, %xmm0, %xmm0
-	vpsrldq	$2, %xmm0, %xmm1
-	vpaddb	%xmm1, %xmm0, %xmm0
-	vpsrldq	$1, %xmm0, %xmm1
-	vpaddb	%xmm1, %xmm0, %xmm0
+	vextracti128	$0x1, %ymm1, %xmm0
+	vpaddb	%xmm1, %xmm0, %xmm1
+	vpsrldq	$8, %xmm1, %xmm0
+	vpaddb	%xmm0, %xmm1, %xmm1
+	vpxor	%xmm0, %xmm0, %xmm0
+	vpsadbw	%xmm0, %xmm1, %xmm0
 	vpextrb	$0, %xmm0, %eax
Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2019-07-31  Jakub Jelinek  <jakub@redhat.com>

	PR tree-optimization/91201
	* config/i386/sse.md (reduc_plus_scal_v16qi): New expander.
	(REDUC_PLUS_MODE): Add V32QImode for TARGET_AVX and V64QImode for
	TARGET_AVX512F.
	(reduc_plus_scal_<mode>): Improve formatting by introducing
	a temporary.

	* gcc.target/i386/sse2-pr91201.c: New test.
	* gcc.target/i386/avx2-pr91201.c: New test.
	* gcc.target/i386/avx512bw-pr91201.c: New test.

--- gcc/config/i386/sse.md.jj	2019-07-30 12:19:45.999490854 +0200
+++ gcc/config/i386/sse.md	2019-07-30 12:19:55.379352735 +0200
@@ -2728,9 +2728,30 @@ (define_expand "reduc_plus_scal_<mode>"
   DONE;
 })
 
+(define_expand "reduc_plus_scal_v16qi"
+ [(plus:V16QI
+    (match_operand:QI 0 "register_operand")
+    (match_operand:V16QI 1 "register_operand"))]
+ "TARGET_SSE2"
+{
+  rtx tmp = gen_reg_rtx (V1TImode);
+  emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, operands[1]),
+				 GEN_INT (64)));
+  rtx tmp2 = gen_reg_rtx (V16QImode);
+  emit_insn (gen_addv16qi3 (tmp2, operands[1], gen_lowpart (V16QImode, tmp)));
+  rtx tmp3 = gen_reg_rtx (V16QImode);
+  emit_move_insn (tmp3, CONST0_RTX (V16QImode));
+  rtx tmp4 = gen_reg_rtx (V2DImode);
+  emit_insn (gen_sse2_psadbw (tmp4, tmp2, tmp3));
+  tmp4 = gen_lowpart (V16QImode, tmp4);
+  emit_insn (gen_vec_extractv16qiqi (operands[0], tmp4, const0_rtx));
+  DONE;
+})
+
 (define_mode_iterator REDUC_PLUS_MODE
  [(V4DF "TARGET_AVX") (V8SF "TARGET_AVX")
-  (V8DF "TARGET_AVX512F") (V16SF "TARGET_AVX512F")])
+  (V8DF "TARGET_AVX512F") (V16SF "TARGET_AVX512F")
+  (V32QI "TARGET_AVX") (V64QI "TARGET_AVX512F")])
 
 (define_expand "reduc_plus_scal_<mode>"
  [(plus:REDUC_PLUS_MODE
@@ -2741,8 +2762,8 @@ (define_expand "reduc_plus_scal_<mode>"
   rtx tmp = gen_reg_rtx (<ssehalfvecmode>mode);
   emit_insn (gen_vec_extract_hi_<mode> (tmp, operands[1]));
   rtx tmp2 = gen_reg_rtx (<ssehalfvecmode>mode);
-  emit_insn (gen_add<ssehalfvecmodelower>3
-    (tmp2, tmp, gen_lowpart (<ssehalfvecmode>mode, operands[1])));
+  rtx tmp3 = gen_lowpart (<ssehalfvecmode>mode, operands[1]);
+  emit_insn (gen_add<ssehalfvecmodelower>3 (tmp2, tmp, tmp3));
   emit_insn (gen_reduc_plus_scal_<ssehalfvecmodelower> (operands[0], tmp2));
   DONE;
 })
--- gcc/testsuite/gcc.target/i386/sse2-pr91201.c.jj	2019-07-30 12:23:48.930913778 +0200
+++ gcc/testsuite/gcc.target/i386/sse2-pr91201.c	2019-07-30 12:23:45.518964018 +0200
@@ -0,0 +1,18 @@
+/* PR tree-optimization/91201 */
+/* { dg-do compile } */
+/* { dg-options "-O3 -msse2 -mno-sse3" } */
+/* { dg-final { scan-assembler "\tpsadbw\t" } } */
+
+unsigned char bytes[1024];
+
+unsigned char
+sum (void)
+{
+  unsigned char r = 0;
+  unsigned char *p = (unsigned char *) bytes;
+  int n;
+
+  for (n = 0; n < sizeof (bytes); ++n)
+    r += p[n];
+  return r;
+}
--- gcc/testsuite/gcc.target/i386/avx2-pr91201.c.jj	2019-07-30 12:24:05.199674228 +0200
+++ gcc/testsuite/gcc.target/i386/avx2-pr91201.c	2019-07-30 12:24:34.544242142 +0200
@@ -0,0 +1,6 @@
+/* PR tree-optimization/91201 */
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx2 -mno-avx512f" } */
+/* { dg-final { scan-assembler "\tvpsadbw\t" } } */
+
+#include "sse2-pr91201.c"
--- gcc/testsuite/gcc.target/i386/avx512bw-pr91201.c.jj	2019-07-30 12:24:50.079013395 +0200
+++ gcc/testsuite/gcc.target/i386/avx512bw-pr91201.c	2019-07-30 12:25:10.685709971 +0200
@@ -0,0 +1,6 @@
+/* PR tree-optimization/91201 */
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx512bw -mprefer-vector-width=512" } */
+/* { dg-final { scan-assembler "\tvpsadbw\t" } } */
+
+#include "sse2-pr91201.c"

	Jakub

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] Implement x86 reduc_plus_scal_v{16,32,64}qi (PR tree-optimization/91201)
  2019-07-31  7:20 [PATCH] Implement x86 reduc_plus_scal_v{16,32,64}qi (PR tree-optimization/91201) Jakub Jelinek
@ 2019-07-31  8:57 ` Uros Bizjak
  2019-07-31  9:51   ` [PATCH] Implement x86 reduc_plus_scal_v8qi " Jakub Jelinek
  0 siblings, 1 reply; 4+ messages in thread
From: Uros Bizjak @ 2019-07-31  8:57 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: gcc-patches

On Wed, Jul 31, 2019 at 9:10 AM Jakub Jelinek <jakub@redhat.com> wrote:
>
> Hi!
>
> As mentioned in the PR, we can use psadbw to shorten the final reductions to
> scalar for 8-bit elements.  E.g. for -mavx2 the difference is:
> -       vmovdqa %xmm1, %xmm0
> -       vextracti128    $0x1, %ymm1, %xmm1
> -       vpaddb  %xmm1, %xmm0, %xmm0
> -       vpsrldq $8, %xmm0, %xmm1
> -       vpaddb  %xmm1, %xmm0, %xmm0
> -       vpsrldq $4, %xmm0, %xmm1
> -       vpaddb  %xmm1, %xmm0, %xmm0
> -       vpsrldq $2, %xmm0, %xmm1
> -       vpaddb  %xmm1, %xmm0, %xmm0
> -       vpsrldq $1, %xmm0, %xmm1
> -       vpaddb  %xmm1, %xmm0, %xmm0
> +       vextracti128    $0x1, %ymm1, %xmm0
> +       vpaddb  %xmm1, %xmm0, %xmm1
> +       vpsrldq $8, %xmm1, %xmm0
> +       vpaddb  %xmm0, %xmm1, %xmm1
> +       vpxor   %xmm0, %xmm0, %xmm0
> +       vpsadbw %xmm0, %xmm1, %xmm0
>         vpextrb $0, %xmm0, %eax
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
>
> 2019-07-31  Jakub Jelinek  <jakub@redhat.com>
>
>         PR tree-optimization/91201
>         * config/i386/sse.md (reduc_plus_scal_v16qi): New expander.
>         (REDUC_PLUS_MODE): Add V32QImode for TARGET_AVX and V64QImode for
>         TARGET_AVX512F.
>         (reduc_plus_scal_<mode>): Improve formatting by introducing
>         a temporary.
>
>         * gcc.target/i386/sse2-pr91201.c: New test.
>         * gcc.target/i386/avx2-pr91201.c: New test.
>         * gcc.target/i386/avx512bw-pr91201.c: New test.

OK.

Thanks,
Uros.

> --- gcc/config/i386/sse.md.jj   2019-07-30 12:19:45.999490854 +0200
> +++ gcc/config/i386/sse.md      2019-07-30 12:19:55.379352735 +0200
> @@ -2728,9 +2728,30 @@ (define_expand "reduc_plus_scal_<mode>"
>    DONE;
>  })
>
> +(define_expand "reduc_plus_scal_v16qi"
> + [(plus:V16QI
> +    (match_operand:QI 0 "register_operand")
> +    (match_operand:V16QI 1 "register_operand"))]
> + "TARGET_SSE2"
> +{
> +  rtx tmp = gen_reg_rtx (V1TImode);
> +  emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, operands[1]),
> +                                GEN_INT (64)));
> +  rtx tmp2 = gen_reg_rtx (V16QImode);
> +  emit_insn (gen_addv16qi3 (tmp2, operands[1], gen_lowpart (V16QImode, tmp)));
> +  rtx tmp3 = gen_reg_rtx (V16QImode);
> +  emit_move_insn (tmp3, CONST0_RTX (V16QImode));
> +  rtx tmp4 = gen_reg_rtx (V2DImode);
> +  emit_insn (gen_sse2_psadbw (tmp4, tmp2, tmp3));
> +  tmp4 = gen_lowpart (V16QImode, tmp4);
> +  emit_insn (gen_vec_extractv16qiqi (operands[0], tmp4, const0_rtx));
> +  DONE;
> +})
> +
>  (define_mode_iterator REDUC_PLUS_MODE
>   [(V4DF "TARGET_AVX") (V8SF "TARGET_AVX")
> -  (V8DF "TARGET_AVX512F") (V16SF "TARGET_AVX512F")])
> +  (V8DF "TARGET_AVX512F") (V16SF "TARGET_AVX512F")
> +  (V32QI "TARGET_AVX") (V64QI "TARGET_AVX512F")])
>
>  (define_expand "reduc_plus_scal_<mode>"
>   [(plus:REDUC_PLUS_MODE
> @@ -2741,8 +2762,8 @@ (define_expand "reduc_plus_scal_<mode>"
>    rtx tmp = gen_reg_rtx (<ssehalfvecmode>mode);
>    emit_insn (gen_vec_extract_hi_<mode> (tmp, operands[1]));
>    rtx tmp2 = gen_reg_rtx (<ssehalfvecmode>mode);
> -  emit_insn (gen_add<ssehalfvecmodelower>3
> -    (tmp2, tmp, gen_lowpart (<ssehalfvecmode>mode, operands[1])));
> +  rtx tmp3 = gen_lowpart (<ssehalfvecmode>mode, operands[1]);
> +  emit_insn (gen_add<ssehalfvecmodelower>3 (tmp2, tmp, tmp3));
>    emit_insn (gen_reduc_plus_scal_<ssehalfvecmodelower> (operands[0], tmp2));
>    DONE;
>  })
> --- gcc/testsuite/gcc.target/i386/sse2-pr91201.c.jj     2019-07-30 12:23:48.930913778 +0200
> +++ gcc/testsuite/gcc.target/i386/sse2-pr91201.c        2019-07-30 12:23:45.518964018 +0200
> @@ -0,0 +1,18 @@
> +/* PR tree-optimization/91201 */
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -msse2 -mno-sse3" } */
> +/* { dg-final { scan-assembler "\tpsadbw\t" } } */
> +
> +unsigned char bytes[1024];
> +
> +unsigned char
> +sum (void)
> +{
> +  unsigned char r = 0;
> +  unsigned char *p = (unsigned char *) bytes;
> +  int n;
> +
> +  for (n = 0; n < sizeof (bytes); ++n)
> +    r += p[n];
> +  return r;
> +}
> --- gcc/testsuite/gcc.target/i386/avx2-pr91201.c.jj     2019-07-30 12:24:05.199674228 +0200
> +++ gcc/testsuite/gcc.target/i386/avx2-pr91201.c        2019-07-30 12:24:34.544242142 +0200
> @@ -0,0 +1,6 @@
> +/* PR tree-optimization/91201 */
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -mavx2 -mno-avx512f" } */
> +/* { dg-final { scan-assembler "\tvpsadbw\t" } } */
> +
> +#include "sse2-pr91201.c"
> --- gcc/testsuite/gcc.target/i386/avx512bw-pr91201.c.jj 2019-07-30 12:24:50.079013395 +0200
> +++ gcc/testsuite/gcc.target/i386/avx512bw-pr91201.c    2019-07-30 12:25:10.685709971 +0200
> @@ -0,0 +1,6 @@
> +/* PR tree-optimization/91201 */
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -mavx512bw -mprefer-vector-width=512" } */
> +/* { dg-final { scan-assembler "\tvpsadbw\t" } } */
> +
> +#include "sse2-pr91201.c"
>
>         Jakub

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCH] Implement x86 reduc_plus_scal_v8qi (PR tree-optimization/91201)
  2019-07-31  8:57 ` Uros Bizjak
@ 2019-07-31  9:51   ` Jakub Jelinek
  2019-07-31 13:55     ` Uros Bizjak
  0 siblings, 1 reply; 4+ messages in thread
From: Jakub Jelinek @ 2019-07-31  9:51 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: gcc-patches

Hi!

On Wed, Jul 31, 2019 at 10:51:22AM +0200, Uros Bizjak wrote:
> OK.

Thanks.  This follow-up implements the same for mmx with sse for V8QImode,
the testcase shows that it is useful too.  The difference is quite large:

-	movq	$0, -72(%rsp)
-	movl	$bytes, %eax
 	movq	bytes(%rip), %xmm0
+	movl	$bytes, %eax
+	pxor	%xmm2, %xmm2
 	.p2align 4,,10
 	.p2align 3
 .L2:
 	movdqa	%xmm0, %xmm1
 	movq	8(%rax), %xmm0
-	movq	-72(%rsp), %xmm2
 	addq	$8, %rax
 	paddb	%xmm0, %xmm1
 	paddb	%xmm0, %xmm2
 	movq	%xmm1, -8(%rax)
-	movq	%xmm2, -72(%rsp)
 	cmpq	$bytes+1016, %rax
 	jne	.L2
-	movq	-72(%rsp), %rcx
-	movzbl	-72(%rsp), %eax
-	movzbl	%ch, %edx
-	addl	%edx, %eax
-	movq	%rcx, %rdx
-	shrq	$16, %rdx
-	addl	%edx, %eax
-	movq	%rcx, %rdx
-	shrq	$24, %rdx
-	addl	%edx, %eax
-	movq	%rcx, %rdx
-	shrq	$32, %rdx
-	addl	%edx, %eax
-	movq	%rcx, %rdx
-	shrq	$40, %rdx
-	addl	%edx, %eax
-	movq	%rcx, %rdx
-	shrq	$48, %rdx
-	addl	%eax, %edx
-	movq	%rcx, %rax
-	shrq	$56, %rax
-	addl	%edx, %eax
+	pxor	%xmm0, %xmm0
+	movdqa	%xmm2, %xmm3
+	psadbw	%xmm0, %xmm3
+	movq	%xmm3, %rax

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2019-07-31  Jakub Jelinek  <jakub@redhat.com>

	PR tree-optimization/91201
	* config/i386/mmx.md (reduc_plus_scal_v8qi): New expander.

	* gcc.target/i386/sse2-pr91201-2.c: New test.

--- gcc/config/i386/mmx.md.jj	2019-07-20 08:35:05.720255567 +0200
+++ gcc/config/i386/mmx.md	2019-07-31 08:43:23.054776025 +0200
@@ -1897,6 +1897,21 @@ (define_insn "mmx_psadbw"
    (set_attr "type" "mmxshft,sseiadd,sseiadd")
    (set_attr "mode" "DI,TI,TI")])
 
+(define_expand "reduc_plus_scal_v8qi"
+ [(plus:V8QI
+    (match_operand:QI 0 "register_operand")
+    (match_operand:V8QI 1 "register_operand"))]
+ "TARGET_MMX_WITH_SSE"
+{
+  rtx tmp = gen_reg_rtx (V8QImode);
+  emit_move_insn (tmp, CONST0_RTX (V8QImode));
+  rtx tmp2 = gen_reg_rtx (V1DImode);
+  emit_insn (gen_mmx_psadbw (tmp2, operands[1], tmp));
+  tmp2 = gen_lowpart (V8QImode, tmp2);
+  emit_insn (gen_vec_extractv8qiqi (operands[0], tmp2, const0_rtx));
+  DONE;
+})
+
 (define_insn_and_split "mmx_pmovmskb"
   [(set (match_operand:SI 0 "register_operand" "=r,r")
 	(unspec:SI [(match_operand:V8QI 1 "register_operand" "y,x")]
--- gcc/testsuite/gcc.target/i386/sse2-pr91201-2.c.jj	2019-07-31 08:45:19.553086849 +0200
+++ gcc/testsuite/gcc.target/i386/sse2-pr91201-2.c	2019-07-31 08:46:52.556738334 +0200
@@ -0,0 +1,21 @@
+/* PR tree-optimization/91201 */
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O3 -msse2 -mno-sse3" } */
+/* { dg-final { scan-assembler "\tpsadbw\t" } } */
+
+unsigned char bytes[1024];
+
+unsigned char
+sum (void)
+{
+  unsigned char r = 0;
+  unsigned char *p = (unsigned char *) bytes;
+  int n;
+
+  for (n = 8; n < sizeof (bytes); ++n)
+    {
+      p[n - 8] += p[n];
+      r += p[n];
+    }
+  return r;
+}


	Jakub

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] Implement x86 reduc_plus_scal_v8qi (PR tree-optimization/91201)
  2019-07-31  9:51   ` [PATCH] Implement x86 reduc_plus_scal_v8qi " Jakub Jelinek
@ 2019-07-31 13:55     ` Uros Bizjak
  0 siblings, 0 replies; 4+ messages in thread
From: Uros Bizjak @ 2019-07-31 13:55 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: gcc-patches

On Wed, Jul 31, 2019 at 11:30 AM Jakub Jelinek <jakub@redhat.com> wrote:
>
> Hi!
>
> On Wed, Jul 31, 2019 at 10:51:22AM +0200, Uros Bizjak wrote:
> > OK.
>
> Thanks.  This follow-up implements the same for mmx with sse for V8QImode,
> the testcase shows that it is useful too.  The difference is quite large:
>
> -       movq    $0, -72(%rsp)
> -       movl    $bytes, %eax
>         movq    bytes(%rip), %xmm0
> +       movl    $bytes, %eax
> +       pxor    %xmm2, %xmm2
>         .p2align 4,,10
>         .p2align 3
>  .L2:
>         movdqa  %xmm0, %xmm1
>         movq    8(%rax), %xmm0
> -       movq    -72(%rsp), %xmm2
>         addq    $8, %rax
>         paddb   %xmm0, %xmm1
>         paddb   %xmm0, %xmm2
>         movq    %xmm1, -8(%rax)
> -       movq    %xmm2, -72(%rsp)
>         cmpq    $bytes+1016, %rax
>         jne     .L2
> -       movq    -72(%rsp), %rcx
> -       movzbl  -72(%rsp), %eax
> -       movzbl  %ch, %edx
> -       addl    %edx, %eax
> -       movq    %rcx, %rdx
> -       shrq    $16, %rdx
> -       addl    %edx, %eax
> -       movq    %rcx, %rdx
> -       shrq    $24, %rdx
> -       addl    %edx, %eax
> -       movq    %rcx, %rdx
> -       shrq    $32, %rdx
> -       addl    %edx, %eax
> -       movq    %rcx, %rdx
> -       shrq    $40, %rdx
> -       addl    %edx, %eax
> -       movq    %rcx, %rdx
> -       shrq    $48, %rdx
> -       addl    %eax, %edx
> -       movq    %rcx, %rax
> -       shrq    $56, %rax
> -       addl    %edx, %eax
> +       pxor    %xmm0, %xmm0
> +       movdqa  %xmm2, %xmm3
> +       psadbw  %xmm0, %xmm3
> +       movq    %xmm3, %rax

Excellent!

IIRC, there are quite some (integer) named patterns that can be
implemented using TARGET_MMX_WITH_SSE. I'm not at my keyboard right
now, but it looks that horizontal adds can be implemented using the
same approach. I'm glad that TARGET_MMX_WITH_SSE opens such noticeable
optimization opportunities.

> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
>
> 2019-07-31  Jakub Jelinek  <jakub@redhat.com>
>
>         PR tree-optimization/91201
>         * config/i386/mmx.md (reduc_plus_scal_v8qi): New expander.
>
>         * gcc.target/i386/sse2-pr91201-2.c: New test.

OK.

Thanks,
Uros.

> --- gcc/config/i386/mmx.md.jj   2019-07-20 08:35:05.720255567 +0200
> +++ gcc/config/i386/mmx.md      2019-07-31 08:43:23.054776025 +0200
> @@ -1897,6 +1897,21 @@ (define_insn "mmx_psadbw"
>     (set_attr "type" "mmxshft,sseiadd,sseiadd")
>     (set_attr "mode" "DI,TI,TI")])
>
> +(define_expand "reduc_plus_scal_v8qi"
> + [(plus:V8QI
> +    (match_operand:QI 0 "register_operand")
> +    (match_operand:V8QI 1 "register_operand"))]
> + "TARGET_MMX_WITH_SSE"
> +{
> +  rtx tmp = gen_reg_rtx (V8QImode);
> +  emit_move_insn (tmp, CONST0_RTX (V8QImode));
> +  rtx tmp2 = gen_reg_rtx (V1DImode);
> +  emit_insn (gen_mmx_psadbw (tmp2, operands[1], tmp));
> +  tmp2 = gen_lowpart (V8QImode, tmp2);
> +  emit_insn (gen_vec_extractv8qiqi (operands[0], tmp2, const0_rtx));
> +  DONE;
> +})
> +
>  (define_insn_and_split "mmx_pmovmskb"
>    [(set (match_operand:SI 0 "register_operand" "=r,r")
>         (unspec:SI [(match_operand:V8QI 1 "register_operand" "y,x")]
> --- gcc/testsuite/gcc.target/i386/sse2-pr91201-2.c.jj   2019-07-31 08:45:19.553086849 +0200
> +++ gcc/testsuite/gcc.target/i386/sse2-pr91201-2.c      2019-07-31 08:46:52.556738334 +0200
> @@ -0,0 +1,21 @@
> +/* PR tree-optimization/91201 */
> +/* { dg-do compile { target lp64 } } */
> +/* { dg-options "-O3 -msse2 -mno-sse3" } */
> +/* { dg-final { scan-assembler "\tpsadbw\t" } } */
> +
> +unsigned char bytes[1024];
> +
> +unsigned char
> +sum (void)
> +{
> +  unsigned char r = 0;
> +  unsigned char *p = (unsigned char *) bytes;
> +  int n;
> +
> +  for (n = 8; n < sizeof (bytes); ++n)
> +    {
> +      p[n - 8] += p[n];
> +      r += p[n];
> +    }
> +  return r;
> +}
>
>
>         Jakub

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2019-07-31 13:33 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-07-31  7:20 [PATCH] Implement x86 reduc_plus_scal_v{16,32,64}qi (PR tree-optimization/91201) Jakub Jelinek
2019-07-31  8:57 ` Uros Bizjak
2019-07-31  9:51   ` [PATCH] Implement x86 reduc_plus_scal_v8qi " Jakub Jelinek
2019-07-31 13:55     ` Uros Bizjak

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).