public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH][ARM] Cleanup DImode shifts
@ 2019-07-22 16:19 Wilco Dijkstra
  2019-07-22 16:35 ` Ramana Radhakrishnan
  2019-07-31 16:39 ` Wilco Dijkstra
  0 siblings, 2 replies; 6+ messages in thread
From: Wilco Dijkstra @ 2019-07-22 16:19 UTC (permalink / raw)
  To: GCC Patches; +Cc: nd

Like the logical operations, expand all shifts early rather than only
sometimes.  The Neon shift expansions are never emitted (not even with
-fneon-for-64bits), so they are not useful.  So all the late expansions
and Neon shift patterns can be removed, and shifts are more optimized
as a result.  Since some extend patterns use Neon DImode shifts, remove
the Neon extend variants and related splits.

A simple example (relying on [1]) generates the same efficient code after
this patch with -mfpu=neon and -mfpu=vfp (previously just the fact of
having Neon enabled resulted inefficient code for no reason).

unsigned long long f(unsigned long long x, unsigned long long y)
{ return x & (y >> 33); }

Before:
        strd    r4, r5, [sp, #-8]!
        lsr     r4, r3, #1
        mov     r5, #0
        and     r1, r1, r5
        and     r0, r0, r4
        ldrd    r4, r5, [sp]
        add     sp, sp, #8
        bx      lr

After:
        and     r0, r0, r3, lsr #1
        mov     r1, #0
        bx      lr

Bootstrap and regress OK on arm-none-linux-gnueabihf --with-cpu=cortex-a57

[1] https://gcc.gnu.org/ml/gcc-patches/2019-07/msg01301.html

ChangeLog:
2019-07-19  Wilco Dijkstra  <wdijkstr@arm.com>

	* config/arm/iterators.md (qhs_extenddi_cstr): Update.
	(qhs_extenddi_cstr): Likewise.
	* config/arm/arm.md (ashldi3): Always expand early.
	(ashlsi3): Likewise.
	(ashrsi3): Likewise.
	(zero_extend<mode>di2): Remove Neon variants.
	(extend<mode>di2): Likewise.
	* config/arm/neon.md (ashldi3_neon_noclobber): Remove.
	(signed_shift_di3_neon): Likewise.
	(unsigned_shift_di3_neon): Likewise.
	(ashrdi3_neon_imm_noclobber): Likewise.
	(lshrdi3_neon_imm_noclobber): Likewise.
	(<shift>di3_neon): Likewise.
	(split extend): Remove DI extend split patterns.

    testsuite/
	* gcc.target/arm/neon-extend-1.c: Remove test.
	* gcc.target/arm/neon-extend-2.c: Remove test.
---

diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
index 0dba97a4ebeed0c2133936ca662f1c9e86ffc6ba..10ed70dac4384354c0a2453c5e51a29108c6c062 100644
--- a/gcc/config/arm/arm.md
+++ b/gcc/config/arm/arm.md
@@ -3601,44 +3601,14 @@ (define_insn "*satsi_<SAT:code>_shift"
 (define_expand "ashldi3"
   [(set (match_operand:DI            0 "s_register_operand")
         (ashift:DI (match_operand:DI 1 "s_register_operand")
-                   (match_operand:SI 2 "general_operand")))]
+                   (match_operand:SI 2 "reg_or_int_operand")))]
   "TARGET_32BIT"
   "
-  if (TARGET_NEON)
-    {
-      /* Delay the decision whether to use NEON or core-regs until
-	 register allocation.  */
-      emit_insn (gen_ashldi3_neon (operands[0], operands[1], operands[2]));
-      DONE;
-    }
-  else
-    {
-      /* Only the NEON case can handle in-memory shift counts.  */
-      if (!reg_or_int_operand (operands[2], SImode))
-        operands[2] = force_reg (SImode, operands[2]);
-    }
-
-  if (!CONST_INT_P (operands[2]) && TARGET_REALLY_IWMMXT)
-    ; /* No special preparation statements; expand pattern as above.  */
-  else
-    {
-      rtx scratch1, scratch2;
-
-      /* Ideally we should use iwmmxt here if we could know that operands[1]
-         ends up already living in an iwmmxt register. Otherwise it's
-         cheaper to have the alternate code being generated than moving
-         values to iwmmxt regs and back.  */
-
-      /* Expand operation using core-registers.
-	 'FAIL' would achieve the same thing, but this is a bit smarter.  */
-      scratch1 = gen_reg_rtx (SImode);
-      scratch2 = gen_reg_rtx (SImode);
-      arm_emit_coreregs_64bit_shift (ASHIFT, operands[0], operands[1],
-				     operands[2], scratch1, scratch2);
-      DONE;
-    }
-  "
-)
+  arm_emit_coreregs_64bit_shift (ASHIFT, operands[0], operands[1],
+				 operands[2], gen_reg_rtx (SImode),
+				 gen_reg_rtx (SImode));
+  DONE;
+")
 
 (define_expand "ashlsi3"
   [(set (match_operand:SI            0 "s_register_operand")
@@ -3661,35 +3631,11 @@ (define_expand "ashrdi3"
                      (match_operand:SI 2 "reg_or_int_operand")))]
   "TARGET_32BIT"
   "
-  if (TARGET_NEON)
-    {
-      /* Delay the decision whether to use NEON or core-regs until
-	 register allocation.  */
-      emit_insn (gen_ashrdi3_neon (operands[0], operands[1], operands[2]));
-      DONE;
-    }
-
-  if (!CONST_INT_P (operands[2]) && TARGET_REALLY_IWMMXT)
-    ; /* No special preparation statements; expand pattern as above.  */
-  else
-    {
-      rtx scratch1, scratch2;
-
-      /* Ideally we should use iwmmxt here if we could know that operands[1]
-         ends up already living in an iwmmxt register. Otherwise it's
-         cheaper to have the alternate code being generated than moving
-         values to iwmmxt regs and back.  */
-
-      /* Expand operation using core-registers.
-	 'FAIL' would achieve the same thing, but this is a bit smarter.  */
-      scratch1 = gen_reg_rtx (SImode);
-      scratch2 = gen_reg_rtx (SImode);
-      arm_emit_coreregs_64bit_shift (ASHIFTRT, operands[0], operands[1],
-				     operands[2], scratch1, scratch2);
-      DONE;
-    }
-  "
-)
+  arm_emit_coreregs_64bit_shift (ASHIFTRT, operands[0], operands[1],
+				 operands[2], gen_reg_rtx (SImode),
+				 gen_reg_rtx (SImode));
+  DONE;
+")
 
 (define_expand "ashrsi3"
   [(set (match_operand:SI              0 "s_register_operand")
@@ -3709,35 +3655,11 @@ (define_expand "lshrdi3"
                      (match_operand:SI 2 "reg_or_int_operand")))]
   "TARGET_32BIT"
   "
-  if (TARGET_NEON)
-    {
-      /* Delay the decision whether to use NEON or core-regs until
-	 register allocation.  */
-      emit_insn (gen_lshrdi3_neon (operands[0], operands[1], operands[2]));
-      DONE;
-    }
-
-  if (!CONST_INT_P (operands[2]) && TARGET_REALLY_IWMMXT)
-    ; /* No special preparation statements; expand pattern as above.  */
-  else
-    {
-      rtx scratch1, scratch2;
-
-      /* Ideally we should use iwmmxt here if we could know that operands[1]
-         ends up already living in an iwmmxt register. Otherwise it's
-         cheaper to have the alternate code being generated than moving
-         values to iwmmxt regs and back.  */
-
-      /* Expand operation using core-registers.
-	 'FAIL' would achieve the same thing, but this is a bit smarter.  */
-      scratch1 = gen_reg_rtx (SImode);
-      scratch2 = gen_reg_rtx (SImode);
-      arm_emit_coreregs_64bit_shift (LSHIFTRT, operands[0], operands[1],
-				     operands[2], scratch1, scratch2);
-      DONE;
-    }
-  "
-)
+  arm_emit_coreregs_64bit_shift (LSHIFTRT, operands[0], operands[1],
+				 operands[2], gen_reg_rtx (SImode),
+				 gen_reg_rtx (SImode));
+  DONE;
+")
 
 (define_expand "lshrsi3"
   [(set (match_operand:SI              0 "s_register_operand")
@@ -4762,30 +4684,30 @@ (define_expand "truncdfhf2"
 ;; Zero and sign extension instructions.
 
 (define_insn "zero_extend<mode>di2"
-  [(set (match_operand:DI 0 "s_register_operand" "=w,r,?r,w")
+  [(set (match_operand:DI 0 "s_register_operand" "=r,?r")
         (zero_extend:DI (match_operand:QHSI 1 "<qhs_zextenddi_op>"
 					    "<qhs_zextenddi_cstr>")))]
   "TARGET_32BIT <qhs_zextenddi_cond>"
   "#"
-  [(set_attr "length" "8,4,8,8")
-   (set_attr "arch" "neon_for_64bits,*,*,avoid_neon_for_64bits")
+  [(set_attr "length" "4,8")
+   (set_attr "arch" "*,*")
    (set_attr "ce_count" "2")
    (set_attr "predicable" "yes")
-   (set_attr "type" "multiple,mov_reg,multiple,multiple")]
+   (set_attr "type" "mov_reg,multiple")]
 )
 
 (define_insn "extend<mode>di2"
-  [(set (match_operand:DI 0 "s_register_operand" "=w,r,?r,?r,w")
+  [(set (match_operand:DI 0 "s_register_operand" "=r,?r,?r")
         (sign_extend:DI (match_operand:QHSI 1 "<qhs_extenddi_op>"
 					    "<qhs_extenddi_cstr>")))]
   "TARGET_32BIT <qhs_sextenddi_cond>"
   "#"
-  [(set_attr "length" "8,4,8,8,8")
+  [(set_attr "length" "4,8,8")
    (set_attr "ce_count" "2")
    (set_attr "shift" "1")
    (set_attr "predicable" "yes")
-   (set_attr "arch" "neon_for_64bits,*,a,t,avoid_neon_for_64bits")
-   (set_attr "type" "multiple,mov_reg,multiple,multiple,multiple")]
+   (set_attr "arch" "*,a,t")
+   (set_attr "type" "mov_reg,multiple,multiple")]
 )
 
 ;; Splits for all extensions to DImode
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index eca16636ade577d84dc62523d066e72f79dc38f6..fa6f0c0529d5364b1e1df705cb1029868578e38c 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -741,8 +741,8 @@ (define_mode_attr qhs_zextenddi_op [(SI "s_register_operand")
 (define_mode_attr qhs_extenddi_op [(SI "s_register_operand")
 				   (HI "nonimmediate_operand")
 				   (QI "arm_reg_or_extendqisi_mem_op")])
-(define_mode_attr qhs_extenddi_cstr [(SI "r,0,r,r,r") (HI "r,0,rm,rm,r") (QI "r,0,rUq,rm,r")])
-(define_mode_attr qhs_zextenddi_cstr [(SI "r,0,r,r") (HI "r,0,rm,r") (QI "r,0,rm,r")])
+(define_mode_attr qhs_extenddi_cstr [(SI "0,r,r") (HI "0,rm,rm") (QI "0,rUq,rm")])
+(define_mode_attr qhs_zextenddi_cstr [(SI "0,r") (HI "0,rm") (QI "0,rm")])
 
 ;; Mode attributes used for fixed-point support.
 (define_mode_attr qaddsub_suf [(V4UQQ "8") (V2UHQ "16") (UQQ "8") (UHQ "16")
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index ef73c77abeeaa02947c70ffa435f7bedc431e3be..757f2c0f5377148c770e061849424aed924a7d7a 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -1135,173 +1135,6 @@ (define_insn "neon_load_count"
   [(set_attr "type" "neon_load1_1reg,neon_from_gp")]
 )
 
-(define_insn "ashldi3_neon_noclobber"
-  [(set (match_operand:DI 0 "s_register_operand"	    "=w,w")
-	(ashift:DI (match_operand:DI 1 "s_register_operand" " w,w")
-		   (match_operand:DI 2 "reg_or_int_operand" " i,w")))]
-  "TARGET_NEON && reload_completed
-   && (!CONST_INT_P (operands[2])
-       || (INTVAL (operands[2]) >= 0 && INTVAL (operands[2]) < 64))"
-  "@
-   vshl.u64\t%P0, %P1, %2
-   vshl.u64\t%P0, %P1, %P2"
-  [(set_attr "type" "neon_shift_imm, neon_shift_reg")]
-)
-
-(define_insn_and_split "ashldi3_neon"
-  [(set (match_operand:DI 0 "s_register_operand"	    "= w, w, &r, r, &r, ?w,?w")
-	(ashift:DI (match_operand:DI 1 "s_register_operand" " 0w, w, 0r, 0,  r, 0w, w")
-		   (match_operand:SI 2 "general_operand"    "rUm, i,  r, i,  i,rUm, i")))
-   (clobber (match_scratch:SI 3				    "= X, X, &r, X,  X,  X, X"))
-   (clobber (match_scratch:SI 4				    "= X, X, &r, X,  X,  X, X"))
-   (clobber (match_scratch:DI 5				    "=&w, X,  X, X,  X, &w, X"))
-   (clobber (reg:CC_C CC_REGNUM))]
-  "TARGET_NEON"
-  "#"
-  "TARGET_NEON && reload_completed"
-  [(const_int 0)]
-  "
-  {
-    if (IS_VFP_REGNUM (REGNO (operands[0])))
-      {
-        if (CONST_INT_P (operands[2]))
-	  {
-	    if (INTVAL (operands[2]) < 1)
-	      {
-	        emit_insn (gen_movdi (operands[0], operands[1]));
-		DONE;
-	      }
-	    else if (INTVAL (operands[2]) > 63)
-	      operands[2] = gen_rtx_CONST_INT (VOIDmode, 63);
-	  }
-	else
-	  {
-	    emit_insn (gen_neon_load_count (operands[5], operands[2]));
-	    operands[2] = operands[5];
-	  }
-
-	/* Ditch the unnecessary clobbers.  */
-	emit_insn (gen_ashldi3_neon_noclobber (operands[0], operands[1],
-					       operands[2]));
-      }
-    else
-      {
-	/* The shift expanders support either full overlap or no overlap.  */
-	gcc_assert (!reg_overlap_mentioned_p (operands[0], operands[1])
-		    || REGNO (operands[0]) == REGNO (operands[1]));
-
-	arm_emit_coreregs_64bit_shift (ASHIFT, operands[0], operands[1],
-				       operands[2], operands[3], operands[4]);
-      }
-    DONE;
-  }"
-  [(set_attr "arch" "neon_for_64bits,neon_for_64bits,*,*,*,avoid_neon_for_64bits,avoid_neon_for_64bits")
-   (set_attr "opt" "*,*,speed,speed,speed,*,*")
-   (set_attr "type" "multiple")]
-)
-
-; The shift amount needs to be negated for right-shifts
-(define_insn "signed_shift_di3_neon"
-  [(set (match_operand:DI 0 "s_register_operand"	     "=w")
-	(unspec:DI [(match_operand:DI 1 "s_register_operand" " w")
-		    (match_operand:DI 2 "s_register_operand" " w")]
-		   UNSPEC_ASHIFT_SIGNED))]
-  "TARGET_NEON && reload_completed"
-  "vshl.s64\t%P0, %P1, %P2"
-  [(set_attr "type" "neon_shift_reg")]
-)
-
-; The shift amount needs to be negated for right-shifts
-(define_insn "unsigned_shift_di3_neon"
-  [(set (match_operand:DI 0 "s_register_operand"	     "=w")
-	(unspec:DI [(match_operand:DI 1 "s_register_operand" " w")
-		    (match_operand:DI 2 "s_register_operand" " w")]
-		   UNSPEC_ASHIFT_UNSIGNED))]
-  "TARGET_NEON && reload_completed"
-  "vshl.u64\t%P0, %P1, %P2"
-  [(set_attr "type" "neon_shift_reg")]
-)
-
-(define_insn "ashrdi3_neon_imm_noclobber"
-  [(set (match_operand:DI 0 "s_register_operand"	      "=w")
-	(ashiftrt:DI (match_operand:DI 1 "s_register_operand" " w")
-		     (match_operand:DI 2 "const_int_operand"  " i")))]
-  "TARGET_NEON && reload_completed
-   && INTVAL (operands[2]) > 0 && INTVAL (operands[2]) <= 64"
-  "vshr.s64\t%P0, %P1, %2"
-  [(set_attr "type" "neon_shift_imm")]
-)
-
-(define_insn "lshrdi3_neon_imm_noclobber"
-  [(set (match_operand:DI 0 "s_register_operand"	      "=w")
-	(lshiftrt:DI (match_operand:DI 1 "s_register_operand" " w")
-		     (match_operand:DI 2 "const_int_operand"  " i")))]
-  "TARGET_NEON && reload_completed
-   && INTVAL (operands[2]) > 0 && INTVAL (operands[2]) <= 64"
-  "vshr.u64\t%P0, %P1, %2"
-  [(set_attr "type" "neon_shift_imm")]
-)
-
-;; ashrdi3_neon
-;; lshrdi3_neon
-(define_insn_and_split "<shift>di3_neon"
-  [(set (match_operand:DI 0 "s_register_operand"	     "= w, w, &r, r, &r,?w,?w")
-	(RSHIFTS:DI (match_operand:DI 1 "s_register_operand" " 0w, w, 0r, 0,  r,0w, w")
-		    (match_operand:SI 2 "reg_or_int_operand" "  r, i,  r, i,  i, r, i")))
-   (clobber (match_scratch:SI 3				     "=2r, X, &r, X,  X,2r, X"))
-   (clobber (match_scratch:SI 4				     "= X, X, &r, X,  X, X, X"))
-   (clobber (match_scratch:DI 5				     "=&w, X,  X, X, X,&w, X"))
-   (clobber (reg:CC CC_REGNUM))]
-  "TARGET_NEON"
-  "#"
-  "TARGET_NEON && reload_completed"
-  [(const_int 0)]
-  "
-  {
-    if (IS_VFP_REGNUM (REGNO (operands[0])))
-      {
-	if (CONST_INT_P (operands[2]))
-	  {
-	    if (INTVAL (operands[2]) < 1)
-	      {
-	        emit_insn (gen_movdi (operands[0], operands[1]));
-		DONE;
-	      }
-	    else if (INTVAL (operands[2]) > 64)
-	      operands[2] = gen_rtx_CONST_INT (VOIDmode, 64);
-
-	    /* Ditch the unnecessary clobbers.  */
-	    emit_insn (gen_<shift>di3_neon_imm_noclobber (operands[0],
-							  operands[1],
-							  operands[2]));
-	  }
-	else 
-	  {
-	    /* We must use a negative left-shift.  */
-	    emit_insn (gen_negsi2 (operands[3], operands[2]));
-	    emit_insn (gen_neon_load_count (operands[5], operands[3]));
-	    emit_insn (gen_<shifttype>_shift_di3_neon (operands[0], operands[1],
-						       operands[5]));
-	  }
-      }
-    else
-      {
-	/* The shift expanders support either full overlap or no overlap.  */
-	gcc_assert (!reg_overlap_mentioned_p (operands[0], operands[1])
-		    || REGNO (operands[0]) == REGNO (operands[1]));
-
-	/* This clobbers CC (ASHIFTRT by register only).  */
-	arm_emit_coreregs_64bit_shift (<CODE>, operands[0], operands[1],
-				       operands[2], operands[3], operands[4]);
-      }
-
-    DONE;
-  }"
-  [(set_attr "arch" "neon_for_64bits,neon_for_64bits,*,*,*,avoid_neon_for_64bits,avoid_neon_for_64bits")
-   (set_attr "opt" "*,*,speed,speed,speed,*,*")
-   (set_attr "type" "multiple")]
-)
-
 ;; Widening operations
 
 (define_expand "widen_ssum<mode>3"
@@ -6792,65 +6625,3 @@ (define_insn "neon_vabd<mode>_3"
  "vabd.<V_if_elem> %<V_reg>0, %<V_reg>1, %<V_reg>2"
  [(set_attr "type" "neon_fp_abd_s<q>")]
 )
-
-;; Copy from core-to-neon regs, then extend, not vice-versa
-
-(define_split
-  [(set (match_operand:DI 0 "s_register_operand" "")
-	(sign_extend:DI (match_operand:SI 1 "s_register_operand" "")))]
-  "TARGET_NEON && reload_completed && IS_VFP_REGNUM (REGNO (operands[0]))"
-  [(set (match_dup 2) (vec_duplicate:V2SI (match_dup 1)))
-   (set (match_dup 0) (ashiftrt:DI (match_dup 0) (const_int 32)))]
-  {
-    operands[2] = gen_rtx_REG (V2SImode, REGNO (operands[0]));
-  })
-
-(define_split
-  [(set (match_operand:DI 0 "s_register_operand" "")
-	(sign_extend:DI (match_operand:HI 1 "s_register_operand" "")))]
-  "TARGET_NEON && reload_completed && IS_VFP_REGNUM (REGNO (operands[0]))"
-  [(set (match_dup 2) (vec_duplicate:V4HI (match_dup 1)))
-   (set (match_dup 0) (ashiftrt:DI (match_dup 0) (const_int 48)))]
-  {
-    operands[2] = gen_rtx_REG (V4HImode, REGNO (operands[0]));
-  })
-
-(define_split
-  [(set (match_operand:DI 0 "s_register_operand" "")
-	(sign_extend:DI (match_operand:QI 1 "s_register_operand" "")))]
-  "TARGET_NEON && reload_completed && IS_VFP_REGNUM (REGNO (operands[0]))"
-  [(set (match_dup 2) (vec_duplicate:V8QI (match_dup 1)))
-   (set (match_dup 0) (ashiftrt:DI (match_dup 0) (const_int 56)))]
-  {
-    operands[2] = gen_rtx_REG (V8QImode, REGNO (operands[0]));
-  })
-
-(define_split
-  [(set (match_operand:DI 0 "s_register_operand" "")
-	(zero_extend:DI (match_operand:SI 1 "s_register_operand" "")))]
-  "TARGET_NEON && reload_completed && IS_VFP_REGNUM (REGNO (operands[0]))"
-  [(set (match_dup 2) (vec_duplicate:V2SI (match_dup 1)))
-   (set (match_dup 0) (lshiftrt:DI (match_dup 0) (const_int 32)))]
-  {
-    operands[2] = gen_rtx_REG (V2SImode, REGNO (operands[0]));
-  })
-
-(define_split
-  [(set (match_operand:DI 0 "s_register_operand" "")
-	(zero_extend:DI (match_operand:HI 1 "s_register_operand" "")))]
-  "TARGET_NEON && reload_completed && IS_VFP_REGNUM (REGNO (operands[0]))"
-  [(set (match_dup 2) (vec_duplicate:V4HI (match_dup 1)))
-   (set (match_dup 0) (lshiftrt:DI (match_dup 0) (const_int 48)))]
-  {
-    operands[2] = gen_rtx_REG (V4HImode, REGNO (operands[0]));
-  })
-
-(define_split
-  [(set (match_operand:DI 0 "s_register_operand" "")
-	(zero_extend:DI (match_operand:QI 1 "s_register_operand" "")))]
-  "TARGET_NEON && reload_completed && IS_VFP_REGNUM (REGNO (operands[0]))"
-  [(set (match_dup 2) (vec_duplicate:V8QI (match_dup 1)))
-   (set (match_dup 0) (lshiftrt:DI (match_dup 0) (const_int 56)))]
-  {
-    operands[2] = gen_rtx_REG (V8QImode, REGNO (operands[0]));
-  })
diff --git a/gcc/testsuite/gcc.target/arm/neon-extend-1.c b/gcc/testsuite/gcc.target/arm/neon-extend-1.c
deleted file mode 100644
index cfe83ce1bde260cdd99ae47963dde6676627cca3..0000000000000000000000000000000000000000
--- a/gcc/testsuite/gcc.target/arm/neon-extend-1.c
+++ /dev/null
@@ -1,13 +0,0 @@
-/* { dg-require-effective-target arm_neon_hw } */
-/* { dg-options "-O2" } */
-/* { dg-add-options arm_neon } */
-
-void
-f (unsigned int a)
-{
-  unsigned long long b = a;
-  asm volatile ("@ extended to %0" : : "w" (b));
-}
-
-/* { dg-final { scan-assembler "vdup.32" } } */
-/* { dg-final { scan-assembler "vshr.u64" } } */
diff --git a/gcc/testsuite/gcc.target/arm/neon-extend-2.c b/gcc/testsuite/gcc.target/arm/neon-extend-2.c
deleted file mode 100644
index 1c5a17e427859146f53828ec4a02b439ab0580ff..0000000000000000000000000000000000000000
--- a/gcc/testsuite/gcc.target/arm/neon-extend-2.c
+++ /dev/null
@@ -1,13 +0,0 @@
-/* { dg-require-effective-target arm_neon_hw } */
-/* { dg-options "-O2" } */
-/* { dg-add-options arm_neon } */
-
-void
-f (int a)
-{
-  long long b = a;
-  asm volatile ("@ extended to %0" : : "w" (b));
-}
-
-/* { dg-final { scan-assembler "vdup.32" } } */
-/* { dg-final { scan-assembler "vshr.s64" } } */

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH][ARM] Cleanup DImode shifts
  2019-07-22 16:19 [PATCH][ARM] Cleanup DImode shifts Wilco Dijkstra
@ 2019-07-22 16:35 ` Ramana Radhakrishnan
  2019-07-22 17:20   ` Wilco Dijkstra
  2019-07-31 16:39 ` Wilco Dijkstra
  1 sibling, 1 reply; 6+ messages in thread
From: Ramana Radhakrishnan @ 2019-07-22 16:35 UTC (permalink / raw)
  To: Wilco Dijkstra, GCC Patches; +Cc: nd

On 22/07/2019 17:16, Wilco Dijkstra wrote:
> Like the logical operations, expand all shifts early rather than only
> sometimes.  The Neon shift expansions are never emitted (not even with
> -fneon-for-64bits), so they are not useful.  So all the late expansions
> and Neon shift patterns can be removed, and shifts are more optimized
> as a result.  Since some extend patterns use Neon DImode shifts, remove
> the Neon extend variants and related splits.
> 
> A simple example (relying on [1]) generates the same efficient code after
> this patch with -mfpu=neon and -mfpu=vfp (previously just the fact of
> having Neon enabled resulted inefficient code for no reason).
> 
> unsigned long long f(unsigned long long x, unsigned long long y)
> { return x & (y >> 33); }
> 
> Before:
>          strd    r4, r5, [sp, #-8]!
>          lsr     r4, r3, #1
>          mov     r5, #0
>          and     r1, r1, r5
>          and     r0, r0, r4
>          ldrd    r4, r5, [sp]
>          add     sp, sp, #8
>          bx      lr
> 
> After:
>          and     r0, r0, r3, lsr #1
>          mov     r1, #0
>          bx      lr
> 
> Bootstrap and regress OK on arm-none-linux-gnueabihf --with-cpu=cortex-a57
> 
> [1] https://gcc.gnu.org/ml/gcc-patches/2019-07/msg01301.html

Thanks for this patch set - What I'm missing in this is any analysis as 
to what's the impact on code generation for neon intrinsics that use 
uint64_t ? Especially things like v<add/sub/...>_u64 ?


Ramana


> 
> ChangeLog:
> 2019-07-19  Wilco Dijkstra  <wdijkstr@arm.com>
> 
> 	* config/arm/iterators.md (qhs_extenddi_cstr): Update.
> 	(qhs_extenddi_cstr): Likewise.
> 	* config/arm/arm.md (ashldi3): Always expand early.
> 	(ashlsi3): Likewise.
> 	(ashrsi3): Likewise.
> 	(zero_extend<mode>di2): Remove Neon variants.
> 	(extend<mode>di2): Likewise.
> 	* config/arm/neon.md (ashldi3_neon_noclobber): Remove.
> 	(signed_shift_di3_neon): Likewise.
> 	(unsigned_shift_di3_neon): Likewise.
> 	(ashrdi3_neon_imm_noclobber): Likewise.
> 	(lshrdi3_neon_imm_noclobber): Likewise.
> 	(<shift>di3_neon): Likewise.
> 	(split extend): Remove DI extend split patterns.
> 
>      testsuite/
> 	* gcc.target/arm/neon-extend-1.c: Remove test.
> 	* gcc.target/arm/neon-extend-2.c: Remove test.
> ---
> 
> diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
> index 0dba97a4ebeed0c2133936ca662f1c9e86ffc6ba..10ed70dac4384354c0a2453c5e51a29108c6c062 100644
> --- a/gcc/config/arm/arm.md
> +++ b/gcc/config/arm/arm.md
> @@ -3601,44 +3601,14 @@ (define_insn "*satsi_<SAT:code>_shift"
>   (define_expand "ashldi3"
>     [(set (match_operand:DI            0 "s_register_operand")
>           (ashift:DI (match_operand:DI 1 "s_register_operand")
> -                   (match_operand:SI 2 "general_operand")))]
> +                   (match_operand:SI 2 "reg_or_int_operand")))]
>     "TARGET_32BIT"
>     "
> -  if (TARGET_NEON)
> -    {
> -      /* Delay the decision whether to use NEON or core-regs until
> -	 register allocation.  */
> -      emit_insn (gen_ashldi3_neon (operands[0], operands[1], operands[2]));
> -      DONE;
> -    }
> -  else
> -    {
> -      /* Only the NEON case can handle in-memory shift counts.  */
> -      if (!reg_or_int_operand (operands[2], SImode))
> -        operands[2] = force_reg (SImode, operands[2]);
> -    }
> -
> -  if (!CONST_INT_P (operands[2]) && TARGET_REALLY_IWMMXT)
> -    ; /* No special preparation statements; expand pattern as above.  */
> -  else
> -    {
> -      rtx scratch1, scratch2;
> -
> -      /* Ideally we should use iwmmxt here if we could know that operands[1]
> -         ends up already living in an iwmmxt register. Otherwise it's
> -         cheaper to have the alternate code being generated than moving
> -         values to iwmmxt regs and back.  */
> -
> -      /* Expand operation using core-registers.
> -	 'FAIL' would achieve the same thing, but this is a bit smarter.  */
> -      scratch1 = gen_reg_rtx (SImode);
> -      scratch2 = gen_reg_rtx (SImode);
> -      arm_emit_coreregs_64bit_shift (ASHIFT, operands[0], operands[1],
> -				     operands[2], scratch1, scratch2);
> -      DONE;
> -    }
> -  "
> -)
> +  arm_emit_coreregs_64bit_shift (ASHIFT, operands[0], operands[1],
> +				 operands[2], gen_reg_rtx (SImode),
> +				 gen_reg_rtx (SImode));
> +  DONE;
> +")
>   
>   (define_expand "ashlsi3"
>     [(set (match_operand:SI            0 "s_register_operand")
> @@ -3661,35 +3631,11 @@ (define_expand "ashrdi3"
>                        (match_operand:SI 2 "reg_or_int_operand")))]
>     "TARGET_32BIT"
>     "
> -  if (TARGET_NEON)
> -    {
> -      /* Delay the decision whether to use NEON or core-regs until
> -	 register allocation.  */
> -      emit_insn (gen_ashrdi3_neon (operands[0], operands[1], operands[2]));
> -      DONE;
> -    }
> -
> -  if (!CONST_INT_P (operands[2]) && TARGET_REALLY_IWMMXT)
> -    ; /* No special preparation statements; expand pattern as above.  */
> -  else
> -    {
> -      rtx scratch1, scratch2;
> -
> -      /* Ideally we should use iwmmxt here if we could know that operands[1]
> -         ends up already living in an iwmmxt register. Otherwise it's
> -         cheaper to have the alternate code being generated than moving
> -         values to iwmmxt regs and back.  */
> -
> -      /* Expand operation using core-registers.
> -	 'FAIL' would achieve the same thing, but this is a bit smarter.  */
> -      scratch1 = gen_reg_rtx (SImode);
> -      scratch2 = gen_reg_rtx (SImode);
> -      arm_emit_coreregs_64bit_shift (ASHIFTRT, operands[0], operands[1],
> -				     operands[2], scratch1, scratch2);
> -      DONE;
> -    }
> -  "
> -)
> +  arm_emit_coreregs_64bit_shift (ASHIFTRT, operands[0], operands[1],
> +				 operands[2], gen_reg_rtx (SImode),
> +				 gen_reg_rtx (SImode));
> +  DONE;
> +")
>   
>   (define_expand "ashrsi3"
>     [(set (match_operand:SI              0 "s_register_operand")
> @@ -3709,35 +3655,11 @@ (define_expand "lshrdi3"
>                        (match_operand:SI 2 "reg_or_int_operand")))]
>     "TARGET_32BIT"
>     "
> -  if (TARGET_NEON)
> -    {
> -      /* Delay the decision whether to use NEON or core-regs until
> -	 register allocation.  */
> -      emit_insn (gen_lshrdi3_neon (operands[0], operands[1], operands[2]));
> -      DONE;
> -    }
> -
> -  if (!CONST_INT_P (operands[2]) && TARGET_REALLY_IWMMXT)
> -    ; /* No special preparation statements; expand pattern as above.  */
> -  else
> -    {
> -      rtx scratch1, scratch2;
> -
> -      /* Ideally we should use iwmmxt here if we could know that operands[1]
> -         ends up already living in an iwmmxt register. Otherwise it's
> -         cheaper to have the alternate code being generated than moving
> -         values to iwmmxt regs and back.  */
> -
> -      /* Expand operation using core-registers.
> -	 'FAIL' would achieve the same thing, but this is a bit smarter.  */
> -      scratch1 = gen_reg_rtx (SImode);
> -      scratch2 = gen_reg_rtx (SImode);
> -      arm_emit_coreregs_64bit_shift (LSHIFTRT, operands[0], operands[1],
> -				     operands[2], scratch1, scratch2);
> -      DONE;
> -    }
> -  "
> -)
> +  arm_emit_coreregs_64bit_shift (LSHIFTRT, operands[0], operands[1],
> +				 operands[2], gen_reg_rtx (SImode),
> +				 gen_reg_rtx (SImode));
> +  DONE;
> +")
>   
>   (define_expand "lshrsi3"
>     [(set (match_operand:SI              0 "s_register_operand")
> @@ -4762,30 +4684,30 @@ (define_expand "truncdfhf2"
>   ;; Zero and sign extension instructions.
>   
>   (define_insn "zero_extend<mode>di2"
> -  [(set (match_operand:DI 0 "s_register_operand" "=w,r,?r,w")
> +  [(set (match_operand:DI 0 "s_register_operand" "=r,?r")
>           (zero_extend:DI (match_operand:QHSI 1 "<qhs_zextenddi_op>"
>   					    "<qhs_zextenddi_cstr>")))]
>     "TARGET_32BIT <qhs_zextenddi_cond>"
>     "#"
> -  [(set_attr "length" "8,4,8,8")
> -   (set_attr "arch" "neon_for_64bits,*,*,avoid_neon_for_64bits")
> +  [(set_attr "length" "4,8")
> +   (set_attr "arch" "*,*")
>      (set_attr "ce_count" "2")
>      (set_attr "predicable" "yes")
> -   (set_attr "type" "multiple,mov_reg,multiple,multiple")]
> +   (set_attr "type" "mov_reg,multiple")]
>   )
>   
>   (define_insn "extend<mode>di2"
> -  [(set (match_operand:DI 0 "s_register_operand" "=w,r,?r,?r,w")
> +  [(set (match_operand:DI 0 "s_register_operand" "=r,?r,?r")
>           (sign_extend:DI (match_operand:QHSI 1 "<qhs_extenddi_op>"
>   					    "<qhs_extenddi_cstr>")))]
>     "TARGET_32BIT <qhs_sextenddi_cond>"
>     "#"
> -  [(set_attr "length" "8,4,8,8,8")
> +  [(set_attr "length" "4,8,8")
>      (set_attr "ce_count" "2")
>      (set_attr "shift" "1")
>      (set_attr "predicable" "yes")
> -   (set_attr "arch" "neon_for_64bits,*,a,t,avoid_neon_for_64bits")
> -   (set_attr "type" "multiple,mov_reg,multiple,multiple,multiple")]
> +   (set_attr "arch" "*,a,t")
> +   (set_attr "type" "mov_reg,multiple,multiple")]
>   )
>   
>   ;; Splits for all extensions to DImode
> diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
> index eca16636ade577d84dc62523d066e72f79dc38f6..fa6f0c0529d5364b1e1df705cb1029868578e38c 100644
> --- a/gcc/config/arm/iterators.md
> +++ b/gcc/config/arm/iterators.md
> @@ -741,8 +741,8 @@ (define_mode_attr qhs_zextenddi_op [(SI "s_register_operand")
>   (define_mode_attr qhs_extenddi_op [(SI "s_register_operand")
>   				   (HI "nonimmediate_operand")
>   				   (QI "arm_reg_or_extendqisi_mem_op")])
> -(define_mode_attr qhs_extenddi_cstr [(SI "r,0,r,r,r") (HI "r,0,rm,rm,r") (QI "r,0,rUq,rm,r")])
> -(define_mode_attr qhs_zextenddi_cstr [(SI "r,0,r,r") (HI "r,0,rm,r") (QI "r,0,rm,r")])
> +(define_mode_attr qhs_extenddi_cstr [(SI "0,r,r") (HI "0,rm,rm") (QI "0,rUq,rm")])
> +(define_mode_attr qhs_zextenddi_cstr [(SI "0,r") (HI "0,rm") (QI "0,rm")])
>   
>   ;; Mode attributes used for fixed-point support.
>   (define_mode_attr qaddsub_suf [(V4UQQ "8") (V2UHQ "16") (UQQ "8") (UHQ "16")
> diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
> index ef73c77abeeaa02947c70ffa435f7bedc431e3be..757f2c0f5377148c770e061849424aed924a7d7a 100644
> --- a/gcc/config/arm/neon.md
> +++ b/gcc/config/arm/neon.md
> @@ -1135,173 +1135,6 @@ (define_insn "neon_load_count"
>     [(set_attr "type" "neon_load1_1reg,neon_from_gp")]
>   )
>   
> -(define_insn "ashldi3_neon_noclobber"
> -  [(set (match_operand:DI 0 "s_register_operand"	    "=w,w")
> -	(ashift:DI (match_operand:DI 1 "s_register_operand" " w,w")
> -		   (match_operand:DI 2 "reg_or_int_operand" " i,w")))]
> -  "TARGET_NEON && reload_completed
> -   && (!CONST_INT_P (operands[2])
> -       || (INTVAL (operands[2]) >= 0 && INTVAL (operands[2]) < 64))"
> -  "@
> -   vshl.u64\t%P0, %P1, %2
> -   vshl.u64\t%P0, %P1, %P2"
> -  [(set_attr "type" "neon_shift_imm, neon_shift_reg")]
> -)
> -
> -(define_insn_and_split "ashldi3_neon"
> -  [(set (match_operand:DI 0 "s_register_operand"	    "= w, w, &r, r, &r, ?w,?w")
> -	(ashift:DI (match_operand:DI 1 "s_register_operand" " 0w, w, 0r, 0,  r, 0w, w")
> -		   (match_operand:SI 2 "general_operand"    "rUm, i,  r, i,  i,rUm, i")))
> -   (clobber (match_scratch:SI 3				    "= X, X, &r, X,  X,  X, X"))
> -   (clobber (match_scratch:SI 4				    "= X, X, &r, X,  X,  X, X"))
> -   (clobber (match_scratch:DI 5				    "=&w, X,  X, X,  X, &w, X"))
> -   (clobber (reg:CC_C CC_REGNUM))]
> -  "TARGET_NEON"
> -  "#"
> -  "TARGET_NEON && reload_completed"
> -  [(const_int 0)]
> -  "
> -  {
> -    if (IS_VFP_REGNUM (REGNO (operands[0])))
> -      {
> -        if (CONST_INT_P (operands[2]))
> -	  {
> -	    if (INTVAL (operands[2]) < 1)
> -	      {
> -	        emit_insn (gen_movdi (operands[0], operands[1]));
> -		DONE;
> -	      }
> -	    else if (INTVAL (operands[2]) > 63)
> -	      operands[2] = gen_rtx_CONST_INT (VOIDmode, 63);
> -	  }
> -	else
> -	  {
> -	    emit_insn (gen_neon_load_count (operands[5], operands[2]));
> -	    operands[2] = operands[5];
> -	  }
> -
> -	/* Ditch the unnecessary clobbers.  */
> -	emit_insn (gen_ashldi3_neon_noclobber (operands[0], operands[1],
> -					       operands[2]));
> -      }
> -    else
> -      {
> -	/* The shift expanders support either full overlap or no overlap.  */
> -	gcc_assert (!reg_overlap_mentioned_p (operands[0], operands[1])
> -		    || REGNO (operands[0]) == REGNO (operands[1]));
> -
> -	arm_emit_coreregs_64bit_shift (ASHIFT, operands[0], operands[1],
> -				       operands[2], operands[3], operands[4]);
> -      }
> -    DONE;
> -  }"
> -  [(set_attr "arch" "neon_for_64bits,neon_for_64bits,*,*,*,avoid_neon_for_64bits,avoid_neon_for_64bits")
> -   (set_attr "opt" "*,*,speed,speed,speed,*,*")
> -   (set_attr "type" "multiple")]
> -)
> -
> -; The shift amount needs to be negated for right-shifts
> -(define_insn "signed_shift_di3_neon"
> -  [(set (match_operand:DI 0 "s_register_operand"	     "=w")
> -	(unspec:DI [(match_operand:DI 1 "s_register_operand" " w")
> -		    (match_operand:DI 2 "s_register_operand" " w")]
> -		   UNSPEC_ASHIFT_SIGNED))]
> -  "TARGET_NEON && reload_completed"
> -  "vshl.s64\t%P0, %P1, %P2"
> -  [(set_attr "type" "neon_shift_reg")]
> -)
> -
> -; The shift amount needs to be negated for right-shifts
> -(define_insn "unsigned_shift_di3_neon"
> -  [(set (match_operand:DI 0 "s_register_operand"	     "=w")
> -	(unspec:DI [(match_operand:DI 1 "s_register_operand" " w")
> -		    (match_operand:DI 2 "s_register_operand" " w")]
> -		   UNSPEC_ASHIFT_UNSIGNED))]
> -  "TARGET_NEON && reload_completed"
> -  "vshl.u64\t%P0, %P1, %P2"
> -  [(set_attr "type" "neon_shift_reg")]
> -)
> -
> -(define_insn "ashrdi3_neon_imm_noclobber"
> -  [(set (match_operand:DI 0 "s_register_operand"	      "=w")
> -	(ashiftrt:DI (match_operand:DI 1 "s_register_operand" " w")
> -		     (match_operand:DI 2 "const_int_operand"  " i")))]
> -  "TARGET_NEON && reload_completed
> -   && INTVAL (operands[2]) > 0 && INTVAL (operands[2]) <= 64"
> -  "vshr.s64\t%P0, %P1, %2"
> -  [(set_attr "type" "neon_shift_imm")]
> -)
> -
> -(define_insn "lshrdi3_neon_imm_noclobber"
> -  [(set (match_operand:DI 0 "s_register_operand"	      "=w")
> -	(lshiftrt:DI (match_operand:DI 1 "s_register_operand" " w")
> -		     (match_operand:DI 2 "const_int_operand"  " i")))]
> -  "TARGET_NEON && reload_completed
> -   && INTVAL (operands[2]) > 0 && INTVAL (operands[2]) <= 64"
> -  "vshr.u64\t%P0, %P1, %2"
> -  [(set_attr "type" "neon_shift_imm")]
> -)
> -
> -;; ashrdi3_neon
> -;; lshrdi3_neon
> -(define_insn_and_split "<shift>di3_neon"
> -  [(set (match_operand:DI 0 "s_register_operand"	     "= w, w, &r, r, &r,?w,?w")
> -	(RSHIFTS:DI (match_operand:DI 1 "s_register_operand" " 0w, w, 0r, 0,  r,0w, w")
> -		    (match_operand:SI 2 "reg_or_int_operand" "  r, i,  r, i,  i, r, i")))
> -   (clobber (match_scratch:SI 3				     "=2r, X, &r, X,  X,2r, X"))
> -   (clobber (match_scratch:SI 4				     "= X, X, &r, X,  X, X, X"))
> -   (clobber (match_scratch:DI 5				     "=&w, X,  X, X, X,&w, X"))
> -   (clobber (reg:CC CC_REGNUM))]
> -  "TARGET_NEON"
> -  "#"
> -  "TARGET_NEON && reload_completed"
> -  [(const_int 0)]
> -  "
> -  {
> -    if (IS_VFP_REGNUM (REGNO (operands[0])))
> -      {
> -	if (CONST_INT_P (operands[2]))
> -	  {
> -	    if (INTVAL (operands[2]) < 1)
> -	      {
> -	        emit_insn (gen_movdi (operands[0], operands[1]));
> -		DONE;
> -	      }
> -	    else if (INTVAL (operands[2]) > 64)
> -	      operands[2] = gen_rtx_CONST_INT (VOIDmode, 64);
> -
> -	    /* Ditch the unnecessary clobbers.  */
> -	    emit_insn (gen_<shift>di3_neon_imm_noclobber (operands[0],
> -							  operands[1],
> -							  operands[2]));
> -	  }
> -	else
> -	  {
> -	    /* We must use a negative left-shift.  */
> -	    emit_insn (gen_negsi2 (operands[3], operands[2]));
> -	    emit_insn (gen_neon_load_count (operands[5], operands[3]));
> -	    emit_insn (gen_<shifttype>_shift_di3_neon (operands[0], operands[1],
> -						       operands[5]));
> -	  }
> -      }
> -    else
> -      {
> -	/* The shift expanders support either full overlap or no overlap.  */
> -	gcc_assert (!reg_overlap_mentioned_p (operands[0], operands[1])
> -		    || REGNO (operands[0]) == REGNO (operands[1]));
> -
> -	/* This clobbers CC (ASHIFTRT by register only).  */
> -	arm_emit_coreregs_64bit_shift (<CODE>, operands[0], operands[1],
> -				       operands[2], operands[3], operands[4]);
> -      }
> -
> -    DONE;
> -  }"
> -  [(set_attr "arch" "neon_for_64bits,neon_for_64bits,*,*,*,avoid_neon_for_64bits,avoid_neon_for_64bits")
> -   (set_attr "opt" "*,*,speed,speed,speed,*,*")
> -   (set_attr "type" "multiple")]
> -)
> -
>   ;; Widening operations
>   
>   (define_expand "widen_ssum<mode>3"
> @@ -6792,65 +6625,3 @@ (define_insn "neon_vabd<mode>_3"
>    "vabd.<V_if_elem> %<V_reg>0, %<V_reg>1, %<V_reg>2"
>    [(set_attr "type" "neon_fp_abd_s<q>")]
>   )
> -
> -;; Copy from core-to-neon regs, then extend, not vice-versa
> -
> -(define_split
> -  [(set (match_operand:DI 0 "s_register_operand" "")
> -	(sign_extend:DI (match_operand:SI 1 "s_register_operand" "")))]
> -  "TARGET_NEON && reload_completed && IS_VFP_REGNUM (REGNO (operands[0]))"
> -  [(set (match_dup 2) (vec_duplicate:V2SI (match_dup 1)))
> -   (set (match_dup 0) (ashiftrt:DI (match_dup 0) (const_int 32)))]
> -  {
> -    operands[2] = gen_rtx_REG (V2SImode, REGNO (operands[0]));
> -  })
> -
> -(define_split
> -  [(set (match_operand:DI 0 "s_register_operand" "")
> -	(sign_extend:DI (match_operand:HI 1 "s_register_operand" "")))]
> -  "TARGET_NEON && reload_completed && IS_VFP_REGNUM (REGNO (operands[0]))"
> -  [(set (match_dup 2) (vec_duplicate:V4HI (match_dup 1)))
> -   (set (match_dup 0) (ashiftrt:DI (match_dup 0) (const_int 48)))]
> -  {
> -    operands[2] = gen_rtx_REG (V4HImode, REGNO (operands[0]));
> -  })
> -
> -(define_split
> -  [(set (match_operand:DI 0 "s_register_operand" "")
> -	(sign_extend:DI (match_operand:QI 1 "s_register_operand" "")))]
> -  "TARGET_NEON && reload_completed && IS_VFP_REGNUM (REGNO (operands[0]))"
> -  [(set (match_dup 2) (vec_duplicate:V8QI (match_dup 1)))
> -   (set (match_dup 0) (ashiftrt:DI (match_dup 0) (const_int 56)))]
> -  {
> -    operands[2] = gen_rtx_REG (V8QImode, REGNO (operands[0]));
> -  })
> -
> -(define_split
> -  [(set (match_operand:DI 0 "s_register_operand" "")
> -	(zero_extend:DI (match_operand:SI 1 "s_register_operand" "")))]
> -  "TARGET_NEON && reload_completed && IS_VFP_REGNUM (REGNO (operands[0]))"
> -  [(set (match_dup 2) (vec_duplicate:V2SI (match_dup 1)))
> -   (set (match_dup 0) (lshiftrt:DI (match_dup 0) (const_int 32)))]
> -  {
> -    operands[2] = gen_rtx_REG (V2SImode, REGNO (operands[0]));
> -  })
> -
> -(define_split
> -  [(set (match_operand:DI 0 "s_register_operand" "")
> -	(zero_extend:DI (match_operand:HI 1 "s_register_operand" "")))]
> -  "TARGET_NEON && reload_completed && IS_VFP_REGNUM (REGNO (operands[0]))"
> -  [(set (match_dup 2) (vec_duplicate:V4HI (match_dup 1)))
> -   (set (match_dup 0) (lshiftrt:DI (match_dup 0) (const_int 48)))]
> -  {
> -    operands[2] = gen_rtx_REG (V4HImode, REGNO (operands[0]));
> -  })
> -
> -(define_split
> -  [(set (match_operand:DI 0 "s_register_operand" "")
> -	(zero_extend:DI (match_operand:QI 1 "s_register_operand" "")))]
> -  "TARGET_NEON && reload_completed && IS_VFP_REGNUM (REGNO (operands[0]))"
> -  [(set (match_dup 2) (vec_duplicate:V8QI (match_dup 1)))
> -   (set (match_dup 0) (lshiftrt:DI (match_dup 0) (const_int 56)))]
> -  {
> -    operands[2] = gen_rtx_REG (V8QImode, REGNO (operands[0]));
> -  })
> diff --git a/gcc/testsuite/gcc.target/arm/neon-extend-1.c b/gcc/testsuite/gcc.target/arm/neon-extend-1.c
> deleted file mode 100644
> index cfe83ce1bde260cdd99ae47963dde6676627cca3..0000000000000000000000000000000000000000
> --- a/gcc/testsuite/gcc.target/arm/neon-extend-1.c
> +++ /dev/null
> @@ -1,13 +0,0 @@
> -/* { dg-require-effective-target arm_neon_hw } */
> -/* { dg-options "-O2" } */
> -/* { dg-add-options arm_neon } */
> -
> -void
> -f (unsigned int a)
> -{
> -  unsigned long long b = a;
> -  asm volatile ("@ extended to %0" : : "w" (b));
> -}
> -
> -/* { dg-final { scan-assembler "vdup.32" } } */
> -/* { dg-final { scan-assembler "vshr.u64" } } */
> diff --git a/gcc/testsuite/gcc.target/arm/neon-extend-2.c b/gcc/testsuite/gcc.target/arm/neon-extend-2.c
> deleted file mode 100644
> index 1c5a17e427859146f53828ec4a02b439ab0580ff..0000000000000000000000000000000000000000
> --- a/gcc/testsuite/gcc.target/arm/neon-extend-2.c
> +++ /dev/null
> @@ -1,13 +0,0 @@
> -/* { dg-require-effective-target arm_neon_hw } */
> -/* { dg-options "-O2" } */
> -/* { dg-add-options arm_neon } */
> -
> -void
> -f (int a)
> -{
> -  long long b = a;
> -  asm volatile ("@ extended to %0" : : "w" (b));
> -}
> -
> -/* { dg-final { scan-assembler "vdup.32" } } */
> -/* { dg-final { scan-assembler "vshr.s64" } } */
> 

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH][ARM] Cleanup DImode shifts
  2019-07-22 16:35 ` Ramana Radhakrishnan
@ 2019-07-22 17:20   ` Wilco Dijkstra
  0 siblings, 0 replies; 6+ messages in thread
From: Wilco Dijkstra @ 2019-07-22 17:20 UTC (permalink / raw)
  To: Ramana Radhakrishnan, GCC Patches; +Cc: nd

Hi Ramana,

> Thanks for this patch set - What I'm missing in this is any analysis as 
> to what's the impact on code generation for neon intrinsics that use 
> uint64_t ? Especially things like v<add/sub/...>_u64 ?

Well things like this continue to work exactly like before:

uint64x1_t f20(uint64x1_t x, uint64x1_t y)
{
  return vshl_u64 (x, y);
}

uint64x1_t f21(uint64x1_t x)
{
  return vshl_n_u64 (x, 10);
}

f20:
	vmov	d16, r0, r1	@ int
	vmov	d17, r2, r3	@ int
	vshl.u64	d16, d16, d17
	vmov	r0, r1, d16	@ int
	bx	lr

f21:
	vmov	d16, r0, r1	@ int
	vshl.i64	d16, d16, #10
	vmov	r0, r1, d16	@ int
	bx	lr

As you can see there is a problem with the uint64x1_t type which for a strange
reason maps to DImode, so avoiding Neon here would avoid lots of moves...

The vadd_u64 variant emits the right code already:

uint64x1_t f22(uint64x1_t x, uint64x1_t y)
{
  return vadd_u64 (x, y);
}

f22:
	adds	r0, r0, r2
	adc	r1, r1, r3
	bx	lr

Wilco

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH][ARM] Cleanup DImode shifts
  2019-07-22 16:19 [PATCH][ARM] Cleanup DImode shifts Wilco Dijkstra
  2019-07-22 16:35 ` Ramana Radhakrishnan
@ 2019-07-31 16:39 ` Wilco Dijkstra
  2019-08-19 16:18   ` Wilco Dijkstra
  2019-08-22 13:46   ` Kyrill Tkachov
  1 sibling, 2 replies; 6+ messages in thread
From: Wilco Dijkstra @ 2019-07-31 16:39 UTC (permalink / raw)
  To: GCC Patches; +Cc: nd, Richard Earnshaw, Kyrylo Tkachov

ping
   
 
Like the logical operations, expand all shifts early rather than only
 sometimes.  The Neon shift expansions are never emitted (not even with
 -fneon-for-64bits), so they are not useful.  So all the late expansions
 and Neon shift patterns can be removed, and shifts are more optimized
 as a result.  Since some extend patterns use Neon DImode shifts, remove
 the Neon extend variants and related splits.
 
 A simple example (relying on [1]) generates the same efficient code after
 this patch with -mfpu=neon and -mfpu=vfp (previously just the fact of
 having Neon enabled resulted inefficient code for no reason).
 
 unsigned long long f(unsigned long long x, unsigned long long y)
 { return x & (y >> 33); }
 
 Before:
         strd    r4, r5, [sp, #-8]!
         lsr     r4, r3, #1
         mov     r5, #0
         and     r1, r1, r5
         and     r0, r0, r4
         ldrd    r4, r5, [sp]
         add     sp, sp, #8
         bx      lr
 
 After:
         and     r0, r0, r3, lsr #1
         mov     r1, #0
         bx      lr
 
 Bootstrap and regress OK on arm-none-linux-gnueabihf --with-cpu=cortex-a57
 
 [1] https://gcc.gnu.org/ml/gcc-patches/2019-07/msg01301.html
 
 ChangeLog:
 2019-07-19  Wilco Dijkstra  <wdijkstr@arm.com>
 
         * config/arm/iterators.md (qhs_extenddi_cstr): Update.
         (qhs_extenddi_cstr): Likewise.
         * config/arm/arm.md (ashldi3): Always expand early.
         (ashlsi3): Likewise.
         (ashrsi3): Likewise.
         (zero_extend<mode>di2): Remove Neon variants.
         (extend<mode>di2): Likewise.
         * config/arm/neon.md (ashldi3_neon_noclobber): Remove.
         (signed_shift_di3_neon): Likewise.
         (unsigned_shift_di3_neon): Likewise.
         (ashrdi3_neon_imm_noclobber): Likewise.
         (lshrdi3_neon_imm_noclobber): Likewise.
         (<shift>di3_neon): Likewise.
         (split extend): Remove DI extend split patterns.
 
     testsuite/
         * gcc.target/arm/neon-extend-1.c: Remove test.
         * gcc.target/arm/neon-extend-2.c: Remove test.
 ---
 
 diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
 index 0dba97a4ebeed0c2133936ca662f1c9e86ffc6ba..10ed70dac4384354c0a2453c5e51a29108c6c062 100644
 --- a/gcc/config/arm/arm.md
 +++ b/gcc/config/arm/arm.md
 @@ -3601,44 +3601,14 @@ (define_insn "*satsi_<SAT:code>_shift"
  (define_expand "ashldi3"
    [(set (match_operand:DI            0 "s_register_operand")
          (ashift:DI (match_operand:DI 1 "s_register_operand")
 -                   (match_operand:SI 2 "general_operand")))]
 +                   (match_operand:SI 2 "reg_or_int_operand")))]
    "TARGET_32BIT"
    "
 -  if (TARGET_NEON)
 -    {
 -      /* Delay the decision whether to use NEON or core-regs until
 -        register allocation.  */
 -      emit_insn (gen_ashldi3_neon (operands[0], operands[1], operands[2]));
 -      DONE;
 -    }
 -  else
 -    {
 -      /* Only the NEON case can handle in-memory shift counts.  */
 -      if (!reg_or_int_operand (operands[2], SImode))
 -        operands[2] = force_reg (SImode, operands[2]);
 -    }
 -
 -  if (!CONST_INT_P (operands[2]) && TARGET_REALLY_IWMMXT)
 -    ; /* No special preparation statements; expand pattern as above.  */
 -  else
 -    {
 -      rtx scratch1, scratch2;
 -
 -      /* Ideally we should use iwmmxt here if we could know that operands[1]
 -         ends up already living in an iwmmxt register. Otherwise it's
 -         cheaper to have the alternate code being generated than moving
 -         values to iwmmxt regs and back.  */
 -
 -      /* Expand operation using core-registers.
 -        'FAIL' would achieve the same thing, but this is a bit smarter.  */
 -      scratch1 = gen_reg_rtx (SImode);
 -      scratch2 = gen_reg_rtx (SImode);
 -      arm_emit_coreregs_64bit_shift (ASHIFT, operands[0], operands[1],
 -                                    operands[2], scratch1, scratch2);
 -      DONE;
 -    }
 -  "
 -)
 +  arm_emit_coreregs_64bit_shift (ASHIFT, operands[0], operands[1],
 +                                operands[2], gen_reg_rtx (SImode),
 +                                gen_reg_rtx (SImode));
 +  DONE;
 +")
  
  (define_expand "ashlsi3"
    [(set (match_operand:SI            0 "s_register_operand")
 @@ -3661,35 +3631,11 @@ (define_expand "ashrdi3"
                       (match_operand:SI 2 "reg_or_int_operand")))]
    "TARGET_32BIT"
    "
 -  if (TARGET_NEON)
 -    {
 -      /* Delay the decision whether to use NEON or core-regs until
 -        register allocation.  */
 -      emit_insn (gen_ashrdi3_neon (operands[0], operands[1], operands[2]));
 -      DONE;
 -    }
 -
 -  if (!CONST_INT_P (operands[2]) && TARGET_REALLY_IWMMXT)
 -    ; /* No special preparation statements; expand pattern as above.  */
 -  else
 -    {
 -      rtx scratch1, scratch2;
 -
 -      /* Ideally we should use iwmmxt here if we could know that operands[1]
 -         ends up already living in an iwmmxt register. Otherwise it's
 -         cheaper to have the alternate code being generated than moving
 -         values to iwmmxt regs and back.  */
 -
 -      /* Expand operation using core-registers.
 -        'FAIL' would achieve the same thing, but this is a bit smarter.  */
 -      scratch1 = gen_reg_rtx (SImode);
 -      scratch2 = gen_reg_rtx (SImode);
 -      arm_emit_coreregs_64bit_shift (ASHIFTRT, operands[0], operands[1],
 -                                    operands[2], scratch1, scratch2);
 -      DONE;
 -    }
 -  "
 -)
 +  arm_emit_coreregs_64bit_shift (ASHIFTRT, operands[0], operands[1],
 +                                operands[2], gen_reg_rtx (SImode),
 +                                gen_reg_rtx (SImode));
 +  DONE;
 +")
  
  (define_expand "ashrsi3"
    [(set (match_operand:SI              0 "s_register_operand")
 @@ -3709,35 +3655,11 @@ (define_expand "lshrdi3"
                       (match_operand:SI 2 "reg_or_int_operand")))]
    "TARGET_32BIT"
    "
 -  if (TARGET_NEON)
 -    {
 -      /* Delay the decision whether to use NEON or core-regs until
 -        register allocation.  */
 -      emit_insn (gen_lshrdi3_neon (operands[0], operands[1], operands[2]));
 -      DONE;
 -    }
 -
 -  if (!CONST_INT_P (operands[2]) && TARGET_REALLY_IWMMXT)
 -    ; /* No special preparation statements; expand pattern as above.  */
 -  else
 -    {
 -      rtx scratch1, scratch2;
 -
 -      /* Ideally we should use iwmmxt here if we could know that operands[1]
 -         ends up already living in an iwmmxt register. Otherwise it's
 -         cheaper to have the alternate code being generated than moving
 -         values to iwmmxt regs and back.  */
 -
 -      /* Expand operation using core-registers.
 -        'FAIL' would achieve the same thing, but this is a bit smarter.  */
 -      scratch1 = gen_reg_rtx (SImode);
 -      scratch2 = gen_reg_rtx (SImode);
 -      arm_emit_coreregs_64bit_shift (LSHIFTRT, operands[0], operands[1],
 -                                    operands[2], scratch1, scratch2);
 -      DONE;
 -    }
 -  "
 -)
 +  arm_emit_coreregs_64bit_shift (LSHIFTRT, operands[0], operands[1],
 +                                operands[2], gen_reg_rtx (SImode),
 +                                gen_reg_rtx (SImode));
 +  DONE;
 +")
  
  (define_expand "lshrsi3"
    [(set (match_operand:SI              0 "s_register_operand")
 @@ -4762,30 +4684,30 @@ (define_expand "truncdfhf2"
  ;; Zero and sign extension instructions.
  
  (define_insn "zero_extend<mode>di2"
 -  [(set (match_operand:DI 0 "s_register_operand" "=w,r,?r,w")
 +  [(set (match_operand:DI 0 "s_register_operand" "=r,?r")
          (zero_extend:DI (match_operand:QHSI 1 "<qhs_zextenddi_op>"
                                              "<qhs_zextenddi_cstr>")))]
    "TARGET_32BIT <qhs_zextenddi_cond>"
    "#"
 -  [(set_attr "length" "8,4,8,8")
 -   (set_attr "arch" "neon_for_64bits,*,*,avoid_neon_for_64bits")
 +  [(set_attr "length" "4,8")
 +   (set_attr "arch" "*,*")
     (set_attr "ce_count" "2")
     (set_attr "predicable" "yes")
 -   (set_attr "type" "multiple,mov_reg,multiple,multiple")]
 +   (set_attr "type" "mov_reg,multiple")]
  )
  
  (define_insn "extend<mode>di2"
 -  [(set (match_operand:DI 0 "s_register_operand" "=w,r,?r,?r,w")
 +  [(set (match_operand:DI 0 "s_register_operand" "=r,?r,?r")
          (sign_extend:DI (match_operand:QHSI 1 "<qhs_extenddi_op>"
                                              "<qhs_extenddi_cstr>")))]
    "TARGET_32BIT <qhs_sextenddi_cond>"
    "#"
 -  [(set_attr "length" "8,4,8,8,8")
 +  [(set_attr "length" "4,8,8")
     (set_attr "ce_count" "2")
     (set_attr "shift" "1")
     (set_attr "predicable" "yes")
 -   (set_attr "arch" "neon_for_64bits,*,a,t,avoid_neon_for_64bits")
 -   (set_attr "type" "multiple,mov_reg,multiple,multiple,multiple")]
 +   (set_attr "arch" "*,a,t")
 +   (set_attr "type" "mov_reg,multiple,multiple")]
  )
  
  ;; Splits for all extensions to DImode
 diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
 index eca16636ade577d84dc62523d066e72f79dc38f6..fa6f0c0529d5364b1e1df705cb1029868578e38c 100644
 --- a/gcc/config/arm/iterators.md
 +++ b/gcc/config/arm/iterators.md
 @@ -741,8 +741,8 @@ (define_mode_attr qhs_zextenddi_op [(SI "s_register_operand")
  (define_mode_attr qhs_extenddi_op [(SI "s_register_operand")
                                     (HI "nonimmediate_operand")
                                     (QI "arm_reg_or_extendqisi_mem_op")])
 -(define_mode_attr qhs_extenddi_cstr [(SI "r,0,r,r,r") (HI "r,0,rm,rm,r") (QI "r,0,rUq,rm,r")])
 -(define_mode_attr qhs_zextenddi_cstr [(SI "r,0,r,r") (HI "r,0,rm,r") (QI "r,0,rm,r")])
 +(define_mode_attr qhs_extenddi_cstr [(SI "0,r,r") (HI "0,rm,rm") (QI "0,rUq,rm")])
 +(define_mode_attr qhs_zextenddi_cstr [(SI "0,r") (HI "0,rm") (QI "0,rm")])
  
  ;; Mode attributes used for fixed-point support.
  (define_mode_attr qaddsub_suf [(V4UQQ "8") (V2UHQ "16") (UQQ "8") (UHQ "16")
 diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
 index ef73c77abeeaa02947c70ffa435f7bedc431e3be..757f2c0f5377148c770e061849424aed924a7d7a 100644
 --- a/gcc/config/arm/neon.md
 +++ b/gcc/config/arm/neon.md
 @@ -1135,173 +1135,6 @@ (define_insn "neon_load_count"
    [(set_attr "type" "neon_load1_1reg,neon_from_gp")]
  )
  
 -(define_insn "ashldi3_neon_noclobber"
 -  [(set (match_operand:DI 0 "s_register_operand"           "=w,w")
 -       (ashift:DI (match_operand:DI 1 "s_register_operand" " w,w")
 -                  (match_operand:DI 2 "reg_or_int_operand" " i,w")))]
 -  "TARGET_NEON && reload_completed
 -   && (!CONST_INT_P (operands[2])
 -       || (INTVAL (operands[2]) >= 0 && INTVAL (operands[2]) < 64))"
 -  "@
 -   vshl.u64\t%P0, %P1, %2
 -   vshl.u64\t%P0, %P1, %P2"
 -  [(set_attr "type" "neon_shift_imm, neon_shift_reg")]
 -)
 -
 -(define_insn_and_split "ashldi3_neon"
 -  [(set (match_operand:DI 0 "s_register_operand"           "= w, w, &r, r, &r, ?w,?w")
 -       (ashift:DI (match_operand:DI 1 "s_register_operand" " 0w, w, 0r, 0,  r, 0w, w")
 -                  (match_operand:SI 2 "general_operand"    "rUm, i,  r, i,  i,rUm, i")))
 -   (clobber (match_scratch:SI 3                                    "= X, X, &r, X,  X,  X, X"))
 -   (clobber (match_scratch:SI 4                                    "= X, X, &r, X,  X,  X, X"))
 -   (clobber (match_scratch:DI 5                                    "=&w, X,  X, X,  X, &w, X"))
 -   (clobber (reg:CC_C CC_REGNUM))]
 -  "TARGET_NEON"
 -  "#"
 -  "TARGET_NEON && reload_completed"
 -  [(const_int 0)]
 -  "
 -  {
 -    if (IS_VFP_REGNUM (REGNO (operands[0])))
 -      {
 -        if (CONST_INT_P (operands[2]))
 -         {
 -           if (INTVAL (operands[2]) < 1)
 -             {
 -               emit_insn (gen_movdi (operands[0], operands[1]));
 -               DONE;
 -             }
 -           else if (INTVAL (operands[2]) > 63)
 -             operands[2] = gen_rtx_CONST_INT (VOIDmode, 63);
 -         }
 -       else
 -         {
 -           emit_insn (gen_neon_load_count (operands[5], operands[2]));
 -           operands[2] = operands[5];
 -         }
 -
 -       /* Ditch the unnecessary clobbers.  */
 -       emit_insn (gen_ashldi3_neon_noclobber (operands[0], operands[1],
 -                                              operands[2]));
 -      }
 -    else
 -      {
 -       /* The shift expanders support either full overlap or no overlap.  */
 -       gcc_assert (!reg_overlap_mentioned_p (operands[0], operands[1])
 -                   || REGNO (operands[0]) == REGNO (operands[1]));
 -
 -       arm_emit_coreregs_64bit_shift (ASHIFT, operands[0], operands[1],
 -                                      operands[2], operands[3], operands[4]);
 -      }
 -    DONE;
 -  }"
 -  [(set_attr "arch" "neon_for_64bits,neon_for_64bits,*,*,*,avoid_neon_for_64bits,avoid_neon_for_64bits")
 -   (set_attr "opt" "*,*,speed,speed,speed,*,*")
 -   (set_attr "type" "multiple")]
 -)
 -
 -; The shift amount needs to be negated for right-shifts
 -(define_insn "signed_shift_di3_neon"
 -  [(set (match_operand:DI 0 "s_register_operand"            "=w")
 -       (unspec:DI [(match_operand:DI 1 "s_register_operand" " w")
 -                   (match_operand:DI 2 "s_register_operand" " w")]
 -                  UNSPEC_ASHIFT_SIGNED))]
 -  "TARGET_NEON && reload_completed"
 -  "vshl.s64\t%P0, %P1, %P2"
 -  [(set_attr "type" "neon_shift_reg")]
 -)
 -
 -; The shift amount needs to be negated for right-shifts
 -(define_insn "unsigned_shift_di3_neon"
 -  [(set (match_operand:DI 0 "s_register_operand"            "=w")
 -       (unspec:DI [(match_operand:DI 1 "s_register_operand" " w")
 -                   (match_operand:DI 2 "s_register_operand" " w")]
 -                  UNSPEC_ASHIFT_UNSIGNED))]
 -  "TARGET_NEON && reload_completed"
 -  "vshl.u64\t%P0, %P1, %P2"
 -  [(set_attr "type" "neon_shift_reg")]
 -)
 -
 -(define_insn "ashrdi3_neon_imm_noclobber"
 -  [(set (match_operand:DI 0 "s_register_operand"             "=w")
 -       (ashiftrt:DI (match_operand:DI 1 "s_register_operand" " w")
 -                    (match_operand:DI 2 "const_int_operand"  " i")))]
 -  "TARGET_NEON && reload_completed
 -   && INTVAL (operands[2]) > 0 && INTVAL (operands[2]) <= 64"
 -  "vshr.s64\t%P0, %P1, %2"
 -  [(set_attr "type" "neon_shift_imm")]
 -)
 -
 -(define_insn "lshrdi3_neon_imm_noclobber"
 -  [(set (match_operand:DI 0 "s_register_operand"             "=w")
 -       (lshiftrt:DI (match_operand:DI 1 "s_register_operand" " w")
 -                    (match_operand:DI 2 "const_int_operand"  " i")))]
 -  "TARGET_NEON && reload_completed
 -   && INTVAL (operands[2]) > 0 && INTVAL (operands[2]) <= 64"
 -  "vshr.u64\t%P0, %P1, %2"
 -  [(set_attr "type" "neon_shift_imm")]
 -)
 -
 -;; ashrdi3_neon
 -;; lshrdi3_neon
 -(define_insn_and_split "<shift>di3_neon"
 -  [(set (match_operand:DI 0 "s_register_operand"            "= w, w, &r, r, &r,?w,?w")
 -       (RSHIFTS:DI (match_operand:DI 1 "s_register_operand" " 0w, w, 0r, 0,  r,0w, w")
 -                   (match_operand:SI 2 "reg_or_int_operand" "  r, i,  r, i,  i, r, i")))
 -   (clobber (match_scratch:SI 3                                     "=2r, X, &r, X,  X,2r, X"))
 -   (clobber (match_scratch:SI 4                                     "= X, X, &r, X,  X, X, X"))
 -   (clobber (match_scratch:DI 5                                     "=&w, X,  X, X, X,&w, X"))
 -   (clobber (reg:CC CC_REGNUM))]
 -  "TARGET_NEON"
 -  "#"
 -  "TARGET_NEON && reload_completed"
 -  [(const_int 0)]
 -  "
 -  {
 -    if (IS_VFP_REGNUM (REGNO (operands[0])))
 -      {
 -       if (CONST_INT_P (operands[2]))
 -         {
 -           if (INTVAL (operands[2]) < 1)
 -             {
 -               emit_insn (gen_movdi (operands[0], operands[1]));
 -               DONE;
 -             }
 -           else if (INTVAL (operands[2]) > 64)
 -             operands[2] = gen_rtx_CONST_INT (VOIDmode, 64);
 -
 -           /* Ditch the unnecessary clobbers.  */
 -           emit_insn (gen_<shift>di3_neon_imm_noclobber (operands[0],
 -                                                         operands[1],
 -                                                         operands[2]));
 -         }
 -       else 
 -         {
 -           /* We must use a negative left-shift.  */
 -           emit_insn (gen_negsi2 (operands[3], operands[2]));
 -           emit_insn (gen_neon_load_count (operands[5], operands[3]));
 -           emit_insn (gen_<shifttype>_shift_di3_neon (operands[0], operands[1],
 -                                                      operands[5]));
 -         }
 -      }
 -    else
 -      {
 -       /* The shift expanders support either full overlap or no overlap.  */
 -       gcc_assert (!reg_overlap_mentioned_p (operands[0], operands[1])
 -                   || REGNO (operands[0]) == REGNO (operands[1]));
 -
 -       /* This clobbers CC (ASHIFTRT by register only).  */
 -       arm_emit_coreregs_64bit_shift (<CODE>, operands[0], operands[1],
 -                                      operands[2], operands[3], operands[4]);
 -      }
 -
 -    DONE;
 -  }"
 -  [(set_attr "arch" "neon_for_64bits,neon_for_64bits,*,*,*,avoid_neon_for_64bits,avoid_neon_for_64bits")
 -   (set_attr "opt" "*,*,speed,speed,speed,*,*")
 -   (set_attr "type" "multiple")]
 -)
 -
  ;; Widening operations
  
  (define_expand "widen_ssum<mode>3"
 @@ -6792,65 +6625,3 @@ (define_insn "neon_vabd<mode>_3"
   "vabd.<V_if_elem> %<V_reg>0, %<V_reg>1, %<V_reg>2"
   [(set_attr "type" "neon_fp_abd_s<q>")]
  )
 -
 -;; Copy from core-to-neon regs, then extend, not vice-versa
 -
 -(define_split
 -  [(set (match_operand:DI 0 "s_register_operand" "")
 -       (sign_extend:DI (match_operand:SI 1 "s_register_operand" "")))]
 -  "TARGET_NEON && reload_completed && IS_VFP_REGNUM (REGNO (operands[0]))"
 -  [(set (match_dup 2) (vec_duplicate:V2SI (match_dup 1)))
 -   (set (match_dup 0) (ashiftrt:DI (match_dup 0) (const_int 32)))]
 -  {
 -    operands[2] = gen_rtx_REG (V2SImode, REGNO (operands[0]));
 -  })
 -
 -(define_split
 -  [(set (match_operand:DI 0 "s_register_operand" "")
 -       (sign_extend:DI (match_operand:HI 1 "s_register_operand" "")))]
 -  "TARGET_NEON && reload_completed && IS_VFP_REGNUM (REGNO (operands[0]))"
 -  [(set (match_dup 2) (vec_duplicate:V4HI (match_dup 1)))
 -   (set (match_dup 0) (ashiftrt:DI (match_dup 0) (const_int 48)))]
 -  {
 -    operands[2] = gen_rtx_REG (V4HImode, REGNO (operands[0]));
 -  })
 -
 -(define_split
 -  [(set (match_operand:DI 0 "s_register_operand" "")
 -       (sign_extend:DI (match_operand:QI 1 "s_register_operand" "")))]
 -  "TARGET_NEON && reload_completed && IS_VFP_REGNUM (REGNO (operands[0]))"
 -  [(set (match_dup 2) (vec_duplicate:V8QI (match_dup 1)))
 -   (set (match_dup 0) (ashiftrt:DI (match_dup 0) (const_int 56)))]
 -  {
 -    operands[2] = gen_rtx_REG (V8QImode, REGNO (operands[0]));
 -  })
 -
 -(define_split
 -  [(set (match_operand:DI 0 "s_register_operand" "")
 -       (zero_extend:DI (match_operand:SI 1 "s_register_operand" "")))]
 -  "TARGET_NEON && reload_completed && IS_VFP_REGNUM (REGNO (operands[0]))"
 -  [(set (match_dup 2) (vec_duplicate:V2SI (match_dup 1)))
 -   (set (match_dup 0) (lshiftrt:DI (match_dup 0) (const_int 32)))]
 -  {
 -    operands[2] = gen_rtx_REG (V2SImode, REGNO (operands[0]));
 -  })
 -
 -(define_split
 -  [(set (match_operand:DI 0 "s_register_operand" "")
 -       (zero_extend:DI (match_operand:HI 1 "s_register_operand" "")))]
 -  "TARGET_NEON && reload_completed && IS_VFP_REGNUM (REGNO (operands[0]))"
 -  [(set (match_dup 2) (vec_duplicate:V4HI (match_dup 1)))
 -   (set (match_dup 0) (lshiftrt:DI (match_dup 0) (const_int 48)))]
 -  {
 -    operands[2] = gen_rtx_REG (V4HImode, REGNO (operands[0]));
 -  })
 -
 -(define_split
 -  [(set (match_operand:DI 0 "s_register_operand" "")
 -       (zero_extend:DI (match_operand:QI 1 "s_register_operand" "")))]
 -  "TARGET_NEON && reload_completed && IS_VFP_REGNUM (REGNO (operands[0]))"
 -  [(set (match_dup 2) (vec_duplicate:V8QI (match_dup 1)))
 -   (set (match_dup 0) (lshiftrt:DI (match_dup 0) (const_int 56)))]
 -  {
 -    operands[2] = gen_rtx_REG (V8QImode, REGNO (operands[0]));
 -  })
 diff --git a/gcc/testsuite/gcc.target/arm/neon-extend-1.c b/gcc/testsuite/gcc.target/arm/neon-extend-1.c
 deleted file mode 100644
 index cfe83ce1bde260cdd99ae47963dde6676627cca3..0000000000000000000000000000000000000000
 --- a/gcc/testsuite/gcc.target/arm/neon-extend-1.c
 +++ /dev/null
 @@ -1,13 +0,0 @@
 -/* { dg-require-effective-target arm_neon_hw } */
 -/* { dg-options "-O2" } */
 -/* { dg-add-options arm_neon } */
 -
 -void
 -f (unsigned int a)
 -{
 -  unsigned long long b = a;
 -  asm volatile ("@ extended to %0" : : "w" (b));
 -}
 -
 -/* { dg-final { scan-assembler "vdup.32" } } */
 -/* { dg-final { scan-assembler "vshr.u64" } } */
 diff --git a/gcc/testsuite/gcc.target/arm/neon-extend-2.c b/gcc/testsuite/gcc.target/arm/neon-extend-2.c
 deleted file mode 100644
 index 1c5a17e427859146f53828ec4a02b439ab0580ff..0000000000000000000000000000000000000000
 --- a/gcc/testsuite/gcc.target/arm/neon-extend-2.c
 +++ /dev/null
 @@ -1,13 +0,0 @@
 -/* { dg-require-effective-target arm_neon_hw } */
 -/* { dg-options "-O2" } */
 -/* { dg-add-options arm_neon } */
 -
 -void
 -f (int a)
 -{
 -  long long b = a;
 -  asm volatile ("@ extended to %0" : : "w" (b));
 -}
 -
 -/* { dg-final { scan-assembler "vdup.32" } } */
 -/* { dg-final { scan-assembler "vshr.s64" } } */    

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH][ARM] Cleanup DImode shifts
  2019-07-31 16:39 ` Wilco Dijkstra
@ 2019-08-19 16:18   ` Wilco Dijkstra
  2019-08-22 13:46   ` Kyrill Tkachov
  1 sibling, 0 replies; 6+ messages in thread
From: Wilco Dijkstra @ 2019-08-19 16:18 UTC (permalink / raw)
  To: GCC Patches; +Cc: nd, Richard Earnshaw, Kyrylo Tkachov


   
 
ping
    
  
 Like the logical operations, expand all shifts early rather than only
  sometimes.  The Neon shift expansions are never emitted (not even with
  -fneon-for-64bits), so they are not useful.  So all the late expansions
  and Neon shift patterns can be removed, and shifts are more optimized
  as a result.  Since some extend patterns use Neon DImode shifts, remove
  the Neon extend variants and related splits.
  
  A simple example (relying on [1]) generates the same efficient code after
  this patch with -mfpu=neon and -mfpu=vfp (previously just the fact of
  having Neon enabled resulted inefficient code for no reason).
  
  unsigned long long f(unsigned long long x, unsigned long long y)
  { return x & (y >> 33); }
  
  Before:
          strd    r4, r5, [sp, #-8]!
          lsr     r4, r3, #1
          mov     r5, #0
          and     r1, r1, r5
          and     r0, r0, r4
          ldrd    r4, r5, [sp]
          add     sp, sp, #8
          bx      lr
  
  After:
          and     r0, r0, r3, lsr #1
          mov     r1, #0
          bx      lr
  
  Bootstrap and regress OK on arm-none-linux-gnueabihf --with-cpu=cortex-a57
  
  [1] https://gcc.gnu.org/ml/gcc-patches/2019-07/msg01301.html
  
  ChangeLog:
  2019-07-19  Wilco Dijkstra  <wdijkstr@arm.com>
  
          * config/arm/iterators.md (qhs_extenddi_cstr): Update.
          (qhs_extenddi_cstr): Likewise.
          * config/arm/arm.md (ashldi3): Always expand early.
          (ashlsi3): Likewise.
          (ashrsi3): Likewise.
          (zero_extend<mode>di2): Remove Neon variants.
          (extend<mode>di2): Likewise.
          * config/arm/neon.md (ashldi3_neon_noclobber): Remove.
          (signed_shift_di3_neon): Likewise.
          (unsigned_shift_di3_neon): Likewise.
          (ashrdi3_neon_imm_noclobber): Likewise.
          (lshrdi3_neon_imm_noclobber): Likewise.
          (<shift>di3_neon): Likewise.
          (split extend): Remove DI extend split patterns.
  
      testsuite/
          * gcc.target/arm/neon-extend-1.c: Remove test.
          * gcc.target/arm/neon-extend-2.c: Remove test.
  ---
  
  diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
  index 0dba97a4ebeed0c2133936ca662f1c9e86ffc6ba..10ed70dac4384354c0a2453c5e51a29108c6c062 100644
  --- a/gcc/config/arm/arm.md
  +++ b/gcc/config/arm/arm.md
  @@ -3601,44 +3601,14 @@ (define_insn "*satsi_<SAT:code>_shift"
   (define_expand "ashldi3"
     [(set (match_operand:DI            0 "s_register_operand")
           (ashift:DI (match_operand:DI 1 "s_register_operand")
  -                   (match_operand:SI 2 "general_operand")))]
  +                   (match_operand:SI 2 "reg_or_int_operand")))]
     "TARGET_32BIT"
     "
  -  if (TARGET_NEON)
  -    {
  -      /* Delay the decision whether to use NEON or core-regs until
  -        register allocation.  */
  -      emit_insn (gen_ashldi3_neon (operands[0], operands[1], operands[2]));
  -      DONE;
  -    }
  -  else
  -    {
  -      /* Only the NEON case can handle in-memory shift counts.  */
  -      if (!reg_or_int_operand (operands[2], SImode))
  -        operands[2] = force_reg (SImode, operands[2]);
  -    }
  -
  -  if (!CONST_INT_P (operands[2]) && TARGET_REALLY_IWMMXT)
  -    ; /* No special preparation statements; expand pattern as above.  */
  -  else
  -    {
  -      rtx scratch1, scratch2;
  -
  -      /* Ideally we should use iwmmxt here if we could know that operands[1]
  -         ends up already living in an iwmmxt register. Otherwise it's
  -         cheaper to have the alternate code being generated than moving
  -         values to iwmmxt regs and back.  */
  -
  -      /* Expand operation using core-registers.
  -        'FAIL' would achieve the same thing, but this is a bit smarter.  */
  -      scratch1 = gen_reg_rtx (SImode);
  -      scratch2 = gen_reg_rtx (SImode);
  -      arm_emit_coreregs_64bit_shift (ASHIFT, operands[0], operands[1],
  -                                    operands[2], scratch1, scratch2);
  -      DONE;
  -    }
  -  "
  -)
  +  arm_emit_coreregs_64bit_shift (ASHIFT, operands[0], operands[1],
  +                                operands[2], gen_reg_rtx (SImode),
  +                                gen_reg_rtx (SImode));
  +  DONE;
  +")
   
   (define_expand "ashlsi3"
     [(set (match_operand:SI            0 "s_register_operand")
  @@ -3661,35 +3631,11 @@ (define_expand "ashrdi3"
                        (match_operand:SI 2 "reg_or_int_operand")))]
     "TARGET_32BIT"
     "
  -  if (TARGET_NEON)
  -    {
  -      /* Delay the decision whether to use NEON or core-regs until
  -        register allocation.  */
  -      emit_insn (gen_ashrdi3_neon (operands[0], operands[1], operands[2]));
  -      DONE;
  -    }
  -
  -  if (!CONST_INT_P (operands[2]) && TARGET_REALLY_IWMMXT)
  -    ; /* No special preparation statements; expand pattern as above.  */
  -  else
  -    {
  -      rtx scratch1, scratch2;
  -
  -      /* Ideally we should use iwmmxt here if we could know that operands[1]
  -         ends up already living in an iwmmxt register. Otherwise it's
  -         cheaper to have the alternate code being generated than moving
  -         values to iwmmxt regs and back.  */
  -
  -      /* Expand operation using core-registers.
  -        'FAIL' would achieve the same thing, but this is a bit smarter.  */
  -      scratch1 = gen_reg_rtx (SImode);
  -      scratch2 = gen_reg_rtx (SImode);
  -      arm_emit_coreregs_64bit_shift (ASHIFTRT, operands[0], operands[1],
  -                                    operands[2], scratch1, scratch2);
  -      DONE;
  -    }
  -  "
  -)
  +  arm_emit_coreregs_64bit_shift (ASHIFTRT, operands[0], operands[1],
  +                                operands[2], gen_reg_rtx (SImode),
  +                                gen_reg_rtx (SImode));
  +  DONE;
  +")
   
   (define_expand "ashrsi3"
     [(set (match_operand:SI              0 "s_register_operand")
  @@ -3709,35 +3655,11 @@ (define_expand "lshrdi3"
                        (match_operand:SI 2 "reg_or_int_operand")))]
     "TARGET_32BIT"
     "
  -  if (TARGET_NEON)
  -    {
  -      /* Delay the decision whether to use NEON or core-regs until
  -        register allocation.  */
  -      emit_insn (gen_lshrdi3_neon (operands[0], operands[1], operands[2]));
  -      DONE;
  -    }
  -
  -  if (!CONST_INT_P (operands[2]) && TARGET_REALLY_IWMMXT)
  -    ; /* No special preparation statements; expand pattern as above.  */
  -  else
  -    {
  -      rtx scratch1, scratch2;
  -
  -      /* Ideally we should use iwmmxt here if we could know that operands[1]
  -         ends up already living in an iwmmxt register. Otherwise it's
  -         cheaper to have the alternate code being generated than moving
  -         values to iwmmxt regs and back.  */
  -
  -      /* Expand operation using core-registers.
  -        'FAIL' would achieve the same thing, but this is a bit smarter.  */
  -      scratch1 = gen_reg_rtx (SImode);
  -      scratch2 = gen_reg_rtx (SImode);
  -      arm_emit_coreregs_64bit_shift (LSHIFTRT, operands[0], operands[1],
  -                                    operands[2], scratch1, scratch2);
  -      DONE;
  -    }
  -  "
  -)
  +  arm_emit_coreregs_64bit_shift (LSHIFTRT, operands[0], operands[1],
  +                                operands[2], gen_reg_rtx (SImode),
  +                                gen_reg_rtx (SImode));
  +  DONE;
  +")
   
   (define_expand "lshrsi3"
     [(set (match_operand:SI              0 "s_register_operand")
  @@ -4762,30 +4684,30 @@ (define_expand "truncdfhf2"
   ;; Zero and sign extension instructions.
   
   (define_insn "zero_extend<mode>di2"
  -  [(set (match_operand:DI 0 "s_register_operand" "=w,r,?r,w")
  +  [(set (match_operand:DI 0 "s_register_operand" "=r,?r")
           (zero_extend:DI (match_operand:QHSI 1 "<qhs_zextenddi_op>"
                                               "<qhs_zextenddi_cstr>")))]
     "TARGET_32BIT <qhs_zextenddi_cond>"
     "#"
  -  [(set_attr "length" "8,4,8,8")
  -   (set_attr "arch" "neon_for_64bits,*,*,avoid_neon_for_64bits")
  +  [(set_attr "length" "4,8")
  +   (set_attr "arch" "*,*")
      (set_attr "ce_count" "2")
      (set_attr "predicable" "yes")
  -   (set_attr "type" "multiple,mov_reg,multiple,multiple")]
  +   (set_attr "type" "mov_reg,multiple")]
   )
   
   (define_insn "extend<mode>di2"
  -  [(set (match_operand:DI 0 "s_register_operand" "=w,r,?r,?r,w")
  +  [(set (match_operand:DI 0 "s_register_operand" "=r,?r,?r")
           (sign_extend:DI (match_operand:QHSI 1 "<qhs_extenddi_op>"
                                               "<qhs_extenddi_cstr>")))]
     "TARGET_32BIT <qhs_sextenddi_cond>"
     "#"
  -  [(set_attr "length" "8,4,8,8,8")
  +  [(set_attr "length" "4,8,8")
      (set_attr "ce_count" "2")
      (set_attr "shift" "1")
      (set_attr "predicable" "yes")
  -   (set_attr "arch" "neon_for_64bits,*,a,t,avoid_neon_for_64bits")
  -   (set_attr "type" "multiple,mov_reg,multiple,multiple,multiple")]
  +   (set_attr "arch" "*,a,t")
  +   (set_attr "type" "mov_reg,multiple,multiple")]
   )
   
   ;; Splits for all extensions to DImode
  diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
  index eca16636ade577d84dc62523d066e72f79dc38f6..fa6f0c0529d5364b1e1df705cb1029868578e38c 100644
  --- a/gcc/config/arm/iterators.md
  +++ b/gcc/config/arm/iterators.md
  @@ -741,8 +741,8 @@ (define_mode_attr qhs_zextenddi_op [(SI "s_register_operand")
   (define_mode_attr qhs_extenddi_op [(SI "s_register_operand")
                                      (HI "nonimmediate_operand")
                                      (QI "arm_reg_or_extendqisi_mem_op")])
  -(define_mode_attr qhs_extenddi_cstr [(SI "r,0,r,r,r") (HI "r,0,rm,rm,r") (QI "r,0,rUq,rm,r")])
  -(define_mode_attr qhs_zextenddi_cstr [(SI "r,0,r,r") (HI "r,0,rm,r") (QI "r,0,rm,r")])
  +(define_mode_attr qhs_extenddi_cstr [(SI "0,r,r") (HI "0,rm,rm") (QI "0,rUq,rm")])
  +(define_mode_attr qhs_zextenddi_cstr [(SI "0,r") (HI "0,rm") (QI "0,rm")])
   
   ;; Mode attributes used for fixed-point support.
   (define_mode_attr qaddsub_suf [(V4UQQ "8") (V2UHQ "16") (UQQ "8") (UHQ "16")
  diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
  index ef73c77abeeaa02947c70ffa435f7bedc431e3be..757f2c0f5377148c770e061849424aed924a7d7a 100644
  --- a/gcc/config/arm/neon.md
  +++ b/gcc/config/arm/neon.md
  @@ -1135,173 +1135,6 @@ (define_insn "neon_load_count"
     [(set_attr "type" "neon_load1_1reg,neon_from_gp")]
   )
   
  -(define_insn "ashldi3_neon_noclobber"
  -  [(set (match_operand:DI 0 "s_register_operand"           "=w,w")
  -       (ashift:DI (match_operand:DI 1 "s_register_operand" " w,w")
  -                  (match_operand:DI 2 "reg_or_int_operand" " i,w")))]
  -  "TARGET_NEON && reload_completed
  -   && (!CONST_INT_P (operands[2])
  -       || (INTVAL (operands[2]) >= 0 && INTVAL (operands[2]) < 64))"
  -  "@
  -   vshl.u64\t%P0, %P1, %2
  -   vshl.u64\t%P0, %P1, %P2"
  -  [(set_attr "type" "neon_shift_imm, neon_shift_reg")]
  -)
  -
  -(define_insn_and_split "ashldi3_neon"
  -  [(set (match_operand:DI 0 "s_register_operand"           "= w, w, &r, r, &r, ?w,?w")
  -       (ashift:DI (match_operand:DI 1 "s_register_operand" " 0w, w, 0r, 0,  r, 0w, w")
  -                  (match_operand:SI 2 "general_operand"    "rUm, i,  r, i,  i,rUm, i")))
  -   (clobber (match_scratch:SI 3                                    "= X, X, &r, X,  X,  X, X"))
  -   (clobber (match_scratch:SI 4                                    "= X, X, &r, X,  X,  X, X"))
  -   (clobber (match_scratch:DI 5                                    "=&w, X,  X, X,  X, &w, X"))
  -   (clobber (reg:CC_C CC_REGNUM))]
  -  "TARGET_NEON"
  -  "#"
  -  "TARGET_NEON && reload_completed"
  -  [(const_int 0)]
  -  "
  -  {
  -    if (IS_VFP_REGNUM (REGNO (operands[0])))
  -      {
  -        if (CONST_INT_P (operands[2]))
  -         {
  -           if (INTVAL (operands[2]) < 1)
  -             {
  -               emit_insn (gen_movdi (operands[0], operands[1]));
  -               DONE;
  -             }
  -           else if (INTVAL (operands[2]) > 63)
  -             operands[2] = gen_rtx_CONST_INT (VOIDmode, 63);
  -         }
  -       else
  -         {
  -           emit_insn (gen_neon_load_count (operands[5], operands[2]));
  -           operands[2] = operands[5];
  -         }
  -
  -       /* Ditch the unnecessary clobbers.  */
  -       emit_insn (gen_ashldi3_neon_noclobber (operands[0], operands[1],
  -                                              operands[2]));
  -      }
  -    else
  -      {
  -       /* The shift expanders support either full overlap or no overlap.  */
  -       gcc_assert (!reg_overlap_mentioned_p (operands[0], operands[1])
  -                   || REGNO (operands[0]) == REGNO (operands[1]));
  -
  -       arm_emit_coreregs_64bit_shift (ASHIFT, operands[0], operands[1],
  -                                      operands[2], operands[3], operands[4]);
  -      }
  -    DONE;
  -  }"
  -  [(set_attr "arch" "neon_for_64bits,neon_for_64bits,*,*,*,avoid_neon_for_64bits,avoid_neon_for_64bits")
  -   (set_attr "opt" "*,*,speed,speed,speed,*,*")
  -   (set_attr "type" "multiple")]
  -)
  -
  -; The shift amount needs to be negated for right-shifts
  -(define_insn "signed_shift_di3_neon"
  -  [(set (match_operand:DI 0 "s_register_operand"            "=w")
  -       (unspec:DI [(match_operand:DI 1 "s_register_operand" " w")
  -                   (match_operand:DI 2 "s_register_operand" " w")]
  -                  UNSPEC_ASHIFT_SIGNED))]
  -  "TARGET_NEON && reload_completed"
  -  "vshl.s64\t%P0, %P1, %P2"
  -  [(set_attr "type" "neon_shift_reg")]
  -)
  -
  -; The shift amount needs to be negated for right-shifts
  -(define_insn "unsigned_shift_di3_neon"
  -  [(set (match_operand:DI 0 "s_register_operand"            "=w")
  -       (unspec:DI [(match_operand:DI 1 "s_register_operand" " w")
  -                   (match_operand:DI 2 "s_register_operand" " w")]
  -                  UNSPEC_ASHIFT_UNSIGNED))]
  -  "TARGET_NEON && reload_completed"
  -  "vshl.u64\t%P0, %P1, %P2"
  -  [(set_attr "type" "neon_shift_reg")]
  -)
  -
  -(define_insn "ashrdi3_neon_imm_noclobber"
  -  [(set (match_operand:DI 0 "s_register_operand"             "=w")
  -       (ashiftrt:DI (match_operand:DI 1 "s_register_operand" " w")
  -                    (match_operand:DI 2 "const_int_operand"  " i")))]
  -  "TARGET_NEON && reload_completed
  -   && INTVAL (operands[2]) > 0 && INTVAL (operands[2]) <= 64"
  -  "vshr.s64\t%P0, %P1, %2"
  -  [(set_attr "type" "neon_shift_imm")]
  -)
  -
  -(define_insn "lshrdi3_neon_imm_noclobber"
  -  [(set (match_operand:DI 0 "s_register_operand"             "=w")
  -       (lshiftrt:DI (match_operand:DI 1 "s_register_operand" " w")
  -                    (match_operand:DI 2 "const_int_operand"  " i")))]
  -  "TARGET_NEON && reload_completed
  -   && INTVAL (operands[2]) > 0 && INTVAL (operands[2]) <= 64"
  -  "vshr.u64\t%P0, %P1, %2"
  -  [(set_attr "type" "neon_shift_imm")]
  -)
  -
  -;; ashrdi3_neon
  -;; lshrdi3_neon
  -(define_insn_and_split "<shift>di3_neon"
  -  [(set (match_operand:DI 0 "s_register_operand"            "= w, w, &r, r, &r,?w,?w")
  -       (RSHIFTS:DI (match_operand:DI 1 "s_register_operand" " 0w, w, 0r, 0,  r,0w, w")
  -                   (match_operand:SI 2 "reg_or_int_operand" "  r, i,  r, i,  i, r, i")))
  -   (clobber (match_scratch:SI 3                                     "=2r, X, &r, X,  X,2r, X"))
  -   (clobber (match_scratch:SI 4                                     "= X, X, &r, X,  X, X, X"))
  -   (clobber (match_scratch:DI 5                                     "=&w, X,  X, X, X,&w, X"))
  -   (clobber (reg:CC CC_REGNUM))]
  -  "TARGET_NEON"
  -  "#"
  -  "TARGET_NEON && reload_completed"
  -  [(const_int 0)]
  -  "
  -  {
  -    if (IS_VFP_REGNUM (REGNO (operands[0])))
  -      {
  -       if (CONST_INT_P (operands[2]))
  -         {
  -           if (INTVAL (operands[2]) < 1)
  -             {
  -               emit_insn (gen_movdi (operands[0], operands[1]));
  -               DONE;
  -             }
  -           else if (INTVAL (operands[2]) > 64)
  -             operands[2] = gen_rtx_CONST_INT (VOIDmode, 64);
  -
  -           /* Ditch the unnecessary clobbers.  */
  -           emit_insn (gen_<shift>di3_neon_imm_noclobber (operands[0],
  -                                                         operands[1],
  -                                                         operands[2]));
  -         }
  -       else 
  -         {
  -           /* We must use a negative left-shift.  */
  -           emit_insn (gen_negsi2 (operands[3], operands[2]));
  -           emit_insn (gen_neon_load_count (operands[5], operands[3]));
  -           emit_insn (gen_<shifttype>_shift_di3_neon (operands[0], operands[1],
  -                                                      operands[5]));
  -         }
  -      }
  -    else
  -      {
  -       /* The shift expanders support either full overlap or no overlap.  */
  -       gcc_assert (!reg_overlap_mentioned_p (operands[0], operands[1])
  -                   || REGNO (operands[0]) == REGNO (operands[1]));
  -
  -       /* This clobbers CC (ASHIFTRT by register only).  */
  -       arm_emit_coreregs_64bit_shift (<CODE>, operands[0], operands[1],
  -                                      operands[2], operands[3], operands[4]);
  -      }
  -
  -    DONE;
  -  }"
  -  [(set_attr "arch" "neon_for_64bits,neon_for_64bits,*,*,*,avoid_neon_for_64bits,avoid_neon_for_64bits")
  -   (set_attr "opt" "*,*,speed,speed,speed,*,*")
  -   (set_attr "type" "multiple")]
  -)
  -
   ;; Widening operations
   
   (define_expand "widen_ssum<mode>3"
  @@ -6792,65 +6625,3 @@ (define_insn "neon_vabd<mode>_3"
    "vabd.<V_if_elem> %<V_reg>0, %<V_reg>1, %<V_reg>2"
    [(set_attr "type" "neon_fp_abd_s<q>")]
   )
  -
  -;; Copy from core-to-neon regs, then extend, not vice-versa
  -
  -(define_split
  -  [(set (match_operand:DI 0 "s_register_operand" "")
  -       (sign_extend:DI (match_operand:SI 1 "s_register_operand" "")))]
  -  "TARGET_NEON && reload_completed && IS_VFP_REGNUM (REGNO (operands[0]))"
  -  [(set (match_dup 2) (vec_duplicate:V2SI (match_dup 1)))
  -   (set (match_dup 0) (ashiftrt:DI (match_dup 0) (const_int 32)))]
  -  {
  -    operands[2] = gen_rtx_REG (V2SImode, REGNO (operands[0]));
  -  })
  -
  -(define_split
  -  [(set (match_operand:DI 0 "s_register_operand" "")
  -       (sign_extend:DI (match_operand:HI 1 "s_register_operand" "")))]
  -  "TARGET_NEON && reload_completed && IS_VFP_REGNUM (REGNO (operands[0]))"
  -  [(set (match_dup 2) (vec_duplicate:V4HI (match_dup 1)))
  -   (set (match_dup 0) (ashiftrt:DI (match_dup 0) (const_int 48)))]
  -  {
  -    operands[2] = gen_rtx_REG (V4HImode, REGNO (operands[0]));
  -  })
  -
  -(define_split
  -  [(set (match_operand:DI 0 "s_register_operand" "")
  -       (sign_extend:DI (match_operand:QI 1 "s_register_operand" "")))]
  -  "TARGET_NEON && reload_completed && IS_VFP_REGNUM (REGNO (operands[0]))"
  -  [(set (match_dup 2) (vec_duplicate:V8QI (match_dup 1)))
  -   (set (match_dup 0) (ashiftrt:DI (match_dup 0) (const_int 56)))]
  -  {
  -    operands[2] = gen_rtx_REG (V8QImode, REGNO (operands[0]));
  -  })
  -
  -(define_split
  -  [(set (match_operand:DI 0 "s_register_operand" "")
  -       (zero_extend:DI (match_operand:SI 1 "s_register_operand" "")))]
  -  "TARGET_NEON && reload_completed && IS_VFP_REGNUM (REGNO (operands[0]))"
  -  [(set (match_dup 2) (vec_duplicate:V2SI (match_dup 1)))
  -   (set (match_dup 0) (lshiftrt:DI (match_dup 0) (const_int 32)))]
  -  {
  -    operands[2] = gen_rtx_REG (V2SImode, REGNO (operands[0]));
  -  })
  -
  -(define_split
  -  [(set (match_operand:DI 0 "s_register_operand" "")
  -       (zero_extend:DI (match_operand:HI 1 "s_register_operand" "")))]
  -  "TARGET_NEON && reload_completed && IS_VFP_REGNUM (REGNO (operands[0]))"
  -  [(set (match_dup 2) (vec_duplicate:V4HI (match_dup 1)))
  -   (set (match_dup 0) (lshiftrt:DI (match_dup 0) (const_int 48)))]
  -  {
  -    operands[2] = gen_rtx_REG (V4HImode, REGNO (operands[0]));
  -  })
  -
  -(define_split
  -  [(set (match_operand:DI 0 "s_register_operand" "")
  -       (zero_extend:DI (match_operand:QI 1 "s_register_operand" "")))]
  -  "TARGET_NEON && reload_completed && IS_VFP_REGNUM (REGNO (operands[0]))"
  -  [(set (match_dup 2) (vec_duplicate:V8QI (match_dup 1)))
  -   (set (match_dup 0) (lshiftrt:DI (match_dup 0) (const_int 56)))]
  -  {
  -    operands[2] = gen_rtx_REG (V8QImode, REGNO (operands[0]));
  -  })
  diff --git a/gcc/testsuite/gcc.target/arm/neon-extend-1.c b/gcc/testsuite/gcc.target/arm/neon-extend-1.c
  deleted file mode 100644
  index cfe83ce1bde260cdd99ae47963dde6676627cca3..0000000000000000000000000000000000000000
  --- a/gcc/testsuite/gcc.target/arm/neon-extend-1.c
  +++ /dev/null
  @@ -1,13 +0,0 @@
  -/* { dg-require-effective-target arm_neon_hw } */
  -/* { dg-options "-O2" } */
  -/* { dg-add-options arm_neon } */
  -
  -void
  -f (unsigned int a)
  -{
  -  unsigned long long b = a;
  -  asm volatile ("@ extended to %0" : : "w" (b));
  -}
  -
  -/* { dg-final { scan-assembler "vdup.32" } } */
  -/* { dg-final { scan-assembler "vshr.u64" } } */
  diff --git a/gcc/testsuite/gcc.target/arm/neon-extend-2.c b/gcc/testsuite/gcc.target/arm/neon-extend-2.c
  deleted file mode 100644
  index 1c5a17e427859146f53828ec4a02b439ab0580ff..0000000000000000000000000000000000000000
  --- a/gcc/testsuite/gcc.target/arm/neon-extend-2.c
  +++ /dev/null
  @@ -1,13 +0,0 @@
  -/* { dg-require-effective-target arm_neon_hw } */
  -/* { dg-options "-O2" } */
  -/* { dg-add-options arm_neon } */
  -
  -void
  -f (int a)
  -{
  -  long long b = a;
  -  asm volatile ("@ extended to %0" : : "w" (b));
  -}
  -
  -/* { dg-final { scan-assembler "vdup.32" } } */
  -/* { dg-final { scan-assembler "vshr.s64" } } */        

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH][ARM] Cleanup DImode shifts
  2019-07-31 16:39 ` Wilco Dijkstra
  2019-08-19 16:18   ` Wilco Dijkstra
@ 2019-08-22 13:46   ` Kyrill Tkachov
  1 sibling, 0 replies; 6+ messages in thread
From: Kyrill Tkachov @ 2019-08-22 13:46 UTC (permalink / raw)
  To: Wilco Dijkstra, GCC Patches; +Cc: nd, Richard Earnshaw

Hi Wilco,

On 7/31/19 5:25 PM, Wilco Dijkstra wrote:
> ping
>
>
> Like the logical operations, expand all shifts early rather than only
>  sometimes.  The Neon shift expansions are never emitted (not even with
>  -fneon-for-64bits), so they are not useful.  So all the late expansions
>  and Neon shift patterns can be removed, and shifts are more optimized
>  as a result.  Since some extend patterns use Neon DImode shifts, remove
>  the Neon extend variants and related splits.
>
>  A simple example (relying on [1]) generates the same efficient code after
>  this patch with -mfpu=neon and -mfpu=vfp (previously just the fact of
>  having Neon enabled resulted inefficient code for no reason).
>
>  unsigned long long f(unsigned long long x, unsigned long long y)
>  { return x & (y >> 33); }
>
>  Before:
>          strd    r4, r5, [sp, #-8]!
>          lsr     r4, r3, #1
>          mov     r5, #0
>          and     r1, r1, r5
>          and     r0, r0, r4
>          ldrd    r4, r5, [sp]
>          add     sp, sp, #8
>          bx      lr
>
>  After:
>          and     r0, r0, r3, lsr #1
>          mov     r1, #0
>          bx      lr
>
>  Bootstrap and regress OK on arm-none-linux-gnueabihf 
> --with-cpu=cortex-a57
>
Seems to me we should deprecate -mneon-for-64bits for GCC 10 and look to 
remove (or make it a no-op at least) in future releases...

Ok for trunk.

Please keep an eye for regressions as with the other patches in this series.

Thanks,

Kyrill


>  [1] https://gcc.gnu.org/ml/gcc-patches/2019-07/msg01301.html
>
>  ChangeLog:
>  2019-07-19  Wilco Dijkstra  <wdijkstr@arm.com>
>
>          * config/arm/iterators.md (qhs_extenddi_cstr): Update.
>          (qhs_extenddi_cstr): Likewise.
>          * config/arm/arm.md (ashldi3): Always expand early.
>          (ashlsi3): Likewise.
>          (ashrsi3): Likewise.
>          (zero_extend<mode>di2): Remove Neon variants.
>          (extend<mode>di2): Likewise.
>          * config/arm/neon.md (ashldi3_neon_noclobber): Remove.
>          (signed_shift_di3_neon): Likewise.
>          (unsigned_shift_di3_neon): Likewise.
>          (ashrdi3_neon_imm_noclobber): Likewise.
>          (lshrdi3_neon_imm_noclobber): Likewise.
>          (<shift>di3_neon): Likewise.
>          (split extend): Remove DI extend split patterns.
>
>      testsuite/
>          * gcc.target/arm/neon-extend-1.c: Remove test.
>          * gcc.target/arm/neon-extend-2.c: Remove test.
>  ---
>
>  diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
>  index 
> 0dba97a4ebeed0c2133936ca662f1c9e86ffc6ba..10ed70dac4384354c0a2453c5e51a29108c6c062 
> 100644
>  --- a/gcc/config/arm/arm.md
>  +++ b/gcc/config/arm/arm.md
>  @@ -3601,44 +3601,14 @@ (define_insn "*satsi_<SAT:code>_shift"
>   (define_expand "ashldi3"
>     [(set (match_operand:DI            0 "s_register_operand")
>           (ashift:DI (match_operand:DI 1 "s_register_operand")
>  -                   (match_operand:SI 2 "general_operand")))]
>  +                   (match_operand:SI 2 "reg_or_int_operand")))]
>     "TARGET_32BIT"
>     "
>  -  if (TARGET_NEON)
>  -    {
>  -      /* Delay the decision whether to use NEON or core-regs until
>  -        register allocation.  */
>  -      emit_insn (gen_ashldi3_neon (operands[0], operands[1], 
> operands[2]));
>  -      DONE;
>  -    }
>  -  else
>  -    {
>  -      /* Only the NEON case can handle in-memory shift counts.  */
>  -      if (!reg_or_int_operand (operands[2], SImode))
>  -        operands[2] = force_reg (SImode, operands[2]);
>  -    }
>  -
>  -  if (!CONST_INT_P (operands[2]) && TARGET_REALLY_IWMMXT)
>  -    ; /* No special preparation statements; expand pattern as above.  */
>  -  else
>  -    {
>  -      rtx scratch1, scratch2;
>  -
>  -      /* Ideally we should use iwmmxt here if we could know that 
> operands[1]
>  -         ends up already living in an iwmmxt register. Otherwise it's
>  -         cheaper to have the alternate code being generated than moving
>  -         values to iwmmxt regs and back.  */
>  -
>  -      /* Expand operation using core-registers.
>  -        'FAIL' would achieve the same thing, but this is a bit 
> smarter.  */
>  -      scratch1 = gen_reg_rtx (SImode);
>  -      scratch2 = gen_reg_rtx (SImode);
>  -      arm_emit_coreregs_64bit_shift (ASHIFT, operands[0], operands[1],
>  -                                    operands[2], scratch1, scratch2);
>  -      DONE;
>  -    }
>  -  "
>  -)
>  +  arm_emit_coreregs_64bit_shift (ASHIFT, operands[0], operands[1],
>  +                                operands[2], gen_reg_rtx (SImode),
>  +                                gen_reg_rtx (SImode));
>  +  DONE;
>  +")
>
>   (define_expand "ashlsi3"
>     [(set (match_operand:SI            0 "s_register_operand")
>  @@ -3661,35 +3631,11 @@ (define_expand "ashrdi3"
>                        (match_operand:SI 2 "reg_or_int_operand")))]
>     "TARGET_32BIT"
>     "
>  -  if (TARGET_NEON)
>  -    {
>  -      /* Delay the decision whether to use NEON or core-regs until
>  -        register allocation.  */
>  -      emit_insn (gen_ashrdi3_neon (operands[0], operands[1], 
> operands[2]));
>  -      DONE;
>  -    }
>  -
>  -  if (!CONST_INT_P (operands[2]) && TARGET_REALLY_IWMMXT)
>  -    ; /* No special preparation statements; expand pattern as above.  */
>  -  else
>  -    {
>  -      rtx scratch1, scratch2;
>  -
>  -      /* Ideally we should use iwmmxt here if we could know that 
> operands[1]
>  -         ends up already living in an iwmmxt register. Otherwise it's
>  -         cheaper to have the alternate code being generated than moving
>  -         values to iwmmxt regs and back.  */
>  -
>  -      /* Expand operation using core-registers.
>  -        'FAIL' would achieve the same thing, but this is a bit 
> smarter.  */
>  -      scratch1 = gen_reg_rtx (SImode);
>  -      scratch2 = gen_reg_rtx (SImode);
>  -      arm_emit_coreregs_64bit_shift (ASHIFTRT, operands[0], operands[1],
>  -                                    operands[2], scratch1, scratch2);
>  -      DONE;
>  -    }
>  -  "
>  -)
>  +  arm_emit_coreregs_64bit_shift (ASHIFTRT, operands[0], operands[1],
>  +                                operands[2], gen_reg_rtx (SImode),
>  +                                gen_reg_rtx (SImode));
>  +  DONE;
>  +")
>
>   (define_expand "ashrsi3"
>     [(set (match_operand:SI              0 "s_register_operand")
>  @@ -3709,35 +3655,11 @@ (define_expand "lshrdi3"
>                        (match_operand:SI 2 "reg_or_int_operand")))]
>     "TARGET_32BIT"
>     "
>  -  if (TARGET_NEON)
>  -    {
>  -      /* Delay the decision whether to use NEON or core-regs until
>  -        register allocation.  */
>  -      emit_insn (gen_lshrdi3_neon (operands[0], operands[1], 
> operands[2]));
>  -      DONE;
>  -    }
>  -
>  -  if (!CONST_INT_P (operands[2]) && TARGET_REALLY_IWMMXT)
>  -    ; /* No special preparation statements; expand pattern as above.  */
>  -  else
>  -    {
>  -      rtx scratch1, scratch2;
>  -
>  -      /* Ideally we should use iwmmxt here if we could know that 
> operands[1]
>  -         ends up already living in an iwmmxt register. Otherwise it's
>  -         cheaper to have the alternate code being generated than moving
>  -         values to iwmmxt regs and back.  */
>  -
>  -      /* Expand operation using core-registers.
>  -        'FAIL' would achieve the same thing, but this is a bit 
> smarter.  */
>  -      scratch1 = gen_reg_rtx (SImode);
>  -      scratch2 = gen_reg_rtx (SImode);
>  -      arm_emit_coreregs_64bit_shift (LSHIFTRT, operands[0], operands[1],
>  -                                    operands[2], scratch1, scratch2);
>  -      DONE;
>  -    }
>  -  "
>  -)
>  +  arm_emit_coreregs_64bit_shift (LSHIFTRT, operands[0], operands[1],
>  +                                operands[2], gen_reg_rtx (SImode),
>  +                                gen_reg_rtx (SImode));
>  +  DONE;
>  +")
>
>   (define_expand "lshrsi3"
>     [(set (match_operand:SI              0 "s_register_operand")
>  @@ -4762,30 +4684,30 @@ (define_expand "truncdfhf2"
>   ;; Zero and sign extension instructions.
>
>   (define_insn "zero_extend<mode>di2"
>  -  [(set (match_operand:DI 0 "s_register_operand" "=w,r,?r,w")
>  +  [(set (match_operand:DI 0 "s_register_operand" "=r,?r")
>           (zero_extend:DI (match_operand:QHSI 1 "<qhs_zextenddi_op>"
> "<qhs_zextenddi_cstr>")))]
>     "TARGET_32BIT <qhs_zextenddi_cond>"
>     "#"
>  -  [(set_attr "length" "8,4,8,8")
>  -   (set_attr "arch" "neon_for_64bits,*,*,avoid_neon_for_64bits")
>  +  [(set_attr "length" "4,8")
>  +   (set_attr "arch" "*,*")
>      (set_attr "ce_count" "2")
>      (set_attr "predicable" "yes")
>  -   (set_attr "type" "multiple,mov_reg,multiple,multiple")]
>  +   (set_attr "type" "mov_reg,multiple")]
>   )
>
>   (define_insn "extend<mode>di2"
>  -  [(set (match_operand:DI 0 "s_register_operand" "=w,r,?r,?r,w")
>  +  [(set (match_operand:DI 0 "s_register_operand" "=r,?r,?r")
>           (sign_extend:DI (match_operand:QHSI 1 "<qhs_extenddi_op>"
> "<qhs_extenddi_cstr>")))]
>     "TARGET_32BIT <qhs_sextenddi_cond>"
>     "#"
>  -  [(set_attr "length" "8,4,8,8,8")
>  +  [(set_attr "length" "4,8,8")
>      (set_attr "ce_count" "2")
>      (set_attr "shift" "1")
>      (set_attr "predicable" "yes")
>  -   (set_attr "arch" "neon_for_64bits,*,a,t,avoid_neon_for_64bits")
>  -   (set_attr "type" "multiple,mov_reg,multiple,multiple,multiple")]
>  +   (set_attr "arch" "*,a,t")
>  +   (set_attr "type" "mov_reg,multiple,multiple")]
>   )
>
>   ;; Splits for all extensions to DImode
>  diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
>  index 
> eca16636ade577d84dc62523d066e72f79dc38f6..fa6f0c0529d5364b1e1df705cb1029868578e38c 
> 100644
>  --- a/gcc/config/arm/iterators.md
>  +++ b/gcc/config/arm/iterators.md
>  @@ -741,8 +741,8 @@ (define_mode_attr qhs_zextenddi_op [(SI 
> "s_register_operand")
>   (define_mode_attr qhs_extenddi_op [(SI "s_register_operand")
>                                      (HI "nonimmediate_operand")
>                                      (QI "arm_reg_or_extendqisi_mem_op")])
>  -(define_mode_attr qhs_extenddi_cstr [(SI "r,0,r,r,r") (HI 
> "r,0,rm,rm,r") (QI "r,0,rUq,rm,r")])
>  -(define_mode_attr qhs_zextenddi_cstr [(SI "r,0,r,r") (HI "r,0,rm,r") 
> (QI "r,0,rm,r")])
>  +(define_mode_attr qhs_extenddi_cstr [(SI "0,r,r") (HI "0,rm,rm") (QI 
> "0,rUq,rm")])
>  +(define_mode_attr qhs_zextenddi_cstr [(SI "0,r") (HI "0,rm") (QI 
> "0,rm")])
>
>   ;; Mode attributes used for fixed-point support.
>   (define_mode_attr qaddsub_suf [(V4UQQ "8") (V2UHQ "16") (UQQ "8") 
> (UHQ "16")
>  diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
>  index 
> ef73c77abeeaa02947c70ffa435f7bedc431e3be..757f2c0f5377148c770e061849424aed924a7d7a 
> 100644
>  --- a/gcc/config/arm/neon.md
>  +++ b/gcc/config/arm/neon.md
>  @@ -1135,173 +1135,6 @@ (define_insn "neon_load_count"
>     [(set_attr "type" "neon_load1_1reg,neon_from_gp")]
>   )
>
>  -(define_insn "ashldi3_neon_noclobber"
>  -  [(set (match_operand:DI 0 "s_register_operand"           "=w,w")
>  -       (ashift:DI (match_operand:DI 1 "s_register_operand" " w,w")
>  -                  (match_operand:DI 2 "reg_or_int_operand" " i,w")))]
>  -  "TARGET_NEON && reload_completed
>  -   && (!CONST_INT_P (operands[2])
>  -       || (INTVAL (operands[2]) >= 0 && INTVAL (operands[2]) < 64))"
>  -  "@
>  -   vshl.u64\t%P0, %P1, %2
>  -   vshl.u64\t%P0, %P1, %P2"
>  -  [(set_attr "type" "neon_shift_imm, neon_shift_reg")]
>  -)
>  -
>  -(define_insn_and_split "ashldi3_neon"
>  -  [(set (match_operand:DI 0 "s_register_operand"           "= w, w, 
> &r, r, &r, ?w,?w")
>  -       (ashift:DI (match_operand:DI 1 "s_register_operand" " 0w, w, 
> 0r, 0,  r, 0w, w")
>  -                  (match_operand:SI 2 "general_operand"    "rUm, i,  
> r, i,  i,rUm, i")))
>  -   (clobber (match_scratch:SI 3                                    
> "= X, X, &r, X,  X,  X, X"))
>  -   (clobber (match_scratch:SI 4                                    
> "= X, X, &r, X,  X,  X, X"))
>  -   (clobber (match_scratch:DI 5                                    
> "=&w, X,  X, X,  X, &w, X"))
>  -   (clobber (reg:CC_C CC_REGNUM))]
>  -  "TARGET_NEON"
>  -  "#"
>  -  "TARGET_NEON && reload_completed"
>  -  [(const_int 0)]
>  -  "
>  -  {
>  -    if (IS_VFP_REGNUM (REGNO (operands[0])))
>  -      {
>  -        if (CONST_INT_P (operands[2]))
>  -         {
>  -           if (INTVAL (operands[2]) < 1)
>  -             {
>  -               emit_insn (gen_movdi (operands[0], operands[1]));
>  -               DONE;
>  -             }
>  -           else if (INTVAL (operands[2]) > 63)
>  -             operands[2] = gen_rtx_CONST_INT (VOIDmode, 63);
>  -         }
>  -       else
>  -         {
>  -           emit_insn (gen_neon_load_count (operands[5], operands[2]));
>  -           operands[2] = operands[5];
>  -         }
>  -
>  -       /* Ditch the unnecessary clobbers.  */
>  -       emit_insn (gen_ashldi3_neon_noclobber (operands[0], operands[1],
>  - operands[2]));
>  -      }
>  -    else
>  -      {
>  -       /* The shift expanders support either full overlap or no 
> overlap.  */
>  -       gcc_assert (!reg_overlap_mentioned_p (operands[0], operands[1])
>  -                   || REGNO (operands[0]) == REGNO (operands[1]));
>  -
>  -       arm_emit_coreregs_64bit_shift (ASHIFT, operands[0], operands[1],
>  -                                      operands[2], operands[3], 
> operands[4]);
>  -      }
>  -    DONE;
>  -  }"
>  -  [(set_attr "arch" 
> "neon_for_64bits,neon_for_64bits,*,*,*,avoid_neon_for_64bits,avoid_neon_for_64bits")
>  -   (set_attr "opt" "*,*,speed,speed,speed,*,*")
>  -   (set_attr "type" "multiple")]
>  -)
>  -
>  -; The shift amount needs to be negated for right-shifts
>  -(define_insn "signed_shift_di3_neon"
>  -  [(set (match_operand:DI 0 "s_register_operand"            "=w")
>  -       (unspec:DI [(match_operand:DI 1 "s_register_operand" " w")
>  -                   (match_operand:DI 2 "s_register_operand" " w")]
>  -                  UNSPEC_ASHIFT_SIGNED))]
>  -  "TARGET_NEON && reload_completed"
>  -  "vshl.s64\t%P0, %P1, %P2"
>  -  [(set_attr "type" "neon_shift_reg")]
>  -)
>  -
>  -; The shift amount needs to be negated for right-shifts
>  -(define_insn "unsigned_shift_di3_neon"
>  -  [(set (match_operand:DI 0 "s_register_operand"            "=w")
>  -       (unspec:DI [(match_operand:DI 1 "s_register_operand" " w")
>  -                   (match_operand:DI 2 "s_register_operand" " w")]
>  -                  UNSPEC_ASHIFT_UNSIGNED))]
>  -  "TARGET_NEON && reload_completed"
>  -  "vshl.u64\t%P0, %P1, %P2"
>  -  [(set_attr "type" "neon_shift_reg")]
>  -)
>  -
>  -(define_insn "ashrdi3_neon_imm_noclobber"
>  -  [(set (match_operand:DI 0 "s_register_operand"             "=w")
>  -       (ashiftrt:DI (match_operand:DI 1 "s_register_operand" " w")
>  -                    (match_operand:DI 2 "const_int_operand"  " i")))]
>  -  "TARGET_NEON && reload_completed
>  -   && INTVAL (operands[2]) > 0 && INTVAL (operands[2]) <= 64"
>  -  "vshr.s64\t%P0, %P1, %2"
>  -  [(set_attr "type" "neon_shift_imm")]
>  -)
>  -
>  -(define_insn "lshrdi3_neon_imm_noclobber"
>  -  [(set (match_operand:DI 0 "s_register_operand"             "=w")
>  -       (lshiftrt:DI (match_operand:DI 1 "s_register_operand" " w")
>  -                    (match_operand:DI 2 "const_int_operand"  " i")))]
>  -  "TARGET_NEON && reload_completed
>  -   && INTVAL (operands[2]) > 0 && INTVAL (operands[2]) <= 64"
>  -  "vshr.u64\t%P0, %P1, %2"
>  -  [(set_attr "type" "neon_shift_imm")]
>  -)
>  -
>  -;; ashrdi3_neon
>  -;; lshrdi3_neon
>  -(define_insn_and_split "<shift>di3_neon"
>  -  [(set (match_operand:DI 0 "s_register_operand"            "= w, w, 
> &r, r, &r,?w,?w")
>  -       (RSHIFTS:DI (match_operand:DI 1 "s_register_operand" " 0w, w, 
> 0r, 0,  r,0w, w")
>  -                   (match_operand:SI 2 "reg_or_int_operand" "  r, 
> i,  r, i,  i, r, i")))
>  -   (clobber (match_scratch:SI 3                                     
> "=2r, X, &r, X,  X,2r, X"))
>  -   (clobber (match_scratch:SI 4                                     
> "= X, X, &r, X,  X, X, X"))
>  -   (clobber (match_scratch:DI 5                                     
> "=&w, X,  X, X, X,&w, X"))
>  -   (clobber (reg:CC CC_REGNUM))]
>  -  "TARGET_NEON"
>  -  "#"
>  -  "TARGET_NEON && reload_completed"
>  -  [(const_int 0)]
>  -  "
>  -  {
>  -    if (IS_VFP_REGNUM (REGNO (operands[0])))
>  -      {
>  -       if (CONST_INT_P (operands[2]))
>  -         {
>  -           if (INTVAL (operands[2]) < 1)
>  -             {
>  -               emit_insn (gen_movdi (operands[0], operands[1]));
>  -               DONE;
>  -             }
>  -           else if (INTVAL (operands[2]) > 64)
>  -             operands[2] = gen_rtx_CONST_INT (VOIDmode, 64);
>  -
>  -           /* Ditch the unnecessary clobbers.  */
>  -           emit_insn (gen_<shift>di3_neon_imm_noclobber (operands[0],
>  -                                                         operands[1],
>  - operands[2]));
>  -         }
>  -       else
>  -         {
>  -           /* We must use a negative left-shift.  */
>  -           emit_insn (gen_negsi2 (operands[3], operands[2]));
>  -           emit_insn (gen_neon_load_count (operands[5], operands[3]));
>  -           emit_insn (gen_<shifttype>_shift_di3_neon (operands[0], 
> operands[1],
>  - operands[5]));
>  -         }
>  -      }
>  -    else
>  -      {
>  -       /* The shift expanders support either full overlap or no 
> overlap.  */
>  -       gcc_assert (!reg_overlap_mentioned_p (operands[0], operands[1])
>  -                   || REGNO (operands[0]) == REGNO (operands[1]));
>  -
>  -       /* This clobbers CC (ASHIFTRT by register only).  */
>  -       arm_emit_coreregs_64bit_shift (<CODE>, operands[0], operands[1],
>  -                                      operands[2], operands[3], 
> operands[4]);
>  -      }
>  -
>  -    DONE;
>  -  }"
>  -  [(set_attr "arch" 
> "neon_for_64bits,neon_for_64bits,*,*,*,avoid_neon_for_64bits,avoid_neon_for_64bits")
>  -   (set_attr "opt" "*,*,speed,speed,speed,*,*")
>  -   (set_attr "type" "multiple")]
>  -)
>  -
>   ;; Widening operations
>
>   (define_expand "widen_ssum<mode>3"
>  @@ -6792,65 +6625,3 @@ (define_insn "neon_vabd<mode>_3"
>    "vabd.<V_if_elem> %<V_reg>0, %<V_reg>1, %<V_reg>2"
>    [(set_attr "type" "neon_fp_abd_s<q>")]
>   )
>  -
>  -;; Copy from core-to-neon regs, then extend, not vice-versa
>  -
>  -(define_split
>  -  [(set (match_operand:DI 0 "s_register_operand" "")
>  -       (sign_extend:DI (match_operand:SI 1 "s_register_operand" "")))]
>  -  "TARGET_NEON && reload_completed && IS_VFP_REGNUM (REGNO 
> (operands[0]))"
>  -  [(set (match_dup 2) (vec_duplicate:V2SI (match_dup 1)))
>  -   (set (match_dup 0) (ashiftrt:DI (match_dup 0) (const_int 32)))]
>  -  {
>  -    operands[2] = gen_rtx_REG (V2SImode, REGNO (operands[0]));
>  -  })
>  -
>  -(define_split
>  -  [(set (match_operand:DI 0 "s_register_operand" "")
>  -       (sign_extend:DI (match_operand:HI 1 "s_register_operand" "")))]
>  -  "TARGET_NEON && reload_completed && IS_VFP_REGNUM (REGNO 
> (operands[0]))"
>  -  [(set (match_dup 2) (vec_duplicate:V4HI (match_dup 1)))
>  -   (set (match_dup 0) (ashiftrt:DI (match_dup 0) (const_int 48)))]
>  -  {
>  -    operands[2] = gen_rtx_REG (V4HImode, REGNO (operands[0]));
>  -  })
>  -
>  -(define_split
>  -  [(set (match_operand:DI 0 "s_register_operand" "")
>  -       (sign_extend:DI (match_operand:QI 1 "s_register_operand" "")))]
>  -  "TARGET_NEON && reload_completed && IS_VFP_REGNUM (REGNO 
> (operands[0]))"
>  -  [(set (match_dup 2) (vec_duplicate:V8QI (match_dup 1)))
>  -   (set (match_dup 0) (ashiftrt:DI (match_dup 0) (const_int 56)))]
>  -  {
>  -    operands[2] = gen_rtx_REG (V8QImode, REGNO (operands[0]));
>  -  })
>  -
>  -(define_split
>  -  [(set (match_operand:DI 0 "s_register_operand" "")
>  -       (zero_extend:DI (match_operand:SI 1 "s_register_operand" "")))]
>  -  "TARGET_NEON && reload_completed && IS_VFP_REGNUM (REGNO 
> (operands[0]))"
>  -  [(set (match_dup 2) (vec_duplicate:V2SI (match_dup 1)))
>  -   (set (match_dup 0) (lshiftrt:DI (match_dup 0) (const_int 32)))]
>  -  {
>  -    operands[2] = gen_rtx_REG (V2SImode, REGNO (operands[0]));
>  -  })
>  -
>  -(define_split
>  -  [(set (match_operand:DI 0 "s_register_operand" "")
>  -       (zero_extend:DI (match_operand:HI 1 "s_register_operand" "")))]
>  -  "TARGET_NEON && reload_completed && IS_VFP_REGNUM (REGNO 
> (operands[0]))"
>  -  [(set (match_dup 2) (vec_duplicate:V4HI (match_dup 1)))
>  -   (set (match_dup 0) (lshiftrt:DI (match_dup 0) (const_int 48)))]
>  -  {
>  -    operands[2] = gen_rtx_REG (V4HImode, REGNO (operands[0]));
>  -  })
>  -
>  -(define_split
>  -  [(set (match_operand:DI 0 "s_register_operand" "")
>  -       (zero_extend:DI (match_operand:QI 1 "s_register_operand" "")))]
>  -  "TARGET_NEON && reload_completed && IS_VFP_REGNUM (REGNO 
> (operands[0]))"
>  -  [(set (match_dup 2) (vec_duplicate:V8QI (match_dup 1)))
>  -   (set (match_dup 0) (lshiftrt:DI (match_dup 0) (const_int 56)))]
>  -  {
>  -    operands[2] = gen_rtx_REG (V8QImode, REGNO (operands[0]));
>  -  })
>  diff --git a/gcc/testsuite/gcc.target/arm/neon-extend-1.c 
> b/gcc/testsuite/gcc.target/arm/neon-extend-1.c
>  deleted file mode 100644
>  index 
> cfe83ce1bde260cdd99ae47963dde6676627cca3..0000000000000000000000000000000000000000
>  --- a/gcc/testsuite/gcc.target/arm/neon-extend-1.c
>  +++ /dev/null
>  @@ -1,13 +0,0 @@
>  -/* { dg-require-effective-target arm_neon_hw } */
>  -/* { dg-options "-O2" } */
>  -/* { dg-add-options arm_neon } */
>  -
>  -void
>  -f (unsigned int a)
>  -{
>  -  unsigned long long b = a;
>  -  asm volatile ("@ extended to %0" : : "w" (b));
>  -}
>  -
>  -/* { dg-final { scan-assembler "vdup.32" } } */
>  -/* { dg-final { scan-assembler "vshr.u64" } } */
>  diff --git a/gcc/testsuite/gcc.target/arm/neon-extend-2.c 
> b/gcc/testsuite/gcc.target/arm/neon-extend-2.c
>  deleted file mode 100644
>  index 
> 1c5a17e427859146f53828ec4a02b439ab0580ff..0000000000000000000000000000000000000000
>  --- a/gcc/testsuite/gcc.target/arm/neon-extend-2.c
>  +++ /dev/null
>  @@ -1,13 +0,0 @@
>  -/* { dg-require-effective-target arm_neon_hw } */
>  -/* { dg-options "-O2" } */
>  -/* { dg-add-options arm_neon } */
>  -
>  -void
>  -f (int a)
>  -{
>  -  long long b = a;
>  -  asm volatile ("@ extended to %0" : : "w" (b));
>  -}
>  -
>  -/* { dg-final { scan-assembler "vdup.32" } } */
>  -/* { dg-final { scan-assembler "vshr.s64" } } */

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2019-08-22 12:05 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-07-22 16:19 [PATCH][ARM] Cleanup DImode shifts Wilco Dijkstra
2019-07-22 16:35 ` Ramana Radhakrishnan
2019-07-22 17:20   ` Wilco Dijkstra
2019-07-31 16:39 ` Wilco Dijkstra
2019-08-19 16:18   ` Wilco Dijkstra
2019-08-22 13:46   ` Kyrill Tkachov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).