public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH 4/4][PR target/88808]Enable bitwise operator for AVX512 masks.
@ 2020-08-14  8:27 Hongtao Liu
  2020-08-17 10:08 ` Uros Bizjak
  0 siblings, 1 reply; 39+ messages in thread
From: Hongtao Liu @ 2020-08-14  8:27 UTC (permalink / raw)
  To: GCC Patches, Uros Bizjak, Kirill Yukhin

[-- Attachment #1: Type: text/plain, Size: 1027 bytes --]

Enable operator or/xor/and/andn/not for mask register, kxnor is not
enabled since there's no corresponding instruction for general
registers.

gcc/
        PR target/88808
        * config/i386/i386.md: (*movsi_internal): Adjust constraints
        for mask registers.
        (*movhi_internal): Ditto.
        (*movqi_internal): Ditto.
        (*anddi_1): Support mask register operations
        (*and<mode>_1): Ditto.
        (*andqi_1): Ditto.
        (*andn<mode>_1): Ditto.
        (*<code><mode>_1): Ditto.
        (*<code>qi_1): Ditto.
        (*one_cmpl<mode>2_1): Ditto.
        (*one_cmplsi2_1_zext): Ditto.
        (*one_cmplqi2_1): Ditto.

gcc/testsuite/
        * gcc.target/i386/bitwise_mask_op-1.c: New test.
        * gcc.target/i386/bitwise_mask_op-2.c: New test.
        * gcc.target/i386/avx512bw-kunpckwd-1.c: Adjust testcase.
        * gcc.target/i386/avx512bw-kunpckwd-3.c: Ditto.
        * gcc.target/i386/avx512dq-kmovb-5.c: Ditto.
        * gcc.target/i386/avx512f-kmovw-5.c: Ditto.


-- 
BR,
Hongtao

[-- Attachment #2: 0004-Enable-bitwise-operation-for-type-mask.patch --]
[-- Type: text/x-patch, Size: 22840 bytes --]

From df816952e6e76e3dccd53b6384075c41eed1a0f9 Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Thu, 13 Aug 2020 14:20:43 +0800
Subject: [PATCH 4/4] Enable bitwise operation for type mask.

Enable operator or/xor/and/andn/not for mask register, kxnor is not
enabled since there's no corresponding instruction for general
registers.

gcc/
	PR target/88808
	* config/i386/i386.md: (*movsi_internal): Adjust constraints
	for mask registers.
	(*movhi_internal): Ditto.
	(*movqi_internal): Ditto.
	(*anddi_1): Support mask register operations
	(*and<mode>_1): Ditto.
	(*andqi_1): Ditto.
	(*andn<mode>_1): Ditto.
	(*<code><mode>_1): Ditto.
	(*<code>qi_1): Ditto.
	(*one_cmpl<mode>2_1): Ditto.
	(*one_cmplsi2_1_zext): Ditto.
	(*one_cmplqi2_1): Ditto.

gcc/testsuite/
	* gcc.target/i386/bitwise_mask_op-1.c: New test.
	* gcc.target/i386/bitwise_mask_op-2.c: New test.
	* gcc.target/i386/avx512bw-kunpckwd-1.c: Adjust testcase.
	* gcc.target/i386/avx512bw-kunpckwd-3.c: Ditto.
	* gcc.target/i386/avx512dq-kmovb-5.c: Ditto.
	* gcc.target/i386/avx512f-kmovw-5.c: Ditto.
---
 gcc/config/i386/i386.md                       | 262 +++++++++++++-----
 .../gcc.target/i386/avx512bw-kunpckwd-1.c     |   2 +-
 .../gcc.target/i386/avx512bw-kunpckwd-3.c     |   2 +-
 .../gcc.target/i386/avx512dq-kmovb-5.c        |   2 +-
 .../gcc.target/i386/avx512f-kmovw-5.c         |   2 +-
 .../gcc.target/i386/bitwise_mask_op-1.c       | 177 ++++++++++++
 .../gcc.target/i386/bitwise_mask_op-2.c       |   7 +
 7 files changed, 377 insertions(+), 77 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/bitwise_mask_op-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/bitwise_mask_op-2.c

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 74d207c3711..e8ad79d1b0a 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -2294,7 +2294,7 @@
 
 (define_insn "*movsi_internal"
   [(set (match_operand:SI 0 "nonimmediate_operand"
-    "=r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,?r,?*v,*k,*k ,*rm,*k")
+    "=r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,?r,?*v,*k,*k ,*rm,k")
 	(match_operand:SI 1 "general_operand"
     "g ,re,C ,*y,m  ,*y,*y,r  ,C ,*v,m ,*v,*v,r  ,*r,*km,*k ,CBC"))]
   "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
@@ -2403,8 +2403,8 @@
 	   (symbol_ref "true")))])
 
 (define_insn "*movhi_internal"
-  [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r ,r ,m ,k,k ,r,m,k")
-	(match_operand:HI 1 "general_operand"      "r ,rn,rm,rn,r,km,k,k,CBC"))]
+  [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r ,r ,m ,*k,*k ,*r,*m,k")
+	(match_operand:HI 1 "general_operand"      "r ,rn,rm,rn,*r,*km,*k,*k,CBC"))]
   "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
 {
   switch (get_attr_type (insn))
@@ -2491,9 +2491,9 @@
 
 (define_insn "*movqi_internal"
   [(set (match_operand:QI 0 "nonimmediate_operand"
-			"=Q,R,r,q,q,r,r ,?r,m ,k,k,r,m,k,k,k")
+			"=Q,R,r,q,q,r,r ,?r,m ,*k,*k,*r,*m,*k,k,k")
 	(match_operand:QI 1 "general_operand"
-			"Q ,R,r,n,m,q,rn, m,qn,r,k,k,k,m,C,BC"))]
+			"Q ,R,r,n,m,q,rn, m,qn,*r,*k,*k,*k,*m,C,BC"))]
   "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
 {
   char buf[128];
@@ -9044,19 +9044,21 @@
 })
 
 (define_insn "*anddi_1"
-  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,rm,r,r")
+  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,rm,r,r,k")
 	(and:DI
-	 (match_operand:DI 1 "nonimmediate_operand" "%0,0,0,qm")
-	 (match_operand:DI 2 "x86_64_szext_general_operand" "Z,re,m,L")))
+	 (match_operand:DI 1 "nonimmediate_operand" "%0,0,0,qm,k")
+	 (match_operand:DI 2 "x86_64_szext_general_operand" "Z,re,m,L,k")))
    (clobber (reg:CC FLAGS_REG))]
   "TARGET_64BIT && ix86_binary_operator_ok (AND, DImode, operands)"
   "@
    and{l}\t{%k2, %k0|%k0, %k2}
    and{q}\t{%2, %0|%0, %2}
    and{q}\t{%2, %0|%0, %2}
-   #"
-  [(set_attr "type" "alu,alu,alu,imovx")
-   (set_attr "length_immediate" "*,*,*,0")
+   and{q}\t{%2, %0|%0, %2}
+   kandq\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "x64,x64,x64,x64,avx512bw")
+   (set_attr "type" "alu,alu,alu,imovx,msklog")
+   (set_attr "length_immediate" "*,*,*,0,*")
    (set (attr "prefix_rex")
      (if_then_else
        (and (eq_attr "type" "imovx")
@@ -9064,7 +9066,7 @@
 		 (match_operand 1 "ext_QIreg_operand")))
        (const_string "1")
        (const_string "*")))
-   (set_attr "mode" "SI,DI,DI,SI")])
+   (set_attr "mode" "SI,DI,DI,SI,DI")])
 
 (define_insn_and_split "*anddi_1_btr"
   [(set (match_operand:DI 0 "nonimmediate_operand" "=rm")
@@ -9130,17 +9132,25 @@
    (set_attr "mode" "SI")])
 
 (define_insn "*and<mode>_1"
-  [(set (match_operand:SWI24 0 "nonimmediate_operand" "=rm,r,Ya")
-	(and:SWI24 (match_operand:SWI24 1 "nonimmediate_operand" "%0,0,qm")
-		   (match_operand:SWI24 2 "<general_operand>" "r<i>,m,L")))
+  [(set (match_operand:SWI24 0 "nonimmediate_operand" "=rm,r,Ya,k")
+	(and:SWI24 (match_operand:SWI24 1 "nonimmediate_operand" "%0,0,qm,k")
+		   (match_operand:SWI24 2 "<general_operand>" "r<i>,m,L,k")))
    (clobber (reg:CC FLAGS_REG))]
   "ix86_binary_operator_ok (AND, <MODE>mode, operands)"
   "@
    and{<imodesuffix>}\t{%2, %0|%0, %2}
    and{<imodesuffix>}\t{%2, %0|%0, %2}
-   #"
-  [(set_attr "type" "alu,alu,imovx")
-   (set_attr "length_immediate" "*,*,0")
+   and{<imodesuffix>}\t{%2, %0|%0, %2}
+   kand<mskmodesuffix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set (attr "isa")
+	(cond [(eq_attr "alternative" "3")
+		 (if_then_else (eq_attr "mode" "SI")
+		   (const_string "avx512bw")
+		   (const_string "avx512f"))
+	      ]
+	      (const_string "*")))
+   (set_attr "type" "alu,alu,imovx,msklog")
+   (set_attr "length_immediate" "*,*,0,*")
    (set (attr "prefix_rex")
      (if_then_else
        (and (eq_attr "type" "imovx")
@@ -9148,20 +9158,39 @@
 		 (match_operand 1 "ext_QIreg_operand")))
        (const_string "1")
        (const_string "*")))
-   (set_attr "mode" "<MODE>,<MODE>,SI")])
+   (set_attr "mode" "<MODE>,<MODE>,SI,<MODE>")])
 
 (define_insn "*andqi_1"
-  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r")
-	(and:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0")
-		(match_operand:QI 2 "general_operand" "qn,m,rn")))
+  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r,k")
+	(and:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0,k")
+		(match_operand:QI 2 "general_operand" "qn,m,rn,k")))
    (clobber (reg:CC FLAGS_REG))]
   "ix86_binary_operator_ok (AND, QImode, operands)"
-  "@
-   and{b}\t{%2, %0|%0, %2}
-   and{b}\t{%2, %0|%0, %2}
-   and{l}\t{%k2, %k0|%k0, %k2}"
-  [(set_attr "type" "alu")
-   (set_attr "mode" "QI,QI,SI")
+{
+  switch (which_alternative)
+    {
+     case 0:
+     case 1:
+       return "and{b}\t{%2, %0|%0, %2}";
+     case 2:
+       return "and{l}\t{%k2, %k0|%k0, %k2}";
+     case 3:
+       if (TARGET_AVX512DQ)
+	 return "kandb\t{%2, %1, %0|%0, %1, %2}";
+       return "kandw\t{%2, %1, %0|%0, %1, %2}";
+     default:
+       gcc_unreachable ();
+     }
+}
+  [(set_attr "type" "alu,alu,alu,msklog")
+   (set (attr "mode")
+	(cond [(eq_attr "alternative" "2")
+		 (const_string "SI")
+		(and (eq_attr "alternative" "3")
+		     (match_test "!TARGET_AVX512DQ"))
+		 (const_string "HI")
+	       ]
+	       (const_string "QI")))
    ;; Potential partial reg stall on alternative 2.
    (set (attr "preferred_for_speed")
      (cond [(eq_attr "alternative" "2")
@@ -9539,28 +9568,53 @@
 })
 
 (define_insn "*andn<mode>_1"
-  [(set (match_operand:SWI48 0 "register_operand" "=r,r")
+  [(set (match_operand:SWI48 0 "register_operand" "=r,r,k")
 	(and:SWI48
-	  (not:SWI48 (match_operand:SWI48 1 "register_operand" "r,r"))
-	  (match_operand:SWI48 2 "nonimmediate_operand" "r,m")))
+	  (not:SWI48 (match_operand:SWI48 1 "register_operand" "r,r,k"))
+	  (match_operand:SWI48 2 "nonimmediate_operand" "r,m,k")))
    (clobber (reg:CC FLAGS_REG))]
-  "TARGET_BMI"
-  "andn\t{%2, %1, %0|%0, %1, %2}"
-  [(set_attr "type" "bitmanip")
-   (set_attr "btver2_decode" "direct, double")
+  "TARGET_BMI || TARGET_AVX512BW"
+  "@
+    andn\t{%2, %1, %0|%0, %1, %2}
+    andn\t{%2, %1, %0|%0, %1, %2}
+    kandn<mskmodesuffix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "bmi,bmi,avx512bw")
+   (set_attr "type" "bitmanip,bitmanip,msklog")
+   (set_attr "btver2_decode" "direct, double,*")
    (set_attr "mode" "<MODE>")])
 
 (define_insn "*andn<mode>_1"
-  [(set (match_operand:SWI12 0 "register_operand" "=r")
+  [(set (match_operand:SWI12 0 "register_operand" "=r,k")
 	(and:SWI12
-	  (not:SWI12 (match_operand:SWI12 1 "register_operand" "r"))
-	  (match_operand:SWI12 2 "register_operand" "r")))
+	  (not:SWI12 (match_operand:SWI12 1 "register_operand" "r,k"))
+	  (match_operand:SWI12 2 "register_operand" "r,k")))
    (clobber (reg:CC FLAGS_REG))]
-  "TARGET_BMI"
-  "andn\t{%k2, %k1, %k0|%k0, %k1, %k2}"
-  [(set_attr "type" "bitmanip")
-   (set_attr "btver2_decode" "direct")
-   (set_attr "mode" "SI")])
+  "TARGET_BMI || TARGET_AVX512BW"
+{
+  switch (which_alternative)
+    {
+    case 0:
+      return "andn\t{%k2, %k1, %k0|%k0, %k1, %k2}";
+    case 1:
+      if (TARGET_AVX512DQ)
+	return "kandn<mskmodesuffix>\t{%2, %1, %0|%0, %1, %2}";
+      return "kandnw\t{%2, %1, %0|%0, %1, %2}";
+    default:
+      gcc_unreachable ();
+    }
+}
+
+  [(set_attr "isa" "bmi,avx512f")
+   (set_attr "type" "bitmanip,msklog")
+   (set_attr "btver2_decode" "direct,*")
+   (set (attr "mode")
+	(cond [(eq_attr "alternative" "0")
+		 (const_string "SI")
+	       (and (eq_attr "alternative" "1")
+		    (match_test "!TARGET_AVX512DQ"))
+		  (const_string "HI")
+	      ]
+	      (const_string "<MODE>")))])
 
 (define_insn "*andn_<mode>_ccno"
   [(set (reg FLAGS_REG)
@@ -9631,14 +9685,24 @@
 })
 
 (define_insn "*<code><mode>_1"
-  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,r")
+  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,r,k")
 	(any_or:SWI248
-	 (match_operand:SWI248 1 "nonimmediate_operand" "%0,0")
-	 (match_operand:SWI248 2 "<general_operand>" "r<i>,m")))
+	 (match_operand:SWI248 1 "nonimmediate_operand" "%0,0,k")
+	 (match_operand:SWI248 2 "<general_operand>" "r<i>,m,k")))
    (clobber (reg:CC FLAGS_REG))]
   "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
-  "<logic>{<imodesuffix>}\t{%2, %0|%0, %2}"
-  [(set_attr "type" "alu")
+  "@
+    <logic>{<imodesuffix>}\t{%2, %0|%0, %2}
+    <logic>{<imodesuffix>}\t{%2, %0|%0, %2}
+    k<logic><mskmodesuffix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set (attr "isa")
+	(cond [(eq_attr "alternative" "2")
+		 (if_then_else (eq_attr "mode" "SI,DI")
+		   (const_string "avx512bw")
+		   (const_string "avx512f"))
+	      ]
+	      (const_string "*")))
+   (set_attr "type" "alu, alu, msklog")
    (set_attr "mode" "<MODE>")])
 
 (define_insn_and_split "*iordi_1_bts"
@@ -9711,17 +9775,37 @@
    (set_attr "mode" "SI")])
 
 (define_insn "*<code>qi_1"
-  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r")
-	(any_or:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0")
-		   (match_operand:QI 2 "general_operand" "qn,m,rn")))
+  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r,k")
+	(any_or:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0,k")
+		   (match_operand:QI 2 "general_operand" "qn,m,rn,k")))
    (clobber (reg:CC FLAGS_REG))]
   "ix86_binary_operator_ok (<CODE>, QImode, operands)"
-  "@
-   <logic>{b}\t{%2, %0|%0, %2}
-   <logic>{b}\t{%2, %0|%0, %2}
-   <logic>{l}\t{%k2, %k0|%k0, %k2}"
-  [(set_attr "type" "alu")
-   (set_attr "mode" "QI,QI,SI")
+{
+  switch (which_alternative)
+    {
+    case 0:
+    case 1:
+      return "<logic>{b}\t{%2, %0|%0, %2}";
+    case 2:
+      return "<logic>{l}\t{%k2, %k0|%k0, %k2}";
+    case 3:
+      if (TARGET_AVX512DQ)
+	return "k<logic>b\t{%2, %1, %0|%0, %1, %2}";
+      return "k<logic>w\t{%2, %1, %0|%0, %1, %2}";
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "isa" "*,*,*,avx512f")
+   (set_attr "type" "alu,alu,alu,msklog")
+   (set (attr "mode")
+	(cond [(eq_attr "alternative" "2")
+		 (const_string "SI")
+		(and (eq_attr "alternative" "3")
+		     (match_test "!TARGET_AVX512DQ"))
+		 (const_string "HI")
+	       ]
+	       (const_string "QI")))
    ;; Potential partial reg stall on alternative 2.
    (set (attr "preferred_for_speed")
      (cond [(eq_attr "alternative" "2")
@@ -10370,31 +10454,63 @@
   "split_double_mode (DImode, &operands[0], 2, &operands[0], &operands[2]);")
 
 (define_insn "*one_cmpl<mode>2_1"
-  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm")
-	(not:SWI248 (match_operand:SWI248 1 "nonimmediate_operand" "0")))]
+  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,k")
+	(not:SWI248 (match_operand:SWI248 1 "nonimmediate_operand" "0,k")))]
   "ix86_unary_operator_ok (NOT, <MODE>mode, operands)"
-  "not{<imodesuffix>}\t%0"
-  [(set_attr "type" "negnot")
+  "@
+  not{<imodesuffix>}\t%0
+  knot<mskmodesuffix>\t{%1, %0|%0, %1}"
+  [(set (attr "isa")
+	(cond [(eq_attr "alternative" "2")
+		 (if_then_else (eq_attr "mode" "SI,DI")
+		   (const_string "avx512bw")
+		   (const_string "avx512f"))
+	      ]
+	      (const_string "*")))
+   (set_attr "type" "negnot,msklog")
    (set_attr "mode" "<MODE>")])
 
 (define_insn "*one_cmplsi2_1_zext"
-  [(set (match_operand:DI 0 "register_operand" "=r")
+  [(set (match_operand:DI 0 "register_operand" "=r,k")
 	(zero_extend:DI
-	  (not:SI (match_operand:SI 1 "register_operand" "0"))))]
+	  (not:SI (match_operand:SI 1 "register_operand" "0,k"))))]
   "TARGET_64BIT && ix86_unary_operator_ok (NOT, SImode, operands)"
-  "not{l}\t%k0"
-  [(set_attr "type" "negnot")
-   (set_attr "mode" "SI")])
+  "@
+    not{l}\t%k0
+    knotd\t{%1, %0|%0, %1}"
+  [(set_attr "isa" "x64,avx512bw")
+   (set_attr "type" "negnot,msklog")
+   (set_attr "mode" "SI,SI")])
 
 (define_insn "*one_cmplqi2_1"
-  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,r")
-	(not:QI (match_operand:QI 1 "nonimmediate_operand" "0,0")))]
+  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,r,k")
+	(not:QI (match_operand:QI 1 "nonimmediate_operand" "0,0,k")))]
   "ix86_unary_operator_ok (NOT, QImode, operands)"
-  "@
-   not{b}\t%0
-   not{l}\t%k0"
-  [(set_attr "type" "negnot")
-   (set_attr "mode" "QI,SI")
+{
+  switch (which_alternative)
+    {
+    case 0:
+      return "not{b}\t%0";
+    case 1:
+      return "not{l}\t%k0";
+    case 2:
+      if (TARGET_AVX512DQ)
+	return "knotb\t{%1, %0|%0, %1}";
+      return "knotw\t{%1, %0|%0, %1}";
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "isa" "*,*,avx512f")
+   (set_attr "type" "negnot,negnot,msklog")
+   (set (attr "mode")
+	(cond [(eq_attr "alternative" "1")
+		 (const_string "SI")
+		(and (eq_attr "alternative" "2")
+		     (match_test "!TARGET_AVX512DQ"))
+		 (const_string "HI")
+	       ]
+	       (const_string "QI")))
    ;; Potential partial reg stall on alternative 1.
    (set (attr "preferred_for_speed")
      (cond [(eq_attr "alternative" "1")
diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-1.c b/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-1.c
index 94422f36010..46d9351f275 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-1.c
@@ -1,6 +1,6 @@
 /* { dg-do compile } */
 /* { dg-options "-mavx512bw -O2" } */
-/* { dg-final { scan-assembler-times "kunpckwd\[ \\t\]+\[^\{\n\]*%k\[1-7\](?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "kunpckwd\[ \\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */
 
 #include <immintrin.h>
 
diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-3.c b/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-3.c
index c68ad8cc1f7..fe13f4f33fc 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-3.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-3.c
@@ -1,6 +1,6 @@
 /* { dg-do compile } */
 /* { dg-options "-mavx512bw -O2" } */
-/* { dg-final { scan-assembler-times "kunpckwd\[ \\t\]+\[^\{\n\]*%k\[1-7\](?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "kunpckwd\[ \\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */
 
 #include <immintrin.h>
 
diff --git a/gcc/testsuite/gcc.target/i386/avx512dq-kmovb-5.c b/gcc/testsuite/gcc.target/i386/avx512dq-kmovb-5.c
index 49817097e26..114e03ee93d 100644
--- a/gcc/testsuite/gcc.target/i386/avx512dq-kmovb-5.c
+++ b/gcc/testsuite/gcc.target/i386/avx512dq-kmovb-5.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-mavx512dq -O2" } */
+/* { dg-options "-mavx512dq -mno-avx512bw -O2" } */
 /* { dg-final { scan-assembler-times "kmovb\[ \\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */
 
 #include <immintrin.h>
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-kmovw-5.c b/gcc/testsuite/gcc.target/i386/avx512f-kmovw-5.c
index 7bb34d34d8d..79d37394b36 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-kmovw-5.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-kmovw-5.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-mavx512f -O2" } */
+/* { dg-options "-mavx512f -mno-avx512bw -O2" } */
 /* { dg-final { scan-assembler-times "kmovw\[ \\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */
 
 #include <immintrin.h>
diff --git a/gcc/testsuite/gcc.target/i386/bitwise_mask_op-1.c b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-1.c
new file mode 100644
index 00000000000..2757bcaaf50
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-1.c
@@ -0,0 +1,177 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512bw -mno-avx512dq -O2" } */
+
+#include <immintrin.h>
+__m512i
+foo_orq (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask64 m1 = _mm512_cmpeq_epi8_mask (a, b);
+  __mmask64 m2 = _mm512_cmpeq_epi8_mask (c, d);
+  return _mm512_mask_add_epi8 (c, m1 | m2, a, d);
+}
+
+/* { dg-final { scan-assembler-times "korq" "1" { target { ! ia32 } } } } */
+
+__m512i
+foo_ord (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask32 m1 = _mm512_cmpeq_epi16_mask (a, b);
+  __mmask32 m2 = _mm512_cmpeq_epi16_mask (c, d);
+  return _mm512_mask_add_epi16 (c, m1 | m2, a, d);
+}
+
+/* { dg-final { scan-assembler-times "kord" "1" } }  */
+
+__m512i
+foo_orw (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask16 m1 = _mm512_cmpeq_epi32_mask (a, b);
+  __mmask16 m2 = _mm512_cmpeq_epi32_mask (c, d);
+  return _mm512_mask_add_epi32 (c, m1 | m2, a, d);
+}
+
+__m512i
+foo_orb (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask8 m1 = _mm512_cmpeq_epi64_mask (a, b);
+  __mmask8 m2 = _mm512_cmpeq_epi64_mask (c, d);
+  return _mm512_mask_add_epi64 (c, m1 | m2, a, d);
+}
+
+/* { dg-final { scan-assembler-times "korw" "2" } }  */
+
+__m512i
+foo_xorq (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask64 m1 = _mm512_cmpeq_epi8_mask (a, b);
+  __mmask64 m2 = _mm512_cmpeq_epi8_mask (c, d);
+  return _mm512_mask_add_epi8 (c, m1 ^ m2, a, d);
+}
+
+/* { dg-final { scan-assembler-times "kxorq" "1" { target { ! ia32 } } } }  */
+
+__m512i
+foo_xord (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask32 m1 = _mm512_cmpeq_epi16_mask (a, b);
+  __mmask32 m2 = _mm512_cmpeq_epi16_mask (c, d);
+  return _mm512_mask_add_epi16 (c, m1 ^ m2, a, d);
+}
+
+/* { dg-final { scan-assembler-times "kxord" "1" } }  */
+
+__m512i
+foo_xorw (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask16 m1 = _mm512_cmpeq_epi32_mask (a, b);
+  __mmask16 m2 = _mm512_cmpeq_epi32_mask (c, d);
+  return _mm512_mask_add_epi32 (c, m1 ^ m2, a, d);
+}
+
+__m512i
+foo_xorb (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask8 m1 = _mm512_cmpeq_epi64_mask (a, b);
+  __mmask8 m2 = _mm512_cmpeq_epi64_mask (c, d);
+  return _mm512_mask_add_epi64 (c, m1 ^ m2, a, d);
+}
+
+/* { dg-final { scan-assembler-times "korw" "2" } }  */
+
+__m512i
+foo_andq (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask64 m1 = _mm512_cmpeq_epi8_mask (a, b);
+  __mmask64 m2 = _mm512_cmpeq_epi8_mask (c, d);
+  return _mm512_mask_add_epi8 (c, m1 & m2, a, d);
+}
+
+__m512i
+foo_andd (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask32 m1 = _mm512_cmpeq_epi16_mask (a, b);
+  __mmask32 m2 = _mm512_cmpeq_epi16_mask (c, d);
+  return _mm512_mask_add_epi16 (c, m1 & m2, a, d);
+}
+
+__m512i
+foo_andw (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask16 m1 = _mm512_cmpeq_epi32_mask (a, b);
+  __mmask16 m2 = _mm512_cmpeq_epi32_mask (c, d);
+  return _mm512_mask_add_epi32 (c, m1 & m2, a, d);
+}
+
+__m512i
+foo_andb (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask8 m1 = _mm512_cmpeq_epi64_mask (a, b);
+  __mmask8 m2 = _mm512_cmpeq_epi64_mask (c, d);
+  return _mm512_mask_add_epi64 (c, m1 & m2, a, d);
+}
+
+__m512i
+foo_andnq (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask64 m1 = _mm512_cmpeq_epi8_mask (a, b);
+  __mmask64 m2 = _mm512_cmpeq_epi8_mask (c, d);
+  return _mm512_mask_add_epi8 (c, m1 & ~m2, a, d);
+}
+
+__m512i
+foo_andnd (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask32 m1 = _mm512_cmpeq_epi16_mask (a, b);
+  __mmask32 m2 = _mm512_cmpeq_epi16_mask (c, d);
+  return _mm512_mask_add_epi16 (c, m1 & ~m2, a, d);
+}
+
+__m512i
+foo_andnw (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask16 m1 = _mm512_cmpeq_epi32_mask (a, b);
+  __mmask16 m2 = _mm512_cmpeq_epi32_mask (c, d);
+  return _mm512_mask_add_epi32 (c, m1 & ~m2, a, d);
+}
+
+__m512i
+foo_andnb (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask8 m1 = _mm512_cmpeq_epi64_mask (a, b);
+  __mmask8 m2 = _mm512_cmpeq_epi64_mask (c, d);
+  return _mm512_mask_add_epi64 (c, m1 & ~m2, a, d);
+}
+
+__m512i
+foo_notq (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask64 m1 = _mm512_cmpeq_epi8_mask (a, b);
+  return _mm512_mask_add_epi8 (c, ~m1, a, d);
+}
+
+/* { dg-final { scan-assembler-times "knotq" "2" { target { ! ia32 } } } }  */
+
+__m512i
+foo_notd (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask32 m1 = _mm512_cmpeq_epi16_mask (a, b);
+  return _mm512_mask_add_epi16 (c, ~m1, a, d);
+}
+
+/* { dg-final { scan-assembler-times "knotd" "2" { target { ! ia32 } } } }  */
+
+__m512i
+foo_notw (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask16 m1 = _mm512_cmpeq_epi32_mask (a, b);
+  return _mm512_mask_add_epi32 (c, ~m1, a, d);
+}
+
+__m512i
+foo_notb (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask8 m1 = _mm512_cmpeq_epi64_mask (a, b);
+  return _mm512_mask_add_epi64 (c, ~m1, a, d);
+}
+
+/* { dg-final { scan-assembler-times "knotw" "4" } }  */
diff --git a/gcc/testsuite/gcc.target/i386/bitwise_mask_op-2.c b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-2.c
new file mode 100644
index 00000000000..277c5a98079
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-2.c
@@ -0,0 +1,7 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512bw -mavx512dq -O2" } */
+/* { dg-final { scan-assembler-times "knotb" "2" } }  */
+/* { dg-final { scan-assembler-times "korb" "1" } }  */
+/* { dg-final { scan-assembler-times "kxorb" "1" } }  */
+#include "bitwise_mask_op-1.c"
+
-- 
2.18.1


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH 4/4][PR target/88808]Enable bitwise operator for AVX512 masks.
  2020-08-14  8:27 [PATCH 4/4][PR target/88808]Enable bitwise operator for AVX512 masks Hongtao Liu
@ 2020-08-17 10:08 ` Uros Bizjak
  2020-08-19  2:26   ` Hongtao Liu
  0 siblings, 1 reply; 39+ messages in thread
From: Uros Bizjak @ 2020-08-17 10:08 UTC (permalink / raw)
  To: Hongtao Liu; +Cc: GCC Patches, Kirill Yukhin, H. J. Lu

On Fri, Aug 14, 2020 at 10:26 AM Hongtao Liu <crazylht@gmail.com> wrote:
>
> Enable operator or/xor/and/andn/not for mask register, kxnor is not
> enabled since there's no corresponding instruction for general
> registers.
>
> gcc/
>         PR target/88808
>         * config/i386/i386.md: (*movsi_internal): Adjust constraints
>         for mask registers.
>         (*movhi_internal): Ditto.
>         (*movqi_internal): Ditto.
>         (*anddi_1): Support mask register operations
>         (*and<mode>_1): Ditto.
>         (*andqi_1): Ditto.
>         (*andn<mode>_1): Ditto.
>         (*<code><mode>_1): Ditto.
>         (*<code>qi_1): Ditto.
>         (*one_cmpl<mode>2_1): Ditto.
>         (*one_cmplsi2_1_zext): Ditto.
>         (*one_cmplqi2_1): Ditto.
>
> gcc/testsuite/
>         * gcc.target/i386/bitwise_mask_op-1.c: New test.
>         * gcc.target/i386/bitwise_mask_op-2.c: New test.
>         * gcc.target/i386/avx512bw-kunpckwd-1.c: Adjust testcase.
>         * gcc.target/i386/avx512bw-kunpckwd-3.c: Ditto.
>         * gcc.target/i386/avx512dq-kmovb-5.c: Ditto.
>         * gcc.target/i386/avx512f-kmovw-5.c: Ditto.

index 74d207c3711..e8ad79d1b0a 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -2294,7 +2294,7 @@

 (define_insn "*movsi_internal"
   [(set (match_operand:SI 0 "nonimmediate_operand"
-    "=r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,?r,?*v,*k,*k ,*rm,*k")
+    "=r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,?r,?*v,*k,*k ,*rm,k")
     (match_operand:SI 1 "general_operand"
     "g ,re,C ,*y,m  ,*y,*y,r  ,C ,*v,m ,*v,*v,r  ,*r,*km,*k ,CBC"))]
   "!(MEM_P (operands[0]) && MEM_P (operands[1]))"

I'd rather see *k everywhere, also with *movqi_internal and
*movhi_internal patterns. The "*" means that the allocator won't
allocate a mask register by default, but it will be used to optimize
moves. With the above change, you are risking that during integer
register pressure, the register allocator will allocate zero to a mask
register, and later "optimize" the move with a direct maskreg-intreg
move.

The current strategy is that only general registers get allocated for
integer modes. Let's keep it this way for now.

Otherwise, the patchset LGTM, but please test the suggested changes and repost.

BTW: Do you plan to remove mask operations from sse.md? ATM, they are
used to distinguish mask operations, generated from builtins from
generic operations, so I'd like to keep them for a while. The drawback
is, that they are not combined with other operations, but at the end
of the day, this is what the programmer asked for by using builtins.

Uros.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH 4/4][PR target/88808]Enable bitwise operator for AVX512 masks.
  2020-08-17 10:08 ` Uros Bizjak
@ 2020-08-19  2:26   ` Hongtao Liu
  2020-08-19  7:05     ` Uros Bizjak
  0 siblings, 1 reply; 39+ messages in thread
From: Hongtao Liu @ 2020-08-19  2:26 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: GCC Patches, Kirill Yukhin, H. J. Lu

[-- Attachment #1: Type: text/plain, Size: 3610 bytes --]

On Mon, Aug 17, 2020 at 6:08 PM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Fri, Aug 14, 2020 at 10:26 AM Hongtao Liu <crazylht@gmail.com> wrote:
> >
> > Enable operator or/xor/and/andn/not for mask register, kxnor is not
> > enabled since there's no corresponding instruction for general
> > registers.
> >
> > gcc/
> >         PR target/88808
> >         * config/i386/i386.md: (*movsi_internal): Adjust constraints
> >         for mask registers.
> >         (*movhi_internal): Ditto.
> >         (*movqi_internal): Ditto.
> >         (*anddi_1): Support mask register operations
> >         (*and<mode>_1): Ditto.
> >         (*andqi_1): Ditto.
> >         (*andn<mode>_1): Ditto.
> >         (*<code><mode>_1): Ditto.
> >         (*<code>qi_1): Ditto.
> >         (*one_cmpl<mode>2_1): Ditto.
> >         (*one_cmplsi2_1_zext): Ditto.
> >         (*one_cmplqi2_1): Ditto.
> >
> > gcc/testsuite/
> >         * gcc.target/i386/bitwise_mask_op-1.c: New test.
> >         * gcc.target/i386/bitwise_mask_op-2.c: New test.
> >         * gcc.target/i386/avx512bw-kunpckwd-1.c: Adjust testcase.
> >         * gcc.target/i386/avx512bw-kunpckwd-3.c: Ditto.
> >         * gcc.target/i386/avx512dq-kmovb-5.c: Ditto.
> >         * gcc.target/i386/avx512f-kmovw-5.c: Ditto.
>
> index 74d207c3711..e8ad79d1b0a 100644
> --- a/gcc/config/i386/i386.md
> +++ b/gcc/config/i386/i386.md
> @@ -2294,7 +2294,7 @@
>
>  (define_insn "*movsi_internal"
>    [(set (match_operand:SI 0 "nonimmediate_operand"
> -    "=r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,?r,?*v,*k,*k ,*rm,*k")
> +    "=r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,?r,?*v,*k,*k ,*rm,k")
>      (match_operand:SI 1 "general_operand"
>      "g ,re,C ,*y,m  ,*y,*y,r  ,C ,*v,m ,*v,*v,r  ,*r,*km,*k ,CBC"))]
>    "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
>
> I'd rather see *k everywhere, also with *movqi_internal and
> *movhi_internal patterns. The "*" means that the allocator won't
> allocate a mask register by default, but it will be used to optimize
> moves. With the above change, you are risking that during integer
> register pressure, the register allocator will allocate zero to a mask
> register, and later "optimize" the move with a direct maskreg-intreg
> move.
>
> The current strategy is that only general registers get allocated for
> integer modes. Let's keep it this way for now.
>

Yes,  though it would fail gcc.target/i386/avx512dq-pr88465.c and
gcc.target/i386/avx512f-pr88465.c, i think it's more reasonable not to
move zero into mask register directly.

> Otherwise, the patchset LGTM, but please test the suggested changes and repost.
>
> BTW: Do you plan to remove mask operations from sse.md? ATM, they are
> used to distinguish mask operations, generated from builtins from
> generic operations, so I'd like to keep them for a while. The drawback
> is, that they are not combined with other operations, but at the end
> of the day, this is what the programmer asked for by using builtins.

Agree, I prefer to keep them.

>
> Uros.

Bootstrap is ok, regression test is ok for i386/x86-64 backend(After
adjusting testcase).

impact for SPEC2017 on SKL.

500.perlbench_r 0.00%
502.gcc_r 1.59%
505.mcf_r 1.49%
520.omnetpp_r 1.91%
523.xalancbmk_r -1.22%
525.x264_r 0.00%
531.deepsjeng_r 0.00%
541.leela_r -0.22%
548.exchange2_r 2.27%
557.xz_r 0.63%
INT geomean 0.64%

503.bwaves_r 3.68%
507.cactuBSSN_r -0.62%
508.namd_r 0.51%
510.parest_r -0.16%
511.povray_r 0.57%
519.lbm_r 0.50%
521.wrf_r 0.00%
526.blender_r 0.00%
527.cam4_r 0.00%
538.imagick_r -0.41%
544.nab_r 0.00%
549.fotonik3d_r -0.20%
554.roms_r 4.19%
FP geomean 0.66%

-- 
BR,
Hongtao

[-- Attachment #2: 0004-Enable-bitwise-operation-for-type-mask_V2.patch --]
[-- Type: text/x-patch, Size: 24005 bytes --]

From e546516449ec4ed9301b83a063efdefbf0f7e75a Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Thu, 13 Aug 2020 14:20:43 +0800
Subject: [PATCH 4/4] Enable bitwise operation for type mask.

Enable operator or/xor/and/andn/not for mask register, kxnor is not
enabled since there's no corresponding instruction for general
registers.

gcc/
	PR target/88808
	* config/i386/i386.md: (*movsi_internal): Adjust constraints
	for mask registers.
	(*movhi_internal): Ditto.
	(*movqi_internal): Ditto.
	(*anddi_1): Support mask register operations
	(*and<mode>_1): Ditto.
	(*andqi_1): Ditto.
	(*andn<mode>_1): Ditto.
	(*<code><mode>_1): Ditto.
	(*<code>qi_1): Ditto.
	(*one_cmpl<mode>2_1): Ditto.
	(*one_cmplsi2_1_zext): Ditto.
	(*one_cmplqi2_1): Ditto.

gcc/testsuite/
	* gcc.target/i386/bitwise_mask_op-1.c: New test.
	* gcc.target/i386/bitwise_mask_op-2.c: New test.
	* gcc.target/i386/avx512bw-kunpckwd-1.c: Adjust testcase.
	* gcc.target/i386/avx512bw-kunpckwd-3.c: Ditto.
	* gcc.target/i386/avx512dq-kmovb-5.c: Ditto.
	* gcc.target/i386/avx512f-kmovw-5.c: Ditto.
	* gcc.target/i386/avx512bw-pr88465.c: Ditto.
	* gcc.target/i386/avx512f-pr88465.c: Ditto.
---
 gcc/config/i386/i386.md                       | 260 +++++++++++++-----
 .../gcc.target/i386/avx512bw-kunpckwd-1.c     |   2 +-
 .../gcc.target/i386/avx512bw-kunpckwd-3.c     |   2 +-
 .../gcc.target/i386/avx512dq-kmovb-5.c        |   2 +-
 .../gcc.target/i386/avx512dq-pr88465.c        |   4 +-
 .../gcc.target/i386/avx512f-kmovw-5.c         |   2 +-
 .../gcc.target/i386/avx512f-pr88465.c         |   4 +-
 .../gcc.target/i386/bitwise_mask_op-1.c       | 177 ++++++++++++
 .../gcc.target/i386/bitwise_mask_op-2.c       |   7 +
 9 files changed, 380 insertions(+), 80 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/bitwise_mask_op-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/bitwise_mask_op-2.c

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 3a15941c3e8..4255b9a7a64 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -2403,8 +2403,8 @@
 	   (symbol_ref "true")))])
 
 (define_insn "*movhi_internal"
-  [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r ,r ,m ,k,k ,r,m,k")
-	(match_operand:HI 1 "general_operand"      "r ,rn,rm,rn,r,km,k,k,CBC"))]
+  [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r ,r ,m ,*k,*k ,*r,*m,*k")
+	(match_operand:HI 1 "general_operand"      "r ,rn,rm,rn,*r,*km,*k,*k,CBC"))]
   "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
 {
   switch (get_attr_type (insn))
@@ -2491,9 +2491,9 @@
 
 (define_insn "*movqi_internal"
   [(set (match_operand:QI 0 "nonimmediate_operand"
-			"=Q,R,r,q,q,r,r ,?r,m ,k,k,r,m,k,k,k")
+			"=Q,R,r,q,q,r,r ,?r,m ,*k,*k,*r,*m,*k,*k,*k")
 	(match_operand:QI 1 "general_operand"
-			"Q ,R,r,n,m,q,rn, m,qn,r,k,k,k,m,C,BC"))]
+			"Q ,R,r,n,m,q,rn, m,qn,*r,*k,*k,*k,*m,C,BC"))]
   "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
 {
   char buf[128];
@@ -9044,19 +9044,21 @@
 })
 
 (define_insn "*anddi_1"
-  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,rm,r,r")
+  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,rm,r,r,k")
 	(and:DI
-	 (match_operand:DI 1 "nonimmediate_operand" "%0,0,0,qm")
-	 (match_operand:DI 2 "x86_64_szext_general_operand" "Z,re,m,L")))
+	 (match_operand:DI 1 "nonimmediate_operand" "%0,0,0,qm,k")
+	 (match_operand:DI 2 "x86_64_szext_general_operand" "Z,re,m,L,k")))
    (clobber (reg:CC FLAGS_REG))]
   "TARGET_64BIT && ix86_binary_operator_ok (AND, DImode, operands)"
   "@
    and{l}\t{%k2, %k0|%k0, %k2}
    and{q}\t{%2, %0|%0, %2}
    and{q}\t{%2, %0|%0, %2}
-   #"
-  [(set_attr "type" "alu,alu,alu,imovx")
-   (set_attr "length_immediate" "*,*,*,0")
+   and{q}\t{%2, %0|%0, %2}
+   kandq\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "x64,x64,x64,x64,avx512bw")
+   (set_attr "type" "alu,alu,alu,imovx,msklog")
+   (set_attr "length_immediate" "*,*,*,0,*")
    (set (attr "prefix_rex")
      (if_then_else
        (and (eq_attr "type" "imovx")
@@ -9064,7 +9066,7 @@
 		 (match_operand 1 "ext_QIreg_operand")))
        (const_string "1")
        (const_string "*")))
-   (set_attr "mode" "SI,DI,DI,SI")])
+   (set_attr "mode" "SI,DI,DI,SI,DI")])
 
 (define_insn_and_split "*anddi_1_btr"
   [(set (match_operand:DI 0 "nonimmediate_operand" "=rm")
@@ -9130,17 +9132,25 @@
    (set_attr "mode" "SI")])
 
 (define_insn "*and<mode>_1"
-  [(set (match_operand:SWI24 0 "nonimmediate_operand" "=rm,r,Ya")
-	(and:SWI24 (match_operand:SWI24 1 "nonimmediate_operand" "%0,0,qm")
-		   (match_operand:SWI24 2 "<general_operand>" "r<i>,m,L")))
+  [(set (match_operand:SWI24 0 "nonimmediate_operand" "=rm,r,Ya,k")
+	(and:SWI24 (match_operand:SWI24 1 "nonimmediate_operand" "%0,0,qm,k")
+		   (match_operand:SWI24 2 "<general_operand>" "r<i>,m,L,k")))
    (clobber (reg:CC FLAGS_REG))]
   "ix86_binary_operator_ok (AND, <MODE>mode, operands)"
   "@
    and{<imodesuffix>}\t{%2, %0|%0, %2}
    and{<imodesuffix>}\t{%2, %0|%0, %2}
-   #"
-  [(set_attr "type" "alu,alu,imovx")
-   (set_attr "length_immediate" "*,*,0")
+   and{<imodesuffix>}\t{%2, %0|%0, %2}
+   kand<mskmodesuffix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set (attr "isa")
+	(cond [(eq_attr "alternative" "3")
+		 (if_then_else (eq_attr "mode" "SI")
+		   (const_string "avx512bw")
+		   (const_string "avx512f"))
+	      ]
+	      (const_string "*")))
+   (set_attr "type" "alu,alu,imovx,msklog")
+   (set_attr "length_immediate" "*,*,0,*")
    (set (attr "prefix_rex")
      (if_then_else
        (and (eq_attr "type" "imovx")
@@ -9148,20 +9158,39 @@
 		 (match_operand 1 "ext_QIreg_operand")))
        (const_string "1")
        (const_string "*")))
-   (set_attr "mode" "<MODE>,<MODE>,SI")])
+   (set_attr "mode" "<MODE>,<MODE>,SI,<MODE>")])
 
 (define_insn "*andqi_1"
-  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r")
-	(and:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0")
-		(match_operand:QI 2 "general_operand" "qn,m,rn")))
+  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r,k")
+	(and:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0,k")
+		(match_operand:QI 2 "general_operand" "qn,m,rn,k")))
    (clobber (reg:CC FLAGS_REG))]
   "ix86_binary_operator_ok (AND, QImode, operands)"
-  "@
-   and{b}\t{%2, %0|%0, %2}
-   and{b}\t{%2, %0|%0, %2}
-   and{l}\t{%k2, %k0|%k0, %k2}"
-  [(set_attr "type" "alu")
-   (set_attr "mode" "QI,QI,SI")
+{
+  switch (which_alternative)
+    {
+     case 0:
+     case 1:
+       return "and{b}\t{%2, %0|%0, %2}";
+     case 2:
+       return "and{l}\t{%k2, %k0|%k0, %k2}";
+     case 3:
+       if (TARGET_AVX512DQ)
+	 return "kandb\t{%2, %1, %0|%0, %1, %2}";
+       return "kandw\t{%2, %1, %0|%0, %1, %2}";
+     default:
+       gcc_unreachable ();
+     }
+}
+  [(set_attr "type" "alu,alu,alu,msklog")
+   (set (attr "mode")
+	(cond [(eq_attr "alternative" "2")
+		 (const_string "SI")
+		(and (eq_attr "alternative" "3")
+		     (match_test "!TARGET_AVX512DQ"))
+		 (const_string "HI")
+	       ]
+	       (const_string "QI")))
    ;; Potential partial reg stall on alternative 2.
    (set (attr "preferred_for_speed")
      (cond [(eq_attr "alternative" "2")
@@ -9539,28 +9568,53 @@
 })
 
 (define_insn "*andn<mode>_1"
-  [(set (match_operand:SWI48 0 "register_operand" "=r,r")
+  [(set (match_operand:SWI48 0 "register_operand" "=r,r,k")
 	(and:SWI48
-	  (not:SWI48 (match_operand:SWI48 1 "register_operand" "r,r"))
-	  (match_operand:SWI48 2 "nonimmediate_operand" "r,m")))
+	  (not:SWI48 (match_operand:SWI48 1 "register_operand" "r,r,k"))
+	  (match_operand:SWI48 2 "nonimmediate_operand" "r,m,k")))
    (clobber (reg:CC FLAGS_REG))]
-  "TARGET_BMI"
-  "andn\t{%2, %1, %0|%0, %1, %2}"
-  [(set_attr "type" "bitmanip")
-   (set_attr "btver2_decode" "direct, double")
+  "TARGET_BMI || TARGET_AVX512BW"
+  "@
+    andn\t{%2, %1, %0|%0, %1, %2}
+    andn\t{%2, %1, %0|%0, %1, %2}
+    kandn<mskmodesuffix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "bmi,bmi,avx512bw")
+   (set_attr "type" "bitmanip,bitmanip,msklog")
+   (set_attr "btver2_decode" "direct, double,*")
    (set_attr "mode" "<MODE>")])
 
 (define_insn "*andn<mode>_1"
-  [(set (match_operand:SWI12 0 "register_operand" "=r")
+  [(set (match_operand:SWI12 0 "register_operand" "=r,k")
 	(and:SWI12
-	  (not:SWI12 (match_operand:SWI12 1 "register_operand" "r"))
-	  (match_operand:SWI12 2 "register_operand" "r")))
+	  (not:SWI12 (match_operand:SWI12 1 "register_operand" "r,k"))
+	  (match_operand:SWI12 2 "register_operand" "r,k")))
    (clobber (reg:CC FLAGS_REG))]
-  "TARGET_BMI"
-  "andn\t{%k2, %k1, %k0|%k0, %k1, %k2}"
-  [(set_attr "type" "bitmanip")
-   (set_attr "btver2_decode" "direct")
-   (set_attr "mode" "SI")])
+  "TARGET_BMI || TARGET_AVX512BW"
+{
+  switch (which_alternative)
+    {
+    case 0:
+      return "andn\t{%k2, %k1, %k0|%k0, %k1, %k2}";
+    case 1:
+      if (TARGET_AVX512DQ)
+	return "kandn<mskmodesuffix>\t{%2, %1, %0|%0, %1, %2}";
+      return "kandnw\t{%2, %1, %0|%0, %1, %2}";
+    default:
+      gcc_unreachable ();
+    }
+}
+
+  [(set_attr "isa" "bmi,avx512f")
+   (set_attr "type" "bitmanip,msklog")
+   (set_attr "btver2_decode" "direct,*")
+   (set (attr "mode")
+	(cond [(eq_attr "alternative" "0")
+		 (const_string "SI")
+	       (and (eq_attr "alternative" "1")
+		    (match_test "!TARGET_AVX512DQ"))
+		  (const_string "HI")
+	      ]
+	      (const_string "<MODE>")))])
 
 (define_insn "*andn_<mode>_ccno"
   [(set (reg FLAGS_REG)
@@ -9631,14 +9685,24 @@
 })
 
 (define_insn "*<code><mode>_1"
-  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,r")
+  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,r,k")
 	(any_or:SWI248
-	 (match_operand:SWI248 1 "nonimmediate_operand" "%0,0")
-	 (match_operand:SWI248 2 "<general_operand>" "r<i>,m")))
+	 (match_operand:SWI248 1 "nonimmediate_operand" "%0,0,k")
+	 (match_operand:SWI248 2 "<general_operand>" "r<i>,m,k")))
    (clobber (reg:CC FLAGS_REG))]
   "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
-  "<logic>{<imodesuffix>}\t{%2, %0|%0, %2}"
-  [(set_attr "type" "alu")
+  "@
+    <logic>{<imodesuffix>}\t{%2, %0|%0, %2}
+    <logic>{<imodesuffix>}\t{%2, %0|%0, %2}
+    k<logic><mskmodesuffix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set (attr "isa")
+	(cond [(eq_attr "alternative" "2")
+		 (if_then_else (eq_attr "mode" "SI,DI")
+		   (const_string "avx512bw")
+		   (const_string "avx512f"))
+	      ]
+	      (const_string "*")))
+   (set_attr "type" "alu, alu, msklog")
    (set_attr "mode" "<MODE>")])
 
 (define_insn_and_split "*iordi_1_bts"
@@ -9711,17 +9775,37 @@
    (set_attr "mode" "SI")])
 
 (define_insn "*<code>qi_1"
-  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r")
-	(any_or:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0")
-		   (match_operand:QI 2 "general_operand" "qn,m,rn")))
+  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r,k")
+	(any_or:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0,k")
+		   (match_operand:QI 2 "general_operand" "qn,m,rn,k")))
    (clobber (reg:CC FLAGS_REG))]
   "ix86_binary_operator_ok (<CODE>, QImode, operands)"
-  "@
-   <logic>{b}\t{%2, %0|%0, %2}
-   <logic>{b}\t{%2, %0|%0, %2}
-   <logic>{l}\t{%k2, %k0|%k0, %k2}"
-  [(set_attr "type" "alu")
-   (set_attr "mode" "QI,QI,SI")
+{
+  switch (which_alternative)
+    {
+    case 0:
+    case 1:
+      return "<logic>{b}\t{%2, %0|%0, %2}";
+    case 2:
+      return "<logic>{l}\t{%k2, %k0|%k0, %k2}";
+    case 3:
+      if (TARGET_AVX512DQ)
+	return "k<logic>b\t{%2, %1, %0|%0, %1, %2}";
+      return "k<logic>w\t{%2, %1, %0|%0, %1, %2}";
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "isa" "*,*,*,avx512f")
+   (set_attr "type" "alu,alu,alu,msklog")
+   (set (attr "mode")
+	(cond [(eq_attr "alternative" "2")
+		 (const_string "SI")
+		(and (eq_attr "alternative" "3")
+		     (match_test "!TARGET_AVX512DQ"))
+		 (const_string "HI")
+	       ]
+	       (const_string "QI")))
    ;; Potential partial reg stall on alternative 2.
    (set (attr "preferred_for_speed")
      (cond [(eq_attr "alternative" "2")
@@ -10370,31 +10454,63 @@
   "split_double_mode (DImode, &operands[0], 2, &operands[0], &operands[2]);")
 
 (define_insn "*one_cmpl<mode>2_1"
-  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm")
-	(not:SWI248 (match_operand:SWI248 1 "nonimmediate_operand" "0")))]
+  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,k")
+	(not:SWI248 (match_operand:SWI248 1 "nonimmediate_operand" "0,k")))]
   "ix86_unary_operator_ok (NOT, <MODE>mode, operands)"
-  "not{<imodesuffix>}\t%0"
-  [(set_attr "type" "negnot")
+  "@
+  not{<imodesuffix>}\t%0
+  knot<mskmodesuffix>\t{%1, %0|%0, %1}"
+  [(set (attr "isa")
+	(cond [(eq_attr "alternative" "2")
+		 (if_then_else (eq_attr "mode" "SI,DI")
+		   (const_string "avx512bw")
+		   (const_string "avx512f"))
+	      ]
+	      (const_string "*")))
+   (set_attr "type" "negnot,msklog")
    (set_attr "mode" "<MODE>")])
 
 (define_insn "*one_cmplsi2_1_zext"
-  [(set (match_operand:DI 0 "register_operand" "=r")
+  [(set (match_operand:DI 0 "register_operand" "=r,k")
 	(zero_extend:DI
-	  (not:SI (match_operand:SI 1 "register_operand" "0"))))]
+	  (not:SI (match_operand:SI 1 "register_operand" "0,k"))))]
   "TARGET_64BIT && ix86_unary_operator_ok (NOT, SImode, operands)"
-  "not{l}\t%k0"
-  [(set_attr "type" "negnot")
-   (set_attr "mode" "SI")])
+  "@
+    not{l}\t%k0
+    knotd\t{%1, %0|%0, %1}"
+  [(set_attr "isa" "x64,avx512bw")
+   (set_attr "type" "negnot,msklog")
+   (set_attr "mode" "SI,SI")])
 
 (define_insn "*one_cmplqi2_1"
-  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,r")
-	(not:QI (match_operand:QI 1 "nonimmediate_operand" "0,0")))]
+  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,r,k")
+	(not:QI (match_operand:QI 1 "nonimmediate_operand" "0,0,k")))]
   "ix86_unary_operator_ok (NOT, QImode, operands)"
-  "@
-   not{b}\t%0
-   not{l}\t%k0"
-  [(set_attr "type" "negnot")
-   (set_attr "mode" "QI,SI")
+{
+  switch (which_alternative)
+    {
+    case 0:
+      return "not{b}\t%0";
+    case 1:
+      return "not{l}\t%k0";
+    case 2:
+      if (TARGET_AVX512DQ)
+	return "knotb\t{%1, %0|%0, %1}";
+      return "knotw\t{%1, %0|%0, %1}";
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "isa" "*,*,avx512f")
+   (set_attr "type" "negnot,negnot,msklog")
+   (set (attr "mode")
+	(cond [(eq_attr "alternative" "1")
+		 (const_string "SI")
+		(and (eq_attr "alternative" "2")
+		     (match_test "!TARGET_AVX512DQ"))
+		 (const_string "HI")
+	       ]
+	       (const_string "QI")))
    ;; Potential partial reg stall on alternative 1.
    (set (attr "preferred_for_speed")
      (cond [(eq_attr "alternative" "1")
diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-1.c b/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-1.c
index 94422f36010..46d9351f275 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-1.c
@@ -1,6 +1,6 @@
 /* { dg-do compile } */
 /* { dg-options "-mavx512bw -O2" } */
-/* { dg-final { scan-assembler-times "kunpckwd\[ \\t\]+\[^\{\n\]*%k\[1-7\](?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "kunpckwd\[ \\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */
 
 #include <immintrin.h>
 
diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-3.c b/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-3.c
index c68ad8cc1f7..fe13f4f33fc 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-3.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-3.c
@@ -1,6 +1,6 @@
 /* { dg-do compile } */
 /* { dg-options "-mavx512bw -O2" } */
-/* { dg-final { scan-assembler-times "kunpckwd\[ \\t\]+\[^\{\n\]*%k\[1-7\](?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "kunpckwd\[ \\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */
 
 #include <immintrin.h>
 
diff --git a/gcc/testsuite/gcc.target/i386/avx512dq-kmovb-5.c b/gcc/testsuite/gcc.target/i386/avx512dq-kmovb-5.c
index 49817097e26..114e03ee93d 100644
--- a/gcc/testsuite/gcc.target/i386/avx512dq-kmovb-5.c
+++ b/gcc/testsuite/gcc.target/i386/avx512dq-kmovb-5.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-mavx512dq -O2" } */
+/* { dg-options "-mavx512dq -mno-avx512bw -O2" } */
 /* { dg-final { scan-assembler-times "kmovb\[ \\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */
 
 #include <immintrin.h>
diff --git a/gcc/testsuite/gcc.target/i386/avx512dq-pr88465.c b/gcc/testsuite/gcc.target/i386/avx512dq-pr88465.c
index a11fd26a44e..4690e7ba9e8 100644
--- a/gcc/testsuite/gcc.target/i386/avx512dq-pr88465.c
+++ b/gcc/testsuite/gcc.target/i386/avx512dq-pr88465.c
@@ -1,8 +1,8 @@
 /* PR target/88465 */
 /* { dg-do compile } */
 /* { dg-options "-O2 -mavx512dq -mno-avx512bw" } */
-/* { dg-final { scan-assembler-times "kxorb\[ \t]" 1 } } */
-/* { dg-final { scan-assembler-times "kxnorb\[ \t]" 1 } } */
+/* { dg-final { scan-assembler-times "kxorb\[ \t]" 1 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times "kxnorb\[ \t]" 1 { xfail *-*-* } } } */
 
 void
 foo (void)
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-kmovw-5.c b/gcc/testsuite/gcc.target/i386/avx512f-kmovw-5.c
index 7bb34d34d8d..79d37394b36 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-kmovw-5.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-kmovw-5.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-mavx512f -O2" } */
+/* { dg-options "-mavx512f -mno-avx512bw -O2" } */
 /* { dg-final { scan-assembler-times "kmovw\[ \\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */
 
 #include <immintrin.h>
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-pr88465.c b/gcc/testsuite/gcc.target/i386/avx512f-pr88465.c
index e66ea64db02..b1ab9633522 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-pr88465.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-pr88465.c
@@ -1,8 +1,8 @@
 /* PR target/88465 */
 /* { dg-do compile } */
 /* { dg-options "-O2 -mavx512f -mno-avx512dq -mno-avx512bw" } */
-/* { dg-final { scan-assembler-times "kxorw\[ \t]" 2 } } */
-/* { dg-final { scan-assembler-times "kxnorw\[ \t]" 1 } } */
+/* { dg-final { scan-assembler-times "kxorw\[ \t]" 2 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times "kxnorw\[ \t]" 1 { xfail *-*-* } } } */
 
 void
 foo (void)
diff --git a/gcc/testsuite/gcc.target/i386/bitwise_mask_op-1.c b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-1.c
new file mode 100644
index 00000000000..2757bcaaf50
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-1.c
@@ -0,0 +1,177 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512bw -mno-avx512dq -O2" } */
+
+#include <immintrin.h>
+__m512i
+foo_orq (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask64 m1 = _mm512_cmpeq_epi8_mask (a, b);
+  __mmask64 m2 = _mm512_cmpeq_epi8_mask (c, d);
+  return _mm512_mask_add_epi8 (c, m1 | m2, a, d);
+}
+
+/* { dg-final { scan-assembler-times "korq" "1" { target { ! ia32 } } } } */
+
+__m512i
+foo_ord (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask32 m1 = _mm512_cmpeq_epi16_mask (a, b);
+  __mmask32 m2 = _mm512_cmpeq_epi16_mask (c, d);
+  return _mm512_mask_add_epi16 (c, m1 | m2, a, d);
+}
+
+/* { dg-final { scan-assembler-times "kord" "1" } }  */
+
+__m512i
+foo_orw (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask16 m1 = _mm512_cmpeq_epi32_mask (a, b);
+  __mmask16 m2 = _mm512_cmpeq_epi32_mask (c, d);
+  return _mm512_mask_add_epi32 (c, m1 | m2, a, d);
+}
+
+__m512i
+foo_orb (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask8 m1 = _mm512_cmpeq_epi64_mask (a, b);
+  __mmask8 m2 = _mm512_cmpeq_epi64_mask (c, d);
+  return _mm512_mask_add_epi64 (c, m1 | m2, a, d);
+}
+
+/* { dg-final { scan-assembler-times "korw" "2" } }  */
+
+__m512i
+foo_xorq (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask64 m1 = _mm512_cmpeq_epi8_mask (a, b);
+  __mmask64 m2 = _mm512_cmpeq_epi8_mask (c, d);
+  return _mm512_mask_add_epi8 (c, m1 ^ m2, a, d);
+}
+
+/* { dg-final { scan-assembler-times "kxorq" "1" { target { ! ia32 } } } }  */
+
+__m512i
+foo_xord (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask32 m1 = _mm512_cmpeq_epi16_mask (a, b);
+  __mmask32 m2 = _mm512_cmpeq_epi16_mask (c, d);
+  return _mm512_mask_add_epi16 (c, m1 ^ m2, a, d);
+}
+
+/* { dg-final { scan-assembler-times "kxord" "1" } }  */
+
+__m512i
+foo_xorw (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask16 m1 = _mm512_cmpeq_epi32_mask (a, b);
+  __mmask16 m2 = _mm512_cmpeq_epi32_mask (c, d);
+  return _mm512_mask_add_epi32 (c, m1 ^ m2, a, d);
+}
+
+__m512i
+foo_xorb (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask8 m1 = _mm512_cmpeq_epi64_mask (a, b);
+  __mmask8 m2 = _mm512_cmpeq_epi64_mask (c, d);
+  return _mm512_mask_add_epi64 (c, m1 ^ m2, a, d);
+}
+
+/* { dg-final { scan-assembler-times "korw" "2" } }  */
+
+__m512i
+foo_andq (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask64 m1 = _mm512_cmpeq_epi8_mask (a, b);
+  __mmask64 m2 = _mm512_cmpeq_epi8_mask (c, d);
+  return _mm512_mask_add_epi8 (c, m1 & m2, a, d);
+}
+
+__m512i
+foo_andd (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask32 m1 = _mm512_cmpeq_epi16_mask (a, b);
+  __mmask32 m2 = _mm512_cmpeq_epi16_mask (c, d);
+  return _mm512_mask_add_epi16 (c, m1 & m2, a, d);
+}
+
+__m512i
+foo_andw (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask16 m1 = _mm512_cmpeq_epi32_mask (a, b);
+  __mmask16 m2 = _mm512_cmpeq_epi32_mask (c, d);
+  return _mm512_mask_add_epi32 (c, m1 & m2, a, d);
+}
+
+__m512i
+foo_andb (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask8 m1 = _mm512_cmpeq_epi64_mask (a, b);
+  __mmask8 m2 = _mm512_cmpeq_epi64_mask (c, d);
+  return _mm512_mask_add_epi64 (c, m1 & m2, a, d);
+}
+
+__m512i
+foo_andnq (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask64 m1 = _mm512_cmpeq_epi8_mask (a, b);
+  __mmask64 m2 = _mm512_cmpeq_epi8_mask (c, d);
+  return _mm512_mask_add_epi8 (c, m1 & ~m2, a, d);
+}
+
+__m512i
+foo_andnd (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask32 m1 = _mm512_cmpeq_epi16_mask (a, b);
+  __mmask32 m2 = _mm512_cmpeq_epi16_mask (c, d);
+  return _mm512_mask_add_epi16 (c, m1 & ~m2, a, d);
+}
+
+__m512i
+foo_andnw (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask16 m1 = _mm512_cmpeq_epi32_mask (a, b);
+  __mmask16 m2 = _mm512_cmpeq_epi32_mask (c, d);
+  return _mm512_mask_add_epi32 (c, m1 & ~m2, a, d);
+}
+
+__m512i
+foo_andnb (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask8 m1 = _mm512_cmpeq_epi64_mask (a, b);
+  __mmask8 m2 = _mm512_cmpeq_epi64_mask (c, d);
+  return _mm512_mask_add_epi64 (c, m1 & ~m2, a, d);
+}
+
+__m512i
+foo_notq (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask64 m1 = _mm512_cmpeq_epi8_mask (a, b);
+  return _mm512_mask_add_epi8 (c, ~m1, a, d);
+}
+
+/* { dg-final { scan-assembler-times "knotq" "2" { target { ! ia32 } } } }  */
+
+__m512i
+foo_notd (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask32 m1 = _mm512_cmpeq_epi16_mask (a, b);
+  return _mm512_mask_add_epi16 (c, ~m1, a, d);
+}
+
+/* { dg-final { scan-assembler-times "knotd" "2" { target { ! ia32 } } } }  */
+
+__m512i
+foo_notw (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask16 m1 = _mm512_cmpeq_epi32_mask (a, b);
+  return _mm512_mask_add_epi32 (c, ~m1, a, d);
+}
+
+__m512i
+foo_notb (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask8 m1 = _mm512_cmpeq_epi64_mask (a, b);
+  return _mm512_mask_add_epi64 (c, ~m1, a, d);
+}
+
+/* { dg-final { scan-assembler-times "knotw" "4" } }  */
diff --git a/gcc/testsuite/gcc.target/i386/bitwise_mask_op-2.c b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-2.c
new file mode 100644
index 00000000000..277c5a98079
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-2.c
@@ -0,0 +1,7 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512bw -mavx512dq -O2" } */
+/* { dg-final { scan-assembler-times "knotb" "2" } }  */
+/* { dg-final { scan-assembler-times "korb" "1" } }  */
+/* { dg-final { scan-assembler-times "kxorb" "1" } }  */
+#include "bitwise_mask_op-1.c"
+
-- 
2.18.1


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH 4/4][PR target/88808]Enable bitwise operator for AVX512 masks.
  2020-08-19  2:26   ` Hongtao Liu
@ 2020-08-19  7:05     ` Uros Bizjak
  2020-08-20  7:24       ` Hongtao Liu
  0 siblings, 1 reply; 39+ messages in thread
From: Uros Bizjak @ 2020-08-19  7:05 UTC (permalink / raw)
  To: Hongtao Liu; +Cc: GCC Patches, Kirill Yukhin, H. J. Lu

On Wed, Aug 19, 2020 at 4:25 AM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Mon, Aug 17, 2020 at 6:08 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> >
> > On Fri, Aug 14, 2020 at 10:26 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > >
> > > Enable operator or/xor/and/andn/not for mask register, kxnor is not
> > > enabled since there's no corresponding instruction for general
> > > registers.
> > >
> > > gcc/
> > >         PR target/88808
> > >         * config/i386/i386.md: (*movsi_internal): Adjust constraints
> > >         for mask registers.
> > >         (*movhi_internal): Ditto.
> > >         (*movqi_internal): Ditto.
> > >         (*anddi_1): Support mask register operations
> > >         (*and<mode>_1): Ditto.
> > >         (*andqi_1): Ditto.
> > >         (*andn<mode>_1): Ditto.
> > >         (*<code><mode>_1): Ditto.
> > >         (*<code>qi_1): Ditto.
> > >         (*one_cmpl<mode>2_1): Ditto.
> > >         (*one_cmplsi2_1_zext): Ditto.
> > >         (*one_cmplqi2_1): Ditto.
> > >
> > > gcc/testsuite/
> > >         * gcc.target/i386/bitwise_mask_op-1.c: New test.
> > >         * gcc.target/i386/bitwise_mask_op-2.c: New test.
> > >         * gcc.target/i386/avx512bw-kunpckwd-1.c: Adjust testcase.
> > >         * gcc.target/i386/avx512bw-kunpckwd-3.c: Ditto.
> > >         * gcc.target/i386/avx512dq-kmovb-5.c: Ditto.
> > >         * gcc.target/i386/avx512f-kmovw-5.c: Ditto.
> >
> > index 74d207c3711..e8ad79d1b0a 100644
> > --- a/gcc/config/i386/i386.md
> > +++ b/gcc/config/i386/i386.md
> > @@ -2294,7 +2294,7 @@
> >
> >  (define_insn "*movsi_internal"
> >    [(set (match_operand:SI 0 "nonimmediate_operand"
> > -    "=r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,?r,?*v,*k,*k ,*rm,*k")
> > +    "=r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,?r,?*v,*k,*k ,*rm,k")
> >      (match_operand:SI 1 "general_operand"
> >      "g ,re,C ,*y,m  ,*y,*y,r  ,C ,*v,m ,*v,*v,r  ,*r,*km,*k ,CBC"))]
> >    "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
> >
> > I'd rather see *k everywhere, also with *movqi_internal and
> > *movhi_internal patterns. The "*" means that the allocator won't
> > allocate a mask register by default, but it will be used to optimize
> > moves. With the above change, you are risking that during integer
> > register pressure, the register allocator will allocate zero to a mask
> > register, and later "optimize" the move with a direct maskreg-intreg
> > move.
> >
> > The current strategy is that only general registers get allocated for
> > integer modes. Let's keep it this way for now.
> >
>
> Yes,  though it would fail gcc.target/i386/avx512dq-pr88465.c and
> gcc.target/i386/avx512f-pr88465.c, i think it's more reasonable not to
> move zero into mask register directly.

Although it would be nice if the register allocator was smart enough,
the current strategy is to introduce peephole2 patterns to fix these
problems, similar to [1]. These peepholes can be introduced in a
follow-up patch.

[1] https://gcc.gnu.org/pipermail/gcc-patches/2020-August/551744.html

> > Otherwise, the patchset LGTM, but please test the suggested changes and repost.
> >
> > BTW: Do you plan to remove mask operations from sse.md? ATM, they are
> > used to distinguish mask operations, generated from builtins from
> > generic operations, so I'd like to keep them for a while. The drawback
> > is, that they are not combined with other operations, but at the end
> > of the day, this is what the programmer asked for by using builtins.
>
> Agree, I prefer to keep them.

Thinking some more about the approach, it looks to me that the optimal
solution is a post-reload splitter that would convert "generic"
patterns to mask operations from sse.md. The mask operations don't set
flags, so we can substantially improve post reload scheduling of these
instructions by removing flags clobber.

So, simply add "#" to relevant alternatives of logic patterns and add
something like:

--cut here--
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 41c6dbfa668..ad49bdc7583 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -1470,6 +1470,18 @@
           ]
           (const_string "<MODE>")))])

+(define_split
+  [(set (match_operand:SWI1248_AVX512BW 0 "mask_reg_operand")
+       (any_logic:SWI1248_AVX512BW
+         (match_operand:SWI1248_AVX512BW 1 "mask_reg_operand")
+         (match_operand:SWI1248_AVX512BW 2 "mask_reg_operand")))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_AVX512F && reload_completed"
+  [(parallel
+     [(set (match_dup 0)
+          (any_logic:SWI1248_AVX512BW (match_dup 1) (match_dup 2)))
+      (unspec [(const_int 0)] UNSPEC_MASKOP)])])
+
 (define_insn "kandn<mode>"
   [(set (match_operand:SWI1248_AVX512BW 0 "register_operand" "=k")
        (and:SWI1248_AVX512BW
--cut here--

and similar for kandn and knot in sse.md. You will have to add
mask_reg_operand predicate, see e.g. sse_reg_operand in predicates.md
for example.

We don't lose anything, because all important transformations,
propagations and simplifications with these patterns happen before
reload.

Uros.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH 4/4][PR target/88808]Enable bitwise operator for AVX512 masks.
  2020-08-19  7:05     ` Uros Bizjak
@ 2020-08-20  7:24       ` Hongtao Liu
  2020-08-20  7:32         ` Hongtao Liu
  0 siblings, 1 reply; 39+ messages in thread
From: Hongtao Liu @ 2020-08-20  7:24 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: GCC Patches, Kirill Yukhin, H. J. Lu

On Wed, Aug 19, 2020 at 3:05 PM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Wed, Aug 19, 2020 at 4:25 AM Hongtao Liu <crazylht@gmail.com> wrote:
> >
> > On Mon, Aug 17, 2020 at 6:08 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > >
> > > On Fri, Aug 14, 2020 at 10:26 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > > >
> > > > Enable operator or/xor/and/andn/not for mask register, kxnor is not
> > > > enabled since there's no corresponding instruction for general
> > > > registers.
> > > >
> > > > gcc/
> > > >         PR target/88808
> > > >         * config/i386/i386.md: (*movsi_internal): Adjust constraints
> > > >         for mask registers.
> > > >         (*movhi_internal): Ditto.
> > > >         (*movqi_internal): Ditto.
> > > >         (*anddi_1): Support mask register operations
> > > >         (*and<mode>_1): Ditto.
> > > >         (*andqi_1): Ditto.
> > > >         (*andn<mode>_1): Ditto.
> > > >         (*<code><mode>_1): Ditto.
> > > >         (*<code>qi_1): Ditto.
> > > >         (*one_cmpl<mode>2_1): Ditto.
> > > >         (*one_cmplsi2_1_zext): Ditto.
> > > >         (*one_cmplqi2_1): Ditto.
> > > >
> > > > gcc/testsuite/
> > > >         * gcc.target/i386/bitwise_mask_op-1.c: New test.
> > > >         * gcc.target/i386/bitwise_mask_op-2.c: New test.
> > > >         * gcc.target/i386/avx512bw-kunpckwd-1.c: Adjust testcase.
> > > >         * gcc.target/i386/avx512bw-kunpckwd-3.c: Ditto.
> > > >         * gcc.target/i386/avx512dq-kmovb-5.c: Ditto.
> > > >         * gcc.target/i386/avx512f-kmovw-5.c: Ditto.
> > >
> > > index 74d207c3711..e8ad79d1b0a 100644
> > > --- a/gcc/config/i386/i386.md
> > > +++ b/gcc/config/i386/i386.md
> > > @@ -2294,7 +2294,7 @@
> > >
> > >  (define_insn "*movsi_internal"
> > >    [(set (match_operand:SI 0 "nonimmediate_operand"
> > > -    "=r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,?r,?*v,*k,*k ,*rm,*k")
> > > +    "=r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,?r,?*v,*k,*k ,*rm,k")
> > >      (match_operand:SI 1 "general_operand"
> > >      "g ,re,C ,*y,m  ,*y,*y,r  ,C ,*v,m ,*v,*v,r  ,*r,*km,*k ,CBC"))]
> > >    "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
> > >
> > > I'd rather see *k everywhere, also with *movqi_internal and
> > > *movhi_internal patterns. The "*" means that the allocator won't
> > > allocate a mask register by default, but it will be used to optimize
> > > moves. With the above change, you are risking that during integer
> > > register pressure, the register allocator will allocate zero to a mask
> > > register, and later "optimize" the move with a direct maskreg-intreg
> > > move.
> > >
> > > The current strategy is that only general registers get allocated for
> > > integer modes. Let's keep it this way for now.
> > >
> >
> > Yes,  though it would fail gcc.target/i386/avx512dq-pr88465.c and
> > gcc.target/i386/avx512f-pr88465.c, i think it's more reasonable not to
> > move zero into mask register directly.
>
> Although it would be nice if the register allocator was smart enough,
> the current strategy is to introduce peephole2 patterns to fix these
> problems, similar to [1]. These peepholes can be introduced in a
> follow-up patch.
>
> [1] https://gcc.gnu.org/pipermail/gcc-patches/2020-August/551744.html
>

peephole2 added.

> > > Otherwise, the patchset LGTM, but please test the suggested changes and repost.
> > >
> > > BTW: Do you plan to remove mask operations from sse.md? ATM, they are
> > > used to distinguish mask operations, generated from builtins from
> > > generic operations, so I'd like to keep them for a while. The drawback
> > > is, that they are not combined with other operations, but at the end
> > > of the day, this is what the programmer asked for by using builtins.
> >
> > Agree, I prefer to keep them.
>
> Thinking some more about the approach, it looks to me that the optimal
> solution is a post-reload splitter that would convert "generic"
> patterns to mask operations from sse.md. The mask operations don't set
> flags, so we can substantially improve post reload scheduling of these
> instructions by removing flags clobber.
>
> So, simply add "#" to relevant alternatives of logic patterns and add
> something like:
>
> --cut here--
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index 41c6dbfa668..ad49bdc7583 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -1470,6 +1470,18 @@
>            ]
>            (const_string "<MODE>")))])
>
> +(define_split
> +  [(set (match_operand:SWI1248_AVX512BW 0 "mask_reg_operand")
> +       (any_logic:SWI1248_AVX512BW
> +         (match_operand:SWI1248_AVX512BW 1 "mask_reg_operand")
> +         (match_operand:SWI1248_AVX512BW 2 "mask_reg_operand")))
> +   (clobber (reg:CC FLAGS_REG))]
> +  "TARGET_AVX512F && reload_completed"
> +  [(parallel
> +     [(set (match_dup 0)
> +          (any_logic:SWI1248_AVX512BW (match_dup 1) (match_dup 2)))
> +      (unspec [(const_int 0)] UNSPEC_MASKOP)])])
> +
>  (define_insn "kandn<mode>"
>    [(set (match_operand:SWI1248_AVX512BW 0 "register_operand" "=k")
>         (and:SWI1248_AVX512BW
> --cut here--
>
> and similar for kandn and knot in sse.md. You will have to add
> mask_reg_operand predicate, see e.g. sse_reg_operand in predicates.md
> for example.
>
> We don't lose anything, because all important transformations,
> propagations and simplifications with these patterns happen before
> reload.

define_splits are added for those bitwise operations.

>
> Uros.

Also add bellow part which will pass gcc.target/i386/bitwise_mask_op-3.c

-     must go into Q_REGS.  */
+     must go into Q_REGS or ALL_MASK_REGS.  */
   if (GET_MODE (x) == QImode && !CONSTANT_P (x))
     {
       if (Q_CLASS_P (regclass))
        return regclass;
       else if (reg_class_subset_p (Q_REGS, regclass))
        return Q_REGS;
+      else if (MASK_CLASS_P (regclass))
+       return regclass;
       else
        return NO_REGS;


Update patch.


--
BR,
Hongtao

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH 4/4][PR target/88808]Enable bitwise operator for AVX512 masks.
  2020-08-20  7:24       ` Hongtao Liu
@ 2020-08-20  7:32         ` Hongtao Liu
  2020-08-20  7:40           ` Uros Bizjak
  0 siblings, 1 reply; 39+ messages in thread
From: Hongtao Liu @ 2020-08-20  7:32 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: GCC Patches, Kirill Yukhin, H. J. Lu

On Thu, Aug 20, 2020 at 3:24 PM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Wed, Aug 19, 2020 at 3:05 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> >
> > On Wed, Aug 19, 2020 at 4:25 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > >
> > > On Mon, Aug 17, 2020 at 6:08 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > >
> > > > On Fri, Aug 14, 2020 at 10:26 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > > > >
> > > > > Enable operator or/xor/and/andn/not for mask register, kxnor is not
> > > > > enabled since there's no corresponding instruction for general
> > > > > registers.
> > > > >
> > > > > gcc/
> > > > >         PR target/88808
> > > > >         * config/i386/i386.md: (*movsi_internal): Adjust constraints
> > > > >         for mask registers.
> > > > >         (*movhi_internal): Ditto.
> > > > >         (*movqi_internal): Ditto.
> > > > >         (*anddi_1): Support mask register operations
> > > > >         (*and<mode>_1): Ditto.
> > > > >         (*andqi_1): Ditto.
> > > > >         (*andn<mode>_1): Ditto.
> > > > >         (*<code><mode>_1): Ditto.
> > > > >         (*<code>qi_1): Ditto.
> > > > >         (*one_cmpl<mode>2_1): Ditto.
> > > > >         (*one_cmplsi2_1_zext): Ditto.
> > > > >         (*one_cmplqi2_1): Ditto.
> > > > >
> > > > > gcc/testsuite/
> > > > >         * gcc.target/i386/bitwise_mask_op-1.c: New test.
> > > > >         * gcc.target/i386/bitwise_mask_op-2.c: New test.
> > > > >         * gcc.target/i386/avx512bw-kunpckwd-1.c: Adjust testcase.
> > > > >         * gcc.target/i386/avx512bw-kunpckwd-3.c: Ditto.
> > > > >         * gcc.target/i386/avx512dq-kmovb-5.c: Ditto.
> > > > >         * gcc.target/i386/avx512f-kmovw-5.c: Ditto.
> > > >
> > > > index 74d207c3711..e8ad79d1b0a 100644
> > > > --- a/gcc/config/i386/i386.md
> > > > +++ b/gcc/config/i386/i386.md
> > > > @@ -2294,7 +2294,7 @@
> > > >
> > > >  (define_insn "*movsi_internal"
> > > >    [(set (match_operand:SI 0 "nonimmediate_operand"
> > > > -    "=r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,?r,?*v,*k,*k ,*rm,*k")
> > > > +    "=r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,?r,?*v,*k,*k ,*rm,k")
> > > >      (match_operand:SI 1 "general_operand"
> > > >      "g ,re,C ,*y,m  ,*y,*y,r  ,C ,*v,m ,*v,*v,r  ,*r,*km,*k ,CBC"))]
> > > >    "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
> > > >
> > > > I'd rather see *k everywhere, also with *movqi_internal and
> > > > *movhi_internal patterns. The "*" means that the allocator won't
> > > > allocate a mask register by default, but it will be used to optimize
> > > > moves. With the above change, you are risking that during integer
> > > > register pressure, the register allocator will allocate zero to a mask
> > > > register, and later "optimize" the move with a direct maskreg-intreg
> > > > move.
> > > >
> > > > The current strategy is that only general registers get allocated for
> > > > integer modes. Let's keep it this way for now.
> > > >
> > >
> > > Yes,  though it would fail gcc.target/i386/avx512dq-pr88465.c and
> > > gcc.target/i386/avx512f-pr88465.c, i think it's more reasonable not to
> > > move zero into mask register directly.
> >
> > Although it would be nice if the register allocator was smart enough,
> > the current strategy is to introduce peephole2 patterns to fix these
> > problems, similar to [1]. These peepholes can be introduced in a
> > follow-up patch.
> >
> > [1] https://gcc.gnu.org/pipermail/gcc-patches/2020-August/551744.html
> >
>
> peephole2 added.
>
> > > > Otherwise, the patchset LGTM, but please test the suggested changes and repost.
> > > >
> > > > BTW: Do you plan to remove mask operations from sse.md? ATM, they are
> > > > used to distinguish mask operations, generated from builtins from
> > > > generic operations, so I'd like to keep them for a while. The drawback
> > > > is, that they are not combined with other operations, but at the end
> > > > of the day, this is what the programmer asked for by using builtins.
> > >
> > > Agree, I prefer to keep them.
> >
> > Thinking some more about the approach, it looks to me that the optimal
> > solution is a post-reload splitter that would convert "generic"
> > patterns to mask operations from sse.md. The mask operations don't set
> > flags, so we can substantially improve post reload scheduling of these
> > instructions by removing flags clobber.
> >
> > So, simply add "#" to relevant alternatives of logic patterns and add
> > something like:
> >
> > --cut here--
> > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> > index 41c6dbfa668..ad49bdc7583 100644
> > --- a/gcc/config/i386/sse.md
> > +++ b/gcc/config/i386/sse.md
> > @@ -1470,6 +1470,18 @@
> >            ]
> >            (const_string "<MODE>")))])
> >
> > +(define_split
> > +  [(set (match_operand:SWI1248_AVX512BW 0 "mask_reg_operand")
> > +       (any_logic:SWI1248_AVX512BW
> > +         (match_operand:SWI1248_AVX512BW 1 "mask_reg_operand")
> > +         (match_operand:SWI1248_AVX512BW 2 "mask_reg_operand")))
> > +   (clobber (reg:CC FLAGS_REG))]
> > +  "TARGET_AVX512F && reload_completed"
> > +  [(parallel
> > +     [(set (match_dup 0)
> > +          (any_logic:SWI1248_AVX512BW (match_dup 1) (match_dup 2)))
> > +      (unspec [(const_int 0)] UNSPEC_MASKOP)])])
> > +
> >  (define_insn "kandn<mode>"
> >    [(set (match_operand:SWI1248_AVX512BW 0 "register_operand" "=k")
> >         (and:SWI1248_AVX512BW
> > --cut here--
> >
> > and similar for kandn and knot in sse.md. You will have to add
> > mask_reg_operand predicate, see e.g. sse_reg_operand in predicates.md
> > for example.
> >
> > We don't lose anything, because all important transformations,
> > propagations and simplifications with these patterns happen before
> > reload.
>
> define_splits are added for those bitwise operations.
>
> >
> > Uros.
>
> Also add bellow part which will pass gcc.target/i386/bitwise_mask_op-3.c
>
> -     must go into Q_REGS.  */
> +     must go into Q_REGS or ALL_MASK_REGS.  */
>    if (GET_MODE (x) == QImode && !CONSTANT_P (x))
>      {
>        if (Q_CLASS_P (regclass))
>         return regclass;
>        else if (reg_class_subset_p (Q_REGS, regclass))
>         return Q_REGS;
> +      else if (MASK_CLASS_P (regclass))
> +       return regclass;
>        else
>         return NO_REGS;
>
>
> Update patch.
>
>
> --
> BR,
> Hongtao

networking is slow to send out mail with attachment, so i copy the
patch into mail.

gcc/
        PR target/88808
        * config/i386/i386.c (ix86_preferred_reload_class): Allow
        QImode data go into mask registers.
        * config/i386/i386.md: (*movhi_internal): Adjust constraints
        for mask registers.
        (*movqi_internal): Ditto.
        (*anddi_1): Support mask register operations
        (*and<mode>_1): Ditto.
        (*andqi_1): Ditto.
        (*andn<mode>_1): Ditto.
        (*<code><mode>_1): Ditto.
        (*<code>qi_1): Ditto.
        (*one_cmpl<mode>2_1): Ditto.
        (*one_cmplsi2_1_zext): Ditto.
        (*one_cmplqi2_1): Ditto.
        (define_peephole2): Move constant 0/-1 directly into mask
        registers.
        * config/i386/predicates.md (mask_reg_operand): New predicate.
        * config/i386/sse.md (define_split): Add post-reload splitters
        that would convert "generic" patterns to mask patterns.
        (*knotsi_1_zext): New define_insn.

gcc/testsuite/
        * gcc.target/i386/bitwise_mask_op-1.c: New test.
        * gcc.target/i386/bitwise_mask_op-2.c: New test.
        * gcc.target/i386/bitwise_mask_op-3.c: New test.
        * gcc.target/i386/avx512bw-pr88465.c: New testcase.
        * gcc.target/i386/avx512bw-kunpckwd-1.c: Adjust testcase.
        * gcc.target/i386/avx512bw-kunpckwd-3.c: Ditto.
        * gcc.target/i386/avx512dq-kmovb-5.c: Ditto.
        * gcc.target/i386/avx512f-kmovw-5.c: Ditto.
---
 gcc/config/i386/i386.c                        |   4 +-
 gcc/config/i386/i386.md                       | 209 ++++++++++++------
 gcc/config/i386/predicates.md                 |   5 +
 gcc/config/i386/sse.md                        |  59 +++++
 .../gcc.target/i386/avx512bw-kunpckwd-1.c     |   2 +-
 .../gcc.target/i386/avx512bw-kunpckwd-3.c     |   2 +-
 .../gcc.target/i386/avx512bw-pr88465.c        |  23 ++
 .../gcc.target/i386/avx512dq-kmovb-5.c        |   2 +-
 .../gcc.target/i386/avx512f-kmovw-5.c         |   2 +-
 .../gcc.target/i386/bitwise_mask_op-1.c       | 178 +++++++++++++++
 .../gcc.target/i386/bitwise_mask_op-2.c       |   8 +
 .../gcc.target/i386/bitwise_mask_op-3.c       |  44 ++++
 12 files changed, 471 insertions(+), 67 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512bw-pr88465.c
 create mode 100644 gcc/testsuite/gcc.target/i386/bitwise_mask_op-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/bitwise_mask_op-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index d71d6d55be6..e8a2182ceb0 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -18407,13 +18407,15 @@ ix86_preferred_reload_class (rtx x,
reg_class_t regclass)
     return INTEGER_CLASS_P (regclass) ? regclass : NO_REGS;

   /* QImode constants are easy to load, but non-constant QImode data
-     must go into Q_REGS.  */
+     must go into Q_REGS or ALL_MASK_REGS.  */
   if (GET_MODE (x) == QImode && !CONSTANT_P (x))
     {
       if (Q_CLASS_P (regclass))
         return regclass;
       else if (reg_class_subset_p (Q_REGS, regclass))
         return Q_REGS;
+      else if (MASK_CLASS_P (regclass))
+        return regclass;
       else
         return NO_REGS;
     }
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 3a15941c3e8..676525fbc1f 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -2403,8 +2403,8 @@
            (symbol_ref "true")))])

 (define_insn "*movhi_internal"
-  [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r ,r ,m ,k,k ,r,m,k")
-        (match_operand:HI 1 "general_operand"      "r
,rn,rm,rn,r,km,k,k,CBC"))]
+  [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r ,r ,m ,*k,*k
,*r,*m,*k")
+        (match_operand:HI 1 "general_operand"      "r
,rn,rm,rn,*r,*km,*k,*k,CBC"))]
   "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
 {
   switch (get_attr_type (insn))
@@ -2491,9 +2491,9 @@

 (define_insn "*movqi_internal"
   [(set (match_operand:QI 0 "nonimmediate_operand"
-                        "=Q,R,r,q,q,r,r ,?r,m ,k,k,r,m,k,k,k")
+                        "=Q,R,r,q,q,r,r ,?r,m ,*k,*k,*r,*m,*k,*k,*k")
         (match_operand:QI 1 "general_operand"
-                        "Q ,R,r,n,m,q,rn, m,qn,r,k,k,k,m,C,BC"))]
+                        "Q ,R,r,n,m,q,rn, m,qn,*r,*k,*k,*k,*m,C,BC"))]
   "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
 {
   char buf[128];
@@ -2624,6 +2624,19 @@
            ]
            (const_string "QI")))])

+/* Reload dislikes loading 0/-1 directly into mask registers.
+   Try to tidy things up here.  */
+(define_peephole2
+  [(set (match_operand:SWI 0 "general_reg_operand")
+        (match_operand:SWI 1 "immediate_operand"))
+   (set (match_operand:SWI 2 "mask_reg_operand")
+        (match_dup 0))]
+  "peep2_reg_dead_p (2, operands[0])
+   && (const0_operand (operands[1], <MODE>mode)
+       || (constm1_operand (operands[1], <MODE>mode)
+           && (<MODE_SIZE> > 1 || TARGET_AVX512DQ)))"
+  [(set (match_dup 2) (match_dup 1))])
+
 ;; Stores and loads of ax to arbitrary constant address.
 ;; We fake an second form of instruction to force reload to load address
 ;; into register when rax is not available
@@ -9044,19 +9057,21 @@
 })

 (define_insn "*anddi_1"
-  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,rm,r,r")
+  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,rm,r,r,k")
         (and:DI
-         (match_operand:DI 1 "nonimmediate_operand" "%0,0,0,qm")
-         (match_operand:DI 2 "x86_64_szext_general_operand" "Z,re,m,L")))
+         (match_operand:DI 1 "nonimmediate_operand" "%0,0,0,qm,k")
+         (match_operand:DI 2 "x86_64_szext_general_operand" "Z,re,m,L,k")))
    (clobber (reg:CC FLAGS_REG))]
   "TARGET_64BIT && ix86_binary_operator_ok (AND, DImode, operands)"
   "@
    and{l}\t{%k2, %k0|%k0, %k2}
    and{q}\t{%2, %0|%0, %2}
    and{q}\t{%2, %0|%0, %2}
+   #
    #"
-  [(set_attr "type" "alu,alu,alu,imovx")
-   (set_attr "length_immediate" "*,*,*,0")
+  [(set_attr "isa" "x64,x64,x64,x64,avx512bw")
+   (set_attr "type" "alu,alu,alu,imovx,msklog")
+   (set_attr "length_immediate" "*,*,*,0,*")
    (set (attr "prefix_rex")
      (if_then_else
        (and (eq_attr "type" "imovx")
@@ -9064,7 +9079,7 @@
                  (match_operand 1 "ext_QIreg_operand")))
        (const_string "1")
        (const_string "*")))
-   (set_attr "mode" "SI,DI,DI,SI")])
+   (set_attr "mode" "SI,DI,DI,SI,DI")])

 (define_insn_and_split "*anddi_1_btr"
   [(set (match_operand:DI 0 "nonimmediate_operand" "=rm")
@@ -9130,17 +9145,25 @@
    (set_attr "mode" "SI")])

 (define_insn "*and<mode>_1"
-  [(set (match_operand:SWI24 0 "nonimmediate_operand" "=rm,r,Ya")
-        (and:SWI24 (match_operand:SWI24 1 "nonimmediate_operand" "%0,0,qm")
-                   (match_operand:SWI24 2 "<general_operand>" "r<i>,m,L")))
+  [(set (match_operand:SWI24 0 "nonimmediate_operand" "=rm,r,Ya,k")
+        (and:SWI24 (match_operand:SWI24 1 "nonimmediate_operand" "%0,0,qm,k")
+                   (match_operand:SWI24 2 "<general_operand>" "r<i>,m,L,k")))
    (clobber (reg:CC FLAGS_REG))]
   "ix86_binary_operator_ok (AND, <MODE>mode, operands)"
   "@
    and{<imodesuffix>}\t{%2, %0|%0, %2}
    and{<imodesuffix>}\t{%2, %0|%0, %2}
+   #
    #"
-  [(set_attr "type" "alu,alu,imovx")
-   (set_attr "length_immediate" "*,*,0")
+  [(set (attr "isa")
+        (cond [(eq_attr "alternative" "3")
+                 (if_then_else (eq_attr "mode" "SI")
+                   (const_string "avx512bw")
+                   (const_string "avx512f"))
+              ]
+              (const_string "*")))
+   (set_attr "type" "alu,alu,imovx,msklog")
+   (set_attr "length_immediate" "*,*,0,*")
    (set (attr "prefix_rex")
      (if_then_else
        (and (eq_attr "type" "imovx")
@@ -9148,20 +9171,28 @@
                  (match_operand 1 "ext_QIreg_operand")))
        (const_string "1")
        (const_string "*")))
-   (set_attr "mode" "<MODE>,<MODE>,SI")])
+   (set_attr "mode" "<MODE>,<MODE>,SI,<MODE>")])

 (define_insn "*andqi_1"
-  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r")
-        (and:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0")
-                (match_operand:QI 2 "general_operand" "qn,m,rn")))
+  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r,k")
+        (and:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0,k")
+                (match_operand:QI 2 "general_operand" "qn,m,rn,k")))
    (clobber (reg:CC FLAGS_REG))]
   "ix86_binary_operator_ok (AND, QImode, operands)"
   "@
    and{b}\t{%2, %0|%0, %2}
    and{b}\t{%2, %0|%0, %2}
-   and{l}\t{%k2, %k0|%k0, %k2}"
-  [(set_attr "type" "alu")
-   (set_attr "mode" "QI,QI,SI")
+   and{l}\t{%k2, %k0|%k0, %k2}
+   #"
+  [(set_attr "type" "alu,alu,alu,msklog")
+   (set (attr "mode")
+        (cond [(eq_attr "alternative" "2")
+                 (const_string "SI")
+                (and (eq_attr "alternative" "3")
+                     (match_test "!TARGET_AVX512DQ"))
+                 (const_string "HI")
+               ]
+               (const_string "QI")))
    ;; Potential partial reg stall on alternative 2.
    (set (attr "preferred_for_speed")
      (cond [(eq_attr "alternative" "2")
@@ -9539,28 +9570,42 @@
 })

 (define_insn "*andn<mode>_1"
-  [(set (match_operand:SWI48 0 "register_operand" "=r,r")
+  [(set (match_operand:SWI48 0 "register_operand" "=r,r,k")
         (and:SWI48
-          (not:SWI48 (match_operand:SWI48 1 "register_operand" "r,r"))
-          (match_operand:SWI48 2 "nonimmediate_operand" "r,m")))
+          (not:SWI48 (match_operand:SWI48 1 "register_operand" "r,r,k"))
+          (match_operand:SWI48 2 "nonimmediate_operand" "r,m,k")))
    (clobber (reg:CC FLAGS_REG))]
-  "TARGET_BMI"
-  "andn\t{%2, %1, %0|%0, %1, %2}"
-  [(set_attr "type" "bitmanip")
-   (set_attr "btver2_decode" "direct, double")
+  "TARGET_BMI || TARGET_AVX512BW"
+  "@
+   andn\t{%2, %1, %0|%0, %1, %2}
+   andn\t{%2, %1, %0|%0, %1, %2}
+   #"
+  [(set_attr "isa" "bmi,bmi,avx512bw")
+   (set_attr "type" "bitmanip,bitmanip,msklog")
+   (set_attr "btver2_decode" "direct, double,*")
    (set_attr "mode" "<MODE>")])

 (define_insn "*andn<mode>_1"
-  [(set (match_operand:SWI12 0 "register_operand" "=r")
+  [(set (match_operand:SWI12 0 "register_operand" "=r,k")
         (and:SWI12
-          (not:SWI12 (match_operand:SWI12 1 "register_operand" "r"))
-          (match_operand:SWI12 2 "register_operand" "r")))
+          (not:SWI12 (match_operand:SWI12 1 "register_operand" "r,k"))
+          (match_operand:SWI12 2 "register_operand" "r,k")))
    (clobber (reg:CC FLAGS_REG))]
-  "TARGET_BMI"
-  "andn\t{%k2, %k1, %k0|%k0, %k1, %k2}"
-  [(set_attr "type" "bitmanip")
-   (set_attr "btver2_decode" "direct")
-   (set_attr "mode" "SI")])
+  "TARGET_BMI || TARGET_AVX512BW"
+  "@
+   andn\t{%k2, %k1, %k0|%k0, %k1, %k2}
+   #"
+  [(set_attr "isa" "bmi,avx512f")
+   (set_attr "type" "bitmanip,msklog")
+   (set_attr "btver2_decode" "direct,*")
+   (set (attr "mode")
+        (cond [(eq_attr "alternative" "0")
+                 (const_string "SI")
+               (and (eq_attr "alternative" "1")
+                    (match_test "!TARGET_AVX512DQ"))
+                  (const_string "HI")
+              ]
+              (const_string "<MODE>")))])

 (define_insn "*andn_<mode>_ccno"
   [(set (reg FLAGS_REG)
@@ -9631,14 +9676,24 @@
 })

 (define_insn "*<code><mode>_1"
-  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,r")
+  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,r,k")
         (any_or:SWI248
-         (match_operand:SWI248 1 "nonimmediate_operand" "%0,0")
-         (match_operand:SWI248 2 "<general_operand>" "r<i>,m")))
+         (match_operand:SWI248 1 "nonimmediate_operand" "%0,0,k")
+         (match_operand:SWI248 2 "<general_operand>" "r<i>,m,k")))
    (clobber (reg:CC FLAGS_REG))]
   "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
-  "<logic>{<imodesuffix>}\t{%2, %0|%0, %2}"
-  [(set_attr "type" "alu")
+  "@
+   <logic>{<imodesuffix>}\t{%2, %0|%0, %2}
+   <logic>{<imodesuffix>}\t{%2, %0|%0, %2}
+   #"
+  [(set (attr "isa")
+        (cond [(eq_attr "alternative" "2")
+                 (if_then_else (eq_attr "mode" "SI,DI")
+                   (const_string "avx512bw")
+                   (const_string "avx512f"))
+              ]
+              (const_string "*")))
+   (set_attr "type" "alu, alu, msklog")
    (set_attr "mode" "<MODE>")])

 (define_insn_and_split "*iordi_1_bts"
@@ -9711,17 +9766,26 @@
    (set_attr "mode" "SI")])

 (define_insn "*<code>qi_1"
-  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r")
-        (any_or:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0")
-                   (match_operand:QI 2 "general_operand" "qn,m,rn")))
+  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r,k")
+        (any_or:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0,k")
+                   (match_operand:QI 2 "general_operand" "qn,m,rn,k")))
    (clobber (reg:CC FLAGS_REG))]
   "ix86_binary_operator_ok (<CODE>, QImode, operands)"
   "@
    <logic>{b}\t{%2, %0|%0, %2}
    <logic>{b}\t{%2, %0|%0, %2}
-   <logic>{l}\t{%k2, %k0|%k0, %k2}"
-  [(set_attr "type" "alu")
-   (set_attr "mode" "QI,QI,SI")
+   <logic>{l}\t{%k2, %k0|%k0, %k2}
+   #"
+  [(set_attr "isa" "*,*,*,avx512f")
+   (set_attr "type" "alu,alu,alu,msklog")
+   (set (attr "mode")
+        (cond [(eq_attr "alternative" "2")
+                 (const_string "SI")
+                (and (eq_attr "alternative" "3")
+                     (match_test "!TARGET_AVX512DQ"))
+                 (const_string "HI")
+               ]
+               (const_string "QI")))
    ;; Potential partial reg stall on alternative 2.
    (set (attr "preferred_for_speed")
      (cond [(eq_attr "alternative" "2")
@@ -10370,31 +10434,52 @@
   "split_double_mode (DImode, &operands[0], 2, &operands[0], &operands[2]);")

 (define_insn "*one_cmpl<mode>2_1"
-  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm")
-        (not:SWI248 (match_operand:SWI248 1 "nonimmediate_operand" "0")))]
+  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,k")
+        (not:SWI248 (match_operand:SWI248 1 "nonimmediate_operand" "0,k")))]
   "ix86_unary_operator_ok (NOT, <MODE>mode, operands)"
-  "not{<imodesuffix>}\t%0"
-  [(set_attr "type" "negnot")
+  "@
+   not{<imodesuffix>}\t%0
+   #"
+  [(set (attr "isa")
+        (cond [(eq_attr "alternative" "2")
+                 (if_then_else (eq_attr "mode" "SI,DI")
+                   (const_string "avx512bw")
+                   (const_string "avx512f"))
+              ]
+              (const_string "*")))
+   (set_attr "type" "negnot,msklog")
    (set_attr "mode" "<MODE>")])

 (define_insn "*one_cmplsi2_1_zext"
-  [(set (match_operand:DI 0 "register_operand" "=r")
+  [(set (match_operand:DI 0 "register_operand" "=r,k")
         (zero_extend:DI
-          (not:SI (match_operand:SI 1 "register_operand" "0"))))]
+          (not:SI (match_operand:SI 1 "register_operand" "0,k"))))]
   "TARGET_64BIT && ix86_unary_operator_ok (NOT, SImode, operands)"
-  "not{l}\t%k0"
-  [(set_attr "type" "negnot")
-   (set_attr "mode" "SI")])
+  "@
+   not{l}\t%k0
+   #"
+  [(set_attr "isa" "x64,avx512bw")
+   (set_attr "type" "negnot,msklog")
+   (set_attr "mode" "SI,SI")])

 (define_insn "*one_cmplqi2_1"
-  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,r")
-        (not:QI (match_operand:QI 1 "nonimmediate_operand" "0,0")))]
+  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,r,k")
+        (not:QI (match_operand:QI 1 "nonimmediate_operand" "0,0,k")))]
   "ix86_unary_operator_ok (NOT, QImode, operands)"
   "@
    not{b}\t%0
-   not{l}\t%k0"
-  [(set_attr "type" "negnot")
-   (set_attr "mode" "QI,SI")
+   not{l}\t%k0
+   #"
+  [(set_attr "isa" "*,*,avx512f")
+   (set_attr "type" "negnot,negnot,msklog")
+   (set (attr "mode")
+        (cond [(eq_attr "alternative" "1")
+                 (const_string "SI")
+                (and (eq_attr "alternative" "2")
+                     (match_test "!TARGET_AVX512DQ"))
+                 (const_string "HI")
+               ]
+               (const_string "QI")))
    ;; Potential partial reg stall on alternative 1.
    (set (attr "preferred_for_speed")
      (cond [(eq_attr "alternative" "1")
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index 07e69d555c0..dd1b31479f5 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -87,6 +87,11 @@
   (and (match_code "reg")
        (match_test "REGNO (op) == FLAGS_REG")))

+;; True if the operand is a MASK register.
+(define_predicate "mask_reg_operand"
+  (and (match_code "reg")
+       (match_test "MASK_REGNO_P (REGNO (op))")))
+
 ;; Match a DI, SI, HI or QImode nonimmediate_operand.
 (define_special_predicate "int_nonimmediate_operand"
   (and (match_operand 0 "nonimmediate_operand")
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index b6348de67cb..4372a9fd785 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -1452,6 +1452,18 @@
   "TARGET_AVX512F
    && !(MEM_P (operands[0]) && MEM_P (operands[1]))")

+(define_split
+  [(set (match_operand:SWI1248_AVX512BW 0 "mask_reg_operand")
+        (any_logic:SWI1248_AVX512BW
+          (match_operand:SWI1248_AVX512BW 1 "mask_reg_operand")
+          (match_operand:SWI1248_AVX512BW 2 "mask_reg_operand")))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_AVX512F && reload_completed"
+  [(parallel
+     [(set (match_dup 0)
+           (any_logic:SWI1248_AVX512BW (match_dup 1) (match_dup 2)))
+      (unspec [(const_int 0)] UNSPEC_MASKOP)])])
+
 (define_insn "k<code><mode>"
   [(set (match_operand:SWI1248_AVX512BW 0 "register_operand" "=k")
         (any_logic:SWI1248_AVX512BW
@@ -1474,6 +1486,21 @@
            ]
            (const_string "<MODE>")))])

+(define_split
+  [(set (match_operand:SWI1248_AVX512BW 0 "mask_reg_operand")
+        (and:SWI1248_AVX512BW
+          (not:SWI1248_AVX512BW
+            (match_operand:SWI1248_AVX512BW 1 "mask_reg_operand"))
+          (match_operand:SWI1248_AVX512BW 2 "mask_reg_operand")))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_AVX512F && reload_completed"
+  [(parallel
+     [(set (match_dup 0)
+           (and:SWI1248_AVX512BW
+             (not:SWI1248_AVX512BW (match_dup 1))
+             (match_dup 2)))
+      (unspec [(const_int 0)] UNSPEC_MASKOP)])])
+
 (define_insn "kandn<mode>"
   [(set (match_operand:SWI1248_AVX512BW 0 "register_operand" "=k")
         (and:SWI1248_AVX512BW
@@ -1520,6 +1547,16 @@
            ]
            (const_string "<MODE>")))])

+(define_split
+  [(set (match_operand:SWI1248_AVX512BW 0 "mask_reg_operand")
+        (not:SWI1248_AVX512BW
+          (match_operand:SWI1248_AVX512BW 1 "mask_reg_operand")))]
+  "TARGET_AVX512F && reload_completed"
+  [(parallel
+     [(set (match_dup 0)
+           (not:SWI1248_AVX512BW (match_dup 1)))
+      (unspec [(const_int 0)] UNSPEC_MASKOP)])])
+
 (define_insn "knot<mode>"
   [(set (match_operand:SWI1248_AVX512BW 0 "register_operand" "=k")
         (not:SWI1248_AVX512BW
@@ -1541,6 +1578,28 @@
            ]
            (const_string "<MODE>")))])

+(define_split
+  [(set (match_operand:DI 0 "mask_reg_operand")
+        (zero_extend:DI
+          (not:DI (match_operand:SI 1 "mask_reg_operand"))))]
+  "TARGET_AVX512BW && reload_completed"
+  [(parallel
+     [(set (match_dup 0)
+           (zero_extend:DI
+             (not:SI (match_dup 1))))
+      (unspec [(const_int 0)] UNSPEC_MASKOP)])])
+
+(define_insn "*knotsi_1_zext"
+  [(set (match_operand:DI 0 "register_operand" "=k")
+        (zero_extend:DI
+          (not:SI (match_operand:SI 1 "register_operand" "k"))))
+   (unspec [(const_int 0)] UNSPEC_MASKOP)]
+  "TARGET_AVX512BW"
+  "knotd\t{%1, %0|%0, %1}";
+  [(set_attr "type" "msklog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "SI")])
+
 (define_insn "kadd<mode>"
   [(set (match_operand:SWI1248_AVX512BWDQ2 0 "register_operand" "=k")
         (plus:SWI1248_AVX512BWDQ2
diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-1.c
b/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-1.c
index 94422f36010..46d9351f275 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-1.c
@@ -1,6 +1,6 @@
 /* { dg-do compile } */
 /* { dg-options "-mavx512bw -O2" } */
-/* { dg-final { scan-assembler-times "kunpckwd\[
\\t\]+\[^\{\n\]*%k\[1-7\](?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "kunpckwd\[
\\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */

 #include <immintrin.h>

diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-3.c
b/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-3.c
index c68ad8cc1f7..fe13f4f33fc 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-3.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-3.c
@@ -1,6 +1,6 @@
 /* { dg-do compile } */
 /* { dg-options "-mavx512bw -O2" } */
-/* { dg-final { scan-assembler-times "kunpckwd\[
\\t\]+\[^\{\n\]*%k\[1-7\](?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "kunpckwd\[
\\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */

 #include <immintrin.h>

diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-pr88465.c
b/gcc/testsuite/gcc.target/i386/avx512bw-pr88465.c
new file mode 100644
index 00000000000..8e34bf45365
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512bw-pr88465.c
@@ -0,0 +1,23 @@
+/* PR target/88465 */
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mavx512bw" } */
+/* { dg-final { scan-assembler-times "kxor\[qd\]\[ \t]" 2 } } */
+/* { dg-final { scan-assembler-times "kxnor\[dq\]\[ \t]" 2 } } */
+
+void
+foo (void)
+{
+  unsigned int k = 0;
+  __asm volatile ("" : : "k" (k));
+  k = -1;
+  __asm volatile ("" : : "k" (k));
+}
+
+void
+bar (void)
+{
+  unsigned long long k = 0;
+  __asm volatile ("" : : "k" (k));
+  k = -1;
+  __asm volatile ("" : : "k" (k));
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512dq-kmovb-5.c
b/gcc/testsuite/gcc.target/i386/avx512dq-kmovb-5.c
index 49817097e26..114e03ee93d 100644
--- a/gcc/testsuite/gcc.target/i386/avx512dq-kmovb-5.c
+++ b/gcc/testsuite/gcc.target/i386/avx512dq-kmovb-5.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-mavx512dq -O2" } */
+/* { dg-options "-mavx512dq -mno-avx512bw -O2" } */
 /* { dg-final { scan-assembler-times "kmovb\[
\\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */

 #include <immintrin.h>
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-kmovw-5.c
b/gcc/testsuite/gcc.target/i386/avx512f-kmovw-5.c
index 7bb34d34d8d..79d37394b36 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-kmovw-5.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-kmovw-5.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-mavx512f -O2" } */
+/* { dg-options "-mavx512f -mno-avx512bw -O2" } */
 /* { dg-final { scan-assembler-times "kmovw\[
\\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */

 #include <immintrin.h>
diff --git a/gcc/testsuite/gcc.target/i386/bitwise_mask_op-1.c
b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-1.c
new file mode 100644
index 00000000000..61f71ab8b23
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-1.c
@@ -0,0 +1,178 @@
+/* PR target/88808  */
+/* { dg-do compile } */
+/* { dg-options "-mavx512bw -mno-avx512dq -O2" } */
+
+#include <immintrin.h>
+__m512i
+foo_orq (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask64 m1 = _mm512_cmpeq_epi8_mask (a, b);
+  __mmask64 m2 = _mm512_cmpeq_epi8_mask (c, d);
+  return _mm512_mask_add_epi8 (c, m1 | m2, a, d);
+}
+
+/* { dg-final { scan-assembler-times "korq" "1" { target { ! ia32 } } } } */
+
+__m512i
+foo_ord (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask32 m1 = _mm512_cmpeq_epi16_mask (a, b);
+  __mmask32 m2 = _mm512_cmpeq_epi16_mask (c, d);
+  return _mm512_mask_add_epi16 (c, m1 | m2, a, d);
+}
+
+/* { dg-final { scan-assembler-times "kord" "1" } }  */
+
+__m512i
+foo_orw (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask16 m1 = _mm512_cmpeq_epi32_mask (a, b);
+  __mmask16 m2 = _mm512_cmpeq_epi32_mask (c, d);
+  return _mm512_mask_add_epi32 (c, m1 | m2, a, d);
+}
+
+__m512i
+foo_orb (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask8 m1 = _mm512_cmpeq_epi64_mask (a, b);
+  __mmask8 m2 = _mm512_cmpeq_epi64_mask (c, d);
+  return _mm512_mask_add_epi64 (c, m1 | m2, a, d);
+}
+
+/* { dg-final { scan-assembler-times "korw" "2" } }  */
+
+__m512i
+foo_xorq (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask64 m1 = _mm512_cmpeq_epi8_mask (a, b);
+  __mmask64 m2 = _mm512_cmpeq_epi8_mask (c, d);
+  return _mm512_mask_add_epi8 (c, m1 ^ m2, a, d);
+}
+
+/* { dg-final { scan-assembler-times "kxorq" "1" { target { ! ia32 } } } }  */
+
+__m512i
+foo_xord (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask32 m1 = _mm512_cmpeq_epi16_mask (a, b);
+  __mmask32 m2 = _mm512_cmpeq_epi16_mask (c, d);
+  return _mm512_mask_add_epi16 (c, m1 ^ m2, a, d);
+}
+
+/* { dg-final { scan-assembler-times "kxord" "1" } }  */
+
+__m512i
+foo_xorw (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask16 m1 = _mm512_cmpeq_epi32_mask (a, b);
+  __mmask16 m2 = _mm512_cmpeq_epi32_mask (c, d);
+  return _mm512_mask_add_epi32 (c, m1 ^ m2, a, d);
+}
+
+__m512i
+foo_xorb (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask8 m1 = _mm512_cmpeq_epi64_mask (a, b);
+  __mmask8 m2 = _mm512_cmpeq_epi64_mask (c, d);
+  return _mm512_mask_add_epi64 (c, m1 ^ m2, a, d);
+}
+
+/* { dg-final { scan-assembler-times "korw" "2" } }  */
+
+__m512i
+foo_andq (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask64 m1 = _mm512_cmpeq_epi8_mask (a, b);
+  __mmask64 m2 = _mm512_cmpeq_epi8_mask (c, d);
+  return _mm512_mask_add_epi8 (c, m1 & m2, a, d);
+}
+
+__m512i
+foo_andd (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask32 m1 = _mm512_cmpeq_epi16_mask (a, b);
+  __mmask32 m2 = _mm512_cmpeq_epi16_mask (c, d);
+  return _mm512_mask_add_epi16 (c, m1 & m2, a, d);
+}
+
+__m512i
+foo_andw (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask16 m1 = _mm512_cmpeq_epi32_mask (a, b);
+  __mmask16 m2 = _mm512_cmpeq_epi32_mask (c, d);
+  return _mm512_mask_add_epi32 (c, m1 & m2, a, d);
+}
+
+__m512i
+foo_andb (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask8 m1 = _mm512_cmpeq_epi64_mask (a, b);
+  __mmask8 m2 = _mm512_cmpeq_epi64_mask (c, d);
+  return _mm512_mask_add_epi64 (c, m1 & m2, a, d);
+}
+
+__m512i
+foo_andnq (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask64 m1 = _mm512_cmpeq_epi8_mask (a, b);
+  __mmask64 m2 = _mm512_cmpeq_epi8_mask (c, d);
+  return _mm512_mask_add_epi8 (c, m1 & ~m2, a, d);
+}
+
+__m512i
+foo_andnd (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask32 m1 = _mm512_cmpeq_epi16_mask (a, b);
+  __mmask32 m2 = _mm512_cmpeq_epi16_mask (c, d);
+  return _mm512_mask_add_epi16 (c, m1 & ~m2, a, d);
+}
+
+__m512i
+foo_andnw (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask16 m1 = _mm512_cmpeq_epi32_mask (a, b);
+  __mmask16 m2 = _mm512_cmpeq_epi32_mask (c, d);
+  return _mm512_mask_add_epi32 (c, m1 & ~m2, a, d);
+}
+
+__m512i
+foo_andnb (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask8 m1 = _mm512_cmpeq_epi64_mask (a, b);
+  __mmask8 m2 = _mm512_cmpeq_epi64_mask (c, d);
+  return _mm512_mask_add_epi64 (c, m1 & ~m2, a, d);
+}
+
+__m512i
+foo_notq (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask64 m1 = _mm512_cmpeq_epi8_mask (a, b);
+  return _mm512_mask_add_epi8 (c, ~m1, a, d);
+}
+
+/* { dg-final { scan-assembler-times "knotq" "2" { target { ! ia32 } } } }  */
+
+__m512i
+foo_notd (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask32 m1 = _mm512_cmpeq_epi16_mask (a, b);
+  return _mm512_mask_add_epi16 (c, ~m1, a, d);
+}
+
+/* { dg-final { scan-assembler-times "knotd" "2" { target { ! ia32 } } } }  */
+
+__m512i
+foo_notw (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask16 m1 = _mm512_cmpeq_epi32_mask (a, b);
+  return _mm512_mask_add_epi32 (c, ~m1, a, d);
+}
+
+__m512i
+foo_notb (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask8 m1 = _mm512_cmpeq_epi64_mask (a, b);
+  return _mm512_mask_add_epi64 (c, ~m1, a, d);
+}
+
+/* { dg-final { scan-assembler-times "knotw" "4" } }  */
diff --git a/gcc/testsuite/gcc.target/i386/bitwise_mask_op-2.c
b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-2.c
new file mode 100644
index 00000000000..850f0b42652
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-2.c
@@ -0,0 +1,8 @@
+/* PR target/88808  */
+/* { dg-do compile } */
+/* { dg-options "-mavx512bw -mavx512dq -O2" } */
+/* { dg-final { scan-assembler-times "knotb" "2" } }  */
+/* { dg-final { scan-assembler-times "korb" "1" } }  */
+/* { dg-final { scan-assembler-times "kxorb" "1" } }  */
+#include "bitwise_mask_op-1.c"
+
diff --git a/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c
b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c
new file mode 100644
index 00000000000..18bf4f0d768
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c
@@ -0,0 +1,44 @@
+/* PR target/88808  */
+/* { dg-do compile } */
+/* { dg-options "-mavx512bw -mavx512dq -O2" } */
+
+#include <immintrin.h>
+volatile __mmask8 foo;
+void
+foo_orb (__m512i a, __m512i b)
+{
+  __mmask8 m1 = _mm512_cmp_epi64_mask (a, b, 2);
+  __mmask8 m2 = _mm512_cmp_epi64_mask (a, b, 4);
+  foo = m1 | m2;
+}
+
+/* { dg-final { scan-assembler-times "korb\[\t \]" "1" } }  */
+
+void
+foo_xorb (__m512i a, __m512i b)
+{
+  __mmask8 m1 = _mm512_cmp_epi64_mask (a, b, 2);
+  __mmask8 m2 = _mm512_cmp_epi64_mask (a, b, 4);
+  foo = m1 ^ m2;
+}
+
+/* { dg-final { scan-assembler-times "kxorb\[\t \]" "1" } }  */
+
+void
+foo_andb (__m512i a, __m512i b)
+{
+  __mmask8 m1 = _mm512_cmp_epi64_mask (a, b, 2);
+  __mmask8 m2 = _mm512_cmp_epi64_mask (a, b, 4);
+  foo = m1 & m2;
+}
+
+void
+foo_andnb (__m512i a, __m512i b)
+{
+  __mmask8 m1 = _mm512_cmp_epi64_mask (a, b, 2);
+  __mmask8 m2 = _mm512_cmp_epi64_mask (a, b, 4);
+  foo = m1 & ~m2;
+}
+
+/* { dg-final { scan-assembler-times "knotb\[\t \]" "1" } }  */
+/* { dg-final { scan-assembler-times "kmovb\[\t \]" "4"} }  */
-- 
2.18.1


-- 
BR,
Hongtao

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH 4/4][PR target/88808]Enable bitwise operator for AVX512 masks.
  2020-08-20  7:32         ` Hongtao Liu
@ 2020-08-20  7:40           ` Uros Bizjak
  2020-08-20  7:45             ` Hongtao Liu
  0 siblings, 1 reply; 39+ messages in thread
From: Uros Bizjak @ 2020-08-20  7:40 UTC (permalink / raw)
  To: Hongtao Liu; +Cc: GCC Patches, Kirill Yukhin, H. J. Lu

On Thu, Aug 20, 2020 at 9:31 AM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Thu, Aug 20, 2020 at 3:24 PM Hongtao Liu <crazylht@gmail.com> wrote:
> >
> > On Wed, Aug 19, 2020 at 3:05 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > >
> > > On Wed, Aug 19, 2020 at 4:25 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > > >
> > > > On Mon, Aug 17, 2020 at 6:08 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > > >
> > > > > On Fri, Aug 14, 2020 at 10:26 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > > > > >
> > > > > > Enable operator or/xor/and/andn/not for mask register, kxnor is not
> > > > > > enabled since there's no corresponding instruction for general
> > > > > > registers.
> > > > > >
> > > > > > gcc/
> > > > > >         PR target/88808
> > > > > >         * config/i386/i386.md: (*movsi_internal): Adjust constraints
> > > > > >         for mask registers.
> > > > > >         (*movhi_internal): Ditto.
> > > > > >         (*movqi_internal): Ditto.
> > > > > >         (*anddi_1): Support mask register operations
> > > > > >         (*and<mode>_1): Ditto.
> > > > > >         (*andqi_1): Ditto.
> > > > > >         (*andn<mode>_1): Ditto.
> > > > > >         (*<code><mode>_1): Ditto.
> > > > > >         (*<code>qi_1): Ditto.
> > > > > >         (*one_cmpl<mode>2_1): Ditto.
> > > > > >         (*one_cmplsi2_1_zext): Ditto.
> > > > > >         (*one_cmplqi2_1): Ditto.
> > > > > >
> > > > > > gcc/testsuite/
> > > > > >         * gcc.target/i386/bitwise_mask_op-1.c: New test.
> > > > > >         * gcc.target/i386/bitwise_mask_op-2.c: New test.
> > > > > >         * gcc.target/i386/avx512bw-kunpckwd-1.c: Adjust testcase.
> > > > > >         * gcc.target/i386/avx512bw-kunpckwd-3.c: Ditto.
> > > > > >         * gcc.target/i386/avx512dq-kmovb-5.c: Ditto.
> > > > > >         * gcc.target/i386/avx512f-kmovw-5.c: Ditto.
> > > > >
> > > > > index 74d207c3711..e8ad79d1b0a 100644
> > > > > --- a/gcc/config/i386/i386.md
> > > > > +++ b/gcc/config/i386/i386.md
> > > > > @@ -2294,7 +2294,7 @@
> > > > >
> > > > >  (define_insn "*movsi_internal"
> > > > >    [(set (match_operand:SI 0 "nonimmediate_operand"
> > > > > -    "=r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,?r,?*v,*k,*k ,*rm,*k")
> > > > > +    "=r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,?r,?*v,*k,*k ,*rm,k")
> > > > >      (match_operand:SI 1 "general_operand"
> > > > >      "g ,re,C ,*y,m  ,*y,*y,r  ,C ,*v,m ,*v,*v,r  ,*r,*km,*k ,CBC"))]
> > > > >    "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
> > > > >
> > > > > I'd rather see *k everywhere, also with *movqi_internal and
> > > > > *movhi_internal patterns. The "*" means that the allocator won't
> > > > > allocate a mask register by default, but it will be used to optimize
> > > > > moves. With the above change, you are risking that during integer
> > > > > register pressure, the register allocator will allocate zero to a mask
> > > > > register, and later "optimize" the move with a direct maskreg-intreg
> > > > > move.
> > > > >
> > > > > The current strategy is that only general registers get allocated for
> > > > > integer modes. Let's keep it this way for now.
> > > > >
> > > >
> > > > Yes,  though it would fail gcc.target/i386/avx512dq-pr88465.c and
> > > > gcc.target/i386/avx512f-pr88465.c, i think it's more reasonable not to
> > > > move zero into mask register directly.
> > >
> > > Although it would be nice if the register allocator was smart enough,
> > > the current strategy is to introduce peephole2 patterns to fix these
> > > problems, similar to [1]. These peepholes can be introduced in a
> > > follow-up patch.
> > >
> > > [1] https://gcc.gnu.org/pipermail/gcc-patches/2020-August/551744.html
> > >
> >
> > peephole2 added.
> >
> > > > > Otherwise, the patchset LGTM, but please test the suggested changes and repost.
> > > > >
> > > > > BTW: Do you plan to remove mask operations from sse.md? ATM, they are
> > > > > used to distinguish mask operations, generated from builtins from
> > > > > generic operations, so I'd like to keep them for a while. The drawback
> > > > > is, that they are not combined with other operations, but at the end
> > > > > of the day, this is what the programmer asked for by using builtins.
> > > >
> > > > Agree, I prefer to keep them.
> > >
> > > Thinking some more about the approach, it looks to me that the optimal
> > > solution is a post-reload splitter that would convert "generic"
> > > patterns to mask operations from sse.md. The mask operations don't set
> > > flags, so we can substantially improve post reload scheduling of these
> > > instructions by removing flags clobber.
> > >
> > > So, simply add "#" to relevant alternatives of logic patterns and add
> > > something like:
> > >
> > > --cut here--
> > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> > > index 41c6dbfa668..ad49bdc7583 100644
> > > --- a/gcc/config/i386/sse.md
> > > +++ b/gcc/config/i386/sse.md
> > > @@ -1470,6 +1470,18 @@
> > >            ]
> > >            (const_string "<MODE>")))])
> > >
> > > +(define_split
> > > +  [(set (match_operand:SWI1248_AVX512BW 0 "mask_reg_operand")
> > > +       (any_logic:SWI1248_AVX512BW
> > > +         (match_operand:SWI1248_AVX512BW 1 "mask_reg_operand")
> > > +         (match_operand:SWI1248_AVX512BW 2 "mask_reg_operand")))
> > > +   (clobber (reg:CC FLAGS_REG))]
> > > +  "TARGET_AVX512F && reload_completed"
> > > +  [(parallel
> > > +     [(set (match_dup 0)
> > > +          (any_logic:SWI1248_AVX512BW (match_dup 1) (match_dup 2)))
> > > +      (unspec [(const_int 0)] UNSPEC_MASKOP)])])
> > > +
> > >  (define_insn "kandn<mode>"
> > >    [(set (match_operand:SWI1248_AVX512BW 0 "register_operand" "=k")
> > >         (and:SWI1248_AVX512BW
> > > --cut here--
> > >
> > > and similar for kandn and knot in sse.md. You will have to add
> > > mask_reg_operand predicate, see e.g. sse_reg_operand in predicates.md
> > > for example.
> > >
> > > We don't lose anything, because all important transformations,
> > > propagations and simplifications with these patterns happen before
> > > reload.
> >
> > define_splits are added for those bitwise operations.
> >
> > >
> > > Uros.
> >
> > Also add bellow part which will pass gcc.target/i386/bitwise_mask_op-3.c
> >
> > -     must go into Q_REGS.  */
> > +     must go into Q_REGS or ALL_MASK_REGS.  */
> >    if (GET_MODE (x) == QImode && !CONSTANT_P (x))
> >      {
> >        if (Q_CLASS_P (regclass))
> >         return regclass;
> >        else if (reg_class_subset_p (Q_REGS, regclass))
> >         return Q_REGS;
> > +      else if (MASK_CLASS_P (regclass))
> > +       return regclass;
> >        else
> >         return NO_REGS;
> >
> >
> > Update patch.
> >
> >
> > --
> > BR,
> > Hongtao
>
> networking is slow to send out mail with attachment, so i copy the
> patch into mail.
>
> gcc/
>         PR target/88808
>         * config/i386/i386.c (ix86_preferred_reload_class): Allow
>         QImode data go into mask registers.
>         * config/i386/i386.md: (*movhi_internal): Adjust constraints
>         for mask registers.
>         (*movqi_internal): Ditto.
>         (*anddi_1): Support mask register operations
>         (*and<mode>_1): Ditto.
>         (*andqi_1): Ditto.
>         (*andn<mode>_1): Ditto.
>         (*<code><mode>_1): Ditto.
>         (*<code>qi_1): Ditto.
>         (*one_cmpl<mode>2_1): Ditto.
>         (*one_cmplsi2_1_zext): Ditto.
>         (*one_cmplqi2_1): Ditto.
>         (define_peephole2): Move constant 0/-1 directly into mask
>         registers.
>         * config/i386/predicates.md (mask_reg_operand): New predicate.
>         * config/i386/sse.md (define_split): Add post-reload splitters
>         that would convert "generic" patterns to mask patterns.
>         (*knotsi_1_zext): New define_insn.
>
> gcc/testsuite/
>         * gcc.target/i386/bitwise_mask_op-1.c: New test.
>         * gcc.target/i386/bitwise_mask_op-2.c: New test.
>         * gcc.target/i386/bitwise_mask_op-3.c: New test.
>         * gcc.target/i386/avx512bw-pr88465.c: New testcase.
>         * gcc.target/i386/avx512bw-kunpckwd-1.c: Adjust testcase.
>         * gcc.target/i386/avx512bw-kunpckwd-3.c: Ditto.
>         * gcc.target/i386/avx512dq-kmovb-5.c: Ditto.
>         * gcc.target/i386/avx512f-kmovw-5.c: Ditto.

A little nit, please put new splitters after the instruction pattern.

OK for the whole patch set with the above change,

Thanks,
Uros.

> ---
>  gcc/config/i386/i386.c                        |   4 +-
>  gcc/config/i386/i386.md                       | 209 ++++++++++++------
>  gcc/config/i386/predicates.md                 |   5 +
>  gcc/config/i386/sse.md                        |  59 +++++
>  .../gcc.target/i386/avx512bw-kunpckwd-1.c     |   2 +-
>  .../gcc.target/i386/avx512bw-kunpckwd-3.c     |   2 +-
>  .../gcc.target/i386/avx512bw-pr88465.c        |  23 ++
>  .../gcc.target/i386/avx512dq-kmovb-5.c        |   2 +-
>  .../gcc.target/i386/avx512f-kmovw-5.c         |   2 +-
>  .../gcc.target/i386/bitwise_mask_op-1.c       | 178 +++++++++++++++
>  .../gcc.target/i386/bitwise_mask_op-2.c       |   8 +
>  .../gcc.target/i386/bitwise_mask_op-3.c       |  44 ++++
>  12 files changed, 471 insertions(+), 67 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/avx512bw-pr88465.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/bitwise_mask_op-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/bitwise_mask_op-2.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c
>
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index d71d6d55be6..e8a2182ceb0 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -18407,13 +18407,15 @@ ix86_preferred_reload_class (rtx x,
> reg_class_t regclass)
>      return INTEGER_CLASS_P (regclass) ? regclass : NO_REGS;
>
>    /* QImode constants are easy to load, but non-constant QImode data
> -     must go into Q_REGS.  */
> +     must go into Q_REGS or ALL_MASK_REGS.  */
>    if (GET_MODE (x) == QImode && !CONSTANT_P (x))
>      {
>        if (Q_CLASS_P (regclass))
>          return regclass;
>        else if (reg_class_subset_p (Q_REGS, regclass))
>          return Q_REGS;
> +      else if (MASK_CLASS_P (regclass))
> +        return regclass;
>        else
>          return NO_REGS;
>      }
> diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
> index 3a15941c3e8..676525fbc1f 100644
> --- a/gcc/config/i386/i386.md
> +++ b/gcc/config/i386/i386.md
> @@ -2403,8 +2403,8 @@
>             (symbol_ref "true")))])
>
>  (define_insn "*movhi_internal"
> -  [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r ,r ,m ,k,k ,r,m,k")
> -        (match_operand:HI 1 "general_operand"      "r
> ,rn,rm,rn,r,km,k,k,CBC"))]
> +  [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r ,r ,m ,*k,*k
> ,*r,*m,*k")
> +        (match_operand:HI 1 "general_operand"      "r
> ,rn,rm,rn,*r,*km,*k,*k,CBC"))]
>    "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
>  {
>    switch (get_attr_type (insn))
> @@ -2491,9 +2491,9 @@
>
>  (define_insn "*movqi_internal"
>    [(set (match_operand:QI 0 "nonimmediate_operand"
> -                        "=Q,R,r,q,q,r,r ,?r,m ,k,k,r,m,k,k,k")
> +                        "=Q,R,r,q,q,r,r ,?r,m ,*k,*k,*r,*m,*k,*k,*k")
>          (match_operand:QI 1 "general_operand"
> -                        "Q ,R,r,n,m,q,rn, m,qn,r,k,k,k,m,C,BC"))]
> +                        "Q ,R,r,n,m,q,rn, m,qn,*r,*k,*k,*k,*m,C,BC"))]
>    "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
>  {
>    char buf[128];
> @@ -2624,6 +2624,19 @@
>             ]
>             (const_string "QI")))])
>
> +/* Reload dislikes loading 0/-1 directly into mask registers.
> +   Try to tidy things up here.  */
> +(define_peephole2
> +  [(set (match_operand:SWI 0 "general_reg_operand")
> +        (match_operand:SWI 1 "immediate_operand"))
> +   (set (match_operand:SWI 2 "mask_reg_operand")
> +        (match_dup 0))]
> +  "peep2_reg_dead_p (2, operands[0])
> +   && (const0_operand (operands[1], <MODE>mode)
> +       || (constm1_operand (operands[1], <MODE>mode)
> +           && (<MODE_SIZE> > 1 || TARGET_AVX512DQ)))"
> +  [(set (match_dup 2) (match_dup 1))])
> +
>  ;; Stores and loads of ax to arbitrary constant address.
>  ;; We fake an second form of instruction to force reload to load address
>  ;; into register when rax is not available
> @@ -9044,19 +9057,21 @@
>  })
>
>  (define_insn "*anddi_1"
> -  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,rm,r,r")
> +  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,rm,r,r,k")
>          (and:DI
> -         (match_operand:DI 1 "nonimmediate_operand" "%0,0,0,qm")
> -         (match_operand:DI 2 "x86_64_szext_general_operand" "Z,re,m,L")))
> +         (match_operand:DI 1 "nonimmediate_operand" "%0,0,0,qm,k")
> +         (match_operand:DI 2 "x86_64_szext_general_operand" "Z,re,m,L,k")))
>     (clobber (reg:CC FLAGS_REG))]
>    "TARGET_64BIT && ix86_binary_operator_ok (AND, DImode, operands)"
>    "@
>     and{l}\t{%k2, %k0|%k0, %k2}
>     and{q}\t{%2, %0|%0, %2}
>     and{q}\t{%2, %0|%0, %2}
> +   #
>     #"
> -  [(set_attr "type" "alu,alu,alu,imovx")
> -   (set_attr "length_immediate" "*,*,*,0")
> +  [(set_attr "isa" "x64,x64,x64,x64,avx512bw")
> +   (set_attr "type" "alu,alu,alu,imovx,msklog")
> +   (set_attr "length_immediate" "*,*,*,0,*")
>     (set (attr "prefix_rex")
>       (if_then_else
>         (and (eq_attr "type" "imovx")
> @@ -9064,7 +9079,7 @@
>                   (match_operand 1 "ext_QIreg_operand")))
>         (const_string "1")
>         (const_string "*")))
> -   (set_attr "mode" "SI,DI,DI,SI")])
> +   (set_attr "mode" "SI,DI,DI,SI,DI")])
>
>  (define_insn_and_split "*anddi_1_btr"
>    [(set (match_operand:DI 0 "nonimmediate_operand" "=rm")
> @@ -9130,17 +9145,25 @@
>     (set_attr "mode" "SI")])
>
>  (define_insn "*and<mode>_1"
> -  [(set (match_operand:SWI24 0 "nonimmediate_operand" "=rm,r,Ya")
> -        (and:SWI24 (match_operand:SWI24 1 "nonimmediate_operand" "%0,0,qm")
> -                   (match_operand:SWI24 2 "<general_operand>" "r<i>,m,L")))
> +  [(set (match_operand:SWI24 0 "nonimmediate_operand" "=rm,r,Ya,k")
> +        (and:SWI24 (match_operand:SWI24 1 "nonimmediate_operand" "%0,0,qm,k")
> +                   (match_operand:SWI24 2 "<general_operand>" "r<i>,m,L,k")))
>     (clobber (reg:CC FLAGS_REG))]
>    "ix86_binary_operator_ok (AND, <MODE>mode, operands)"
>    "@
>     and{<imodesuffix>}\t{%2, %0|%0, %2}
>     and{<imodesuffix>}\t{%2, %0|%0, %2}
> +   #
>     #"
> -  [(set_attr "type" "alu,alu,imovx")
> -   (set_attr "length_immediate" "*,*,0")
> +  [(set (attr "isa")
> +        (cond [(eq_attr "alternative" "3")
> +                 (if_then_else (eq_attr "mode" "SI")
> +                   (const_string "avx512bw")
> +                   (const_string "avx512f"))
> +              ]
> +              (const_string "*")))
> +   (set_attr "type" "alu,alu,imovx,msklog")
> +   (set_attr "length_immediate" "*,*,0,*")
>     (set (attr "prefix_rex")
>       (if_then_else
>         (and (eq_attr "type" "imovx")
> @@ -9148,20 +9171,28 @@
>                   (match_operand 1 "ext_QIreg_operand")))
>         (const_string "1")
>         (const_string "*")))
> -   (set_attr "mode" "<MODE>,<MODE>,SI")])
> +   (set_attr "mode" "<MODE>,<MODE>,SI,<MODE>")])
>
>  (define_insn "*andqi_1"
> -  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r")
> -        (and:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0")
> -                (match_operand:QI 2 "general_operand" "qn,m,rn")))
> +  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r,k")
> +        (and:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0,k")
> +                (match_operand:QI 2 "general_operand" "qn,m,rn,k")))
>     (clobber (reg:CC FLAGS_REG))]
>    "ix86_binary_operator_ok (AND, QImode, operands)"
>    "@
>     and{b}\t{%2, %0|%0, %2}
>     and{b}\t{%2, %0|%0, %2}
> -   and{l}\t{%k2, %k0|%k0, %k2}"
> -  [(set_attr "type" "alu")
> -   (set_attr "mode" "QI,QI,SI")
> +   and{l}\t{%k2, %k0|%k0, %k2}
> +   #"
> +  [(set_attr "type" "alu,alu,alu,msklog")
> +   (set (attr "mode")
> +        (cond [(eq_attr "alternative" "2")
> +                 (const_string "SI")
> +                (and (eq_attr "alternative" "3")
> +                     (match_test "!TARGET_AVX512DQ"))
> +                 (const_string "HI")
> +               ]
> +               (const_string "QI")))
>     ;; Potential partial reg stall on alternative 2.
>     (set (attr "preferred_for_speed")
>       (cond [(eq_attr "alternative" "2")
> @@ -9539,28 +9570,42 @@
>  })
>
>  (define_insn "*andn<mode>_1"
> -  [(set (match_operand:SWI48 0 "register_operand" "=r,r")
> +  [(set (match_operand:SWI48 0 "register_operand" "=r,r,k")
>          (and:SWI48
> -          (not:SWI48 (match_operand:SWI48 1 "register_operand" "r,r"))
> -          (match_operand:SWI48 2 "nonimmediate_operand" "r,m")))
> +          (not:SWI48 (match_operand:SWI48 1 "register_operand" "r,r,k"))
> +          (match_operand:SWI48 2 "nonimmediate_operand" "r,m,k")))
>     (clobber (reg:CC FLAGS_REG))]
> -  "TARGET_BMI"
> -  "andn\t{%2, %1, %0|%0, %1, %2}"
> -  [(set_attr "type" "bitmanip")
> -   (set_attr "btver2_decode" "direct, double")
> +  "TARGET_BMI || TARGET_AVX512BW"
> +  "@
> +   andn\t{%2, %1, %0|%0, %1, %2}
> +   andn\t{%2, %1, %0|%0, %1, %2}
> +   #"
> +  [(set_attr "isa" "bmi,bmi,avx512bw")
> +   (set_attr "type" "bitmanip,bitmanip,msklog")
> +   (set_attr "btver2_decode" "direct, double,*")
>     (set_attr "mode" "<MODE>")])
>
>  (define_insn "*andn<mode>_1"
> -  [(set (match_operand:SWI12 0 "register_operand" "=r")
> +  [(set (match_operand:SWI12 0 "register_operand" "=r,k")
>          (and:SWI12
> -          (not:SWI12 (match_operand:SWI12 1 "register_operand" "r"))
> -          (match_operand:SWI12 2 "register_operand" "r")))
> +          (not:SWI12 (match_operand:SWI12 1 "register_operand" "r,k"))
> +          (match_operand:SWI12 2 "register_operand" "r,k")))
>     (clobber (reg:CC FLAGS_REG))]
> -  "TARGET_BMI"
> -  "andn\t{%k2, %k1, %k0|%k0, %k1, %k2}"
> -  [(set_attr "type" "bitmanip")
> -   (set_attr "btver2_decode" "direct")
> -   (set_attr "mode" "SI")])
> +  "TARGET_BMI || TARGET_AVX512BW"
> +  "@
> +   andn\t{%k2, %k1, %k0|%k0, %k1, %k2}
> +   #"
> +  [(set_attr "isa" "bmi,avx512f")
> +   (set_attr "type" "bitmanip,msklog")
> +   (set_attr "btver2_decode" "direct,*")
> +   (set (attr "mode")
> +        (cond [(eq_attr "alternative" "0")
> +                 (const_string "SI")
> +               (and (eq_attr "alternative" "1")
> +                    (match_test "!TARGET_AVX512DQ"))
> +                  (const_string "HI")
> +              ]
> +              (const_string "<MODE>")))])
>
>  (define_insn "*andn_<mode>_ccno"
>    [(set (reg FLAGS_REG)
> @@ -9631,14 +9676,24 @@
>  })
>
>  (define_insn "*<code><mode>_1"
> -  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,r")
> +  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,r,k")
>          (any_or:SWI248
> -         (match_operand:SWI248 1 "nonimmediate_operand" "%0,0")
> -         (match_operand:SWI248 2 "<general_operand>" "r<i>,m")))
> +         (match_operand:SWI248 1 "nonimmediate_operand" "%0,0,k")
> +         (match_operand:SWI248 2 "<general_operand>" "r<i>,m,k")))
>     (clobber (reg:CC FLAGS_REG))]
>    "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
> -  "<logic>{<imodesuffix>}\t{%2, %0|%0, %2}"
> -  [(set_attr "type" "alu")
> +  "@
> +   <logic>{<imodesuffix>}\t{%2, %0|%0, %2}
> +   <logic>{<imodesuffix>}\t{%2, %0|%0, %2}
> +   #"
> +  [(set (attr "isa")
> +        (cond [(eq_attr "alternative" "2")
> +                 (if_then_else (eq_attr "mode" "SI,DI")
> +                   (const_string "avx512bw")
> +                   (const_string "avx512f"))
> +              ]
> +              (const_string "*")))
> +   (set_attr "type" "alu, alu, msklog")
>     (set_attr "mode" "<MODE>")])
>
>  (define_insn_and_split "*iordi_1_bts"
> @@ -9711,17 +9766,26 @@
>     (set_attr "mode" "SI")])
>
>  (define_insn "*<code>qi_1"
> -  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r")
> -        (any_or:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0")
> -                   (match_operand:QI 2 "general_operand" "qn,m,rn")))
> +  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r,k")
> +        (any_or:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0,k")
> +                   (match_operand:QI 2 "general_operand" "qn,m,rn,k")))
>     (clobber (reg:CC FLAGS_REG))]
>    "ix86_binary_operator_ok (<CODE>, QImode, operands)"
>    "@
>     <logic>{b}\t{%2, %0|%0, %2}
>     <logic>{b}\t{%2, %0|%0, %2}
> -   <logic>{l}\t{%k2, %k0|%k0, %k2}"
> -  [(set_attr "type" "alu")
> -   (set_attr "mode" "QI,QI,SI")
> +   <logic>{l}\t{%k2, %k0|%k0, %k2}
> +   #"
> +  [(set_attr "isa" "*,*,*,avx512f")
> +   (set_attr "type" "alu,alu,alu,msklog")
> +   (set (attr "mode")
> +        (cond [(eq_attr "alternative" "2")
> +                 (const_string "SI")
> +                (and (eq_attr "alternative" "3")
> +                     (match_test "!TARGET_AVX512DQ"))
> +                 (const_string "HI")
> +               ]
> +               (const_string "QI")))
>     ;; Potential partial reg stall on alternative 2.
>     (set (attr "preferred_for_speed")
>       (cond [(eq_attr "alternative" "2")
> @@ -10370,31 +10434,52 @@
>    "split_double_mode (DImode, &operands[0], 2, &operands[0], &operands[2]);")
>
>  (define_insn "*one_cmpl<mode>2_1"
> -  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm")
> -        (not:SWI248 (match_operand:SWI248 1 "nonimmediate_operand" "0")))]
> +  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,k")
> +        (not:SWI248 (match_operand:SWI248 1 "nonimmediate_operand" "0,k")))]
>    "ix86_unary_operator_ok (NOT, <MODE>mode, operands)"
> -  "not{<imodesuffix>}\t%0"
> -  [(set_attr "type" "negnot")
> +  "@
> +   not{<imodesuffix>}\t%0
> +   #"
> +  [(set (attr "isa")
> +        (cond [(eq_attr "alternative" "2")
> +                 (if_then_else (eq_attr "mode" "SI,DI")
> +                   (const_string "avx512bw")
> +                   (const_string "avx512f"))
> +              ]
> +              (const_string "*")))
> +   (set_attr "type" "negnot,msklog")
>     (set_attr "mode" "<MODE>")])
>
>  (define_insn "*one_cmplsi2_1_zext"
> -  [(set (match_operand:DI 0 "register_operand" "=r")
> +  [(set (match_operand:DI 0 "register_operand" "=r,k")
>          (zero_extend:DI
> -          (not:SI (match_operand:SI 1 "register_operand" "0"))))]
> +          (not:SI (match_operand:SI 1 "register_operand" "0,k"))))]
>    "TARGET_64BIT && ix86_unary_operator_ok (NOT, SImode, operands)"
> -  "not{l}\t%k0"
> -  [(set_attr "type" "negnot")
> -   (set_attr "mode" "SI")])
> +  "@
> +   not{l}\t%k0
> +   #"
> +  [(set_attr "isa" "x64,avx512bw")
> +   (set_attr "type" "negnot,msklog")
> +   (set_attr "mode" "SI,SI")])
>
>  (define_insn "*one_cmplqi2_1"
> -  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,r")
> -        (not:QI (match_operand:QI 1 "nonimmediate_operand" "0,0")))]
> +  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,r,k")
> +        (not:QI (match_operand:QI 1 "nonimmediate_operand" "0,0,k")))]
>    "ix86_unary_operator_ok (NOT, QImode, operands)"
>    "@
>     not{b}\t%0
> -   not{l}\t%k0"
> -  [(set_attr "type" "negnot")
> -   (set_attr "mode" "QI,SI")
> +   not{l}\t%k0
> +   #"
> +  [(set_attr "isa" "*,*,avx512f")
> +   (set_attr "type" "negnot,negnot,msklog")
> +   (set (attr "mode")
> +        (cond [(eq_attr "alternative" "1")
> +                 (const_string "SI")
> +                (and (eq_attr "alternative" "2")
> +                     (match_test "!TARGET_AVX512DQ"))
> +                 (const_string "HI")
> +               ]
> +               (const_string "QI")))
>     ;; Potential partial reg stall on alternative 1.
>     (set (attr "preferred_for_speed")
>       (cond [(eq_attr "alternative" "1")
> diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
> index 07e69d555c0..dd1b31479f5 100644
> --- a/gcc/config/i386/predicates.md
> +++ b/gcc/config/i386/predicates.md
> @@ -87,6 +87,11 @@
>    (and (match_code "reg")
>         (match_test "REGNO (op) == FLAGS_REG")))
>
> +;; True if the operand is a MASK register.
> +(define_predicate "mask_reg_operand"
> +  (and (match_code "reg")
> +       (match_test "MASK_REGNO_P (REGNO (op))")))
> +
>  ;; Match a DI, SI, HI or QImode nonimmediate_operand.
>  (define_special_predicate "int_nonimmediate_operand"
>    (and (match_operand 0 "nonimmediate_operand")
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index b6348de67cb..4372a9fd785 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -1452,6 +1452,18 @@
>    "TARGET_AVX512F
>     && !(MEM_P (operands[0]) && MEM_P (operands[1]))")
>
> +(define_split
> +  [(set (match_operand:SWI1248_AVX512BW 0 "mask_reg_operand")
> +        (any_logic:SWI1248_AVX512BW
> +          (match_operand:SWI1248_AVX512BW 1 "mask_reg_operand")
> +          (match_operand:SWI1248_AVX512BW 2 "mask_reg_operand")))
> +   (clobber (reg:CC FLAGS_REG))]
> +  "TARGET_AVX512F && reload_completed"
> +  [(parallel
> +     [(set (match_dup 0)
> +           (any_logic:SWI1248_AVX512BW (match_dup 1) (match_dup 2)))
> +      (unspec [(const_int 0)] UNSPEC_MASKOP)])])
> +
>  (define_insn "k<code><mode>"
>    [(set (match_operand:SWI1248_AVX512BW 0 "register_operand" "=k")
>          (any_logic:SWI1248_AVX512BW
> @@ -1474,6 +1486,21 @@
>             ]
>             (const_string "<MODE>")))])
>
> +(define_split
> +  [(set (match_operand:SWI1248_AVX512BW 0 "mask_reg_operand")
> +        (and:SWI1248_AVX512BW
> +          (not:SWI1248_AVX512BW
> +            (match_operand:SWI1248_AVX512BW 1 "mask_reg_operand"))
> +          (match_operand:SWI1248_AVX512BW 2 "mask_reg_operand")))
> +   (clobber (reg:CC FLAGS_REG))]
> +  "TARGET_AVX512F && reload_completed"
> +  [(parallel
> +     [(set (match_dup 0)
> +           (and:SWI1248_AVX512BW
> +             (not:SWI1248_AVX512BW (match_dup 1))
> +             (match_dup 2)))
> +      (unspec [(const_int 0)] UNSPEC_MASKOP)])])
> +
>  (define_insn "kandn<mode>"
>    [(set (match_operand:SWI1248_AVX512BW 0 "register_operand" "=k")
>          (and:SWI1248_AVX512BW
> @@ -1520,6 +1547,16 @@
>             ]
>             (const_string "<MODE>")))])
>
> +(define_split
> +  [(set (match_operand:SWI1248_AVX512BW 0 "mask_reg_operand")
> +        (not:SWI1248_AVX512BW
> +          (match_operand:SWI1248_AVX512BW 1 "mask_reg_operand")))]
> +  "TARGET_AVX512F && reload_completed"
> +  [(parallel
> +     [(set (match_dup 0)
> +           (not:SWI1248_AVX512BW (match_dup 1)))
> +      (unspec [(const_int 0)] UNSPEC_MASKOP)])])
> +
>  (define_insn "knot<mode>"
>    [(set (match_operand:SWI1248_AVX512BW 0 "register_operand" "=k")
>          (not:SWI1248_AVX512BW
> @@ -1541,6 +1578,28 @@
>             ]
>             (const_string "<MODE>")))])
>
> +(define_split
> +  [(set (match_operand:DI 0 "mask_reg_operand")
> +        (zero_extend:DI
> +          (not:DI (match_operand:SI 1 "mask_reg_operand"))))]
> +  "TARGET_AVX512BW && reload_completed"
> +  [(parallel
> +     [(set (match_dup 0)
> +           (zero_extend:DI
> +             (not:SI (match_dup 1))))
> +      (unspec [(const_int 0)] UNSPEC_MASKOP)])])
> +
> +(define_insn "*knotsi_1_zext"
> +  [(set (match_operand:DI 0 "register_operand" "=k")
> +        (zero_extend:DI
> +          (not:SI (match_operand:SI 1 "register_operand" "k"))))
> +   (unspec [(const_int 0)] UNSPEC_MASKOP)]
> +  "TARGET_AVX512BW"
> +  "knotd\t{%1, %0|%0, %1}";
> +  [(set_attr "type" "msklog")
> +   (set_attr "prefix" "vex")
> +   (set_attr "mode" "SI")])
> +
>  (define_insn "kadd<mode>"
>    [(set (match_operand:SWI1248_AVX512BWDQ2 0 "register_operand" "=k")
>          (plus:SWI1248_AVX512BWDQ2
> diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-1.c
> b/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-1.c
> index 94422f36010..46d9351f275 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-1.c
> +++ b/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-1.c
> @@ -1,6 +1,6 @@
>  /* { dg-do compile } */
>  /* { dg-options "-mavx512bw -O2" } */
> -/* { dg-final { scan-assembler-times "kunpckwd\[
> \\t\]+\[^\{\n\]*%k\[1-7\](?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "kunpckwd\[
> \\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */
>
>  #include <immintrin.h>
>
> diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-3.c
> b/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-3.c
> index c68ad8cc1f7..fe13f4f33fc 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-3.c
> +++ b/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-3.c
> @@ -1,6 +1,6 @@
>  /* { dg-do compile } */
>  /* { dg-options "-mavx512bw -O2" } */
> -/* { dg-final { scan-assembler-times "kunpckwd\[
> \\t\]+\[^\{\n\]*%k\[1-7\](?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "kunpckwd\[
> \\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */
>
>  #include <immintrin.h>
>
> diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-pr88465.c
> b/gcc/testsuite/gcc.target/i386/avx512bw-pr88465.c
> new file mode 100644
> index 00000000000..8e34bf45365
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512bw-pr88465.c
> @@ -0,0 +1,23 @@
> +/* PR target/88465 */
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-O2 -mavx512bw" } */
> +/* { dg-final { scan-assembler-times "kxor\[qd\]\[ \t]" 2 } } */
> +/* { dg-final { scan-assembler-times "kxnor\[dq\]\[ \t]" 2 } } */
> +
> +void
> +foo (void)
> +{
> +  unsigned int k = 0;
> +  __asm volatile ("" : : "k" (k));
> +  k = -1;
> +  __asm volatile ("" : : "k" (k));
> +}
> +
> +void
> +bar (void)
> +{
> +  unsigned long long k = 0;
> +  __asm volatile ("" : : "k" (k));
> +  k = -1;
> +  __asm volatile ("" : : "k" (k));
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/avx512dq-kmovb-5.c
> b/gcc/testsuite/gcc.target/i386/avx512dq-kmovb-5.c
> index 49817097e26..114e03ee93d 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512dq-kmovb-5.c
> +++ b/gcc/testsuite/gcc.target/i386/avx512dq-kmovb-5.c
> @@ -1,5 +1,5 @@
>  /* { dg-do compile } */
> -/* { dg-options "-mavx512dq -O2" } */
> +/* { dg-options "-mavx512dq -mno-avx512bw -O2" } */
>  /* { dg-final { scan-assembler-times "kmovb\[
> \\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */
>
>  #include <immintrin.h>
> diff --git a/gcc/testsuite/gcc.target/i386/avx512f-kmovw-5.c
> b/gcc/testsuite/gcc.target/i386/avx512f-kmovw-5.c
> index 7bb34d34d8d..79d37394b36 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512f-kmovw-5.c
> +++ b/gcc/testsuite/gcc.target/i386/avx512f-kmovw-5.c
> @@ -1,5 +1,5 @@
>  /* { dg-do compile } */
> -/* { dg-options "-mavx512f -O2" } */
> +/* { dg-options "-mavx512f -mno-avx512bw -O2" } */
>  /* { dg-final { scan-assembler-times "kmovw\[
> \\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */
>
>  #include <immintrin.h>
> diff --git a/gcc/testsuite/gcc.target/i386/bitwise_mask_op-1.c
> b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-1.c
> new file mode 100644
> index 00000000000..61f71ab8b23
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-1.c
> @@ -0,0 +1,178 @@
> +/* PR target/88808  */
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512bw -mno-avx512dq -O2" } */
> +
> +#include <immintrin.h>
> +__m512i
> +foo_orq (__m512i a, __m512i b, __m512i c, __m512i d)
> +{
> +  __mmask64 m1 = _mm512_cmpeq_epi8_mask (a, b);
> +  __mmask64 m2 = _mm512_cmpeq_epi8_mask (c, d);
> +  return _mm512_mask_add_epi8 (c, m1 | m2, a, d);
> +}
> +
> +/* { dg-final { scan-assembler-times "korq" "1" { target { ! ia32 } } } } */
> +
> +__m512i
> +foo_ord (__m512i a, __m512i b, __m512i c, __m512i d)
> +{
> +  __mmask32 m1 = _mm512_cmpeq_epi16_mask (a, b);
> +  __mmask32 m2 = _mm512_cmpeq_epi16_mask (c, d);
> +  return _mm512_mask_add_epi16 (c, m1 | m2, a, d);
> +}
> +
> +/* { dg-final { scan-assembler-times "kord" "1" } }  */
> +
> +__m512i
> +foo_orw (__m512i a, __m512i b, __m512i c, __m512i d)
> +{
> +  __mmask16 m1 = _mm512_cmpeq_epi32_mask (a, b);
> +  __mmask16 m2 = _mm512_cmpeq_epi32_mask (c, d);
> +  return _mm512_mask_add_epi32 (c, m1 | m2, a, d);
> +}
> +
> +__m512i
> +foo_orb (__m512i a, __m512i b, __m512i c, __m512i d)
> +{
> +  __mmask8 m1 = _mm512_cmpeq_epi64_mask (a, b);
> +  __mmask8 m2 = _mm512_cmpeq_epi64_mask (c, d);
> +  return _mm512_mask_add_epi64 (c, m1 | m2, a, d);
> +}
> +
> +/* { dg-final { scan-assembler-times "korw" "2" } }  */
> +
> +__m512i
> +foo_xorq (__m512i a, __m512i b, __m512i c, __m512i d)
> +{
> +  __mmask64 m1 = _mm512_cmpeq_epi8_mask (a, b);
> +  __mmask64 m2 = _mm512_cmpeq_epi8_mask (c, d);
> +  return _mm512_mask_add_epi8 (c, m1 ^ m2, a, d);
> +}
> +
> +/* { dg-final { scan-assembler-times "kxorq" "1" { target { ! ia32 } } } }  */
> +
> +__m512i
> +foo_xord (__m512i a, __m512i b, __m512i c, __m512i d)
> +{
> +  __mmask32 m1 = _mm512_cmpeq_epi16_mask (a, b);
> +  __mmask32 m2 = _mm512_cmpeq_epi16_mask (c, d);
> +  return _mm512_mask_add_epi16 (c, m1 ^ m2, a, d);
> +}
> +
> +/* { dg-final { scan-assembler-times "kxord" "1" } }  */
> +
> +__m512i
> +foo_xorw (__m512i a, __m512i b, __m512i c, __m512i d)
> +{
> +  __mmask16 m1 = _mm512_cmpeq_epi32_mask (a, b);
> +  __mmask16 m2 = _mm512_cmpeq_epi32_mask (c, d);
> +  return _mm512_mask_add_epi32 (c, m1 ^ m2, a, d);
> +}
> +
> +__m512i
> +foo_xorb (__m512i a, __m512i b, __m512i c, __m512i d)
> +{
> +  __mmask8 m1 = _mm512_cmpeq_epi64_mask (a, b);
> +  __mmask8 m2 = _mm512_cmpeq_epi64_mask (c, d);
> +  return _mm512_mask_add_epi64 (c, m1 ^ m2, a, d);
> +}
> +
> +/* { dg-final { scan-assembler-times "korw" "2" } }  */
> +
> +__m512i
> +foo_andq (__m512i a, __m512i b, __m512i c, __m512i d)
> +{
> +  __mmask64 m1 = _mm512_cmpeq_epi8_mask (a, b);
> +  __mmask64 m2 = _mm512_cmpeq_epi8_mask (c, d);
> +  return _mm512_mask_add_epi8 (c, m1 & m2, a, d);
> +}
> +
> +__m512i
> +foo_andd (__m512i a, __m512i b, __m512i c, __m512i d)
> +{
> +  __mmask32 m1 = _mm512_cmpeq_epi16_mask (a, b);
> +  __mmask32 m2 = _mm512_cmpeq_epi16_mask (c, d);
> +  return _mm512_mask_add_epi16 (c, m1 & m2, a, d);
> +}
> +
> +__m512i
> +foo_andw (__m512i a, __m512i b, __m512i c, __m512i d)
> +{
> +  __mmask16 m1 = _mm512_cmpeq_epi32_mask (a, b);
> +  __mmask16 m2 = _mm512_cmpeq_epi32_mask (c, d);
> +  return _mm512_mask_add_epi32 (c, m1 & m2, a, d);
> +}
> +
> +__m512i
> +foo_andb (__m512i a, __m512i b, __m512i c, __m512i d)
> +{
> +  __mmask8 m1 = _mm512_cmpeq_epi64_mask (a, b);
> +  __mmask8 m2 = _mm512_cmpeq_epi64_mask (c, d);
> +  return _mm512_mask_add_epi64 (c, m1 & m2, a, d);
> +}
> +
> +__m512i
> +foo_andnq (__m512i a, __m512i b, __m512i c, __m512i d)
> +{
> +  __mmask64 m1 = _mm512_cmpeq_epi8_mask (a, b);
> +  __mmask64 m2 = _mm512_cmpeq_epi8_mask (c, d);
> +  return _mm512_mask_add_epi8 (c, m1 & ~m2, a, d);
> +}
> +
> +__m512i
> +foo_andnd (__m512i a, __m512i b, __m512i c, __m512i d)
> +{
> +  __mmask32 m1 = _mm512_cmpeq_epi16_mask (a, b);
> +  __mmask32 m2 = _mm512_cmpeq_epi16_mask (c, d);
> +  return _mm512_mask_add_epi16 (c, m1 & ~m2, a, d);
> +}
> +
> +__m512i
> +foo_andnw (__m512i a, __m512i b, __m512i c, __m512i d)
> +{
> +  __mmask16 m1 = _mm512_cmpeq_epi32_mask (a, b);
> +  __mmask16 m2 = _mm512_cmpeq_epi32_mask (c, d);
> +  return _mm512_mask_add_epi32 (c, m1 & ~m2, a, d);
> +}
> +
> +__m512i
> +foo_andnb (__m512i a, __m512i b, __m512i c, __m512i d)
> +{
> +  __mmask8 m1 = _mm512_cmpeq_epi64_mask (a, b);
> +  __mmask8 m2 = _mm512_cmpeq_epi64_mask (c, d);
> +  return _mm512_mask_add_epi64 (c, m1 & ~m2, a, d);
> +}
> +
> +__m512i
> +foo_notq (__m512i a, __m512i b, __m512i c, __m512i d)
> +{
> +  __mmask64 m1 = _mm512_cmpeq_epi8_mask (a, b);
> +  return _mm512_mask_add_epi8 (c, ~m1, a, d);
> +}
> +
> +/* { dg-final { scan-assembler-times "knotq" "2" { target { ! ia32 } } } }  */
> +
> +__m512i
> +foo_notd (__m512i a, __m512i b, __m512i c, __m512i d)
> +{
> +  __mmask32 m1 = _mm512_cmpeq_epi16_mask (a, b);
> +  return _mm512_mask_add_epi16 (c, ~m1, a, d);
> +}
> +
> +/* { dg-final { scan-assembler-times "knotd" "2" { target { ! ia32 } } } }  */
> +
> +__m512i
> +foo_notw (__m512i a, __m512i b, __m512i c, __m512i d)
> +{
> +  __mmask16 m1 = _mm512_cmpeq_epi32_mask (a, b);
> +  return _mm512_mask_add_epi32 (c, ~m1, a, d);
> +}
> +
> +__m512i
> +foo_notb (__m512i a, __m512i b, __m512i c, __m512i d)
> +{
> +  __mmask8 m1 = _mm512_cmpeq_epi64_mask (a, b);
> +  return _mm512_mask_add_epi64 (c, ~m1, a, d);
> +}
> +
> +/* { dg-final { scan-assembler-times "knotw" "4" } }  */
> diff --git a/gcc/testsuite/gcc.target/i386/bitwise_mask_op-2.c
> b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-2.c
> new file mode 100644
> index 00000000000..850f0b42652
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-2.c
> @@ -0,0 +1,8 @@
> +/* PR target/88808  */
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512bw -mavx512dq -O2" } */
> +/* { dg-final { scan-assembler-times "knotb" "2" } }  */
> +/* { dg-final { scan-assembler-times "korb" "1" } }  */
> +/* { dg-final { scan-assembler-times "kxorb" "1" } }  */
> +#include "bitwise_mask_op-1.c"
> +
> diff --git a/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c
> b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c
> new file mode 100644
> index 00000000000..18bf4f0d768
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c
> @@ -0,0 +1,44 @@
> +/* PR target/88808  */
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512bw -mavx512dq -O2" } */
> +
> +#include <immintrin.h>
> +volatile __mmask8 foo;
> +void
> +foo_orb (__m512i a, __m512i b)
> +{
> +  __mmask8 m1 = _mm512_cmp_epi64_mask (a, b, 2);
> +  __mmask8 m2 = _mm512_cmp_epi64_mask (a, b, 4);
> +  foo = m1 | m2;
> +}
> +
> +/* { dg-final { scan-assembler-times "korb\[\t \]" "1" } }  */
> +
> +void
> +foo_xorb (__m512i a, __m512i b)
> +{
> +  __mmask8 m1 = _mm512_cmp_epi64_mask (a, b, 2);
> +  __mmask8 m2 = _mm512_cmp_epi64_mask (a, b, 4);
> +  foo = m1 ^ m2;
> +}
> +
> +/* { dg-final { scan-assembler-times "kxorb\[\t \]" "1" } }  */
> +
> +void
> +foo_andb (__m512i a, __m512i b)
> +{
> +  __mmask8 m1 = _mm512_cmp_epi64_mask (a, b, 2);
> +  __mmask8 m2 = _mm512_cmp_epi64_mask (a, b, 4);
> +  foo = m1 & m2;
> +}
> +
> +void
> +foo_andnb (__m512i a, __m512i b)
> +{
> +  __mmask8 m1 = _mm512_cmp_epi64_mask (a, b, 2);
> +  __mmask8 m2 = _mm512_cmp_epi64_mask (a, b, 4);
> +  foo = m1 & ~m2;
> +}
> +
> +/* { dg-final { scan-assembler-times "knotb\[\t \]" "1" } }  */
> +/* { dg-final { scan-assembler-times "kmovb\[\t \]" "4"} }  */
> --
> 2.18.1
>
>
> --
> BR,
> Hongtao

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH 4/4][PR target/88808]Enable bitwise operator for AVX512 masks.
  2020-08-20  7:40           ` Uros Bizjak
@ 2020-08-20  7:45             ` Hongtao Liu
  2020-08-21 13:15               ` Uros Bizjak
  0 siblings, 1 reply; 39+ messages in thread
From: Hongtao Liu @ 2020-08-20  7:45 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: GCC Patches, Kirill Yukhin, H. J. Lu

On Thu, Aug 20, 2020 at 3:40 PM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Thu, Aug 20, 2020 at 9:31 AM Hongtao Liu <crazylht@gmail.com> wrote:
> >
> > On Thu, Aug 20, 2020 at 3:24 PM Hongtao Liu <crazylht@gmail.com> wrote:
> > >
> > > On Wed, Aug 19, 2020 at 3:05 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > >
> > > > On Wed, Aug 19, 2020 at 4:25 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > > > >
> > > > > On Mon, Aug 17, 2020 at 6:08 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > > > >
> > > > > > On Fri, Aug 14, 2020 at 10:26 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > > > > > >
> > > > > > > Enable operator or/xor/and/andn/not for mask register, kxnor is not
> > > > > > > enabled since there's no corresponding instruction for general
> > > > > > > registers.
> > > > > > >
> > > > > > > gcc/
> > > > > > >         PR target/88808
> > > > > > >         * config/i386/i386.md: (*movsi_internal): Adjust constraints
> > > > > > >         for mask registers.
> > > > > > >         (*movhi_internal): Ditto.
> > > > > > >         (*movqi_internal): Ditto.
> > > > > > >         (*anddi_1): Support mask register operations
> > > > > > >         (*and<mode>_1): Ditto.
> > > > > > >         (*andqi_1): Ditto.
> > > > > > >         (*andn<mode>_1): Ditto.
> > > > > > >         (*<code><mode>_1): Ditto.
> > > > > > >         (*<code>qi_1): Ditto.
> > > > > > >         (*one_cmpl<mode>2_1): Ditto.
> > > > > > >         (*one_cmplsi2_1_zext): Ditto.
> > > > > > >         (*one_cmplqi2_1): Ditto.
> > > > > > >
> > > > > > > gcc/testsuite/
> > > > > > >         * gcc.target/i386/bitwise_mask_op-1.c: New test.
> > > > > > >         * gcc.target/i386/bitwise_mask_op-2.c: New test.
> > > > > > >         * gcc.target/i386/avx512bw-kunpckwd-1.c: Adjust testcase.
> > > > > > >         * gcc.target/i386/avx512bw-kunpckwd-3.c: Ditto.
> > > > > > >         * gcc.target/i386/avx512dq-kmovb-5.c: Ditto.
> > > > > > >         * gcc.target/i386/avx512f-kmovw-5.c: Ditto.
> > > > > >
> > > > > > index 74d207c3711..e8ad79d1b0a 100644
> > > > > > --- a/gcc/config/i386/i386.md
> > > > > > +++ b/gcc/config/i386/i386.md
> > > > > > @@ -2294,7 +2294,7 @@
> > > > > >
> > > > > >  (define_insn "*movsi_internal"
> > > > > >    [(set (match_operand:SI 0 "nonimmediate_operand"
> > > > > > -    "=r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,?r,?*v,*k,*k ,*rm,*k")
> > > > > > +    "=r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,?r,?*v,*k,*k ,*rm,k")
> > > > > >      (match_operand:SI 1 "general_operand"
> > > > > >      "g ,re,C ,*y,m  ,*y,*y,r  ,C ,*v,m ,*v,*v,r  ,*r,*km,*k ,CBC"))]
> > > > > >    "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
> > > > > >
> > > > > > I'd rather see *k everywhere, also with *movqi_internal and
> > > > > > *movhi_internal patterns. The "*" means that the allocator won't
> > > > > > allocate a mask register by default, but it will be used to optimize
> > > > > > moves. With the above change, you are risking that during integer
> > > > > > register pressure, the register allocator will allocate zero to a mask
> > > > > > register, and later "optimize" the move with a direct maskreg-intreg
> > > > > > move.
> > > > > >
> > > > > > The current strategy is that only general registers get allocated for
> > > > > > integer modes. Let's keep it this way for now.
> > > > > >
> > > > >
> > > > > Yes,  though it would fail gcc.target/i386/avx512dq-pr88465.c and
> > > > > gcc.target/i386/avx512f-pr88465.c, i think it's more reasonable not to
> > > > > move zero into mask register directly.
> > > >
> > > > Although it would be nice if the register allocator was smart enough,
> > > > the current strategy is to introduce peephole2 patterns to fix these
> > > > problems, similar to [1]. These peepholes can be introduced in a
> > > > follow-up patch.
> > > >
> > > > [1] https://gcc.gnu.org/pipermail/gcc-patches/2020-August/551744.html
> > > >
> > >
> > > peephole2 added.
> > >
> > > > > > Otherwise, the patchset LGTM, but please test the suggested changes and repost.
> > > > > >
> > > > > > BTW: Do you plan to remove mask operations from sse.md? ATM, they are
> > > > > > used to distinguish mask operations, generated from builtins from
> > > > > > generic operations, so I'd like to keep them for a while. The drawback
> > > > > > is, that they are not combined with other operations, but at the end
> > > > > > of the day, this is what the programmer asked for by using builtins.
> > > > >
> > > > > Agree, I prefer to keep them.
> > > >
> > > > Thinking some more about the approach, it looks to me that the optimal
> > > > solution is a post-reload splitter that would convert "generic"
> > > > patterns to mask operations from sse.md. The mask operations don't set
> > > > flags, so we can substantially improve post reload scheduling of these
> > > > instructions by removing flags clobber.
> > > >
> > > > So, simply add "#" to relevant alternatives of logic patterns and add
> > > > something like:
> > > >
> > > > --cut here--
> > > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> > > > index 41c6dbfa668..ad49bdc7583 100644
> > > > --- a/gcc/config/i386/sse.md
> > > > +++ b/gcc/config/i386/sse.md
> > > > @@ -1470,6 +1470,18 @@
> > > >            ]
> > > >            (const_string "<MODE>")))])
> > > >
> > > > +(define_split
> > > > +  [(set (match_operand:SWI1248_AVX512BW 0 "mask_reg_operand")
> > > > +       (any_logic:SWI1248_AVX512BW
> > > > +         (match_operand:SWI1248_AVX512BW 1 "mask_reg_operand")
> > > > +         (match_operand:SWI1248_AVX512BW 2 "mask_reg_operand")))
> > > > +   (clobber (reg:CC FLAGS_REG))]
> > > > +  "TARGET_AVX512F && reload_completed"
> > > > +  [(parallel
> > > > +     [(set (match_dup 0)
> > > > +          (any_logic:SWI1248_AVX512BW (match_dup 1) (match_dup 2)))
> > > > +      (unspec [(const_int 0)] UNSPEC_MASKOP)])])
> > > > +
> > > >  (define_insn "kandn<mode>"
> > > >    [(set (match_operand:SWI1248_AVX512BW 0 "register_operand" "=k")
> > > >         (and:SWI1248_AVX512BW
> > > > --cut here--
> > > >
> > > > and similar for kandn and knot in sse.md. You will have to add
> > > > mask_reg_operand predicate, see e.g. sse_reg_operand in predicates.md
> > > > for example.
> > > >
> > > > We don't lose anything, because all important transformations,
> > > > propagations and simplifications with these patterns happen before
> > > > reload.
> > >
> > > define_splits are added for those bitwise operations.
> > >
> > > >
> > > > Uros.
> > >
> > > Also add bellow part which will pass gcc.target/i386/bitwise_mask_op-3.c
> > >
> > > -     must go into Q_REGS.  */
> > > +     must go into Q_REGS or ALL_MASK_REGS.  */
> > >    if (GET_MODE (x) == QImode && !CONSTANT_P (x))
> > >      {
> > >        if (Q_CLASS_P (regclass))
> > >         return regclass;
> > >        else if (reg_class_subset_p (Q_REGS, regclass))
> > >         return Q_REGS;
> > > +      else if (MASK_CLASS_P (regclass))
> > > +       return regclass;
> > >        else
> > >         return NO_REGS;
> > >
> > >
> > > Update patch.
> > >
> > >
> > > --
> > > BR,
> > > Hongtao
> >
> > networking is slow to send out mail with attachment, so i copy the
> > patch into mail.
> >
> > gcc/
> >         PR target/88808
> >         * config/i386/i386.c (ix86_preferred_reload_class): Allow
> >         QImode data go into mask registers.
> >         * config/i386/i386.md: (*movhi_internal): Adjust constraints
> >         for mask registers.
> >         (*movqi_internal): Ditto.
> >         (*anddi_1): Support mask register operations
> >         (*and<mode>_1): Ditto.
> >         (*andqi_1): Ditto.
> >         (*andn<mode>_1): Ditto.
> >         (*<code><mode>_1): Ditto.
> >         (*<code>qi_1): Ditto.
> >         (*one_cmpl<mode>2_1): Ditto.
> >         (*one_cmplsi2_1_zext): Ditto.
> >         (*one_cmplqi2_1): Ditto.
> >         (define_peephole2): Move constant 0/-1 directly into mask
> >         registers.
> >         * config/i386/predicates.md (mask_reg_operand): New predicate.
> >         * config/i386/sse.md (define_split): Add post-reload splitters
> >         that would convert "generic" patterns to mask patterns.
> >         (*knotsi_1_zext): New define_insn.
> >
> > gcc/testsuite/
> >         * gcc.target/i386/bitwise_mask_op-1.c: New test.
> >         * gcc.target/i386/bitwise_mask_op-2.c: New test.
> >         * gcc.target/i386/bitwise_mask_op-3.c: New test.
> >         * gcc.target/i386/avx512bw-pr88465.c: New testcase.
> >         * gcc.target/i386/avx512bw-kunpckwd-1.c: Adjust testcase.
> >         * gcc.target/i386/avx512bw-kunpckwd-3.c: Ditto.
> >         * gcc.target/i386/avx512dq-kmovb-5.c: Ditto.
> >         * gcc.target/i386/avx512f-kmovw-5.c: Ditto.
>
> A little nit, please put new splitters after the instruction pattern.
>
> OK for the whole patch set with the above change,
>

Yes, thanks for the review.

> Thanks,
> Uros.
>
> > ---
> >  gcc/config/i386/i386.c                        |   4 +-
> >  gcc/config/i386/i386.md                       | 209 ++++++++++++------
> >  gcc/config/i386/predicates.md                 |   5 +
> >  gcc/config/i386/sse.md                        |  59 +++++
> >  .../gcc.target/i386/avx512bw-kunpckwd-1.c     |   2 +-
> >  .../gcc.target/i386/avx512bw-kunpckwd-3.c     |   2 +-
> >  .../gcc.target/i386/avx512bw-pr88465.c        |  23 ++
> >  .../gcc.target/i386/avx512dq-kmovb-5.c        |   2 +-
> >  .../gcc.target/i386/avx512f-kmovw-5.c         |   2 +-
> >  .../gcc.target/i386/bitwise_mask_op-1.c       | 178 +++++++++++++++
> >  .../gcc.target/i386/bitwise_mask_op-2.c       |   8 +
> >  .../gcc.target/i386/bitwise_mask_op-3.c       |  44 ++++
> >  12 files changed, 471 insertions(+), 67 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/avx512bw-pr88465.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/bitwise_mask_op-1.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/bitwise_mask_op-2.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c
> >
> > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> > index d71d6d55be6..e8a2182ceb0 100644
> > --- a/gcc/config/i386/i386.c
> > +++ b/gcc/config/i386/i386.c
> > @@ -18407,13 +18407,15 @@ ix86_preferred_reload_class (rtx x,
> > reg_class_t regclass)
> >      return INTEGER_CLASS_P (regclass) ? regclass : NO_REGS;
> >
> >    /* QImode constants are easy to load, but non-constant QImode data
> > -     must go into Q_REGS.  */
> > +     must go into Q_REGS or ALL_MASK_REGS.  */
> >    if (GET_MODE (x) == QImode && !CONSTANT_P (x))
> >      {
> >        if (Q_CLASS_P (regclass))
> >          return regclass;
> >        else if (reg_class_subset_p (Q_REGS, regclass))
> >          return Q_REGS;
> > +      else if (MASK_CLASS_P (regclass))
> > +        return regclass;
> >        else
> >          return NO_REGS;
> >      }
> > diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
> > index 3a15941c3e8..676525fbc1f 100644
> > --- a/gcc/config/i386/i386.md
> > +++ b/gcc/config/i386/i386.md
> > @@ -2403,8 +2403,8 @@
> >             (symbol_ref "true")))])
> >
> >  (define_insn "*movhi_internal"
> > -  [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r ,r ,m ,k,k ,r,m,k")
> > -        (match_operand:HI 1 "general_operand"      "r
> > ,rn,rm,rn,r,km,k,k,CBC"))]
> > +  [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r ,r ,m ,*k,*k
> > ,*r,*m,*k")
> > +        (match_operand:HI 1 "general_operand"      "r
> > ,rn,rm,rn,*r,*km,*k,*k,CBC"))]
> >    "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
> >  {
> >    switch (get_attr_type (insn))
> > @@ -2491,9 +2491,9 @@
> >
> >  (define_insn "*movqi_internal"
> >    [(set (match_operand:QI 0 "nonimmediate_operand"
> > -                        "=Q,R,r,q,q,r,r ,?r,m ,k,k,r,m,k,k,k")
> > +                        "=Q,R,r,q,q,r,r ,?r,m ,*k,*k,*r,*m,*k,*k,*k")
> >          (match_operand:QI 1 "general_operand"
> > -                        "Q ,R,r,n,m,q,rn, m,qn,r,k,k,k,m,C,BC"))]
> > +                        "Q ,R,r,n,m,q,rn, m,qn,*r,*k,*k,*k,*m,C,BC"))]
> >    "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
> >  {
> >    char buf[128];
> > @@ -2624,6 +2624,19 @@
> >             ]
> >             (const_string "QI")))])
> >
> > +/* Reload dislikes loading 0/-1 directly into mask registers.
> > +   Try to tidy things up here.  */
> > +(define_peephole2
> > +  [(set (match_operand:SWI 0 "general_reg_operand")
> > +        (match_operand:SWI 1 "immediate_operand"))
> > +   (set (match_operand:SWI 2 "mask_reg_operand")
> > +        (match_dup 0))]
> > +  "peep2_reg_dead_p (2, operands[0])
> > +   && (const0_operand (operands[1], <MODE>mode)
> > +       || (constm1_operand (operands[1], <MODE>mode)
> > +           && (<MODE_SIZE> > 1 || TARGET_AVX512DQ)))"
> > +  [(set (match_dup 2) (match_dup 1))])
> > +
> >  ;; Stores and loads of ax to arbitrary constant address.
> >  ;; We fake an second form of instruction to force reload to load address
> >  ;; into register when rax is not available
> > @@ -9044,19 +9057,21 @@
> >  })
> >
> >  (define_insn "*anddi_1"
> > -  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,rm,r,r")
> > +  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,rm,r,r,k")
> >          (and:DI
> > -         (match_operand:DI 1 "nonimmediate_operand" "%0,0,0,qm")
> > -         (match_operand:DI 2 "x86_64_szext_general_operand" "Z,re,m,L")))
> > +         (match_operand:DI 1 "nonimmediate_operand" "%0,0,0,qm,k")
> > +         (match_operand:DI 2 "x86_64_szext_general_operand" "Z,re,m,L,k")))
> >     (clobber (reg:CC FLAGS_REG))]
> >    "TARGET_64BIT && ix86_binary_operator_ok (AND, DImode, operands)"
> >    "@
> >     and{l}\t{%k2, %k0|%k0, %k2}
> >     and{q}\t{%2, %0|%0, %2}
> >     and{q}\t{%2, %0|%0, %2}
> > +   #
> >     #"
> > -  [(set_attr "type" "alu,alu,alu,imovx")
> > -   (set_attr "length_immediate" "*,*,*,0")
> > +  [(set_attr "isa" "x64,x64,x64,x64,avx512bw")
> > +   (set_attr "type" "alu,alu,alu,imovx,msklog")
> > +   (set_attr "length_immediate" "*,*,*,0,*")
> >     (set (attr "prefix_rex")
> >       (if_then_else
> >         (and (eq_attr "type" "imovx")
> > @@ -9064,7 +9079,7 @@
> >                   (match_operand 1 "ext_QIreg_operand")))
> >         (const_string "1")
> >         (const_string "*")))
> > -   (set_attr "mode" "SI,DI,DI,SI")])
> > +   (set_attr "mode" "SI,DI,DI,SI,DI")])
> >
> >  (define_insn_and_split "*anddi_1_btr"
> >    [(set (match_operand:DI 0 "nonimmediate_operand" "=rm")
> > @@ -9130,17 +9145,25 @@
> >     (set_attr "mode" "SI")])
> >
> >  (define_insn "*and<mode>_1"
> > -  [(set (match_operand:SWI24 0 "nonimmediate_operand" "=rm,r,Ya")
> > -        (and:SWI24 (match_operand:SWI24 1 "nonimmediate_operand" "%0,0,qm")
> > -                   (match_operand:SWI24 2 "<general_operand>" "r<i>,m,L")))
> > +  [(set (match_operand:SWI24 0 "nonimmediate_operand" "=rm,r,Ya,k")
> > +        (and:SWI24 (match_operand:SWI24 1 "nonimmediate_operand" "%0,0,qm,k")
> > +                   (match_operand:SWI24 2 "<general_operand>" "r<i>,m,L,k")))
> >     (clobber (reg:CC FLAGS_REG))]
> >    "ix86_binary_operator_ok (AND, <MODE>mode, operands)"
> >    "@
> >     and{<imodesuffix>}\t{%2, %0|%0, %2}
> >     and{<imodesuffix>}\t{%2, %0|%0, %2}
> > +   #
> >     #"
> > -  [(set_attr "type" "alu,alu,imovx")
> > -   (set_attr "length_immediate" "*,*,0")
> > +  [(set (attr "isa")
> > +        (cond [(eq_attr "alternative" "3")
> > +                 (if_then_else (eq_attr "mode" "SI")
> > +                   (const_string "avx512bw")
> > +                   (const_string "avx512f"))
> > +              ]
> > +              (const_string "*")))
> > +   (set_attr "type" "alu,alu,imovx,msklog")
> > +   (set_attr "length_immediate" "*,*,0,*")
> >     (set (attr "prefix_rex")
> >       (if_then_else
> >         (and (eq_attr "type" "imovx")
> > @@ -9148,20 +9171,28 @@
> >                   (match_operand 1 "ext_QIreg_operand")))
> >         (const_string "1")
> >         (const_string "*")))
> > -   (set_attr "mode" "<MODE>,<MODE>,SI")])
> > +   (set_attr "mode" "<MODE>,<MODE>,SI,<MODE>")])
> >
> >  (define_insn "*andqi_1"
> > -  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r")
> > -        (and:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0")
> > -                (match_operand:QI 2 "general_operand" "qn,m,rn")))
> > +  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r,k")
> > +        (and:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0,k")
> > +                (match_operand:QI 2 "general_operand" "qn,m,rn,k")))
> >     (clobber (reg:CC FLAGS_REG))]
> >    "ix86_binary_operator_ok (AND, QImode, operands)"
> >    "@
> >     and{b}\t{%2, %0|%0, %2}
> >     and{b}\t{%2, %0|%0, %2}
> > -   and{l}\t{%k2, %k0|%k0, %k2}"
> > -  [(set_attr "type" "alu")
> > -   (set_attr "mode" "QI,QI,SI")
> > +   and{l}\t{%k2, %k0|%k0, %k2}
> > +   #"
> > +  [(set_attr "type" "alu,alu,alu,msklog")
> > +   (set (attr "mode")
> > +        (cond [(eq_attr "alternative" "2")
> > +                 (const_string "SI")
> > +                (and (eq_attr "alternative" "3")
> > +                     (match_test "!TARGET_AVX512DQ"))
> > +                 (const_string "HI")
> > +               ]
> > +               (const_string "QI")))
> >     ;; Potential partial reg stall on alternative 2.
> >     (set (attr "preferred_for_speed")
> >       (cond [(eq_attr "alternative" "2")
> > @@ -9539,28 +9570,42 @@
> >  })
> >
> >  (define_insn "*andn<mode>_1"
> > -  [(set (match_operand:SWI48 0 "register_operand" "=r,r")
> > +  [(set (match_operand:SWI48 0 "register_operand" "=r,r,k")
> >          (and:SWI48
> > -          (not:SWI48 (match_operand:SWI48 1 "register_operand" "r,r"))
> > -          (match_operand:SWI48 2 "nonimmediate_operand" "r,m")))
> > +          (not:SWI48 (match_operand:SWI48 1 "register_operand" "r,r,k"))
> > +          (match_operand:SWI48 2 "nonimmediate_operand" "r,m,k")))
> >     (clobber (reg:CC FLAGS_REG))]
> > -  "TARGET_BMI"
> > -  "andn\t{%2, %1, %0|%0, %1, %2}"
> > -  [(set_attr "type" "bitmanip")
> > -   (set_attr "btver2_decode" "direct, double")
> > +  "TARGET_BMI || TARGET_AVX512BW"
> > +  "@
> > +   andn\t{%2, %1, %0|%0, %1, %2}
> > +   andn\t{%2, %1, %0|%0, %1, %2}
> > +   #"
> > +  [(set_attr "isa" "bmi,bmi,avx512bw")
> > +   (set_attr "type" "bitmanip,bitmanip,msklog")
> > +   (set_attr "btver2_decode" "direct, double,*")
> >     (set_attr "mode" "<MODE>")])
> >
> >  (define_insn "*andn<mode>_1"
> > -  [(set (match_operand:SWI12 0 "register_operand" "=r")
> > +  [(set (match_operand:SWI12 0 "register_operand" "=r,k")
> >          (and:SWI12
> > -          (not:SWI12 (match_operand:SWI12 1 "register_operand" "r"))
> > -          (match_operand:SWI12 2 "register_operand" "r")))
> > +          (not:SWI12 (match_operand:SWI12 1 "register_operand" "r,k"))
> > +          (match_operand:SWI12 2 "register_operand" "r,k")))
> >     (clobber (reg:CC FLAGS_REG))]
> > -  "TARGET_BMI"
> > -  "andn\t{%k2, %k1, %k0|%k0, %k1, %k2}"
> > -  [(set_attr "type" "bitmanip")
> > -   (set_attr "btver2_decode" "direct")
> > -   (set_attr "mode" "SI")])
> > +  "TARGET_BMI || TARGET_AVX512BW"
> > +  "@
> > +   andn\t{%k2, %k1, %k0|%k0, %k1, %k2}
> > +   #"
> > +  [(set_attr "isa" "bmi,avx512f")
> > +   (set_attr "type" "bitmanip,msklog")
> > +   (set_attr "btver2_decode" "direct,*")
> > +   (set (attr "mode")
> > +        (cond [(eq_attr "alternative" "0")
> > +                 (const_string "SI")
> > +               (and (eq_attr "alternative" "1")
> > +                    (match_test "!TARGET_AVX512DQ"))
> > +                  (const_string "HI")
> > +              ]
> > +              (const_string "<MODE>")))])
> >
> >  (define_insn "*andn_<mode>_ccno"
> >    [(set (reg FLAGS_REG)
> > @@ -9631,14 +9676,24 @@
> >  })
> >
> >  (define_insn "*<code><mode>_1"
> > -  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,r")
> > +  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,r,k")
> >          (any_or:SWI248
> > -         (match_operand:SWI248 1 "nonimmediate_operand" "%0,0")
> > -         (match_operand:SWI248 2 "<general_operand>" "r<i>,m")))
> > +         (match_operand:SWI248 1 "nonimmediate_operand" "%0,0,k")
> > +         (match_operand:SWI248 2 "<general_operand>" "r<i>,m,k")))
> >     (clobber (reg:CC FLAGS_REG))]
> >    "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
> > -  "<logic>{<imodesuffix>}\t{%2, %0|%0, %2}"
> > -  [(set_attr "type" "alu")
> > +  "@
> > +   <logic>{<imodesuffix>}\t{%2, %0|%0, %2}
> > +   <logic>{<imodesuffix>}\t{%2, %0|%0, %2}
> > +   #"
> > +  [(set (attr "isa")
> > +        (cond [(eq_attr "alternative" "2")
> > +                 (if_then_else (eq_attr "mode" "SI,DI")
> > +                   (const_string "avx512bw")
> > +                   (const_string "avx512f"))
> > +              ]
> > +              (const_string "*")))
> > +   (set_attr "type" "alu, alu, msklog")
> >     (set_attr "mode" "<MODE>")])
> >
> >  (define_insn_and_split "*iordi_1_bts"
> > @@ -9711,17 +9766,26 @@
> >     (set_attr "mode" "SI")])
> >
> >  (define_insn "*<code>qi_1"
> > -  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r")
> > -        (any_or:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0")
> > -                   (match_operand:QI 2 "general_operand" "qn,m,rn")))
> > +  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r,k")
> > +        (any_or:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0,k")
> > +                   (match_operand:QI 2 "general_operand" "qn,m,rn,k")))
> >     (clobber (reg:CC FLAGS_REG))]
> >    "ix86_binary_operator_ok (<CODE>, QImode, operands)"
> >    "@
> >     <logic>{b}\t{%2, %0|%0, %2}
> >     <logic>{b}\t{%2, %0|%0, %2}
> > -   <logic>{l}\t{%k2, %k0|%k0, %k2}"
> > -  [(set_attr "type" "alu")
> > -   (set_attr "mode" "QI,QI,SI")
> > +   <logic>{l}\t{%k2, %k0|%k0, %k2}
> > +   #"
> > +  [(set_attr "isa" "*,*,*,avx512f")
> > +   (set_attr "type" "alu,alu,alu,msklog")
> > +   (set (attr "mode")
> > +        (cond [(eq_attr "alternative" "2")
> > +                 (const_string "SI")
> > +                (and (eq_attr "alternative" "3")
> > +                     (match_test "!TARGET_AVX512DQ"))
> > +                 (const_string "HI")
> > +               ]
> > +               (const_string "QI")))
> >     ;; Potential partial reg stall on alternative 2.
> >     (set (attr "preferred_for_speed")
> >       (cond [(eq_attr "alternative" "2")
> > @@ -10370,31 +10434,52 @@
> >    "split_double_mode (DImode, &operands[0], 2, &operands[0], &operands[2]);")
> >
> >  (define_insn "*one_cmpl<mode>2_1"
> > -  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm")
> > -        (not:SWI248 (match_operand:SWI248 1 "nonimmediate_operand" "0")))]
> > +  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,k")
> > +        (not:SWI248 (match_operand:SWI248 1 "nonimmediate_operand" "0,k")))]
> >    "ix86_unary_operator_ok (NOT, <MODE>mode, operands)"
> > -  "not{<imodesuffix>}\t%0"
> > -  [(set_attr "type" "negnot")
> > +  "@
> > +   not{<imodesuffix>}\t%0
> > +   #"
> > +  [(set (attr "isa")
> > +        (cond [(eq_attr "alternative" "2")
> > +                 (if_then_else (eq_attr "mode" "SI,DI")
> > +                   (const_string "avx512bw")
> > +                   (const_string "avx512f"))
> > +              ]
> > +              (const_string "*")))
> > +   (set_attr "type" "negnot,msklog")
> >     (set_attr "mode" "<MODE>")])
> >
> >  (define_insn "*one_cmplsi2_1_zext"
> > -  [(set (match_operand:DI 0 "register_operand" "=r")
> > +  [(set (match_operand:DI 0 "register_operand" "=r,k")
> >          (zero_extend:DI
> > -          (not:SI (match_operand:SI 1 "register_operand" "0"))))]
> > +          (not:SI (match_operand:SI 1 "register_operand" "0,k"))))]
> >    "TARGET_64BIT && ix86_unary_operator_ok (NOT, SImode, operands)"
> > -  "not{l}\t%k0"
> > -  [(set_attr "type" "negnot")
> > -   (set_attr "mode" "SI")])
> > +  "@
> > +   not{l}\t%k0
> > +   #"
> > +  [(set_attr "isa" "x64,avx512bw")
> > +   (set_attr "type" "negnot,msklog")
> > +   (set_attr "mode" "SI,SI")])
> >
> >  (define_insn "*one_cmplqi2_1"
> > -  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,r")
> > -        (not:QI (match_operand:QI 1 "nonimmediate_operand" "0,0")))]
> > +  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,r,k")
> > +        (not:QI (match_operand:QI 1 "nonimmediate_operand" "0,0,k")))]
> >    "ix86_unary_operator_ok (NOT, QImode, operands)"
> >    "@
> >     not{b}\t%0
> > -   not{l}\t%k0"
> > -  [(set_attr "type" "negnot")
> > -   (set_attr "mode" "QI,SI")
> > +   not{l}\t%k0
> > +   #"
> > +  [(set_attr "isa" "*,*,avx512f")
> > +   (set_attr "type" "negnot,negnot,msklog")
> > +   (set (attr "mode")
> > +        (cond [(eq_attr "alternative" "1")
> > +                 (const_string "SI")
> > +                (and (eq_attr "alternative" "2")
> > +                     (match_test "!TARGET_AVX512DQ"))
> > +                 (const_string "HI")
> > +               ]
> > +               (const_string "QI")))
> >     ;; Potential partial reg stall on alternative 1.
> >     (set (attr "preferred_for_speed")
> >       (cond [(eq_attr "alternative" "1")
> > diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
> > index 07e69d555c0..dd1b31479f5 100644
> > --- a/gcc/config/i386/predicates.md
> > +++ b/gcc/config/i386/predicates.md
> > @@ -87,6 +87,11 @@
> >    (and (match_code "reg")
> >         (match_test "REGNO (op) == FLAGS_REG")))
> >
> > +;; True if the operand is a MASK register.
> > +(define_predicate "mask_reg_operand"
> > +  (and (match_code "reg")
> > +       (match_test "MASK_REGNO_P (REGNO (op))")))
> > +
> >  ;; Match a DI, SI, HI or QImode nonimmediate_operand.
> >  (define_special_predicate "int_nonimmediate_operand"
> >    (and (match_operand 0 "nonimmediate_operand")
> > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> > index b6348de67cb..4372a9fd785 100644
> > --- a/gcc/config/i386/sse.md
> > +++ b/gcc/config/i386/sse.md
> > @@ -1452,6 +1452,18 @@
> >    "TARGET_AVX512F
> >     && !(MEM_P (operands[0]) && MEM_P (operands[1]))")
> >
> > +(define_split
> > +  [(set (match_operand:SWI1248_AVX512BW 0 "mask_reg_operand")
> > +        (any_logic:SWI1248_AVX512BW
> > +          (match_operand:SWI1248_AVX512BW 1 "mask_reg_operand")
> > +          (match_operand:SWI1248_AVX512BW 2 "mask_reg_operand")))
> > +   (clobber (reg:CC FLAGS_REG))]
> > +  "TARGET_AVX512F && reload_completed"
> > +  [(parallel
> > +     [(set (match_dup 0)
> > +           (any_logic:SWI1248_AVX512BW (match_dup 1) (match_dup 2)))
> > +      (unspec [(const_int 0)] UNSPEC_MASKOP)])])
> > +
> >  (define_insn "k<code><mode>"
> >    [(set (match_operand:SWI1248_AVX512BW 0 "register_operand" "=k")
> >          (any_logic:SWI1248_AVX512BW
> > @@ -1474,6 +1486,21 @@
> >             ]
> >             (const_string "<MODE>")))])
> >
> > +(define_split
> > +  [(set (match_operand:SWI1248_AVX512BW 0 "mask_reg_operand")
> > +        (and:SWI1248_AVX512BW
> > +          (not:SWI1248_AVX512BW
> > +            (match_operand:SWI1248_AVX512BW 1 "mask_reg_operand"))
> > +          (match_operand:SWI1248_AVX512BW 2 "mask_reg_operand")))
> > +   (clobber (reg:CC FLAGS_REG))]
> > +  "TARGET_AVX512F && reload_completed"
> > +  [(parallel
> > +     [(set (match_dup 0)
> > +           (and:SWI1248_AVX512BW
> > +             (not:SWI1248_AVX512BW (match_dup 1))
> > +             (match_dup 2)))
> > +      (unspec [(const_int 0)] UNSPEC_MASKOP)])])
> > +
> >  (define_insn "kandn<mode>"
> >    [(set (match_operand:SWI1248_AVX512BW 0 "register_operand" "=k")
> >          (and:SWI1248_AVX512BW
> > @@ -1520,6 +1547,16 @@
> >             ]
> >             (const_string "<MODE>")))])
> >
> > +(define_split
> > +  [(set (match_operand:SWI1248_AVX512BW 0 "mask_reg_operand")
> > +        (not:SWI1248_AVX512BW
> > +          (match_operand:SWI1248_AVX512BW 1 "mask_reg_operand")))]
> > +  "TARGET_AVX512F && reload_completed"
> > +  [(parallel
> > +     [(set (match_dup 0)
> > +           (not:SWI1248_AVX512BW (match_dup 1)))
> > +      (unspec [(const_int 0)] UNSPEC_MASKOP)])])
> > +
> >  (define_insn "knot<mode>"
> >    [(set (match_operand:SWI1248_AVX512BW 0 "register_operand" "=k")
> >          (not:SWI1248_AVX512BW
> > @@ -1541,6 +1578,28 @@
> >             ]
> >             (const_string "<MODE>")))])
> >
> > +(define_split
> > +  [(set (match_operand:DI 0 "mask_reg_operand")
> > +        (zero_extend:DI
> > +          (not:DI (match_operand:SI 1 "mask_reg_operand"))))]
> > +  "TARGET_AVX512BW && reload_completed"
> > +  [(parallel
> > +     [(set (match_dup 0)
> > +           (zero_extend:DI
> > +             (not:SI (match_dup 1))))
> > +      (unspec [(const_int 0)] UNSPEC_MASKOP)])])
> > +
> > +(define_insn "*knotsi_1_zext"
> > +  [(set (match_operand:DI 0 "register_operand" "=k")
> > +        (zero_extend:DI
> > +          (not:SI (match_operand:SI 1 "register_operand" "k"))))
> > +   (unspec [(const_int 0)] UNSPEC_MASKOP)]
> > +  "TARGET_AVX512BW"
> > +  "knotd\t{%1, %0|%0, %1}";
> > +  [(set_attr "type" "msklog")
> > +   (set_attr "prefix" "vex")
> > +   (set_attr "mode" "SI")])
> > +
> >  (define_insn "kadd<mode>"
> >    [(set (match_operand:SWI1248_AVX512BWDQ2 0 "register_operand" "=k")
> >          (plus:SWI1248_AVX512BWDQ2
> > diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-1.c
> > b/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-1.c
> > index 94422f36010..46d9351f275 100644
> > --- a/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-1.c
> > +++ b/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-1.c
> > @@ -1,6 +1,6 @@
> >  /* { dg-do compile } */
> >  /* { dg-options "-mavx512bw -O2" } */
> > -/* { dg-final { scan-assembler-times "kunpckwd\[
> > \\t\]+\[^\{\n\]*%k\[1-7\](?:\n|\[ \\t\]+#)" 1 } } */
> > +/* { dg-final { scan-assembler-times "kunpckwd\[
> > \\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */
> >
> >  #include <immintrin.h>
> >
> > diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-3.c
> > b/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-3.c
> > index c68ad8cc1f7..fe13f4f33fc 100644
> > --- a/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-3.c
> > +++ b/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-3.c
> > @@ -1,6 +1,6 @@
> >  /* { dg-do compile } */
> >  /* { dg-options "-mavx512bw -O2" } */
> > -/* { dg-final { scan-assembler-times "kunpckwd\[
> > \\t\]+\[^\{\n\]*%k\[1-7\](?:\n|\[ \\t\]+#)" 1 } } */
> > +/* { dg-final { scan-assembler-times "kunpckwd\[
> > \\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */
> >
> >  #include <immintrin.h>
> >
> > diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-pr88465.c
> > b/gcc/testsuite/gcc.target/i386/avx512bw-pr88465.c
> > new file mode 100644
> > index 00000000000..8e34bf45365
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/avx512bw-pr88465.c
> > @@ -0,0 +1,23 @@
> > +/* PR target/88465 */
> > +/* { dg-do compile { target { ! ia32 } } } */
> > +/* { dg-options "-O2 -mavx512bw" } */
> > +/* { dg-final { scan-assembler-times "kxor\[qd\]\[ \t]" 2 } } */
> > +/* { dg-final { scan-assembler-times "kxnor\[dq\]\[ \t]" 2 } } */
> > +
> > +void
> > +foo (void)
> > +{
> > +  unsigned int k = 0;
> > +  __asm volatile ("" : : "k" (k));
> > +  k = -1;
> > +  __asm volatile ("" : : "k" (k));
> > +}
> > +
> > +void
> > +bar (void)
> > +{
> > +  unsigned long long k = 0;
> > +  __asm volatile ("" : : "k" (k));
> > +  k = -1;
> > +  __asm volatile ("" : : "k" (k));
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/avx512dq-kmovb-5.c
> > b/gcc/testsuite/gcc.target/i386/avx512dq-kmovb-5.c
> > index 49817097e26..114e03ee93d 100644
> > --- a/gcc/testsuite/gcc.target/i386/avx512dq-kmovb-5.c
> > +++ b/gcc/testsuite/gcc.target/i386/avx512dq-kmovb-5.c
> > @@ -1,5 +1,5 @@
> >  /* { dg-do compile } */
> > -/* { dg-options "-mavx512dq -O2" } */
> > +/* { dg-options "-mavx512dq -mno-avx512bw -O2" } */
> >  /* { dg-final { scan-assembler-times "kmovb\[
> > \\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */
> >
> >  #include <immintrin.h>
> > diff --git a/gcc/testsuite/gcc.target/i386/avx512f-kmovw-5.c
> > b/gcc/testsuite/gcc.target/i386/avx512f-kmovw-5.c
> > index 7bb34d34d8d..79d37394b36 100644
> > --- a/gcc/testsuite/gcc.target/i386/avx512f-kmovw-5.c
> > +++ b/gcc/testsuite/gcc.target/i386/avx512f-kmovw-5.c
> > @@ -1,5 +1,5 @@
> >  /* { dg-do compile } */
> > -/* { dg-options "-mavx512f -O2" } */
> > +/* { dg-options "-mavx512f -mno-avx512bw -O2" } */
> >  /* { dg-final { scan-assembler-times "kmovw\[
> > \\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */
> >
> >  #include <immintrin.h>
> > diff --git a/gcc/testsuite/gcc.target/i386/bitwise_mask_op-1.c
> > b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-1.c
> > new file mode 100644
> > index 00000000000..61f71ab8b23
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-1.c
> > @@ -0,0 +1,178 @@
> > +/* PR target/88808  */
> > +/* { dg-do compile } */
> > +/* { dg-options "-mavx512bw -mno-avx512dq -O2" } */
> > +
> > +#include <immintrin.h>
> > +__m512i
> > +foo_orq (__m512i a, __m512i b, __m512i c, __m512i d)
> > +{
> > +  __mmask64 m1 = _mm512_cmpeq_epi8_mask (a, b);
> > +  __mmask64 m2 = _mm512_cmpeq_epi8_mask (c, d);
> > +  return _mm512_mask_add_epi8 (c, m1 | m2, a, d);
> > +}
> > +
> > +/* { dg-final { scan-assembler-times "korq" "1" { target { ! ia32 } } } } */
> > +
> > +__m512i
> > +foo_ord (__m512i a, __m512i b, __m512i c, __m512i d)
> > +{
> > +  __mmask32 m1 = _mm512_cmpeq_epi16_mask (a, b);
> > +  __mmask32 m2 = _mm512_cmpeq_epi16_mask (c, d);
> > +  return _mm512_mask_add_epi16 (c, m1 | m2, a, d);
> > +}
> > +
> > +/* { dg-final { scan-assembler-times "kord" "1" } }  */
> > +
> > +__m512i
> > +foo_orw (__m512i a, __m512i b, __m512i c, __m512i d)
> > +{
> > +  __mmask16 m1 = _mm512_cmpeq_epi32_mask (a, b);
> > +  __mmask16 m2 = _mm512_cmpeq_epi32_mask (c, d);
> > +  return _mm512_mask_add_epi32 (c, m1 | m2, a, d);
> > +}
> > +
> > +__m512i
> > +foo_orb (__m512i a, __m512i b, __m512i c, __m512i d)
> > +{
> > +  __mmask8 m1 = _mm512_cmpeq_epi64_mask (a, b);
> > +  __mmask8 m2 = _mm512_cmpeq_epi64_mask (c, d);
> > +  return _mm512_mask_add_epi64 (c, m1 | m2, a, d);
> > +}
> > +
> > +/* { dg-final { scan-assembler-times "korw" "2" } }  */
> > +
> > +__m512i
> > +foo_xorq (__m512i a, __m512i b, __m512i c, __m512i d)
> > +{
> > +  __mmask64 m1 = _mm512_cmpeq_epi8_mask (a, b);
> > +  __mmask64 m2 = _mm512_cmpeq_epi8_mask (c, d);
> > +  return _mm512_mask_add_epi8 (c, m1 ^ m2, a, d);
> > +}
> > +
> > +/* { dg-final { scan-assembler-times "kxorq" "1" { target { ! ia32 } } } }  */
> > +
> > +__m512i
> > +foo_xord (__m512i a, __m512i b, __m512i c, __m512i d)
> > +{
> > +  __mmask32 m1 = _mm512_cmpeq_epi16_mask (a, b);
> > +  __mmask32 m2 = _mm512_cmpeq_epi16_mask (c, d);
> > +  return _mm512_mask_add_epi16 (c, m1 ^ m2, a, d);
> > +}
> > +
> > +/* { dg-final { scan-assembler-times "kxord" "1" } }  */
> > +
> > +__m512i
> > +foo_xorw (__m512i a, __m512i b, __m512i c, __m512i d)
> > +{
> > +  __mmask16 m1 = _mm512_cmpeq_epi32_mask (a, b);
> > +  __mmask16 m2 = _mm512_cmpeq_epi32_mask (c, d);
> > +  return _mm512_mask_add_epi32 (c, m1 ^ m2, a, d);
> > +}
> > +
> > +__m512i
> > +foo_xorb (__m512i a, __m512i b, __m512i c, __m512i d)
> > +{
> > +  __mmask8 m1 = _mm512_cmpeq_epi64_mask (a, b);
> > +  __mmask8 m2 = _mm512_cmpeq_epi64_mask (c, d);
> > +  return _mm512_mask_add_epi64 (c, m1 ^ m2, a, d);
> > +}
> > +
> > +/* { dg-final { scan-assembler-times "korw" "2" } }  */
> > +
> > +__m512i
> > +foo_andq (__m512i a, __m512i b, __m512i c, __m512i d)
> > +{
> > +  __mmask64 m1 = _mm512_cmpeq_epi8_mask (a, b);
> > +  __mmask64 m2 = _mm512_cmpeq_epi8_mask (c, d);
> > +  return _mm512_mask_add_epi8 (c, m1 & m2, a, d);
> > +}
> > +
> > +__m512i
> > +foo_andd (__m512i a, __m512i b, __m512i c, __m512i d)
> > +{
> > +  __mmask32 m1 = _mm512_cmpeq_epi16_mask (a, b);
> > +  __mmask32 m2 = _mm512_cmpeq_epi16_mask (c, d);
> > +  return _mm512_mask_add_epi16 (c, m1 & m2, a, d);
> > +}
> > +
> > +__m512i
> > +foo_andw (__m512i a, __m512i b, __m512i c, __m512i d)
> > +{
> > +  __mmask16 m1 = _mm512_cmpeq_epi32_mask (a, b);
> > +  __mmask16 m2 = _mm512_cmpeq_epi32_mask (c, d);
> > +  return _mm512_mask_add_epi32 (c, m1 & m2, a, d);
> > +}
> > +
> > +__m512i
> > +foo_andb (__m512i a, __m512i b, __m512i c, __m512i d)
> > +{
> > +  __mmask8 m1 = _mm512_cmpeq_epi64_mask (a, b);
> > +  __mmask8 m2 = _mm512_cmpeq_epi64_mask (c, d);
> > +  return _mm512_mask_add_epi64 (c, m1 & m2, a, d);
> > +}
> > +
> > +__m512i
> > +foo_andnq (__m512i a, __m512i b, __m512i c, __m512i d)
> > +{
> > +  __mmask64 m1 = _mm512_cmpeq_epi8_mask (a, b);
> > +  __mmask64 m2 = _mm512_cmpeq_epi8_mask (c, d);
> > +  return _mm512_mask_add_epi8 (c, m1 & ~m2, a, d);
> > +}
> > +
> > +__m512i
> > +foo_andnd (__m512i a, __m512i b, __m512i c, __m512i d)
> > +{
> > +  __mmask32 m1 = _mm512_cmpeq_epi16_mask (a, b);
> > +  __mmask32 m2 = _mm512_cmpeq_epi16_mask (c, d);
> > +  return _mm512_mask_add_epi16 (c, m1 & ~m2, a, d);
> > +}
> > +
> > +__m512i
> > +foo_andnw (__m512i a, __m512i b, __m512i c, __m512i d)
> > +{
> > +  __mmask16 m1 = _mm512_cmpeq_epi32_mask (a, b);
> > +  __mmask16 m2 = _mm512_cmpeq_epi32_mask (c, d);
> > +  return _mm512_mask_add_epi32 (c, m1 & ~m2, a, d);
> > +}
> > +
> > +__m512i
> > +foo_andnb (__m512i a, __m512i b, __m512i c, __m512i d)
> > +{
> > +  __mmask8 m1 = _mm512_cmpeq_epi64_mask (a, b);
> > +  __mmask8 m2 = _mm512_cmpeq_epi64_mask (c, d);
> > +  return _mm512_mask_add_epi64 (c, m1 & ~m2, a, d);
> > +}
> > +
> > +__m512i
> > +foo_notq (__m512i a, __m512i b, __m512i c, __m512i d)
> > +{
> > +  __mmask64 m1 = _mm512_cmpeq_epi8_mask (a, b);
> > +  return _mm512_mask_add_epi8 (c, ~m1, a, d);
> > +}
> > +
> > +/* { dg-final { scan-assembler-times "knotq" "2" { target { ! ia32 } } } }  */
> > +
> > +__m512i
> > +foo_notd (__m512i a, __m512i b, __m512i c, __m512i d)
> > +{
> > +  __mmask32 m1 = _mm512_cmpeq_epi16_mask (a, b);
> > +  return _mm512_mask_add_epi16 (c, ~m1, a, d);
> > +}
> > +
> > +/* { dg-final { scan-assembler-times "knotd" "2" { target { ! ia32 } } } }  */
> > +
> > +__m512i
> > +foo_notw (__m512i a, __m512i b, __m512i c, __m512i d)
> > +{
> > +  __mmask16 m1 = _mm512_cmpeq_epi32_mask (a, b);
> > +  return _mm512_mask_add_epi32 (c, ~m1, a, d);
> > +}
> > +
> > +__m512i
> > +foo_notb (__m512i a, __m512i b, __m512i c, __m512i d)
> > +{
> > +  __mmask8 m1 = _mm512_cmpeq_epi64_mask (a, b);
> > +  return _mm512_mask_add_epi64 (c, ~m1, a, d);
> > +}
> > +
> > +/* { dg-final { scan-assembler-times "knotw" "4" } }  */
> > diff --git a/gcc/testsuite/gcc.target/i386/bitwise_mask_op-2.c
> > b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-2.c
> > new file mode 100644
> > index 00000000000..850f0b42652
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-2.c
> > @@ -0,0 +1,8 @@
> > +/* PR target/88808  */
> > +/* { dg-do compile } */
> > +/* { dg-options "-mavx512bw -mavx512dq -O2" } */
> > +/* { dg-final { scan-assembler-times "knotb" "2" } }  */
> > +/* { dg-final { scan-assembler-times "korb" "1" } }  */
> > +/* { dg-final { scan-assembler-times "kxorb" "1" } }  */
> > +#include "bitwise_mask_op-1.c"
> > +
> > diff --git a/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c
> > b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c
> > new file mode 100644
> > index 00000000000..18bf4f0d768
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/bitwise_mask_op-3.c
> > @@ -0,0 +1,44 @@
> > +/* PR target/88808  */
> > +/* { dg-do compile } */
> > +/* { dg-options "-mavx512bw -mavx512dq -O2" } */
> > +
> > +#include <immintrin.h>
> > +volatile __mmask8 foo;
> > +void
> > +foo_orb (__m512i a, __m512i b)
> > +{
> > +  __mmask8 m1 = _mm512_cmp_epi64_mask (a, b, 2);
> > +  __mmask8 m2 = _mm512_cmp_epi64_mask (a, b, 4);
> > +  foo = m1 | m2;
> > +}
> > +
> > +/* { dg-final { scan-assembler-times "korb\[\t \]" "1" } }  */
> > +
> > +void
> > +foo_xorb (__m512i a, __m512i b)
> > +{
> > +  __mmask8 m1 = _mm512_cmp_epi64_mask (a, b, 2);
> > +  __mmask8 m2 = _mm512_cmp_epi64_mask (a, b, 4);
> > +  foo = m1 ^ m2;
> > +}
> > +
> > +/* { dg-final { scan-assembler-times "kxorb\[\t \]" "1" } }  */
> > +
> > +void
> > +foo_andb (__m512i a, __m512i b)
> > +{
> > +  __mmask8 m1 = _mm512_cmp_epi64_mask (a, b, 2);
> > +  __mmask8 m2 = _mm512_cmp_epi64_mask (a, b, 4);
> > +  foo = m1 & m2;
> > +}
> > +
> > +void
> > +foo_andnb (__m512i a, __m512i b)
> > +{
> > +  __mmask8 m1 = _mm512_cmp_epi64_mask (a, b, 2);
> > +  __mmask8 m2 = _mm512_cmp_epi64_mask (a, b, 4);
> > +  foo = m1 & ~m2;
> > +}
> > +
> > +/* { dg-final { scan-assembler-times "knotb\[\t \]" "1" } }  */
> > +/* { dg-final { scan-assembler-times "kmovb\[\t \]" "4"} }  */
> > --
> > 2.18.1
> >
> >
> > --
> > BR,
> > Hongtao



-- 
BR,
Hongtao

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH 4/4][PR target/88808]Enable bitwise operator for AVX512 masks.
  2020-08-20  7:45             ` Hongtao Liu
@ 2020-08-21 13:15               ` Uros Bizjak
  2020-08-21 15:41                 ` Hongtao Liu
  0 siblings, 1 reply; 39+ messages in thread
From: Uros Bizjak @ 2020-08-21 13:15 UTC (permalink / raw)
  To: Hongtao Liu; +Cc: GCC Patches, Kirill Yukhin, H. J. Lu

> > > gcc/
> > >         PR target/88808
> > >         * config/i386/i386.c (ix86_preferred_reload_class): Allow
> > >         QImode data go into mask registers.
> > >         * config/i386/i386.md: (*movhi_internal): Adjust constraints
> > >         for mask registers.
> > >         (*movqi_internal): Ditto.
> > >         (*anddi_1): Support mask register operations
> > >         (*and<mode>_1): Ditto.
> > >         (*andqi_1): Ditto.
> > >         (*andn<mode>_1): Ditto.
> > >         (*<code><mode>_1): Ditto.
> > >         (*<code>qi_1): Ditto.
> > >         (*one_cmpl<mode>2_1): Ditto.
> > >         (*one_cmplsi2_1_zext): Ditto.
> > >         (*one_cmplqi2_1): Ditto.
> > >         (define_peephole2): Move constant 0/-1 directly into mask
> > >         registers.
> > >         * config/i386/predicates.md (mask_reg_operand): New predicate.
> > >         * config/i386/sse.md (define_split): Add post-reload splitters
> > >         that would convert "generic" patterns to mask patterns.
> > >         (*knotsi_1_zext): New define_insn.
> > >
> > > gcc/testsuite/
> > >         * gcc.target/i386/bitwise_mask_op-1.c: New test.
> > >         * gcc.target/i386/bitwise_mask_op-2.c: New test.
> > >         * gcc.target/i386/bitwise_mask_op-3.c: New test.
> > >         * gcc.target/i386/avx512bw-pr88465.c: New testcase.
> > >         * gcc.target/i386/avx512bw-kunpckwd-1.c: Adjust testcase.
> > >         * gcc.target/i386/avx512bw-kunpckwd-3.c: Ditto.
> > >         * gcc.target/i386/avx512dq-kmovb-5.c: Ditto.
> > >         * gcc.target/i386/avx512f-kmovw-5.c: Ditto.
> >
> > A little nit, please put new splitters after the instruction pattern.
> >
> > OK for the whole patch set with the above change,
> >
>
> Yes, thanks for the review.

Please note that your patch introduces several testsuite fails with -m32:

gcc -O2 -mavx512bitalg -mavx512bw -m32 -g avx512bitalg-vpopcntb-1.c

Program received signal SIGILL, Illegal instruction.
0x080490ac in __get_cpuid_count (__edx=<synthetic pointer>,
__ecx=<synthetic pointer>, __ebx=<synthetic pointer>, __eax=<synthetic
pointer>,
    __subleaf=0, __leaf=7) at /hdd/uros/gcc-build-fast/gcc/include/cpuid.h:316
316       __cpuid_count (__leaf, __subleaf, *__eax, *__ebx, *__ecx, *__edx);

   0x080490a3 <+51>:    cpuid
   0x080490a5 <+53>:    mov    $0x1,%eax
   0x080490aa <+58>:    mov    %ecx,%esi
=> 0x080490ac <+60>:    kmovd  %ebx,%k0
   0x080490b0 <+64>:    mov    %edi,%ecx
   0x080490b2 <+66>:    mov    %edi,%ebx

kmov insn is generated for __cpuid_count function, where the binary
determines, if the new instructions are supported. The binary will
crash in the detection code if the processor lacks AVX512
instructions.

Uros.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH 4/4][PR target/88808]Enable bitwise operator for AVX512 masks.
  2020-08-21 13:15               ` Uros Bizjak
@ 2020-08-21 15:41                 ` Hongtao Liu
  2020-08-21 15:50                   ` H.J. Lu
  2020-08-21 15:50                   ` Uros Bizjak
  0 siblings, 2 replies; 39+ messages in thread
From: Hongtao Liu @ 2020-08-21 15:41 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: GCC Patches, Kirill Yukhin, H. J. Lu

On Fri, Aug 21, 2020 at 9:15 PM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> > > > gcc/
> > > >         PR target/88808
> > > >         * config/i386/i386.c (ix86_preferred_reload_class): Allow
> > > >         QImode data go into mask registers.
> > > >         * config/i386/i386.md: (*movhi_internal): Adjust constraints
> > > >         for mask registers.
> > > >         (*movqi_internal): Ditto.
> > > >         (*anddi_1): Support mask register operations
> > > >         (*and<mode>_1): Ditto.
> > > >         (*andqi_1): Ditto.
> > > >         (*andn<mode>_1): Ditto.
> > > >         (*<code><mode>_1): Ditto.
> > > >         (*<code>qi_1): Ditto.
> > > >         (*one_cmpl<mode>2_1): Ditto.
> > > >         (*one_cmplsi2_1_zext): Ditto.
> > > >         (*one_cmplqi2_1): Ditto.
> > > >         (define_peephole2): Move constant 0/-1 directly into mask
> > > >         registers.
> > > >         * config/i386/predicates.md (mask_reg_operand): New predicate.
> > > >         * config/i386/sse.md (define_split): Add post-reload splitters
> > > >         that would convert "generic" patterns to mask patterns.
> > > >         (*knotsi_1_zext): New define_insn.
> > > >
> > > > gcc/testsuite/
> > > >         * gcc.target/i386/bitwise_mask_op-1.c: New test.
> > > >         * gcc.target/i386/bitwise_mask_op-2.c: New test.
> > > >         * gcc.target/i386/bitwise_mask_op-3.c: New test.
> > > >         * gcc.target/i386/avx512bw-pr88465.c: New testcase.
> > > >         * gcc.target/i386/avx512bw-kunpckwd-1.c: Adjust testcase.
> > > >         * gcc.target/i386/avx512bw-kunpckwd-3.c: Ditto.
> > > >         * gcc.target/i386/avx512dq-kmovb-5.c: Ditto.
> > > >         * gcc.target/i386/avx512f-kmovw-5.c: Ditto.
> > >
> > > A little nit, please put new splitters after the instruction pattern.
> > >
> > > OK for the whole patch set with the above change,
> > >
> >
> > Yes, thanks for the review.
>
> Please note that your patch introduces several testsuite fails with -m32:
>
> gcc -O2 -mavx512bitalg -mavx512bw -m32 -g avx512bitalg-vpopcntb-1.c
>

I can't reproduce this failure.

> Program received signal SIGILL, Illegal instruction.
> 0x080490ac in __get_cpuid_count (__edx=<synthetic pointer>,
> __ecx=<synthetic pointer>, __ebx=<synthetic pointer>, __eax=<synthetic
> pointer>,
>     __subleaf=0, __leaf=7) at /hdd/uros/gcc-build-fast/gcc/include/cpuid.h:316
> 316       __cpuid_count (__leaf, __subleaf, *__eax, *__ebx, *__ecx, *__edx);
>
>    0x080490a3 <+51>:    cpuid
>    0x080490a5 <+53>:    mov    $0x1,%eax
>    0x080490aa <+58>:    mov    %ecx,%esi
> => 0x080490ac <+60>:    kmovd  %ebx,%k0
>    0x080490b0 <+64>:    mov    %edi,%ecx
>    0x080490b2 <+66>:    mov    %edi,%ebx
>
> kmov insn is generated for __cpuid_count function, where the binary
> determines, if the new instructions are supported. The binary will
> crash in the detection code if the processor lacks AVX512
> instructions.
>

IMHO, the testcase shouldn't be run on processors without AVX512BW.
Because in  avx512bitalg-vpopcntb-1.c, there's /* {
dg-require-effective-target avx512bw } */.

what's the version of your assembler?

> Uros.



-- 
BR,
Hongtao

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH 4/4][PR target/88808]Enable bitwise operator for AVX512 masks.
  2020-08-21 15:41                 ` Hongtao Liu
@ 2020-08-21 15:50                   ` H.J. Lu
  2020-08-21 15:50                   ` Uros Bizjak
  1 sibling, 0 replies; 39+ messages in thread
From: H.J. Lu @ 2020-08-21 15:50 UTC (permalink / raw)
  To: Hongtao Liu; +Cc: Uros Bizjak, GCC Patches, Kirill Yukhin

On Fri, Aug 21, 2020 at 8:41 AM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Fri, Aug 21, 2020 at 9:15 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> >
> > > > > gcc/
> > > > >         PR target/88808
> > > > >         * config/i386/i386.c (ix86_preferred_reload_class): Allow
> > > > >         QImode data go into mask registers.
> > > > >         * config/i386/i386.md: (*movhi_internal): Adjust constraints
> > > > >         for mask registers.
> > > > >         (*movqi_internal): Ditto.
> > > > >         (*anddi_1): Support mask register operations
> > > > >         (*and<mode>_1): Ditto.
> > > > >         (*andqi_1): Ditto.
> > > > >         (*andn<mode>_1): Ditto.
> > > > >         (*<code><mode>_1): Ditto.
> > > > >         (*<code>qi_1): Ditto.
> > > > >         (*one_cmpl<mode>2_1): Ditto.
> > > > >         (*one_cmplsi2_1_zext): Ditto.
> > > > >         (*one_cmplqi2_1): Ditto.
> > > > >         (define_peephole2): Move constant 0/-1 directly into mask
> > > > >         registers.
> > > > >         * config/i386/predicates.md (mask_reg_operand): New predicate.
> > > > >         * config/i386/sse.md (define_split): Add post-reload splitters
> > > > >         that would convert "generic" patterns to mask patterns.
> > > > >         (*knotsi_1_zext): New define_insn.
> > > > >
> > > > > gcc/testsuite/
> > > > >         * gcc.target/i386/bitwise_mask_op-1.c: New test.
> > > > >         * gcc.target/i386/bitwise_mask_op-2.c: New test.
> > > > >         * gcc.target/i386/bitwise_mask_op-3.c: New test.
> > > > >         * gcc.target/i386/avx512bw-pr88465.c: New testcase.
> > > > >         * gcc.target/i386/avx512bw-kunpckwd-1.c: Adjust testcase.
> > > > >         * gcc.target/i386/avx512bw-kunpckwd-3.c: Ditto.
> > > > >         * gcc.target/i386/avx512dq-kmovb-5.c: Ditto.
> > > > >         * gcc.target/i386/avx512f-kmovw-5.c: Ditto.
> > > >
> > > > A little nit, please put new splitters after the instruction pattern.
> > > >
> > > > OK for the whole patch set with the above change,
> > > >
> > >
> > > Yes, thanks for the review.
> >
> > Please note that your patch introduces several testsuite fails with -m32:
> >
> > gcc -O2 -mavx512bitalg -mavx512bw -m32 -g avx512bitalg-vpopcntb-1.c
> >
>
> I can't reproduce this failure.
>
> > Program received signal SIGILL, Illegal instruction.
> > 0x080490ac in __get_cpuid_count (__edx=<synthetic pointer>,
> > __ecx=<synthetic pointer>, __ebx=<synthetic pointer>, __eax=<synthetic
> > pointer>,
> >     __subleaf=0, __leaf=7) at /hdd/uros/gcc-build-fast/gcc/include/cpuid.h:316
> > 316       __cpuid_count (__leaf, __subleaf, *__eax, *__ebx, *__ecx, *__edx);
> >
> >    0x080490a3 <+51>:    cpuid
> >    0x080490a5 <+53>:    mov    $0x1,%eax
> >    0x080490aa <+58>:    mov    %ecx,%esi
> > => 0x080490ac <+60>:    kmovd  %ebx,%k0
> >    0x080490b0 <+64>:    mov    %edi,%ecx
> >    0x080490b2 <+66>:    mov    %edi,%ebx
> >
> > kmov insn is generated for __cpuid_count function, where the binary
> > determines, if the new instructions are supported. The binary will
> > crash in the detection code if the processor lacks AVX512
> > instructions.
> >
>
> IMHO, the testcase shouldn't be run on processors without AVX512BW.
> Because in  avx512bitalg-vpopcntb-1.c, there's /* {
> dg-require-effective-target avx512bw } */.
>

dg-require-effective-target avx512bw checks assembler support.
__cpuid_count can't use any mask instructions.   Please run this test
on CPU without AVX512.


-- 
H.J.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH 4/4][PR target/88808]Enable bitwise operator for AVX512 masks.
  2020-08-21 15:41                 ` Hongtao Liu
  2020-08-21 15:50                   ` H.J. Lu
@ 2020-08-21 15:50                   ` Uros Bizjak
  2020-08-21 16:29                     ` Hongtao Liu
  1 sibling, 1 reply; 39+ messages in thread
From: Uros Bizjak @ 2020-08-21 15:50 UTC (permalink / raw)
  To: Hongtao Liu; +Cc: GCC Patches, Kirill Yukhin, H. J. Lu

On Fri, Aug 21, 2020 at 5:41 PM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Fri, Aug 21, 2020 at 9:15 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> >
> > > > > gcc/
> > > > >         PR target/88808
> > > > >         * config/i386/i386.c (ix86_preferred_reload_class): Allow
> > > > >         QImode data go into mask registers.
> > > > >         * config/i386/i386.md: (*movhi_internal): Adjust constraints
> > > > >         for mask registers.
> > > > >         (*movqi_internal): Ditto.
> > > > >         (*anddi_1): Support mask register operations
> > > > >         (*and<mode>_1): Ditto.
> > > > >         (*andqi_1): Ditto.
> > > > >         (*andn<mode>_1): Ditto.
> > > > >         (*<code><mode>_1): Ditto.
> > > > >         (*<code>qi_1): Ditto.
> > > > >         (*one_cmpl<mode>2_1): Ditto.
> > > > >         (*one_cmplsi2_1_zext): Ditto.
> > > > >         (*one_cmplqi2_1): Ditto.
> > > > >         (define_peephole2): Move constant 0/-1 directly into mask
> > > > >         registers.
> > > > >         * config/i386/predicates.md (mask_reg_operand): New predicate.
> > > > >         * config/i386/sse.md (define_split): Add post-reload splitters
> > > > >         that would convert "generic" patterns to mask patterns.
> > > > >         (*knotsi_1_zext): New define_insn.
> > > > >
> > > > > gcc/testsuite/
> > > > >         * gcc.target/i386/bitwise_mask_op-1.c: New test.
> > > > >         * gcc.target/i386/bitwise_mask_op-2.c: New test.
> > > > >         * gcc.target/i386/bitwise_mask_op-3.c: New test.
> > > > >         * gcc.target/i386/avx512bw-pr88465.c: New testcase.
> > > > >         * gcc.target/i386/avx512bw-kunpckwd-1.c: Adjust testcase.
> > > > >         * gcc.target/i386/avx512bw-kunpckwd-3.c: Ditto.
> > > > >         * gcc.target/i386/avx512dq-kmovb-5.c: Ditto.
> > > > >         * gcc.target/i386/avx512f-kmovw-5.c: Ditto.
> > > >
> > > > A little nit, please put new splitters after the instruction pattern.
> > > >
> > > > OK for the whole patch set with the above change,
> > > >
> > >
> > > Yes, thanks for the review.
> >
> > Please note that your patch introduces several testsuite fails with -m32:
> >
> > gcc -O2 -mavx512bitalg -mavx512bw -m32 -g avx512bitalg-vpopcntb-1.c
> >
>
> I can't reproduce this failure.

Because you are running it on AVX512 enabled target.

> > Program received signal SIGILL, Illegal instruction.
> > 0x080490ac in __get_cpuid_count (__edx=<synthetic pointer>,
> > __ecx=<synthetic pointer>, __ebx=<synthetic pointer>, __eax=<synthetic
> > pointer>,
> >     __subleaf=0, __leaf=7) at /hdd/uros/gcc-build-fast/gcc/include/cpuid.h:316
> > 316       __cpuid_count (__leaf, __subleaf, *__eax, *__ebx, *__ecx, *__edx);
> >
> >    0x080490a3 <+51>:    cpuid
> >    0x080490a5 <+53>:    mov    $0x1,%eax
> >    0x080490aa <+58>:    mov    %ecx,%esi
> > => 0x080490ac <+60>:    kmovd  %ebx,%k0
> >    0x080490b0 <+64>:    mov    %edi,%ecx
> >    0x080490b2 <+66>:    mov    %edi,%ebx
> >
> > kmov insn is generated for __cpuid_count function, where the binary
> > determines, if the new instructions are supported. The binary will
> > crash in the detection code if the processor lacks AVX512
> > instructions.
> >
>
> IMHO, the testcase shouldn't be run on processors without AVX512BW.

No, it could run, because it checks for AVX512BW at runtime.

> Because in  avx512bitalg-vpopcntb-1.c, there's /*
> dg-require-effective-target avx512bw } */.

This is to check the toolchain for support.

> what's the version of your assembler?

GNU assembler version 2.34-4.fc32

Please add something like
X86_TUNE_INTER_UNIT_MOVES_FROM_MASK/X86_TUNE_INTER_UNIT_MOVES_TO_MASK
and enable them only for m_CORE_AVX512 (or perhaps m_INTEL).

Handle this in inline_secondary_memory_needed to reject direct moves
for all other targets. This should disable direct moves for generic
targets.

Uros.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH 4/4][PR target/88808]Enable bitwise operator for AVX512 masks.
  2020-08-21 15:50                   ` Uros Bizjak
@ 2020-08-21 16:29                     ` Hongtao Liu
  2020-08-21 16:35                       ` H.J. Lu
  2020-08-21 18:25                       ` Uros Bizjak
  0 siblings, 2 replies; 39+ messages in thread
From: Hongtao Liu @ 2020-08-21 16:29 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: GCC Patches, Kirill Yukhin, H. J. Lu

On Fri, Aug 21, 2020 at 11:50 PM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Fri, Aug 21, 2020 at 5:41 PM Hongtao Liu <crazylht@gmail.com> wrote:
> >
> > On Fri, Aug 21, 2020 at 9:15 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > >
> > > > > > gcc/
> > > > > >         PR target/88808
> > > > > >         * config/i386/i386.c (ix86_preferred_reload_class): Allow
> > > > > >         QImode data go into mask registers.
> > > > > >         * config/i386/i386.md: (*movhi_internal): Adjust constraints
> > > > > >         for mask registers.
> > > > > >         (*movqi_internal): Ditto.
> > > > > >         (*anddi_1): Support mask register operations
> > > > > >         (*and<mode>_1): Ditto.
> > > > > >         (*andqi_1): Ditto.
> > > > > >         (*andn<mode>_1): Ditto.
> > > > > >         (*<code><mode>_1): Ditto.
> > > > > >         (*<code>qi_1): Ditto.
> > > > > >         (*one_cmpl<mode>2_1): Ditto.
> > > > > >         (*one_cmplsi2_1_zext): Ditto.
> > > > > >         (*one_cmplqi2_1): Ditto.
> > > > > >         (define_peephole2): Move constant 0/-1 directly into mask
> > > > > >         registers.
> > > > > >         * config/i386/predicates.md (mask_reg_operand): New predicate.
> > > > > >         * config/i386/sse.md (define_split): Add post-reload splitters
> > > > > >         that would convert "generic" patterns to mask patterns.
> > > > > >         (*knotsi_1_zext): New define_insn.
> > > > > >
> > > > > > gcc/testsuite/
> > > > > >         * gcc.target/i386/bitwise_mask_op-1.c: New test.
> > > > > >         * gcc.target/i386/bitwise_mask_op-2.c: New test.
> > > > > >         * gcc.target/i386/bitwise_mask_op-3.c: New test.
> > > > > >         * gcc.target/i386/avx512bw-pr88465.c: New testcase.
> > > > > >         * gcc.target/i386/avx512bw-kunpckwd-1.c: Adjust testcase.
> > > > > >         * gcc.target/i386/avx512bw-kunpckwd-3.c: Ditto.
> > > > > >         * gcc.target/i386/avx512dq-kmovb-5.c: Ditto.
> > > > > >         * gcc.target/i386/avx512f-kmovw-5.c: Ditto.
> > > > >
> > > > > A little nit, please put new splitters after the instruction pattern.
> > > > >
> > > > > OK for the whole patch set with the above change,
> > > > >
> > > >
> > > > Yes, thanks for the review.
> > >
> > > Please note that your patch introduces several testsuite fails with -m32:
> > >
> > > gcc -O2 -mavx512bitalg -mavx512bw -m32 -g avx512bitalg-vpopcntb-1.c
> > >
> >
> > I can't reproduce this failure.
>
> Because you are running it on AVX512 enabled target.
>
> > > Program received signal SIGILL, Illegal instruction.
> > > 0x080490ac in __get_cpuid_count (__edx=<synthetic pointer>,
> > > __ecx=<synthetic pointer>, __ebx=<synthetic pointer>, __eax=<synthetic
> > > pointer>,
> > >     __subleaf=0, __leaf=7) at /hdd/uros/gcc-build-fast/gcc/include/cpuid.h:316
> > > 316       __cpuid_count (__leaf, __subleaf, *__eax, *__ebx, *__ecx, *__edx);
> > >
> > >    0x080490a3 <+51>:    cpuid
> > >    0x080490a5 <+53>:    mov    $0x1,%eax
> > >    0x080490aa <+58>:    mov    %ecx,%esi
> > > => 0x080490ac <+60>:    kmovd  %ebx,%k0
> > >    0x080490b0 <+64>:    mov    %edi,%ecx
> > >    0x080490b2 <+66>:    mov    %edi,%ebx
> > >
> > > kmov insn is generated for __cpuid_count function, where the binary
> > > determines, if the new instructions are supported. The binary will
> > > crash in the detection code if the processor lacks AVX512
> > > instructions.
> > >
> >
> > IMHO, the testcase shouldn't be run on processors without AVX512BW.
>
> No, it could run, because it checks for AVX512BW at runtime.
>

Got it.

> > Because in  avx512bitalg-vpopcntb-1.c, there's /*
> > dg-require-effective-target avx512bw } */.
>
> This is to check the toolchain for support.
>
> > what's the version of your assembler?
>
> GNU assembler version 2.34-4.fc32
>

If assembler supports avx512bw, but processor not, the test would pass
condition `dg-require-effective-target avx512bw` and be runned.
then crashed for illegal instruction.

> Please add something like
> X86_TUNE_INTER_UNIT_MOVES_FROM_MASK/X86_TUNE_INTER_UNIT_MOVES_TO_MASK
> and enable them only for m_CORE_AVX512 (or perhaps m_INTEL).
>
> Handle this in inline_secondary_memory_needed to reject direct moves
> for all other targets. This should disable direct moves for generic
> targets.
>

Yes, I'll add it.

> Uros.



-- 
BR,
Hongtao

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH 4/4][PR target/88808]Enable bitwise operator for AVX512 masks.
  2020-08-21 16:29                     ` Hongtao Liu
@ 2020-08-21 16:35                       ` H.J. Lu
  2020-08-21 16:45                         ` H.J. Lu
  2020-08-21 16:46                         ` [PATCH 4/4][PR target/88808]Enable bitwise operator for AVX512 masks Hongtao Liu
  2020-08-21 18:25                       ` Uros Bizjak
  1 sibling, 2 replies; 39+ messages in thread
From: H.J. Lu @ 2020-08-21 16:35 UTC (permalink / raw)
  To: Hongtao Liu; +Cc: Uros Bizjak, GCC Patches, Kirill Yukhin

On Fri, Aug 21, 2020 at 9:29 AM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Fri, Aug 21, 2020 at 11:50 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> >
> > On Fri, Aug 21, 2020 at 5:41 PM Hongtao Liu <crazylht@gmail.com> wrote:
> > >
> > > On Fri, Aug 21, 2020 at 9:15 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > >
> > > > > > > gcc/
> > > > > > >         PR target/88808
> > > > > > >         * config/i386/i386.c (ix86_preferred_reload_class): Allow
> > > > > > >         QImode data go into mask registers.
> > > > > > >         * config/i386/i386.md: (*movhi_internal): Adjust constraints
> > > > > > >         for mask registers.
> > > > > > >         (*movqi_internal): Ditto.
> > > > > > >         (*anddi_1): Support mask register operations
> > > > > > >         (*and<mode>_1): Ditto.
> > > > > > >         (*andqi_1): Ditto.
> > > > > > >         (*andn<mode>_1): Ditto.
> > > > > > >         (*<code><mode>_1): Ditto.
> > > > > > >         (*<code>qi_1): Ditto.
> > > > > > >         (*one_cmpl<mode>2_1): Ditto.
> > > > > > >         (*one_cmplsi2_1_zext): Ditto.
> > > > > > >         (*one_cmplqi2_1): Ditto.
> > > > > > >         (define_peephole2): Move constant 0/-1 directly into mask
> > > > > > >         registers.
> > > > > > >         * config/i386/predicates.md (mask_reg_operand): New predicate.
> > > > > > >         * config/i386/sse.md (define_split): Add post-reload splitters
> > > > > > >         that would convert "generic" patterns to mask patterns.
> > > > > > >         (*knotsi_1_zext): New define_insn.
> > > > > > >
> > > > > > > gcc/testsuite/
> > > > > > >         * gcc.target/i386/bitwise_mask_op-1.c: New test.
> > > > > > >         * gcc.target/i386/bitwise_mask_op-2.c: New test.
> > > > > > >         * gcc.target/i386/bitwise_mask_op-3.c: New test.
> > > > > > >         * gcc.target/i386/avx512bw-pr88465.c: New testcase.
> > > > > > >         * gcc.target/i386/avx512bw-kunpckwd-1.c: Adjust testcase.
> > > > > > >         * gcc.target/i386/avx512bw-kunpckwd-3.c: Ditto.
> > > > > > >         * gcc.target/i386/avx512dq-kmovb-5.c: Ditto.
> > > > > > >         * gcc.target/i386/avx512f-kmovw-5.c: Ditto.
> > > > > >
> > > > > > A little nit, please put new splitters after the instruction pattern.
> > > > > >
> > > > > > OK for the whole patch set with the above change,
> > > > > >
> > > > >
> > > > > Yes, thanks for the review.
> > > >
> > > > Please note that your patch introduces several testsuite fails with -m32:
> > > >
> > > > gcc -O2 -mavx512bitalg -mavx512bw -m32 -g avx512bitalg-vpopcntb-1.c
> > > >
> > >
> > > I can't reproduce this failure.
> >
> > Because you are running it on AVX512 enabled target.
> >
> > > > Program received signal SIGILL, Illegal instruction.
> > > > 0x080490ac in __get_cpuid_count (__edx=<synthetic pointer>,
> > > > __ecx=<synthetic pointer>, __ebx=<synthetic pointer>, __eax=<synthetic
> > > > pointer>,
> > > >     __subleaf=0, __leaf=7) at /hdd/uros/gcc-build-fast/gcc/include/cpuid.h:316
> > > > 316       __cpuid_count (__leaf, __subleaf, *__eax, *__ebx, *__ecx, *__edx);
> > > >
> > > >    0x080490a3 <+51>:    cpuid
> > > >    0x080490a5 <+53>:    mov    $0x1,%eax
> > > >    0x080490aa <+58>:    mov    %ecx,%esi
> > > > => 0x080490ac <+60>:    kmovd  %ebx,%k0
> > > >    0x080490b0 <+64>:    mov    %edi,%ecx
> > > >    0x080490b2 <+66>:    mov    %edi,%ebx
> > > >
> > > > kmov insn is generated for __cpuid_count function, where the binary
> > > > determines, if the new instructions are supported. The binary will
> > > > crash in the detection code if the processor lacks AVX512
> > > > instructions.
> > > >
> > >
> > > IMHO, the testcase shouldn't be run on processors without AVX512BW.
> >
> > No, it could run, because it checks for AVX512BW at runtime.
> >
>
> Got it.
>
> > > Because in  avx512bitalg-vpopcntb-1.c, there's /*
> > > dg-require-effective-target avx512bw } */.
> >
> > This is to check the toolchain for support.
> >
> > > what's the version of your assembler?
> >
> > GNU assembler version 2.34-4.fc32
> >
>
> If assembler supports avx512bw, but processor not, the test would pass
> condition `dg-require-effective-target avx512bw` and be runned.
> then crashed for illegal instruction.
>
> > Please add something like
> > X86_TUNE_INTER_UNIT_MOVES_FROM_MASK/X86_TUNE_INTER_UNIT_MOVES_TO_MASK
> > and enable them only for m_CORE_AVX512 (or perhaps m_INTEL).
> >
> > Handle this in inline_secondary_memory_needed to reject direct moves
> > for all other targets. This should disable direct moves for generic
> > targets.
> >
>
> Yes, I'll add it.
>


(define_insn "*movsi_internal"
  [(set (match_operand:SI 0 "nonimmediate_operand"
    "=r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,?r,?*v,*k,*k ,*rm,*k")
        (match_operand:SI 1 "general_operand"
    "g ,re,C ,*y,m  ,*y,*y,r  ,C ,*v,m ,*v,*v,r  ,*r,*km,*k ,CBC"))]
  "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
...
 [(set (attr "isa")
     (cond [(eq_attr "alternative" "12,13")
              (const_string "sse2")
           ]
           (const_string "*")))

is wrong.   mask register alternatives should be marked with avx512f.
Please fix it.   Other integer move patterns may have the same issue.
Once these are fixed,

diff --git a/gcc/testsuite/gcc.target/i386/avx512-check.h
b/gcc/testsuite/gcc.target/i386/avx512-check.h
index 0a377dba1d5..576e9b390c6 100644
--- a/gcc/testsuite/gcc.target/i386/avx512-check.h
+++ b/gcc/testsuite/gcc.target/i386/avx512-check.h
@@ -25,6 +25,7 @@ do_test (void)
 }
 #endif

+__attribute__((target ("no-avx512f")))
 static int
 check_osxsave (void)
 {
@@ -34,6 +35,7 @@ check_osxsave (void)
   return (ecx & bit_OSXSAVE) != 0;
 }

+__attribute__((target ("no-avx512f")))
 int
 main ()
 {

should work.

-- 
H.J.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH 4/4][PR target/88808]Enable bitwise operator for AVX512 masks.
  2020-08-21 16:35                       ` H.J. Lu
@ 2020-08-21 16:45                         ` H.J. Lu
  2020-08-22 16:26                           ` [PATCH] x86: Disable SSE, AVX and AVX512 during CPUID check H.J. Lu
  2020-08-21 16:46                         ` [PATCH 4/4][PR target/88808]Enable bitwise operator for AVX512 masks Hongtao Liu
  1 sibling, 1 reply; 39+ messages in thread
From: H.J. Lu @ 2020-08-21 16:45 UTC (permalink / raw)
  To: Hongtao Liu; +Cc: Uros Bizjak, GCC Patches, Kirill Yukhin

[-- Attachment #1: Type: text/plain, Size: 6341 bytes --]

On Fri, Aug 21, 2020 at 9:35 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Fri, Aug 21, 2020 at 9:29 AM Hongtao Liu <crazylht@gmail.com> wrote:
> >
> > On Fri, Aug 21, 2020 at 11:50 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > >
> > > On Fri, Aug 21, 2020 at 5:41 PM Hongtao Liu <crazylht@gmail.com> wrote:
> > > >
> > > > On Fri, Aug 21, 2020 at 9:15 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > > >
> > > > > > > > gcc/
> > > > > > > >         PR target/88808
> > > > > > > >         * config/i386/i386.c (ix86_preferred_reload_class): Allow
> > > > > > > >         QImode data go into mask registers.
> > > > > > > >         * config/i386/i386.md: (*movhi_internal): Adjust constraints
> > > > > > > >         for mask registers.
> > > > > > > >         (*movqi_internal): Ditto.
> > > > > > > >         (*anddi_1): Support mask register operations
> > > > > > > >         (*and<mode>_1): Ditto.
> > > > > > > >         (*andqi_1): Ditto.
> > > > > > > >         (*andn<mode>_1): Ditto.
> > > > > > > >         (*<code><mode>_1): Ditto.
> > > > > > > >         (*<code>qi_1): Ditto.
> > > > > > > >         (*one_cmpl<mode>2_1): Ditto.
> > > > > > > >         (*one_cmplsi2_1_zext): Ditto.
> > > > > > > >         (*one_cmplqi2_1): Ditto.
> > > > > > > >         (define_peephole2): Move constant 0/-1 directly into mask
> > > > > > > >         registers.
> > > > > > > >         * config/i386/predicates.md (mask_reg_operand): New predicate.
> > > > > > > >         * config/i386/sse.md (define_split): Add post-reload splitters
> > > > > > > >         that would convert "generic" patterns to mask patterns.
> > > > > > > >         (*knotsi_1_zext): New define_insn.
> > > > > > > >
> > > > > > > > gcc/testsuite/
> > > > > > > >         * gcc.target/i386/bitwise_mask_op-1.c: New test.
> > > > > > > >         * gcc.target/i386/bitwise_mask_op-2.c: New test.
> > > > > > > >         * gcc.target/i386/bitwise_mask_op-3.c: New test.
> > > > > > > >         * gcc.target/i386/avx512bw-pr88465.c: New testcase.
> > > > > > > >         * gcc.target/i386/avx512bw-kunpckwd-1.c: Adjust testcase.
> > > > > > > >         * gcc.target/i386/avx512bw-kunpckwd-3.c: Ditto.
> > > > > > > >         * gcc.target/i386/avx512dq-kmovb-5.c: Ditto.
> > > > > > > >         * gcc.target/i386/avx512f-kmovw-5.c: Ditto.
> > > > > > >
> > > > > > > A little nit, please put new splitters after the instruction pattern.
> > > > > > >
> > > > > > > OK for the whole patch set with the above change,
> > > > > > >
> > > > > >
> > > > > > Yes, thanks for the review.
> > > > >
> > > > > Please note that your patch introduces several testsuite fails with -m32:
> > > > >
> > > > > gcc -O2 -mavx512bitalg -mavx512bw -m32 -g avx512bitalg-vpopcntb-1.c
> > > > >
> > > >
> > > > I can't reproduce this failure.
> > >
> > > Because you are running it on AVX512 enabled target.
> > >
> > > > > Program received signal SIGILL, Illegal instruction.
> > > > > 0x080490ac in __get_cpuid_count (__edx=<synthetic pointer>,
> > > > > __ecx=<synthetic pointer>, __ebx=<synthetic pointer>, __eax=<synthetic
> > > > > pointer>,
> > > > >     __subleaf=0, __leaf=7) at /hdd/uros/gcc-build-fast/gcc/include/cpuid.h:316
> > > > > 316       __cpuid_count (__leaf, __subleaf, *__eax, *__ebx, *__ecx, *__edx);
> > > > >
> > > > >    0x080490a3 <+51>:    cpuid
> > > > >    0x080490a5 <+53>:    mov    $0x1,%eax
> > > > >    0x080490aa <+58>:    mov    %ecx,%esi
> > > > > => 0x080490ac <+60>:    kmovd  %ebx,%k0
> > > > >    0x080490b0 <+64>:    mov    %edi,%ecx
> > > > >    0x080490b2 <+66>:    mov    %edi,%ebx
> > > > >
> > > > > kmov insn is generated for __cpuid_count function, where the binary
> > > > > determines, if the new instructions are supported. The binary will
> > > > > crash in the detection code if the processor lacks AVX512
> > > > > instructions.
> > > > >
> > > >
> > > > IMHO, the testcase shouldn't be run on processors without AVX512BW.
> > >
> > > No, it could run, because it checks for AVX512BW at runtime.
> > >
> >
> > Got it.
> >
> > > > Because in  avx512bitalg-vpopcntb-1.c, there's /*
> > > > dg-require-effective-target avx512bw } */.
> > >
> > > This is to check the toolchain for support.
> > >
> > > > what's the version of your assembler?
> > >
> > > GNU assembler version 2.34-4.fc32
> > >
> >
> > If assembler supports avx512bw, but processor not, the test would pass
> > condition `dg-require-effective-target avx512bw` and be runned.
> > then crashed for illegal instruction.
> >
> > > Please add something like
> > > X86_TUNE_INTER_UNIT_MOVES_FROM_MASK/X86_TUNE_INTER_UNIT_MOVES_TO_MASK
> > > and enable them only for m_CORE_AVX512 (or perhaps m_INTEL).
> > >
> > > Handle this in inline_secondary_memory_needed to reject direct moves
> > > for all other targets. This should disable direct moves for generic
> > > targets.
> > >
> >
> > Yes, I'll add it.
> >
>
>
> (define_insn "*movsi_internal"
>   [(set (match_operand:SI 0 "nonimmediate_operand"
>     "=r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,?r,?*v,*k,*k ,*rm,*k")
>         (match_operand:SI 1 "general_operand"
>     "g ,re,C ,*y,m  ,*y,*y,r  ,C ,*v,m ,*v,*v,r  ,*r,*km,*k ,CBC"))]
>   "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
> ...
>  [(set (attr "isa")
>      (cond [(eq_attr "alternative" "12,13")
>               (const_string "sse2")
>            ]
>            (const_string "*")))
>
> is wrong.   mask register alternatives should be marked with avx512f.
> Please fix it.   Other integer move patterns may have the same issue.
> Once these are fixed,
>
> diff --git a/gcc/testsuite/gcc.target/i386/avx512-check.h
> b/gcc/testsuite/gcc.target/i386/avx512-check.h
> index 0a377dba1d5..576e9b390c6 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512-check.h
> +++ b/gcc/testsuite/gcc.target/i386/avx512-check.h
> @@ -25,6 +25,7 @@ do_test (void)
>  }
>  #endif
>
> +__attribute__((target ("no-avx512f")))
>  static int
>  check_osxsave (void)
>  {
> @@ -34,6 +35,7 @@ check_osxsave (void)
>    return (ecx & bit_OSXSAVE) != 0;
>  }
>
> +__attribute__((target ("no-avx512f")))
>  int
>  main ()
>  {
>
> should work.
>

Like this.  You need to check all integer patterns with mskmov and msklog.

-- 
H.J.

[-- Attachment #2: 0001-Fix-movsi_internal.patch --]
[-- Type: text/x-patch, Size: 1833 bytes --]

From ac4407e52514679012312e7f223c342f9223ddff Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Fri, 21 Aug 2020 09:42:49 -0700
Subject: [PATCH] Fix *movsi_internal

---
 gcc/config/i386/cpuid.h                      | 5 +++++
 gcc/config/i386/i386.md                      | 2 ++
 gcc/testsuite/gcc.target/i386/avx512-check.h | 5 +++++
 3 files changed, 12 insertions(+)

diff --git a/gcc/config/i386/cpuid.h b/gcc/config/i386/cpuid.h
index bca61d620db..ecb249fade7 100644
--- a/gcc/config/i386/cpuid.h
+++ b/gcc/config/i386/cpuid.h
@@ -24,6 +24,9 @@
 #ifndef _CPUID_H_INCLUDED
 #define _CPUID_H_INCLUDED
 
+#pragma GCC push_options
+#pragma GCC target("no-avx")
+
 /* %eax */
 #define bit_AVX512BF16	(1 << 5)
 
@@ -324,4 +327,6 @@ __cpuidex (int __cpuid_info[4], int __leaf, int __subleaf)
 		 __cpuid_info[2], __cpuid_info[3]);
 }
 
+#pragma GCC pop_options
+
 #endif /* _CPUID_H_INCLUDED */
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 446793b78db..cf5828c23d5 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -2349,6 +2349,8 @@ (define_insn "*movsi_internal"
   [(set (attr "isa")
      (cond [(eq_attr "alternative" "12,13")
 	      (const_string "sse2")
+	    (eq_attr "alternative" "14,15,16,17")
+	      (const_string "avx512f")
 	   ]
 	   (const_string "*")))
    (set (attr "type")
diff --git a/gcc/testsuite/gcc.target/i386/avx512-check.h b/gcc/testsuite/gcc.target/i386/avx512-check.h
index 0a377dba1d5..094453e5e0e 100644
--- a/gcc/testsuite/gcc.target/i386/avx512-check.h
+++ b/gcc/testsuite/gcc.target/i386/avx512-check.h
@@ -25,6 +25,9 @@ do_test (void)
 }
 #endif
 
+#pragma GCC push_options
+#pragma GCC target("no-avx")
+
 static int
 check_osxsave (void)
 {
@@ -110,3 +113,5 @@ main ()
 #endif
   return 0;
 }
+
+#pragma GCC pop_options
-- 
2.26.2


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH 4/4][PR target/88808]Enable bitwise operator for AVX512 masks.
  2020-08-21 16:35                       ` H.J. Lu
  2020-08-21 16:45                         ` H.J. Lu
@ 2020-08-21 16:46                         ` Hongtao Liu
  2020-08-21 17:02                           ` H.J. Lu
  1 sibling, 1 reply; 39+ messages in thread
From: Hongtao Liu @ 2020-08-21 16:46 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Uros Bizjak, GCC Patches, Kirill Yukhin

On Sat, Aug 22, 2020 at 12:36 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Fri, Aug 21, 2020 at 9:29 AM Hongtao Liu <crazylht@gmail.com> wrote:
> >
> > On Fri, Aug 21, 2020 at 11:50 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > >
> > > On Fri, Aug 21, 2020 at 5:41 PM Hongtao Liu <crazylht@gmail.com> wrote:
> > > >
> > > > On Fri, Aug 21, 2020 at 9:15 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > > >
> > > > > > > > gcc/
> > > > > > > >         PR target/88808
> > > > > > > >         * config/i386/i386.c (ix86_preferred_reload_class): Allow
> > > > > > > >         QImode data go into mask registers.
> > > > > > > >         * config/i386/i386.md: (*movhi_internal): Adjust constraints
> > > > > > > >         for mask registers.
> > > > > > > >         (*movqi_internal): Ditto.
> > > > > > > >         (*anddi_1): Support mask register operations
> > > > > > > >         (*and<mode>_1): Ditto.
> > > > > > > >         (*andqi_1): Ditto.
> > > > > > > >         (*andn<mode>_1): Ditto.
> > > > > > > >         (*<code><mode>_1): Ditto.
> > > > > > > >         (*<code>qi_1): Ditto.
> > > > > > > >         (*one_cmpl<mode>2_1): Ditto.
> > > > > > > >         (*one_cmplsi2_1_zext): Ditto.
> > > > > > > >         (*one_cmplqi2_1): Ditto.
> > > > > > > >         (define_peephole2): Move constant 0/-1 directly into mask
> > > > > > > >         registers.
> > > > > > > >         * config/i386/predicates.md (mask_reg_operand): New predicate.
> > > > > > > >         * config/i386/sse.md (define_split): Add post-reload splitters
> > > > > > > >         that would convert "generic" patterns to mask patterns.
> > > > > > > >         (*knotsi_1_zext): New define_insn.
> > > > > > > >
> > > > > > > > gcc/testsuite/
> > > > > > > >         * gcc.target/i386/bitwise_mask_op-1.c: New test.
> > > > > > > >         * gcc.target/i386/bitwise_mask_op-2.c: New test.
> > > > > > > >         * gcc.target/i386/bitwise_mask_op-3.c: New test.
> > > > > > > >         * gcc.target/i386/avx512bw-pr88465.c: New testcase.
> > > > > > > >         * gcc.target/i386/avx512bw-kunpckwd-1.c: Adjust testcase.
> > > > > > > >         * gcc.target/i386/avx512bw-kunpckwd-3.c: Ditto.
> > > > > > > >         * gcc.target/i386/avx512dq-kmovb-5.c: Ditto.
> > > > > > > >         * gcc.target/i386/avx512f-kmovw-5.c: Ditto.
> > > > > > >
> > > > > > > A little nit, please put new splitters after the instruction pattern.
> > > > > > >
> > > > > > > OK for the whole patch set with the above change,
> > > > > > >
> > > > > >
> > > > > > Yes, thanks for the review.
> > > > >
> > > > > Please note that your patch introduces several testsuite fails with -m32:
> > > > >
> > > > > gcc -O2 -mavx512bitalg -mavx512bw -m32 -g avx512bitalg-vpopcntb-1.c
> > > > >
> > > >
> > > > I can't reproduce this failure.
> > >
> > > Because you are running it on AVX512 enabled target.
> > >
> > > > > Program received signal SIGILL, Illegal instruction.
> > > > > 0x080490ac in __get_cpuid_count (__edx=<synthetic pointer>,
> > > > > __ecx=<synthetic pointer>, __ebx=<synthetic pointer>, __eax=<synthetic
> > > > > pointer>,
> > > > >     __subleaf=0, __leaf=7) at /hdd/uros/gcc-build-fast/gcc/include/cpuid.h:316
> > > > > 316       __cpuid_count (__leaf, __subleaf, *__eax, *__ebx, *__ecx, *__edx);
> > > > >
> > > > >    0x080490a3 <+51>:    cpuid
> > > > >    0x080490a5 <+53>:    mov    $0x1,%eax
> > > > >    0x080490aa <+58>:    mov    %ecx,%esi
> > > > > => 0x080490ac <+60>:    kmovd  %ebx,%k0
> > > > >    0x080490b0 <+64>:    mov    %edi,%ecx
> > > > >    0x080490b2 <+66>:    mov    %edi,%ebx
> > > > >
> > > > > kmov insn is generated for __cpuid_count function, where the binary
> > > > > determines, if the new instructions are supported. The binary will
> > > > > crash in the detection code if the processor lacks AVX512
> > > > > instructions.
> > > > >
> > > >
> > > > IMHO, the testcase shouldn't be run on processors without AVX512BW.
> > >
> > > No, it could run, because it checks for AVX512BW at runtime.
> > >
> >
> > Got it.
> >
> > > > Because in  avx512bitalg-vpopcntb-1.c, there's /*
> > > > dg-require-effective-target avx512bw } */.
> > >
> > > This is to check the toolchain for support.
> > >
> > > > what's the version of your assembler?
> > >
> > > GNU assembler version 2.34-4.fc32
> > >
> >
> > If assembler supports avx512bw, but processor not, the test would pass
> > condition `dg-require-effective-target avx512bw` and be runned.
> > then crashed for illegal instruction.
> >
> > > Please add something like
> > > X86_TUNE_INTER_UNIT_MOVES_FROM_MASK/X86_TUNE_INTER_UNIT_MOVES_TO_MASK
> > > and enable them only for m_CORE_AVX512 (or perhaps m_INTEL).
> > >
> > > Handle this in inline_secondary_memory_needed to reject direct moves
> > > for all other targets. This should disable direct moves for generic
> > > targets.
> > >
> >
> > Yes, I'll add it.
> >
>
>
> (define_insn "*movsi_internal"
>   [(set (match_operand:SI 0 "nonimmediate_operand"
>     "=r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,?r,?*v,*k,*k ,*rm,*k")
>         (match_operand:SI 1 "general_operand"
>     "g ,re,C ,*y,m  ,*y,*y,r  ,C ,*v,m ,*v,*v,r  ,*r,*km,*k ,CBC"))]
>   "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
> ...
>  [(set (attr "isa")
>      (cond [(eq_attr "alternative" "12,13")
>               (const_string "sse2")
>            ]
>            (const_string "*")))
>
> is wrong.   mask register alternatives should be marked with avx512f.
> Please fix it.   Other integer move patterns may have the same issue.
> Once these are fixed,
>

It is restricted by ix86_hard_regno_mode_ok
---
18976      return ((TARGET_AVX512F && VALID_MASK_REG_MODE (mode))
18977              || (TARGET_AVX512BW
18978                  && VALID_MASK_AVX512BW_MODE (mode)));
18979    }
---

> diff --git a/gcc/testsuite/gcc.target/i386/avx512-check.h
> b/gcc/testsuite/gcc.target/i386/avx512-check.h
> index 0a377dba1d5..576e9b390c6 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512-check.h
> +++ b/gcc/testsuite/gcc.target/i386/avx512-check.h
> @@ -25,6 +25,7 @@ do_test (void)
>  }
>  #endif
>
> +__attribute__((target ("no-avx512f")))
>  static int
>  check_osxsave (void)
>  {
> @@ -34,6 +35,7 @@ check_osxsave (void)
>    return (ecx & bit_OSXSAVE) != 0;
>  }
>
> +__attribute__((target ("no-avx512f")))
>  int
>  main ()
>  {
>
> should work.
>

That's what i thought right now.
The real problem is we use -mavx512bw to build a binary to determine
whether the processor support AVX512BW

> --
> H.J.



-- 
BR,
Hongtao

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH 4/4][PR target/88808]Enable bitwise operator for AVX512 masks.
  2020-08-21 16:46                         ` [PATCH 4/4][PR target/88808]Enable bitwise operator for AVX512 masks Hongtao Liu
@ 2020-08-21 17:02                           ` H.J. Lu
  2020-08-21 17:07                             ` H.J. Lu
  0 siblings, 1 reply; 39+ messages in thread
From: H.J. Lu @ 2020-08-21 17:02 UTC (permalink / raw)
  To: Hongtao Liu; +Cc: Uros Bizjak, GCC Patches, Kirill Yukhin

On Fri, Aug 21, 2020 at 9:46 AM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Sat, Aug 22, 2020 at 12:36 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Fri, Aug 21, 2020 at 9:29 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > >
> > > On Fri, Aug 21, 2020 at 11:50 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > >
> > > > On Fri, Aug 21, 2020 at 5:41 PM Hongtao Liu <crazylht@gmail.com> wrote:
> > > > >
> > > > > On Fri, Aug 21, 2020 at 9:15 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > > > >
> > > > > > > > > gcc/
> > > > > > > > >         PR target/88808
> > > > > > > > >         * config/i386/i386.c (ix86_preferred_reload_class): Allow
> > > > > > > > >         QImode data go into mask registers.
> > > > > > > > >         * config/i386/i386.md: (*movhi_internal): Adjust constraints
> > > > > > > > >         for mask registers.
> > > > > > > > >         (*movqi_internal): Ditto.
> > > > > > > > >         (*anddi_1): Support mask register operations
> > > > > > > > >         (*and<mode>_1): Ditto.
> > > > > > > > >         (*andqi_1): Ditto.
> > > > > > > > >         (*andn<mode>_1): Ditto.
> > > > > > > > >         (*<code><mode>_1): Ditto.
> > > > > > > > >         (*<code>qi_1): Ditto.
> > > > > > > > >         (*one_cmpl<mode>2_1): Ditto.
> > > > > > > > >         (*one_cmplsi2_1_zext): Ditto.
> > > > > > > > >         (*one_cmplqi2_1): Ditto.
> > > > > > > > >         (define_peephole2): Move constant 0/-1 directly into mask
> > > > > > > > >         registers.
> > > > > > > > >         * config/i386/predicates.md (mask_reg_operand): New predicate.
> > > > > > > > >         * config/i386/sse.md (define_split): Add post-reload splitters
> > > > > > > > >         that would convert "generic" patterns to mask patterns.
> > > > > > > > >         (*knotsi_1_zext): New define_insn.
> > > > > > > > >
> > > > > > > > > gcc/testsuite/
> > > > > > > > >         * gcc.target/i386/bitwise_mask_op-1.c: New test.
> > > > > > > > >         * gcc.target/i386/bitwise_mask_op-2.c: New test.
> > > > > > > > >         * gcc.target/i386/bitwise_mask_op-3.c: New test.
> > > > > > > > >         * gcc.target/i386/avx512bw-pr88465.c: New testcase.
> > > > > > > > >         * gcc.target/i386/avx512bw-kunpckwd-1.c: Adjust testcase.
> > > > > > > > >         * gcc.target/i386/avx512bw-kunpckwd-3.c: Ditto.
> > > > > > > > >         * gcc.target/i386/avx512dq-kmovb-5.c: Ditto.
> > > > > > > > >         * gcc.target/i386/avx512f-kmovw-5.c: Ditto.
> > > > > > > >
> > > > > > > > A little nit, please put new splitters after the instruction pattern.
> > > > > > > >
> > > > > > > > OK for the whole patch set with the above change,
> > > > > > > >
> > > > > > >
> > > > > > > Yes, thanks for the review.
> > > > > >
> > > > > > Please note that your patch introduces several testsuite fails with -m32:
> > > > > >
> > > > > > gcc -O2 -mavx512bitalg -mavx512bw -m32 -g avx512bitalg-vpopcntb-1.c
> > > > > >
> > > > >
> > > > > I can't reproduce this failure.
> > > >
> > > > Because you are running it on AVX512 enabled target.
> > > >
> > > > > > Program received signal SIGILL, Illegal instruction.
> > > > > > 0x080490ac in __get_cpuid_count (__edx=<synthetic pointer>,
> > > > > > __ecx=<synthetic pointer>, __ebx=<synthetic pointer>, __eax=<synthetic
> > > > > > pointer>,
> > > > > >     __subleaf=0, __leaf=7) at /hdd/uros/gcc-build-fast/gcc/include/cpuid.h:316
> > > > > > 316       __cpuid_count (__leaf, __subleaf, *__eax, *__ebx, *__ecx, *__edx);
> > > > > >
> > > > > >    0x080490a3 <+51>:    cpuid
> > > > > >    0x080490a5 <+53>:    mov    $0x1,%eax
> > > > > >    0x080490aa <+58>:    mov    %ecx,%esi
> > > > > > => 0x080490ac <+60>:    kmovd  %ebx,%k0
> > > > > >    0x080490b0 <+64>:    mov    %edi,%ecx
> > > > > >    0x080490b2 <+66>:    mov    %edi,%ebx
> > > > > >
> > > > > > kmov insn is generated for __cpuid_count function, where the binary
> > > > > > determines, if the new instructions are supported. The binary will
> > > > > > crash in the detection code if the processor lacks AVX512
> > > > > > instructions.
> > > > > >
> > > > >
> > > > > IMHO, the testcase shouldn't be run on processors without AVX512BW.
> > > >
> > > > No, it could run, because it checks for AVX512BW at runtime.
> > > >
> > >
> > > Got it.
> > >
> > > > > Because in  avx512bitalg-vpopcntb-1.c, there's /*
> > > > > dg-require-effective-target avx512bw } */.
> > > >
> > > > This is to check the toolchain for support.
> > > >
> > > > > what's the version of your assembler?
> > > >
> > > > GNU assembler version 2.34-4.fc32
> > > >
> > >
> > > If assembler supports avx512bw, but processor not, the test would pass
> > > condition `dg-require-effective-target avx512bw` and be runned.
> > > then crashed for illegal instruction.
> > >
> > > > Please add something like
> > > > X86_TUNE_INTER_UNIT_MOVES_FROM_MASK/X86_TUNE_INTER_UNIT_MOVES_TO_MASK
> > > > and enable them only for m_CORE_AVX512 (or perhaps m_INTEL).
> > > >
> > > > Handle this in inline_secondary_memory_needed to reject direct moves
> > > > for all other targets. This should disable direct moves for generic
> > > > targets.
> > > >
> > >
> > > Yes, I'll add it.
> > >
> >
> >
> > (define_insn "*movsi_internal"
> >   [(set (match_operand:SI 0 "nonimmediate_operand"
> >     "=r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,?r,?*v,*k,*k ,*rm,*k")
> >         (match_operand:SI 1 "general_operand"
> >     "g ,re,C ,*y,m  ,*y,*y,r  ,C ,*v,m ,*v,*v,r  ,*r,*km,*k ,CBC"))]
> >   "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
> > ...
> >  [(set (attr "isa")
> >      (cond [(eq_attr "alternative" "12,13")
> >               (const_string "sse2")
> >            ]
> >            (const_string "*")))
> >
> > is wrong.   mask register alternatives should be marked with avx512f.
> > Please fix it.   Other integer move patterns may have the same issue.
> > Once these are fixed,
> >
>
> It is restricted by ix86_hard_regno_mode_ok
> ---
> 18976      return ((TARGET_AVX512F && VALID_MASK_REG_MODE (mode))
> 18977              || (TARGET_AVX512BW
> 18978                  && VALID_MASK_AVX512BW_MODE (mode)));
> 18979    }
> ---

It may not be checked by RA.

> > diff --git a/gcc/testsuite/gcc.target/i386/avx512-check.h
> > b/gcc/testsuite/gcc.target/i386/avx512-check.h
> > index 0a377dba1d5..576e9b390c6 100644
> > --- a/gcc/testsuite/gcc.target/i386/avx512-check.h
> > +++ b/gcc/testsuite/gcc.target/i386/avx512-check.h
> > @@ -25,6 +25,7 @@ do_test (void)
> >  }
> >  #endif
> >
> > +__attribute__((target ("no-avx512f")))
> >  static int
> >  check_osxsave (void)
> >  {
> > @@ -34,6 +35,7 @@ check_osxsave (void)
> >    return (ecx & bit_OSXSAVE) != 0;
> >  }
> >
> > +__attribute__((target ("no-avx512f")))
> >  int
> >  main ()
> >  {
> >
> > should work.
> >
>
> That's what i thought right now.
> The real problem is we use -mavx512bw to build a binary to determine
> whether the processor support AVX512BW
>

This isn't a problem as long as AVX512BW isn't used in AVX512BW detection
codes.   All CPUID codes should have

#pragma GCC push_options
#pragma GCC target("no-avx")  // no-sse?

Otherwise we will run into problems with integer register spill with
AVX instructions.


-- 
H.J.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH 4/4][PR target/88808]Enable bitwise operator for AVX512 masks.
  2020-08-21 17:02                           ` H.J. Lu
@ 2020-08-21 17:07                             ` H.J. Lu
  2020-08-21 17:29                               ` Hongtao Liu
  0 siblings, 1 reply; 39+ messages in thread
From: H.J. Lu @ 2020-08-21 17:07 UTC (permalink / raw)
  To: Hongtao Liu; +Cc: Uros Bizjak, GCC Patches, Kirill Yukhin

On Fri, Aug 21, 2020 at 10:02 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Fri, Aug 21, 2020 at 9:46 AM Hongtao Liu <crazylht@gmail.com> wrote:
> >
> > On Sat, Aug 22, 2020 at 12:36 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Fri, Aug 21, 2020 at 9:29 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > > >
> > > > On Fri, Aug 21, 2020 at 11:50 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > > >
> > > > > On Fri, Aug 21, 2020 at 5:41 PM Hongtao Liu <crazylht@gmail.com> wrote:
> > > > > >
> > > > > > On Fri, Aug 21, 2020 at 9:15 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > > > > >
> > > > > > > > > > gcc/
> > > > > > > > > >         PR target/88808
> > > > > > > > > >         * config/i386/i386.c (ix86_preferred_reload_class): Allow
> > > > > > > > > >         QImode data go into mask registers.
> > > > > > > > > >         * config/i386/i386.md: (*movhi_internal): Adjust constraints
> > > > > > > > > >         for mask registers.
> > > > > > > > > >         (*movqi_internal): Ditto.
> > > > > > > > > >         (*anddi_1): Support mask register operations
> > > > > > > > > >         (*and<mode>_1): Ditto.
> > > > > > > > > >         (*andqi_1): Ditto.
> > > > > > > > > >         (*andn<mode>_1): Ditto.
> > > > > > > > > >         (*<code><mode>_1): Ditto.
> > > > > > > > > >         (*<code>qi_1): Ditto.
> > > > > > > > > >         (*one_cmpl<mode>2_1): Ditto.
> > > > > > > > > >         (*one_cmplsi2_1_zext): Ditto.
> > > > > > > > > >         (*one_cmplqi2_1): Ditto.
> > > > > > > > > >         (define_peephole2): Move constant 0/-1 directly into mask
> > > > > > > > > >         registers.
> > > > > > > > > >         * config/i386/predicates.md (mask_reg_operand): New predicate.
> > > > > > > > > >         * config/i386/sse.md (define_split): Add post-reload splitters
> > > > > > > > > >         that would convert "generic" patterns to mask patterns.
> > > > > > > > > >         (*knotsi_1_zext): New define_insn.
> > > > > > > > > >
> > > > > > > > > > gcc/testsuite/
> > > > > > > > > >         * gcc.target/i386/bitwise_mask_op-1.c: New test.
> > > > > > > > > >         * gcc.target/i386/bitwise_mask_op-2.c: New test.
> > > > > > > > > >         * gcc.target/i386/bitwise_mask_op-3.c: New test.
> > > > > > > > > >         * gcc.target/i386/avx512bw-pr88465.c: New testcase.
> > > > > > > > > >         * gcc.target/i386/avx512bw-kunpckwd-1.c: Adjust testcase.
> > > > > > > > > >         * gcc.target/i386/avx512bw-kunpckwd-3.c: Ditto.
> > > > > > > > > >         * gcc.target/i386/avx512dq-kmovb-5.c: Ditto.
> > > > > > > > > >         * gcc.target/i386/avx512f-kmovw-5.c: Ditto.
> > > > > > > > >
> > > > > > > > > A little nit, please put new splitters after the instruction pattern.
> > > > > > > > >
> > > > > > > > > OK for the whole patch set with the above change,
> > > > > > > > >
> > > > > > > >
> > > > > > > > Yes, thanks for the review.
> > > > > > >
> > > > > > > Please note that your patch introduces several testsuite fails with -m32:
> > > > > > >
> > > > > > > gcc -O2 -mavx512bitalg -mavx512bw -m32 -g avx512bitalg-vpopcntb-1.c
> > > > > > >
> > > > > >
> > > > > > I can't reproduce this failure.
> > > > >
> > > > > Because you are running it on AVX512 enabled target.
> > > > >
> > > > > > > Program received signal SIGILL, Illegal instruction.
> > > > > > > 0x080490ac in __get_cpuid_count (__edx=<synthetic pointer>,
> > > > > > > __ecx=<synthetic pointer>, __ebx=<synthetic pointer>, __eax=<synthetic
> > > > > > > pointer>,
> > > > > > >     __subleaf=0, __leaf=7) at /hdd/uros/gcc-build-fast/gcc/include/cpuid.h:316
> > > > > > > 316       __cpuid_count (__leaf, __subleaf, *__eax, *__ebx, *__ecx, *__edx);
> > > > > > >
> > > > > > >    0x080490a3 <+51>:    cpuid
> > > > > > >    0x080490a5 <+53>:    mov    $0x1,%eax
> > > > > > >    0x080490aa <+58>:    mov    %ecx,%esi
> > > > > > > => 0x080490ac <+60>:    kmovd  %ebx,%k0
> > > > > > >    0x080490b0 <+64>:    mov    %edi,%ecx
> > > > > > >    0x080490b2 <+66>:    mov    %edi,%ebx
> > > > > > >
> > > > > > > kmov insn is generated for __cpuid_count function, where the binary
> > > > > > > determines, if the new instructions are supported. The binary will
> > > > > > > crash in the detection code if the processor lacks AVX512
> > > > > > > instructions.
> > > > > > >
> > > > > >
> > > > > > IMHO, the testcase shouldn't be run on processors without AVX512BW.
> > > > >
> > > > > No, it could run, because it checks for AVX512BW at runtime.
> > > > >
> > > >
> > > > Got it.
> > > >
> > > > > > Because in  avx512bitalg-vpopcntb-1.c, there's /*
> > > > > > dg-require-effective-target avx512bw } */.
> > > > >
> > > > > This is to check the toolchain for support.
> > > > >
> > > > > > what's the version of your assembler?
> > > > >
> > > > > GNU assembler version 2.34-4.fc32
> > > > >
> > > >
> > > > If assembler supports avx512bw, but processor not, the test would pass
> > > > condition `dg-require-effective-target avx512bw` and be runned.
> > > > then crashed for illegal instruction.
> > > >
> > > > > Please add something like
> > > > > X86_TUNE_INTER_UNIT_MOVES_FROM_MASK/X86_TUNE_INTER_UNIT_MOVES_TO_MASK
> > > > > and enable them only for m_CORE_AVX512 (or perhaps m_INTEL).
> > > > >
> > > > > Handle this in inline_secondary_memory_needed to reject direct moves
> > > > > for all other targets. This should disable direct moves for generic
> > > > > targets.
> > > > >
> > > >
> > > > Yes, I'll add it.
> > > >
> > >
> > >
> > > (define_insn "*movsi_internal"
> > >   [(set (match_operand:SI 0 "nonimmediate_operand"
> > >     "=r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,?r,?*v,*k,*k ,*rm,*k")
> > >         (match_operand:SI 1 "general_operand"
> > >     "g ,re,C ,*y,m  ,*y,*y,r  ,C ,*v,m ,*v,*v,r  ,*r,*km,*k ,CBC"))]
> > >   "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
> > > ...
> > >  [(set (attr "isa")
> > >      (cond [(eq_attr "alternative" "12,13")
> > >               (const_string "sse2")
> > >            ]
> > >            (const_string "*")))
> > >
> > > is wrong.   mask register alternatives should be marked with avx512f.
> > > Please fix it.   Other integer move patterns may have the same issue.
> > > Once these are fixed,
> > >
> >
> > It is restricted by ix86_hard_regno_mode_ok
> > ---
> > 18976      return ((TARGET_AVX512F && VALID_MASK_REG_MODE (mode))
> > 18977              || (TARGET_AVX512BW
> > 18978                  && VALID_MASK_AVX512BW_MODE (mode)));
> > 18979    }
> > ---
>
> It may not be checked by RA.

See:

https://gcc.gnu.org/pipermail/gcc-patches/2020-April/543838.html

> > > diff --git a/gcc/testsuite/gcc.target/i386/avx512-check.h
> > > b/gcc/testsuite/gcc.target/i386/avx512-check.h
> > > index 0a377dba1d5..576e9b390c6 100644
> > > --- a/gcc/testsuite/gcc.target/i386/avx512-check.h
> > > +++ b/gcc/testsuite/gcc.target/i386/avx512-check.h
> > > @@ -25,6 +25,7 @@ do_test (void)
> > >  }
> > >  #endif
> > >
> > > +__attribute__((target ("no-avx512f")))
> > >  static int
> > >  check_osxsave (void)
> > >  {
> > > @@ -34,6 +35,7 @@ check_osxsave (void)
> > >    return (ecx & bit_OSXSAVE) != 0;
> > >  }
> > >
> > > +__attribute__((target ("no-avx512f")))
> > >  int
> > >  main ()
> > >  {
> > >
> > > should work.
> > >
> >
> > That's what i thought right now.
> > The real problem is we use -mavx512bw to build a binary to determine
> > whether the processor support AVX512BW
> >
>
> This isn't a problem as long as AVX512BW isn't used in AVX512BW detection
> codes.   All CPUID codes should have
>
> #pragma GCC push_options
> #pragma GCC target("no-avx")  // no-sse?
>
> Otherwise we will run into problems with integer register spill with
> AVX instructions.
>
>
> --
> H.J.



-- 
H.J.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH 4/4][PR target/88808]Enable bitwise operator for AVX512 masks.
  2020-08-21 17:07                             ` H.J. Lu
@ 2020-08-21 17:29                               ` Hongtao Liu
  0 siblings, 0 replies; 39+ messages in thread
From: Hongtao Liu @ 2020-08-21 17:29 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Uros Bizjak, GCC Patches, Kirill Yukhin

On Sat, Aug 22, 2020 at 1:08 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Fri, Aug 21, 2020 at 10:02 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Fri, Aug 21, 2020 at 9:46 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > >
> > > On Sat, Aug 22, 2020 at 12:36 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > >
> > > > On Fri, Aug 21, 2020 at 9:29 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > > > >
> > > > > On Fri, Aug 21, 2020 at 11:50 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > > > >
> > > > > > On Fri, Aug 21, 2020 at 5:41 PM Hongtao Liu <crazylht@gmail.com> wrote:
> > > > > > >
> > > > > > > On Fri, Aug 21, 2020 at 9:15 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > > > > > >
> > > > > > > > > > > gcc/
> > > > > > > > > > >         PR target/88808
> > > > > > > > > > >         * config/i386/i386.c (ix86_preferred_reload_class): Allow
> > > > > > > > > > >         QImode data go into mask registers.
> > > > > > > > > > >         * config/i386/i386.md: (*movhi_internal): Adjust constraints
> > > > > > > > > > >         for mask registers.
> > > > > > > > > > >         (*movqi_internal): Ditto.
> > > > > > > > > > >         (*anddi_1): Support mask register operations
> > > > > > > > > > >         (*and<mode>_1): Ditto.
> > > > > > > > > > >         (*andqi_1): Ditto.
> > > > > > > > > > >         (*andn<mode>_1): Ditto.
> > > > > > > > > > >         (*<code><mode>_1): Ditto.
> > > > > > > > > > >         (*<code>qi_1): Ditto.
> > > > > > > > > > >         (*one_cmpl<mode>2_1): Ditto.
> > > > > > > > > > >         (*one_cmplsi2_1_zext): Ditto.
> > > > > > > > > > >         (*one_cmplqi2_1): Ditto.
> > > > > > > > > > >         (define_peephole2): Move constant 0/-1 directly into mask
> > > > > > > > > > >         registers.
> > > > > > > > > > >         * config/i386/predicates.md (mask_reg_operand): New predicate.
> > > > > > > > > > >         * config/i386/sse.md (define_split): Add post-reload splitters
> > > > > > > > > > >         that would convert "generic" patterns to mask patterns.
> > > > > > > > > > >         (*knotsi_1_zext): New define_insn.
> > > > > > > > > > >
> > > > > > > > > > > gcc/testsuite/
> > > > > > > > > > >         * gcc.target/i386/bitwise_mask_op-1.c: New test.
> > > > > > > > > > >         * gcc.target/i386/bitwise_mask_op-2.c: New test.
> > > > > > > > > > >         * gcc.target/i386/bitwise_mask_op-3.c: New test.
> > > > > > > > > > >         * gcc.target/i386/avx512bw-pr88465.c: New testcase.
> > > > > > > > > > >         * gcc.target/i386/avx512bw-kunpckwd-1.c: Adjust testcase.
> > > > > > > > > > >         * gcc.target/i386/avx512bw-kunpckwd-3.c: Ditto.
> > > > > > > > > > >         * gcc.target/i386/avx512dq-kmovb-5.c: Ditto.
> > > > > > > > > > >         * gcc.target/i386/avx512f-kmovw-5.c: Ditto.
> > > > > > > > > >
> > > > > > > > > > A little nit, please put new splitters after the instruction pattern.
> > > > > > > > > >
> > > > > > > > > > OK for the whole patch set with the above change,
> > > > > > > > > >
> > > > > > > > >
> > > > > > > > > Yes, thanks for the review.
> > > > > > > >
> > > > > > > > Please note that your patch introduces several testsuite fails with -m32:
> > > > > > > >
> > > > > > > > gcc -O2 -mavx512bitalg -mavx512bw -m32 -g avx512bitalg-vpopcntb-1.c
> > > > > > > >
> > > > > > >
> > > > > > > I can't reproduce this failure.
> > > > > >
> > > > > > Because you are running it on AVX512 enabled target.
> > > > > >
> > > > > > > > Program received signal SIGILL, Illegal instruction.
> > > > > > > > 0x080490ac in __get_cpuid_count (__edx=<synthetic pointer>,
> > > > > > > > __ecx=<synthetic pointer>, __ebx=<synthetic pointer>, __eax=<synthetic
> > > > > > > > pointer>,
> > > > > > > >     __subleaf=0, __leaf=7) at /hdd/uros/gcc-build-fast/gcc/include/cpuid.h:316
> > > > > > > > 316       __cpuid_count (__leaf, __subleaf, *__eax, *__ebx, *__ecx, *__edx);
> > > > > > > >
> > > > > > > >    0x080490a3 <+51>:    cpuid
> > > > > > > >    0x080490a5 <+53>:    mov    $0x1,%eax
> > > > > > > >    0x080490aa <+58>:    mov    %ecx,%esi
> > > > > > > > => 0x080490ac <+60>:    kmovd  %ebx,%k0
> > > > > > > >    0x080490b0 <+64>:    mov    %edi,%ecx
> > > > > > > >    0x080490b2 <+66>:    mov    %edi,%ebx
> > > > > > > >
> > > > > > > > kmov insn is generated for __cpuid_count function, where the binary
> > > > > > > > determines, if the new instructions are supported. The binary will
> > > > > > > > crash in the detection code if the processor lacks AVX512
> > > > > > > > instructions.
> > > > > > > >
> > > > > > >
> > > > > > > IMHO, the testcase shouldn't be run on processors without AVX512BW.
> > > > > >
> > > > > > No, it could run, because it checks for AVX512BW at runtime.
> > > > > >
> > > > >
> > > > > Got it.
> > > > >
> > > > > > > Because in  avx512bitalg-vpopcntb-1.c, there's /*
> > > > > > > dg-require-effective-target avx512bw } */.
> > > > > >
> > > > > > This is to check the toolchain for support.
> > > > > >
> > > > > > > what's the version of your assembler?
> > > > > >
> > > > > > GNU assembler version 2.34-4.fc32
> > > > > >
> > > > >
> > > > > If assembler supports avx512bw, but processor not, the test would pass
> > > > > condition `dg-require-effective-target avx512bw` and be runned.
> > > > > then crashed for illegal instruction.
> > > > >
> > > > > > Please add something like
> > > > > > X86_TUNE_INTER_UNIT_MOVES_FROM_MASK/X86_TUNE_INTER_UNIT_MOVES_TO_MASK
> > > > > > and enable them only for m_CORE_AVX512 (or perhaps m_INTEL).
> > > > > >
> > > > > > Handle this in inline_secondary_memory_needed to reject direct moves
> > > > > > for all other targets. This should disable direct moves for generic
> > > > > > targets.
> > > > > >
> > > > >
> > > > > Yes, I'll add it.
> > > > >
> > > >
> > > >
> > > > (define_insn "*movsi_internal"
> > > >   [(set (match_operand:SI 0 "nonimmediate_operand"
> > > >     "=r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,?r,?*v,*k,*k ,*rm,*k")
> > > >         (match_operand:SI 1 "general_operand"
> > > >     "g ,re,C ,*y,m  ,*y,*y,r  ,C ,*v,m ,*v,*v,r  ,*r,*km,*k ,CBC"))]
> > > >   "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
> > > > ...
> > > >  [(set (attr "isa")
> > > >      (cond [(eq_attr "alternative" "12,13")
> > > >               (const_string "sse2")
> > > >            ]
> > > >            (const_string "*")))
> > > >
> > > > is wrong.   mask register alternatives should be marked with avx512f.
> > > > Please fix it.   Other integer move patterns may have the same issue.
> > > > Once these are fixed,
> > > >
> > >
> > > It is restricted by ix86_hard_regno_mode_ok
> > > ---
> > > 18976      return ((TARGET_AVX512F && VALID_MASK_REG_MODE (mode))
> > > 18977              || (TARGET_AVX512BW
> > > 18978                  && VALID_MASK_AVX512BW_MODE (mode)));
> > > 18979    }
> > > ---
> >
> > It may not be checked by RA.
>

Also restricted by constraint.md
(define_register_constraint "k" "TARGET_AVX512F ? ALL_MASK_REGS : NO_REGS"
"@internal Any mask register.")

> See:
>
> https://gcc.gnu.org/pipermail/gcc-patches/2020-April/543838.html
>

(define_register_constraint "v" "TARGET_SSE ? ALL_SSE_REGS : NO_REGS"
 "Any EVEX encodable SSE register (@code{%xmm0-%xmm31}).")
Need adjustment?

> > > > diff --git a/gcc/testsuite/gcc.target/i386/avx512-check.h
> > > > b/gcc/testsuite/gcc.target/i386/avx512-check.h
> > > > index 0a377dba1d5..576e9b390c6 100644
> > > > --- a/gcc/testsuite/gcc.target/i386/avx512-check.h
> > > > +++ b/gcc/testsuite/gcc.target/i386/avx512-check.h
> > > > @@ -25,6 +25,7 @@ do_test (void)
> > > >  }
> > > >  #endif
> > > >
> > > > +__attribute__((target ("no-avx512f")))
> > > >  static int
> > > >  check_osxsave (void)
> > > >  {
> > > > @@ -34,6 +35,7 @@ check_osxsave (void)
> > > >    return (ecx & bit_OSXSAVE) != 0;
> > > >  }
> > > >
> > > > +__attribute__((target ("no-avx512f")))
> > > >  int
> > > >  main ()
> > > >  {
> > > >
> > > > should work.
> > > >
> > >
> > > That's what i thought right now.
> > > The real problem is we use -mavx512bw to build a binary to determine
> > > whether the processor support AVX512BW
> > >
> >
> > This isn't a problem as long as AVX512BW isn't used in AVX512BW detection
> > codes.   All CPUID codes should have
> >
> > #pragma GCC push_options
> > #pragma GCC target("no-avx")  // no-sse?
> >
> > Otherwise we will run into problems with integer register spill with
> > AVX instructions.
> >
> >
> > --
> > H.J.
>
>
>
> --
> H.J.



-- 
BR,
Hongtao

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH 4/4][PR target/88808]Enable bitwise operator for AVX512 masks.
  2020-08-21 16:29                     ` Hongtao Liu
  2020-08-21 16:35                       ` H.J. Lu
@ 2020-08-21 18:25                       ` Uros Bizjak
  1 sibling, 0 replies; 39+ messages in thread
From: Uros Bizjak @ 2020-08-21 18:25 UTC (permalink / raw)
  To: Hongtao Liu; +Cc: GCC Patches, Kirill Yukhin, H. J. Lu

On Fri, Aug 21, 2020 at 6:29 PM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Fri, Aug 21, 2020 at 11:50 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> >
> > On Fri, Aug 21, 2020 at 5:41 PM Hongtao Liu <crazylht@gmail.com> wrote:
> > >
> > > On Fri, Aug 21, 2020 at 9:15 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > >
> > > > > > > gcc/
> > > > > > >         PR target/88808
> > > > > > >         * config/i386/i386.c (ix86_preferred_reload_class): Allow
> > > > > > >         QImode data go into mask registers.
> > > > > > >         * config/i386/i386.md: (*movhi_internal): Adjust constraints
> > > > > > >         for mask registers.
> > > > > > >         (*movqi_internal): Ditto.
> > > > > > >         (*anddi_1): Support mask register operations
> > > > > > >         (*and<mode>_1): Ditto.
> > > > > > >         (*andqi_1): Ditto.
> > > > > > >         (*andn<mode>_1): Ditto.
> > > > > > >         (*<code><mode>_1): Ditto.
> > > > > > >         (*<code>qi_1): Ditto.
> > > > > > >         (*one_cmpl<mode>2_1): Ditto.
> > > > > > >         (*one_cmplsi2_1_zext): Ditto.
> > > > > > >         (*one_cmplqi2_1): Ditto.
> > > > > > >         (define_peephole2): Move constant 0/-1 directly into mask
> > > > > > >         registers.
> > > > > > >         * config/i386/predicates.md (mask_reg_operand): New predicate.
> > > > > > >         * config/i386/sse.md (define_split): Add post-reload splitters
> > > > > > >         that would convert "generic" patterns to mask patterns.
> > > > > > >         (*knotsi_1_zext): New define_insn.
> > > > > > >
> > > > > > > gcc/testsuite/
> > > > > > >         * gcc.target/i386/bitwise_mask_op-1.c: New test.
> > > > > > >         * gcc.target/i386/bitwise_mask_op-2.c: New test.
> > > > > > >         * gcc.target/i386/bitwise_mask_op-3.c: New test.
> > > > > > >         * gcc.target/i386/avx512bw-pr88465.c: New testcase.
> > > > > > >         * gcc.target/i386/avx512bw-kunpckwd-1.c: Adjust testcase.
> > > > > > >         * gcc.target/i386/avx512bw-kunpckwd-3.c: Ditto.
> > > > > > >         * gcc.target/i386/avx512dq-kmovb-5.c: Ditto.
> > > > > > >         * gcc.target/i386/avx512f-kmovw-5.c: Ditto.
> > > > > >
> > > > > > A little nit, please put new splitters after the instruction pattern.
> > > > > >
> > > > > > OK for the whole patch set with the above change,
> > > > > >
> > > > >
> > > > > Yes, thanks for the review.
> > > >
> > > > Please note that your patch introduces several testsuite fails with -m32:
> > > >
> > > > gcc -O2 -mavx512bitalg -mavx512bw -m32 -g avx512bitalg-vpopcntb-1.c
> > > >
> > >
> > > I can't reproduce this failure.
> >
> > Because you are running it on AVX512 enabled target.
> >
> > > > Program received signal SIGILL, Illegal instruction.
> > > > 0x080490ac in __get_cpuid_count (__edx=<synthetic pointer>,
> > > > __ecx=<synthetic pointer>, __ebx=<synthetic pointer>, __eax=<synthetic
> > > > pointer>,
> > > >     __subleaf=0, __leaf=7) at /hdd/uros/gcc-build-fast/gcc/include/cpuid.h:316
> > > > 316       __cpuid_count (__leaf, __subleaf, *__eax, *__ebx, *__ecx, *__edx);
> > > >
> > > >    0x080490a3 <+51>:    cpuid
> > > >    0x080490a5 <+53>:    mov    $0x1,%eax
> > > >    0x080490aa <+58>:    mov    %ecx,%esi
> > > > => 0x080490ac <+60>:    kmovd  %ebx,%k0
> > > >    0x080490b0 <+64>:    mov    %edi,%ecx
> > > >    0x080490b2 <+66>:    mov    %edi,%ebx
> > > >
> > > > kmov insn is generated for __cpuid_count function, where the binary
> > > > determines, if the new instructions are supported. The binary will
> > > > crash in the detection code if the processor lacks AVX512
> > > > instructions.
> > > >
> > >
> > > IMHO, the testcase shouldn't be run on processors without AVX512BW.
> >
> > No, it could run, because it checks for AVX512BW at runtime.
> >
>
> Got it.
>
> > > Because in  avx512bitalg-vpopcntb-1.c, there's /*
> > > dg-require-effective-target avx512bw } */.
> >
> > This is to check the toolchain for support.
> >
> > > what's the version of your assembler?
> >
> > GNU assembler version 2.34-4.fc32
> >
>
> If assembler supports avx512bw, but processor not, the test would pass
> condition `dg-require-effective-target avx512bw` and be runned.
> then crashed for illegal instruction.

Please look at avx512-check.h. This is where main function lives.

> > Please add something like
> > X86_TUNE_INTER_UNIT_MOVES_FROM_MASK/X86_TUNE_INTER_UNIT_MOVES_TO_MASK
> > and enable them only for m_CORE_AVX512 (or perhaps m_INTEL).
> >
> > Handle this in inline_secondary_memory_needed to reject direct moves
> > for all other targets. This should disable direct moves for generic
> > targets.
> >
>
> Yes, I'll add it.

Thanks.

Uros.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [PATCH] x86: Disable SSE, AVX and AVX512 during CPUID check
  2020-08-21 16:45                         ` H.J. Lu
@ 2020-08-22 16:26                           ` H.J. Lu
  2020-08-22 17:11                             ` Uros Bizjak
  0 siblings, 1 reply; 39+ messages in thread
From: H.J. Lu @ 2020-08-22 16:26 UTC (permalink / raw)
  To: Hongtao Liu; +Cc: Uros Bizjak, GCC Patches, Kirill Yukhin

[-- Attachment #1: Type: text/plain, Size: 6957 bytes --]

On Fri, Aug 21, 2020 at 9:45 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Fri, Aug 21, 2020 at 9:35 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Fri, Aug 21, 2020 at 9:29 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > >
> > > On Fri, Aug 21, 2020 at 11:50 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > >
> > > > On Fri, Aug 21, 2020 at 5:41 PM Hongtao Liu <crazylht@gmail.com> wrote:
> > > > >
> > > > > On Fri, Aug 21, 2020 at 9:15 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > > > >
> > > > > > > > > gcc/
> > > > > > > > >         PR target/88808
> > > > > > > > >         * config/i386/i386.c (ix86_preferred_reload_class): Allow
> > > > > > > > >         QImode data go into mask registers.
> > > > > > > > >         * config/i386/i386.md: (*movhi_internal): Adjust constraints
> > > > > > > > >         for mask registers.
> > > > > > > > >         (*movqi_internal): Ditto.
> > > > > > > > >         (*anddi_1): Support mask register operations
> > > > > > > > >         (*and<mode>_1): Ditto.
> > > > > > > > >         (*andqi_1): Ditto.
> > > > > > > > >         (*andn<mode>_1): Ditto.
> > > > > > > > >         (*<code><mode>_1): Ditto.
> > > > > > > > >         (*<code>qi_1): Ditto.
> > > > > > > > >         (*one_cmpl<mode>2_1): Ditto.
> > > > > > > > >         (*one_cmplsi2_1_zext): Ditto.
> > > > > > > > >         (*one_cmplqi2_1): Ditto.
> > > > > > > > >         (define_peephole2): Move constant 0/-1 directly into mask
> > > > > > > > >         registers.
> > > > > > > > >         * config/i386/predicates.md (mask_reg_operand): New predicate.
> > > > > > > > >         * config/i386/sse.md (define_split): Add post-reload splitters
> > > > > > > > >         that would convert "generic" patterns to mask patterns.
> > > > > > > > >         (*knotsi_1_zext): New define_insn.
> > > > > > > > >
> > > > > > > > > gcc/testsuite/
> > > > > > > > >         * gcc.target/i386/bitwise_mask_op-1.c: New test.
> > > > > > > > >         * gcc.target/i386/bitwise_mask_op-2.c: New test.
> > > > > > > > >         * gcc.target/i386/bitwise_mask_op-3.c: New test.
> > > > > > > > >         * gcc.target/i386/avx512bw-pr88465.c: New testcase.
> > > > > > > > >         * gcc.target/i386/avx512bw-kunpckwd-1.c: Adjust testcase.
> > > > > > > > >         * gcc.target/i386/avx512bw-kunpckwd-3.c: Ditto.
> > > > > > > > >         * gcc.target/i386/avx512dq-kmovb-5.c: Ditto.
> > > > > > > > >         * gcc.target/i386/avx512f-kmovw-5.c: Ditto.
> > > > > > > >
> > > > > > > > A little nit, please put new splitters after the instruction pattern.
> > > > > > > >
> > > > > > > > OK for the whole patch set with the above change,
> > > > > > > >
> > > > > > >
> > > > > > > Yes, thanks for the review.
> > > > > >
> > > > > > Please note that your patch introduces several testsuite fails with -m32:
> > > > > >
> > > > > > gcc -O2 -mavx512bitalg -mavx512bw -m32 -g avx512bitalg-vpopcntb-1.c
> > > > > >
> > > > >
> > > > > I can't reproduce this failure.
> > > >
> > > > Because you are running it on AVX512 enabled target.
> > > >
> > > > > > Program received signal SIGILL, Illegal instruction.
> > > > > > 0x080490ac in __get_cpuid_count (__edx=<synthetic pointer>,
> > > > > > __ecx=<synthetic pointer>, __ebx=<synthetic pointer>, __eax=<synthetic
> > > > > > pointer>,
> > > > > >     __subleaf=0, __leaf=7) at /hdd/uros/gcc-build-fast/gcc/include/cpuid.h:316
> > > > > > 316       __cpuid_count (__leaf, __subleaf, *__eax, *__ebx, *__ecx, *__edx);
> > > > > >
> > > > > >    0x080490a3 <+51>:    cpuid
> > > > > >    0x080490a5 <+53>:    mov    $0x1,%eax
> > > > > >    0x080490aa <+58>:    mov    %ecx,%esi
> > > > > > => 0x080490ac <+60>:    kmovd  %ebx,%k0
> > > > > >    0x080490b0 <+64>:    mov    %edi,%ecx
> > > > > >    0x080490b2 <+66>:    mov    %edi,%ebx
> > > > > >
> > > > > > kmov insn is generated for __cpuid_count function, where the binary
> > > > > > determines, if the new instructions are supported. The binary will
> > > > > > crash in the detection code if the processor lacks AVX512
> > > > > > instructions.
> > > > > >
> > > > >
> > > > > IMHO, the testcase shouldn't be run on processors without AVX512BW.
> > > >
> > > > No, it could run, because it checks for AVX512BW at runtime.
> > > >
> > >
> > > Got it.
> > >
> > > > > Because in  avx512bitalg-vpopcntb-1.c, there's /*
> > > > > dg-require-effective-target avx512bw } */.
> > > >
> > > > This is to check the toolchain for support.
> > > >
> > > > > what's the version of your assembler?
> > > >
> > > > GNU assembler version 2.34-4.fc32
> > > >
> > >
> > > If assembler supports avx512bw, but processor not, the test would pass
> > > condition `dg-require-effective-target avx512bw` and be runned.
> > > then crashed for illegal instruction.
> > >
> > > > Please add something like
> > > > X86_TUNE_INTER_UNIT_MOVES_FROM_MASK/X86_TUNE_INTER_UNIT_MOVES_TO_MASK
> > > > and enable them only for m_CORE_AVX512 (or perhaps m_INTEL).
> > > >
> > > > Handle this in inline_secondary_memory_needed to reject direct moves
> > > > for all other targets. This should disable direct moves for generic
> > > > targets.
> > > >
> > >
> > > Yes, I'll add it.
> > >
> >
> >
> > (define_insn "*movsi_internal"
> >   [(set (match_operand:SI 0 "nonimmediate_operand"
> >     "=r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,?r,?*v,*k,*k ,*rm,*k")
> >         (match_operand:SI 1 "general_operand"
> >     "g ,re,C ,*y,m  ,*y,*y,r  ,C ,*v,m ,*v,*v,r  ,*r,*km,*k ,CBC"))]
> >   "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
> > ...
> >  [(set (attr "isa")
> >      (cond [(eq_attr "alternative" "12,13")
> >               (const_string "sse2")
> >            ]
> >            (const_string "*")))
> >
> > is wrong.   mask register alternatives should be marked with avx512f.
> > Please fix it.   Other integer move patterns may have the same issue.
> > Once these are fixed,
> >
> > diff --git a/gcc/testsuite/gcc.target/i386/avx512-check.h
> > b/gcc/testsuite/gcc.target/i386/avx512-check.h
> > index 0a377dba1d5..576e9b390c6 100644
> > --- a/gcc/testsuite/gcc.target/i386/avx512-check.h
> > +++ b/gcc/testsuite/gcc.target/i386/avx512-check.h
> > @@ -25,6 +25,7 @@ do_test (void)
> >  }
> >  #endif
> >
> > +__attribute__((target ("no-avx512f")))
> >  static int
> >  check_osxsave (void)
> >  {
> > @@ -34,6 +35,7 @@ check_osxsave (void)
> >    return (ecx & bit_OSXSAVE) != 0;
> >  }
> >
> > +__attribute__((target ("no-avx512f")))
> >  int
> >  main ()
> >  {
> >
> > should work.
> >
>
> Like this.  You need to check all integer patterns with mskmov and msklog.

Compile CPUID check with "-mno-sse -mfpmath=387" to disable SSE, AVX and
AVX512 during CPUID check to avoid vector and mask register operations.

Note: -mfpmath=387 is needed to override -mfpmath=sse.

OK for master branch?

-- 
H.J.

[-- Attachment #2: 0001-x86-Disable-SSE-AVX-and-AVX512-during-CPUID-check.patch --]
[-- Type: text/x-patch, Size: 6341 bytes --]

From 7e1ccab16bfbea9c6836296936e87e783290db24 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Fri, 21 Aug 2020 09:42:49 -0700
Subject: [PATCH] x86: Disable SSE, AVX and AVX512 during CPUID check

Compile CPUID check with "-mno-sse -mfpmath=387" to disable SSE, AVX and
AVX512 during CPUID check to avoid vector and mask register operations.

Note: -mfpmath=387 is needed to override -mfpmath=sse.

gcc/

	PR target/96744
	* config/i386/cpuid.h: Add #pragma GCC target("no-sse,fpmath=387")
	to disable SSE, AVX and AVX512.

gcc/testsuite/

	PR target/96744
	* gcc.target/i386/adx-check.h: Add #pragma GCC target("no-sse,fpmath=387")
	to disable SSE, AVX and AVX512.
	* gcc.target/i386/avx2-check.h: Likewise.
	* gcc.target/i386/avx512-check.h: Likewise.
	* gcc.target/i386/bmi-check.h: Likewise.
	* gcc.target/i386/bmi2-check.h: Likewise.
	* gcc.target/i386/pr77756.c: Likewise.
	* gcc.target/i386/pr95973.c: Likewise.
	* gcc.target/i386/rtm-check.h: Likewise.
	* gcc.target/i386/sha-check.h: Likewise.
---
 gcc/config/i386/cpuid.h                      | 5 +++++
 gcc/testsuite/gcc.target/i386/adx-check.h    | 4 ++++
 gcc/testsuite/gcc.target/i386/avx2-check.h   | 5 +++++
 gcc/testsuite/gcc.target/i386/avx512-check.h | 5 +++++
 gcc/testsuite/gcc.target/i386/bmi-check.h    | 5 +++++
 gcc/testsuite/gcc.target/i386/bmi2-check.h   | 5 +++++
 gcc/testsuite/gcc.target/i386/pr77756.c      | 5 +++++
 gcc/testsuite/gcc.target/i386/pr95973.c      | 5 +++++
 gcc/testsuite/gcc.target/i386/rtm-check.h    | 5 +++++
 gcc/testsuite/gcc.target/i386/sha-check.h    | 5 +++++
 10 files changed, 49 insertions(+)

diff --git a/gcc/config/i386/cpuid.h b/gcc/config/i386/cpuid.h
index bca61d620db..01aba002bcf 100644
--- a/gcc/config/i386/cpuid.h
+++ b/gcc/config/i386/cpuid.h
@@ -24,6 +24,9 @@
 #ifndef _CPUID_H_INCLUDED
 #define _CPUID_H_INCLUDED
 
+#pragma GCC push_options
+#pragma GCC target("no-sse,fpmath=387")
+
 /* %eax */
 #define bit_AVX512BF16	(1 << 5)
 
@@ -324,4 +327,6 @@ __cpuidex (int __cpuid_info[4], int __leaf, int __subleaf)
 		 __cpuid_info[2], __cpuid_info[3]);
 }
 
+#pragma GCC pop_options
+
 #endif /* _CPUID_H_INCLUDED */
diff --git a/gcc/testsuite/gcc.target/i386/adx-check.h b/gcc/testsuite/gcc.target/i386/adx-check.h
index cfed1a38483..942f248df43 100644
--- a/gcc/testsuite/gcc.target/i386/adx-check.h
+++ b/gcc/testsuite/gcc.target/i386/adx-check.h
@@ -8,6 +8,9 @@ static void __attribute__ ((noinline)) do_test (void)
   adx_test ();
 }
 
+#pragma GCC push_options
+#pragma GCC target("no-sse,fpmath=387")
+
 int
 main ()
 {
@@ -32,3 +35,4 @@ main ()
   return 0;
 }
 
+#pragma GCC pop_options
diff --git a/gcc/testsuite/gcc.target/i386/avx2-check.h b/gcc/testsuite/gcc.target/i386/avx2-check.h
index 25bed5e0da6..861308ceb5c 100644
--- a/gcc/testsuite/gcc.target/i386/avx2-check.h
+++ b/gcc/testsuite/gcc.target/i386/avx2-check.h
@@ -10,6 +10,9 @@ static void __attribute__ ((noinline)) do_test (void)
   avx2_test ();
 }
 
+#pragma GCC push_options
+#pragma GCC target("no-sse,fpmath=387")
+
 static int
 check_osxsave (void)
 {
@@ -42,3 +45,5 @@ main ()
 #endif
   return 0;
 }
+
+#pragma GCC pop_options
diff --git a/gcc/testsuite/gcc.target/i386/avx512-check.h b/gcc/testsuite/gcc.target/i386/avx512-check.h
index 0a377dba1d5..74e1cce16c3 100644
--- a/gcc/testsuite/gcc.target/i386/avx512-check.h
+++ b/gcc/testsuite/gcc.target/i386/avx512-check.h
@@ -25,6 +25,9 @@ do_test (void)
 }
 #endif
 
+#pragma GCC push_options
+#pragma GCC target("no-sse,fpmath=387")
+
 static int
 check_osxsave (void)
 {
@@ -110,3 +113,5 @@ main ()
 #endif
   return 0;
 }
+
+#pragma GCC pop_options
diff --git a/gcc/testsuite/gcc.target/i386/bmi-check.h b/gcc/testsuite/gcc.target/i386/bmi-check.h
index 1973f3b6468..35b46528d8c 100644
--- a/gcc/testsuite/gcc.target/i386/bmi-check.h
+++ b/gcc/testsuite/gcc.target/i386/bmi-check.h
@@ -12,6 +12,9 @@ do_test (void)
   bmi_test ();
 }
 
+#pragma GCC push_options
+#pragma GCC target("no-sse,fpmath=387")
+
 int
 main ()
 {
@@ -35,3 +38,5 @@ main ()
 #endif
   return 0;
 }
+
+#pragma GCC pop_options
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-check.h b/gcc/testsuite/gcc.target/i386/bmi2-check.h
index ba91ef9b780..06e738e704d 100644
--- a/gcc/testsuite/gcc.target/i386/bmi2-check.h
+++ b/gcc/testsuite/gcc.target/i386/bmi2-check.h
@@ -11,6 +11,9 @@ do_test (void)
   bmi2_test ();
 }
 
+#pragma GCC push_options
+#pragma GCC target("no-sse,fpmath=387")
+
 int
 main ()
 {
@@ -34,3 +37,5 @@ main ()
 #endif
   return 0;
 }
+
+#pragma GCC pop_options
diff --git a/gcc/testsuite/gcc.target/i386/pr77756.c b/gcc/testsuite/gcc.target/i386/pr77756.c
index 1eee7cd5a00..3d2f371d1f0 100644
--- a/gcc/testsuite/gcc.target/i386/pr77756.c
+++ b/gcc/testsuite/gcc.target/i386/pr77756.c
@@ -2,6 +2,9 @@
 
 #include "cpuid.h"
 
+#pragma GCC push_options
+#pragma GCC target("no-sse,fpmath=387")
+
 int
 main ()
 {
@@ -20,3 +23,5 @@ main ()
 
   return 0;
 }
+
+#pragma GCC pop_options
diff --git a/gcc/testsuite/gcc.target/i386/pr95973.c b/gcc/testsuite/gcc.target/i386/pr95973.c
index 08c7dba8f46..f1d3b7870a3 100644
--- a/gcc/testsuite/gcc.target/i386/pr95973.c
+++ b/gcc/testsuite/gcc.target/i386/pr95973.c
@@ -4,6 +4,9 @@
 #include <cpuid.h>
 #include <cpuid.h>
 
+#pragma GCC push_options
+#pragma GCC target("no-sse,fpmath=387")
+
 int
 main ()
 {
@@ -23,3 +26,5 @@ main ()
 
   return 0;
 }
+
+#pragma GCC pop_options
diff --git a/gcc/testsuite/gcc.target/i386/rtm-check.h b/gcc/testsuite/gcc.target/i386/rtm-check.h
index bdb5a6dc0bf..f131a4135ca 100644
--- a/gcc/testsuite/gcc.target/i386/rtm-check.h
+++ b/gcc/testsuite/gcc.target/i386/rtm-check.h
@@ -8,6 +8,9 @@ static void __attribute__ ((noinline)) do_test (void)
   rtm_test ();
 }
 
+#pragma GCC push_options
+#pragma GCC target("no-sse,fpmath=387")
+
 int
 main ()
 {
@@ -31,3 +34,5 @@ main ()
 #endif
   return 0;
 }
+
+#pragma GCC pop_options
diff --git a/gcc/testsuite/gcc.target/i386/sha-check.h b/gcc/testsuite/gcc.target/i386/sha-check.h
index 5bc5a59ab80..2c2c4e94403 100644
--- a/gcc/testsuite/gcc.target/i386/sha-check.h
+++ b/gcc/testsuite/gcc.target/i386/sha-check.h
@@ -10,6 +10,9 @@ do_test (void)
   sha_test ();
 }
 
+#pragma GCC push_options
+#pragma GCC target("no-sse,fpmath=387")
+
 int
 main ()
 {
@@ -33,3 +36,5 @@ main ()
 #endif
   return 0;
 }
+
+#pragma GCC pop_options
-- 
2.26.2


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] x86: Disable SSE, AVX and AVX512 during CPUID check
  2020-08-22 16:26                           ` [PATCH] x86: Disable SSE, AVX and AVX512 during CPUID check H.J. Lu
@ 2020-08-22 17:11                             ` Uros Bizjak
  2020-08-22 19:08                               ` [PATCH] x86: Only use general-purpose registers " H.J. Lu
  0 siblings, 1 reply; 39+ messages in thread
From: Uros Bizjak @ 2020-08-22 17:11 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Hongtao Liu, GCC Patches, Kirill Yukhin

On Sat, Aug 22, 2020 at 6:27 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Fri, Aug 21, 2020 at 9:45 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Fri, Aug 21, 2020 at 9:35 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Fri, Aug 21, 2020 at 9:29 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > > >
> > > > On Fri, Aug 21, 2020 at 11:50 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > > >
> > > > > On Fri, Aug 21, 2020 at 5:41 PM Hongtao Liu <crazylht@gmail.com> wrote:
> > > > > >
> > > > > > On Fri, Aug 21, 2020 at 9:15 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > > > > >
> > > > > > > > > > gcc/
> > > > > > > > > >         PR target/88808
> > > > > > > > > >         * config/i386/i386.c (ix86_preferred_reload_class): Allow
> > > > > > > > > >         QImode data go into mask registers.
> > > > > > > > > >         * config/i386/i386.md: (*movhi_internal): Adjust constraints
> > > > > > > > > >         for mask registers.
> > > > > > > > > >         (*movqi_internal): Ditto.
> > > > > > > > > >         (*anddi_1): Support mask register operations
> > > > > > > > > >         (*and<mode>_1): Ditto.
> > > > > > > > > >         (*andqi_1): Ditto.
> > > > > > > > > >         (*andn<mode>_1): Ditto.
> > > > > > > > > >         (*<code><mode>_1): Ditto.
> > > > > > > > > >         (*<code>qi_1): Ditto.
> > > > > > > > > >         (*one_cmpl<mode>2_1): Ditto.
> > > > > > > > > >         (*one_cmplsi2_1_zext): Ditto.
> > > > > > > > > >         (*one_cmplqi2_1): Ditto.
> > > > > > > > > >         (define_peephole2): Move constant 0/-1 directly into mask
> > > > > > > > > >         registers.
> > > > > > > > > >         * config/i386/predicates.md (mask_reg_operand): New predicate.
> > > > > > > > > >         * config/i386/sse.md (define_split): Add post-reload splitters
> > > > > > > > > >         that would convert "generic" patterns to mask patterns.
> > > > > > > > > >         (*knotsi_1_zext): New define_insn.
> > > > > > > > > >
> > > > > > > > > > gcc/testsuite/
> > > > > > > > > >         * gcc.target/i386/bitwise_mask_op-1.c: New test.
> > > > > > > > > >         * gcc.target/i386/bitwise_mask_op-2.c: New test.
> > > > > > > > > >         * gcc.target/i386/bitwise_mask_op-3.c: New test.
> > > > > > > > > >         * gcc.target/i386/avx512bw-pr88465.c: New testcase.
> > > > > > > > > >         * gcc.target/i386/avx512bw-kunpckwd-1.c: Adjust testcase.
> > > > > > > > > >         * gcc.target/i386/avx512bw-kunpckwd-3.c: Ditto.
> > > > > > > > > >         * gcc.target/i386/avx512dq-kmovb-5.c: Ditto.
> > > > > > > > > >         * gcc.target/i386/avx512f-kmovw-5.c: Ditto.
> > > > > > > > >
> > > > > > > > > A little nit, please put new splitters after the instruction pattern.
> > > > > > > > >
> > > > > > > > > OK for the whole patch set with the above change,
> > > > > > > > >
> > > > > > > >
> > > > > > > > Yes, thanks for the review.
> > > > > > >
> > > > > > > Please note that your patch introduces several testsuite fails with -m32:
> > > > > > >
> > > > > > > gcc -O2 -mavx512bitalg -mavx512bw -m32 -g avx512bitalg-vpopcntb-1.c
> > > > > > >
> > > > > >
> > > > > > I can't reproduce this failure.
> > > > >
> > > > > Because you are running it on AVX512 enabled target.
> > > > >
> > > > > > > Program received signal SIGILL, Illegal instruction.
> > > > > > > 0x080490ac in __get_cpuid_count (__edx=<synthetic pointer>,
> > > > > > > __ecx=<synthetic pointer>, __ebx=<synthetic pointer>, __eax=<synthetic
> > > > > > > pointer>,
> > > > > > >     __subleaf=0, __leaf=7) at /hdd/uros/gcc-build-fast/gcc/include/cpuid.h:316
> > > > > > > 316       __cpuid_count (__leaf, __subleaf, *__eax, *__ebx, *__ecx, *__edx);
> > > > > > >
> > > > > > >    0x080490a3 <+51>:    cpuid
> > > > > > >    0x080490a5 <+53>:    mov    $0x1,%eax
> > > > > > >    0x080490aa <+58>:    mov    %ecx,%esi
> > > > > > > => 0x080490ac <+60>:    kmovd  %ebx,%k0
> > > > > > >    0x080490b0 <+64>:    mov    %edi,%ecx
> > > > > > >    0x080490b2 <+66>:    mov    %edi,%ebx
> > > > > > >
> > > > > > > kmov insn is generated for __cpuid_count function, where the binary
> > > > > > > determines, if the new instructions are supported. The binary will
> > > > > > > crash in the detection code if the processor lacks AVX512
> > > > > > > instructions.
> > > > > > >
> > > > > >
> > > > > > IMHO, the testcase shouldn't be run on processors without AVX512BW.
> > > > >
> > > > > No, it could run, because it checks for AVX512BW at runtime.
> > > > >
> > > >
> > > > Got it.
> > > >
> > > > > > Because in  avx512bitalg-vpopcntb-1.c, there's /*
> > > > > > dg-require-effective-target avx512bw } */.
> > > > >
> > > > > This is to check the toolchain for support.
> > > > >
> > > > > > what's the version of your assembler?
> > > > >
> > > > > GNU assembler version 2.34-4.fc32
> > > > >
> > > >
> > > > If assembler supports avx512bw, but processor not, the test would pass
> > > > condition `dg-require-effective-target avx512bw` and be runned.
> > > > then crashed for illegal instruction.
> > > >
> > > > > Please add something like
> > > > > X86_TUNE_INTER_UNIT_MOVES_FROM_MASK/X86_TUNE_INTER_UNIT_MOVES_TO_MASK
> > > > > and enable them only for m_CORE_AVX512 (or perhaps m_INTEL).
> > > > >
> > > > > Handle this in inline_secondary_memory_needed to reject direct moves
> > > > > for all other targets. This should disable direct moves for generic
> > > > > targets.
> > > > >
> > > >
> > > > Yes, I'll add it.
> > > >
> > >
> > >
> > > (define_insn "*movsi_internal"
> > >   [(set (match_operand:SI 0 "nonimmediate_operand"
> > >     "=r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,?r,?*v,*k,*k ,*rm,*k")
> > >         (match_operand:SI 1 "general_operand"
> > >     "g ,re,C ,*y,m  ,*y,*y,r  ,C ,*v,m ,*v,*v,r  ,*r,*km,*k ,CBC"))]
> > >   "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
> > > ...
> > >  [(set (attr "isa")
> > >      (cond [(eq_attr "alternative" "12,13")
> > >               (const_string "sse2")
> > >            ]
> > >            (const_string "*")))
> > >
> > > is wrong.   mask register alternatives should be marked with avx512f.
> > > Please fix it.   Other integer move patterns may have the same issue.
> > > Once these are fixed,
> > >
> > > diff --git a/gcc/testsuite/gcc.target/i386/avx512-check.h
> > > b/gcc/testsuite/gcc.target/i386/avx512-check.h
> > > index 0a377dba1d5..576e9b390c6 100644
> > > --- a/gcc/testsuite/gcc.target/i386/avx512-check.h
> > > +++ b/gcc/testsuite/gcc.target/i386/avx512-check.h
> > > @@ -25,6 +25,7 @@ do_test (void)
> > >  }
> > >  #endif
> > >
> > > +__attribute__((target ("no-avx512f")))
> > >  static int
> > >  check_osxsave (void)
> > >  {
> > > @@ -34,6 +35,7 @@ check_osxsave (void)
> > >    return (ecx & bit_OSXSAVE) != 0;
> > >  }
> > >
> > > +__attribute__((target ("no-avx512f")))
> > >  int
> > >  main ()
> > >  {
> > >
> > > should work.
> > >
> >
> > Like this.  You need to check all integer patterns with mskmov and msklog.
>
> Compile CPUID check with "-mno-sse -mfpmath=387" to disable SSE, AVX and
> AVX512 during CPUID check to avoid vector and mask register operations.

-mgeneral-regs-only ?

Uros.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [PATCH] x86: Only use general-purpose registers during CPUID check
  2020-08-22 17:11                             ` Uros Bizjak
@ 2020-08-22 19:08                               ` H.J. Lu
  2020-08-23  8:18                                 ` Uros Bizjak
  0 siblings, 1 reply; 39+ messages in thread
From: H.J. Lu @ 2020-08-22 19:08 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: Hongtao Liu, GCC Patches, Kirill Yukhin

[-- Attachment #1: Type: text/plain, Size: 7860 bytes --]

On Sat, Aug 22, 2020 at 10:11 AM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Sat, Aug 22, 2020 at 6:27 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Fri, Aug 21, 2020 at 9:45 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Fri, Aug 21, 2020 at 9:35 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > >
> > > > On Fri, Aug 21, 2020 at 9:29 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > > > >
> > > > > On Fri, Aug 21, 2020 at 11:50 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > > > >
> > > > > > On Fri, Aug 21, 2020 at 5:41 PM Hongtao Liu <crazylht@gmail.com> wrote:
> > > > > > >
> > > > > > > On Fri, Aug 21, 2020 at 9:15 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > > > > > >
> > > > > > > > > > > gcc/
> > > > > > > > > > >         PR target/88808
> > > > > > > > > > >         * config/i386/i386.c (ix86_preferred_reload_class): Allow
> > > > > > > > > > >         QImode data go into mask registers.
> > > > > > > > > > >         * config/i386/i386.md: (*movhi_internal): Adjust constraints
> > > > > > > > > > >         for mask registers.
> > > > > > > > > > >         (*movqi_internal): Ditto.
> > > > > > > > > > >         (*anddi_1): Support mask register operations
> > > > > > > > > > >         (*and<mode>_1): Ditto.
> > > > > > > > > > >         (*andqi_1): Ditto.
> > > > > > > > > > >         (*andn<mode>_1): Ditto.
> > > > > > > > > > >         (*<code><mode>_1): Ditto.
> > > > > > > > > > >         (*<code>qi_1): Ditto.
> > > > > > > > > > >         (*one_cmpl<mode>2_1): Ditto.
> > > > > > > > > > >         (*one_cmplsi2_1_zext): Ditto.
> > > > > > > > > > >         (*one_cmplqi2_1): Ditto.
> > > > > > > > > > >         (define_peephole2): Move constant 0/-1 directly into mask
> > > > > > > > > > >         registers.
> > > > > > > > > > >         * config/i386/predicates.md (mask_reg_operand): New predicate.
> > > > > > > > > > >         * config/i386/sse.md (define_split): Add post-reload splitters
> > > > > > > > > > >         that would convert "generic" patterns to mask patterns.
> > > > > > > > > > >         (*knotsi_1_zext): New define_insn.
> > > > > > > > > > >
> > > > > > > > > > > gcc/testsuite/
> > > > > > > > > > >         * gcc.target/i386/bitwise_mask_op-1.c: New test.
> > > > > > > > > > >         * gcc.target/i386/bitwise_mask_op-2.c: New test.
> > > > > > > > > > >         * gcc.target/i386/bitwise_mask_op-3.c: New test.
> > > > > > > > > > >         * gcc.target/i386/avx512bw-pr88465.c: New testcase.
> > > > > > > > > > >         * gcc.target/i386/avx512bw-kunpckwd-1.c: Adjust testcase.
> > > > > > > > > > >         * gcc.target/i386/avx512bw-kunpckwd-3.c: Ditto.
> > > > > > > > > > >         * gcc.target/i386/avx512dq-kmovb-5.c: Ditto.
> > > > > > > > > > >         * gcc.target/i386/avx512f-kmovw-5.c: Ditto.
> > > > > > > > > >
> > > > > > > > > > A little nit, please put new splitters after the instruction pattern.
> > > > > > > > > >
> > > > > > > > > > OK for the whole patch set with the above change,
> > > > > > > > > >
> > > > > > > > >
> > > > > > > > > Yes, thanks for the review.
> > > > > > > >
> > > > > > > > Please note that your patch introduces several testsuite fails with -m32:
> > > > > > > >
> > > > > > > > gcc -O2 -mavx512bitalg -mavx512bw -m32 -g avx512bitalg-vpopcntb-1.c
> > > > > > > >
> > > > > > >
> > > > > > > I can't reproduce this failure.
> > > > > >
> > > > > > Because you are running it on AVX512 enabled target.
> > > > > >
> > > > > > > > Program received signal SIGILL, Illegal instruction.
> > > > > > > > 0x080490ac in __get_cpuid_count (__edx=<synthetic pointer>,
> > > > > > > > __ecx=<synthetic pointer>, __ebx=<synthetic pointer>, __eax=<synthetic
> > > > > > > > pointer>,
> > > > > > > >     __subleaf=0, __leaf=7) at /hdd/uros/gcc-build-fast/gcc/include/cpuid.h:316
> > > > > > > > 316       __cpuid_count (__leaf, __subleaf, *__eax, *__ebx, *__ecx, *__edx);
> > > > > > > >
> > > > > > > >    0x080490a3 <+51>:    cpuid
> > > > > > > >    0x080490a5 <+53>:    mov    $0x1,%eax
> > > > > > > >    0x080490aa <+58>:    mov    %ecx,%esi
> > > > > > > > => 0x080490ac <+60>:    kmovd  %ebx,%k0
> > > > > > > >    0x080490b0 <+64>:    mov    %edi,%ecx
> > > > > > > >    0x080490b2 <+66>:    mov    %edi,%ebx
> > > > > > > >
> > > > > > > > kmov insn is generated for __cpuid_count function, where the binary
> > > > > > > > determines, if the new instructions are supported. The binary will
> > > > > > > > crash in the detection code if the processor lacks AVX512
> > > > > > > > instructions.
> > > > > > > >
> > > > > > >
> > > > > > > IMHO, the testcase shouldn't be run on processors without AVX512BW.
> > > > > >
> > > > > > No, it could run, because it checks for AVX512BW at runtime.
> > > > > >
> > > > >
> > > > > Got it.
> > > > >
> > > > > > > Because in  avx512bitalg-vpopcntb-1.c, there's /*
> > > > > > > dg-require-effective-target avx512bw } */.
> > > > > >
> > > > > > This is to check the toolchain for support.
> > > > > >
> > > > > > > what's the version of your assembler?
> > > > > >
> > > > > > GNU assembler version 2.34-4.fc32
> > > > > >
> > > > >
> > > > > If assembler supports avx512bw, but processor not, the test would pass
> > > > > condition `dg-require-effective-target avx512bw` and be runned.
> > > > > then crashed for illegal instruction.
> > > > >
> > > > > > Please add something like
> > > > > > X86_TUNE_INTER_UNIT_MOVES_FROM_MASK/X86_TUNE_INTER_UNIT_MOVES_TO_MASK
> > > > > > and enable them only for m_CORE_AVX512 (or perhaps m_INTEL).
> > > > > >
> > > > > > Handle this in inline_secondary_memory_needed to reject direct moves
> > > > > > for all other targets. This should disable direct moves for generic
> > > > > > targets.
> > > > > >
> > > > >
> > > > > Yes, I'll add it.
> > > > >
> > > >
> > > >
> > > > (define_insn "*movsi_internal"
> > > >   [(set (match_operand:SI 0 "nonimmediate_operand"
> > > >     "=r,m ,*y,*y,?*y,?m,?r,?*y,*v,*v,*v,m ,?r,?*v,*k,*k ,*rm,*k")
> > > >         (match_operand:SI 1 "general_operand"
> > > >     "g ,re,C ,*y,m  ,*y,*y,r  ,C ,*v,m ,*v,*v,r  ,*r,*km,*k ,CBC"))]
> > > >   "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
> > > > ...
> > > >  [(set (attr "isa")
> > > >      (cond [(eq_attr "alternative" "12,13")
> > > >               (const_string "sse2")
> > > >            ]
> > > >            (const_string "*")))
> > > >
> > > > is wrong.   mask register alternatives should be marked with avx512f.
> > > > Please fix it.   Other integer move patterns may have the same issue.
> > > > Once these are fixed,
> > > >
> > > > diff --git a/gcc/testsuite/gcc.target/i386/avx512-check.h
> > > > b/gcc/testsuite/gcc.target/i386/avx512-check.h
> > > > index 0a377dba1d5..576e9b390c6 100644
> > > > --- a/gcc/testsuite/gcc.target/i386/avx512-check.h
> > > > +++ b/gcc/testsuite/gcc.target/i386/avx512-check.h
> > > > @@ -25,6 +25,7 @@ do_test (void)
> > > >  }
> > > >  #endif
> > > >
> > > > +__attribute__((target ("no-avx512f")))
> > > >  static int
> > > >  check_osxsave (void)
> > > >  {
> > > > @@ -34,6 +35,7 @@ check_osxsave (void)
> > > >    return (ecx & bit_OSXSAVE) != 0;
> > > >  }
> > > >
> > > > +__attribute__((target ("no-avx512f")))
> > > >  int
> > > >  main ()
> > > >  {
> > > >
> > > > should work.
> > > >
> > >
> > > Like this.  You need to check all integer patterns with mskmov and msklog.
> >
> > Compile CPUID check with "-mno-sse -mfpmath=387" to disable SSE, AVX and
> > AVX512 during CPUID check to avoid vector and mask register operations.
>
> -mgeneral-regs-only ?
>

Here is a patch to add target("general-regs-only") function
attribute and use it for CPUID check.   OK for master if there
are no regressions?

Thanks.

-- 
H.J.

[-- Attachment #2: 0001-x86-Only-use-general-purpose-registers-during-CPUID-.patch --]
[-- Type: text/x-patch, Size: 9743 bytes --]

From fda2012d7b2100eabec4610595c113fa5fc83638 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Fri, 21 Aug 2020 09:42:49 -0700
Subject: [PATCH] x86: Only use general-purpose registers during CPUID check

Compile CPUID check with -mgeneral-regs-only or -mno-sse -mfpmath=387
target attribute to only use general-purpose registers.

Note: -mno-sse -mfpmath=387 target attribute is used for GCCs older than
GCC 11.

gcc/

	PR target/96744
	* common/config/i386/i386-common.c (ix86_handle_option): Set
	x_ix86_fpmath to FPMATH_387 for -mgeneral-regs-only.
	* config/i386/cpuid.h: Add #pragma GCC target("general-regs-only")
	or #pragma GCC target("no-sse,fpmath=387") to only use
	general-purpose registers.
	* config/i386/i386-options.c (IX86_ATTR_IX86_YES): New.
	(IX86_ATTR_IX86_NO): Likewise.
	(ix86_opt_type): Add ix86_opt_ix86_yes and ix86_opt_ix86_no.
	(ix86_valid_target_attribute_inner_p): Handle general-regs-only,
	ix86_opt_ix86_yes and ix86_opt_ix86_no.
	* doc/extend.texi: Document target("general-regs-only") function
	attribute.

gcc/testsuite/

	PR target/96744
	* gcc.target/i386/adx-check.h: Add
	__attribute__((__target__("general-regs-only"))) to only use
	general-purpose registers.
	* gcc.target/i386/bmi-check.h: Likewise.
	* gcc.target/i386/bmi2-check.h: Likewise.
	* gcc.target/i386/pr77756.c: Likewise.
	* gcc.target/i386/pr95973.c: Likewise.
	* gcc.target/i386/rtm-check.h: Likewise.
	* gcc.target/i386/sha-check.h: Likewise.
	* gcc.target/i386/avx2-check.h: Add
	#pragma GCC target("general-regs-only") to only use
	general-purpose registers.
	* gcc.target/i386/avx512-check.h: Likewise.
---
 gcc/common/config/i386/i386-common.c         |  1 +
 gcc/config/i386/cpuid.h                      |  9 +++++
 gcc/config/i386/i386-options.c               | 37 ++++++++++++++++++++
 gcc/doc/extend.texi                          |  4 +++
 gcc/testsuite/gcc.target/i386/adx-check.h    |  2 +-
 gcc/testsuite/gcc.target/i386/avx2-check.h   |  5 +++
 gcc/testsuite/gcc.target/i386/avx512-check.h |  5 +++
 gcc/testsuite/gcc.target/i386/bmi-check.h    |  1 +
 gcc/testsuite/gcc.target/i386/bmi2-check.h   |  1 +
 gcc/testsuite/gcc.target/i386/pr77756.c      |  1 +
 gcc/testsuite/gcc.target/i386/pr95973.c      |  1 +
 gcc/testsuite/gcc.target/i386/rtm-check.h    |  1 +
 gcc/testsuite/gcc.target/i386/sha-check.h    |  1 +
 13 files changed, 68 insertions(+), 1 deletion(-)

diff --git a/gcc/common/config/i386/i386-common.c b/gcc/common/config/i386/i386-common.c
index bb14305ad7b..77a67d0dd20 100644
--- a/gcc/common/config/i386/i386-common.c
+++ b/gcc/common/config/i386/i386-common.c
@@ -333,6 +333,7 @@ ix86_handle_option (struct gcc_options *opts,
 	    |= OPTION_MASK_ISA2_GENERAL_REGS_ONLY_UNSET;
 
 	  opts->x_target_flags &= ~MASK_80387;
+	  opts->x_ix86_fpmath = FPMATH_387;
 	}
       else
 	gcc_unreachable ();
diff --git a/gcc/config/i386/cpuid.h b/gcc/config/i386/cpuid.h
index bca61d620db..d3bcf4ae313 100644
--- a/gcc/config/i386/cpuid.h
+++ b/gcc/config/i386/cpuid.h
@@ -24,6 +24,13 @@
 #ifndef _CPUID_H_INCLUDED
 #define _CPUID_H_INCLUDED
 
+#pragma GCC push_options
+#if __GNUC__ >= 11
+#pragma GCC target("general-regs-only")
+#else
+#pragma GCC target("no-sse,fpmath=387")
+#endif
+
 /* %eax */
 #define bit_AVX512BF16	(1 << 5)
 
@@ -324,4 +331,6 @@ __cpuidex (int __cpuid_info[4], int __leaf, int __subleaf)
 		 __cpuid_info[2], __cpuid_info[3]);
 }
 
+#pragma GCC pop_options
+
 #endif /* _CPUID_H_INCLUDED */
diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c
index 26d1ea18ef1..27cba65ccf9 100644
--- a/gcc/config/i386/i386-options.c
+++ b/gcc/config/i386/i386-options.c
@@ -922,12 +922,18 @@ ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[],
 #define IX86_ATTR_ENUM(S,O)  { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
 #define IX86_ATTR_NO(S,O,M)  { S, sizeof (S)-1, ix86_opt_no,  O, M }
+#define IX86_ATTR_IX86_YES(S,O,M) \
+  { S, sizeof (S)-1, ix86_opt_ix86_yes, O, M }
+#define IX86_ATTR_IX86_NO(S,O,M) \
+  { S, sizeof (S)-1, ix86_opt_ix86_no,  O, M }
 
   enum ix86_opt_type
   {
     ix86_opt_unknown,
     ix86_opt_yes,
     ix86_opt_no,
+    ix86_opt_ix86_yes,
+    ix86_opt_ix86_no,
     ix86_opt_str,
     ix86_opt_enum,
     ix86_opt_isa
@@ -1062,6 +1068,10 @@ ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[],
     IX86_ATTR_YES ("recip",
 		   OPT_mrecip,
 		   MASK_RECIP),
+
+    IX86_ATTR_IX86_YES ("general-regs-only",
+			OPT_mgeneral_regs_only,
+			OPTION_MASK_GENERAL_REGS_ONLY),
   };
 
   location_t loc
@@ -1175,6 +1185,33 @@ ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[],
 	    opts->x_target_flags &= ~mask;
 	}
 
+      else if (type == ix86_opt_ix86_yes || type == ix86_opt_ix86_no)
+	{
+	  if (mask == OPTION_MASK_GENERAL_REGS_ONLY)
+	    {
+	      if (type != ix86_opt_ix86_yes)
+		gcc_unreachable ();
+
+	      opts->x_ix86_target_flags |= mask;
+
+	      struct cl_decoded_option decoded;
+	      generate_option (opt, NULL, opt_set_p, CL_TARGET,
+			       &decoded);
+	      ix86_handle_option (opts, opts_set, &decoded,
+				  input_location);
+	    }
+	  else
+	    {
+	      if (type == ix86_opt_ix86_no)
+		opt_set_p = !opt_set_p;
+
+	      if (opt_set_p)
+		opts->x_ix86_target_flags |= mask;
+	      else
+		opts->x_ix86_target_flags &= ~mask;
+	    }
+	}
+
       else if (type == ix86_opt_str)
 	{
 	  if (p_strings[opt])
diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index fd794961e0a..2bb9b2f72f5 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -6656,6 +6656,10 @@ Enable/disable the generation of RCPSS, RCPPS, RSQRTSS and RSQRTPS
 instructions followed an additional Newton-Raphson step instead of
 doing a floating-point division.
 
+@item general-regs-only
+@cindex @code{target("general-regs-only")} function attribute, x86
+Generate code which uses only the general registers.
+
 @item arch=@var{ARCH}
 @cindex @code{target("arch=@var{ARCH}")} function attribute, x86
 Specify the architecture to generate code for in compiling the function.
diff --git a/gcc/testsuite/gcc.target/i386/adx-check.h b/gcc/testsuite/gcc.target/i386/adx-check.h
index cfed1a38483..40f3b523f2b 100644
--- a/gcc/testsuite/gcc.target/i386/adx-check.h
+++ b/gcc/testsuite/gcc.target/i386/adx-check.h
@@ -8,6 +8,7 @@ static void __attribute__ ((noinline)) do_test (void)
   adx_test ();
 }
 
+__attribute__((__target__("general-regs-only")))
 int
 main ()
 {
@@ -31,4 +32,3 @@ main ()
 #endif
   return 0;
 }
-
diff --git a/gcc/testsuite/gcc.target/i386/avx2-check.h b/gcc/testsuite/gcc.target/i386/avx2-check.h
index 25bed5e0da6..dd0f99db0e2 100644
--- a/gcc/testsuite/gcc.target/i386/avx2-check.h
+++ b/gcc/testsuite/gcc.target/i386/avx2-check.h
@@ -10,6 +10,9 @@ static void __attribute__ ((noinline)) do_test (void)
   avx2_test ();
 }
 
+#pragma GCC push_options
+#pragma GCC target("general-regs-only")
+
 static int
 check_osxsave (void)
 {
@@ -42,3 +45,5 @@ main ()
 #endif
   return 0;
 }
+
+#pragma GCC pop_options
diff --git a/gcc/testsuite/gcc.target/i386/avx512-check.h b/gcc/testsuite/gcc.target/i386/avx512-check.h
index 0a377dba1d5..8aa3361dc63 100644
--- a/gcc/testsuite/gcc.target/i386/avx512-check.h
+++ b/gcc/testsuite/gcc.target/i386/avx512-check.h
@@ -25,6 +25,9 @@ do_test (void)
 }
 #endif
 
+#pragma GCC push_options
+#pragma GCC target("general-regs-only")
+
 static int
 check_osxsave (void)
 {
@@ -110,3 +113,5 @@ main ()
 #endif
   return 0;
 }
+
+#pragma GCC pop_options
diff --git a/gcc/testsuite/gcc.target/i386/bmi-check.h b/gcc/testsuite/gcc.target/i386/bmi-check.h
index 1973f3b6468..ee4cad96a8b 100644
--- a/gcc/testsuite/gcc.target/i386/bmi-check.h
+++ b/gcc/testsuite/gcc.target/i386/bmi-check.h
@@ -12,6 +12,7 @@ do_test (void)
   bmi_test ();
 }
 
+__attribute__((__target__("general-regs-only")))
 int
 main ()
 {
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-check.h b/gcc/testsuite/gcc.target/i386/bmi2-check.h
index ba91ef9b780..7977e353640 100644
--- a/gcc/testsuite/gcc.target/i386/bmi2-check.h
+++ b/gcc/testsuite/gcc.target/i386/bmi2-check.h
@@ -11,6 +11,7 @@ do_test (void)
   bmi2_test ();
 }
 
+__attribute__((__target__("general-regs-only")))
 int
 main ()
 {
diff --git a/gcc/testsuite/gcc.target/i386/pr77756.c b/gcc/testsuite/gcc.target/i386/pr77756.c
index 1eee7cd5a00..bfe5d866bca 100644
--- a/gcc/testsuite/gcc.target/i386/pr77756.c
+++ b/gcc/testsuite/gcc.target/i386/pr77756.c
@@ -2,6 +2,7 @@
 
 #include "cpuid.h"
 
+__attribute__((__target__("general-regs-only")))
 int
 main ()
 {
diff --git a/gcc/testsuite/gcc.target/i386/pr95973.c b/gcc/testsuite/gcc.target/i386/pr95973.c
index 08c7dba8f46..9fc50afe43f 100644
--- a/gcc/testsuite/gcc.target/i386/pr95973.c
+++ b/gcc/testsuite/gcc.target/i386/pr95973.c
@@ -4,6 +4,7 @@
 #include <cpuid.h>
 #include <cpuid.h>
 
+__attribute__((__target__("general-regs-only")))
 int
 main ()
 {
diff --git a/gcc/testsuite/gcc.target/i386/rtm-check.h b/gcc/testsuite/gcc.target/i386/rtm-check.h
index bdb5a6dc0bf..b7936ed0455 100644
--- a/gcc/testsuite/gcc.target/i386/rtm-check.h
+++ b/gcc/testsuite/gcc.target/i386/rtm-check.h
@@ -8,6 +8,7 @@ static void __attribute__ ((noinline)) do_test (void)
   rtm_test ();
 }
 
+__attribute__((__target__("general-regs-only")))
 int
 main ()
 {
diff --git a/gcc/testsuite/gcc.target/i386/sha-check.h b/gcc/testsuite/gcc.target/i386/sha-check.h
index 5bc5a59ab80..2bf9e8315d4 100644
--- a/gcc/testsuite/gcc.target/i386/sha-check.h
+++ b/gcc/testsuite/gcc.target/i386/sha-check.h
@@ -10,6 +10,7 @@ do_test (void)
   sha_test ();
 }
 
+__attribute__((__target__("general-regs-only")))
 int
 main ()
 {
-- 
2.26.2


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] x86: Only use general-purpose registers during CPUID check
  2020-08-22 19:08                               ` [PATCH] x86: Only use general-purpose registers " H.J. Lu
@ 2020-08-23  8:18                                 ` Uros Bizjak
  2020-08-23 15:07                                   ` [PATCH] x86: Add target("general-regs-only") function attribute H.J. Lu
  2020-08-23 15:23                                   ` [PATCH] x86: Only use general-purpose registers during CPUID check H.J. Lu
  0 siblings, 2 replies; 39+ messages in thread
From: Uros Bizjak @ 2020-08-23  8:18 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Hongtao Liu, GCC Patches, Kirill Yukhin

On Sat, Aug 22, 2020 at 9:09 PM H.J. Lu <hjl.tools@gmail.com> wrote:

> > > Compile CPUID check with "-mno-sse -mfpmath=387" to disable SSE, AVX and
> > > AVX512 during CPUID check to avoid vector and mask register operations.
> >
> > -mgeneral-regs-only ?
> >
>
> Here is a patch to add target("general-regs-only") function
> attribute and use it for CPUID check.   OK for master if there
> are no regressions?

Please test it first, then ask for an approval.

Please submit the general-regs-only part as an independent patch. (I
think this is the option linux should use for compilation).

OTOH, wrapping CPUID check in a target attribute is a bad idea. We
should disable spills to mask registers for generic targets by either
raising costs of moves between general and mask registers and/or (as
suggested earlier) introducing TARGET_SPILL_TO_MASK_REGS tuning and
use it in secondary_memory_needed to prevent inter register unit
spills.

So, compiling with -mavx512bw would NOT enable spills by default,
where compiling with -march=skylake-avx512 (or using equivalent
-mtune) would. This is IMO the least surprising approach, and would
avoid changing sources (as you now have to do for several testcases).

Uros.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [PATCH] x86: Add target("general-regs-only") function attribute
  2020-08-23  8:18                                 ` Uros Bizjak
@ 2020-08-23 15:07                                   ` H.J. Lu
  2020-08-23 15:37                                     ` Uros Bizjak
  2020-08-23 15:23                                   ` [PATCH] x86: Only use general-purpose registers during CPUID check H.J. Lu
  1 sibling, 1 reply; 39+ messages in thread
From: H.J. Lu @ 2020-08-23 15:07 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: Hongtao Liu, GCC Patches, Kirill Yukhin

On Sun, Aug 23, 2020 at 10:18:28AM +0200, Uros Bizjak wrote:
> On Sat, Aug 22, 2020 at 9:09 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> 
> > > > Compile CPUID check with "-mno-sse -mfpmath=387" to disable SSE, AVX and
> > > > AVX512 during CPUID check to avoid vector and mask register operations.
> > >
> > > -mgeneral-regs-only ?
> > >
> >
> > Here is a patch to add target("general-regs-only") function
> > attribute and use it for CPUID check.   OK for master if there
> > are no regressions?
> 
> Please test it first, then ask for an approval.
> 
> Please submit the general-regs-only part as an independent patch. (I
> think this is the option linux should use for compilation).
> 

Tested on Linux/x86-64.  OK for master?

Thanks.

H.J.
---
gcc/

	PR target/96744
	* config/i386/i386-options.c (IX86_ATTR_IX86_YES): New.
	(IX86_ATTR_IX86_NO): Likewise.
	(ix86_opt_type): Add ix86_opt_ix86_yes and ix86_opt_ix86_no.
	(ix86_valid_target_attribute_inner_p): Handle general-regs-only,
	ix86_opt_ix86_yes and ix86_opt_ix86_no.
	(ix86_option_override_internal): Check opts->x_ix86_target_flags
	instead of opts->x_ix86_target_flags.
	* doc/extend.texi: Document target("general-regs-only") function
	attribute.

gcc/testsuite/

	PR target/96744
	* gcc.target/i386/pr96744-1.c: New test.
	* gcc.target/i386/pr96744-2.c: Likewise.
	* gcc.target/i386/pr96744-3a.c: Likewise.
	* gcc.target/i386/pr96744-3b.c: Likewise.
	* gcc.target/i386/pr96744-4.c: Likewise.
	* gcc.target/i386/pr96744-5.c: Likewise.
	* gcc.target/i386/pr96744-6.c: Likewise.
	* gcc.target/i386/pr96744-7.c: Likewise.
	* gcc.target/i386/pr96744-8a.c: Likewise.
	* gcc.target/i386/pr96744-8b.c: Likewise.
	* gcc.target/i386/pr96744-9.c: Likewise.
---
 gcc/config/i386/i386-options.c             | 44 ++++++++++++++++++++--
 gcc/doc/extend.texi                        |  4 ++
 gcc/testsuite/gcc.target/i386/pr96744-1.c  | 10 +++++
 gcc/testsuite/gcc.target/i386/pr96744-2.c  | 11 ++++++
 gcc/testsuite/gcc.target/i386/pr96744-3a.c | 12 ++++++
 gcc/testsuite/gcc.target/i386/pr96744-3b.c | 16 ++++++++
 gcc/testsuite/gcc.target/i386/pr96744-4.c  | 11 ++++++
 gcc/testsuite/gcc.target/i386/pr96744-5.c  | 17 +++++++++
 gcc/testsuite/gcc.target/i386/pr96744-6.c  | 11 ++++++
 gcc/testsuite/gcc.target/i386/pr96744-7.c  | 14 +++++++
 gcc/testsuite/gcc.target/i386/pr96744-8a.c | 33 ++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr96744-8b.c | 35 +++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr96744-9.c  | 25 ++++++++++++
 13 files changed, 240 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr96744-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr96744-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr96744-3a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr96744-3b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr96744-4.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr96744-5.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr96744-6.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr96744-7.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr96744-8a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr96744-8b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr96744-9.c

diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c
index 26d1ea18ef1..e0fc68c27bf 100644
--- a/gcc/config/i386/i386-options.c
+++ b/gcc/config/i386/i386-options.c
@@ -922,12 +922,18 @@ ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[],
 #define IX86_ATTR_ENUM(S,O)  { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
 #define IX86_ATTR_NO(S,O,M)  { S, sizeof (S)-1, ix86_opt_no,  O, M }
+#define IX86_ATTR_IX86_YES(S,O,M) \
+  { S, sizeof (S)-1, ix86_opt_ix86_yes, O, M }
+#define IX86_ATTR_IX86_NO(S,O,M) \
+  { S, sizeof (S)-1, ix86_opt_ix86_no,  O, M }
 
   enum ix86_opt_type
   {
     ix86_opt_unknown,
     ix86_opt_yes,
     ix86_opt_no,
+    ix86_opt_ix86_yes,
+    ix86_opt_ix86_no,
     ix86_opt_str,
     ix86_opt_enum,
     ix86_opt_isa
@@ -1062,6 +1068,10 @@ ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[],
     IX86_ATTR_YES ("recip",
 		   OPT_mrecip,
 		   MASK_RECIP),
+
+    IX86_ATTR_IX86_YES ("general-regs-only",
+			OPT_mgeneral_regs_only,
+			OPTION_MASK_GENERAL_REGS_ONLY),
   };
 
   location_t loc
@@ -1175,6 +1185,33 @@ ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[],
 	    opts->x_target_flags &= ~mask;
 	}
 
+      else if (type == ix86_opt_ix86_yes || type == ix86_opt_ix86_no)
+	{
+	  if (mask == OPTION_MASK_GENERAL_REGS_ONLY)
+	    {
+	      if (type != ix86_opt_ix86_yes)
+		gcc_unreachable ();
+
+	      opts->x_ix86_target_flags |= mask;
+
+	      struct cl_decoded_option decoded;
+	      generate_option (opt, NULL, opt_set_p, CL_TARGET,
+			       &decoded);
+	      ix86_handle_option (opts, opts_set, &decoded,
+				  input_location);
+	    }
+	  else
+	    {
+	      if (type == ix86_opt_ix86_no)
+		opt_set_p = !opt_set_p;
+
+	      if (opt_set_p)
+		opts->x_ix86_target_flags |= mask;
+	      else
+		opts->x_ix86_target_flags &= ~mask;
+	    }
+	}
+
       else if (type == ix86_opt_str)
 	{
 	  if (p_strings[opt])
@@ -2260,9 +2297,10 @@ ix86_option_override_internal (bool main_args_p,
 	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PKU))
 	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PKU;
 
-	/* Don't enable x87 instructions if only
-	   general registers are allowed.  */
-	if (!(opts_set->x_ix86_target_flags & OPTION_MASK_GENERAL_REGS_ONLY)
+	/* Don't enable x87 instructions if only general registers are
+	   allowed by target("general-regs-only") function attribute or
+	   -mgeneral-regs-only.  */
+	if (!(opts->x_ix86_target_flags & OPTION_MASK_GENERAL_REGS_ONLY)
 	    && !(opts_set->x_target_flags & MASK_80387))
 	  {
 	    if (((processor_alias_table[i].flags & PTA_NO_80387) != 0))
diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index fd794961e0a..2bb9b2f72f5 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -6656,6 +6656,10 @@ Enable/disable the generation of RCPSS, RCPPS, RSQRTSS and RSQRTPS
 instructions followed an additional Newton-Raphson step instead of
 doing a floating-point division.
 
+@item general-regs-only
+@cindex @code{target("general-regs-only")} function attribute, x86
+Generate code which uses only the general registers.
+
 @item arch=@var{ARCH}
 @cindex @code{target("arch=@var{ARCH}")} function attribute, x86
 Specify the architecture to generate code for in compiling the function.
diff --git a/gcc/testsuite/gcc.target/i386/pr96744-1.c b/gcc/testsuite/gcc.target/i386/pr96744-1.c
new file mode 100644
index 00000000000..46f3ce6ddd4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr96744-1.c
@@ -0,0 +1,10 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-msse2" } */
+
+typedef int int32x2_t __attribute__ ((__vector_size__ ((8))));
+
+__attribute__((__target__("general-regs-only")))
+int32x2_t test (int32x2_t a, int32x2_t b)
+{ /* { dg-error "SSE register return with SSE disabled" } */
+  return a + b;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr96744-2.c b/gcc/testsuite/gcc.target/i386/pr96744-2.c
new file mode 100644
index 00000000000..4a436d8ef04
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr96744-2.c
@@ -0,0 +1,11 @@
+/* { dg-do compile { target ia32 } } */
+/* { dg-options "-mmmx" } */
+
+typedef int int32x2_t __attribute__ ((__vector_size__ ((8))));
+
+__attribute__((__target__("general-regs-only")))
+int32x2_t
+test (int32x2_t a, int32x2_t b) /* { dg-warning "MMX vector argument without MMX enabled" } */
+{ /* { dg-warning "MMX vector return without MMX enabled" } */
+  return a + b;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr96744-3a.c b/gcc/testsuite/gcc.target/i386/pr96744-3a.c
new file mode 100644
index 00000000000..79191544eb4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr96744-3a.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-msse2" } */
+
+typedef int int32x4_t __attribute__ ((__vector_size__ ((16))));
+extern int32x4_t c;
+
+__attribute__((__target__("general-regs-only")))
+void
+test (int32x4_t a, int32x4_t b) /* { dg-warning "SSE vector argument without SSE enabled" } */
+{
+  c = a + b;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr96744-3b.c b/gcc/testsuite/gcc.target/i386/pr96744-3b.c
new file mode 100644
index 00000000000..389b5cf9897
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr96744-3b.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-msse2" } */
+
+typedef int int32x4_t __attribute__ ((__vector_size__ ((16))));
+extern int32x4_t c;
+
+#pragma GCC push_options
+#pragma GCC target("general-regs-only")
+
+void
+test (int32x4_t a, int32x4_t b) /* { dg-warning "SSE vector argument without SSE enabled" } */
+{
+  c = a + b;
+}
+
+#pragma GCC pop_options
diff --git a/gcc/testsuite/gcc.target/i386/pr96744-4.c b/gcc/testsuite/gcc.target/i386/pr96744-4.c
new file mode 100644
index 00000000000..005329f95e9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr96744-4.c
@@ -0,0 +1,11 @@
+/* { dg-do compile { target ia32 } } */
+/* { dg-options "-msse2" } */
+
+typedef int int32x4_t __attribute__ ((__vector_size__ ((16))));
+
+__attribute__((__target__("general-regs-only")))
+int32x4_t
+test (int32x4_t a, int32x4_t b) /* { dg-warning "SSE vector argument without SSE enabled" } */
+{ /* { dg-warning "SSE vector return without SSE enabled" } */
+  return a + b;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr96744-5.c b/gcc/testsuite/gcc.target/i386/pr96744-5.c
new file mode 100644
index 00000000000..18f2132aa27
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr96744-5.c
@@ -0,0 +1,17 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-msse2" } */
+
+#include <stdarg.h>
+
+typedef int int32x2_t __attribute__ ((__vector_size__ ((8))));
+
+__attribute__((__target__("general-regs-only")))
+int
+test (int i, ...)
+{
+  va_list argp;
+  va_start (argp, i);
+  int32x2_t x = (int32x2_t) {0, 1};
+  x += va_arg (argp, int32x2_t); /* { dg-error "SSE register argument with SSE disabled" } */
+  return x[0] + x[1];
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr96744-6.c b/gcc/testsuite/gcc.target/i386/pr96744-6.c
new file mode 100644
index 00000000000..38a3cc7fa92
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr96744-6.c
@@ -0,0 +1,11 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-msse2 -mfpmath=sse" } */
+
+extern float a, b, c;
+
+__attribute__((__target__("general-regs-only")))
+void
+foo (void)
+{
+  c = a * b; /* { dg-error "SSE register return with SSE disabled" } */
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr96744-7.c b/gcc/testsuite/gcc.target/i386/pr96744-7.c
new file mode 100644
index 00000000000..5f55b6cbd33
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr96744-7.c
@@ -0,0 +1,14 @@
+/* { dg-do compile { target { ia32 && { ! *-*-darwin* } } } } */
+/* { dg-options "-msse2" } */
+
+extern float a, b, c;
+
+__attribute__((__target__("general-regs-only")))
+void
+foo (void)
+{
+  c = a * b;
+}
+
+/* { dg-final { scan-assembler-not "mulss" } } */
+/* { dg-final { scan-assembler "call\[ \t\]__mulsf3" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr96744-8a.c b/gcc/testsuite/gcc.target/i386/pr96744-8a.c
new file mode 100644
index 00000000000..d264e1e01d4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr96744-8a.c
@@ -0,0 +1,33 @@
+/* { dg-do run } */
+/* { dg-options "-O2" } */
+
+extern void abort ();
+
+__attribute__((__target__("general-regs-only")))
+int
+dec (int a, int b)
+{
+  return a + b;
+}
+
+__attribute__((__target__("general-regs-only")))
+int
+cal (int a, int b)
+{
+  int sum1 = a * b;
+  int sum2 = a / b;
+  int sum = dec (sum1, sum2);
+  return a + b + sum + sum1 + sum2;
+}
+
+__attribute__((__target__("general-regs-only")))
+int
+main (int argc, char **argv)
+{
+  int ret = cal (2, 1);
+
+  if (ret != 11)
+    abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr96744-8b.c b/gcc/testsuite/gcc.target/i386/pr96744-8b.c
new file mode 100644
index 00000000000..30e763d932e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr96744-8b.c
@@ -0,0 +1,35 @@
+/* { dg-do run } */
+/* { dg-options "-O2" } */
+
+#pragma GCC push_options
+#pragma GCC target("general-regs-only")
+
+extern void abort ();
+
+int
+dec (int a, int b)
+{
+  return a + b;
+}
+
+int
+cal (int a, int b)
+{
+  int sum1 = a * b;
+  int sum2 = a / b;
+  int sum = dec (sum1, sum2);
+  return a + b + sum + sum1 + sum2;
+}
+
+int
+main (int argc, char **argv)
+{
+  int ret = cal (2, 1);
+
+  if (ret != 11)
+    abort ();
+
+  return 0;
+}
+
+#pragma GCC pop_options
diff --git a/gcc/testsuite/gcc.target/i386/pr96744-9.c b/gcc/testsuite/gcc.target/i386/pr96744-9.c
new file mode 100644
index 00000000000..196e86f08e9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr96744-9.c
@@ -0,0 +1,25 @@
+/* { dg-do run } */
+/* { dg-options "-O2" } */
+
+extern void abort ();
+
+__attribute__((__target__("general-regs-only")))
+int
+cal (int a, int b)
+{
+  int sum = a + b;
+  int sum1 = a * b;
+  return (a + b + sum + sum1);
+}
+
+__attribute__((__target__("general-regs-only")))
+int
+main (int argc, char **argv)
+{
+  int ret = cal (1, 2);
+
+  if (ret != 8)
+    abort ();
+
+  return 0;
+}
-- 
2.26.2


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] x86: Only use general-purpose registers during CPUID check
  2020-08-23  8:18                                 ` Uros Bizjak
  2020-08-23 15:07                                   ` [PATCH] x86: Add target("general-regs-only") function attribute H.J. Lu
@ 2020-08-23 15:23                                   ` H.J. Lu
  2020-08-23 16:02                                     ` Uros Bizjak
  2020-08-23 21:22                                     ` [PATCH] x86: Only use general-purpose registers during CPUID check Florian Weimer
  1 sibling, 2 replies; 39+ messages in thread
From: H.J. Lu @ 2020-08-23 15:23 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: Hongtao Liu, GCC Patches, Kirill Yukhin

On Sun, Aug 23, 2020 at 10:18:28AM +0200, Uros Bizjak wrote:
> On Sat, Aug 22, 2020 at 9:09 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> 
> > > > Compile CPUID check with "-mno-sse -mfpmath=387" to disable SSE, AVX and
> > > > AVX512 during CPUID check to avoid vector and mask register operations.
> > >
> > > -mgeneral-regs-only ?
> > >
> >
> > Here is a patch to add target("general-regs-only") function
> > attribute and use it for CPUID check.   OK for master if there
> > are no regressions?
> 
> Please test it first, then ask for an approval.
> 
> Please submit the general-regs-only part as an independent patch. (I
> think this is the option linux should use for compilation).
> 
> OTOH, wrapping CPUID check in a target attribute is a bad idea. We
> should disable spills to mask registers for generic targets by either
> raising costs of moves between general and mask registers and/or (as
> suggested earlier) introducing TARGET_SPILL_TO_MASK_REGS tuning and
> use it in secondary_memory_needed to prevent inter register unit
> spills.
> 
> So, compiling with -mavx512bw would NOT enable spills by default,
> where compiling with -march=skylake-avx512 (or using equivalent
> -mtune) would. This is IMO the least surprising approach, and would
> avoid changing sources (as you now have to do for several testcases).

We have 2 orthogonal issues here:

1. When mask register spill should be enabled.
2. CPUID check should be done with general registers only.

As shown in GCC testcases, CPUID check may be done with arbitrary ISAs
or -march/-mtune options enabled.  We should either

1. Enable only general registers for CPUID check.  Or
2. Issue an error for CPUID check if non-general registers are used.

H.J.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] x86: Add target("general-regs-only") function attribute
  2020-08-23 15:07                                   ` [PATCH] x86: Add target("general-regs-only") function attribute H.J. Lu
@ 2020-08-23 15:37                                     ` Uros Bizjak
  0 siblings, 0 replies; 39+ messages in thread
From: Uros Bizjak @ 2020-08-23 15:37 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Hongtao Liu, GCC Patches, Kirill Yukhin

On Sun, Aug 23, 2020 at 5:07 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Sun, Aug 23, 2020 at 10:18:28AM +0200, Uros Bizjak wrote:
> > On Sat, Aug 22, 2020 at 9:09 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > > > > Compile CPUID check with "-mno-sse -mfpmath=387" to disable SSE, AVX and
> > > > > AVX512 during CPUID check to avoid vector and mask register operations.
> > > >
> > > > -mgeneral-regs-only ?
> > > >
> > >
> > > Here is a patch to add target("general-regs-only") function
> > > attribute and use it for CPUID check.   OK for master if there
> > > are no regressions?
> >
> > Please test it first, then ask for an approval.
> >
> > Please submit the general-regs-only part as an independent patch. (I
> > think this is the option linux should use for compilation).
> >
>
> Tested on Linux/x86-64.  OK for master?
>
> Thanks.
>
> H.J.
> ---
> gcc/
>
>         PR target/96744
>         * config/i386/i386-options.c (IX86_ATTR_IX86_YES): New.
>         (IX86_ATTR_IX86_NO): Likewise.
>         (ix86_opt_type): Add ix86_opt_ix86_yes and ix86_opt_ix86_no.
>         (ix86_valid_target_attribute_inner_p): Handle general-regs-only,
>         ix86_opt_ix86_yes and ix86_opt_ix86_no.
>         (ix86_option_override_internal): Check opts->x_ix86_target_flags
>         instead of opts->x_ix86_target_flags.
>         * doc/extend.texi: Document target("general-regs-only") function
>         attribute.
>
> gcc/testsuite/
>
>         PR target/96744
>         * gcc.target/i386/pr96744-1.c: New test.
>         * gcc.target/i386/pr96744-2.c: Likewise.
>         * gcc.target/i386/pr96744-3a.c: Likewise.
>         * gcc.target/i386/pr96744-3b.c: Likewise.
>         * gcc.target/i386/pr96744-4.c: Likewise.
>         * gcc.target/i386/pr96744-5.c: Likewise.
>         * gcc.target/i386/pr96744-6.c: Likewise.
>         * gcc.target/i386/pr96744-7.c: Likewise.
>         * gcc.target/i386/pr96744-8a.c: Likewise.
>         * gcc.target/i386/pr96744-8b.c: Likewise.
>         * gcc.target/i386/pr96744-9.c: Likewise.

OK.

Thanks,
Uros.
> ---
>  gcc/config/i386/i386-options.c             | 44 ++++++++++++++++++++--
>  gcc/doc/extend.texi                        |  4 ++
>  gcc/testsuite/gcc.target/i386/pr96744-1.c  | 10 +++++
>  gcc/testsuite/gcc.target/i386/pr96744-2.c  | 11 ++++++
>  gcc/testsuite/gcc.target/i386/pr96744-3a.c | 12 ++++++
>  gcc/testsuite/gcc.target/i386/pr96744-3b.c | 16 ++++++++
>  gcc/testsuite/gcc.target/i386/pr96744-4.c  | 11 ++++++
>  gcc/testsuite/gcc.target/i386/pr96744-5.c  | 17 +++++++++
>  gcc/testsuite/gcc.target/i386/pr96744-6.c  | 11 ++++++
>  gcc/testsuite/gcc.target/i386/pr96744-7.c  | 14 +++++++
>  gcc/testsuite/gcc.target/i386/pr96744-8a.c | 33 ++++++++++++++++
>  gcc/testsuite/gcc.target/i386/pr96744-8b.c | 35 +++++++++++++++++
>  gcc/testsuite/gcc.target/i386/pr96744-9.c  | 25 ++++++++++++
>  13 files changed, 240 insertions(+), 3 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr96744-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr96744-2.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr96744-3a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr96744-3b.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr96744-4.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr96744-5.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr96744-6.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr96744-7.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr96744-8a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr96744-8b.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr96744-9.c
>
> diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c
> index 26d1ea18ef1..e0fc68c27bf 100644
> --- a/gcc/config/i386/i386-options.c
> +++ b/gcc/config/i386/i386-options.c
> @@ -922,12 +922,18 @@ ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[],
>  #define IX86_ATTR_ENUM(S,O)  { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
>  #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
>  #define IX86_ATTR_NO(S,O,M)  { S, sizeof (S)-1, ix86_opt_no,  O, M }
> +#define IX86_ATTR_IX86_YES(S,O,M) \
> +  { S, sizeof (S)-1, ix86_opt_ix86_yes, O, M }
> +#define IX86_ATTR_IX86_NO(S,O,M) \
> +  { S, sizeof (S)-1, ix86_opt_ix86_no,  O, M }
>
>    enum ix86_opt_type
>    {
>      ix86_opt_unknown,
>      ix86_opt_yes,
>      ix86_opt_no,
> +    ix86_opt_ix86_yes,
> +    ix86_opt_ix86_no,
>      ix86_opt_str,
>      ix86_opt_enum,
>      ix86_opt_isa
> @@ -1062,6 +1068,10 @@ ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[],
>      IX86_ATTR_YES ("recip",
>                    OPT_mrecip,
>                    MASK_RECIP),
> +
> +    IX86_ATTR_IX86_YES ("general-regs-only",
> +                       OPT_mgeneral_regs_only,
> +                       OPTION_MASK_GENERAL_REGS_ONLY),
>    };
>
>    location_t loc
> @@ -1175,6 +1185,33 @@ ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[],
>             opts->x_target_flags &= ~mask;
>         }
>
> +      else if (type == ix86_opt_ix86_yes || type == ix86_opt_ix86_no)
> +       {
> +         if (mask == OPTION_MASK_GENERAL_REGS_ONLY)
> +           {
> +             if (type != ix86_opt_ix86_yes)
> +               gcc_unreachable ();
> +
> +             opts->x_ix86_target_flags |= mask;
> +
> +             struct cl_decoded_option decoded;
> +             generate_option (opt, NULL, opt_set_p, CL_TARGET,
> +                              &decoded);
> +             ix86_handle_option (opts, opts_set, &decoded,
> +                                 input_location);
> +           }
> +         else
> +           {
> +             if (type == ix86_opt_ix86_no)
> +               opt_set_p = !opt_set_p;
> +
> +             if (opt_set_p)
> +               opts->x_ix86_target_flags |= mask;
> +             else
> +               opts->x_ix86_target_flags &= ~mask;
> +           }
> +       }
> +
>        else if (type == ix86_opt_str)
>         {
>           if (p_strings[opt])
> @@ -2260,9 +2297,10 @@ ix86_option_override_internal (bool main_args_p,
>             && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PKU))
>           opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PKU;
>
> -       /* Don't enable x87 instructions if only
> -          general registers are allowed.  */
> -       if (!(opts_set->x_ix86_target_flags & OPTION_MASK_GENERAL_REGS_ONLY)
> +       /* Don't enable x87 instructions if only general registers are
> +          allowed by target("general-regs-only") function attribute or
> +          -mgeneral-regs-only.  */
> +       if (!(opts->x_ix86_target_flags & OPTION_MASK_GENERAL_REGS_ONLY)
>             && !(opts_set->x_target_flags & MASK_80387))
>           {
>             if (((processor_alias_table[i].flags & PTA_NO_80387) != 0))
> diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
> index fd794961e0a..2bb9b2f72f5 100644
> --- a/gcc/doc/extend.texi
> +++ b/gcc/doc/extend.texi
> @@ -6656,6 +6656,10 @@ Enable/disable the generation of RCPSS, RCPPS, RSQRTSS and RSQRTPS
>  instructions followed an additional Newton-Raphson step instead of
>  doing a floating-point division.
>
> +@item general-regs-only
> +@cindex @code{target("general-regs-only")} function attribute, x86
> +Generate code which uses only the general registers.
> +
>  @item arch=@var{ARCH}
>  @cindex @code{target("arch=@var{ARCH}")} function attribute, x86
>  Specify the architecture to generate code for in compiling the function.
> diff --git a/gcc/testsuite/gcc.target/i386/pr96744-1.c b/gcc/testsuite/gcc.target/i386/pr96744-1.c
> new file mode 100644
> index 00000000000..46f3ce6ddd4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr96744-1.c
> @@ -0,0 +1,10 @@
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-msse2" } */
> +
> +typedef int int32x2_t __attribute__ ((__vector_size__ ((8))));
> +
> +__attribute__((__target__("general-regs-only")))
> +int32x2_t test (int32x2_t a, int32x2_t b)
> +{ /* { dg-error "SSE register return with SSE disabled" } */
> +  return a + b;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr96744-2.c b/gcc/testsuite/gcc.target/i386/pr96744-2.c
> new file mode 100644
> index 00000000000..4a436d8ef04
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr96744-2.c
> @@ -0,0 +1,11 @@
> +/* { dg-do compile { target ia32 } } */
> +/* { dg-options "-mmmx" } */
> +
> +typedef int int32x2_t __attribute__ ((__vector_size__ ((8))));
> +
> +__attribute__((__target__("general-regs-only")))
> +int32x2_t
> +test (int32x2_t a, int32x2_t b) /* { dg-warning "MMX vector argument without MMX enabled" } */
> +{ /* { dg-warning "MMX vector return without MMX enabled" } */
> +  return a + b;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr96744-3a.c b/gcc/testsuite/gcc.target/i386/pr96744-3a.c
> new file mode 100644
> index 00000000000..79191544eb4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr96744-3a.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-msse2" } */
> +
> +typedef int int32x4_t __attribute__ ((__vector_size__ ((16))));
> +extern int32x4_t c;
> +
> +__attribute__((__target__("general-regs-only")))
> +void
> +test (int32x4_t a, int32x4_t b) /* { dg-warning "SSE vector argument without SSE enabled" } */
> +{
> +  c = a + b;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr96744-3b.c b/gcc/testsuite/gcc.target/i386/pr96744-3b.c
> new file mode 100644
> index 00000000000..389b5cf9897
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr96744-3b.c
> @@ -0,0 +1,16 @@
> +/* { dg-do compile } */
> +/* { dg-options "-msse2" } */
> +
> +typedef int int32x4_t __attribute__ ((__vector_size__ ((16))));
> +extern int32x4_t c;
> +
> +#pragma GCC push_options
> +#pragma GCC target("general-regs-only")
> +
> +void
> +test (int32x4_t a, int32x4_t b) /* { dg-warning "SSE vector argument without SSE enabled" } */
> +{
> +  c = a + b;
> +}
> +
> +#pragma GCC pop_options
> diff --git a/gcc/testsuite/gcc.target/i386/pr96744-4.c b/gcc/testsuite/gcc.target/i386/pr96744-4.c
> new file mode 100644
> index 00000000000..005329f95e9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr96744-4.c
> @@ -0,0 +1,11 @@
> +/* { dg-do compile { target ia32 } } */
> +/* { dg-options "-msse2" } */
> +
> +typedef int int32x4_t __attribute__ ((__vector_size__ ((16))));
> +
> +__attribute__((__target__("general-regs-only")))
> +int32x4_t
> +test (int32x4_t a, int32x4_t b) /* { dg-warning "SSE vector argument without SSE enabled" } */
> +{ /* { dg-warning "SSE vector return without SSE enabled" } */
> +  return a + b;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr96744-5.c b/gcc/testsuite/gcc.target/i386/pr96744-5.c
> new file mode 100644
> index 00000000000..18f2132aa27
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr96744-5.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-msse2" } */
> +
> +#include <stdarg.h>
> +
> +typedef int int32x2_t __attribute__ ((__vector_size__ ((8))));
> +
> +__attribute__((__target__("general-regs-only")))
> +int
> +test (int i, ...)
> +{
> +  va_list argp;
> +  va_start (argp, i);
> +  int32x2_t x = (int32x2_t) {0, 1};
> +  x += va_arg (argp, int32x2_t); /* { dg-error "SSE register argument with SSE disabled" } */
> +  return x[0] + x[1];
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr96744-6.c b/gcc/testsuite/gcc.target/i386/pr96744-6.c
> new file mode 100644
> index 00000000000..38a3cc7fa92
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr96744-6.c
> @@ -0,0 +1,11 @@
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-msse2 -mfpmath=sse" } */
> +
> +extern float a, b, c;
> +
> +__attribute__((__target__("general-regs-only")))
> +void
> +foo (void)
> +{
> +  c = a * b; /* { dg-error "SSE register return with SSE disabled" } */
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr96744-7.c b/gcc/testsuite/gcc.target/i386/pr96744-7.c
> new file mode 100644
> index 00000000000..5f55b6cbd33
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr96744-7.c
> @@ -0,0 +1,14 @@
> +/* { dg-do compile { target { ia32 && { ! *-*-darwin* } } } } */
> +/* { dg-options "-msse2" } */
> +
> +extern float a, b, c;
> +
> +__attribute__((__target__("general-regs-only")))
> +void
> +foo (void)
> +{
> +  c = a * b;
> +}
> +
> +/* { dg-final { scan-assembler-not "mulss" } } */
> +/* { dg-final { scan-assembler "call\[ \t\]__mulsf3" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr96744-8a.c b/gcc/testsuite/gcc.target/i386/pr96744-8a.c
> new file mode 100644
> index 00000000000..d264e1e01d4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr96744-8a.c
> @@ -0,0 +1,33 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2" } */
> +
> +extern void abort ();
> +
> +__attribute__((__target__("general-regs-only")))
> +int
> +dec (int a, int b)
> +{
> +  return a + b;
> +}
> +
> +__attribute__((__target__("general-regs-only")))
> +int
> +cal (int a, int b)
> +{
> +  int sum1 = a * b;
> +  int sum2 = a / b;
> +  int sum = dec (sum1, sum2);
> +  return a + b + sum + sum1 + sum2;
> +}
> +
> +__attribute__((__target__("general-regs-only")))
> +int
> +main (int argc, char **argv)
> +{
> +  int ret = cal (2, 1);
> +
> +  if (ret != 11)
> +    abort ();
> +
> +  return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr96744-8b.c b/gcc/testsuite/gcc.target/i386/pr96744-8b.c
> new file mode 100644
> index 00000000000..30e763d932e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr96744-8b.c
> @@ -0,0 +1,35 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2" } */
> +
> +#pragma GCC push_options
> +#pragma GCC target("general-regs-only")
> +
> +extern void abort ();
> +
> +int
> +dec (int a, int b)
> +{
> +  return a + b;
> +}
> +
> +int
> +cal (int a, int b)
> +{
> +  int sum1 = a * b;
> +  int sum2 = a / b;
> +  int sum = dec (sum1, sum2);
> +  return a + b + sum + sum1 + sum2;
> +}
> +
> +int
> +main (int argc, char **argv)
> +{
> +  int ret = cal (2, 1);
> +
> +  if (ret != 11)
> +    abort ();
> +
> +  return 0;
> +}
> +
> +#pragma GCC pop_options
> diff --git a/gcc/testsuite/gcc.target/i386/pr96744-9.c b/gcc/testsuite/gcc.target/i386/pr96744-9.c
> new file mode 100644
> index 00000000000..196e86f08e9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr96744-9.c
> @@ -0,0 +1,25 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2" } */
> +
> +extern void abort ();
> +
> +__attribute__((__target__("general-regs-only")))
> +int
> +cal (int a, int b)
> +{
> +  int sum = a + b;
> +  int sum1 = a * b;
> +  return (a + b + sum + sum1);
> +}
> +
> +__attribute__((__target__("general-regs-only")))
> +int
> +main (int argc, char **argv)
> +{
> +  int ret = cal (1, 2);
> +
> +  if (ret != 8)
> +    abort ();
> +
> +  return 0;
> +}
> --
> 2.26.2
>

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] x86: Only use general-purpose registers during CPUID check
  2020-08-23 15:23                                   ` [PATCH] x86: Only use general-purpose registers during CPUID check H.J. Lu
@ 2020-08-23 16:02                                     ` Uros Bizjak
  2020-08-24 13:22                                       ` [PATCH] x86: Use -march=x86-64/-march=i386 in <cpuid.h> H.J. Lu
  2020-08-23 21:22                                     ` [PATCH] x86: Only use general-purpose registers during CPUID check Florian Weimer
  1 sibling, 1 reply; 39+ messages in thread
From: Uros Bizjak @ 2020-08-23 16:02 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Hongtao Liu, GCC Patches, Kirill Yukhin

On Sun, Aug 23, 2020 at 5:23 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Sun, Aug 23, 2020 at 10:18:28AM +0200, Uros Bizjak wrote:
> > On Sat, Aug 22, 2020 at 9:09 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > > > > Compile CPUID check with "-mno-sse -mfpmath=387" to disable SSE, AVX and
> > > > > AVX512 during CPUID check to avoid vector and mask register operations.
> > > >
> > > > -mgeneral-regs-only ?
> > > >
> > >
> > > Here is a patch to add target("general-regs-only") function
> > > attribute and use it for CPUID check.   OK for master if there
> > > are no regressions?
> >
> > Please test it first, then ask for an approval.
> >
> > Please submit the general-regs-only part as an independent patch. (I
> > think this is the option linux should use for compilation).
> >
> > OTOH, wrapping CPUID check in a target attribute is a bad idea. We
> > should disable spills to mask registers for generic targets by either
> > raising costs of moves between general and mask registers and/or (as
> > suggested earlier) introducing TARGET_SPILL_TO_MASK_REGS tuning and
> > use it in secondary_memory_needed to prevent inter register unit
> > spills.
> >
> > So, compiling with -mavx512bw would NOT enable spills by default,
> > where compiling with -march=skylake-avx512 (or using equivalent
> > -mtune) would. This is IMO the least surprising approach, and would
> > avoid changing sources (as you now have to do for several testcases).
>
> We have 2 orthogonal issues here:
>
> 1. When mask register spill should be enabled.
> 2. CPUID check should be done with general registers only.
>
> As shown in GCC testcases, CPUID check may be done with arbitrary ISAs
> or -march/-mtune options enabled.  We should either
>
> 1. Enable only general registers for CPUID check.  Or
> 2. Issue an error for CPUID check if non-general registers are used.

We should follow the same approach as with SSE2, where DI/SImode
spills to XMM registers were effectively disabled for a generic
target. So, unless the tuning target is also specified, spills to mask
registers should not be generated. It was my oversight to approve the
patch that enables spills for a generic target, and without the tuning
flag, the patch will be reverted.

Now, we have -mgeneral-regs-only functionality in place, so if a
package wants to enable spills, the correct -mtune (ro -march that
implies -mtune) should be used, and it is expected that the detection
code is amended with general-regs-only pragmas.

<footnote

Speaking of pragmas, these should be added outside cpuid.h, like:

#pragma GCC push_options
#pragma GCC target("general-regs-only")

#include <cpuid.h>

void cpuid_check ()
...

#pragma GCC pop_options

>footnote

Nowadays, -march=native is mostly used outside generic target
compilations, so for relevant avx512 targets, we still generate spills
to mask regs. In future, we can review the setting of the tuning flag
for a generic target in the same way as with SSE2 inter-reg moves.

Uros.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] x86: Only use general-purpose registers during CPUID check
  2020-08-23 15:23                                   ` [PATCH] x86: Only use general-purpose registers during CPUID check H.J. Lu
  2020-08-23 16:02                                     ` Uros Bizjak
@ 2020-08-23 21:22                                     ` Florian Weimer
  1 sibling, 0 replies; 39+ messages in thread
From: Florian Weimer @ 2020-08-23 21:22 UTC (permalink / raw)
  To: H.J. Lu via Gcc-patches; +Cc: Uros Bizjak, H.J. Lu

* H. J. Lu via Gcc-patches:

> 2. CPUID check should be done with general registers only.

Is this really the concern here?  Isn't this about instructions, not
registers?  If there's a useful integer register instruction for
post-processing CPUID bits that's not in the baseline ABI, GCC still
shouldn't use it in the check, I assume.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [PATCH] x86: Use -march=x86-64/-march=i386 in <cpuid.h>
  2020-08-23 16:02                                     ` Uros Bizjak
@ 2020-08-24 13:22                                       ` H.J. Lu
  2020-08-24 14:55                                         ` Uros Bizjak
  0 siblings, 1 reply; 39+ messages in thread
From: H.J. Lu @ 2020-08-24 13:22 UTC (permalink / raw)
  To: Uros Bizjak, Florian Weimer; +Cc: Hongtao Liu, GCC Patches, Kirill Yukhin

On Sun, Aug 23, 2020 at 9:03 AM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Sun, Aug 23, 2020 at 5:23 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Sun, Aug 23, 2020 at 10:18:28AM +0200, Uros Bizjak wrote:
> > > On Sat, Aug 22, 2020 at 9:09 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > > > > Compile CPUID check with "-mno-sse -mfpmath=387" to disable SSE, AVX and
> > > > > > AVX512 during CPUID check to avoid vector and mask register operations.
> > > > >
> > > > > -mgeneral-regs-only ?
> > > > >
> > > >
> > > > Here is a patch to add target("general-regs-only") function
> > > > attribute and use it for CPUID check.   OK for master if there
> > > > are no regressions?
> > >
> > > Please test it first, then ask for an approval.
> > >
> > > Please submit the general-regs-only part as an independent patch. (I
> > > think this is the option linux should use for compilation).
> > >
> > > OTOH, wrapping CPUID check in a target attribute is a bad idea. We
> > > should disable spills to mask registers for generic targets by either
> > > raising costs of moves between general and mask registers and/or (as
> > > suggested earlier) introducing TARGET_SPILL_TO_MASK_REGS tuning and
> > > use it in secondary_memory_needed to prevent inter register unit
> > > spills.
> > >
> > > So, compiling with -mavx512bw would NOT enable spills by default,
> > > where compiling with -march=skylake-avx512 (or using equivalent
> > > -mtune) would. This is IMO the least surprising approach, and would
> > > avoid changing sources (as you now have to do for several testcases).
> >
> > We have 2 orthogonal issues here:
> >
> > 1. When mask register spill should be enabled.
> > 2. CPUID check should be done with general registers only.
> >
> > As shown in GCC testcases, CPUID check may be done with arbitrary ISAs
> > or -march/-mtune options enabled.  We should either
> >
> > 1. Enable only general registers for CPUID check.  Or
> > 2. Issue an error for CPUID check if non-general registers are used.
>
> We should follow the same approach as with SSE2, where DI/SImode
> spills to XMM registers were effectively disabled for a generic
> target. So, unless the tuning target is also specified, spills to mask
> registers should not be generated. It was my oversight to approve the
> patch that enables spills for a generic target, and without the tuning
> flag, the patch will be reverted.
>
> Now, we have -mgeneral-regs-only functionality in place, so if a
> package wants to enable spills, the correct -mtune (ro -march that
> implies -mtune) should be used, and it is expected that the detection
> code is amended with general-regs-only pragmas.
>
> <footnote
>
> Speaking of pragmas, these should be added outside cpuid.h, like:
>
> #pragma GCC push_options
> #pragma GCC target("general-regs-only")
>
> #include <cpuid.h>
>
> void cpuid_check ()
> ...
>
> #pragma GCC pop_options
>
> >footnote
>
> Nowadays, -march=native is mostly used outside generic target
> compilations, so for relevant avx512 targets, we still generate spills
> to mask regs. In future, we can review the setting of the tuning flag
> for a generic target in the same way as with SSE2 inter-reg moves.
>

Florian raised an issue that we need to limit <cpuid.h> to the basic ISAs.
<cpuid.h> should be handled similarly to other intrinsic header files.
That is <cpuid.h> should use

#pragma GCC push_options
#ifdef __x86_64__
#pragma GCC target("arch=x86-64")
#else
#pragma GCC target("arch=i386")
...
#pragma GCC pop_options

Here is a patch.  OK for master?

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] x86: Use -march=x86-64/-march=i386 in <cpuid.h>
  2020-08-24 13:22                                       ` [PATCH] x86: Use -march=x86-64/-march=i386 in <cpuid.h> H.J. Lu
@ 2020-08-24 14:55                                         ` Uros Bizjak
  2020-08-24 16:16                                           ` [PATCH] x86: Use target("baseline-isas-only") " H.J. Lu
  0 siblings, 1 reply; 39+ messages in thread
From: Uros Bizjak @ 2020-08-24 14:55 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Florian Weimer, Hongtao Liu, GCC Patches, Kirill Yukhin

On Mon, Aug 24, 2020 at 3:23 PM H.J. Lu <hjl.tools@gmail.com> wrote:

> > Speaking of pragmas, these should be added outside cpuid.h, like:
> >
> > #pragma GCC push_options
> > #pragma GCC target("general-regs-only")
> >
> > #include <cpuid.h>
> >
> > void cpuid_check ()
> > ...
> >
> > #pragma GCC pop_options
> >
> > >footnote
> >
> > Nowadays, -march=native is mostly used outside generic target
> > compilations, so for relevant avx512 targets, we still generate spills
> > to mask regs. In future, we can review the setting of the tuning flag
> > for a generic target in the same way as with SSE2 inter-reg moves.
> >
>
> Florian raised an issue that we need to limit <cpuid.h> to the basic ISAs.
> <cpuid.h> should be handled similarly to other intrinsic header files.
> That is <cpuid.h> should use
>
> #pragma GCC push_options
> #ifdef __x86_64__
> #pragma GCC target("arch=x86-64")
> #else
> #pragma GCC target("arch=i386")
> ...
> #pragma GCC pop_options
>
> Here is a patch.  OK for master?

-ENOPATCH

However, how will this affect inlining? Every single function in
cpuid.h is defined as static __inline, and due to target flags
mismatch, it won't be inlined anymore. These inline functions are used
in some bit testing functions, and to keep them inlined, these should
also use the same options to avoid non-basic ISAs. This is the reason
cpuid.h should be #included after pragma, together with bit testing
functions, as shown above.

Uros.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [PATCH] x86: Use target("baseline-isas-only") in <cpuid.h>
  2020-08-24 14:55                                         ` Uros Bizjak
@ 2020-08-24 16:16                                           ` H.J. Lu
  2020-08-24 19:25                                             ` Uros Bizjak
  0 siblings, 1 reply; 39+ messages in thread
From: H.J. Lu @ 2020-08-24 16:16 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: Florian Weimer, Hongtao Liu, GCC Patches, Kirill Yukhin

[-- Attachment #1: Type: text/plain, Size: 1696 bytes --]

On Mon, Aug 24, 2020 at 7:55 AM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Mon, Aug 24, 2020 at 3:23 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> > > Speaking of pragmas, these should be added outside cpuid.h, like:
> > >
> > > #pragma GCC push_options
> > > #pragma GCC target("general-regs-only")
> > >
> > > #include <cpuid.h>
> > >
> > > void cpuid_check ()
> > > ...
> > >
> > > #pragma GCC pop_options
> > >
> > > >footnote
> > >
> > > Nowadays, -march=native is mostly used outside generic target
> > > compilations, so for relevant avx512 targets, we still generate spills
> > > to mask regs. In future, we can review the setting of the tuning flag
> > > for a generic target in the same way as with SSE2 inter-reg moves.
> > >
> >
> > Florian raised an issue that we need to limit <cpuid.h> to the basic ISAs.
> > <cpuid.h> should be handled similarly to other intrinsic header files.
> > That is <cpuid.h> should use
> >
> > #pragma GCC push_options
> > #ifdef __x86_64__
> > #pragma GCC target("arch=x86-64")
> > #else
> > #pragma GCC target("arch=i386")
> > ...
> > #pragma GCC pop_options
> >
> > Here is a patch.  OK for master?
>
> -ENOPATCH
>
> However, how will this affect inlining? Every single function in
> cpuid.h is defined as static __inline, and due to target flags
> mismatch, it won't be inlined anymore. These inline functions are used
> in some bit testing functions, and to keep them inlined, these should
> also use the same options to avoid non-basic ISAs. This is the reason
> cpuid.h should be #included after pragma, together with bit testing
> functions, as shown above.
>

How about target("baseline-isas-only")? All CPUID functions are
inlined.


-- 
H.J.

[-- Attachment #2: 0001-x86-Use-target-baseline-isas-only-in-cpuid.h.patch --]
[-- Type: text/x-patch, Size: 7593 bytes --]

From 82efbfdc58e6bdcf11a9e09018db6bbc690f77b1 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Fri, 21 Aug 2020 09:42:49 -0700
Subject: [PATCH] x86: Use target("baseline-isas-only") in <cpuid.h>

CPUID check should be done only with baseline ISAs, which include FXSR,
MMX, SSE and SSE2 in 64-bit mode.

gcc/

	PR target/96744
	* common/config/i386/i386-common.c (ix86_handle_option): Support
	-mbaseline-isas-only.
	* config/i386/cpuid.h: Add #pragma GCC target("baseline-isas-only").
	* config/i386/i386-options.c (ix86_valid_target_attribute_inner_p):
	Handle baseline-isas-only.
	* config/i386/i386.opt: Add -mbaseline-isas-only.
	* doc/extend.texi: Document target("baseline-isas-only") function
	attribute.
	* doc/invoke.texi: Document -mbaseline-isas-only.

gcc/testsuite/

	PR target/96744
	* gcc.target/i386/avx512-check.h: Add #pragma GCC
	target("baseline-isas-only") for CPUID check.
	* gcc.target/i386/pr96744-10.c: New test.
---
 gcc/common/config/i386/i386-common.c         | 20 +++++++++++++++
 gcc/config/i386/cpuid.h                      | 13 ++++++++++
 gcc/config/i386/i386-options.c               |  7 ++++-
 gcc/config/i386/i386.opt                     |  6 ++++-
 gcc/doc/extend.texi                          |  4 +++
 gcc/doc/invoke.texi                          |  5 ++++
 gcc/testsuite/gcc.target/i386/avx512-check.h |  5 ++++
 gcc/testsuite/gcc.target/i386/pr96744-10.c   | 27 ++++++++++++++++++++
 8 files changed, 85 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr96744-10.c

diff --git a/gcc/common/config/i386/i386-common.c b/gcc/common/config/i386/i386-common.c
index bb14305ad7b..2c0b7f3fe0f 100644
--- a/gcc/common/config/i386/i386-common.c
+++ b/gcc/common/config/i386/i386-common.c
@@ -338,6 +338,26 @@ ix86_handle_option (struct gcc_options *opts,
 	gcc_unreachable ();
       return true;
 
+    case OPT_mbaseline_isas_only:
+      if (value)
+	{
+	  /* Only enable baseline ISAs.  */
+	  if ((opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT))
+	    opts->x_ix86_isa_flags = (OPTION_MASK_ISA_64BIT
+				      | OPTION_MASK_ISA_FXSR
+				      | OPTION_MASK_ISA_MMX
+				      | OPTION_MASK_ISA_SSE
+				      | OPTION_MASK_ISA_SSE2);
+	  else
+	    opts->x_ix86_isa_flags = 0;
+	  opts->x_ix86_isa_flags2 = 0;
+	  opts->x_ix86_isa_flags_explicit = -1;
+	  opts->x_ix86_isa_flags2_explicit = -1;
+	}
+      else
+	gcc_unreachable ();
+      return true;
+
     case OPT_mmmx:
       if (value)
 	{
diff --git a/gcc/config/i386/cpuid.h b/gcc/config/i386/cpuid.h
index bca61d620db..dd2ef8f9b30 100644
--- a/gcc/config/i386/cpuid.h
+++ b/gcc/config/i386/cpuid.h
@@ -24,6 +24,17 @@
 #ifndef _CPUID_H_INCLUDED
 #define _CPUID_H_INCLUDED
 
+#pragma GCC push_options
+#if __GNUC__ >= 11
+#pragma GCC target("baseline-isas-only")
+#else
+#ifdef __x86_64__
+#pragma GCC target("arch=x86-64")
+#else
+#pragma GCC target("arch=i386")
+#endif
+#endif
+
 /* %eax */
 #define bit_AVX512BF16	(1 << 5)
 
@@ -324,4 +335,6 @@ __cpuidex (int __cpuid_info[4], int __leaf, int __subleaf)
 		 __cpuid_info[2], __cpuid_info[3]);
 }
 
+#pragma GCC pop_options
+
 #endif /* _CPUID_H_INCLUDED */
diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c
index e0fc68c27bf..4a09c1c93ee 100644
--- a/gcc/config/i386/i386-options.c
+++ b/gcc/config/i386/i386-options.c
@@ -1072,6 +1072,10 @@ ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[],
     IX86_ATTR_IX86_YES ("general-regs-only",
 			OPT_mgeneral_regs_only,
 			OPTION_MASK_GENERAL_REGS_ONLY),
+
+    IX86_ATTR_IX86_YES ("baseline-isas-only",
+			OPT_mbaseline_isas_only,
+			OPTION_MASK_BASELINE_ISAS_ONLY),
   };
 
   location_t loc
@@ -1187,7 +1191,8 @@ ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[],
 
       else if (type == ix86_opt_ix86_yes || type == ix86_opt_ix86_no)
 	{
-	  if (mask == OPTION_MASK_GENERAL_REGS_ONLY)
+	  if (mask == OPTION_MASK_GENERAL_REGS_ONLY
+	      || mask == OPTION_MASK_BASELINE_ISAS_ONLY)
 	    {
 	      if (type != ix86_opt_ix86_yes)
 		gcc_unreachable ();
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index c9f7195d423..f3a088aaa28 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -1015,6 +1015,10 @@ mgeneral-regs-only
 Target Report RejectNegative Mask(GENERAL_REGS_ONLY) Var(ix86_target_flags) Save
 Generate code which uses only the general registers.
 
+mbaseline-isas-only
+Target Report RejectNegative Mask(BASELINE_ISAS_ONLY) Var(ix86_target_flags) Save
+Generate code which uses only the baseline ISAs.
+
 mshstk
 Target Report Mask(ISA_SHSTK) Var(ix86_isa_flags) Save
 Enable shadow stack built-in functions from Control-flow Enforcement
@@ -1114,4 +1118,4 @@ Support SERIALIZE built-in functions and code generation.
 
 mtsxldtrk
 Target Report Mask(ISA2_TSXLDTRK) Var(ix86_isa_flags2) Save
-Support TSXLDTRK built-in functions and code generation.
\ No newline at end of file
+Support TSXLDTRK built-in functions and code generation.
diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index 2bb9b2f72f5..eadb8dd71a4 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -6660,6 +6660,10 @@ doing a floating-point division.
 @cindex @code{target("general-regs-only")} function attribute, x86
 Generate code which uses only the general registers.
 
+@item baseline-isas-only
+@cindex @code{target("baseline-isas-only")} function attribute, x86
+Generate code which uses only the baseline ISAs.
+
 @item arch=@var{ARCH}
 @cindex @code{target("arch=@var{ARCH}")} function attribute, x86
 Specify the architecture to generate code for in compiling the function.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 4cf6b204b56..5499bbe809e 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -30579,6 +30579,11 @@ Generate code that uses only the general-purpose registers.  This
 prevents the compiler from using floating-point, vector, mask and bound
 registers.
 
+@item -mbaseline-isas-only
+@opindex mbaseline-isas-only
+Generate code that uses only the baseline ISAs which include FXSR, MMX,
+SSE and SSE2 in 64-bit mode.
+
 @item -mindirect-branch=@var{choice}
 @opindex mindirect-branch
 Convert indirect call and jump with @var{choice}.  The default is
diff --git a/gcc/testsuite/gcc.target/i386/avx512-check.h b/gcc/testsuite/gcc.target/i386/avx512-check.h
index 0a377dba1d5..396a18d377b 100644
--- a/gcc/testsuite/gcc.target/i386/avx512-check.h
+++ b/gcc/testsuite/gcc.target/i386/avx512-check.h
@@ -25,6 +25,9 @@ do_test (void)
 }
 #endif
 
+#pragma GCC push_options
+#pragma GCC target("baseline-isas-only")
+
 static int
 check_osxsave (void)
 {
@@ -110,3 +113,5 @@ main ()
 #endif
   return 0;
 }
+
+#pragma GCC pop_options
diff --git a/gcc/testsuite/gcc.target/i386/pr96744-10.c b/gcc/testsuite/gcc.target/i386/pr96744-10.c
new file mode 100644
index 00000000000..f6f9badde1b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr96744-10.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake" } */
+
+#include <cpuid.h>
+
+int
+main ()
+{
+  unsigned int eax, ebx, ecx, edx;
+  int cpuid_info[4];
+
+  if (!__get_cpuid_count (7, 0, &eax, &ebx, &ecx, &edx))
+    return 0;
+
+  __cpuidex (cpuid_info, 7, 0);
+
+  if (cpuid_info[0] != eax
+      || cpuid_info[1] != ebx
+      || cpuid_info[2] != ecx
+      || cpuid_info[3] != edx)
+    __builtin_abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-assembler-not {call[ \t]+_?__get_cpuid_count} } } */
+/* { dg-final { scan-assembler-not {call[ \t]+_?__cpuidex} } } */
-- 
2.26.2


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] x86: Use target("baseline-isas-only") in <cpuid.h>
  2020-08-24 16:16                                           ` [PATCH] x86: Use target("baseline-isas-only") " H.J. Lu
@ 2020-08-24 19:25                                             ` Uros Bizjak
  2020-08-24 19:40                                               ` H.J. Lu
  0 siblings, 1 reply; 39+ messages in thread
From: Uros Bizjak @ 2020-08-24 19:25 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Florian Weimer, Hongtao Liu, GCC Patches, Kirill Yukhin

On Mon, Aug 24, 2020 at 6:17 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Mon, Aug 24, 2020 at 7:55 AM Uros Bizjak <ubizjak@gmail.com> wrote:
> >
> > On Mon, Aug 24, 2020 at 3:23 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > > > Speaking of pragmas, these should be added outside cpuid.h, like:
> > > >
> > > > #pragma GCC push_options
> > > > #pragma GCC target("general-regs-only")
> > > >
> > > > #include <cpuid.h>
> > > >
> > > > void cpuid_check ()
> > > > ...
> > > >
> > > > #pragma GCC pop_options
> > > >
> > > > >footnote
> > > >
> > > > Nowadays, -march=native is mostly used outside generic target
> > > > compilations, so for relevant avx512 targets, we still generate spills
> > > > to mask regs. In future, we can review the setting of the tuning flag
> > > > for a generic target in the same way as with SSE2 inter-reg moves.
> > > >
> > >
> > > Florian raised an issue that we need to limit <cpuid.h> to the basic ISAs.
> > > <cpuid.h> should be handled similarly to other intrinsic header files.
> > > That is <cpuid.h> should use
> > >
> > > #pragma GCC push_options
> > > #ifdef __x86_64__
> > > #pragma GCC target("arch=x86-64")
> > > #else
> > > #pragma GCC target("arch=i386")
> > > ...
> > > #pragma GCC pop_options
> > >
> > > Here is a patch.  OK for master?
> >
> > -ENOPATCH
> >
> > However, how will this affect inlining? Every single function in
> > cpuid.h is defined as static __inline, and due to target flags
> > mismatch, it won't be inlined anymore. These inline functions are used
> > in some bit testing functions, and to keep them inlined, these should
> > also use the same options to avoid non-basic ISAs. This is the reason
> > cpuid.h should be #included after pragma, together with bit testing
> > functions, as shown above.
> >
>
> How about target("baseline-isas-only")? All CPUID functions are
> inlined.

No, I don't think this is a good idea. Now consider the situation that
caller functions are compiled with e.g. -mgeneral-regs-only. Due to
#pragmas, CPUID functions are compiled with a superset ISAs, so they
again won't be inlined. ISAs of caller functions and CPUID should
match, the best way is to include <cpuid.h> after the #pragma. And
IMO, general-regs-only target #pragma is an excellent setting for
both: cpuid.h and caller bit testing functions.

So, if we care about inlining, decorating cpuid.h with target pragmas
is a bad idea.

Uros.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] x86: Use target("baseline-isas-only") in <cpuid.h>
  2020-08-24 19:25                                             ` Uros Bizjak
@ 2020-08-24 19:40                                               ` H.J. Lu
  2020-08-25 12:12                                                 ` [PATCH] x86: Use target("general-regs-only, baseline-isas-only") " H.J. Lu
  0 siblings, 1 reply; 39+ messages in thread
From: H.J. Lu @ 2020-08-24 19:40 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: Florian Weimer, Hongtao Liu, GCC Patches, Kirill Yukhin

On Mon, Aug 24, 2020 at 12:25 PM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Mon, Aug 24, 2020 at 6:17 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Mon, Aug 24, 2020 at 7:55 AM Uros Bizjak <ubizjak@gmail.com> wrote:
> > >
> > > On Mon, Aug 24, 2020 at 3:23 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > > > Speaking of pragmas, these should be added outside cpuid.h, like:
> > > > >
> > > > > #pragma GCC push_options
> > > > > #pragma GCC target("general-regs-only")
> > > > >
> > > > > #include <cpuid.h>
> > > > >
> > > > > void cpuid_check ()
> > > > > ...
> > > > >
> > > > > #pragma GCC pop_options
> > > > >
> > > > > >footnote
> > > > >
> > > > > Nowadays, -march=native is mostly used outside generic target
> > > > > compilations, so for relevant avx512 targets, we still generate spills
> > > > > to mask regs. In future, we can review the setting of the tuning flag
> > > > > for a generic target in the same way as with SSE2 inter-reg moves.
> > > > >
> > > >
> > > > Florian raised an issue that we need to limit <cpuid.h> to the basic ISAs.
> > > > <cpuid.h> should be handled similarly to other intrinsic header files.
> > > > That is <cpuid.h> should use
> > > >
> > > > #pragma GCC push_options
> > > > #ifdef __x86_64__
> > > > #pragma GCC target("arch=x86-64")
> > > > #else
> > > > #pragma GCC target("arch=i386")
> > > > ...
> > > > #pragma GCC pop_options
> > > >
> > > > Here is a patch.  OK for master?
> > >
> > > -ENOPATCH
> > >
> > > However, how will this affect inlining? Every single function in
> > > cpuid.h is defined as static __inline, and due to target flags
> > > mismatch, it won't be inlined anymore. These inline functions are used
> > > in some bit testing functions, and to keep them inlined, these should
> > > also use the same options to avoid non-basic ISAs. This is the reason
> > > cpuid.h should be #included after pragma, together with bit testing
> > > functions, as shown above.
> > >
> >
> > How about target("baseline-isas-only")? All CPUID functions are
> > inlined.
>
> No, I don't think this is a good idea. Now consider the situation that
> caller functions are compiled with e.g. -mgeneral-regs-only. Due to
> #pragmas, CPUID functions are compiled with a superset ISAs, so they
> again won't be inlined. ISAs of caller functions and CPUID should
> match, the best way is to include <cpuid.h> after the #pragma. And
> IMO, general-regs-only target #pragma is an excellent setting for
> both: cpuid.h and caller bit testing functions.
>
> So, if we care about inlining, decorating cpuid.h with target pragmas
> is a bad idea.

This can be done with #pragma in <cpuid.h>.

-- 
H.J.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [PATCH] x86: Use target("general-regs-only, baseline-isas-only") in <cpuid.h>
  2020-08-24 19:40                                               ` H.J. Lu
@ 2020-08-25 12:12                                                 ` H.J. Lu
  2020-08-25 12:27                                                   ` [PATCH] x86: Use target("general-regs-only,baseline-isas-only") " Uros Bizjak
  0 siblings, 1 reply; 39+ messages in thread
From: H.J. Lu @ 2020-08-25 12:12 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: Florian Weimer, Hongtao Liu, GCC Patches, Kirill Yukhin

[-- Attachment #1: Type: text/plain, Size: 3057 bytes --]

On Mon, Aug 24, 2020 at 12:40 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Mon, Aug 24, 2020 at 12:25 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> >
> > On Mon, Aug 24, 2020 at 6:17 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Mon, Aug 24, 2020 at 7:55 AM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > >
> > > > On Mon, Aug 24, 2020 at 3:23 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > >
> > > > > > Speaking of pragmas, these should be added outside cpuid.h, like:
> > > > > >
> > > > > > #pragma GCC push_options
> > > > > > #pragma GCC target("general-regs-only")
> > > > > >
> > > > > > #include <cpuid.h>
> > > > > >
> > > > > > void cpuid_check ()
> > > > > > ...
> > > > > >
> > > > > > #pragma GCC pop_options
> > > > > >
> > > > > > >footnote
> > > > > >
> > > > > > Nowadays, -march=native is mostly used outside generic target
> > > > > > compilations, so for relevant avx512 targets, we still generate spills
> > > > > > to mask regs. In future, we can review the setting of the tuning flag
> > > > > > for a generic target in the same way as with SSE2 inter-reg moves.
> > > > > >
> > > > >
> > > > > Florian raised an issue that we need to limit <cpuid.h> to the basic ISAs.
> > > > > <cpuid.h> should be handled similarly to other intrinsic header files.
> > > > > That is <cpuid.h> should use
> > > > >
> > > > > #pragma GCC push_options
> > > > > #ifdef __x86_64__
> > > > > #pragma GCC target("arch=x86-64")
> > > > > #else
> > > > > #pragma GCC target("arch=i386")
> > > > > ...
> > > > > #pragma GCC pop_options
> > > > >
> > > > > Here is a patch.  OK for master?
> > > >
> > > > -ENOPATCH
> > > >
> > > > However, how will this affect inlining? Every single function in
> > > > cpuid.h is defined as static __inline, and due to target flags
> > > > mismatch, it won't be inlined anymore. These inline functions are used
> > > > in some bit testing functions, and to keep them inlined, these should
> > > > also use the same options to avoid non-basic ISAs. This is the reason
> > > > cpuid.h should be #included after pragma, together with bit testing
> > > > functions, as shown above.
> > > >
> > >
> > > How about target("baseline-isas-only")? All CPUID functions are
> > > inlined.
> >
> > No, I don't think this is a good idea. Now consider the situation that
> > caller functions are compiled with e.g. -mgeneral-regs-only. Due to
> > #pragmas, CPUID functions are compiled with a superset ISAs, so they
> > again won't be inlined. ISAs of caller functions and CPUID should
> > match, the best way is to include <cpuid.h> after the #pragma. And
> > IMO, general-regs-only target #pragma is an excellent setting for
> > both: cpuid.h and caller bit testing functions.
> >
> > So, if we care about inlining, decorating cpuid.h with target pragmas
> > is a bad idea.
>
> This can be done with #pragma in <cpuid.h>.
>

We just need to update ix86_can_inline_p to allow inline functions
with baseline-isas-only and general-regs-only attributes if caller
supports the same set of ISAs.

Here is the updated patch.

-- 
H.J.

[-- Attachment #2: 0001-x86-Use-target-general-regs-only-baseline-isas-only-.patch --]
[-- Type: text/x-patch, Size: 17456 bytes --]

From 78eb1a4c4938494349032f0e10017ce553fb8fdd Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Fri, 21 Aug 2020 09:42:49 -0700
Subject: [PATCH] x86: Use target("general-regs-only,baseline-isas-only") in
 <cpuid.h>

Add -mbaseline-isas-only and target("baseline-isas-only") attribute to
support baseline ISAs, which include FXSR, MMX, SSE and SSE2 in 64-bit
mode.  Use only general registers and baseline ISAs to perform CPUID
check.  We can inline functions with general registers and baseline
ISAs attributes if caller supports the same set of ISAs.

gcc/

	PR target/96744
	* common/config/i386/i386-common.c (ix86_handle_option): Support
	-mbaseline-isas-only.
	* config/i386/cpuid.h: Add #pragma GCC
	target("general-regs-only,baseline-isas-only").
	* config/i386/i386-options.c (ix86_valid_target_attribute_inner_p):
	Handle baseline-isas-only.
	* config/i386/i386.c (ix86_can_inline_p): Allow inline functions
	with baseline-isas-only and general-regs-only attributes if caller
	supports the same set of ISAs.
	* config/i386/i386.h (TARGET_64BIT_BASELINE_ISAS): New.
	* config/i386/i386.opt: Add -mbaseline-isas-only.
	* doc/extend.texi: Document target("baseline-isas-only") function
	attribute.
	* doc/invoke.texi: Document -mbaseline-isas-only.

gcc/testsuite/

	PR target/96744
	* gcc.target/i386/avx512-check.h: Add #pragma GCC
	target("baseline-isas-only") for CPUID check.
	* gcc.target/i386/pr96744-10.c: New test.
	* gcc.target/i386/pr96744-11.c: Likewise.
	* gcc.target/i386/pr96744-12.c: Likewise.
	* gcc.target/i386/pr96744-12.c: Likewise.
	* gcc.target/i386/pr96744-14.c: Likewise.
	* gcc.target/i386/pr96744-15.c: Likewise.
---
 gcc/common/config/i386/i386-common.c         | 28 +++++++++++++
 gcc/config/i386/cpuid.h                      | 13 ++++++
 gcc/config/i386/i386-options.c               |  7 +++-
 gcc/config/i386/i386.c                       | 34 ++++++++-------
 gcc/config/i386/i386.h                       |  4 ++
 gcc/config/i386/i386.opt                     |  6 ++-
 gcc/doc/extend.texi                          |  4 ++
 gcc/doc/invoke.texi                          |  5 +++
 gcc/testsuite/gcc.target/i386/avx512-check.h |  5 +++
 gcc/testsuite/gcc.target/i386/pr96744-10.c   | 27 ++++++++++++
 gcc/testsuite/gcc.target/i386/pr96744-11.c   | 27 ++++++++++++
 gcc/testsuite/gcc.target/i386/pr96744-12.c   | 27 ++++++++++++
 gcc/testsuite/gcc.target/i386/pr96744-13.c   | 32 ++++++++++++++
 gcc/testsuite/gcc.target/i386/pr96744-14.c   | 44 ++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr96744-15.c   | 44 ++++++++++++++++++++
 15 files changed, 290 insertions(+), 17 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr96744-10.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr96744-11.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr96744-12.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr96744-13.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr96744-14.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr96744-15.c

diff --git a/gcc/common/config/i386/i386-common.c b/gcc/common/config/i386/i386-common.c
index bb14305ad7b..46ee70e1e47 100644
--- a/gcc/common/config/i386/i386-common.c
+++ b/gcc/common/config/i386/i386-common.c
@@ -338,6 +338,34 @@ ix86_handle_option (struct gcc_options *opts,
 	gcc_unreachable ();
       return true;
 
+    case OPT_mbaseline_isas_only:
+      if (value)
+	{
+	  /* Only enable baseline ISAs.  */
+	  if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
+	    {
+	      HOST_WIDE_INT x_ix86_isa_flags;
+	      if (TARGET_LP64_P (opts->x_ix86_isa_flags))
+		x_ix86_isa_flags = (OPTION_MASK_ISA_64BIT
+				    | OPTION_MASK_ABI_64);
+	      else
+		x_ix86_isa_flags = (OPTION_MASK_ISA_64BIT
+				    | OPTION_MASK_ABI_X32);
+	      if (!TARGET_GENERAL_REGS_ONLY_P (opts->x_ix86_target_flags))
+		x_ix86_isa_flags |= TARGET_64BIT_BASELINE_ISAS;
+	      opts->x_ix86_isa_flags = x_ix86_isa_flags;
+	    }
+	  else
+	    {
+	      opts->x_ix86_isa_flags = 0;
+	      opts->x_ix86_fpmath = FPMATH_387;
+	    }
+	  opts->x_ix86_isa_flags2 = 0;
+	}
+      else
+	gcc_unreachable ();
+      return true;
+
     case OPT_mmmx:
       if (value)
 	{
diff --git a/gcc/config/i386/cpuid.h b/gcc/config/i386/cpuid.h
index bca61d620db..55415c3366a 100644
--- a/gcc/config/i386/cpuid.h
+++ b/gcc/config/i386/cpuid.h
@@ -24,6 +24,17 @@
 #ifndef _CPUID_H_INCLUDED
 #define _CPUID_H_INCLUDED
 
+#pragma GCC push_options
+#if __GNUC__ >= 11
+#pragma GCC target("general-regs-only,baseline-isas-only")
+#else
+#ifdef __x86_64__
+#pragma GCC target("arch=x86-64")
+#else
+#pragma GCC target("arch=i386")
+#endif
+#endif
+
 /* %eax */
 #define bit_AVX512BF16	(1 << 5)
 
@@ -324,4 +335,6 @@ __cpuidex (int __cpuid_info[4], int __leaf, int __subleaf)
 		 __cpuid_info[2], __cpuid_info[3]);
 }
 
+#pragma GCC pop_options
+
 #endif /* _CPUID_H_INCLUDED */
diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c
index e0fc68c27bf..4a09c1c93ee 100644
--- a/gcc/config/i386/i386-options.c
+++ b/gcc/config/i386/i386-options.c
@@ -1072,6 +1072,10 @@ ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[],
     IX86_ATTR_IX86_YES ("general-regs-only",
 			OPT_mgeneral_regs_only,
 			OPTION_MASK_GENERAL_REGS_ONLY),
+
+    IX86_ATTR_IX86_YES ("baseline-isas-only",
+			OPT_mbaseline_isas_only,
+			OPTION_MASK_BASELINE_ISAS_ONLY),
   };
 
   location_t loc
@@ -1187,7 +1191,8 @@ ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[],
 
       else if (type == ix86_opt_ix86_yes || type == ix86_opt_ix86_no)
 	{
-	  if (mask == OPTION_MASK_GENERAL_REGS_ONLY)
+	  if (mask == OPTION_MASK_GENERAL_REGS_ONLY
+	      || mask == OPTION_MASK_BASELINE_ISAS_ONLY)
 	    {
 	      if (type != ix86_opt_ix86_yes)
 		gcc_unreachable ();
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 37e77ea9fdd..91b535f1a95 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -551,18 +551,6 @@ ix86_can_inline_p (tree caller, tree callee)
   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
 
-  /* Changes of those flags can be tolerated for always inlines. Lets hope
-     user knows what he is doing.  */
-  const unsigned HOST_WIDE_INT always_inline_safe_mask
-	 = (MASK_USE_8BIT_IDIV | MASK_ACCUMULATE_OUTGOING_ARGS
-	    | MASK_NO_ALIGN_STRINGOPS | MASK_AVX256_SPLIT_UNALIGNED_LOAD
-	    | MASK_AVX256_SPLIT_UNALIGNED_STORE | MASK_CLD
-	    | MASK_NO_FANCY_MATH_387 | MASK_IEEE_FP | MASK_INLINE_ALL_STRINGOPS
-	    | MASK_INLINE_STRINGOPS_DYNAMICALLY | MASK_RECIP | MASK_STACK_PROBE
-	    | MASK_STV | MASK_TLS_DIRECT_SEG_REFS | MASK_VZEROUPPER
-	    | MASK_NO_PUSH_ARGS | MASK_OMIT_LEAF_FRAME_POINTER);
-
-
   if (!callee_tree)
     callee_tree = target_option_default_node;
   if (!caller_tree)
@@ -573,10 +561,26 @@ ix86_can_inline_p (tree caller, tree callee)
   struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
   struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
   bool ret = false;
+  /* NB: Also tolerate those flags like always inlines for target
+    ("general-regs-only") attribute.  */
   bool always_inline
-    = (DECL_DISREGARD_INLINE_LIMITS (callee)
-       && lookup_attribute ("always_inline",
-			    DECL_ATTRIBUTES (callee)));
+    = ((DECL_DISREGARD_INLINE_LIMITS (callee)
+	&& lookup_attribute ("always_inline",
+			     DECL_ATTRIBUTES (callee)))
+       || TARGET_BASELINE_ISAS_ONLY_P (callee_opts->x_ix86_target_flags));
+
+  /* Changes of those flags can be tolerated for always inlines. Lets hope
+     user knows what he is doing.  */
+  const unsigned HOST_WIDE_INT always_inline_safe_mask
+	 = (MASK_USE_8BIT_IDIV | MASK_ACCUMULATE_OUTGOING_ARGS
+	    | MASK_NO_ALIGN_STRINGOPS | MASK_AVX256_SPLIT_UNALIGNED_LOAD
+	    | MASK_AVX256_SPLIT_UNALIGNED_STORE | MASK_CLD
+	    | MASK_NO_FANCY_MATH_387 | MASK_IEEE_FP | MASK_INLINE_ALL_STRINGOPS
+	    | MASK_INLINE_STRINGOPS_DYNAMICALLY | MASK_RECIP | MASK_STACK_PROBE
+	    | MASK_STV | MASK_TLS_DIRECT_SEG_REFS | MASK_VZEROUPPER
+	    | MASK_NO_PUSH_ARGS | MASK_OMIT_LEAF_FRAME_POINTER
+	    | (TARGET_GENERAL_REGS_ONLY_P (callee_opts->x_ix86_target_flags)
+	       ? MASK_80387 : 0));
 
   cgraph_node *callee_node = cgraph_node::get (callee);
   /* Callee's isa options should be a subset of the caller's, i.e. a SSE4
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 852dd017aa4..ffe96809632 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -211,6 +211,10 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 #define TARGET_16BIT	TARGET_CODE16
 #define TARGET_16BIT_P(x)	TARGET_CODE16_P(x)
 
+#define TARGET_64BIT_BASELINE_ISAS \
+  (OPTION_MASK_ISA_FXSR | OPTION_MASK_ISA_MMX | OPTION_MASK_ISA_SSE \
+   | OPTION_MASK_ISA_SSE2)
+
 #define TARGET_MMX_WITH_SSE	(TARGET_64BIT && TARGET_SSE2)
 
 #include "config/vxworks-dummy.h"
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index c9f7195d423..f3a088aaa28 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -1015,6 +1015,10 @@ mgeneral-regs-only
 Target Report RejectNegative Mask(GENERAL_REGS_ONLY) Var(ix86_target_flags) Save
 Generate code which uses only the general registers.
 
+mbaseline-isas-only
+Target Report RejectNegative Mask(BASELINE_ISAS_ONLY) Var(ix86_target_flags) Save
+Generate code which uses only the baseline ISAs.
+
 mshstk
 Target Report Mask(ISA_SHSTK) Var(ix86_isa_flags) Save
 Enable shadow stack built-in functions from Control-flow Enforcement
@@ -1114,4 +1118,4 @@ Support SERIALIZE built-in functions and code generation.
 
 mtsxldtrk
 Target Report Mask(ISA2_TSXLDTRK) Var(ix86_isa_flags2) Save
-Support TSXLDTRK built-in functions and code generation.
\ No newline at end of file
+Support TSXLDTRK built-in functions and code generation.
diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index 2bb9b2f72f5..eadb8dd71a4 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -6660,6 +6660,10 @@ doing a floating-point division.
 @cindex @code{target("general-regs-only")} function attribute, x86
 Generate code which uses only the general registers.
 
+@item baseline-isas-only
+@cindex @code{target("baseline-isas-only")} function attribute, x86
+Generate code which uses only the baseline ISAs.
+
 @item arch=@var{ARCH}
 @cindex @code{target("arch=@var{ARCH}")} function attribute, x86
 Specify the architecture to generate code for in compiling the function.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 4cf6b204b56..5499bbe809e 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -30579,6 +30579,11 @@ Generate code that uses only the general-purpose registers.  This
 prevents the compiler from using floating-point, vector, mask and bound
 registers.
 
+@item -mbaseline-isas-only
+@opindex mbaseline-isas-only
+Generate code that uses only the baseline ISAs which include FXSR, MMX,
+SSE and SSE2 in 64-bit mode.
+
 @item -mindirect-branch=@var{choice}
 @opindex mindirect-branch
 Convert indirect call and jump with @var{choice}.  The default is
diff --git a/gcc/testsuite/gcc.target/i386/avx512-check.h b/gcc/testsuite/gcc.target/i386/avx512-check.h
index 0a377dba1d5..396a18d377b 100644
--- a/gcc/testsuite/gcc.target/i386/avx512-check.h
+++ b/gcc/testsuite/gcc.target/i386/avx512-check.h
@@ -25,6 +25,9 @@ do_test (void)
 }
 #endif
 
+#pragma GCC push_options
+#pragma GCC target("baseline-isas-only")
+
 static int
 check_osxsave (void)
 {
@@ -110,3 +113,5 @@ main ()
 #endif
   return 0;
 }
+
+#pragma GCC pop_options
diff --git a/gcc/testsuite/gcc.target/i386/pr96744-10.c b/gcc/testsuite/gcc.target/i386/pr96744-10.c
new file mode 100644
index 00000000000..14bcc06f121
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr96744-10.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <cpuid.h>
+
+int
+main ()
+{
+  unsigned int eax, ebx, ecx, edx;
+  int cpuid_info[4];
+
+  if (!__get_cpuid_count (7, 0, &eax, &ebx, &ecx, &edx))
+    return 0;
+
+  __cpuidex (cpuid_info, 7, 0);
+
+  if (cpuid_info[0] != eax
+      || cpuid_info[1] != ebx
+      || cpuid_info[2] != ecx
+      || cpuid_info[3] != edx)
+    __builtin_abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-assembler-not {call[ \t]+_?__get_cpuid_count} } } */
+/* { dg-final { scan-assembler-not {call[ \t]+_?__cpuidex} } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr96744-11.c b/gcc/testsuite/gcc.target/i386/pr96744-11.c
new file mode 100644
index 00000000000..7fec1a8612b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr96744-11.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mgeneral-regs-only" } */
+
+#include <cpuid.h>
+
+int
+main ()
+{
+  unsigned int eax, ebx, ecx, edx;
+  int cpuid_info[4];
+
+  if (!__get_cpuid_count (7, 0, &eax, &ebx, &ecx, &edx))
+    return 0;
+
+  __cpuidex (cpuid_info, 7, 0);
+
+  if (cpuid_info[0] != eax
+      || cpuid_info[1] != ebx
+      || cpuid_info[2] != ecx
+      || cpuid_info[3] != edx)
+    __builtin_abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-assembler-not {call[ \t]+_?__get_cpuid_count} } } */
+/* { dg-final { scan-assembler-not {call[ \t]+_?__cpuidex} } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr96744-12.c b/gcc/testsuite/gcc.target/i386/pr96744-12.c
new file mode 100644
index 00000000000..54ca127e75a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr96744-12.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake -mfpmath=sse" } */
+
+#include <cpuid.h>
+
+int
+main ()
+{
+  unsigned int eax, ebx, ecx, edx;
+  int cpuid_info[4];
+
+  if (!__get_cpuid_count (7, 0, &eax, &ebx, &ecx, &edx))
+    return 0;
+
+  __cpuidex (cpuid_info, 7, 0);
+
+  if (cpuid_info[0] != eax
+      || cpuid_info[1] != ebx
+      || cpuid_info[2] != ecx
+      || cpuid_info[3] != edx)
+    __builtin_abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-assembler-not {call[ \t]+_?__get_cpuid_count} } } */
+/* { dg-final { scan-assembler-not {call[ \t]+_?__cpuidex} } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr96744-13.c b/gcc/testsuite/gcc.target/i386/pr96744-13.c
new file mode 100644
index 00000000000..4a4e32fab55
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr96744-13.c
@@ -0,0 +1,32 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake -mfpmath=sse" } */
+
+#include <cpuid.h>
+
+#pragma GCC push_options
+#pragma GCC target("general-regs-only,baseline-isas-only")
+
+int
+main ()
+{
+  unsigned int eax, ebx, ecx, edx;
+  int cpuid_info[4];
+
+  if (!__get_cpuid_count (7, 0, &eax, &ebx, &ecx, &edx))
+    return 0;
+
+  __cpuidex (cpuid_info, 7, 0);
+
+  if (cpuid_info[0] != eax
+      || cpuid_info[1] != ebx
+      || cpuid_info[2] != ecx
+      || cpuid_info[3] != edx)
+    __builtin_abort ();
+
+  return 0;
+}
+
+#pragma GCC pop_options
+
+/* { dg-final { scan-assembler-not {call[ \t]+_?__get_cpuid_count} } } */
+/* { dg-final { scan-assembler-not {call[ \t]+_?__cpuidex} } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr96744-14.c b/gcc/testsuite/gcc.target/i386/pr96744-14.c
new file mode 100644
index 00000000000..a307d9b1e7f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr96744-14.c
@@ -0,0 +1,44 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake" } */
+
+#include <cpuid.h>
+
+#pragma GCC push_options
+#pragma GCC target("baseline-isas-only")
+
+static __inline void
+foo (int __cpuid_info[4], int __leaf, int __subleaf)
+{
+  __cpuid_count (__leaf, __subleaf, __cpuid_info[0], __cpuid_info[1],
+		 __cpuid_info[2], __cpuid_info[3]);
+}
+
+#pragma GCC pop_options
+
+#pragma GCC push_options
+#pragma GCC target("general-regs-only")
+
+int
+bar (void)
+{
+  unsigned int eax, ebx, ecx, edx;
+  int cpuid_info[4];
+
+  if (!__get_cpuid_count (7, 0, &eax, &ebx, &ecx, &edx))
+    return 0;
+
+  foo (cpuid_info, 7, 0);
+
+  if (cpuid_info[0] != eax
+      || cpuid_info[1] != ebx
+      || cpuid_info[2] != ecx
+      || cpuid_info[3] != edx)
+    __builtin_abort ();
+
+  return 0;
+}
+
+#pragma GCC pop_options
+
+/* { dg-final { scan-assembler-not {call[ \t]+_?__get_cpuid_count} } } */
+/* { dg-final { scan-assembler {call[ \t]+_?foo} } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr96744-15.c b/gcc/testsuite/gcc.target/i386/pr96744-15.c
new file mode 100644
index 00000000000..ad19e187279
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr96744-15.c
@@ -0,0 +1,44 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake" } */
+
+#include <cpuid.h>
+
+#pragma GCC push_options
+#pragma GCC target("general-regs-only")
+
+static __inline void
+foo (int __cpuid_info[4], int __leaf, int __subleaf)
+{
+  __cpuid_count (__leaf, __subleaf, __cpuid_info[0], __cpuid_info[1],
+		 __cpuid_info[2], __cpuid_info[3]);
+}
+
+#pragma GCC pop_options
+
+#pragma GCC push_options
+#pragma GCC target("baseline-isas-only")
+
+int
+bar (void)
+{
+  unsigned int eax, ebx, ecx, edx;
+  int cpuid_info[4];
+
+  if (!__get_cpuid_count (7, 0, &eax, &ebx, &ecx, &edx))
+    return 0;
+
+  foo (cpuid_info, 7, 0);
+
+  if (cpuid_info[0] != eax
+      || cpuid_info[1] != ebx
+      || cpuid_info[2] != ecx
+      || cpuid_info[3] != edx)
+    __builtin_abort ();
+
+  return 0;
+}
+
+#pragma GCC pop_options
+
+/* { dg-final { scan-assembler-not {call[ \t]+_?__get_cpuid_count} } } */
+/* { dg-final { scan-assembler {call[ \t]+_?foo} } } */
-- 
2.26.2


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] x86: Use target("general-regs-only,baseline-isas-only") in <cpuid.h>
  2020-08-25 12:12                                                 ` [PATCH] x86: Use target("general-regs-only, baseline-isas-only") " H.J. Lu
@ 2020-08-25 12:27                                                   ` Uros Bizjak
  2020-08-27 14:44                                                     ` [PATCH] x86: Add -mbaseline-isas-only/target("baseline-isas-only") H.J. Lu
  0 siblings, 1 reply; 39+ messages in thread
From: Uros Bizjak @ 2020-08-25 12:27 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Florian Weimer, Hongtao Liu, GCC Patches, Kirill Yukhin

On Tue, Aug 25, 2020 at 2:13 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Mon, Aug 24, 2020 at 12:40 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Mon, Aug 24, 2020 at 12:25 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > >
> > > On Mon, Aug 24, 2020 at 6:17 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > >
> > > > On Mon, Aug 24, 2020 at 7:55 AM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > > >
> > > > > On Mon, Aug 24, 2020 at 3:23 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > >
> > > > > > > Speaking of pragmas, these should be added outside cpuid.h, like:
> > > > > > >
> > > > > > > #pragma GCC push_options
> > > > > > > #pragma GCC target("general-regs-only")
> > > > > > >
> > > > > > > #include <cpuid.h>
> > > > > > >
> > > > > > > void cpuid_check ()
> > > > > > > ...
> > > > > > >
> > > > > > > #pragma GCC pop_options
> > > > > > >
> > > > > > > >footnote
> > > > > > >
> > > > > > > Nowadays, -march=native is mostly used outside generic target
> > > > > > > compilations, so for relevant avx512 targets, we still generate spills
> > > > > > > to mask regs. In future, we can review the setting of the tuning flag
> > > > > > > for a generic target in the same way as with SSE2 inter-reg moves.
> > > > > > >
> > > > > >
> > > > > > Florian raised an issue that we need to limit <cpuid.h> to the basic ISAs.
> > > > > > <cpuid.h> should be handled similarly to other intrinsic header files.
> > > > > > That is <cpuid.h> should use
> > > > > >
> > > > > > #pragma GCC push_options
> > > > > > #ifdef __x86_64__
> > > > > > #pragma GCC target("arch=x86-64")
> > > > > > #else
> > > > > > #pragma GCC target("arch=i386")
> > > > > > ...
> > > > > > #pragma GCC pop_options
> > > > > >
> > > > > > Here is a patch.  OK for master?
> > > > >
> > > > > -ENOPATCH
> > > > >
> > > > > However, how will this affect inlining? Every single function in
> > > > > cpuid.h is defined as static __inline, and due to target flags
> > > > > mismatch, it won't be inlined anymore. These inline functions are used
> > > > > in some bit testing functions, and to keep them inlined, these should
> > > > > also use the same options to avoid non-basic ISAs. This is the reason
> > > > > cpuid.h should be #included after pragma, together with bit testing
> > > > > functions, as shown above.
> > > > >
> > > >
> > > > How about target("baseline-isas-only")? All CPUID functions are
> > > > inlined.
> > >
> > > No, I don't think this is a good idea. Now consider the situation that
> > > caller functions are compiled with e.g. -mgeneral-regs-only. Due to
> > > #pragmas, CPUID functions are compiled with a superset ISAs, so they
> > > again won't be inlined. ISAs of caller functions and CPUID should
> > > match, the best way is to include <cpuid.h> after the #pragma. And
> > > IMO, general-regs-only target #pragma is an excellent setting for
> > > both: cpuid.h and caller bit testing functions.
> > >
> > > So, if we care about inlining, decorating cpuid.h with target pragmas
> > > is a bad idea.
> >
> > This can be done with #pragma in <cpuid.h>.
> >
>
> We just need to update ix86_can_inline_p to allow inline functions
> with baseline-isas-only and general-regs-only attributes if caller
> supports the same set of ISAs.
>
> Here is the updated patch.

I'm not against it, but I don't plan to approve the attached patch.

Uros.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [PATCH] x86: Add -mbaseline-isas-only/target("baseline-isas-only")
  2020-08-25 12:27                                                   ` [PATCH] x86: Use target("general-regs-only,baseline-isas-only") " Uros Bizjak
@ 2020-08-27 14:44                                                     ` H.J. Lu
  2020-08-27 17:00                                                       ` Uros Bizjak
  0 siblings, 1 reply; 39+ messages in thread
From: H.J. Lu @ 2020-08-27 14:44 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: Florian Weimer, Hongtao Liu, GCC Patches, Kirill Yukhin

[-- Attachment #1: Type: text/plain, Size: 3606 bytes --]

On Tue, Aug 25, 2020 at 5:27 AM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Tue, Aug 25, 2020 at 2:13 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Mon, Aug 24, 2020 at 12:40 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Mon, Aug 24, 2020 at 12:25 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > >
> > > > On Mon, Aug 24, 2020 at 6:17 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > >
> > > > > On Mon, Aug 24, 2020 at 7:55 AM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > > > >
> > > > > > On Mon, Aug 24, 2020 at 3:23 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > >
> > > > > > > > Speaking of pragmas, these should be added outside cpuid.h, like:
> > > > > > > >
> > > > > > > > #pragma GCC push_options
> > > > > > > > #pragma GCC target("general-regs-only")
> > > > > > > >
> > > > > > > > #include <cpuid.h>
> > > > > > > >
> > > > > > > > void cpuid_check ()
> > > > > > > > ...
> > > > > > > >
> > > > > > > > #pragma GCC pop_options
> > > > > > > >
> > > > > > > > >footnote
> > > > > > > >
> > > > > > > > Nowadays, -march=native is mostly used outside generic target
> > > > > > > > compilations, so for relevant avx512 targets, we still generate spills
> > > > > > > > to mask regs. In future, we can review the setting of the tuning flag
> > > > > > > > for a generic target in the same way as with SSE2 inter-reg moves.
> > > > > > > >
> > > > > > >
> > > > > > > Florian raised an issue that we need to limit <cpuid.h> to the basic ISAs.
> > > > > > > <cpuid.h> should be handled similarly to other intrinsic header files.
> > > > > > > That is <cpuid.h> should use
> > > > > > >
> > > > > > > #pragma GCC push_options
> > > > > > > #ifdef __x86_64__
> > > > > > > #pragma GCC target("arch=x86-64")
> > > > > > > #else
> > > > > > > #pragma GCC target("arch=i386")
> > > > > > > ...
> > > > > > > #pragma GCC pop_options
> > > > > > >
> > > > > > > Here is a patch.  OK for master?
> > > > > >
> > > > > > -ENOPATCH
> > > > > >
> > > > > > However, how will this affect inlining? Every single function in
> > > > > > cpuid.h is defined as static __inline, and due to target flags
> > > > > > mismatch, it won't be inlined anymore. These inline functions are used
> > > > > > in some bit testing functions, and to keep them inlined, these should
> > > > > > also use the same options to avoid non-basic ISAs. This is the reason
> > > > > > cpuid.h should be #included after pragma, together with bit testing
> > > > > > functions, as shown above.
> > > > > >
> > > > >
> > > > > How about target("baseline-isas-only")? All CPUID functions are
> > > > > inlined.
> > > >
> > > > No, I don't think this is a good idea. Now consider the situation that
> > > > caller functions are compiled with e.g. -mgeneral-regs-only. Due to
> > > > #pragmas, CPUID functions are compiled with a superset ISAs, so they
> > > > again won't be inlined. ISAs of caller functions and CPUID should
> > > > match, the best way is to include <cpuid.h> after the #pragma. And
> > > > IMO, general-regs-only target #pragma is an excellent setting for
> > > > both: cpuid.h and caller bit testing functions.
> > > >
> > > > So, if we care about inlining, decorating cpuid.h with target pragmas
> > > > is a bad idea.
> > >
> > > This can be done with #pragma in <cpuid.h>.
> > >
> >
> > We just need to update ix86_can_inline_p to allow inline functions
> > with baseline-isas-only and general-regs-only attributes if caller
> > supports the same set of ISAs.
> >
> > Here is the updated patch.
>
> I'm not against it, but I don't plan to approve the attached patch.
>

How about this one?

-- 
H.J.

[-- Attachment #2: 0001-x86-Add-mbaseline-isas-only-target-baseline-isas-onl.patch --]
[-- Type: text/x-patch, Size: 18520 bytes --]

From a6ffd2914535d23eb916b4684d5edef132912f72 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Fri, 21 Aug 2020 09:42:49 -0700
Subject: [PATCH] x86: Add -mbaseline-isas-only/target("baseline-isas-only")

Add -mbaseline-isas-only and target("baseline-isas-only") attribute to
support baseline ISAs, which include FXSR, MMX, SSE and SSE2 in 64-bit
mode.  Use only general registers and baseline ISAs to perform CPUID
check.  We can inline functions with general registers and baseline
ISAs attributes if caller supports the same set of ISAs.

gcc/

	PR target/96744
	* common/config/i386/i386-common.c (ix86_handle_option): Support
	-mbaseline-isas-only.
	* config/i386/i386-options.c (ix86_valid_target_attribute_inner_p):
	Handle baseline-isas-only.
	* config/i386/i386.c (ix86_can_inline_p): Allow inline functions
	with baseline-isas-only and general-regs-only attributes if caller
	supports the same set of ISAs.
	* config/i386/i386.h (TARGET_64BIT_BASELINE_ISAS): New.
	* config/i386/i386.opt: Add -mbaseline-isas-only.
	* doc/extend.texi: Document target("baseline-isas-only") function
	attribute.
	* doc/invoke.texi: Document -mbaseline-isas-only.

gcc/testsuite/

	PR target/96744
	* gcc.target/i386/avx512-check.h: Add #pragma GCC
	target(""general-regs-only,baseline-isas-only") for CPUID check.
	* gcc.target/i386/pr96744-10.c: New test.
	* gcc.target/i386/pr96744-11.c: Likewise.
	* gcc.target/i386/pr96744-12.c: Likewise.
	* gcc.target/i386/pr96744-13.c: Likewise.
	* gcc.target/i386/pr96744-14.c: Likewise.
	* gcc.target/i386/pr96744-15.c: Likewise.
	* gcc.target/i386/pr96744-16.c: Likewise.
	* gcc.target/i386/pr96744-17.c: Likewise.
---
 gcc/common/config/i386/i386-common.c         | 28 ++++++++++++
 gcc/config/i386/i386-options.c               |  7 ++-
 gcc/config/i386/i386.c                       | 34 +++++++-------
 gcc/config/i386/i386.h                       |  4 ++
 gcc/config/i386/i386.opt                     |  6 ++-
 gcc/doc/extend.texi                          |  4 ++
 gcc/doc/invoke.texi                          |  5 +++
 gcc/testsuite/gcc.target/i386/avx512-check.h | 10 +++++
 gcc/testsuite/gcc.target/i386/pr96744-10.c   | 27 +++++++++++
 gcc/testsuite/gcc.target/i386/pr96744-11.c   | 27 +++++++++++
 gcc/testsuite/gcc.target/i386/pr96744-12.c   | 27 +++++++++++
 gcc/testsuite/gcc.target/i386/pr96744-13.c   | 35 +++++++++++++++
 gcc/testsuite/gcc.target/i386/pr96744-14.c   | 47 ++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr96744-15.c   | 47 ++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr96744-16.c   | 12 +++++
 gcc/testsuite/gcc.target/i386/pr96744-17.c   | 16 +++++++
 16 files changed, 319 insertions(+), 17 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr96744-10.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr96744-11.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr96744-12.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr96744-13.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr96744-14.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr96744-15.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr96744-16.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr96744-17.c

diff --git a/gcc/common/config/i386/i386-common.c b/gcc/common/config/i386/i386-common.c
index bb14305ad7b..46ee70e1e47 100644
--- a/gcc/common/config/i386/i386-common.c
+++ b/gcc/common/config/i386/i386-common.c
@@ -338,6 +338,34 @@ ix86_handle_option (struct gcc_options *opts,
 	gcc_unreachable ();
       return true;
 
+    case OPT_mbaseline_isas_only:
+      if (value)
+	{
+	  /* Only enable baseline ISAs.  */
+	  if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
+	    {
+	      HOST_WIDE_INT x_ix86_isa_flags;
+	      if (TARGET_LP64_P (opts->x_ix86_isa_flags))
+		x_ix86_isa_flags = (OPTION_MASK_ISA_64BIT
+				    | OPTION_MASK_ABI_64);
+	      else
+		x_ix86_isa_flags = (OPTION_MASK_ISA_64BIT
+				    | OPTION_MASK_ABI_X32);
+	      if (!TARGET_GENERAL_REGS_ONLY_P (opts->x_ix86_target_flags))
+		x_ix86_isa_flags |= TARGET_64BIT_BASELINE_ISAS;
+	      opts->x_ix86_isa_flags = x_ix86_isa_flags;
+	    }
+	  else
+	    {
+	      opts->x_ix86_isa_flags = 0;
+	      opts->x_ix86_fpmath = FPMATH_387;
+	    }
+	  opts->x_ix86_isa_flags2 = 0;
+	}
+      else
+	gcc_unreachable ();
+      return true;
+
     case OPT_mmmx:
       if (value)
 	{
diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c
index b93c338346f..199cf20e9da 100644
--- a/gcc/config/i386/i386-options.c
+++ b/gcc/config/i386/i386-options.c
@@ -1072,6 +1072,10 @@ ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[],
     IX86_ATTR_IX86_YES ("general-regs-only",
 			OPT_mgeneral_regs_only,
 			OPTION_MASK_GENERAL_REGS_ONLY),
+
+    IX86_ATTR_IX86_YES ("baseline-isas-only",
+			OPT_mbaseline_isas_only,
+			OPTION_MASK_BASELINE_ISAS_ONLY),
   };
 
   location_t loc
@@ -1187,7 +1191,8 @@ ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[],
 
       else if (type == ix86_opt_ix86_yes || type == ix86_opt_ix86_no)
 	{
-	  if (mask == OPTION_MASK_GENERAL_REGS_ONLY)
+	  if (mask == OPTION_MASK_GENERAL_REGS_ONLY
+	      || mask == OPTION_MASK_BASELINE_ISAS_ONLY)
 	    {
 	      if (!opt_set_p)
 		{
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 37e77ea9fdd..91b535f1a95 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -551,18 +551,6 @@ ix86_can_inline_p (tree caller, tree callee)
   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
 
-  /* Changes of those flags can be tolerated for always inlines. Lets hope
-     user knows what he is doing.  */
-  const unsigned HOST_WIDE_INT always_inline_safe_mask
-	 = (MASK_USE_8BIT_IDIV | MASK_ACCUMULATE_OUTGOING_ARGS
-	    | MASK_NO_ALIGN_STRINGOPS | MASK_AVX256_SPLIT_UNALIGNED_LOAD
-	    | MASK_AVX256_SPLIT_UNALIGNED_STORE | MASK_CLD
-	    | MASK_NO_FANCY_MATH_387 | MASK_IEEE_FP | MASK_INLINE_ALL_STRINGOPS
-	    | MASK_INLINE_STRINGOPS_DYNAMICALLY | MASK_RECIP | MASK_STACK_PROBE
-	    | MASK_STV | MASK_TLS_DIRECT_SEG_REFS | MASK_VZEROUPPER
-	    | MASK_NO_PUSH_ARGS | MASK_OMIT_LEAF_FRAME_POINTER);
-
-
   if (!callee_tree)
     callee_tree = target_option_default_node;
   if (!caller_tree)
@@ -573,10 +561,26 @@ ix86_can_inline_p (tree caller, tree callee)
   struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
   struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
   bool ret = false;
+  /* NB: Also tolerate those flags like always inlines for target
+    ("general-regs-only") attribute.  */
   bool always_inline
-    = (DECL_DISREGARD_INLINE_LIMITS (callee)
-       && lookup_attribute ("always_inline",
-			    DECL_ATTRIBUTES (callee)));
+    = ((DECL_DISREGARD_INLINE_LIMITS (callee)
+	&& lookup_attribute ("always_inline",
+			     DECL_ATTRIBUTES (callee)))
+       || TARGET_BASELINE_ISAS_ONLY_P (callee_opts->x_ix86_target_flags));
+
+  /* Changes of those flags can be tolerated for always inlines. Lets hope
+     user knows what he is doing.  */
+  const unsigned HOST_WIDE_INT always_inline_safe_mask
+	 = (MASK_USE_8BIT_IDIV | MASK_ACCUMULATE_OUTGOING_ARGS
+	    | MASK_NO_ALIGN_STRINGOPS | MASK_AVX256_SPLIT_UNALIGNED_LOAD
+	    | MASK_AVX256_SPLIT_UNALIGNED_STORE | MASK_CLD
+	    | MASK_NO_FANCY_MATH_387 | MASK_IEEE_FP | MASK_INLINE_ALL_STRINGOPS
+	    | MASK_INLINE_STRINGOPS_DYNAMICALLY | MASK_RECIP | MASK_STACK_PROBE
+	    | MASK_STV | MASK_TLS_DIRECT_SEG_REFS | MASK_VZEROUPPER
+	    | MASK_NO_PUSH_ARGS | MASK_OMIT_LEAF_FRAME_POINTER
+	    | (TARGET_GENERAL_REGS_ONLY_P (callee_opts->x_ix86_target_flags)
+	       ? MASK_80387 : 0));
 
   cgraph_node *callee_node = cgraph_node::get (callee);
   /* Callee's isa options should be a subset of the caller's, i.e. a SSE4
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 92b7475a7bf..334ff966e6f 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -211,6 +211,10 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 #define TARGET_16BIT	TARGET_CODE16
 #define TARGET_16BIT_P(x)	TARGET_CODE16_P(x)
 
+#define TARGET_64BIT_BASELINE_ISAS \
+  (OPTION_MASK_ISA_FXSR | OPTION_MASK_ISA_MMX | OPTION_MASK_ISA_SSE \
+   | OPTION_MASK_ISA_SSE2)
+
 #define TARGET_MMX_WITH_SSE	(TARGET_64BIT && TARGET_SSE2)
 
 #include "config/vxworks-dummy.h"
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index c9f7195d423..f3a088aaa28 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -1015,6 +1015,10 @@ mgeneral-regs-only
 Target Report RejectNegative Mask(GENERAL_REGS_ONLY) Var(ix86_target_flags) Save
 Generate code which uses only the general registers.
 
+mbaseline-isas-only
+Target Report RejectNegative Mask(BASELINE_ISAS_ONLY) Var(ix86_target_flags) Save
+Generate code which uses only the baseline ISAs.
+
 mshstk
 Target Report Mask(ISA_SHSTK) Var(ix86_isa_flags) Save
 Enable shadow stack built-in functions from Control-flow Enforcement
@@ -1114,4 +1118,4 @@ Support SERIALIZE built-in functions and code generation.
 
 mtsxldtrk
 Target Report Mask(ISA2_TSXLDTRK) Var(ix86_isa_flags2) Save
-Support TSXLDTRK built-in functions and code generation.
\ No newline at end of file
+Support TSXLDTRK built-in functions and code generation.
diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index 3b37aba5795..cd8ceaa25c6 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -6666,6 +6666,10 @@ doing a floating-point division.
 @cindex @code{target("general-regs-only")} function attribute, x86
 Generate code which uses only the general registers.
 
+@item baseline-isas-only
+@cindex @code{target("baseline-isas-only")} function attribute, x86
+Generate code which uses only the baseline ISAs.
+
 @item arch=@var{ARCH}
 @cindex @code{target("arch=@var{ARCH}")} function attribute, x86
 Specify the architecture to generate code for in compiling the function.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index c99924214a7..c28448dd946 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -30603,6 +30603,11 @@ Generate code that uses only the general-purpose registers.  This
 prevents the compiler from using floating-point, vector, mask and bound
 registers.
 
+@item -mbaseline-isas-only
+@opindex mbaseline-isas-only
+Generate code that uses only the baseline ISAs which include FXSR, MMX,
+SSE and SSE2 in 64-bit mode.
+
 @item -mindirect-branch=@var{choice}
 @opindex mindirect-branch
 Convert indirect call and jump with @var{choice}.  The default is
diff --git a/gcc/testsuite/gcc.target/i386/avx512-check.h b/gcc/testsuite/gcc.target/i386/avx512-check.h
index 0a377dba1d5..1f1f59a37a3 100644
--- a/gcc/testsuite/gcc.target/i386/avx512-check.h
+++ b/gcc/testsuite/gcc.target/i386/avx512-check.h
@@ -1,5 +1,10 @@
 #include <stdlib.h>
+
+#pragma GCC push_options
+#pragma GCC target("general-regs-only,baseline-isas-only")
 #include "cpuid.h"
+#pragma GCC pop_options
+
 #include "m512-check.h"
 #include "avx512f-os-support.h"
 
@@ -25,6 +30,9 @@ do_test (void)
 }
 #endif
 
+#pragma GCC push_options
+#pragma GCC target("baseline-isas-only")
+
 static int
 check_osxsave (void)
 {
@@ -110,3 +118,5 @@ main ()
 #endif
   return 0;
 }
+
+#pragma GCC pop_options
diff --git a/gcc/testsuite/gcc.target/i386/pr96744-10.c b/gcc/testsuite/gcc.target/i386/pr96744-10.c
new file mode 100644
index 00000000000..14bcc06f121
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr96744-10.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <cpuid.h>
+
+int
+main ()
+{
+  unsigned int eax, ebx, ecx, edx;
+  int cpuid_info[4];
+
+  if (!__get_cpuid_count (7, 0, &eax, &ebx, &ecx, &edx))
+    return 0;
+
+  __cpuidex (cpuid_info, 7, 0);
+
+  if (cpuid_info[0] != eax
+      || cpuid_info[1] != ebx
+      || cpuid_info[2] != ecx
+      || cpuid_info[3] != edx)
+    __builtin_abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-assembler-not {call[ \t]+_?__get_cpuid_count} } } */
+/* { dg-final { scan-assembler-not {call[ \t]+_?__cpuidex} } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr96744-11.c b/gcc/testsuite/gcc.target/i386/pr96744-11.c
new file mode 100644
index 00000000000..7fec1a8612b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr96744-11.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mgeneral-regs-only" } */
+
+#include <cpuid.h>
+
+int
+main ()
+{
+  unsigned int eax, ebx, ecx, edx;
+  int cpuid_info[4];
+
+  if (!__get_cpuid_count (7, 0, &eax, &ebx, &ecx, &edx))
+    return 0;
+
+  __cpuidex (cpuid_info, 7, 0);
+
+  if (cpuid_info[0] != eax
+      || cpuid_info[1] != ebx
+      || cpuid_info[2] != ecx
+      || cpuid_info[3] != edx)
+    __builtin_abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-assembler-not {call[ \t]+_?__get_cpuid_count} } } */
+/* { dg-final { scan-assembler-not {call[ \t]+_?__cpuidex} } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr96744-12.c b/gcc/testsuite/gcc.target/i386/pr96744-12.c
new file mode 100644
index 00000000000..54ca127e75a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr96744-12.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake -mfpmath=sse" } */
+
+#include <cpuid.h>
+
+int
+main ()
+{
+  unsigned int eax, ebx, ecx, edx;
+  int cpuid_info[4];
+
+  if (!__get_cpuid_count (7, 0, &eax, &ebx, &ecx, &edx))
+    return 0;
+
+  __cpuidex (cpuid_info, 7, 0);
+
+  if (cpuid_info[0] != eax
+      || cpuid_info[1] != ebx
+      || cpuid_info[2] != ecx
+      || cpuid_info[3] != edx)
+    __builtin_abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-assembler-not {call[ \t]+_?__get_cpuid_count} } } */
+/* { dg-final { scan-assembler-not {call[ \t]+_?__cpuidex} } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr96744-13.c b/gcc/testsuite/gcc.target/i386/pr96744-13.c
new file mode 100644
index 00000000000..e38ed3ccabb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr96744-13.c
@@ -0,0 +1,35 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake -mfpmath=sse" } */
+
+#pragma GCC push_options
+#pragma GCC target("general-regs-only,baseline-isas-only")
+#include <cpuid.h>
+#pragma GCC pop_options
+
+#pragma GCC push_options
+#pragma GCC target("general-regs-only,baseline-isas-only")
+
+int
+main ()
+{
+  unsigned int eax, ebx, ecx, edx;
+  int cpuid_info[4];
+
+  if (!__get_cpuid_count (7, 0, &eax, &ebx, &ecx, &edx))
+    return 0;
+
+  __cpuidex (cpuid_info, 7, 0);
+
+  if (cpuid_info[0] != eax
+      || cpuid_info[1] != ebx
+      || cpuid_info[2] != ecx
+      || cpuid_info[3] != edx)
+    __builtin_abort ();
+
+  return 0;
+}
+
+#pragma GCC pop_options
+
+/* { dg-final { scan-assembler-not {call[ \t]+_?__get_cpuid_count} } } */
+/* { dg-final { scan-assembler-not {call[ \t]+_?__cpuidex} } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr96744-14.c b/gcc/testsuite/gcc.target/i386/pr96744-14.c
new file mode 100644
index 00000000000..577ffc76d1d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr96744-14.c
@@ -0,0 +1,47 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake" } */
+
+#pragma GCC push_options
+#pragma GCC target("general-regs-only,baseline-isas-only")
+#include <cpuid.h>
+#pragma GCC pop_options
+
+#pragma GCC push_options
+#pragma GCC target("baseline-isas-only")
+
+static __inline void
+foo (int __cpuid_info[4], int __leaf, int __subleaf)
+{
+  __cpuid_count (__leaf, __subleaf, __cpuid_info[0], __cpuid_info[1],
+		 __cpuid_info[2], __cpuid_info[3]);
+}
+
+#pragma GCC pop_options
+
+#pragma GCC push_options
+#pragma GCC target("general-regs-only")
+
+int
+bar (void)
+{
+  unsigned int eax, ebx, ecx, edx;
+  int cpuid_info[4];
+
+  if (!__get_cpuid_count (7, 0, &eax, &ebx, &ecx, &edx))
+    return 0;
+
+  foo (cpuid_info, 7, 0);
+
+  if (cpuid_info[0] != eax
+      || cpuid_info[1] != ebx
+      || cpuid_info[2] != ecx
+      || cpuid_info[3] != edx)
+    __builtin_abort ();
+
+  return 0;
+}
+
+#pragma GCC pop_options
+
+/* { dg-final { scan-assembler-not {call[ \t]+_?__get_cpuid_count} } } */
+/* { dg-final { scan-assembler {call[ \t]+_?foo} } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr96744-15.c b/gcc/testsuite/gcc.target/i386/pr96744-15.c
new file mode 100644
index 00000000000..3a6fb9d0124
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr96744-15.c
@@ -0,0 +1,47 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake" } */
+
+#pragma GCC push_options
+#pragma GCC target("general-regs-only,baseline-isas-only")
+#include <cpuid.h>
+#pragma GCC pop_options
+
+#pragma GCC push_options
+#pragma GCC target("general-regs-only")
+
+static __inline void
+foo (int __cpuid_info[4], int __leaf, int __subleaf)
+{
+  __cpuid_count (__leaf, __subleaf, __cpuid_info[0], __cpuid_info[1],
+		 __cpuid_info[2], __cpuid_info[3]);
+}
+
+#pragma GCC pop_options
+
+#pragma GCC push_options
+#pragma GCC target("baseline-isas-only")
+
+int
+bar (void)
+{
+  unsigned int eax, ebx, ecx, edx;
+  int cpuid_info[4];
+
+  if (!__get_cpuid_count (7, 0, &eax, &ebx, &ecx, &edx))
+    return 0;
+
+  foo (cpuid_info, 7, 0);
+
+  if (cpuid_info[0] != eax
+      || cpuid_info[1] != ebx
+      || cpuid_info[2] != ecx
+      || cpuid_info[3] != edx)
+    __builtin_abort ();
+
+  return 0;
+}
+
+#pragma GCC pop_options
+
+/* { dg-final { scan-assembler-not {call[ \t]+_?__get_cpuid_count} } } */
+/* { dg-final { scan-assembler {call[ \t]+_?foo} } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr96744-16.c b/gcc/testsuite/gcc.target/i386/pr96744-16.c
new file mode 100644
index 00000000000..06064a74a46
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr96744-16.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+
+/* Reject the negated form of non-negatable attributes.  */
+
+__attribute__ ((target ("no-baseline-isas-only")))
+int
+foo (int a)
+{
+  return a + 1;
+}
+
+/* { dg-error "does not allow a negated form" "" { target *-*-* } 0 } */
diff --git a/gcc/testsuite/gcc.target/i386/pr96744-17.c b/gcc/testsuite/gcc.target/i386/pr96744-17.c
new file mode 100644
index 00000000000..65be5a2951b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr96744-17.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+
+/* Reject the negated form of non-negatable pragma target.  */
+
+#pragma GCC push_options
+#pragma GCC target("no-baseline-isas-only")
+
+int
+foo (int a)
+{
+  return a + 1;
+}
+
+#pragma GCC pop_options
+
+/* { dg-error "does not allow a negated form" "" { target *-*-* } 0 } */
-- 
2.26.2


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] x86: Add -mbaseline-isas-only/target("baseline-isas-only")
  2020-08-27 14:44                                                     ` [PATCH] x86: Add -mbaseline-isas-only/target("baseline-isas-only") H.J. Lu
@ 2020-08-27 17:00                                                       ` Uros Bizjak
  2020-08-28 13:41                                                         ` H.J. Lu
  0 siblings, 1 reply; 39+ messages in thread
From: Uros Bizjak @ 2020-08-27 17:00 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Florian Weimer, Hongtao Liu, GCC Patches, Kirill Yukhin

On Thu, Aug 27, 2020 at 4:45 PM H.J. Lu <hjl.tools@gmail.com> wrote:

> > > > > > How about target("baseline-isas-only")? All CPUID functions are
> > > > > > inlined.
> > > > >
> > > > > No, I don't think this is a good idea. Now consider the situation that
> > > > > caller functions are compiled with e.g. -mgeneral-regs-only. Due to
> > > > > #pragmas, CPUID functions are compiled with a superset ISAs, so they
> > > > > again won't be inlined. ISAs of caller functions and CPUID should
> > > > > match, the best way is to include <cpuid.h> after the #pragma. And
> > > > > IMO, general-regs-only target #pragma is an excellent setting for
> > > > > both: cpuid.h and caller bit testing functions.
> > > > >
> > > > > So, if we care about inlining, decorating cpuid.h with target pragmas
> > > > > is a bad idea.
> > > >
> > > > This can be done with #pragma in <cpuid.h>.
> > > >
> > >
> > > We just need to update ix86_can_inline_p to allow inline functions
> > > with baseline-isas-only and general-regs-only attributes if caller
> > > supports the same set of ISAs.
> > >
> > > Here is the updated patch.
> >
> > I'm not against it, but I don't plan to approve the attached patch.
> >
>
> How about this one?

I really don't see any benefit in introducing baseline-isas-only
#pragma, when we have general-regs-only #pragma. We may want (for
whatever reason) to avoid SSE2 movd/movq instructions also for 64bit
targets in functions that test bits, returned by cpuid. And since
cpuid.h functions are extremely simple (because we want them to be
inlined!), we can simply #include them after mentioned #pragma,
together with bit testing functions. This way, all problems involving
inlining are avoided.

Uros.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] x86: Add -mbaseline-isas-only/target("baseline-isas-only")
  2020-08-27 17:00                                                       ` Uros Bizjak
@ 2020-08-28 13:41                                                         ` H.J. Lu
  0 siblings, 0 replies; 39+ messages in thread
From: H.J. Lu @ 2020-08-28 13:41 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: Florian Weimer, Hongtao Liu, GCC Patches, Kirill Yukhin

On Thu, Aug 27, 2020 at 10:00 AM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Thu, Aug 27, 2020 at 4:45 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> > > > > > > How about target("baseline-isas-only")? All CPUID functions are
> > > > > > > inlined.
> > > > > >
> > > > > > No, I don't think this is a good idea. Now consider the situation that
> > > > > > caller functions are compiled with e.g. -mgeneral-regs-only. Due to
> > > > > > #pragmas, CPUID functions are compiled with a superset ISAs, so they
> > > > > > again won't be inlined. ISAs of caller functions and CPUID should
> > > > > > match, the best way is to include <cpuid.h> after the #pragma. And
> > > > > > IMO, general-regs-only target #pragma is an excellent setting for
> > > > > > both: cpuid.h and caller bit testing functions.
> > > > > >
> > > > > > So, if we care about inlining, decorating cpuid.h with target pragmas
> > > > > > is a bad idea.
> > > > >
> > > > > This can be done with #pragma in <cpuid.h>.
> > > > >
> > > >
> > > > We just need to update ix86_can_inline_p to allow inline functions
> > > > with baseline-isas-only and general-regs-only attributes if caller
> > > > supports the same set of ISAs.
> > > >
> > > > Here is the updated patch.
> > >
> > > I'm not against it, but I don't plan to approve the attached patch.
> > >
> >
> > How about this one?
>
> I really don't see any benefit in introducing baseline-isas-only
> #pragma, when we have general-regs-only #pragma. We may want (for
> whatever reason) to avoid SSE2 movd/movq instructions also for 64bit
> targets in functions that test bits, returned by cpuid. And since
> cpuid.h functions are extremely simple (because we want them to be
> inlined!), we can simply #include them after mentioned #pragma,
> together with bit testing functions. This way, all problems involving
> inlining are avoided.
>

Baseline ISAs are related to -march=x86-64.   The differences are

1. Baseline ISAs apply to both 32-bit and 64-bit.
2. Baseline ISAs doesn't change -mtune.

It is guaranteed to work for both 32-bit and 64-bit as well as any
-mXXX options at command-line.

-- 
H.J.

^ permalink raw reply	[flat|nested] 39+ messages in thread

end of thread, other threads:[~2020-08-28 13:42 UTC | newest]

Thread overview: 39+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-08-14  8:27 [PATCH 4/4][PR target/88808]Enable bitwise operator for AVX512 masks Hongtao Liu
2020-08-17 10:08 ` Uros Bizjak
2020-08-19  2:26   ` Hongtao Liu
2020-08-19  7:05     ` Uros Bizjak
2020-08-20  7:24       ` Hongtao Liu
2020-08-20  7:32         ` Hongtao Liu
2020-08-20  7:40           ` Uros Bizjak
2020-08-20  7:45             ` Hongtao Liu
2020-08-21 13:15               ` Uros Bizjak
2020-08-21 15:41                 ` Hongtao Liu
2020-08-21 15:50                   ` H.J. Lu
2020-08-21 15:50                   ` Uros Bizjak
2020-08-21 16:29                     ` Hongtao Liu
2020-08-21 16:35                       ` H.J. Lu
2020-08-21 16:45                         ` H.J. Lu
2020-08-22 16:26                           ` [PATCH] x86: Disable SSE, AVX and AVX512 during CPUID check H.J. Lu
2020-08-22 17:11                             ` Uros Bizjak
2020-08-22 19:08                               ` [PATCH] x86: Only use general-purpose registers " H.J. Lu
2020-08-23  8:18                                 ` Uros Bizjak
2020-08-23 15:07                                   ` [PATCH] x86: Add target("general-regs-only") function attribute H.J. Lu
2020-08-23 15:37                                     ` Uros Bizjak
2020-08-23 15:23                                   ` [PATCH] x86: Only use general-purpose registers during CPUID check H.J. Lu
2020-08-23 16:02                                     ` Uros Bizjak
2020-08-24 13:22                                       ` [PATCH] x86: Use -march=x86-64/-march=i386 in <cpuid.h> H.J. Lu
2020-08-24 14:55                                         ` Uros Bizjak
2020-08-24 16:16                                           ` [PATCH] x86: Use target("baseline-isas-only") " H.J. Lu
2020-08-24 19:25                                             ` Uros Bizjak
2020-08-24 19:40                                               ` H.J. Lu
2020-08-25 12:12                                                 ` [PATCH] x86: Use target("general-regs-only, baseline-isas-only") " H.J. Lu
2020-08-25 12:27                                                   ` [PATCH] x86: Use target("general-regs-only,baseline-isas-only") " Uros Bizjak
2020-08-27 14:44                                                     ` [PATCH] x86: Add -mbaseline-isas-only/target("baseline-isas-only") H.J. Lu
2020-08-27 17:00                                                       ` Uros Bizjak
2020-08-28 13:41                                                         ` H.J. Lu
2020-08-23 21:22                                     ` [PATCH] x86: Only use general-purpose registers during CPUID check Florian Weimer
2020-08-21 16:46                         ` [PATCH 4/4][PR target/88808]Enable bitwise operator for AVX512 masks Hongtao Liu
2020-08-21 17:02                           ` H.J. Lu
2020-08-21 17:07                             ` H.J. Lu
2020-08-21 17:29                               ` Hongtao Liu
2020-08-21 18:25                       ` Uros Bizjak

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).