public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [committed] amdgcn: Adjust GFX10/GFX11 cache coherency
@ 2024-03-22 15:54 Andrew Stubbs
  2024-04-04 10:04 ` Thomas Schwinge
  0 siblings, 1 reply; 2+ messages in thread
From: Andrew Stubbs @ 2024-03-22 15:54 UTC (permalink / raw)
  To: gcc-patches

The RDNA devices have different cache architectures to the CDNA devices, and
the differences go deeper than just the assembler mnemonics, so we
probably need to generate different code to maintain coherency across
the whole device.

I believe this patch is correct according to the documentation in the LLVM
AMDGPU user guide (the ISA manual is less instructive), but I hadn't observed
any real problems before (or after).

Committed to mainline.

Andrew

gcc/ChangeLog:

	* config/gcn/gcn.md (*memory_barrier): Split into RDNA and !RDNA.
	(atomic_load<mode>): Adjust RDNA cache settings.
	(atomic_store<mode>): Likewise.
	(atomic_exchange<mode>): Likewise.
---
 gcc/config/gcn/gcn.md | 86 +++++++++++++++++++++++++++----------------
 1 file changed, 55 insertions(+), 31 deletions(-)

diff --git a/gcc/config/gcn/gcn.md b/gcc/config/gcn/gcn.md
index 3b51453aaca..574c2f87e8c 100644
--- a/gcc/config/gcn/gcn.md
+++ b/gcc/config/gcn/gcn.md
@@ -1960,11 +1960,19 @@
 (define_insn "*memory_barrier"
   [(set (match_operand:BLK 0)
 	(unspec:BLK [(match_dup 0)] UNSPEC_MEMORY_BARRIER))]
-  ""
-  "{buffer_wbinvl1_vol|buffer_gl0_inv}"
+  "!TARGET_RDNA2_PLUS"
+  "buffer_wbinvl1_vol"
   [(set_attr "type" "mubuf")
    (set_attr "length" "4")])
 
+(define_insn "*memory_barrier"
+  [(set (match_operand:BLK 0)
+	(unspec:BLK [(match_dup 0)] UNSPEC_MEMORY_BARRIER))]
+  "TARGET_RDNA2_PLUS"
+  "buffer_gl1_inv\;buffer_gl0_inv"
+  [(set_attr "type" "mult")
+   (set_attr "length" "8")])
+
 ; FIXME: These patterns have been disabled as they do not seem to work
 ; reliably - they can cause hangs or incorrect results.
 ; TODO: flush caches according to memory model
@@ -2094,9 +2102,13 @@
 	  case 0:
 	    return "s_load%o0\t%0, %A1 glc\;s_waitcnt\tlgkmcnt(0)";
 	  case 1:
-	    return "flat_load%o0\t%0, %A1%O1 glc\;s_waitcnt\t0";
+	    return (TARGET_RDNA2 /* Not GFX11.  */
+		    ? "flat_load%o0\t%0, %A1%O1 glc dlc\;s_waitcnt\t0"
+		    : "flat_load%o0\t%0, %A1%O1 glc\;s_waitcnt\t0");
 	  case 2:
-	    return "global_load%o0\t%0, %A1%O1 glc\;s_waitcnt\tvmcnt(0)";
+	    return (TARGET_RDNA2 /* Not GFX11.  */
+		    ? "global_load%o0\t%0, %A1%O1 glc dlc\;s_waitcnt\tvmcnt(0)"
+		    : "global_load%o0\t%0, %A1%O1 glc\;s_waitcnt\tvmcnt(0)");
 	  }
 	break;
       case MEMMODEL_CONSUME:
@@ -2108,15 +2120,21 @@
 	    return "s_load%o0\t%0, %A1 glc\;s_waitcnt\tlgkmcnt(0)\;"
 		   "s_dcache_wb_vol";
 	  case 1:
-	    return (TARGET_RDNA2_PLUS
+	    return (TARGET_RDNA2
+		    ? "flat_load%o0\t%0, %A1%O1 glc dlc\;s_waitcnt\t0\;"
+		      "buffer_gl1_inv\;buffer_gl0_inv"
+		    : TARGET_RDNA3
 		    ? "flat_load%o0\t%0, %A1%O1 glc\;s_waitcnt\t0\;"
-		      "buffer_gl0_inv"
+		      "buffer_gl1_inv\;buffer_gl0_inv"
 		    : "flat_load%o0\t%0, %A1%O1 glc\;s_waitcnt\t0\;"
 		      "buffer_wbinvl1_vol");
 	  case 2:
-	    return (TARGET_RDNA2_PLUS
+	    return (TARGET_RDNA2
+		    ? "global_load%o0\t%0, %A1%O1 glc dlc\;s_waitcnt\tvmcnt(0)\;"
+		      "buffer_gl1_inv\;buffer_gl0_inv"
+		    : TARGET_RDNA3
 		    ? "global_load%o0\t%0, %A1%O1 glc\;s_waitcnt\tvmcnt(0)\;"
-		      "buffer_gl0_inv"
+		      "buffer_gl1_inv\;buffer_gl0_inv"
 		    : "global_load%o0\t%0, %A1%O1 glc\;s_waitcnt\tvmcnt(0)\;"
 		      "buffer_wbinvl1_vol");
 	  }
@@ -2130,15 +2148,21 @@
 	    return "s_dcache_wb_vol\;s_load%o0\t%0, %A1 glc\;"
 		   "s_waitcnt\tlgkmcnt(0)\;s_dcache_inv_vol";
 	  case 1:
-	    return (TARGET_RDNA2_PLUS
-		    ? "buffer_gl0_inv\;flat_load%o0\t%0, %A1%O1 glc\;"
-		      "s_waitcnt\t0\;buffer_gl0_inv"
+	    return (TARGET_RDNA2
+		    ? "buffer_gl1_inv\;buffer_gl0_inv\;flat_load%o0\t%0, %A1%O1 glc dlc\;"
+		      "s_waitcnt\t0\;buffer_gl1_inv\;buffer_gl0_inv"
+		    : TARGET_RDNA3
+		    ? "buffer_gl1_inv\;buffer_gl0_inv\;flat_load%o0\t%0, %A1%O1 glc\;"
+		      "s_waitcnt\t0\;buffer_gl1_inv\;buffer_gl0_inv"
 		    : "buffer_wbinvl1_vol\;flat_load%o0\t%0, %A1%O1 glc\;"
 		      "s_waitcnt\t0\;buffer_wbinvl1_vol");
 	  case 2:
-	    return (TARGET_RDNA2_PLUS
-		    ? "buffer_gl0_inv\;global_load%o0\t%0, %A1%O1 glc\;"
-		      "s_waitcnt\tvmcnt(0)\;buffer_gl0_inv"
+	    return (TARGET_RDNA2
+		    ? "buffer_gl1_inv\;buffer_gl0_inv\;global_load%o0\t%0, %A1%O1 glc dlc\;"
+		      "s_waitcnt\tvmcnt(0)\;buffer_gl1_inv\;buffer_gl0_inv"
+		    : TARGET_RDNA3
+		    ? "buffer_gl1_inv\;buffer_gl0_inv\;global_load%o0\t%0, %A1%O1 glc\;"
+		      "s_waitcnt\tvmcnt(0)\;buffer_gl1_inv\;buffer_gl0_inv"
 		    : "buffer_wbinvl1_vol\;global_load%o0\t%0, %A1%O1 glc\;"
 		      "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol");
 	  }
@@ -2147,7 +2171,7 @@
     gcc_unreachable ();
   }
   [(set_attr "type" "smem,flat,flat")
-   (set_attr "length" "20")
+   (set_attr "length" "28")
    (set_attr "gcn_version" "gcn5,*,gcn5")
    (set_attr "rdna" "no,*,*")])
 
@@ -2180,11 +2204,11 @@
 	    return "s_dcache_wb_vol\;s_store%o1\t%1, %A0 glc";
 	  case 1:
 	    return (TARGET_RDNA2_PLUS
-		    ? "buffer_gl0_inv\;flat_store%o1\t%A0, %1%O0 glc"
+		    ? "buffer_gl1_inv\;buffer_gl0_inv\;flat_store%o1\t%A0, %1%O0 glc"
 		    : "buffer_wbinvl1_vol\;flat_store%o1\t%A0, %1%O0 glc");
 	  case 2:
 	    return (TARGET_RDNA2_PLUS
-		    ? "buffer_gl0_inv\;global_store%o1\t%A0, %1%O0 glc"
+		    ? "buffer_gl1_inv\;buffer_gl0_inv\;global_store%o1\t%A0, %1%O0 glc"
 		    : "buffer_wbinvl1_vol\;global_store%o1\t%A0, %1%O0 glc");
 	  }
 	break;
@@ -2198,14 +2222,14 @@
 		   "s_waitcnt\tlgkmcnt(0)\;s_dcache_inv_vol";
 	  case 1:
 	    return (TARGET_RDNA2_PLUS
-		    ? "buffer_gl0_inv\;flat_store%o1\t%A0, %1%O0 glc\;"
-		      "s_waitcnt\t0\;buffer_gl0_inv"
+		    ? "buffer_gl1_inv\;buffer_gl0_inv\;flat_store%o1\t%A0, %1%O0 glc\;"
+		      "s_waitcnt\t0\;buffer_gl1_inv\;buffer_gl0_inv"
 		    : "buffer_wbinvl1_vol\;flat_store%o1\t%A0, %1%O0 glc\;"
 		      "s_waitcnt\t0\;buffer_wbinvl1_vol");
 	  case 2:
 	    return (TARGET_RDNA2_PLUS
-		    ? "buffer_gl0_inv\;global_store%o1\t%A0, %1%O0 glc\;"
-		      "s_waitcnt\tvmcnt(0)\;buffer_gl0_inv"
+		    ? "buffer_gl1_inv\;buffer_gl0_inv\;global_store%o1\t%A0, %1%O0 glc\;"
+		      "s_waitcnt\tvmcnt(0)\;buffer_gl1_inv\;buffer_gl0_inv"
 		    : "buffer_wbinvl1_vol\;global_store%o1\t%A0, %1%O0 glc\;"
 		      "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol");
 	  }
@@ -2214,7 +2238,7 @@
     gcc_unreachable ();
   }
   [(set_attr "type" "smem,flat,flat")
-   (set_attr "length" "20")
+   (set_attr "length" "28")
    (set_attr "gcn_version" "gcn5,*,gcn5")
    (set_attr "rdna" "no,*,*")])
 
@@ -2253,13 +2277,13 @@
 	  case 1:
 	    return (TARGET_RDNA2_PLUS
 		    ? "flat_atomic_swap<X>\t%0, %1, %2 glc\;s_waitcnt\t0\;"
-		      "buffer_gl0_inv"
+		      "buffer_gl1_inv\;buffer_gl0_inv"
 		    : "flat_atomic_swap<X>\t%0, %1, %2 glc\;s_waitcnt\t0\;"
 		      "buffer_wbinvl1_vol");
 	  case 2:
 	    return (TARGET_RDNA2_PLUS
 		    ? "global_atomic_swap<X>\t%0, %A1, %2%O1 glc\;"
-		      "s_waitcnt\tvmcnt(0)\;buffer_gl0_inv"
+		      "s_waitcnt\tvmcnt(0)\;buffer_gl1_inv\;buffer_gl0_inv"
 		    : "global_atomic_swap<X>\t%0, %A1, %2%O1 glc\;"
 		      "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol");
 	  }
@@ -2273,13 +2297,13 @@
 		   "s_waitcnt\tlgkmcnt(0)";
 	  case 1:
 	    return (TARGET_RDNA2_PLUS
-		    ? "buffer_gl0_inv\;flat_atomic_swap<X>\t%0, %1, %2 glc\;"
+		    ? "buffer_gl1_inv\;buffer_gl0_inv\;flat_atomic_swap<X>\t%0, %1, %2 glc\;"
 		      "s_waitcnt\t0"
 		    : "buffer_wbinvl1_vol\;flat_atomic_swap<X>\t%0, %1, %2 glc\;"
 		      "s_waitcnt\t0");
 	  case 2:
 	    return (TARGET_RDNA2_PLUS
-		    ? "buffer_gl0_inv\;"
+		    ? "buffer_gl1_inv\;buffer_gl0_inv\;"
 		      "global_atomic_swap<X>\t%0, %A1, %2%O1 glc\;"
 		      "s_waitcnt\tvmcnt(0)"
 		    : "buffer_wbinvl1_vol\;"
@@ -2297,15 +2321,15 @@
 		   "s_waitcnt\tlgkmcnt(0)\;s_dcache_inv_vol";
 	  case 1:
 	    return (TARGET_RDNA2_PLUS
-		    ? "buffer_gl0_inv\;flat_atomic_swap<X>\t%0, %1, %2 glc\;"
-		      "s_waitcnt\t0\;buffer_gl0_inv"
+		    ? "buffer_gl1_inv\;buffer_gl0_inv\;flat_atomic_swap<X>\t%0, %1, %2 glc\;"
+		      "s_waitcnt\t0\;buffer_gl1_inv\;buffer_gl0_inv"
 		    : "buffer_wbinvl1_vol\;flat_atomic_swap<X>\t%0, %1, %2 glc\;"
 		      "s_waitcnt\t0\;buffer_wbinvl1_vol");
 	  case 2:
 	    return (TARGET_RDNA2_PLUS
-		    ? "buffer_gl0_inv\;"
+		    ? "buffer_gl1_inv\;buffer_gl0_inv\;"
 		      "global_atomic_swap<X>\t%0, %A1, %2%O1 glc\;"
-		      "s_waitcnt\tvmcnt(0)\;buffer_gl0_inv"
+		      "s_waitcnt\tvmcnt(0)\;buffer_gl1_inv\;buffer_gl0_inv"
 		    : "buffer_wbinvl1_vol\;"
 		      "global_atomic_swap<X>\t%0, %A1, %2%O1 glc\;"
 		      "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol");
@@ -2315,7 +2339,7 @@
     gcc_unreachable ();
   }
   [(set_attr "type" "smem,flat,flat")
-   (set_attr "length" "20")
+   (set_attr "length" "28")
    (set_attr "gcn_version" "gcn5,*,gcn5")
    (set_attr "rdna" "no,*,*")])
 
-- 
2.41.0


^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [committed] amdgcn: Adjust GFX10/GFX11 cache coherency
  2024-03-22 15:54 [committed] amdgcn: Adjust GFX10/GFX11 cache coherency Andrew Stubbs
@ 2024-04-04 10:04 ` Thomas Schwinge
  0 siblings, 0 replies; 2+ messages in thread
From: Thomas Schwinge @ 2024-04-04 10:04 UTC (permalink / raw)
  To: Andrew Stubbs, gcc-patches, Richard Biener

Hi!

To again state this in public:

On 2024-03-22T15:54:49+0000, Andrew Stubbs <ams@baylibre.com> wrote:
> The RDNA devices have different cache architectures to the CDNA devices, and
> the differences go deeper than just the assembler mnemonics, so we
> probably need to generate different code to maintain coherency across
> the whole device.
>
> I believe this patch is correct according to the documentation in the LLVM
> AMDGPU user guide (the ISA manual is less instructive), but I hadn't observed
> any real problems before (or after).
>
> Committed to mainline.

Thanks!  This commit does repair a lot of the GCN offloading damage noted
in <https://inbox.sourceware.org/878r44l00i.fsf@euler.schwinge.ddns.net>
"libgomp GCN gfx1030/gfx1100 offloading status" and thereabouts, that is,
this recovers to PASS a lot of twinkling libgomp/OpenMP/GCN execution
test cases, and their even more annyoing random timeouts.

(The commit doesn't affect GCN target testing.)


I still have a number of stabilization hacks applied to my sources -- but
I've, for example, not seen any 'HSA_STATUS_ERROR_OUT_OF_RESOURCES' or
random timeouts in my current GCN offloading test results.


Grüße
 Thomas


> gcc/ChangeLog:
>
> 	* config/gcn/gcn.md (*memory_barrier): Split into RDNA and !RDNA.
> 	(atomic_load<mode>): Adjust RDNA cache settings.
> 	(atomic_store<mode>): Likewise.
> 	(atomic_exchange<mode>): Likewise.
> ---
>  gcc/config/gcn/gcn.md | 86 +++++++++++++++++++++++++++----------------
>  1 file changed, 55 insertions(+), 31 deletions(-)
>
> diff --git a/gcc/config/gcn/gcn.md b/gcc/config/gcn/gcn.md
> index 3b51453aaca..574c2f87e8c 100644
> --- a/gcc/config/gcn/gcn.md
> +++ b/gcc/config/gcn/gcn.md
> @@ -1960,11 +1960,19 @@
>  (define_insn "*memory_barrier"
>    [(set (match_operand:BLK 0)
>  	(unspec:BLK [(match_dup 0)] UNSPEC_MEMORY_BARRIER))]
> -  ""
> -  "{buffer_wbinvl1_vol|buffer_gl0_inv}"
> +  "!TARGET_RDNA2_PLUS"
> +  "buffer_wbinvl1_vol"
>    [(set_attr "type" "mubuf")
>     (set_attr "length" "4")])
>  
> +(define_insn "*memory_barrier"
> +  [(set (match_operand:BLK 0)
> +	(unspec:BLK [(match_dup 0)] UNSPEC_MEMORY_BARRIER))]
> +  "TARGET_RDNA2_PLUS"
> +  "buffer_gl1_inv\;buffer_gl0_inv"
> +  [(set_attr "type" "mult")
> +   (set_attr "length" "8")])
> +
>  ; FIXME: These patterns have been disabled as they do not seem to work
>  ; reliably - they can cause hangs or incorrect results.
>  ; TODO: flush caches according to memory model
> @@ -2094,9 +2102,13 @@
>  	  case 0:
>  	    return "s_load%o0\t%0, %A1 glc\;s_waitcnt\tlgkmcnt(0)";
>  	  case 1:
> -	    return "flat_load%o0\t%0, %A1%O1 glc\;s_waitcnt\t0";
> +	    return (TARGET_RDNA2 /* Not GFX11.  */
> +		    ? "flat_load%o0\t%0, %A1%O1 glc dlc\;s_waitcnt\t0"
> +		    : "flat_load%o0\t%0, %A1%O1 glc\;s_waitcnt\t0");
>  	  case 2:
> -	    return "global_load%o0\t%0, %A1%O1 glc\;s_waitcnt\tvmcnt(0)";
> +	    return (TARGET_RDNA2 /* Not GFX11.  */
> +		    ? "global_load%o0\t%0, %A1%O1 glc dlc\;s_waitcnt\tvmcnt(0)"
> +		    : "global_load%o0\t%0, %A1%O1 glc\;s_waitcnt\tvmcnt(0)");
>  	  }
>  	break;
>        case MEMMODEL_CONSUME:
> @@ -2108,15 +2120,21 @@
>  	    return "s_load%o0\t%0, %A1 glc\;s_waitcnt\tlgkmcnt(0)\;"
>  		   "s_dcache_wb_vol";
>  	  case 1:
> -	    return (TARGET_RDNA2_PLUS
> +	    return (TARGET_RDNA2
> +		    ? "flat_load%o0\t%0, %A1%O1 glc dlc\;s_waitcnt\t0\;"
> +		      "buffer_gl1_inv\;buffer_gl0_inv"
> +		    : TARGET_RDNA3
>  		    ? "flat_load%o0\t%0, %A1%O1 glc\;s_waitcnt\t0\;"
> -		      "buffer_gl0_inv"
> +		      "buffer_gl1_inv\;buffer_gl0_inv"
>  		    : "flat_load%o0\t%0, %A1%O1 glc\;s_waitcnt\t0\;"
>  		      "buffer_wbinvl1_vol");
>  	  case 2:
> -	    return (TARGET_RDNA2_PLUS
> +	    return (TARGET_RDNA2
> +		    ? "global_load%o0\t%0, %A1%O1 glc dlc\;s_waitcnt\tvmcnt(0)\;"
> +		      "buffer_gl1_inv\;buffer_gl0_inv"
> +		    : TARGET_RDNA3
>  		    ? "global_load%o0\t%0, %A1%O1 glc\;s_waitcnt\tvmcnt(0)\;"
> -		      "buffer_gl0_inv"
> +		      "buffer_gl1_inv\;buffer_gl0_inv"
>  		    : "global_load%o0\t%0, %A1%O1 glc\;s_waitcnt\tvmcnt(0)\;"
>  		      "buffer_wbinvl1_vol");
>  	  }
> @@ -2130,15 +2148,21 @@
>  	    return "s_dcache_wb_vol\;s_load%o0\t%0, %A1 glc\;"
>  		   "s_waitcnt\tlgkmcnt(0)\;s_dcache_inv_vol";
>  	  case 1:
> -	    return (TARGET_RDNA2_PLUS
> -		    ? "buffer_gl0_inv\;flat_load%o0\t%0, %A1%O1 glc\;"
> -		      "s_waitcnt\t0\;buffer_gl0_inv"
> +	    return (TARGET_RDNA2
> +		    ? "buffer_gl1_inv\;buffer_gl0_inv\;flat_load%o0\t%0, %A1%O1 glc dlc\;"
> +		      "s_waitcnt\t0\;buffer_gl1_inv\;buffer_gl0_inv"
> +		    : TARGET_RDNA3
> +		    ? "buffer_gl1_inv\;buffer_gl0_inv\;flat_load%o0\t%0, %A1%O1 glc\;"
> +		      "s_waitcnt\t0\;buffer_gl1_inv\;buffer_gl0_inv"
>  		    : "buffer_wbinvl1_vol\;flat_load%o0\t%0, %A1%O1 glc\;"
>  		      "s_waitcnt\t0\;buffer_wbinvl1_vol");
>  	  case 2:
> -	    return (TARGET_RDNA2_PLUS
> -		    ? "buffer_gl0_inv\;global_load%o0\t%0, %A1%O1 glc\;"
> -		      "s_waitcnt\tvmcnt(0)\;buffer_gl0_inv"
> +	    return (TARGET_RDNA2
> +		    ? "buffer_gl1_inv\;buffer_gl0_inv\;global_load%o0\t%0, %A1%O1 glc dlc\;"
> +		      "s_waitcnt\tvmcnt(0)\;buffer_gl1_inv\;buffer_gl0_inv"
> +		    : TARGET_RDNA3
> +		    ? "buffer_gl1_inv\;buffer_gl0_inv\;global_load%o0\t%0, %A1%O1 glc\;"
> +		      "s_waitcnt\tvmcnt(0)\;buffer_gl1_inv\;buffer_gl0_inv"
>  		    : "buffer_wbinvl1_vol\;global_load%o0\t%0, %A1%O1 glc\;"
>  		      "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol");
>  	  }
> @@ -2147,7 +2171,7 @@
>      gcc_unreachable ();
>    }
>    [(set_attr "type" "smem,flat,flat")
> -   (set_attr "length" "20")
> +   (set_attr "length" "28")
>     (set_attr "gcn_version" "gcn5,*,gcn5")
>     (set_attr "rdna" "no,*,*")])
>  
> @@ -2180,11 +2204,11 @@
>  	    return "s_dcache_wb_vol\;s_store%o1\t%1, %A0 glc";
>  	  case 1:
>  	    return (TARGET_RDNA2_PLUS
> -		    ? "buffer_gl0_inv\;flat_store%o1\t%A0, %1%O0 glc"
> +		    ? "buffer_gl1_inv\;buffer_gl0_inv\;flat_store%o1\t%A0, %1%O0 glc"
>  		    : "buffer_wbinvl1_vol\;flat_store%o1\t%A0, %1%O0 glc");
>  	  case 2:
>  	    return (TARGET_RDNA2_PLUS
> -		    ? "buffer_gl0_inv\;global_store%o1\t%A0, %1%O0 glc"
> +		    ? "buffer_gl1_inv\;buffer_gl0_inv\;global_store%o1\t%A0, %1%O0 glc"
>  		    : "buffer_wbinvl1_vol\;global_store%o1\t%A0, %1%O0 glc");
>  	  }
>  	break;
> @@ -2198,14 +2222,14 @@
>  		   "s_waitcnt\tlgkmcnt(0)\;s_dcache_inv_vol";
>  	  case 1:
>  	    return (TARGET_RDNA2_PLUS
> -		    ? "buffer_gl0_inv\;flat_store%o1\t%A0, %1%O0 glc\;"
> -		      "s_waitcnt\t0\;buffer_gl0_inv"
> +		    ? "buffer_gl1_inv\;buffer_gl0_inv\;flat_store%o1\t%A0, %1%O0 glc\;"
> +		      "s_waitcnt\t0\;buffer_gl1_inv\;buffer_gl0_inv"
>  		    : "buffer_wbinvl1_vol\;flat_store%o1\t%A0, %1%O0 glc\;"
>  		      "s_waitcnt\t0\;buffer_wbinvl1_vol");
>  	  case 2:
>  	    return (TARGET_RDNA2_PLUS
> -		    ? "buffer_gl0_inv\;global_store%o1\t%A0, %1%O0 glc\;"
> -		      "s_waitcnt\tvmcnt(0)\;buffer_gl0_inv"
> +		    ? "buffer_gl1_inv\;buffer_gl0_inv\;global_store%o1\t%A0, %1%O0 glc\;"
> +		      "s_waitcnt\tvmcnt(0)\;buffer_gl1_inv\;buffer_gl0_inv"
>  		    : "buffer_wbinvl1_vol\;global_store%o1\t%A0, %1%O0 glc\;"
>  		      "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol");
>  	  }
> @@ -2214,7 +2238,7 @@
>      gcc_unreachable ();
>    }
>    [(set_attr "type" "smem,flat,flat")
> -   (set_attr "length" "20")
> +   (set_attr "length" "28")
>     (set_attr "gcn_version" "gcn5,*,gcn5")
>     (set_attr "rdna" "no,*,*")])
>  
> @@ -2253,13 +2277,13 @@
>  	  case 1:
>  	    return (TARGET_RDNA2_PLUS
>  		    ? "flat_atomic_swap<X>\t%0, %1, %2 glc\;s_waitcnt\t0\;"
> -		      "buffer_gl0_inv"
> +		      "buffer_gl1_inv\;buffer_gl0_inv"
>  		    : "flat_atomic_swap<X>\t%0, %1, %2 glc\;s_waitcnt\t0\;"
>  		      "buffer_wbinvl1_vol");
>  	  case 2:
>  	    return (TARGET_RDNA2_PLUS
>  		    ? "global_atomic_swap<X>\t%0, %A1, %2%O1 glc\;"
> -		      "s_waitcnt\tvmcnt(0)\;buffer_gl0_inv"
> +		      "s_waitcnt\tvmcnt(0)\;buffer_gl1_inv\;buffer_gl0_inv"
>  		    : "global_atomic_swap<X>\t%0, %A1, %2%O1 glc\;"
>  		      "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol");
>  	  }
> @@ -2273,13 +2297,13 @@
>  		   "s_waitcnt\tlgkmcnt(0)";
>  	  case 1:
>  	    return (TARGET_RDNA2_PLUS
> -		    ? "buffer_gl0_inv\;flat_atomic_swap<X>\t%0, %1, %2 glc\;"
> +		    ? "buffer_gl1_inv\;buffer_gl0_inv\;flat_atomic_swap<X>\t%0, %1, %2 glc\;"
>  		      "s_waitcnt\t0"
>  		    : "buffer_wbinvl1_vol\;flat_atomic_swap<X>\t%0, %1, %2 glc\;"
>  		      "s_waitcnt\t0");
>  	  case 2:
>  	    return (TARGET_RDNA2_PLUS
> -		    ? "buffer_gl0_inv\;"
> +		    ? "buffer_gl1_inv\;buffer_gl0_inv\;"
>  		      "global_atomic_swap<X>\t%0, %A1, %2%O1 glc\;"
>  		      "s_waitcnt\tvmcnt(0)"
>  		    : "buffer_wbinvl1_vol\;"
> @@ -2297,15 +2321,15 @@
>  		   "s_waitcnt\tlgkmcnt(0)\;s_dcache_inv_vol";
>  	  case 1:
>  	    return (TARGET_RDNA2_PLUS
> -		    ? "buffer_gl0_inv\;flat_atomic_swap<X>\t%0, %1, %2 glc\;"
> -		      "s_waitcnt\t0\;buffer_gl0_inv"
> +		    ? "buffer_gl1_inv\;buffer_gl0_inv\;flat_atomic_swap<X>\t%0, %1, %2 glc\;"
> +		      "s_waitcnt\t0\;buffer_gl1_inv\;buffer_gl0_inv"
>  		    : "buffer_wbinvl1_vol\;flat_atomic_swap<X>\t%0, %1, %2 glc\;"
>  		      "s_waitcnt\t0\;buffer_wbinvl1_vol");
>  	  case 2:
>  	    return (TARGET_RDNA2_PLUS
> -		    ? "buffer_gl0_inv\;"
> +		    ? "buffer_gl1_inv\;buffer_gl0_inv\;"
>  		      "global_atomic_swap<X>\t%0, %A1, %2%O1 glc\;"
> -		      "s_waitcnt\tvmcnt(0)\;buffer_gl0_inv"
> +		      "s_waitcnt\tvmcnt(0)\;buffer_gl1_inv\;buffer_gl0_inv"
>  		    : "buffer_wbinvl1_vol\;"
>  		      "global_atomic_swap<X>\t%0, %A1, %2%O1 glc\;"
>  		      "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol");
> @@ -2315,7 +2339,7 @@
>      gcc_unreachable ();
>    }
>    [(set_attr "type" "smem,flat,flat")
> -   (set_attr "length" "20")
> +   (set_attr "length" "28")
>     (set_attr "gcn_version" "gcn5,*,gcn5")
>     (set_attr "rdna" "no,*,*")])
>  
> -- 
> 2.41.0

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2024-04-04 10:04 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-03-22 15:54 [committed] amdgcn: Adjust GFX10/GFX11 cache coherency Andrew Stubbs
2024-04-04 10:04 ` Thomas Schwinge

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).