diff --git a/gcc/config/gcn/gcn-protos.h b/gcc/config/gcn/gcn-protos.h index 861044e77f0..d7862b21a2a 100644 --- a/gcc/config/gcn/gcn-protos.h +++ b/gcc/config/gcn/gcn-protos.h @@ -27,6 +27,11 @@ extern unsigned int gcn_dwarf_register_number (unsigned int regno); extern rtx get_exec (int64_t); extern rtx get_exec (machine_mode mode); extern char * gcn_expand_dpp_shr_insn (machine_mode, const char *, int, int); +extern char * gcn_expand_dpp_swap_pairs_insn (machine_mode, const char *, int); +extern char * gcn_expand_dpp_distribute_even_insn (machine_mode, const char *, + int unspec); +extern char * gcn_expand_dpp_distribute_odd_insn (machine_mode, const char *, + int unspec); extern void gcn_expand_epilogue (); extern rtx gcn_expand_scaled_offsets (addr_space_t as, rtx base, rtx offsets, rtx scale, bool unsigned_p, rtx exec); diff --git a/gcc/config/gcn/gcn-valu.md b/gcc/config/gcn/gcn-valu.md index 47d9d87d58a..cb650bca3ff 100644 --- a/gcc/config/gcn/gcn-valu.md +++ b/gcc/config/gcn/gcn-valu.md @@ -1224,6 +1224,45 @@ [(set_attr "type" "vop_dpp") (set_attr "length" "16")]) +(define_insn "@dpp_swap_pairs" + [(set (match_operand:V_noHI 0 "register_operand" "=v") + (unspec:V_noHI + [(match_operand:V_noHI 1 "register_operand" " v")] + UNSPEC_MOV_DPP_SWAP_PAIRS))] + "" + { + return gcn_expand_dpp_swap_pairs_insn (mode, "v_mov_b32", + UNSPEC_MOV_DPP_SWAP_PAIRS); + } + [(set_attr "type" "vop_dpp") + (set_attr "length" "16")]) + +(define_insn "@dpp_distribute_even" + [(set (match_operand:V_noHI 0 "register_operand" "=v") + (unspec:V_noHI + [(match_operand:V_noHI 1 "register_operand" " v")] + UNSPEC_MOV_DPP_DISTRIBUTE_EVEN))] + "" + { + return gcn_expand_dpp_distribute_even_insn (mode, "v_mov_b32", + UNSPEC_MOV_DPP_DISTRIBUTE_EVEN); + } + [(set_attr "type" "vop_dpp") + (set_attr "length" "16")]) + +(define_insn "@dpp_distribute_odd" + [(set (match_operand:V_noHI 0 "register_operand" "=v") + (unspec:V_noHI + [(match_operand:V_noHI 1 "register_operand" " v")] + UNSPEC_MOV_DPP_DISTRIBUTE_EVEN))] + "" + { + return gcn_expand_dpp_distribute_odd_insn (mode, "v_mov_b32", + UNSPEC_MOV_DPP_DISTRIBUTE_ODD); + } + [(set_attr "type" "vop_dpp") + (set_attr "length" "16")]) + ;; }}} ;; {{{ ALU special case: add/sub @@ -2185,6 +2224,194 @@ DONE; }) +(define_int_iterator UNSPEC_CMUL_OP [UNSPEC_CMUL UNSPEC_CMUL_CONJ]) +(define_int_attr conj_op [(UNSPEC_CMUL "") (UNSPEC_CMUL_CONJ "_conj")]) +(define_int_attr cmul_subadd [(UNSPEC_CMUL "sub") (UNSPEC_CMUL_CONJ "add")]) +(define_int_attr cmul_addsub [(UNSPEC_CMUL "add") (UNSPEC_CMUL_CONJ "sub")]) + +(define_expand "cmul3" + [(set (match_operand:V_noHI 0 "register_operand" "= &v") + (unspec:V_noHI + [(match_operand:V_noHI 1 "register_operand" "v") + (match_operand:V_noHI 2 "register_operand" "v")] + UNSPEC_CMUL_OP))] + "" + { + // operands[1] a b + // operands[2] c d + rtx t1 = gen_reg_rtx (mode); + emit_insn (gen_mul3 (t1, operands[1], operands[2])); // a*c b*d + + rtx s2_perm = gen_reg_rtx (mode); + emit_insn (gen_dpp_swap_pairs (s2_perm, operands[2])); // d c + + rtx t2 = gen_reg_rtx (mode); + emit_insn (gen_mul3 (t2, operands[1], s2_perm)); // a*d b*c + + rtx t1_perm = gen_reg_rtx (mode); + emit_insn (gen_dpp_swap_pairs (t1_perm, t1)); // b*d a*c + + rtx even = gen_rtx_REG (DImode, EXEC_REG); + emit_move_insn (even, get_exec (0x5555555555555555UL)); + rtx dest = operands[0]; + emit_insn (gen_3_exec (dest, t1, t1_perm, dest, even)); + // a*c-b*d 0 + + rtx t2_perm = gen_reg_rtx (mode); + emit_insn (gen_dpp_swap_pairs (t2_perm, t2)); // b*c a*d + + rtx odd = gen_rtx_REG (DImode, EXEC_REG); + emit_move_insn (odd, get_exec (0xaaaaaaaaaaaaaaaaUL)); + emit_insn (gen_3_exec (dest, t2, t2_perm, dest, odd)); + // 0 a*d+b*c + DONE; + }) + +(define_code_iterator addsub [plus minus]) +(define_code_attr addsub_as [(plus "a") (minus "s")]) + +(define_expand "cml4" + [(set (match_operand:V_FP 0 "register_operand" "= &v") + (addsub:V_FP + (unspec:V_FP + [(match_operand:V_FP 1 "register_operand" "v") + (match_operand:V_FP 2 "register_operand" "v")] + UNSPEC_CMUL) + (match_operand:V_FP 3 "register_operand" "v")))] + "" + { + rtx a = gen_reg_rtx (mode); + emit_insn (gen_dpp_distribute_even (a, operands[1])); // a a + + rtx t1 = gen_reg_rtx (mode); + emit_insn (gen_fm4 (t1, a, operands[2], operands[3])); + // a*c a*d + + rtx b = gen_reg_rtx (mode); + emit_insn (gen_dpp_distribute_odd (b, operands[1])); // b b + + rtx t2 = gen_reg_rtx (mode); + emit_insn (gen_mul3 (t2, b, operands[2])); // b*c b*d + + rtx t2_perm = gen_reg_rtx (mode); + emit_insn (gen_dpp_swap_pairs (t2_perm, t2)); // b*d b*c + + rtx even = gen_rtx_REG (DImode, EXEC_REG); + emit_move_insn (even, get_exec (0x5555555555555555UL)); + rtx dest = operands[0]; + emit_insn (gen_sub3_exec (dest, t1, t2_perm, dest, even)); + + rtx odd = gen_rtx_REG (DImode, EXEC_REG); + emit_move_insn (odd, get_exec (0xaaaaaaaaaaaaaaaaUL)); + emit_insn (gen_add3_exec (dest, t1, t2_perm, dest, odd)); + + DONE; + }) + +(define_expand "vec_addsub3" + [(set (match_operand:V_noHI 0 "register_operand" "= &v") + (vec_merge:V_noHI + (minus:V_noHI + (match_operand:V_noHI 1 "register_operand" "v") + (match_operand:V_noHI 2 "register_operand" "v")) + (plus:V_noHI (match_dup 1) (match_dup 2)) + (const_int 6148914691236517205)))] + "" + { + rtx even = gen_rtx_REG (DImode, EXEC_REG); + emit_move_insn (even, get_exec (0x5555555555555555UL)); + rtx dest = operands[0]; + rtx x = operands[1]; + rtx y = operands[2]; + emit_insn (gen_sub3_exec (dest, x, y, dest, even)); + rtx odd = gen_rtx_REG (DImode, EXEC_REG); + emit_move_insn (odd, get_exec (0xaaaaaaaaaaaaaaaaUL)); + emit_insn (gen_add3_exec (dest, x, y, dest, odd)); + + DONE; + }) + +(define_int_iterator CADD [UNSPEC_CADD90 UNSPEC_CADD270]) +(define_int_attr rot [(UNSPEC_CADD90 "90") (UNSPEC_CADD270 "270")]) +(define_int_attr cadd_subadd [(UNSPEC_CADD90 "sub") (UNSPEC_CADD270 "add")]) +(define_int_attr cadd_addsub [(UNSPEC_CADD90 "add") (UNSPEC_CADD270 "sub")]) + +(define_expand "cadd3" + [(set (match_operand:V_noHI 0 "register_operand" "=&v") + (unspec:V_noHI [(match_operand:V_noHI 1 "register_operand" "v") + (match_operand:V_noHI 2 "register_operand" "v")] + CADD))] + "" + { + rtx dest = operands[0]; + rtx x = operands[1]; + rtx y = gen_reg_rtx (mode); + emit_insn (gen_dpp_swap_pairs (y, operands[2])); + + rtx even = gen_rtx_REG (DImode, EXEC_REG); + emit_move_insn (even, get_exec (0x5555555555555555UL)); + emit_insn (gen_3_exec (dest, x, y, dest, even)); + rtx odd = gen_rtx_REG (DImode, EXEC_REG); + emit_move_insn (odd, get_exec (0xaaaaaaaaaaaaaaaaUL)); + emit_insn (gen_3_exec (dest, x, y, dest, odd)); + + DONE; + }) + +;; It would be possible to represent these without the UNSPEC as +;; +;; (vec_merge +;; (fma op1 op2 op3) +;; (fma op1 op2 (neg op3)) +;; (merge-const)) +;; +;; But this doesn't seem useful in practice. + +(define_expand "vec_fmaddsub4" + [(set (match_operand:V_noHI 0 "register_operand" "=&v") + (unspec:V_noHI + [(match_operand:V_noHI 1 "register_operand" "v") + (match_operand:V_noHI 2 "register_operand" "v") + (match_operand:V_noHI 3 "register_operand" "v")] + UNSPEC_FMADDSUB))] + "" + { + rtx t1 = gen_reg_rtx (mode); + emit_insn (gen_mul3 (t1, operands[1], operands[2])); + rtx even = gen_rtx_REG (DImode, EXEC_REG); + emit_move_insn (even, get_exec (0x5555555555555555UL)); + rtx dest = operands[0]; + emit_insn (gen_sub3_exec (dest, t1, operands[3], dest, even)); + rtx odd = gen_rtx_REG (DImode, EXEC_REG); + emit_move_insn (odd, get_exec (0xaaaaaaaaaaaaaaaaUL)); + emit_insn (gen_add3_exec (dest, t1, operands[3], dest, odd)); + + DONE; + }) + +(define_expand "vec_fmsubadd4" + [(set (match_operand:V_noHI 0 "register_operand" "=&v") + (unspec:V_noHI + [(match_operand:V_noHI 1 "register_operand" "v") + (match_operand:V_noHI 2 "register_operand" "v") + (neg:V_noHI + (match_operand:V_noHI 3 "register_operand" "v"))] + UNSPEC_FMADDSUB))] + "" + { + rtx t1 = gen_reg_rtx (mode); + emit_insn (gen_mul3 (t1, operands[1], operands[2])); + rtx even = gen_rtx_REG (DImode, EXEC_REG); + emit_move_insn (even, get_exec (0x5555555555555555UL)); + rtx dest = operands[0]; + emit_insn (gen_add3_exec (dest, t1, operands[3], dest, even)); + rtx odd = gen_rtx_REG (DImode, EXEC_REG); + emit_move_insn (odd, get_exec (0xaaaaaaaaaaaaaaaaUL)); + emit_insn (gen_add3_exec (dest, t1, operands[3], dest, odd)); + + DONE; + }) + ;; }}} ;; {{{ ALU generic case @@ -2768,6 +2995,56 @@ [(set_attr "type" "vop3a") (set_attr "length" "8")]) +(define_insn "fms4" + [(set (match_operand:V_FP 0 "register_operand" "= v, v") + (fma:V_FP + (match_operand:V_FP 1 "gcn_alu_operand" "% vA, vA") + (match_operand:V_FP 2 "gcn_alu_operand" " vA,vSvA") + (neg:V_FP + (match_operand:V_FP 3 "gcn_alu_operand" "vSvA, vA"))))] + "" + "v_fma%i0\t%0, %1, %2, -%3" + [(set_attr "type" "vop3a") + (set_attr "length" "8")]) + +(define_insn "fms4_negop2" + [(set (match_operand:V_FP 0 "register_operand" "= v, v, v") + (fma:V_FP + (match_operand:V_FP 1 "gcn_alu_operand" " vA, vA,vSvA") + (neg:V_FP + (match_operand:V_FP 2 "gcn_alu_operand" " vA,vSvA, vA")) + (neg:V_FP + (match_operand:V_FP 3 "gcn_alu_operand" "vSvA, vA, vA"))))] + "" + "v_fma%i0\t%0, %1, -%2, -%3" + [(set_attr "type" "vop3a") + (set_attr "length" "8")]) + +(define_insn "fms4" + [(set (match_operand:FP 0 "register_operand" "= v, v") + (fma:FP + (match_operand:FP 1 "gcn_alu_operand" "% vA, vA") + (match_operand:FP 2 "gcn_alu_operand" " vA,vSvA") + (neg:FP + (match_operand:FP 3 "gcn_alu_operand" "vSvA, vA"))))] + "" + "v_fma%i0\t%0, %1, %2, -%3" + [(set_attr "type" "vop3a") + (set_attr "length" "8")]) + +(define_insn "fms4_negop2" + [(set (match_operand:FP 0 "register_operand" "= v, v, v") + (fma:FP + (match_operand:FP 1 "gcn_alu_operand" " vA, vA,vSvA") + (neg:FP + (match_operand:FP 2 "gcn_alu_operand" " vA,vSvA, vA")) + (neg:FP + (match_operand:FP 3 "gcn_alu_operand" "vSvA, vA, vA"))))] + "" + "v_fma%i0\t%0, %1, -%2, -%3" + [(set_attr "type" "vop3a") + (set_attr "length" "8")]) + ;; }}} ;; {{{ FP division diff --git a/gcc/config/gcn/gcn.cc b/gcc/config/gcn/gcn.cc index 23ab01e75d8..c04fae2650f 100644 --- a/gcc/config/gcn/gcn.cc +++ b/gcc/config/gcn/gcn.cc @@ -5012,6 +5012,72 @@ gcn_vector_alignment_reachable (const_tree ARG_UNUSED (type), bool is_packed) return !is_packed; } +/* Generate DPP pairwise swap instruction. + The opcode is given by INSN. */ + +char * +gcn_expand_dpp_swap_pairs_insn (machine_mode mode, const char *insn, + int ARG_UNUSED (unspec)) +{ + static char buf[128]; + const char *dpp; + + /* Add the DPP modifiers. */ + dpp = "quad_perm:[1,0,3,2]"; + + if (vgpr_2reg_mode_p (mode)) + sprintf (buf, "%s\t%%L0, %%L1 %s\n\t%s\t%%H0, %%H1 %s", + insn, dpp, insn, dpp); + else + sprintf (buf, "%s\t%%0, %%1 %s", insn, dpp); + + return buf; +} + +/* Generate DPP distribute even instruction. + The opcode is given by INSN. */ + +char * +gcn_expand_dpp_distribute_even_insn (machine_mode mode, const char *insn, + int ARG_UNUSED (unspec)) +{ + static char buf[128]; + const char *dpp; + + /* Add the DPP modifiers. */ + dpp = "quad_perm:[0,0,2,2]"; + + if (vgpr_2reg_mode_p (mode)) + sprintf (buf, "%s\t%%L0, %%L1 %s\n\t%s\t%%H0, %%H1 %s", + insn, dpp, insn, dpp); + else + sprintf (buf, "%s\t%%0, %%1 %s", insn, dpp); + + return buf; +} + +/* Generate DPP distribute odd instruction. + The opcode is given by INSN. */ + +char * +gcn_expand_dpp_distribute_odd_insn (machine_mode mode, const char *insn, + int ARG_UNUSED (unspec)) +{ + static char buf[128]; + const char *dpp; + + /* Add the DPP modifiers. */ + dpp = "quad_perm:[1,1,3,3]"; + + if (vgpr_2reg_mode_p (mode)) + sprintf (buf, "%s\t%%L0, %%L1 %s\n\t%s\t%%H0, %%H1 %s", + insn, dpp, insn, dpp); + else + sprintf (buf, "%s\t%%0, %%1 %s", insn, dpp); + + return buf; +} + /* Generate DPP instructions used for vector reductions. The opcode is given by INSN. diff --git a/gcc/config/gcn/gcn.md b/gcc/config/gcn/gcn.md index 10d2b874cce..dc14da6a058 100644 --- a/gcc/config/gcn/gcn.md +++ b/gcc/config/gcn/gcn.md @@ -78,6 +78,14 @@ UNSPEC_PLUS_CARRY_DPP_SHR UNSPEC_PLUS_CARRY_IN_DPP_SHR UNSPEC_AND_DPP_SHR UNSPEC_IOR_DPP_SHR UNSPEC_XOR_DPP_SHR UNSPEC_MOV_DPP_SHR + UNSPEC_MOV_DPP_SWAP_PAIRS + UNSPEC_MOV_DPP_DISTRIBUTE_EVEN + UNSPEC_MOV_DPP_DISTRIBUTE_ODD + UNSPEC_CMUL UNSPEC_CMUL_CONJ + UNSPEC_CMUL_ADD UNSPEC_CMUL_SUB + UNSPEC_FMADDSUB + UNSPEC_CADD90 + UNSPEC_CADD270 UNSPEC_GATHER UNSPEC_SCATTER UNSPEC_RCP diff --git a/gcc/testsuite/gcc.target/gcn/complex.c b/gcc/testsuite/gcc.target/gcn/complex.c new file mode 100755 index 00000000000..3b8a6cc854c --- /dev/null +++ b/gcc/testsuite/gcc.target/gcn/complex.c @@ -0,0 +1,640 @@ +// { dg-do run } +// { dg-options "-O -fopenmp-simd -ftree-loop-if-convert -fno-ssa-phiopt" } + +#include +#include +#include + +#define COUNT 1000 +#define MAX 1000 +#define ALIGNMENT (2*1024*1024) // 2MB + +_Complex double conj(_Complex double); +_Complex float conjf(_Complex float); + +unsigned int device = 0; + +// cmul + +void cmulF(float *td, float *te, float *tf, float *tg, int tas) +{ + typedef _Complex float complexT; + int array_size = tas/2; + complexT *d = (complexT*)(td); + complexT *e = (complexT*)(te); + complexT *f = (complexT*)(tf); +#pragma omp target teams distribute parallel for simd + for (int i = 0; i < array_size; i++) + { + d[i] = e[i] * f[i]; + } +} + +bool cmulFcheck(float *td, float *te, float *tf, float *tg, int tas) +{ + for (int i = 0; i < tas/2; ++i) + { + float a = te[i*2]; + float b = te[i*2+1]; + float c = tf[i*2]; + float d = tf[i*2+1]; + if (td[i*2] != a*c-b*d || td[i*2+1] != a*d+b*c) + return false; + } + return true; +} + +void cmulD(double *td, double *te, double *tf, double *tg, int tas) +{ + typedef _Complex double complexT; + int array_size = tas/2; + complexT *d = (complexT*)(td); + complexT *e = (complexT*)(te); + complexT *f = (complexT*)(tf); +#pragma omp target teams distribute parallel for simd + for (int i = 0; i < array_size; i++) + { + d[i] = e[i] * f[i]; + } +} + +bool cmulDcheck(double *td, double *te, double *tf, double *tg, int tas) +{ + for (int i = 0; i < tas/2; ++i) + { + double a = te[i*2]; + double b = te[i*2+1]; + double c = tf[i*2]; + double d = tf[i*2+1]; + if (td[i*2] != a*c-b*d || td[i*2+1] != a*d+b*c) + return false; + } + return true; +} + + +// cmul_conj + +void cmul_conjF(float *td, float *te, float *tf, float *tg, int tas) +{ + typedef _Complex float complexT; + int array_size = tas/2; + complexT *d = (complexT*)(td); + complexT *e = (complexT*)(te); + complexT *f = (complexT*)(tf); +#pragma omp target teams distribute parallel for simd + for (int i = 0; i < array_size; i++) + { + d[i] = e[i] * conj(f[i]); + } +} + +bool cmul_conjFcheck(float *td, float *te, float *tf, float *tg, int tas) +{ + for (int i = 0; i < tas/2; ++i) + { + float a = te[i*2]; + float b = te[i*2+1]; + float c = tf[i*2]; + float d = tf[i*2+1]; + if (td[i*2] != a*c+b*d || td[i*2+1] != b*c-a*d) + return false; + } + return true; +} + +void cmul_conjD(double *td, double *te, double *tf, double *tg, int tas) +{ + typedef _Complex double complexT; + int array_size = tas/2; + complexT *d = (complexT*)(td); + complexT *e = (complexT*)(te); + complexT *f = (complexT*)(tf); +#pragma omp target teams distribute parallel for simd + for (int i = 0; i < array_size; i++) + { + d[i] = e[i] * conj(f[i]); + } +} + +bool cmul_conjDcheck(double *td, double *te, double *tf, double *tg, int tas) +{ + for (int i = 0; i < tas/2; ++i) + { + double a = te[i*2]; + double b = te[i*2+1]; + double c = tf[i*2]; + double d = tf[i*2+1]; + if (td[i*2] != a*c+b*d || td[i*2+1] != b*c-a*d) + return false; + } + return true; +} + + +// addsub + +void addsubF(float *td, float *te, float *tf, float *tg, int tas) +{ + typedef _Complex float complexT; + int array_size = tas/2; + complexT *d = (complexT*)(td); + complexT *e = (complexT*)(te); + complexT *f = (complexT*)(tf); +#pragma omp target teams distribute parallel for simd + for (int i = 0; i < array_size; i++) + { + d[i] = e[i] - conjf(f[i]); + } +} + +bool addsubFcheck(float *td, float *te, float *tf, float *tg, int tas) +{ + for (int i = 0; i < tas/2; ++i) + { + float a = te[i*2]; + float b = te[i*2+1]; + float c = tf[i*2]; + float d = tf[i*2+1]; + if (td[i*2] != a-c || td[i*2+1] != b+d) + return false; + } + return true; +} + +void addsubD(double *td, double *te, double *tf, double *tg, int tas) +{ + typedef _Complex double complexT; + int array_size = tas/2; + complexT *d = (complexT*)(td); + complexT *e = (complexT*)(te); + complexT *f = (complexT*)(tf); +#pragma omp target teams distribute parallel for simd + for (int i = 0; i < array_size; i++) + { + d[i] = e[i] - conj(f[i]); + } +} + +bool addsubDcheck(double *td, double *te, double *tf, double *tg, int tas) +{ + for (int i = 0; i < tas/2; ++i) + { + double a = te[i*2]; + double b = te[i*2+1]; + double c = tf[i*2]; + double d = tf[i*2+1]; + if (td[i*2] != a-c || td[i*2+1] != b+d) + return false; + } + return true; +} + + +// fmaddsub + +void fmaddsubF(float *td, float *te, float *tf, float *tg, int tas) +{ + int array_size = tas/2; +#pragma omp target teams distribute parallel for simd + for (int i = 0; i < array_size; i++) + { + td[i*2] = te[i*2]*tf[i*2]-tg[i*2]; + td[i*2+1] = te[i*2+1]*tf[i*2+1]+tg[i*2+1]; + } +} + +bool fmaddsubFcheck(float *td, float *te, float *tf, float *tg, int tas) +{ + for (int i = 0; i < tas/2; ++i) + { + float a = te[i*2]; + float b = te[i*2+1]; + float c = tf[i*2]; + float d = tf[i*2+1]; + float e = tg[i*2]; + float f = tg[i*2+1]; + if (td[i*2] != a*c-e || td[i*2+1] != b*d+f) + return false; + } + return true; +} + +void fmaddsubD(double *td, double *te, double *tf, double *tg, int tas) +{ + int array_size = tas/2; +#pragma omp target teams distribute parallel for simd + for (int i = 0; i < array_size; i++) + { + td[i*2] = te[i*2]*tf[i*2]-tg[i*2]; + td[i*2+1] = te[i*2+1]*tf[i*2+1]+tg[i*2+1]; + } +} + +bool fmaddsubDcheck(double *td, double *te, double *tf, double *tg, int tas) +{ + for (int i = 0; i < tas/2; ++i) + { + double a = te[i*2]; + double b = te[i*2+1]; + double c = tf[i*2]; + double d = tf[i*2+1]; + double e = tg[i*2]; + double f = tg[i*2+1]; + if (td[i*2] != a*c-e || td[i*2+1] != b*d+f) + return false; + } + return true; +} + + +// fmsubadd + +void fmsubaddF(float *td, float *te, float *tf, float *tg, int tas) +{ + int array_size = tas/2; +#pragma omp target teams distribute parallel for simd + for (int i = 0; i < array_size; i++) + { + td[i*2] = te[i*2]*tf[i*2]+tg[i*2]; + td[i*2+1] = te[i*2+1]*tf[i*2+1]-tg[i*2+1]; + } +} + +bool fmsubaddFcheck(float *td, float *te, float *tf, float *tg, int tas) +{ + for (int i = 0; i < tas/2; ++i) + { + float a = te[i*2]; + float b = te[i*2+1]; + float c = tf[i*2]; + float d = tf[i*2+1]; + float e = tg[i*2]; + float f = tg[i*2+1]; + if (td[i*2] != a*c+e || td[i*2+1] != b*d-f) + return false; + } + return true; +} + +void fmsubaddD(double *td, double *te, double *tf, double *tg, int tas) +{ + int array_size = tas/2; +#pragma omp target teams distribute parallel for simd + for (int i = 0; i < array_size; i++) + { + td[i*2] = te[i*2]*tf[i*2]+tg[i*2]; + td[i*2+1] = te[i*2+1]*tf[i*2+1]-tg[i*2+1]; + } +} + +bool fmsubaddDcheck(double *td, double *te, double *tf, double *tg, int tas) +{ + for (int i = 0; i < tas/2; ++i) + { + double a = te[i*2]; + double b = te[i*2+1]; + double c = tf[i*2]; + double d = tf[i*2+1]; + double e = tg[i*2]; + double f = tg[i*2+1]; + if (td[i*2] != a*c+e || td[i*2+1] != b*d-f) + return false; + } + return true; +} + + +// cadd90 + +void cadd90F(float *td, float *te, float *tf, float *tg, int tas) +{ + int array_size = tas/2; +#pragma omp target teams distribute parallel for simd + for (int i = 0; i < array_size; i++) + { + td[i*2] = te[i*2] - tf[i*2+1]; + td[i*2+1] = te[i*2+1] + tf[i*2]; + } +} + +bool cadd90Fcheck(float *td, float *te, float *tf, float *tg, int tas) +{ + for (int i = 0; i < tas/2; ++i) + { + float a = te[i*2]; + float b = te[i*2+1]; + float c = tf[i*2]; + float d = tf[i*2+1]; + if (td[i*2] != a-d || td[i*2+1] != b+c) + return false; + } + return true; +} + +void cadd90D(double *td, double *te, double *tf, double *tg, int tas) +{ + int array_size = tas/2; +#pragma omp target teams distribute parallel for simd + for (int i = 0; i < array_size; i++) + { + td[i*2] = te[i*2] - tf[i*2+1]; + td[i*2+1] = te[i*2+1] + tf[i*2]; + } +} + +bool cadd90Dcheck(double *td, double *te, double *tf, double *tg, int tas) +{ + for (int i = 0; i < tas/2; ++i) + { + double a = te[i*2]; + double b = te[i*2+1]; + double c = tf[i*2]; + double d = tf[i*2+1]; + if (td[i*2] != a-d || td[i*2+1] != b+c) + return false; + } + return true; +} + +// cadd270 + +void cadd270F(float *td, float *te, float *tf, float *tg, int tas) +{ + int array_size = tas/2; +#pragma omp target teams distribute parallel for simd + for (int i = 0; i < array_size; i++) + { + td[i*2] = te[i*2] + tf[i*2+1]; + td[i*2+1] = te[i*2+1] - tf[i*2]; + } +} + +bool cadd270Fcheck(float *td, float *te, float *tf, float *tg, int tas) +{ + for (int i = 0; i < tas/2; ++i) + { + float a = te[i*2]; + float b = te[i*2+1]; + float c = tf[i*2]; + float d = tf[i*2+1]; + if (td[i*2] != a+d || td[i*2+1] != b-c) + return false; + } + return true; +} + +void cadd270D(double *td, double *te, double *tf, double *tg, int tas) +{ + int array_size = tas/2; +#pragma omp target teams distribute parallel for simd + for (int i = 0; i < array_size; i++) + { + td[i*2] = te[i*2] + tf[i*2+1]; + td[i*2+1] = te[i*2+1] - tf[i*2]; + } +} + +bool cadd270Dcheck(double *td, double *te, double *tf, double *tg, int tas) +{ + for (int i = 0; i < tas/2; ++i) + { + double a = te[i*2]; + double b = te[i*2+1]; + double c = tf[i*2]; + double d = tf[i*2+1]; + if (td[i*2] != a+d || td[i*2+1] != b-c) + return false; + } + return true; +} + + +// cmla + +void cmlaF(float *td, float *te, float *tf, float *tg, int tas) +{ + typedef _Complex float complexT; + int array_size = tas/2; + complexT *d = (complexT*)(td); + complexT *e = (complexT*)(te); + complexT *f = (complexT*)(tf); + complexT *g = (complexT*)(tg); +#pragma omp target teams distribute parallel for simd + for (int i = 0; i < array_size; i++) + { + d[i] = e[i] * f[i] + g[i]; + } +} + +bool cmlaFcheck(float *td, float *te, float *tf, float *tg, int tas) +{ + for (int i = 0; i < tas/2; ++i) + { + float a = te[i*2]; + float b = te[i*2+1]; + float c = tf[i*2]; + float d = tf[i*2+1]; + float e = tg[i*2]; + float f = tg[i*2+1]; + if (td[i*2] != a*c-b*d+e || td[i*2+1] != a*d+b*c+f) + return false; + } + return true; +} + +void cmlaD(double *td, double *te, double *tf, double *tg, int tas) +{ + typedef _Complex double complexT; + int array_size = tas/2; + complexT *d = (complexT*)(td); + complexT *e = (complexT*)(te); + complexT *f = (complexT*)(tf); + complexT *g = (complexT*)(tg); +#pragma omp target teams distribute parallel for simd + for (int i = 0; i < array_size; i++) + { + d[i] = e[i] * f[i] + g[i]; + } +} + +bool cmlaDcheck(double *td, double *te, double *tf, double *tg, int tas) +{ + for (int i = 0; i < tas/2; ++i) + { + double a = te[i*2]; + double b = te[i*2+1]; + double c = tf[i*2]; + double d = tf[i*2+1]; + double e = tg[i*2]; + double f = tg[i*2+1]; + if (td[i*2] != a*c-b*d+e || td[i*2+1] != a*d+b*c+f) + return false; + } + return true; +} + + +// cmls + +void cmlsF(float *td, float *te, float *tf, float *tg, int tas) +{ + typedef _Complex float complexT; + int array_size = tas/2; + complexT *d = (complexT*)(td); + complexT *e = (complexT*)(te); + complexT *f = (complexT*)(tf); + complexT *g = (complexT*)(tg); +#pragma omp target teams distribute parallel for simd + for (int i = 0; i < array_size; i++) + { + d[i] = e[i] * f[i] - g[i]; + } +} + +bool cmlsFcheck(float *td, float *te, float *tf, float *tg, int tas) +{ + for (int i = 0; i < tas/2; ++i) + { + float a = te[i*2]; + float b = te[i*2+1]; + float c = tf[i*2]; + float d = tf[i*2+1]; + float e = tg[i*2]; + float f = tg[i*2+1]; + if (td[i*2] != a*c-b*d-e || td[i*2+1] != a*d+b*c-f) + return false; + } + return true; +} + +void cmlsD(double *td, double *te, double *tf, double *tg, int tas) +{ + typedef _Complex double complexT; + int array_size = tas/2; + complexT *d = (complexT*)(td); + complexT *e = (complexT*)(te); + complexT *f = (complexT*)(tf); + complexT *g = (complexT*)(tg); +#pragma omp target teams distribute parallel for simd + for (int i = 0; i < array_size; i++) + { + d[i] = e[i] * f[i] - g[i]; + } +} + +bool cmlsDcheck(double *td, double *te, double *tf, double *tg, int tas) +{ + for (int i = 0; i < tas/2; ++i) + { + double a = te[i*2]; + double b = te[i*2+1]; + double c = tf[i*2]; + double d = tf[i*2+1]; + double e = tg[i*2]; + double f = tg[i*2+1]; + if (td[i*2] != a*c-b*d-e || td[i*2+1] != a*d+b*c-f) + return false; + } + return true; +} + + +typedef void(*runF)(float *td, float *te, float *tf, float *tg, int tas); +typedef void(*runD)(double *td, double *te, double *tf, double *tg, int tas); +typedef bool(*checkF)(float *td, float *te, float *tf, float *tg, int tas); +typedef bool(*checkD)(double *td, double *te, double *tf, double *tg, int tas); + +typedef struct +{ + runF rF; + runD rD; + checkF cF; + checkD cD; +} operation; + +operation ops[] = { + {cmulF, cmulD, cmulFcheck, cmulDcheck}, + {cmul_conjF, cmul_conjD, cmul_conjFcheck, cmul_conjDcheck}, + {addsubF, addsubD, addsubFcheck, addsubDcheck}, + {fmaddsubF, fmaddsubD, fmaddsubFcheck, fmaddsubDcheck}, + {fmsubaddF, fmsubaddD, fmsubaddFcheck, fmsubaddDcheck}, + {cadd90F, cadd90D, cadd90Fcheck, cadd90Dcheck}, + {cadd270F, cadd270D, cadd270Fcheck, cadd270Dcheck}, + {cmlaF, cmlaD, cmlaFcheck, cmlaDcheck}, + {cmlsF, cmlsD, cmlsFcheck, cmlsDcheck} +}; + +void testF(operation* op) +{ + float* td; + float* te; + float* tf; + float* tg; + int array_size = COUNT; + td = (float*)omp_aligned_alloc(ALIGNMENT, sizeof(float)*array_size, omp_default_mem_alloc); + te = (float*)omp_aligned_alloc(ALIGNMENT, sizeof(float)*array_size, omp_default_mem_alloc); + tf = (float*)omp_aligned_alloc(ALIGNMENT, sizeof(float)*array_size, omp_default_mem_alloc); + tg = (float*)omp_aligned_alloc(ALIGNMENT, sizeof(float)*array_size, omp_default_mem_alloc); + omp_set_default_device(device); + float* dd = td; + float* ee = te; + float* ff = tf; + float* gg = tg; + for (int i = 0; i < COUNT; ++i) + { + te[i] = (float)(rand() % MAX); + tf[i] = (float)(rand() % MAX); + tg[i] = (float)(rand() % MAX); + } + // Set up data region on device +#pragma omp target enter data map(to: dd[0:array_size], ee[0:array_size], ff[0:array_size], gg[0:array_size]) + {} + op->rF(td, te, tf, tg, COUNT); +#pragma omp target exit data map(from: dd[0:array_size], ee[0:array_size], ff[0:array_size], gg[0:array_size]) + {} + if (!op->cF(td, te, tf, tg, COUNT)) + abort(); +} + +void testD(operation* op) +{ + double* td; + double* te; + double* tf; + double* tg; + int array_size = COUNT; + td = (double*)omp_aligned_alloc(ALIGNMENT, sizeof(double)*array_size, omp_default_mem_alloc); + te = (double*)omp_aligned_alloc(ALIGNMENT, sizeof(double)*array_size, omp_default_mem_alloc); + tf = (double*)omp_aligned_alloc(ALIGNMENT, sizeof(double)*array_size, omp_default_mem_alloc); + tg = (double*)omp_aligned_alloc(ALIGNMENT, sizeof(double)*array_size, omp_default_mem_alloc); + omp_set_default_device(device); + double* dd = td; + double* ee = te; + double* ff = tf; + double* gg = tg; + for (int i = 0; i < COUNT; ++i) + { + te[i] = (double)(rand() % MAX); + tf[i] = (double)(rand() % MAX); + tg[i] = (double)(rand() % MAX); + } + // Set up data region on device +#pragma omp target enter data map(to: dd[0:array_size], ee[0:array_size], ff[0:array_size], gg[0:array_size]) + {} + op->rD(td, te, tf, tg, COUNT); +#pragma omp target exit data map(from: dd[0:array_size], ee[0:array_size], ff[0:array_size], gg[0:array_size]) + {} + if (!op->cD(td, te, tf, tg, COUNT)) + abort(); +} + +int main() +{ + for (int i = 0; i < 9; ++i) + { + testF(&ops[i]); + testD(&ops[i]); + } +} +