public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH][2/3] Fix PR54733 Optimize endian independent load/store
@ 2014-04-02  0:55 Thomas Preud'homme
  2014-04-02  6:17 ` Marc Glisse
  0 siblings, 1 reply; 14+ messages in thread
From: Thomas Preud'homme @ 2014-04-02  0:55 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 1823 bytes --]

Here is the second patch in the series. Its purpose is to extend the current bswap
optimization to handle bitwise operations on memory sources (array and
structures). This patch extends the concept of symbolic number used in the
bswap pass to memorize some extra informations for bitwise operations on
memory source: base of the access (beginning of the array or structure), offset
to the base and size of the access. All the logic to work on memory source is in
how to merge two symbolic numbers into one according to the offsets of the
two initial symbolic number and the target endianness: the symbolic number
with bigger offset would see its values incremented on a little endian machine.

Note that as it stands the patch does not work for arrays indexed with variable
(such a tab[a] || (tab[a+1] << 8)) because fold_const does not fold (a + 1) - a. If it
was found that some software uses that, adding a new folding would make this
code cover more cases. This patch also adds a few testcases to check both (i) that
the optimization works as expected and (ii) that the result are correct.

The ChangeLog are as follows:

*** gcc/ChangeLog ***

2014-04-01  Thomas Preud'homme  <thomas.preudhomme@arm.com>

	PR tree-optimization/54733
	* tree-ssa-math-opts.c (find_bswap_1): Add support for memory source.
	(find_bswap): Likewise.
	(execute_optimize_bswap): Likewise.

*** gcc/testsuite/ChangeLog ***

2014-04-01  Thomas Preud'homme  <thomas.preudhomme@arm.com>

	PR tree-optimization/54733
	* gcc.dg/optimize-bswapdi-3.c: New test to check extension of bswap
	optimization to support memory sources.
	* gcc.dg/optimize-bswaphi-1.c: Likewise.
	* gcc.dg/optimize-bswapsi-2.c: Likewise.
	* gcc.c-torture/execute/bswap-2.c: Likewise.

Is this ok for stage 1?

Best regards,

Thomas

[-- Attachment #2: gcc32rm-84.3.part2.diff --]
[-- Type: application/octet-stream, Size: 16741 bytes --]

diff --git a/gcc/testsuite/gcc.c-torture/execute/bswap-2.c b/gcc/testsuite/gcc.c-torture/execute/bswap-2.c
new file mode 100644
index 0000000..3181c5f
--- /dev/null
+++ b/gcc/testsuite/gcc.c-torture/execute/bswap-2.c
@@ -0,0 +1,57 @@
+#ifdef __UINT32_TYPE__
+typedef __UINT32_TYPE__ uint32_t;
+#else
+typedef __UINT32_TYPE__ unsigned;
+#endif
+
+struct bitfield {
+  unsigned char f0:7;
+  unsigned char f1:7;
+  unsigned char f2:7;
+  unsigned char f3:7;
+};
+
+struct ok {
+  unsigned char f0;
+  unsigned char f1;
+  unsigned char f2;
+  unsigned char f3;
+};
+
+union bf_or_uint32 {
+  struct ok inval;
+  struct bitfield bfval;
+};
+
+__attribute__ ((noinline, noclone)) uint32_t
+partial_read_le32 (union bf_or_uint32 in)
+{
+  return in.bfval.f0 | (in.bfval.f1 << 8)
+	 | (in.bfval.f2 << 16) | (in.bfval.f3 << 24);
+}
+
+__attribute__ ((noinline, noclone)) uint32_t
+partial_read_be32 (union bf_or_uint32 in)
+{
+  return in.bfval.f3 | (in.bfval.f2 << 8)
+	 | (in.bfval.f1 << 16) | (in.bfval.f0 << 24);
+}
+
+int
+main ()
+{
+  union bf_or_uint32 bfin;
+  uint32_t out;
+
+  if (sizeof (uint32_t) * __CHAR_BIT__ != 32)
+    return 0;
+  bfin.inval = (struct ok) { 0x83, 0x85, 0x87, 0x89 };
+  out = partial_read_le32 (bfin);
+  if (out != 0x09070503 && out != 0x88868482)
+    __builtin_abort ();
+  bfin.inval = (struct ok) { 0x83, 0x85, 0x87, 0x89 };
+  out = partial_read_be32 (bfin);
+  if (out != 0x03050709 && out != 0x82848688)
+    __builtin_abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.dg/optimize-bswapdi-3.c b/gcc/testsuite/gcc.dg/optimize-bswapdi-3.c
new file mode 100644
index 0000000..803d50c
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/optimize-bswapdi-3.c
@@ -0,0 +1,59 @@
+/* { dg-do compile { target *-*-* } } */
+/* { dg-require-effective-target bswap64 } */
+/* { dg-require-effective-target stdint_types } */
+/* { dg-options "-O2 -fdump-tree-bswap" } */
+
+#include <stdint.h>
+
+struct uint64_st {
+  unsigned char u0, u1, u2, u3, u4, u5, u6, u7;
+};
+
+uint32_t read_aux (void *, uint32_t);
+
+uint64_t read_le64_1 (void)
+{
+  unsigned char data[8];
+
+  read_aux (data, 8);
+  return (uint64_t) data[0] | ((uint64_t) data[1] << 8)
+	 | ((uint64_t) data[2] << 16) | ((uint64_t) data[3] << 24)
+	 | ((uint64_t) data[4] << 32) | ((uint64_t) data[5] << 40)
+	 | ((uint64_t) data[6] << 48) | ((uint64_t) data[7] << 56);
+}
+
+uint64_t read_le64_2 (void)
+{
+  struct uint64_st data;
+
+  read_aux (&data, 8);
+  return (uint64_t) data.u0 | ((uint64_t) data.u1 << 8)
+	 | ((uint64_t) data.u2 << 16) | ((uint64_t) data.u3 << 24)
+	 | ((uint64_t) data.u4 << 32) | ((uint64_t) data.u5 << 40)
+	 | ((uint64_t) data.u6 << 48) | ((uint64_t) data.u7 << 56);
+}
+
+uint64_t read_be64_1 (void)
+{
+  unsigned char data[8];
+
+  read_aux (data, 8);
+  return (uint64_t) data[7] | ((uint64_t) data[6] << 8)
+	 | ((uint64_t) data[5] << 16) | ((uint64_t) data[4] << 24)
+	 | ((uint64_t) data[3] << 32) | ((uint64_t) data[2] << 40)
+	 | ((uint64_t) data[1] << 48) | ((uint64_t) data[0] << 56);
+}
+
+uint64_t read_be64_2 (void)
+{
+  struct uint64_st data;
+
+  read_aux (&data, 8);
+  return (uint64_t) data.u7 | ((uint64_t) data.u6 << 8)
+	 | ((uint64_t) data.u5 << 16) | ((uint64_t) data.u4 << 24)
+	 | ((uint64_t) data.u3 << 32) | ((uint64_t) data.u2 << 40)
+	 | ((uint64_t) data.u1 << 48) | ((uint64_t) data.u0 << 56);
+}
+
+/* { dg-final { scan-tree-dump-times "64 bit bswap implementation found at" 2 "bswap" } } */
+/* { dg-final { cleanup-tree-dump "bswap" } } */
diff --git a/gcc/testsuite/gcc.dg/optimize-bswaphi-1.c b/gcc/testsuite/gcc.dg/optimize-bswaphi-1.c
new file mode 100644
index 0000000..19f37cd
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/optimize-bswaphi-1.c
@@ -0,0 +1,48 @@
+/* { dg-do compile { target { ! alpha*-*-* } } } */
+/* { dg-require-effective-target bswap16 } */
+/* { dg-require-effective-target stdint_types } */
+/* { dg-options "-O2 -fdump-tree-bswap" } */
+/* { dg-options "-O2 -fdump-tree-bswap -march=z900" { target s390-*-* } } */
+
+#include <stdint.h>
+
+struct uint16_st {
+  unsigned char u0, u1;
+};
+
+uint32_t read_aux (void *, uint32_t);
+
+uint32_t read_le16_1 (void)
+{
+  unsigned char data[2];
+
+  read_aux (data, 2);
+  return data[0] | (data[1] << 8);
+}
+
+uint32_t read_le16_2 (void)
+{
+  struct uint16_st data;
+
+  read_aux (&data, 2);
+  return data.u0 | (data.u1 << 8);
+}
+
+uint32_t read_be16_1 (void)
+{
+  unsigned char data[2];
+
+  read_aux (data, 2);
+  return data[1] | (data[0] << 8);
+}
+
+uint32_t read_be16_2 (void)
+{
+  struct uint16_st data;
+
+  read_aux (&data, 2);
+  return data.u1 | (data.u0 << 8);
+}
+
+/* { dg-final { scan-tree-dump-times "16 bit bswap implementation found at" 2 "bswap" } } */
+/* { dg-final { cleanup-tree-dump "bswap" } } */
diff --git a/gcc/testsuite/gcc.dg/optimize-bswapsi-2.c b/gcc/testsuite/gcc.dg/optimize-bswapsi-2.c
new file mode 100644
index 0000000..07ad73a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/optimize-bswapsi-2.c
@@ -0,0 +1,48 @@
+/* { dg-do compile { target *-*-* } } */
+/* { dg-require-effective-target bswap32 } */
+/* { dg-require-effective-target stdint_types } */
+/* { dg-options "-O2 -fdump-tree-bswap" } */
+/* { dg-options "-O2 -fdump-tree-bswap -march=z900" { target s390-*-* } } */
+
+#include <stdint.h>
+
+struct uint32_st {
+  unsigned char u0, u1, u2, u3;
+};
+
+uint32_t read_aux (void *, uint32_t);
+
+uint32_t read_le32_1 (void)
+{
+  unsigned char data[4];
+
+  read_aux (data, 4);
+  return data[0] | (data[1] << 8) | (data[2] << 16) | (data[3] << 24);
+}
+
+uint32_t read_le32_2 (void)
+{
+  struct uint32_st data;
+
+  read_aux (&data, 4);
+  return data.u0 | (data.u1 << 8) | (data.u2 << 16) | (data.u3 << 24);
+}
+
+uint32_t read_be32_1 (void)
+{
+  unsigned char data[4];
+
+  read_aux (data, 4);
+  return data[3] | (data[2] << 8) | (data[1] << 16) | (data[0] << 24);
+}
+
+uint32_t read_be32_2 (void)
+{
+  struct uint32_st data;
+
+  read_aux (&data, 4);
+  return data.u3 | (data.u2 << 8) | (data.u1 << 16) | (data.u0 << 24);
+}
+
+/* { dg-final { scan-tree-dump-times "32 bit bswap implementation found at" 2 "bswap" } } */
+/* { dg-final { cleanup-tree-dump "bswap" } } */
diff --git a/gcc/tree-ssa-math-opts.c b/gcc/tree-ssa-math-opts.c
index 9ff857c..576502a 100644
--- a/gcc/tree-ssa-math-opts.c
+++ b/gcc/tree-ssa-math-opts.c
@@ -1616,11 +1616,21 @@ make_pass_cse_sincos (gcc::context *ctxt)
 
    0    - byte has the value 0
    1..size - byte contains the content of the byte
-   number indexed with that value minus one  */
+   number indexed with that value minus one.
+
+   To detect permutations on memory sources (arrays and structures), a symbolic
+   number is also associated a base address (the array or structure the load is
+   made from), an offset from the base address and a range which gives the
+   range of memory represented by this symbolic number. The range is different
+   from size as size reflect the size of the type of current expression. It can
+   thus be bigger when casting to a type with bigger size.  */
 
 struct symbolic_number {
   unsigned HOST_WIDEST_INT n;
   int size;
+  tree base_addr;
+  tree offset;
+  unsigned HOST_WIDE_INT range;
 };
 
 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
@@ -1733,6 +1743,51 @@ find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
 	 to initialize the symbolic number.  */
       if (!source_expr1)
 	{
+	  n->base_addr = n->offset = NULL_TREE;
+	  if (is_gimple_assign (rhs1_stmt))
+	    {
+	      unsigned bit_offset;
+	      tree var, elt_size, index, field, bit_offset_expr;
+
+	      var = gimple_assign_rhs1 (rhs1_stmt);
+	      switch (TREE_CODE (var))
+		{
+		/* leaf node is an array, memorize its base, element size and
+		   offset from base to compare to other array leaf node.  */
+		case ARRAY_RANGE_REF:
+		case ARRAY_REF:
+		  n->base_addr = TREE_OPERAND (var, 0);
+		  elt_size = array_ref_element_size (var);
+		  if (TREE_CODE (elt_size) != INTEGER_CST)
+		    return NULL_TREE;
+		  index = TREE_OPERAND (var, 1);
+		  if (TREE_THIS_VOLATILE (var) || TREE_THIS_VOLATILE (index))
+		    return NULL_TREE;
+		  n->offset = fold_build2 (MULT_EXPR, sizetype,
+					   index, elt_size);
+		  rhs1 = var;
+		  break;
+		/* leaf node is a record field, memorize its base, field size
+		   and offset from base to compare to other field leaf node.  */
+		case COMPONENT_REF:
+		  n->base_addr = TREE_OPERAND (var, 0);
+		  field = TREE_OPERAND (var, 1);
+		  bit_offset = TREE_INT_CST_LOW (DECL_FIELD_BIT_OFFSET (field));
+		  if (bit_offset % BITS_PER_UNIT)
+		    return NULL_TREE;
+		  bit_offset /= BITS_PER_UNIT;
+		  n->offset = component_ref_field_offset (var);
+		  bit_offset_expr = build_int_cst (TREE_TYPE (n->offset),
+						   bit_offset);
+		  n->offset = fold_build2 (PLUS_EXPR, TREE_TYPE (n->offset),
+					  n->offset, bit_offset_expr);
+		  if (TYPE_PRECISION (TREE_TYPE (field)) != BITS_PER_UNIT)
+		    return NULL_TREE;
+		  rhs1 = var;
+		  break;
+		default:;
+		}
+	    }
 	  /* Set up the symbolic number N by setting each byte to a
 	     value between 1 and the byte size of rhs1.  The highest
 	     order byte is set to n->size and the lowest order
@@ -1741,6 +1796,7 @@ find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
 	  if (n->size % BITS_PER_UNIT != 0)
 	    return NULL_TREE;
 	  n->size /= BITS_PER_UNIT;
+	  n->range = n->size;
 	  n->n = (sizeof (HOST_WIDEST_INT) < 8 ? 0 :
 		  (unsigned HOST_WIDEST_INT)0x08070605 << 32 | 0x04030201);
 
@@ -1824,10 +1880,69 @@ find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
 
 	  source_expr2 = find_bswap_1 (rhs2_stmt, &n2, limit - 1);
 
-	  if (source_expr1 != source_expr2
-	      || n1.size != n2.size)
+	  if (n1.size != n2.size || !source_expr2)
 	    return NULL_TREE;
 
+	  if (source_expr1 != source_expr2)
+	    {
+	      tree off_sub_expr, off_cmp_expr;
+	      HOST_WIDEST_INT inc, mask;
+	      unsigned i;
+	      HOST_WIDE_INT off_sub;
+	      struct symbolic_number *n_ptr;
+
+	      if (!n1.base_addr || !n2.base_addr
+		  || n1.base_addr != n2.base_addr)
+		return NULL_TREE;
+
+	      off_cmp_expr = fold_build2 (LT_EXPR, TREE_TYPE (n1.offset),
+					  n2.offset, n1.offset);
+	      if (TREE_CODE (off_cmp_expr) != INTEGER_CST)
+		return NULL_TREE;
+
+	      /* We swap n1 with n2 to have n1 < n2.  */
+	      if (TREE_INT_CST_LOW (off_cmp_expr))
+		{
+		  struct symbolic_number tmpn;
+
+		  tmpn = n2;
+		  n2 = n1;
+		  n1 = tmpn;
+		  source_expr1 = source_expr2;
+		}
+
+	      off_sub_expr = fold_build2 (MINUS_EXPR, TREE_TYPE (n1.offset),
+					  n2.offset, n1.offset);
+	      if (!cst_and_fits_in_hwi (off_sub_expr))
+		return NULL_TREE;
+
+	      off_sub = TREE_INT_CST_LOW (off_sub_expr);
+
+	      /* Check that the range of memory covered < biggest int size.  */
+	      if (off_sub + n2.range > (int)sizeof (HOST_WIDEST_INT))
+	        return NULL_TREE;
+	      n->range = n2.range + off_sub;
+
+	      /* Reinterpret byte marks in symbolic number holding the value of
+		 bigger weight according to host endianness.  */
+	      inc = BYTES_BIG_ENDIAN ? off_sub + n2.range - n1.range : off_sub;
+	      mask = 0xFF;
+	      if (BYTES_BIG_ENDIAN)
+		n_ptr = &n1;
+	      else
+		n_ptr = &n2;
+	      for (i = 0; i < sizeof (HOST_WIDEST_INT); i++, inc <<= 8,
+		   mask <<= 8)
+		{
+		  if (n_ptr->n & mask)
+		    n_ptr->n += inc;
+		}
+	    }
+	  else
+	    n->range = n1.range;
+
+	  n->base_addr = n1.base_addr;
+	  n->offset = n1.offset;
 	  n->size = n1.size;
 	  for (i = 0, mask = 0xff; i < n->size; i++, mask <<= BITS_PER_UNIT)
 	    {
@@ -1853,14 +1968,15 @@ find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
 }
 
 /* Check if STMT completes a bswap implementation consisting of ORs,
-   SHIFTs and ANDs.  Return the source tree expression on which the
-   byte swap is performed and NULL if no bswap was found.  */
+   SHIFTs and ANDs.  It also sets *memsrc if the source lies in memory
+   and in this case also sets *size to the size of the load needed. At
+   last, the function returns the source tree expression.  */
 
 static tree
-find_bswap (gimple stmt)
+find_bswap (gimple stmt, bool *memsrc, int *size)
 {
 /* The number which the find_bswap result should match in order to
-   have a full byte swap.  The number is shifted to the left according
+   have a full byte swap.  The number is shifted to the right according
    to the size of the symbolic number before using it.  */
   unsigned HOST_WIDEST_INT cmp =
     sizeof (HOST_WIDEST_INT) < 8 ? 0 :
@@ -1868,12 +1984,12 @@ find_bswap (gimple stmt)
 
   struct symbolic_number n;
   tree source_expr;
-  int limit;
+  int limit, rsize;
 
   /* The last parameter determines the depth search limit.  It usually
      correlates directly to the number of bytes to be touched.  We
      increase that number by three  here in order to also
-     cover signed -> unsigned converions of the src operand as can be seen
+     cover signed -> unsigned conversions of the src operand as can be seen
      in libgcc, and for initial shift/and operation of the src operand.  */
   limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
   limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
@@ -1885,18 +2001,34 @@ find_bswap (gimple stmt)
   /* Zero out the extra bits of N and CMP.  */
   if (n.size < (int)sizeof (HOST_WIDEST_INT))
     {
+      int tmpn;
       unsigned HOST_WIDEST_INT mask =
 	((unsigned HOST_WIDEST_INT)1 << (n.size * BITS_PER_UNIT)) - 1;
 
       n.n &= mask;
-      cmp >>= (sizeof (HOST_WIDEST_INT) - n.size) * BITS_PER_UNIT;
+      /* Find real size of result (highest non zero byte).  */
+      for (tmpn = n.n, rsize = 0; tmpn; tmpn >>= BITS_PER_UNIT, rsize++);
+      cmp >>= (sizeof (HOST_WIDEST_INT) - rsize) * BITS_PER_UNIT;
     }
+  else
+    rsize = n.size;
 
   /* A complete byte swap should make the symbolic number to start
      with the largest digit in the highest order byte.  */
   if (cmp != n.n)
     return NULL_TREE;
 
+  *memsrc = 0;
+
+  if (n.base_addr)
+    {
+      *memsrc = 1;
+      n.size = rsize;
+    }
+  else if (rsize != n.size)
+    return NULL_TREE;
+
+  *size = n.size * BITS_PER_UNIT;
   return source_expr;
 }
 
@@ -1961,21 +2093,26 @@ execute_optimize_bswap (void)
       for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
         {
 	  gimple stmt = gsi_stmt (gsi);
-	  tree bswap_src, bswap_type;
+	  tree bswap_src, bswap_type, load_type;
 	  tree bswap_tmp;
 	  tree fndecl = NULL_TREE;
 	  int type_size;
+	  bool memsrc;
 	  gimple call;
 
 	  if (!is_gimple_assign (stmt)
 	      || gimple_assign_rhs_code (stmt) != BIT_IOR_EXPR)
 	    continue;
 
-	  type_size = TYPE_PRECISION (gimple_expr_type (stmt));
+	  bswap_src = find_bswap (stmt, &memsrc, &type_size);
+
+	  if (!bswap_src)
+	    continue;
 
 	  switch (type_size)
 	    {
 	    case 16:
+	      load_type = uint16_type_node;
 	      if (bswap16_p)
 		{
 		  fndecl = builtin_decl_explicit (BUILT_IN_BSWAP16);
@@ -1983,6 +2120,7 @@ execute_optimize_bswap (void)
 		}
 	      break;
 	    case 32:
+	      load_type = uint32_type_node;
 	      if (bswap32_p)
 		{
 		  fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
@@ -1990,6 +2128,7 @@ execute_optimize_bswap (void)
 		}
 	      break;
 	    case 64:
+	      load_type = uint64_type_node;
 	      if (bswap64_p)
 		{
 		  fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
@@ -2003,10 +2142,35 @@ execute_optimize_bswap (void)
 	  if (!fndecl)
 	    continue;
 
-	  bswap_src = find_bswap (stmt);
 
-	  if (!bswap_src)
-	    continue;
+	  /* Need to load the value from memory first.  */
+	  if (memsrc)
+	    {
+	      tree addr_expr, addr_tmp, val_expr, val_tmp;
+	      tree load_ptr_type, load_offset_ptr;
+	      gimple addr_stmt, load_stmt;
+
+	      changed = true;
+
+	      /*  Compute address to load from and cast according to the size
+		  of the load.  */
+	      load_ptr_type = build_pointer_type (load_type);
+	      addr_expr = build1 (ADDR_EXPR, load_ptr_type, bswap_src);
+	      addr_tmp = make_temp_ssa_name (load_ptr_type, NULL, "load_src");
+	      addr_stmt = gimple_build_assign_with_ops
+			 (NOP_EXPR, addr_tmp, addr_expr, NULL);
+	      gsi_insert_before (&gsi, addr_stmt, GSI_SAME_STMT);
+
+	      /* Perform the load.  */
+	      load_offset_ptr = build_int_cst (load_ptr_type, 0);
+	      val_tmp = make_temp_ssa_name (load_type, NULL, "load_dst");
+	      val_expr = build2 (MEM_REF, load_type, addr_tmp, load_offset_ptr);
+	      load_stmt = gimple_build_assign_with_ops
+			 (MEM_REF, val_tmp, val_expr, NULL);
+
+	      gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
+	      bswap_src = val_tmp;
+	    }
 
 	  changed = true;
 	  if (type_size == 16)

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH][2/3] Fix PR54733 Optimize endian independent load/store
  2014-04-02  0:55 [PATCH][2/3] Fix PR54733 Optimize endian independent load/store Thomas Preud'homme
@ 2014-04-02  6:17 ` Marc Glisse
  2014-04-02  7:04   ` Thomas Preud'homme
  0 siblings, 1 reply; 14+ messages in thread
From: Marc Glisse @ 2014-04-02  6:17 UTC (permalink / raw)
  To: Thomas Preud'homme; +Cc: gcc-patches

On Wed, 2 Apr 2014, Thomas Preud'homme wrote:

> Note that as it stands the patch does not work for arrays indexed with 
> variable (such a tab[a] || (tab[a+1] << 8)) because fold_const does not 
> fold (a + 1) - a.

Uh? It does fold a+1-a for me. What it doesn't do is look through the 
definition of b in b-a. Richard+GSoC will supposedly soon provide a 
function that does that.

-- 
Marc Glisse

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH][2/3] Fix PR54733 Optimize endian independent load/store
  2014-04-02  6:17 ` Marc Glisse
@ 2014-04-02  7:04   ` Thomas Preud'homme
  2014-04-02  8:17     ` Richard Biener
  0 siblings, 1 reply; 14+ messages in thread
From: Thomas Preud'homme @ 2014-04-02  7:04 UTC (permalink / raw)
  To: gcc-patches

> From: Marc Glisse [mailto:marc.glisse@inria.fr]
> 
> Uh? It does fold a+1-a for me. What it doesn't do is look through the
> definition of b in b-a. Richard+GSoC will supposedly soon provide a
> function that does that.

Oh right, it's a bit more complex here since the array index is converted
to an offset first. So the operation is more like:  ((a+1)*cst) - (a*cst).
Any chances this might be handled at some point? Note that this might
not be very frequent so it's not very important for this patch.

Thanks for the comment.

Best regards,

Thomas


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH][2/3] Fix PR54733 Optimize endian independent load/store
  2014-04-02  7:04   ` Thomas Preud'homme
@ 2014-04-02  8:17     ` Richard Biener
  2014-04-03  5:42       ` Thomas Preud'homme
  0 siblings, 1 reply; 14+ messages in thread
From: Richard Biener @ 2014-04-02  8:17 UTC (permalink / raw)
  To: Thomas Preud'homme; +Cc: GCC Patches

On Wed, Apr 2, 2014 at 9:04 AM, Thomas Preud'homme
<thomas.preudhomme@arm.com> wrote:
>> From: Marc Glisse [mailto:marc.glisse@inria.fr]
>>
>> Uh? It does fold a+1-a for me. What it doesn't do is look through the
>> definition of b in b-a. Richard+GSoC will supposedly soon provide a
>> function that does that.
>
> Oh right, it's a bit more complex here since the array index is converted
> to an offset first. So the operation is more like:  ((a+1)*cst) - (a*cst).
> Any chances this might be handled at some point? Note that this might
> not be very frequent so it's not very important for this patch.

"More like" isn't enough to answer this - do you have a testcase?  (usually
these end up in undefined-overflow and/or conversion-to-sizetype issues)

Richard.

> Thanks for the comment.
>
> Best regards,
>
> Thomas
>
>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH][2/3] Fix PR54733 Optimize endian independent load/store
  2014-04-02  8:17     ` Richard Biener
@ 2014-04-03  5:42       ` Thomas Preud'homme
  2014-04-04  5:49         ` Thomas Preud'homme
  0 siblings, 1 reply; 14+ messages in thread
From: Thomas Preud'homme @ 2014-04-03  5:42 UTC (permalink / raw)
  To: GCC Patches

[-- Attachment #1: Type: text/plain, Size: 717 bytes --]

> From: Richard Biener [mailto:richard.guenther@gmail.com]
> 
> "More like" isn't enough to answer this - do you have a testcase?  (usually
> these end up in undefined-overflow and/or conversion-to-sizetype issues)

I do. See attachment. This testcase needs to be compiled with patch 2/3
applied. As you can see from the patch, data[a] and data[a+1] will be
converted to offsets by multiplying the index with the element size. Then
later, analyzing the ORing, a substraction of these two index will be done.
So you have two fold_build and not one. I can't reproduce it with a simple
expression such as (a+1)*1 - a*1 so maybe being done in two part is the
reason, you know better.

Best regards,

Thomas

[-- Attachment #2: missed_folding.c --]
[-- Type: application/octet-stream, Size: 207 bytes --]

extern unsigned char data[20];

int read_le16_1 (int a)
{
  a = (a + 1) & ~1;
  return data[a] | (data[a + 1] << 8);
}

int read_be16_1 (int a)
{
  a = (a + 1) & ~1;
  return data[a + 1] | (data[a] << 8);
}

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH][2/3] Fix PR54733 Optimize endian independent load/store
  2014-04-03  5:42       ` Thomas Preud'homme
@ 2014-04-04  5:49         ` Thomas Preud'homme
  2014-04-15  9:07           ` Richard Biener
  0 siblings, 1 reply; 14+ messages in thread
From: Thomas Preud'homme @ 2014-04-04  5:49 UTC (permalink / raw)
  To: GCC Patches

[-- Attachment #1: Type: text/plain, Size: 247 bytes --]

> From: gcc-patches-owner@gcc.gnu.org [mailto:gcc-patches-
> owner@gcc.gnu.org] On Behalf Of Rainer Orth
> 
> Just omit the { target *-*-* } completely, also a few more times.

Please find attached an updated patch.

Best regards,

Thomas

[-- Attachment #2: gcc32rm-84.3.2.part2.diff --]
[-- Type: application/octet-stream, Size: 16679 bytes --]

diff --git a/gcc/testsuite/gcc.c-torture/execute/bswap-2.c b/gcc/testsuite/gcc.c-torture/execute/bswap-2.c
new file mode 100644
index 0000000..3181c5f
--- /dev/null
+++ b/gcc/testsuite/gcc.c-torture/execute/bswap-2.c
@@ -0,0 +1,57 @@
+#ifdef __UINT32_TYPE__
+typedef __UINT32_TYPE__ uint32_t;
+#else
+typedef __UINT32_TYPE__ unsigned;
+#endif
+
+struct bitfield {
+  unsigned char f0:7;
+  unsigned char f1:7;
+  unsigned char f2:7;
+  unsigned char f3:7;
+};
+
+struct ok {
+  unsigned char f0;
+  unsigned char f1;
+  unsigned char f2;
+  unsigned char f3;
+};
+
+union bf_or_uint32 {
+  struct ok inval;
+  struct bitfield bfval;
+};
+
+__attribute__ ((noinline, noclone)) uint32_t
+partial_read_le32 (union bf_or_uint32 in)
+{
+  return in.bfval.f0 | (in.bfval.f1 << 8)
+	 | (in.bfval.f2 << 16) | (in.bfval.f3 << 24);
+}
+
+__attribute__ ((noinline, noclone)) uint32_t
+partial_read_be32 (union bf_or_uint32 in)
+{
+  return in.bfval.f3 | (in.bfval.f2 << 8)
+	 | (in.bfval.f1 << 16) | (in.bfval.f0 << 24);
+}
+
+int
+main ()
+{
+  union bf_or_uint32 bfin;
+  uint32_t out;
+
+  if (sizeof (uint32_t) * __CHAR_BIT__ != 32)
+    return 0;
+  bfin.inval = (struct ok) { 0x83, 0x85, 0x87, 0x89 };
+  out = partial_read_le32 (bfin);
+  if (out != 0x09070503 && out != 0x88868482)
+    __builtin_abort ();
+  bfin.inval = (struct ok) { 0x83, 0x85, 0x87, 0x89 };
+  out = partial_read_be32 (bfin);
+  if (out != 0x03050709 && out != 0x82848688)
+    __builtin_abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.dg/optimize-bswapdi-3.c b/gcc/testsuite/gcc.dg/optimize-bswapdi-3.c
new file mode 100644
index 0000000..a419c8d
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/optimize-bswapdi-3.c
@@ -0,0 +1,59 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target bswap64 } */
+/* { dg-require-effective-target stdint_types } */
+/* { dg-options "-O2 -fdump-tree-bswap" } */
+
+#include <stdint.h>
+
+struct uint64_st {
+  unsigned char u0, u1, u2, u3, u4, u5, u6, u7;
+};
+
+uint32_t read_aux (void *, uint32_t);
+
+uint64_t read_le64_1 (void)
+{
+  unsigned char data[8];
+
+  read_aux (data, 8);
+  return (uint64_t) data[0] | ((uint64_t) data[1] << 8)
+	 | ((uint64_t) data[2] << 16) | ((uint64_t) data[3] << 24)
+	 | ((uint64_t) data[4] << 32) | ((uint64_t) data[5] << 40)
+	 | ((uint64_t) data[6] << 48) | ((uint64_t) data[7] << 56);
+}
+
+uint64_t read_le64_2 (void)
+{
+  struct uint64_st data;
+
+  read_aux (&data, 8);
+  return (uint64_t) data.u0 | ((uint64_t) data.u1 << 8)
+	 | ((uint64_t) data.u2 << 16) | ((uint64_t) data.u3 << 24)
+	 | ((uint64_t) data.u4 << 32) | ((uint64_t) data.u5 << 40)
+	 | ((uint64_t) data.u6 << 48) | ((uint64_t) data.u7 << 56);
+}
+
+uint64_t read_be64_1 (void)
+{
+  unsigned char data[8];
+
+  read_aux (data, 8);
+  return (uint64_t) data[7] | ((uint64_t) data[6] << 8)
+	 | ((uint64_t) data[5] << 16) | ((uint64_t) data[4] << 24)
+	 | ((uint64_t) data[3] << 32) | ((uint64_t) data[2] << 40)
+	 | ((uint64_t) data[1] << 48) | ((uint64_t) data[0] << 56);
+}
+
+uint64_t read_be64_2 (void)
+{
+  struct uint64_st data;
+
+  read_aux (&data, 8);
+  return (uint64_t) data.u7 | ((uint64_t) data.u6 << 8)
+	 | ((uint64_t) data.u5 << 16) | ((uint64_t) data.u4 << 24)
+	 | ((uint64_t) data.u3 << 32) | ((uint64_t) data.u2 << 40)
+	 | ((uint64_t) data.u1 << 48) | ((uint64_t) data.u0 << 56);
+}
+
+/* { dg-final { scan-tree-dump-times "64 bit bswap implementation found at" 2 "bswap" } } */
+/* { dg-final { cleanup-tree-dump "bswap" } } */
diff --git a/gcc/testsuite/gcc.dg/optimize-bswaphi-1.c b/gcc/testsuite/gcc.dg/optimize-bswaphi-1.c
new file mode 100644
index 0000000..fe4d1ad
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/optimize-bswaphi-1.c
@@ -0,0 +1,48 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target bswap16 } */
+/* { dg-require-effective-target stdint_types } */
+/* { dg-options "-O2 -fdump-tree-bswap" } */
+/* { dg-options "-O2 -fdump-tree-bswap -march=z900" { target s390-*-* } } */
+
+#include <stdint.h>
+
+struct uint16_st {
+  unsigned char u0, u1;
+};
+
+uint32_t read_aux (void *, uint32_t);
+
+uint32_t read_le16_1 (void)
+{
+  unsigned char data[2];
+
+  read_aux (data, 2);
+  return data[0] | (data[1] << 8);
+}
+
+uint32_t read_le16_2 (void)
+{
+  struct uint16_st data;
+
+  read_aux (&data, 2);
+  return data.u0 | (data.u1 << 8);
+}
+
+uint32_t read_be16_1 (void)
+{
+  unsigned char data[2];
+
+  read_aux (data, 2);
+  return data[1] | (data[0] << 8);
+}
+
+uint32_t read_be16_2 (void)
+{
+  struct uint16_st data;
+
+  read_aux (&data, 2);
+  return data.u1 | (data.u0 << 8);
+}
+
+/* { dg-final { scan-tree-dump-times "16 bit bswap implementation found at" 2 "bswap" } } */
+/* { dg-final { cleanup-tree-dump "bswap" } } */
diff --git a/gcc/testsuite/gcc.dg/optimize-bswapsi-2.c b/gcc/testsuite/gcc.dg/optimize-bswapsi-2.c
new file mode 100644
index 0000000..d3006c5
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/optimize-bswapsi-2.c
@@ -0,0 +1,48 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target bswap32 } */
+/* { dg-require-effective-target stdint_types } */
+/* { dg-options "-O2 -fdump-tree-bswap" } */
+/* { dg-options "-O2 -fdump-tree-bswap -march=z900" { target s390-*-* } } */
+
+#include <stdint.h>
+
+struct uint32_st {
+  unsigned char u0, u1, u2, u3;
+};
+
+uint32_t read_aux (void *, uint32_t);
+
+uint32_t read_le32_1 (void)
+{
+  unsigned char data[4];
+
+  read_aux (data, 4);
+  return data[0] | (data[1] << 8) | (data[2] << 16) | (data[3] << 24);
+}
+
+uint32_t read_le32_2 (void)
+{
+  struct uint32_st data;
+
+  read_aux (&data, 4);
+  return data.u0 | (data.u1 << 8) | (data.u2 << 16) | (data.u3 << 24);
+}
+
+uint32_t read_be32_1 (void)
+{
+  unsigned char data[4];
+
+  read_aux (data, 4);
+  return data[3] | (data[2] << 8) | (data[1] << 16) | (data[0] << 24);
+}
+
+uint32_t read_be32_2 (void)
+{
+  struct uint32_st data;
+
+  read_aux (&data, 4);
+  return data.u3 | (data.u2 << 8) | (data.u1 << 16) | (data.u0 << 24);
+}
+
+/* { dg-final { scan-tree-dump-times "32 bit bswap implementation found at" 2 "bswap" } } */
+/* { dg-final { cleanup-tree-dump "bswap" } } */
diff --git a/gcc/tree-ssa-math-opts.c b/gcc/tree-ssa-math-opts.c
index 9ff857c..576502a 100644
--- a/gcc/tree-ssa-math-opts.c
+++ b/gcc/tree-ssa-math-opts.c
@@ -1616,11 +1616,21 @@ make_pass_cse_sincos (gcc::context *ctxt)
 
    0    - byte has the value 0
    1..size - byte contains the content of the byte
-   number indexed with that value minus one  */
+   number indexed with that value minus one.
+
+   To detect permutations on memory sources (arrays and structures), a symbolic
+   number is also associated a base address (the array or structure the load is
+   made from), an offset from the base address and a range which gives the
+   range of memory represented by this symbolic number. The range is different
+   from size as size reflect the size of the type of current expression. It can
+   thus be bigger when casting to a type with bigger size.  */
 
 struct symbolic_number {
   unsigned HOST_WIDEST_INT n;
   int size;
+  tree base_addr;
+  tree offset;
+  unsigned HOST_WIDE_INT range;
 };
 
 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
@@ -1733,6 +1743,51 @@ find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
 	 to initialize the symbolic number.  */
       if (!source_expr1)
 	{
+	  n->base_addr = n->offset = NULL_TREE;
+	  if (is_gimple_assign (rhs1_stmt))
+	    {
+	      unsigned bit_offset;
+	      tree var, elt_size, index, field, bit_offset_expr;
+
+	      var = gimple_assign_rhs1 (rhs1_stmt);
+	      switch (TREE_CODE (var))
+		{
+		/* leaf node is an array, memorize its base, element size and
+		   offset from base to compare to other array leaf node.  */
+		case ARRAY_RANGE_REF:
+		case ARRAY_REF:
+		  n->base_addr = TREE_OPERAND (var, 0);
+		  elt_size = array_ref_element_size (var);
+		  if (TREE_CODE (elt_size) != INTEGER_CST)
+		    return NULL_TREE;
+		  index = TREE_OPERAND (var, 1);
+		  if (TREE_THIS_VOLATILE (var) || TREE_THIS_VOLATILE (index))
+		    return NULL_TREE;
+		  n->offset = fold_build2 (MULT_EXPR, sizetype,
+					   index, elt_size);
+		  rhs1 = var;
+		  break;
+		/* leaf node is a record field, memorize its base, field size
+		   and offset from base to compare to other field leaf node.  */
+		case COMPONENT_REF:
+		  n->base_addr = TREE_OPERAND (var, 0);
+		  field = TREE_OPERAND (var, 1);
+		  bit_offset = TREE_INT_CST_LOW (DECL_FIELD_BIT_OFFSET (field));
+		  if (bit_offset % BITS_PER_UNIT)
+		    return NULL_TREE;
+		  bit_offset /= BITS_PER_UNIT;
+		  n->offset = component_ref_field_offset (var);
+		  bit_offset_expr = build_int_cst (TREE_TYPE (n->offset),
+						   bit_offset);
+		  n->offset = fold_build2 (PLUS_EXPR, TREE_TYPE (n->offset),
+					  n->offset, bit_offset_expr);
+		  if (TYPE_PRECISION (TREE_TYPE (field)) != BITS_PER_UNIT)
+		    return NULL_TREE;
+		  rhs1 = var;
+		  break;
+		default:;
+		}
+	    }
 	  /* Set up the symbolic number N by setting each byte to a
 	     value between 1 and the byte size of rhs1.  The highest
 	     order byte is set to n->size and the lowest order
@@ -1741,6 +1796,7 @@ find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
 	  if (n->size % BITS_PER_UNIT != 0)
 	    return NULL_TREE;
 	  n->size /= BITS_PER_UNIT;
+	  n->range = n->size;
 	  n->n = (sizeof (HOST_WIDEST_INT) < 8 ? 0 :
 		  (unsigned HOST_WIDEST_INT)0x08070605 << 32 | 0x04030201);
 
@@ -1824,10 +1880,69 @@ find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
 
 	  source_expr2 = find_bswap_1 (rhs2_stmt, &n2, limit - 1);
 
-	  if (source_expr1 != source_expr2
-	      || n1.size != n2.size)
+	  if (n1.size != n2.size || !source_expr2)
 	    return NULL_TREE;
 
+	  if (source_expr1 != source_expr2)
+	    {
+	      tree off_sub_expr, off_cmp_expr;
+	      HOST_WIDEST_INT inc, mask;
+	      unsigned i;
+	      HOST_WIDE_INT off_sub;
+	      struct symbolic_number *n_ptr;
+
+	      if (!n1.base_addr || !n2.base_addr
+		  || n1.base_addr != n2.base_addr)
+		return NULL_TREE;
+
+	      off_cmp_expr = fold_build2 (LT_EXPR, TREE_TYPE (n1.offset),
+					  n2.offset, n1.offset);
+	      if (TREE_CODE (off_cmp_expr) != INTEGER_CST)
+		return NULL_TREE;
+
+	      /* We swap n1 with n2 to have n1 < n2.  */
+	      if (TREE_INT_CST_LOW (off_cmp_expr))
+		{
+		  struct symbolic_number tmpn;
+
+		  tmpn = n2;
+		  n2 = n1;
+		  n1 = tmpn;
+		  source_expr1 = source_expr2;
+		}
+
+	      off_sub_expr = fold_build2 (MINUS_EXPR, TREE_TYPE (n1.offset),
+					  n2.offset, n1.offset);
+	      if (!cst_and_fits_in_hwi (off_sub_expr))
+		return NULL_TREE;
+
+	      off_sub = TREE_INT_CST_LOW (off_sub_expr);
+
+	      /* Check that the range of memory covered < biggest int size.  */
+	      if (off_sub + n2.range > (int)sizeof (HOST_WIDEST_INT))
+	        return NULL_TREE;
+	      n->range = n2.range + off_sub;
+
+	      /* Reinterpret byte marks in symbolic number holding the value of
+		 bigger weight according to host endianness.  */
+	      inc = BYTES_BIG_ENDIAN ? off_sub + n2.range - n1.range : off_sub;
+	      mask = 0xFF;
+	      if (BYTES_BIG_ENDIAN)
+		n_ptr = &n1;
+	      else
+		n_ptr = &n2;
+	      for (i = 0; i < sizeof (HOST_WIDEST_INT); i++, inc <<= 8,
+		   mask <<= 8)
+		{
+		  if (n_ptr->n & mask)
+		    n_ptr->n += inc;
+		}
+	    }
+	  else
+	    n->range = n1.range;
+
+	  n->base_addr = n1.base_addr;
+	  n->offset = n1.offset;
 	  n->size = n1.size;
 	  for (i = 0, mask = 0xff; i < n->size; i++, mask <<= BITS_PER_UNIT)
 	    {
@@ -1853,14 +1968,15 @@ find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
 }
 
 /* Check if STMT completes a bswap implementation consisting of ORs,
-   SHIFTs and ANDs.  Return the source tree expression on which the
-   byte swap is performed and NULL if no bswap was found.  */
+   SHIFTs and ANDs.  It also sets *memsrc if the source lies in memory
+   and in this case also sets *size to the size of the load needed. At
+   last, the function returns the source tree expression.  */
 
 static tree
-find_bswap (gimple stmt)
+find_bswap (gimple stmt, bool *memsrc, int *size)
 {
 /* The number which the find_bswap result should match in order to
-   have a full byte swap.  The number is shifted to the left according
+   have a full byte swap.  The number is shifted to the right according
    to the size of the symbolic number before using it.  */
   unsigned HOST_WIDEST_INT cmp =
     sizeof (HOST_WIDEST_INT) < 8 ? 0 :
@@ -1868,12 +1984,12 @@ find_bswap (gimple stmt)
 
   struct symbolic_number n;
   tree source_expr;
-  int limit;
+  int limit, rsize;
 
   /* The last parameter determines the depth search limit.  It usually
      correlates directly to the number of bytes to be touched.  We
      increase that number by three  here in order to also
-     cover signed -> unsigned converions of the src operand as can be seen
+     cover signed -> unsigned conversions of the src operand as can be seen
      in libgcc, and for initial shift/and operation of the src operand.  */
   limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
   limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
@@ -1885,18 +2001,34 @@ find_bswap (gimple stmt)
   /* Zero out the extra bits of N and CMP.  */
   if (n.size < (int)sizeof (HOST_WIDEST_INT))
     {
+      int tmpn;
       unsigned HOST_WIDEST_INT mask =
 	((unsigned HOST_WIDEST_INT)1 << (n.size * BITS_PER_UNIT)) - 1;
 
       n.n &= mask;
-      cmp >>= (sizeof (HOST_WIDEST_INT) - n.size) * BITS_PER_UNIT;
+      /* Find real size of result (highest non zero byte).  */
+      for (tmpn = n.n, rsize = 0; tmpn; tmpn >>= BITS_PER_UNIT, rsize++);
+      cmp >>= (sizeof (HOST_WIDEST_INT) - rsize) * BITS_PER_UNIT;
     }
+  else
+    rsize = n.size;
 
   /* A complete byte swap should make the symbolic number to start
      with the largest digit in the highest order byte.  */
   if (cmp != n.n)
     return NULL_TREE;
 
+  *memsrc = 0;
+
+  if (n.base_addr)
+    {
+      *memsrc = 1;
+      n.size = rsize;
+    }
+  else if (rsize != n.size)
+    return NULL_TREE;
+
+  *size = n.size * BITS_PER_UNIT;
   return source_expr;
 }
 
@@ -1961,21 +2093,26 @@ execute_optimize_bswap (void)
       for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
         {
 	  gimple stmt = gsi_stmt (gsi);
-	  tree bswap_src, bswap_type;
+	  tree bswap_src, bswap_type, load_type;
 	  tree bswap_tmp;
 	  tree fndecl = NULL_TREE;
 	  int type_size;
+	  bool memsrc;
 	  gimple call;
 
 	  if (!is_gimple_assign (stmt)
 	      || gimple_assign_rhs_code (stmt) != BIT_IOR_EXPR)
 	    continue;
 
-	  type_size = TYPE_PRECISION (gimple_expr_type (stmt));
+	  bswap_src = find_bswap (stmt, &memsrc, &type_size);
+
+	  if (!bswap_src)
+	    continue;
 
 	  switch (type_size)
 	    {
 	    case 16:
+	      load_type = uint16_type_node;
 	      if (bswap16_p)
 		{
 		  fndecl = builtin_decl_explicit (BUILT_IN_BSWAP16);
@@ -1983,6 +2120,7 @@ execute_optimize_bswap (void)
 		}
 	      break;
 	    case 32:
+	      load_type = uint32_type_node;
 	      if (bswap32_p)
 		{
 		  fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
@@ -1990,6 +2128,7 @@ execute_optimize_bswap (void)
 		}
 	      break;
 	    case 64:
+	      load_type = uint64_type_node;
 	      if (bswap64_p)
 		{
 		  fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
@@ -2003,10 +2142,35 @@ execute_optimize_bswap (void)
 	  if (!fndecl)
 	    continue;
 
-	  bswap_src = find_bswap (stmt);
 
-	  if (!bswap_src)
-	    continue;
+	  /* Need to load the value from memory first.  */
+	  if (memsrc)
+	    {
+	      tree addr_expr, addr_tmp, val_expr, val_tmp;
+	      tree load_ptr_type, load_offset_ptr;
+	      gimple addr_stmt, load_stmt;
+
+	      changed = true;
+
+	      /*  Compute address to load from and cast according to the size
+		  of the load.  */
+	      load_ptr_type = build_pointer_type (load_type);
+	      addr_expr = build1 (ADDR_EXPR, load_ptr_type, bswap_src);
+	      addr_tmp = make_temp_ssa_name (load_ptr_type, NULL, "load_src");
+	      addr_stmt = gimple_build_assign_with_ops
+			 (NOP_EXPR, addr_tmp, addr_expr, NULL);
+	      gsi_insert_before (&gsi, addr_stmt, GSI_SAME_STMT);
+
+	      /* Perform the load.  */
+	      load_offset_ptr = build_int_cst (load_ptr_type, 0);
+	      val_tmp = make_temp_ssa_name (load_type, NULL, "load_dst");
+	      val_expr = build2 (MEM_REF, load_type, addr_tmp, load_offset_ptr);
+	      load_stmt = gimple_build_assign_with_ops
+			 (MEM_REF, val_tmp, val_expr, NULL);
+
+	      gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
+	      bswap_src = val_tmp;
+	    }
 
 	  changed = true;
 	  if (type_size == 16)

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH][2/3] Fix PR54733 Optimize endian independent load/store
  2014-04-04  5:49         ` Thomas Preud'homme
@ 2014-04-15  9:07           ` Richard Biener
  2014-04-17  5:30             ` Thomas Preud'homme
  0 siblings, 1 reply; 14+ messages in thread
From: Richard Biener @ 2014-04-15  9:07 UTC (permalink / raw)
  To: Thomas Preud'homme; +Cc: GCC Patches

On Fri, Apr 4, 2014 at 7:49 AM, Thomas Preud'homme
<thomas.preudhomme@arm.com> wrote:
>> From: gcc-patches-owner@gcc.gnu.org [mailto:gcc-patches-
>> owner@gcc.gnu.org] On Behalf Of Rainer Orth
>>
>> Just omit the { target *-*-* } completely, also a few more times.
>
> Please find attached an updated patch.

@@ -1733,6 +1743,51 @@ find_bswap_1 (gimple stmt, struct
symbolic_number *n, int limit)
         to initialize the symbolic number.  */
       if (!source_expr1)
        {
+         n->base_addr = n->offset = NULL_TREE;
+         if (is_gimple_assign (rhs1_stmt))

you want gimple_assign_load_p (rhs1_stmt) && !gimple_has_volatile_ops
(rhs1_stmt)
here.

+               case ARRAY_RANGE_REF:

For ARRAY_RANGE_REF this doesn't make much sense IMHO.

+               case ARRAY_REF:
+                 n->base_addr = TREE_OPERAND (var, 0);
+                 elt_size = array_ref_element_size (var);
+                 if (TREE_CODE (elt_size) != INTEGER_CST)
+                   return NULL_TREE;
+                 index = TREE_OPERAND (var, 1);
+                 if (TREE_THIS_VOLATILE (var) || TREE_THIS_VOLATILE (index))
+                   return NULL_TREE;
+                 n->offset = fold_build2 (MULT_EXPR, sizetype,
+                                          index, elt_size);

You fail to honor array_ref_low_bound.

With handling only the outermost handled-component and then only a
selected subset you'll catch many but not all cases.  Why not simply
use get_inner_reference () here (plus stripping the constant offset
from an innermost MEM_REF) and get the best of both worlds (not
duplicate parts of its logic and handle more cases)?  Eventually
using tree-affine.c and get_inner_reference_aff is even more appropriate
so you can compute the address differences without decomposing
them yourselves.

Btw, I think for the recursion to work properly you need to handle loads
for the toplevel stmt, not for rhs1_stmt.  Eventually you need to split
out a find_bswap_2 that handles the recursion case that allows loads
from the case that doesn't called from find_bswap.

+             /*  Compute address to load from and cast according to the size
+                 of the load.  */
+             load_ptr_type = build_pointer_type (load_type);
+             addr_expr = build1 (ADDR_EXPR, load_ptr_type, bswap_src);
+             addr_tmp = make_temp_ssa_name (load_ptr_type, NULL, "load_src");
+             addr_stmt = gimple_build_assign_with_ops
+                        (NOP_EXPR, addr_tmp, addr_expr, NULL);
+             gsi_insert_before (&gsi, addr_stmt, GSI_SAME_STMT);
+
+             /* Perform the load.  */
+             load_offset_ptr = build_int_cst (load_ptr_type, 0);
+             val_tmp = make_temp_ssa_name (load_type, NULL, "load_dst");
+             val_expr = build2 (MEM_REF, load_type, addr_tmp, load_offset_ptr);
+             load_stmt = gimple_build_assign_with_ops
+                        (MEM_REF, val_tmp, val_expr, NULL);

this is unnecessarily complex and has TBAA issues.  You don't need to
create a "correct" pointer type, so doing

    addr_expr = fold_build_addr_expr (bswap_src);

is enough.  Now, to fix the TBAA issues you either need to remember
and "combine" the reference_alias_ptr_type of each original load and
use that for the load_offset_ptr value or decide that isn't worth it and
use alias-set zero (use ptr_type_node).

Can you also expand the comment about "size vs. range"?  Is it
that range can be bigger than size if you have (short)a[0] |
((short)a[3] << 1) sofar
where size == 2 but range == 3?  Thus range can also be smaller than size
for example for (short)a[0] | ((short)a[0] << 1) where range would be 1 and
size == 2?  I suppose adding two examples like this to the comment, together
with the expected value of 'n' would help here.

Otherwise the patch looks good.  Now we're only missing the addition
of trying to match to a VEC_PERM_EXPR with a constant mask
using can_vec_perm_p ;)

Thanks,
Richard.



> Best regards,
>
> Thomas

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH][2/3] Fix PR54733 Optimize endian independent load/store
  2014-04-15  9:07           ` Richard Biener
@ 2014-04-17  5:30             ` Thomas Preud'homme
  2014-04-17 10:36               ` Richard Biener
  0 siblings, 1 reply; 14+ messages in thread
From: Thomas Preud'homme @ 2014-04-17  5:30 UTC (permalink / raw)
  To: GCC Patches

> From: Richard Biener [mailto:richard.guenther@gmail.com]
> 
> With handling only the outermost handled-component and then only a
> selected subset you'll catch many but not all cases.  Why not simply
> use get_inner_reference () here (plus stripping the constant offset
> from an innermost MEM_REF) and get the best of both worlds (not
> duplicate parts of its logic and handle more cases)?  Eventually
> using tree-affine.c and get_inner_reference_aff is even more appropriate
> so you can compute the address differences without decomposing
> them yourselves.

Why does the constant offset from an innermost MEM_REF need to be
stripped? Shouldn't that be part of the offset in the symbolic number?

> 
> +             /*  Compute address to load from and cast according to the size
> +                 of the load.  */
> +             load_ptr_type = build_pointer_type (load_type);
> +             addr_expr = build1 (ADDR_EXPR, load_ptr_type, bswap_src);
> +             addr_tmp = make_temp_ssa_name (load_ptr_type, NULL,
> "load_src");
> +             addr_stmt = gimple_build_assign_with_ops
> +                        (NOP_EXPR, addr_tmp, addr_expr, NULL);
> +             gsi_insert_before (&gsi, addr_stmt, GSI_SAME_STMT);
> +
> +             /* Perform the load.  */
> +             load_offset_ptr = build_int_cst (load_ptr_type, 0);
> +             val_tmp = make_temp_ssa_name (load_type, NULL, "load_dst");
> +             val_expr = build2 (MEM_REF, load_type, addr_tmp, load_offset_ptr);
> +             load_stmt = gimple_build_assign_with_ops
> +                        (MEM_REF, val_tmp, val_expr, NULL);
> 
> this is unnecessarily complex and has TBAA issues.  You don't need to
> create a "correct" pointer type, so doing
> 
>     addr_expr = fold_build_addr_expr (bswap_src);
> 
> is enough.  Now, to fix the TBAA issues you either need to remember
> and "combine" the reference_alias_ptr_type of each original load and
> use that for the load_offset_ptr value or decide that isn't worth it and
> use alias-set zero (use ptr_type_node).

Sorry this is only my second patch [1] to gcc so it's not all clear to me. The TBAA
issue you mention comes from val_expr referring to a memory area that
overlap with the smaller memory area used in the bitwise OR operation, am I
right? Now, I have no idea about how to do the combination of the values
returned by reference_alias_ptr_type () for each individual small memory
area. Can you advise me on this? And what are the effect of not doing it and
using ptr_type_node for the alias-set?

[1] First one was a fix on the existing implementation of the bswap pass.

> 
> Can you also expand the comment about "size vs. range"?  Is it
> that range can be bigger than size if you have (short)a[0] |
> ((short)a[3] << 1) sofar
> where size == 2 but range == 3?  Thus range can also be smaller than size
> for example for (short)a[0] | ((short)a[0] << 1) where range would be 1 and
> size == 2?  I suppose adding two examples like this to the comment, together
> with the expected value of 'n' would help here.

You understood correctly. I will add the suggested example.

> Otherwise the patch looks good.  Now we're only missing the addition
> of trying to match to a VEC_PERM_EXPR with a constant mask
> using can_vec_perm_p ;)

Is that the vector shuffle engine you were mentioning in PR54733? If I
understand correctly it is a generalization of the check again CMPNOP and
CMPXCHG in find_bswap in this new patchset. I will look if ARM could
Benefit from this and if yes I might take a look (yep, two conditions).

Thanks a lot for such quick and detailed comments after my ping.

Best regards,

Thomas



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH][2/3] Fix PR54733 Optimize endian independent load/store
  2014-04-17  5:30             ` Thomas Preud'homme
@ 2014-04-17 10:36               ` Richard Biener
  2014-04-24  1:37                 ` Thomas Preud'homme
  0 siblings, 1 reply; 14+ messages in thread
From: Richard Biener @ 2014-04-17 10:36 UTC (permalink / raw)
  To: Thomas Preud'homme; +Cc: GCC Patches

On Thu, Apr 17, 2014 at 7:19 AM, Thomas Preud'homme
<thomas.preudhomme@arm.com> wrote:
>> From: Richard Biener [mailto:richard.guenther@gmail.com]
>>
>> With handling only the outermost handled-component and then only a
>> selected subset you'll catch many but not all cases.  Why not simply
>> use get_inner_reference () here (plus stripping the constant offset
>> from an innermost MEM_REF) and get the best of both worlds (not
>> duplicate parts of its logic and handle more cases)?  Eventually
>> using tree-affine.c and get_inner_reference_aff is even more appropriate
>> so you can compute the address differences without decomposing
>> them yourselves.
>
> Why does the constant offset from an innermost MEM_REF need to be
> stripped? Shouldn't that be part of the offset in the symbolic number?

Yes, but get_inner_reference returns MEM[ptr, constant-offset] as base,
thus it doesn't move the constant offset therein to bitpos and doesn't
return MEM[ptr, 0].  You have to do that yourselves.

(as you are really interested in the _address_ of the memory reference
instead of the reference itself it would be appropriate to introduce a
variant of get_inner_reference that returns 'ptr' in this case and
&x for x.field1 for example)

>>
>> +             /*  Compute address to load from and cast according to the size
>> +                 of the load.  */
>> +             load_ptr_type = build_pointer_type (load_type);
>> +             addr_expr = build1 (ADDR_EXPR, load_ptr_type, bswap_src);
>> +             addr_tmp = make_temp_ssa_name (load_ptr_type, NULL,
>> "load_src");
>> +             addr_stmt = gimple_build_assign_with_ops
>> +                        (NOP_EXPR, addr_tmp, addr_expr, NULL);
>> +             gsi_insert_before (&gsi, addr_stmt, GSI_SAME_STMT);
>> +
>> +             /* Perform the load.  */
>> +             load_offset_ptr = build_int_cst (load_ptr_type, 0);
>> +             val_tmp = make_temp_ssa_name (load_type, NULL, "load_dst");
>> +             val_expr = build2 (MEM_REF, load_type, addr_tmp, load_offset_ptr);
>> +             load_stmt = gimple_build_assign_with_ops
>> +                        (MEM_REF, val_tmp, val_expr, NULL);
>>
>> this is unnecessarily complex and has TBAA issues.  You don't need to
>> create a "correct" pointer type, so doing
>>
>>     addr_expr = fold_build_addr_expr (bswap_src);
>>
>> is enough.  Now, to fix the TBAA issues you either need to remember
>> and "combine" the reference_alias_ptr_type of each original load and
>> use that for the load_offset_ptr value or decide that isn't worth it and
>> use alias-set zero (use ptr_type_node).
>
> Sorry this is only my second patch [1] to gcc so it's not all clear to me. The TBAA
> issue you mention comes from val_expr referring to a memory area that
> overlap with the smaller memory area used in the bitwise OR operation, am I
> right? Now, I have no idea about how to do the combination of the values
> returned by reference_alias_ptr_type () for each individual small memory
> area. Can you advise me on this? And what are the effect of not doing it and
> using ptr_type_node for the alias-set?

You can "combine" two reference_alias_ptr_type()s with

  if (alias_ptr_types_compatible_p (type1, type2))
    return type1;
  else
    return ptr_type_node;

using ptr_type_node for the alias-set will make it alias with all memory
references (that is, type-based disambiguation will be disabled).  That's
required for example if you combine four loads with type 'short' using
a single load with type 'long'.

> [1] First one was a fix on the existing implementation of the bswap pass.
>
>>
>> Can you also expand the comment about "size vs. range"?  Is it
>> that range can be bigger than size if you have (short)a[0] |
>> ((short)a[3] << 1) sofar
>> where size == 2 but range == 3?  Thus range can also be smaller than size
>> for example for (short)a[0] | ((short)a[0] << 1) where range would be 1 and
>> size == 2?  I suppose adding two examples like this to the comment, together
>> with the expected value of 'n' would help here.
>
> You understood correctly. I will add the suggested example.
>
>> Otherwise the patch looks good.  Now we're only missing the addition
>> of trying to match to a VEC_PERM_EXPR with a constant mask
>> using can_vec_perm_p ;)
>
> Is that the vector shuffle engine you were mentioning in PR54733? If I
> understand correctly it is a generalization of the check again CMPNOP and
> CMPXCHG in find_bswap in this new patchset. I will look if ARM could
> Benefit from this and if yes I might take a look (yep, two conditions).

Yep.  For example it might match on things like

int foo (char *x)
{
   return ((((x[0] << 1 | x[0]) << 1) | x[1]) << 1) | x[0];
}

not sure if target support for shuffles on small vectors (or vector parts)
is working well.  Thus on v1si as in the example.

Richard.

> Thanks a lot for such quick and detailed comments after my ping.
>
> Best regards,
>
> Thomas
>
>
>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH][2/3] Fix PR54733 Optimize endian independent load/store
  2014-04-17 10:36               ` Richard Biener
@ 2014-04-24  1:37                 ` Thomas Preud'homme
  2014-04-24  8:40                   ` Thomas Preud'homme
  2014-04-29 10:47                   ` Richard Biener
  0 siblings, 2 replies; 14+ messages in thread
From: Thomas Preud'homme @ 2014-04-24  1:37 UTC (permalink / raw)
  To: GCC Patches

[-- Attachment #1: Type: text/plain, Size: 1339 bytes --]

See updated part 2 of the patch in attachment. Part 1 is unchanged. New ChangeLog are as follows:

*** gcc/ChangeLog ***

2014-04-23  Thomas Preud'homme  <thomas.preudhomme@arm.com>

	PR tree-optimization/54733
	* tree-ssa-math-opts.c (find_bswap_load): New.
	(find_bswap_1): Add support for memory source.
	(find_bswap): Likewise.
	(execute_optimize_bswap): Likewise.
	* expr.c (get_inner_reference): Renamed to ...
	(get_inner_reference_1): This. Also add a parameter to control whether
	a MEM_REF should be split into base + offset.
	(get_inner_reference): New wrapper around get_inner_reference_1 for
	compatibility.
	(get_inner_reference_base): New wrapper around get_inner_reference_1
	to access its new facet.
	* tree.h (get_inner_reference_base): Declare.


*** gcc/testsuite/ChangeLog ***

2014-04-23  Thomas Preud'homme  <thomas.preudhomme@arm.com>

	PR tree-optimization/54733
	* gcc.dg/optimize-bswapdi-3.c: New test to check extension of bswap
	optimization to support memory sources.
	* gcc.dg/optimize-bswaphi-1.c: Likewise.
	* gcc.dg/optimize-bswapsi-2.c: Likewise.
	* gcc.c-torture/execute/bswap-2.c: Likewise.

Bootstrapped on x86_64-linux-gnu with no testsuite regression. Also did a arm-none-eabi cross build with no regression after running testsuite via qemu

Best regards,

Thomas

[-- Attachment #2: gcc32rm-84.4.0.part2.diff --]
[-- Type: application/octet-stream, Size: 23475 bytes --]

diff --git a/gcc/expr.c b/gcc/expr.c
index 72e4401..486aacf 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -6657,7 +6657,9 @@ store_field (rtx target, HOST_WIDE_INT bitsize, HOST_WIDE_INT bitpos,
 \f
 /* Given an expression EXP that may be a COMPONENT_REF, a BIT_FIELD_REF,
    an ARRAY_REF, or an ARRAY_RANGE_REF, look for nested operations of these
-   codes and find the ultimate containing object, which we return.
+   codes and find the ultimate containing object, which we return. If that
+   object is a MEM_REF that is not the address of an object, PTR_BASE decides
+   whether the pointer is returned instead of the memory reference.
 
    We set *PBITSIZE to the size in bits that we want, *PBITPOS to the
    bit position, and *PUNSIGNEDP to the signedness of the field.
@@ -6691,10 +6693,10 @@ store_field (rtx target, HOST_WIDE_INT bitsize, HOST_WIDE_INT bitpos,
    the normal operating mode in the RTL expanders.  */
 
 tree
-get_inner_reference (tree exp, HOST_WIDE_INT *pbitsize,
+get_inner_reference_1 (tree exp, HOST_WIDE_INT *pbitsize,
 		     HOST_WIDE_INT *pbitpos, tree *poffset,
 		     enum machine_mode *pmode, int *punsignedp,
-		     int *pvolatilep, bool keep_aligning)
+		     int *pvolatilep, bool keep_aligning, bool ptr_base)
 {
   tree size_tree = 0;
   enum machine_mode mode = VOIDmode;
@@ -6826,7 +6828,7 @@ get_inner_reference (tree exp, HOST_WIDE_INT *pbitsize,
 
 	case MEM_REF:
 	  /* Hand back the decl for MEM[&decl, off].  */
-	  if (TREE_CODE (TREE_OPERAND (exp, 0)) == ADDR_EXPR)
+	  if (TREE_CODE (TREE_OPERAND (exp, 0)) == ADDR_EXPR || ptr_base)
 	    {
 	      tree off = TREE_OPERAND (exp, 1);
 	      if (!integer_zerop (off))
@@ -6836,7 +6838,10 @@ get_inner_reference (tree exp, HOST_WIDE_INT *pbitsize,
 				      ? 3 : exact_log2 (BITS_PER_UNIT));
 		  bit_offset += boff;
 		}
-	      exp = TREE_OPERAND (TREE_OPERAND (exp, 0), 0);
+	      if (TREE_CODE (TREE_OPERAND (exp, 0)) == ADDR_EXPR)
+		exp = TREE_OPERAND (TREE_OPERAND (exp, 0), 0);
+	      else
+		exp = TREE_OPERAND (exp, 0);
 	    }
 	  goto done;
 
@@ -6904,6 +6909,34 @@ get_inner_reference (tree exp, HOST_WIDE_INT *pbitsize,
   return exp;
 }
 
+/* Given an expression EXP that is a handled_component_p,
+   look for the ultimate containing object, which is returned and specify the
+   access position and size.  */
+
+tree
+get_inner_reference (tree exp, HOST_WIDE_INT *pbitsize,
+		     HOST_WIDE_INT *pbitpos, tree *poffset,
+		     enum machine_mode *pmode, int *punsignedp,
+		     int *pvolatilep, bool keep_aligning)
+{
+  return get_inner_reference_1 (exp, pbitsize, pbitpos, poffset, pmode,
+				punsignedp, pvolatilep, keep_aligning, false);
+}
+
+/* Given an expression EXP that is a handled_component_p,
+   look for the base of the ultimate containing object, which is returned and
+   specify the access position and size.  */
+
+tree
+get_inner_reference_base (tree exp, HOST_WIDE_INT *pbitsize,
+			  HOST_WIDE_INT *pbitpos, tree *poffset,
+			  enum machine_mode *pmode, int *punsignedp,
+			  int *pvolatilep, bool keep_aligning)
+{
+  return get_inner_reference_1 (exp, pbitsize, pbitpos, poffset, pmode,
+				punsignedp, pvolatilep, keep_aligning, true);
+}
+
 /* Return a tree of sizetype representing the size, in bytes, of the element
    of EXP, an ARRAY_REF or an ARRAY_RANGE_REF.  */
 
diff --git a/gcc/testsuite/gcc.c-torture/execute/bswap-2.c b/gcc/testsuite/gcc.c-torture/execute/bswap-2.c
new file mode 100644
index 0000000..3181c5f
--- /dev/null
+++ b/gcc/testsuite/gcc.c-torture/execute/bswap-2.c
@@ -0,0 +1,57 @@
+#ifdef __UINT32_TYPE__
+typedef __UINT32_TYPE__ uint32_t;
+#else
+typedef __UINT32_TYPE__ unsigned;
+#endif
+
+struct bitfield {
+  unsigned char f0:7;
+  unsigned char f1:7;
+  unsigned char f2:7;
+  unsigned char f3:7;
+};
+
+struct ok {
+  unsigned char f0;
+  unsigned char f1;
+  unsigned char f2;
+  unsigned char f3;
+};
+
+union bf_or_uint32 {
+  struct ok inval;
+  struct bitfield bfval;
+};
+
+__attribute__ ((noinline, noclone)) uint32_t
+partial_read_le32 (union bf_or_uint32 in)
+{
+  return in.bfval.f0 | (in.bfval.f1 << 8)
+	 | (in.bfval.f2 << 16) | (in.bfval.f3 << 24);
+}
+
+__attribute__ ((noinline, noclone)) uint32_t
+partial_read_be32 (union bf_or_uint32 in)
+{
+  return in.bfval.f3 | (in.bfval.f2 << 8)
+	 | (in.bfval.f1 << 16) | (in.bfval.f0 << 24);
+}
+
+int
+main ()
+{
+  union bf_or_uint32 bfin;
+  uint32_t out;
+
+  if (sizeof (uint32_t) * __CHAR_BIT__ != 32)
+    return 0;
+  bfin.inval = (struct ok) { 0x83, 0x85, 0x87, 0x89 };
+  out = partial_read_le32 (bfin);
+  if (out != 0x09070503 && out != 0x88868482)
+    __builtin_abort ();
+  bfin.inval = (struct ok) { 0x83, 0x85, 0x87, 0x89 };
+  out = partial_read_be32 (bfin);
+  if (out != 0x03050709 && out != 0x82848688)
+    __builtin_abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.dg/optimize-bswapdi-3.c b/gcc/testsuite/gcc.dg/optimize-bswapdi-3.c
new file mode 100644
index 0000000..bca7b4d
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/optimize-bswapdi-3.c
@@ -0,0 +1,81 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target bswap64 } */
+/* { dg-require-effective-target stdint_types } */
+/* { dg-options "-O2 -fdump-tree-bswap" } */
+
+#include <stdint.h>
+
+struct uint64_st {
+  unsigned char u0, u1, u2, u3, u4, u5, u6, u7;
+};
+
+uint32_t read_aux (void *, uint32_t);
+
+uint64_t read_le64_1 (void)
+{
+  unsigned char data[8];
+
+  read_aux (data, 8);
+  return (uint64_t) data[0] | ((uint64_t) data[1] << 8)
+	 | ((uint64_t) data[2] << 16) | ((uint64_t) data[3] << 24)
+	 | ((uint64_t) data[4] << 32) | ((uint64_t) data[5] << 40)
+	 | ((uint64_t) data[6] << 48) | ((uint64_t) data[7] << 56);
+}
+
+uint64_t read_le64_2 (void)
+{
+  struct uint64_st data;
+
+  read_aux (&data, 8);
+  return (uint64_t) data.u0 | ((uint64_t) data.u1 << 8)
+	 | ((uint64_t) data.u2 << 16) | ((uint64_t) data.u3 << 24)
+	 | ((uint64_t) data.u4 << 32) | ((uint64_t) data.u5 << 40)
+	 | ((uint64_t) data.u6 << 48) | ((uint64_t) data.u7 << 56);
+}
+
+uint64_t read_le64_3 (void)
+{
+  unsigned char *data;
+
+  read_aux (data, 8);
+  return (uint64_t) *data | ((uint64_t) *(data + 1) << 8)
+	 | ((uint64_t) *(data + 2) << 16) | ((uint64_t) *(data + 3) << 24)
+	 | ((uint64_t) *(data + 4) << 32) | ((uint64_t) *(data + 5) << 40)
+	 | ((uint64_t) *(data + 6) << 48) | ((uint64_t) *(data + 7) << 56);
+}
+
+uint64_t read_be64_1 (void)
+{
+  unsigned char data[8];
+
+  read_aux (data, 8);
+  return (uint64_t) data[7] | ((uint64_t) data[6] << 8)
+	 | ((uint64_t) data[5] << 16) | ((uint64_t) data[4] << 24)
+	 | ((uint64_t) data[3] << 32) | ((uint64_t) data[2] << 40)
+	 | ((uint64_t) data[1] << 48) | ((uint64_t) data[0] << 56);
+}
+
+uint64_t read_be64_2 (void)
+{
+  struct uint64_st data;
+
+  read_aux (&data, 8);
+  return (uint64_t) data.u7 | ((uint64_t) data.u6 << 8)
+	 | ((uint64_t) data.u5 << 16) | ((uint64_t) data.u4 << 24)
+	 | ((uint64_t) data.u3 << 32) | ((uint64_t) data.u2 << 40)
+	 | ((uint64_t) data.u1 << 48) | ((uint64_t) data.u0 << 56);
+}
+
+uint64_t read_be64_3 (void)
+{
+  unsigned char *data;
+
+  read_aux (data, 8);
+  return (uint64_t) *(data + 7) | ((uint64_t) *(data + 6) << 8)
+	 | ((uint64_t) *(data + 5) << 16) | ((uint64_t) *(data + 4) << 24)
+	 | ((uint64_t) *(data + 3) << 32) | ((uint64_t) *(data + 2) << 40)
+	 | ((uint64_t) *(data + 1) << 48) | ((uint64_t) *data << 56);
+}
+
+/* { dg-final { scan-tree-dump-times "64 bit bswap implementation found at" 3 "bswap" } } */
+/* { dg-final { cleanup-tree-dump "bswap" } } */
diff --git a/gcc/testsuite/gcc.dg/optimize-bswaphi-1.c b/gcc/testsuite/gcc.dg/optimize-bswaphi-1.c
new file mode 100644
index 0000000..4dcd3e3
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/optimize-bswaphi-1.c
@@ -0,0 +1,64 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target bswap16 } */
+/* { dg-require-effective-target stdint_types } */
+/* { dg-options "-O2 -fdump-tree-bswap" } */
+/* { dg-options "-O2 -fdump-tree-bswap -march=z900" { target s390-*-* } } */
+
+#include <stdint.h>
+
+struct uint16_st {
+  unsigned char u0, u1;
+};
+
+uint32_t read_aux (void *, uint32_t);
+
+uint32_t read_le16_1 (void)
+{
+  unsigned char data[2];
+
+  read_aux (data, 2);
+  return data[0] | (data[1] << 8);
+}
+
+uint32_t read_le16_2 (void)
+{
+  struct uint16_st data;
+
+  read_aux (&data, 2);
+  return data.u0 | (data.u1 << 8);
+}
+
+uint32_t read_le16_3 (void)
+{
+  unsigned char *data;
+
+  read_aux (data, 2);
+  return *data | (*(data + 1) << 8);
+}
+
+uint32_t read_be16_1 (void)
+{
+  unsigned char data[2];
+
+  read_aux (data, 2);
+  return data[1] | (data[0] << 8);
+}
+
+uint32_t read_be16_2 (void)
+{
+  struct uint16_st data;
+
+  read_aux (&data, 2);
+  return data.u1 | (data.u0 << 8);
+}
+
+uint32_t read_be16_3 (void)
+{
+  unsigned char *data;
+
+  read_aux (data, 2);
+  return *(data + 1) | (*data << 8);
+}
+
+/* { dg-final { scan-tree-dump-times "16 bit bswap implementation found at" 3 "bswap" } } */
+/* { dg-final { cleanup-tree-dump "bswap" } } */
diff --git a/gcc/testsuite/gcc.dg/optimize-bswapsi-2.c b/gcc/testsuite/gcc.dg/optimize-bswapsi-2.c
new file mode 100644
index 0000000..b365b96
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/optimize-bswapsi-2.c
@@ -0,0 +1,66 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target bswap32 } */
+/* { dg-require-effective-target stdint_types } */
+/* { dg-options "-O2 -fdump-tree-bswap" } */
+/* { dg-options "-O2 -fdump-tree-bswap -march=z900" { target s390-*-* } } */
+
+#include <stdint.h>
+
+struct uint32_st {
+  unsigned char u0, u1, u2, u3;
+};
+
+uint32_t read_aux (void *, uint32_t);
+
+uint32_t read_le32_1 (void)
+{
+  unsigned char data[4];
+
+  read_aux (data, 4);
+  return data[0] | (data[1] << 8) | (data[2] << 16) | (data[3] << 24);
+}
+
+uint32_t read_le32_2 (void)
+{
+  struct uint32_st data;
+
+  read_aux (&data, 4);
+  return data.u0 | (data.u1 << 8) | (data.u2 << 16) | (data.u3 << 24);
+}
+
+uint32_t read_le32_3 (void)
+{
+  unsigned char *data;
+
+  read_aux (data, 4);
+  return *data | (*(data + 1) << 8) | (*(data + 2) << 16)
+	 | (*(data + 3) << 24);
+}
+
+uint32_t read_be32_1 (void)
+{
+  unsigned char data[4];
+
+  read_aux (data, 4);
+  return data[3] | (data[2] << 8) | (data[1] << 16) | (data[0] << 24);
+}
+
+uint32_t read_be32_2 (void)
+{
+  struct uint32_st data;
+
+  read_aux (&data, 4);
+  return data.u3 | (data.u2 << 8) | (data.u1 << 16) | (data.u0 << 24);
+}
+
+uint32_t read_be32_3 (void)
+{
+  unsigned char *data;
+
+  read_aux (data, 4);
+  return *(data + 3) | (*(data + 2) << 8) | (*(data + 1) << 16)
+	 | (*data << 24);
+}
+
+/* { dg-final { scan-tree-dump-times "32 bit bswap implementation found at" 3 "bswap" } } */
+/* { dg-final { cleanup-tree-dump "bswap" } } */
diff --git a/gcc/tree-ssa-math-opts.c b/gcc/tree-ssa-math-opts.c
index 9ff857c..bcc52be 100644
--- a/gcc/tree-ssa-math-opts.c
+++ b/gcc/tree-ssa-math-opts.c
@@ -98,6 +98,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "is-a.h"
 #include "gimple.h"
 #include "gimple-iterator.h"
+#include "gimplify.h"
 #include "gimplify-me.h"
 #include "stor-layout.h"
 #include "gimple-ssa.h"
@@ -1616,11 +1617,26 @@ make_pass_cse_sincos (gcc::context *ctxt)
 
    0    - byte has the value 0
    1..size - byte contains the content of the byte
-   number indexed with that value minus one  */
+   number indexed with that value minus one.
+
+   To detect permutations on memory sources (arrays and structures), a symbolic
+   number is also associated a base address (the array or structure the load is
+   made from), an offset from the base address and a range which gives the
+   difference between the highest and lowest accessed memory location to make
+   such a symbolic number. The range is thus different from size which reflects
+   the size of the type of current expression.
+
+   For instance, for an array char a[], (short) a[0] | (short) a[3] would have
+   a size of 2 but a range of 4 while (short) a[0] | ((short) a[0] << 1) would
+   still have a size of 2 but this time a range of 1.  */
 
 struct symbolic_number {
   unsigned HOST_WIDEST_INT n;
   int size;
+  tree base_addr;
+  tree offset;
+  tree alias_set;
+  unsigned HOST_WIDE_INT range;
 };
 
 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
@@ -1682,6 +1698,45 @@ verify_symbolic_number_p (struct symbolic_number *n, gimple stmt)
   return true;
 }
 
+/* Check if STMT might be a byte swap from a memory source and returns the
+   answer. If so, REF is that memory source and the base of the memory area
+   accessed and the offset of the access from that base are recorded in N.  */
+
+bool
+find_bswap_load (gimple stmt, tree ref, struct symbolic_number *n)
+{
+  /* Leaf node is an array or component ref. Memorize its base and
+     offset from base to compare to other such leaf node.  */
+  HOST_WIDE_INT bitsize, bitpos;
+  enum machine_mode mode;
+  int unsignedp, volatilep;
+  tree bitpos_expr;
+
+  if (!gimple_assign_load_p (stmt) || gimple_has_volatile_ops (stmt))
+    return false;
+
+  n->base_addr = get_inner_reference_base (ref, &bitsize, &bitpos, &n->offset,
+					   &mode, &unsignedp, &volatilep,
+					   false);
+
+  if (bitpos % BITS_PER_UNIT)
+    return false;
+  if (bitsize % BITS_PER_UNIT)
+    return false;
+
+  n->alias_set = reference_alias_ptr_type (ref);
+  bitpos /= BITS_PER_UNIT;
+  bitpos_expr = build_int_cst (sizetype, bitpos);
+  if (n->offset)
+    {
+      bitpos_expr = build_int_cst (sizetype, bitpos);
+      n->offset = fold_build2 (PLUS_EXPR, sizetype, n->offset, bitpos_expr);
+    }
+  else
+    n->offset = build_int_cst (sizetype, bitpos);
+  return true;
+}
+
 /* find_bswap_1 invokes itself recursively with N and tries to perform
    the operation given by the rhs of STMT on the result.  If the
    operation could successfully be executed the function returns the
@@ -1701,6 +1756,9 @@ find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
 
   rhs1 = gimple_assign_rhs1 (stmt);
 
+  if (find_bswap_load (stmt, rhs1, n))
+    return rhs1;
+
   if (TREE_CODE (rhs1) != SSA_NAME)
     return NULL_TREE;
 
@@ -1729,9 +1787,9 @@ find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
 
       source_expr1 = find_bswap_1 (rhs1_stmt, n, limit - 1);
 
-      /* If find_bswap_1 returned NULL STMT is a leaf node and we have
+      /* If find_bswap_1 returned NULL, STMT is a leaf node and we have
 	 to initialize the symbolic number.  */
-      if (!source_expr1)
+      if (!source_expr1 || gimple_assign_load_p (rhs1_stmt))
 	{
 	  /* Set up the symbolic number N by setting each byte to a
 	     value between 1 and the byte size of rhs1.  The highest
@@ -1741,6 +1799,7 @@ find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
 	  if (n->size % BITS_PER_UNIT != 0)
 	    return NULL_TREE;
 	  n->size /= BITS_PER_UNIT;
+	  n->range = n->size;
 	  n->n = (sizeof (HOST_WIDEST_INT) < 8 ? 0 :
 		  (unsigned HOST_WIDEST_INT)0x08070605 << 32 | 0x04030201);
 
@@ -1748,7 +1807,11 @@ find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
 	    n->n &= ((unsigned HOST_WIDEST_INT)1 <<
 		     (n->size * BITS_PER_UNIT)) - 1;
 
-	  source_expr1 = rhs1;
+	  if (!source_expr1)
+	    {
+	      n->base_addr = n->offset = n->alias_set = NULL_TREE;
+	      source_expr1 = rhs1;
+	    }
 	}
 
       switch (code)
@@ -1824,10 +1887,74 @@ find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
 
 	  source_expr2 = find_bswap_1 (rhs2_stmt, &n2, limit - 1);
 
-	  if (source_expr1 != source_expr2
-	      || n1.size != n2.size)
+	  if (n1.size != n2.size || !source_expr2)
 	    return NULL_TREE;
 
+	  if (source_expr1 != source_expr2)
+	    {
+	      tree off_sub_expr, off_cmp_expr;
+	      HOST_WIDEST_INT inc, mask;
+	      unsigned i;
+	      HOST_WIDE_INT off_sub;
+	      struct symbolic_number *n_ptr;
+
+	      if (!n1.base_addr || !n2.base_addr
+		  || n1.base_addr != n2.base_addr)
+		return NULL_TREE;
+
+	      off_cmp_expr = fold_build2 (LT_EXPR, TREE_TYPE (n1.offset),
+					  n2.offset, n1.offset);
+	      if (TREE_CODE (off_cmp_expr) != INTEGER_CST)
+		return NULL_TREE;
+
+	      /* We swap n1 with n2 to have n1 < n2.  */
+	      if (TREE_INT_CST_LOW (off_cmp_expr))
+		{
+		  struct symbolic_number tmpn;
+
+		  tmpn = n2;
+		  n2 = n1;
+		  n1 = tmpn;
+		  source_expr1 = source_expr2;
+		}
+
+	      off_sub_expr = fold_build2 (MINUS_EXPR, TREE_TYPE (n1.offset),
+					  n2.offset, n1.offset);
+	      if (!cst_and_fits_in_hwi (off_sub_expr))
+		return NULL_TREE;
+
+	      off_sub = TREE_INT_CST_LOW (off_sub_expr);
+
+	      /* Check that the range of memory covered < biggest int size.  */
+	      if (off_sub + n2.range > (int)sizeof (HOST_WIDEST_INT))
+	        return NULL_TREE;
+	      n->range = n2.range + off_sub;
+
+	      /* Reinterpret byte marks in symbolic number holding the value of
+		 bigger weight according to host endianness.  */
+	      inc = BYTES_BIG_ENDIAN ? off_sub + n2.range - n1.range : off_sub;
+	      mask = 0xFF;
+	      if (BYTES_BIG_ENDIAN)
+		n_ptr = &n1;
+	      else
+		n_ptr = &n2;
+	      for (i = 0; i < sizeof (HOST_WIDEST_INT); i++, inc <<= 8,
+		   mask <<= 8)
+		{
+		  if (n_ptr->n & mask)
+		    n_ptr->n += inc;
+		}
+	    }
+	  else
+	    n->range = n1.range;
+
+	  if (!n1.alias_set
+	      || alias_ptr_types_compatible_p (n1.alias_set, n2.alias_set))
+	    n->alias_set = n1.alias_set;
+	  else
+	    n->alias_set = ptr_type_node;
+	  n->base_addr = n1.base_addr;
+	  n->offset = n1.offset;
 	  n->size = n1.size;
 	  for (i = 0, mask = 0xff; i < n->size; i++, mask <<= BITS_PER_UNIT)
 	    {
@@ -1853,14 +1980,16 @@ find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
 }
 
 /* Check if STMT completes a bswap implementation consisting of ORs,
-   SHIFTs and ANDs.  Return the source tree expression on which the
-   byte swap is performed and NULL if no bswap was found.  */
+   SHIFTs and ANDs.  If the source lies in memory, it also sets
+   *alias_set to the alias-set of the memory reference and *size to
+   the size of the load needed.  At last, the function returns the
+   source tree expression.  */
 
 static tree
-find_bswap (gimple stmt)
+find_bswap (gimple stmt, tree *alias_set, int *size)
 {
 /* The number which the find_bswap result should match in order to
-   have a full byte swap.  The number is shifted to the left according
+   have a full byte swap.  The number is shifted to the right according
    to the size of the symbolic number before using it.  */
   unsigned HOST_WIDEST_INT cmp =
     sizeof (HOST_WIDEST_INT) < 8 ? 0 :
@@ -1868,12 +1997,12 @@ find_bswap (gimple stmt)
 
   struct symbolic_number n;
   tree source_expr;
-  int limit;
+  int limit, rsize;
 
   /* The last parameter determines the depth search limit.  It usually
      correlates directly to the number of bytes to be touched.  We
      increase that number by three  here in order to also
-     cover signed -> unsigned converions of the src operand as can be seen
+     cover signed -> unsigned conversions of the src operand as can be seen
      in libgcc, and for initial shift/and operation of the src operand.  */
   limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
   limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
@@ -1885,18 +2014,34 @@ find_bswap (gimple stmt)
   /* Zero out the extra bits of N and CMP.  */
   if (n.size < (int)sizeof (HOST_WIDEST_INT))
     {
+      int tmpn;
       unsigned HOST_WIDEST_INT mask =
 	((unsigned HOST_WIDEST_INT)1 << (n.size * BITS_PER_UNIT)) - 1;
 
       n.n &= mask;
-      cmp >>= (sizeof (HOST_WIDEST_INT) - n.size) * BITS_PER_UNIT;
+      /* Find real size of result (highest non zero byte).  */
+      for (tmpn = n.n, rsize = 0; tmpn; tmpn >>= BITS_PER_UNIT, rsize++);
+      cmp >>= (sizeof (HOST_WIDEST_INT) - rsize) * BITS_PER_UNIT;
     }
+  else
+    rsize = n.size;
 
   /* A complete byte swap should make the symbolic number to start
      with the largest digit in the highest order byte.  */
   if (cmp != n.n)
     return NULL_TREE;
 
+  *alias_set = NULL_TREE;
+
+  if (n.base_addr)
+    {
+      *alias_set = n.alias_set;
+      n.size = rsize;
+    }
+  else if (rsize != n.size)
+    return NULL_TREE;
+
+  *size = n.size * BITS_PER_UNIT;
   return source_expr;
 }
 
@@ -1961,7 +2106,7 @@ execute_optimize_bswap (void)
       for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
         {
 	  gimple stmt = gsi_stmt (gsi);
-	  tree bswap_src, bswap_type;
+	  tree bswap_src, bswap_type, load_type, alias_type;
 	  tree bswap_tmp;
 	  tree fndecl = NULL_TREE;
 	  int type_size;
@@ -1971,11 +2116,15 @@ execute_optimize_bswap (void)
 	      || gimple_assign_rhs_code (stmt) != BIT_IOR_EXPR)
 	    continue;
 
-	  type_size = TYPE_PRECISION (gimple_expr_type (stmt));
+	  bswap_src = find_bswap (stmt, &alias_type, &type_size);
+
+	  if (!bswap_src)
+	    continue;
 
 	  switch (type_size)
 	    {
 	    case 16:
+	      load_type = uint16_type_node;
 	      if (bswap16_p)
 		{
 		  fndecl = builtin_decl_explicit (BUILT_IN_BSWAP16);
@@ -1983,6 +2132,7 @@ execute_optimize_bswap (void)
 		}
 	      break;
 	    case 32:
+	      load_type = uint32_type_node;
 	      if (bswap32_p)
 		{
 		  fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
@@ -1990,6 +2140,7 @@ execute_optimize_bswap (void)
 		}
 	      break;
 	    case 64:
+	      load_type = uint64_type_node;
 	      if (bswap64_p)
 		{
 		  fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
@@ -2003,10 +2154,34 @@ execute_optimize_bswap (void)
 	  if (!fndecl)
 	    continue;
 
-	  bswap_src = find_bswap (stmt);
 
-	  if (!bswap_src)
-	    continue;
+	  /* Need to load the value from memory first.  */
+	  if (alias_type)
+	    {
+	      tree addr_expr, addr_tmp, val_expr, val_tmp;
+	      tree load_offset_ptr;
+	      gimple addr_stmt, load_stmt;
+
+	      changed = true;
+
+	      /*  Compute address to load from and cast according to the size
+		  of the load.  */
+	      addr_expr = build_fold_addr_expr (unshare_expr (bswap_src));
+	      addr_tmp = make_temp_ssa_name (TREE_TYPE (addr_expr), NULL,
+					     "load_src");
+	      addr_stmt = gimple_build_assign (addr_tmp, addr_expr);
+	      gsi_insert_before (&gsi, addr_stmt, GSI_SAME_STMT);
+
+	      /* Perform the load.  */
+	      load_offset_ptr = build_int_cst (alias_type, 0);
+	      val_tmp = make_temp_ssa_name (load_type, NULL, "load_dst");
+	      val_expr = build2 (MEM_REF, load_type, addr_tmp, load_offset_ptr);
+	      load_stmt = gimple_build_assign_with_ops
+			 (MEM_REF, val_tmp, val_expr, NULL);
+
+	      gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
+	      bswap_src = val_tmp;
+	    }
 
 	  changed = true;
 	  if (type_size == 16)
diff --git a/gcc/tree.h b/gcc/tree.h
index 9fbc5c4..efa4f36 100644
--- a/gcc/tree.h
+++ b/gcc/tree.h
@@ -4533,6 +4533,13 @@ extern tree get_inner_reference (tree, HOST_WIDE_INT *, HOST_WIDE_INT *,
 				 tree *, enum machine_mode *, int *, int *,
 				 bool);
 
+/* Given an expression EXP that is a handled_component_p,
+   look for the base of the ultimate containing object, which is returned and
+   specify the access position and size.  */
+extern tree get_inner_reference_base (tree, HOST_WIDE_INT *, HOST_WIDE_INT *,
+				      tree *, enum machine_mode *, int *,
+				      int *, bool);
+
 /* Return a tree representing the lower bound of the array mentioned in
    EXP, an ARRAY_REF or an ARRAY_RANGE_REF.  */
 extern tree array_ref_low_bound (tree);

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH][2/3] Fix PR54733 Optimize endian independent load/store
  2014-04-24  1:37                 ` Thomas Preud'homme
@ 2014-04-24  8:40                   ` Thomas Preud'homme
  2014-04-29 10:47                   ` Richard Biener
  1 sibling, 0 replies; 14+ messages in thread
From: Thomas Preud'homme @ 2014-04-24  8:40 UTC (permalink / raw)
  To: Thomas Preud'homme, GCC Patches

> 
> Bootstrapped on x86_64-linux-gnu with no testsuite regression. Also did a
> arm-none-eabi cross build with no regression after running testsuite via
> qemu

Forgot to ask if it's ok for trunk. Same question for part 1 and 3.

Best regards,

Thomas



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH][2/3] Fix PR54733 Optimize endian independent load/store
  2014-04-24  1:37                 ` Thomas Preud'homme
  2014-04-24  8:40                   ` Thomas Preud'homme
@ 2014-04-29 10:47                   ` Richard Biener
  2014-05-04  9:51                     ` Thomas Preud'homme
  1 sibling, 1 reply; 14+ messages in thread
From: Richard Biener @ 2014-04-29 10:47 UTC (permalink / raw)
  To: Thomas Preud'homme; +Cc: GCC Patches

On Thu, Apr 24, 2014 at 3:34 AM, Thomas Preud'homme
<thomas.preudhomme@arm.com> wrote:
> See updated part 2 of the patch in attachment. Part 1 is unchanged. New ChangeLog are as follows:
>
> *** gcc/ChangeLog ***
>
> 2014-04-23  Thomas Preud'homme  <thomas.preudhomme@arm.com>
>
>         PR tree-optimization/54733
>         * tree-ssa-math-opts.c (find_bswap_load): New.
>         (find_bswap_1): Add support for memory source.
>         (find_bswap): Likewise.
>         (execute_optimize_bswap): Likewise.
>         * expr.c (get_inner_reference): Renamed to ...
>         (get_inner_reference_1): This. Also add a parameter to control whether
>         a MEM_REF should be split into base + offset.
>         (get_inner_reference): New wrapper around get_inner_reference_1 for
>         compatibility.
>         (get_inner_reference_base): New wrapper around get_inner_reference_1
>         to access its new facet.
>         * tree.h (get_inner_reference_base): Declare.
>
>
> *** gcc/testsuite/ChangeLog ***
>
> 2014-04-23  Thomas Preud'homme  <thomas.preudhomme@arm.com>
>
>         PR tree-optimization/54733
>         * gcc.dg/optimize-bswapdi-3.c: New test to check extension of bswap
>         optimization to support memory sources.
>         * gcc.dg/optimize-bswaphi-1.c: Likewise.
>         * gcc.dg/optimize-bswapsi-2.c: Likewise.
>         * gcc.c-torture/execute/bswap-2.c: Likewise.
>
> Bootstrapped on x86_64-linux-gnu with no testsuite regression. Also did a arm-none-eabi cross build with no regression after running testsuite via qemu

+/* Given an expression EXP that is a handled_component_p,
+   look for the base of the ultimate containing object, which is returned and
+   specify the access position and size.  */
+
+tree
+get_inner_reference_base (tree exp, HOST_WIDE_INT *pbitsize,
+                         HOST_WIDE_INT *pbitpos, tree *poffset,
+                         enum machine_mode *pmode, int *punsignedp,
+                         int *pvolatilep, bool keep_aligning)
+{

this name isn't exactly spot-on and it's behavior doesn't appear to be
consistent for TARGET_MEM_REF (which get_inner_reference_1
simply returns unchanged).  It returns the base object or address
of the ultimate containing object (not handling TARGET_MEM_REF
should be simply documented), so I'd say instead of inventing
a new name add an overload instead (with the extra flag).

I'd name the flag 'allow_address', not 'ptr_base' (as you don't return
a pointer to a decl but the decl itself if the base is a decl).

+  bitpos /= BITS_PER_UNIT;
+  bitpos_expr = build_int_cst (sizetype, bitpos);

bitpos_expr = size_int (bitpos);

and it's really bytepos_expr now, no?

+  if (n->offset)
+    {
+      bitpos_expr = build_int_cst (sizetype, bitpos);
+      n->offset = fold_build2 (PLUS_EXPR, sizetype, n->offset, bitpos_expr);
+    }
+  else
+    n->offset = build_int_cst (sizetype, bitpos);

n->offset = bitpos_expr;

btw, I wonder why you glob together bitpos and offset again here.
I'd simply record and compare n->bitpos.

Which means

+             off_cmp_expr = fold_build2 (LT_EXPR, TREE_TYPE (n1.offset),
+                                         n2.offset, n1.offset);
+             if (TREE_CODE (off_cmp_expr) != INTEGER_CST)
+               return NULL_TREE;

you'd simply compare n1.offset and n2.offset like you compare the
bases (use operand_equal_p ()) and reject differences and then
the constant offset part (tracked in bitpos) can be operated on with
integer math instead of trees.

+             /*  Compute address to load from and cast according to the size
+                 of the load.  */
+             addr_expr = build_fold_addr_expr (unshare_expr (bswap_src));
+             addr_tmp = make_temp_ssa_name (TREE_TYPE (addr_expr), NULL,
+                                            "load_src");
+             addr_stmt = gimple_build_assign (addr_tmp, addr_expr);
+             gsi_insert_before (&gsi, addr_stmt, GSI_SAME_STMT);
+
+             /* Perform the load.  */
+             load_offset_ptr = build_int_cst (alias_type, 0);
+             val_tmp = make_temp_ssa_name (load_type, NULL, "load_dst");
+             val_expr = build2 (MEM_REF, load_type, addr_tmp, load_offset_ptr);
+             load_stmt = gimple_build_assign_with_ops
+                        (MEM_REF, val_tmp, val_expr, NULL);

to not cause spurious TREE_ADDRESSABLE bases you should
avoid the load_src temporary if possible:

   if (is_gimple_min_invariant (addr_expr))
    addr_tmp = addr_expr;
   else
      {
         addr_tmp = make_temp ....

and then use fold_build2 to build the MEM_REF.  Also use
gimple_build_assign for the load, not the with_ops variant.

The rest of the patch looks good to me now, sorry for the delay
in reviewing.

Thanks,
Richard.


> Best regards,
>
> Thomas

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH][2/3] Fix PR54733 Optimize endian independent load/store
  2014-04-29 10:47                   ` Richard Biener
@ 2014-05-04  9:51                     ` Thomas Preud'homme
  2014-05-05 11:29                       ` Thomas Preud'homme
  0 siblings, 1 reply; 14+ messages in thread
From: Thomas Preud'homme @ 2014-05-04  9:51 UTC (permalink / raw)
  To: GCC Patches

[-- Attachment #1: Type: text/plain, Size: 1172 bytes --]

Please find attached the new version of this patch addressing all your comments.

ChangeLog are now as follows:

*** gcc/ChangeLog ***

2014-05-04  Thomas Preud'homme  <thomas.preudhomme@arm.com>

	PR tree-optimization/54733
	* expr.c (get_inner_reference): Add a parameter to control whether a
	MEM_REF should be split into base + offset.
	* tree.h (get_inner_reference): Default new parameter to false.
	* tree-ssa-math-opts.c (find_bswap_load): New.
	(find_bswap_1): Add support for memory source.
	(find_bswap): Likewise.
	(execute_optimize_bswap): Likewise.

*** gcc/testsuite/ChangeLog ***

2014-05-04  Thomas Preud'homme  <thomas.preudhomme@arm.com>

	PR tree-optimization/54733
	* gcc.dg/optimize-bswapdi-3.c: New test to check extension of bswap
	optimization to support memory sources.
	* gcc.dg/optimize-bswaphi-1.c: Likewise.
	* gcc.dg/optimize-bswapsi-2.c: Likewise.
	* gcc.c-torture/execute/bswap-2.c: Likewise.

Ok for stage1? Note that I haven't commited part 1 despite your approval as I'm waiting for the other parts to be approved. I don't know if it's the usual procedure for patchset.

Best regards,

Thomas Preud'homme

[-- Attachment #2: gcc32rm-84.5.1.part2.diff --]
[-- Type: application/octet-stream, Size: 22542 bytes --]

diff --git a/gcc/expr.c b/gcc/expr.c
index fec6194..864ee20 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -6669,7 +6669,10 @@ store_field (rtx target, HOST_WIDE_INT bitsize, HOST_WIDE_INT bitpos,
 \f
 /* Given an expression EXP that may be a COMPONENT_REF, a BIT_FIELD_REF,
    an ARRAY_REF, or an ARRAY_RANGE_REF, look for nested operations of these
-   codes and find the ultimate containing object, which we return.
+   codes and find the ultimate containing object, which we return. If that
+   object is a MEM_REF that is not the address of an object, ALLOW_ADDRESS
+   decides whether the pointer is returned instead of the memory reference.
+   Note that the behavior for TARGET_MEM_REF is not affected by this parameter.
 
    We set *PBITSIZE to the size in bits that we want, *PBITPOS to the
    bit position, and *PUNSIGNEDP to the signedness of the field.
@@ -6706,7 +6709,7 @@ tree
 get_inner_reference (tree exp, HOST_WIDE_INT *pbitsize,
 		     HOST_WIDE_INT *pbitpos, tree *poffset,
 		     enum machine_mode *pmode, int *punsignedp,
-		     int *pvolatilep, bool keep_aligning)
+		     int *pvolatilep, bool keep_aligning, bool allow_address)
 {
   tree size_tree = 0;
   enum machine_mode mode = VOIDmode;
@@ -6838,7 +6841,7 @@ get_inner_reference (tree exp, HOST_WIDE_INT *pbitsize,
 
 	case MEM_REF:
 	  /* Hand back the decl for MEM[&decl, off].  */
-	  if (TREE_CODE (TREE_OPERAND (exp, 0)) == ADDR_EXPR)
+	  if (TREE_CODE (TREE_OPERAND (exp, 0)) == ADDR_EXPR || allow_address)
 	    {
 	      tree off = TREE_OPERAND (exp, 1);
 	      if (!integer_zerop (off))
@@ -6848,7 +6851,10 @@ get_inner_reference (tree exp, HOST_WIDE_INT *pbitsize,
 				      ? 3 : exact_log2 (BITS_PER_UNIT));
 		  bit_offset += boff;
 		}
-	      exp = TREE_OPERAND (TREE_OPERAND (exp, 0), 0);
+	      if (TREE_CODE (TREE_OPERAND (exp, 0)) == ADDR_EXPR)
+		exp = TREE_OPERAND (TREE_OPERAND (exp, 0), 0);
+	      else
+		exp = TREE_OPERAND (exp, 0);
 	    }
 	  goto done;
 
diff --git a/gcc/testsuite/gcc.c-torture/execute/bswap-2.c b/gcc/testsuite/gcc.c-torture/execute/bswap-2.c
new file mode 100644
index 0000000..e91b487
--- /dev/null
+++ b/gcc/testsuite/gcc.c-torture/execute/bswap-2.c
@@ -0,0 +1,90 @@
+#ifdef __UINT32_TYPE__
+typedef __UINT32_TYPE__ uint32_t;
+#else
+typedef __UINT32_TYPE__ unsigned;
+#endif
+
+struct bitfield {
+  unsigned char f0:7;
+  unsigned char f1:7;
+  unsigned char f2:7;
+  unsigned char f3:7;
+};
+
+struct ok {
+  unsigned char f0;
+  unsigned char f1;
+  unsigned char f2;
+  unsigned char f3;
+};
+
+union bf_or_uint32 {
+  struct ok inval;
+  struct bitfield bfval;
+};
+
+__attribute__ ((noinline, noclone)) uint32_t
+partial_read_le32 (union bf_or_uint32 in)
+{
+  return in.bfval.f0 | (in.bfval.f1 << 8)
+	 | (in.bfval.f2 << 16) | (in.bfval.f3 << 24);
+}
+
+__attribute__ ((noinline, noclone)) uint32_t
+partial_read_be32 (union bf_or_uint32 in)
+{
+  return in.bfval.f3 | (in.bfval.f2 << 8)
+	 | (in.bfval.f1 << 16) | (in.bfval.f0 << 24);
+}
+
+__attribute__ ((noinline, noclone)) uint32_t
+fake_read_le32 (char *x, char *y)
+{
+  unsigned char c0, c1, c2, c3;
+
+  c0 = x[0];
+  c1 = x[1];
+  *y = 1;
+  c2 = x[2];
+  c3 = x[3];
+  return c0 | c1 << 8 | c2 << 16 | c3 << 24;
+}
+
+__attribute__ ((noinline, noclone)) uint32_t
+fake_read_be32 (char *x, char *y)
+{
+  unsigned char c0, c1, c2, c3;
+
+  c0 = x[0];
+  c1 = x[1];
+  *y = 1;
+  c2 = x[2];
+  c3 = x[3];
+  return c3 | c2 << 8 | c1 << 16 | c0 << 24;
+}
+
+int
+main ()
+{
+  union bf_or_uint32 bfin;
+  uint32_t out;
+  char cin[] = { 0x83, 0x85, 0x87, 0x89 };
+
+  if (sizeof (uint32_t) * __CHAR_BIT__ != 32)
+    return 0;
+  bfin.inval = (struct ok) { 0x83, 0x85, 0x87, 0x89 };
+  out = partial_read_le32 (bfin);
+  if (out != 0x09070503 && out != 0x88868482)
+    __builtin_abort ();
+  bfin.inval = (struct ok) { 0x83, 0x85, 0x87, 0x89 };
+  out = partial_read_be32 (bfin);
+  if (out != 0x03050709 && out != 0x82848688)
+    __builtin_abort ();
+  out = fake_read_le32 (cin, &cin[2]);
+  if (out != 0x89018583)
+    __builtin_abort ();
+  out = fake_read_be32 (cin, &cin[2]);
+  if (out != 0x83850189)
+    __builtin_abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.dg/optimize-bswapdi-3.c b/gcc/testsuite/gcc.dg/optimize-bswapdi-3.c
new file mode 100644
index 0000000..bca7b4d
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/optimize-bswapdi-3.c
@@ -0,0 +1,81 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target bswap64 } */
+/* { dg-require-effective-target stdint_types } */
+/* { dg-options "-O2 -fdump-tree-bswap" } */
+
+#include <stdint.h>
+
+struct uint64_st {
+  unsigned char u0, u1, u2, u3, u4, u5, u6, u7;
+};
+
+uint32_t read_aux (void *, uint32_t);
+
+uint64_t read_le64_1 (void)
+{
+  unsigned char data[8];
+
+  read_aux (data, 8);
+  return (uint64_t) data[0] | ((uint64_t) data[1] << 8)
+	 | ((uint64_t) data[2] << 16) | ((uint64_t) data[3] << 24)
+	 | ((uint64_t) data[4] << 32) | ((uint64_t) data[5] << 40)
+	 | ((uint64_t) data[6] << 48) | ((uint64_t) data[7] << 56);
+}
+
+uint64_t read_le64_2 (void)
+{
+  struct uint64_st data;
+
+  read_aux (&data, 8);
+  return (uint64_t) data.u0 | ((uint64_t) data.u1 << 8)
+	 | ((uint64_t) data.u2 << 16) | ((uint64_t) data.u3 << 24)
+	 | ((uint64_t) data.u4 << 32) | ((uint64_t) data.u5 << 40)
+	 | ((uint64_t) data.u6 << 48) | ((uint64_t) data.u7 << 56);
+}
+
+uint64_t read_le64_3 (void)
+{
+  unsigned char *data;
+
+  read_aux (data, 8);
+  return (uint64_t) *data | ((uint64_t) *(data + 1) << 8)
+	 | ((uint64_t) *(data + 2) << 16) | ((uint64_t) *(data + 3) << 24)
+	 | ((uint64_t) *(data + 4) << 32) | ((uint64_t) *(data + 5) << 40)
+	 | ((uint64_t) *(data + 6) << 48) | ((uint64_t) *(data + 7) << 56);
+}
+
+uint64_t read_be64_1 (void)
+{
+  unsigned char data[8];
+
+  read_aux (data, 8);
+  return (uint64_t) data[7] | ((uint64_t) data[6] << 8)
+	 | ((uint64_t) data[5] << 16) | ((uint64_t) data[4] << 24)
+	 | ((uint64_t) data[3] << 32) | ((uint64_t) data[2] << 40)
+	 | ((uint64_t) data[1] << 48) | ((uint64_t) data[0] << 56);
+}
+
+uint64_t read_be64_2 (void)
+{
+  struct uint64_st data;
+
+  read_aux (&data, 8);
+  return (uint64_t) data.u7 | ((uint64_t) data.u6 << 8)
+	 | ((uint64_t) data.u5 << 16) | ((uint64_t) data.u4 << 24)
+	 | ((uint64_t) data.u3 << 32) | ((uint64_t) data.u2 << 40)
+	 | ((uint64_t) data.u1 << 48) | ((uint64_t) data.u0 << 56);
+}
+
+uint64_t read_be64_3 (void)
+{
+  unsigned char *data;
+
+  read_aux (data, 8);
+  return (uint64_t) *(data + 7) | ((uint64_t) *(data + 6) << 8)
+	 | ((uint64_t) *(data + 5) << 16) | ((uint64_t) *(data + 4) << 24)
+	 | ((uint64_t) *(data + 3) << 32) | ((uint64_t) *(data + 2) << 40)
+	 | ((uint64_t) *(data + 1) << 48) | ((uint64_t) *data << 56);
+}
+
+/* { dg-final { scan-tree-dump-times "64 bit bswap implementation found at" 3 "bswap" } } */
+/* { dg-final { cleanup-tree-dump "bswap" } } */
diff --git a/gcc/testsuite/gcc.dg/optimize-bswaphi-1.c b/gcc/testsuite/gcc.dg/optimize-bswaphi-1.c
new file mode 100644
index 0000000..4dcd3e3
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/optimize-bswaphi-1.c
@@ -0,0 +1,64 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target bswap16 } */
+/* { dg-require-effective-target stdint_types } */
+/* { dg-options "-O2 -fdump-tree-bswap" } */
+/* { dg-options "-O2 -fdump-tree-bswap -march=z900" { target s390-*-* } } */
+
+#include <stdint.h>
+
+struct uint16_st {
+  unsigned char u0, u1;
+};
+
+uint32_t read_aux (void *, uint32_t);
+
+uint32_t read_le16_1 (void)
+{
+  unsigned char data[2];
+
+  read_aux (data, 2);
+  return data[0] | (data[1] << 8);
+}
+
+uint32_t read_le16_2 (void)
+{
+  struct uint16_st data;
+
+  read_aux (&data, 2);
+  return data.u0 | (data.u1 << 8);
+}
+
+uint32_t read_le16_3 (void)
+{
+  unsigned char *data;
+
+  read_aux (data, 2);
+  return *data | (*(data + 1) << 8);
+}
+
+uint32_t read_be16_1 (void)
+{
+  unsigned char data[2];
+
+  read_aux (data, 2);
+  return data[1] | (data[0] << 8);
+}
+
+uint32_t read_be16_2 (void)
+{
+  struct uint16_st data;
+
+  read_aux (&data, 2);
+  return data.u1 | (data.u0 << 8);
+}
+
+uint32_t read_be16_3 (void)
+{
+  unsigned char *data;
+
+  read_aux (data, 2);
+  return *(data + 1) | (*data << 8);
+}
+
+/* { dg-final { scan-tree-dump-times "16 bit bswap implementation found at" 3 "bswap" } } */
+/* { dg-final { cleanup-tree-dump "bswap" } } */
diff --git a/gcc/testsuite/gcc.dg/optimize-bswapsi-2.c b/gcc/testsuite/gcc.dg/optimize-bswapsi-2.c
new file mode 100644
index 0000000..b365b96
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/optimize-bswapsi-2.c
@@ -0,0 +1,66 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target bswap32 } */
+/* { dg-require-effective-target stdint_types } */
+/* { dg-options "-O2 -fdump-tree-bswap" } */
+/* { dg-options "-O2 -fdump-tree-bswap -march=z900" { target s390-*-* } } */
+
+#include <stdint.h>
+
+struct uint32_st {
+  unsigned char u0, u1, u2, u3;
+};
+
+uint32_t read_aux (void *, uint32_t);
+
+uint32_t read_le32_1 (void)
+{
+  unsigned char data[4];
+
+  read_aux (data, 4);
+  return data[0] | (data[1] << 8) | (data[2] << 16) | (data[3] << 24);
+}
+
+uint32_t read_le32_2 (void)
+{
+  struct uint32_st data;
+
+  read_aux (&data, 4);
+  return data.u0 | (data.u1 << 8) | (data.u2 << 16) | (data.u3 << 24);
+}
+
+uint32_t read_le32_3 (void)
+{
+  unsigned char *data;
+
+  read_aux (data, 4);
+  return *data | (*(data + 1) << 8) | (*(data + 2) << 16)
+	 | (*(data + 3) << 24);
+}
+
+uint32_t read_be32_1 (void)
+{
+  unsigned char data[4];
+
+  read_aux (data, 4);
+  return data[3] | (data[2] << 8) | (data[1] << 16) | (data[0] << 24);
+}
+
+uint32_t read_be32_2 (void)
+{
+  struct uint32_st data;
+
+  read_aux (&data, 4);
+  return data.u3 | (data.u2 << 8) | (data.u1 << 16) | (data.u0 << 24);
+}
+
+uint32_t read_be32_3 (void)
+{
+  unsigned char *data;
+
+  read_aux (data, 4);
+  return *(data + 3) | (*(data + 2) << 8) | (*(data + 1) << 16)
+	 | (*data << 24);
+}
+
+/* { dg-final { scan-tree-dump-times "32 bit bswap implementation found at" 3 "bswap" } } */
+/* { dg-final { cleanup-tree-dump "bswap" } } */
diff --git a/gcc/tree-ssa-math-opts.c b/gcc/tree-ssa-math-opts.c
index b965ad1..65e7ffe 100644
--- a/gcc/tree-ssa-math-opts.c
+++ b/gcc/tree-ssa-math-opts.c
@@ -98,6 +98,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "is-a.h"
 #include "gimple.h"
 #include "gimple-iterator.h"
+#include "gimplify.h"
 #include "gimplify-me.h"
 #include "stor-layout.h"
 #include "gimple-ssa.h"
@@ -1606,11 +1607,28 @@ make_pass_cse_sincos (gcc::context *ctxt)
 
    0    - byte has the value 0
    1..size - byte contains the content of the byte
-   number indexed with that value minus one  */
+   number indexed with that value minus one.
+
+   To detect permutations on memory sources (arrays and structures), a symbolic
+   number is also associated a base address (the array or structure the load is
+   made from), an offset from the base address and a range which gives the
+   difference between the highest and lowest accessed memory location to make
+   such a symbolic number. The range is thus different from size which reflects
+   the size of the type of current expression.
+
+   For instance, for an array char a[], (short) a[0] | (short) a[3] would have
+   a size of 2 but a range of 4 while (short) a[0] | ((short) a[0] << 1) would
+   still have a size of 2 but this time a range of 1.  */
 
 struct symbolic_number {
   unsigned HOST_WIDEST_INT n;
   int size;
+  tree base_addr;
+  tree offset;
+  HOST_WIDE_INT bytepos;
+  tree alias_set;
+  tree vuse;
+  unsigned HOST_WIDE_INT range;
 };
 
 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
@@ -1672,6 +1690,37 @@ verify_symbolic_number_p (struct symbolic_number *n, gimple stmt)
   return true;
 }
 
+/* Check if STMT might be a byte swap from a memory source and returns the
+   answer. If so, REF is that memory source and the base of the memory area
+   accessed and the offset of the access from that base are recorded in N.  */
+
+bool
+find_bswap_load (gimple stmt, tree ref, struct symbolic_number *n)
+{
+  /* Leaf node is an array or component ref. Memorize its base and
+     offset from base to compare to other such leaf node.  */
+  HOST_WIDE_INT bitsize, bitpos;
+  enum machine_mode mode;
+  int unsignedp, volatilep;
+
+  if (!gimple_assign_load_p (stmt) || gimple_has_volatile_ops (stmt))
+    return false;
+
+  n->base_addr = get_inner_reference (ref, &bitsize, &bitpos, &n->offset,
+				      &mode, &unsignedp, &volatilep, false,
+				      true);
+
+  if (bitpos % BITS_PER_UNIT)
+    return false;
+  if (bitsize % BITS_PER_UNIT)
+    return false;
+
+  n->bytepos = bitpos / BITS_PER_UNIT;
+  n->alias_set = reference_alias_ptr_type (ref);
+  n->vuse = gimple_vuse (stmt);
+  return true;
+}
+
 /* find_bswap_1 invokes itself recursively with N and tries to perform
    the operation given by the rhs of STMT on the result.  If the
    operation could successfully be executed the function returns the
@@ -1691,6 +1740,9 @@ find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
 
   rhs1 = gimple_assign_rhs1 (stmt);
 
+  if (find_bswap_load (stmt, rhs1, n))
+    return rhs1;
+
   if (TREE_CODE (rhs1) != SSA_NAME)
     return NULL_TREE;
 
@@ -1719,9 +1771,9 @@ find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
 
       source_expr1 = find_bswap_1 (rhs1_stmt, n, limit - 1);
 
-      /* If find_bswap_1 returned NULL STMT is a leaf node and we have
+      /* If find_bswap_1 returned NULL, STMT is a leaf node and we have
 	 to initialize the symbolic number.  */
-      if (!source_expr1)
+      if (!source_expr1 || gimple_assign_load_p (rhs1_stmt))
 	{
 	  /* Set up the symbolic number N by setting each byte to a
 	     value between 1 and the byte size of rhs1.  The highest
@@ -1731,6 +1783,7 @@ find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
 	  if (n->size % BITS_PER_UNIT != 0)
 	    return NULL_TREE;
 	  n->size /= BITS_PER_UNIT;
+	  n->range = n->size;
 	  n->n = (sizeof (HOST_WIDEST_INT) < 8 ? 0 :
 		  (unsigned HOST_WIDEST_INT)0x08070605 << 32 | 0x04030201);
 
@@ -1738,7 +1791,11 @@ find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
 	    n->n &= ((unsigned HOST_WIDEST_INT)1 <<
 		     (n->size * BITS_PER_UNIT)) - 1;
 
-	  source_expr1 = rhs1;
+	  if (!source_expr1)
+	    {
+	      n->base_addr = n->offset = n->alias_set = n->vuse = NULL_TREE;
+	      source_expr1 = rhs1;
+	    }
 	}
 
       switch (code)
@@ -1814,10 +1871,72 @@ find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
 
 	  source_expr2 = find_bswap_1 (rhs2_stmt, &n2, limit - 1);
 
-	  if (source_expr1 != source_expr2
-	      || n1.size != n2.size)
+	  if (n1.size != n2.size || !source_expr2)
 	    return NULL_TREE;
 
+	  if (!n1.vuse != !n2.vuse ||
+	  (n1.vuse && !operand_equal_p (n1.vuse, n2.vuse, 0)))
+	    return NULL_TREE;
+
+	  if (source_expr1 != source_expr2)
+	    {
+	      HOST_WIDEST_INT inc, mask;
+	      unsigned i;
+	      HOST_WIDE_INT off_sub;
+	      struct symbolic_number *n_ptr;
+
+	      if (!n1.base_addr || !n2.base_addr
+		  || !operand_equal_p (n1.base_addr, n2.base_addr, 0))
+		return NULL_TREE;
+	      if (!n1.offset != !n2.offset ||
+	          (n1.offset && !operand_equal_p (n1.offset, n2.offset, 0)))
+		return NULL_TREE;
+
+	      /* We swap n1 with n2 to have n1 < n2.  */
+	      if (n2.bytepos < n1.bytepos)
+		{
+		  struct symbolic_number tmpn;
+
+		  tmpn = n2;
+		  n2 = n1;
+		  n1 = tmpn;
+		  source_expr1 = source_expr2;
+		}
+
+	      off_sub = n2.bytepos - n1.bytepos;
+
+	      /* Check that the range of memory covered < biggest int size.  */
+	      if (off_sub + n2.range > (int) sizeof (HOST_WIDEST_INT))
+	        return NULL_TREE;
+	      n->range = n2.range + off_sub;
+
+	      /* Reinterpret byte marks in symbolic number holding the value of
+		 bigger weight according to host endianness.  */
+	      inc = BYTES_BIG_ENDIAN ? off_sub + n2.range - n1.range : off_sub;
+	      mask = 0xFF;
+	      if (BYTES_BIG_ENDIAN)
+		n_ptr = &n1;
+	      else
+		n_ptr = &n2;
+	      for (i = 0; i < sizeof (HOST_WIDEST_INT); i++, inc <<= 8,
+		   mask <<= 8)
+		{
+		  if (n_ptr->n & mask)
+		    n_ptr->n += inc;
+		}
+	    }
+	  else
+	    n->range = n1.range;
+
+	  if (!n1.alias_set
+	      || alias_ptr_types_compatible_p (n1.alias_set, n2.alias_set))
+	    n->alias_set = n1.alias_set;
+	  else
+	    n->alias_set = ptr_type_node;
+	  n->vuse = n1.vuse;
+	  n->base_addr = n1.base_addr;
+	  n->offset = n1.offset;
+	  n->bytepos = n1.bytepos;
 	  n->size = n1.size;
 	  for (i = 0, mask = 0xff; i < n->size; i++, mask <<= BITS_PER_UNIT)
 	    {
@@ -1843,14 +1962,16 @@ find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
 }
 
 /* Check if STMT completes a bswap implementation consisting of ORs,
-   SHIFTs and ANDs.  Return the source tree expression on which the
-   byte swap is performed and NULL if no bswap was found.  */
+   SHIFTs and ANDs.  If the source lies in memory, it also sets
+   *ALIAS_SET to the alias-set of the memory reference, *VUSE to its
+   VUSE and *SIZE to the size of the load needed.  At last, the
+   function returns the source tree expression.  */
 
 static tree
-find_bswap (gimple stmt)
+find_bswap (gimple stmt, tree *alias_set, tree *vuse, int *size)
 {
 /* The number which the find_bswap result should match in order to
-   have a full byte swap.  The number is shifted to the left according
+   have a full byte swap.  The number is shifted to the right according
    to the size of the symbolic number before using it.  */
   unsigned HOST_WIDEST_INT cmp =
     sizeof (HOST_WIDEST_INT) < 8 ? 0 :
@@ -1858,12 +1979,12 @@ find_bswap (gimple stmt)
 
   struct symbolic_number n;
   tree source_expr;
-  int limit;
+  int limit, rsize;
 
   /* The last parameter determines the depth search limit.  It usually
      correlates directly to the number of bytes to be touched.  We
      increase that number by three  here in order to also
-     cover signed -> unsigned converions of the src operand as can be seen
+     cover signed -> unsigned conversions of the src operand as can be seen
      in libgcc, and for initial shift/and operation of the src operand.  */
   limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
   limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
@@ -1875,18 +1996,35 @@ find_bswap (gimple stmt)
   /* Zero out the extra bits of N and CMP.  */
   if (n.size < (int)sizeof (HOST_WIDEST_INT))
     {
+      int tmpn;
       unsigned HOST_WIDEST_INT mask =
 	((unsigned HOST_WIDEST_INT)1 << (n.size * BITS_PER_UNIT)) - 1;
 
       n.n &= mask;
-      cmp >>= (sizeof (HOST_WIDEST_INT) - n.size) * BITS_PER_UNIT;
+      /* Find real size of result (highest non zero byte).  */
+      for (tmpn = n.n, rsize = 0; tmpn; tmpn >>= BITS_PER_UNIT, rsize++);
+      cmp >>= (sizeof (HOST_WIDEST_INT) - rsize) * BITS_PER_UNIT;
     }
+  else
+    rsize = n.size;
 
   /* A complete byte swap should make the symbolic number to start
      with the largest digit in the highest order byte.  */
   if (cmp != n.n)
     return NULL_TREE;
 
+  *alias_set = NULL_TREE;
+
+  if (n.base_addr)
+    {
+      *alias_set = n.alias_set;
+      *vuse = n.vuse;
+      n.size = rsize;
+    }
+  else if (rsize != n.size)
+    return NULL_TREE;
+
+  *size = n.size * BITS_PER_UNIT;
   return source_expr;
 }
 
@@ -1984,7 +2122,7 @@ pass_optimize_bswap::execute (function *fun)
       for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
         {
 	  gimple stmt = gsi_stmt (gsi);
-	  tree bswap_src, bswap_type;
+	  tree bswap_src, bswap_type, load_type, alias_type, vuse = NULL_TREE;
 	  tree bswap_tmp;
 	  tree fndecl = NULL_TREE;
 	  int type_size;
@@ -1994,11 +2132,15 @@ pass_optimize_bswap::execute (function *fun)
 	      || gimple_assign_rhs_code (stmt) != BIT_IOR_EXPR)
 	    continue;
 
-	  type_size = TYPE_PRECISION (gimple_expr_type (stmt));
+	  bswap_src = find_bswap (stmt, &alias_type, &vuse, &type_size);
+
+	  if (!bswap_src)
+	    continue;
 
 	  switch (type_size)
 	    {
 	    case 16:
+	      load_type = uint16_type_node;
 	      if (bswap16_p)
 		{
 		  fndecl = builtin_decl_explicit (BUILT_IN_BSWAP16);
@@ -2006,6 +2148,7 @@ pass_optimize_bswap::execute (function *fun)
 		}
 	      break;
 	    case 32:
+	      load_type = uint32_type_node;
 	      if (bswap32_p)
 		{
 		  fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
@@ -2013,6 +2156,7 @@ pass_optimize_bswap::execute (function *fun)
 		}
 	      break;
 	    case 64:
+	      load_type = uint64_type_node;
 	      if (bswap64_p)
 		{
 		  fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
@@ -2026,10 +2170,39 @@ pass_optimize_bswap::execute (function *fun)
 	  if (!fndecl)
 	    continue;
 
-	  bswap_src = find_bswap (stmt);
 
-	  if (!bswap_src)
-	    continue;
+	  /* Need to load the value from memory first.  */
+	  if (alias_type)
+	    {
+	      tree addr_expr, addr_tmp, val_expr, val_tmp;
+	      tree load_offset_ptr;
+	      gimple addr_stmt, load_stmt;
+
+	      changed = true;
+
+	      /*  Compute address to load from and cast according to the size
+		  of the load.  */
+	      addr_expr = build_fold_addr_expr (unshare_expr (bswap_src));
+	      if (is_gimple_min_invariant (addr_expr))
+		addr_tmp = addr_expr;
+	      else
+		{
+		  addr_tmp = make_temp_ssa_name (TREE_TYPE (addr_expr), NULL,
+						 "load_src");
+		  addr_stmt = gimple_build_assign (addr_tmp, addr_expr);
+		  gsi_insert_before (&gsi, addr_stmt, GSI_SAME_STMT);
+		}
+
+	      /* Perform the load.  */
+	      load_offset_ptr = build_int_cst (alias_type, 0);
+	      val_tmp = make_temp_ssa_name (load_type, NULL, "load_dst");
+	      val_expr = fold_build2 (MEM_REF, load_type, addr_tmp,
+				      load_offset_ptr);
+	      load_stmt = gimple_build_assign (val_tmp, val_expr);
+	      gimple_set_vuse (load_stmt, vuse);
+	      gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
+	      bswap_src = val_tmp;
+	    }
 
 	  changed = true;
 	  if (type_size == 16)
diff --git a/gcc/tree.h b/gcc/tree.h
index ae4876d..0cb2abe 100644
--- a/gcc/tree.h
+++ b/gcc/tree.h
@@ -4539,7 +4539,7 @@ extern tree build_personality_function (const char *);
    the access position and size.  */
 extern tree get_inner_reference (tree, HOST_WIDE_INT *, HOST_WIDE_INT *,
 				 tree *, enum machine_mode *, int *, int *,
-				 bool);
+				 bool, bool = false);
 
 /* Return a tree representing the lower bound of the array mentioned in
    EXP, an ARRAY_REF or an ARRAY_RANGE_REF.  */

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH][2/3] Fix PR54733 Optimize endian independent load/store
  2014-05-04  9:51                     ` Thomas Preud'homme
@ 2014-05-05 11:29                       ` Thomas Preud'homme
  0 siblings, 0 replies; 14+ messages in thread
From: Thomas Preud'homme @ 2014-05-05 11:29 UTC (permalink / raw)
  To: GCC Patches

I found a way to improve the function find_bswap/find_bswap_or_nop
and reduce its size. Please hold for the review, I will post an updated
version as soon as I finish testing.

Best regards,

Thomas Preud'homme



^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2014-05-05 11:29 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-04-02  0:55 [PATCH][2/3] Fix PR54733 Optimize endian independent load/store Thomas Preud'homme
2014-04-02  6:17 ` Marc Glisse
2014-04-02  7:04   ` Thomas Preud'homme
2014-04-02  8:17     ` Richard Biener
2014-04-03  5:42       ` Thomas Preud'homme
2014-04-04  5:49         ` Thomas Preud'homme
2014-04-15  9:07           ` Richard Biener
2014-04-17  5:30             ` Thomas Preud'homme
2014-04-17 10:36               ` Richard Biener
2014-04-24  1:37                 ` Thomas Preud'homme
2014-04-24  8:40                   ` Thomas Preud'homme
2014-04-29 10:47                   ` Richard Biener
2014-05-04  9:51                     ` Thomas Preud'homme
2014-05-05 11:29                       ` Thomas Preud'homme

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).