public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* RE: [PATCH] Fix PR54733 Optimize endian independent load/store
@ 2014-05-09 10:26 Thomas Preud'homme
  2014-05-16 10:07 ` Thomas Preud'homme
  0 siblings, 1 reply; 31+ messages in thread
From: Thomas Preud'homme @ 2014-05-09 10:26 UTC (permalink / raw)
  To: GCC Patches

[-- Attachment #1: Type: text/plain, Size: 2127 bytes --]

Sorry, took longer than expected as I got distracted by some other patch.
I merged the whole patchset in a single patch as I was told the current setup
is actually more difficult to read.

Here are the updated ChangeLogs:

*** gcc/ChangeLog ***

2014-05-09  Thomas Preud'homme  <thomas.preudhomme@arm.com>

	PR tree-optimization/54733
	* expr.c (get_inner_reference): Add a parameter to control whether a
	MEM_REF should be split into base + offset.
	* tree.h (get_inner_reference): Default new parameter to false.
	* tree-ssa-math-opts.c (nop_stats): New "bswap_stats" structure.
	(CMPNOP): Define.
	(find_bswap_or_nop_load): New.
	(find_bswap_1): Renamed to ...
	(find_bswap_or_nop_1): This. Also add support for memory source.
	(find_bswap): Renamed to ...
	(find_bswap_or_nop): This. Also add support for memory source and
	detection of bitwise operations equivalent to load in host endianness.
	(execute_optimize_bswap): Likewise. Also move its leading comment back
	in place and split statement transformation into ...
	(bswap_replace): This. Add assert when updating bswap_stats.

*** gcc/testsuite/ChangeLog ***

2014-05-09  Thomas Preud'homme  <thomas.preudhomme@arm.com>

	PR tree-optimization/54733
	* gcc.dg/optimize-bswapdi-3.c: New test to check extension of bswap
	optimization to support memory sources and bitwise operations
	equivalent to load in host endianness.
	* gcc.dg/optimize-bswaphi-1.c: Likewise.
	* gcc.dg/optimize-bswapsi-2.c: Likewise.
	* gcc.c-torture/execute/bswap-2.c: Likewise.

Ok for trunk?

Best regards,

Thomas

> -----Original Message-----
> From: gcc-patches-owner@gcc.gnu.org [mailto:gcc-patches-
> owner@gcc.gnu.org] On Behalf Of Thomas Preud'homme
> Sent: Monday, May 05, 2014 7:30 PM
> To: GCC Patches
> Subject: RE: [PATCH][2/3] Fix PR54733 Optimize endian independent
> load/store
> 
> I found a way to improve the function find_bswap/find_bswap_or_nop
> and reduce its size. Please hold for the review, I will post an updated
> version as soon as I finish testing.
> 
> Best regards,
> 
> Thomas Preud'homme
> 
> 
> 

[-- Attachment #2: gcc32rm-84.5.4.diff --]
[-- Type: application/octet-stream, Size: 32901 bytes --]

diff --git a/gcc/expr.c b/gcc/expr.c
index fec6194..864ee20 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -6669,7 +6669,10 @@ store_field (rtx target, HOST_WIDE_INT bitsize, HOST_WIDE_INT bitpos,
 \f
 /* Given an expression EXP that may be a COMPONENT_REF, a BIT_FIELD_REF,
    an ARRAY_REF, or an ARRAY_RANGE_REF, look for nested operations of these
-   codes and find the ultimate containing object, which we return.
+   codes and find the ultimate containing object, which we return. If that
+   object is a MEM_REF that is not the address of an object, ALLOW_ADDRESS
+   decides whether the pointer is returned instead of the memory reference.
+   Note that the behavior for TARGET_MEM_REF is not affected by this parameter.
 
    We set *PBITSIZE to the size in bits that we want, *PBITPOS to the
    bit position, and *PUNSIGNEDP to the signedness of the field.
@@ -6706,7 +6709,7 @@ tree
 get_inner_reference (tree exp, HOST_WIDE_INT *pbitsize,
 		     HOST_WIDE_INT *pbitpos, tree *poffset,
 		     enum machine_mode *pmode, int *punsignedp,
-		     int *pvolatilep, bool keep_aligning)
+		     int *pvolatilep, bool keep_aligning, bool allow_address)
 {
   tree size_tree = 0;
   enum machine_mode mode = VOIDmode;
@@ -6838,7 +6841,7 @@ get_inner_reference (tree exp, HOST_WIDE_INT *pbitsize,
 
 	case MEM_REF:
 	  /* Hand back the decl for MEM[&decl, off].  */
-	  if (TREE_CODE (TREE_OPERAND (exp, 0)) == ADDR_EXPR)
+	  if (TREE_CODE (TREE_OPERAND (exp, 0)) == ADDR_EXPR || allow_address)
 	    {
 	      tree off = TREE_OPERAND (exp, 1);
 	      if (!integer_zerop (off))
@@ -6848,7 +6851,10 @@ get_inner_reference (tree exp, HOST_WIDE_INT *pbitsize,
 				      ? 3 : exact_log2 (BITS_PER_UNIT));
 		  bit_offset += boff;
 		}
-	      exp = TREE_OPERAND (TREE_OPERAND (exp, 0), 0);
+	      if (TREE_CODE (TREE_OPERAND (exp, 0)) == ADDR_EXPR)
+		exp = TREE_OPERAND (TREE_OPERAND (exp, 0), 0);
+	      else
+		exp = TREE_OPERAND (exp, 0);
 	    }
 	  goto done;
 
diff --git a/gcc/testsuite/gcc.c-torture/execute/bswap-2.c b/gcc/testsuite/gcc.c-torture/execute/bswap-2.c
new file mode 100644
index 0000000..e91b487
--- /dev/null
+++ b/gcc/testsuite/gcc.c-torture/execute/bswap-2.c
@@ -0,0 +1,90 @@
+#ifdef __UINT32_TYPE__
+typedef __UINT32_TYPE__ uint32_t;
+#else
+typedef __UINT32_TYPE__ unsigned;
+#endif
+
+struct bitfield {
+  unsigned char f0:7;
+  unsigned char f1:7;
+  unsigned char f2:7;
+  unsigned char f3:7;
+};
+
+struct ok {
+  unsigned char f0;
+  unsigned char f1;
+  unsigned char f2;
+  unsigned char f3;
+};
+
+union bf_or_uint32 {
+  struct ok inval;
+  struct bitfield bfval;
+};
+
+__attribute__ ((noinline, noclone)) uint32_t
+partial_read_le32 (union bf_or_uint32 in)
+{
+  return in.bfval.f0 | (in.bfval.f1 << 8)
+	 | (in.bfval.f2 << 16) | (in.bfval.f3 << 24);
+}
+
+__attribute__ ((noinline, noclone)) uint32_t
+partial_read_be32 (union bf_or_uint32 in)
+{
+  return in.bfval.f3 | (in.bfval.f2 << 8)
+	 | (in.bfval.f1 << 16) | (in.bfval.f0 << 24);
+}
+
+__attribute__ ((noinline, noclone)) uint32_t
+fake_read_le32 (char *x, char *y)
+{
+  unsigned char c0, c1, c2, c3;
+
+  c0 = x[0];
+  c1 = x[1];
+  *y = 1;
+  c2 = x[2];
+  c3 = x[3];
+  return c0 | c1 << 8 | c2 << 16 | c3 << 24;
+}
+
+__attribute__ ((noinline, noclone)) uint32_t
+fake_read_be32 (char *x, char *y)
+{
+  unsigned char c0, c1, c2, c3;
+
+  c0 = x[0];
+  c1 = x[1];
+  *y = 1;
+  c2 = x[2];
+  c3 = x[3];
+  return c3 | c2 << 8 | c1 << 16 | c0 << 24;
+}
+
+int
+main ()
+{
+  union bf_or_uint32 bfin;
+  uint32_t out;
+  char cin[] = { 0x83, 0x85, 0x87, 0x89 };
+
+  if (sizeof (uint32_t) * __CHAR_BIT__ != 32)
+    return 0;
+  bfin.inval = (struct ok) { 0x83, 0x85, 0x87, 0x89 };
+  out = partial_read_le32 (bfin);
+  if (out != 0x09070503 && out != 0x88868482)
+    __builtin_abort ();
+  bfin.inval = (struct ok) { 0x83, 0x85, 0x87, 0x89 };
+  out = partial_read_be32 (bfin);
+  if (out != 0x03050709 && out != 0x82848688)
+    __builtin_abort ();
+  out = fake_read_le32 (cin, &cin[2]);
+  if (out != 0x89018583)
+    __builtin_abort ();
+  out = fake_read_be32 (cin, &cin[2]);
+  if (out != 0x83850189)
+    __builtin_abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.dg/optimize-bswapdi-3.c b/gcc/testsuite/gcc.dg/optimize-bswapdi-3.c
new file mode 100644
index 0000000..a10c23f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/optimize-bswapdi-3.c
@@ -0,0 +1,82 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target bswap64 } */
+/* { dg-require-effective-target stdint_types } */
+/* { dg-options "-O2 -fdump-tree-bswap" } */
+
+#include <stdint.h>
+
+struct uint64_st {
+  unsigned char u0, u1, u2, u3, u4, u5, u6, u7;
+};
+
+uint32_t read_aux (void *, uint32_t);
+
+uint64_t read_le64_1 (void)
+{
+  unsigned char data[8];
+
+  read_aux (data, 8);
+  return (uint64_t) data[0] | ((uint64_t) data[1] << 8)
+	 | ((uint64_t) data[2] << 16) | ((uint64_t) data[3] << 24)
+	 | ((uint64_t) data[4] << 32) | ((uint64_t) data[5] << 40)
+	 | ((uint64_t) data[6] << 48) | ((uint64_t) data[7] << 56);
+}
+
+uint64_t read_le64_2 (void)
+{
+  struct uint64_st data;
+
+  read_aux (&data, 8);
+  return (uint64_t) data.u0 | ((uint64_t) data.u1 << 8)
+	 | ((uint64_t) data.u2 << 16) | ((uint64_t) data.u3 << 24)
+	 | ((uint64_t) data.u4 << 32) | ((uint64_t) data.u5 << 40)
+	 | ((uint64_t) data.u6 << 48) | ((uint64_t) data.u7 << 56);
+}
+
+uint64_t read_le64_3 (void)
+{
+  unsigned char *data;
+
+  read_aux (data, 8);
+  return (uint64_t) *data | ((uint64_t) *(data + 1) << 8)
+	 | ((uint64_t) *(data + 2) << 16) | ((uint64_t) *(data + 3) << 24)
+	 | ((uint64_t) *(data + 4) << 32) | ((uint64_t) *(data + 5) << 40)
+	 | ((uint64_t) *(data + 6) << 48) | ((uint64_t) *(data + 7) << 56);
+}
+
+uint64_t read_be64_1 (void)
+{
+  unsigned char data[8];
+
+  read_aux (data, 8);
+  return (uint64_t) data[7] | ((uint64_t) data[6] << 8)
+	 | ((uint64_t) data[5] << 16) | ((uint64_t) data[4] << 24)
+	 | ((uint64_t) data[3] << 32) | ((uint64_t) data[2] << 40)
+	 | ((uint64_t) data[1] << 48) | ((uint64_t) data[0] << 56);
+}
+
+uint64_t read_be64_2 (void)
+{
+  struct uint64_st data;
+
+  read_aux (&data, 8);
+  return (uint64_t) data.u7 | ((uint64_t) data.u6 << 8)
+	 | ((uint64_t) data.u5 << 16) | ((uint64_t) data.u4 << 24)
+	 | ((uint64_t) data.u3 << 32) | ((uint64_t) data.u2 << 40)
+	 | ((uint64_t) data.u1 << 48) | ((uint64_t) data.u0 << 56);
+}
+
+uint64_t read_be64_3 (void)
+{
+  unsigned char *data;
+
+  read_aux (data, 8);
+  return (uint64_t) *(data + 7) | ((uint64_t) *(data + 6) << 8)
+	 | ((uint64_t) *(data + 5) << 16) | ((uint64_t) *(data + 4) << 24)
+	 | ((uint64_t) *(data + 3) << 32) | ((uint64_t) *(data + 2) << 40)
+	 | ((uint64_t) *(data + 1) << 48) | ((uint64_t) *data << 56);
+}
+
+/* { dg-final { scan-tree-dump-times "64 bit load in host endianness found at" 3 "bswap" } } */
+/* { dg-final { scan-tree-dump-times "64 bit bswap implementation found at" 3 "bswap" } } */
+/* { dg-final { cleanup-tree-dump "bswap" } } */
diff --git a/gcc/testsuite/gcc.dg/optimize-bswaphi-1.c b/gcc/testsuite/gcc.dg/optimize-bswaphi-1.c
new file mode 100644
index 0000000..b57cd92
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/optimize-bswaphi-1.c
@@ -0,0 +1,65 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target bswap16 } */
+/* { dg-require-effective-target stdint_types } */
+/* { dg-options "-O2 -fdump-tree-bswap" } */
+/* { dg-options "-O2 -fdump-tree-bswap -march=z900" { target s390-*-* } } */
+
+#include <stdint.h>
+
+struct uint16_st {
+  unsigned char u0, u1;
+};
+
+uint32_t read_aux (void *, uint32_t);
+
+uint32_t read_le16_1 (void)
+{
+  unsigned char data[2];
+
+  read_aux (data, 2);
+  return data[0] | (data[1] << 8);
+}
+
+uint32_t read_le16_2 (void)
+{
+  struct uint16_st data;
+
+  read_aux (&data, 2);
+  return data.u0 | (data.u1 << 8);
+}
+
+uint32_t read_le16_3 (void)
+{
+  unsigned char *data;
+
+  read_aux (data, 2);
+  return *data | (*(data + 1) << 8);
+}
+
+uint32_t read_be16_1 (void)
+{
+  unsigned char data[2];
+
+  read_aux (data, 2);
+  return data[1] | (data[0] << 8);
+}
+
+uint32_t read_be16_2 (void)
+{
+  struct uint16_st data;
+
+  read_aux (&data, 2);
+  return data.u1 | (data.u0 << 8);
+}
+
+uint32_t read_be16_3 (void)
+{
+  unsigned char *data;
+
+  read_aux (data, 2);
+  return *(data + 1) | (*data << 8);
+}
+
+/* { dg-final { scan-tree-dump-times "16 bit load in host endianness found at" 3 "bswap" } } */
+/* { dg-final { scan-tree-dump-times "16 bit bswap implementation found at" 3 "bswap" } } */
+/* { dg-final { cleanup-tree-dump "bswap" } } */
diff --git a/gcc/testsuite/gcc.dg/optimize-bswapsi-2.c b/gcc/testsuite/gcc.dg/optimize-bswapsi-2.c
new file mode 100644
index 0000000..e59b310
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/optimize-bswapsi-2.c
@@ -0,0 +1,67 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target bswap32 } */
+/* { dg-require-effective-target stdint_types } */
+/* { dg-options "-O2 -fdump-tree-bswap" } */
+/* { dg-options "-O2 -fdump-tree-bswap -march=z900" { target s390-*-* } } */
+
+#include <stdint.h>
+
+struct uint32_st {
+  unsigned char u0, u1, u2, u3;
+};
+
+uint32_t read_aux (void *, uint32_t);
+
+uint32_t read_le32_1 (void)
+{
+  unsigned char data[4];
+
+  read_aux (data, 4);
+  return data[0] | (data[1] << 8) | (data[2] << 16) | (data[3] << 24);
+}
+
+uint32_t read_le32_2 (void)
+{
+  struct uint32_st data;
+
+  read_aux (&data, 4);
+  return data.u0 | (data.u1 << 8) | (data.u2 << 16) | (data.u3 << 24);
+}
+
+uint32_t read_le32_3 (void)
+{
+  unsigned char *data;
+
+  read_aux (data, 4);
+  return *data | (*(data + 1) << 8) | (*(data + 2) << 16)
+	 | (*(data + 3) << 24);
+}
+
+uint32_t read_be32_1 (void)
+{
+  unsigned char data[4];
+
+  read_aux (data, 4);
+  return data[3] | (data[2] << 8) | (data[1] << 16) | (data[0] << 24);
+}
+
+uint32_t read_be32_2 (void)
+{
+  struct uint32_st data;
+
+  read_aux (&data, 4);
+  return data.u3 | (data.u2 << 8) | (data.u1 << 16) | (data.u0 << 24);
+}
+
+uint32_t read_be32_3 (void)
+{
+  unsigned char *data;
+
+  read_aux (data, 4);
+  return *(data + 3) | (*(data + 2) << 8) | (*(data + 1) << 16)
+	 | (*data << 24);
+}
+
+/* { dg-final { scan-tree-dump-times "32 bit load in host endianness found at" 3 "bswap" } } */
+/* { dg-final { scan-tree-dump-times "32 bit bswap implementation found at" 3 "bswap" } } */
+/* { dg-final { cleanup-tree-dump "bswap" } } */
diff --git a/gcc/tree-ssa-math-opts.c b/gcc/tree-ssa-math-opts.c
index b965ad1..7e0212d 100644
--- a/gcc/tree-ssa-math-opts.c
+++ b/gcc/tree-ssa-math-opts.c
@@ -98,6 +98,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "is-a.h"
 #include "gimple.h"
 #include "gimple-iterator.h"
+#include "gimplify.h"
 #include "gimplify-me.h"
 #include "stor-layout.h"
 #include "gimple-ssa.h"
@@ -170,15 +171,15 @@ static struct
 
 static struct
 {
-  /* Number of hand-written 16-bit bswaps found.  */
+  /* Number of hand-written 16-bit nop / bswaps found.  */
   int found_16bit;
 
-  /* Number of hand-written 32-bit bswaps found.  */
+  /* Number of hand-written 32-bit nop / bswaps found.  */
   int found_32bit;
 
-  /* Number of hand-written 64-bit bswaps found.  */
+  /* Number of hand-written 64-bit nop / bswaps found.  */
   int found_64bit;
-} bswap_stats;
+} nop_stats, bswap_stats;
 
 static struct
 {
@@ -1606,13 +1607,43 @@ make_pass_cse_sincos (gcc::context *ctxt)
 
    0    - byte has the value 0
    1..size - byte contains the content of the byte
-   number indexed with that value minus one  */
+   number indexed with that value minus one.
+
+   To detect permutations on memory sources (arrays and structures), a symbolic
+   number is also associated a base address (the array or structure the load is
+   made from), an offset from the base address and a range which gives the
+   difference between the highest and lowest accessed memory location to make
+   such a symbolic number. The range is thus different from size which reflects
+   the size of the type of current expression. Note that for non memory source,
+   range holds the same value as size.
+
+   For instance, for an array char a[], (short) a[0] | (short) a[3] would have
+   a size of 2 but a range of 4 while (short) a[0] | ((short) a[0] << 1) would
+   still have a size of 2 but this time a range of 1.  */
 
 struct symbolic_number {
   unsigned HOST_WIDEST_INT n;
   int size;
+  tree base_addr;
+  tree offset;
+  HOST_WIDE_INT bytepos;
+  tree alias_set;
+  tree vuse;
+  unsigned HOST_WIDE_INT range;
 };
 
+/* The number which the find_bswap_or_nop_1 result should match in
+   order to have a nop.  The number is masked according to the size of
+   the symbolic number before using it.  */
+#define CMPNOP (sizeof (HOST_WIDEST_INT) < 8 ? 0 : \
+  (unsigned HOST_WIDEST_INT)0x08070605 << 32 | 0x04030201)
+
+/* The number which the find_bswap_or_nop_1 result should match in
+   order to have a byte swap.  The number is masked according to the
+   size of the symbolic number before using it.  */
+#define CMPXCHG (sizeof (HOST_WIDEST_INT) < 8 ? 0 : \
+  (unsigned HOST_WIDEST_INT)0x01020304 << 32 | 0x05060708)
+
 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
    number N.  Return false if the requested operation is not permitted
    on a symbolic number.  */
@@ -1672,13 +1703,44 @@ verify_symbolic_number_p (struct symbolic_number *n, gimple stmt)
   return true;
 }
 
-/* find_bswap_1 invokes itself recursively with N and tries to perform
-   the operation given by the rhs of STMT on the result.  If the
-   operation could successfully be executed the function returns the
-   tree expression of the source operand and NULL otherwise.  */
+/* Check if STMT might be a byte swap or a nop from a memory source and returns
+   the answer. If so, REF is that memory source and the base of the memory area
+   accessed and the offset of the access from that base are recorded in N.  */
+
+bool
+find_bswap_or_nop_load (gimple stmt, tree ref, struct symbolic_number *n)
+{
+  /* Leaf node is an array or component ref. Memorize its base and
+     offset from base to compare to other such leaf node.  */
+  HOST_WIDE_INT bitsize, bitpos;
+  enum machine_mode mode;
+  int unsignedp, volatilep;
+
+  if (!gimple_assign_load_p (stmt) || gimple_has_volatile_ops (stmt))
+    return false;
+
+  n->base_addr = get_inner_reference (ref, &bitsize, &bitpos, &n->offset,
+				      &mode, &unsignedp, &volatilep, false,
+				      true);
+
+  if (bitpos % BITS_PER_UNIT)
+    return false;
+  if (bitsize % BITS_PER_UNIT)
+    return false;
+
+  n->bytepos = bitpos / BITS_PER_UNIT;
+  n->alias_set = reference_alias_ptr_type (ref);
+  n->vuse = gimple_vuse (stmt);
+  return true;
+}
+
+/* find_bswap_or_nop_1 invokes itself recursively with N and tries to perform
+   the operation given by the rhs of STMT on the result.  If the operation
+   could successfully be executed the function returns the tree expression of
+   the source operand and NULL otherwise.  */
 
 static tree
-find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
+find_bswap_or_nop_1 (gimple stmt, struct symbolic_number *n, int limit)
 {
   enum tree_code code;
   tree rhs1, rhs2 = NULL;
@@ -1691,6 +1753,9 @@ find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
 
   rhs1 = gimple_assign_rhs1 (stmt);
 
+  if (find_bswap_or_nop_load (stmt, rhs1, n))
+    return rhs1;
+
   if (TREE_CODE (rhs1) != SSA_NAME)
     return NULL_TREE;
 
@@ -1717,11 +1782,11 @@ find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
 	  && code != CONVERT_EXPR)
 	return NULL_TREE;
 
-      source_expr1 = find_bswap_1 (rhs1_stmt, n, limit - 1);
+      source_expr1 = find_bswap_or_nop_1 (rhs1_stmt, n, limit - 1);
 
-      /* If find_bswap_1 returned NULL STMT is a leaf node and we have
-	 to initialize the symbolic number.  */
-      if (!source_expr1)
+      /* If find_bswap_or_nop_1 returned NULL, STMT is a leaf node and
+	 we have to initialize the symbolic number.  */
+      if (!source_expr1 || gimple_assign_load_p (rhs1_stmt))
 	{
 	  /* Set up the symbolic number N by setting each byte to a
 	     value between 1 and the byte size of rhs1.  The highest
@@ -1731,14 +1796,18 @@ find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
 	  if (n->size % BITS_PER_UNIT != 0)
 	    return NULL_TREE;
 	  n->size /= BITS_PER_UNIT;
-	  n->n = (sizeof (HOST_WIDEST_INT) < 8 ? 0 :
-		  (unsigned HOST_WIDEST_INT)0x08070605 << 32 | 0x04030201);
+	  n->range = n->size;
+	  n->n = CMPNOP;
 
 	  if (n->size < (int)sizeof (HOST_WIDEST_INT))
 	    n->n &= ((unsigned HOST_WIDEST_INT)1 <<
 		     (n->size * BITS_PER_UNIT)) - 1;
 
-	  source_expr1 = rhs1;
+	  if (!source_expr1)
+	    {
+	      n->base_addr = n->offset = n->alias_set = n->vuse = NULL_TREE;
+	      source_expr1 = rhs1;
+	    }
 	}
 
       switch (code)
@@ -1779,6 +1848,8 @@ find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
 		n->n &= ((unsigned HOST_WIDEST_INT)1 << type_size) - 1;
 	      }
 	    n->size = type_size / BITS_PER_UNIT;
+	    if (!n->base_addr)
+	      n->range = n->size;
 	  }
 	  break;
 	default:
@@ -1807,17 +1878,79 @@ find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
       switch (code)
 	{
 	case BIT_IOR_EXPR:
-	  source_expr1 = find_bswap_1 (rhs1_stmt, &n1, limit - 1);
+	  source_expr1 = find_bswap_or_nop_1 (rhs1_stmt, &n1, limit - 1);
 
 	  if (!source_expr1)
 	    return NULL_TREE;
 
-	  source_expr2 = find_bswap_1 (rhs2_stmt, &n2, limit - 1);
+	  source_expr2 = find_bswap_or_nop_1 (rhs2_stmt, &n2, limit - 1);
+
+	  if (n1.size != n2.size || !source_expr2)
+	    return NULL_TREE;
 
-	  if (source_expr1 != source_expr2
-	      || n1.size != n2.size)
+	  if (!n1.vuse != !n2.vuse ||
+	  (n1.vuse && !operand_equal_p (n1.vuse, n2.vuse, 0)))
 	    return NULL_TREE;
 
+	  if (source_expr1 != source_expr2)
+	    {
+	      HOST_WIDEST_INT inc, mask;
+	      unsigned i;
+	      HOST_WIDE_INT off_sub;
+	      struct symbolic_number *n_ptr;
+
+	      if (!n1.base_addr || !n2.base_addr
+		  || !operand_equal_p (n1.base_addr, n2.base_addr, 0))
+		return NULL_TREE;
+	      if (!n1.offset != !n2.offset ||
+	          (n1.offset && !operand_equal_p (n1.offset, n2.offset, 0)))
+		return NULL_TREE;
+
+	      /* We swap n1 with n2 to have n1 < n2.  */
+	      if (n2.bytepos < n1.bytepos)
+		{
+		  struct symbolic_number tmpn;
+
+		  tmpn = n2;
+		  n2 = n1;
+		  n1 = tmpn;
+		  source_expr1 = source_expr2;
+		}
+
+	      off_sub = n2.bytepos - n1.bytepos;
+
+	      /* Check that the range of memory covered < biggest int size.  */
+	      if (off_sub + n2.range > (int) sizeof (HOST_WIDEST_INT))
+	        return NULL_TREE;
+	      n->range = n2.range + off_sub;
+
+	      /* Reinterpret byte marks in symbolic number holding the value of
+		 bigger weight according to host endianness.  */
+	      inc = BYTES_BIG_ENDIAN ? off_sub + n2.range - n1.range : off_sub;
+	      mask = 0xFF;
+	      if (BYTES_BIG_ENDIAN)
+		n_ptr = &n1;
+	      else
+		n_ptr = &n2;
+	      for (i = 0; i < sizeof (HOST_WIDEST_INT); i++, inc <<= 8,
+		   mask <<= 8)
+		{
+		  if (n_ptr->n & mask)
+		    n_ptr->n += inc;
+		}
+	    }
+	  else
+	    n->range = n1.range;
+
+	  if (!n1.alias_set
+	      || alias_ptr_types_compatible_p (n1.alias_set, n2.alias_set))
+	    n->alias_set = n1.alias_set;
+	  else
+	    n->alias_set = ptr_type_node;
+	  n->vuse = n1.vuse;
+	  n->base_addr = n1.base_addr;
+	  n->offset = n1.offset;
+	  n->bytepos = n1.bytepos;
 	  n->size = n1.size;
 	  for (i = 0, mask = 0xff; i < n->size; i++, mask <<= BITS_PER_UNIT)
 	    {
@@ -1842,57 +1975,75 @@ find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
   return NULL_TREE;
 }
 
-/* Check if STMT completes a bswap implementation consisting of ORs,
-   SHIFTs and ANDs.  Return the source tree expression on which the
-   byte swap is performed and NULL if no bswap was found.  */
+/* Check if STMT completes a bswap implementation or a read in a given
+   endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP
+   accordingly.  It also sets N to represent the kind of operations
+   performed: size of the resulting expression and whether it works on
+   a memory source, and if so alias-set and vuse.  At last, the
+   function returns the source tree expression.  */
 
 static tree
-find_bswap (gimple stmt)
+find_bswap_or_nop (gimple stmt, struct symbolic_number *n, bool *bswap)
 {
-/* The number which the find_bswap result should match in order to
-   have a full byte swap.  The number is shifted to the left according
-   to the size of the symbolic number before using it.  */
-  unsigned HOST_WIDEST_INT cmp =
-    sizeof (HOST_WIDEST_INT) < 8 ? 0 :
-    (unsigned HOST_WIDEST_INT)0x01020304 << 32 | 0x05060708;
-
-  struct symbolic_number n;
+/* The number which the find_bswap_or_nop_1 result should match in order
+   to have a full byte swap.  The number is shifted to the right
+   according to the size of the symbolic number before using it.  */
+  unsigned HOST_WIDEST_INT cmpxchg = CMPXCHG;
+  unsigned HOST_WIDEST_INT cmpnop = CMPNOP;
+
   tree source_expr;
   int limit;
 
   /* The last parameter determines the depth search limit.  It usually
-     correlates directly to the number of bytes to be touched.  We
-     increase that number by three  here in order to also
-     cover signed -> unsigned converions of the src operand as can be seen
+     correlates directly to the number n of bytes to be touched.  We
+     increase that number by log2(n) + 1 here in order to also
+     cover signed -> unsigned conversions of the src operand as can be seen
      in libgcc, and for initial shift/and operation of the src operand.  */
   limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
   limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
-  source_expr =  find_bswap_1 (stmt, &n, limit);
+  source_expr =  find_bswap_or_nop_1 (stmt, n, limit);
 
   if (!source_expr)
     return NULL_TREE;
 
-  /* Zero out the extra bits of N and CMP.  */
-  if (n.size < (int)sizeof (HOST_WIDEST_INT))
+  /* Find real size of result (highest non zero byte).  */
+  if (n->base_addr)
     {
-      unsigned HOST_WIDEST_INT mask =
-	((unsigned HOST_WIDEST_INT)1 << (n.size * BITS_PER_UNIT)) - 1;
+      int rsize;
+      unsigned HOST_WIDEST_INT tmpn;
 
-      n.n &= mask;
-      cmp >>= (sizeof (HOST_WIDEST_INT) - n.size) * BITS_PER_UNIT;
+      for (tmpn = n->n, rsize = 0; tmpn; tmpn >>= BITS_PER_UNIT, rsize++);
+      n->range = rsize;
     }
 
-  /* A complete byte swap should make the symbolic number to start
-     with the largest digit in the highest order byte.  */
-  if (cmp != n.n)
+  /* Zero out the extra bits of N and CMP*.  */
+  if (n->range < (int)sizeof (HOST_WIDEST_INT))
+    {
+      unsigned HOST_WIDEST_INT mask;
+
+      mask = ((unsigned HOST_WIDEST_INT)1 << (n->range * BITS_PER_UNIT)) - 1;
+      cmpxchg >>= (sizeof (HOST_WIDEST_INT) - n->range) * BITS_PER_UNIT;
+      cmpnop &= mask;
+    }
+
+  /* A complete byte swap should make the symbolic number to start with
+     the largest digit in the highest order byte. Unchanged symbolic
+     number indicates a read with same endianness as host architecture.  */
+  if (n->n == cmpnop)
+    *bswap = false;
+  else if (n->n == cmpxchg)
+    *bswap = true;
+  else
+    return NULL_TREE;
+
+  /* Useless bit manipulation performed by code.  */
+  if (!n->base_addr && n->n == cmpnop)
     return NULL_TREE;
 
+  n->range *= BITS_PER_UNIT;
   return source_expr;
 }
 
-/* Find manual byte swap implementations and turn them into a bswap
-   builtin invokation.  */
-
 namespace {
 
 const pass_data pass_data_optimize_bswap =
@@ -1926,6 +2077,146 @@ public:
 
 }; // class pass_optimize_bswap
 
+/* Perform the bswap optimization: replace the statement STMT at GSI
+   with load type, VUSE and set-alias as described by N if a memory
+   source is involved (N->base_addr is non null), followed by the
+   builtin bswap invocation in FNDECL if BSWAP is true.  SRC gives
+   the source on which STMT is operating and N->range gives the
+   size of the expression involved for maintaining some statistics.  */
+
+static void
+bswap_replace (gimple stmt, gimple_stmt_iterator *gsi, tree src, tree fndecl,
+	       tree bswap_type, tree load_type, struct symbolic_number *n,
+	       bool bswap)
+{
+  tree tmp, tgt;
+  gimple call;
+
+  tgt = gimple_assign_lhs (stmt);
+
+  /* Need to load the value from memory first.  */
+  if (n->base_addr)
+    {
+      tree addr_expr, addr_tmp, val_expr, val_tmp;
+      tree load_offset_ptr;
+      gimple addr_stmt, load_stmt;
+
+      /*  Compute address to load from and cast according to the size
+	  of the load.  */
+      addr_expr = build_fold_addr_expr (unshare_expr (src));
+      if (is_gimple_min_invariant (addr_expr))
+	addr_tmp = addr_expr;
+      else
+	{
+	  addr_tmp = make_temp_ssa_name (TREE_TYPE (addr_expr), NULL,
+					 "load_src");
+	  addr_stmt = gimple_build_assign (addr_tmp, addr_expr);
+	  gsi_insert_before (gsi, addr_stmt, GSI_SAME_STMT);
+	}
+
+      /* Perform the load.  */
+      load_offset_ptr = build_int_cst (n->alias_set, 0);
+      val_expr = fold_build2 (MEM_REF, load_type, addr_tmp,
+			      load_offset_ptr);
+
+      if (!bswap)
+	{
+	  if (n->range == 16)
+	    nop_stats.found_16bit++;
+	  else if (n->range == 32)
+	    nop_stats.found_32bit++;
+	  else
+	    {
+	      gcc_assert (n->range == 64);
+	      nop_stats.found_64bit++;
+	    }
+
+	  /* Convert the result of load if necessary.  */
+	  if (!useless_type_conversion_p (TREE_TYPE (tgt), load_type))
+	    {
+	      val_tmp = make_temp_ssa_name (load_type, NULL, "load_dst");
+	      load_stmt = gimple_build_assign (val_tmp, val_expr);
+	      gimple_set_vuse (load_stmt, n->vuse);
+	      gsi_insert_before (gsi, load_stmt, GSI_SAME_STMT);
+	      gimple_assign_set_rhs_with_ops_1 (gsi, NOP_EXPR, val_tmp,
+						NULL_TREE, NULL_TREE);
+	    }
+	  else
+	    gimple_assign_set_rhs_with_ops_1 (gsi, MEM_REF, val_expr,
+					      NULL_TREE, NULL_TREE);
+	  update_stmt (gsi_stmt (*gsi));
+
+	  if (dump_file)
+	    {
+	      fprintf (dump_file,
+		       "%d bit load in host endianness found at: ",
+		       (int)n->range);
+	      print_gimple_stmt (dump_file, stmt, 0, 0);
+	    }
+	  return;
+	}
+      else
+	{
+	  val_tmp = make_temp_ssa_name (load_type, NULL, "load_dst");
+	  load_stmt = gimple_build_assign (val_tmp, val_expr);
+	  gimple_set_vuse (load_stmt, n->vuse);
+	  gsi_insert_before (gsi, load_stmt, GSI_SAME_STMT);
+	}
+      src = val_tmp;
+    }
+
+  if (n->range == 16)
+    bswap_stats.found_16bit++;
+  else if (n->range == 32)
+    bswap_stats.found_32bit++;
+  else
+    {
+      gcc_assert (n->range == 64);
+      bswap_stats.found_64bit++;
+    }
+
+  tmp = src;
+
+  /* Convert the src expression if necessary.  */
+  if (!useless_type_conversion_p (TREE_TYPE (tmp), bswap_type))
+    {
+      gimple convert_stmt;
+      tmp = make_temp_ssa_name (bswap_type, NULL, "bswapsrc");
+      convert_stmt = gimple_build_assign_with_ops (NOP_EXPR, tmp, src, NULL);
+      gsi_insert_before (gsi, convert_stmt, GSI_SAME_STMT);
+    }
+
+  call = gimple_build_call (fndecl, 1, tmp);
+
+  tmp = tgt;
+
+  /* Convert the result if necessary.  */
+  if (!useless_type_conversion_p (TREE_TYPE (tgt), bswap_type))
+    {
+      gimple convert_stmt;
+      tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst");
+      convert_stmt = gimple_build_assign_with_ops (NOP_EXPR, tgt, tmp, NULL);
+      gsi_insert_after (gsi, convert_stmt, GSI_SAME_STMT);
+    }
+
+  gimple_call_set_lhs (call, tmp);
+
+  if (dump_file)
+    {
+      fprintf (dump_file, "%d bit bswap implementation found at: ",
+	       (int)n->range);
+      print_gimple_stmt (dump_file, stmt, 0, 0);
+    }
+
+  gsi_insert_after (gsi, call, GSI_SAME_STMT);
+  gsi_remove (gsi, true);
+}
+
+/* Find manual byte swap implementations as well as load in a given
+   endianness. Byte swaps are turned into a bswap builtin invokation
+   while endian loads are converted to bswap builtin invokation or
+   simple load according to the host endianness.  */
+
 unsigned int
 pass_optimize_bswap::execute (function *fun)
 {
@@ -1948,9 +2239,6 @@ pass_optimize_bswap::execute (function *fun)
 	       && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing
 		   || (bswap32_p && word_mode == SImode)));
 
-  if (!bswap16_p && !bswap32_p && !bswap64_p)
-    return 0;
-
   /* Determine the argument type of the builtins.  The code later on
      assumes that the return and argument type are the same.  */
   if (bswap16_p)
@@ -1971,6 +2259,7 @@ pass_optimize_bswap::execute (function *fun)
       bswap64_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
     }
 
+  memset (&nop_stats, 0, sizeof (nop_stats));
   memset (&bswap_stats, 0, sizeof (bswap_stats));
 
   FOR_EACH_BB_FN (bb, fun)
@@ -1984,21 +2273,24 @@ pass_optimize_bswap::execute (function *fun)
       for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
         {
 	  gimple stmt = gsi_stmt (gsi);
-	  tree bswap_src, bswap_type;
-	  tree bswap_tmp;
-	  tree fndecl = NULL_TREE;
-	  int type_size;
-	  gimple call;
+	  tree fndecl = NULL_TREE, bswap_type = NULL_TREE;
+	  tree src, load_type;
+	  struct symbolic_number n;
+	  bool bswap;
 
 	  if (!is_gimple_assign (stmt)
 	      || gimple_assign_rhs_code (stmt) != BIT_IOR_EXPR)
 	    continue;
 
-	  type_size = TYPE_PRECISION (gimple_expr_type (stmt));
+	  src = find_bswap_or_nop (stmt, &n, &bswap);
 
-	  switch (type_size)
+	  if (!src)
+	    continue;
+
+	  switch (n.range)
 	    {
 	    case 16:
+	      load_type = uint16_type_node;
 	      if (bswap16_p)
 		{
 		  fndecl = builtin_decl_explicit (BUILT_IN_BSWAP16);
@@ -2006,6 +2298,7 @@ pass_optimize_bswap::execute (function *fun)
 		}
 	      break;
 	    case 32:
+	      load_type = uint32_type_node;
 	      if (bswap32_p)
 		{
 		  fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
@@ -2013,6 +2306,7 @@ pass_optimize_bswap::execute (function *fun)
 		}
 	      break;
 	    case 64:
+	      load_type = uint64_type_node;
 	      if (bswap64_p)
 		{
 		  fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
@@ -2023,62 +2317,21 @@ pass_optimize_bswap::execute (function *fun)
 	      continue;
 	    }
 
-	  if (!fndecl)
-	    continue;
-
-	  bswap_src = find_bswap (stmt);
-
-	  if (!bswap_src)
+	  if (bswap && !fndecl)
 	    continue;
 
+	  bswap_replace (stmt, &gsi, src, fndecl, bswap_type, load_type, &n,
+			 bswap);
 	  changed = true;
-	  if (type_size == 16)
-	    bswap_stats.found_16bit++;
-	  else if (type_size == 32)
-	    bswap_stats.found_32bit++;
-	  else
-	    bswap_stats.found_64bit++;
-
-	  bswap_tmp = bswap_src;
-
-	  /* Convert the src expression if necessary.  */
-	  if (!useless_type_conversion_p (TREE_TYPE (bswap_tmp), bswap_type))
-	    {
-	      gimple convert_stmt;
-	      bswap_tmp = make_temp_ssa_name (bswap_type, NULL, "bswapsrc");
-	      convert_stmt = gimple_build_assign_with_ops
-		  		(NOP_EXPR, bswap_tmp, bswap_src, NULL);
-	      gsi_insert_before (&gsi, convert_stmt, GSI_SAME_STMT);
-	    }
-
-	  call = gimple_build_call (fndecl, 1, bswap_tmp);
-
-	  bswap_tmp = gimple_assign_lhs (stmt);
-
-	  /* Convert the result if necessary.  */
-	  if (!useless_type_conversion_p (TREE_TYPE (bswap_tmp), bswap_type))
-	    {
-	      gimple convert_stmt;
-	      bswap_tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst");
-	      convert_stmt = gimple_build_assign_with_ops
-			(NOP_EXPR, gimple_assign_lhs (stmt), bswap_tmp, NULL);
-	      gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
-	    }
-
-	  gimple_call_set_lhs (call, bswap_tmp);
-
-	  if (dump_file)
-	    {
-	      fprintf (dump_file, "%d bit bswap implementation found at: ",
-		       (int)type_size);
-	      print_gimple_stmt (dump_file, stmt, 0, 0);
-	    }
-
-	  gsi_insert_after (&gsi, call, GSI_SAME_STMT);
-	  gsi_remove (&gsi, true);
 	}
     }
 
+  statistics_counter_event (fun, "16-bit nop implementations found",
+			    nop_stats.found_16bit);
+  statistics_counter_event (fun, "32-bit nop implementations found",
+			    nop_stats.found_32bit);
+  statistics_counter_event (fun, "64-bit nop implementations found",
+			    nop_stats.found_64bit);
   statistics_counter_event (fun, "16-bit bswap implementations found",
 			    bswap_stats.found_16bit);
   statistics_counter_event (fun, "32-bit bswap implementations found",
diff --git a/gcc/tree.h b/gcc/tree.h
index ae4876d..0cb2abe 100644
--- a/gcc/tree.h
+++ b/gcc/tree.h
@@ -4539,7 +4539,7 @@ extern tree build_personality_function (const char *);
    the access position and size.  */
 extern tree get_inner_reference (tree, HOST_WIDE_INT *, HOST_WIDE_INT *,
 				 tree *, enum machine_mode *, int *, int *,
-				 bool);
+				 bool, bool = false);
 
 /* Return a tree representing the lower bound of the array mentioned in
    EXP, an ARRAY_REF or an ARRAY_RANGE_REF.  */

^ permalink raw reply	[flat|nested] 31+ messages in thread

* RE: [PATCH] Fix PR54733 Optimize endian independent load/store
  2014-05-09 10:26 [PATCH] Fix PR54733 Optimize endian independent load/store Thomas Preud'homme
@ 2014-05-16 10:07 ` Thomas Preud'homme
  2014-05-16 10:48   ` Richard Biener
  0 siblings, 1 reply; 31+ messages in thread
From: Thomas Preud'homme @ 2014-05-16 10:07 UTC (permalink / raw)
  To: GCC Patches

Ping?

Best regards,

Thomas Preud'homme

> -----Original Message-----
> From: gcc-patches-owner@gcc.gnu.org [mailto:gcc-patches-
> owner@gcc.gnu.org] On Behalf Of Thomas Preud'homme
> Sent: Friday, May 09, 2014 6:26 PM
> To: GCC Patches
> Subject: RE: [PATCH] Fix PR54733 Optimize endian independent load/store
> 
> Sorry, took longer than expected as I got distracted by some other patch.
> I merged the whole patchset in a single patch as I was told the current setup
> is actually more difficult to read.
> 
> Here are the updated ChangeLogs:
> 
> *** gcc/ChangeLog ***
> 
> 2014-05-09  Thomas Preud'homme  <thomas.preudhomme@arm.com>
> 
> 	PR tree-optimization/54733
> 	* expr.c (get_inner_reference): Add a parameter to control whether
> a
> 	MEM_REF should be split into base + offset.
> 	* tree.h (get_inner_reference): Default new parameter to false.
> 	* tree-ssa-math-opts.c (nop_stats): New "bswap_stats" structure.
> 	(CMPNOP): Define.
> 	(find_bswap_or_nop_load): New.
> 	(find_bswap_1): Renamed to ...
> 	(find_bswap_or_nop_1): This. Also add support for memory source.
> 	(find_bswap): Renamed to ...
> 	(find_bswap_or_nop): This. Also add support for memory source and
> 	detection of bitwise operations equivalent to load in host endianness.
> 	(execute_optimize_bswap): Likewise. Also move its leading
> comment back
> 	in place and split statement transformation into ...
> 	(bswap_replace): This. Add assert when updating bswap_stats.
> 
> *** gcc/testsuite/ChangeLog ***
> 
> 2014-05-09  Thomas Preud'homme  <thomas.preudhomme@arm.com>
> 
> 	PR tree-optimization/54733
> 	* gcc.dg/optimize-bswapdi-3.c: New test to check extension of
> bswap
> 	optimization to support memory sources and bitwise operations
> 	equivalent to load in host endianness.
> 	* gcc.dg/optimize-bswaphi-1.c: Likewise.
> 	* gcc.dg/optimize-bswapsi-2.c: Likewise.
> 	* gcc.c-torture/execute/bswap-2.c: Likewise.
> 
> Ok for trunk?
> 
> Best regards,
> 
> Thomas
> 
> > -----Original Message-----
> > From: gcc-patches-owner@gcc.gnu.org [mailto:gcc-patches-
> > owner@gcc.gnu.org] On Behalf Of Thomas Preud'homme
> > Sent: Monday, May 05, 2014 7:30 PM
> > To: GCC Patches
> > Subject: RE: [PATCH][2/3] Fix PR54733 Optimize endian independent
> > load/store
> >
> > I found a way to improve the function find_bswap/find_bswap_or_nop
> > and reduce its size. Please hold for the review, I will post an updated
> > version as soon as I finish testing.
> >
> > Best regards,
> >
> > Thomas Preud'homme
> >
> >
> >


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] Fix PR54733 Optimize endian independent load/store
  2014-05-16 10:07 ` Thomas Preud'homme
@ 2014-05-16 10:48   ` Richard Biener
  2014-05-16 10:56     ` pinskia
  0 siblings, 1 reply; 31+ messages in thread
From: Richard Biener @ 2014-05-16 10:48 UTC (permalink / raw)
  To: Thomas Preud'homme; +Cc: GCC Patches

On Fri, May 16, 2014 at 12:07 PM, Thomas Preud'homme
<thomas.preudhomme@arm.com> wrote:
> Ping?

Sorry ...

> Best regards,
>
> Thomas Preud'homme
>
>> -----Original Message-----
>> From: gcc-patches-owner@gcc.gnu.org [mailto:gcc-patches-
>> owner@gcc.gnu.org] On Behalf Of Thomas Preud'homme
>> Sent: Friday, May 09, 2014 6:26 PM
>> To: GCC Patches
>> Subject: RE: [PATCH] Fix PR54733 Optimize endian independent load/store
>>
>> Sorry, took longer than expected as I got distracted by some other patch.
>> I merged the whole patchset in a single patch as I was told the current setup
>> is actually more difficult to read.
>>
>> Here are the updated ChangeLogs:
>>
>> *** gcc/ChangeLog ***
>>
>> 2014-05-09  Thomas Preud'homme  <thomas.preudhomme@arm.com>
>>
>>       PR tree-optimization/54733
>>       * expr.c (get_inner_reference): Add a parameter to control whether
>> a
>>       MEM_REF should be split into base + offset.
>>       * tree.h (get_inner_reference): Default new parameter to false.
>>       * tree-ssa-math-opts.c (nop_stats): New "bswap_stats" structure.
>>       (CMPNOP): Define.
>>       (find_bswap_or_nop_load): New.
>>       (find_bswap_1): Renamed to ...
>>       (find_bswap_or_nop_1): This. Also add support for memory source.
>>       (find_bswap): Renamed to ...
>>       (find_bswap_or_nop): This. Also add support for memory source and
>>       detection of bitwise operations equivalent to load in host endianness.
>>       (execute_optimize_bswap): Likewise. Also move its leading
>> comment back
>>       in place and split statement transformation into ...
>>       (bswap_replace): This. Add assert when updating bswap_stats.
>>
>> *** gcc/testsuite/ChangeLog ***
>>
>> 2014-05-09  Thomas Preud'homme  <thomas.preudhomme@arm.com>
>>
>>       PR tree-optimization/54733
>>       * gcc.dg/optimize-bswapdi-3.c: New test to check extension of
>> bswap
>>       optimization to support memory sources and bitwise operations
>>       equivalent to load in host endianness.
>>       * gcc.dg/optimize-bswaphi-1.c: Likewise.
>>       * gcc.dg/optimize-bswapsi-2.c: Likewise.
>>       * gcc.c-torture/execute/bswap-2.c: Likewise.
>>
>> Ok for trunk?

Ok, I now decided otherwise and dislike the new parameter to
get_inner_reference.  Can you please revert that part and just
deal with a MEM_REF result in your only caller?

And (of course) I found another possible issue.  The way you
compute load_type and use it here:

+      /* Perform the load.  */
+      load_offset_ptr = build_int_cst (n->alias_set, 0);
+      val_expr = fold_build2 (MEM_REF, load_type, addr_tmp,
+                             load_offset_ptr);

makes the load always appear aligned according to the mode of
load_type.  On strict-alignment targets this may cause faults.

So what you have to do is either (simpler)

   unsigned int align = get_pointer_alignment (addr_tmp);
   tree al_load_type = load_type;
   if (align < TYPE_ALIGN (load_type))
     al_load_type = build_aligned_type (load_type, align);
...
    val_expr = fold_build2 (MEM_REF, al_load_type, addr_tmp,
                                     load_offset_ptr);

or keep track of the "first" actual load and use

   unsigned int align = get_object_alignment (that_first_load);

"first" in the one that corresponds to addr_tmp.  From that there
is a much better chance to derive good alignment values.

Of course on STRICT_ALIGNMENT targets a not aligned load
will be decomposed again, so eventually doing the transformation
may no longer be profitable(?).

Thanks and sorry again for the delay.

Otherwise the patch looks good to me.

Richard.

>> Best regards,
>>
>> Thomas
>>
>> > -----Original Message-----
>> > From: gcc-patches-owner@gcc.gnu.org [mailto:gcc-patches-
>> > owner@gcc.gnu.org] On Behalf Of Thomas Preud'homme
>> > Sent: Monday, May 05, 2014 7:30 PM
>> > To: GCC Patches
>> > Subject: RE: [PATCH][2/3] Fix PR54733 Optimize endian independent
>> > load/store
>> >
>> > I found a way to improve the function find_bswap/find_bswap_or_nop
>> > and reduce its size. Please hold for the review, I will post an updated
>> > version as soon as I finish testing.
>> >
>> > Best regards,
>> >
>> > Thomas Preud'homme
>> >
>> >
>> >
>
>

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] Fix PR54733 Optimize endian independent load/store
  2014-05-16 10:48   ` Richard Biener
@ 2014-05-16 10:56     ` pinskia
  2014-05-16 11:03       ` Richard Biener
  0 siblings, 1 reply; 31+ messages in thread
From: pinskia @ 2014-05-16 10:56 UTC (permalink / raw)
  To: Richard Biener; +Cc: Thomas Preud'homme, GCC Patches



> On May 16, 2014, at 3:48 AM, Richard Biener <richard.guenther@gmail.com> wrote:
> 
> On Fri, May 16, 2014 at 12:07 PM, Thomas Preud'homme
> <thomas.preudhomme@arm.com> wrote:
>> Ping?
> 
> Sorry ...
> 
>> Best regards,
>> 
>> Thomas Preud'homme
>> 
>>> -----Original Message-----
>>> From: gcc-patches-owner@gcc.gnu.org [mailto:gcc-patches-
>>> owner@gcc.gnu.org] On Behalf Of Thomas Preud'homme
>>> Sent: Friday, May 09, 2014 6:26 PM
>>> To: GCC Patches
>>> Subject: RE: [PATCH] Fix PR54733 Optimize endian independent load/store
>>> 
>>> Sorry, took longer than expected as I got distracted by some other patch.
>>> I merged the whole patchset in a single patch as I was told the current setup
>>> is actually more difficult to read.
>>> 
>>> Here are the updated ChangeLogs:
>>> 
>>> *** gcc/ChangeLog ***
>>> 
>>> 2014-05-09  Thomas Preud'homme  <thomas.preudhomme@arm.com>
>>> 
>>>      PR tree-optimization/54733
>>>      * expr.c (get_inner_reference): Add a parameter to control whether
>>> a
>>>      MEM_REF should be split into base + offset.
>>>      * tree.h (get_inner_reference): Default new parameter to false.
>>>      * tree-ssa-math-opts.c (nop_stats): New "bswap_stats" structure.
>>>      (CMPNOP): Define.
>>>      (find_bswap_or_nop_load): New.
>>>      (find_bswap_1): Renamed to ...
>>>      (find_bswap_or_nop_1): This. Also add support for memory source.
>>>      (find_bswap): Renamed to ...
>>>      (find_bswap_or_nop): This. Also add support for memory source and
>>>      detection of bitwise operations equivalent to load in host endianness.
>>>      (execute_optimize_bswap): Likewise. Also move its leading
>>> comment back
>>>      in place and split statement transformation into ...
>>>      (bswap_replace): This. Add assert when updating bswap_stats.
>>> 
>>> *** gcc/testsuite/ChangeLog ***
>>> 
>>> 2014-05-09  Thomas Preud'homme  <thomas.preudhomme@arm.com>
>>> 
>>>      PR tree-optimization/54733
>>>      * gcc.dg/optimize-bswapdi-3.c: New test to check extension of
>>> bswap
>>>      optimization to support memory sources and bitwise operations
>>>      equivalent to load in host endianness.
>>>      * gcc.dg/optimize-bswaphi-1.c: Likewise.
>>>      * gcc.dg/optimize-bswapsi-2.c: Likewise.
>>>      * gcc.c-torture/execute/bswap-2.c: Likewise.
>>> 
>>> Ok for trunk?
> 
> Ok, I now decided otherwise and dislike the new parameter to
> get_inner_reference.  Can you please revert that part and just
> deal with a MEM_REF result in your only caller?
> 
> And (of course) I found another possible issue.  The way you
> compute load_type and use it here:
> 
> +      /* Perform the load.  */
> +      load_offset_ptr = build_int_cst (n->alias_set, 0);
> +      val_expr = fold_build2 (MEM_REF, load_type, addr_tmp,
> +                             load_offset_ptr);
> 
> makes the load always appear aligned according to the mode of
> load_type.  On strict-alignment targets this may cause faults.
> 
> So what you have to do is either (simpler)
> 
>   unsigned int align = get_pointer_alignment (addr_tmp);
>   tree al_load_type = load_type;
>   if (align < TYPE_ALIGN (load_type))
>     al_load_type = build_aligned_type (load_type, align);
> ...
>    val_expr = fold_build2 (MEM_REF, al_load_type, addr_tmp,
>                                     load_offset_ptr);
> 
> or keep track of the "first" actual load and use
> 
>   unsigned int align = get_object_alignment (that_first_load);
> 
> "first" in the one that corresponds to addr_tmp.  From that there
> is a much better chance to derive good alignment values.
> 
> Of course on STRICT_ALIGNMENT targets a not aligned load
> will be decomposed again, so eventually doing the transformation
> may no longer be profitable(?).

Not always decomposed. On MIPS, it should using the load/store left/right instructions for unaligned load/stores which is normally better than decomposed load/stores. So having a cost model would be nice. 

Thanks,
Andrew

> 
> Thanks and sorry again for the delay.
> 
> Otherwise the patch looks good to me.
> 
> Richard.
> 
>>> Best regards,
>>> 
>>> Thomas
>>> 
>>>> -----Original Message-----
>>>> From: gcc-patches-owner@gcc.gnu.org [mailto:gcc-patches-
>>>> owner@gcc.gnu.org] On Behalf Of Thomas Preud'homme
>>>> Sent: Monday, May 05, 2014 7:30 PM
>>>> To: GCC Patches
>>>> Subject: RE: [PATCH][2/3] Fix PR54733 Optimize endian independent
>>>> load/store
>>>> 
>>>> I found a way to improve the function find_bswap/find_bswap_or_nop
>>>> and reduce its size. Please hold for the review, I will post an updated
>>>> version as soon as I finish testing.
>>>> 
>>>> Best regards,
>>>> 
>>>> Thomas Preud'homme
>> 
>> 

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] Fix PR54733 Optimize endian independent load/store
  2014-05-16 10:56     ` pinskia
@ 2014-05-16 11:03       ` Richard Biener
  2014-05-16 11:14         ` Richard Biener
  2014-05-20  2:46         ` Thomas Preud'homme
  0 siblings, 2 replies; 31+ messages in thread
From: Richard Biener @ 2014-05-16 11:03 UTC (permalink / raw)
  To: Andrew Pinski; +Cc: Thomas Preud'homme, GCC Patches

On Fri, May 16, 2014 at 12:56 PM,  <pinskia@gmail.com> wrote:
>
>
>> On May 16, 2014, at 3:48 AM, Richard Biener <richard.guenther@gmail.com> wrote:
>>
>> On Fri, May 16, 2014 at 12:07 PM, Thomas Preud'homme
>> <thomas.preudhomme@arm.com> wrote:
>>> Ping?
>>
>> Sorry ...
>>
>>> Best regards,
>>>
>>> Thomas Preud'homme
>>>
>>>> -----Original Message-----
>>>> From: gcc-patches-owner@gcc.gnu.org [mailto:gcc-patches-
>>>> owner@gcc.gnu.org] On Behalf Of Thomas Preud'homme
>>>> Sent: Friday, May 09, 2014 6:26 PM
>>>> To: GCC Patches
>>>> Subject: RE: [PATCH] Fix PR54733 Optimize endian independent load/store
>>>>
>>>> Sorry, took longer than expected as I got distracted by some other patch.
>>>> I merged the whole patchset in a single patch as I was told the current setup
>>>> is actually more difficult to read.
>>>>
>>>> Here are the updated ChangeLogs:
>>>>
>>>> *** gcc/ChangeLog ***
>>>>
>>>> 2014-05-09  Thomas Preud'homme  <thomas.preudhomme@arm.com>
>>>>
>>>>      PR tree-optimization/54733
>>>>      * expr.c (get_inner_reference): Add a parameter to control whether
>>>> a
>>>>      MEM_REF should be split into base + offset.
>>>>      * tree.h (get_inner_reference): Default new parameter to false.
>>>>      * tree-ssa-math-opts.c (nop_stats): New "bswap_stats" structure.
>>>>      (CMPNOP): Define.
>>>>      (find_bswap_or_nop_load): New.
>>>>      (find_bswap_1): Renamed to ...
>>>>      (find_bswap_or_nop_1): This. Also add support for memory source.
>>>>      (find_bswap): Renamed to ...
>>>>      (find_bswap_or_nop): This. Also add support for memory source and
>>>>      detection of bitwise operations equivalent to load in host endianness.
>>>>      (execute_optimize_bswap): Likewise. Also move its leading
>>>> comment back
>>>>      in place and split statement transformation into ...
>>>>      (bswap_replace): This. Add assert when updating bswap_stats.
>>>>
>>>> *** gcc/testsuite/ChangeLog ***
>>>>
>>>> 2014-05-09  Thomas Preud'homme  <thomas.preudhomme@arm.com>
>>>>
>>>>      PR tree-optimization/54733
>>>>      * gcc.dg/optimize-bswapdi-3.c: New test to check extension of
>>>> bswap
>>>>      optimization to support memory sources and bitwise operations
>>>>      equivalent to load in host endianness.
>>>>      * gcc.dg/optimize-bswaphi-1.c: Likewise.
>>>>      * gcc.dg/optimize-bswapsi-2.c: Likewise.
>>>>      * gcc.c-torture/execute/bswap-2.c: Likewise.
>>>>
>>>> Ok for trunk?
>>
>> Ok, I now decided otherwise and dislike the new parameter to
>> get_inner_reference.  Can you please revert that part and just
>> deal with a MEM_REF result in your only caller?
>>
>> And (of course) I found another possible issue.  The way you
>> compute load_type and use it here:
>>
>> +      /* Perform the load.  */
>> +      load_offset_ptr = build_int_cst (n->alias_set, 0);
>> +      val_expr = fold_build2 (MEM_REF, load_type, addr_tmp,
>> +                             load_offset_ptr);
>>
>> makes the load always appear aligned according to the mode of
>> load_type.  On strict-alignment targets this may cause faults.
>>
>> So what you have to do is either (simpler)
>>
>>   unsigned int align = get_pointer_alignment (addr_tmp);
>>   tree al_load_type = load_type;
>>   if (align < TYPE_ALIGN (load_type))
>>     al_load_type = build_aligned_type (load_type, align);
>> ...
>>    val_expr = fold_build2 (MEM_REF, al_load_type, addr_tmp,
>>                                     load_offset_ptr);
>>
>> or keep track of the "first" actual load and use
>>
>>   unsigned int align = get_object_alignment (that_first_load);
>>
>> "first" in the one that corresponds to addr_tmp.  From that there
>> is a much better chance to derive good alignment values.
>>
>> Of course on STRICT_ALIGNMENT targets a not aligned load
>> will be decomposed again, so eventually doing the transformation
>> may no longer be profitable(?).
>
> Not always decomposed. On MIPS, it should using the load/store left/right instructions for unaligned load/stores which is normally better than decomposed load/stores. So having a cost model would be nice.

Agreed, but I am happy with doing that as a followup.  Btw,
a very simple one would be to reject unaligned
SLOW_UNALIGNED_ACCESS (TYPE_MODE (load_type), align).
[of course that may be true on MIPS even for the cases where
a "reasonable" fast unalgined variant exists - nearly no target
defines that macro in a too fancy way]

Richard.

> Thanks,
> Andrew
>
>>
>> Thanks and sorry again for the delay.
>>
>> Otherwise the patch looks good to me.
>>
>> Richard.
>>
>>>> Best regards,
>>>>
>>>> Thomas
>>>>
>>>>> -----Original Message-----
>>>>> From: gcc-patches-owner@gcc.gnu.org [mailto:gcc-patches-
>>>>> owner@gcc.gnu.org] On Behalf Of Thomas Preud'homme
>>>>> Sent: Monday, May 05, 2014 7:30 PM
>>>>> To: GCC Patches
>>>>> Subject: RE: [PATCH][2/3] Fix PR54733 Optimize endian independent
>>>>> load/store
>>>>>
>>>>> I found a way to improve the function find_bswap/find_bswap_or_nop
>>>>> and reduce its size. Please hold for the review, I will post an updated
>>>>> version as soon as I finish testing.
>>>>>
>>>>> Best regards,
>>>>>
>>>>> Thomas Preud'homme
>>>
>>>

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] Fix PR54733 Optimize endian independent load/store
  2014-05-16 11:03       ` Richard Biener
@ 2014-05-16 11:14         ` Richard Biener
  2014-05-16 16:59           ` pinskia
  2014-05-19  9:30           ` Thomas Preud'homme
  2014-05-20  2:46         ` Thomas Preud'homme
  1 sibling, 2 replies; 31+ messages in thread
From: Richard Biener @ 2014-05-16 11:14 UTC (permalink / raw)
  To: Andrew Pinski; +Cc: Thomas Preud'homme, GCC Patches

On Fri, May 16, 2014 at 1:03 PM, Richard Biener
<richard.guenther@gmail.com> wrote:
> On Fri, May 16, 2014 at 12:56 PM,  <pinskia@gmail.com> wrote:
>>
>>
>>> On May 16, 2014, at 3:48 AM, Richard Biener <richard.guenther@gmail.com> wrote:
>>>
>>> On Fri, May 16, 2014 at 12:07 PM, Thomas Preud'homme
>>> <thomas.preudhomme@arm.com> wrote:
>>>> Ping?
>>>
>>> Sorry ...
>>>
>>>> Best regards,
>>>>
>>>> Thomas Preud'homme
>>>>
>>>>> -----Original Message-----
>>>>> From: gcc-patches-owner@gcc.gnu.org [mailto:gcc-patches-
>>>>> owner@gcc.gnu.org] On Behalf Of Thomas Preud'homme
>>>>> Sent: Friday, May 09, 2014 6:26 PM
>>>>> To: GCC Patches
>>>>> Subject: RE: [PATCH] Fix PR54733 Optimize endian independent load/store
>>>>>
>>>>> Sorry, took longer than expected as I got distracted by some other patch.
>>>>> I merged the whole patchset in a single patch as I was told the current setup
>>>>> is actually more difficult to read.
>>>>>
>>>>> Here are the updated ChangeLogs:
>>>>>
>>>>> *** gcc/ChangeLog ***
>>>>>
>>>>> 2014-05-09  Thomas Preud'homme  <thomas.preudhomme@arm.com>
>>>>>
>>>>>      PR tree-optimization/54733
>>>>>      * expr.c (get_inner_reference): Add a parameter to control whether
>>>>> a
>>>>>      MEM_REF should be split into base + offset.
>>>>>      * tree.h (get_inner_reference): Default new parameter to false.
>>>>>      * tree-ssa-math-opts.c (nop_stats): New "bswap_stats" structure.
>>>>>      (CMPNOP): Define.
>>>>>      (find_bswap_or_nop_load): New.
>>>>>      (find_bswap_1): Renamed to ...
>>>>>      (find_bswap_or_nop_1): This. Also add support for memory source.
>>>>>      (find_bswap): Renamed to ...
>>>>>      (find_bswap_or_nop): This. Also add support for memory source and
>>>>>      detection of bitwise operations equivalent to load in host endianness.
>>>>>      (execute_optimize_bswap): Likewise. Also move its leading
>>>>> comment back
>>>>>      in place and split statement transformation into ...
>>>>>      (bswap_replace): This. Add assert when updating bswap_stats.
>>>>>
>>>>> *** gcc/testsuite/ChangeLog ***
>>>>>
>>>>> 2014-05-09  Thomas Preud'homme  <thomas.preudhomme@arm.com>
>>>>>
>>>>>      PR tree-optimization/54733
>>>>>      * gcc.dg/optimize-bswapdi-3.c: New test to check extension of
>>>>> bswap
>>>>>      optimization to support memory sources and bitwise operations
>>>>>      equivalent to load in host endianness.
>>>>>      * gcc.dg/optimize-bswaphi-1.c: Likewise.
>>>>>      * gcc.dg/optimize-bswapsi-2.c: Likewise.
>>>>>      * gcc.c-torture/execute/bswap-2.c: Likewise.
>>>>>
>>>>> Ok for trunk?
>>>
>>> Ok, I now decided otherwise and dislike the new parameter to
>>> get_inner_reference.  Can you please revert that part and just
>>> deal with a MEM_REF result in your only caller?
>>>
>>> And (of course) I found another possible issue.  The way you
>>> compute load_type and use it here:
>>>
>>> +      /* Perform the load.  */
>>> +      load_offset_ptr = build_int_cst (n->alias_set, 0);
>>> +      val_expr = fold_build2 (MEM_REF, load_type, addr_tmp,
>>> +                             load_offset_ptr);
>>>
>>> makes the load always appear aligned according to the mode of
>>> load_type.  On strict-alignment targets this may cause faults.
>>>
>>> So what you have to do is either (simpler)
>>>
>>>   unsigned int align = get_pointer_alignment (addr_tmp);
>>>   tree al_load_type = load_type;
>>>   if (align < TYPE_ALIGN (load_type))
>>>     al_load_type = build_aligned_type (load_type, align);
>>> ...
>>>    val_expr = fold_build2 (MEM_REF, al_load_type, addr_tmp,
>>>                                     load_offset_ptr);
>>>
>>> or keep track of the "first" actual load and use
>>>
>>>   unsigned int align = get_object_alignment (that_first_load);
>>>
>>> "first" in the one that corresponds to addr_tmp.  From that there
>>> is a much better chance to derive good alignment values.
>>>
>>> Of course on STRICT_ALIGNMENT targets a not aligned load
>>> will be decomposed again, so eventually doing the transformation
>>> may no longer be profitable(?).
>>
>> Not always decomposed. On MIPS, it should using the load/store left/right instructions for unaligned load/stores which is normally better than decomposed load/stores. So having a cost model would be nice.
>
> Agreed, but I am happy with doing that as a followup.  Btw,
> a very simple one would be to reject unaligned
> SLOW_UNALIGNED_ACCESS (TYPE_MODE (load_type), align).
> [of course that may be true on MIPS even for the cases where
> a "reasonable" fast unalgined variant exists - nearly no target
> defines that macro in a too fancy way]

Oh, and what happens for

unsigned foo (unsigned char *x)
{
  return x[0] << 24 | x[2] << 8 | x[3];
}

?  We could do an unsigned int load from x and zero byte 3
with an AND.  Enhancement for a followup, similar to also
considering vector types for the load (also I'm not sure
that uint64_type_node always has non-BLKmode for all
targets).

Richard.

> Richard.
>
>> Thanks,
>> Andrew
>>
>>>
>>> Thanks and sorry again for the delay.
>>>
>>> Otherwise the patch looks good to me.
>>>
>>> Richard.
>>>
>>>>> Best regards,
>>>>>
>>>>> Thomas
>>>>>
>>>>>> -----Original Message-----
>>>>>> From: gcc-patches-owner@gcc.gnu.org [mailto:gcc-patches-
>>>>>> owner@gcc.gnu.org] On Behalf Of Thomas Preud'homme
>>>>>> Sent: Monday, May 05, 2014 7:30 PM
>>>>>> To: GCC Patches
>>>>>> Subject: RE: [PATCH][2/3] Fix PR54733 Optimize endian independent
>>>>>> load/store
>>>>>>
>>>>>> I found a way to improve the function find_bswap/find_bswap_or_nop
>>>>>> and reduce its size. Please hold for the review, I will post an updated
>>>>>> version as soon as I finish testing.
>>>>>>
>>>>>> Best regards,
>>>>>>
>>>>>> Thomas Preud'homme
>>>>
>>>>

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] Fix PR54733 Optimize endian independent load/store
  2014-05-16 11:14         ` Richard Biener
@ 2014-05-16 16:59           ` pinskia
  2014-05-19  2:49             ` Thomas Preud'homme
  2014-05-20  2:50             ` Andrew Pinski
  2014-05-19  9:30           ` Thomas Preud'homme
  1 sibling, 2 replies; 31+ messages in thread
From: pinskia @ 2014-05-16 16:59 UTC (permalink / raw)
  To: Richard Biener; +Cc: Thomas Preud'homme, GCC Patches



> On May 16, 2014, at 4:13 AM, Richard Biener <richard.guenther@gmail.com> wrote:
> 
> On Fri, May 16, 2014 at 1:03 PM, Richard Biener
> <richard.guenther@gmail.com> wrote:
>> On Fri, May 16, 2014 at 12:56 PM,  <pinskia@gmail.com> wrote:
>>> 
>>> 
>>>> On May 16, 2014, at 3:48 AM, Richard Biener <richard.guenther@gmail.com> wrote:
>>>> 
>>>> On Fri, May 16, 2014 at 12:07 PM, Thomas Preud'homme
>>>> <thomas.preudhomme@arm.com> wrote:
>>>>> Ping?
>>>> 
>>>> Sorry ...
>>>> 
>>>>> Best regards,
>>>>> 
>>>>> Thomas Preud'homme
>>>>> 
>>>>>> -----Original Message-----
>>>>>> From: gcc-patches-owner@gcc.gnu.org [mailto:gcc-patches-
>>>>>> owner@gcc.gnu.org] On Behalf Of Thomas Preud'homme
>>>>>> Sent: Friday, May 09, 2014 6:26 PM
>>>>>> To: GCC Patches
>>>>>> Subject: RE: [PATCH] Fix PR54733 Optimize endian independent load/store
>>>>>> 
>>>>>> Sorry, took longer than expected as I got distracted by some other patch.
>>>>>> I merged the whole patchset in a single patch as I was told the current setup
>>>>>> is actually more difficult to read.
>>>>>> 
>>>>>> Here are the updated ChangeLogs:
>>>>>> 
>>>>>> *** gcc/ChangeLog ***
>>>>>> 
>>>>>> 2014-05-09  Thomas Preud'homme  <thomas.preudhomme@arm.com>
>>>>>> 
>>>>>>     PR tree-optimization/54733
>>>>>>     * expr.c (get_inner_reference): Add a parameter to control whether
>>>>>> a
>>>>>>     MEM_REF should be split into base + offset.
>>>>>>     * tree.h (get_inner_reference): Default new parameter to false.
>>>>>>     * tree-ssa-math-opts.c (nop_stats): New "bswap_stats" structure.
>>>>>>     (CMPNOP): Define.
>>>>>>     (find_bswap_or_nop_load): New.
>>>>>>     (find_bswap_1): Renamed to ...
>>>>>>     (find_bswap_or_nop_1): This. Also add support for memory source.
>>>>>>     (find_bswap): Renamed to ...
>>>>>>     (find_bswap_or_nop): This. Also add support for memory source and
>>>>>>     detection of bitwise operations equivalent to load in host endianness.
>>>>>>     (execute_optimize_bswap): Likewise. Also move its leading
>>>>>> comment back
>>>>>>     in place and split statement transformation into ...
>>>>>>     (bswap_replace): This. Add assert when updating bswap_stats.
>>>>>> 
>>>>>> *** gcc/testsuite/ChangeLog ***
>>>>>> 
>>>>>> 2014-05-09  Thomas Preud'homme  <thomas.preudhomme@arm.com>
>>>>>> 
>>>>>>     PR tree-optimization/54733
>>>>>>     * gcc.dg/optimize-bswapdi-3.c: New test to check extension of
>>>>>> bswap
>>>>>>     optimization to support memory sources and bitwise operations
>>>>>>     equivalent to load in host endianness.
>>>>>>     * gcc.dg/optimize-bswaphi-1.c: Likewise.
>>>>>>     * gcc.dg/optimize-bswapsi-2.c: Likewise.
>>>>>>     * gcc.c-torture/execute/bswap-2.c: Likewise.
>>>>>> 
>>>>>> Ok for trunk?
>>>> 
>>>> Ok, I now decided otherwise and dislike the new parameter to
>>>> get_inner_reference.  Can you please revert that part and just
>>>> deal with a MEM_REF result in your only caller?
>>>> 
>>>> And (of course) I found another possible issue.  The way you
>>>> compute load_type and use it here:
>>>> 
>>>> +      /* Perform the load.  */
>>>> +      load_offset_ptr = build_int_cst (n->alias_set, 0);
>>>> +      val_expr = fold_build2 (MEM_REF, load_type, addr_tmp,
>>>> +                             load_offset_ptr);
>>>> 
>>>> makes the load always appear aligned according to the mode of
>>>> load_type.  On strict-alignment targets this may cause faults.
>>>> 
>>>> So what you have to do is either (simpler)
>>>> 
>>>>  unsigned int align = get_pointer_alignment (addr_tmp);
>>>>  tree al_load_type = load_type;
>>>>  if (align < TYPE_ALIGN (load_type))
>>>>    al_load_type = build_aligned_type (load_type, align);
>>>> ...
>>>>   val_expr = fold_build2 (MEM_REF, al_load_type, addr_tmp,
>>>>                                    load_offset_ptr);
>>>> 
>>>> or keep track of the "first" actual load and use
>>>> 
>>>>  unsigned int align = get_object_alignment (that_first_load);
>>>> 
>>>> "first" in the one that corresponds to addr_tmp.  From that there
>>>> is a much better chance to derive good alignment values.
>>>> 
>>>> Of course on STRICT_ALIGNMENT targets a not aligned load
>>>> will be decomposed again, so eventually doing the transformation
>>>> may no longer be profitable(?).
>>> 
>>> Not always decomposed. On MIPS, it should using the load/store left/right instructions for unaligned load/stores which is normally better than decomposed load/stores. So having a cost model would be nice.
>> 
>> Agreed, but I am happy with doing that as a followup.  Btw,
>> a very simple one would be to reject unaligned
>> SLOW_UNALIGNED_ACCESS (TYPE_MODE (load_type), align).
>> [of course that may be true on MIPS even for the cases where
>> a "reasonable" fast unalgined variant exists - nearly no target
>> defines that macro in a too fancy way]
> 
> Oh, and what happens for
> 
> unsigned foo (unsigned char *x)
> {
>  return x[0] << 24 | x[2] << 8 | x[3];
> }
> 
> ?  We could do an unsigned int load from x and zero byte 3
> with an AND.  Enhancement for a followup, similar to also
> considering vector types for the load (also I'm not sure
> that uint64_type_node always has non-BLKmode for all
> targets).

No we cannot if x[4] is on a different page which is not mapped in, we get a fault. Not something we want. 

Thanks,
Andrew


> 
> Richard.
> 
>> Richard.
>> 
>>> Thanks,
>>> Andrew
>>> 
>>>> 
>>>> Thanks and sorry again for the delay.
>>>> 
>>>> Otherwise the patch looks good to me.
>>>> 
>>>> Richard.
>>>> 
>>>>>> Best regards,
>>>>>> 
>>>>>> Thomas
>>>>>> 
>>>>>>> -----Original Message-----
>>>>>>> From: gcc-patches-owner@gcc.gnu.org [mailto:gcc-patches-
>>>>>>> owner@gcc.gnu.org] On Behalf Of Thomas Preud'homme
>>>>>>> Sent: Monday, May 05, 2014 7:30 PM
>>>>>>> To: GCC Patches
>>>>>>> Subject: RE: [PATCH][2/3] Fix PR54733 Optimize endian independent
>>>>>>> load/store
>>>>>>> 
>>>>>>> I found a way to improve the function find_bswap/find_bswap_or_nop
>>>>>>> and reduce its size. Please hold for the review, I will post an updated
>>>>>>> version as soon as I finish testing.
>>>>>>> 
>>>>>>> Best regards,
>>>>>>> 
>>>>>>> Thomas Preud'homme
>>>>> 
>>>>> 

^ permalink raw reply	[flat|nested] 31+ messages in thread

* RE: [PATCH] Fix PR54733 Optimize endian independent load/store
  2014-05-16 16:59           ` pinskia
@ 2014-05-19  2:49             ` Thomas Preud'homme
  2014-05-20  2:50             ` Andrew Pinski
  1 sibling, 0 replies; 31+ messages in thread
From: Thomas Preud'homme @ 2014-05-19  2:49 UTC (permalink / raw)
  To: GCC Patches

> From: Richard Biener [mailto:richard.guenther@gmail.com]
> On Fri, May 16, 2014 at 12:07 PM, Thomas Preud'homme
> <thomas.preudhomme@arm.com> wrote:
> > Ping?
> 
> Sorry ...

> 
> Thanks and sorry again for the delay.
> 

No need to be sorry, it was really not meant as a complaint. I understand very
well that patches sometimes go under the radar and I just wanted to make
sure someone saw it.


> From: pinskia@gmail.com [mailto:pinskia@gmail.com]
> 
> Not always decomposed. On MIPS, it should using the load/store left/right
> instructions for unaligned load/stores which is normally better than
> decomposed load/stores. So having a cost model would be nice.
> 

This makes me think about the following situation:

uint32_t
read_le (unsigned char data[])
{
  uint16_t lo, hi;

  hi = data[0] | (data[1] << 8);
  lo = (data[2] << 16) | (data[3] << 24);
  printf ("lo + hi: %d\n", lo + hi);
  return hi | lo;
}

Currently this will do a load of 4 bytes and do a bswap on big endian target but
It would be better to just handle hi and lo separately doing two 2 bytes load
and doing a bitwise OR of these two. So check if two SSA_NAME are used by
other statements and if yes stop there.


> From: Richard Biener [mailto:richard.guenther@gmail.com]
> 
> Oh, and what happens for
> 
> unsigned foo (unsigned char *x)
> {
>   return x[0] << 24 | x[2] << 8 | x[3];
> }
> 
> ?  We could do an unsigned int load from x and zero byte 3
> with an AND.  Enhancement for a followup, similar to also
> considering vector types for the load (also I'm not sure
> that uint64_type_node always has non-BLKmode for all
> targets).
> 

Looks like a nice improvement to the patch indeed.


> From: pinskia@gmail.com [mailto:pinskia@gmail.com]
> 
> No we cannot if x[4] is on a different page which is not mapped in, we get a
> fault. Not something we want.
> 

Why would x[4] be loaded? I guess Richard was only suggesting doing a single
load + zeroing only when the untouched array entry is neither the first nor the
last, that is when there is a load.

Best regards,

Thomas Preud'homme



^ permalink raw reply	[flat|nested] 31+ messages in thread

* RE: [PATCH] Fix PR54733 Optimize endian independent load/store
  2014-05-16 11:14         ` Richard Biener
  2014-05-16 16:59           ` pinskia
@ 2014-05-19  9:30           ` Thomas Preud'homme
  2014-05-19  9:46             ` Richard Biener
  1 sibling, 1 reply; 31+ messages in thread
From: Thomas Preud'homme @ 2014-05-19  9:30 UTC (permalink / raw)
  To: GCC Patches

> From: Richard Biener [mailto:richard.guenther@gmail.com]
> 
> Oh, and what happens for
> 
> unsigned foo (unsigned char *x)
> {
>   return x[0] << 24 | x[2] << 8 | x[3];
> }
> 
> ?  We could do an unsigned int load from x and zero byte 3
> with an AND.  Enhancement for a followup, similar to also
> considering vector types for the load (also I'm not sure
> that uint64_type_node always has non-BLKmode for all
> targets).

I implemented this but it makes the statistics more difficult to read
and thus the test more difficult to write. Indeed, when doing a full
bswap many of the intermediate computations are partial bswap.
I can get on with it by using a different string to notify of partial
bswap than full bswap and test the partial bswap feature in a
separate file to not get noise from the full bswap. However I still
don't like the idea of notifying for partial bswap in statements that
will eventually be eliminated as dead statements. Besides, this
occur because some recursion is made for each statement even
if they were already examined which is not very nice in itself.

The solution I see is to keep a map of visited statements but I
don't know if that's fine in this case. After all, it's not strictly
necessary as the pass would work without this. It would just serve
to avoid redundant computation and confusion statistics.

Best regards,

Thomas


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] Fix PR54733 Optimize endian independent load/store
  2014-05-19  9:30           ` Thomas Preud'homme
@ 2014-05-19  9:46             ` Richard Biener
  0 siblings, 0 replies; 31+ messages in thread
From: Richard Biener @ 2014-05-19  9:46 UTC (permalink / raw)
  To: Thomas Preud'homme; +Cc: GCC Patches

On Mon, May 19, 2014 at 11:30 AM, Thomas Preud'homme
<thomas.preudhomme@arm.com> wrote:
>> From: Richard Biener [mailto:richard.guenther@gmail.com]
>>
>> Oh, and what happens for
>>
>> unsigned foo (unsigned char *x)
>> {
>>   return x[0] << 24 | x[2] << 8 | x[3];
>> }
>>
>> ?  We could do an unsigned int load from x and zero byte 3
>> with an AND.  Enhancement for a followup, similar to also
>> considering vector types for the load (also I'm not sure
>> that uint64_type_node always has non-BLKmode for all
>> targets).
>
> I implemented this but it makes the statistics more difficult to read
> and thus the test more difficult to write. Indeed, when doing a full
> bswap many of the intermediate computations are partial bswap.
> I can get on with it by using a different string to notify of partial
> bswap than full bswap and test the partial bswap feature in a
> separate file to not get noise from the full bswap. However I still
> don't like the idea of notifying for partial bswap in statements that
> will eventually be eliminated as dead statements. Besides, this
> occur because some recursion is made for each statement even
> if they were already examined which is not very nice in itself.

Ah, indeed ...

> The solution I see is to keep a map of visited statements but I
> don't know if that's fine in this case. After all, it's not strictly
> necessary as the pass would work without this. It would just serve
> to avoid redundant computation and confusion statistics.

... having bswap cleanup after itself (thus remove dead code itself
would be nice).  Let's just keep the above as a possibility for
further enhancements and focus on getting the current patch
correct and committed.

Btw, another enhancement is memory target.  Thus recognize

void bswap_mem (char *m1, char *m2)
{
  m2[0] = m1[3];
  m2[1] = m1[2];
  m2[2] = m1[1];
  m2[3] = m1[0];
}

with all the complication that arises due to aliasing issues.

Thanks,
Richard.

> Best regards,
>
> Thomas
>
>

^ permalink raw reply	[flat|nested] 31+ messages in thread

* RE: [PATCH] Fix PR54733 Optimize endian independent load/store
  2014-05-16 11:03       ` Richard Biener
  2014-05-16 11:14         ` Richard Biener
@ 2014-05-20  2:46         ` Thomas Preud'homme
  2014-05-20  9:06           ` Richard Biener
  1 sibling, 1 reply; 31+ messages in thread
From: Thomas Preud'homme @ 2014-05-20  2:46 UTC (permalink / raw)
  To: GCC Patches

> From: Richard Biener [mailto:richard.guenther@gmail.com]
> 
> Agreed, but I am happy with doing that as a followup.  Btw,
> a very simple one would be to reject unaligned
> SLOW_UNALIGNED_ACCESS (TYPE_MODE (load_type), align).
> [of course that may be true on MIPS even for the cases where
> a "reasonable" fast unalgined variant exists - nearly no target
> defines that macro in a too fancy way]

Indeed, it's defined to 1 without consideration of the mode or alignment
At least ARM, alpha, tilegx, tilepro and all target with STRICT_ALIGNMENT
since that's the default value for SLOW_UNALIGNED_ACCESS macro. Thus
mips should be in there too for instance.

However, I fail to see how the code produced to do an unaligned load
could be worse than the manual load done in the original bitwise
expression. It might be worse for load + bswap though. Maybe I could
skip the optimization based on this macro only for bswap?

Best regards,

Thomas


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] Fix PR54733 Optimize endian independent load/store
  2014-05-16 16:59           ` pinskia
  2014-05-19  2:49             ` Thomas Preud'homme
@ 2014-05-20  2:50             ` Andrew Pinski
  1 sibling, 0 replies; 31+ messages in thread
From: Andrew Pinski @ 2014-05-20  2:50 UTC (permalink / raw)
  To: Richard Biener; +Cc: Thomas Preud'homme, GCC Patches

On Fri, May 16, 2014 at 9:58 AM,  <pinskia@gmail.com> wrote:
>
>
>> On May 16, 2014, at 4:13 AM, Richard Biener <richard.guenther@gmail.com> wrote:
>>
>> On Fri, May 16, 2014 at 1:03 PM, Richard Biener
>> <richard.guenther@gmail.com> wrote:
>>> On Fri, May 16, 2014 at 12:56 PM,  <pinskia@gmail.com> wrote:
>>>>
>>>>
>>>>> On May 16, 2014, at 3:48 AM, Richard Biener <richard.guenther@gmail.com> wrote:
>>>>>
>>>>> On Fri, May 16, 2014 at 12:07 PM, Thomas Preud'homme
>>>>> <thomas.preudhomme@arm.com> wrote:
>>>>>> Ping?
>>>>>
>>>>> Sorry ...
>>>>>
>>>>>> Best regards,
>>>>>>
>>>>>> Thomas Preud'homme
>>>>>>
>>>>>>> -----Original Message-----
>>>>>>> From: gcc-patches-owner@gcc.gnu.org [mailto:gcc-patches-
>>>>>>> owner@gcc.gnu.org] On Behalf Of Thomas Preud'homme
>>>>>>> Sent: Friday, May 09, 2014 6:26 PM
>>>>>>> To: GCC Patches
>>>>>>> Subject: RE: [PATCH] Fix PR54733 Optimize endian independent load/store
>>>>>>>
>>>>>>> Sorry, took longer than expected as I got distracted by some other patch.
>>>>>>> I merged the whole patchset in a single patch as I was told the current setup
>>>>>>> is actually more difficult to read.
>>>>>>>
>>>>>>> Here are the updated ChangeLogs:
>>>>>>>
>>>>>>> *** gcc/ChangeLog ***
>>>>>>>
>>>>>>> 2014-05-09  Thomas Preud'homme  <thomas.preudhomme@arm.com>
>>>>>>>
>>>>>>>     PR tree-optimization/54733
>>>>>>>     * expr.c (get_inner_reference): Add a parameter to control whether
>>>>>>> a
>>>>>>>     MEM_REF should be split into base + offset.
>>>>>>>     * tree.h (get_inner_reference): Default new parameter to false.
>>>>>>>     * tree-ssa-math-opts.c (nop_stats): New "bswap_stats" structure.
>>>>>>>     (CMPNOP): Define.
>>>>>>>     (find_bswap_or_nop_load): New.
>>>>>>>     (find_bswap_1): Renamed to ...
>>>>>>>     (find_bswap_or_nop_1): This. Also add support for memory source.
>>>>>>>     (find_bswap): Renamed to ...
>>>>>>>     (find_bswap_or_nop): This. Also add support for memory source and
>>>>>>>     detection of bitwise operations equivalent to load in host endianness.
>>>>>>>     (execute_optimize_bswap): Likewise. Also move its leading
>>>>>>> comment back
>>>>>>>     in place and split statement transformation into ...
>>>>>>>     (bswap_replace): This. Add assert when updating bswap_stats.
>>>>>>>
>>>>>>> *** gcc/testsuite/ChangeLog ***
>>>>>>>
>>>>>>> 2014-05-09  Thomas Preud'homme  <thomas.preudhomme@arm.com>
>>>>>>>
>>>>>>>     PR tree-optimization/54733
>>>>>>>     * gcc.dg/optimize-bswapdi-3.c: New test to check extension of
>>>>>>> bswap
>>>>>>>     optimization to support memory sources and bitwise operations
>>>>>>>     equivalent to load in host endianness.
>>>>>>>     * gcc.dg/optimize-bswaphi-1.c: Likewise.
>>>>>>>     * gcc.dg/optimize-bswapsi-2.c: Likewise.
>>>>>>>     * gcc.c-torture/execute/bswap-2.c: Likewise.
>>>>>>>
>>>>>>> Ok for trunk?
>>>>>
>>>>> Ok, I now decided otherwise and dislike the new parameter to
>>>>> get_inner_reference.  Can you please revert that part and just
>>>>> deal with a MEM_REF result in your only caller?
>>>>>
>>>>> And (of course) I found another possible issue.  The way you
>>>>> compute load_type and use it here:
>>>>>
>>>>> +      /* Perform the load.  */
>>>>> +      load_offset_ptr = build_int_cst (n->alias_set, 0);
>>>>> +      val_expr = fold_build2 (MEM_REF, load_type, addr_tmp,
>>>>> +                             load_offset_ptr);
>>>>>
>>>>> makes the load always appear aligned according to the mode of
>>>>> load_type.  On strict-alignment targets this may cause faults.
>>>>>
>>>>> So what you have to do is either (simpler)
>>>>>
>>>>>  unsigned int align = get_pointer_alignment (addr_tmp);
>>>>>  tree al_load_type = load_type;
>>>>>  if (align < TYPE_ALIGN (load_type))
>>>>>    al_load_type = build_aligned_type (load_type, align);
>>>>> ...
>>>>>   val_expr = fold_build2 (MEM_REF, al_load_type, addr_tmp,
>>>>>                                    load_offset_ptr);
>>>>>
>>>>> or keep track of the "first" actual load and use
>>>>>
>>>>>  unsigned int align = get_object_alignment (that_first_load);
>>>>>
>>>>> "first" in the one that corresponds to addr_tmp.  From that there
>>>>> is a much better chance to derive good alignment values.
>>>>>
>>>>> Of course on STRICT_ALIGNMENT targets a not aligned load
>>>>> will be decomposed again, so eventually doing the transformation
>>>>> may no longer be profitable(?).
>>>>
>>>> Not always decomposed. On MIPS, it should using the load/store left/right instructions for unaligned load/stores which is normally better than decomposed load/stores. So having a cost model would be nice.
>>>
>>> Agreed, but I am happy with doing that as a followup.  Btw,
>>> a very simple one would be to reject unaligned
>>> SLOW_UNALIGNED_ACCESS (TYPE_MODE (load_type), align).
>>> [of course that may be true on MIPS even for the cases where
>>> a "reasonable" fast unalgined variant exists - nearly no target
>>> defines that macro in a too fancy way]
>>
>> Oh, and what happens for
>>
>> unsigned foo (unsigned char *x)
>> {
>>  return x[0] << 24 | x[2] << 8 | x[3];
>> }
>>
>> ?  We could do an unsigned int load from x and zero byte 3
>> with an AND.  Enhancement for a followup, similar to also
>> considering vector types for the load (also I'm not sure
>> that uint64_type_node always has non-BLKmode for all
>> targets).
>
> No we cannot if x[4] is on a different page which is not mapped in, we get a fault. Not something we want.

I was reading that code wrong.  I trying to say if we don't load from
x[3] then we can't do it.  But with the example above we can.

Thanks,
Andrew Pinski


>
> Thanks,
> Andrew
>
>
>>
>> Richard.
>>
>>> Richard.
>>>
>>>> Thanks,
>>>> Andrew
>>>>
>>>>>
>>>>> Thanks and sorry again for the delay.
>>>>>
>>>>> Otherwise the patch looks good to me.
>>>>>
>>>>> Richard.
>>>>>
>>>>>>> Best regards,
>>>>>>>
>>>>>>> Thomas
>>>>>>>
>>>>>>>> -----Original Message-----
>>>>>>>> From: gcc-patches-owner@gcc.gnu.org [mailto:gcc-patches-
>>>>>>>> owner@gcc.gnu.org] On Behalf Of Thomas Preud'homme
>>>>>>>> Sent: Monday, May 05, 2014 7:30 PM
>>>>>>>> To: GCC Patches
>>>>>>>> Subject: RE: [PATCH][2/3] Fix PR54733 Optimize endian independent
>>>>>>>> load/store
>>>>>>>>
>>>>>>>> I found a way to improve the function find_bswap/find_bswap_or_nop
>>>>>>>> and reduce its size. Please hold for the review, I will post an updated
>>>>>>>> version as soon as I finish testing.
>>>>>>>>
>>>>>>>> Best regards,
>>>>>>>>
>>>>>>>> Thomas Preud'homme
>>>>>>
>>>>>>

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] Fix PR54733 Optimize endian independent load/store
  2014-05-20  2:46         ` Thomas Preud'homme
@ 2014-05-20  9:06           ` Richard Biener
  2014-05-20 10:29             ` Thomas Preud'homme
  0 siblings, 1 reply; 31+ messages in thread
From: Richard Biener @ 2014-05-20  9:06 UTC (permalink / raw)
  To: Thomas Preud'homme; +Cc: GCC Patches

On Tue, May 20, 2014 at 4:46 AM, Thomas Preud'homme
<thomas.preudhomme@arm.com> wrote:
>> From: Richard Biener [mailto:richard.guenther@gmail.com]
>>
>> Agreed, but I am happy with doing that as a followup.  Btw,
>> a very simple one would be to reject unaligned
>> SLOW_UNALIGNED_ACCESS (TYPE_MODE (load_type), align).
>> [of course that may be true on MIPS even for the cases where
>> a "reasonable" fast unalgined variant exists - nearly no target
>> defines that macro in a too fancy way]
>
> Indeed, it's defined to 1 without consideration of the mode or alignment
> At least ARM, alpha, tilegx, tilepro and all target with STRICT_ALIGNMENT
> since that's the default value for SLOW_UNALIGNED_ACCESS macro. Thus
> mips should be in there too for instance.
>
> However, I fail to see how the code produced to do an unaligned load
> could be worse than the manual load done in the original bitwise
> expression. It might be worse for load + bswap though. Maybe I could
> skip the optimization based on this macro only for bswap?

It may do three aligned loads, char, short, char and combine them
while doing an unaligned int load may end up being slower.  Though
very probable the RTL expansion machinery for unaligned loads
is way more clever to emit an optimal sequence than a programmer is.

Anyway, as said before please consider addressing any cost issues
as followup - just make sure to properly emit unaligned loads via
a sequence I suggested.

Thanks,
Richard.

> Best regards,
>
> Thomas
>
>

^ permalink raw reply	[flat|nested] 31+ messages in thread

* RE: [PATCH] Fix PR54733 Optimize endian independent load/store
  2014-05-20  9:06           ` Richard Biener
@ 2014-05-20 10:29             ` Thomas Preud'homme
  2014-05-21  1:00               ` Thomas Preud'homme
  0 siblings, 1 reply; 31+ messages in thread
From: Thomas Preud'homme @ 2014-05-20 10:29 UTC (permalink / raw)
  To: GCC Patches

> From: Richard Biener [mailto:richard.guenther@gmail.com]
> 
> It may do three aligned loads, char, short, char and combine them
> while doing an unaligned int load may end up being slower.  Though
> very probable the RTL expansion machinery for unaligned loads
> is way more clever to emit an optimal sequence than a programmer is.

That's what I meant. I expect the RTL machinery to emit the optimal sequence
to do an unaligned load. If it doesn't, it should probably be fixed.

> 
> Anyway, as said before please consider addressing any cost issues
> as followup - just make sure to properly emit unaligned loads via
> a sequence I suggested.

That's what I did with the addition of skipping the optimization for target
with slow unaligned access for bswap permutation. Since this affects ARM
you can be sure I'll follow up on the cost model. I already started thinking
about it.

I'll send the new patch as soon as all the tests are done.

Best regards,

Thomas



^ permalink raw reply	[flat|nested] 31+ messages in thread

* RE: [PATCH] Fix PR54733 Optimize endian independent load/store
  2014-05-20 10:29             ` Thomas Preud'homme
@ 2014-05-21  1:00               ` Thomas Preud'homme
  2014-05-22 10:07                 ` Richard Biener
  2014-05-24 18:56                 ` Andreas Schwab
  0 siblings, 2 replies; 31+ messages in thread
From: Thomas Preud'homme @ 2014-05-21  1:00 UTC (permalink / raw)
  To: GCC Patches

[-- Attachment #1: Type: text/plain, Size: 1397 bytes --]

> From: gcc-patches-owner@gcc.gnu.org [mailto:gcc-patches-
> 
> I'll send the new patch as soon as all the tests are done.

Here you are. I also simplified the tests a bit having the reference as a function
parameter instead of a local one.

Updated ChangeLogs:

*** gcc/ChangeLog ***

2014-05-20  Thomas Preud'homme  <thomas.preudhomme@arm.com>

	PR tree-optimization/54733
	* tree-ssa-math-opts.c (nop_stats): New "bswap_stats" structure.
	(CMPNOP): Define.
	(find_bswap_or_nop_load): New.
	(find_bswap_1): Renamed to ...
	(find_bswap_or_nop_1): This. Also add support for memory source.
	(find_bswap): Renamed to ...
	(find_bswap_or_nop): This. Also add support for memory source and
	detection of bitwise operations equivalent to load in host endianness.
	(execute_optimize_bswap): Likewise. Also move its leading comment back
	in place and split statement transformation into ...
	(bswap_replace): This.

*** gcc/testsuite/ChangeLog ***

2014-05-20  Thomas Preud'homme  <thomas.preudhomme@arm.com>

	PR tree-optimization/54733
	* gcc.dg/optimize-bswapdi-3.c: New test to check extension of bswap
	optimization to support memory sources and bitwise operations
	equivalent to load in host endianness.
	* gcc.dg/optimize-bswaphi-1.c: Likewise.
	* gcc.dg/optimize-bswapsi-2.c: Likewise.
	* gcc.c-torture/execute/bswap-2.c: Likewise. 

Best regards,

Thomas

[-- Attachment #2: gcc32rm-84.6.0.diff --]
[-- Type: application/octet-stream, Size: 31189 bytes --]

diff --git a/gcc/testsuite/gcc.c-torture/execute/bswap-2.c b/gcc/testsuite/gcc.c-torture/execute/bswap-2.c
new file mode 100644
index 0000000..e91b487
--- /dev/null
+++ b/gcc/testsuite/gcc.c-torture/execute/bswap-2.c
@@ -0,0 +1,90 @@
+#ifdef __UINT32_TYPE__
+typedef __UINT32_TYPE__ uint32_t;
+#else
+typedef __UINT32_TYPE__ unsigned;
+#endif
+
+struct bitfield {
+  unsigned char f0:7;
+  unsigned char f1:7;
+  unsigned char f2:7;
+  unsigned char f3:7;
+};
+
+struct ok {
+  unsigned char f0;
+  unsigned char f1;
+  unsigned char f2;
+  unsigned char f3;
+};
+
+union bf_or_uint32 {
+  struct ok inval;
+  struct bitfield bfval;
+};
+
+__attribute__ ((noinline, noclone)) uint32_t
+partial_read_le32 (union bf_or_uint32 in)
+{
+  return in.bfval.f0 | (in.bfval.f1 << 8)
+	 | (in.bfval.f2 << 16) | (in.bfval.f3 << 24);
+}
+
+__attribute__ ((noinline, noclone)) uint32_t
+partial_read_be32 (union bf_or_uint32 in)
+{
+  return in.bfval.f3 | (in.bfval.f2 << 8)
+	 | (in.bfval.f1 << 16) | (in.bfval.f0 << 24);
+}
+
+__attribute__ ((noinline, noclone)) uint32_t
+fake_read_le32 (char *x, char *y)
+{
+  unsigned char c0, c1, c2, c3;
+
+  c0 = x[0];
+  c1 = x[1];
+  *y = 1;
+  c2 = x[2];
+  c3 = x[3];
+  return c0 | c1 << 8 | c2 << 16 | c3 << 24;
+}
+
+__attribute__ ((noinline, noclone)) uint32_t
+fake_read_be32 (char *x, char *y)
+{
+  unsigned char c0, c1, c2, c3;
+
+  c0 = x[0];
+  c1 = x[1];
+  *y = 1;
+  c2 = x[2];
+  c3 = x[3];
+  return c3 | c2 << 8 | c1 << 16 | c0 << 24;
+}
+
+int
+main ()
+{
+  union bf_or_uint32 bfin;
+  uint32_t out;
+  char cin[] = { 0x83, 0x85, 0x87, 0x89 };
+
+  if (sizeof (uint32_t) * __CHAR_BIT__ != 32)
+    return 0;
+  bfin.inval = (struct ok) { 0x83, 0x85, 0x87, 0x89 };
+  out = partial_read_le32 (bfin);
+  if (out != 0x09070503 && out != 0x88868482)
+    __builtin_abort ();
+  bfin.inval = (struct ok) { 0x83, 0x85, 0x87, 0x89 };
+  out = partial_read_be32 (bfin);
+  if (out != 0x03050709 && out != 0x82848688)
+    __builtin_abort ();
+  out = fake_read_le32 (cin, &cin[2]);
+  if (out != 0x89018583)
+    __builtin_abort ();
+  out = fake_read_be32 (cin, &cin[2]);
+  if (out != 0x83850189)
+    __builtin_abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.dg/optimize-bswapdi-3.c b/gcc/testsuite/gcc.dg/optimize-bswapdi-3.c
new file mode 100644
index 0000000..0a8bf2e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/optimize-bswapdi-3.c
@@ -0,0 +1,64 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target bswap64 } */
+/* { dg-require-effective-target stdint_types } */
+/* { dg-options "-O2 -fdump-tree-bswap" } */
+
+#include <stdint.h>
+
+unsigned char data[8];
+
+struct uint64_st {
+  unsigned char u0, u1, u2, u3, u4, u5, u6, u7;
+};
+
+uint64_t read_le64_1 (void)
+{
+  return (uint64_t) data[0] | ((uint64_t) data[1] << 8)
+	 | ((uint64_t) data[2] << 16) | ((uint64_t) data[3] << 24)
+	 | ((uint64_t) data[4] << 32) | ((uint64_t) data[5] << 40)
+	 | ((uint64_t) data[6] << 48) | ((uint64_t) data[7] << 56);
+}
+
+uint64_t read_le64_2 (struct uint64_st data)
+{
+  return (uint64_t) data.u0 | ((uint64_t) data.u1 << 8)
+	 | ((uint64_t) data.u2 << 16) | ((uint64_t) data.u3 << 24)
+	 | ((uint64_t) data.u4 << 32) | ((uint64_t) data.u5 << 40)
+	 | ((uint64_t) data.u6 << 48) | ((uint64_t) data.u7 << 56);
+}
+
+uint64_t read_le64_3 (unsigned char *data)
+{
+  return (uint64_t) *data | ((uint64_t) *(data + 1) << 8)
+	 | ((uint64_t) *(data + 2) << 16) | ((uint64_t) *(data + 3) << 24)
+	 | ((uint64_t) *(data + 4) << 32) | ((uint64_t) *(data + 5) << 40)
+	 | ((uint64_t) *(data + 6) << 48) | ((uint64_t) *(data + 7) << 56);
+}
+
+uint64_t read_be64_1 (void)
+{
+  return (uint64_t) data[7] | ((uint64_t) data[6] << 8)
+	 | ((uint64_t) data[5] << 16) | ((uint64_t) data[4] << 24)
+	 | ((uint64_t) data[3] << 32) | ((uint64_t) data[2] << 40)
+	 | ((uint64_t) data[1] << 48) | ((uint64_t) data[0] << 56);
+}
+
+uint64_t read_be64_2 (struct uint64_st data)
+{
+  return (uint64_t) data.u7 | ((uint64_t) data.u6 << 8)
+	 | ((uint64_t) data.u5 << 16) | ((uint64_t) data.u4 << 24)
+	 | ((uint64_t) data.u3 << 32) | ((uint64_t) data.u2 << 40)
+	 | ((uint64_t) data.u1 << 48) | ((uint64_t) data.u0 << 56);
+}
+
+uint64_t read_be64_3 (unsigned char *data)
+{
+  return (uint64_t) *(data + 7) | ((uint64_t) *(data + 6) << 8)
+	 | ((uint64_t) *(data + 5) << 16) | ((uint64_t) *(data + 4) << 24)
+	 | ((uint64_t) *(data + 3) << 32) | ((uint64_t) *(data + 2) << 40)
+	 | ((uint64_t) *(data + 1) << 48) | ((uint64_t) *data << 56);
+}
+
+/* { dg-final { scan-tree-dump-times "64 bit load in host endianness found at" 3 "bswap" } } */
+/* { dg-final { scan-tree-dump-times "64 bit bswap implementation found at" 3 "bswap" { xfail alpha*-*-* arm*-*-* } } } */
+/* { dg-final { cleanup-tree-dump "bswap" } } */
diff --git a/gcc/testsuite/gcc.dg/optimize-bswaphi-1.c b/gcc/testsuite/gcc.dg/optimize-bswaphi-1.c
new file mode 100644
index 0000000..65bff98
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/optimize-bswaphi-1.c
@@ -0,0 +1,47 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target bswap16 } */
+/* { dg-require-effective-target stdint_types } */
+/* { dg-options "-O2 -fdump-tree-bswap" } */
+/* { dg-options "-O2 -fdump-tree-bswap -march=z900" { target s390-*-* } } */
+
+#include <stdint.h>
+
+unsigned char data[2];
+
+struct uint16_st {
+  unsigned char u0, u1;
+};
+
+uint32_t read_le16_1 (void)
+{
+  return data[0] | (data[1] << 8);
+}
+
+uint32_t read_le16_2 (struct uint16_st data)
+{
+  return data.u0 | (data.u1 << 8);
+}
+
+uint32_t read_le16_3 (unsigned char *data)
+{
+  return *data | (*(data + 1) << 8);
+}
+
+uint32_t read_be16_1 (void)
+{
+  return data[1] | (data[0] << 8);
+}
+
+uint32_t read_be16_2 (struct uint16_st data)
+{
+  return data.u1 | (data.u0 << 8);
+}
+
+uint32_t read_be16_3 (unsigned char *data)
+{
+  return *(data + 1) | (*data << 8);
+}
+
+/* { dg-final { scan-tree-dump-times "16 bit load in host endianness found at" 3 "bswap" } } */
+/* { dg-final { scan-tree-dump-times "16 bit bswap implementation found at" 3 "bswap" { xfail alpha*-*-* arm*-*-* } } } */
+/* { dg-final { cleanup-tree-dump "bswap" } } */
diff --git a/gcc/testsuite/gcc.dg/optimize-bswapsi-2.c b/gcc/testsuite/gcc.dg/optimize-bswapsi-2.c
new file mode 100644
index 0000000..518b510
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/optimize-bswapsi-2.c
@@ -0,0 +1,49 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target bswap32 } */
+/* { dg-require-effective-target stdint_types } */
+/* { dg-options "-O2 -fdump-tree-bswap" } */
+/* { dg-options "-O2 -fdump-tree-bswap -march=z900" { target s390-*-* } } */
+
+#include <stdint.h>
+
+extern unsigned char data[4];
+
+struct uint32_st {
+  unsigned char u0, u1, u2, u3;
+};
+
+uint32_t read_le32_1 (void)
+{
+  return data[0] | (data[1] << 8) | (data[2] << 16) | (data[3] << 24);
+}
+
+uint32_t read_le32_2 (struct uint32_st data)
+{
+  return data.u0 | (data.u1 << 8) | (data.u2 << 16) | (data.u3 << 24);
+}
+
+uint32_t read_le32_3 (unsigned char *data)
+{
+  return *data | (*(data + 1) << 8) | (*(data + 2) << 16)
+	 | (*(data + 3) << 24);
+}
+
+uint32_t read_be32_1 (void)
+{
+  return data[3] | (data[2] << 8) | (data[1] << 16) | (data[0] << 24);
+}
+
+uint32_t read_be32_2 (struct uint32_st data)
+{
+  return data.u3 | (data.u2 << 8) | (data.u1 << 16) | (data.u0 << 24);
+}
+
+uint32_t read_be32_3 (unsigned char *data)
+{
+  return *(data + 3) | (*(data + 2) << 8) | (*(data + 1) << 16)
+	 | (*data << 24);
+}
+
+/* { dg-final { scan-tree-dump-times "32 bit load in host endianness found at" 3 "bswap" } } */
+/* { dg-final { scan-tree-dump-times "32 bit bswap implementation found at" 3 "bswap" { xfail alpha*-*-* arm*-*-* } } } */
+/* { dg-final { cleanup-tree-dump "bswap" } } */
diff --git a/gcc/tree-ssa-math-opts.c b/gcc/tree-ssa-math-opts.c
index 336626d..7c8c63a 100644
--- a/gcc/tree-ssa-math-opts.c
+++ b/gcc/tree-ssa-math-opts.c
@@ -98,6 +98,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "is-a.h"
 #include "gimple.h"
 #include "gimple-iterator.h"
+#include "gimplify.h"
 #include "gimplify-me.h"
 #include "stor-layout.h"
 #include "gimple-ssa.h"
@@ -170,15 +171,15 @@ static struct
 
 static struct
 {
-  /* Number of hand-written 16-bit bswaps found.  */
+  /* Number of hand-written 16-bit nop / bswaps found.  */
   int found_16bit;
 
-  /* Number of hand-written 32-bit bswaps found.  */
+  /* Number of hand-written 32-bit nop / bswaps found.  */
   int found_32bit;
 
-  /* Number of hand-written 64-bit bswaps found.  */
+  /* Number of hand-written 64-bit nop / bswaps found.  */
   int found_64bit;
-} bswap_stats;
+} nop_stats, bswap_stats;
 
 static struct
 {
@@ -1604,13 +1605,43 @@ make_pass_cse_sincos (gcc::context *ctxt)
 
    0    - byte has the value 0
    1..size - byte contains the content of the byte
-   number indexed with that value minus one  */
+   number indexed with that value minus one.
+
+   To detect permutations on memory sources (arrays and structures), a symbolic
+   number is also associated a base address (the array or structure the load is
+   made from), an offset from the base address and a range which gives the
+   difference between the highest and lowest accessed memory location to make
+   such a symbolic number. The range is thus different from size which reflects
+   the size of the type of current expression. Note that for non memory source,
+   range holds the same value as size.
+
+   For instance, for an array char a[], (short) a[0] | (short) a[3] would have
+   a size of 2 but a range of 4 while (short) a[0] | ((short) a[0] << 1) would
+   still have a size of 2 but this time a range of 1.  */
 
 struct symbolic_number {
   unsigned HOST_WIDEST_INT n;
   int size;
+  tree base_addr;
+  tree offset;
+  HOST_WIDE_INT bytepos;
+  tree alias_set;
+  tree vuse;
+  unsigned HOST_WIDE_INT range;
 };
 
+/* The number which the find_bswap_or_nop_1 result should match in
+   order to have a nop.  The number is masked according to the size of
+   the symbolic number before using it.  */
+#define CMPNOP (sizeof (HOST_WIDEST_INT) < 8 ? 0 : \
+  (unsigned HOST_WIDEST_INT)0x08070605 << 32 | 0x04030201)
+
+/* The number which the find_bswap_or_nop_1 result should match in
+   order to have a byte swap.  The number is masked according to the
+   size of the symbolic number before using it.  */
+#define CMPXCHG (sizeof (HOST_WIDEST_INT) < 8 ? 0 : \
+  (unsigned HOST_WIDEST_INT)0x01020304 << 32 | 0x05060708)
+
 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
    number N.  Return false if the requested operation is not permitted
    on a symbolic number.  */
@@ -1670,13 +1701,76 @@ verify_symbolic_number_p (struct symbolic_number *n, gimple stmt)
   return true;
 }
 
-/* find_bswap_1 invokes itself recursively with N and tries to perform
-   the operation given by the rhs of STMT on the result.  If the
-   operation could successfully be executed the function returns the
-   tree expression of the source operand and NULL otherwise.  */
+/* Check if STMT might be a byte swap or a nop from a memory source and returns
+   the answer. If so, REF is that memory source and the base of the memory area
+   accessed and the offset of the access from that base are recorded in N.  */
+
+bool
+find_bswap_or_nop_load (gimple stmt, tree ref, struct symbolic_number *n)
+{
+  /* Leaf node is an array or component ref. Memorize its base and
+     offset from base to compare to other such leaf node.  */
+  HOST_WIDE_INT bitsize, bitpos;
+  enum machine_mode mode;
+  int unsignedp, volatilep;
+
+  if (!gimple_assign_load_p (stmt) || gimple_has_volatile_ops (stmt))
+    return false;
+
+  n->base_addr = get_inner_reference (ref, &bitsize, &bitpos, &n->offset,
+				      &mode, &unsignedp, &volatilep, false);
+
+  if (TREE_CODE (n->base_addr) == MEM_REF)
+    {
+      offset_int bit_offset = 0;
+      tree off = TREE_OPERAND (n->base_addr, 1);
+
+      if (!integer_zerop (off))
+	{
+	  offset_int boff, coff = mem_ref_offset (n->base_addr);
+	  boff = wi::lshift (coff, LOG2_BITS_PER_UNIT);
+	  bit_offset += boff;
+	}
+
+      n->base_addr = TREE_OPERAND (n->base_addr, 0);
+
+      /* Avoid returning a negative bitpos as this may wreak havoc later.  */
+      if (wi::neg_p (bit_offset))
+	{
+	  offset_int mask = wi::mask <offset_int> (LOG2_BITS_PER_UNIT, false);
+	  offset_int tem = bit_offset.and_not (mask);
+	  /* TEM is the bitpos rounded to BITS_PER_UNIT towards -Inf.
+	     Subtract it to BIT_OFFSET and add it (scaled) to OFFSET.  */
+	  bit_offset -= tem;
+	  tem = wi::arshift (tem, LOG2_BITS_PER_UNIT);
+	  if (n->offset)
+	    n->offset = size_binop (PLUS_EXPR, n->offset,
+				    wide_int_to_tree (sizetype, tem));
+	  else
+	    n->offset = wide_int_to_tree (sizetype, tem);
+	}
+
+      bitpos += bit_offset.to_shwi ();
+    }
+
+  if (bitpos % BITS_PER_UNIT)
+    return false;
+  if (bitsize % BITS_PER_UNIT)
+    return false;
+
+  n->bytepos = bitpos / BITS_PER_UNIT;
+  n->alias_set = reference_alias_ptr_type (ref);
+  n->vuse = gimple_vuse (stmt);
+  return true;
+}
+
+/* find_bswap_or_nop_1 invokes itself recursively with N and tries to perform
+   the operation given by the rhs of STMT on the result.  If the operation
+   could successfully be executed the function returns the tree expression of
+   the source operand and NULL otherwise.  */
 
 static tree
-find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
+find_bswap_or_nop_1 (gimple stmt, struct symbolic_number *n, int limit)
 {
   enum tree_code code;
   tree rhs1, rhs2 = NULL;
@@ -1689,6 +1783,9 @@ find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
 
   rhs1 = gimple_assign_rhs1 (stmt);
 
+  if (find_bswap_or_nop_load (stmt, rhs1, n))
+    return rhs1;
+
   if (TREE_CODE (rhs1) != SSA_NAME)
     return NULL_TREE;
 
@@ -1715,11 +1812,11 @@ find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
 	  && code != CONVERT_EXPR)
 	return NULL_TREE;
 
-      source_expr1 = find_bswap_1 (rhs1_stmt, n, limit - 1);
+      source_expr1 = find_bswap_or_nop_1 (rhs1_stmt, n, limit - 1);
 
-      /* If find_bswap_1 returned NULL STMT is a leaf node and we have
-	 to initialize the symbolic number.  */
-      if (!source_expr1)
+      /* If find_bswap_or_nop_1 returned NULL, STMT is a leaf node and
+	 we have to initialize the symbolic number.  */
+      if (!source_expr1 || gimple_assign_load_p (rhs1_stmt))
 	{
 	  /* Set up the symbolic number N by setting each byte to a
 	     value between 1 and the byte size of rhs1.  The highest
@@ -1729,14 +1826,18 @@ find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
 	  if (n->size % BITS_PER_UNIT != 0)
 	    return NULL_TREE;
 	  n->size /= BITS_PER_UNIT;
-	  n->n = (sizeof (HOST_WIDEST_INT) < 8 ? 0 :
-		  (unsigned HOST_WIDEST_INT)0x08070605 << 32 | 0x04030201);
+	  n->range = n->size;
+	  n->n = CMPNOP;
 
 	  if (n->size < (int)sizeof (HOST_WIDEST_INT))
 	    n->n &= ((unsigned HOST_WIDEST_INT)1 <<
 		     (n->size * BITS_PER_UNIT)) - 1;
 
-	  source_expr1 = rhs1;
+	  if (!source_expr1)
+	    {
+	      n->base_addr = n->offset = n->alias_set = n->vuse = NULL_TREE;
+	      source_expr1 = rhs1;
+	    }
 	}
 
       switch (code)
@@ -1777,6 +1878,8 @@ find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
 		n->n &= ((unsigned HOST_WIDEST_INT)1 << type_size) - 1;
 	      }
 	    n->size = type_size / BITS_PER_UNIT;
+	    if (!n->base_addr)
+	      n->range = n->size;
 	  }
 	  break;
 	default:
@@ -1805,17 +1908,79 @@ find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
       switch (code)
 	{
 	case BIT_IOR_EXPR:
-	  source_expr1 = find_bswap_1 (rhs1_stmt, &n1, limit - 1);
+	  source_expr1 = find_bswap_or_nop_1 (rhs1_stmt, &n1, limit - 1);
 
 	  if (!source_expr1)
 	    return NULL_TREE;
 
-	  source_expr2 = find_bswap_1 (rhs2_stmt, &n2, limit - 1);
+	  source_expr2 = find_bswap_or_nop_1 (rhs2_stmt, &n2, limit - 1);
+
+	  if (n1.size != n2.size || !source_expr2)
+	    return NULL_TREE;
 
-	  if (source_expr1 != source_expr2
-	      || n1.size != n2.size)
+	  if (!n1.vuse != !n2.vuse ||
+	  (n1.vuse && !operand_equal_p (n1.vuse, n2.vuse, 0)))
 	    return NULL_TREE;
 
+	  if (source_expr1 != source_expr2)
+	    {
+	      HOST_WIDEST_INT inc, mask;
+	      unsigned i;
+	      HOST_WIDE_INT off_sub;
+	      struct symbolic_number *n_ptr;
+
+	      if (!n1.base_addr || !n2.base_addr
+		  || !operand_equal_p (n1.base_addr, n2.base_addr, 0))
+		return NULL_TREE;
+	      if (!n1.offset != !n2.offset ||
+	          (n1.offset && !operand_equal_p (n1.offset, n2.offset, 0)))
+		return NULL_TREE;
+
+	      /* We swap n1 with n2 to have n1 < n2.  */
+	      if (n2.bytepos < n1.bytepos)
+		{
+		  struct symbolic_number tmpn;
+
+		  tmpn = n2;
+		  n2 = n1;
+		  n1 = tmpn;
+		  source_expr1 = source_expr2;
+		}
+
+	      off_sub = n2.bytepos - n1.bytepos;
+
+	      /* Check that the range of memory covered < biggest int size.  */
+	      if (off_sub + n2.range > (int) sizeof (HOST_WIDEST_INT))
+	        return NULL_TREE;
+	      n->range = n2.range + off_sub;
+
+	      /* Reinterpret byte marks in symbolic number holding the value of
+		 bigger weight according to host endianness.  */
+	      inc = BYTES_BIG_ENDIAN ? off_sub + n2.range - n1.range : off_sub;
+	      mask = 0xFF;
+	      if (BYTES_BIG_ENDIAN)
+		n_ptr = &n1;
+	      else
+		n_ptr = &n2;
+	      for (i = 0; i < sizeof (HOST_WIDEST_INT); i++, inc <<= 8,
+		   mask <<= 8)
+		{
+		  if (n_ptr->n & mask)
+		    n_ptr->n += inc;
+		}
+	    }
+	  else
+	    n->range = n1.range;
+
+	  if (!n1.alias_set
+	      || alias_ptr_types_compatible_p (n1.alias_set, n2.alias_set))
+	    n->alias_set = n1.alias_set;
+	  else
+	    n->alias_set = ptr_type_node;
+	  n->vuse = n1.vuse;
+	  n->base_addr = n1.base_addr;
+	  n->offset = n1.offset;
+	  n->bytepos = n1.bytepos;
 	  n->size = n1.size;
 	  for (i = 0, mask = 0xff; i < n->size; i++, mask <<= BITS_PER_UNIT)
 	    {
@@ -1840,57 +2005,75 @@ find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
   return NULL_TREE;
 }
 
-/* Check if STMT completes a bswap implementation consisting of ORs,
-   SHIFTs and ANDs.  Return the source tree expression on which the
-   byte swap is performed and NULL if no bswap was found.  */
+/* Check if STMT completes a bswap implementation or a read in a given
+   endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP
+   accordingly.  It also sets N to represent the kind of operations
+   performed: size of the resulting expression and whether it works on
+   a memory source, and if so alias-set and vuse.  At last, the
+   function returns the source tree expression.  */
 
 static tree
-find_bswap (gimple stmt)
+find_bswap_or_nop (gimple stmt, struct symbolic_number *n, bool *bswap)
 {
-/* The number which the find_bswap result should match in order to
-   have a full byte swap.  The number is shifted to the left according
-   to the size of the symbolic number before using it.  */
-  unsigned HOST_WIDEST_INT cmp =
-    sizeof (HOST_WIDEST_INT) < 8 ? 0 :
-    (unsigned HOST_WIDEST_INT)0x01020304 << 32 | 0x05060708;
-
-  struct symbolic_number n;
+/* The number which the find_bswap_or_nop_1 result should match in order
+   to have a full byte swap.  The number is shifted to the right
+   according to the size of the symbolic number before using it.  */
+  unsigned HOST_WIDEST_INT cmpxchg = CMPXCHG;
+  unsigned HOST_WIDEST_INT cmpnop = CMPNOP;
+
   tree source_expr;
   int limit;
 
   /* The last parameter determines the depth search limit.  It usually
-     correlates directly to the number of bytes to be touched.  We
-     increase that number by three  here in order to also
-     cover signed -> unsigned converions of the src operand as can be seen
+     correlates directly to the number n of bytes to be touched.  We
+     increase that number by log2(n) + 1 here in order to also
+     cover signed -> unsigned conversions of the src operand as can be seen
      in libgcc, and for initial shift/and operation of the src operand.  */
   limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
   limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
-  source_expr =  find_bswap_1 (stmt, &n, limit);
+  source_expr =  find_bswap_or_nop_1 (stmt, n, limit);
 
   if (!source_expr)
     return NULL_TREE;
 
-  /* Zero out the extra bits of N and CMP.  */
-  if (n.size < (int)sizeof (HOST_WIDEST_INT))
+  /* Find real size of result (highest non zero byte).  */
+  if (n->base_addr)
     {
-      unsigned HOST_WIDEST_INT mask =
-	((unsigned HOST_WIDEST_INT)1 << (n.size * BITS_PER_UNIT)) - 1;
+      int rsize;
+      unsigned HOST_WIDEST_INT tmpn;
 
-      n.n &= mask;
-      cmp >>= (sizeof (HOST_WIDEST_INT) - n.size) * BITS_PER_UNIT;
+      for (tmpn = n->n, rsize = 0; tmpn; tmpn >>= BITS_PER_UNIT, rsize++);
+      n->range = rsize;
     }
 
-  /* A complete byte swap should make the symbolic number to start
-     with the largest digit in the highest order byte.  */
-  if (cmp != n.n)
+  /* Zero out the extra bits of N and CMP*.  */
+  if (n->range < (int)sizeof (HOST_WIDEST_INT))
+    {
+      unsigned HOST_WIDEST_INT mask;
+
+      mask = ((unsigned HOST_WIDEST_INT)1 << (n->range * BITS_PER_UNIT)) - 1;
+      cmpxchg >>= (sizeof (HOST_WIDEST_INT) - n->range) * BITS_PER_UNIT;
+      cmpnop &= mask;
+    }
+
+  /* A complete byte swap should make the symbolic number to start with
+     the largest digit in the highest order byte. Unchanged symbolic
+     number indicates a read with same endianness as host architecture.  */
+  if (n->n == cmpnop)
+    *bswap = false;
+  else if (n->n == cmpxchg)
+    *bswap = true;
+  else
     return NULL_TREE;
 
+  /* Useless bit manipulation performed by code.  */
+  if (!n->base_addr && n->n == cmpnop)
+    return NULL_TREE;
+
+  n->range *= BITS_PER_UNIT;
   return source_expr;
 }
 
-/* Find manual byte swap implementations and turn them into a bswap
-   builtin invokation.  */
-
 namespace {
 
 const pass_data pass_data_optimize_bswap =
@@ -1924,6 +2107,156 @@ public:
 
 }; // class pass_optimize_bswap
 
+/* Perform the bswap optimization: replace the statement STMT at GSI
+   with load type, VUSE and set-alias as described by N if a memory
+   source is involved (N->base_addr is non null), followed by the
+   builtin bswap invocation in FNDECL if BSWAP is true.  SRC gives
+   the source on which STMT is operating and N->range gives the
+   size of the expression involved for maintaining some statistics.  */
+
+static bool
+bswap_replace (gimple stmt, gimple_stmt_iterator *gsi, tree src, tree fndecl,
+	       tree bswap_type, tree load_type, struct symbolic_number *n,
+	       bool bswap)
+{
+  tree tmp, tgt;
+  gimple call;
+
+  tgt = gimple_assign_lhs (stmt);
+
+  /* Need to load the value from memory first.  */
+  if (n->base_addr)
+    {
+      tree addr_expr, addr_tmp, val_expr, val_tmp;
+      tree load_offset_ptr, aligned_load_type;
+      gimple addr_stmt, load_stmt;
+      unsigned align;
+
+      align = get_object_alignment (src);
+      if (bswap && SLOW_UNALIGNED_ACCESS (TYPE_MODE (load_type), align))
+	return false;
+
+      /*  Compute address to load from and cast according to the size
+	  of the load.  */
+      addr_expr = build_fold_addr_expr (unshare_expr (src));
+      if (is_gimple_min_invariant (addr_expr))
+	addr_tmp = addr_expr;
+      else
+	{
+	  addr_tmp = make_temp_ssa_name (TREE_TYPE (addr_expr), NULL,
+					 "load_src");
+	  addr_stmt = gimple_build_assign (addr_tmp, addr_expr);
+	  gsi_insert_before (gsi, addr_stmt, GSI_SAME_STMT);
+	}
+
+      /* Perform the load.  */
+      aligned_load_type = load_type;
+      if (align < TYPE_ALIGN (load_type))
+	aligned_load_type = build_aligned_type (load_type, align);
+      load_offset_ptr = build_int_cst (n->alias_set, 0);
+      val_expr = fold_build2 (MEM_REF, aligned_load_type, addr_tmp,
+			      load_offset_ptr);
+
+      if (!bswap)
+	{
+	  if (n->range == 16)
+	    nop_stats.found_16bit++;
+	  else if (n->range == 32)
+	    nop_stats.found_32bit++;
+	  else
+	    {
+	      gcc_assert (n->range == 64);
+	      nop_stats.found_64bit++;
+	    }
+
+	  /* Convert the result of load if necessary.  */
+	  if (!useless_type_conversion_p (TREE_TYPE (tgt), load_type))
+	    {
+	      val_tmp = make_temp_ssa_name (aligned_load_type, NULL,
+					    "load_dst");
+	      load_stmt = gimple_build_assign (val_tmp, val_expr);
+	      gimple_set_vuse (load_stmt, n->vuse);
+	      gsi_insert_before (gsi, load_stmt, GSI_SAME_STMT);
+	      gimple_assign_set_rhs_with_ops_1 (gsi, NOP_EXPR, val_tmp,
+						NULL_TREE, NULL_TREE);
+	    }
+	  else
+	    gimple_assign_set_rhs_with_ops_1 (gsi, MEM_REF, val_expr,
+					      NULL_TREE, NULL_TREE);
+	  update_stmt (gsi_stmt (*gsi));
+
+	  if (dump_file)
+	    {
+	      fprintf (dump_file,
+		       "%d bit load in host endianness found at: ",
+		       (int)n->range);
+	      print_gimple_stmt (dump_file, stmt, 0, 0);
+	    }
+	  return true;
+	}
+      else
+	{
+	  val_tmp = make_temp_ssa_name (aligned_load_type, NULL, "load_dst");
+	  load_stmt = gimple_build_assign (val_tmp, val_expr);
+	  gimple_set_vuse (load_stmt, n->vuse);
+	  gsi_insert_before (gsi, load_stmt, GSI_SAME_STMT);
+	}
+      src = val_tmp;
+    }
+
+  if (n->range == 16)
+    bswap_stats.found_16bit++;
+  else if (n->range == 32)
+    bswap_stats.found_32bit++;
+  else
+    {
+      gcc_assert (n->range == 64);
+      bswap_stats.found_64bit++;
+    }
+
+  tmp = src;
+
+  /* Convert the src expression if necessary.  */
+  if (!useless_type_conversion_p (TREE_TYPE (tmp), bswap_type))
+    {
+      gimple convert_stmt;
+      tmp = make_temp_ssa_name (bswap_type, NULL, "bswapsrc");
+      convert_stmt = gimple_build_assign_with_ops (NOP_EXPR, tmp, src, NULL);
+      gsi_insert_before (gsi, convert_stmt, GSI_SAME_STMT);
+    }
+
+  call = gimple_build_call (fndecl, 1, tmp);
+
+  tmp = tgt;
+
+  /* Convert the result if necessary.  */
+  if (!useless_type_conversion_p (TREE_TYPE (tgt), bswap_type))
+    {
+      gimple convert_stmt;
+      tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst");
+      convert_stmt = gimple_build_assign_with_ops (NOP_EXPR, tgt, tmp, NULL);
+      gsi_insert_after (gsi, convert_stmt, GSI_SAME_STMT);
+    }
+
+  gimple_call_set_lhs (call, tmp);
+
+  if (dump_file)
+    {
+      fprintf (dump_file, "%d bit bswap implementation found at: ",
+	       (int)n->range);
+      print_gimple_stmt (dump_file, stmt, 0, 0);
+    }
+
+  gsi_insert_after (gsi, call, GSI_SAME_STMT);
+  gsi_remove (gsi, true);
+  return true;
+}
+
+/* Find manual byte swap implementations as well as load in a given
+   endianness. Byte swaps are turned into a bswap builtin invokation
+   while endian loads are converted to bswap builtin invokation or
+   simple load according to the host endianness.  */
+
 unsigned int
 pass_optimize_bswap::execute (function *fun)
 {
@@ -1946,9 +2279,6 @@ pass_optimize_bswap::execute (function *fun)
 	       && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing
 		   || (bswap32_p && word_mode == SImode)));
 
-  if (!bswap16_p && !bswap32_p && !bswap64_p)
-    return 0;
-
   /* Determine the argument type of the builtins.  The code later on
      assumes that the return and argument type are the same.  */
   if (bswap16_p)
@@ -1969,6 +2299,7 @@ pass_optimize_bswap::execute (function *fun)
       bswap64_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
     }
 
+  memset (&nop_stats, 0, sizeof (nop_stats));
   memset (&bswap_stats, 0, sizeof (bswap_stats));
 
   FOR_EACH_BB_FN (bb, fun)
@@ -1982,21 +2313,24 @@ pass_optimize_bswap::execute (function *fun)
       for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
         {
 	  gimple stmt = gsi_stmt (gsi);
-	  tree bswap_src, bswap_type;
-	  tree bswap_tmp;
-	  tree fndecl = NULL_TREE;
-	  int type_size;
-	  gimple call;
+	  tree fndecl = NULL_TREE, bswap_type = NULL_TREE;
+	  tree src, load_type;
+	  struct symbolic_number n;
+	  bool bswap;
 
 	  if (!is_gimple_assign (stmt)
 	      || gimple_assign_rhs_code (stmt) != BIT_IOR_EXPR)
 	    continue;
 
-	  type_size = TYPE_PRECISION (gimple_expr_type (stmt));
+	  src = find_bswap_or_nop (stmt, &n, &bswap);
+
+	  if (!src)
+	    continue;
 
-	  switch (type_size)
+	  switch (n.range)
 	    {
 	    case 16:
+	      load_type = uint16_type_node;
 	      if (bswap16_p)
 		{
 		  fndecl = builtin_decl_explicit (BUILT_IN_BSWAP16);
@@ -2004,6 +2338,7 @@ pass_optimize_bswap::execute (function *fun)
 		}
 	      break;
 	    case 32:
+	      load_type = uint32_type_node;
 	      if (bswap32_p)
 		{
 		  fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
@@ -2011,6 +2346,7 @@ pass_optimize_bswap::execute (function *fun)
 		}
 	      break;
 	    case 64:
+	      load_type = uint64_type_node;
 	      if (bswap64_p)
 		{
 		  fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
@@ -2021,62 +2357,21 @@ pass_optimize_bswap::execute (function *fun)
 	      continue;
 	    }
 
-	  if (!fndecl)
+	  if (bswap && !fndecl)
 	    continue;
 
-	  bswap_src = find_bswap (stmt);
-
-	  if (!bswap_src)
-	    continue;
-
-	  changed = true;
-	  if (type_size == 16)
-	    bswap_stats.found_16bit++;
-	  else if (type_size == 32)
-	    bswap_stats.found_32bit++;
-	  else
-	    bswap_stats.found_64bit++;
-
-	  bswap_tmp = bswap_src;
-
-	  /* Convert the src expression if necessary.  */
-	  if (!useless_type_conversion_p (TREE_TYPE (bswap_tmp), bswap_type))
-	    {
-	      gimple convert_stmt;
-	      bswap_tmp = make_temp_ssa_name (bswap_type, NULL, "bswapsrc");
-	      convert_stmt = gimple_build_assign_with_ops
-		  		(NOP_EXPR, bswap_tmp, bswap_src, NULL);
-	      gsi_insert_before (&gsi, convert_stmt, GSI_SAME_STMT);
-	    }
-
-	  call = gimple_build_call (fndecl, 1, bswap_tmp);
-
-	  bswap_tmp = gimple_assign_lhs (stmt);
-
-	  /* Convert the result if necessary.  */
-	  if (!useless_type_conversion_p (TREE_TYPE (bswap_tmp), bswap_type))
-	    {
-	      gimple convert_stmt;
-	      bswap_tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst");
-	      convert_stmt = gimple_build_assign_with_ops
-			(NOP_EXPR, gimple_assign_lhs (stmt), bswap_tmp, NULL);
-	      gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
-	    }
-
-	  gimple_call_set_lhs (call, bswap_tmp);
-
-	  if (dump_file)
-	    {
-	      fprintf (dump_file, "%d bit bswap implementation found at: ",
-		       (int)type_size);
-	      print_gimple_stmt (dump_file, stmt, 0, 0);
-	    }
-
-	  gsi_insert_after (&gsi, call, GSI_SAME_STMT);
-	  gsi_remove (&gsi, true);
+	  if (bswap_replace (stmt, &gsi, src, fndecl, bswap_type, load_type,
+			     &n, bswap))
+	    changed = true;
 	}
     }
 
+  statistics_counter_event (fun, "16-bit nop implementations found",
+			    nop_stats.found_16bit);
+  statistics_counter_event (fun, "32-bit nop implementations found",
+			    nop_stats.found_32bit);
+  statistics_counter_event (fun, "64-bit nop implementations found",
+			    nop_stats.found_64bit);
   statistics_counter_event (fun, "16-bit bswap implementations found",
 			    bswap_stats.found_16bit);
   statistics_counter_event (fun, "32-bit bswap implementations found",

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] Fix PR54733 Optimize endian independent load/store
  2014-05-21  1:00               ` Thomas Preud'homme
@ 2014-05-22 10:07                 ` Richard Biener
  2014-05-23  3:36                   ` Thomas Preud'homme
  2014-05-24 18:56                 ` Andreas Schwab
  1 sibling, 1 reply; 31+ messages in thread
From: Richard Biener @ 2014-05-22 10:07 UTC (permalink / raw)
  To: Thomas Preud'homme; +Cc: GCC Patches

On Wed, May 21, 2014 at 3:00 AM, Thomas Preud'homme
<thomas.preudhomme@arm.com> wrote:
>> From: gcc-patches-owner@gcc.gnu.org [mailto:gcc-patches-
>>
>> I'll send the new patch as soon as all the tests are done.
>
> Here you are. I also simplified the tests a bit having the reference as a function
> parameter instead of a local one.
>
> Updated ChangeLogs:
>
> *** gcc/ChangeLog ***
>
> 2014-05-20  Thomas Preud'homme  <thomas.preudhomme@arm.com>
>
>         PR tree-optimization/54733
>         * tree-ssa-math-opts.c (nop_stats): New "bswap_stats" structure.
>         (CMPNOP): Define.
>         (find_bswap_or_nop_load): New.
>         (find_bswap_1): Renamed to ...
>         (find_bswap_or_nop_1): This. Also add support for memory source.
>         (find_bswap): Renamed to ...
>         (find_bswap_or_nop): This. Also add support for memory source and
>         detection of bitwise operations equivalent to load in host endianness.
>         (execute_optimize_bswap): Likewise. Also move its leading comment back
>         in place and split statement transformation into ...
>         (bswap_replace): This.
>
> *** gcc/testsuite/ChangeLog ***
>
> 2014-05-20  Thomas Preud'homme  <thomas.preudhomme@arm.com>
>
>         PR tree-optimization/54733
>         * gcc.dg/optimize-bswapdi-3.c: New test to check extension of bswap
>         optimization to support memory sources and bitwise operations
>         equivalent to load in host endianness.
>         * gcc.dg/optimize-bswaphi-1.c: Likewise.
>         * gcc.dg/optimize-bswapsi-2.c: Likewise.
>         * gcc.c-torture/execute/bswap-2.c: Likewise.
>
> Best regards,

This is ok.

Thanks,
Richard.

> Thomas

^ permalink raw reply	[flat|nested] 31+ messages in thread

* RE: [PATCH] Fix PR54733 Optimize endian independent load/store
  2014-05-22 10:07                 ` Richard Biener
@ 2014-05-23  3:36                   ` Thomas Preud'homme
  2014-05-26 10:57                     ` Christophe Lyon
  0 siblings, 1 reply; 31+ messages in thread
From: Thomas Preud'homme @ 2014-05-23  3:36 UTC (permalink / raw)
  To: 'Richard Biener'; +Cc: GCC Patches

> From: Richard Biener [mailto:richard.guenther@gmail.com]
> On Wed, May 21, 2014 at 3:00 AM, Thomas Preud'homme
> <thomas.preudhomme@arm.com> wrote:

> >
> > Updated ChangeLogs:
> >
> > *** gcc/ChangeLog ***
> >
> > 2014-05-20  Thomas Preud'homme  <thomas.preudhomme@arm.com>
> >
> >         PR tree-optimization/54733
> >         * tree-ssa-math-opts.c (nop_stats): New "bswap_stats" structure.
> >         (CMPNOP): Define.
> >         (find_bswap_or_nop_load): New.
> >         (find_bswap_1): Renamed to ...
> >         (find_bswap_or_nop_1): This. Also add support for memory source.
> >         (find_bswap): Renamed to ...
> >         (find_bswap_or_nop): This. Also add support for memory source and
> >         detection of bitwise operations equivalent to load in host endianness.
> >         (execute_optimize_bswap): Likewise. Also move its leading comment
> back
> >         in place and split statement transformation into ...
> >         (bswap_replace): This.
> >
> > *** gcc/testsuite/ChangeLog ***
> >
> > 2014-05-20  Thomas Preud'homme  <thomas.preudhomme@arm.com>
> >
> >         PR tree-optimization/54733
> >         * gcc.dg/optimize-bswapdi-3.c: New test to check extension of bswap
> >         optimization to support memory sources and bitwise operations
> >         equivalent to load in host endianness.
> >         * gcc.dg/optimize-bswaphi-1.c: Likewise.
> >         * gcc.dg/optimize-bswapsi-2.c: Likewise.
> >         * gcc.c-torture/execute/bswap-2.c: Likewise.
> >
> > Best regards,
> 
> This is ok.

Great. Commited.

Thanks a lot for your patience in mentoring me to improve this patch.

Best regards,

Thomas


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] Fix PR54733 Optimize endian independent load/store
  2014-05-21  1:00               ` Thomas Preud'homme
  2014-05-22 10:07                 ` Richard Biener
@ 2014-05-24 18:56                 ` Andreas Schwab
  1 sibling, 0 replies; 31+ messages in thread
From: Andreas Schwab @ 2014-05-24 18:56 UTC (permalink / raw)
  To: Thomas Preud'homme; +Cc: GCC Patches

	* gcc.c-torture/execute/bswap-2.c (main): Handle more bitfield
	layouts.

diff --git a/gcc/testsuite/gcc.c-torture/execute/bswap-2.c b/gcc/testsuite/gcc.c-torture/execute/bswap-2.c
index e91b487..38f18fd 100644
--- a/gcc/testsuite/gcc.c-torture/execute/bswap-2.c
+++ b/gcc/testsuite/gcc.c-torture/execute/bswap-2.c
@@ -74,11 +74,11 @@ main ()
     return 0;
   bfin.inval = (struct ok) { 0x83, 0x85, 0x87, 0x89 };
   out = partial_read_le32 (bfin);
-  if (out != 0x09070503 && out != 0x88868482)
+  if (out != 0x09070503 && out != 0x88868482 && out != 0x78306141)
     __builtin_abort ();
   bfin.inval = (struct ok) { 0x83, 0x85, 0x87, 0x89 };
   out = partial_read_be32 (bfin);
-  if (out != 0x03050709 && out != 0x82848688)
+  if (out != 0x03050709 && out != 0x82848688 && out != 0x41613078)
     __builtin_abort ();
   out = fake_read_le32 (cin, &cin[2]);
   if (out != 0x89018583)
-- 
1.9.3

-- 
Andreas Schwab, schwab@linux-m68k.org
GPG Key fingerprint = 58CA 54C7 6D53 942B 1756  01D3 44D5 214B 8276 4ED5
"And now for something completely different."

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] Fix PR54733 Optimize endian independent load/store
  2014-05-23  3:36                   ` Thomas Preud'homme
@ 2014-05-26 10:57                     ` Christophe Lyon
  2014-05-26 11:06                       ` Thomas Preud'homme
  0 siblings, 1 reply; 31+ messages in thread
From: Christophe Lyon @ 2014-05-26 10:57 UTC (permalink / raw)
  To: Thomas Preud'homme; +Cc: Richard Biener, GCC Patches

On 23 May 2014 05:36, Thomas Preud'homme <thomas.preudhomme@arm.com> wrote:
>> From: Richard Biener [mailto:richard.guenther@gmail.com]
>> On Wed, May 21, 2014 at 3:00 AM, Thomas Preud'homme
>> <thomas.preudhomme@arm.com> wrote:
>
>> >
>> > Updated ChangeLogs:
>> >
>> > *** gcc/ChangeLog ***
>> >
>> > 2014-05-20  Thomas Preud'homme  <thomas.preudhomme@arm.com>
>> >
>> >         PR tree-optimization/54733
>> >         * tree-ssa-math-opts.c (nop_stats): New "bswap_stats" structure.
>> >         (CMPNOP): Define.
>> >         (find_bswap_or_nop_load): New.
>> >         (find_bswap_1): Renamed to ...
>> >         (find_bswap_or_nop_1): This. Also add support for memory source.
>> >         (find_bswap): Renamed to ...
>> >         (find_bswap_or_nop): This. Also add support for memory source and
>> >         detection of bitwise operations equivalent to load in host endianness.
>> >         (execute_optimize_bswap): Likewise. Also move its leading comment
>> back
>> >         in place and split statement transformation into ...
>> >         (bswap_replace): This.
>> >
>> > *** gcc/testsuite/ChangeLog ***
>> >
>> > 2014-05-20  Thomas Preud'homme  <thomas.preudhomme@arm.com>
>> >
>> >         PR tree-optimization/54733
>> >         * gcc.dg/optimize-bswapdi-3.c: New test to check extension of bswap
>> >         optimization to support memory sources and bitwise operations
>> >         equivalent to load in host endianness.
>> >         * gcc.dg/optimize-bswaphi-1.c: Likewise.
>> >         * gcc.dg/optimize-bswapsi-2.c: Likewise.
>> >         * gcc.c-torture/execute/bswap-2.c: Likewise.
>> >
>> > Best regards,
>>
>> This is ok.
>
> Great. Commited.
>
> Thanks a lot for your patience in mentoring me to improve this patch.
>
> Best regards,
>
> Thomas
>

I have noticed that the new bswap-2.c test fails at execution on armeb targets.
See:
http://cbuild.validation.linaro.org/build/cross-validation/gcc/210843/report-build-info.html

Could you have a look?
Thanks,

Christophe.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* RE: [PATCH] Fix PR54733 Optimize endian independent load/store
  2014-05-26 10:57                     ` Christophe Lyon
@ 2014-05-26 11:06                       ` Thomas Preud'homme
  2014-05-27  7:23                         ` Thomas Preud'homme
  0 siblings, 1 reply; 31+ messages in thread
From: Thomas Preud'homme @ 2014-05-26 11:06 UTC (permalink / raw)
  To: 'Christophe Lyon'; +Cc: Richard Biener, GCC Patches

> From: Christophe Lyon [mailto:christophe.lyon@linaro.org]
> 
> I have noticed that the new bswap-2.c test fails at execution on armeb
> targets.
> See:
> http://cbuild.validation.linaro.org/build/cross-validation/gcc/210843/report-
> build-info.html
> 
> Could you have a look?

Sure.

I suspect it's the same kind of problem m68k run into. I already wrote a patch to
reduce the impact of different bitfield layout and it's in review right now. I'll
send you tomorrow for testing.

Thanks for the feedback.

Best regards,

Thomas


^ permalink raw reply	[flat|nested] 31+ messages in thread

* RE: [PATCH] Fix PR54733 Optimize endian independent load/store
  2014-05-26 11:06                       ` Thomas Preud'homme
@ 2014-05-27  7:23                         ` Thomas Preud'homme
  2014-05-27 14:35                           ` Christophe Lyon
  2014-05-27 16:45                           ` Andreas Schwab
  0 siblings, 2 replies; 31+ messages in thread
From: Thomas Preud'homme @ 2014-05-27  7:23 UTC (permalink / raw)
  To: 'Christophe Lyon', 'Andreas Schwab'
  Cc: Richard Biener, GCC Patches

[-- Attachment #1: Type: text/plain, Size: 740 bytes --]

Hi Chistophe and Andreas,

> From: gcc-patches-owner@gcc.gnu.org [mailto:gcc-patches-
> owner@gcc.gnu.org] On Behalf Of Thomas Preud'homme
> 
> I suspect it's the same kind of problem m68k run into. I already wrote a patch
> to
> reduce the impact of different bitfield layout and it's in review right now. I'll
> send you tomorrow for testing.

Can you both try the attached patch to see if it solves the bug you experienced?
Andreas, I wrote it based on the correction you did so it should at least work for
m68k.

ChangeLog is:

2014-05-26  Thomas Preud'homme  <thomas.preudhomme@arm.com>

	* gcc.c-torture/execute/bswap-2.c: Add alignment constraints to
	bitfield to make it more portable.

Best regards,

Thomas 

[-- Attachment #2: fix_bswap-2.diff --]
[-- Type: application/octet-stream, Size: 1051 bytes --]

diff --git a/gcc/testsuite/gcc.c-torture/execute/bswap-2.c b/gcc/testsuite/gcc.c-torture/execute/bswap-2.c
index 38f18fd..4368d83 100644
--- a/gcc/testsuite/gcc.c-torture/execute/bswap-2.c
+++ b/gcc/testsuite/gcc.c-torture/execute/bswap-2.c
@@ -6,8 +6,11 @@ typedef __UINT32_TYPE__ unsigned;
 
 struct bitfield {
   unsigned char f0:7;
+  unsigned char   :0;
   unsigned char f1:7;
+  unsigned char   :0;
   unsigned char f2:7;
+  unsigned char   :0;
   unsigned char f3:7;
 };
 
@@ -74,11 +77,11 @@ main ()
     return 0;
   bfin.inval = (struct ok) { 0x83, 0x85, 0x87, 0x89 };
   out = partial_read_le32 (bfin);
-  if (out != 0x09070503 && out != 0x88868482 && out != 0x78306141)
+  if (out != 0x09070503 && out != 0x88868482)
     __builtin_abort ();
   bfin.inval = (struct ok) { 0x83, 0x85, 0x87, 0x89 };
   out = partial_read_be32 (bfin);
-  if (out != 0x03050709 && out != 0x82848688 && out != 0x41613078)
+  if (out != 0x03050709 && out != 0x82848688)
     __builtin_abort ();
   out = fake_read_le32 (cin, &cin[2]);
   if (out != 0x89018583)

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] Fix PR54733 Optimize endian independent load/store
  2014-05-27  7:23                         ` Thomas Preud'homme
@ 2014-05-27 14:35                           ` Christophe Lyon
  2014-05-27 16:45                           ` Andreas Schwab
  1 sibling, 0 replies; 31+ messages in thread
From: Christophe Lyon @ 2014-05-27 14:35 UTC (permalink / raw)
  To: Thomas Preud'homme; +Cc: Andreas Schwab, Richard Biener, GCC Patches

On 27 May 2014 09:23, Thomas Preud'homme <thomas.preudhomme@arm.com> wrote:
> Hi Chistophe and Andreas,
>
>> From: gcc-patches-owner@gcc.gnu.org [mailto:gcc-patches-
>> owner@gcc.gnu.org] On Behalf Of Thomas Preud'homme
>>
>> I suspect it's the same kind of problem m68k run into. I already wrote a patch
>> to
>> reduce the impact of different bitfield layout and it's in review right now. I'll
>> send you tomorrow for testing.
>
> Can you both try the attached patch to see if it solves the bug you experienced?
> Andreas, I wrote it based on the correction you did so it should at least work for
> m68k.
>
It still doesn't work on armeb.

Christophe.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] Fix PR54733 Optimize endian independent load/store
  2014-05-27  7:23                         ` Thomas Preud'homme
  2014-05-27 14:35                           ` Christophe Lyon
@ 2014-05-27 16:45                           ` Andreas Schwab
  2014-05-29  8:18                             ` Thomas Preud'homme
  1 sibling, 1 reply; 31+ messages in thread
From: Andreas Schwab @ 2014-05-27 16:45 UTC (permalink / raw)
  To: Thomas Preud'homme
  Cc: 'Christophe Lyon', Richard Biener, GCC Patches

"Thomas Preud'homme" <thomas.preudhomme@arm.com> writes:

> diff --git a/gcc/testsuite/gcc.c-torture/execute/bswap-2.c b/gcc/testsuite/gcc.c-torture/execute/bswap-2.c
> index 38f18fd..4368d83 100644
> --- a/gcc/testsuite/gcc.c-torture/execute/bswap-2.c
> +++ b/gcc/testsuite/gcc.c-torture/execute/bswap-2.c
> @@ -6,8 +6,11 @@ typedef __UINT32_TYPE__ unsigned;
>  
>  struct bitfield {
>    unsigned char f0:7;
> +  unsigned char   :0;
>    unsigned char f1:7;
> +  unsigned char   :0;
>    unsigned char f2:7;
> +  unsigned char   :0;
>    unsigned char f3:7;
>  };

This adds a full byte of padding between each bitfield.  If you want a
single padding bit you should use :1, but you also need to update the
test to check for 0x44434241 (0x88868482 is impossible, since that
requires at least 8 bits per bitfield).

Andreas.

-- 
Andreas Schwab, schwab@linux-m68k.org
GPG Key fingerprint = 58CA 54C7 6D53 942B 1756  01D3 44D5 214B 8276 4ED5
"And now for something completely different."

^ permalink raw reply	[flat|nested] 31+ messages in thread

* RE: [PATCH] Fix PR54733 Optimize endian independent load/store
  2014-05-27 16:45                           ` Andreas Schwab
@ 2014-05-29  8:18                             ` Thomas Preud'homme
  2014-05-29  8:52                               ` Andreas Schwab
  0 siblings, 1 reply; 31+ messages in thread
From: Thomas Preud'homme @ 2014-05-29  8:18 UTC (permalink / raw)
  To: 'Andreas Schwab'
  Cc: 'Christophe Lyon', Richard Biener, GCC Patches

[-- Attachment #1: Type: text/plain, Size: 2532 bytes --]

> From: Andreas Schwab [mailto:schwab@linux-m68k.org]
> 
> This adds a full byte of padding between each bitfield.  If you want a
> single padding bit you should use :1, but you also need to update the
> test to check for 0x44434241 (0x88868482 is impossible, since that
> requires at least 8 bits per bitfield).

Actually if I understood C99 correctly it depends on the storage unit
allocated for the bitfield preceding the 0 length bitfield. Instead of
trying to cover all possible value read from this bitfield I rewrote the
test to check if bswap misinterpret the expression and replace it with
a load or load+bswap. This reduce the number of possible values to 2
and thus makes the test less fragile and easier to understand.

By the way, I couldn't understand how you reached the value
0x44434241. Can you explain me?

Here is the ChangeLog:

2014-05-29  Thomas Preud'homme  <thomas.preudhomme@arm.com>

	* gcc.c-torture/execute/bswap-2.c: Add alignment constraints to
	bitfield and test wrong results instead of correct results to make the
	test more portable.

And the patch:

diff --git a/gcc/testsuite/gcc.c-torture/execute/bswap-2.c b/gcc/testsuite/gcc.c-torture/execute/bswap-2.c
index 38f18fd..a47e01a 100644
--- a/gcc/testsuite/gcc.c-torture/execute/bswap-2.c
+++ b/gcc/testsuite/gcc.c-torture/execute/bswap-2.c
@@ -6,8 +6,11 @@ typedef __UINT32_TYPE__ unsigned;
 
 struct bitfield {
   unsigned char f0:7;
+  unsigned char   :1;
   unsigned char f1:7;
+  unsigned char   :1;
   unsigned char f2:7;
+  unsigned char   :1;
   unsigned char f3:7;
 };
 
@@ -74,11 +77,17 @@ main ()
     return 0;
   bfin.inval = (struct ok) { 0x83, 0x85, 0x87, 0x89 };
   out = partial_read_le32 (bfin);
-  if (out != 0x09070503 && out != 0x88868482 && out != 0x78306141)
+  /* Test what bswap would do if its check are not strict enough instead of
+     what is the expected result as there is too many possible results with
+     bitfields.  */
+  if (out == 0x89878583)
     __builtin_abort ();
   bfin.inval = (struct ok) { 0x83, 0x85, 0x87, 0x89 };
   out = partial_read_be32 (bfin);
-  if (out != 0x03050709 && out != 0x82848688 && out != 0x41613078)
+  /* Test what bswap would do if its check are not strict enough instead of
+     what is the expected result as there is too many possible results with
+     bitfields.  */
+  if (out == 0x83858789)
     __builtin_abort ();
   out = fake_read_le32 (cin, &cin[2]);
   if (out != 0x89018583)

Best regards,

Thomas

[-- Attachment #2: fix_bswap-2.diff --]
[-- Type: application/octet-stream, Size: 1359 bytes --]

diff --git a/gcc/testsuite/gcc.c-torture/execute/bswap-2.c b/gcc/testsuite/gcc.c-torture/execute/bswap-2.c
index 38f18fd..a47e01a 100644
--- a/gcc/testsuite/gcc.c-torture/execute/bswap-2.c
+++ b/gcc/testsuite/gcc.c-torture/execute/bswap-2.c
@@ -6,8 +6,11 @@ typedef __UINT32_TYPE__ unsigned;
 
 struct bitfield {
   unsigned char f0:7;
+  unsigned char   :1;
   unsigned char f1:7;
+  unsigned char   :1;
   unsigned char f2:7;
+  unsigned char   :1;
   unsigned char f3:7;
 };
 
@@ -74,11 +77,17 @@ main ()
     return 0;
   bfin.inval = (struct ok) { 0x83, 0x85, 0x87, 0x89 };
   out = partial_read_le32 (bfin);
-  if (out != 0x09070503 && out != 0x88868482 && out != 0x78306141)
+  /* Test what bswap would do if its check are not strict enough instead of
+     what is the expected result as there is too many possible results with
+     bitfields.  */
+  if (out == 0x89878583)
     __builtin_abort ();
   bfin.inval = (struct ok) { 0x83, 0x85, 0x87, 0x89 };
   out = partial_read_be32 (bfin);
-  if (out != 0x03050709 && out != 0x82848688 && out != 0x41613078)
+  /* Test what bswap would do if its check are not strict enough instead of
+     what is the expected result as there is too many possible results with
+     bitfields.  */
+  if (out == 0x83858789)
     __builtin_abort ();
   out = fake_read_le32 (cin, &cin[2]);
   if (out != 0x89018583)

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] Fix PR54733 Optimize endian independent load/store
  2014-05-29  8:18                             ` Thomas Preud'homme
@ 2014-05-29  8:52                               ` Andreas Schwab
  2014-05-29  9:58                                 ` Thomas Preud'homme
  0 siblings, 1 reply; 31+ messages in thread
From: Andreas Schwab @ 2014-05-29  8:52 UTC (permalink / raw)
  To: Thomas Preud'homme
  Cc: 'Christophe Lyon', Richard Biener, GCC Patches

"Thomas Preud'homme" <thomas.preudhomme@arm.com> writes:

> By the way, I couldn't understand how you reached the value
> 0x44434241. Can you explain me?

Each byte is composed of the first 7 bits of the original byte.

Andreas.

-- 
Andreas Schwab, schwab@linux-m68k.org
GPG Key fingerprint = 58CA 54C7 6D53 942B 1756  01D3 44D5 214B 8276 4ED5
"And now for something completely different."

^ permalink raw reply	[flat|nested] 31+ messages in thread

* RE: [PATCH] Fix PR54733 Optimize endian independent load/store
  2014-05-29  8:52                               ` Andreas Schwab
@ 2014-05-29  9:58                                 ` Thomas Preud'homme
  2014-06-02 13:57                                   ` Christophe Lyon
  0 siblings, 1 reply; 31+ messages in thread
From: Thomas Preud'homme @ 2014-05-29  9:58 UTC (permalink / raw)
  To: 'Andreas Schwab'
  Cc: 'Christophe Lyon', Richard Biener, GCC Patches

> From: Andreas Schwab [mailto:schwab@linux-m68k.org]
> "Thomas Preud'homme" <thomas.preudhomme@arm.com> writes:
> 
> > By the way, I couldn't understand how you reached the value
> > 0x44434241. Can you explain me?
> 
> Each byte is composed of the first 7 bits of the original byte.

Sorry, it seems I wasn't very awake when I checked that. Makes sense now.
Thanks. 

Does the patch solve the problem you had? What about you Christophe?

Best regards,

Thomas



^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] Fix PR54733 Optimize endian independent load/store
  2014-05-29  9:58                                 ` Thomas Preud'homme
@ 2014-06-02 13:57                                   ` Christophe Lyon
  2014-06-04  7:04                                     ` Thomas Preud'homme
  0 siblings, 1 reply; 31+ messages in thread
From: Christophe Lyon @ 2014-06-02 13:57 UTC (permalink / raw)
  To: Thomas Preud'homme; +Cc: Andreas Schwab, Richard Biener, GCC Patches

On 29 May 2014 11:58, Thomas Preud'homme <thomas.preudhomme@arm.com> wrote:
>> From: Andreas Schwab [mailto:schwab@linux-m68k.org]
>> "Thomas Preud'homme" <thomas.preudhomme@arm.com> writes:
>>
>> > By the way, I couldn't understand how you reached the value
>> > 0x44434241. Can you explain me?
>>
>> Each byte is composed of the first 7 bits of the original byte.
>
> Sorry, it seems I wasn't very awake when I checked that. Makes sense now.
> Thanks.
>
> Does the patch solve the problem you had? What about you Christophe?
>
> Best regards,
>
> Thomas
>

Hi Thomas,

After a quick test, it looks OK to me.

Thanks

Christophe.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* RE: [PATCH] Fix PR54733 Optimize endian independent load/store
  2014-06-02 13:57                                   ` Christophe Lyon
@ 2014-06-04  7:04                                     ` Thomas Preud'homme
  2014-06-04  9:52                                       ` Richard Biener
  0 siblings, 1 reply; 31+ messages in thread
From: Thomas Preud'homme @ 2014-06-04  7:04 UTC (permalink / raw)
  To: Andreas Schwab; +Cc: 'Christophe Lyon', Richard Biener, GCC Patches

> From: Christophe Lyon [mailto:christophe.lyon@linaro.org]
> On 29 May 2014 11:58, Thomas Preud'homme
> <thomas.preudhomme@arm.com> wrote:
> >
> > Does the patch solve the problem you had? What about you Christophe?
> >
> >
> 
> Hi Thomas,
> 
> After a quick test, it looks OK to me.

Great. What about you Andreas? Does it work fine for you? If yes, is this ok for trunk?

Best regards,

Thomas


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] Fix PR54733 Optimize endian independent load/store
  2014-06-04  7:04                                     ` Thomas Preud'homme
@ 2014-06-04  9:52                                       ` Richard Biener
  2014-06-06 10:45                                         ` Thomas Preud'homme
  0 siblings, 1 reply; 31+ messages in thread
From: Richard Biener @ 2014-06-04  9:52 UTC (permalink / raw)
  To: Thomas Preud'homme; +Cc: Andreas Schwab, Christophe Lyon, GCC Patches

On Wed, Jun 4, 2014 at 9:04 AM, Thomas Preud'homme
<thomas.preudhomme@arm.com> wrote:
>> From: Christophe Lyon [mailto:christophe.lyon@linaro.org]
>> On 29 May 2014 11:58, Thomas Preud'homme
>> <thomas.preudhomme@arm.com> wrote:
>> >
>> > Does the patch solve the problem you had? What about you Christophe?
>> >
>> >
>>
>> Hi Thomas,
>>
>> After a quick test, it looks OK to me.
>
> Great. What about you Andreas? Does it work fine for you? If yes, is this ok for trunk?

Ok.

Thanks,
Richard.

> Best regards,
>
> Thomas
>
>

^ permalink raw reply	[flat|nested] 31+ messages in thread

* RE: [PATCH] Fix PR54733 Optimize endian independent load/store
  2014-06-04  9:52                                       ` Richard Biener
@ 2014-06-06 10:45                                         ` Thomas Preud'homme
  0 siblings, 0 replies; 31+ messages in thread
From: Thomas Preud'homme @ 2014-06-06 10:45 UTC (permalink / raw)
  To: 'Richard Biener'; +Cc: Andreas Schwab, Christophe Lyon, GCC Patches

> From: Richard Biener [mailto:richard.guenther@gmail.com]
> On Wed, Jun 4, 2014 at 9:04 AM, Thomas Preud'homme
> <thomas.preudhomme@arm.com> wrote:
> >
> > Great. What about you Andreas? Does it work fine for you? If yes, is this ok
> for trunk?
> 
> Ok.
> 
> Thanks,
> Richard.

Commited since I got positive feedback from Christophe Lyon and
Rainer (on PR61320).

Best regards,

Thomas



^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH] Fix PR54733 Optimize endian independent load/store
@ 2014-03-19  7:24 Thomas Preud'homme
  0 siblings, 0 replies; 31+ messages in thread
From: Thomas Preud'homme @ 2014-03-19  7:24 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 3693 bytes --]

Hi everybody,

*** Motivation ***

Currently gcc is capable of replacing hand-crafted implementation of byteswap by a suitable instruction thanks to the bswap optimization pass. The patch proposed here aims at extending this pass to also optimize load in a specific endianness, independent of the host endianness.

*** Methodology ***

The patch adds support for dealing with a memory source (array or structure) and detect whether the result of a bitwise operation happens to be equivalent to a big endian or little endian load and replace it by a load or a load and a byteswap according to the host endianness. The original code used the concept of symbolic number: a number where the value of each byte indicates its position (in terms of weight) before the bitwise manipulation. After performing the bit manipulation on that symbolic number, the result tells how the byte were shuffled (see variable cmp in function find_bswap). Detecting an operation resulting in a number in the host endianness is thus pretty straightforward: look if the symbolic number has *not* changed.

As to supporting read from array and structure, there is some logic to recognize the base of the array/structure and the offset of entries/fields accessed to check if the range of memory accessed would fit in an integer. Each entries is initially treated independently and when they are ORed together the values in the symbolic number are updated according to the host endianness: the entry of higher address would see its values incremented on a little endian machine.

Note that as it stands the patch does not work for arrays indexed with variable (such a tab[a] || (tab[a+1] << 8)) because fold_const does not fold (a + 1) - a. If such cases were folded, the number of cases detected would automatically be increased due to the use of fold_build2 to compare two offsets.

This patch also adds a few testcases to check both (i) that the optimization works as expected and (ii) that the result are correct. It also define new effective targets (bswap16, bswap32 and bswap64) to centralize the information about what target supports  byte swap instructions for the testsuite and modify existing tests to use these new effective targets.

The patch is quite big but could be split if necessary. A big part of the code added is for handling memory source and it would be difficult to split it but variable renaming and introduction of bwapXX effective target could be made separately to reduce the noise. The patch is too big so is only in attachment of this email.

The ChangeLog are as follows:

*** gcc/ChangeLog ***

2014-03-19  Thomas Preud'homme  <thomas.preudhomme@arm.com>

	PR tree-optimization/54733
	* tree-ssa-math-opts.c (find_bswap_1): Renamed to ...
	(find_bswap_or_nop_1): This. Also add support for memory source.
	(find_bswap): Renamed to ...
	(find_bswap_or_nop): This. Also add support for memory source and
	detection of noop bitwise operations.
	(execute_optimize_bswap): Likewise.

*** gcc/testsuite/ChangeLog ***

2014-03-19  Thomas Preud'homme  <thomas.preudhomme@arm.com>

	PR tree-optimization/54733
	* lib/target-supports.exp: New effective targets for architectures
	capable of performing byte swap.
	* gcc.dg/optimize-bswapdi-1.c: Convert to new bswap target.
	* gcc.dg/optimize-bswapdi-2.c: Likewise.
	* gcc.dg/optimize-bswapsi-1.c: Likewise.
	* gcc.dg/optimize-bswapdi-3.c: New test to check extension of bswap
	optimization to support memory sources.
	* gcc.dg/optimize-bswaphi-1.c: Likewise.
	* gcc.dg/optimize-bswapsi-2.c: Likewise.
	* gcc.c-torture/execute/bswap-2.c: Likewise.

Is this ok for stage 1?

Best regards,

Thomas

[-- Attachment #2: gcc32rm-84.3.diff --]
[-- Type: application/octet-stream, Size: 26868 bytes --]

diff --git a/gcc/testsuite/gcc.c-torture/execute/bswap-2.c b/gcc/testsuite/gcc.c-torture/execute/bswap-2.c
new file mode 100644
index 0000000..35ca743
--- /dev/null
+++ b/gcc/testsuite/gcc.c-torture/execute/bswap-2.c
@@ -0,0 +1,46 @@
+#ifdef __UINT32_TYPE__
+typedef __UINT32_TYPE__ uint32_t;
+#else
+typedef __UINT32_TYPE__ unsigned;
+#endif
+
+struct bitfield {
+  unsigned char f0:7;
+  unsigned char f1:7;
+  unsigned char f2:7;
+  unsigned char f3:7;
+};
+
+struct ok {
+  unsigned char f0;
+  unsigned char f1;
+  unsigned char f2;
+  unsigned char f3;
+};
+
+union bf_or_uint32 {
+  struct ok inval;
+  struct bitfield bfval;
+};
+
+__attribute__ ((noinline, noclone)) uint32_t
+partial_read_le32 (union bf_or_uint32 in)
+{
+  return in.bfval.f0 | (in.bfval.f1 << 8)
+	 | (in.bfval.f2 << 16) | (in.bfval.f3 << 24);
+}
+
+int
+main ()
+{
+  union bf_or_uint32 bfin;
+  uint32_t out;
+
+  if (sizeof (uint32_t) * __CHAR_BIT__ != 32)
+    return 0;
+  bfin.inval = (struct ok) { 0x83, 0x85, 0x87, 0x89 };
+  out = partial_read_le32 (bfin);
+  if (out != 0x09070503 && out != 0x88868482)
+    __builtin_abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.dg/optimize-bswapdi-1.c b/gcc/testsuite/gcc.dg/optimize-bswapdi-1.c
index 7d557f3..a9c3443 100644
--- a/gcc/testsuite/gcc.dg/optimize-bswapdi-1.c
+++ b/gcc/testsuite/gcc.dg/optimize-bswapdi-1.c
@@ -1,6 +1,6 @@
-/* { dg-do compile { target arm*-*-* alpha*-*-* ia64*-*-* x86_64-*-* s390x-*-* powerpc*-*-* rs6000-*-* } } */
+/* { dg-do compile { target *-*-* } } */
+/* { dg-require-effective-target bswap64 } */
 /* { dg-require-effective-target stdint_types } */
-/* { dg-require-effective-target lp64 } */
 /* { dg-options "-O2 -fdump-tree-bswap" } */
 
 #include <stdint.h>
diff --git a/gcc/testsuite/gcc.dg/optimize-bswapdi-2.c b/gcc/testsuite/gcc.dg/optimize-bswapdi-2.c
index 6e2821d..277449b 100644
--- a/gcc/testsuite/gcc.dg/optimize-bswapdi-2.c
+++ b/gcc/testsuite/gcc.dg/optimize-bswapdi-2.c
@@ -1,6 +1,6 @@
-/* { dg-do compile { target arm*-*-* alpha*-*-* ia64*-*-* x86_64-*-* s390x-*-* powerpc*-*-* rs6000-*-* } } */
+/* { dg-do compile { target *-*-* } } */
+/* { dg-require-effective-target bswap64 } */
 /* { dg-require-effective-target stdint_types } */
-/* { dg-require-effective-target lp64 } */
 /* { dg-options "-O2 -fdump-tree-bswap" } */
 
 #include <stdint.h>
diff --git a/gcc/testsuite/gcc.dg/optimize-bswapdi-3.c b/gcc/testsuite/gcc.dg/optimize-bswapdi-3.c
new file mode 100644
index 0000000..9e19a11
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/optimize-bswapdi-3.c
@@ -0,0 +1,60 @@
+/* { dg-do compile { target *-*-* } } */
+/* { dg-require-effective-target bswap64 } */
+/* { dg-require-effective-target stdint_types } */
+/* { dg-options "-O2 -fdump-tree-bswap" } */
+
+#include <stdint.h>
+
+struct uint64_st {
+  unsigned char u0, u1, u2, u3, u4, u5, u6, u7;
+};
+
+uint32_t read_aux (void *, uint32_t);
+
+uint64_t read_le64_1 (void)
+{
+  unsigned char data[8];
+
+  read_aux (data, 8);
+  return (uint64_t) data[0] | ((uint64_t) data[1] << 8)
+	 | ((uint64_t) data[2] << 16) | ((uint64_t) data[3] << 24)
+	 | ((uint64_t) data[4] << 32) | ((uint64_t) data[5] << 40)
+	 | ((uint64_t) data[6] << 48) | ((uint64_t) data[7] << 56);
+}
+
+uint64_t read_le64_2 (void)
+{
+  struct uint64_st data;
+
+  read_aux (&data, 8);
+  return (uint64_t) data.u0 | ((uint64_t) data.u1 << 8)
+	 | ((uint64_t) data.u2 << 16) | ((uint64_t) data.u3 << 24)
+	 | ((uint64_t) data.u4 << 32) | ((uint64_t) data.u5 << 40)
+	 | ((uint64_t) data.u6 << 48) | ((uint64_t) data.u7 << 56);
+}
+
+uint64_t read_be64_1 (void)
+{
+  unsigned char data[8];
+
+  read_aux (data, 8);
+  return (uint64_t) data[7] | ((uint64_t) data[6] << 8)
+	 | ((uint64_t) data[5] << 16) | ((uint64_t) data[4] << 24)
+	 | ((uint64_t) data[3] << 32) | ((uint64_t) data[2] << 40)
+	 | ((uint64_t) data[1] << 48) | ((uint64_t) data[0] << 56);
+}
+
+uint64_t read_be64_2 (void)
+{
+  struct uint64_st data;
+
+  read_aux (&data, 8);
+  return (uint64_t) data.u7 | ((uint64_t) data.u6 << 8)
+	 | ((uint64_t) data.u5 << 16) | ((uint64_t) data.u4 << 24)
+	 | ((uint64_t) data.u3 << 32) | ((uint64_t) data.u2 << 40)
+	 | ((uint64_t) data.u1 << 48) | ((uint64_t) data.u0 << 56);
+}
+
+/* { dg-final { scan-tree-dump-times "64 bit load in host endianness found at" 2 "bswap" } } */
+/* { dg-final { scan-tree-dump-times "64 bit bswap implementation found at" 2 "bswap" } } */
+/* { dg-final { cleanup-tree-dump "bswap" } } */
diff --git a/gcc/testsuite/gcc.dg/optimize-bswaphi-1.c b/gcc/testsuite/gcc.dg/optimize-bswaphi-1.c
new file mode 100644
index 0000000..a6bb4c7
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/optimize-bswaphi-1.c
@@ -0,0 +1,49 @@
+/* { dg-do compile { target { ! alpha*-*-* } } } */
+/* { dg-require-effective-target bswap16 } */
+/* { dg-require-effective-target stdint_types } */
+/* { dg-options "-O2 -fdump-tree-bswap" } */
+/* { dg-options "-O2 -fdump-tree-bswap -march=z900" { target s390-*-* } } */
+
+#include <stdint.h>
+
+struct uint16_st {
+  unsigned char u0, u1;
+};
+
+uint32_t read_aux (void *, uint32_t);
+
+uint32_t read_le16_1 (void)
+{
+  unsigned char data[2];
+
+  read_aux (data, 2);
+  return data[0] | (data[1] << 8);
+}
+
+uint32_t read_le16_2 (void)
+{
+  struct uint16_st data;
+
+  read_aux (&data, 2);
+  return data.u0 | (data.u1 << 8);
+}
+
+uint32_t read_be16_1 (void)
+{
+  unsigned char data[2];
+
+  read_aux (data, 2);
+  return data[1] | (data[0] << 8);
+}
+
+uint32_t read_be16_2 (void)
+{
+  struct uint16_st data;
+
+  read_aux (&data, 2);
+  return data.u1 | (data.u0 << 8);
+}
+
+/* { dg-final { scan-tree-dump-times "16 bit load in host endianness found at" 2 "bswap" } } */
+/* { dg-final { scan-tree-dump-times "16 bit bswap implementation found at" 2 "bswap" } } */
+/* { dg-final { cleanup-tree-dump "bswap" } } */
diff --git a/gcc/testsuite/gcc.dg/optimize-bswapsi-1.c b/gcc/testsuite/gcc.dg/optimize-bswapsi-1.c
index 78238e3..2ad44fb 100644
--- a/gcc/testsuite/gcc.dg/optimize-bswapsi-1.c
+++ b/gcc/testsuite/gcc.dg/optimize-bswapsi-1.c
@@ -1,4 +1,5 @@
-/* { dg-do compile { target arm*-*-* alpha*-*-* i?86-*-* powerpc*-*-* rs6000-*-* x86_64-*-* s390*-*-* } } */
+/* { dg-do compile { target *-*-* } } */
+/* { dg-require-effective-target bswap32 } */
 /* { dg-require-effective-target stdint_types } */
 /* { dg-options "-O2 -fdump-tree-bswap" } */
 /* { dg-options "-O2 -fdump-tree-bswap -march=z900" { target s390-*-* } } */
diff --git a/gcc/testsuite/gcc.dg/optimize-bswapsi-2.c b/gcc/testsuite/gcc.dg/optimize-bswapsi-2.c
new file mode 100644
index 0000000..82cb92f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/optimize-bswapsi-2.c
@@ -0,0 +1,49 @@
+/* { dg-do compile { target *-*-* } } */
+/* { dg-require-effective-target bswap32 } */
+/* { dg-require-effective-target stdint_types } */
+/* { dg-options "-O2 -fdump-tree-bswap" } */
+/* { dg-options "-O2 -fdump-tree-bswap -march=z900" { target s390-*-* } } */
+
+#include <stdint.h>
+
+struct uint32_st {
+  unsigned char u0, u1, u2, u3;
+};
+
+uint32_t read_aux (void *, uint32_t);
+
+uint32_t read_le32_1 (void)
+{
+  unsigned char data[4];
+
+  read_aux (data, 4);
+  return data[0] | (data[1] << 8) | (data[2] << 16) | (data[3] << 24);
+}
+
+uint32_t read_le32_2 (void)
+{
+  struct uint32_st data;
+
+  read_aux (&data, 4);
+  return data.u0 | (data.u1 << 8) | (data.u2 << 16) | (data.u3 << 24);
+}
+
+uint32_t read_be32_1 (void)
+{
+  unsigned char data[4];
+
+  read_aux (data, 4);
+  return data[3] | (data[2] << 8) | (data[1] << 16) | (data[0] << 24);
+}
+
+uint32_t read_be32_2 (void)
+{
+  struct uint32_st data;
+
+  read_aux (&data, 4);
+  return data.u3 | (data.u2 << 8) | (data.u1 << 16) | (data.u0 << 24);
+}
+
+/* { dg-final { scan-tree-dump-times "32 bit load in host endianness found at" 2 "bswap" } } */
+/* { dg-final { scan-tree-dump-times "32 bit bswap implementation found at" 2 "bswap" } } */
+/* { dg-final { cleanup-tree-dump "bswap" } } */
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index bee8471..71a346b 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -4732,6 +4732,87 @@ proc check_effective_target_sync_long_long_runtime { } {
     }
 }
 
+# Return 1 if the target supports bit byte swap instructions.
+
+proc check_effective_target_bswap { } {
+    global et_bswap_saved
+
+    if [info exists et_bswap_saved] {
+        verbose "check_effective_target_bswap: using cached result" 2
+    } else {
+	set et_bswap_saved 0
+	if { [istarget aarch64-*-*]
+	     || [istarget alpha*-*-*]
+	     || [istarget arm*-*-*]
+	     || [istarget i?86-*-*]
+	     || [istarget powerpc*-*-*]
+	     || [istarget rs6000-*-*]
+	     || [istarget s390*-*-*]
+	     || [istarget x86_64-*-*] } {
+	    set et_bswap_saved 1
+	}
+    }
+
+    verbose "check_effective_target_bswap: returning $et_bswap_saved" 2
+    return $et_bswap_saved
+}
+
+# Return 1 if the target supports 16-bit byte swap instructions.
+
+proc check_effective_target_bswap16 { } {
+    global et_bswap16_saved
+
+    if [info exists et_bswap16_saved] {
+	verbose "check_effective_target_bswap16: using cached result" 2
+    } else {
+	set et_bswap16_saved 0
+	if { [is-effective-target bswap]
+	     && ![istarget x86_64-*-*] } {
+	   set et_bswap16_saved 1
+	}
+    }
+
+    verbose "check_effective_target_bswap16: returning $et_bswap16_saved" 2
+    return $et_bswap16_saved
+}
+
+# Return 1 if the target supports 32-bit byte swap instructions.
+
+proc check_effective_target_bswap32 { } {
+    global et_bswap32_saved
+
+    if [info exists et_bswap32_saved] {
+	verbose "check_effective_target_bswap32: using cached result" 2
+    } else {
+	set et_bswap32_saved 0
+	if { [is-effective-target bswap] } {
+	   set et_bswap32_saved 1
+	}
+    }
+
+    verbose "check_effective_target_bswap32: returning $et_bswap32_saved" 2
+    return $et_bswap32_saved
+}
+
+# Return 1 if the target supports 64-bit byte swap instructions.
+
+proc check_effective_target_bswap64 { } {
+    global et_bswap64_saved
+
+    if [info exists et_bswap64_saved] {
+        verbose "check_effective_target_bswap64: using cached result" 2
+    } else {
+	set et_bswap64_saved 0
+	if { [is-effective-target bswap]
+	     && [is-effective-target lp64] } {
+	   set et_bswap64_saved 1
+	}
+    }
+
+    verbose "check_effective_target_bswap64: returning $et_bswap64_saved" 2
+    return $et_bswap64_saved
+}
+
 # Return 1 if the target supports atomic operations on "int" and "long".
 
 proc check_effective_target_sync_int_long { } {
diff --git a/gcc/tree-ssa-math-opts.c b/gcc/tree-ssa-math-opts.c
index 9ff857c..1d0a172 100644
--- a/gcc/tree-ssa-math-opts.c
+++ b/gcc/tree-ssa-math-opts.c
@@ -1616,13 +1616,29 @@ make_pass_cse_sincos (gcc::context *ctxt)
 
    0    - byte has the value 0
    1..size - byte contains the content of the byte
-   number indexed with that value minus one  */
+   number indexed with that value minus one.
+
+   To detect permutations on memory sources (arrays and structures), a symbolic
+   number is also associated a base address (the array or structure the load is
+   made from), an offset from the base address and a range which gives the
+   range of memory represented by this symbolic number. The range is different
+   from size as size reflect the size of the type of current expression. It can
+   thus be bigger when casting to a type with bigger size.  */
 
 struct symbolic_number {
   unsigned HOST_WIDEST_INT n;
   int size;
+  tree base_addr;
+  tree offset;
+  unsigned HOST_WIDE_INT range;
 };
 
+/* The number which the find_bswap_or_nop_1 result should match in
+   order to have a nop.  The number is masked according to the size of
+   the symbolic number before using it.  */
+#define CMPNOP (sizeof (HOST_WIDEST_INT) < 8 ? 0 : \
+  (unsigned HOST_WIDEST_INT)0x08070605 << 32 | 0x04030201)
+
 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
    number N.  Return false if the requested operation is not permitted
    on a symbolic number.  */
@@ -1688,7 +1704,7 @@ verify_symbolic_number_p (struct symbolic_number *n, gimple stmt)
    tree expression of the source operand and NULL otherwise.  */
 
 static tree
-find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
+find_bswap_or_nop_1 (gimple stmt, struct symbolic_number *n, int limit)
 {
   enum tree_code code;
   tree rhs1, rhs2 = NULL;
@@ -1727,12 +1743,57 @@ find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
 	  && code != CONVERT_EXPR)
 	return NULL_TREE;
 
-      source_expr1 = find_bswap_1 (rhs1_stmt, n, limit - 1);
+      source_expr1 = find_bswap_or_nop_1 (rhs1_stmt, n, limit - 1);
 
-      /* If find_bswap_1 returned NULL STMT is a leaf node and we have
-	 to initialize the symbolic number.  */
+      /* If find_bswap_or_nop_1 returned NULL, STMT is a leaf node and
+	 we have to initialize the symbolic number.  */
       if (!source_expr1)
 	{
+	  n->base_addr = n->offset = NULL_TREE;
+	  if (is_gimple_assign (rhs1_stmt))
+	    {
+	      unsigned bit_offset;
+	      tree var, elt_size, index, field, bit_offset_expr;
+
+	      var = gimple_assign_rhs1 (rhs1_stmt);
+	      switch (TREE_CODE (var))
+		{
+		/* leaf node is an array, memorize its base, element size and
+		   offset from base to compare to other array leaf node.  */
+		case ARRAY_RANGE_REF:
+		case ARRAY_REF:
+		  n->base_addr = TREE_OPERAND (var, 0);
+		  elt_size = array_ref_element_size (var);
+		  if (TREE_CODE (elt_size) != INTEGER_CST)
+		    return NULL_TREE;
+		  index = TREE_OPERAND (var, 1);
+		  if (TREE_THIS_VOLATILE (var) || TREE_THIS_VOLATILE (index))
+		    return NULL_TREE;
+		  n->offset = fold_build2 (MULT_EXPR, sizetype,
+					   index, elt_size);
+		  rhs1 = var;
+		  break;
+		/* leaf node is a record field, memorize its base, field size
+		   and offset from base to compare to other field leaf node.  */
+		case COMPONENT_REF:
+		  n->base_addr = TREE_OPERAND (var, 0);
+		  field = TREE_OPERAND (var, 1);
+		  bit_offset = TREE_INT_CST_LOW (DECL_FIELD_BIT_OFFSET (field));
+		  if (bit_offset % BITS_PER_UNIT)
+		    return NULL_TREE;
+		  bit_offset /= BITS_PER_UNIT;
+		  n->offset = component_ref_field_offset (var);
+		  bit_offset_expr = build_int_cst (TREE_TYPE (n->offset),
+						   bit_offset);
+		  n->offset = fold_build2 (PLUS_EXPR, TREE_TYPE (n->offset),
+					  n->offset, bit_offset_expr);
+		  if (TYPE_PRECISION (TREE_TYPE (field)) != BITS_PER_UNIT)
+		    return NULL_TREE;
+		  rhs1 = var;
+		  break;
+		default:;
+		}
+	    }
 	  /* Set up the symbolic number N by setting each byte to a
 	     value between 1 and the byte size of rhs1.  The highest
 	     order byte is set to n->size and the lowest order
@@ -1741,8 +1802,8 @@ find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
 	  if (n->size % BITS_PER_UNIT != 0)
 	    return NULL_TREE;
 	  n->size /= BITS_PER_UNIT;
-	  n->n = (sizeof (HOST_WIDEST_INT) < 8 ? 0 :
-		  (unsigned HOST_WIDEST_INT)0x08070605 << 32 | 0x04030201);
+	  n->range = n->size;
+	  n->n = CMPNOP;
 
 	  if (n->size < (int)sizeof (HOST_WIDEST_INT))
 	    n->n &= ((unsigned HOST_WIDEST_INT)1 <<
@@ -1817,17 +1878,76 @@ find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
       switch (code)
 	{
 	case BIT_IOR_EXPR:
-	  source_expr1 = find_bswap_1 (rhs1_stmt, &n1, limit - 1);
+	  source_expr1 = find_bswap_or_nop_1 (rhs1_stmt, &n1, limit - 1);
 
 	  if (!source_expr1)
 	    return NULL_TREE;
 
-	  source_expr2 = find_bswap_1 (rhs2_stmt, &n2, limit - 1);
+	  source_expr2 = find_bswap_or_nop_1 (rhs2_stmt, &n2, limit - 1);
 
-	  if (source_expr1 != source_expr2
-	      || n1.size != n2.size)
+	  if (n1.size != n2.size || !source_expr2)
 	    return NULL_TREE;
 
+	  if (source_expr1 != source_expr2)
+	    {
+	      tree off_sub_expr, off_cmp_expr;
+	      HOST_WIDEST_INT inc, mask;
+	      unsigned i;
+	      HOST_WIDE_INT off_sub;
+	      struct symbolic_number *n_ptr;
+
+	      if (!n1.base_addr || !n2.base_addr
+		  || n1.base_addr != n2.base_addr)
+		return NULL_TREE;
+
+	      off_cmp_expr = fold_build2 (LT_EXPR, TREE_TYPE (n1.offset),
+					  n2.offset, n1.offset);
+	      if (TREE_CODE (off_cmp_expr) != INTEGER_CST)
+		return NULL_TREE;
+
+	      /* We swap n1 with n2 to have n1 < n2.  */
+	      if (TREE_INT_CST_LOW (off_cmp_expr))
+		{
+		  struct symbolic_number tmpn;
+
+		  tmpn = n2;
+		  n2 = n1;
+		  n1 = tmpn;
+		  source_expr1 = source_expr2;
+		}
+
+	      off_sub_expr = fold_build2 (MINUS_EXPR, TREE_TYPE (n1.offset),
+					  n2.offset, n1.offset);
+	      if (!cst_and_fits_in_hwi (off_sub_expr))
+		return NULL_TREE;
+
+	      off_sub = TREE_INT_CST_LOW (off_sub_expr);
+
+	      /* Check that the range of memory covered < biggest int size.  */
+	      if (off_sub + n2.range > (int)sizeof (HOST_WIDEST_INT))
+	        return NULL_TREE;
+	      n->range = n2.range + off_sub;
+
+	      /* Reinterpret byte marks in symbolic number holding the value of
+		 bigger weight according to host endianness.  */
+	      inc = BYTES_BIG_ENDIAN ? off_sub + n2.range - n1.range : off_sub;
+	      mask = 0xFF;
+	      if (BYTES_BIG_ENDIAN)
+		n_ptr = &n1;
+	      else
+		n_ptr = &n2;
+	      for (i = 0; i < sizeof (HOST_WIDEST_INT); i++, inc <<= 8,
+		   mask <<= 8)
+		{
+		  if (n_ptr->n & mask)
+		    n_ptr->n += inc;
+		}
+	    }
+	  else
+	    n->range = n1.range;
+
+	  n->base_addr = n1.base_addr;
+	  n->offset = n1.offset;
 	  n->size = n1.size;
 	  for (i = 0, mask = 0xff; i < n->size; i++, mask <<= BITS_PER_UNIT)
 	    {
@@ -1852,56 +1972,88 @@ find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
   return NULL_TREE;
 }
 
-/* Check if STMT completes a bswap implementation consisting of ORs,
-   SHIFTs and ANDs.  Return the source tree expression on which the
-   byte swap is performed and NULL if no bswap was found.  */
+/* Check if STMT completes a bswap implementation or a read in a given
+   endianness consisting of ORs, SHIFTs and ANDs and sets *swap
+   accordingly. It also sets *memsrc if the source lies in memory and
+   in this case also sets *size to the size of the load needed. At
+   last, the function returns the source tree expression.  */
 
 static tree
-find_bswap (gimple stmt)
+find_bswap_or_nop (gimple stmt, bool *memsrc, bool *swap, int *size)
 {
-/* The number which the find_bswap result should match in order to
-   have a full byte swap.  The number is shifted to the left according
-   to the size of the symbolic number before using it.  */
-  unsigned HOST_WIDEST_INT cmp =
+/* The number which the find_bswap_or_nop_1 result should match in order
+   to have a full byte swap.  The number is shifted to the right
+   according to the size of the symbolic number before using it.  */
+  unsigned HOST_WIDEST_INT cmpxchg =
     sizeof (HOST_WIDEST_INT) < 8 ? 0 :
     (unsigned HOST_WIDEST_INT)0x01020304 << 32 | 0x05060708;
+  unsigned HOST_WIDEST_INT cmpnop = CMPNOP;
 
   struct symbolic_number n;
   tree source_expr;
-  int limit;
+  int limit, rsize;
 
   /* The last parameter determines the depth search limit.  It usually
      correlates directly to the number of bytes to be touched.  We
      increase that number by three  here in order to also
-     cover signed -> unsigned converions of the src operand as can be seen
+     cover signed -> unsigned conversions of the src operand as can be seen
      in libgcc, and for initial shift/and operation of the src operand.  */
   limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
   limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
-  source_expr =  find_bswap_1 (stmt, &n, limit);
+  source_expr =  find_bswap_or_nop_1 (stmt, &n, limit);
 
   if (!source_expr)
     return NULL_TREE;
 
-  /* Zero out the extra bits of N and CMP.  */
+  /* Zero out the extra bits of N and CMP*.  */
   if (n.size < (int)sizeof (HOST_WIDEST_INT))
     {
+      int tmpn;
       unsigned HOST_WIDEST_INT mask =
 	((unsigned HOST_WIDEST_INT)1 << (n.size * BITS_PER_UNIT)) - 1;
 
       n.n &= mask;
-      cmp >>= (sizeof (HOST_WIDEST_INT) - n.size) * BITS_PER_UNIT;
+      /* Find real size of result (highest non zero byte).  */
+      for (tmpn = n.n, rsize = 0; tmpn; tmpn >>= BITS_PER_UNIT, rsize++);
+      cmpxchg >>= (sizeof (HOST_WIDEST_INT) - rsize) * BITS_PER_UNIT;
+      cmpnop &= mask >> ((n.size - rsize) * BITS_PER_UNIT);
     }
+  else
+    rsize = n.size;
 
   /* A complete byte swap should make the symbolic number to start
-     with the largest digit in the highest order byte.  */
-  if (cmp != n.n)
+     with the largest digit in the highest order byte. Unchanged symbolic
+     number indicates a read with same endianness as host architecture.  */
+  if (n.n != cmpnop && n.n != cmpxchg)
+    return NULL_TREE;
+
+  *memsrc = 0;
+
+  if (n.base_addr)
+    {
+      *memsrc = 1;
+      n.size = rsize;
+    }
+  else if (rsize != n.size)
+    return NULL_TREE;
+  else if (n.n == cmpnop)
+    /* Useless bit manipulation performed by code.  */
     return NULL_TREE;
 
+
+  if (n.n == cmpxchg)
+    *swap = 1;
+  else
+    *swap = 0;
+
+  *size = n.size * BITS_PER_UNIT;
   return source_expr;
 }
 
-/* Find manual byte swap implementations and turn them into a bswap
-   builtin invokation.  */
+/* Find manual byte swap implementations as well as load in a given
+   endianness. Byte swaps are turned into a bswap builtin invokation
+   while endian loads are converted to bswap builtin invokation or
+   simple load according to the host endianness.  */
 
 static unsigned int
 execute_optimize_bswap (void)
@@ -1925,9 +2077,6 @@ execute_optimize_bswap (void)
 	       && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing
 		   || (bswap32_p && word_mode == SImode)));
 
-  if (!bswap16_p && !bswap32_p && !bswap64_p)
-    return 0;
-
   /* Determine the argument type of the builtins.  The code later on
      assumes that the return and argument type are the same.  */
   if (bswap16_p)
@@ -1961,21 +2110,25 @@ execute_optimize_bswap (void)
       for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
         {
 	  gimple stmt = gsi_stmt (gsi);
-	  tree bswap_src, bswap_type;
-	  tree bswap_tmp;
+	  tree src, tmp, tgt, load_type, bswap_type = NULL;
 	  tree fndecl = NULL_TREE;
 	  int type_size;
+	  bool memsrc, swap;
 	  gimple call;
 
 	  if (!is_gimple_assign (stmt)
 	      || gimple_assign_rhs_code (stmt) != BIT_IOR_EXPR)
 	    continue;
 
-	  type_size = TYPE_PRECISION (gimple_expr_type (stmt));
+	  src = find_bswap_or_nop (stmt, &memsrc, &swap, &type_size);
+
+	  if (!src)
+	    continue;
 
 	  switch (type_size)
 	    {
 	    case 16:
+	      load_type = uint16_type_node;
 	      if (bswap16_p)
 		{
 		  fndecl = builtin_decl_explicit (BUILT_IN_BSWAP16);
@@ -1983,6 +2136,7 @@ execute_optimize_bswap (void)
 		}
 	      break;
 	    case 32:
+	      load_type = uint32_type_node;
 	      if (bswap32_p)
 		{
 		  fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
@@ -1990,6 +2144,7 @@ execute_optimize_bswap (void)
 		}
 	      break;
 	    case 64:
+	      load_type = uint64_type_node;
 	      if (bswap64_p)
 		{
 		  fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
@@ -2000,13 +2155,67 @@ execute_optimize_bswap (void)
 	      continue;
 	    }
 
-	  if (!fndecl)
+	  if (swap && !fndecl)
 	    continue;
 
-	  bswap_src = find_bswap (stmt);
+	  tgt = gimple_assign_lhs (stmt);
 
-	  if (!bswap_src)
-	    continue;
+	  /* Need to load the value from memory first.  */
+	  if (memsrc)
+	    {
+	      tree addr_expr, addr_tmp, val_expr, val_tmp;
+	      tree load_ptr_type, load_offset_ptr;
+	      gimple addr_stmt, load_stmt;
+
+	      changed = true;
+
+	      /*  Compute address to load from and cast according to the size
+		  of the load.  */
+	      load_ptr_type = build_pointer_type (load_type);
+	      addr_expr = build1 (ADDR_EXPR, load_ptr_type, src);
+	      addr_tmp = make_temp_ssa_name (load_ptr_type, NULL, "load_src");
+	      addr_stmt = gimple_build_assign_with_ops
+			 (NOP_EXPR, addr_tmp, addr_expr, NULL);
+	      gsi_insert_before (&gsi, addr_stmt, GSI_SAME_STMT);
+
+	      /* Perform the load.  */
+	      load_offset_ptr = build_int_cst (load_ptr_type, 0);
+	      val_tmp = make_temp_ssa_name (load_type, NULL, "load_dst");
+	      val_expr = build2 (MEM_REF, load_type, addr_tmp, load_offset_ptr);
+	      load_stmt = gimple_build_assign_with_ops
+			 (MEM_REF, val_tmp, val_expr, NULL);
+
+	      if (!swap)
+		{
+		  /* Convert the result of load if necessary.  */
+		  if (!useless_type_conversion_p (TREE_TYPE (tgt),
+						  TREE_TYPE (val_tmp)))
+		    {
+		      gimple convert_stmt;
+		      convert_stmt = gimple_build_assign_with_ops
+			     (NOP_EXPR, tgt, val_tmp, NULL);
+		      gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
+		    }
+		  else
+		    gimple_assign_set_lhs (load_stmt, tgt);
+
+		  if (dump_file)
+		    {
+		      fprintf (dump_file,
+			       "%d bit load in host endianness found at: ",
+			       (int)type_size);
+		      print_gimple_stmt (dump_file, stmt, 0, 0);
+		    }
+
+		  gsi_insert_after (&gsi, load_stmt, GSI_SAME_STMT);
+		  gsi_remove (&gsi, true);
+
+		  continue;
+		}
+	      else
+		gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
+	      src = val_tmp;
+	    }
 
 	  changed = true;
 	  if (type_size == 16)
@@ -2016,33 +2225,33 @@ execute_optimize_bswap (void)
 	  else
 	    bswap_stats.found_64bit++;
 
-	  bswap_tmp = bswap_src;
+	  tmp = src;
 
 	  /* Convert the src expression if necessary.  */
-	  if (!useless_type_conversion_p (TREE_TYPE (bswap_tmp), bswap_type))
+	  if (!useless_type_conversion_p (TREE_TYPE (tmp), bswap_type))
 	    {
 	      gimple convert_stmt;
-	      bswap_tmp = make_temp_ssa_name (bswap_type, NULL, "bswapsrc");
+	      tmp = make_temp_ssa_name (bswap_type, NULL, "bswapsrc");
 	      convert_stmt = gimple_build_assign_with_ops
-		  		(NOP_EXPR, bswap_tmp, bswap_src, NULL);
+				(NOP_EXPR, tmp, src, NULL);
 	      gsi_insert_before (&gsi, convert_stmt, GSI_SAME_STMT);
 	    }
 
-	  call = gimple_build_call (fndecl, 1, bswap_tmp);
+	  call = gimple_build_call (fndecl, 1, tmp);
 
-	  bswap_tmp = gimple_assign_lhs (stmt);
+	  tmp = tgt;
 
 	  /* Convert the result if necessary.  */
-	  if (!useless_type_conversion_p (TREE_TYPE (bswap_tmp), bswap_type))
+	  if (!useless_type_conversion_p (TREE_TYPE (tgt), bswap_type))
 	    {
 	      gimple convert_stmt;
-	      bswap_tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst");
+	      tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst");
 	      convert_stmt = gimple_build_assign_with_ops
-			(NOP_EXPR, gimple_assign_lhs (stmt), bswap_tmp, NULL);
+			(NOP_EXPR, tgt, tmp, NULL);
 	      gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
 	    }
 
-	  gimple_call_set_lhs (call, bswap_tmp);
+	  gimple_call_set_lhs (call, tmp);
 
 	  if (dump_file)
 	    {

^ permalink raw reply	[flat|nested] 31+ messages in thread

end of thread, other threads:[~2014-06-06 10:45 UTC | newest]

Thread overview: 31+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-05-09 10:26 [PATCH] Fix PR54733 Optimize endian independent load/store Thomas Preud'homme
2014-05-16 10:07 ` Thomas Preud'homme
2014-05-16 10:48   ` Richard Biener
2014-05-16 10:56     ` pinskia
2014-05-16 11:03       ` Richard Biener
2014-05-16 11:14         ` Richard Biener
2014-05-16 16:59           ` pinskia
2014-05-19  2:49             ` Thomas Preud'homme
2014-05-20  2:50             ` Andrew Pinski
2014-05-19  9:30           ` Thomas Preud'homme
2014-05-19  9:46             ` Richard Biener
2014-05-20  2:46         ` Thomas Preud'homme
2014-05-20  9:06           ` Richard Biener
2014-05-20 10:29             ` Thomas Preud'homme
2014-05-21  1:00               ` Thomas Preud'homme
2014-05-22 10:07                 ` Richard Biener
2014-05-23  3:36                   ` Thomas Preud'homme
2014-05-26 10:57                     ` Christophe Lyon
2014-05-26 11:06                       ` Thomas Preud'homme
2014-05-27  7:23                         ` Thomas Preud'homme
2014-05-27 14:35                           ` Christophe Lyon
2014-05-27 16:45                           ` Andreas Schwab
2014-05-29  8:18                             ` Thomas Preud'homme
2014-05-29  8:52                               ` Andreas Schwab
2014-05-29  9:58                                 ` Thomas Preud'homme
2014-06-02 13:57                                   ` Christophe Lyon
2014-06-04  7:04                                     ` Thomas Preud'homme
2014-06-04  9:52                                       ` Richard Biener
2014-06-06 10:45                                         ` Thomas Preud'homme
2014-05-24 18:56                 ` Andreas Schwab
  -- strict thread matches above, loose matches on Subject: below --
2014-03-19  7:24 Thomas Preud'homme

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).