public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* RE: [PATCH] Fix PR54733 Optimize endian independent load/store
@ 2014-05-09 10:26 Thomas Preud'homme
  2014-05-16 10:07 ` Thomas Preud'homme
  0 siblings, 1 reply; 31+ messages in thread
From: Thomas Preud'homme @ 2014-05-09 10:26 UTC (permalink / raw)
  To: GCC Patches

[-- Attachment #1: Type: text/plain, Size: 2127 bytes --]

Sorry, took longer than expected as I got distracted by some other patch.
I merged the whole patchset in a single patch as I was told the current setup
is actually more difficult to read.

Here are the updated ChangeLogs:

*** gcc/ChangeLog ***

2014-05-09  Thomas Preud'homme  <thomas.preudhomme@arm.com>

	PR tree-optimization/54733
	* expr.c (get_inner_reference): Add a parameter to control whether a
	MEM_REF should be split into base + offset.
	* tree.h (get_inner_reference): Default new parameter to false.
	* tree-ssa-math-opts.c (nop_stats): New "bswap_stats" structure.
	(CMPNOP): Define.
	(find_bswap_or_nop_load): New.
	(find_bswap_1): Renamed to ...
	(find_bswap_or_nop_1): This. Also add support for memory source.
	(find_bswap): Renamed to ...
	(find_bswap_or_nop): This. Also add support for memory source and
	detection of bitwise operations equivalent to load in host endianness.
	(execute_optimize_bswap): Likewise. Also move its leading comment back
	in place and split statement transformation into ...
	(bswap_replace): This. Add assert when updating bswap_stats.

*** gcc/testsuite/ChangeLog ***

2014-05-09  Thomas Preud'homme  <thomas.preudhomme@arm.com>

	PR tree-optimization/54733
	* gcc.dg/optimize-bswapdi-3.c: New test to check extension of bswap
	optimization to support memory sources and bitwise operations
	equivalent to load in host endianness.
	* gcc.dg/optimize-bswaphi-1.c: Likewise.
	* gcc.dg/optimize-bswapsi-2.c: Likewise.
	* gcc.c-torture/execute/bswap-2.c: Likewise.

Ok for trunk?

Best regards,

Thomas

> -----Original Message-----
> From: gcc-patches-owner@gcc.gnu.org [mailto:gcc-patches-
> owner@gcc.gnu.org] On Behalf Of Thomas Preud'homme
> Sent: Monday, May 05, 2014 7:30 PM
> To: GCC Patches
> Subject: RE: [PATCH][2/3] Fix PR54733 Optimize endian independent
> load/store
> 
> I found a way to improve the function find_bswap/find_bswap_or_nop
> and reduce its size. Please hold for the review, I will post an updated
> version as soon as I finish testing.
> 
> Best regards,
> 
> Thomas Preud'homme
> 
> 
> 

[-- Attachment #2: gcc32rm-84.5.4.diff --]
[-- Type: application/octet-stream, Size: 32901 bytes --]

diff --git a/gcc/expr.c b/gcc/expr.c
index fec6194..864ee20 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -6669,7 +6669,10 @@ store_field (rtx target, HOST_WIDE_INT bitsize, HOST_WIDE_INT bitpos,
 \f
 /* Given an expression EXP that may be a COMPONENT_REF, a BIT_FIELD_REF,
    an ARRAY_REF, or an ARRAY_RANGE_REF, look for nested operations of these
-   codes and find the ultimate containing object, which we return.
+   codes and find the ultimate containing object, which we return. If that
+   object is a MEM_REF that is not the address of an object, ALLOW_ADDRESS
+   decides whether the pointer is returned instead of the memory reference.
+   Note that the behavior for TARGET_MEM_REF is not affected by this parameter.
 
    We set *PBITSIZE to the size in bits that we want, *PBITPOS to the
    bit position, and *PUNSIGNEDP to the signedness of the field.
@@ -6706,7 +6709,7 @@ tree
 get_inner_reference (tree exp, HOST_WIDE_INT *pbitsize,
 		     HOST_WIDE_INT *pbitpos, tree *poffset,
 		     enum machine_mode *pmode, int *punsignedp,
-		     int *pvolatilep, bool keep_aligning)
+		     int *pvolatilep, bool keep_aligning, bool allow_address)
 {
   tree size_tree = 0;
   enum machine_mode mode = VOIDmode;
@@ -6838,7 +6841,7 @@ get_inner_reference (tree exp, HOST_WIDE_INT *pbitsize,
 
 	case MEM_REF:
 	  /* Hand back the decl for MEM[&decl, off].  */
-	  if (TREE_CODE (TREE_OPERAND (exp, 0)) == ADDR_EXPR)
+	  if (TREE_CODE (TREE_OPERAND (exp, 0)) == ADDR_EXPR || allow_address)
 	    {
 	      tree off = TREE_OPERAND (exp, 1);
 	      if (!integer_zerop (off))
@@ -6848,7 +6851,10 @@ get_inner_reference (tree exp, HOST_WIDE_INT *pbitsize,
 				      ? 3 : exact_log2 (BITS_PER_UNIT));
 		  bit_offset += boff;
 		}
-	      exp = TREE_OPERAND (TREE_OPERAND (exp, 0), 0);
+	      if (TREE_CODE (TREE_OPERAND (exp, 0)) == ADDR_EXPR)
+		exp = TREE_OPERAND (TREE_OPERAND (exp, 0), 0);
+	      else
+		exp = TREE_OPERAND (exp, 0);
 	    }
 	  goto done;
 
diff --git a/gcc/testsuite/gcc.c-torture/execute/bswap-2.c b/gcc/testsuite/gcc.c-torture/execute/bswap-2.c
new file mode 100644
index 0000000..e91b487
--- /dev/null
+++ b/gcc/testsuite/gcc.c-torture/execute/bswap-2.c
@@ -0,0 +1,90 @@
+#ifdef __UINT32_TYPE__
+typedef __UINT32_TYPE__ uint32_t;
+#else
+typedef __UINT32_TYPE__ unsigned;
+#endif
+
+struct bitfield {
+  unsigned char f0:7;
+  unsigned char f1:7;
+  unsigned char f2:7;
+  unsigned char f3:7;
+};
+
+struct ok {
+  unsigned char f0;
+  unsigned char f1;
+  unsigned char f2;
+  unsigned char f3;
+};
+
+union bf_or_uint32 {
+  struct ok inval;
+  struct bitfield bfval;
+};
+
+__attribute__ ((noinline, noclone)) uint32_t
+partial_read_le32 (union bf_or_uint32 in)
+{
+  return in.bfval.f0 | (in.bfval.f1 << 8)
+	 | (in.bfval.f2 << 16) | (in.bfval.f3 << 24);
+}
+
+__attribute__ ((noinline, noclone)) uint32_t
+partial_read_be32 (union bf_or_uint32 in)
+{
+  return in.bfval.f3 | (in.bfval.f2 << 8)
+	 | (in.bfval.f1 << 16) | (in.bfval.f0 << 24);
+}
+
+__attribute__ ((noinline, noclone)) uint32_t
+fake_read_le32 (char *x, char *y)
+{
+  unsigned char c0, c1, c2, c3;
+
+  c0 = x[0];
+  c1 = x[1];
+  *y = 1;
+  c2 = x[2];
+  c3 = x[3];
+  return c0 | c1 << 8 | c2 << 16 | c3 << 24;
+}
+
+__attribute__ ((noinline, noclone)) uint32_t
+fake_read_be32 (char *x, char *y)
+{
+  unsigned char c0, c1, c2, c3;
+
+  c0 = x[0];
+  c1 = x[1];
+  *y = 1;
+  c2 = x[2];
+  c3 = x[3];
+  return c3 | c2 << 8 | c1 << 16 | c0 << 24;
+}
+
+int
+main ()
+{
+  union bf_or_uint32 bfin;
+  uint32_t out;
+  char cin[] = { 0x83, 0x85, 0x87, 0x89 };
+
+  if (sizeof (uint32_t) * __CHAR_BIT__ != 32)
+    return 0;
+  bfin.inval = (struct ok) { 0x83, 0x85, 0x87, 0x89 };
+  out = partial_read_le32 (bfin);
+  if (out != 0x09070503 && out != 0x88868482)
+    __builtin_abort ();
+  bfin.inval = (struct ok) { 0x83, 0x85, 0x87, 0x89 };
+  out = partial_read_be32 (bfin);
+  if (out != 0x03050709 && out != 0x82848688)
+    __builtin_abort ();
+  out = fake_read_le32 (cin, &cin[2]);
+  if (out != 0x89018583)
+    __builtin_abort ();
+  out = fake_read_be32 (cin, &cin[2]);
+  if (out != 0x83850189)
+    __builtin_abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.dg/optimize-bswapdi-3.c b/gcc/testsuite/gcc.dg/optimize-bswapdi-3.c
new file mode 100644
index 0000000..a10c23f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/optimize-bswapdi-3.c
@@ -0,0 +1,82 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target bswap64 } */
+/* { dg-require-effective-target stdint_types } */
+/* { dg-options "-O2 -fdump-tree-bswap" } */
+
+#include <stdint.h>
+
+struct uint64_st {
+  unsigned char u0, u1, u2, u3, u4, u5, u6, u7;
+};
+
+uint32_t read_aux (void *, uint32_t);
+
+uint64_t read_le64_1 (void)
+{
+  unsigned char data[8];
+
+  read_aux (data, 8);
+  return (uint64_t) data[0] | ((uint64_t) data[1] << 8)
+	 | ((uint64_t) data[2] << 16) | ((uint64_t) data[3] << 24)
+	 | ((uint64_t) data[4] << 32) | ((uint64_t) data[5] << 40)
+	 | ((uint64_t) data[6] << 48) | ((uint64_t) data[7] << 56);
+}
+
+uint64_t read_le64_2 (void)
+{
+  struct uint64_st data;
+
+  read_aux (&data, 8);
+  return (uint64_t) data.u0 | ((uint64_t) data.u1 << 8)
+	 | ((uint64_t) data.u2 << 16) | ((uint64_t) data.u3 << 24)
+	 | ((uint64_t) data.u4 << 32) | ((uint64_t) data.u5 << 40)
+	 | ((uint64_t) data.u6 << 48) | ((uint64_t) data.u7 << 56);
+}
+
+uint64_t read_le64_3 (void)
+{
+  unsigned char *data;
+
+  read_aux (data, 8);
+  return (uint64_t) *data | ((uint64_t) *(data + 1) << 8)
+	 | ((uint64_t) *(data + 2) << 16) | ((uint64_t) *(data + 3) << 24)
+	 | ((uint64_t) *(data + 4) << 32) | ((uint64_t) *(data + 5) << 40)
+	 | ((uint64_t) *(data + 6) << 48) | ((uint64_t) *(data + 7) << 56);
+}
+
+uint64_t read_be64_1 (void)
+{
+  unsigned char data[8];
+
+  read_aux (data, 8);
+  return (uint64_t) data[7] | ((uint64_t) data[6] << 8)
+	 | ((uint64_t) data[5] << 16) | ((uint64_t) data[4] << 24)
+	 | ((uint64_t) data[3] << 32) | ((uint64_t) data[2] << 40)
+	 | ((uint64_t) data[1] << 48) | ((uint64_t) data[0] << 56);
+}
+
+uint64_t read_be64_2 (void)
+{
+  struct uint64_st data;
+
+  read_aux (&data, 8);
+  return (uint64_t) data.u7 | ((uint64_t) data.u6 << 8)
+	 | ((uint64_t) data.u5 << 16) | ((uint64_t) data.u4 << 24)
+	 | ((uint64_t) data.u3 << 32) | ((uint64_t) data.u2 << 40)
+	 | ((uint64_t) data.u1 << 48) | ((uint64_t) data.u0 << 56);
+}
+
+uint64_t read_be64_3 (void)
+{
+  unsigned char *data;
+
+  read_aux (data, 8);
+  return (uint64_t) *(data + 7) | ((uint64_t) *(data + 6) << 8)
+	 | ((uint64_t) *(data + 5) << 16) | ((uint64_t) *(data + 4) << 24)
+	 | ((uint64_t) *(data + 3) << 32) | ((uint64_t) *(data + 2) << 40)
+	 | ((uint64_t) *(data + 1) << 48) | ((uint64_t) *data << 56);
+}
+
+/* { dg-final { scan-tree-dump-times "64 bit load in host endianness found at" 3 "bswap" } } */
+/* { dg-final { scan-tree-dump-times "64 bit bswap implementation found at" 3 "bswap" } } */
+/* { dg-final { cleanup-tree-dump "bswap" } } */
diff --git a/gcc/testsuite/gcc.dg/optimize-bswaphi-1.c b/gcc/testsuite/gcc.dg/optimize-bswaphi-1.c
new file mode 100644
index 0000000..b57cd92
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/optimize-bswaphi-1.c
@@ -0,0 +1,65 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target bswap16 } */
+/* { dg-require-effective-target stdint_types } */
+/* { dg-options "-O2 -fdump-tree-bswap" } */
+/* { dg-options "-O2 -fdump-tree-bswap -march=z900" { target s390-*-* } } */
+
+#include <stdint.h>
+
+struct uint16_st {
+  unsigned char u0, u1;
+};
+
+uint32_t read_aux (void *, uint32_t);
+
+uint32_t read_le16_1 (void)
+{
+  unsigned char data[2];
+
+  read_aux (data, 2);
+  return data[0] | (data[1] << 8);
+}
+
+uint32_t read_le16_2 (void)
+{
+  struct uint16_st data;
+
+  read_aux (&data, 2);
+  return data.u0 | (data.u1 << 8);
+}
+
+uint32_t read_le16_3 (void)
+{
+  unsigned char *data;
+
+  read_aux (data, 2);
+  return *data | (*(data + 1) << 8);
+}
+
+uint32_t read_be16_1 (void)
+{
+  unsigned char data[2];
+
+  read_aux (data, 2);
+  return data[1] | (data[0] << 8);
+}
+
+uint32_t read_be16_2 (void)
+{
+  struct uint16_st data;
+
+  read_aux (&data, 2);
+  return data.u1 | (data.u0 << 8);
+}
+
+uint32_t read_be16_3 (void)
+{
+  unsigned char *data;
+
+  read_aux (data, 2);
+  return *(data + 1) | (*data << 8);
+}
+
+/* { dg-final { scan-tree-dump-times "16 bit load in host endianness found at" 3 "bswap" } } */
+/* { dg-final { scan-tree-dump-times "16 bit bswap implementation found at" 3 "bswap" } } */
+/* { dg-final { cleanup-tree-dump "bswap" } } */
diff --git a/gcc/testsuite/gcc.dg/optimize-bswapsi-2.c b/gcc/testsuite/gcc.dg/optimize-bswapsi-2.c
new file mode 100644
index 0000000..e59b310
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/optimize-bswapsi-2.c
@@ -0,0 +1,67 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target bswap32 } */
+/* { dg-require-effective-target stdint_types } */
+/* { dg-options "-O2 -fdump-tree-bswap" } */
+/* { dg-options "-O2 -fdump-tree-bswap -march=z900" { target s390-*-* } } */
+
+#include <stdint.h>
+
+struct uint32_st {
+  unsigned char u0, u1, u2, u3;
+};
+
+uint32_t read_aux (void *, uint32_t);
+
+uint32_t read_le32_1 (void)
+{
+  unsigned char data[4];
+
+  read_aux (data, 4);
+  return data[0] | (data[1] << 8) | (data[2] << 16) | (data[3] << 24);
+}
+
+uint32_t read_le32_2 (void)
+{
+  struct uint32_st data;
+
+  read_aux (&data, 4);
+  return data.u0 | (data.u1 << 8) | (data.u2 << 16) | (data.u3 << 24);
+}
+
+uint32_t read_le32_3 (void)
+{
+  unsigned char *data;
+
+  read_aux (data, 4);
+  return *data | (*(data + 1) << 8) | (*(data + 2) << 16)
+	 | (*(data + 3) << 24);
+}
+
+uint32_t read_be32_1 (void)
+{
+  unsigned char data[4];
+
+  read_aux (data, 4);
+  return data[3] | (data[2] << 8) | (data[1] << 16) | (data[0] << 24);
+}
+
+uint32_t read_be32_2 (void)
+{
+  struct uint32_st data;
+
+  read_aux (&data, 4);
+  return data.u3 | (data.u2 << 8) | (data.u1 << 16) | (data.u0 << 24);
+}
+
+uint32_t read_be32_3 (void)
+{
+  unsigned char *data;
+
+  read_aux (data, 4);
+  return *(data + 3) | (*(data + 2) << 8) | (*(data + 1) << 16)
+	 | (*data << 24);
+}
+
+/* { dg-final { scan-tree-dump-times "32 bit load in host endianness found at" 3 "bswap" } } */
+/* { dg-final { scan-tree-dump-times "32 bit bswap implementation found at" 3 "bswap" } } */
+/* { dg-final { cleanup-tree-dump "bswap" } } */
diff --git a/gcc/tree-ssa-math-opts.c b/gcc/tree-ssa-math-opts.c
index b965ad1..7e0212d 100644
--- a/gcc/tree-ssa-math-opts.c
+++ b/gcc/tree-ssa-math-opts.c
@@ -98,6 +98,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "is-a.h"
 #include "gimple.h"
 #include "gimple-iterator.h"
+#include "gimplify.h"
 #include "gimplify-me.h"
 #include "stor-layout.h"
 #include "gimple-ssa.h"
@@ -170,15 +171,15 @@ static struct
 
 static struct
 {
-  /* Number of hand-written 16-bit bswaps found.  */
+  /* Number of hand-written 16-bit nop / bswaps found.  */
   int found_16bit;
 
-  /* Number of hand-written 32-bit bswaps found.  */
+  /* Number of hand-written 32-bit nop / bswaps found.  */
   int found_32bit;
 
-  /* Number of hand-written 64-bit bswaps found.  */
+  /* Number of hand-written 64-bit nop / bswaps found.  */
   int found_64bit;
-} bswap_stats;
+} nop_stats, bswap_stats;
 
 static struct
 {
@@ -1606,13 +1607,43 @@ make_pass_cse_sincos (gcc::context *ctxt)
 
    0    - byte has the value 0
    1..size - byte contains the content of the byte
-   number indexed with that value minus one  */
+   number indexed with that value minus one.
+
+   To detect permutations on memory sources (arrays and structures), a symbolic
+   number is also associated a base address (the array or structure the load is
+   made from), an offset from the base address and a range which gives the
+   difference between the highest and lowest accessed memory location to make
+   such a symbolic number. The range is thus different from size which reflects
+   the size of the type of current expression. Note that for non memory source,
+   range holds the same value as size.
+
+   For instance, for an array char a[], (short) a[0] | (short) a[3] would have
+   a size of 2 but a range of 4 while (short) a[0] | ((short) a[0] << 1) would
+   still have a size of 2 but this time a range of 1.  */
 
 struct symbolic_number {
   unsigned HOST_WIDEST_INT n;
   int size;
+  tree base_addr;
+  tree offset;
+  HOST_WIDE_INT bytepos;
+  tree alias_set;
+  tree vuse;
+  unsigned HOST_WIDE_INT range;
 };
 
+/* The number which the find_bswap_or_nop_1 result should match in
+   order to have a nop.  The number is masked according to the size of
+   the symbolic number before using it.  */
+#define CMPNOP (sizeof (HOST_WIDEST_INT) < 8 ? 0 : \
+  (unsigned HOST_WIDEST_INT)0x08070605 << 32 | 0x04030201)
+
+/* The number which the find_bswap_or_nop_1 result should match in
+   order to have a byte swap.  The number is masked according to the
+   size of the symbolic number before using it.  */
+#define CMPXCHG (sizeof (HOST_WIDEST_INT) < 8 ? 0 : \
+  (unsigned HOST_WIDEST_INT)0x01020304 << 32 | 0x05060708)
+
 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
    number N.  Return false if the requested operation is not permitted
    on a symbolic number.  */
@@ -1672,13 +1703,44 @@ verify_symbolic_number_p (struct symbolic_number *n, gimple stmt)
   return true;
 }
 
-/* find_bswap_1 invokes itself recursively with N and tries to perform
-   the operation given by the rhs of STMT on the result.  If the
-   operation could successfully be executed the function returns the
-   tree expression of the source operand and NULL otherwise.  */
+/* Check if STMT might be a byte swap or a nop from a memory source and returns
+   the answer. If so, REF is that memory source and the base of the memory area
+   accessed and the offset of the access from that base are recorded in N.  */
+
+bool
+find_bswap_or_nop_load (gimple stmt, tree ref, struct symbolic_number *n)
+{
+  /* Leaf node is an array or component ref. Memorize its base and
+     offset from base to compare to other such leaf node.  */
+  HOST_WIDE_INT bitsize, bitpos;
+  enum machine_mode mode;
+  int unsignedp, volatilep;
+
+  if (!gimple_assign_load_p (stmt) || gimple_has_volatile_ops (stmt))
+    return false;
+
+  n->base_addr = get_inner_reference (ref, &bitsize, &bitpos, &n->offset,
+				      &mode, &unsignedp, &volatilep, false,
+				      true);
+
+  if (bitpos % BITS_PER_UNIT)
+    return false;
+  if (bitsize % BITS_PER_UNIT)
+    return false;
+
+  n->bytepos = bitpos / BITS_PER_UNIT;
+  n->alias_set = reference_alias_ptr_type (ref);
+  n->vuse = gimple_vuse (stmt);
+  return true;
+}
+
+/* find_bswap_or_nop_1 invokes itself recursively with N and tries to perform
+   the operation given by the rhs of STMT on the result.  If the operation
+   could successfully be executed the function returns the tree expression of
+   the source operand and NULL otherwise.  */
 
 static tree
-find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
+find_bswap_or_nop_1 (gimple stmt, struct symbolic_number *n, int limit)
 {
   enum tree_code code;
   tree rhs1, rhs2 = NULL;
@@ -1691,6 +1753,9 @@ find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
 
   rhs1 = gimple_assign_rhs1 (stmt);
 
+  if (find_bswap_or_nop_load (stmt, rhs1, n))
+    return rhs1;
+
   if (TREE_CODE (rhs1) != SSA_NAME)
     return NULL_TREE;
 
@@ -1717,11 +1782,11 @@ find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
 	  && code != CONVERT_EXPR)
 	return NULL_TREE;
 
-      source_expr1 = find_bswap_1 (rhs1_stmt, n, limit - 1);
+      source_expr1 = find_bswap_or_nop_1 (rhs1_stmt, n, limit - 1);
 
-      /* If find_bswap_1 returned NULL STMT is a leaf node and we have
-	 to initialize the symbolic number.  */
-      if (!source_expr1)
+      /* If find_bswap_or_nop_1 returned NULL, STMT is a leaf node and
+	 we have to initialize the symbolic number.  */
+      if (!source_expr1 || gimple_assign_load_p (rhs1_stmt))
 	{
 	  /* Set up the symbolic number N by setting each byte to a
 	     value between 1 and the byte size of rhs1.  The highest
@@ -1731,14 +1796,18 @@ find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
 	  if (n->size % BITS_PER_UNIT != 0)
 	    return NULL_TREE;
 	  n->size /= BITS_PER_UNIT;
-	  n->n = (sizeof (HOST_WIDEST_INT) < 8 ? 0 :
-		  (unsigned HOST_WIDEST_INT)0x08070605 << 32 | 0x04030201);
+	  n->range = n->size;
+	  n->n = CMPNOP;
 
 	  if (n->size < (int)sizeof (HOST_WIDEST_INT))
 	    n->n &= ((unsigned HOST_WIDEST_INT)1 <<
 		     (n->size * BITS_PER_UNIT)) - 1;
 
-	  source_expr1 = rhs1;
+	  if (!source_expr1)
+	    {
+	      n->base_addr = n->offset = n->alias_set = n->vuse = NULL_TREE;
+	      source_expr1 = rhs1;
+	    }
 	}
 
       switch (code)
@@ -1779,6 +1848,8 @@ find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
 		n->n &= ((unsigned HOST_WIDEST_INT)1 << type_size) - 1;
 	      }
 	    n->size = type_size / BITS_PER_UNIT;
+	    if (!n->base_addr)
+	      n->range = n->size;
 	  }
 	  break;
 	default:
@@ -1807,17 +1878,79 @@ find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
       switch (code)
 	{
 	case BIT_IOR_EXPR:
-	  source_expr1 = find_bswap_1 (rhs1_stmt, &n1, limit - 1);
+	  source_expr1 = find_bswap_or_nop_1 (rhs1_stmt, &n1, limit - 1);
 
 	  if (!source_expr1)
 	    return NULL_TREE;
 
-	  source_expr2 = find_bswap_1 (rhs2_stmt, &n2, limit - 1);
+	  source_expr2 = find_bswap_or_nop_1 (rhs2_stmt, &n2, limit - 1);
+
+	  if (n1.size != n2.size || !source_expr2)
+	    return NULL_TREE;
 
-	  if (source_expr1 != source_expr2
-	      || n1.size != n2.size)
+	  if (!n1.vuse != !n2.vuse ||
+	  (n1.vuse && !operand_equal_p (n1.vuse, n2.vuse, 0)))
 	    return NULL_TREE;
 
+	  if (source_expr1 != source_expr2)
+	    {
+	      HOST_WIDEST_INT inc, mask;
+	      unsigned i;
+	      HOST_WIDE_INT off_sub;
+	      struct symbolic_number *n_ptr;
+
+	      if (!n1.base_addr || !n2.base_addr
+		  || !operand_equal_p (n1.base_addr, n2.base_addr, 0))
+		return NULL_TREE;
+	      if (!n1.offset != !n2.offset ||
+	          (n1.offset && !operand_equal_p (n1.offset, n2.offset, 0)))
+		return NULL_TREE;
+
+	      /* We swap n1 with n2 to have n1 < n2.  */
+	      if (n2.bytepos < n1.bytepos)
+		{
+		  struct symbolic_number tmpn;
+
+		  tmpn = n2;
+		  n2 = n1;
+		  n1 = tmpn;
+		  source_expr1 = source_expr2;
+		}
+
+	      off_sub = n2.bytepos - n1.bytepos;
+
+	      /* Check that the range of memory covered < biggest int size.  */
+	      if (off_sub + n2.range > (int) sizeof (HOST_WIDEST_INT))
+	        return NULL_TREE;
+	      n->range = n2.range + off_sub;
+
+	      /* Reinterpret byte marks in symbolic number holding the value of
+		 bigger weight according to host endianness.  */
+	      inc = BYTES_BIG_ENDIAN ? off_sub + n2.range - n1.range : off_sub;
+	      mask = 0xFF;
+	      if (BYTES_BIG_ENDIAN)
+		n_ptr = &n1;
+	      else
+		n_ptr = &n2;
+	      for (i = 0; i < sizeof (HOST_WIDEST_INT); i++, inc <<= 8,
+		   mask <<= 8)
+		{
+		  if (n_ptr->n & mask)
+		    n_ptr->n += inc;
+		}
+	    }
+	  else
+	    n->range = n1.range;
+
+	  if (!n1.alias_set
+	      || alias_ptr_types_compatible_p (n1.alias_set, n2.alias_set))
+	    n->alias_set = n1.alias_set;
+	  else
+	    n->alias_set = ptr_type_node;
+	  n->vuse = n1.vuse;
+	  n->base_addr = n1.base_addr;
+	  n->offset = n1.offset;
+	  n->bytepos = n1.bytepos;
 	  n->size = n1.size;
 	  for (i = 0, mask = 0xff; i < n->size; i++, mask <<= BITS_PER_UNIT)
 	    {
@@ -1842,57 +1975,75 @@ find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
   return NULL_TREE;
 }
 
-/* Check if STMT completes a bswap implementation consisting of ORs,
-   SHIFTs and ANDs.  Return the source tree expression on which the
-   byte swap is performed and NULL if no bswap was found.  */
+/* Check if STMT completes a bswap implementation or a read in a given
+   endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP
+   accordingly.  It also sets N to represent the kind of operations
+   performed: size of the resulting expression and whether it works on
+   a memory source, and if so alias-set and vuse.  At last, the
+   function returns the source tree expression.  */
 
 static tree
-find_bswap (gimple stmt)
+find_bswap_or_nop (gimple stmt, struct symbolic_number *n, bool *bswap)
 {
-/* The number which the find_bswap result should match in order to
-   have a full byte swap.  The number is shifted to the left according
-   to the size of the symbolic number before using it.  */
-  unsigned HOST_WIDEST_INT cmp =
-    sizeof (HOST_WIDEST_INT) < 8 ? 0 :
-    (unsigned HOST_WIDEST_INT)0x01020304 << 32 | 0x05060708;
-
-  struct symbolic_number n;
+/* The number which the find_bswap_or_nop_1 result should match in order
+   to have a full byte swap.  The number is shifted to the right
+   according to the size of the symbolic number before using it.  */
+  unsigned HOST_WIDEST_INT cmpxchg = CMPXCHG;
+  unsigned HOST_WIDEST_INT cmpnop = CMPNOP;
+
   tree source_expr;
   int limit;
 
   /* The last parameter determines the depth search limit.  It usually
-     correlates directly to the number of bytes to be touched.  We
-     increase that number by three  here in order to also
-     cover signed -> unsigned converions of the src operand as can be seen
+     correlates directly to the number n of bytes to be touched.  We
+     increase that number by log2(n) + 1 here in order to also
+     cover signed -> unsigned conversions of the src operand as can be seen
      in libgcc, and for initial shift/and operation of the src operand.  */
   limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
   limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
-  source_expr =  find_bswap_1 (stmt, &n, limit);
+  source_expr =  find_bswap_or_nop_1 (stmt, n, limit);
 
   if (!source_expr)
     return NULL_TREE;
 
-  /* Zero out the extra bits of N and CMP.  */
-  if (n.size < (int)sizeof (HOST_WIDEST_INT))
+  /* Find real size of result (highest non zero byte).  */
+  if (n->base_addr)
     {
-      unsigned HOST_WIDEST_INT mask =
-	((unsigned HOST_WIDEST_INT)1 << (n.size * BITS_PER_UNIT)) - 1;
+      int rsize;
+      unsigned HOST_WIDEST_INT tmpn;
 
-      n.n &= mask;
-      cmp >>= (sizeof (HOST_WIDEST_INT) - n.size) * BITS_PER_UNIT;
+      for (tmpn = n->n, rsize = 0; tmpn; tmpn >>= BITS_PER_UNIT, rsize++);
+      n->range = rsize;
     }
 
-  /* A complete byte swap should make the symbolic number to start
-     with the largest digit in the highest order byte.  */
-  if (cmp != n.n)
+  /* Zero out the extra bits of N and CMP*.  */
+  if (n->range < (int)sizeof (HOST_WIDEST_INT))
+    {
+      unsigned HOST_WIDEST_INT mask;
+
+      mask = ((unsigned HOST_WIDEST_INT)1 << (n->range * BITS_PER_UNIT)) - 1;
+      cmpxchg >>= (sizeof (HOST_WIDEST_INT) - n->range) * BITS_PER_UNIT;
+      cmpnop &= mask;
+    }
+
+  /* A complete byte swap should make the symbolic number to start with
+     the largest digit in the highest order byte. Unchanged symbolic
+     number indicates a read with same endianness as host architecture.  */
+  if (n->n == cmpnop)
+    *bswap = false;
+  else if (n->n == cmpxchg)
+    *bswap = true;
+  else
+    return NULL_TREE;
+
+  /* Useless bit manipulation performed by code.  */
+  if (!n->base_addr && n->n == cmpnop)
     return NULL_TREE;
 
+  n->range *= BITS_PER_UNIT;
   return source_expr;
 }
 
-/* Find manual byte swap implementations and turn them into a bswap
-   builtin invokation.  */
-
 namespace {
 
 const pass_data pass_data_optimize_bswap =
@@ -1926,6 +2077,146 @@ public:
 
 }; // class pass_optimize_bswap
 
+/* Perform the bswap optimization: replace the statement STMT at GSI
+   with load type, VUSE and set-alias as described by N if a memory
+   source is involved (N->base_addr is non null), followed by the
+   builtin bswap invocation in FNDECL if BSWAP is true.  SRC gives
+   the source on which STMT is operating and N->range gives the
+   size of the expression involved for maintaining some statistics.  */
+
+static void
+bswap_replace (gimple stmt, gimple_stmt_iterator *gsi, tree src, tree fndecl,
+	       tree bswap_type, tree load_type, struct symbolic_number *n,
+	       bool bswap)
+{
+  tree tmp, tgt;
+  gimple call;
+
+  tgt = gimple_assign_lhs (stmt);
+
+  /* Need to load the value from memory first.  */
+  if (n->base_addr)
+    {
+      tree addr_expr, addr_tmp, val_expr, val_tmp;
+      tree load_offset_ptr;
+      gimple addr_stmt, load_stmt;
+
+      /*  Compute address to load from and cast according to the size
+	  of the load.  */
+      addr_expr = build_fold_addr_expr (unshare_expr (src));
+      if (is_gimple_min_invariant (addr_expr))
+	addr_tmp = addr_expr;
+      else
+	{
+	  addr_tmp = make_temp_ssa_name (TREE_TYPE (addr_expr), NULL,
+					 "load_src");
+	  addr_stmt = gimple_build_assign (addr_tmp, addr_expr);
+	  gsi_insert_before (gsi, addr_stmt, GSI_SAME_STMT);
+	}
+
+      /* Perform the load.  */
+      load_offset_ptr = build_int_cst (n->alias_set, 0);
+      val_expr = fold_build2 (MEM_REF, load_type, addr_tmp,
+			      load_offset_ptr);
+
+      if (!bswap)
+	{
+	  if (n->range == 16)
+	    nop_stats.found_16bit++;
+	  else if (n->range == 32)
+	    nop_stats.found_32bit++;
+	  else
+	    {
+	      gcc_assert (n->range == 64);
+	      nop_stats.found_64bit++;
+	    }
+
+	  /* Convert the result of load if necessary.  */
+	  if (!useless_type_conversion_p (TREE_TYPE (tgt), load_type))
+	    {
+	      val_tmp = make_temp_ssa_name (load_type, NULL, "load_dst");
+	      load_stmt = gimple_build_assign (val_tmp, val_expr);
+	      gimple_set_vuse (load_stmt, n->vuse);
+	      gsi_insert_before (gsi, load_stmt, GSI_SAME_STMT);
+	      gimple_assign_set_rhs_with_ops_1 (gsi, NOP_EXPR, val_tmp,
+						NULL_TREE, NULL_TREE);
+	    }
+	  else
+	    gimple_assign_set_rhs_with_ops_1 (gsi, MEM_REF, val_expr,
+					      NULL_TREE, NULL_TREE);
+	  update_stmt (gsi_stmt (*gsi));
+
+	  if (dump_file)
+	    {
+	      fprintf (dump_file,
+		       "%d bit load in host endianness found at: ",
+		       (int)n->range);
+	      print_gimple_stmt (dump_file, stmt, 0, 0);
+	    }
+	  return;
+	}
+      else
+	{
+	  val_tmp = make_temp_ssa_name (load_type, NULL, "load_dst");
+	  load_stmt = gimple_build_assign (val_tmp, val_expr);
+	  gimple_set_vuse (load_stmt, n->vuse);
+	  gsi_insert_before (gsi, load_stmt, GSI_SAME_STMT);
+	}
+      src = val_tmp;
+    }
+
+  if (n->range == 16)
+    bswap_stats.found_16bit++;
+  else if (n->range == 32)
+    bswap_stats.found_32bit++;
+  else
+    {
+      gcc_assert (n->range == 64);
+      bswap_stats.found_64bit++;
+    }
+
+  tmp = src;
+
+  /* Convert the src expression if necessary.  */
+  if (!useless_type_conversion_p (TREE_TYPE (tmp), bswap_type))
+    {
+      gimple convert_stmt;
+      tmp = make_temp_ssa_name (bswap_type, NULL, "bswapsrc");
+      convert_stmt = gimple_build_assign_with_ops (NOP_EXPR, tmp, src, NULL);
+      gsi_insert_before (gsi, convert_stmt, GSI_SAME_STMT);
+    }
+
+  call = gimple_build_call (fndecl, 1, tmp);
+
+  tmp = tgt;
+
+  /* Convert the result if necessary.  */
+  if (!useless_type_conversion_p (TREE_TYPE (tgt), bswap_type))
+    {
+      gimple convert_stmt;
+      tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst");
+      convert_stmt = gimple_build_assign_with_ops (NOP_EXPR, tgt, tmp, NULL);
+      gsi_insert_after (gsi, convert_stmt, GSI_SAME_STMT);
+    }
+
+  gimple_call_set_lhs (call, tmp);
+
+  if (dump_file)
+    {
+      fprintf (dump_file, "%d bit bswap implementation found at: ",
+	       (int)n->range);
+      print_gimple_stmt (dump_file, stmt, 0, 0);
+    }
+
+  gsi_insert_after (gsi, call, GSI_SAME_STMT);
+  gsi_remove (gsi, true);
+}
+
+/* Find manual byte swap implementations as well as load in a given
+   endianness. Byte swaps are turned into a bswap builtin invokation
+   while endian loads are converted to bswap builtin invokation or
+   simple load according to the host endianness.  */
+
 unsigned int
 pass_optimize_bswap::execute (function *fun)
 {
@@ -1948,9 +2239,6 @@ pass_optimize_bswap::execute (function *fun)
 	       && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing
 		   || (bswap32_p && word_mode == SImode)));
 
-  if (!bswap16_p && !bswap32_p && !bswap64_p)
-    return 0;
-
   /* Determine the argument type of the builtins.  The code later on
      assumes that the return and argument type are the same.  */
   if (bswap16_p)
@@ -1971,6 +2259,7 @@ pass_optimize_bswap::execute (function *fun)
       bswap64_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
     }
 
+  memset (&nop_stats, 0, sizeof (nop_stats));
   memset (&bswap_stats, 0, sizeof (bswap_stats));
 
   FOR_EACH_BB_FN (bb, fun)
@@ -1984,21 +2273,24 @@ pass_optimize_bswap::execute (function *fun)
       for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
         {
 	  gimple stmt = gsi_stmt (gsi);
-	  tree bswap_src, bswap_type;
-	  tree bswap_tmp;
-	  tree fndecl = NULL_TREE;
-	  int type_size;
-	  gimple call;
+	  tree fndecl = NULL_TREE, bswap_type = NULL_TREE;
+	  tree src, load_type;
+	  struct symbolic_number n;
+	  bool bswap;
 
 	  if (!is_gimple_assign (stmt)
 	      || gimple_assign_rhs_code (stmt) != BIT_IOR_EXPR)
 	    continue;
 
-	  type_size = TYPE_PRECISION (gimple_expr_type (stmt));
+	  src = find_bswap_or_nop (stmt, &n, &bswap);
 
-	  switch (type_size)
+	  if (!src)
+	    continue;
+
+	  switch (n.range)
 	    {
 	    case 16:
+	      load_type = uint16_type_node;
 	      if (bswap16_p)
 		{
 		  fndecl = builtin_decl_explicit (BUILT_IN_BSWAP16);
@@ -2006,6 +2298,7 @@ pass_optimize_bswap::execute (function *fun)
 		}
 	      break;
 	    case 32:
+	      load_type = uint32_type_node;
 	      if (bswap32_p)
 		{
 		  fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
@@ -2013,6 +2306,7 @@ pass_optimize_bswap::execute (function *fun)
 		}
 	      break;
 	    case 64:
+	      load_type = uint64_type_node;
 	      if (bswap64_p)
 		{
 		  fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
@@ -2023,62 +2317,21 @@ pass_optimize_bswap::execute (function *fun)
 	      continue;
 	    }
 
-	  if (!fndecl)
-	    continue;
-
-	  bswap_src = find_bswap (stmt);
-
-	  if (!bswap_src)
+	  if (bswap && !fndecl)
 	    continue;
 
+	  bswap_replace (stmt, &gsi, src, fndecl, bswap_type, load_type, &n,
+			 bswap);
 	  changed = true;
-	  if (type_size == 16)
-	    bswap_stats.found_16bit++;
-	  else if (type_size == 32)
-	    bswap_stats.found_32bit++;
-	  else
-	    bswap_stats.found_64bit++;
-
-	  bswap_tmp = bswap_src;
-
-	  /* Convert the src expression if necessary.  */
-	  if (!useless_type_conversion_p (TREE_TYPE (bswap_tmp), bswap_type))
-	    {
-	      gimple convert_stmt;
-	      bswap_tmp = make_temp_ssa_name (bswap_type, NULL, "bswapsrc");
-	      convert_stmt = gimple_build_assign_with_ops
-		  		(NOP_EXPR, bswap_tmp, bswap_src, NULL);
-	      gsi_insert_before (&gsi, convert_stmt, GSI_SAME_STMT);
-	    }
-
-	  call = gimple_build_call (fndecl, 1, bswap_tmp);
-
-	  bswap_tmp = gimple_assign_lhs (stmt);
-
-	  /* Convert the result if necessary.  */
-	  if (!useless_type_conversion_p (TREE_TYPE (bswap_tmp), bswap_type))
-	    {
-	      gimple convert_stmt;
-	      bswap_tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst");
-	      convert_stmt = gimple_build_assign_with_ops
-			(NOP_EXPR, gimple_assign_lhs (stmt), bswap_tmp, NULL);
-	      gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
-	    }
-
-	  gimple_call_set_lhs (call, bswap_tmp);
-
-	  if (dump_file)
-	    {
-	      fprintf (dump_file, "%d bit bswap implementation found at: ",
-		       (int)type_size);
-	      print_gimple_stmt (dump_file, stmt, 0, 0);
-	    }
-
-	  gsi_insert_after (&gsi, call, GSI_SAME_STMT);
-	  gsi_remove (&gsi, true);
 	}
     }
 
+  statistics_counter_event (fun, "16-bit nop implementations found",
+			    nop_stats.found_16bit);
+  statistics_counter_event (fun, "32-bit nop implementations found",
+			    nop_stats.found_32bit);
+  statistics_counter_event (fun, "64-bit nop implementations found",
+			    nop_stats.found_64bit);
   statistics_counter_event (fun, "16-bit bswap implementations found",
 			    bswap_stats.found_16bit);
   statistics_counter_event (fun, "32-bit bswap implementations found",
diff --git a/gcc/tree.h b/gcc/tree.h
index ae4876d..0cb2abe 100644
--- a/gcc/tree.h
+++ b/gcc/tree.h
@@ -4539,7 +4539,7 @@ extern tree build_personality_function (const char *);
    the access position and size.  */
 extern tree get_inner_reference (tree, HOST_WIDE_INT *, HOST_WIDE_INT *,
 				 tree *, enum machine_mode *, int *, int *,
-				 bool);
+				 bool, bool = false);
 
 /* Return a tree representing the lower bound of the array mentioned in
    EXP, an ARRAY_REF or an ARRAY_RANGE_REF.  */

^ permalink raw reply	[flat|nested] 31+ messages in thread
* [PATCH] Fix PR54733 Optimize endian independent load/store
@ 2014-03-19  7:24 Thomas Preud'homme
  0 siblings, 0 replies; 31+ messages in thread
From: Thomas Preud'homme @ 2014-03-19  7:24 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 3693 bytes --]

Hi everybody,

*** Motivation ***

Currently gcc is capable of replacing hand-crafted implementation of byteswap by a suitable instruction thanks to the bswap optimization pass. The patch proposed here aims at extending this pass to also optimize load in a specific endianness, independent of the host endianness.

*** Methodology ***

The patch adds support for dealing with a memory source (array or structure) and detect whether the result of a bitwise operation happens to be equivalent to a big endian or little endian load and replace it by a load or a load and a byteswap according to the host endianness. The original code used the concept of symbolic number: a number where the value of each byte indicates its position (in terms of weight) before the bitwise manipulation. After performing the bit manipulation on that symbolic number, the result tells how the byte were shuffled (see variable cmp in function find_bswap). Detecting an operation resulting in a number in the host endianness is thus pretty straightforward: look if the symbolic number has *not* changed.

As to supporting read from array and structure, there is some logic to recognize the base of the array/structure and the offset of entries/fields accessed to check if the range of memory accessed would fit in an integer. Each entries is initially treated independently and when they are ORed together the values in the symbolic number are updated according to the host endianness: the entry of higher address would see its values incremented on a little endian machine.

Note that as it stands the patch does not work for arrays indexed with variable (such a tab[a] || (tab[a+1] << 8)) because fold_const does not fold (a + 1) - a. If such cases were folded, the number of cases detected would automatically be increased due to the use of fold_build2 to compare two offsets.

This patch also adds a few testcases to check both (i) that the optimization works as expected and (ii) that the result are correct. It also define new effective targets (bswap16, bswap32 and bswap64) to centralize the information about what target supports  byte swap instructions for the testsuite and modify existing tests to use these new effective targets.

The patch is quite big but could be split if necessary. A big part of the code added is for handling memory source and it would be difficult to split it but variable renaming and introduction of bwapXX effective target could be made separately to reduce the noise. The patch is too big so is only in attachment of this email.

The ChangeLog are as follows:

*** gcc/ChangeLog ***

2014-03-19  Thomas Preud'homme  <thomas.preudhomme@arm.com>

	PR tree-optimization/54733
	* tree-ssa-math-opts.c (find_bswap_1): Renamed to ...
	(find_bswap_or_nop_1): This. Also add support for memory source.
	(find_bswap): Renamed to ...
	(find_bswap_or_nop): This. Also add support for memory source and
	detection of noop bitwise operations.
	(execute_optimize_bswap): Likewise.

*** gcc/testsuite/ChangeLog ***

2014-03-19  Thomas Preud'homme  <thomas.preudhomme@arm.com>

	PR tree-optimization/54733
	* lib/target-supports.exp: New effective targets for architectures
	capable of performing byte swap.
	* gcc.dg/optimize-bswapdi-1.c: Convert to new bswap target.
	* gcc.dg/optimize-bswapdi-2.c: Likewise.
	* gcc.dg/optimize-bswapsi-1.c: Likewise.
	* gcc.dg/optimize-bswapdi-3.c: New test to check extension of bswap
	optimization to support memory sources.
	* gcc.dg/optimize-bswaphi-1.c: Likewise.
	* gcc.dg/optimize-bswapsi-2.c: Likewise.
	* gcc.c-torture/execute/bswap-2.c: Likewise.

Is this ok for stage 1?

Best regards,

Thomas

[-- Attachment #2: gcc32rm-84.3.diff --]
[-- Type: application/octet-stream, Size: 26868 bytes --]

diff --git a/gcc/testsuite/gcc.c-torture/execute/bswap-2.c b/gcc/testsuite/gcc.c-torture/execute/bswap-2.c
new file mode 100644
index 0000000..35ca743
--- /dev/null
+++ b/gcc/testsuite/gcc.c-torture/execute/bswap-2.c
@@ -0,0 +1,46 @@
+#ifdef __UINT32_TYPE__
+typedef __UINT32_TYPE__ uint32_t;
+#else
+typedef __UINT32_TYPE__ unsigned;
+#endif
+
+struct bitfield {
+  unsigned char f0:7;
+  unsigned char f1:7;
+  unsigned char f2:7;
+  unsigned char f3:7;
+};
+
+struct ok {
+  unsigned char f0;
+  unsigned char f1;
+  unsigned char f2;
+  unsigned char f3;
+};
+
+union bf_or_uint32 {
+  struct ok inval;
+  struct bitfield bfval;
+};
+
+__attribute__ ((noinline, noclone)) uint32_t
+partial_read_le32 (union bf_or_uint32 in)
+{
+  return in.bfval.f0 | (in.bfval.f1 << 8)
+	 | (in.bfval.f2 << 16) | (in.bfval.f3 << 24);
+}
+
+int
+main ()
+{
+  union bf_or_uint32 bfin;
+  uint32_t out;
+
+  if (sizeof (uint32_t) * __CHAR_BIT__ != 32)
+    return 0;
+  bfin.inval = (struct ok) { 0x83, 0x85, 0x87, 0x89 };
+  out = partial_read_le32 (bfin);
+  if (out != 0x09070503 && out != 0x88868482)
+    __builtin_abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.dg/optimize-bswapdi-1.c b/gcc/testsuite/gcc.dg/optimize-bswapdi-1.c
index 7d557f3..a9c3443 100644
--- a/gcc/testsuite/gcc.dg/optimize-bswapdi-1.c
+++ b/gcc/testsuite/gcc.dg/optimize-bswapdi-1.c
@@ -1,6 +1,6 @@
-/* { dg-do compile { target arm*-*-* alpha*-*-* ia64*-*-* x86_64-*-* s390x-*-* powerpc*-*-* rs6000-*-* } } */
+/* { dg-do compile { target *-*-* } } */
+/* { dg-require-effective-target bswap64 } */
 /* { dg-require-effective-target stdint_types } */
-/* { dg-require-effective-target lp64 } */
 /* { dg-options "-O2 -fdump-tree-bswap" } */
 
 #include <stdint.h>
diff --git a/gcc/testsuite/gcc.dg/optimize-bswapdi-2.c b/gcc/testsuite/gcc.dg/optimize-bswapdi-2.c
index 6e2821d..277449b 100644
--- a/gcc/testsuite/gcc.dg/optimize-bswapdi-2.c
+++ b/gcc/testsuite/gcc.dg/optimize-bswapdi-2.c
@@ -1,6 +1,6 @@
-/* { dg-do compile { target arm*-*-* alpha*-*-* ia64*-*-* x86_64-*-* s390x-*-* powerpc*-*-* rs6000-*-* } } */
+/* { dg-do compile { target *-*-* } } */
+/* { dg-require-effective-target bswap64 } */
 /* { dg-require-effective-target stdint_types } */
-/* { dg-require-effective-target lp64 } */
 /* { dg-options "-O2 -fdump-tree-bswap" } */
 
 #include <stdint.h>
diff --git a/gcc/testsuite/gcc.dg/optimize-bswapdi-3.c b/gcc/testsuite/gcc.dg/optimize-bswapdi-3.c
new file mode 100644
index 0000000..9e19a11
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/optimize-bswapdi-3.c
@@ -0,0 +1,60 @@
+/* { dg-do compile { target *-*-* } } */
+/* { dg-require-effective-target bswap64 } */
+/* { dg-require-effective-target stdint_types } */
+/* { dg-options "-O2 -fdump-tree-bswap" } */
+
+#include <stdint.h>
+
+struct uint64_st {
+  unsigned char u0, u1, u2, u3, u4, u5, u6, u7;
+};
+
+uint32_t read_aux (void *, uint32_t);
+
+uint64_t read_le64_1 (void)
+{
+  unsigned char data[8];
+
+  read_aux (data, 8);
+  return (uint64_t) data[0] | ((uint64_t) data[1] << 8)
+	 | ((uint64_t) data[2] << 16) | ((uint64_t) data[3] << 24)
+	 | ((uint64_t) data[4] << 32) | ((uint64_t) data[5] << 40)
+	 | ((uint64_t) data[6] << 48) | ((uint64_t) data[7] << 56);
+}
+
+uint64_t read_le64_2 (void)
+{
+  struct uint64_st data;
+
+  read_aux (&data, 8);
+  return (uint64_t) data.u0 | ((uint64_t) data.u1 << 8)
+	 | ((uint64_t) data.u2 << 16) | ((uint64_t) data.u3 << 24)
+	 | ((uint64_t) data.u4 << 32) | ((uint64_t) data.u5 << 40)
+	 | ((uint64_t) data.u6 << 48) | ((uint64_t) data.u7 << 56);
+}
+
+uint64_t read_be64_1 (void)
+{
+  unsigned char data[8];
+
+  read_aux (data, 8);
+  return (uint64_t) data[7] | ((uint64_t) data[6] << 8)
+	 | ((uint64_t) data[5] << 16) | ((uint64_t) data[4] << 24)
+	 | ((uint64_t) data[3] << 32) | ((uint64_t) data[2] << 40)
+	 | ((uint64_t) data[1] << 48) | ((uint64_t) data[0] << 56);
+}
+
+uint64_t read_be64_2 (void)
+{
+  struct uint64_st data;
+
+  read_aux (&data, 8);
+  return (uint64_t) data.u7 | ((uint64_t) data.u6 << 8)
+	 | ((uint64_t) data.u5 << 16) | ((uint64_t) data.u4 << 24)
+	 | ((uint64_t) data.u3 << 32) | ((uint64_t) data.u2 << 40)
+	 | ((uint64_t) data.u1 << 48) | ((uint64_t) data.u0 << 56);
+}
+
+/* { dg-final { scan-tree-dump-times "64 bit load in host endianness found at" 2 "bswap" } } */
+/* { dg-final { scan-tree-dump-times "64 bit bswap implementation found at" 2 "bswap" } } */
+/* { dg-final { cleanup-tree-dump "bswap" } } */
diff --git a/gcc/testsuite/gcc.dg/optimize-bswaphi-1.c b/gcc/testsuite/gcc.dg/optimize-bswaphi-1.c
new file mode 100644
index 0000000..a6bb4c7
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/optimize-bswaphi-1.c
@@ -0,0 +1,49 @@
+/* { dg-do compile { target { ! alpha*-*-* } } } */
+/* { dg-require-effective-target bswap16 } */
+/* { dg-require-effective-target stdint_types } */
+/* { dg-options "-O2 -fdump-tree-bswap" } */
+/* { dg-options "-O2 -fdump-tree-bswap -march=z900" { target s390-*-* } } */
+
+#include <stdint.h>
+
+struct uint16_st {
+  unsigned char u0, u1;
+};
+
+uint32_t read_aux (void *, uint32_t);
+
+uint32_t read_le16_1 (void)
+{
+  unsigned char data[2];
+
+  read_aux (data, 2);
+  return data[0] | (data[1] << 8);
+}
+
+uint32_t read_le16_2 (void)
+{
+  struct uint16_st data;
+
+  read_aux (&data, 2);
+  return data.u0 | (data.u1 << 8);
+}
+
+uint32_t read_be16_1 (void)
+{
+  unsigned char data[2];
+
+  read_aux (data, 2);
+  return data[1] | (data[0] << 8);
+}
+
+uint32_t read_be16_2 (void)
+{
+  struct uint16_st data;
+
+  read_aux (&data, 2);
+  return data.u1 | (data.u0 << 8);
+}
+
+/* { dg-final { scan-tree-dump-times "16 bit load in host endianness found at" 2 "bswap" } } */
+/* { dg-final { scan-tree-dump-times "16 bit bswap implementation found at" 2 "bswap" } } */
+/* { dg-final { cleanup-tree-dump "bswap" } } */
diff --git a/gcc/testsuite/gcc.dg/optimize-bswapsi-1.c b/gcc/testsuite/gcc.dg/optimize-bswapsi-1.c
index 78238e3..2ad44fb 100644
--- a/gcc/testsuite/gcc.dg/optimize-bswapsi-1.c
+++ b/gcc/testsuite/gcc.dg/optimize-bswapsi-1.c
@@ -1,4 +1,5 @@
-/* { dg-do compile { target arm*-*-* alpha*-*-* i?86-*-* powerpc*-*-* rs6000-*-* x86_64-*-* s390*-*-* } } */
+/* { dg-do compile { target *-*-* } } */
+/* { dg-require-effective-target bswap32 } */
 /* { dg-require-effective-target stdint_types } */
 /* { dg-options "-O2 -fdump-tree-bswap" } */
 /* { dg-options "-O2 -fdump-tree-bswap -march=z900" { target s390-*-* } } */
diff --git a/gcc/testsuite/gcc.dg/optimize-bswapsi-2.c b/gcc/testsuite/gcc.dg/optimize-bswapsi-2.c
new file mode 100644
index 0000000..82cb92f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/optimize-bswapsi-2.c
@@ -0,0 +1,49 @@
+/* { dg-do compile { target *-*-* } } */
+/* { dg-require-effective-target bswap32 } */
+/* { dg-require-effective-target stdint_types } */
+/* { dg-options "-O2 -fdump-tree-bswap" } */
+/* { dg-options "-O2 -fdump-tree-bswap -march=z900" { target s390-*-* } } */
+
+#include <stdint.h>
+
+struct uint32_st {
+  unsigned char u0, u1, u2, u3;
+};
+
+uint32_t read_aux (void *, uint32_t);
+
+uint32_t read_le32_1 (void)
+{
+  unsigned char data[4];
+
+  read_aux (data, 4);
+  return data[0] | (data[1] << 8) | (data[2] << 16) | (data[3] << 24);
+}
+
+uint32_t read_le32_2 (void)
+{
+  struct uint32_st data;
+
+  read_aux (&data, 4);
+  return data.u0 | (data.u1 << 8) | (data.u2 << 16) | (data.u3 << 24);
+}
+
+uint32_t read_be32_1 (void)
+{
+  unsigned char data[4];
+
+  read_aux (data, 4);
+  return data[3] | (data[2] << 8) | (data[1] << 16) | (data[0] << 24);
+}
+
+uint32_t read_be32_2 (void)
+{
+  struct uint32_st data;
+
+  read_aux (&data, 4);
+  return data.u3 | (data.u2 << 8) | (data.u1 << 16) | (data.u0 << 24);
+}
+
+/* { dg-final { scan-tree-dump-times "32 bit load in host endianness found at" 2 "bswap" } } */
+/* { dg-final { scan-tree-dump-times "32 bit bswap implementation found at" 2 "bswap" } } */
+/* { dg-final { cleanup-tree-dump "bswap" } } */
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index bee8471..71a346b 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -4732,6 +4732,87 @@ proc check_effective_target_sync_long_long_runtime { } {
     }
 }
 
+# Return 1 if the target supports bit byte swap instructions.
+
+proc check_effective_target_bswap { } {
+    global et_bswap_saved
+
+    if [info exists et_bswap_saved] {
+        verbose "check_effective_target_bswap: using cached result" 2
+    } else {
+	set et_bswap_saved 0
+	if { [istarget aarch64-*-*]
+	     || [istarget alpha*-*-*]
+	     || [istarget arm*-*-*]
+	     || [istarget i?86-*-*]
+	     || [istarget powerpc*-*-*]
+	     || [istarget rs6000-*-*]
+	     || [istarget s390*-*-*]
+	     || [istarget x86_64-*-*] } {
+	    set et_bswap_saved 1
+	}
+    }
+
+    verbose "check_effective_target_bswap: returning $et_bswap_saved" 2
+    return $et_bswap_saved
+}
+
+# Return 1 if the target supports 16-bit byte swap instructions.
+
+proc check_effective_target_bswap16 { } {
+    global et_bswap16_saved
+
+    if [info exists et_bswap16_saved] {
+	verbose "check_effective_target_bswap16: using cached result" 2
+    } else {
+	set et_bswap16_saved 0
+	if { [is-effective-target bswap]
+	     && ![istarget x86_64-*-*] } {
+	   set et_bswap16_saved 1
+	}
+    }
+
+    verbose "check_effective_target_bswap16: returning $et_bswap16_saved" 2
+    return $et_bswap16_saved
+}
+
+# Return 1 if the target supports 32-bit byte swap instructions.
+
+proc check_effective_target_bswap32 { } {
+    global et_bswap32_saved
+
+    if [info exists et_bswap32_saved] {
+	verbose "check_effective_target_bswap32: using cached result" 2
+    } else {
+	set et_bswap32_saved 0
+	if { [is-effective-target bswap] } {
+	   set et_bswap32_saved 1
+	}
+    }
+
+    verbose "check_effective_target_bswap32: returning $et_bswap32_saved" 2
+    return $et_bswap32_saved
+}
+
+# Return 1 if the target supports 64-bit byte swap instructions.
+
+proc check_effective_target_bswap64 { } {
+    global et_bswap64_saved
+
+    if [info exists et_bswap64_saved] {
+        verbose "check_effective_target_bswap64: using cached result" 2
+    } else {
+	set et_bswap64_saved 0
+	if { [is-effective-target bswap]
+	     && [is-effective-target lp64] } {
+	   set et_bswap64_saved 1
+	}
+    }
+
+    verbose "check_effective_target_bswap64: returning $et_bswap64_saved" 2
+    return $et_bswap64_saved
+}
+
 # Return 1 if the target supports atomic operations on "int" and "long".
 
 proc check_effective_target_sync_int_long { } {
diff --git a/gcc/tree-ssa-math-opts.c b/gcc/tree-ssa-math-opts.c
index 9ff857c..1d0a172 100644
--- a/gcc/tree-ssa-math-opts.c
+++ b/gcc/tree-ssa-math-opts.c
@@ -1616,13 +1616,29 @@ make_pass_cse_sincos (gcc::context *ctxt)
 
    0    - byte has the value 0
    1..size - byte contains the content of the byte
-   number indexed with that value minus one  */
+   number indexed with that value minus one.
+
+   To detect permutations on memory sources (arrays and structures), a symbolic
+   number is also associated a base address (the array or structure the load is
+   made from), an offset from the base address and a range which gives the
+   range of memory represented by this symbolic number. The range is different
+   from size as size reflect the size of the type of current expression. It can
+   thus be bigger when casting to a type with bigger size.  */
 
 struct symbolic_number {
   unsigned HOST_WIDEST_INT n;
   int size;
+  tree base_addr;
+  tree offset;
+  unsigned HOST_WIDE_INT range;
 };
 
+/* The number which the find_bswap_or_nop_1 result should match in
+   order to have a nop.  The number is masked according to the size of
+   the symbolic number before using it.  */
+#define CMPNOP (sizeof (HOST_WIDEST_INT) < 8 ? 0 : \
+  (unsigned HOST_WIDEST_INT)0x08070605 << 32 | 0x04030201)
+
 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
    number N.  Return false if the requested operation is not permitted
    on a symbolic number.  */
@@ -1688,7 +1704,7 @@ verify_symbolic_number_p (struct symbolic_number *n, gimple stmt)
    tree expression of the source operand and NULL otherwise.  */
 
 static tree
-find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
+find_bswap_or_nop_1 (gimple stmt, struct symbolic_number *n, int limit)
 {
   enum tree_code code;
   tree rhs1, rhs2 = NULL;
@@ -1727,12 +1743,57 @@ find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
 	  && code != CONVERT_EXPR)
 	return NULL_TREE;
 
-      source_expr1 = find_bswap_1 (rhs1_stmt, n, limit - 1);
+      source_expr1 = find_bswap_or_nop_1 (rhs1_stmt, n, limit - 1);
 
-      /* If find_bswap_1 returned NULL STMT is a leaf node and we have
-	 to initialize the symbolic number.  */
+      /* If find_bswap_or_nop_1 returned NULL, STMT is a leaf node and
+	 we have to initialize the symbolic number.  */
       if (!source_expr1)
 	{
+	  n->base_addr = n->offset = NULL_TREE;
+	  if (is_gimple_assign (rhs1_stmt))
+	    {
+	      unsigned bit_offset;
+	      tree var, elt_size, index, field, bit_offset_expr;
+
+	      var = gimple_assign_rhs1 (rhs1_stmt);
+	      switch (TREE_CODE (var))
+		{
+		/* leaf node is an array, memorize its base, element size and
+		   offset from base to compare to other array leaf node.  */
+		case ARRAY_RANGE_REF:
+		case ARRAY_REF:
+		  n->base_addr = TREE_OPERAND (var, 0);
+		  elt_size = array_ref_element_size (var);
+		  if (TREE_CODE (elt_size) != INTEGER_CST)
+		    return NULL_TREE;
+		  index = TREE_OPERAND (var, 1);
+		  if (TREE_THIS_VOLATILE (var) || TREE_THIS_VOLATILE (index))
+		    return NULL_TREE;
+		  n->offset = fold_build2 (MULT_EXPR, sizetype,
+					   index, elt_size);
+		  rhs1 = var;
+		  break;
+		/* leaf node is a record field, memorize its base, field size
+		   and offset from base to compare to other field leaf node.  */
+		case COMPONENT_REF:
+		  n->base_addr = TREE_OPERAND (var, 0);
+		  field = TREE_OPERAND (var, 1);
+		  bit_offset = TREE_INT_CST_LOW (DECL_FIELD_BIT_OFFSET (field));
+		  if (bit_offset % BITS_PER_UNIT)
+		    return NULL_TREE;
+		  bit_offset /= BITS_PER_UNIT;
+		  n->offset = component_ref_field_offset (var);
+		  bit_offset_expr = build_int_cst (TREE_TYPE (n->offset),
+						   bit_offset);
+		  n->offset = fold_build2 (PLUS_EXPR, TREE_TYPE (n->offset),
+					  n->offset, bit_offset_expr);
+		  if (TYPE_PRECISION (TREE_TYPE (field)) != BITS_PER_UNIT)
+		    return NULL_TREE;
+		  rhs1 = var;
+		  break;
+		default:;
+		}
+	    }
 	  /* Set up the symbolic number N by setting each byte to a
 	     value between 1 and the byte size of rhs1.  The highest
 	     order byte is set to n->size and the lowest order
@@ -1741,8 +1802,8 @@ find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
 	  if (n->size % BITS_PER_UNIT != 0)
 	    return NULL_TREE;
 	  n->size /= BITS_PER_UNIT;
-	  n->n = (sizeof (HOST_WIDEST_INT) < 8 ? 0 :
-		  (unsigned HOST_WIDEST_INT)0x08070605 << 32 | 0x04030201);
+	  n->range = n->size;
+	  n->n = CMPNOP;
 
 	  if (n->size < (int)sizeof (HOST_WIDEST_INT))
 	    n->n &= ((unsigned HOST_WIDEST_INT)1 <<
@@ -1817,17 +1878,76 @@ find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
       switch (code)
 	{
 	case BIT_IOR_EXPR:
-	  source_expr1 = find_bswap_1 (rhs1_stmt, &n1, limit - 1);
+	  source_expr1 = find_bswap_or_nop_1 (rhs1_stmt, &n1, limit - 1);
 
 	  if (!source_expr1)
 	    return NULL_TREE;
 
-	  source_expr2 = find_bswap_1 (rhs2_stmt, &n2, limit - 1);
+	  source_expr2 = find_bswap_or_nop_1 (rhs2_stmt, &n2, limit - 1);
 
-	  if (source_expr1 != source_expr2
-	      || n1.size != n2.size)
+	  if (n1.size != n2.size || !source_expr2)
 	    return NULL_TREE;
 
+	  if (source_expr1 != source_expr2)
+	    {
+	      tree off_sub_expr, off_cmp_expr;
+	      HOST_WIDEST_INT inc, mask;
+	      unsigned i;
+	      HOST_WIDE_INT off_sub;
+	      struct symbolic_number *n_ptr;
+
+	      if (!n1.base_addr || !n2.base_addr
+		  || n1.base_addr != n2.base_addr)
+		return NULL_TREE;
+
+	      off_cmp_expr = fold_build2 (LT_EXPR, TREE_TYPE (n1.offset),
+					  n2.offset, n1.offset);
+	      if (TREE_CODE (off_cmp_expr) != INTEGER_CST)
+		return NULL_TREE;
+
+	      /* We swap n1 with n2 to have n1 < n2.  */
+	      if (TREE_INT_CST_LOW (off_cmp_expr))
+		{
+		  struct symbolic_number tmpn;
+
+		  tmpn = n2;
+		  n2 = n1;
+		  n1 = tmpn;
+		  source_expr1 = source_expr2;
+		}
+
+	      off_sub_expr = fold_build2 (MINUS_EXPR, TREE_TYPE (n1.offset),
+					  n2.offset, n1.offset);
+	      if (!cst_and_fits_in_hwi (off_sub_expr))
+		return NULL_TREE;
+
+	      off_sub = TREE_INT_CST_LOW (off_sub_expr);
+
+	      /* Check that the range of memory covered < biggest int size.  */
+	      if (off_sub + n2.range > (int)sizeof (HOST_WIDEST_INT))
+	        return NULL_TREE;
+	      n->range = n2.range + off_sub;
+
+	      /* Reinterpret byte marks in symbolic number holding the value of
+		 bigger weight according to host endianness.  */
+	      inc = BYTES_BIG_ENDIAN ? off_sub + n2.range - n1.range : off_sub;
+	      mask = 0xFF;
+	      if (BYTES_BIG_ENDIAN)
+		n_ptr = &n1;
+	      else
+		n_ptr = &n2;
+	      for (i = 0; i < sizeof (HOST_WIDEST_INT); i++, inc <<= 8,
+		   mask <<= 8)
+		{
+		  if (n_ptr->n & mask)
+		    n_ptr->n += inc;
+		}
+	    }
+	  else
+	    n->range = n1.range;
+
+	  n->base_addr = n1.base_addr;
+	  n->offset = n1.offset;
 	  n->size = n1.size;
 	  for (i = 0, mask = 0xff; i < n->size; i++, mask <<= BITS_PER_UNIT)
 	    {
@@ -1852,56 +1972,88 @@ find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
   return NULL_TREE;
 }
 
-/* Check if STMT completes a bswap implementation consisting of ORs,
-   SHIFTs and ANDs.  Return the source tree expression on which the
-   byte swap is performed and NULL if no bswap was found.  */
+/* Check if STMT completes a bswap implementation or a read in a given
+   endianness consisting of ORs, SHIFTs and ANDs and sets *swap
+   accordingly. It also sets *memsrc if the source lies in memory and
+   in this case also sets *size to the size of the load needed. At
+   last, the function returns the source tree expression.  */
 
 static tree
-find_bswap (gimple stmt)
+find_bswap_or_nop (gimple stmt, bool *memsrc, bool *swap, int *size)
 {
-/* The number which the find_bswap result should match in order to
-   have a full byte swap.  The number is shifted to the left according
-   to the size of the symbolic number before using it.  */
-  unsigned HOST_WIDEST_INT cmp =
+/* The number which the find_bswap_or_nop_1 result should match in order
+   to have a full byte swap.  The number is shifted to the right
+   according to the size of the symbolic number before using it.  */
+  unsigned HOST_WIDEST_INT cmpxchg =
     sizeof (HOST_WIDEST_INT) < 8 ? 0 :
     (unsigned HOST_WIDEST_INT)0x01020304 << 32 | 0x05060708;
+  unsigned HOST_WIDEST_INT cmpnop = CMPNOP;
 
   struct symbolic_number n;
   tree source_expr;
-  int limit;
+  int limit, rsize;
 
   /* The last parameter determines the depth search limit.  It usually
      correlates directly to the number of bytes to be touched.  We
      increase that number by three  here in order to also
-     cover signed -> unsigned converions of the src operand as can be seen
+     cover signed -> unsigned conversions of the src operand as can be seen
      in libgcc, and for initial shift/and operation of the src operand.  */
   limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
   limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
-  source_expr =  find_bswap_1 (stmt, &n, limit);
+  source_expr =  find_bswap_or_nop_1 (stmt, &n, limit);
 
   if (!source_expr)
     return NULL_TREE;
 
-  /* Zero out the extra bits of N and CMP.  */
+  /* Zero out the extra bits of N and CMP*.  */
   if (n.size < (int)sizeof (HOST_WIDEST_INT))
     {
+      int tmpn;
       unsigned HOST_WIDEST_INT mask =
 	((unsigned HOST_WIDEST_INT)1 << (n.size * BITS_PER_UNIT)) - 1;
 
       n.n &= mask;
-      cmp >>= (sizeof (HOST_WIDEST_INT) - n.size) * BITS_PER_UNIT;
+      /* Find real size of result (highest non zero byte).  */
+      for (tmpn = n.n, rsize = 0; tmpn; tmpn >>= BITS_PER_UNIT, rsize++);
+      cmpxchg >>= (sizeof (HOST_WIDEST_INT) - rsize) * BITS_PER_UNIT;
+      cmpnop &= mask >> ((n.size - rsize) * BITS_PER_UNIT);
     }
+  else
+    rsize = n.size;
 
   /* A complete byte swap should make the symbolic number to start
-     with the largest digit in the highest order byte.  */
-  if (cmp != n.n)
+     with the largest digit in the highest order byte. Unchanged symbolic
+     number indicates a read with same endianness as host architecture.  */
+  if (n.n != cmpnop && n.n != cmpxchg)
+    return NULL_TREE;
+
+  *memsrc = 0;
+
+  if (n.base_addr)
+    {
+      *memsrc = 1;
+      n.size = rsize;
+    }
+  else if (rsize != n.size)
+    return NULL_TREE;
+  else if (n.n == cmpnop)
+    /* Useless bit manipulation performed by code.  */
     return NULL_TREE;
 
+
+  if (n.n == cmpxchg)
+    *swap = 1;
+  else
+    *swap = 0;
+
+  *size = n.size * BITS_PER_UNIT;
   return source_expr;
 }
 
-/* Find manual byte swap implementations and turn them into a bswap
-   builtin invokation.  */
+/* Find manual byte swap implementations as well as load in a given
+   endianness. Byte swaps are turned into a bswap builtin invokation
+   while endian loads are converted to bswap builtin invokation or
+   simple load according to the host endianness.  */
 
 static unsigned int
 execute_optimize_bswap (void)
@@ -1925,9 +2077,6 @@ execute_optimize_bswap (void)
 	       && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing
 		   || (bswap32_p && word_mode == SImode)));
 
-  if (!bswap16_p && !bswap32_p && !bswap64_p)
-    return 0;
-
   /* Determine the argument type of the builtins.  The code later on
      assumes that the return and argument type are the same.  */
   if (bswap16_p)
@@ -1961,21 +2110,25 @@ execute_optimize_bswap (void)
       for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
         {
 	  gimple stmt = gsi_stmt (gsi);
-	  tree bswap_src, bswap_type;
-	  tree bswap_tmp;
+	  tree src, tmp, tgt, load_type, bswap_type = NULL;
 	  tree fndecl = NULL_TREE;
 	  int type_size;
+	  bool memsrc, swap;
 	  gimple call;
 
 	  if (!is_gimple_assign (stmt)
 	      || gimple_assign_rhs_code (stmt) != BIT_IOR_EXPR)
 	    continue;
 
-	  type_size = TYPE_PRECISION (gimple_expr_type (stmt));
+	  src = find_bswap_or_nop (stmt, &memsrc, &swap, &type_size);
+
+	  if (!src)
+	    continue;
 
 	  switch (type_size)
 	    {
 	    case 16:
+	      load_type = uint16_type_node;
 	      if (bswap16_p)
 		{
 		  fndecl = builtin_decl_explicit (BUILT_IN_BSWAP16);
@@ -1983,6 +2136,7 @@ execute_optimize_bswap (void)
 		}
 	      break;
 	    case 32:
+	      load_type = uint32_type_node;
 	      if (bswap32_p)
 		{
 		  fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
@@ -1990,6 +2144,7 @@ execute_optimize_bswap (void)
 		}
 	      break;
 	    case 64:
+	      load_type = uint64_type_node;
 	      if (bswap64_p)
 		{
 		  fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
@@ -2000,13 +2155,67 @@ execute_optimize_bswap (void)
 	      continue;
 	    }
 
-	  if (!fndecl)
+	  if (swap && !fndecl)
 	    continue;
 
-	  bswap_src = find_bswap (stmt);
+	  tgt = gimple_assign_lhs (stmt);
 
-	  if (!bswap_src)
-	    continue;
+	  /* Need to load the value from memory first.  */
+	  if (memsrc)
+	    {
+	      tree addr_expr, addr_tmp, val_expr, val_tmp;
+	      tree load_ptr_type, load_offset_ptr;
+	      gimple addr_stmt, load_stmt;
+
+	      changed = true;
+
+	      /*  Compute address to load from and cast according to the size
+		  of the load.  */
+	      load_ptr_type = build_pointer_type (load_type);
+	      addr_expr = build1 (ADDR_EXPR, load_ptr_type, src);
+	      addr_tmp = make_temp_ssa_name (load_ptr_type, NULL, "load_src");
+	      addr_stmt = gimple_build_assign_with_ops
+			 (NOP_EXPR, addr_tmp, addr_expr, NULL);
+	      gsi_insert_before (&gsi, addr_stmt, GSI_SAME_STMT);
+
+	      /* Perform the load.  */
+	      load_offset_ptr = build_int_cst (load_ptr_type, 0);
+	      val_tmp = make_temp_ssa_name (load_type, NULL, "load_dst");
+	      val_expr = build2 (MEM_REF, load_type, addr_tmp, load_offset_ptr);
+	      load_stmt = gimple_build_assign_with_ops
+			 (MEM_REF, val_tmp, val_expr, NULL);
+
+	      if (!swap)
+		{
+		  /* Convert the result of load if necessary.  */
+		  if (!useless_type_conversion_p (TREE_TYPE (tgt),
+						  TREE_TYPE (val_tmp)))
+		    {
+		      gimple convert_stmt;
+		      convert_stmt = gimple_build_assign_with_ops
+			     (NOP_EXPR, tgt, val_tmp, NULL);
+		      gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
+		    }
+		  else
+		    gimple_assign_set_lhs (load_stmt, tgt);
+
+		  if (dump_file)
+		    {
+		      fprintf (dump_file,
+			       "%d bit load in host endianness found at: ",
+			       (int)type_size);
+		      print_gimple_stmt (dump_file, stmt, 0, 0);
+		    }
+
+		  gsi_insert_after (&gsi, load_stmt, GSI_SAME_STMT);
+		  gsi_remove (&gsi, true);
+
+		  continue;
+		}
+	      else
+		gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
+	      src = val_tmp;
+	    }
 
 	  changed = true;
 	  if (type_size == 16)
@@ -2016,33 +2225,33 @@ execute_optimize_bswap (void)
 	  else
 	    bswap_stats.found_64bit++;
 
-	  bswap_tmp = bswap_src;
+	  tmp = src;
 
 	  /* Convert the src expression if necessary.  */
-	  if (!useless_type_conversion_p (TREE_TYPE (bswap_tmp), bswap_type))
+	  if (!useless_type_conversion_p (TREE_TYPE (tmp), bswap_type))
 	    {
 	      gimple convert_stmt;
-	      bswap_tmp = make_temp_ssa_name (bswap_type, NULL, "bswapsrc");
+	      tmp = make_temp_ssa_name (bswap_type, NULL, "bswapsrc");
 	      convert_stmt = gimple_build_assign_with_ops
-		  		(NOP_EXPR, bswap_tmp, bswap_src, NULL);
+				(NOP_EXPR, tmp, src, NULL);
 	      gsi_insert_before (&gsi, convert_stmt, GSI_SAME_STMT);
 	    }
 
-	  call = gimple_build_call (fndecl, 1, bswap_tmp);
+	  call = gimple_build_call (fndecl, 1, tmp);
 
-	  bswap_tmp = gimple_assign_lhs (stmt);
+	  tmp = tgt;
 
 	  /* Convert the result if necessary.  */
-	  if (!useless_type_conversion_p (TREE_TYPE (bswap_tmp), bswap_type))
+	  if (!useless_type_conversion_p (TREE_TYPE (tgt), bswap_type))
 	    {
 	      gimple convert_stmt;
-	      bswap_tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst");
+	      tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst");
 	      convert_stmt = gimple_build_assign_with_ops
-			(NOP_EXPR, gimple_assign_lhs (stmt), bswap_tmp, NULL);
+			(NOP_EXPR, tgt, tmp, NULL);
 	      gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
 	    }
 
-	  gimple_call_set_lhs (call, bswap_tmp);
+	  gimple_call_set_lhs (call, tmp);
 
 	  if (dump_file)
 	    {

^ permalink raw reply	[flat|nested] 31+ messages in thread

end of thread, other threads:[~2014-06-06 10:45 UTC | newest]

Thread overview: 31+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-05-09 10:26 [PATCH] Fix PR54733 Optimize endian independent load/store Thomas Preud'homme
2014-05-16 10:07 ` Thomas Preud'homme
2014-05-16 10:48   ` Richard Biener
2014-05-16 10:56     ` pinskia
2014-05-16 11:03       ` Richard Biener
2014-05-16 11:14         ` Richard Biener
2014-05-16 16:59           ` pinskia
2014-05-19  2:49             ` Thomas Preud'homme
2014-05-20  2:50             ` Andrew Pinski
2014-05-19  9:30           ` Thomas Preud'homme
2014-05-19  9:46             ` Richard Biener
2014-05-20  2:46         ` Thomas Preud'homme
2014-05-20  9:06           ` Richard Biener
2014-05-20 10:29             ` Thomas Preud'homme
2014-05-21  1:00               ` Thomas Preud'homme
2014-05-22 10:07                 ` Richard Biener
2014-05-23  3:36                   ` Thomas Preud'homme
2014-05-26 10:57                     ` Christophe Lyon
2014-05-26 11:06                       ` Thomas Preud'homme
2014-05-27  7:23                         ` Thomas Preud'homme
2014-05-27 14:35                           ` Christophe Lyon
2014-05-27 16:45                           ` Andreas Schwab
2014-05-29  8:18                             ` Thomas Preud'homme
2014-05-29  8:52                               ` Andreas Schwab
2014-05-29  9:58                                 ` Thomas Preud'homme
2014-06-02 13:57                                   ` Christophe Lyon
2014-06-04  7:04                                     ` Thomas Preud'homme
2014-06-04  9:52                                       ` Richard Biener
2014-06-06 10:45                                         ` Thomas Preud'homme
2014-05-24 18:56                 ` Andreas Schwab
  -- strict thread matches above, loose matches on Subject: below --
2014-03-19  7:24 Thomas Preud'homme

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).