From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <gcc-bugs-return-91229-listarch-gcc-bugs=gcc.gnu.org@gcc.gnu.org>
Received: (qmail 4531 invoked by alias); 4 Apr 2004 12:45:57 -0000
Mailing-List: contact gcc-bugs-help@gcc.gnu.org; run by ezmlm
Precedence: bulk
List-Archive: <http://gcc.gnu.org/ml/gcc-bugs/>
List-Post: <mailto:gcc-bugs@gcc.gnu.org>
List-Help: <mailto:gcc-bugs-help@gcc.gnu.org>
Sender: gcc-bugs-owner@gcc.gnu.org
Received: (qmail 4509 invoked by uid 48); 4 Apr 2004 12:45:55 -0000
Date: Sun, 04 Apr 2004 12:45:00 -0000
Message-ID: <20040404124555.4508.qmail@sources.redhat.com>
From: "steven at gcc dot gnu dot org" <gcc-bugzilla@gcc.gnu.org>
To: gcc-bugs@gcc.gnu.org
In-Reply-To: <20040120183908.13776.kgardas@objectsecurity.com>
References: <20040120183908.13776.kgardas@objectsecurity.com>
Reply-To: gcc-bugzilla@gcc.gnu.org
Subject: [Bug c++/13776] [tree-ssa] Many C++ compile-time regression in 3.5-tree-ssa 040120
X-Bugzilla-Reason: CC
X-SW-Source: 2004-04/txt/msg00326.txt.bz2
List-Id: <gcc-bugs.sourceware.org>


------- Additional Comments From steven at gcc dot gnu dot org  2004-04-04 12:45 -------
I did some profiling of iterative_hash on tree-ssa.  Not 
immediately related to this PR, perhaps, but part of the problem. 
 
  %   cumulative   self              self     total 
 time   seconds   seconds    calls   s/call   s/call  name 
  2.75      1.66     1.66  2329935     0.00     0.00  iterative_hash 
  2.29      3.04     1.38   235027     0.00     0.00  walk_tree 
  2.04      4.27     1.23  1419091     0.00     0.00  ggc_alloc_stat 
  1.87      5.40     1.13  1020397     0.00     0.00  htab_find_slot_with_hash 
  1.74      6.45     1.05  1295674     0.00     0.00  mark_set_1 
  1.67      7.46     1.01   396445     0.00     0.00  iterative_hash_expr 
  1.64      8.45     0.99  2947490     0.00     0.00  bitmap_bit_p 
  1.59      9.41     0.96   321482     0.00     0.00  for_each_rtx 
  1.54     10.34     0.93  1566242     0.00     0.00  bitmap_set_bit 
  1.42     11.20     0.86   770792     0.00     0.00  et_splay 
 
Right now, this function seems to be used only on the tree-ssa 
branch, and mostly in the tree optimizers via iterative_hash_expr: 
 
----------------------------------------------- 
                             1423028             iterative_hash_expr [35] 
                0.00    0.00      40/396445      pre_expression [433] 
                0.00    0.00     162/396445      process_delayed_rename [971] 
                0.03    0.04   10126/396445      gimple_tree_hash [516] 
                0.39    0.67  151915/396445      avail_expr_hash [71] 
                0.60    1.03  234202/396445      true_false_expr_hash [52] 
[35]     4.6    1.01    1.74  396445+1423028 iterative_hash_expr [35] 
                1.65    0.00 2308918/2329935     iterative_hash [53] 
                0.06    0.00  383567/1028690     first_rtl_op [321] 
                0.03    0.00  546018/635717      commutative_tree_code [699] 
                             1423028             iterative_hash_expr [35] 
----------------------------------------------- 
                0.00    0.00     919/2329935     build_type_attribute_variant 
<cycle 12> [1420] 
                0.00    0.00     940/2329935     build_array_type [1299] 
                0.00    0.00    4814/2329935     build_function_type <cycle 
12> [671] 
                0.01    0.00   14344/2329935     type_hash_list [900] 
                1.65    0.00 2308918/2329935     iterative_hash_expr [35] 
[53]     2.8    1.66    0.00 2329935         iterative_hash [53] 
----------------------------------------------- 
 
So ~95% of all iterative_hash_expr calls are from DOM, which could use 
a little help in terms of compilation speed: ~12% for this particular 
test case pt.i. 
 
I also did some coverage testing on iterative_hash: 
 
        -:  794:hashval_t iterative_hash (k_in, length, initval) 
        -:  795:     const PTR k_in;               /* the key */ 
        -:  796:     register size_t  length;      /* the length of the key */ 
        -:  797:     register hashval_t  initval;  /* the previous hash, or an 
arbitrary value */ 
 13721488:  798:{ 
 13721488:  799:  register const unsigned char *k = (const unsigned char 
*)k_in; 
 13721488:  800:  register hashval_t a,b,c,len; 
        -:  801: 
        -:  802:  /* Set up the internal state */ 
 13721488:  803:  len = length; 
 13721488:  804:  a = b = 0x9e3779b9;  /* the golden ratio; an arbitrary value 
*/ 
 13721488:  805:  c = initval;           /* the previous hash value */ 
        -:  806: 
        -:  807:  /*---------------------------------------- handle most of 
the key */ 
        -:  808:#ifndef WORDS_BIGENDIAN 
        -:  809:  /* On a little-endian machine, if the data is 4-byte aligned 
we can hash 
        -:  810:     by word for better speed.  This gives nondeterministic 
results on 
        -:  811:     big-endian machines.  */ 
 13721488:  812:  if (sizeof (hashval_t) == 4 && (((size_t)k)&3) == 0) 
branch  0 taken 0% 
 13724520:  813:    while (len >= 12)    /* aligned */ 
branch  0 taken 1% 
branch  1 taken 100% 
        -:  814:      { 
     3032:  815:        a += *(hashval_t *)(k+0); 
     3032:  816:        b += *(hashval_t *)(k+4); 
     3032:  817:        c += *(hashval_t *)(k+8); 
     3032:  818:        mix(a,b,c); 
     3032:  819:        k += 12; len -= 12; 
branch  0 taken 100% 
        -:  820:      } 
        -:  821:  else /* unaligned */ 
        -:  822:#endif 
    #####:  823:    while (len >= 12) 
branch  0 never executed 
branch  1 never executed 
        -:  824:      { 
    #####:  825:        a += (k[0] +((hashval_t)k[1]<<8) 
+((hashval_t)k[2]<<16) +((hashval_t)k[3]<<24)); 
    #####:  826:        b += (k[4] +((hashval_t)k[5]<<8) 
+((hashval_t)k[6]<<16) +((hashval_t)k[7]<<24)); 
    #####:  827:        c += (k[8] +((hashval_t)k[9]<<8) 
+((hashval_t)k[10]<<16)+((hashval_t)k[11]<<24)); 
    #####:  828:        mix(a,b,c); 
    #####:  829:        k += 12; len -= 12; 
branch  0 never executed 
        -:  830:      } 
        -:  831: 
        -:  832:  /*------------------------------------- handle the last 11 
bytes */ 
 13721488:  833:  c += length; 
 13721488:  834:  switch(len)              /* all the case statements fall 
through */ 
branch  0 taken 0% 
branch  1 taken 0% 
branch  2 taken 0% 
branch  3 taken 0% 
branch  4 taken 0% 
branch  5 taken 1% 
branch  6 taken 0% 
branch  7 taken 1% 
branch  8 taken 99% 
branch  9 taken 1% 
branch 10 taken 1% 
branch 11 taken 0% 
branch 12 taken 1% 
        -:  835:    { 
    #####:  836:    case 11: c+=((hashval_t)k[10]<<24); 
    #####:  837:    case 10: c+=((hashval_t)k[9]<<16); 
    #####:  838:    case 9 : c+=((hashval_t)k[8]<<8); 
        -:  839:      /* the first byte of c is reserved for the length */ 
    #####:  840:    case 8 : b+=((hashval_t)k[7]<<24); 
      129:  841:    case 7 : b+=((hashval_t)k[6]<<16); 
      129:  842:    case 6 : b+=((hashval_t)k[5]<<8); 
      181:  843:    case 5 : b+=k[4]; 
 13719971:  844:    case 4 : a+=((hashval_t)k[3]<<24); 
 13719977:  845:    case 3 : a+=((hashval_t)k[2]<<16); 
 13719979:  846:    case 2 : a+=((hashval_t)k[1]<<8); 
 13719979:  847:    case 1 : a+=k[0]; 
        -:  848:      /* case 0: nothing left to add */ 
        -:  849:    } 
 13721488:  850:  mix(a,b,c); 
        -:  851:  /*-------------------------------------------- report the 
result */ 
 13721488:  852:  return c; 
        -:  853:} 
 
So it seems that a specialized version for 4 byte objects really would 
help here. 
 
(Xeon is 32bit, so the 8 byte case is important for 64bit targets??) 
 

-- 
           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |law at redhat dot com


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=13776