From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 30691 invoked by alias); 13 Dec 2012 19:28:54 -0000 Received: (qmail 30683 invoked by uid 22791); 13 Dec 2012 19:28:54 -0000 X-SWARE-Spam-Status: No, hits=-1.7 required=5.0 tests=AWL,BAYES_00,KHOP_RCVD_UNTRUST,RCVD_IN_DNSWL_LOW,RP_MATCHES_RCVD,TW_RL X-Spam-Check-By: sourceware.org Received: from nikam.ms.mff.cuni.cz (HELO nikam.ms.mff.cuni.cz) (195.113.20.16) by sourceware.org (qpsmtpd/0.43rc1) with ESMTP; Thu, 13 Dec 2012 19:28:46 +0000 Received: by nikam.ms.mff.cuni.cz (Postfix, from userid 16202) id 59B0D542481; Thu, 13 Dec 2012 20:28:44 +0100 (CET) Date: Thu, 13 Dec 2012 19:28:00 -0000 From: Jan Hubicka To: Xinliang David Li Cc: Jakub Jelinek , Jan Hubicka , GCC Patches , Teresa Johnson Subject: Re: [PATCH i386]: Enable push/pop in pro/epilogue for modern CPUs Message-ID: <20121213192844.GA26009@kam.mff.cuni.cz> References: <20121212163722.GA21037@atrey.karlin.mff.cuni.cz> <20121212183036.GB5303@atrey.karlin.mff.cuni.cz> <20121213011933.GB21037@atrey.karlin.mff.cuni.cz> <20121213062128.GK2315@tucnak.redhat.com> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: User-Agent: Mutt/1.5.20 (2009-06-14) Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk List-Id: List-Archive: List-Post: List-Help: Sender: gcc-patches-owner@gcc.gnu.org X-SW-Source: 2012-12/txt/msg00933.txt.bz2 > Try the following one. 1) -minline-all-stringops > -mstringop-strategy=rep_8byte -O2 vs 1) -mstringop_strategy=libcall > -O2. > > David > > > #include > #include > #include > #ifndef LEN > #define LEN 16 > #endif > > void copy(char* s1, char* s2,int len) __attribute__((noinline)); > void copy(char* s1, char* s2,int len) > { > memcpy(s2,s1,len); > } I guess the catch here is that you force the copy to be noinline and thus you eliminate the benefits of inlined sequence. With inline stringop one saves regalloc and often can get rid of the alignment tests. This is script I use to tune the tables. Honza test() { rm -f a.out cat <&1` echo -n " "$TIME echo $TIME $4 >>/tmp/accum } testrow() { echo -n "" >/tmp/accum printf "block size %7i" $3 test "$2" "$3" "-mstringop-strategy=libcall" libcall test "$2" "$3" "-mstringop-strategy=rep_byte -malign-stringops" rep1 test "$2" "$3" "-mstringop-strategy=rep_byte -mno-align-stringops" rep1noalign test "$2" "$3" "-mstringop-strategy=rep_4byte -malign-stringops" rep4 test "$2" "$3" "-mstringop-strategy=rep_4byte -mno-align-stringops" rep4noalign if [ "$mode" == 64 ] then test "$2" "$3" "-mstringop-strategy=rep_8byte -malign-stringops" rep8 test "$2" "$3" "-mstringop-strategy=rep_8byte -mno-align-stringops" rep8noalign fi test "$2" "$3" "-mstringop-strategy=loop -malign-stringops" loop test "$2" "$3" "-mstringop-strategy=loop -mno-align-stringops" loopnoalign test "$2" "$3" "-mstringop-strategy=unrolled_loop -malign-stringops" unrl test "$2" "$3" "-mstringop-strategy=unrolled_loop -mno-align-stringops" unrlnoalign test "$2" "$3" "-mstringop-strategy=sse_loop -malign-stringops" sse test "$2" "$3" "-mstringop-strategy=sse_loop -mno-align-stringops -msse2" ssenoalign test "$2" "$3" "-mstringop-strategy=byte_loop" byte best=`cat /tmp/accum | sort | head -1` test "$2" "$3" " -fprofile-generate" >/dev/null 2>&1 test "$2" "$3" " -fprofile-use" test "$2" "$3" " -minline-stringops-dynamically" echo " best: $best" } test_all_sizes() { if [ "$mode" == 64 ] then echo " libcall rep1 noalg rep4 noalg rep8 noalg loop noalg unrl noalg sse noalg byte profiled dynamic" else echo " libcall rep1 noalg rep4 noalg loop noalg unrl noalg sse noalg byte profiled dynamic" fi #for size in 1 2 3 4 6 8 10 12 14 16 24 32 48 64 128 256 512 1024 4096 8192 81920 819200 8192000 #for size in 8192000 819200 81920 8192 4096 2048 1024 512 256 128 64 48 32 24 16 14 12 10 8 6 5 4 3 2 1 for size in 8192000 819200 81920 20480 8192 4096 2048 1024 512 256 128 64 48 32 24 16 14 12 10 8 6 4 1 #for size in 128 256 1024 4096 8192 81920 819200 do testrow "$1" "$2" $size done } mode=$1 shift export memsize=$1 shift cmdline=$* if [ "$mode" != 32 ] then if [ "$mode" != 64 ] then echo "Usage:" echo "test_stringop mode size cmdline" echo "mode is either 32 or 64" echo "size is amount of memory copied in each test. Should be chosed small enough so runtime is less than minute for each test and sorting works" echo "Example: test_stringop 32 640000000 ./xgcc -B ./ -march=pentium3" exit fi fi echo "memcpy mode:$mode size:$memsize" export STRINGOP="" type=char test_all_sizes $mode "$cmdline -m$mode" echo "Aligned" type=long test_all_sizes $mode "$cmdline -m$mode" echo "memset" type=char export STRINGOP="-Dtest_memset=1" test_all_sizes $mode "$cmdline -m$mode" echo "Aligned" type=long test_all_sizes $mode "$cmdline -m$mode"