From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 72167 invoked by alias); 9 Jul 2015 12:58:27 -0000 Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk List-Id: List-Archive: List-Post: List-Help: Sender: gcc-patches-owner@gcc.gnu.org Received: (qmail 72084 invoked by uid 89); 9 Jul 2015 12:58:26 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-2.3 required=5.0 tests=AWL,BAYES_00,FREEMAIL_FROM,RCVD_IN_DNSWL_LOW,SPF_PASS autolearn=ham version=3.3.2 X-HELO: mail-wg0-f47.google.com Received: from mail-wg0-f47.google.com (HELO mail-wg0-f47.google.com) (74.125.82.47) by sourceware.org (qpsmtpd/0.93/v0.84-503-g423c35a) with (AES128-GCM-SHA256 encrypted) ESMTPS; Thu, 09 Jul 2015 12:58:16 +0000 Received: by wgxm20 with SMTP id m20so39509998wgx.3 for ; Thu, 09 Jul 2015 05:58:13 -0700 (PDT) X-Received: by 10.180.90.209 with SMTP id by17mr121975059wib.2.1436446693394; Thu, 09 Jul 2015 05:58:13 -0700 (PDT) Received: from s42.loc (91-119-213-131.dynamic.xdsl-line.inode.at. [91.119.213.131]) by smtp.gmail.com with ESMTPSA id l14sm8733339wjq.21.2015.07.09.05.58.12 (version=TLSv1.2 cipher=RC4-SHA bits=128/128); Thu, 09 Jul 2015 05:58:12 -0700 (PDT) Received: from cow by s42.loc with local (Exim 4.80) (envelope-from ) id 1ZDBP1-0003B0-F2; Thu, 09 Jul 2015 14:58:11 +0200 From: Bernhard Reutner-Fischer To: gcc-patches@gcc.gnu.org Cc: Bernhard Reutner-Fischer , "Joseph S . Myers" , Richard Biener Subject: [PATCH] fold builtin_tolower, builtin_toupper Date: Thu, 09 Jul 2015 12:58:00 -0000 Message-Id: <1436446689-12180-1-git-send-email-rep.dot.nop@gmail.com> X-IsSubscribed: yes X-SW-Source: 2015-07/txt/msg00737.txt.bz2 gcc/ChangeLog 2015-07-09 Bernhard Reutner-Fischer * builtins.c (fold_builtin_tolower, fold_builtin_toupper): New static functions. (fold_builtin_1): Handle BUILT_IN_TOLOWER, BUILT_IN_TOUPPER. Signed-off-by: Bernhard Reutner-Fischer --- gcc/builtins.c | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) Using the three testcases attached to PR66741 where the -1.c one is using builtins $ for i in 0 1 2;do gcc -o tolower_strcpy-$i tolower_strcpy-$i.c -Ofast -W -Wall -Wextra -pedantic -DMAIN -msse4.2;done pristine (trunk@225368): # tolower_strcpy-0 real 0m6.068s user 0m3.204s sys 0m2.840s # tolower_strcpy-1 real 0m8.097s user 0m5.548s sys 0m2.528s # tolower_strcpy-2 real 0m3.568s user 0m0.804s sys 0m2.748s trunk@225368 + fold tolower/toupper below # tolower_strcpy-0 real 0m6.055s user 0m3.212s sys 0m2.832s # tolower_strcpy-1 real 0m5.383s user 0m2.464s sys 0m2.900s # tolower_strcpy-2 real 0m3.605s user 0m0.668s sys 0m2.924s The tolower loop now ends up as .L5: movsbl (%rbx), %edx leal 32(%rdx), %ecx movl %edx, %eax subl $65, %edx cmpl $25, %edx cmovbe %ecx, %eax addq $1, %rbx movb %al, -1(%rbx) cmpq %rsi, %rbx jne .L5 instead of the former call .L5: movsbl (%rbx), %edi addq $1, %rbx call tolower movb %al, -1(%rbx) cmpq %rbp, %rbx jne .L5 Would something like attached be ok for trunk after proper testing? Advise on the questions inline WRT caching lang_hooks intermediate results? Hints on further steps towards fixing the PR? I think the next step would be to try to teach graphite to fuse the two loops in tolower_strcpy-0.c. Need to look at graphite.. Then see how to classify builtins that could be expanded early and what breaks if doing so. This sounds like a potential disaster, fun. Next, see why the vectorizer (or something else) does not pave the way to use SSE instruction as the tolower_strcpy-2.c does. thanks, diff --git a/gcc/builtins.c b/gcc/builtins.c index 5f53342..421c908 100644 --- a/gcc/builtins.c +++ b/gcc/builtins.c @@ -204,6 +204,9 @@ static tree fold_builtin_strrchr (location_t, tree, tree, tree); static tree fold_builtin_strspn (location_t, tree, tree); static tree fold_builtin_strcspn (location_t, tree, tree); +static tree fold_builtin_tolower (location_t, tree); +static tree fold_builtin_toupper (location_t, tree); + static rtx expand_builtin_object_size (tree); static rtx expand_builtin_memory_chk (tree, rtx, machine_mode, enum built_in_function); @@ -10285,6 +10288,12 @@ fold_builtin_1 (location_t loc, tree fndecl, tree arg0) case BUILT_IN_ISDIGIT: return fold_builtin_isdigit (loc, arg0); + case BUILT_IN_TOLOWER: + return fold_builtin_tolower (loc, arg0); + + case BUILT_IN_TOUPPER: + return fold_builtin_toupper (loc, arg0); + CASE_FLT_FN (BUILT_IN_FINITE): case BUILT_IN_FINITED32: case BUILT_IN_FINITED64: @@ -11208,6 +11217,96 @@ fold_builtin_strcspn (location_t loc, tree s1, tree s2) } } + +/* Simplify a call to the tolower builtin. ARG is the argument to the call. + + Return NULL_TREE if no simplification was possible, otherwise return the + simplified form of the call as a tree. */ + +static tree +fold_builtin_tolower (location_t loc, tree arg) +{ + if (!validate_arg (arg, INTEGER_TYPE)) + return NULL_TREE; + + /* Transform tolower(c) -> (unsigned)(c) | 0x20. + + More specifically: + unsigned tem = arg - 'A'; + if (tem <= ('Z' - 'A')) + arg += 'a' - 'A'; + return arg; + */ + unsigned HOST_WIDE_INT target_A = lang_hooks.to_target_charset ('A'); + unsigned HOST_WIDE_INT target_Z = lang_hooks.to_target_charset ('Z'); + unsigned HOST_WIDE_INT target_a = lang_hooks.to_target_charset ('a'); + if (target_A == 0 + || target_Z == 0 + || target_a == 0) + return NULL_TREE; + + arg = fold_convert_loc (loc, unsigned_type_node, arg); + tree tem = fold_build2 (MINUS_EXPR, unsigned_type_node, arg, + build_int_cst (unsigned_type_node, target_A)); + /* ??? x19 and x20 would better live in static storage; Think: + * static struct static_fold_tolower {uHWI x19, x20; unsigned probe_done}; + */ + unsigned HOST_WIDE_INT x19 = target_Z - target_A; + unsigned HOST_WIDE_INT x20 = target_a - target_A; + tem = fold_build2_loc (loc, LE_EXPR, integer_type_node, tem, + build_int_cst (unsigned_type_node, x19)); + tem = fold_build3_loc (loc, COND_EXPR, unsigned_type_node, tem, + fold_build2 (PLUS_EXPR, unsigned_type_node, arg, + build_int_cst (unsigned_type_node, x20)), + arg); + return fold_convert_loc (loc, integer_type_node, tem); +} + +/* Simplify a call to the toupper builtin. ARG is the argument to the call. + + Return NULL_TREE if no simplification was possible, otherwise return the + simplified form of the call as a tree. */ + +static tree +fold_builtin_toupper (location_t loc, tree arg) +{ + if (!validate_arg (arg, INTEGER_TYPE)) + return NULL_TREE; + + /* Transform toupper(c) -> (unsigned)(c) ^ 0x20. + + More specifically: + unsigned tem = arg - 'a'; + if (tem <= ('z' - 'a')) + arg -= 'a' - 'A'; + return arg; + */ + unsigned HOST_WIDE_INT target_A = lang_hooks.to_target_charset ('A'); + unsigned HOST_WIDE_INT target_z = lang_hooks.to_target_charset ('z'); + unsigned HOST_WIDE_INT target_a = lang_hooks.to_target_charset ('a'); + if (target_A == 0 + || target_z == 0 + || target_a == 0) + return NULL_TREE; + + arg = fold_convert_loc (loc, unsigned_type_node, arg); + tree tem = fold_build2 (MINUS_EXPR, unsigned_type_node, arg, + build_int_cst (unsigned_type_node, target_a)); + /* ??? x19 and x20 would better live in static storage; Think: + * static struct static_fold_tolower {uHWI x19, x20; unsigned probe_done}; + */ + unsigned HOST_WIDE_INT x19 = target_z - target_a; + unsigned HOST_WIDE_INT x20 = target_a - target_A; + tem = fold_build2_loc (loc, LE_EXPR, integer_type_node, tem, + build_int_cst (unsigned_type_node, x19)); + tem = fold_build3_loc (loc, COND_EXPR, unsigned_type_node, tem, + fold_build2 (MINUS_EXPR, unsigned_type_node, arg, + build_int_cst (unsigned_type_node, x20)), + arg); + return fold_convert_loc (loc, integer_type_node, tem); +} + + /* Fold the next_arg or va_start call EXP. Returns true if there was an error produced. False otherwise. This is done so that we don't output the error or warning twice or three times. */ -- 2.1.4