From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-lf1-x134.google.com (mail-lf1-x134.google.com [IPv6:2a00:1450:4864:20::134]) by sourceware.org (Postfix) with ESMTPS id 4151D385800B for ; Wed, 7 Feb 2024 12:44:03 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org 4151D385800B Authentication-Results: sourceware.org; dmarc=none (p=none dis=none) header.from=embecosm.com Authentication-Results: sourceware.org; spf=pass smtp.mailfrom=embecosm.com ARC-Filter: OpenARC Filter v1.0.0 sourceware.org 4151D385800B Authentication-Results: server2.sourceware.org; arc=none smtp.remote-ip=2a00:1450:4864:20::134 ARC-Seal: i=1; a=rsa-sha256; d=sourceware.org; s=key; t=1707309850; cv=none; b=KxaR6uWi9cyKYmajDYFei50cLLopTa1qkhBoC8EkG/w/V66fLron9QyPPrv9TD2d2JWqE7kyoG5e9FcPPmJhEtTSpw/Gey1+HVSZDGz6D9PNqJtjQPONdM5qz6GSTr5xOLlUTxssM9og2U8xWXMthYkUvd4BSLVTSCfFFbwvOcg= ARC-Message-Signature: i=1; a=rsa-sha256; d=sourceware.org; s=key; t=1707309850; c=relaxed/simple; bh=q2E76l+iGVUSTBRlkwQvKj1Ozx8qxkBrTu7jua/LmqE=; h=DKIM-Signature:From:To:Subject:Date:Message-ID:MIME-Version; b=ijzWASUhxFeAb1NhALLn7yPQKklZiF5dNMHgKSSPYNf7+8SC8KE9Leld5HV5rxqy/dAl24gvNiSFuV0Lyqrvgjv83SYVEsVXBhw5SjXkdCeSXawVGW62qahVJCZwGRugy7OSWbc4MQU++VEx0C2RxAnlj5kAOjkB5pFaJFeqfsQ= ARC-Authentication-Results: i=1; server2.sourceware.org Received: by mail-lf1-x134.google.com with SMTP id 2adb3069b0e04-511616b73ddso982550e87.0 for ; Wed, 07 Feb 2024 04:44:03 -0800 (PST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=embecosm.com; s=google; t=1707309842; x=1707914642; darn=gcc.gnu.org; h=content-transfer-encoding:mime-version:reply-to:references :in-reply-to:message-id:date:subject:cc:to:from:from:to:cc:subject :date:message-id:reply-to; bh=2KfXBPgU3wfbdhusy/i/FeS+LYq5pZy2m5PcIVGGcGY=; b=JTrD6BvavEPdvBQU9mgNuD0sAUy70HmTyRX41sM23+kGSOW7PprzbsJ1nkWv9iY1PR quA+KvKSqIwWL9khjWkRUwBAFkcOTUtSsbRy3MiZJVkjh7hkZm3GmgoFe+SvznMrwpoU jYqRY9m95f1TvvlQnKDaP1TavrybI1fd+rc753PbGfrAbTRSZ+D8Qk2p6wOkvNdT/NNv vjBf8mQIxYj+5zp9ZKKpi+u4lELowtnG/1PJvPfK0qlslFYVi3kNl810CasDZqbHhFdN 71hXwMM2PggSWISLlDYhZPh43xpsDxj9Y2ukbooGowIrnRBDkWr0XhHfFbqiDIe90FDO G4sg== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20230601; t=1707309842; x=1707914642; h=content-transfer-encoding:mime-version:reply-to:references :in-reply-to:message-id:date:subject:cc:to:from:x-gm-message-state :from:to:cc:subject:date:message-id:reply-to; bh=2KfXBPgU3wfbdhusy/i/FeS+LYq5pZy2m5PcIVGGcGY=; b=lf4Wh/SAc6UcsHgoe53uWgeh8Aj1n5nDWVhaPrRFGLDzNu3cNUz34wtFA9Hagq4mT2 n0fRMI38EMqDTcIvzgS5Wv+0ZF6JQ3UGYf2IBZqL1C5NybyYI7rr4sNJ4DaTvKsfARKN a8MOurXCbLzRDTBEoP7JfaWa6zfZwaccyIpzfTNpmQTMp9AclcgQN3366pd9KUkGqYje ZLQRwLDF75fwN+UyTejpR8dKwBd+s9XnKj8ajJTZNfsiMFAuZ0NyFULYfQA+2t0KYdIl SeJR5PDLmHB/DbxWFmPoVa8UJPl8zuoGsv4ZIsclcJZthkrZ6IiPOM6LmZk4slZIJ1eD 3RUQ== X-Gm-Message-State: AOJu0YxXnC5MgO6lrMfx6rX9FC0h6ybLjHZCpPeO9rI/zdkI2AHq/p1C YOpDYpChWhEba4XauMX449dU0jGty0ek7RYjSmlHqwUlrccKQTGg7uZ+Qz5qyA== X-Google-Smtp-Source: AGHT+IEYSmHv2Cp32Wja2P7YsSiyy3iiHM0Rh31iotP8SiSZGbOw3QRKk+tATiOcEyBdcrAN5cH+Eg== X-Received: by 2002:a05:6512:3494:b0:511:64c2:8c0a with SMTP id v20-20020a056512349400b0051164c28c0amr1367415lfr.20.1707309841414; Wed, 07 Feb 2024 04:44:01 -0800 (PST) X-Forwarded-Encrypted: i=1; AJvYcCV3cNvry0r89N91PVSeegKt5gC12pigNQ4alFLsA7ORR2xngxu4gbwSWeQc0dSi+RNDrvRIEPkbCCBsHkDvgLIeZaawQcw= Received: from platypus.sou.embecosm-corp.com ([212.69.42.53]) by smtp.gmail.com with ESMTPSA id x19-20020a05600c2a5300b0040fbdd6f69bsm5044188wme.33.2024.02.07.04.44.00 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Wed, 07 Feb 2024 04:44:00 -0800 (PST) From: arthur.cohen@embecosm.com To: gcc-patches@gcc.gnu.org Cc: gcc-rust@gcc.gnu.org, Raiki Tamura Subject: [COMMITTED 02/25] gccrs: Implement quick-check for Unicode Date: Wed, 7 Feb 2024 12:43:48 +0100 Message-ID: <20240207114419.1100894-3-arthur.cohen@embecosm.com> X-Mailer: git-send-email 2.42.1 In-Reply-To: <20240207114419.1100894-2-arthur.cohen@embecosm.com> References: <20240207114419.1100894-2-arthur.cohen@embecosm.com> Reply-To: arthur.cohen@embecosm.com MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-Spam-Status: No, score=-13.3 required=5.0 tests=BAYES_00,DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,DKIM_VALID_EF,GIT_PATCH_0,RCVD_IN_DNSWL_NONE,SPF_HELO_NONE,SPF_PASS,TXREP,T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org List-Id: From: Raiki Tamura gcc/rust/ChangeLog: * rust-lang.cc (run_rust_tests): Add test. * rust-system.h: Add . * util/make-rust-unicode.py: Output NFC_Quick_Check table. * util/rust-codepoint.h (struct Codepoint): Add is_supplementary method. * util/rust-unicode-data.h: Generated. * util/rust-unicode.cc (binary_search_sorted_array): Removed. (lookup_cc): Remove namespace. (is_alphabetic): Use std::binary_search (nfc_quick_check): New function. (nfc_normalize): Use nfc_quick_check. (is_nfc_qc_maybe): New function. (is_nfc_qc_no): New function. (rust_nfc_qc_test): New test. * util/rust-unicode.h (is_nfc_qc_no): New function. (is_nfc_qc_maybe): New function. (enum class): New enum class. (nfc_quick_check): New function. (rust_nfc_qc_test): New test. Signed-off-by: Raiki Tamura --- gcc/rust/rust-lang.cc | 1 + gcc/rust/rust-system.h | 1 + gcc/rust/util/make-rust-unicode.py | 39 +++++-- gcc/rust/util/rust-codepoint.h | 1 + gcc/rust/util/rust-unicode-data.h | 158 ++++++++++++++++++++++++++++- gcc/rust/util/rust-unicode.cc | 143 +++++++++++++------------- gcc/rust/util/rust-unicode.h | 19 ++++ 7 files changed, 280 insertions(+), 82 deletions(-) diff --git a/gcc/rust/rust-lang.cc b/gcc/rust/rust-lang.cc index 8b76ba28ea2..4c2ef108bce 100644 --- a/gcc/rust/rust-lang.cc +++ b/gcc/rust/rust-lang.cc @@ -438,6 +438,7 @@ run_rust_tests () { // Call tests for the rust frontend here rust_input_source_test (); + rust_nfc_qc_test (); rust_utf8_normalize_test (); rust_punycode_encode_test (); rust_cfg_parser_test (); diff --git a/gcc/rust/rust-system.h b/gcc/rust/rust-system.h index 88d6c1095e6..d8a42181700 100644 --- a/gcc/rust/rust-system.h +++ b/gcc/rust/rust-system.h @@ -44,6 +44,7 @@ #include #include #include +#include // Rust frontend requires C++11 minimum, so will have unordered_map and set #include diff --git a/gcc/rust/util/make-rust-unicode.py b/gcc/rust/util/make-rust-unicode.py index 5303440fd25..f6f04ebdf5b 100644 --- a/gcc/rust/util/make-rust-unicode.py +++ b/gcc/rust/util/make-rust-unicode.py @@ -250,6 +250,30 @@ def write_numeric() -> None: print("}};") +def write_nfc_qc(): + print( + "const std::array, {}> NFC_QC_NO_RANGES = {{{{".format( + len(nfc_qc_no_ranges) + ) + ) + print(" // clang-format off") + for r in nfc_qc_no_ranges: + print(" {{{:#06x}, {:#06x}}},".format(r[0], r[1])) + print(" // clang-format on") + print("}};") + + print( + "const std::array, {}> NFC_QC_MAYBE_RANGES = {{{{".format( + len(nfc_qc_maybe_ranges) + ) + ) + print(" // clang-format off") + for r in nfc_qc_maybe_ranges: + print(" {{{:#06x}, {:#06x}}},".format(r[0], r[1])) + print(" // clang-format on") + print("}};") + + def main() -> None: if len(sys.argv) != 4: print("too few arguments", file=sys.stderr) @@ -265,13 +289,12 @@ def main() -> None: print(COPYRIGHT) print() - print('#include "rust-system.h"') - print() - print("namespace Rust {") - print() + print('#include "rust-system.h"\n') + print("namespace Rust {\n") print("const uint32_t NUM_ALPHABETIC_RANGES = {};".format(len(alphabetic_ranges))) - print("const uint32_t NUM_NUMERIC_CODEPOINTS = {};".format(len(numeric_codepoints))) - print() + print( + "const uint32_t NUM_NUMERIC_CODEPOINTS = {};\n".format(len(numeric_codepoints)) + ) write_decomposition() print() @@ -283,8 +306,8 @@ def main() -> None: print() write_numeric() print() - - # TODO: write NFC_QC table + write_nfc_qc() + print() print("} // namespace Rust") diff --git a/gcc/rust/util/rust-codepoint.h b/gcc/rust/util/rust-codepoint.h index a75e99e7c0c..2c5fece9d3e 100644 --- a/gcc/rust/util/rust-codepoint.h +++ b/gcc/rust/util/rust-codepoint.h @@ -40,6 +40,7 @@ struct Codepoint static Codepoint eof () { return Codepoint (UINT32_MAX); } bool is_eof () const { return value == UINT32_MAX; } bool is_ascii () const { return value <= MAX_ASCII_CODEPOINT; } + bool is_supplementary_character () const { return value > 0xFFFF; } // Returns a C++ string containing string value of codepoint. std::string as_string (); diff --git a/gcc/rust/util/rust-unicode-data.h b/gcc/rust/util/rust-unicode-data.h index c42460f4541..e07752a4f63 100644 --- a/gcc/rust/util/rust-unicode-data.h +++ b/gcc/rust/util/rust-unicode-data.h @@ -20,7 +20,7 @@ namespace Rust { -const uint32_t NUM_ALPHABETIC_RANGES = 1117; +const uint32_t NUM_ALPHABETIC_RANGES = 1141; const uint32_t NUM_NUMERIC_CODEPOINTS = 1831; const std::map> DECOMPOSITION_MAP = { @@ -4167,6 +4167,7 @@ const std::array, NUM_ALPHABETIC_RANGES> {0x0bd7, 0x0bd8}, {0x0c00, 0x0c01}, {0x0c01, 0x0c04}, + {0x0c04, 0x0c05}, {0x0c05, 0x0c0d}, {0x0c0e, 0x0c11}, {0x0c12, 0x0c29}, @@ -4202,6 +4203,7 @@ const std::array, NUM_ALPHABETIC_RANGES> {0x0ce0, 0x0ce2}, {0x0ce2, 0x0ce4}, {0x0cf1, 0x0cf3}, + {0x0cf3, 0x0cf4}, {0x0d00, 0x0d02}, {0x0d02, 0x0d04}, {0x0d04, 0x0d0d}, @@ -4257,7 +4259,7 @@ const std::array, NUM_ALPHABETIC_RANGES> {0x0f49, 0x0f6d}, {0x0f71, 0x0f7f}, {0x0f7f, 0x0f80}, - {0x0f80, 0x0f82}, + {0x0f80, 0x0f84}, {0x0f88, 0x0f8d}, {0x0f8d, 0x0f98}, {0x0f99, 0x0fbd}, @@ -4758,6 +4760,7 @@ const std::array, NUM_ALPHABETIC_RANGES> {0x11071, 0x11073}, {0x11073, 0x11075}, {0x11075, 0x11076}, + {0x11080, 0x11082}, {0x11082, 0x11083}, {0x11083, 0x110b0}, {0x110b0, 0x110b3}, @@ -4794,6 +4797,8 @@ const std::array, NUM_ALPHABETIC_RANGES> {0x11234, 0x11235}, {0x11237, 0x11238}, {0x1123e, 0x1123f}, + {0x1123f, 0x11241}, + {0x11241, 0x11242}, {0x11280, 0x11287}, {0x11288, 0x11289}, {0x1128a, 0x1128e}, @@ -4948,12 +4953,22 @@ const std::array, NUM_ALPHABETIC_RANGES> {0x11ee0, 0x11ef3}, {0x11ef3, 0x11ef5}, {0x11ef5, 0x11ef7}, + {0x11f00, 0x11f02}, + {0x11f02, 0x11f03}, + {0x11f03, 0x11f04}, + {0x11f04, 0x11f11}, + {0x11f12, 0x11f34}, + {0x11f34, 0x11f36}, + {0x11f36, 0x11f3b}, + {0x11f3e, 0x11f40}, + {0x11f40, 0x11f41}, {0x11fb0, 0x11fb1}, {0x12000, 0x1239a}, {0x12400, 0x1246f}, {0x12480, 0x12544}, {0x12f90, 0x12ff1}, - {0x13000, 0x1342f}, + {0x13000, 0x13430}, + {0x13441, 0x13447}, {0x14400, 0x14647}, {0x16800, 0x16a39}, {0x16a40, 0x16a5f}, @@ -4980,7 +4995,9 @@ const std::array, NUM_ALPHABETIC_RANGES> {0x1aff5, 0x1affc}, {0x1affd, 0x1afff}, {0x1b000, 0x1b123}, + {0x1b132, 0x1b133}, {0x1b150, 0x1b153}, + {0x1b155, 0x1b156}, {0x1b164, 0x1b168}, {0x1b170, 0x1b2fc}, {0x1bc00, 0x1bc6b}, @@ -5021,16 +5038,21 @@ const std::array, NUM_ALPHABETIC_RANGES> {0x1df00, 0x1df0a}, {0x1df0a, 0x1df0b}, {0x1df0b, 0x1df1f}, + {0x1df25, 0x1df2b}, {0x1e000, 0x1e007}, {0x1e008, 0x1e019}, {0x1e01b, 0x1e022}, {0x1e023, 0x1e025}, {0x1e026, 0x1e02b}, + {0x1e030, 0x1e06e}, + {0x1e08f, 0x1e090}, {0x1e100, 0x1e12d}, {0x1e137, 0x1e13e}, {0x1e14e, 0x1e14f}, {0x1e290, 0x1e2ae}, {0x1e2c0, 0x1e2ec}, + {0x1e4d0, 0x1e4eb}, + {0x1e4eb, 0x1e4ec}, {0x1e7e0, 0x1e7e7}, {0x1e7e8, 0x1e7ec}, {0x1e7ed, 0x1e7ef}, @@ -5076,12 +5098,14 @@ const std::array, NUM_ALPHABETIC_RANGES> {0x1f150, 0x1f16a}, {0x1f170, 0x1f18a}, {0x20000, 0x2a6e0}, - {0x2a700, 0x2b739}, + {0x2a700, 0x2b73a}, {0x2b740, 0x2b81e}, {0x2b820, 0x2cea2}, {0x2ceb0, 0x2ebe1}, + {0x2ebf0, 0x2ee5e}, {0x2f800, 0x2fa1e}, {0x30000, 0x3134b}, + {0x31350, 0x323b0}, // clang-format on }}; @@ -5205,4 +5229,130 @@ const std::array NUMERIC_CODEPOINTS = {{ // clang-format on }}; +const std::array, 74> NFC_QC_NO_RANGES = {{ + // clang-format off + {0x0340, 0x0342}, + {0x0343, 0x0345}, + {0x0374, 0x0375}, + {0x037e, 0x037f}, + {0x0387, 0x0388}, + {0x0958, 0x0960}, + {0x09dc, 0x09de}, + {0x09df, 0x09e0}, + {0x0a33, 0x0a34}, + {0x0a36, 0x0a37}, + {0x0a59, 0x0a5c}, + {0x0a5e, 0x0a5f}, + {0x0b5c, 0x0b5e}, + {0x0f43, 0x0f44}, + {0x0f4d, 0x0f4e}, + {0x0f52, 0x0f53}, + {0x0f57, 0x0f58}, + {0x0f5c, 0x0f5d}, + {0x0f69, 0x0f6a}, + {0x0f73, 0x0f74}, + {0x0f75, 0x0f77}, + {0x0f78, 0x0f79}, + {0x0f81, 0x0f82}, + {0x0f93, 0x0f94}, + {0x0f9d, 0x0f9e}, + {0x0fa2, 0x0fa3}, + {0x0fa7, 0x0fa8}, + {0x0fac, 0x0fad}, + {0x0fb9, 0x0fba}, + {0x1f71, 0x1f72}, + {0x1f73, 0x1f74}, + {0x1f75, 0x1f76}, + {0x1f77, 0x1f78}, + {0x1f79, 0x1f7a}, + {0x1f7b, 0x1f7c}, + {0x1f7d, 0x1f7e}, + {0x1fbb, 0x1fbc}, + {0x1fbe, 0x1fbf}, + {0x1fc9, 0x1fca}, + {0x1fcb, 0x1fcc}, + {0x1fd3, 0x1fd4}, + {0x1fdb, 0x1fdc}, + {0x1fe3, 0x1fe4}, + {0x1feb, 0x1fec}, + {0x1fee, 0x1ff0}, + {0x1ff9, 0x1ffa}, + {0x1ffb, 0x1ffc}, + {0x1ffd, 0x1ffe}, + {0x2000, 0x2002}, + {0x2126, 0x2127}, + {0x212a, 0x212c}, + {0x2329, 0x232a}, + {0x232a, 0x232b}, + {0x2adc, 0x2add}, + {0xf900, 0xfa0e}, + {0xfa10, 0xfa11}, + {0xfa12, 0xfa13}, + {0xfa15, 0xfa1f}, + {0xfa20, 0xfa21}, + {0xfa22, 0xfa23}, + {0xfa25, 0xfa27}, + {0xfa2a, 0xfa6e}, + {0xfa70, 0xfada}, + {0xfb1d, 0xfb1e}, + {0xfb1f, 0xfb20}, + {0xfb2a, 0xfb37}, + {0xfb38, 0xfb3d}, + {0xfb3e, 0xfb3f}, + {0xfb40, 0xfb42}, + {0xfb43, 0xfb45}, + {0xfb46, 0xfb4f}, + {0x1d15e, 0x1d165}, + {0x1d1bb, 0x1d1c1}, + {0x2f800, 0x2fa1e}, + // clang-format on +}}; +const std::array, 43> NFC_QC_MAYBE_RANGES = {{ + // clang-format off + {0x0300, 0x0305}, + {0x0306, 0x030d}, + {0x030f, 0x0310}, + {0x0311, 0x0312}, + {0x0313, 0x0315}, + {0x031b, 0x031c}, + {0x0323, 0x0329}, + {0x032d, 0x032f}, + {0x0330, 0x0332}, + {0x0338, 0x0339}, + {0x0342, 0x0343}, + {0x0345, 0x0346}, + {0x0653, 0x0656}, + {0x093c, 0x093d}, + {0x09be, 0x09bf}, + {0x09d7, 0x09d8}, + {0x0b3e, 0x0b3f}, + {0x0b56, 0x0b57}, + {0x0b57, 0x0b58}, + {0x0bbe, 0x0bbf}, + {0x0bd7, 0x0bd8}, + {0x0c56, 0x0c57}, + {0x0cc2, 0x0cc3}, + {0x0cd5, 0x0cd7}, + {0x0d3e, 0x0d3f}, + {0x0d57, 0x0d58}, + {0x0dca, 0x0dcb}, + {0x0dcf, 0x0dd0}, + {0x0ddf, 0x0de0}, + {0x102e, 0x102f}, + {0x1161, 0x1176}, + {0x11a8, 0x11c3}, + {0x1b35, 0x1b36}, + {0x3099, 0x309b}, + {0x110ba, 0x110bb}, + {0x11127, 0x11128}, + {0x1133e, 0x1133f}, + {0x11357, 0x11358}, + {0x114b0, 0x114b1}, + {0x114ba, 0x114bb}, + {0x114bd, 0x114be}, + {0x115af, 0x115b0}, + {0x11930, 0x11931}, + // clang-format on +}}; + } // namespace Rust diff --git a/gcc/rust/util/rust-unicode.cc b/gcc/rust/util/rust-unicode.cc index 999ecb042ca..6bd2db550a1 100644 --- a/gcc/rust/util/rust-unicode.cc +++ b/gcc/rust/util/rust-unicode.cc @@ -39,75 +39,27 @@ const uint32_t L_COUNT = 19, V_COUNT = 21, T_COUNT = 28; const uint32_t N_COUNT = V_COUNT * T_COUNT; const uint32_t S_COUNT = L_COUNT * N_COUNT; +// Check if the codepoint is in any of the ranges (half-open intervals [a,b)). template -int64_t +bool binary_search_ranges ( - // FIXME: use binray search function from const std::array, SIZE> &ranges, uint32_t target_cp) { - if (SIZE == 0) - return -1; - - uint32_t low, high, mid; - uint32_t start, end; - low = 0; - high = SIZE - 1; - mid = (low + high) / 2; - while (high - low > 1) - { - start = ranges[mid].first; - end = ranges[mid].second; - if (start <= target_cp && target_cp < end) - { - return mid; - } - else if (end <= target_cp) - low = mid + 1; - else - high = mid - 1; - mid = (low + high) / 2; - } - - if (ranges[mid].first <= target_cp && target_cp < ranges[mid].second) - return mid; - else - return -1; -} - -template -int64_t -binary_search_sorted_array (const std::array &array, - uint32_t target) -{ - // FIXME: use binray search function from - if (SIZE == 0) - return -1; - - uint32_t low, high, mid; - low = 0; - high = SIZE; - mid = (low + high) / 2; - while (high - low > 1) - { - if (array[mid] <= target) - low = mid; - else - high = mid; - mid = (low + high) / 2; - } - - if (array[mid] == target) - return mid; + auto it = std::lower_bound (ranges.begin (), ranges.end (), target_cp, + [] (const std::pair &a, + uint32_t b) { return a.second <= b; }); + if (it == ranges.end ()) + return false; else - return -1; + return it->first <= target_cp && target_cp < it->second; } int lookup_cc (codepoint_t c) { - auto it = Rust::CCC_TABLE.find (c.value); - if (it != Rust::CCC_TABLE.end ()) + auto it = CCC_TABLE.find (c.value); + if (it != CCC_TABLE.end ()) return it->second; else // Starter. Returns zero. @@ -289,10 +241,44 @@ recomp (string_t s) return buf; } +// see https://unicode.org/reports/tr15/#Detecting_Normalization_Forms +QuickCheckResult +nfc_quick_check (const string_t &s) +{ + int last_canonical_class = 0; + QuickCheckResult res = QuickCheckResult::YES; + + for (unsigned long i = 0; i < s.size (); i++) + { + codepoint_t c = s[i]; + + if (c.is_supplementary_character ()) + i++; + + int canonical_class = lookup_cc (c); + if (last_canonical_class > canonical_class && canonical_class != 0) + return QuickCheckResult::NO; + + if (is_nfc_qc_no (c.value)) + return QuickCheckResult::NO; + + if (is_nfc_qc_maybe (c.value)) + res = QuickCheckResult::MAYBE; + + last_canonical_class = canonical_class; + } + return res; +} + string_t -nfc_normalize (string_t s) +nfc_normalize (const string_t &s) { - // TODO: Quick Check + if (nfc_quick_check (s) == QuickCheckResult::YES) + return s; + + // TODO: optimize normalization. + // i.e. only normalize a limited area around MAYBE character, instead of + // performing complete normlization of the entire string // decompose string_t d = decomp_cano (s); @@ -312,21 +298,26 @@ Utf8String::nfc_normalize () const bool is_alphabetic (uint32_t codepoint) { - int64_t res = binary_search_ranges (ALPHABETIC_RANGES, codepoint); - if (res < 0) - return false; - else - return true; + return binary_search_ranges (ALPHABETIC_RANGES, codepoint); } bool is_numeric (uint32_t codepoint) { - int64_t res = binary_search_sorted_array (NUMERIC_CODEPOINTS, codepoint); - if (res < 0) - return false; - else - return true; + return std::binary_search (NUMERIC_CODEPOINTS.begin (), + NUMERIC_CODEPOINTS.end (), codepoint); +} + +bool +is_nfc_qc_maybe (uint32_t codepoint) +{ + return binary_search_ranges (NFC_QC_MAYBE_RANGES, codepoint); +} + +bool +is_nfc_qc_no (uint32_t codepoint) +{ + return binary_search_ranges (NFC_QC_NO_RANGES, codepoint); } bool @@ -344,6 +335,18 @@ is_ascii_only (const std::string &str) namespace selftest { +void +rust_nfc_qc_test () +{ + ASSERT_EQ (Rust::nfc_quick_check ({0x1e0a /* NFC_QC_YES */}), + Rust::QuickCheckResult::YES); + ASSERT_EQ (Rust::nfc_quick_check ( + {0x1e0a /* NFC_QC_YES */, 0x0323 /* NFC_QC_MAYBE */}), + Rust::QuickCheckResult::MAYBE); + ASSERT_EQ (Rust::nfc_quick_check ({0x0340 /* NFC_QC_NO */}), + Rust::QuickCheckResult::NO); +} + void assert_normalize (const std::vector origin, const std::vector expected) diff --git a/gcc/rust/util/rust-unicode.h b/gcc/rust/util/rust-unicode.h index 2538436797f..aa7bd8a5632 100644 --- a/gcc/rust/util/rust-unicode.h +++ b/gcc/rust/util/rust-unicode.h @@ -68,12 +68,31 @@ is_ascii_only (const std::string &str); bool is_numeric (uint32_t codepoint); +bool +is_nfc_qc_no (uint32_t codepoint); + +bool +is_nfc_qc_maybe (uint32_t codepoint); + +enum class QuickCheckResult +{ + YES, + NO, + MAYBE +}; + +QuickCheckResult +nfc_quick_check (const std::vector &s); + } // namespace Rust #if CHECKING_P namespace selftest { +void +rust_nfc_qc_test (); + void rust_utf8_normalize_test (); -- 2.42.1