public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc r14-7904] gccrs: Add function `Rust::encode_punycode`
@ 2024-01-16 18:06 Arthur Cohen
0 siblings, 0 replies; only message in thread
From: Arthur Cohen @ 2024-01-16 18:06 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:619f1874b0ce0edc68b2748153b4134b2db306fd
commit r14-7904-g619f1874b0ce0edc68b2748153b4134b2db306fd
Author: Raiki Tamura <tamaron1203@gmail.com>
Date: Sun Jul 30 19:54:36 2023 +0900
gccrs: Add function `Rust::encode_punycode`
gcc/rust/ChangeLog:
* Make-lang.in: Add rust-punycode.o.
* rust-lang.cc (run_rust_tests): Add selftest.
* util/rust-punycode.cc: New file.
* util/rust-punycode.h: New file.
Signed-off-by: Raiki Tamura <tamaron1203@gmail.com>
Diff:
---
gcc/rust/Make-lang.in | 1 +
gcc/rust/rust-lang.cc | 2 +
gcc/rust/util/rust-punycode.cc | 180 +++++++++++++++++++++++++++++++++++++++++
gcc/rust/util/rust-punycode.h | 46 +++++++++++
4 files changed, 229 insertions(+)
diff --git a/gcc/rust/Make-lang.in b/gcc/rust/Make-lang.in
index 1df82f1598b..ad41a56523f 100644
--- a/gcc/rust/Make-lang.in
+++ b/gcc/rust/Make-lang.in
@@ -186,6 +186,7 @@ GRS_OBJS = \
rust/rust-feature-gate.o \
rust/rust-dir-owner.o \
rust/rust-unicode.o \
+ rust/rust-punycode.o \
$(END)
# removed object files from here
diff --git a/gcc/rust/rust-lang.cc b/gcc/rust/rust-lang.cc
index 157d83f506f..6d40cc0697d 100644
--- a/gcc/rust/rust-lang.cc
+++ b/gcc/rust/rust-lang.cc
@@ -38,6 +38,7 @@
#include "rust-lex.h"
#include "optional.h"
#include "rust-unicode.h"
+#include "rust-punycode.h"
#include <mpfr.h>
// note: header files must be in this order or else forward declarations don't
@@ -453,6 +454,7 @@ run_rust_tests ()
// Call tests for the rust frontend here
rust_input_source_test ();
rust_utf8_normalize_test ();
+ rust_punycode_encode_test ();
rust_cfg_parser_test ();
rust_privacy_ctx_test ();
rust_crate_name_validation_test ();
diff --git a/gcc/rust/util/rust-punycode.cc b/gcc/rust/util/rust-punycode.cc
new file mode 100644
index 00000000000..a35d54aa6f5
--- /dev/null
+++ b/gcc/rust/util/rust-punycode.cc
@@ -0,0 +1,180 @@
+// Copyright (C) 2020-2023 Free Software Foundation, Inc.
+
+// This file is part of GCC.
+
+// GCC is free software; you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free
+// Software Foundation; either version 3, or (at your option) any later
+// version.
+
+// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+// for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with GCC; see the file COPYING3. If not see
+// <http://www.gnu.org/licenses/>.
+
+// This file provides functions for punycode conversion
+// See https://datatracker.ietf.org/doc/html/rfc3492
+
+#include "rust-system.h"
+#include "rust-unicode.h"
+#include "optional.h"
+#include "selftest.h"
+
+namespace Rust {
+
+// https://tools.ietf.org/html/rfc3492#section-4.
+constexpr uint32_t BASE = 36;
+constexpr uint32_t TMIN = 1;
+constexpr uint32_t TMAX = 26;
+constexpr uint32_t SKEW = 38;
+constexpr uint32_t DAMP = 700;
+constexpr uint32_t INITIAL_BIAS = 72;
+constexpr uint32_t INITIAL_N = 128;
+constexpr char DELIMITER = '-';
+
+constexpr uint32_t MAX_ASCII_CODEPOINT = 0x7F;
+
+std::string
+extract_basic_string (const std::vector<Codepoint> &src)
+{
+ std::string basic_string;
+ for (auto c : src)
+ {
+ if (c.value <= MAX_ASCII_CODEPOINT)
+ basic_string += c.as_string ();
+ }
+ return basic_string;
+}
+
+uint32_t
+adapt_bias (uint32_t delta, const uint32_t n_points, const bool is_first)
+{
+ delta /= is_first ? DAMP : 2;
+ delta += delta / n_points;
+ uint32_t k = 0;
+
+ while (delta > (BASE - TMIN) * TMAX / 2)
+ {
+ delta /= BASE - TMIN;
+ k += BASE;
+ }
+ return k + (BASE - TMIN + 1) * delta / (delta + SKEW);
+}
+
+uint32_t
+clamped_sub (const uint32_t min, const uint32_t lhs, const uint32_t rhs,
+ const uint32_t max)
+{
+ if (min + rhs >= lhs)
+ return min;
+ else if (max + rhs <= lhs)
+ return max;
+ else
+ return lhs - rhs;
+}
+
+uint32_t
+min_gt_or_eq (const std::vector<Codepoint> &l, const uint32_t threshold)
+{
+ uint32_t min = UINT32_MAX;
+ for (auto c : l)
+ if (c.value >= threshold && c.value < min)
+ min = c.value;
+ return min;
+}
+
+char
+encode_digit (const uint32_t d)
+{
+ return d + 22 + (d < 26 ? 75 : 0);
+}
+
+tl::optional<std::string>
+encode_punycode (const Utf8String &input)
+{
+ std::vector<Codepoint> input_chars = input.get_chars ();
+
+ uint32_t n = INITIAL_N;
+ uint32_t delta = 0;
+ uint32_t bias = INITIAL_BIAS;
+
+ std::string output = extract_basic_string (input_chars);
+ uint32_t h = output.size ();
+ const uint32_t b = h;
+ if (b > 0)
+ output += DELIMITER;
+
+ while (h < input_chars.size ())
+ {
+ const uint32_t m = min_gt_or_eq (input_chars, n);
+
+ if (m - n > ((UINT32_MAX - delta) / (h + 1)))
+ return tl::nullopt;
+
+ delta += (m - n) * (h + 1);
+ n = m;
+
+ for (const auto c : input_chars)
+ {
+ if (c.value < n)
+ delta++;
+ else if (c.value == n)
+ {
+ uint32_t q = delta;
+ // encode as a variable length integer
+ for (uint32_t k = 1;; k++)
+ {
+ const uint32_t kb = k * BASE;
+ const uint32_t t = clamped_sub (TMIN, kb, bias, TMAX);
+ if (q < t)
+ break;
+
+ output += encode_digit (t + (q - t) % (BASE - t));
+ q = (q - t) / (BASE - t);
+ }
+ output += encode_digit (q);
+
+ bias = adapt_bias (delta, h + 1, h == b);
+ delta = 0;
+ h++;
+ }
+ }
+ delta++;
+ n++;
+ }
+
+ return {output};
+}
+
+} // namespace Rust
+
+namespace selftest {
+
+void
+encode_assert (const std::string &input, const std::string &expected)
+{
+ Rust::Utf8String input_utf8
+ = Rust::Utf8String::make_utf8_string (input).value ();
+ std::string actual = Rust::encode_punycode (input_utf8).value ();
+ ASSERT_EQ (actual, expected);
+}
+
+void
+rust_punycode_encode_test ()
+{
+ encode_assert ("abc", "abc-");
+ encode_assert ("12345", "12345-");
+ encode_assert ("香港", "j6w193g");
+
+ // Examples from https://datatracker.ietf.org/doc/html/rfc3492#section-7.1
+ encode_assert ("ليهمابتكلموشعربي؟", "egbpdaj6bu4bxfgehfvwxn");
+ encode_assert ("他们为什么不说中文", "ihqwcrb4cv8a8dqg056pqjye");
+ encode_assert ("他們爲什麽不說中文", "ihqwctvzc91f659drss3x8bo0yb");
+ encode_assert ("Pročprostěnemluvíčesky", "Proprostnemluvesky-uyb24dma41a");
+}
+
+} // namespace selftest
diff --git a/gcc/rust/util/rust-punycode.h b/gcc/rust/util/rust-punycode.h
new file mode 100644
index 00000000000..ffb139a5ff3
--- /dev/null
+++ b/gcc/rust/util/rust-punycode.h
@@ -0,0 +1,46 @@
+// Copyright (C) 2020-2023 Free Software Foundation, Inc.
+
+// This file is part of GCC.
+
+// GCC is free software; you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free
+// Software Foundation; either version 3, or (at your option) any later
+// version.
+
+// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+// for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with GCC; see the file COPYING3. If not see
+// <http://www.gnu.org/licenses/>.
+
+#ifndef RUST_PUNYCODE_H
+#define RUST_PUNYCODE_H
+
+#include "rust-unicode.h"
+#include "optional.h"
+
+namespace Rust {
+
+/* Encode a string as punycode. Returns a string if encoding is successful.
+ * Returns nullopt otherwise. Note that a returned string contains only ASCII
+ * characters and does not start with `xn--`. */
+tl::optional<std::string>
+encode_punycode (const Utf8String &src);
+
+} // namespace Rust
+
+#if CHECKING_P
+
+namespace selftest {
+
+void
+rust_punycode_encode_test ();
+
+} // namespace selftest
+
+#endif // CHECKING_P
+
+#endif
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2024-01-16 18:06 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-01-16 18:06 [gcc r14-7904] gccrs: Add function `Rust::encode_punycode` Arthur Cohen
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).