public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc r14-7904] gccrs: Add function `Rust::encode_punycode`
@ 2024-01-16 18:06 Arthur Cohen
  0 siblings, 0 replies; only message in thread
From: Arthur Cohen @ 2024-01-16 18:06 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:619f1874b0ce0edc68b2748153b4134b2db306fd

commit r14-7904-g619f1874b0ce0edc68b2748153b4134b2db306fd
Author: Raiki Tamura <tamaron1203@gmail.com>
Date:   Sun Jul 30 19:54:36 2023 +0900

    gccrs: Add function `Rust::encode_punycode`
    
    gcc/rust/ChangeLog:
    
            * Make-lang.in: Add rust-punycode.o.
            * rust-lang.cc (run_rust_tests): Add selftest.
            * util/rust-punycode.cc: New file.
            * util/rust-punycode.h: New file.
    
    Signed-off-by: Raiki Tamura <tamaron1203@gmail.com>

Diff:
---
 gcc/rust/Make-lang.in          |   1 +
 gcc/rust/rust-lang.cc          |   2 +
 gcc/rust/util/rust-punycode.cc | 180 +++++++++++++++++++++++++++++++++++++++++
 gcc/rust/util/rust-punycode.h  |  46 +++++++++++
 4 files changed, 229 insertions(+)

diff --git a/gcc/rust/Make-lang.in b/gcc/rust/Make-lang.in
index 1df82f1598b..ad41a56523f 100644
--- a/gcc/rust/Make-lang.in
+++ b/gcc/rust/Make-lang.in
@@ -186,6 +186,7 @@ GRS_OBJS = \
     rust/rust-feature-gate.o \
     rust/rust-dir-owner.o \
     rust/rust-unicode.o \
+    rust/rust-punycode.o \
     $(END)
 # removed object files from here
 
diff --git a/gcc/rust/rust-lang.cc b/gcc/rust/rust-lang.cc
index 157d83f506f..6d40cc0697d 100644
--- a/gcc/rust/rust-lang.cc
+++ b/gcc/rust/rust-lang.cc
@@ -38,6 +38,7 @@
 #include "rust-lex.h"
 #include "optional.h"
 #include "rust-unicode.h"
+#include "rust-punycode.h"
 
 #include <mpfr.h>
 // note: header files must be in this order or else forward declarations don't
@@ -453,6 +454,7 @@ run_rust_tests ()
   // Call tests for the rust frontend here
   rust_input_source_test ();
   rust_utf8_normalize_test ();
+  rust_punycode_encode_test ();
   rust_cfg_parser_test ();
   rust_privacy_ctx_test ();
   rust_crate_name_validation_test ();
diff --git a/gcc/rust/util/rust-punycode.cc b/gcc/rust/util/rust-punycode.cc
new file mode 100644
index 00000000000..a35d54aa6f5
--- /dev/null
+++ b/gcc/rust/util/rust-punycode.cc
@@ -0,0 +1,180 @@
+// Copyright (C) 2020-2023 Free Software Foundation, Inc.
+
+// This file is part of GCC.
+
+// GCC is free software; you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free
+// Software Foundation; either version 3, or (at your option) any later
+// version.
+
+// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+// for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with GCC; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
+// This file provides functions for punycode conversion
+// See https://datatracker.ietf.org/doc/html/rfc3492
+
+#include "rust-system.h"
+#include "rust-unicode.h"
+#include "optional.h"
+#include "selftest.h"
+
+namespace Rust {
+
+// https://tools.ietf.org/html/rfc3492#section-4.
+constexpr uint32_t BASE = 36;
+constexpr uint32_t TMIN = 1;
+constexpr uint32_t TMAX = 26;
+constexpr uint32_t SKEW = 38;
+constexpr uint32_t DAMP = 700;
+constexpr uint32_t INITIAL_BIAS = 72;
+constexpr uint32_t INITIAL_N = 128;
+constexpr char DELIMITER = '-';
+
+constexpr uint32_t MAX_ASCII_CODEPOINT = 0x7F;
+
+std::string
+extract_basic_string (const std::vector<Codepoint> &src)
+{
+  std::string basic_string;
+  for (auto c : src)
+    {
+      if (c.value <= MAX_ASCII_CODEPOINT)
+	basic_string += c.as_string ();
+    }
+  return basic_string;
+}
+
+uint32_t
+adapt_bias (uint32_t delta, const uint32_t n_points, const bool is_first)
+{
+  delta /= is_first ? DAMP : 2;
+  delta += delta / n_points;
+  uint32_t k = 0;
+
+  while (delta > (BASE - TMIN) * TMAX / 2)
+    {
+      delta /= BASE - TMIN;
+      k += BASE;
+    }
+  return k + (BASE - TMIN + 1) * delta / (delta + SKEW);
+}
+
+uint32_t
+clamped_sub (const uint32_t min, const uint32_t lhs, const uint32_t rhs,
+	     const uint32_t max)
+{
+  if (min + rhs >= lhs)
+    return min;
+  else if (max + rhs <= lhs)
+    return max;
+  else
+    return lhs - rhs;
+}
+
+uint32_t
+min_gt_or_eq (const std::vector<Codepoint> &l, const uint32_t threshold)
+{
+  uint32_t min = UINT32_MAX;
+  for (auto c : l)
+    if (c.value >= threshold && c.value < min)
+      min = c.value;
+  return min;
+}
+
+char
+encode_digit (const uint32_t d)
+{
+  return d + 22 + (d < 26 ? 75 : 0);
+}
+
+tl::optional<std::string>
+encode_punycode (const Utf8String &input)
+{
+  std::vector<Codepoint> input_chars = input.get_chars ();
+
+  uint32_t n = INITIAL_N;
+  uint32_t delta = 0;
+  uint32_t bias = INITIAL_BIAS;
+
+  std::string output = extract_basic_string (input_chars);
+  uint32_t h = output.size ();
+  const uint32_t b = h;
+  if (b > 0)
+    output += DELIMITER;
+
+  while (h < input_chars.size ())
+    {
+      const uint32_t m = min_gt_or_eq (input_chars, n);
+
+      if (m - n > ((UINT32_MAX - delta) / (h + 1)))
+	return tl::nullopt;
+
+      delta += (m - n) * (h + 1);
+      n = m;
+
+      for (const auto c : input_chars)
+	{
+	  if (c.value < n)
+	    delta++;
+	  else if (c.value == n)
+	    {
+	      uint32_t q = delta;
+	      // encode as a variable length integer
+	      for (uint32_t k = 1;; k++)
+		{
+		  const uint32_t kb = k * BASE;
+		  const uint32_t t = clamped_sub (TMIN, kb, bias, TMAX);
+		  if (q < t)
+		    break;
+
+		  output += encode_digit (t + (q - t) % (BASE - t));
+		  q = (q - t) / (BASE - t);
+		}
+	      output += encode_digit (q);
+
+	      bias = adapt_bias (delta, h + 1, h == b);
+	      delta = 0;
+	      h++;
+	    }
+	}
+      delta++;
+      n++;
+    }
+
+  return {output};
+}
+
+} // namespace Rust
+
+namespace selftest {
+
+void
+encode_assert (const std::string &input, const std::string &expected)
+{
+  Rust::Utf8String input_utf8
+    = Rust::Utf8String::make_utf8_string (input).value ();
+  std::string actual = Rust::encode_punycode (input_utf8).value ();
+  ASSERT_EQ (actual, expected);
+}
+
+void
+rust_punycode_encode_test ()
+{
+  encode_assert ("abc", "abc-");
+  encode_assert ("12345", "12345-");
+  encode_assert ("香港", "j6w193g");
+
+  // Examples from https://datatracker.ietf.org/doc/html/rfc3492#section-7.1
+  encode_assert ("ليهمابتكلموشعربي؟", "egbpdaj6bu4bxfgehfvwxn");
+  encode_assert ("他们为什么不说中文", "ihqwcrb4cv8a8dqg056pqjye");
+  encode_assert ("他們爲什麽不說中文", "ihqwctvzc91f659drss3x8bo0yb");
+  encode_assert ("Pročprostěnemluvíčesky", "Proprostnemluvesky-uyb24dma41a");
+}
+
+} // namespace selftest
diff --git a/gcc/rust/util/rust-punycode.h b/gcc/rust/util/rust-punycode.h
new file mode 100644
index 00000000000..ffb139a5ff3
--- /dev/null
+++ b/gcc/rust/util/rust-punycode.h
@@ -0,0 +1,46 @@
+// Copyright (C) 2020-2023 Free Software Foundation, Inc.
+
+// This file is part of GCC.
+
+// GCC is free software; you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free
+// Software Foundation; either version 3, or (at your option) any later
+// version.
+
+// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+// for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with GCC; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
+#ifndef RUST_PUNYCODE_H
+#define RUST_PUNYCODE_H
+
+#include "rust-unicode.h"
+#include "optional.h"
+
+namespace Rust {
+
+/* Encode a string as punycode. Returns a string if encoding is successful.
+ * Returns nullopt otherwise. Note that a returned string contains only ASCII
+ * characters and does not start with `xn--`. */
+tl::optional<std::string>
+encode_punycode (const Utf8String &src);
+
+} // namespace Rust
+
+#if CHECKING_P
+
+namespace selftest {
+
+void
+rust_punycode_encode_test ();
+
+} // namespace selftest
+
+#endif // CHECKING_P
+
+#endif

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2024-01-16 18:06 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-01-16 18:06 [gcc r14-7904] gccrs: Add function `Rust::encode_punycode` Arthur Cohen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).