Skip to content

Implement punycode encoding #2533

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions gcc/rust/Make-lang.in
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,7 @@ GRS_OBJS = \
rust/rust-feature-gate.o \
rust/rust-dir-owner.o \
rust/rust-unicode.o \
rust/rust-punycode.o \
$(END)
# removed object files from here

Expand Down
2 changes: 2 additions & 0 deletions gcc/rust/rust-lang.cc
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
#include "rust-lex.h"
#include "optional.h"
#include "rust-unicode.h"
#include "rust-punycode.h"

#include <mpfr.h>
// note: header files must be in this order or else forward declarations don't
Expand Down Expand Up @@ -456,6 +457,7 @@ run_rust_tests ()
// Call tests for the rust frontend here
rust_input_source_test ();
rust_utf8_normalize_test ();
rust_punycode_encode_test ();
rust_cfg_parser_test ();
rust_privacy_ctx_test ();
rust_crate_name_validation_test ();
Expand Down
180 changes: 180 additions & 0 deletions gcc/rust/util/rust-punycode.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
// Copyright (C) 2020-2023 Free Software Foundation, Inc.

// This file is part of GCC.

// GCC is free software; you can redistribute it and/or modify it under
// the terms of the GNU General Public License as published by the Free
// Software Foundation; either version 3, or (at your option) any later
// version.

// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
// WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.

// You should have received a copy of the GNU General Public License
// along with GCC; see the file COPYING3. If not see
// <http://www.gnu.org/licenses/>.

// This file provides functions for punycode conversion
// See https://datatracker.ietf.org/doc/html/rfc3492

#include "rust-system.h"
#include "rust-unicode.h"
#include "optional.h"
#include "selftest.h"

namespace Rust {

// https://tools.ietf.org/html/rfc3492#section-4.
constexpr uint32_t BASE = 36;
constexpr uint32_t TMIN = 1;
constexpr uint32_t TMAX = 26;
constexpr uint32_t SKEW = 38;
constexpr uint32_t DAMP = 700;
constexpr uint32_t INITIAL_BIAS = 72;
constexpr uint32_t INITIAL_N = 128;
constexpr char DELIMITER = '-';

constexpr uint32_t MAX_ASCII_CODEPOINT = 0x7F;

std::string
extract_basic_string (const std::vector<Codepoint> &src)
{
std::string basic_string;
for (auto c : src)
{
if (c.value <= MAX_ASCII_CODEPOINT)
basic_string += c.as_string ();
}
return basic_string;
}

uint32_t
adapt_bias (uint32_t delta, const uint32_t n_points, const bool is_first)
{
delta /= is_first ? DAMP : 2;
delta += delta / n_points;
uint32_t k = 0;

while (delta > (BASE - TMIN) * TMAX / 2)
{
delta /= BASE - TMIN;
k += BASE;
}
return k + (BASE - TMIN + 1) * delta / (delta + SKEW);
}

uint32_t
clamped_sub (const uint32_t min, const uint32_t lhs, const uint32_t rhs,
const uint32_t max)
{
if (min + rhs >= lhs)
return min;
else if (max + rhs <= lhs)
return max;
else
return lhs - rhs;
}

uint32_t
min_gt_or_eq (const std::vector<Codepoint> &l, const uint32_t threshold)
{
uint32_t min = UINT32_MAX;
for (auto c : l)
if (c.value >= threshold && c.value < min)
min = c.value;
return min;
}

char
encode_digit (const uint32_t d)
{
return d + 22 + (d < 26 ? 75 : 0);
}

tl::optional<std::string>
encode_punycode (const Utf8String &input)
{
std::vector<Codepoint> input_chars = input.get_chars ();

uint32_t n = INITIAL_N;
uint32_t delta = 0;
uint32_t bias = INITIAL_BIAS;

std::string output = extract_basic_string (input_chars);
uint32_t h = output.size ();
const uint32_t b = h;
if (b > 0)
output += DELIMITER;

while (h < input_chars.size ())
{
const uint32_t m = min_gt_or_eq (input_chars, n);

if (m - n > ((UINT32_MAX - delta) / (h + 1)))
return tl::nullopt;

delta += (m - n) * (h + 1);
n = m;

for (const auto c : input_chars)
{
if (c.value < n)
delta++;
else if (c.value == n)
{
uint32_t q = delta;
// encode as a variable length integer
for (uint32_t k = 1;; k++)
{
const uint32_t kb = k * BASE;
const uint32_t t = clamped_sub (TMIN, kb, bias, TMAX);
if (q < t)
break;

output += encode_digit (t + (q - t) % (BASE - t));
q = (q - t) / (BASE - t);
}
output += encode_digit (q);

bias = adapt_bias (delta, h + 1, h == b);
delta = 0;
h++;
}
}
delta++;
n++;
}

return {output};
}

} // namespace Rust

namespace selftest {

void
encode_assert (const std::string &input, const std::string &expected)
{
Rust::Utf8String input_utf8
= Rust::Utf8String::make_utf8_string (input).value ();
std::string actual = Rust::encode_punycode (input_utf8).value ();
ASSERT_EQ (actual, expected);
}

void
rust_punycode_encode_test ()
{
encode_assert ("abc", "abc-");
encode_assert ("12345", "12345-");
encode_assert ("香港", "j6w193g");

// Examples from https://datatracker.ietf.org/doc/html/rfc3492#section-7.1
encode_assert ("ليهمابتكلموشعربي؟", "egbpdaj6bu4bxfgehfvwxn");
encode_assert ("他们为什么不说中文", "ihqwcrb4cv8a8dqg056pqjye");
encode_assert ("他們爲什麽不說中文", "ihqwctvzc91f659drss3x8bo0yb");
encode_assert ("Pročprostěnemluvíčesky", "Proprostnemluvesky-uyb24dma41a");
}

} // namespace selftest
46 changes: 46 additions & 0 deletions gcc/rust/util/rust-punycode.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
// Copyright (C) 2020-2023 Free Software Foundation, Inc.

// This file is part of GCC.

// GCC is free software; you can redistribute it and/or modify it under
// the terms of the GNU General Public License as published by the Free
// Software Foundation; either version 3, or (at your option) any later
// version.

// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
// WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.

// You should have received a copy of the GNU General Public License
// along with GCC; see the file COPYING3. If not see
// <http://www.gnu.org/licenses/>.

#ifndef RUST_PUNYCODE_H
#define RUST_PUNYCODE_H

#include "rust-unicode.h"
#include "optional.h"

namespace Rust {

/* Encode a string as punycode. Returns a string if encoding is successful.
* Returns nullopt otherwise. Note that a returned string contains only ASCII
* characters and does not start with `xn--`. */
tl::optional<std::string>
encode_punycode (const Utf8String &src);

} // namespace Rust

#if CHECKING_P

namespace selftest {

void
rust_punycode_encode_test ();

} // namespace selftest

#endif // CHECKING_P

#endif