|
| 1 | +package IDNA::Punycode; |
| 2 | + |
| 3 | +use strict; |
| 4 | +our $VERSION = 0.03; |
| 5 | + |
| 6 | +require Exporter; |
| 7 | +our @ISA = qw(Exporter); |
| 8 | +our @EXPORT = qw(encode_punycode decode_punycode idn_prefix); |
| 9 | + |
| 10 | +use integer; |
| 11 | + |
| 12 | +our $DEBUG = 0; |
| 13 | +our $PREFIX = 'xn--'; |
| 14 | + |
| 15 | +use constant BASE => 36; |
| 16 | +use constant TMIN => 1; |
| 17 | +use constant TMAX => 26; |
| 18 | +use constant SKEW => 38; |
| 19 | +use constant DAMP => 700; |
| 20 | +use constant INITIAL_BIAS => 72; |
| 21 | +use constant INITIAL_N => 128; |
| 22 | + |
| 23 | +my $Delimiter = chr 0x2D; |
| 24 | +my $BasicRE = qr/[\x00-\x7f]/; |
| 25 | + |
| 26 | +sub _croak { require Carp; Carp::croak(@_); } |
| 27 | + |
| 28 | +sub idn_prefix { |
| 29 | + $PREFIX = shift; |
| 30 | +} |
| 31 | + |
| 32 | +sub digit_value { |
| 33 | + my $code = shift; |
| 34 | + return ord($code) - ord("A") if $code =~ /[A-Z]/; |
| 35 | + return ord($code) - ord("a") if $code =~ /[a-z]/; |
| 36 | + return ord($code) - ord("0") + 26 if $code =~ /[0-9]/; |
| 37 | + return; |
| 38 | +} |
| 39 | + |
| 40 | +sub code_point { |
| 41 | + my $digit = shift; |
| 42 | + return $digit + ord('a') if 0 <= $digit && $digit <= 25; |
| 43 | + return $digit + ord('0') - 26 if 26 <= $digit && $digit <= 36; |
| 44 | + die 'NOT COME HERE'; |
| 45 | +} |
| 46 | + |
| 47 | +sub adapt { |
| 48 | + my($delta, $numpoints, $firsttime) = @_; |
| 49 | + $delta = $firsttime ? $delta / DAMP : $delta / 2; |
| 50 | + $delta += $delta / $numpoints; |
| 51 | + my $k = 0; |
| 52 | + while ($delta > ((BASE - TMIN) * TMAX) / 2) { |
| 53 | + $delta /= BASE - TMIN; |
| 54 | + $k += BASE; |
| 55 | + } |
| 56 | + return $k + (((BASE - TMIN + 1) * $delta) / ($delta + SKEW)); |
| 57 | +} |
| 58 | + |
| 59 | +sub decode_punycode { |
| 60 | + my $code = shift; |
| 61 | + |
| 62 | + my $n = INITIAL_N; |
| 63 | + my $i = 0; |
| 64 | + my $bias = INITIAL_BIAS; |
| 65 | + my @output; |
| 66 | + |
| 67 | + if ($PREFIX) { |
| 68 | + if ($code !~ /^$PREFIX/) { |
| 69 | + return $code; |
| 70 | + } |
| 71 | + $code =~ s/^$PREFIX//; |
| 72 | + } |
| 73 | + |
| 74 | + if ($code =~ s/(.*)$Delimiter//o) { |
| 75 | + push @output, map ord, split //, $1; |
| 76 | + return _croak('non-basic code point') unless $1 =~ /^$BasicRE*$/o; |
| 77 | + } |
| 78 | + |
| 79 | + while ($code) { |
| 80 | + my $oldi = $i; |
| 81 | + my $w = 1; |
| 82 | + LOOP: |
| 83 | + for (my $k = BASE; 1; $k += BASE) { |
| 84 | + my $cp = substr($code, 0, 1, ''); |
| 85 | + my $digit = digit_value($cp); |
| 86 | + defined $digit or return _croak("invalid punycode input"); |
| 87 | + $i += $digit * $w; |
| 88 | + my $t = ($k <= $bias) ? TMIN |
| 89 | + : ($k >= $bias + TMAX) ? TMAX : $k - $bias; |
| 90 | + last LOOP if $digit < $t; |
| 91 | + $w *= (BASE - $t); |
| 92 | + } |
| 93 | + $bias = adapt($i - $oldi, @output + 1, $oldi == 0); |
| 94 | + warn "bias becomes $bias" if $DEBUG; |
| 95 | + $n += $i / (@output + 1); |
| 96 | + $i = $i % (@output + 1); |
| 97 | + splice(@output, $i, 0, $n); |
| 98 | + warn join " ", map sprintf('%04x', $_), @output if $DEBUG; |
| 99 | + $i++; |
| 100 | + } |
| 101 | + return join '', map chr, @output; |
| 102 | +} |
| 103 | + |
| 104 | +sub encode_punycode { |
| 105 | + my $input = shift; |
| 106 | + # my @input = split //, $input; # doesn't work in 5.6.x! |
| 107 | + my @input = map substr($input, $_, 1), 0..length($input)-1; |
| 108 | + |
| 109 | + my $n = INITIAL_N; |
| 110 | + my $delta = 0; |
| 111 | + my $bias = INITIAL_BIAS; |
| 112 | + |
| 113 | + my @output; |
| 114 | + my @basic = grep /$BasicRE/, @input; |
| 115 | + my $h = my $b = @basic; |
| 116 | + #push @output, @basic, $Delimiter if $b > 0; |
| 117 | + push @output, @basic if $b > 0; |
| 118 | + warn "basic codepoints: (@output)" if $DEBUG; |
| 119 | + |
| 120 | + if ($h < @input) { |
| 121 | + $PREFIX && unshift(@output, $PREFIX); |
| 122 | + push(@output, $Delimiter); |
| 123 | + } else { |
| 124 | + return join '', @output; |
| 125 | + } |
| 126 | + |
| 127 | + while ($h < @input) { |
| 128 | + my $m = min(grep { $_ >= $n } map ord, @input); |
| 129 | + warn sprintf "next code point to insert is %04x", $m if $DEBUG; |
| 130 | + $delta += ($m - $n) * ($h + 1); |
| 131 | + $n = $m; |
| 132 | + for my $i (@input) { |
| 133 | + my $c = ord($i); |
| 134 | + $delta++ if $c < $n; |
| 135 | + if ($c == $n) { |
| 136 | + my $q = $delta; |
| 137 | + LOOP: |
| 138 | + for (my $k = BASE; 1; $k += BASE) { |
| 139 | + my $t = ($k <= $bias) ? TMIN : |
| 140 | + ($k >= $bias + TMAX) ? TMAX : $k - $bias; |
| 141 | + last LOOP if $q < $t; |
| 142 | + my $cp = code_point($t + (($q - $t) % (BASE - $t))); |
| 143 | + push @output, chr($cp); |
| 144 | + $q = ($q - $t) / (BASE - $t); |
| 145 | + } |
| 146 | + push @output, chr(code_point($q)); |
| 147 | + $bias = adapt($delta, $h + 1, $h == $b); |
| 148 | + warn "bias becomes $bias" if $DEBUG; |
| 149 | + $delta = 0; |
| 150 | + $h++; |
| 151 | + } |
| 152 | + } |
| 153 | + $delta++; |
| 154 | + $n++; |
| 155 | + } |
| 156 | + return join '', @output; |
| 157 | +} |
| 158 | + |
| 159 | +sub min { |
| 160 | + my $min = shift; |
| 161 | + for (@_) { $min = $_ if $_ <= $min } |
| 162 | + return $min; |
| 163 | +} |
| 164 | + |
| 165 | +1; |
| 166 | +__END__ |
| 167 | +
|
| 168 | +=head1 NAME |
| 169 | +
|
| 170 | +IDNA::Punycode - encodes Unicode string in Punycode |
| 171 | +
|
| 172 | +=head1 SYNOPSIS |
| 173 | +
|
| 174 | + use IDNA::Punycode; |
| 175 | + idn_prefix('xn--'); |
| 176 | + $punycode = encode_punycode($unicode); |
| 177 | + $unicode = decode_punycode($punycode); |
| 178 | +
|
| 179 | +=head1 DESCRIPTION |
| 180 | +
|
| 181 | +IDNA::Punycode is a module to encode / decode Unicode strings into |
| 182 | +Punycode, an efficient encoding of Unicode for use with IDNA. |
| 183 | +
|
| 184 | +This module requires Perl 5.6.0 or over to handle UTF8 flagged Unicode |
| 185 | +strings. |
| 186 | +
|
| 187 | +=head1 FUNCTIONS |
| 188 | +
|
| 189 | +This module exports following functions by default. |
| 190 | +
|
| 191 | +=over 4 |
| 192 | +
|
| 193 | +=item encode_punycode |
| 194 | +
|
| 195 | + $punycode = encode_punycode($unicode); |
| 196 | +
|
| 197 | +takes Unicode string (UTF8-flagged variable) and returns Punycode |
| 198 | +encoding for it. |
| 199 | +
|
| 200 | +=item decode_punycode |
| 201 | +
|
| 202 | + $unicode = decode_punycode($punycode) |
| 203 | +
|
| 204 | +takes Punycode encoding and returns original Unicode string. |
| 205 | +
|
| 206 | +=item idn_prefix |
| 207 | +
|
| 208 | + idn_prefix($prefix); |
| 209 | +
|
| 210 | +causes encode_punycode() to add $prefix to ACE-string after conversion. |
| 211 | +As a side-effect decode_punycode() will only consider strings |
| 212 | +beginning with $prefix as punycode representations. |
| 213 | +
|
| 214 | +According to RFC 3490 the ACE prefix "xn--" had been chosen as the |
| 215 | +standard. Thus, "xn--" is also the default ACE prefix. For compatibility |
| 216 | +I'm leaving idn_prefix() in the module. Use C<idn_prefix(undef)> to |
| 217 | +get the old behaviour. |
| 218 | +
|
| 219 | +=back |
| 220 | +
|
| 221 | +These functions throws exceptionsn on failure. You can catch 'em via |
| 222 | +C<eval>. |
| 223 | +
|
| 224 | +=head1 AUTHORS |
| 225 | +
|
| 226 | +Tatsuhiko Miyagawa E<lt>[email protected]E<gt> is the original |
| 227 | +author and wrote almost all the code. |
| 228 | +
|
| 229 | +Robert Urban E<lt>[email protected]E<gt> added C<idn_prefix()>. |
| 230 | +
|
| 231 | +This library is free software; you can redistribute it and/or modify |
| 232 | +it under the same terms as Perl itself. |
| 233 | +
|
| 234 | +=head1 SEE ALSO |
| 235 | +
|
| 236 | +http://www.ietf.org/internet-drafts/draft-ietf-idn-punycode-01.txt |
| 237 | +
|
| 238 | +L<Encode::Punycode> |
| 239 | +
|
| 240 | +=cut |
0 commit comments