Skip to content

Commit 7c27f90

Browse files
author
bors-servo
authored
Auto merge of #453 - Marwes:perf, r=SimonSapin
Improve performance of the uts46 crate I may have complicated the table structure more than is desired so I'd be happy to remove it if you want. It does shrink the total size of the tables by 6000 * 8 - 1500 * 2 = 42kb though (and gives a slight performance boost). (Calculation is done as `removed_ranges * range_size - added_indexes * index_size`) Total performance change of parsing a small and simple url. ``` name old ns/iter new ns/iter diff ns/iter diff % speedup short 4,819 (5 MB/s) 3,516 (7 MB/s) -1,303 -27.04% x 1.37 ``` <!-- Reviewable:start --> --- This change is [<img src="https://reviewable.io/review_button.svg" height="34" align="absmiddle" alt="Reviewable"/>](https://reviewable.io/reviews/servo/rust-url/453) <!-- Reviewable:end -->
2 parents 04d5ec1 + 5dfd1ec commit 7c27f90

File tree

8 files changed

+10895
-7633
lines changed

8 files changed

+10895
-7633
lines changed

.travis.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@ jobs:
88
- cargo update
99
# getopts is only used in tests. Its versions 0.2.16+ don’t build on 1.17.0
1010
- cargo update -p getopts --precise 0.2.15
11+
12+
- cargo update -p unicode-normalization --precise 0.1.5
13+
1114
# data-url uses pub(crate) which is unstable in 1.17
1215
script: cargo test --all-features -p url -p idna -p percent-encoding -p url_serde
1316

Cargo.toml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ rustc-test = "0.3"
3535
rustc-serialize = "0.3"
3636
serde_json = ">=0.6.1, <0.9"
3737

38+
bencher = "0.1"
39+
3840
[features]
3941
query_encoding = ["encoding"]
4042
heap_size = ["heapsize"]
@@ -48,5 +50,9 @@ percent-encoding = { version = "1.0.0", path = "./percent_encoding" }
4850
rustc-serialize = {version = "0.3", optional = true}
4951
serde = {version = ">=0.6.1, <0.9", optional = true}
5052

53+
[[bench]]
54+
name = "parse_url"
55+
harness = false
56+
5157
[package.metadata.docs.rs]
5258
features = ["query_encoding"]

benches/parse_url.rs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
#[macro_use]
2+
extern crate bencher;
3+
4+
extern crate url;
5+
6+
use bencher::{black_box, Bencher};
7+
8+
use url::Url;
9+
10+
fn short(bench: &mut Bencher) {
11+
let url = "https://example.com/bench";
12+
13+
bench.bytes = url.len() as u64;
14+
bench.iter(|| black_box(url).parse::<Url>().unwrap());
15+
}
16+
17+
benchmark_group!(benches, short);
18+
benchmark_main!(benches);

idna/src/make_uts46_mapping_table.py

Lines changed: 61 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
# You can get the latest idna table from
1111
# http://www.unicode.org/Public/idna/latest/IdnaMappingTable.txt
1212

13+
from __future__ import print_function
1314
import collections
1415
import itertools
1516

@@ -23,8 +24,6 @@
2324
// except according to those terms.
2425
2526
// Generated by make_idna_table.py
26-
27-
static TABLE: &'static [Range] = &[
2827
''')
2928

3029
txt = open("IdnaMappingTable.txt")
@@ -84,6 +83,7 @@ def rust_slice(s):
8483

8584
def mergeable_key(r):
8685
mapping = r[2]
86+
8787
# These types have associated data, so we should not merge them.
8888
if mapping in ('Mapped', 'Deviation', 'DisallowedStd3Mapped'):
8989
return r
@@ -123,12 +123,65 @@ def mergeable_key(r):
123123
unicode_str = group[0][3]
124124
optimized_ranges.append((first, last, mapping, unicode_str))
125125

126-
for (first, last, mapping, unicode_str) in optimized_ranges:
127-
if unicode_str is not None:
128-
mapping += rust_slice(strtab_slice(unicode_str))
129-
print(" Range { from: '%s', to: '%s', mapping: %s }," % (escape_char(char(first)),
130-
escape_char(char(last)),
131-
mapping))
126+
def is_single_char_range(r):
127+
(first, last, _, _) = r
128+
return first == last
129+
130+
# We can reduce the size of the character range table and the index table to about 1/4
131+
# by merging runs of single character ranges and using character offsets from the start
132+
# of that range to retrieve the correct `Mapping` value
133+
def merge_single_char_ranges(ranges):
134+
current = []
135+
for r in ranges:
136+
if not current or is_single_char_range(current[-1]) and is_single_char_range(r):
137+
current.append(r)
138+
continue
139+
if len(current) != 0:
140+
ret = current
141+
current = [r]
142+
yield ret
143+
continue
144+
current.append(r)
145+
ret = current
146+
current = []
147+
yield ret
148+
yield current
149+
150+
optimized_ranges = list(merge_single_char_ranges(optimized_ranges))
151+
152+
153+
print("static TABLE: &'static [Range] = &[")
154+
155+
for ranges in optimized_ranges:
156+
first = ranges[0][0]
157+
last = ranges[-1][1]
158+
print(" Range { from: '%s', to: '%s', }," % (escape_char(char(first)),
159+
escape_char(char(last))))
160+
161+
print("];\n")
162+
163+
print("static INDEX_TABLE: &'static [u16] = &[")
164+
165+
SINGLE_MARKER = 1 << 15
166+
167+
offset = 0
168+
for ranges in optimized_ranges:
169+
assert offset < SINGLE_MARKER
170+
171+
block_len = len(ranges)
172+
single = SINGLE_MARKER if block_len == 1 else 0
173+
print(" %s," % (offset | single))
174+
offset += block_len
175+
176+
print("];\n")
177+
178+
print("static MAPPING_TABLE: &'static [Mapping] = &[")
179+
180+
for ranges in optimized_ranges:
181+
for (first, last, mapping, unicode_str) in ranges:
182+
if unicode_str is not None:
183+
mapping += rust_slice(strtab_slice(unicode_str))
184+
print(" %s," % mapping)
132185

133186
print("];\n")
134187

idna/src/punycode.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
1616
use std::u32;
1717
use std::char;
18+
#[allow(unused_imports, deprecated)]
1819
use std::ascii::AsciiExt;
1920

2021
// Bootstring parameters for Punycode

idna/src/uts46.rs

Lines changed: 28 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
1212
use self::Mapping::*;
1313
use punycode;
14+
#[allow(unused_imports, deprecated)]
1415
use std::ascii::AsciiExt;
1516
use std::cmp::Ordering::{Equal, Less, Greater};
1617
use unicode_bidi::{BidiClass, bidi_class};
@@ -55,7 +56,6 @@ enum Mapping {
5556
struct Range {
5657
from: char,
5758
to: char,
58-
mapping: Mapping,
5959
}
6060

6161
fn find_char(codepoint: char) -> &'static Mapping {
@@ -68,7 +68,19 @@ fn find_char(codepoint: char) -> &'static Mapping {
6868
Equal
6969
}
7070
});
71-
r.ok().map(|i| &TABLE[i].mapping).unwrap()
71+
r.ok().map(|i| {
72+
const SINGLE_MARKER: u16 = 1 << 15;
73+
74+
let x = INDEX_TABLE[i];
75+
let single = (x & SINGLE_MARKER) != 0;
76+
let offset = !SINGLE_MARKER & x;
77+
78+
if single {
79+
&MAPPING_TABLE[offset as usize]
80+
} else {
81+
&MAPPING_TABLE[(offset + (codepoint as u16 - TABLE[i].from as u16)) as usize]
82+
}
83+
}).unwrap()
7284
}
7385

7486
fn map_char(codepoint: char, flags: Flags, output: &mut String, errors: &mut Vec<Error>) {
@@ -221,17 +233,21 @@ fn passes_bidi(label: &str, is_bidi_domain: bool) -> bool {
221233
}
222234

223235
/// http://www.unicode.org/reports/tr46/#Validity_Criteria
236+
fn validate_full(label: &str, is_bidi_domain: bool, flags: Flags, errors: &mut Vec<Error>) {
237+
// V1: Must be in NFC form.
238+
if label.nfc().ne(label.chars()) {
239+
errors.push(Error::ValidityCriteria);
240+
} else {
241+
validate(label, is_bidi_domain, flags, errors);
242+
}
243+
}
244+
224245
fn validate(label: &str, is_bidi_domain: bool, flags: Flags, errors: &mut Vec<Error>) {
225246
let first_char = label.chars().next();
226247
if first_char == None {
227248
// Empty string, pass
228249
}
229250

230-
// V1: Must be in NFC form.
231-
else if label.nfc().ne(label.chars()) {
232-
errors.push(Error::ValidityCriteria);
233-
}
234-
235251
// V2: No U+002D HYPHEN-MINUS in both third and fourth positions.
236252
//
237253
// NOTE: Spec says that the label must not contain a HYPHEN-MINUS character in both the
@@ -279,11 +295,12 @@ fn validate(label: &str, is_bidi_domain: bool, flags: Flags, errors: &mut Vec<Er
279295

280296
/// http://www.unicode.org/reports/tr46/#Processing
281297
fn processing(domain: &str, flags: Flags, errors: &mut Vec<Error>) -> String {
282-
let mut mapped = String::new();
298+
let mut mapped = String::with_capacity(domain.len());
283299
for c in domain.chars() {
284300
map_char(c, flags, &mut mapped, errors)
285301
}
286-
let normalized: String = mapped.nfc().collect();
302+
let mut normalized = String::with_capacity(mapped.len());
303+
normalized.extend(mapped.nfc());
287304

288305
// Find out if it's a Bidi Domain Name
289306
//
@@ -322,12 +339,13 @@ fn processing(domain: &str, flags: Flags, errors: &mut Vec<Error>) -> String {
322339
match punycode::decode_to_string(&label[PUNYCODE_PREFIX.len()..]) {
323340
Some(decoded_label) => {
324341
let flags = Flags { transitional_processing: false, ..flags };
325-
validate(&decoded_label, is_bidi_domain, flags, errors);
342+
validate_full(&decoded_label, is_bidi_domain, flags, errors);
326343
validated.push_str(&decoded_label)
327344
}
328345
None => errors.push(Error::PunycodeError)
329346
}
330347
} else {
348+
// `normalized` is already `NFC` so we can skip that check
331349
validate(label, is_bidi_domain, flags, errors);
332350
validated.push_str(label)
333351
}

0 commit comments

Comments
 (0)