Skip to content

Commit 1ecea60

Browse files
committed
address all review comments
1 parent 9e2f0e4 commit 1ecea60

File tree

4 files changed

+76
-77
lines changed

4 files changed

+76
-77
lines changed

.cargo/config.toml

+14
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# Consider adding "--codegen=link-args=-Wl,--compress-debug-sections=zlib"
2+
3+
[target.x86_64-unknown-linux-gnu]
4+
# SSE3 is requred by simd-varint.
5+
# POPCNT makes `count_ones` (which we use in geofilter and bitrank) more efficient.
6+
rustflags = ["-C", "target-feature=+ssse3,+avx2,+popcnt"]
7+
8+
[target.x86_64-apple-darwin]
9+
# SSE3 is requred by simd-varint.
10+
# POPCNT makes `count_ones` (which we use in geofilter and bitrank) more efficient.
11+
rustflags = ["-C", "target-feature=+ssse3,+avx2,+popcnt"]
12+
13+
[target.aarch64-apple-darwin]
14+
rustflags = ["-C", "target-feature=+neon"]

crates/string-offsets/src/bitrank.rs

+33-37
Original file line numberDiff line numberDiff line change
@@ -40,9 +40,6 @@ struct Block {
4040

4141
impl Block {
4242
/// Set a bit without updating `self.sub_blocks`.
43-
///
44-
/// This panics if the bit was already set, because that indicates that the original positions
45-
/// list is invalid/had duplicates.
4643
fn set(&mut self, index: usize) {
4744
debug_assert!(index < BITS_PER_BLOCK);
4845
let chunk_idx = index / BITS_PER_SUB_BLOCK;
@@ -52,11 +49,7 @@ impl Block {
5249
self.bits[chunk_idx] |= mask;
5350
}
5451

55-
/// The **total rank** of the block relative local index, and the index of the one
56-
/// bit that establishes that rank (aka "select") **if** it occurs within that same
57-
/// chunk, otherwise ['None']. The assumption is that if you would have to look back
58-
/// through previous chunks it would actually be cheaper to do a lookup in the original
59-
/// data structure that the bit vector was created from.
52+
/// The **total rank** of the block relative local index.
6053
fn rank(&self, local_idx: usize) -> usize {
6154
let mut rank = self.rank as usize;
6255
let sub_block = local_idx / BITS_PER_SUB_BLOCK;
@@ -65,11 +58,7 @@ impl Block {
6558
let remainder = local_idx % BITS_PER_SUB_BLOCK;
6659

6760
let last_chunk = local_idx / BITS_PER_SUB_BLOCK;
68-
let masked = if remainder == 0 {
69-
0
70-
} else {
71-
self.bits[last_chunk] << (BITS_PER_SUB_BLOCK - remainder)
72-
};
61+
let masked = self.bits[last_chunk] & !(SubblockBits::MAX << remainder);
7362
rank + masked.count_ones() as usize
7463
}
7564

@@ -176,42 +165,52 @@ mod tests {
176165

177166
/// Creates a `BitRank` containing the integers in `iter` (which should be strictly
178167
/// increasing).
179-
pub fn bitrank<I: IntoIterator<Item = usize>>(capacity: usize, iter: I) -> BitRank {
180-
let mut builder = BitRankBuilder::with_capacity(capacity);
181-
for position in iter {
182-
builder.push(position);
168+
pub fn bitrank<I>(iter: I) -> BitRank
169+
where
170+
I: IntoIterator<Item = usize>,
171+
I::IntoIter: DoubleEndedIterator,
172+
{
173+
let mut iter = iter.into_iter().rev();
174+
if let Some(last) = iter.next() {
175+
let mut builder = BitRankBuilder::with_capacity(last + 1);
176+
builder.push(last);
177+
for position in iter {
178+
builder.push(position);
179+
}
180+
builder.finish()
181+
} else {
182+
BitRank { blocks: vec![] }
183183
}
184-
builder.finish()
185184
}
186185

187186
#[test]
188187
fn test_rank_zero() {
189-
let br = bitrank(1, [0]);
188+
let br = bitrank([0]);
190189
assert_eq!(br.rank(0), 0);
191190
assert_eq!(br.rank(1), 1);
192191
}
193192

194193
#[test]
195194
fn test_empty() {
196-
let br = bitrank(0, []);
195+
let br = bitrank([]);
197196
assert!(br.blocks.is_empty());
198197
}
199198

200199
#[test]
201200
fn test_index_out_of_bounds() {
202-
let br = bitrank(BITS_PER_BLOCK, [BITS_PER_BLOCK - 1]);
201+
let br = bitrank([BITS_PER_BLOCK - 1]);
203202
assert_eq!(br.rank(BITS_PER_BLOCK), 1);
204203
}
205204

206205
#[test]
207206
#[should_panic]
208207
fn test_duplicate_position() {
209-
bitrank(91, [64, 66, 68, 68, 90]);
208+
bitrank([64, 66, 68, 68, 90]);
210209
}
211210

212211
#[test]
213212
fn test_rank_exclusive() {
214-
let br = bitrank(133, 0..132);
213+
let br = bitrank(0..132);
215214
assert_eq!(br.blocks.len(), 1);
216215
assert_eq!(br.rank(64), 64);
217216
assert_eq!(br.rank(132), 132);
@@ -221,37 +220,37 @@ mod tests {
221220
fn test_rank() {
222221
let mut positions: Vec<usize> = (0..132).collect();
223222
positions.append(&mut vec![138usize, 140, 146]);
224-
let br = bitrank(146, positions);
223+
let br = bitrank(positions);
225224
assert_eq!(br.rank(135), 132);
226225

227-
let br2 = bitrank(BITS_PER_BLOCK, 0..BITS_PER_BLOCK - 5);
226+
let br2 = bitrank(0..BITS_PER_BLOCK - 5);
228227
assert_eq!(br2.rank(169), 169);
229228

230-
let br3 = bitrank(BITS_PER_BLOCK + 6, 0..BITS_PER_BLOCK + 5);
229+
let br3 = bitrank(0..BITS_PER_BLOCK + 5);
231230
assert_eq!(br3.rank(BITS_PER_BLOCK), BITS_PER_BLOCK);
232231
}
233232

234233
#[test]
235234
fn test_rank_idx() {
236235
let mut positions: Vec<usize> = (0..132).collect();
237236
positions.append(&mut vec![138usize, 140, 146]);
238-
let br = bitrank(147, positions);
237+
let br = bitrank(positions);
239238
assert_eq!(br.rank(135), 132);
240239

241240
let bits2: Vec<usize> = (0..BITS_PER_BLOCK - 5).collect();
242-
let br2 = bitrank(BITS_PER_BLOCK, bits2);
241+
let br2 = bitrank(bits2);
243242
assert_eq!(br2.rank(169), 169);
244243

245244
let bits3: Vec<usize> = (0..BITS_PER_BLOCK + 5).collect();
246-
let br3 = bitrank(BITS_PER_BLOCK + 6, bits3);
245+
let br3 = bitrank(bits3);
247246
assert_eq!(br3.rank(BITS_PER_BLOCK), BITS_PER_BLOCK);
248247

249248
let bits4: Vec<usize> = vec![1, 1000, 7777, BITS_PER_BLOCK + 1];
250-
let br4 = bitrank(BITS_PER_BLOCK + 1, bits4);
249+
let br4 = bitrank(bits4);
251250
assert_eq!(br4.rank(8000), 3);
252251

253252
let bits5: Vec<usize> = vec![1, 1000, 7777, BITS_PER_BLOCK + 1];
254-
let br5 = bitrank(BITS_PER_BLOCK + 1, bits5);
253+
let br5 = bitrank(bits5);
255254
assert_eq!(br5.rank(BITS_PER_BLOCK), 3);
256255
}
257256

@@ -267,7 +266,7 @@ mod tests {
267266
// This isn't strictly necessary, given that the bit would just be toggled again, but it
268267
// ensures that we are meeting the contract.
269268
random_bits.dedup();
270-
let br = bitrank(1_000_000, random_bits.iter().copied());
269+
let br = bitrank(random_bits.iter().copied());
271270
let mut rank = 0;
272271
for i in 0..random_bits.capacity() {
273272
assert_eq!(br.rank(i), rank);
@@ -282,7 +281,7 @@ mod tests {
282281
#[test]
283282
fn test_rank_out_of_bounds() {
284283
for i in 1..30 {
285-
let br = bitrank(BITS_PER_BLOCK * i, [BITS_PER_BLOCK * i - 1]);
284+
let br = bitrank([BITS_PER_BLOCK * i - 1]);
286285
assert_eq!(br.max_rank(), 1);
287286
assert_eq!(br.rank(BITS_PER_BLOCK * i - 1), 0);
288287
for j in 0..10 {
@@ -293,10 +292,7 @@ mod tests {
293292

294293
#[test]
295294
fn test_large_gap() {
296-
let br = bitrank(
297-
BITS_PER_BLOCK * 16,
298-
(3..4).chain(BITS_PER_BLOCK * 15..BITS_PER_BLOCK * 15 + 17),
299-
);
295+
let br = bitrank((3..4).chain(BITS_PER_BLOCK * 15..BITS_PER_BLOCK * 15 + 17));
300296
for i in 1..15 {
301297
assert_eq!(br.rank(BITS_PER_BLOCK * i), 1);
302298
}

crates/string-offsets/src/lib.rs

+11-9
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,10 @@
33
//! # Example
44
//!
55
//! ```
6-
//! use string_offsets::{StringOffsets, AllConfig};
6+
//! use string_offsets::StringOffsets;
77
//!
88
//! let s = "☀️hello\n🗺️world\n";
9-
//! let offsets = StringOffsets::<AllConfig>::new(s);
9+
//! let offsets: StringOffsets = StringOffsets::new(s);
1010
//!
1111
//! // Find offsets where lines begin and end.
1212
//! assert_eq!(offsets.line_to_utf8s(0), 0..12); // note: 0-based line numbers
@@ -93,7 +93,9 @@ pub use config::{AllConfig, OnlyLines};
9393
/// Most operations run in O(1) time. A few require O(log n) time. The memory consumed by this
9494
/// data structure is typically less than the memory occupied by the actual content. In the best
9595
/// case, it requires ~45% of the content space.
96-
pub struct StringOffsets<C: ConfigType> {
96+
/// One can reduce memory requirements further by only requesting the necessary features via the
97+
/// configuration type.
98+
pub struct StringOffsets<C: ConfigType = AllConfig> {
9799
/// Vector storing, for every line, the byte position at which the line starts.
98100
line_begins: Vec<u32>,
99101

@@ -488,7 +490,7 @@ mod tests {
488490
let content = r#"a short line.
489491
followed by another one.
490492
no terminating newline!"#;
491-
let lines = StringOffsets::<AllConfig>::new(content);
493+
let lines: StringOffsets = StringOffsets::new(content);
492494
assert_eq!(lines.line_to_utf8s(0), 0..14);
493495
assert_eq!(&content[0..14], "a short line.\n");
494496
assert_eq!(lines.line_to_utf8s(1), 14..39);
@@ -534,7 +536,7 @@ no terminating newline!"#;
534536
fn test_convert_ascii() {
535537
let content = r#"line0
536538
line1"#;
537-
let lines = StringOffsets::<AllConfig>::new(content);
539+
let lines: StringOffsets = StringOffsets::new(content);
538540
assert_eq!(lines.utf8_to_char_pos(0), pos(0, 0));
539541
assert_eq!(lines.utf8_to_char_pos(1), pos(0, 1));
540542
assert_eq!(lines.utf8_to_char_pos(6), pos(1, 0));
@@ -547,7 +549,7 @@ line1"#;
547549
let content = r#"❤️ line0
548550
line1
549551
✅ line2"#;
550-
let lines = StringOffsets::<AllConfig>::new(content);
552+
let lines: StringOffsets = StringOffsets::new(content);
551553
assert_eq!(lines.utf8_to_char_pos(0), pos(0, 0)); // ❤️ takes 6 bytes to represent in utf8 (2 code points)
552554
assert_eq!(lines.utf8_to_char_pos(1), pos(0, 0));
553555
assert_eq!(lines.utf8_to_char_pos(2), pos(0, 0));
@@ -578,7 +580,7 @@ line1
578580
fn test_small() {
579581
// Á - 2 bytes utf8
580582
let content = r#"❤️ line0 ❤️Á 👋"#;
581-
let lines = StringOffsets::<AllConfig>::new(content);
583+
let lines: StringOffsets = StringOffsets::new(content);
582584
let mut utf16_index = 0;
583585
let mut char_index = 0;
584586
for (byte_index, char) in content.char_indices() {
@@ -598,7 +600,7 @@ line1
598600
// ^~~~ utf8: 1 char, 1 byte, utf16: 1 code unit
599601
// ^~~~~ utf8: 1 char, 2 bytes, utf16: 1 code unit
600602
// ^~~~~~ utf8: 2 chars, 3 byte ea., utf16: 2 code units
601-
let lines = StringOffsets::<AllConfig>::new(content);
603+
let lines: StringOffsets = StringOffsets::new(content);
602604

603605
// UTF-16 positions
604606
assert_eq!(lines.utf8_to_utf16_pos(0), pos(0, 0)); // ❤️
@@ -644,7 +646,7 @@ line1
644646
#[test]
645647
fn test_critical_input_len() {
646648
let content = [b'a'; 16384];
647-
let lines = StringOffsets::<AllConfig>::from_bytes(&content);
649+
let lines: StringOffsets = StringOffsets::from_bytes(&content);
648650
assert_eq!(lines.utf8_to_utf16_pos(16384), pos(1, 0));
649651
}
650652
}

crates/string-offsets/src/wasm.rs

+18-31
Original file line numberDiff line numberDiff line change
@@ -2,89 +2,76 @@ use wasm_bindgen::prelude::*;
22

33
use crate::{AllConfig, Pos, StringOffsets as StringOffsetsImpl};
44

5-
#[cfg_attr(feature = "wasm", wasm_bindgen)]
5+
#[wasm_bindgen]
66
pub struct StringOffsets(StringOffsetsImpl<AllConfig>);
77

8-
#[cfg_attr(feature = "wasm", wasm_bindgen)]
8+
#[wasm_bindgen]
9+
#[allow(non_snake_case)]
910
impl StringOffsets {
10-
#[cfg_attr(feature = "wasm", wasm_bindgen(constructor))]
11+
#[wasm_bindgen(constructor)]
1112
pub fn new(content: &str) -> Self {
1213
Self(StringOffsetsImpl::new(content))
1314
}
1415

1516
#[allow(unused_variables)]
16-
#[cfg_attr(feature = "wasm", wasm_bindgen(static_method_of = StringOffsets))]
17+
#[wasm_bindgen(static_method_of = StringOffsets)]
1718
pub fn from_bytes(content: &[u8]) -> Self {
1819
Self(StringOffsetsImpl::from_bytes(content))
1920
}
2021

21-
#[cfg_attr(feature = "wasm", wasm_bindgen(js_name = lines))]
2222
pub fn lines(&self) -> usize {
2323
self.0.lines()
2424
}
2525

26-
#[cfg_attr(feature = "wasm", wasm_bindgen(js_name = lineToUtf8Begin))]
27-
pub fn line_to_utf8_begin(&self, line_number: usize) -> usize {
26+
pub fn lineToUtf8Begin(&self, line_number: usize) -> usize {
2827
self.0.line_to_utf8_begin(line_number)
2928
}
3029

31-
#[cfg_attr(feature = "wasm", wasm_bindgen(js_name = lineToUtf8End))]
32-
pub fn line_to_utf8_end(&self, line_number: usize) -> usize {
30+
pub fn lineToUtf8End(&self, line_number: usize) -> usize {
3331
self.0.line_to_utf8_end(line_number)
3432
}
3533

36-
#[cfg_attr(feature = "wasm", wasm_bindgen(js_name = utf8ToLine))]
37-
pub fn utf8_to_line(&self, byte_number: usize) -> usize {
34+
pub fn utf8ToLine(&self, byte_number: usize) -> usize {
3835
self.0.utf8_to_line(byte_number)
3936
}
4037

41-
#[cfg_attr(feature = "wasm", wasm_bindgen(js_name = lineChars))]
42-
pub fn line_chars(&self, line_number: usize) -> usize {
38+
pub fn lineChars(&self, line_number: usize) -> usize {
4339
self.0.line_chars(line_number)
4440
}
4541

46-
#[cfg_attr(feature = "wasm", wasm_bindgen(js_name = lineToCharBegin))]
47-
pub fn line_to_char_begin(&self, line_number: usize) -> usize {
42+
pub fn lineToCharBegin(&self, line_number: usize) -> usize {
4843
self.0.line_to_char_begin(line_number)
4944
}
5045

51-
#[cfg_attr(feature = "wasm", wasm_bindgen(js_name = lineToCharEnd))]
52-
pub fn line_to_char_end(&self, line_number: usize) -> usize {
46+
pub fn lineToCharEnd(&self, line_number: usize) -> usize {
5347
self.0.line_to_char_end(line_number)
5448
}
5549

56-
#[cfg_attr(feature = "wasm", wasm_bindgen(js_name = utf8ToCharPos))]
57-
pub fn utf8_to_char_pos(&self, byte_number: usize) -> Pos {
50+
pub fn utf8ToCharPos(&self, byte_number: usize) -> Pos {
5851
self.0.utf8_to_char_pos(byte_number)
5952
}
6053

61-
#[cfg_attr(feature = "wasm", wasm_bindgen(js_name = utf8ToChar))]
62-
pub fn utf8_to_char(&self, byte_number: usize) -> usize {
54+
pub fn utf8ToChar(&self, byte_number: usize) -> usize {
6355
self.0.utf8_to_char(byte_number)
6456
}
6557

66-
#[cfg_attr(feature = "wasm", wasm_bindgen(js_name = charToUtf8))]
67-
pub fn char_to_utf8(&self, char_number: usize) -> usize {
58+
pub fn charToUtf8(&self, char_number: usize) -> usize {
6859
self.0.char_to_utf8(char_number)
6960
}
7061

71-
#[cfg_attr(feature = "wasm", wasm_bindgen(js_name = utf8ToUtf16))]
72-
pub fn utf8_to_utf16(&self, byte_number: usize) -> usize {
62+
pub fn utf8ToUtf16(&self, byte_number: usize) -> usize {
7363
self.0.utf8_to_utf16(byte_number)
7464
}
7565

76-
#[cfg_attr(feature = "wasm", wasm_bindgen(js_name = lineToUtf16Begin))]
77-
pub fn line_to_utf16_begin(&self, line_number: usize) -> usize {
66+
pub fn lineToUtf16Begin(&self, line_number: usize) -> usize {
7867
self.0.line_to_utf16_begin(line_number)
7968
}
8069

81-
#[cfg_attr(feature = "wasm", wasm_bindgen(js_name = lineToUtf16End))]
82-
pub fn line_to_utf16_end(&self, line_number: usize) -> usize {
70+
pub fn lineToUtf16End(&self, line_number: usize) -> usize {
8371
self.0.line_to_utf16_end(line_number)
8472
}
8573

86-
#[cfg_attr(feature = "wasm", wasm_bindgen(js_name = utf8ToUtf16Pos))]
87-
pub fn utf8_to_utf16_pos(&self, byte_number: usize) -> Pos {
74+
pub fn utf8ToUtf16Pos(&self, byte_number: usize) -> Pos {
8875
self.0.utf8_to_utf16_pos(byte_number)
8976
}
9077
}

0 commit comments

Comments
 (0)