diff --git a/build.zig b/build.zig index 4161a6396e68..3a7468243f49 100644 --- a/build.zig +++ b/build.zig @@ -122,6 +122,8 @@ pub fn build(b: *Builder) !void { "compress-gettysburg.txt", "compress-pi.txt", "rfc1951.txt", + // exclude files from lib/std/compress/xz/testdata + ".xz", // exclude files from lib/std/tz/ ".tzif", // others diff --git a/lib/std/compress.zig b/lib/std/compress.zig index 3c52002cfc86..334d7bfcb8f9 100644 --- a/lib/std/compress.zig +++ b/lib/std/compress.zig @@ -3,6 +3,7 @@ const std = @import("std.zig"); pub const deflate = @import("compress/deflate.zig"); pub const gzip = @import("compress/gzip.zig"); pub const zlib = @import("compress/zlib.zig"); +pub const xz = @import("compress/xz.zig"); pub fn HashedReader( comptime ReaderType: anytype, @@ -38,4 +39,5 @@ test { _ = deflate; _ = gzip; _ = zlib; + _ = xz; } diff --git a/lib/std/compress/gzip.zig b/lib/std/compress/gzip.zig index 4d1f8e28f498..7e9fea6814fa 100644 --- a/lib/std/compress/gzip.zig +++ b/lib/std/compress/gzip.zig @@ -1,7 +1,7 @@ // // Decompressor for GZIP data streams (RFC1952) -const std = @import("std"); +const std = @import("../std.zig"); const io = std.io; const fs = std.fs; const testing = std.testing; @@ -17,10 +17,7 @@ const FCOMMENT = 1 << 4; const max_string_len = 1024; -/// TODO: the fully qualified namespace to this declaration is -/// std.compress.gzip.GzipStream which has a redundant "gzip" in the name. -/// Instead, it should be `std.compress.gzip.Stream`. -pub fn GzipStream(comptime ReaderType: type) type { +pub fn Decompress(comptime ReaderType: type) type { return struct { const Self = @This(); @@ -154,14 +151,14 @@ pub fn GzipStream(comptime ReaderType: type) type { }; } -pub fn gzipStream(allocator: mem.Allocator, reader: anytype) !GzipStream(@TypeOf(reader)) { - return GzipStream(@TypeOf(reader)).init(allocator, reader); +pub fn decompress(allocator: mem.Allocator, reader: anytype) !Decompress(@TypeOf(reader)) { + return Decompress(@TypeOf(reader)).init(allocator, reader); } fn testReader(data: []const u8, comptime expected: []const u8) !void { var in_stream = io.fixedBufferStream(data); - var gzip_stream = try gzipStream(testing.allocator, in_stream.reader()); + var gzip_stream = try decompress(testing.allocator, in_stream.reader()); defer gzip_stream.deinit(); // Read and decompress the whole file diff --git a/lib/std/compress/xz.zig b/lib/std/compress/xz.zig new file mode 100644 index 000000000000..1b2a543ad1b8 --- /dev/null +++ b/lib/std/compress/xz.zig @@ -0,0 +1,145 @@ +const std = @import("std"); +const block = @import("xz/block.zig"); +const Allocator = std.mem.Allocator; +const Crc32 = std.hash.Crc32; + +pub const Check = enum(u4) { + none = 0x00, + crc32 = 0x01, + crc64 = 0x04, + sha256 = 0x0A, + _, +}; + +fn readStreamFlags(reader: anytype, check: *Check) !void { + var bit_reader = std.io.bitReader(.Little, reader); + + const reserved1 = try bit_reader.readBitsNoEof(u8, 8); + if (reserved1 != 0) + return error.CorruptInput; + + check.* = @intToEnum(Check, try bit_reader.readBitsNoEof(u4, 4)); + + const reserved2 = try bit_reader.readBitsNoEof(u4, 4); + if (reserved2 != 0) + return error.CorruptInput; +} + +pub fn decompress(allocator: Allocator, reader: anytype) !Decompress(@TypeOf(reader)) { + return Decompress(@TypeOf(reader)).init(allocator, reader); +} + +pub fn Decompress(comptime ReaderType: type) type { + return struct { + const Self = @This(); + + pub const Error = ReaderType.Error || block.Decoder(ReaderType).Error; + pub const Reader = std.io.Reader(*Self, Error, read); + + allocator: Allocator, + block_decoder: block.Decoder(ReaderType), + in_reader: ReaderType, + + fn init(allocator: Allocator, source: ReaderType) !Self { + const magic = try source.readBytesNoEof(6); + if (!std.mem.eql(u8, &magic, &.{ 0xFD, '7', 'z', 'X', 'Z', 0x00 })) + return error.BadHeader; + + var check: Check = undefined; + const hash_a = blk: { + var hasher = std.compress.hashedReader(source, Crc32.init()); + try readStreamFlags(hasher.reader(), &check); + break :blk hasher.hasher.final(); + }; + + const hash_b = try source.readIntLittle(u32); + if (hash_a != hash_b) + return error.WrongChecksum; + + return Self{ + .allocator = allocator, + .block_decoder = try block.decoder(allocator, source, check), + .in_reader = source, + }; + } + + pub fn deinit(self: *Self) void { + self.block_decoder.deinit(); + } + + pub fn reader(self: *Self) Reader { + return .{ .context = self }; + } + + pub fn read(self: *Self, buffer: []u8) Error!usize { + if (buffer.len == 0) + return 0; + + const r = try self.block_decoder.read(buffer); + if (r != 0) + return r; + + const index_size = blk: { + var hasher = std.compress.hashedReader(self.in_reader, Crc32.init()); + hasher.hasher.update(&[1]u8{0x00}); + + var counter = std.io.countingReader(hasher.reader()); + counter.bytes_read += 1; + + const counting_reader = counter.reader(); + + const record_count = try std.leb.readULEB128(u64, counting_reader); + if (record_count != self.block_decoder.block_count) + return error.CorruptInput; + + var i: usize = 0; + while (i < record_count) : (i += 1) { + // TODO: validate records + _ = try std.leb.readULEB128(u64, counting_reader); + _ = try std.leb.readULEB128(u64, counting_reader); + } + + while (counter.bytes_read % 4 != 0) { + if (try counting_reader.readByte() != 0) + return error.CorruptInput; + } + + const hash_a = hasher.hasher.final(); + const hash_b = try counting_reader.readIntLittle(u32); + if (hash_a != hash_b) + return error.WrongChecksum; + + break :blk counter.bytes_read; + }; + + const hash_a = try self.in_reader.readIntLittle(u32); + + const hash_b = blk: { + var hasher = std.compress.hashedReader(self.in_reader, Crc32.init()); + const hashed_reader = hasher.reader(); + + const backward_size = (try hashed_reader.readIntLittle(u32) + 1) * 4; + if (backward_size != index_size) + return error.CorruptInput; + + var check: Check = undefined; + try readStreamFlags(hashed_reader, &check); + + break :blk hasher.hasher.final(); + }; + + if (hash_a != hash_b) + return error.WrongChecksum; + + const magic = try self.in_reader.readBytesNoEof(2); + if (!std.mem.eql(u8, &magic, &.{ 'Y', 'Z' })) + return error.CorruptInput; + + return 0; + } + }; +} + +test { + _ = @import("xz/test.zig"); +} diff --git a/lib/std/compress/xz/block.zig b/lib/std/compress/xz/block.zig new file mode 100644 index 000000000000..1b909beaf4d7 --- /dev/null +++ b/lib/std/compress/xz/block.zig @@ -0,0 +1,317 @@ +const std = @import("../../std.zig"); +const lzma = @import("lzma.zig"); +const Allocator = std.mem.Allocator; +const Crc32 = std.hash.Crc32; +const Crc64 = std.hash.crc.Crc64Xz; +const Sha256 = std.crypto.hash.sha2.Sha256; +const xz = std.compress.xz; + +const DecodeError = error{ + CorruptInput, + EndOfStream, + EndOfStreamWithNoError, + WrongChecksum, + Unsupported, + Overflow, +}; + +pub fn decoder(allocator: Allocator, reader: anytype, check: xz.Check) !Decoder(@TypeOf(reader)) { + return Decoder(@TypeOf(reader)).init(allocator, reader, check); +} + +pub fn Decoder(comptime ReaderType: type) type { + return struct { + const Self = @This(); + pub const Error = + ReaderType.Error || + DecodeError || + Allocator.Error; + pub const Reader = std.io.Reader(*Self, Error, read); + + allocator: Allocator, + inner_reader: ReaderType, + check: xz.Check, + err: ?Error, + accum: lzma.LzAccumBuffer, + lzma_state: lzma.DecoderState, + block_count: usize, + + fn init(allocator: Allocator, in_reader: ReaderType, check: xz.Check) !Self { + return Self{ + .allocator = allocator, + .inner_reader = in_reader, + .check = check, + .err = null, + .accum = .{}, + .lzma_state = try lzma.DecoderState.init(allocator), + .block_count = 0, + }; + } + + pub fn deinit(self: *Self) void { + self.accum.deinit(self.allocator); + self.lzma_state.deinit(self.allocator); + } + + pub fn reader(self: *Self) Reader { + return .{ .context = self }; + } + + pub fn read(self: *Self, output: []u8) Error!usize { + while (true) { + if (self.accum.to_read.items.len > 0) { + const n = self.accum.read(output); + if (self.accum.to_read.items.len == 0 and self.err != null) { + if (self.err.? == DecodeError.EndOfStreamWithNoError) { + return n; + } + return self.err.?; + } + return n; + } + if (self.err != null) { + if (self.err.? == DecodeError.EndOfStreamWithNoError) { + return 0; + } + return self.err.?; + } + self.readBlock() catch |e| { + self.err = e; + if (self.accum.to_read.items.len == 0) { + try self.accum.reset(self.allocator); + } + }; + } + } + + fn readBlock(self: *Self) Error!void { + const unpacked_pos = self.accum.to_read.items.len; + + var block_counter = std.io.countingReader(self.inner_reader); + const block_reader = block_counter.reader(); + + var packed_size: ?u64 = null; + var unpacked_size: ?u64 = null; + + // Block Header + { + var header_hasher = std.compress.hashedReader(block_reader, Crc32.init()); + const header_reader = header_hasher.reader(); + + const header_size = try header_reader.readByte() * 4; + if (header_size == 0) + return error.EndOfStreamWithNoError; + + const Flags = packed struct(u8) { + last_filter_index: u2, + reserved: u4, + has_packed_size: bool, + has_unpacked_size: bool, + }; + + const flags = @bitCast(Flags, try header_reader.readByte()); + const filter_count = @as(u3, flags.last_filter_index) + 1; + if (filter_count > 1) + return error.Unsupported; + + if (flags.has_packed_size) + packed_size = try std.leb.readULEB128(u64, header_reader); + + if (flags.has_unpacked_size) + unpacked_size = try std.leb.readULEB128(u64, header_reader); + + const FilterId = enum(u64) { + lzma2 = 0x21, + _, + }; + + const filter_id = @intToEnum( + FilterId, + try std.leb.readULEB128(u64, header_reader), + ); + + if (@enumToInt(filter_id) >= 0x4000_0000_0000_0000) + return error.CorruptInput; + + if (filter_id != .lzma2) + return error.Unsupported; + + const properties_size = try std.leb.readULEB128(u64, header_reader); + if (properties_size != 1) + return error.CorruptInput; + + // TODO: use filter properties + _ = try header_reader.readByte(); + + while (block_counter.bytes_read != header_size) { + if (try header_reader.readByte() != 0) + return error.CorruptInput; + } + + const hash_a = header_hasher.hasher.final(); + const hash_b = try header_reader.readIntLittle(u32); + if (hash_a != hash_b) + return error.WrongChecksum; + } + + // Compressed Data + var packed_counter = std.io.countingReader(block_reader); + const packed_reader = packed_counter.reader(); + while (try self.readLzma2Chunk(packed_reader)) {} + + if (packed_size) |s| { + if (s != packed_counter.bytes_read) + return error.CorruptInput; + } + + const unpacked_bytes = self.accum.to_read.items[unpacked_pos..]; + if (unpacked_size) |s| { + if (s != unpacked_bytes.len) + return error.CorruptInput; + } + + // Block Padding + while (block_counter.bytes_read % 4 != 0) { + if (try block_reader.readByte() != 0) + return error.CorruptInput; + } + + switch (self.check) { + .none => {}, + .crc32 => { + const hash_a = Crc32.hash(unpacked_bytes); + const hash_b = try self.inner_reader.readIntLittle(u32); + if (hash_a != hash_b) + return error.WrongChecksum; + }, + .crc64 => { + const hash_a = Crc64.hash(unpacked_bytes); + const hash_b = try self.inner_reader.readIntLittle(u64); + if (hash_a != hash_b) + return error.WrongChecksum; + }, + .sha256 => { + var hash_a: [Sha256.digest_length]u8 = undefined; + Sha256.hash(unpacked_bytes, &hash_a, .{}); + + var hash_b: [Sha256.digest_length]u8 = undefined; + try self.inner_reader.readNoEof(&hash_b); + + if (!std.mem.eql(u8, &hash_a, &hash_b)) + return error.WrongChecksum; + }, + else => return error.Unsupported, + } + + self.block_count += 1; + } + + fn readLzma2Chunk(self: *Self, packed_reader: anytype) Error!bool { + const status = try packed_reader.readByte(); + switch (status) { + 0 => { + try self.accum.reset(self.allocator); + return false; + }, + 1, 2 => { + if (status == 1) + try self.accum.reset(self.allocator); + + const size = try packed_reader.readIntBig(u16) + 1; + try self.accum.ensureUnusedCapacity(self.allocator, size); + + var i: usize = 0; + while (i < size) : (i += 1) + self.accum.appendAssumeCapacity(try packed_reader.readByte()); + + return true; + }, + else => { + if (status & 0x80 == 0) + return error.CorruptInput; + + const Reset = struct { + dict: bool, + state: bool, + props: bool, + }; + + const reset = switch ((status >> 5) & 0x3) { + 0 => Reset{ + .dict = false, + .state = false, + .props = false, + }, + 1 => Reset{ + .dict = false, + .state = true, + .props = false, + }, + 2 => Reset{ + .dict = false, + .state = true, + .props = true, + }, + 3 => Reset{ + .dict = true, + .state = true, + .props = true, + }, + else => unreachable, + }; + + const unpacked_size = blk: { + var tmp: u64 = status & 0x1F; + tmp <<= 16; + tmp |= try packed_reader.readIntBig(u16); + break :blk tmp + 1; + }; + + const packed_size = blk: { + const tmp: u17 = try packed_reader.readIntBig(u16); + break :blk tmp + 1; + }; + + if (reset.dict) + try self.accum.reset(self.allocator); + + if (reset.state) { + var new_props = self.lzma_state.lzma_props; + + if (reset.props) { + var props = try packed_reader.readByte(); + if (props >= 225) + return error.CorruptInput; + + const lc = @intCast(u4, props % 9); + props /= 9; + const lp = @intCast(u3, props % 5); + props /= 5; + const pb = @intCast(u3, props); + + if (lc + lp > 4) + return error.CorruptInput; + + new_props = .{ .lc = lc, .lp = lp, .pb = pb }; + } + + try self.lzma_state.reset_state(self.allocator, new_props); + } + + self.lzma_state.unpacked_size = unpacked_size + self.accum.len(); + + const buffer = try self.allocator.alloc(u8, packed_size); + defer self.allocator.free(buffer); + + for (buffer) |*b| + b.* = try packed_reader.readByte(); + + var rangecoder = try lzma.RangeDecoder.init(buffer); + try self.lzma_state.process(self.allocator, &self.accum, &rangecoder); + + return true; + }, + } + } + }; +} diff --git a/lib/std/compress/xz/lzma.zig b/lib/std/compress/xz/lzma.zig new file mode 100644 index 000000000000..9fe941e2b1e8 --- /dev/null +++ b/lib/std/compress/xz/lzma.zig @@ -0,0 +1,658 @@ +// Ported from https://github.com/gendx/lzma-rs + +const std = @import("../../std.zig"); +const assert = std.debug.assert; +const Allocator = std.mem.Allocator; +const ArrayListUnmanaged = std.ArrayListUnmanaged; + +const LzmaProperties = struct { + lc: u4, + lp: u3, + pb: u3, + + fn validate(self: LzmaProperties) void { + assert(self.lc <= 8); + assert(self.lp <= 4); + assert(self.pb <= 4); + } +}; + +pub const DecoderState = struct { + lzma_props: LzmaProperties, + unpacked_size: ?u64, + literal_probs: Vec2D(u16), + pos_slot_decoder: [4]BitTree, + align_decoder: BitTree, + pos_decoders: [115]u16, + is_match: [192]u16, + is_rep: [12]u16, + is_rep_g0: [12]u16, + is_rep_g1: [12]u16, + is_rep_g2: [12]u16, + is_rep_0long: [192]u16, + state: usize, + rep: [4]usize, + len_decoder: LenDecoder, + rep_len_decoder: LenDecoder, + + pub fn init(allocator: Allocator) !DecoderState { + return .{ + .lzma_props = LzmaProperties{ .lc = 0, .lp = 0, .pb = 0 }, + .unpacked_size = null, + .literal_probs = try Vec2D(u16).init(allocator, 0x400, 1, 0x300), + .pos_slot_decoder = .{ + try BitTree.init(allocator, 6), + try BitTree.init(allocator, 6), + try BitTree.init(allocator, 6), + try BitTree.init(allocator, 6), + }, + .align_decoder = try BitTree.init(allocator, 4), + .pos_decoders = .{0x400} ** 115, + .is_match = .{0x400} ** 192, + .is_rep = .{0x400} ** 12, + .is_rep_g0 = .{0x400} ** 12, + .is_rep_g1 = .{0x400} ** 12, + .is_rep_g2 = .{0x400} ** 12, + .is_rep_0long = .{0x400} ** 192, + .state = 0, + .rep = .{0} ** 4, + .len_decoder = try LenDecoder.init(allocator), + .rep_len_decoder = try LenDecoder.init(allocator), + }; + } + + pub fn deinit(self: *DecoderState, allocator: Allocator) void { + self.literal_probs.deinit(allocator); + for (self.pos_slot_decoder) |*t| t.deinit(allocator); + self.align_decoder.deinit(allocator); + self.len_decoder.deinit(allocator); + self.rep_len_decoder.deinit(allocator); + } + + pub fn reset_state(self: *DecoderState, allocator: Allocator, new_props: LzmaProperties) !void { + new_props.validate(); + if (self.lzma_props.lc + self.lzma_props.lp == new_props.lc + new_props.lp) { + self.literal_probs.fill(0x400); + } else { + self.literal_probs.deinit(allocator); + self.literal_probs = try Vec2D(u16).init(allocator, 0x400, @as(usize, 1) << (new_props.lc + new_props.lp), 0x300); + } + + self.lzma_props = new_props; + for (self.pos_slot_decoder) |*t| t.reset(); + self.align_decoder.reset(); + self.pos_decoders = .{0x400} ** 115; + self.is_match = .{0x400} ** 192; + self.is_rep = .{0x400} ** 12; + self.is_rep_g0 = .{0x400} ** 12; + self.is_rep_g1 = .{0x400} ** 12; + self.is_rep_g2 = .{0x400} ** 12; + self.is_rep_0long = .{0x400} ** 192; + self.state = 0; + self.rep = .{0} ** 4; + self.len_decoder.reset(); + self.rep_len_decoder.reset(); + } + + fn processNextInner( + self: *DecoderState, + allocator: Allocator, + output: *LzAccumBuffer, + rangecoder: *RangeDecoder, + update: bool, + ) !ProcessingStatus { + const pos_state = output.len() & ((@as(usize, 1) << self.lzma_props.pb) - 1); + + if (!try rangecoder.decodeBit( + &self.is_match[(self.state << 4) + pos_state], + update, + )) { + const byte: u8 = try self.decodeLiteral(output, rangecoder, update); + + if (update) { + try output.appendLiteral(allocator, byte); + + self.state = if (self.state < 4) + 0 + else if (self.state < 10) + self.state - 3 + else + self.state - 6; + } + return .continue_; + } + + var len: usize = undefined; + if (try rangecoder.decodeBit(&self.is_rep[self.state], update)) { + if (!try rangecoder.decodeBit(&self.is_rep_g0[self.state], update)) { + if (!try rangecoder.decodeBit( + &self.is_rep_0long[(self.state << 4) + pos_state], + update, + )) { + if (update) { + self.state = if (self.state < 7) 9 else 11; + const dist = self.rep[0] + 1; + try output.appendLz(allocator, 1, dist); + } + return .continue_; + } + } else { + const idx: usize = if (!try rangecoder.decodeBit(&self.is_rep_g1[self.state], update)) + 1 + else if (!try rangecoder.decodeBit(&self.is_rep_g2[self.state], update)) + 2 + else + 3; + if (update) { + const dist = self.rep[idx]; + var i = idx; + while (i > 0) : (i -= 1) { + self.rep[i] = self.rep[i - 1]; + } + self.rep[0] = dist; + } + } + + len = try self.rep_len_decoder.decode(rangecoder, pos_state, update); + + if (update) { + self.state = if (self.state < 7) 8 else 11; + } + } else { + if (update) { + self.rep[3] = self.rep[2]; + self.rep[2] = self.rep[1]; + self.rep[1] = self.rep[0]; + } + + len = try self.len_decoder.decode(rangecoder, pos_state, update); + + if (update) { + self.state = if (self.state < 7) 7 else 10; + } + + const rep_0 = try self.decodeDistance(rangecoder, len, update); + + if (update) { + self.rep[0] = rep_0; + if (self.rep[0] == 0xFFFF_FFFF) { + if (rangecoder.isFinished()) { + return .finished; + } + return error.CorruptInput; + } + } + } + + if (update) { + len += 2; + + const dist = self.rep[0] + 1; + try output.appendLz(allocator, len, dist); + } + + return .continue_; + } + + fn processNext( + self: *DecoderState, + allocator: Allocator, + output: *LzAccumBuffer, + rangecoder: *RangeDecoder, + ) !ProcessingStatus { + return self.processNextInner(allocator, output, rangecoder, true); + } + + pub fn process( + self: *DecoderState, + allocator: Allocator, + output: *LzAccumBuffer, + rangecoder: *RangeDecoder, + ) !void { + while (true) { + if (self.unpacked_size) |unpacked_size| { + if (output.len() >= unpacked_size) { + break; + } + } else if (rangecoder.isFinished()) { + break; + } + + if (try self.processNext(allocator, output, rangecoder) == .finished) { + break; + } + } + + if (self.unpacked_size) |len| { + if (len != output.len()) { + return error.CorruptInput; + } + } + } + + fn decodeLiteral( + self: *DecoderState, + output: *LzAccumBuffer, + rangecoder: *RangeDecoder, + update: bool, + ) !u8 { + const def_prev_byte = 0; + const prev_byte = @as(usize, output.lastOr(def_prev_byte)); + + var result: usize = 1; + const lit_state = ((output.len() & ((@as(usize, 1) << self.lzma_props.lp) - 1)) << self.lzma_props.lc) + + (prev_byte >> (8 - self.lzma_props.lc)); + const probs = try self.literal_probs.get(lit_state); + + if (self.state >= 7) { + var match_byte = @as(usize, try output.lastN(self.rep[0] + 1)); + + while (result < 0x100) { + const match_bit = (match_byte >> 7) & 1; + match_byte <<= 1; + const bit = @boolToInt(try rangecoder.decodeBit( + &probs[((@as(usize, 1) + match_bit) << 8) + result], + update, + )); + result = (result << 1) ^ bit; + if (match_bit != bit) { + break; + } + } + } + + while (result < 0x100) { + result = (result << 1) ^ @boolToInt(try rangecoder.decodeBit(&probs[result], update)); + } + + return @truncate(u8, result - 0x100); + } + + fn decodeDistance( + self: *DecoderState, + rangecoder: *RangeDecoder, + length: usize, + update: bool, + ) !usize { + const len_state = if (length > 3) 3 else length; + + const pos_slot = @as(usize, try self.pos_slot_decoder[len_state].parse(rangecoder, update)); + if (pos_slot < 4) + return pos_slot; + + const num_direct_bits = @intCast(u5, (pos_slot >> 1) - 1); + var result = (2 ^ (pos_slot & 1)) << num_direct_bits; + + if (pos_slot < 14) { + result += try rangecoder.parseReverseBitTree( + num_direct_bits, + &self.pos_decoders, + result - pos_slot, + update, + ); + } else { + result += @as(usize, try rangecoder.get(num_direct_bits - 4)) << 4; + result += try self.align_decoder.parseReverse(rangecoder, update); + } + + return result; + } +}; + +const ProcessingStatus = enum { + continue_, + finished, +}; + +pub const LzAccumBuffer = struct { + to_read: ArrayListUnmanaged(u8) = .{}, + buf: ArrayListUnmanaged(u8) = .{}, + + pub fn deinit(self: *LzAccumBuffer, allocator: Allocator) void { + self.to_read.deinit(allocator); + self.buf.deinit(allocator); + } + + pub fn read(self: *LzAccumBuffer, output: []u8) usize { + const input = self.to_read.items; + const n = std.math.min(input.len, output.len); + std.mem.copy(u8, output[0..n], input[0..n]); + std.mem.copy(u8, input, input[n..]); + self.to_read.shrinkRetainingCapacity(input.len - n); + return n; + } + + pub fn ensureUnusedCapacity( + self: *LzAccumBuffer, + allocator: Allocator, + additional_count: usize, + ) !void { + try self.buf.ensureUnusedCapacity(allocator, additional_count); + } + + pub fn appendAssumeCapacity(self: *LzAccumBuffer, byte: u8) void { + self.buf.appendAssumeCapacity(byte); + } + + pub fn reset(self: *LzAccumBuffer, allocator: Allocator) !void { + try self.to_read.appendSlice(allocator, self.buf.items); + self.buf.clearRetainingCapacity(); + } + + pub fn len(self: *const LzAccumBuffer) usize { + return self.buf.items.len; + } + + pub fn lastOr(self: *const LzAccumBuffer, lit: u8) u8 { + const buf_len = self.buf.items.len; + return if (buf_len == 0) + lit + else + self.buf.items[buf_len - 1]; + } + + pub fn lastN(self: *const LzAccumBuffer, dist: usize) !u8 { + const buf_len = self.buf.items.len; + if (dist > buf_len) { + return error.CorruptInput; + } + + return self.buf.items[buf_len - dist]; + } + + pub fn appendLiteral(self: *LzAccumBuffer, allocator: Allocator, lit: u8) !void { + try self.buf.append(allocator, lit); + } + + pub fn appendLz(self: *LzAccumBuffer, allocator: Allocator, length: usize, dist: usize) !void { + const buf_len = self.buf.items.len; + if (dist > buf_len) { + return error.CorruptInput; + } + + var offset = buf_len - dist; + var i: usize = 0; + while (i < length) : (i += 1) { + const x = self.buf.items[offset]; + try self.buf.append(allocator, x); + offset += 1; + } + } +}; + +pub const RangeDecoder = struct { + stream: std.io.FixedBufferStream([]const u8), + range: u32, + code: u32, + + pub fn init(buffer: []const u8) !RangeDecoder { + var dec = RangeDecoder{ + .stream = std.io.fixedBufferStream(buffer), + .range = 0xFFFF_FFFF, + .code = 0, + }; + const reader = dec.stream.reader(); + _ = try reader.readByte(); + dec.code = try reader.readIntBig(u32); + return dec; + } + + pub fn fromParts( + buffer: []const u8, + range: u32, + code: u32, + ) RangeDecoder { + return .{ + .stream = std.io.fixedBufferStream(buffer), + .range = range, + .code = code, + }; + } + + pub fn set(self: *RangeDecoder, range: u32, code: u32) void { + self.range = range; + self.code = code; + } + + pub fn readInto(self: *RangeDecoder, dest: []u8) !usize { + return self.stream.read(dest); + } + + pub inline fn isFinished(self: *const RangeDecoder) bool { + return self.code == 0 and self.isEof(); + } + + pub inline fn isEof(self: *const RangeDecoder) bool { + return self.stream.pos == self.stream.buffer.len; + } + + inline fn normalize(self: *RangeDecoder) !void { + if (self.range < 0x0100_0000) { + self.range <<= 8; + self.code = (self.code << 8) ^ @as(u32, try self.stream.reader().readByte()); + } + } + + inline fn getBit(self: *RangeDecoder) !bool { + self.range >>= 1; + + const bit = self.code >= self.range; + if (bit) + self.code -= self.range; + + try self.normalize(); + return bit; + } + + fn get(self: *RangeDecoder, count: usize) !u32 { + var result: u32 = 0; + var i: usize = 0; + while (i < count) : (i += 1) + result = (result << 1) ^ @boolToInt(try self.getBit()); + return result; + } + + pub inline fn decodeBit(self: *RangeDecoder, prob: *u16, update: bool) !bool { + const bound = (self.range >> 11) * prob.*; + + if (self.code < bound) { + if (update) + prob.* += (0x800 - prob.*) >> 5; + self.range = bound; + + try self.normalize(); + return false; + } else { + if (update) + prob.* -= prob.* >> 5; + self.code -= bound; + self.range -= bound; + + try self.normalize(); + return true; + } + } + + fn parseBitTree( + self: *RangeDecoder, + num_bits: u5, + probs: []u16, + update: bool, + ) !u32 { + var tmp: u32 = 1; + var i: u5 = 0; + while (i < num_bits) : (i += 1) { + const bit = try self.decodeBit(&probs[tmp], update); + tmp = (tmp << 1) ^ @boolToInt(bit); + } + return tmp - (@as(u32, 1) << num_bits); + } + + pub fn parseReverseBitTree( + self: *RangeDecoder, + num_bits: u5, + probs: []u16, + offset: usize, + update: bool, + ) !u32 { + var result: u32 = 0; + var tmp: usize = 1; + var i: u5 = 0; + while (i < num_bits) : (i += 1) { + const bit = @boolToInt(try self.decodeBit(&probs[offset + tmp], update)); + tmp = (tmp << 1) ^ bit; + result ^= @as(u32, bit) << i; + } + return result; + } +}; + +fn Vec2D(comptime T: type) type { + return struct { + data: []T, + cols: usize, + + const Self = @This(); + + pub fn init(allocator: Allocator, data: T, rows: usize, cols: usize) !Self { + const len = try std.math.mul(usize, rows, cols); + var vec2d = Self{ + .data = try allocator.alloc(T, len), + .cols = cols, + }; + vec2d.fill(data); + return vec2d; + } + + pub fn deinit(self: *Self, allocator: Allocator) void { + allocator.free(self.data); + } + + pub fn fill(self: *Self, value: T) void { + std.mem.set(T, self.data, value); + } + + pub fn get(self: *Self, row: usize) ![]T { + const start_row = try std.math.mul(usize, row, self.cols); + return self.data[start_row .. start_row + self.cols]; + } + }; +} + +const BitTree = struct { + num_bits: u5, + probs: ArrayListUnmanaged(u16), + + pub fn init(allocator: Allocator, num_bits: u5) !BitTree { + var probs_len = @as(usize, 1) << num_bits; + var probs = try ArrayListUnmanaged(u16).initCapacity(allocator, probs_len); + while (probs_len > 0) : (probs_len -= 1) + probs.appendAssumeCapacity(0x400); + return .{ .num_bits = num_bits, .probs = probs }; + } + + pub fn deinit(self: *BitTree, allocator: Allocator) void { + self.probs.deinit(allocator); + } + + pub fn parse( + self: *BitTree, + rangecoder: *RangeDecoder, + update: bool, + ) !u32 { + return rangecoder.parseBitTree(self.num_bits, self.probs.items, update); + } + + pub fn parseReverse( + self: *BitTree, + rangecoder: *RangeDecoder, + update: bool, + ) !u32 { + return rangecoder.parseReverseBitTree(self.num_bits, self.probs.items, 0, update); + } + + pub fn reset(self: *BitTree) void { + std.mem.set(u16, self.probs.items, 0x400); + } +}; + +const LenDecoder = struct { + choice: u16, + choice2: u16, + low_coder: [16]BitTree, + mid_coder: [16]BitTree, + high_coder: BitTree, + + pub fn init(allocator: Allocator) !LenDecoder { + return .{ + .choice = 0x400, + .choice2 = 0x400, + .low_coder = .{ + try BitTree.init(allocator, 3), + try BitTree.init(allocator, 3), + try BitTree.init(allocator, 3), + try BitTree.init(allocator, 3), + try BitTree.init(allocator, 3), + try BitTree.init(allocator, 3), + try BitTree.init(allocator, 3), + try BitTree.init(allocator, 3), + try BitTree.init(allocator, 3), + try BitTree.init(allocator, 3), + try BitTree.init(allocator, 3), + try BitTree.init(allocator, 3), + try BitTree.init(allocator, 3), + try BitTree.init(allocator, 3), + try BitTree.init(allocator, 3), + try BitTree.init(allocator, 3), + }, + .mid_coder = .{ + try BitTree.init(allocator, 3), + try BitTree.init(allocator, 3), + try BitTree.init(allocator, 3), + try BitTree.init(allocator, 3), + try BitTree.init(allocator, 3), + try BitTree.init(allocator, 3), + try BitTree.init(allocator, 3), + try BitTree.init(allocator, 3), + try BitTree.init(allocator, 3), + try BitTree.init(allocator, 3), + try BitTree.init(allocator, 3), + try BitTree.init(allocator, 3), + try BitTree.init(allocator, 3), + try BitTree.init(allocator, 3), + try BitTree.init(allocator, 3), + try BitTree.init(allocator, 3), + }, + .high_coder = try BitTree.init(allocator, 8), + }; + } + + pub fn deinit(self: *LenDecoder, allocator: Allocator) void { + for (self.low_coder) |*t| t.deinit(allocator); + for (self.mid_coder) |*t| t.deinit(allocator); + self.high_coder.deinit(allocator); + } + + pub fn decode( + self: *LenDecoder, + rangecoder: *RangeDecoder, + pos_state: usize, + update: bool, + ) !usize { + if (!try rangecoder.decodeBit(&self.choice, update)) { + return @as(usize, try self.low_coder[pos_state].parse(rangecoder, update)); + } else if (!try rangecoder.decodeBit(&self.choice2, update)) { + return @as(usize, try self.mid_coder[pos_state].parse(rangecoder, update)) + 8; + } else { + return @as(usize, try self.high_coder.parse(rangecoder, update)) + 16; + } + } + + pub fn reset(self: *LenDecoder) void { + self.choice = 0x400; + self.choice2 = 0x400; + for (self.low_coder) |*t| t.reset(); + for (self.mid_coder) |*t| t.reset(); + self.high_coder.reset(); + } +}; diff --git a/lib/std/compress/xz/test.zig b/lib/std/compress/xz/test.zig new file mode 100644 index 000000000000..848f518c7883 --- /dev/null +++ b/lib/std/compress/xz/test.zig @@ -0,0 +1,80 @@ +const std = @import("../../std.zig"); +const testing = std.testing; +const xz = std.compress.xz; + +fn decompress(data: []const u8) ![]u8 { + var in_stream = std.io.fixedBufferStream(data); + + var xz_stream = try xz.decompress(testing.allocator, in_stream.reader()); + defer xz_stream.deinit(); + + return xz_stream.reader().readAllAlloc(testing.allocator, std.math.maxInt(usize)); +} + +fn testReader(data: []const u8, comptime expected: []const u8) !void { + const buf = try decompress(data); + defer testing.allocator.free(buf); + + try testing.expectEqualSlices(u8, expected, buf); +} + +test "compressed data" { + try testReader(@embedFile("testdata/good-0-empty.xz"), ""); + + inline for ([_][]const u8{ + "good-1-check-none.xz", + "good-1-check-crc32.xz", + "good-1-check-crc64.xz", + "good-1-check-sha256.xz", + "good-2-lzma2.xz", + "good-1-block_header-1.xz", + "good-1-block_header-2.xz", + "good-1-block_header-3.xz", + }) |filename| { + try testReader(@embedFile("testdata/" ++ filename), + \\Hello + \\World! + \\ + ); + } + + inline for ([_][]const u8{ + "good-1-lzma2-1.xz", + "good-1-lzma2-2.xz", + "good-1-lzma2-3.xz", + "good-1-lzma2-4.xz", + }) |filename| { + try testReader(@embedFile("testdata/" ++ filename), + \\Lorem ipsum dolor sit amet, consectetur adipisicing + \\elit, sed do eiusmod tempor incididunt ut + \\labore et dolore magna aliqua. Ut enim + \\ad minim veniam, quis nostrud exercitation ullamco + \\laboris nisi ut aliquip ex ea commodo + \\consequat. Duis aute irure dolor in reprehenderit + \\in voluptate velit esse cillum dolore eu + \\fugiat nulla pariatur. Excepteur sint occaecat cupidatat + \\non proident, sunt in culpa qui officia + \\deserunt mollit anim id est laborum. + \\ + ); + } + + try testReader(@embedFile("testdata/good-1-lzma2-5.xz"), ""); +} + +test "unsupported" { + inline for ([_][]const u8{ + "good-1-delta-lzma2.tiff.xz", + "good-1-x86-lzma2.xz", + "good-1-sparc-lzma2.xz", + "good-1-arm64-lzma2-1.xz", + "good-1-arm64-lzma2-2.xz", + "good-1-3delta-lzma2.xz", + "good-1-empty-bcj-lzma2.xz", + }) |filename| { + try testing.expectError( + error.Unsupported, + decompress(@embedFile("testdata/" ++ filename)), + ); + } +} diff --git a/lib/std/compress/xz/testdata/good-0-empty.xz b/lib/std/compress/xz/testdata/good-0-empty.xz new file mode 100644 index 000000000000..83b95e05bc81 Binary files /dev/null and b/lib/std/compress/xz/testdata/good-0-empty.xz differ diff --git a/lib/std/compress/xz/testdata/good-0cat-empty.xz b/lib/std/compress/xz/testdata/good-0cat-empty.xz new file mode 100644 index 000000000000..e6fc31461db4 Binary files /dev/null and b/lib/std/compress/xz/testdata/good-0cat-empty.xz differ diff --git a/lib/std/compress/xz/testdata/good-0catpad-empty.xz b/lib/std/compress/xz/testdata/good-0catpad-empty.xz new file mode 100644 index 000000000000..4f86b7d152d4 Binary files /dev/null and b/lib/std/compress/xz/testdata/good-0catpad-empty.xz differ diff --git a/lib/std/compress/xz/testdata/good-0pad-empty.xz b/lib/std/compress/xz/testdata/good-0pad-empty.xz new file mode 100644 index 000000000000..c51e3a69c0de Binary files /dev/null and b/lib/std/compress/xz/testdata/good-0pad-empty.xz differ diff --git a/lib/std/compress/xz/testdata/good-1-3delta-lzma2.xz b/lib/std/compress/xz/testdata/good-1-3delta-lzma2.xz new file mode 100644 index 000000000000..a0be1d007a20 Binary files /dev/null and b/lib/std/compress/xz/testdata/good-1-3delta-lzma2.xz differ diff --git a/lib/std/compress/xz/testdata/good-1-arm64-lzma2-1.xz b/lib/std/compress/xz/testdata/good-1-arm64-lzma2-1.xz new file mode 100644 index 000000000000..78169f14e55b Binary files /dev/null and b/lib/std/compress/xz/testdata/good-1-arm64-lzma2-1.xz differ diff --git a/lib/std/compress/xz/testdata/good-1-arm64-lzma2-2.xz b/lib/std/compress/xz/testdata/good-1-arm64-lzma2-2.xz new file mode 100644 index 000000000000..e0302fe99e99 Binary files /dev/null and b/lib/std/compress/xz/testdata/good-1-arm64-lzma2-2.xz differ diff --git a/lib/std/compress/xz/testdata/good-1-block_header-1.xz b/lib/std/compress/xz/testdata/good-1-block_header-1.xz new file mode 100644 index 000000000000..fea5ad2f9980 Binary files /dev/null and b/lib/std/compress/xz/testdata/good-1-block_header-1.xz differ diff --git a/lib/std/compress/xz/testdata/good-1-block_header-2.xz b/lib/std/compress/xz/testdata/good-1-block_header-2.xz new file mode 100644 index 000000000000..6b5dcb347b24 Binary files /dev/null and b/lib/std/compress/xz/testdata/good-1-block_header-2.xz differ diff --git a/lib/std/compress/xz/testdata/good-1-block_header-3.xz b/lib/std/compress/xz/testdata/good-1-block_header-3.xz new file mode 100644 index 000000000000..156531206342 Binary files /dev/null and b/lib/std/compress/xz/testdata/good-1-block_header-3.xz differ diff --git a/lib/std/compress/xz/testdata/good-1-check-crc32.xz b/lib/std/compress/xz/testdata/good-1-check-crc32.xz new file mode 100644 index 000000000000..6c89593d447a Binary files /dev/null and b/lib/std/compress/xz/testdata/good-1-check-crc32.xz differ diff --git a/lib/std/compress/xz/testdata/good-1-check-crc64.xz b/lib/std/compress/xz/testdata/good-1-check-crc64.xz new file mode 100644 index 000000000000..5a9915d2f9fc Binary files /dev/null and b/lib/std/compress/xz/testdata/good-1-check-crc64.xz differ diff --git a/lib/std/compress/xz/testdata/good-1-check-none.xz b/lib/std/compress/xz/testdata/good-1-check-none.xz new file mode 100644 index 000000000000..1e85faf3d06e Binary files /dev/null and b/lib/std/compress/xz/testdata/good-1-check-none.xz differ diff --git a/lib/std/compress/xz/testdata/good-1-check-sha256.xz b/lib/std/compress/xz/testdata/good-1-check-sha256.xz new file mode 100644 index 000000000000..fdc556b602c4 Binary files /dev/null and b/lib/std/compress/xz/testdata/good-1-check-sha256.xz differ diff --git a/lib/std/compress/xz/testdata/good-1-delta-lzma2.tiff.xz b/lib/std/compress/xz/testdata/good-1-delta-lzma2.tiff.xz new file mode 100644 index 000000000000..1f033bc52b5d Binary files /dev/null and b/lib/std/compress/xz/testdata/good-1-delta-lzma2.tiff.xz differ diff --git a/lib/std/compress/xz/testdata/good-1-empty-bcj-lzma2.xz b/lib/std/compress/xz/testdata/good-1-empty-bcj-lzma2.xz new file mode 100644 index 000000000000..94016d8b66e1 Binary files /dev/null and b/lib/std/compress/xz/testdata/good-1-empty-bcj-lzma2.xz differ diff --git a/lib/std/compress/xz/testdata/good-1-lzma2-1.xz b/lib/std/compress/xz/testdata/good-1-lzma2-1.xz new file mode 100644 index 000000000000..d8d6489c8721 Binary files /dev/null and b/lib/std/compress/xz/testdata/good-1-lzma2-1.xz differ diff --git a/lib/std/compress/xz/testdata/good-1-lzma2-2.xz b/lib/std/compress/xz/testdata/good-1-lzma2-2.xz new file mode 100644 index 000000000000..7e8cdf1b7d0d Binary files /dev/null and b/lib/std/compress/xz/testdata/good-1-lzma2-2.xz differ diff --git a/lib/std/compress/xz/testdata/good-1-lzma2-3.xz b/lib/std/compress/xz/testdata/good-1-lzma2-3.xz new file mode 100644 index 000000000000..c4c72be65614 Binary files /dev/null and b/lib/std/compress/xz/testdata/good-1-lzma2-3.xz differ diff --git a/lib/std/compress/xz/testdata/good-1-lzma2-4.xz b/lib/std/compress/xz/testdata/good-1-lzma2-4.xz new file mode 100644 index 000000000000..e0d623a001ce Binary files /dev/null and b/lib/std/compress/xz/testdata/good-1-lzma2-4.xz differ diff --git a/lib/std/compress/xz/testdata/good-1-lzma2-5.xz b/lib/std/compress/xz/testdata/good-1-lzma2-5.xz new file mode 100644 index 000000000000..339d1c30c17f Binary files /dev/null and b/lib/std/compress/xz/testdata/good-1-lzma2-5.xz differ diff --git a/lib/std/compress/xz/testdata/good-1-sparc-lzma2.xz b/lib/std/compress/xz/testdata/good-1-sparc-lzma2.xz new file mode 100644 index 000000000000..4532bc61c1a3 Binary files /dev/null and b/lib/std/compress/xz/testdata/good-1-sparc-lzma2.xz differ diff --git a/lib/std/compress/xz/testdata/good-1-x86-lzma2.xz b/lib/std/compress/xz/testdata/good-1-x86-lzma2.xz new file mode 100644 index 000000000000..8053917bad7d Binary files /dev/null and b/lib/std/compress/xz/testdata/good-1-x86-lzma2.xz differ diff --git a/lib/std/compress/xz/testdata/good-2-lzma2.xz b/lib/std/compress/xz/testdata/good-2-lzma2.xz new file mode 100644 index 000000000000..bed5085c1f6c Binary files /dev/null and b/lib/std/compress/xz/testdata/good-2-lzma2.xz differ diff --git a/src/Package.zig b/src/Package.zig index f8823cc74e1e..ebe84b8444d3 100644 --- a/src/Package.zig +++ b/src/Package.zig @@ -370,14 +370,11 @@ fn fetchAndUnpack( if (mem.endsWith(u8, uri.path, ".tar.gz")) { // I observed the gzip stream to read 1 byte at a time, so I am using a // buffered reader on the front of it. - var br = std.io.bufferedReaderSize(std.crypto.tls.max_ciphertext_record_len, req.reader()); - - var gzip_stream = try std.compress.gzip.gzipStream(gpa, br.reader()); - defer gzip_stream.deinit(); - - try std.tar.pipeToFileSystem(tmp_directory.handle, gzip_stream.reader(), .{ - .strip_components = 1, - }); + try unpackTarball(gpa, &req, tmp_directory.handle, std.compress.gzip); + } else if (mem.endsWith(u8, uri.path, ".tar.xz")) { + // I have not checked what buffer sizes the xz decompression implementation uses + // by default, so the same logic applies for buffering the reader as for gzip. + try unpackTarball(gpa, &req, tmp_directory.handle, std.compress.xz); } else { return reportError( ini, @@ -430,6 +427,22 @@ fn fetchAndUnpack( return createWithDir(gpa, fqn, global_cache_directory, pkg_dir_sub_path, build_zig_basename); } +fn unpackTarball( + gpa: Allocator, + req: *std.http.Client.Request, + out_dir: fs.Dir, + comptime compression: type, +) !void { + var br = std.io.bufferedReaderSize(std.crypto.tls.max_ciphertext_record_len, req.reader()); + + var decompress = try compression.decompress(gpa, br.reader()); + defer decompress.deinit(); + + try std.tar.pipeToFileSystem(out_dir, decompress.reader(), .{ + .strip_components = 1, + }); +} + fn reportError( ini: std.Ini, comp_directory: Compilation.Directory,