Skip to content

Commit 0cd6bc0

Browse files
committed
new file: Compression/High-level/bbwr_file_compression.pl
new file: Compression/lzsst2_file_compression.pl
1 parent 305eab3 commit 0cd6bc0

15 files changed

+861
-24
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,200 @@
1+
#!/usr/bin/perl
2+
3+
# Author: Trizen
4+
# Date: 04 July 2024
5+
# https://github.com/trizen
6+
7+
# Compress/decompress files using Binary Burrows-Wheeler Transform (BWT) + Binary Variable Run-Length Encoding.
8+
9+
# References:
10+
# Data Compression (Summer 2023) - Lecture 13 - BZip2
11+
# https://youtube.com/watch?v=cvoZbBZ3M2A
12+
#
13+
# Data Compression (Summer 2023) - Lecture 5 - Basic Techniques
14+
# https://youtube.com/watch?v=TdFWb8mL5Gk
15+
16+
use 5.036;
17+
use Getopt::Std qw(getopts);
18+
use File::Basename qw(basename);
19+
use Compression::Util qw(:all);
20+
21+
use constant {
22+
PKGNAME => 'BBWR',
23+
VERSION => '0.01',
24+
FORMAT => 'bbwr',
25+
26+
CHUNK_SIZE => 1 << 13, # larger values == better compression
27+
};
28+
29+
# Container signature
30+
use constant SIGNATURE => uc(FORMAT) . chr(1);
31+
32+
sub usage {
33+
my ($code) = @_;
34+
print <<"EOH";
35+
usage: $0 [options] [input file] [output file]
36+
37+
options:
38+
-e : extract
39+
-i <filename> : input filename
40+
-o <filename> : output filename
41+
-r : rewrite output
42+
43+
-v : version number
44+
-h : this message
45+
46+
examples:
47+
$0 document.txt
48+
$0 document.txt archive.${\FORMAT}
49+
$0 archive.${\FORMAT} document.txt
50+
$0 -e -i archive.${\FORMAT} -o document.txt
51+
52+
EOH
53+
54+
exit($code // 0);
55+
}
56+
57+
sub version {
58+
printf("%s %s\n", PKGNAME, VERSION);
59+
exit;
60+
}
61+
62+
sub valid_archive {
63+
my ($fh) = @_;
64+
65+
if (read($fh, (my $sig), length(SIGNATURE), 0) == length(SIGNATURE)) {
66+
$sig eq SIGNATURE || return;
67+
}
68+
69+
return 1;
70+
}
71+
72+
sub main {
73+
my %opt;
74+
getopts('ei:o:vhr', \%opt);
75+
76+
$opt{h} && usage(0);
77+
$opt{v} && version();
78+
79+
my ($input, $output) = @ARGV;
80+
$input //= $opt{i} // usage(2);
81+
$output //= $opt{o};
82+
83+
my $ext = qr{\.${\FORMAT}\z}io;
84+
if ($opt{e} || $input =~ $ext) {
85+
86+
if (not defined $output) {
87+
($output = basename($input)) =~ s{$ext}{}
88+
|| die "$0: no output file specified!\n";
89+
}
90+
91+
if (not $opt{r} and -e $output) {
92+
print "'$output' already exists! -- Replace? [y/N] ";
93+
<STDIN> =~ /^y/i || exit 17;
94+
}
95+
96+
decompress_file($input, $output)
97+
|| die "$0: error: decompression failed!\n";
98+
}
99+
elsif ($input !~ $ext || (defined($output) && $output =~ $ext)) {
100+
$output //= basename($input) . '.' . FORMAT;
101+
compress_file($input, $output)
102+
|| die "$0: error: compression failed!\n";
103+
}
104+
else {
105+
warn "$0: don't know what to do...\n";
106+
usage(1);
107+
}
108+
}
109+
110+
sub compression ($chunk, $out_fh) {
111+
112+
my $bits = unpack('B*', $chunk);
113+
my $vrle1 = binary_vrl_encode($bits);
114+
115+
if (length($vrle1) < length($bits)) {
116+
printf "Doing early VLR, saving %s bits\n", length($bits) - length($vrle1);
117+
print $out_fh chr(1);
118+
}
119+
else {
120+
print $out_fh chr(0);
121+
$vrle1 = $bits;
122+
}
123+
124+
my ($bwt, $idx) = bwt_encode($vrle1);
125+
my $vrle2 = binary_vrl_encode($bwt);
126+
127+
say "BWT index: $idx";
128+
129+
print $out_fh pack('N', $idx);
130+
print $out_fh pack('N', length($vrle2));
131+
print $out_fh pack('B*', $vrle2);
132+
}
133+
134+
sub decompression ($fh, $out_fh) {
135+
136+
my $compressed_byte = ord(getc($fh) // die "error");
137+
138+
my $idx = unpack('N', join('', map { getc($fh) // die "error" } 1 .. 4));
139+
my $bits_len = unpack('N', join('', map { getc($fh) // die "error" } 1 .. 4));
140+
141+
say "BWT index = $idx";
142+
143+
my $bwt = binary_vrl_decode(read_bits($fh, $bits_len));
144+
my $data = bwt_decode($bwt, $idx);
145+
146+
if ($compressed_byte == 1) {
147+
$data = binary_vrl_decode($data);
148+
}
149+
150+
print $out_fh pack('B*', $data);
151+
}
152+
153+
# Compress file
154+
sub compress_file ($input, $output) {
155+
156+
open my $fh, '<:raw', $input
157+
or die "Can't open file <<$input>> for reading: $!";
158+
159+
my $header = SIGNATURE;
160+
161+
# Open the output file for writing
162+
open my $out_fh, '>:raw', $output
163+
or die "Can't open file <<$output>> for write: $!";
164+
165+
# Print the header
166+
print $out_fh $header;
167+
168+
# Compress data
169+
while (read($fh, (my $chunk), CHUNK_SIZE)) {
170+
compression($chunk, $out_fh);
171+
}
172+
173+
# Close the file
174+
close $out_fh;
175+
}
176+
177+
# Decompress file
178+
sub decompress_file ($input, $output) {
179+
180+
# Open and validate the input file
181+
open my $fh, '<:raw', $input
182+
or die "Can't open file <<$input>> for reading: $!";
183+
184+
valid_archive($fh) || die "$0: file `$input' is not a \U${\FORMAT}\E v${\VERSION} archive!\n";
185+
186+
# Open the output file
187+
open my $out_fh, '>:raw', $output
188+
or die "Can't open file <<$output>> for writing: $!";
189+
190+
while (!eof($fh)) {
191+
decompression($fh, $out_fh);
192+
}
193+
194+
# Close the file
195+
close $fh;
196+
close $out_fh;
197+
}
198+
199+
main();
200+
exit(0);

Compression/High-level/lzb_file_compression.pl

+2-2
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,8 @@
2525

2626
local $Compression::Util::LZ_MIN_LEN = 4; # minimum match length
2727
local $Compression::Util::LZ_MAX_LEN = ~0; # maximum match length
28-
local $Compression::Util::LZ_MAX_DIST = (1 << 16) - 1; # maximum match length
29-
local $Compression::Util::LZ_MAX_CHAIN_LEN = 16; # higher value = better compression
28+
local $Compression::Util::LZ_MAX_DIST = (1 << 16) - 1; # maximum match distance
29+
local $Compression::Util::LZ_MAX_CHAIN_LEN = 32; # higher value = better compression
3030

3131
# Container signature
3232
use constant SIGNATURE => uc(FORMAT) . chr(1);

Compression/High-level/lzbf_file_compression.pl

+1-1
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525

2626
local $Compression::Util::LZ_MIN_LEN = 5; # minimum match length
2727
local $Compression::Util::LZ_MAX_LEN = ~0; # maximum match length
28-
local $Compression::Util::LZ_MAX_DIST = (1 << 16) - 1; # maximum distance
28+
local $Compression::Util::LZ_MAX_DIST = (1 << 16) - 1; # maximum match distance
2929

3030
# Container signature
3131
use constant SIGNATURE => uc(FORMAT) . chr(1);

Compression/High-level/lzbw3_file_compression.pl

+1-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
VERSION => '0.01',
1717
FORMAT => 'lzbw3',
1818

19-
CHUNK_SIZE => 1 << 17, # higher value = better compression
19+
CHUNK_SIZE => 1 << 18, # higher value = better compression
2020
};
2121

2222
# Container signature

0 commit comments

Comments
 (0)