Skip to content

Commit 048ad6c

Browse files
author
Philipp Heckel
committed
Support smaller chunk sizes
1 parent dce4056 commit 048ad6c

12 files changed

+122
-89
lines changed

chunker.go

+3-3
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ const (
1111
// Max size for any chunk produced by any of the chunkers. Note that lowering this
1212
// probably has terrible consequences for existing manifests, because the buffers all use this
1313
// value. Splitting this value and the buffer size would be a way to solve this.
14-
chunkSizeMaxBytes = 32 * 1024 * 1024
14+
DefaultChunkSizeMaxBytes = 32 * 1024 * 1024
1515
)
1616

1717
type Chunker interface {
@@ -24,10 +24,10 @@ type chunk struct {
2424
checksum []byte
2525
}
2626

27-
func NewChunk() *chunk {
27+
func NewChunk(maxSize int64) *chunk {
2828
return &chunk{
2929
size: 0,
30-
data: make([]byte, chunkSizeMaxBytes),
30+
data: make([]byte, maxSize),
3131
checksum: nil,
3232
}
3333
}

chunker_fixed.go

+25-21
Original file line numberDiff line numberDiff line change
@@ -6,39 +6,43 @@ import (
66
)
77

88
type fixedChunker struct {
9-
reader io.ReaderAt
10-
store ChunkStore
11-
start int64
12-
sizeInBytes int64
13-
skip *manifest
9+
reader io.ReaderAt
10+
store ChunkStore
11+
start int64
12+
sizeInBytes int64
13+
chunkMaxSize int64
14+
skip *manifest
1415
}
1516

16-
func NewFixedChunker(reader io.ReaderAt, index ChunkStore, offset int64, size int64) *fixedChunker {
17-
skip := NewManifest()
18-
return NewFixedChunkerWithSkip(reader, index, offset, size, skip)
17+
func NewFixedChunker(reader io.ReaderAt, index ChunkStore, offset int64, size int64, chunkMaxSize int64) *fixedChunker {
18+
skip := NewManifest(chunkMaxSize)
19+
return NewFixedChunkerWithSkip(reader, index, offset, size, chunkMaxSize, skip)
1920
}
2021

21-
func NewFixedChunkerWithSkip(reader io.ReaderAt, store ChunkStore, offset int64, size int64, skip *manifest) *fixedChunker {
22+
func NewFixedChunkerWithSkip(reader io.ReaderAt, store ChunkStore, offset int64, size int64,
23+
chunkMaxSize int64, skip *manifest) *fixedChunker {
24+
2225
return &fixedChunker{
23-
reader: reader,
24-
store: store,
25-
start: offset,
26-
sizeInBytes: size,
27-
skip: skip,
26+
reader: reader,
27+
store: store,
28+
start: offset,
29+
sizeInBytes: size,
30+
chunkMaxSize: chunkMaxSize,
31+
skip: skip,
2832
}
2933
}
3034

3135
func (d *fixedChunker) Dedup() (*manifest, error) {
32-
out := NewManifest()
36+
out := NewManifest(d.chunkMaxSize)
3337

3438
sliceOffsets := d.skip.Offsets()
3539

3640
currentOffset := int64(0)
3741
breakpointIndex := 0
3842
breakpoint := int64(0)
3943

40-
chunk := NewChunk()
41-
buffer := make([]byte, chunkSizeMaxBytes)
44+
chunk := NewChunk(d.chunkMaxSize)
45+
buffer := make([]byte, d.chunkMaxSize)
4246

4347
statusf("Creating gap chunks ...")
4448
chunkBytes := int64(0)
@@ -54,15 +58,15 @@ func (d *fixedChunker) Dedup() (*manifest, error) {
5458
breakpoint = sliceOffsets[breakpointIndex]
5559
bytesToBreakpoint := breakpoint - currentOffset
5660

57-
if bytesToBreakpoint > chunkSizeMaxBytes {
61+
if bytesToBreakpoint > d.chunkMaxSize {
5862
// We can fill an entire chunk, because there are enough bytes to the next breakpoint
5963

60-
chunkEndOffset := minInt64(currentOffset + chunkSizeMaxBytes, d.sizeInBytes)
64+
chunkEndOffset := minInt64(currentOffset + d.chunkMaxSize, d.sizeInBytes)
6165

6266
bytesRead, err := d.reader.ReadAt(buffer, d.start + currentOffset)
6367
if err != nil {
6468
return nil, err
65-
} else if bytesRead != chunkSizeMaxBytes {
69+
} else if int64(bytesRead) != d.chunkMaxSize {
6670
return nil, fmt.Errorf("cannot read all bytes from disk, %d read\n", bytesRead)
6771
}
6872

@@ -146,7 +150,7 @@ func (d *fixedChunker) Dedup() (*manifest, error) {
146150
breakpointIndex++
147151
}
148152
} else {
149-
chunkEndOffset := minInt64(currentOffset + chunkSizeMaxBytes, d.sizeInBytes)
153+
chunkEndOffset := minInt64(currentOffset + d.chunkMaxSize, d.sizeInBytes)
150154
chunkSize := chunkEndOffset - currentOffset
151155

152156
bytesRead, err := d.reader.ReadAt(buffer[:chunkSize], d.start + currentOffset)

chunker_gpt.go

+22-18
Original file line numberDiff line numberDiff line change
@@ -21,26 +21,30 @@ const (
2121
)
2222

2323
type gptDiskChunker struct {
24-
reader io.ReaderAt
25-
store ChunkStore
26-
start int64
27-
size int64
28-
exact bool
29-
noFile bool
30-
minSize int64
31-
manifest *manifest
24+
reader io.ReaderAt
25+
store ChunkStore
26+
start int64
27+
size int64
28+
exact bool
29+
noFile bool
30+
minSize int64
31+
chunkMaxSize int64
32+
manifest *manifest
3233
}
3334

34-
func NewGptDiskChunker(reader io.ReaderAt, store ChunkStore, offset int64, size int64, exact bool, noFile bool, minSize int64) *gptDiskChunker {
35+
func NewGptDiskChunker(reader io.ReaderAt, store ChunkStore, offset int64, size int64, exact bool, noFile bool,
36+
minSize int64, chunkMaxSize int64) *gptDiskChunker {
37+
3538
return &gptDiskChunker{
3639
reader: reader,
37-
store: store,
38-
start: offset,
39-
size: size,
40-
exact: exact,
41-
noFile: noFile,
42-
minSize: minSize,
43-
manifest: NewManifest(),
40+
store: store,
41+
start: offset,
42+
size: size,
43+
exact: exact,
44+
noFile: noFile,
45+
minSize: minSize,
46+
chunkMaxSize: chunkMaxSize,
47+
manifest: NewManifest(chunkMaxSize),
4448
}
4549
}
4650

@@ -97,7 +101,7 @@ func (d *gptDiskChunker) dedupNtfsPartitions() error {
97101

98102
if partitionType == typeNtfs {
99103
debugf("NTFS partition found at offset %d\n", partitionOffset)
100-
ntfs := NewNtfsChunker(d.reader, d.store, partitionOffset, d.exact, d.noFile, d.minSize)
104+
ntfs := NewNtfsChunker(d.reader, d.store, partitionOffset, d.exact, d.noFile, d.minSize, d.chunkMaxSize)
101105
manifest, err := ntfs.Dedup()
102106
if err != nil {
103107
return err
@@ -111,7 +115,7 @@ func (d *gptDiskChunker) dedupNtfsPartitions() error {
111115
}
112116

113117
func (d *gptDiskChunker) dedupRest() error {
114-
chunker := NewFixedChunkerWithSkip(d.reader, d.store, d.start, d.size, d.manifest)
118+
chunker := NewFixedChunkerWithSkip(d.reader, d.store, d.start, d.size, d.chunkMaxSize, d.manifest)
115119

116120
gapManifest, err := chunker.Dedup()
117121
if err != nil {

chunker_mbr.go

+23-19
Original file line numberDiff line numberDiff line change
@@ -18,26 +18,30 @@ const (
1818
)
1919

2020
type mbrDiskChunker struct {
21-
reader io.ReaderAt
22-
store ChunkStore
23-
start int64
24-
size int64
25-
exact bool
26-
noFile bool
27-
minSize int64
28-
manifest *manifest
21+
reader io.ReaderAt
22+
store ChunkStore
23+
start int64
24+
size int64
25+
exact bool
26+
noFile bool
27+
minSize int64
28+
chunkMaxSize int64
29+
manifest *manifest
2930
}
3031

31-
func NewMbrDiskChunker(reader io.ReaderAt, store ChunkStore, offset int64, size int64, exact bool, noFile bool, minSize int64) *mbrDiskChunker {
32+
func NewMbrDiskChunker(reader io.ReaderAt, store ChunkStore, offset int64, size int64, exact bool, noFile bool,
33+
minSize int64, chunkMaxSize int64) *mbrDiskChunker {
34+
3235
return &mbrDiskChunker{
33-
reader: reader,
34-
store: store,
35-
start: offset,
36-
size: size,
37-
exact: exact,
38-
noFile: noFile,
39-
minSize: minSize,
40-
manifest: NewManifest(),
36+
reader: reader,
37+
store: store,
38+
start: offset,
39+
size: size,
40+
exact: exact,
41+
noFile: noFile,
42+
minSize: minSize,
43+
chunkMaxSize: chunkMaxSize,
44+
manifest: NewManifest(chunkMaxSize),
4145
}
4246
}
4347

@@ -81,7 +85,7 @@ func (d *mbrDiskChunker) dedupNtfsPartitions() error {
8185
}
8286

8387
if partitionType == typeNtfs {
84-
ntfs := NewNtfsChunker(d.reader, d.store, partitionOffset, d.exact, d.noFile, d.minSize)
88+
ntfs := NewNtfsChunker(d.reader, d.store, partitionOffset, d.exact, d.noFile, d.minSize, d.chunkMaxSize)
8589
manifest, err := ntfs.Dedup()
8690
if err != nil {
8791
return err
@@ -95,7 +99,7 @@ func (d *mbrDiskChunker) dedupNtfsPartitions() error {
9599
}
96100

97101
func (d *mbrDiskChunker) dedupRest() error {
98-
chunker := NewFixedChunkerWithSkip(d.reader, d.store, d.start, d.size, d.manifest)
102+
chunker := NewFixedChunkerWithSkip(d.reader, d.store, d.start, d.size, d.chunkMaxSize, d.manifest)
99103

100104
gapManifest, err := chunker.Dedup()
101105
if err != nil {

chunker_ntfs.go

+18-13
Original file line numberDiff line numberDiff line change
@@ -37,12 +37,14 @@ type ntfsChunker struct {
3737
exact bool
3838
noFile bool
3939
minSize int64
40+
chunkMaxSize int64
41+
4042
totalSectors int64
4143
sectorSize int64
4244
sectorsPerCluster int64
4345
clusterSize int64
4446
store ChunkStore
45-
buffer []byte // cannot be larger than chunkSizeMaxBytes, the logic relies on it!
47+
buffer []byte // cannot be larger than DefaultChunkSizeMaxBytes, the logic relies on it!
4648
chunk *chunk
4749

4850
// Output manifest
@@ -131,17 +133,20 @@ const (
131133

132134
var ErrUnexpectedMagic = errors.New("unexpected magic")
133135

134-
func NewNtfsChunker(reader io.ReaderAt, store ChunkStore, offset int64, exact bool, noFile bool, minSize int64) *ntfsChunker {
136+
func NewNtfsChunker(reader io.ReaderAt, store ChunkStore, offset int64, exact bool, noFile bool,
137+
minSize int64, chunkMaxSize int64) *ntfsChunker {
138+
135139
return &ntfsChunker{
136-
reader: reader,
137-
store: store,
138-
start: offset,
139-
exact: exact,
140-
noFile: noFile,
141-
minSize: minSize,
142-
chunk: NewChunk(),
143-
buffer: make([]byte, chunkSizeMaxBytes),
144-
out: NewManifest(),
140+
reader: reader,
141+
store: store,
142+
start: offset,
143+
exact: exact,
144+
noFile: noFile,
145+
minSize: minSize,
146+
chunkMaxSize: chunkMaxSize,
147+
chunk: NewChunk(chunkMaxSize),
148+
buffer: make([]byte, chunkMaxSize),
149+
out: NewManifest(chunkMaxSize),
145150
}
146151
}
147152

@@ -456,7 +461,7 @@ func (d *ntfsChunker) dedupFile(entry *entry) (int64, error) {
456461

457462
// Add run to chunk(s)
458463
debugf("- Bytes read = %d, current chunk size = %d, chunk max = %d\n",
459-
runBytesRead, d.chunk.Size(), chunkSizeMaxBytes)
464+
runBytesRead, d.chunk.Size(), d.chunkMaxSize)
460465

461466
slices[runOffset] = &chunkSlice{
462467
checksum: nil, // fill this when chunk is finalized!
@@ -647,7 +652,7 @@ func (d *ntfsChunker) dedupUnused(mft *entry) error {
647652
}
648653

649654
func (d *ntfsChunker) dedupGaps() error {
650-
chunker := NewFixedChunkerWithSkip(d.reader, d.store, d.start, d.sizeInBytes, d.out)
655+
chunker := NewFixedChunkerWithSkip(d.reader, d.store, d.start, d.sizeInBytes, d.chunkMaxSize, d.out)
651656

652657
gapManifest, err := chunker.Dedup()
653658
if err != nil {

cmd/fsdup/main.go

+7-1
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@ func indexCommand(args []string) {
106106
exactFlag := flags.Bool("exact", false, "Ignore the NTFS bitmap, i.e. include unused blocks")
107107
noFileFlag := flags.Bool("nofile", false, "Don't do NTFS FILE deduping, just do gaps and unused space")
108108
minSizeFlag := flags.String("minsize", fmt.Sprintf("%d", fsdup.DefaultDedupFileSizeMinBytes), "Minimum file size to consider for deduping")
109+
maxChunkSizeFlag := flags.String("maxchunksize", fmt.Sprintf("%d", fsdup.DefaultChunkSizeMaxBytes), "Maximum size per chunk")
109110

110111
flags.Parse(args)
111112

@@ -125,6 +126,11 @@ func indexCommand(args []string) {
125126
exit(2, "Invalid min size value: " + err.Error())
126127
}
127128

129+
chunkMaxSize, err := convertToBytes(*maxChunkSizeFlag)
130+
if err != nil {
131+
exit(2, "Invalid max chunk size value: " + err.Error())
132+
}
133+
128134
file := flags.Arg(0)
129135
manifest := flags.Arg(1)
130136

@@ -139,7 +145,7 @@ func indexCommand(args []string) {
139145
}
140146

141147
// Go index!
142-
if err := fsdup.Index(file, store, manifest, offset, exact, noFile, minSize); err != nil {
148+
if err := fsdup.Index(file, store, manifest, offset, exact, noFile, minSize, chunkMaxSize); err != nil {
143149
exit(2, "Cannot index file: " + string(err.Error()))
144150
}
145151
}

export.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ func Export(manifestFile string, store ChunkStore, outputFile string) error {
2121
return err
2222
}
2323

24-
buffer := make([]byte, chunkSizeMaxBytes)
24+
buffer := make([]byte, manifest.chunkMaxSize)
2525
offset := int64(0)
2626

2727
for _, breakpoint := range manifest.Offsets() {

import.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ func Import(manifestFile string, store ChunkStore, inputFile string) error {
3333

3434
imported := int64(0)
3535
skipped := int64(0)
36-
buffer := make([]byte, chunkSizeMaxBytes)
36+
buffer := make([]byte, manifest.chunkMaxSize)
3737

3838
for _, checksumStr := range manifest.ChecksumsByDiskOffset(chunkSlices) {
3939
slices := chunkSlices[checksumStr]

index.go

+6-5
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@ const (
2222
probeTypeBufferLength = 1024
2323
)
2424

25-
func Index(inputFile string, store ChunkStore, manifestFile string, offset int64, exact bool, noFile bool, minSize int64) error {
25+
func Index(inputFile string, store ChunkStore, manifestFile string, offset int64, exact bool,
26+
noFile bool, minSize int64, chunkMaxSize int64) error {
2627
file, err := os.Open(inputFile)
2728
if err != nil {
2829
return err
@@ -45,13 +46,13 @@ func Index(inputFile string, store ChunkStore, manifestFile string, offset int64
4546

4647
switch fileType {
4748
case typeNtfs:
48-
chunker = NewNtfsChunker(file, store, offset, exact, noFile, minSize)
49+
chunker = NewNtfsChunker(file, store, offset, exact, noFile, minSize, chunkMaxSize)
4950
case typeMbrDisk:
50-
chunker = NewMbrDiskChunker(file, store, offset, size, exact, noFile, minSize)
51+
chunker = NewMbrDiskChunker(file, store, offset, size, exact, noFile, minSize, chunkMaxSize)
5152
case typeGptDisk:
52-
chunker = NewGptDiskChunker(file, store, offset, size, exact, noFile, minSize)
53+
chunker = NewGptDiskChunker(file, store, offset, size, exact, noFile, minSize, chunkMaxSize)
5354
default:
54-
chunker = NewFixedChunker(file, store, offset, size)
55+
chunker = NewFixedChunker(file, store, offset, size, chunkMaxSize)
5556
}
5657

5758
manifest, err := chunker.Dedup()

0 commit comments

Comments
 (0)