Skip to content

Commit 5ebb6bb

Browse files
author
Philipp Heckel
committed
Rename, add slice histogram
1 parent e58c9d7 commit 5ebb6bb

File tree

5 files changed

+70
-67
lines changed

5 files changed

+70
-67
lines changed

Diff for: chunker_fixed.go

+3-3
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ func (d *fixedChunker) Dedup() (*manifest, error) {
7676
debugf("offset %d - %d, NEW chunk %x, size %d\n",
7777
currentOffset, chunkEndOffset, chunk.Checksum(), chunk.Size())
7878

79-
out.Add(currentOffset, &chunkPart{
79+
out.Add(currentOffset, &chunkSlice{
8080
checksum: chunk.Checksum(),
8181
from: 0,
8282
to: chunk.Size(),
@@ -110,7 +110,7 @@ func (d *fixedChunker) Dedup() (*manifest, error) {
110110
return nil, err
111111
}
112112

113-
out.Add(currentOffset, &chunkPart{
113+
out.Add(currentOffset, &chunkSlice{
114114
checksum: chunk.Checksum(),
115115
from: 0,
116116
to: chunk.Size(),
@@ -160,7 +160,7 @@ func (d *fixedChunker) Dedup() (*manifest, error) {
160160
debugf("offset %d - %d, NEW3 chunk %x, size %d\n",
161161
currentOffset, chunkEndOffset, chunk.Checksum(), chunk.Size())
162162

163-
out.Add(currentOffset, &chunkPart{
163+
out.Add(currentOffset, &chunkSlice{
164164
checksum: chunk.Checksum(),
165165
from: 0,
166166
to: chunk.Size(),

Diff for: chunker_ntfs.go

+15-15
Original file line numberDiff line numberDiff line change
@@ -416,7 +416,7 @@ func (d *ntfsChunker) readRuns(entry []byte, offset int64) []run {
416416
func (d *ntfsChunker) dedupFile(entry *entry) error {
417417
remainingToEndOfFile := entry.dataSize
418418

419-
parts := make(map[int64]*chunkPart, 0)
419+
slices := make(map[int64]*chunkSlice, 0)
420420
d.chunk.Reset()
421421

422422
for _, run := range entry.runs {
@@ -449,7 +449,7 @@ func (d *ntfsChunker) dedupFile(entry *entry) error {
449449
debugf("- Bytes read = %d, current chunk size = %d, chunk max = %d\n",
450450
runBytesRead, d.chunk.Size(), chunkSizeMaxBytes)
451451

452-
parts[runOffset] = &chunkPart{
452+
slices[runOffset] = &chunkSlice{
453453
checksum: nil, // fill this when chunk is finalized!
454454
from: d.chunk.Size(),
455455
to: d.chunk.Size() + int64(runBytesRead),
@@ -464,15 +464,15 @@ func (d *ntfsChunker) dedupFile(entry *entry) error {
464464
if d.chunk.Full() {
465465
debugf("- Chunk full. Emitting chunk %x, size = %d\n", d.chunk.Checksum(), d.chunk.Size())
466466

467-
// Add parts to disk map
468-
for partOffset, part := range parts {
469-
part.checksum = d.chunk.Checksum()
467+
// Add slices to disk map
468+
for sliceOffset, slice := range slices {
469+
slice.checksum = d.chunk.Checksum()
470470
debugf("- Adding disk section %d - %d, mapping to chunk %x, offset %d - %d\n",
471-
partOffset, partOffset + part.to - part.from, part.checksum, part.from, part.to)
472-
d.out.Add(partOffset, part)
471+
sliceOffset, sliceOffset+ slice.to - slice.from, slice.checksum, slice.from, slice.to)
472+
d.out.Add(sliceOffset, slice)
473473
}
474474

475-
parts = make(map[int64]*chunkPart, 0) // clear!
475+
slices = make(map[int64]*chunkSlice, 0) // clear!
476476

477477
// Write chunk
478478
if err := d.store.Write(d.chunk.Checksum(), d.chunk.Data()); err != nil {
@@ -494,7 +494,7 @@ func (d *ntfsChunker) dedupFile(entry *entry) error {
494494
debugf("- File end is not cluster aligned, emitting sparse section %d - %d\n",
495495
runOffset, runOffset + remainingToEndOfCluster)
496496

497-
d.out.Add(runOffset, &chunkPart{
497+
d.out.Add(runOffset, &chunkSlice{
498498
checksum: nil,
499499
from: 0,
500500
to: remainingToEndOfCluster,
@@ -507,12 +507,12 @@ func (d *ntfsChunker) dedupFile(entry *entry) error {
507507

508508
// Finish last chunk
509509
if d.chunk.Size() > 0 {
510-
// Add parts to disk map
511-
for partOffset, part := range parts {
512-
part.checksum = d.chunk.Checksum()
510+
// Add slices to disk map
511+
for sliceOffset, slice := range slices {
512+
slice.checksum = d.chunk.Checksum()
513513
debugf("- Adding disk section %d - %d, mapping to chunk %x, offset %d - %d\n",
514-
partOffset, partOffset + part.to - part.from, part.checksum, part.from, part.to)
515-
d.out.Add(partOffset, part)
514+
sliceOffset, sliceOffset+ slice.to - slice.from, slice.checksum, slice.from, slice.to)
515+
d.out.Add(sliceOffset, slice)
516516
}
517517

518518
debugf("- End of file. Emitting last chunk %x, size = %d\n", d.chunk.Checksum(), d.chunk.Size())
@@ -603,7 +603,7 @@ func (d *ntfsChunker) dedupUnused(mft *entry) error {
603603
debugf("- Detected large sparse section %d - %d (%d bytes)\n",
604604
sparseSectionStartOffset, sparseSectionEndOffset, sparseSectionLength)
605605

606-
d.out.Add(sparseSectionStartOffset, &chunkPart{
606+
d.out.Add(sparseSectionStartOffset, &chunkSlice{
607607
checksum: nil,
608608
from: sparseSectionStartOffset,
609609
to: sparseSectionEndOffset,

Diff for: manifest.go

+19-19
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,11 @@ const (
1717
)
1818

1919
type manifest struct {
20-
diskMap map[int64]*chunkPart
20+
diskMap map[int64]*chunkSlice
2121
size int64
2222
}
2323

24-
type chunkPart struct {
24+
type chunkSlice struct {
2525
checksum []byte
2626
from int64
2727
to int64
@@ -31,7 +31,7 @@ type chunkPart struct {
3131
func NewManifest() *manifest {
3232
return &manifest{
3333
size: 0,
34-
diskMap: make(map[int64]*chunkPart, 0),
34+
diskMap: make(map[int64]*chunkSlice, 0),
3535
}
3636
}
3737

@@ -50,7 +50,7 @@ func NewManifestFromFile(file string) (*manifest, error) {
5050
offset := int64(0)
5151

5252
for _, slice := range pbmanifest.Slices {
53-
manifest.Add(offset, &chunkPart{
53+
manifest.Add(offset, &chunkSlice{
5454
checksum: slice.Checksum,
5555
from: slice.Offset,
5656
to: slice.Offset + slice.Length,
@@ -99,11 +99,11 @@ func (m *manifest) Chunks() map[string]*chunk {
9999
return chunkMap
100100
}
101101

102-
func (m *manifest) Add(offset int64, part *chunkPart) {
102+
func (m *manifest) Add(offset int64, part *chunkSlice) {
103103
m.diskMap[offset] = part
104104
}
105105

106-
func (m *manifest) Get(offset int64) *chunkPart {
106+
func (m *manifest) Get(offset int64) *chunkSlice {
107107
return m.diskMap[offset]
108108
}
109109

@@ -125,8 +125,8 @@ func (m *manifest) Merge(other *manifest) {
125125
}
126126

127127
func (m *manifest) MergeAtOffset(offset int64, other *manifest) {
128-
for partOffset, part := range other.diskMap {
129-
m.diskMap[offset+partOffset] = part
128+
for sliceOffset, part := range other.diskMap {
129+
m.diskMap[offset+sliceOffset] = part
130130
}
131131
}
132132

@@ -138,12 +138,12 @@ func (m *manifest) WriteToFile(file string) error {
138138
}
139139

140140
for i, offset := range m.Offsets() {
141-
part := m.diskMap[offset]
141+
slice := m.diskMap[offset]
142142
pbmanifest.Slices[i] = &pb.Slice{
143-
Checksum: part.checksum,
144-
Offset: part.from,
145-
Length: part.to - part.from,
146-
Kind: int32(part.kind),
143+
Checksum: slice.checksum,
144+
Offset: slice.from,
145+
Length: slice.to - slice.from,
146+
Kind: int32(slice.kind),
147147
}
148148
}
149149

@@ -162,21 +162,21 @@ func (m *manifest) WriteToFile(file string) error {
162162

163163
func (m *manifest) Print() {
164164
for i, offset := range m.Offsets() {
165-
part := m.diskMap[offset]
165+
slice := m.diskMap[offset]
166166

167-
if part.checksum == nil {
167+
if slice.checksum == nil {
168168
fmt.Printf("idx %010d diskoff %013d - %013d len %-13d sparse -\n",
169-
i, offset, offset + part.to - part.from, part.to - part.from)
169+
i, offset, offset + slice.to - slice.from, slice.to - slice.from)
170170
} else {
171171
kind := "unknown"
172-
if part.kind == kindGap {
172+
if slice.kind == kindGap {
173173
kind = "gap"
174-
} else if part.kind == kindFile {
174+
} else if slice.kind == kindFile {
175175
kind = "file"
176176
}
177177

178178
fmt.Printf("idx %010d diskoff %013d - %013d len %-13d %-10s chunk %64x chunkoff %10d - %10d\n",
179-
i, offset, offset + part.to - part.from, part.to - part.from, kind, part.checksum, part.from, part.to)
179+
i, offset, offset + slice.to - slice.from, slice.to - slice.from, kind, slice.checksum, slice.from, slice.to)
180180
}
181181
}
182182
}

Diff for: map.go

+19-19
Original file line numberDiff line numberDiff line change
@@ -136,16 +136,16 @@ func (d *manifestImage) syncSlices(from int64, to int64) error {
136136
toIndex := int64(-1)
137137

138138
for i := int64(0); i < int64(len(d.offsets)); i++ {
139-
part := d.manifest.Get(d.offsets[i])
140-
partStart := d.offsets[i]
141-
partEnd := partStart + part.to - part.from
139+
slice := d.manifest.Get(d.offsets[i])
140+
sliceStart := d.offsets[i]
141+
sliceEnd := sliceStart + slice.to - slice.from
142142

143-
if partStart <= from && from < partEnd {
143+
if sliceStart <= from && from < sliceEnd {
144144
fromIndex = i
145-
fromOffset = partStart
145+
fromOffset = sliceStart
146146
}
147147

148-
if partStart <= to && to <= partEnd { // FIXME: to <= ??
148+
if sliceStart <= to && to <= sliceEnd { // FIXME: to <= ??
149149
toIndex = i
150150
break
151151
}
@@ -158,52 +158,52 @@ func (d *manifestImage) syncSlices(from int64, to int64) error {
158158

159159
offset := fromOffset
160160
for i := fromIndex; i <= toIndex; i++ {
161-
part := d.manifest.Get(d.offsets[i])
162-
if err := d.syncSlice(offset, part); err != nil {
161+
slice := d.manifest.Get(d.offsets[i])
162+
if err := d.syncSlice(offset, slice); err != nil {
163163
return err
164164
}
165165

166-
offset += part.to - part.from
166+
offset += slice.to - slice.from
167167
}
168168

169169
return nil
170170
}
171171

172-
func (d *manifestImage) syncSlice(offset int64, part *chunkPart) error {
172+
func (d *manifestImage) syncSlice(offset int64, slice *chunkSlice) error {
173173
if _, ok := d.written[offset]; ok {
174174
return nil
175175
}
176176

177-
if part.checksum == nil {
177+
if slice.checksum == nil {
178178
d.written[offset] = true
179179
return nil
180180
}
181181

182-
length := part.to - part.from
182+
length := slice.to - slice.from
183183
debugf("Syncing diskoff %d - %d (len %d) -> checksum %x, %d to %d\n",
184-
offset, offset + length, length, part.checksum, part.from, part.to)
184+
offset, offset + length, length, slice.checksum, slice.from, slice.to)
185185

186186
buffer := make([]byte, chunkSizeMaxBytes) // FIXME: Make this a buffer pool
187-
read, err := d.cache.ReadAt(part.checksum, buffer[:length], part.from)
187+
read, err := d.cache.ReadAt(slice.checksum, buffer[:length], slice.from)
188188
if err != nil {
189-
debugf("Chunk %x not in cache. Retrieving full chunk ...\n", part.checksum)
189+
debugf("Chunk %x not in cache. Retrieving full chunk ...\n", slice.checksum)
190190

191191
// Read entire chunk, store to cache
192-
chunk := d.chunks[fmt.Sprintf("%x", part.checksum)]
192+
chunk := d.chunks[fmt.Sprintf("%x", slice.checksum)]
193193

194194
// FIXME: This will fill up the local cache will all chunks and never delete it
195-
read, err = d.store.ReadAt(part.checksum, buffer[:chunk.size], 0)
195+
read, err = d.store.ReadAt(slice.checksum, buffer[:chunk.size], 0)
196196
if err != nil {
197197
return err
198198
} else if int64(read) != chunk.size {
199199
return errors.New(fmt.Sprintf("cannot read entire chunk, read only %d bytes", read))
200200
}
201201

202-
if err := d.cache.Write(part.checksum, buffer[:chunk.size]); err != nil {
202+
if err := d.cache.Write(slice.checksum, buffer[:chunk.size]); err != nil {
203203
return err
204204
}
205205

206-
buffer = buffer[part.from:part.to]
206+
buffer = buffer[slice.from:slice.to]
207207
} else if int64(read) != length {
208208
return errors.New(fmt.Sprintf("cannot read entire slice, read only %d bytes", read))
209209
}

Diff for: stat.go

+14-11
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,11 @@ import (
77
)
88

99
type chunkStat struct {
10-
checksum []byte
11-
size int64
12-
slices int64
13-
kind kind
10+
checksum []byte
11+
size int64
12+
sliceCount int64
13+
sliceSizes int64
14+
kind kind
1415
}
1516

1617
func Stat(manifestFiles []string, verbose bool) error {
@@ -64,14 +65,16 @@ func Stat(manifestFiles []string, verbose bool) error {
6465

6566
if _, ok := chunkMap[checksumStr]; !ok {
6667
chunkMap[checksumStr] = &chunkStat{
67-
checksum: slice.checksum,
68-
size: slice.to,
69-
slices: 1,
70-
kind: slice.kind, // This is inaccurate, because only the first appearance of the chunk is counted!
68+
checksum: slice.checksum,
69+
size: slice.to,
70+
sliceCount: 1,
71+
sliceSizes: sliceSize,
72+
kind: slice.kind, // This is inaccurate, because only the first appearance of the chunk is counted!
7173
}
7274
} else {
7375
chunkMap[checksumStr].size = maxInt64(chunkMap[checksumStr].size, slice.to)
74-
chunkMap[checksumStr].slices++
76+
chunkMap[checksumStr].sliceCount++
77+
chunkMap[checksumStr].sliceSizes += sliceSize
7578
}
7679
}
7780
}
@@ -105,7 +108,7 @@ func Stat(manifestFiles []string, verbose bool) error {
105108

106109
// Find chunk histogram
107110
sort.Slice(chunkStats, func(i, j int) bool {
108-
return chunkStats[i].slices > chunkStats[j].slices
111+
return chunkStats[i].sliceSizes > chunkStats[j].sliceSizes
109112
})
110113

111114
manifestCount := int64(len(manifestFiles))
@@ -136,7 +139,7 @@ func Stat(manifestFiles []string, verbose bool) error {
136139
if verbose {
137140
fmt.Printf("Slice histogram (top 10):\n")
138141
for i, stat := range chunkStats {
139-
fmt.Printf("- Chunk %x appeared in %d slice(s)\n", stat.checksum, stat.slices)
142+
fmt.Printf("- Chunk %x: %s in %d slice(s)\n", stat.checksum, convertToHumanReadable(stat.sliceSizes), stat.sliceCount)
140143
if i == 10 {
141144
break
142145
}

0 commit comments

Comments
 (0)