-
Notifications
You must be signed in to change notification settings - Fork 5k
CountMInSketch - Implementation + testing + readme #990
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
danibachar
wants to merge
2
commits into
kodecocodes:master
Choose a base branch
from
danibachar:feature/count-min-sketch
base: master
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from 1 commit
Commits
Show all changes
2 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
111 changes: 111 additions & 0 deletions
111
CountMinSketch/CountMinSketch.playground/Contents.swift
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
//: # CountMin Sketch | ||
import Foundation | ||
|
||
/// Private wrapper around Hashing, allowing hash different Hashables and keep their value | ||
private final class Hashing<T> where T: Hashable { | ||
private var map: [T: Int] = [:] | ||
|
||
func hash(_ value: T) -> Int { | ||
if let hash = map[value] { | ||
return hash | ||
} | ||
var hasher = Hasher() | ||
hasher.combine(value) | ||
let newValue = abs(hasher.finalize()) | ||
map[value] = newValue | ||
return newValue | ||
} | ||
} | ||
|
||
/* | ||
A class for counting hashable items using the Count-min Sketch strategy. | ||
It fulfills a similar purpose than `itertools.Counter`. | ||
The Count-min Sketch is a randomized data structure that uses a constant | ||
amount of memory and has constant insertion and lookup times at the cost | ||
of an arbitrarily small overestimation of the counts. | ||
*/ | ||
public final class CountMinSketch<T> where T: Hashable { | ||
private var hashers: [Hashing<T>] = [] | ||
private var matrix: [[UInt64]] = [] | ||
private let rows: Int | ||
private let cols: Int | ||
|
||
/// The total amount of elements adedd to the model | ||
private(set) var count: UInt64 = 0 | ||
/// init - will determine the matrix size | ||
/// - Parameters: | ||
/// - rows: the size of the hash tables, larger implies smaller overestimation | ||
/// - cols: the amount of hash tables, larger implies lower probability of | ||
init(rows: Int, cols: Int) { | ||
self.rows = rows | ||
self.cols = cols | ||
for _ in 0..<self.rows { | ||
hashers.append(Hashing()) | ||
matrix.append([UInt64](repeating: 0, count: self.cols)) | ||
} | ||
} | ||
|
||
/// Init - will determine the matrix size. s.t CountMin sketch guarantees approximation error on point queries more than epsilon * F1 (where F1 is the Frequency of first order of the stream) with probability `delta` in space O(1 \ epsilon * log(1 \ delta)) | ||
/// - Parameters: | ||
/// - delta: the probability for an error bigger than epsilon | ||
/// - epsilon: the error from the actual value | ||
init(delta: CGFloat, epsilon: CGFloat) { | ||
self.rows = Int(log2(1/delta).rounded(.up)) | ||
self.cols = Int((2/epsilon).rounded(.up)) | ||
for _ in 0..<self.rows { | ||
hashers.append(Hashing()) | ||
matrix.append([UInt64](repeating: 0, count: self.cols)) | ||
} | ||
} | ||
|
||
// Adding elemets to count, by default we increase the element count by one | ||
// But we extended the API to allow increasing the count in batches | ||
|
||
/// Adding an element ot the sketch | ||
/// - Parameters: | ||
/// - element: the element to add, must conform to hashable (described by T in the class definition) | ||
/// - value: the value (i.e amount) that we want to increase the element count by | ||
func add(element: T, value: UInt64=1) { | ||
self.count += value | ||
for row in 0..<self.rows { | ||
let hash = self.hashers[row].hash(element) | ||
let col = hash % self.cols | ||
self.matrix[row][col] += value | ||
} | ||
} | ||
|
||
/// Querying an element appearances | ||
/// - Parameter element: the element we want to get an estimation for | ||
/// - Returns: estimation of the amount of time that elememt was `add` | ||
func query(element: T) -> UInt64 { | ||
var values = [UInt64]() | ||
for row in 0..<self.rows { | ||
let hash = self.hashers[row].hash(element) | ||
let col = hash % self.cols | ||
let value = self.matrix[row][col] | ||
values.append(value) | ||
} | ||
return values.min()! | ||
} | ||
} | ||
|
||
|
||
//: EXAMPLES | ||
//: Let's create a sketch | ||
|
||
let stream: [Int] = [ | ||
1,2,3,4,5,5 ,1,23,43,23,4534,345,234,2,3423,234,23,42,453,45,345,23,2,343,45,345,34 | ||
] | ||
|
||
let sketch = CountMinSketch<Int>(rows: 10, cols: 10) | ||
|
||
for element in stream { | ||
sketch.add(element: element) | ||
} | ||
|
||
assert(sketch.count == stream.count) | ||
|
||
print("We have \(sketch.count) elements in the stream") | ||
|
||
|
||
print("The frequency of 1 is \(sketch.query(element: 1))") |
4 changes: 4 additions & 0 deletions
4
CountMinSketch/CountMinSketch.playground/contents.xcplayground
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?> | ||
<playground version='5.0' target-platform='osx' display-mode='raw'> | ||
<timeline fileName='timeline.xctimeline'/> | ||
</playground> |
7 changes: 7 additions & 0 deletions
7
CountMinSketch/CountMinSketch.playground/playground.xcworkspace/contents.xcworkspacedata
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
import Foundation | ||
|
||
/// Private wrapper around Hashing, allowing hash different Hashables and keep their value | ||
private final class Hashing<T> where T: Hashable { | ||
private var map: [T: Int] = [:] | ||
|
||
func hash(_ value: T) -> Int { | ||
if let hash = map[value] { | ||
return hash | ||
} | ||
var hasher = Hasher() | ||
hasher.combine(value) | ||
let newValue = abs(hasher.finalize()) | ||
map[value] = newValue | ||
return newValue | ||
} | ||
} | ||
|
||
/* | ||
A class for counting hashable items using the Count-min Sketch strategy. | ||
It fulfills a similar purpose than `itertools.Counter`. | ||
The Count-min Sketch is a randomized data structure that uses a constant | ||
amount of memory and has constant insertion and lookup times at the cost | ||
of an arbitrarily small overestimation of the counts. | ||
*/ | ||
public final class CountMinSketch<T> where T: Hashable { | ||
private var hashers: [Hashing<T>] = [] | ||
private var matrix: [[UInt64]] = [] | ||
private let rows: Int | ||
private let cols: Int | ||
|
||
/// The total amount of elements adedd to the model | ||
private(set) var count: UInt64 = 0 | ||
/// init - will determine the matrix size | ||
/// - Parameters: | ||
/// - rows: the size of the hash tables, larger implies smaller overestimation | ||
/// - cols: the amount of hash tables, larger implies lower probability of | ||
init(rows: Int, cols: Int) { | ||
self.rows = rows | ||
self.cols = cols | ||
for _ in 0..<self.rows { | ||
hashers.append(Hashing()) | ||
matrix.append([UInt64](repeating: 0, count: self.cols)) | ||
} | ||
} | ||
|
||
/// Init - will determine the matrix size. s.t CountMin sketch guarantees approximation error on point queries more than epsilon * F1 (where F1 is the Frequency of first order of the stream) with probability `delta` in space O(1 \ epsilon * log(1 \ delta)) | ||
/// - Parameters: | ||
/// - delta: the probability for an error bigger than epsilon | ||
/// - epsilon: the error from the actual value | ||
init(delta: CGFloat, epsilon: CGFloat) { | ||
self.rows = Int(log2(1/delta).rounded(.up)) | ||
self.cols = Int((2/epsilon).rounded(.up)) | ||
for _ in 0..<self.rows { | ||
hashers.append(Hashing()) | ||
matrix.append([UInt64](repeating: 0, count: self.cols)) | ||
} | ||
} | ||
|
||
// Adding elemets to count, by default we increase the element count by one | ||
// But we extended the API to allow increasing the count in batches | ||
|
||
/// Adding an element ot the sketch | ||
/// - Parameters: | ||
/// - element: the element to add, must conform to hashable (described by T in the class definition) | ||
/// - value: the value (i.e amount) that we want to increase the element count by | ||
func add(element: T, value: UInt64=1) { | ||
self.count += value | ||
for row in 0..<self.rows { | ||
let hash = self.hashers[row].hash(element) | ||
let col = hash % self.cols | ||
self.matrix[row][col] += value | ||
} | ||
} | ||
|
||
/// Querying an element appearances | ||
/// - Parameter element: the element we want to get an estimation for | ||
/// - Returns: estimation of the amount of time that elememt was `add` | ||
func query(element: T) -> UInt64 { | ||
var values = [UInt64]() | ||
for row in 0..<self.rows { | ||
let hash = self.hashers[row].hash(element) | ||
let col = hash % self.cols | ||
let value = self.matrix[row][col] | ||
values.append(value) | ||
} | ||
return values.min()! | ||
} | ||
} |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
# CountMin Sketch | ||
|
||
#### Explanation about the model (Taken from wikipedia) | ||
In computing, the count–min sketch (CM sketch) is a probabilistic data structure that serves as a frequency table of events in a stream of data. It uses hash functions to map events to frequencies, but unlike a hash table uses only sub-linear space, at the expense of overcounting some events due to collisions. The count–min sketch was invented in 2003 by Graham Cormode and S. Muthu Muthukrishnan and described by them in a [2005 paper](https://www.sciencedirect.com/science/article/abs/pii/S0196677403001913?via%3Dihub). | ||
|
||
The goal of the basic version of the count–min sketch is to consume a stream of events, one at a time, and count the frequency of the different types of events in the stream. At any time, the sketch can be queried for the frequency of a particular event type i from a universe of event types {U}, and will return an estimate of this frequency that is within a certain distance of the true frequency, with a certain probability. | ||
|
||
The actual sketch data structure is a two-dimensional array of w columns and d rows. The parameters w and d are fixed when the sketch is created, and determine the time and space needs and the probability of error when the sketch is queried for a frequency or inner product. Associated with each of the d rows is a separate hash function; the hash functions must be pairwise independent. The parameters w and d can be chosen by setting w = ⌈2/ε⌉ and d = ⌈ln 1/δ⌉, where the error in answering a query is within an additive factor of ε with probability 1 − δ | ||
When a new event of type i arrives we update as follows: for each row j of the table, apply the corresponding hash function to obtain a column index k = hj(i). Then increment the value in row j, column k by one. | ||
|
||
 | ||
 | ||
|
||
|
||
#### Implementation details | ||
1. Memory consumption - We hold a matrix in the size according to the probalictic charactaristic the user wish, specifically we will have cols = ⌈2/ε⌉ and row = ⌈ln 1/δ⌉ | ||
2. `add` function - Given the assumption applying a hash function takes O(1) as well as arithmetic addition `adding` an element shall take O(⌈ln 1/δ⌉) = O(number of rows) | ||
3. `query` - Same as adding - O(⌈ln 1/δ⌉) = O(number of rows) | ||
|
||
|
||
#### How is this different from a regular counter | ||
This model allows us to use sublinear space to estimate the frequecny of elements if a stream. | ||
While a regualr counter will have to maintain some mapping between each element to its frequency, this model allows us to use probability and have a smaller memory footprint. | ||
This value of this Data Structure makes it particulary benefitial for huge data streams where it is not feasable to hold an exact counter for each elements as the stream is potentially endless. | ||
|
||
|
||
*Written by Daniel Bachar* |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.