-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_processing.py
53 lines (38 loc) · 1.33 KB
/
data_processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import pandas as pd
from models import KMEANS
def filter_columns(data):
#takes in dataframe, returns with processed columns
data = data[ data['security_id'] == 1]
data.drop('row_id', axis=1, inplace=True)
data.drop('security_id', axis=1, inplace=True)
data.drop('initiator', axis=1, inplace=True)
for i in range(1, 51):
data.drop(f'time{i}', axis=1, inplace=True)
data.drop(f'transtype{i}', axis=1, inplace=True)
for i in range(61, 101):
data.drop(f'bid{i}', axis=1, inplace=True)
data.drop(f'ask{i}', axis=1, inplace=True)
return data
def scale_data(data):
scaled_data = (data - data.mean()) / data.std()
return scaled_data
def split_data(data):
#returns matrices X, y
target_columns = []
for i in range(51, 61):
target_columns.append(f'bid{i}')
target_columns.append(f'ask{i}')
X = data.drop(columns=target_columns)
y = data[target_columns]
return X, y
# load and process data
def get_matrices(df):
data = filter_columns(df)
scaled_data = scale_data(data)
X, y = split_data(scaled_data)
return X, y
# apply k-means clusters, returns augmented df
def apply_kmeans_labels(df, y_vector):
kmeans_model = KMEANS(n_clusters=5)
clusters = kmeans_model.fit(y_vector)
return clusters