-
Notifications
You must be signed in to change notification settings - Fork 34
/
Copy pathdatasets.py
254 lines (196 loc) · 8.89 KB
/
datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
import os
import pandas as pd
import numpy as np
import sklearn.datasets
try:
from urllib.request import urlretrieve
except ImportError:
from urllib import urlretrieve
github_data_url = "https://github.com/slundberg/shap/raw/master/data/"
def imagenet50(display=False, resolution=224): # pylint: disable=unused-argument
""" This is a set of 50 images representative of ImageNet images.
This dataset was collected by randomly finding a working ImageNet link and then pasting the
original ImageNet image into Google image search restricted to images licensed for reuse. A
similar image (now with rights to reuse) was downloaded as a rough replacment for the original
ImageNet image. The point is to have a random sample of ImageNet for use as a background
distribution for explaining models trained on ImageNet data.
Note that because the images are only rough replacements the labels might no longer be correct.
"""
prefix = github_data_url + "imagenet50_"
X = np.load(cache(prefix + "%sx%s.npy" % (resolution, resolution))).astype(np.float32)
y = np.loadtxt(cache(prefix + "labels.csv"))
return X, y
def boston(display=False): # pylint: disable=unused-argument
""" Return the boston housing data in a nice package. """
d = sklearn.datasets.load_boston()
df = pd.DataFrame(data=d.data, columns=d.feature_names) # pylint: disable=E1101
return df, d.target # pylint: disable=E1101
def linnerud(display=False): # pylint: disable=unused-argument
""" Return the linnerud data in a nice package (multi-target regression). """
d = sklearn.datasets.load_linnerud()
X = pd.DataFrame(d.data, columns=d.feature_names) # pylint: disable=E1101
y = pd.DataFrame(d.target, columns=d.target_names) # pylint: disable=E1101
return X, y # pylint: disable=E1101
def imdb(display=False): # pylint: disable=unused-argument
""" Return the clssic IMDB sentiment analysis training data in a nice package.
Full data is at: http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Paper to cite when using the data is: http://www.aclweb.org/anthology/P11-1015
"""
with open(cache(github_data_url + "imdb_train.txt")) as f:
data = f.readlines()
y = np.ones(25000, dtype=bool)
y[:12500] = 0
return data, y
def communitiesandcrime(display=False): # pylint: disable=unused-argument
""" Predict total number of non-violent crimes per 100K popuation.
This dataset is from the classic UCI Machine Learning repository:
https://archive.ics.uci.edu/ml/datasets/Communities+and+Crime+Unnormalized
"""
raw_data = pd.read_csv(
cache(github_data_url + "CommViolPredUnnormalizedData.txt"),
na_values="?"
)
# find the indices where the total violent crimes are known
valid_inds = np.where(np.invert(np.isnan(raw_data.iloc[:,-2])))[0]
y = np.array(raw_data.iloc[valid_inds,-2], dtype=float)
# extract the predictive features and remove columns with missing values
X = raw_data.iloc[valid_inds,5:-18]
valid_cols = np.where(np.isnan(X.values).sum(0) == 0)[0]
X = X.iloc[:,valid_cols]
return X, y
def diabetes(display=False): # pylint: disable=unused-argument
""" Return the diabetes data in a nice package. """
d = sklearn.datasets.load_diabetes()
df = pd.DataFrame(data=d.data, columns=d.feature_names) # pylint: disable=E1101
return df, d.target # pylint: disable=E1101
def iris(display=False):
""" Return the classic iris data in a nice package. """
d = sklearn.datasets.load_iris()
df = pd.DataFrame(data=d.data, columns=d.feature_names) # pylint: disable=E1101
if display:
return df, [d.target_names[v] for v in d.target] # pylint: disable=E1101
return df, d.target # pylint: disable=E1101
def adult(display=False):
""" Return the Adult census data in a nice package. """
dtypes = [
("Age", "float32"), ("Workclass", "category"), ("fnlwgt", "float32"),
("Education", "category"), ("Education-Num", "float32"), ("Marital Status", "category"),
("Occupation", "category"), ("Relationship", "category"), ("Race", "category"),
("Sex", "category"), ("Capital Gain", "float32"), ("Capital Loss", "float32"),
("Hours per week", "float32"), ("Country", "category"), ("Target", "category")
]
raw_data = pd.read_csv(
cache(github_data_url + "adult.data"),
names=[d[0] for d in dtypes],
na_values="?",
dtype=dict(dtypes)
)
data = raw_data.drop(["Education"], axis=1) # redundant with Education-Num
filt_dtypes = list(filter(lambda x: not (x[0] in ["Target", "Education"]), dtypes))
data["Target"] = data["Target"] == " >50K"
rcode = {
"Not-in-family": 0,
"Unmarried": 1,
"Other-relative": 2,
"Own-child": 3,
"Husband": 4,
"Wife": 5
}
for k, dtype in filt_dtypes:
if dtype == "category":
if k == "Relationship":
data[k] = np.array([rcode[v.strip()] for v in data[k]])
else:
data[k] = data[k].cat.codes
if display:
return raw_data.drop(["Education", "Target", "fnlwgt"], axis=1), data["Target"].values
return data.drop(["Target", "fnlwgt"], axis=1), data["Target"].values
def nhanesi(display=False):
""" A nicely packaged version of NHANES I data with surivival times as labels.
"""
X = pd.read_csv(cache(github_data_url + "NHANESI_X.csv"), index_col=0)
y = pd.read_csv(cache(github_data_url + "NHANESI_y.csv"), index_col=0)["y"]
if display:
X_display = X.copy()
#X_display["sex_isFemale"] = ["Female" if v else "Male" for v in X["sex_isFemale"]]
return X_display, np.array(y)
return X, np.array(y)
def corrgroups60(display=False): # pylint: disable=unused-argument
""" Correlated Groups 60
A simulated dataset with tight correlations among distinct groups of features.
"""
# set a constant seed
old_seed = np.random.seed()
np.random.seed(0)
# generate dataset with known correlation
N = 1000
M = 60
# set one coefficent from each group of 3 to 1
beta = np.zeros(M)
beta[0:30:3] = 1
# build a correlation matrix with groups of 3 tightly correlated features
C = np.eye(M)
for i in range(0,30,3):
C[i,i+1] = C[i+1,i] = 0.99
C[i,i+2] = C[i+2,i] = 0.99
C[i+1,i+2] = C[i+2,i+1] = 0.99
f = lambda X: np.matmul(X, beta)
# Make sure the sample correlation is a perfect match
X_start = np.random.randn(N, M)
X_centered = X_start - X_start.mean(0)
Sigma = np.matmul(X_centered.T, X_centered) / X_centered.shape[0]
W = np.linalg.cholesky(np.linalg.inv(Sigma)).T
X_white = np.matmul(X_centered, W.T)
assert np.linalg.norm(np.corrcoef(np.matmul(X_centered, W.T).T) - np.eye(M)) < 1e-6 # ensure this decorrelates the data
# create the final data
X_final = np.matmul(X_white, np.linalg.cholesky(C).T)
X = X_final
y = f(X) + np.random.randn(N) * 1e-2
# restore the previous numpy random seed
np.random.seed(old_seed)
return pd.DataFrame(X), y
def independentlinear60(display=False): # pylint: disable=unused-argument
""" A simulated dataset with tight correlations among distinct groups of features.
"""
# set a constant seed
old_seed = np.random.seed()
np.random.seed(0)
# generate dataset with known correlation
N = 1000
M = 60
# set one coefficent from each group of 3 to 1
beta = np.zeros(M)
beta[0:30:3] = 1
f = lambda X: np.matmul(X, beta)
# Make sure the sample correlation is a perfect match
X_start = np.random.randn(N, M)
X = X_start - X_start.mean(0)
y = f(X) + np.random.randn(N) * 1e-2
# restore the previous numpy random seed
np.random.seed(old_seed)
return pd.DataFrame(X), y
def a1a():
""" A sparse dataset in scipy csr matrix format.
"""
return sklearn.datasets.load_svmlight_file(cache(github_data_url + 'a1a.svmlight'))
def rank():
""" Ranking datasets from lightgbm repository.
"""
rank_data_url = 'https://raw.githubusercontent.com/Microsoft/LightGBM/master/examples/lambdarank/'
x_train, y_train = sklearn.datasets.load_svmlight_file(cache(rank_data_url + 'rank.train'))
x_test, y_test = sklearn.datasets.load_svmlight_file(cache(rank_data_url + 'rank.test'))
q_train = np.loadtxt(cache(rank_data_url + 'rank.train.query'))
q_test = np.loadtxt(cache(rank_data_url + 'rank.test.query'))
return x_train, y_train, x_test, y_test, q_train, q_test
def cache(url, file_name=None):
""" Loads a file from the URL and caches it locally.
"""
if file_name is None:
file_name = os.path.basename(url)
data_dir = os.path.join(os.path.dirname(__file__), "cached_data")
if not os.path.isdir(data_dir):
os.mkdir(data_dir)
file_path = os.path.join(data_dir, file_name)
if not os.path.isfile(file_path):
urlretrieve(url, file_path)
return file_path