Skip to content

Commit 6b2b743

Browse files
committed
word vec"
1 parent 4cb015b commit 6b2b743

8 files changed

+76773
-66
lines changed

class_15/.ipynb_checkpoints/colorization-checkpoint.ipynb

+383-5
Large diffs are not rendered by default.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,280 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 2,
6+
"metadata": {
7+
"collapsed": true
8+
},
9+
"outputs": [],
10+
"source": [
11+
"import numpy as np\n",
12+
"from matplotlib import pyplot as plt\n",
13+
"%matplotlib inline"
14+
]
15+
},
16+
{
17+
"cell_type": "code",
18+
"execution_count": 3,
19+
"metadata": {
20+
"collapsed": true
21+
},
22+
"outputs": [],
23+
"source": [
24+
"f = open('./data.txt')\n",
25+
"d = f.read()\n",
26+
"f.close()"
27+
]
28+
},
29+
{
30+
"cell_type": "code",
31+
"execution_count": 4,
32+
"metadata": {
33+
"collapsed": false
34+
},
35+
"outputs": [],
36+
"source": [
37+
"data = d[1260:]\n",
38+
"data = data.lower().decode('utf-8')\n",
39+
"import re"
40+
]
41+
},
42+
{
43+
"cell_type": "code",
44+
"execution_count": 5,
45+
"metadata": {
46+
"collapsed": false
47+
},
48+
"outputs": [],
49+
"source": [
50+
"p = re.sub('[^A-Za-z]+', ' ', data)\n",
51+
"ds = p.split()"
52+
]
53+
},
54+
{
55+
"cell_type": "code",
56+
"execution_count": 6,
57+
"metadata": {
58+
"collapsed": false
59+
},
60+
"outputs": [],
61+
"source": [
62+
"u = np.unique(ds, return_counts=True)"
63+
]
64+
},
65+
{
66+
"cell_type": "code",
67+
"execution_count": 7,
68+
"metadata": {
69+
"collapsed": false,
70+
"scrolled": false
71+
},
72+
"outputs": [],
73+
"source": [
74+
"bow = {}\n",
75+
"rev_bow = {}\n",
76+
"i = 0\n",
77+
"for ix in range(len(u[0])):\n",
78+
" if u[1][ix] > 2:\n",
79+
" bow[i] = u[0][ix]\n",
80+
" rev_bow[u[0][ix]] = i\n",
81+
" i += 1"
82+
]
83+
},
84+
{
85+
"cell_type": "code",
86+
"execution_count": 8,
87+
"metadata": {
88+
"collapsed": false
89+
},
90+
"outputs": [
91+
{
92+
"data": {
93+
"text/plain": [
94+
"1781"
95+
]
96+
},
97+
"execution_count": 8,
98+
"metadata": {},
99+
"output_type": "execute_result"
100+
}
101+
],
102+
"source": [
103+
"len(bow)"
104+
]
105+
},
106+
{
107+
"cell_type": "code",
108+
"execution_count": 9,
109+
"metadata": {
110+
"collapsed": true
111+
},
112+
"outputs": [],
113+
"source": [
114+
"def get_one_hot_vector(word):\n",
115+
" vec = np.zeros((len(bow),))\n",
116+
" vec[rev_bow[word]] = 1.0\n",
117+
" \n",
118+
" return vec\n",
119+
"\n",
120+
"def get_word_from_vec(vec):\n",
121+
" ind = np.argmax(vec)\n",
122+
" \n",
123+
" return bow[ind]"
124+
]
125+
},
126+
{
127+
"cell_type": "code",
128+
"execution_count": 10,
129+
"metadata": {
130+
"collapsed": false,
131+
"scrolled": false
132+
},
133+
"outputs": [
134+
{
135+
"name": "stdout",
136+
"output_type": "stream",
137+
"text": [
138+
"tree\n"
139+
]
140+
}
141+
],
142+
"source": [
143+
"a = get_one_hot_vector('tree')\n",
144+
"a_ = get_word_from_vec(a)\n",
145+
"\n",
146+
"print a_"
147+
]
148+
},
149+
{
150+
"cell_type": "code",
151+
"execution_count": 11,
152+
"metadata": {
153+
"collapsed": false,
154+
"scrolled": false
155+
},
156+
"outputs": [],
157+
"source": [
158+
"all_data = p.split()\n",
159+
"len(all_data)\n",
160+
"\n",
161+
"dataset = []#np.zeros((len(all_data), len(bow)))\n",
162+
"# print dataset.shape"
163+
]
164+
},
165+
{
166+
"cell_type": "code",
167+
"execution_count": 12,
168+
"metadata": {
169+
"collapsed": false
170+
},
171+
"outputs": [],
172+
"source": [
173+
"for w in range(len(all_data)):\n",
174+
" try:\n",
175+
" dataset.append(get_one_hot_vector(all_data[w]))\n",
176+
" except:\n",
177+
" pass"
178+
]
179+
},
180+
{
181+
"cell_type": "code",
182+
"execution_count": 13,
183+
"metadata": {
184+
"collapsed": false
185+
},
186+
"outputs": [
187+
{
188+
"name": "stdout",
189+
"output_type": "stream",
190+
"text": [
191+
"(35456, 1781)\n"
192+
]
193+
}
194+
],
195+
"source": [
196+
"dataset = np.asarray(dataset)\n",
197+
"print dataset.shape"
198+
]
199+
},
200+
{
201+
"cell_type": "code",
202+
"execution_count": 14,
203+
"metadata": {
204+
"collapsed": true
205+
},
206+
"outputs": [],
207+
"source": [
208+
"np.save('all_word_data', dataset)"
209+
]
210+
},
211+
{
212+
"cell_type": "code",
213+
"execution_count": 16,
214+
"metadata": {
215+
"collapsed": true
216+
},
217+
"outputs": [],
218+
"source": [
219+
"import pickle as pk\n",
220+
"\n",
221+
"fb = open('bow.pkl', 'w')\n",
222+
"fr = open('rev_bow.pkl', 'w')"
223+
]
224+
},
225+
{
226+
"cell_type": "code",
227+
"execution_count": 18,
228+
"metadata": {
229+
"collapsed": true
230+
},
231+
"outputs": [],
232+
"source": [
233+
"pk.dump(bow, fb)\n",
234+
"pk.dump(rev_bow, fr)"
235+
]
236+
},
237+
{
238+
"cell_type": "code",
239+
"execution_count": 19,
240+
"metadata": {
241+
"collapsed": true
242+
},
243+
"outputs": [],
244+
"source": [
245+
"fb.close()\n",
246+
"fr.close()"
247+
]
248+
},
249+
{
250+
"cell_type": "code",
251+
"execution_count": null,
252+
"metadata": {
253+
"collapsed": true
254+
},
255+
"outputs": [],
256+
"source": []
257+
}
258+
],
259+
"metadata": {
260+
"kernelspec": {
261+
"display_name": "Python 2",
262+
"language": "python",
263+
"name": "python2"
264+
},
265+
"language_info": {
266+
"codemirror_mode": {
267+
"name": "ipython",
268+
"version": 2
269+
},
270+
"file_extension": ".py",
271+
"mimetype": "text/x-python",
272+
"name": "python",
273+
"nbconvert_exporter": "python",
274+
"pygments_lexer": "ipython2",
275+
"version": "2.7.12"
276+
}
277+
},
278+
"nbformat": 4,
279+
"nbformat_minor": 2
280+
}

class_15/Untitled.ipynb

+30-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,34 @@
11
{
2-
"cells": [],
3-
"metadata": {},
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {
7+
"collapsed": true
8+
},
9+
"outputs": [],
10+
"source": []
11+
}
12+
],
13+
"metadata": {
14+
"kernelspec": {
15+
"display_name": "Python 2",
16+
"language": "python",
17+
"name": "python2"
18+
},
19+
"language_info": {
20+
"codemirror_mode": {
21+
"name": "ipython",
22+
"version": 2
23+
},
24+
"file_extension": ".py",
25+
"mimetype": "text/x-python",
26+
"name": "python",
27+
"nbconvert_exporter": "python",
28+
"pygments_lexer": "ipython2",
29+
"version": "2.7.12"
30+
}
31+
},
432
"nbformat": 4,
533
"nbformat_minor": 2
634
}

0 commit comments

Comments
 (0)