Skip to content

Commit ed04296

Browse files
committed
Prepare testing and experiments.
1 parent 6feccb7 commit ed04296

13 files changed

+56
-109
lines changed

.gitignore

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,5 +64,4 @@ __pycache__
6464
.idea/*
6565

6666
embedding/*.txt
67-
models/*
68-
results/*
67+
models/*

embedding/glove.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
GLOVE_DIR = 'embedding/glove.6B.100d.txt'
55

66

7-
def get_pretrained_glove(num_words, text_vocab):
7+
def pre_trained_glove(num_words, text_vocab):
88
embeddings_index = {}
99
try:
1010
f = open(GLOVE_DIR, 'r+', encoding="utf-8")
Lines changed: 21 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,18 @@
11
import itertools
2+
import sys
23

34
import matplotlib.pyplot as plt
45
import numpy as np
5-
from keras.models import load_model
66
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support
77

8-
from constants import MAX_LEN
98
from utils.classification_report import classification_report
109
from utils.plot_confusion_matrix_util import plot_confusion_matrix
1110

1211

13-
def test_model(model_path, test, test_input, labels_vocab):
14-
model = load_model(model_path + 'ner_model')
12+
def evaluate(model, test, test_input, labels_vocab, save_path):
13+
test_eval = model.evaluate(test_input, np.array(test.y))
14+
print('Test loss:', test_eval[0])
15+
print('Test accuracy:', test_eval[1])
1516

1617
predicted_values = np.argmax(model.predict(test_input), axis=-1)
1718
true_values = np.argmax(test.y, -1)
@@ -20,33 +21,35 @@ def test_model(model_path, test, test_input, labels_vocab):
2021
true_values = list(itertools.chain(*true_values))
2122
predicted_values = list(itertools.chain(*predicted_values))
2223

24+
orig_stdout = sys.stdout
25+
f = open(save_path + 'results.txt', 'w')
26+
sys.stdout = f
27+
2328
print("Macro Precision/Recall/F1 score:")
2429
print(precision_recall_fscore_support(true_values, predicted_values, average='macro'))
30+
print(60 * "-")
2531

2632
print("Micro Precision/Recall/F1 score:")
2733
print(precision_recall_fscore_support(true_values, predicted_values, average='micro'))
34+
print(60 * "-")
2835

29-
print("Weighted Precision/Recall/F1 score:")
30-
print(precision_recall_fscore_support(true_values, predicted_values, average='weighted'))
31-
32-
# Remove padding label
3336
keys = list(labels_vocab.stoi.keys())
3437
values = list(labels_vocab.stoi.values())
35-
# values.remove(labels_vocab.stoi[NO_ENTITY_TOKEN])
36-
# keys.remove(NO_ENTITY_TOKEN)
3738

38-
# Classification report
39-
report = classification_report(true_values, predicted_values, labels=values, target_names=keys, digits=4, average='macro')
40-
print(report)
39+
# Classification report's
40+
macro_report = classification_report(true_values, predicted_values, labels=values, target_names=keys, digits=4, average='macro')
41+
print(macro_report)
42+
print(60 * "-")
43+
44+
micro_report = classification_report(true_values, predicted_values, labels=values, target_names=keys, digits=4, average='micro')
45+
print(micro_report)
4146

42-
# plot_classification_report(report)
43-
# plt.savefig('results/classification_report.png', dpi=200, format='png', bbox_inches='tight')
44-
# plt.close()
47+
sys.stdout = orig_stdout
48+
f.close()
4549

4650
# Confusion Matrix
4751
cnf_matrix = confusion_matrix(true_values, predicted_values)
4852
np.set_printoptions(precision=2)
49-
# TODO fix classes
5053
plot_confusion_matrix(cnf_matrix, classes=list(labels_vocab.stoi.keys()), normalize=True, title='Normalized confusion matrix')
51-
plt.savefig('results/confusion_matrix.png', dpi=200, format='png', bbox_inches='tight')
54+
plt.savefig(save_path + '/images/confusion_matrix.png', dpi=200, format='png', bbox_inches='tight')
5255
plt.close()

experiments.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import json
22
import os
3-
from datetime import datetime
43
from collections import namedtuple
4+
55
from train import train
66

77

@@ -18,6 +18,7 @@ def main():
1818
data = fd.read()
1919
args = json2obj(data)
2020
print(args.rnn_type)
21+
os.makedirs(args.save_path + 'images', exist_ok=True)
2122

2223
train(args)
2324

inputs.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,18 @@
11
from keras import Input
22
from keras.layers import Embedding, Dropout, LSTM, TimeDistributed, SpatialDropout1D, concatenate
3-
from embedding.glove import get_pretrained_glove
3+
from embedding.glove import pre_trained_glove
44
from constants import MAX_LEN, MAX_LEN_CHAR
55

66

77
def inputs_factory(args, vocabs):
88
inputs = []
99
input_layers = []
1010

11-
for key, func in inputs_map.items():
12-
if key in args.inputs:
13-
input, input_layer = func(args, vocabs)
14-
inputs.append(input)
15-
input_layers.append(input_layer)
11+
# Args inputs element must be separated by -, the order is important here
12+
for i in args.inputs.split('-'):
13+
input, input_layer = inputs_map.get(i)(args, vocabs)
14+
inputs.append(input)
15+
input_layers.append(input_layer)
1616

1717
# Concatenate inputs (if there are multiple)
1818
if len(inputs) > 1:
@@ -31,7 +31,7 @@ def words_input(args, vocabs):
3131
if args.embeddings_type == 'glove':
3232
txt_embed = Embedding(input_dim=num_words, output_dim=MAX_LEN, input_length=None,
3333
name='txt_embedding', trainable=args.embeddings_trainable,
34-
weights=([get_pretrained_glove(num_words, vocabs.words)]))(txt_input)
34+
weights=([pre_trained_glove(num_words, vocabs.words)]))(txt_input)
3535
else:
3636
txt_embed = Embedding(input_dim=num_words, output_dim=MAX_LEN, input_length=None,
3737
name='txt_embedding', trainable=args.embeddings_trainable)(txt_input)
@@ -51,7 +51,7 @@ def pos_input(args, vocabs):
5151

5252
def chars_input(args, vocabs):
5353
char_in = Input(shape=(None, MAX_LEN_CHAR,), name="char_input")
54-
emb_char = TimeDistributed(Embedding(input_dim=len(vocabs.chars.itos), output_dim=MAX_LEN_CHAR, input_length=None))\
54+
emb_char = TimeDistributed(Embedding(input_dim=len(vocabs.chars.itos), output_dim=MAX_LEN_CHAR, input_length=None)) \
5555
(char_in)
5656
char_enc = TimeDistributed(LSTM(units=20, return_sequences=False, recurrent_dropout=0.5))(emb_char)
5757
return char_in, char_enc

model.py

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -80,27 +80,22 @@ def train(self, epochs, embedding=None):
8080

8181
model.compile(optimizer="rmsprop", loss='categorical_crossentropy', metrics=['accuracy'])
8282

83-
plot_model(model, to_file=self.save_path + 'ner_model_image.png')
83+
plot_model(model, to_file=self.save_path + 'model_structure.png')
8484
print(model.summary())
8585

86-
dir = create_dir()
87-
88-
tensorboard_callback = TensorBoard(log_dir=dir, histogram_freq=0, write_graph=True, write_images=True)
89-
9086
history = model.fit(
9187
[self.X_train, self.train_pos, self.train_characters],
9288
np.array(self.Y_train), batch_size=32, epochs=epochs,
93-
validation_data=(
94-
[self.X_validation, self.valid_pos, self.valid_characters],
95-
np.array(self.Y_validation)),
96-
callbacks=[tensorboard_callback], verbose=1)
89+
validation_data=([self.X_validation, self.valid_pos, self.valid_characters], np.array(self.Y_validation)), verbose=1)
9790

98-
model.save(self.save_path + 'ner_model')
91+
model.save(self.save_path + 'model_ner')
9992

10093
test_eval = model.evaluate(
10194
[self.X_test, self.test_pos, self.test_characters],
10295
np.array(self.Y_test))
96+
10397
print('Test loss:', test_eval[0])
10498
print('Test accuracy:', test_eval[1])
10599

106100
return model, history
101+

model_args.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
2-
"max_epochs": 5,
2+
"max_epochs": 2,
33
"batch_size": 32,
4-
"save_path": "models/",
4+
"save_path": "models/test/",
55
"inputs": "words",
66
"embeddings_trainable": false,
77
"embeddings_type": "glove",

predict.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ def main():
2323
args = parse_args()
2424

2525
vocabs = load_object(args.path + 'vocabs')
26-
model = load_model(args.path + 'ner_model')
26+
model = load_model(args.path + 'model_ner')
2727
nlp = spacy.load('en')
2828

2929
while True:

train.py

Lines changed: 9 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,10 @@
11
import numpy as np
22
import matplotlib.pyplot as plt
33
import argparse
4-
import re
54
import os
65
from keras.callbacks import TensorBoard
76
from dataset.api import load_dataset
8-
from test_model import test_model
7+
from evaluation import evaluate
98
from keras.utils.vis_utils import plot_model
109
from datetime import datetime
1110
from inputs import inputs_factory
@@ -40,34 +39,15 @@ def parse_args():
4039
return args
4140

4241

43-
def create_dir():
44-
runs = ([x[0] for x in os.walk("results/logs")])
45-
runs = [x for x in runs if "run" in x]
46-
runs = list(map(int, re.findall(r'\d+', "".join(runs))))
47-
runs.sort()
48-
if len(runs) == 0:
49-
return "results/logs/run1"
50-
51-
dir_idx = runs[-1] + 1
52-
53-
dir = "results/logs/run" + str(dir_idx)
54-
55-
if not os.path.exists(dir):
56-
os.makedirs(dir)
57-
return dir
58-
else:
59-
raise FileExistsError('Clear logs dir.')
60-
61-
62-
def plot_train_and_save(history):
42+
def plot_train_and_save(history, path):
6343
# Plot accuracy
6444
plt.plot(history.history['acc'])
6545
plt.plot(history.history['val_acc'])
6646
plt.title('Model Accuracy')
6747
plt.ylabel('Accuracy')
6848
plt.xlabel('Epoch')
6949
plt.legend(['train', 'validation'], loc='lower right')
70-
plt.savefig('results/model_accuracy.png', dpi=200, format='png', bbox_inches='tight')
50+
plt.savefig(path + '/images/model_accuracy.png', dpi=200, format='png', bbox_inches='tight')
7151
plt.close()
7252

7353
# Plot loss
@@ -77,7 +57,7 @@ def plot_train_and_save(history):
7757
plt.ylabel('Loss')
7858
plt.xlabel('Epoch')
7959
plt.legend(['train', 'validation'], loc='upper right')
80-
plt.savefig('results/model_loss.png', dpi=200, format='png', bbox_inches='tight')
60+
plt.savefig(path + '/images/model_loss.png', dpi=200, format='png', bbox_inches='tight')
8161
plt.close()
8262

8363

@@ -114,12 +94,10 @@ def train(args):
11494

11595
# prepare model
11696
model.compile(optimizer="rmsprop", loss='categorical_crossentropy', metrics=['accuracy'])
117-
plot_model(model, to_file=args.save_path + 'ner_model_image.png')
97+
plot_model(model, to_file=args.save_path + 'images/model_structure.png')
11898
print(model.summary())
11999

120-
dir = create_dir()
121-
122-
tensorboard_callback = TensorBoard(log_dir=dir, histogram_freq=0, write_graph=True, write_images=True)
100+
tensorboard_callback = TensorBoard(log_dir=args.save_path, histogram_freq=0, write_graph=True, write_images=True)
123101

124102
# get inputs based on args.inputs argument
125103
train, val, test = filter_inputs(args, datasets)
@@ -130,14 +108,11 @@ def train(args):
130108
validation_data=(val, np.array(datasets.val.y)),
131109
callbacks=[tensorboard_callback], verbose=1)
132110

133-
model.save(args.save_path + 'ner_model')
111+
model.save(args.save_path + 'model_ner')
134112

135-
test_eval = model.evaluate(test, np.array(datasets.test.y))
136-
print('Test loss:', test_eval[0])
137-
print('Test accuracy:', test_eval[1])
113+
evaluate(model, datasets.test, test, vocabs.labels, args.save_path)
138114

139-
test_model(args.save_path, datasets.test, test, vocabs.labels)
140-
plot_train_and_save(history)
115+
plot_train_and_save(history, args.save_path)
141116

142117

143118
if __name__ == '__main__':

train.pyc

5.81 KB
Binary file not shown.

utils/plot_classification_report_util.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ def cm2inch(*tupl):
3434
return tuple(i / inch for i in tupl)
3535

3636

37-
def heatmap(AUC, title, xlabel, ylabel, xticklabels, yticklabels, figure_width=40, figure_height=20,
37+
def heatmap(path, AUC, title, xlabel, ylabel, xticklabels, yticklabels, figure_width=40, figure_height=20,
3838
correct_orientation=False, cmap='RdBu'):
3939
'''
4040
Inspired by:
@@ -89,9 +89,11 @@ def heatmap(AUC, title, xlabel, ylabel, xticklabels, yticklabels, figure_width=4
8989
# fig.set_size_inches(cm2inch(40, 20))
9090
# fig.set_size_inches(cm2inch(40*4, 20*4))
9191
fig.set_size_inches(cm2inch(figure_width, figure_height))
92+
plt.savefig(path + 'model_loss.png', dpi=200, format='png', bbox_inches='tight')
93+
plt.close()
9294

9395

94-
def plot_classification_report(classification_report, title='Classification report ', cmap='RdBu'):
96+
def plot_classification_report(classification_report, path, title='Classification report ', cmap='RdBu'):
9597
'''
9698
Plot scikit-learn classification report.
9799
Extension based on https://stackoverflow.com/a/31689645/395857
@@ -110,7 +112,7 @@ def plot_classification_report(classification_report, title='Classification repo
110112
if len(t) < 2:
111113
continue
112114

113-
if t[0] == 'avg':
115+
if t[0] == 'avg' or t[0] == 'macro avg' or t[0] == 'micro avg':
114116
t[0:3] = [''.join(t[0:3]).upper()]
115117

116118
classes.append(t[0])
@@ -130,5 +132,5 @@ def plot_classification_report(classification_report, title='Classification repo
130132
figure_width = 25
131133
figure_height = len(class_names) + 7
132134
correct_orientation = False
133-
heatmap(np.array(plotMat), title, xlabel, ylabel, xticklabels, yticklabels, figure_width, figure_height,
135+
heatmap(path, np.array(plotMat), title, xlabel, ylabel, xticklabels, yticklabels, figure_width, figure_height,
134136
correct_orientation, cmap=cmap)

utils/plot_confusion_matrix_util.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,6 @@ def plot_confusion_matrix(cm, classes,
1313
"""
1414
if normalize:
1515
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
16-
print("Normalized confusion matrix")
17-
else:
18-
print('Confusion matrix, without normalization')
1916

2017
# print(cm)
2118

utils/sentence_getter.py

Lines changed: 0 additions & 25 deletions
This file was deleted.

0 commit comments

Comments
 (0)