lbasek
diff --git a/‎root/constants.py renamed to ‎constants.py b/‎root/constants.py renamed to ‎constants.py
diff --git a/‎dataset/csv/test.csv renamed to ‎data/csv/test.csv b/‎dataset/csv/test.csv renamed to ‎data/csv/test.csv
diff --git a/‎dataset/csv/train.csv renamed to ‎data/csv/train.csv b/‎dataset/csv/train.csv renamed to ‎data/csv/train.csv
diff --git a/‎dataset/csv/valid.csv renamed to ‎data/csv/valid.csv b/‎dataset/csv/valid.csv renamed to ‎data/csv/valid.csv
diff --git a/‎dataset/dataset.py renamed to ‎data/dataset.py b/‎dataset/dataset.py renamed to ‎data/dataset.py
diff --git a/‎dataset/raw/test.txt renamed to ‎data/raw/test.txt b/‎dataset/raw/test.txt renamed to ‎data/raw/test.txt
diff --git a/‎dataset/raw/train.txt renamed to ‎data/raw/train.txt b/‎dataset/raw/train.txt renamed to ‎data/raw/train.txt
diff --git a/‎dataset/raw/valid.txt renamed to ‎data/raw/valid.txt b/‎dataset/raw/valid.txt renamed to ‎data/raw/valid.txt
diff --git a/‎root/dataset/__init__.py renamed to ‎dataset/__init__.py b/‎root/dataset/__init__.py renamed to ‎dataset/__init__.py
diff --git a/‎root/dataset/api.py renamed to ‎dataset/api.py
+10-9 b/‎root/dataset/api.py renamed to ‎dataset/api.py
+10-9
diff --git a/‎root/dataset/data_processor.py renamed to ‎dataset/data_processor.py
-1 b/‎root/dataset/data_processor.py renamed to ‎dataset/data_processor.py
-1
diff --git a/‎root/dataset/vocab.py renamed to ‎dataset/vocab.py
+8-4 b/‎root/dataset/vocab.py renamed to ‎dataset/vocab.py
+8-4
diff --git a/‎embedding/glove.py
+1-1 b/‎embedding/glove.py
+1-1
diff --git a/‎root/model.py renamed to ‎model.py
+8-10 b/‎root/model.py renamed to ‎model.py
+8-10
diff --git a/‎predict.py
+63 b/‎predict.py
+63
diff --git a/‎root/main.py
-41 b/‎root/main.py
-41
diff --git a/‎root/test_model.py renamed to ‎test_model.py
+5-5 b/‎root/test_model.py renamed to ‎test_model.py
+5-5
diff --git a/‎train.py
+50 b/‎train.py
+50
diff --git a/‎utils/classification_report.py
+1-1 b/‎utils/classification_report.py
+1-1
diff --git a/‎utils/serialization.py
+19 b/‎utils/serialization.py
+19
@@ -1,19 +1,18 @@
-import itertools
 from collections import namedtuple
 
 import numpy as np
 from keras.utils import to_categorical
-from keras_preprocessing.sequence import pad_sequences
-from root.constants import NO_ENTITY_TOKEN, MAX_LEN, PAD, MAX_LEN_CHAR
+
+from constants import NO_ENTITY_TOKEN, MAX_LEN, PAD, MAX_LEN_CHAR
 from .data_processor import numericalize
 from .vocab import TextVocab, LabelVocab, PosVocab, CharacterVocab
 
 
 def load_dataset():
     # load examples
-    train_examples = load_examples('../dataset/raw/train.txt')
-    val_examples = load_examples('../dataset/raw/valid.txt')
-    test_examples = load_examples('../dataset/raw/test.txt')
+    train_examples = load_examples('data/raw/train.txt')
+    val_examples = load_examples('data/raw/valid.txt')
+    test_examples = load_examples('data/raw/test.txt')
 
     # build vocabularies
     text_vocab = TextVocab.build(list(map(lambda e: e.sentence, train_examples)))
@@ -32,7 +31,7 @@ def load_examples(file_path):
     """
     Loads sentences from file in CoNLL 2003 format.
 
-    :param file_path: Path to file with CoNLL data.
+    :param file_path: Path to file with CoNLL dataset.
     :return: list(Example)
     """
     examples = []
@@ -57,8 +56,10 @@ def load_examples(file_path):
 
             sentence.append(parts[0])
 
-            if parts[1] in ['$', '"', '(', ')', "''", '.', ':', ',']:
-                pos.append('NN')
+            if parts[1] == '(':
+                pos.append('-LRB-')
+            elif parts[1] == ')':
+                pos.append('-RRB-')
             else:
                 pos.append(parts[1])
 
 
@@ -1,4 +1,3 @@
-from keras import preprocessing
 from keras_preprocessing.sequence import pad_sequences
 
 
 
@@ -1,6 +1,6 @@
 from abc import ABC, abstractmethod
 from collections import defaultdict, Counter
-from root.constants import PAD, UNK
+from constants import PAD, UNK
 
 
 class Vocab(ABC):
@@ -34,12 +34,16 @@ def build(sentences, max_size=None):
 
         vocab = TextVocab()
         vocab._itos = [PAD, UNK] + list(map(lambda t: t[0], words_and_freqs))
-        vocab._stoi = defaultdict(lambda: 1)  # index of UNK token
+        vocab._stoi = defaultdict(_unk_token_idx)  # index of UNK token
         vocab.stoi.update({k: v for v, k in enumerate(vocab.itos)})
 
         return vocab
 
 
+def _unk_token_idx():
+    return 1
+
+
 class LabelVocab(Vocab):
     @staticmethod
     def build(sentences):
@@ -63,7 +67,7 @@ def build(sentences):
 
         vocab = PosVocab()
         vocab._itos = [PAD] + list(unique_pos)
-        vocab._stoi = defaultdict(lambda: 1)
+        vocab._stoi = defaultdict(_unk_token_idx)
         vocab.stoi.update({k: v for v, k in enumerate(vocab.itos)})
 
         return vocab
@@ -76,7 +80,7 @@ def build(words):
         chars = list(map(lambda c: c, " 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.,-_()[]{}!?:;#'\"/\\%$`&=*+@^~|"))
 
         vocab._itos = [PAD, UNK] + chars
-        vocab._stoi = defaultdict(lambda: 2)
+        vocab._stoi = defaultdict(_unk_token_idx)
         vocab.stoi.update({k: v for v, k in enumerate(vocab.itos)})
 
         return vocab
@@ -1,7 +1,7 @@
 import sys
 import numpy as np
 
-GLOVE_DIR = '../embedding/glove.6B.100d.txt'
+GLOVE_DIR = 'embedding/glove.6B.100d.txt'
 
 
 def get_pretrained_glove(num_words, text_vocab):
 
@@ -7,12 +7,12 @@
 from keras.utils.vis_utils import plot_model
 from keras.callbacks import TensorBoard
 
-from root.constants import MAX_LEN, MAX_LEN_CHAR
+from constants import MAX_LEN, MAX_LEN_CHAR
 
 
 class NeuralNetwork(object):
 
-    def __init__(self, num_words, num_entities, num_pos, num_chars, train, test, validation):
+    def __init__(self, save_path, num_words, num_entities, num_pos, num_chars, train, test, validation):
         self.num_words = num_words
         self.num_entities = num_entities
         self.num_pos = num_pos
@@ -26,6 +26,7 @@ def __init__(self, num_words, num_entities, num_pos, num_chars, train, test, val
         self.train_pos = train.pos
         self.test_pos = test.pos
         self.valid_pos = validation.pos
+        self.save_path = save_path
 
         self.train_characters = train.characters
         self.test_characters = test.characters
@@ -56,19 +57,16 @@ def train(self, epochs, embedding=None):
         # Deep Layers
         model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(x)
         model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
-        model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
 
         # Output
         out = TimeDistributed(Dense(self.num_entities, activation="softmax"))(model)
         model = Model(inputs=[txt_input, pos_input, char_in], outputs=[out])
 
         model.compile(optimizer="rmsprop", loss='categorical_crossentropy', metrics=['accuracy'])
 
-        plot_model(model, to_file='../models/ner_model_image.png')
+        plot_model(model, to_file=self.save_path + 'ner_model_image.png')
         print(model.summary())
 
-        model.compile(optimizer="rmsprop", metrics=['accuracy'], loss='categorical_crossentropy')
-
         dir = create_dir()
 
         tensorboard_callback = TensorBoard(log_dir=dir, histogram_freq=0, write_graph=True, write_images=True)
@@ -82,7 +80,7 @@ def train(self, epochs, embedding=None):
                 np.array(self.Y_validation)),
             callbacks=[tensorboard_callback], verbose=1)
 
-        model.save("../models/ner_model")
+        model.save(self.save_path + 'ner_model')
 
         test_eval = model.evaluate(
             [self.X_test, self.test_pos, np.array(self.test_characters).reshape((len(self.test_characters), MAX_LEN, MAX_LEN_CHAR))],
@@ -94,16 +92,16 @@ def train(self, epochs, embedding=None):
 
 
 def create_dir():
-    runs = ([x[0] for x in os.walk("../results/logs")])
+    runs = ([x[0] for x in os.walk("results/logs")])
     runs = [x for x in runs if "run" in x]
     runs = list(map(int, re.findall(r'\d+', "".join(runs))))
     runs.sort()
     if len(runs) == 0:
-        return "../results/logs/run1"
+        return "results/logs/run1"
 
     dir_idx = runs[-1] + 1
 
-    dir = "../results/logs/run" + str(dir_idx)
+    dir = "results/logs/run" + str(dir_idx)
 
     if not os.path.exists(dir):
         os.makedirs(dir)
 
@@ -0,0 +1,63 @@
+import os
+import argparse
+import spacy
+import numpy as np
+from keras.models import load_model
+from dataset.data_processor import numericalize
+from utils.serialization import load_object
+from constants import NO_ENTITY_TOKEN, MAX_LEN_CHAR
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Script for using NER model')
+    parser.add_argument('-p', '--path', help='Path to model and vocabulary directory.')
+
+    args = parser.parse_args()
+    # add path separator (/) at the end if needed
+    args.path = args.path if args.path[-1] == os.path.sep else args.path + os.path.sep
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    text_vocab = load_object(args.path + 'text_vocab')
+    pos_vocab = load_object(args.path + 'pos_vocab')
+    char_vocab = load_object(args.path + 'char_vocab')
+    labels_vocab = load_object(args.path + 'labels_vocab')
+    model = load_model(args.path + 'ner_model')
+    nlp = spacy.load('en')
+
+    while True:
+        user_input = input('Input sentence: ').strip()
+        if not user_input:
+            continue
+        if user_input == 'end':
+            break
+
+        # tokenize user input
+        doc = nlp(user_input)
+        text = [token.text for token in doc]
+        pos = [token.tag_ for token in doc]
+        chars = numericalize(char_vocab, [[c for c in token.text] for token in doc], NO_ENTITY_TOKEN, maxlen=MAX_LEN_CHAR)
+        chars = np.array(chars)[np.newaxis, :, :]
+
+        print(chars)
+
+        # get model output
+        # pad token is irrelevant here beacuse we are numericalizing just one sentence (it won't be padded)
+        text = np.array(numericalize(text_vocab, [text], NO_ENTITY_TOKEN))
+        pos = np.array(numericalize(pos_vocab, [pos], NO_ENTITY_TOKEN))
+
+        out = model.predict([text, pos, chars]).squeeze()
+        predicted_labels = [labels_vocab.itos[label_idx] for label_idx in np.argmax(out, axis=1).tolist()]
+
+        # print result
+        for token, label in zip([token.text for token in doc], predicted_labels):
+            print("%s %s" % (token, label))
+        print()
+
+
+if __name__ == '__main__':
+    main()
@@ -5,13 +5,13 @@
 from keras.models import load_model
 from sklearn.metrics import confusion_matrix, precision_recall_fscore_support
 
-from root.constants import MAX_LEN
+from constants import MAX_LEN
 from utils.classification_report import classification_report
 from utils.plot_confusion_matrix_util import plot_confusion_matrix
 
 
-def test_model(test, text_vocab, labels_vocab):
-    model = load_model('../models/ner_model')
+def test_model(model_path, test, text_vocab, labels_vocab):
+    model = load_model(model_path + 'ner_model')
 
     predicted_values = np.argmax(model.predict([test.X, test.pos, np.array(test.characters).reshape((len(test.characters), MAX_LEN, 10))]),
                                  axis=-1)
@@ -41,13 +41,13 @@ def test_model(test, text_vocab, labels_vocab):
     print(report)
 
     # plot_classification_report(report)
-    # plt.savefig('../results/classification_report.png', dpi=200, format='png', bbox_inches='tight')
+    # plt.savefig('results/classification_report.png', dpi=200, format='png', bbox_inches='tight')
     # plt.close()
 
     # Confusion Matrix
     cnf_matrix = confusion_matrix(true_values, predicted_values)
     np.set_printoptions(precision=2)
     # TODO fix classes
     plot_confusion_matrix(cnf_matrix, classes=list(labels_vocab.stoi.keys()), normalize=True, title='Normalized confusion matrix')
-    plt.savefig('../results/confusion_matrix.png', dpi=200, format='png', bbox_inches='tight')
+    plt.savefig('results/confusion_matrix.png', dpi=200, format='png', bbox_inches='tight')
     plt.close()
@@ -0,0 +1,50 @@
+import matplotlib.pyplot as plt
+
+from embedding.glove import get_pretrained_glove
+from dataset.api import load_dataset
+from model import NeuralNetwork
+from test_model import test_model
+from datetime import datetime
+from utils.serialization import save_object
+
+text_vocab, labels_vocab, pos_vocab, character_vocab, train, val, test = load_dataset()
+
+num_words = len(text_vocab.itos)
+num_entities = len(labels_vocab.itos)
+num_pos = len(pos_vocab.itos)
+num_chars = len(character_vocab.itos)
+
+# save vocabulary
+save_path = 'models/' + datetime.now().strftime("%Y-%m-%d-%H:%M") + '/'
+save_object(text_vocab, save_path + 'text_vocab')
+save_object(pos_vocab, save_path + 'pos_vocab')
+save_object(character_vocab, save_path + 'char_vocab')
+save_object(labels_vocab, save_path + 'labels_vocab')
+
+nn = NeuralNetwork(save_path, num_words, num_entities, num_pos, num_chars, train, test, val)
+
+model, history = nn.train(epochs=3, embedding=get_pretrained_glove(num_words, text_vocab))
+
+print(history.history.keys())
+
+test_model(save_path, test, text_vocab, labels_vocab)
+
+# Plot accuracy
+plt.plot(history.history['acc'])
+plt.plot(history.history['val_acc'])
+plt.title('Model Accuracy')
+plt.ylabel('Accuracy')
+plt.xlabel('Epoch')
+plt.legend(['train', 'validation'], loc='lower right')
+plt.savefig('results/model_accuracy.png', dpi=200, format='png', bbox_inches='tight')
+plt.close()
+
+# Plot loss
+plt.plot(history.history['loss'])
+plt.plot(history.history['val_loss'])
+plt.title('Model loss')
+plt.ylabel('Loss')
+plt.xlabel('Epoch')
+plt.legend(['train', 'validation'], loc='upper right')
+plt.savefig('results/model_loss.png', dpi=200, format='png', bbox_inches='tight')
+plt.close()
@@ -24,7 +24,7 @@ def classification_report(y_true, y_pred, labels=None, target_names=None,
     digits : int
         Number of digits for formatting output floating point values
     average : string, ['weighted' (default), 'binary', 'micro', 'macro']
-        Determines the type of averaging performed on the data, after reporting the individual results per class:
+        Determines the type of averaging performed on the dataset, after reporting the individual results per class:
         ``'binary'``:
             Only report results for the class specified by ``pos_label``.
             This is applicable only if targets (``y_{true,pred}``) are binary.
 
@@ -0,0 +1,19 @@
+import os
+import pickle
+
+
+def ensure_dir_exists(path):
+    if not os.path.isdir(path):
+        os.makedirs(path)
+
+
+def save_object(obj, path):
+    ensure_dir_exists(os.path.dirname(path))
+    with open(path, 'wb') as fd:
+        pickle.dump(obj, fd)
+
+
+def load_object(path):
+    with open(path, 'rb') as fd:
+        obj = pickle.load(fd)
+    return obj
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,3 @@`
`1`		`-from keras import preprocessing`
`2`	`1`	`from keras_preprocessing.sequence import pad_sequences`
`3`	`2`
`4`	`3`