Fixed predict script. Some refactoring.

ldulcic · ldulcic · commit 41f4f08165aa · 2018-08-09T21:53:11.000+02:00
diff --git a/dataset/api.py b/dataset/api.py
@@ -55,8 +55,10 @@ def load_examples(file_path):
 
             sentence.append(parts[0])
 
-            if parts[1] in ['$', '"', '(', ')', "''", '.', ':', ',']:
-                pos.append('NN')
+            if parts[1] == '(':
+                pos.append('-LRB-')
+            elif parts[1] == ')':
+                pos.append('-RRB-')
             else:
                 pos.append(parts[1])
 
diff --git a/dataset/data_processor.py b/dataset/data_processor.py
@@ -1,4 +1,3 @@
-from keras import preprocessing
 from keras_preprocessing.sequence import pad_sequences
 
 
diff --git a/dataset/vocab.py b/dataset/vocab.py
@@ -67,7 +67,7 @@ def build(sentences):
 
         vocab = PosVocab()
         vocab._itos = [PAD] + list(unique_pos)
-        vocab._stoi = defaultdict(lambda: 1)
+        vocab._stoi = defaultdict(_unk_token_idx)
         vocab.stoi.update({k: v for v, k in enumerate(vocab.itos)})
 
         return vocab
@@ -80,7 +80,7 @@ def build(words):
         chars = list(map(lambda c: c, " 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.,-_()[]{}!?:;#'\"/\\%$`&=*+@^~|"))
 
         vocab._itos = [PAD, UNK] + chars
-        vocab._stoi = defaultdict(lambda: 2)
+        vocab._stoi = defaultdict(_unk_token_idx)
         vocab.stoi.update({k: v for v, k in enumerate(vocab.itos)})
 
         return vocab
diff --git a/predict.py b/predict.py
@@ -23,6 +23,8 @@ def main():
     args = parse_args()
 
     text_vocab = load_object(args.path + 'text_vocab')
+    pos_vocab = load_object(args.path + 'pos_vocab')
+    char_vocab = load_object(args.path + 'char_vocab')
     labels_vocab = load_object(args.path + 'labels_vocab')
     model = load_model(args.path + 'ner_model')
     nlp = spacy.load('en')
@@ -36,15 +38,21 @@ def main():
 
         # tokenize user input
         doc = nlp(user_input)
-        user_input_tokenized = [token.text for token in doc]
+        text = [token.text for token in doc]
+        pos = [token.tag_ for token in doc]
+        chars = numericalize(char_vocab, [[c for c in token.text] for token in doc], NO_ENTITY_TOKEN, maxlen=10)
+        chars = np.array(chars)[np.newaxis, :, :]
 
         # get model output
-        model_input = np.array(numericalize(text_vocab, [user_input_tokenized], NO_ENTITY_TOKEN)) # pad token is irrelevant here beacuse we are numericalizing just one sentence (it won't be padded)
-        out = model.predict(model_input).squeeze()
+        # pad token is irrelevant here beacuse we are numericalizing just one sentence (it won't be padded)
+        text = np.array(numericalize(text_vocab, [text], NO_ENTITY_TOKEN))
+        pos = np.array(numericalize(pos_vocab, [pos], NO_ENTITY_TOKEN))
+
+        out = model.predict([text, pos, chars]).squeeze()
         predicted_labels = [labels_vocab.itos[label_idx] for label_idx in np.argmax(out, axis=1).tolist()]
 
         # print result
-        for token, label in zip(user_input_tokenized, predicted_labels):
+        for token, label in zip([token.text for token in doc], predicted_labels):
             print("%s %s" % (token, label))
         print()
 
diff --git a/train.py b/train.py
@@ -17,6 +17,8 @@
 # save vocabulary
 save_path = 'models/' + datetime.now().strftime("%Y-%m-%d-%H:%M") + '/'
 save_object(text_vocab, save_path + 'text_vocab')
+save_object(pos_vocab, save_path + 'pos_vocab')
+save_object(character_vocab, save_path + 'char_vocab')
 save_object(labels_vocab, save_path + 'labels_vocab')
 
 nn = NeuralNetwork(save_path, num_words, num_entities, num_pos, num_chars, train, test, val)

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,3 @@`
`1`		`-from keras import preprocessing`
`2`	`1`	`from keras_preprocessing.sequence import pad_sequences`
`3`	`2`
`4`	`3`