@@ -41,29 +41,24 @@ class PositiveProbWarn {
41
41
WarningAction action_;
42
42
};
43
43
44
- template <class Weights > StringPiece Read1Gram (util::FilePiece &f, Weights &weights , PositiveProbWarn &warn) {
44
+ template <class Voc , class Weights > void Read1Gram (util::FilePiece &f, Voc &vocab, Weights *unigrams , PositiveProbWarn &warn) {
45
45
try {
46
- weights. prob = f.ReadFloat ();
47
- if (weights. prob > 0.0 ) {
48
- warn.Warn (weights. prob );
49
- weights. prob = 0.0 ;
46
+ float prob = f.ReadFloat ();
47
+ if (prob > 0.0 ) {
48
+ warn.Warn (prob);
49
+ prob = 0.0 ;
50
50
}
51
51
UTIL_THROW_IF (f.get () != ' \t ' , FormatLoadException, " Expected tab after probability" );
52
- StringPiece ret (f.ReadDelimited (kARPASpaces ));
53
- ReadBackoff (f, weights);
54
- return ret;
52
+ WordIndex word = vocab.Insert (f.ReadDelimited (kARPASpaces ));
53
+ Weights &w = unigrams[word];
54
+ w.prob = prob;
55
+ ReadBackoff (f, w);
55
56
} catch (util::Exception &e) {
56
57
e << " in the 1-gram at byte " << f.Offset ();
57
58
throw ;
58
59
}
59
60
}
60
61
61
- template <class Voc , class Weights > void Read1Gram (util::FilePiece &f, Voc &vocab, Weights *unigrams, PositiveProbWarn &warn) {
62
- Weights temp;
63
- WordIndex word = vocab.Insert (Read1Gram (f, temp, warn));
64
- unigrams[word] = temp;
65
- }
66
-
67
62
template <class Voc , class Weights > void Read1Grams (util::FilePiece &f, std::size_t count, Voc &vocab, Weights *unigrams, PositiveProbWarn &warn) {
68
63
ReadNGramHeader (f, 1 );
69
64
for (std::size_t i = 0 ; i < count; ++i) {
@@ -81,7 +76,12 @@ template <class Voc, class Weights, class Iterator> void ReadNGram(util::FilePie
81
76
weights.prob = 0.0 ;
82
77
}
83
78
for (unsigned char i = 0 ; i < n; ++i, ++indices_out) {
84
- *indices_out = vocab.Index (f.ReadDelimited (kARPASpaces ));
79
+ StringPiece word (f.ReadDelimited (kARPASpaces ));
80
+ WordIndex index = vocab.Index (word);
81
+ *indices_out = index ;
82
+ // Check for words mapped to <unk> that are not the string <unk>.
83
+ UTIL_THROW_IF (index == 0 /* mapped to <unk> */ && (word != StringPiece (" <unk>" , 5 )) && (word != StringPiece (" <UNK>" , 5 )),
84
+ FormatLoadException, " Word " << word << " was not seen in the unigrams (which are supposed to list the entire vocabulary) but appears" );
85
85
}
86
86
ReadBackoff (f, weights);
87
87
} catch (util::Exception &e) {
0 commit comments