Skip to content

Commit 02ab8f5

Browse files
committed
Bugfix / Stephan Peitz and more paranoid error checking
1 parent 1c45d78 commit 02ab8f5

File tree

5 files changed

+28
-22
lines changed

5 files changed

+28
-22
lines changed

lm/model_test.cc

+1-1
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,7 @@ template <class M> void MinimalState(const M &model) {
176176
AppendTest("to", 1, -1.687872, false);
177177
AppendTest("look", 2, -0.2922095, true);
178178
BOOST_CHECK_EQUAL(2, state.length);
179-
AppendTest("good", 3, -7, true);
179+
AppendTest("a", 3, -7, true);
180180
}
181181

182182
template <class M> void ExtendLeftTest(const M &model) {

lm/read_arpa.hh

+15-15
Original file line numberDiff line numberDiff line change
@@ -41,29 +41,24 @@ class PositiveProbWarn {
4141
WarningAction action_;
4242
};
4343

44-
template <class Weights> StringPiece Read1Gram(util::FilePiece &f, Weights &weights, PositiveProbWarn &warn) {
44+
template <class Voc, class Weights> void Read1Gram(util::FilePiece &f, Voc &vocab, Weights *unigrams, PositiveProbWarn &warn) {
4545
try {
46-
weights.prob = f.ReadFloat();
47-
if (weights.prob > 0.0) {
48-
warn.Warn(weights.prob);
49-
weights.prob = 0.0;
46+
float prob = f.ReadFloat();
47+
if (prob > 0.0) {
48+
warn.Warn(prob);
49+
prob = 0.0;
5050
}
5151
UTIL_THROW_IF(f.get() != '\t', FormatLoadException, "Expected tab after probability");
52-
StringPiece ret(f.ReadDelimited(kARPASpaces));
53-
ReadBackoff(f, weights);
54-
return ret;
52+
WordIndex word = vocab.Insert(f.ReadDelimited(kARPASpaces));
53+
Weights &w = unigrams[word];
54+
w.prob = prob;
55+
ReadBackoff(f, w);
5556
} catch(util::Exception &e) {
5657
e << " in the 1-gram at byte " << f.Offset();
5758
throw;
5859
}
5960
}
6061

61-
template <class Voc, class Weights> void Read1Gram(util::FilePiece &f, Voc &vocab, Weights *unigrams, PositiveProbWarn &warn) {
62-
Weights temp;
63-
WordIndex word = vocab.Insert(Read1Gram(f, temp, warn));
64-
unigrams[word] = temp;
65-
}
66-
6762
template <class Voc, class Weights> void Read1Grams(util::FilePiece &f, std::size_t count, Voc &vocab, Weights *unigrams, PositiveProbWarn &warn) {
6863
ReadNGramHeader(f, 1);
6964
for (std::size_t i = 0; i < count; ++i) {
@@ -81,7 +76,12 @@ template <class Voc, class Weights, class Iterator> void ReadNGram(util::FilePie
8176
weights.prob = 0.0;
8277
}
8378
for (unsigned char i = 0; i < n; ++i, ++indices_out) {
84-
*indices_out = vocab.Index(f.ReadDelimited(kARPASpaces));
79+
StringPiece word(f.ReadDelimited(kARPASpaces));
80+
WordIndex index = vocab.Index(word);
81+
*indices_out = index;
82+
// Check for words mapped to <unk> that are not the string <unk>.
83+
UTIL_THROW_IF(index == 0 /* mapped to <unk> */ && (word != StringPiece("<unk>", 5)) && (word != StringPiece("<UNK>", 5)),
84+
FormatLoadException, "Word " << word << " was not seen in the unigrams (which are supposed to list the entire vocabulary) but appears");
8585
}
8686
ReadBackoff(f, weights);
8787
} catch(util::Exception &e) {

lm/test.arpa

+1-1
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ ngram 5=4
105105
-0.04835128 looking on a -0.4771212
106106
-3 also would consider -7
107107
-6 <unk> however <unk> -12
108-
-7 to look good
108+
-7 to look a
109109

110110
\4-grams:
111111
-0.009249173 looking on a little -0.4771212

lm/test_nounk.arpa

+1-1
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ ngram 5=4
101101
-0.1892331 little more loin
102102
-0.04835128 looking on a -0.4771212
103103
-3 also would consider -7
104-
-7 to look good
104+
-7 to look a
105105

106106
\4-grams:
107107
-0.009249173 looking on a little -0.4771212

lm/trie_sort.cc

+10-4
Original file line numberDiff line numberDiff line change
@@ -107,14 +107,20 @@ FILE *WriteContextFile(uint8_t *begin, uint8_t *end, const std::string &temp_pre
107107
}
108108

109109
struct ThrowCombine {
110-
void operator()(std::size_t /*entry_size*/, const void * /*first*/, const void * /*second*/, FILE * /*out*/) const {
111-
UTIL_THROW(FormatLoadException, "Duplicate n-gram detected.");
110+
void operator()(std::size_t entry_size, unsigned char order, const void *first, const void *second, FILE * /*out*/) const {
111+
const WordIndex *base = reinterpret_cast<const WordIndex*>(first);
112+
FormatLoadException e;
113+
e << "Duplicate n-gram detected with vocab ids";
114+
for (const WordIndex *i = base; i != base + order; ++i) {
115+
e << ' ' << *i;
116+
}
117+
throw e;
112118
}
113119
};
114120

115121
// Useful for context files that just contain records with no value.
116122
struct FirstCombine {
117-
void operator()(std::size_t entry_size, const void *first, const void * /*second*/, FILE *out) const {
123+
void operator()(std::size_t entry_size, unsigned char /*order*/, const void *first, const void * /*second*/, FILE *out) const {
118124
util::WriteOrThrow(out, first, entry_size);
119125
}
120126
};
@@ -134,7 +140,7 @@ template <class Combine> FILE *MergeSortedFiles(FILE *first_file, FILE *second_f
134140
util::WriteOrThrow(out_file.get(), second.Data(), entry_size);
135141
++second;
136142
} else {
137-
combine(entry_size, first.Data(), second.Data(), out_file.get());
143+
combine(entry_size, order, first.Data(), second.Data(), out_file.get());
138144
++first; ++second;
139145
}
140146
}

0 commit comments

Comments
 (0)