-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathlexer.js
175 lines (153 loc) · 5.5 KB
/
lexer.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
/*!
Copyright (C) 2011 Chad Weider
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the
use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software in a
product, an acknowledgment in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
var Lexer = function (rules) {
this._tokenRegExp = undefined;
// Perform validation and freeze rules so there are no side-effects.
this._rules = [];
if (!rules || !rules.length) {
throw new Error("Rules must be of type Array.");
} else {
for (var i = 0, ii = rules.length; i < ii; i++) {
var rule = rules[i];
if (!rule || !rule.length || rule.length > 3) {
throw new Error("Invalid rule at index " + i + ".");
}
var expression = rule[0];
if (!((expression instanceof RegExp)
|| (expression instanceof String))) {
throw new Error("Expression must be an instance of RegExp or String"
+ " for rule at index " + i + ".");
}
// Prevent side-effects by taking string copy of RegExp. This also has
// the benefit of stripping all modifiers from the RegExp.
if (expression.source) {
expression = expression.source;
} else {
expression = expression.replace(/[-*+?.,^$|#\[\]{}()\\]/g, '\\$1');
}
var type = rule[1];
if ((typeof type != 'string' || type.length == 0)
&& type !== null) {
throw new Error("Expected String or null instead found "
+ JSON.stringify(String(type))
+ " for type of rule at index " + i + ".");
}
var action = rule[2];
if (action !== undefined
&& !(action instanceof Function)) {
throw new Error("Constructor is defined, but is not a function for "
+ "rule at index " + i + ".");
}
this._rules[i] = [expression, type, action];
}
}
};
Lexer.prototype = new function () {
this._compile = function () {
if (!this._tokenRegExp) {
var tokenExpressions = [];
var rules = this._rules;
var captureCount = 0;
for (var i = 0, ii = rules.length; i < ii; i++) {
var rule = rules[i];
var expression = rule[0];
// How many captures does this expression have?
var CAPTURE_EXP = /\\.|\[(?:\\.|[^\]])*\]|(\((?!\?[!:=]))|./g;
captures = expression.replace(CAPTURE_EXP, function (match, p) {
return p ? '.' : '';
}).length;
rule[3] = captures + 1;
// Increment backreferences.
var BACK_REF_EXP = /\\\D|\[(?:\\.|[^\]])*\]|\\(\d+)|./g;
expression = expression.replace(BACK_REF_EXP, function (match, d) {
if (d) {
var n = parseInt(d, 10);
if (n > 0 && n <= captures) {
return '\\' + (n + captureCount + 1);
} else {
return parseInt(d, 8); // Assume this was an escape sequence?
}
} else {
return match;
}
});
captureCount += captures + 1;
tokenExpressions.push(expression);
}
this._tokenRegExp =
new RegExp('(' + tokenExpressions.join(')|(') + ')', 'g');
}
return this._tokenRegExp;
};
this.lex = function (text) {
if (typeof text != 'string') {
throw new Error("Attempt to lex an Object that is not a String.");
}
var tokens = [];
var tokenMatch;
var tokenRegExp = this._compile();
var index = 0;
while (tokenMatch = tokenRegExp.exec(text)) {
// Throw if character is skipped.
if (tokenMatch.index != index) {
throw new Error("Unexpected character found "
+ JSON.stringify(String(text.charAt(index))) + " at index "
+ index + ".");
}
index += tokenMatch[0].length;
var token = {
type: undefined
, value: tokenMatch[0]
, match: undefined
, offset: index
};
// Do a linear search for the group that matched then look up its
// corresponding token.
var i = 1;
var r = 0;
var rules = this._rules;
var rule = rules[r];
while (!tokenMatch[i]) {
i += rule[3];
rule = rules[++r];
}
token.type = rule[1];
token.match = tokenMatch.slice(i, i+rule[3]);
token.match[rule[3]-1] = token.match[rule[3]-1]; // Expected length
if (rule[2]) {
rule[2].call(this, token);
}
// Throw an exception rather than enter an infinite loop.
if (tokenMatch[0].length == 0) {
throw new Error(
"Rule at index " + i + " matched the empty string.");
}
// If the type is null then token will be thrown away.
if (token.type) {
tokens.push(token);
}
}
// Throw if all input isn't consumed.
if (text.length != index) {
throw new Error("Unexpected character found "
+ JSON.stringify(String(text.charAt(index))) + " at index "
+ index + ".");
}
return tokens;
};
};
exports.Lexer = Lexer;