1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
|
/* Tokenizer for JavaScript code */
var tokenizeJavaScript = (function() {
// Advance the stream until the given character (not preceded by a
// backslash) is encountered, or the end of the line is reached.
function nextUntilUnescaped(source, end) {
var escaped = false;
var next;
while (!source.endOfLine()) {
var next = source.next();
if (next == end && !escaped)
return false;
escaped = !escaped && next == "\\";
}
return escaped;
}
// A map of JavaScript's keywords. The a/b/c keyword distinction is
// very rough, but it gives the parser enough information to parse
// correct code correctly (we don't care that much how we parse
// incorrect code). The style information included in these objects
// is used by the highlighter to pick the correct CSS style for a
// token.
var keywords = function(){
function result(type, style){
return {type: type, style: "js-" + style};
}
// keywords that take a parenthised expression, and then a
// statement (if)
var keywordA = result("keyword a", "keyword");
// keywords that take just a statement (else)
var keywordB = result("keyword b", "keyword");
// keywords that optionally take an expression, and form a
// statement (return)
var keywordC = result("keyword c", "keyword");
var operator = result("operator", "keyword");
var atom = result("atom", "atom");
return {
"if": keywordA, "while": keywordA, "with": keywordA,
"else": keywordB, "do": keywordB, "try": keywordB, "finally": keywordB,
"return": keywordC, "break": keywordC, "continue": keywordC, "new": keywordC, "delete": keywordC, "throw": keywordC,
"in": operator, "typeof": operator, "instanceof": operator,
"var": result("var", "keyword"), "function": result("function", "keyword"), "catch": result("catch", "keyword"),
"for": result("for", "keyword"), "switch": result("switch", "keyword"),
"case": result("case", "keyword"), "default": result("default", "keyword"),
"true": atom, "false": atom, "null": atom, "undefined": atom, "NaN": atom, "Infinity": atom
};
}();
// Some helper regexps
var isOperatorChar = /[+\-*&%\/=<>!?|]/;
var isHexDigit = /[0-9A-Fa-f]/;
var isWordChar = /[\w\$_]/;
// Wrapper around jsToken that helps maintain parser state (whether
// we are inside of a multi-line comment and whether the next token
// could be a regular expression).
function jsTokenState(inside, regexp) {
return function(source, setState) {
var newInside = inside;
var type = jsToken(inside, regexp, source, function(c) {newInside = c;});
var newRegexp = type.type == "operator" || type.type == "keyword c" || type.type.match(/^[\[{}\(,;:]$/);
if (newRegexp != regexp || newInside != inside)
setState(jsTokenState(newInside, newRegexp));
return type;
};
}
// The token reader, inteded to be used by the tokenizer from
// tokenize.js (through jsTokenState). Advances the source stream
// over a token, and returns an object containing the type and style
// of that token.
function jsToken(inside, regexp, source, setInside) {
function readHexNumber(){
source.next(); // skip the 'x'
source.nextWhileMatches(isHexDigit);
return {type: "number", style: "js-atom"};
}
function readNumber() {
source.nextWhileMatches(/[0-9]/);
if (source.equals(".")){
source.next();
source.nextWhileMatches(/[0-9]/);
}
if (source.equals("e") || source.equals("E")){
source.next();
if (source.equals("-"))
source.next();
source.nextWhileMatches(/[0-9]/);
}
return {type: "number", style: "js-atom"};
}
// Read a word, look it up in keywords. If not found, it is a
// variable, otherwise it is a keyword of the type found.
function readWord() {
source.nextWhileMatches(isWordChar);
var word = source.get();
var known = keywords.hasOwnProperty(word) && keywords.propertyIsEnumerable(word) && keywords[word];
return known ? {type: known.type, style: known.style, content: word} :
{type: "variable", style: "js-variable", content: word};
}
function readRegexp() {
nextUntilUnescaped(source, "/");
source.nextWhileMatches(/[gi]/);
return {type: "regexp", style: "js-string"};
}
// Mutli-line comments are tricky. We want to return the newlines
// embedded in them as regular newline tokens, and then continue
// returning a comment token for every line of the comment. So
// some state has to be saved (inside) to indicate whether we are
// inside a /* */ sequence.
function readMultilineComment(start){
var newInside = "/*";
var maybeEnd = (start == "*");
while (true) {
if (source.endOfLine())
break;
var next = source.next();
if (next == "/" && maybeEnd){
newInside = null;
break;
}
maybeEnd = (next == "*");
}
setInside(newInside);
return {type: "comment", style: "js-comment"};
}
function readOperator() {
source.nextWhileMatches(isOperatorChar);
return {type: "operator", style: "js-operator"};
}
function readString(quote) {
var endBackSlash = nextUntilUnescaped(source, quote);
setInside(endBackSlash ? quote : null);
return {type: "string", style: "js-string"};
}
// Fetch the next token. Dispatches on first character in the
// stream, or first two characters when the first is a slash.
if (inside == "\"" || inside == "'")
return readString(inside);
var ch = source.next();
if (inside == "/*")
return readMultilineComment(ch);
else if (ch == "\"" || ch == "'")
return readString(ch);
// with punctuation, the type of the token is the symbol itself
else if (/[\[\]{}\(\),;\:\.]/.test(ch))
return {type: ch, style: "js-punctuation"};
else if (ch == "0" && (source.equals("x") || source.equals("X")))
return readHexNumber();
else if (/[0-9]/.test(ch))
return readNumber();
else if (ch == "/"){
if (source.equals("*"))
{ source.next(); return readMultilineComment(ch); }
else if (source.equals("/"))
{ nextUntilUnescaped(source, null); return {type: "comment", style: "js-comment"};}
else if (regexp)
return readRegexp();
else
return readOperator();
}
else if (isOperatorChar.test(ch))
return readOperator();
else
return readWord();
}
// The external interface to the tokenizer.
return function(source, startState) {
return tokenizer(source, startState || jsTokenState(false, true));
};
})();
|