Skip to content

Commit 8ef28cd

Browse files
author
Ivan Nikulin
committed
Lexer implementation started.
0 parents  commit 8ef28cd

3 files changed

Lines changed: 162 additions & 0 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
.idea

lib/err.js

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
exports.UNEXPECTED_NULL_CHARACTER = 'UNEXPECTED_NULL_CHARACTER';

lib/lexer.js

Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
var err = require('./err');
2+
3+
//Const
4+
var EOF = null;
5+
6+
//States
7+
var DATA_STATE = 'DATA_STATE',
8+
CHARACTER_REFERENCE_IN_DATA_STATE = 'CHARACTER_REFERENCE_IN_DATA_STATE',
9+
RCDATA_STATE = 'RCDATA_STATE',
10+
CHARACTER_REFERENCE_IN_RCDATA_STATE = 'CHARACTER_REFERENCE_IN_RCDATA_STATE',
11+
RAWTEXT_STATE = 'RAWTEXT_STATE',
12+
SCRIPT_DATA_STATE = 'SCRIPT_DATA_STATE',
13+
PLAINTEXT_STATE = 'PLAINTEXT_STATE',
14+
TAG_OPEN_STATE = 'TAG_OPEN_STATE',
15+
END_TAG_OPEN_STATE = 'END_TAG_OPEN_STATE',
16+
TAG_NAME_STATE = 'TAG_NAME_STATE',
17+
RCDATA_LESS_THAN_SIGN_STATE = 'RCDATA_LESS_THAN_SIGN_STATE',
18+
RCDATA_END_TAG_OPEN_STATE = 'RCDATA_END_TAG_OPEN_STATE',
19+
RCDATA_END_TAG_NAME_STATE = 'RCDATA_END_TAG_NAME_STATE',
20+
RAWTEXT_LESS_THAN_SIGN_STATE = 'RAWTEXT_LESS_THAN_SIGN_STATE',
21+
RAWTEXT_END_TAG_OPEN_STATE = 'RAWTEXT_END_TAG_OPEN_STATE',
22+
RAWTEXT_END_TAG_NAME_STATE = 'RAWTEXT_END_TAG_NAME_STATE',
23+
SCRIPT_DATA_LESS_THAN_SIGN_STATE = 'SCRIPT_DATA_LESS_THAN_SIGN_STATE',
24+
SCRIPT_DATA_END_TAG_OPEN_STATE = 'SCRIPT_DATA_END_TAG_OPEN_STATE',
25+
SCRIPT_DATA_END_TAG_NAME_STATE = 'SCRIPT_DATA_END_TAG_NAME_STATE',
26+
SCRIPT_DATA_ESCAPE_START_STATE = 'SCRIPT_DATA_ESCAPE_START_STATE',
27+
SCRIPT_DATA_ESCAPE_START_DASH_STATE = 'SCRIPT_DATA_ESCAPE_START_DASH_STATE',
28+
SCRIPT_DATA_ESCAPED_STATE = 'SCRIPT_DATA_ESCAPED_STATE',
29+
SCRIPT_DATA_ESCAPED_DASH_STATE = 'SCRIPT_DATA_ESCAPED_DASH_STATE',
30+
SCRIPT_DATA_ESCAPED_DASH_DASH_STATE = 'SCRIPT_DATA_ESCAPED_DASH_DASH_STATE',
31+
SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE = 'SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE',
32+
SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE = 'SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE',
33+
SCRIPT_DATA_ESCAPED_END_TAG_NAME_STATE = 'SCRIPT_DATA_ESCAPED_END_TAG_NAME_STATE',
34+
SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE = 'SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE',
35+
SCRIPT_DATA_DOUBLE_ESCAPED_STATE = 'SCRIPT_DATA_DOUBLE_ESCAPED_STATE',
36+
SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE = 'SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE',
37+
SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE = 'SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE',
38+
SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE = 'SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE',
39+
SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE = 'SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE',
40+
BEFORE_ATTRIBUTE_NAME_STATE = 'BEFORE_ATTRIBUTE_NAME_STATE',
41+
ATTRIBUTE_NAME_STATE = 'ATTRIBUTE_NAME_STATE',
42+
AFTER_ATTRIBUTE_NAME_STATE = 'AFTER_ATTRIBUTE_NAME_STATE',
43+
BEFORE_ATTRIBUTE_VALUE_STATE = 'BEFORE_ATTRIBUTE_VALUE_STATE',
44+
ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE = 'ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE',
45+
ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE = 'ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE',
46+
ATTRIBUTE_VALUE_UNQUOTED_STATE = 'ATTRIBUTE_VALUE_UNQUOTED_STATE',
47+
CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUES_STATE = 'CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUES_STATE',
48+
AFTER_ATTRIBUTE_VALUE_QUOTED_STATE = 'AFTER_ATTRIBUTE_VALUE_QUOTED_STATE',
49+
SELF_CLOSING_START_TAG_STATE = 'SELF_CLOSING_START_TAG_STATE',
50+
BOGUS_COMMENT_STATE = 'BOGUS_COMMENT_STATE',
51+
MARKUP_DECLARATION_OPEN_STATE = 'MARKUP_DECLARATION_OPEN_STATE',
52+
COMMENT_START_STATE = 'COMMENT_START_STATE',
53+
COMMENT_START_DASH_STATE = 'COMMENT_START_DASH_STATE',
54+
COMMENT_STATE = 'COMMENT_STATE',
55+
COMMENT_END_DASH_STATE = 'COMMENT_END_DASH_STATE',
56+
COMMENT_END_STATE = 'COMMENT_END_STATE',
57+
COMMENT_END_BANG_STATE = 'COMMENT_END_BANG_STATE',
58+
DOCTYPE_STATE = 'DOCTYPE_STATE',
59+
BEFORE_DOCTYPE_NAME_STATE = 'BEFORE_DOCTYPE_NAME_STATE',
60+
DOCTYPE_NAME_STATE = 'DOCTYPE_NAME_STATE',
61+
AFTER_DOCTYPE_NAME_STATE = 'AFTER_DOCTYPE_NAME_STATE',
62+
AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE = 'AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE',
63+
BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE = 'BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE',
64+
DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE = 'DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE',
65+
DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE = 'DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE',
66+
AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE = 'AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE',
67+
BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE = 'BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE',
68+
AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE = 'AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE',
69+
BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE = 'BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE',
70+
DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE = 'DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE',
71+
DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE = 'DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE',
72+
AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE = 'AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE',
73+
BOGUS_DOCTYPE_STATE = 'BOGUS_DOCTYPE_STATE',
74+
CDATA_SECTION_STATE = 'CDATA_SECTION_STATE';
75+
76+
var Lexer = exports.Lexer = function (html) {
77+
//Input data
78+
this.html = html;
79+
80+
//Positioning
81+
this.pos = 0;
82+
this.line = 1;
83+
this.col = 1;
84+
this.lineLengths = [];
85+
86+
//Tokenization
87+
this.state = DATA_STATE;
88+
this.tokenQueue = [];
89+
this.errs = [];
90+
};
91+
92+
//Token types
93+
Lexer.CHARACTER_TOKEN = 'CHARACTER_TOKEN';
94+
Lexer.EOF_TOKEN = 'EOF_TOKEN';
95+
96+
//Proto
97+
Lexer.prototype.getToken = function () {
98+
var ch = EOF,
99+
prevCh = this.html[this.pos - 1];
100+
101+
//NOTE: iterate through states until we don't get at least one token in the queue
102+
while (!this.tokenQueue.length) {
103+
if (this.pos < this.html.length)
104+
ch = this.html[this.pos];
105+
106+
//NOTE: treat CR+LF as single line break
107+
if ((ch === '\n' && prevCh !== '\r') || ch === '\r' || ch === '\v' || ch === '\f') {
108+
this.lineLengths.push(this.col);
109+
this.line++;
110+
this.col = 1;
111+
}
112+
113+
_[this.state].call(this, ch);
114+
115+
prevCh = ch;
116+
this.pos++;
117+
}
118+
119+
return this.tokenQueue.shift();
120+
};
121+
122+
Lexer.prototype._reconsume = function () {
123+
this.pos--;
124+
this.col--;
125+
126+
if (!this.col) {
127+
this.line--;
128+
this.col = this.lineLengths[this.line];
129+
}
130+
};
131+
132+
Lexer.prototype._err = function (code) {
133+
this.errs.push({code: code, line: this.line, col: this.col});
134+
};
135+
136+
Lexer.prototype._emitCharacterToken = function (ch) {
137+
this.tokenQueue.push({type: Lexer.CHARACTER_TOKEN, ch: ch});
138+
};
139+
140+
Lexer.prototype._emitEOFToken = function () {
141+
this.tokenQueue.push({type: Lexer.EOF_TOKEN});
142+
};
143+
144+
//State processors
145+
var _ = {};
146+
147+
//8.2.4.1 Data state
148+
_[DATA_STATE] = function (ch) {
149+
if (ch === '&')
150+
this.state = CHARACTER_REFERENCE_IN_DATA_STATE;
151+
else if (ch === '<')
152+
this.state = TAG_OPEN_STATE;
153+
else if (ch === '\u0000') {
154+
this._err(err.UNEXPECTED_NULL_CHARACTER);
155+
this._emitCharacterToken(ch);
156+
} else if (ch === EOF)
157+
this._emitEOFToken();
158+
else
159+
this._emitCharacterToken(ch);
160+
};

0 commit comments

Comments
 (0)