Skip to content

Commit 6ae3b68

Browse files
ayazhafizmhevery
authored andcommitted
feat(compiler): Parse and recover on incomplete opening HTML tags (#38681)
Let's say we have a code like ```html <div<span>123</span> ``` Currently this gets parsed into a tree with the element tag `div<span`. This has at least two downsides: - An incorrect diagnostic that `</span>` doesn't close an element is emitted. - A consumer of the parse tree using it for editor services is unable to provide correct completions for the opening `<span>` tag. This patch attempts to fix both issues by instead parsing the code into the same tree that would be parsed for `<div></div><span>123</span>`. In particular, we do this by optimistically scanning an open tag as usual, but if we do not notice a terminating '>', we mark the tag as "incomplete". A parser then emits an error for the incomplete tag and adds a synthetic (recovered) element node to the tree with the incomplete open tag's name. What's the downside of this? For one, a breaking change. <ol> <li> The first breaking change is that `<` symbols that are ambiguously text or opening tags will be parsed as opening tags instead of text in element bodies. Take the code ```html <p>a<b</p> ``` Clearly we cannot have the best of both worlds, and this patch chooses to swap the parsing strategy to support the new feature. Of course, `<` can still be inserted as text via the `&lt;` entity. </li> </ol> Part of #38596 PR Close #38681
1 parent 49f27e3 commit 6ae3b68

File tree

4 files changed

+153
-32
lines changed

4 files changed

+153
-32
lines changed

packages/compiler/src/ml_parser/lexer.ts

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ export enum TokenType {
1717
TAG_OPEN_END,
1818
TAG_OPEN_END_VOID,
1919
TAG_CLOSE,
20+
INCOMPLETE_TAG_OPEN,
2021
TEXT,
2122
ESCAPABLE_RAW_TEXT,
2223
RAW_TEXT,
@@ -511,8 +512,6 @@ class _Tokenizer {
511512
let tagName: string;
512513
let prefix: string;
513514
let openTagToken: Token|undefined;
514-
let tokensBeforeTagOpen = this.tokens.length;
515-
const innerStart = this._cursor.clone();
516515
try {
517516
if (!chars.isAsciiLetter(this._cursor.peek())) {
518517
throw this._createError(
@@ -523,7 +522,8 @@ class _Tokenizer {
523522
prefix = openTagToken.parts[0];
524523
tagName = openTagToken.parts[1];
525524
this._attemptCharCodeUntilFn(isNotWhitespace);
526-
while (this._cursor.peek() !== chars.$SLASH && this._cursor.peek() !== chars.$GT) {
525+
while (this._cursor.peek() !== chars.$SLASH && this._cursor.peek() !== chars.$GT &&
526+
this._cursor.peek() !== chars.$LT) {
527527
this._consumeAttributeName();
528528
this._attemptCharCodeUntilFn(isNotWhitespace);
529529
if (this._attemptCharCode(chars.$EQ)) {
@@ -535,14 +535,15 @@ class _Tokenizer {
535535
this._consumeTagOpenEnd();
536536
} catch (e) {
537537
if (e instanceof _ControlFlowError) {
538-
// When the start tag is invalid (including invalid "attributes"), assume we want a "<"
539-
this._cursor = innerStart;
540538
if (openTagToken) {
541-
this.tokens.length = tokensBeforeTagOpen;
539+
// We errored before we could close the opening tag, so it is incomplete.
540+
openTagToken.type = TokenType.INCOMPLETE_TAG_OPEN;
541+
} else {
542+
// When the start tag is invalid, assume we want a "<" as text.
543+
// Back to back text tokens are merged at the end.
544+
this._beginToken(TokenType.TEXT, start);
545+
this._endToken(['<']);
542546
}
543-
// Back to back text tokens are merged at the end
544-
this._beginToken(TokenType.TEXT, start);
545-
this._endToken(['<']);
546547
return;
547548
}
548549

@@ -772,8 +773,8 @@ function isNotWhitespace(code: number): boolean {
772773
}
773774

774775
function isNameEnd(code: number): boolean {
775-
return chars.isWhitespace(code) || code === chars.$GT || code === chars.$SLASH ||
776-
code === chars.$SQ || code === chars.$DQ || code === chars.$EQ;
776+
return chars.isWhitespace(code) || code === chars.$GT || code === chars.$LT ||
777+
code === chars.$SLASH || code === chars.$SQ || code === chars.$DQ || code === chars.$EQ;
777778
}
778779

779780
function isPrefixEnd(code: number): boolean {

packages/compiler/src/ml_parser/parser.ts

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,8 @@ class _TreeBuilder {
5656

5757
build(): void {
5858
while (this._peek.type !== lex.TokenType.EOF) {
59-
if (this._peek.type === lex.TokenType.TAG_OPEN_START) {
59+
if (this._peek.type === lex.TokenType.TAG_OPEN_START ||
60+
this._peek.type === lex.TokenType.INCOMPLETE_TAG_OPEN) {
6061
this._consumeStartTag(this._advance());
6162
} else if (this._peek.type === lex.TokenType.TAG_CLOSE) {
6263
this._consumeEndTag(this._advance());
@@ -233,8 +234,7 @@ class _TreeBuilder {
233234
}
234235

235236
private _consumeStartTag(startTagToken: lex.Token) {
236-
const prefix = startTagToken.parts[0];
237-
const name = startTagToken.parts[1];
237+
const [prefix, name] = startTagToken.parts;
238238
const attrs: html.Attribute[] = [];
239239
while (this._peek.type === lex.TokenType.ATTR_NAME) {
240240
attrs.push(this._consumeAttr(this._advance()));
@@ -266,6 +266,12 @@ class _TreeBuilder {
266266
// Elements that are self-closed have their `endSourceSpan` set to the full span, as the
267267
// element start tag also represents the end tag.
268268
this._popElement(fullName, span);
269+
} else if (startTagToken.type === lex.TokenType.INCOMPLETE_TAG_OPEN) {
270+
// We already know the opening tag is not complete, so it is unlikely it has a corresponding
271+
// close tag. Let's optimistically parse it as a full element and emit an error.
272+
this._popElement(fullName, null);
273+
this.errors.push(
274+
TreeError.create(fullName, span, `Opening tag "${fullName}" not terminated.`));
269275
}
270276
}
271277

@@ -295,15 +301,21 @@ class _TreeBuilder {
295301
}
296302
}
297303

298-
private _popElement(fullName: string, endSourceSpan: ParseSourceSpan): boolean {
304+
/**
305+
* Closes the nearest element with the tag name `fullName` in the parse tree.
306+
* `endSourceSpan` is the span of the closing tag, or null if the element does
307+
* not have a closing tag (for example, this happens when an incomplete
308+
* opening tag is recovered).
309+
*/
310+
private _popElement(fullName: string, endSourceSpan: ParseSourceSpan|null): boolean {
299311
for (let stackIndex = this._elementStack.length - 1; stackIndex >= 0; stackIndex--) {
300312
const el = this._elementStack[stackIndex];
301313
if (el.name == fullName) {
302314
// Record the parse span with the element that is being closed. Any elements that are
303315
// removed from the element stack at this point are closed implicitly, so they won't get
304316
// an end source span (as there is no explicit closing element).
305317
el.endSourceSpan = endSourceSpan;
306-
el.sourceSpan.end = endSourceSpan.end || el.sourceSpan.end;
318+
el.sourceSpan.end = endSourceSpan !== null ? endSourceSpan.end : el.sourceSpan.end;
307319

308320
this._elementStack.splice(stackIndex, this._elementStack.length - stackIndex);
309321
return true;

packages/compiler/test/ml_parser/html_parser_spec.ts

Lines changed: 62 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ import {HtmlParser, ParseTreeResult, TreeError} from '../../src/ml_parser/html_p
1111
import {TokenType} from '../../src/ml_parser/lexer';
1212
import {ParseError} from '../../src/parse_util';
1313

14-
import {humanizeDom, humanizeDomSourceSpans, humanizeLineColumn} from './ast_spec_utils';
14+
import {humanizeDom, humanizeDomSourceSpans, humanizeLineColumn, humanizeNodes} from './ast_spec_utils';
1515

1616
{
1717
describe('HtmlParser', () => {
@@ -622,7 +622,7 @@ import {humanizeDom, humanizeDomSourceSpans, humanizeLineColumn} from './ast_spe
622622
`{a, select, b {foo} % { bar {% bar}}`, 'TestComp', {tokenizeExpansionForms: true});
623623
expect(humanizeErrors(p.errors)).toEqual([
624624
[
625-
6,
625+
TokenType.RAW_TEXT,
626626
'Unexpected character "EOF" (Do you have an unescaped "{" in your template? Use "{{ \'{\' }}") to escape it.)',
627627
'0:36'
628628
],
@@ -840,14 +840,66 @@ import {humanizeDom, humanizeDomSourceSpans, humanizeLineColumn} from './ast_spe
840840
]]);
841841
});
842842

843-
it('should report subsequent open tags without proper close tag', () => {
844-
const errors = parser.parse('<div</div>', 'TestComp').errors;
845-
expect(errors.length).toEqual(1);
846-
expect(humanizeErrors(errors)).toEqual([[
847-
'div',
848-
'Unexpected closing tag "div". It may happen when the tag has already been closed by another tag. For more info see https://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags',
849-
'0:4'
850-
]]);
843+
describe('incomplete element tag', () => {
844+
it('should parse and report incomplete tags after the tag name', () => {
845+
const {errors, rootNodes} = parser.parse('<div<span><div </span>', 'TestComp');
846+
847+
expect(humanizeNodes(rootNodes, true)).toEqual([
848+
[html.Element, 'div', 0, '<div', '<div', null],
849+
[html.Element, 'span', 0, '<span><div </span>', '<span>', '</span>'],
850+
[html.Element, 'div', 1, '<div ', '<div ', null],
851+
]);
852+
853+
expect(humanizeErrors(errors)).toEqual([
854+
['div', 'Opening tag "div" not terminated.', '0:0'],
855+
['div', 'Opening tag "div" not terminated.', '0:10'],
856+
]);
857+
});
858+
859+
it('should parse and report incomplete tags after attribute', () => {
860+
const {errors, rootNodes} =
861+
parser.parse('<div class="hi" sty<span></span>', 'TestComp');
862+
863+
expect(humanizeNodes(rootNodes, true)).toEqual([
864+
[html.Element, 'div', 0, '<div class="hi" sty', '<div class="hi" sty', null],
865+
[html.Attribute, 'class', 'hi', 'class="hi"'],
866+
[html.Attribute, 'sty', '', 'sty'],
867+
[html.Element, 'span', 0, '<span></span>', '<span>', '</span>'],
868+
]);
869+
870+
expect(humanizeErrors(errors)).toEqual([
871+
['div', 'Opening tag "div" not terminated.', '0:0'],
872+
]);
873+
});
874+
875+
it('should parse and report incomplete tags after quote', () => {
876+
const {errors, rootNodes} = parser.parse('<div "<span></span>', 'TestComp');
877+
878+
expect(humanizeNodes(rootNodes, true)).toEqual([
879+
[html.Element, 'div', 0, '<div ', '<div ', null],
880+
[html.Text, '"', 0, '"'],
881+
[html.Element, 'span', 0, '<span></span>', '<span>', '</span>'],
882+
]);
883+
884+
expect(humanizeErrors(errors)).toEqual([
885+
['div', 'Opening tag "div" not terminated.', '0:0'],
886+
]);
887+
});
888+
889+
it('should report subsequent open tags without proper close tag', () => {
890+
const errors = parser.parse('<div</div>', 'TestComp').errors;
891+
expect(errors.length).toEqual(2);
892+
expect(humanizeErrors(errors)).toEqual([
893+
['div', 'Opening tag "div" not terminated.', '0:0'],
894+
// TODO(ayazhafiz): the following error is unnecessary and can be pruned if we keep
895+
// track of the incomplete tag names.
896+
[
897+
'div',
898+
'Unexpected closing tag "div". It may happen when the tag has already been closed by another tag. For more info see https://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags',
899+
'0:4'
900+
]
901+
]);
902+
});
851903
});
852904

853905
it('should report closing tag for void elements', () => {

packages/compiler/test/ml_parser/lexer_spec.ts

Lines changed: 62 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,45 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
232232
[lex.TokenType.EOF, ''],
233233
]);
234234
});
235+
236+
describe('tags', () => {
237+
it('after tag name', () => {
238+
expect(tokenizeAndHumanizeSourceSpans('<div<span><div</span>')).toEqual([
239+
[lex.TokenType.INCOMPLETE_TAG_OPEN, '<div'],
240+
[lex.TokenType.TAG_OPEN_START, '<span'],
241+
[lex.TokenType.TAG_OPEN_END, '>'],
242+
[lex.TokenType.INCOMPLETE_TAG_OPEN, '<div'],
243+
[lex.TokenType.TAG_CLOSE, '</span>'],
244+
[lex.TokenType.EOF, ''],
245+
]);
246+
});
247+
248+
it('in attribute', () => {
249+
expect(tokenizeAndHumanizeSourceSpans('<div class="hi" sty<span></span>')).toEqual([
250+
[lex.TokenType.INCOMPLETE_TAG_OPEN, '<div'],
251+
[lex.TokenType.ATTR_NAME, 'class'],
252+
[lex.TokenType.ATTR_QUOTE, '"'],
253+
[lex.TokenType.ATTR_VALUE, 'hi'],
254+
[lex.TokenType.ATTR_QUOTE, '"'],
255+
[lex.TokenType.ATTR_NAME, 'sty'],
256+
[lex.TokenType.TAG_OPEN_START, '<span'],
257+
[lex.TokenType.TAG_OPEN_END, '>'],
258+
[lex.TokenType.TAG_CLOSE, '</span>'],
259+
[lex.TokenType.EOF, ''],
260+
]);
261+
});
262+
263+
it('after quote', () => {
264+
expect(tokenizeAndHumanizeSourceSpans('<div "<span></span>')).toEqual([
265+
[lex.TokenType.INCOMPLETE_TAG_OPEN, '<div'],
266+
[lex.TokenType.TEXT, '"'],
267+
[lex.TokenType.TAG_OPEN_START, '<span'],
268+
[lex.TokenType.TAG_OPEN_END, '>'],
269+
[lex.TokenType.TAG_CLOSE, '</span>'],
270+
[lex.TokenType.EOF, ''],
271+
]);
272+
});
273+
});
235274
});
236275

237276
describe('attributes', () => {
@@ -554,7 +593,8 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
554593
expect(tokenizeAndHumanizeSourceSpans('<p>a<b</p>')).toEqual([
555594
[lex.TokenType.TAG_OPEN_START, '<p'],
556595
[lex.TokenType.TAG_OPEN_END, '>'],
557-
[lex.TokenType.TEXT, 'a<b'],
596+
[lex.TokenType.TEXT, 'a'],
597+
[lex.TokenType.INCOMPLETE_TAG_OPEN, '<b'],
558598
[lex.TokenType.TAG_CLOSE, '</p>'],
559599
[lex.TokenType.EOF, ''],
560600
]);
@@ -579,25 +619,41 @@ import {ParseLocation, ParseSourceFile, ParseSourceSpan} from '../../src/parse_u
579619

580620
it('should parse start tags quotes in place of an attribute name as text', () => {
581621
expect(tokenizeAndHumanizeParts('<t ">')).toEqual([
582-
[lex.TokenType.TEXT, '<t ">'],
622+
[lex.TokenType.INCOMPLETE_TAG_OPEN, '', 't'],
623+
[lex.TokenType.TEXT, '">'],
583624
[lex.TokenType.EOF],
584625
]);
585626

586627
expect(tokenizeAndHumanizeParts('<t \'>')).toEqual([
587-
[lex.TokenType.TEXT, '<t \'>'],
628+
[lex.TokenType.INCOMPLETE_TAG_OPEN, '', 't'],
629+
[lex.TokenType.TEXT, '\'>'],
588630
[lex.TokenType.EOF],
589631
]);
590632
});
591633

592-
it('should parse start tags quotes in place of an attribute name (after a valid attribute) as text',
634+
it('should parse start tags quotes in place of an attribute name (after a valid attribute)',
593635
() => {
594636
expect(tokenizeAndHumanizeParts('<t a="b" ">')).toEqual([
595-
[lex.TokenType.TEXT, '<t a="b" ">'],
637+
[lex.TokenType.INCOMPLETE_TAG_OPEN, '', 't'],
638+
[lex.TokenType.ATTR_NAME, '', 'a'],
639+
[lex.TokenType.ATTR_QUOTE, '"'],
640+
[lex.TokenType.ATTR_VALUE, 'b'],
641+
[lex.TokenType.ATTR_QUOTE, '"'],
642+
// TODO(ayazhafiz): the " symbol should be a synthetic attribute,
643+
// allowing us to complete the opening tag correctly.
644+
[lex.TokenType.TEXT, '">'],
596645
[lex.TokenType.EOF],
597646
]);
598647

599648
expect(tokenizeAndHumanizeParts('<t a=\'b\' \'>')).toEqual([
600-
[lex.TokenType.TEXT, '<t a=\'b\' \'>'],
649+
[lex.TokenType.INCOMPLETE_TAG_OPEN, '', 't'],
650+
[lex.TokenType.ATTR_NAME, '', 'a'],
651+
[lex.TokenType.ATTR_QUOTE, '\''],
652+
[lex.TokenType.ATTR_VALUE, 'b'],
653+
[lex.TokenType.ATTR_QUOTE, '\''],
654+
// TODO(ayazhafiz): the ' symbol should be a synthetic attribute,
655+
// allowing us to complete the opening tag correctly.
656+
[lex.TokenType.TEXT, '\'>'],
601657
[lex.TokenType.EOF],
602658
]);
603659
});

0 commit comments

Comments
 (0)