Skip to content

Commit 5625286

Browse files
committed
Merge branch '294' of github.com:ig3/node-html-parser into ig3-294
2 parents d9f5586 + 51528c4 commit 5625286

File tree

5 files changed

+1138
-843
lines changed

5 files changed

+1138
-843
lines changed

package.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,8 @@
8787
"standard-version": "^9.5.0",
8888
"travis-cov": "latest",
8989
"ts-node": "^10.9.1",
90-
"typescript": "latest"
90+
"typescript": "latest",
91+
"yarn": "^1.22.22"
9192
},
9293
"config": {
9394
"blanket": {

src/nodes/html.ts

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1017,6 +1017,9 @@ const kElementsClosedByClosing = {
10171017
th: { tr: true, table: true, TR: true, TABLE: true },
10181018
TH: { tr: true, table: true, TR: true, TABLE: true },
10191019
} as Record<string, Record<string, boolean>>;
1020+
const kElementsClosedByClosingExcept = {
1021+
p: { a: true, audio: true, del: true, ins: true, map: true, noscript: true, video: true },
1022+
} as Record<string, Record<string, boolean>>;
10201023

10211024
export interface Options {
10221025
lowerCaseTagName?: boolean;
@@ -1202,6 +1205,28 @@ export function base_parse(data: string, options = {} as Partial<Options>) {
12021205
continue;
12031206
}
12041207
}
1208+
const openTag =
1209+
currentParent.rawTagName ?
1210+
currentParent.rawTagName.toLowerCase() :
1211+
'';
1212+
if (kElementsClosedByClosingExcept[openTag]) {
1213+
const closingTag = tagName.toLowerCase();
1214+
if (stack.length > 1) {
1215+
const possibleContainer = stack[stack.length - 2];
1216+
if (
1217+
possibleContainer &&
1218+
possibleContainer.rawTagName &&
1219+
possibleContainer.rawTagName.toLowerCase() === closingTag &&
1220+
!kElementsClosedByClosingExcept[openTag][closingTag]
1221+
) {
1222+
// Update range end for closed tag
1223+
(<[number, number]>currentParent.range)[1] = createRange(-1, Math.max(lastTextPos, tagEndPos))[1];
1224+
stack.pop();
1225+
currentParent = arr_back(stack);
1226+
continue;
1227+
}
1228+
}
1229+
}
12051230
// Use aggressive strategy to handle unmatching markups.
12061231
break;
12071232
}

test/tests/issues/294.js

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
const { parse, valid } = require('@test/test-target');
2+
3+
describe('issue 294 Closing tag is missing but valid HTML still not parsable', function () {
4+
it('Valid HTML missing closing p tag should parse', function () {
5+
const content = '<body><main class=h-entry><p>hello</main></body>';
6+
valid(content).should.equal(true);
7+
const root = parse(content);
8+
root.outerHTML.should.equal('<body><main class=h-entry><p>hello</p></main></body>');
9+
const list = root.querySelectorAll('.h-entry');
10+
list.length.should.equal(1);
11+
});
12+
});

test/tests/valid.js

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,25 @@ describe('parseWithValidation', function () {
2020
result.should.eql(false);
2121
})
2222

23-
it('hillcrestpartyrentals.html should return Object with valid: false. not closing <p> tag on line 476', function () {
23+
// #294: Closing tag is missing but valid HTML is still not parseable
24+
//
25+
// Tag omission in text/html:
26+
// A p element's end tag can be omitted if the p element is immediately
27+
// followed by an address, article, aside, blockquote, details, dialog,
28+
// div, dl, fieldset, figcaption, figure, footer, form, h1, h2, h3, h4,
29+
// h5, h6, header, hgroup, hr, main, menu, nav, ol, p, pre, search,
30+
// section, table, or ul element, or if there is no more content in the
31+
// parent element and the parent element is an HTML element that is not
32+
// an a, audio, del, ins, map, noscript, or video element, or an
33+
// autonomous custom element.
34+
//
35+
// Based on this, hillcrestpartyrentals.html is in fact valid HTML. All
36+
// the p elements missing close tags are contained within td elements
37+
// and, therefore, should be closed when there is no more content in the
38+
// parent td element (i.e. at the `</td>`).
39+
it('hillcrestpartyrentals.html should return Object with valid: true. not closing <p> tag on line 476', function () {
2440
const result = valid(fs.readFileSync(__dirname + '/../assets/html/hillcrestpartyrentals.html').toString());
25-
result.should.eql(false);
41+
result.should.eql(true);
2642
})
2743

2844
it('google.html should return Object with valid: true', function () {

0 commit comments

Comments
 (0)