Skip to content

Commit 3282332

Browse files
committed
[linkcheck] Use stricter HTML parsing during anchor-checking.
1 parent 12d0125 commit 3282332

2 files changed

Lines changed: 38 additions & 1 deletion

File tree

sphinx/builders/linkcheck.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -582,7 +582,7 @@ def _get_request_headers(
582582
return {}
583583

584584

585-
def contains_anchor(response: Response, anchor: str) -> bool:
585+
def contains_anchor(response: Response, anchor: str, *, lenient: bool = False) -> bool:
586586
"""Determine if an anchor is contained within an HTTP response."""
587587
parser = AnchorCheckParser(unquote(anchor))
588588
# Read file in chunks. If we find a matching anchor, we break
@@ -595,6 +595,10 @@ def contains_anchor(response: Response, anchor: str) -> bool:
595595
if parser.found:
596596
break
597597
parser.close()
598+
599+
if parser.errors and not lenient:
600+
raise ValueError(parser.errors[0])
601+
598602
return parser.found
599603

600604

@@ -606,8 +610,17 @@ def __init__(self, search_anchor: str) -> None:
606610

607611
self.search_anchor = search_anchor
608612
self.found = False
613+
self.errors: list[str] = []
614+
self.decl: str | None = None
615+
616+
def handle_decl(self, decl: str) -> None:
617+
self.decl = decl
609618

610619
def handle_starttag(self, tag: Any, attrs: Any) -> None:
620+
if self.errors:
621+
return
622+
if self.decl is None:
623+
self.errors.append(f'encountered start tag "{tag}" before a doctype declaration')
611624
for key, value in attrs:
612625
if key in ('id', 'name') and value == self.search_anchor:
613626
self.found = True

tests/test_builders/test_build_linkcheck.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -320,6 +320,7 @@ def do_GET(self):
320320
self._send_chunked(content)
321321

322322

323+
@pytest.mark.xfail() # AnchorsIgnoreForUrlHandler returns incomplete HTML documents
323324
@pytest.mark.sphinx(
324325
'linkcheck', testroot='linkcheck-anchors-ignore-for-url', freshenv=True,
325326
confoverrides={'linkcheck_anchors_ignore_for_url': [
@@ -380,6 +381,29 @@ def do_GET(self):
380381
)
381382

382383

384+
@pytest.mark.sphinx('linkcheck', testroot='linkcheck-localserver-anchor', freshenv=True)
385+
def test_incomplete_html_anchor(app):
386+
class IncompleteHTMLDocumentHandler(BaseHTTPRequestHandler):
387+
protocol_version = 'HTTP/1.1'
388+
389+
def do_GET(self):
390+
content = b'this is <div id="anchor">not</div> a valid HTML document'
391+
self.send_response(200, 'OK')
392+
self.send_header('Content-Length', str(len(content)))
393+
self.end_headers()
394+
self.wfile.write(content)
395+
396+
with http_server(IncompleteHTMLDocumentHandler):
397+
app.build()
398+
399+
content = (app.outdir / 'output.json').read_text(encoding='utf8')
400+
assert len(content.splitlines()) == 1
401+
402+
row = json.loads(content)
403+
assert row['status'] == 'broken'
404+
assert row['info'] == 'encountered start tag "div" before a doctype declaration'
405+
406+
383407
def custom_handler(valid_credentials=(), success_criteria=lambda _: True):
384408
"""
385409
Returns an HTTP request handler that authenticates the client and then determines

0 commit comments

Comments
 (0)