Skip to content

Commit de8e22e

Browse files
committed
improve title extractor
1 parent bf432d4 commit de8e22e

File tree

3 files changed

+26
-25
lines changed

3 files changed

+26
-25
lines changed

archivebox/extractors/__init__.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -42,15 +42,15 @@
4242

4343
def get_default_archive_methods():
4444
return [
45-
('title', should_save_title, save_title),
4645
('favicon', should_save_favicon, save_favicon),
4746
('headers', should_save_headers, save_headers),
4847
('singlefile', should_save_singlefile, save_singlefile),
4948
('pdf', should_save_pdf, save_pdf),
5049
('screenshot', should_save_screenshot, save_screenshot),
5150
('dom', should_save_dom, save_dom),
5251
('wget', should_save_wget, save_wget),
53-
('readability', should_save_readability, save_readability), # keep readability below wget and singlefile, as it depends on them
52+
('title', should_save_title, save_title), # keep title and readability below wget and singlefile, as it depends on them
53+
('readability', should_save_readability, save_readability),
5454
('mercury', should_save_mercury, save_mercury),
5555
('git', should_save_git, save_git),
5656
('media', should_save_media, save_media),
@@ -182,7 +182,7 @@ def archive_links(all_links: Union[Iterable[Link], QuerySet], overwrite: bool=Fa
182182
except KeyboardInterrupt:
183183
log_archiving_paused(num_links, idx, link.timestamp)
184184
raise SystemExit(0)
185-
except BaseException: # lgtm [py/catch-base-exception]
185+
except BaseException:
186186
print()
187187
raise
188188

archivebox/extractors/readability.py

+1-21
Original file line numberDiff line numberDiff line change
@@ -22,28 +22,8 @@
2222
READABILITY_VERSION,
2323
)
2424
from ..logging_util import TimedProgress
25+
from .title import get_html
2526

26-
@enforce_types
27-
def get_html(link: Link, path: Path) -> str:
28-
"""
29-
Try to find wget, singlefile and then dom files.
30-
If none is found, download the url again.
31-
"""
32-
canonical = link.canonical_outputs()
33-
abs_path = path.absolute()
34-
sources = [canonical["singlefile_path"], canonical["wget_path"], canonical["dom_path"]]
35-
document = None
36-
for source in sources:
37-
try:
38-
with open(abs_path / source, "r", encoding="utf-8") as f:
39-
document = f.read()
40-
break
41-
except (FileNotFoundError, TypeError):
42-
continue
43-
if document is None:
44-
return download_url(link.url)
45-
else:
46-
return document
4727

4828
@enforce_types
4929
def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:

archivebox/extractors/title.py

+22-1
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,27 @@ def handle_endtag(self, tag):
5858
if tag.lower() == "title":
5959
self.inside_title_tag = False
6060

61+
@enforce_types
62+
def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str:
63+
"""
64+
Try to find wget, singlefile and then dom files.
65+
If none is found, download the url again.
66+
"""
67+
canonical = link.canonical_outputs()
68+
abs_path = path.absolute()
69+
sources = [canonical["singlefile_path"], canonical["wget_path"], canonical["dom_path"]]
70+
document = None
71+
for source in sources:
72+
try:
73+
with open(abs_path / source, "r", encoding="utf-8") as f:
74+
document = f.read()
75+
break
76+
except (FileNotFoundError, TypeError):
77+
continue
78+
if document is None:
79+
return download_url(link.url, timeout=timeout)
80+
else:
81+
return document
6182

6283
@enforce_types
6384
def should_save_title(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
@@ -90,7 +111,7 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -
90111
status = 'succeeded'
91112
timer = TimedProgress(timeout, prefix=' ')
92113
try:
93-
html = download_url(link.url, timeout=timeout)
114+
html = get_html(link, out_dir, timeout=timeout)
94115
try:
95116
# try using relatively strict html parser first
96117
parser = TitleParser()

0 commit comments

Comments
 (0)