Skip to content

Commit acd53c8

Browse files
committed
handle new wallabag export format with newlines mid-tag attributes
1 parent 808ae1a commit acd53c8

File tree

1 file changed

+11
-5
lines changed

1 file changed

+11
-5
lines changed

archivebox/parsers/wallabag_atom.py

+11-5
Original file line numberDiff line numberDiff line change
@@ -34,13 +34,19 @@ def parse_wallabag_atom_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
3434

3535
trailing_removed = entry.split('</entry>', 1)[0]
3636
leading_removed = trailing_removed.strip()
37-
rows = leading_removed.split('\n')
37+
splits_fixed = leading_removed.replace('"\n href="', '" href="')
38+
rows = splits_fixed.split('\n')
3839

39-
def get_row(key):
40-
return [r.strip() for r in rows if r.strip().startswith('<{}'.format(key))][0]
40+
def get_row(prefix):
41+
return [
42+
row.strip()
43+
for row in rows
44+
if row.strip().startswith('<{}'.format(prefix))
45+
][0]
4146

4247
title = str_between(get_row('title'), '<title><![CDATA[', ']]></title>').strip()
43-
url = str_between(get_row('link rel="via"'), '<link rel="via">', '</link>')
48+
url_inside_link = str_between(get_row('link rel="via"'), '<link rel="via">', '</link>')
49+
url_inside_attr = str_between(get_row('link rel="via"'), 'href="', '"/>')
4450
ts_str = str_between(get_row('published'), '<published>', '</published>')
4551
time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
4652
try:
@@ -49,7 +55,7 @@ def get_row(key):
4955
tags = None
5056

5157
yield Link(
52-
url=htmldecode(url),
58+
url=htmldecode(url_inside_attr or url_inside_link),
5359
timestamp=str(time.timestamp()),
5460
title=htmldecode(title) or None,
5561
tags=tags or '',

0 commit comments

Comments
 (0)