Skip to content

Commit 5a2c78e

Browse files
committed
add proper support for URL_WHITELIST instead of using negation regexes
1 parent e4974d3 commit 5a2c78e

File tree

2 files changed

+6
-4
lines changed

2 files changed

+6
-4
lines changed

archivebox/config.py

+2
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@
7777
'OUTPUT_PERMISSIONS': {'type': str, 'default': '644'},
7878
'RESTRICT_FILE_NAMES': {'type': str, 'default': 'windows'},
7979
'URL_BLACKLIST': {'type': str, 'default': r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$'}, # to avoid downloading code assets as their own pages
80+
'URL_WHITELIST': {'type': str, 'default': None},
8081
'ENFORCE_ATOMIC_WRITES': {'type': bool, 'default': True},
8182
},
8283

@@ -337,6 +338,7 @@ def get_real_name(key: str) -> str:
337338
'COOKIES_FILE': {'default': lambda c: c['COOKIES_FILE'] and Path(c['COOKIES_FILE']).resolve()},
338339
'CHROME_USER_DATA_DIR': {'default': lambda c: find_chrome_data_dir() if c['CHROME_USER_DATA_DIR'] is None else (Path(c['CHROME_USER_DATA_DIR']).resolve() if c['CHROME_USER_DATA_DIR'] else None)}, # None means unset, so we autodetect it with find_chrome_Data_dir(), but emptystring '' means user manually set it to '', and we should store it as None
339340
'URL_BLACKLIST_PTN': {'default': lambda c: c['URL_BLACKLIST'] and re.compile(c['URL_BLACKLIST'] or '', re.IGNORECASE | re.UNICODE | re.MULTILINE)},
341+
'URL_WHITELIST_PTN': {'default': lambda c: c['URL_WHITELIST'] and re.compile(c['URL_WHITELIST'] or '', re.IGNORECASE | re.UNICODE | re.MULTILINE)},
340342
'DIR_OUTPUT_PERMISSIONS': {'default': lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5')},
341343

342344
'ARCHIVEBOX_BINARY': {'default': lambda c: sys.argv[0] or bin_path('archivebox')},

archivebox/index/__init__.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
OUTPUT_DIR,
2424
TIMEOUT,
2525
URL_BLACKLIST_PTN,
26+
URL_WHITELIST_PTN,
2627
stderr,
2728
OUTPUT_PERMISSIONS
2829
)
@@ -141,10 +142,9 @@ def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
141142
continue
142143
if scheme(link.url) not in ('http', 'https', 'ftp'):
143144
continue
144-
if URL_BLACKLIST_PTN and (URL_BLACKLIST_PTN.match(link.url) or URL_BLACKLIST_PTN.search(link.url)):
145-
# https://stackoverflow.com/questions/180986/what-is-the-difference-between-re-search-and-re-match
146-
# we want both behaviors in order to support multiple patterns in the regex,
147-
# and negation regexes like (?!someptnhere) to allow for whitelisting
145+
if URL_BLACKLIST_PTN and URL_BLACKLIST_PTN.search(link.url):
146+
continue
147+
if URL_WHITELIST_PTN and (not URL_WHITELIST_PTN.search(link.url)):
148148
continue
149149

150150
yield link

0 commit comments

Comments
 (0)