Skip to content

Commit 43ffaa1

Browse files
mekarpelespre-commit-ci[bot]cdrini
authored
(Closes #8343) Create update_stale_ocaid_references.py script (#10588)
* (Closes #8343) Create update_stale_ocaid_references.py script --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Drini Cami <[email protected]>
1 parent 2dcfff5 commit 43ffaa1

File tree

1 file changed

+98
-0
lines changed

1 file changed

+98
-0
lines changed
+98
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
#!/usr/bin/env python3
2+
"""
3+
PYTHONPATH=. python ./scripts/update_dark_ocaid_references.py /olsystem/etc/openlibrary.yml --since "2025-03"
4+
5+
# e.g. https://openlibrary.org/recentchanges/2025/03/16/bulk_update/146351306
6+
"""
7+
8+
9+
import _init_path # noqa: F401 Imported for its side effect of setting PYTHONPATH
10+
import requests
11+
import web
12+
13+
import infogami
14+
from infogami import config
15+
from openlibrary.accounts import RunAs
16+
from openlibrary.config import load_config
17+
from scripts.solr_builder.solr_builder.fn_to_cli import FnToCLI
18+
19+
20+
def get_dark_ol_editions(s3_keys, cursor=None, rows=1000, since=None, until=None):
21+
if not s3_keys or not all(k in s3_keys for k in ("s3_key", "s3_secret")):
22+
raise ValueError("Invalid S3 keys provided")
23+
scrape_api_url = "https://archive.org/services/search/v1/scrape"
24+
headers = {"authorization": "LOW {s3_key}:{s3_secret}".format(**s3_keys)}
25+
fields = ["identifier", "curatenote", "curatedate", "openlibrary_edition"]
26+
q = "openlibrary_edition:*"
27+
if since or until:
28+
q = ' AND '.join((q, f"curatedate:[{since or '*'} TO {until or '*'}]"))
29+
30+
query_params = {
31+
"q": q,
32+
"count": rows,
33+
"scope": "dark",
34+
"service": "metadata__dark",
35+
"fields": ",".join(fields),
36+
}
37+
if cursor:
38+
query_params['cursor'] = cursor
39+
40+
response = requests.get(scrape_api_url, headers=headers, params=query_params)
41+
response.raise_for_status()
42+
data = response.json()
43+
cursor = data.get("cursor")
44+
editions = {
45+
# Edition key `/books/OL123M` mapped to its ES dark item metadata
46+
f'/books/{doc["openlibrary_edition"]}': doc
47+
for doc in data.get("items", [])
48+
}
49+
return editions, cursor
50+
51+
52+
def disassociate_dark_ocaids(s3_keys, since=None, until=None, test=True):
53+
cursor = ""
54+
editions_fetched, editions_dirty, editions_updated = (0, 0, 0)
55+
while cursor is not None:
56+
es_editions, cursor = get_dark_ol_editions(
57+
s3_keys, since=since, until=until, cursor=cursor
58+
)
59+
editions_fetched += len(es_editions)
60+
ol_editions = web.ctx.site.get_many(list(es_editions.keys()))
61+
updated_eds = []
62+
for ed in ol_editions:
63+
es_id = es_editions[ed.key]['identifier']
64+
ed_dict = ed.dict()
65+
if ed_dict.get('ocaid') == es_id:
66+
editions_dirty += 1
67+
del ed_dict['ocaid']
68+
updated_eds.append(ed_dict)
69+
70+
if updated_eds and not test:
71+
editions_updated += len(updated_eds)
72+
with RunAs('ImportBot'):
73+
web.ctx.ip = web.ctx.ip or '127.0.0.1'
74+
web.ctx.site.save_many(updated_eds, comment="Redacting ocaids")
75+
return editions_fetched, editions_dirty, editions_updated
76+
77+
78+
def main(
79+
ol_config: str,
80+
since: str | None = None,
81+
until: str | None = None,
82+
test: bool = True,
83+
):
84+
load_config(ol_config)
85+
infogami._setup()
86+
s3_keys = config.get('ia_ol_metadata_write_s3') # XXX needs dark scope
87+
editions_fetched, editions_dirty, editions_updated = disassociate_dark_ocaids(
88+
s3_keys, since=since, until=until, test=test
89+
)
90+
print(
91+
f"{editions_fetched} editions fetched, "
92+
f"{editions_dirty} editions dirty, "
93+
f"{editions_updated} editions updated"
94+
)
95+
96+
97+
if __name__ == '__main__':
98+
FnToCLI(main).run()

0 commit comments

Comments
 (0)