|
| 1 | +#!/usr/bin/env python3 |
| 2 | +""" |
| 3 | +PYTHONPATH=. python ./scripts/update_dark_ocaid_references.py /olsystem/etc/openlibrary.yml --since "2025-03" |
| 4 | +
|
| 5 | +# e.g. https://openlibrary.org/recentchanges/2025/03/16/bulk_update/146351306 |
| 6 | +""" |
| 7 | + |
| 8 | + |
| 9 | +import _init_path # noqa: F401 Imported for its side effect of setting PYTHONPATH |
| 10 | +import requests |
| 11 | +import web |
| 12 | + |
| 13 | +import infogami |
| 14 | +from infogami import config |
| 15 | +from openlibrary.accounts import RunAs |
| 16 | +from openlibrary.config import load_config |
| 17 | +from scripts.solr_builder.solr_builder.fn_to_cli import FnToCLI |
| 18 | + |
| 19 | + |
| 20 | +def get_dark_ol_editions(s3_keys, cursor=None, rows=1000, since=None, until=None): |
| 21 | + if not s3_keys or not all(k in s3_keys for k in ("s3_key", "s3_secret")): |
| 22 | + raise ValueError("Invalid S3 keys provided") |
| 23 | + scrape_api_url = "https://archive.org/services/search/v1/scrape" |
| 24 | + headers = {"authorization": "LOW {s3_key}:{s3_secret}".format(**s3_keys)} |
| 25 | + fields = ["identifier", "curatenote", "curatedate", "openlibrary_edition"] |
| 26 | + q = "openlibrary_edition:*" |
| 27 | + if since or until: |
| 28 | + q = ' AND '.join((q, f"curatedate:[{since or '*'} TO {until or '*'}]")) |
| 29 | + |
| 30 | + query_params = { |
| 31 | + "q": q, |
| 32 | + "count": rows, |
| 33 | + "scope": "dark", |
| 34 | + "service": "metadata__dark", |
| 35 | + "fields": ",".join(fields), |
| 36 | + } |
| 37 | + if cursor: |
| 38 | + query_params['cursor'] = cursor |
| 39 | + |
| 40 | + response = requests.get(scrape_api_url, headers=headers, params=query_params) |
| 41 | + response.raise_for_status() |
| 42 | + data = response.json() |
| 43 | + cursor = data.get("cursor") |
| 44 | + editions = { |
| 45 | + # Edition key `/books/OL123M` mapped to its ES dark item metadata |
| 46 | + f'/books/{doc["openlibrary_edition"]}': doc |
| 47 | + for doc in data.get("items", []) |
| 48 | + } |
| 49 | + return editions, cursor |
| 50 | + |
| 51 | + |
| 52 | +def disassociate_dark_ocaids(s3_keys, since=None, until=None, test=True): |
| 53 | + cursor = "" |
| 54 | + editions_fetched, editions_dirty, editions_updated = (0, 0, 0) |
| 55 | + while cursor is not None: |
| 56 | + es_editions, cursor = get_dark_ol_editions( |
| 57 | + s3_keys, since=since, until=until, cursor=cursor |
| 58 | + ) |
| 59 | + editions_fetched += len(es_editions) |
| 60 | + ol_editions = web.ctx.site.get_many(list(es_editions.keys())) |
| 61 | + updated_eds = [] |
| 62 | + for ed in ol_editions: |
| 63 | + es_id = es_editions[ed.key]['identifier'] |
| 64 | + ed_dict = ed.dict() |
| 65 | + if ed_dict.get('ocaid') == es_id: |
| 66 | + editions_dirty += 1 |
| 67 | + del ed_dict['ocaid'] |
| 68 | + updated_eds.append(ed_dict) |
| 69 | + |
| 70 | + if updated_eds and not test: |
| 71 | + editions_updated += len(updated_eds) |
| 72 | + with RunAs('ImportBot'): |
| 73 | + web.ctx.ip = web.ctx.ip or '127.0.0.1' |
| 74 | + web.ctx.site.save_many(updated_eds, comment="Redacting ocaids") |
| 75 | + return editions_fetched, editions_dirty, editions_updated |
| 76 | + |
| 77 | + |
| 78 | +def main( |
| 79 | + ol_config: str, |
| 80 | + since: str | None = None, |
| 81 | + until: str | None = None, |
| 82 | + test: bool = True, |
| 83 | +): |
| 84 | + load_config(ol_config) |
| 85 | + infogami._setup() |
| 86 | + s3_keys = config.get('ia_ol_metadata_write_s3') # XXX needs dark scope |
| 87 | + editions_fetched, editions_dirty, editions_updated = disassociate_dark_ocaids( |
| 88 | + s3_keys, since=since, until=until, test=test |
| 89 | + ) |
| 90 | + print( |
| 91 | + f"{editions_fetched} editions fetched, " |
| 92 | + f"{editions_dirty} editions dirty, " |
| 93 | + f"{editions_updated} editions updated" |
| 94 | + ) |
| 95 | + |
| 96 | + |
| 97 | +if __name__ == '__main__': |
| 98 | + FnToCLI(main).run() |
0 commit comments