Stephan Bridger: python

Thursday, July 24, 2025

Using Python To Access archive.today, July 2025

It seems like a lot of the previous software wrappers to interact with archive.today (and archive.is, archive.ph, etc) via the command-line are either outdated or broken. So, here's a Python script to automatically submit links from the command-line to archive.today and retrieve their archived URLs.

From testing, it seems like it's best to keep the delay around 8 to 10 seconds. If you go too fast, Cloudflare will begin to yell at you and start throwing 429 errors.

As long as you've received a "WIP" URL from archive.today, it should be archived shortly after, though it may not appear immediately.

Add your own random user-agent. :)

'''

% python3 archiveToday.py --help
usage: archiveToday.py [-h] --urls URLS [--delay DELAY] [--output OUTPUT]
Batch archive URLs with archive.today
options:
  -h, --help       show this help message and exit
  --urls URLS      Path to file containing URLs (one per line)
  --delay DELAY    Delay between submissions in seconds
  --output OUTPUT  CSV file to save results
  
'''


import requests
import time
import os
import argparse
import csv
import re
from bs4 import BeautifulSoup  

def archive_url(session, url):
    try:
        print(f"Archiving: {url}")
        resp = session.get("https://archive.ph/submit/", params={"url": url}, allow_redirects=False)

        # If already archived, follow 302 redirect
        if resp.status_code == 302:
            archived_url = resp.headers.get("Location")
            # Match both 4 and 5 character archive slugs
            match = re.match(r"(https://archive\.ph/\w{4,5})", archived_url)
            if match:
                archived_url = match.group(1)
            print(f"Already archived: {archived_url}")
            return url, archived_url

        # If needs archiving, follow refresh to /wip/
        if resp.status_code == 200:
            refresh_header = resp.headers.get("refresh", "")
            match = re.search(r'url=(https?://[^\s]+)', refresh_header)
            if not match:
                print("WIP URL not found in refresh header.")
                return url, None

            wip_url = match.group(1)
            print(f"Archiving in progress (WIP): {wip_url}")

            final_resp = session.get(wip_url, allow_redirects=True)
            if final_resp.status_code == 200:
                archived_url = final_resp.url.replace("/wip/", "/")
                print(f"Archived: {archived_url}")
                return url, archived_url
            else:
                print(f"Failed to retrieve from WIP URL. Status: {final_resp.status_code}")
                return url, None

        print(f"Unexpected status code: {resp.status_code}")
        return url, None

    except Exception as e:
        print(f"Exception archiving {url}: {e}")
        return url, None

def read_urls_from_file(filename):
    if not os.path.exists(filename):
        print(f"File not found: {filename}")
        return []

    with open(filename, 'r') as f:
        urls = [
            line.strip()
            for line in f
            if line.strip() and not line.strip().startswith("#")
        ]
    return urls


def main():
    parser = argparse.ArgumentParser(description="Batch archive URLs with archive.today")
    parser.add_argument("--urls", required=True, help="Path to file containing URLs (one per line)")
    parser.add_argument("--delay", type=int, default=30, help="Delay between submissions in seconds")
    parser.add_argument("--output", default="archived_results.csv", help="CSV file to save results")
    args = parser.parse_args()
    urls = read_urls_from_file(args.urls)

    if not urls:
        print("No URLs to archive.")
        return

    session = requests.Session()
    session.headers.update({
        "User-Agent": ""
    })

    with open(args.output, "w", newline='', encoding="utf-8") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["Original URL", "Archived URL"])

        for idx, url in enumerate(urls, 1):
            print(f"\n[{idx}/{len(urls)}]")
            original, archived = archive_url(session, url)
            writer.writerow([original, archived or ""])
            if idx < len(urls):
                print(f"Waiting {args.delay} seconds before next...")
                time.sleep(args.delay)

    print(f"\nFinished. Results saved to {args.output}")

if __name__ == "__main__":
    main()

Saturday, January 21, 2023

Stephan Bridger

Thursday, July 24, 2025

Using Python To Access archive.today, July 2025

Wednesday, August 23, 2023

Portable Executable Format and Structured Exception Handling

Saturday, January 21, 2023

Mm .. Malware Analysis

Using Python To Access archive.today, July 2025

Navigation