-
Notifications
You must be signed in to change notification settings - Fork 13
Expand file tree
/
Copy pathdownload_packages.py
More file actions
106 lines (90 loc) · 3.11 KB
/
download_packages.py
File metadata and controls
106 lines (90 loc) · 3.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# Download N most popular PyPI packages
import json
import os
import argparse
import requests
TOP_PYPI_PACKAGES = (
"https://hugovk.github.io/top-pypi-packages/top-pypi-packages-30-days.json"
)
PYPI_INFO = "https://pypi.python.org/pypi/{}/json"
def dl_data(url):
response = requests.get(url)
response.raise_for_status()
return response.content
def dl_json(url):
response = requests.get(url)
response.raise_for_status()
return response.json()
def dl_package_info(package):
return dl_json(PYPI_INFO.format(package))
parser = argparse.ArgumentParser()
parser.add_argument(
"-n", "--number", type=int, default=100, help="How many packages (default 100)"
)
parser.add_argument(
"-o", "--odir", default="packages", help="Where to download (default ./packages)"
)
parser.add_argument(
"-t",
"--top-packages",
default=TOP_PYPI_PACKAGES,
help=f"URL for 'top PYPI packages (default {TOP_PYPI_PACKAGES})",
)
def main():
args = parser.parse_args()
os.makedirs(args.odir, exist_ok=True)
packages = dl_json(args.top_packages)
print("Last update:", packages["last_update"])
rows = packages["rows"]
# Sort from high to low download count
rows.sort(key=lambda row: -row["download_count"])
# Limit to top N packages
rows = rows[: args.number]
print(f"Downloading {len(rows)} packages...")
index = 0
count = 0
skipped = 0
missing = 0
try:
for row in rows:
print(
f"Project {row['project']}"
f" was downloaded {row['download_count']:,d} times"
)
index += 1
info = dl_package_info(row["project"])
releases = info["releases"]
# Assume the releases are listed in chronological order
last_release = list(releases)[-1]
print(f" Last release: {last_release}")
files = releases[last_release]
for file in files:
filename = file["filename"]
# Download the sdist, which is the .tar.gz filename
if filename.endswith(".tar.gz"):
print(f" File name: {filename}")
dest = os.path.basename(filename)
fulldest = os.path.join(args.odir, dest)
if not os.path.exists(fulldest):
url = file["url"]
print(f" URL: {url}")
data = dl_data(url)
print(f" Writing {len(data)} bytes to {fulldest} ")
with open(fulldest, "wb") as f:
f.write(data)
count += 1
else:
print(f" Skipping {fulldest} (already exists)")
skipped += 1
break
else:
missing += 1
except KeyboardInterrupt:
print(f"Interrupted at index {index}")
finally:
print(
f"Out of {len(rows)} packages:"
f" downloaded {count}, skipped {skipped}, missed {missing}"
)
if __name__ == "__main__":
main()