Skip to content

Commit 64df490

Browse files
PGijsberseddiebergman
authored andcommitted
Update fetching a bucket from MinIO
Previously, each dataset had their own bucket: https://openml1.win.tue.nl/datasets61/dataset_61.pq But we were advised to reduce the amount of buckets and favor hosting many objects in hierarchical structure, so we now have instead some prefixes to divide up the dataset objects into separate subdirectories: https://openml1.win.tue.nl/datasets/0000/0061/dataset_61.pq This commit has bypassed pre-commit. Tests should be updated too.
1 parent b06ecee commit 64df490

File tree

2 files changed

+5
-7
lines changed

2 files changed

+5
-7
lines changed

openml/_api_calls.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -193,17 +193,18 @@ def _download_minio_bucket(source: str, destination: str | Path) -> None:
193193
parsed_url = urllib.parse.urlparse(source)
194194

195195
# expect path format: /BUCKET/path/to/file.ext
196-
bucket = parsed_url.path[1:]
196+
_, bucket, *prefixes, _file = parsed_url.path.split("/")
197+
prefix = "/".join(prefixes)
197198

198199
client = minio.Minio(endpoint=parsed_url.netloc, secure=False)
199200

200-
for file_object in client.list_objects(bucket, recursive=True):
201+
for file_object in client.list_objects(bucket, prefix=prefix, recursive=True):
201202
if file_object.object_name is None:
202203
raise ValueError("Object name is None.")
203204

204205
_download_minio_file(
205-
source=source + "/" + file_object.object_name,
206-
destination=Path(destination, file_object.object_name),
206+
source=source.rsplit("/", 1)[0] + "/" + file_object.object_name.rsplit("/", 1)[1],
207+
destination=Path(destination, file_object.object_name.rsplit("/", 1)[1]),
207208
exists_ok=True,
208209
)
209210

openml/datasets/functions.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1211,9 +1211,6 @@ def _get_dataset_parquet(
12111211
# For now, it would be the only way for the user to fetch the additional
12121212
# files in the bucket (no function exists on an OpenMLDataset to do this).
12131213
if download_all_files:
1214-
if url.endswith(".pq"):
1215-
url, _ = url.rsplit("/", maxsplit=1)
1216-
12171214
openml._api_calls._download_minio_bucket(source=url, destination=cache_directory)
12181215

12191216
if not output_file_path.is_file():

0 commit comments

Comments
 (0)