Update fetching a bucket from MinIO

PGijsbers · eddiebergman · commit 64df49072691 · 2024-01-12T11:35:45.000+01:00
Previously, each dataset had their own bucket: https://openml1.win.tue.nl/datasets61/dataset_61.pq But we were advised to reduce the amount of buckets and favor hosting many objects in hierarchical structure, so we now have instead some prefixes to divide up the dataset objects into separate subdirectories: https://openml1.win.tue.nl/datasets/0000/0061/dataset_61.pq This commit has bypassed pre-commit. Tests should be updated too.
diff --git a/openml/_api_calls.py b/openml/_api_calls.py
@@ -193,17 +193,18 @@ def _download_minio_bucket(source: str, destination: str | Path) -> None:
     parsed_url = urllib.parse.urlparse(source)
 
     # expect path format: /BUCKET/path/to/file.ext
-    bucket = parsed_url.path[1:]
+    _, bucket, *prefixes, _file = parsed_url.path.split("/")
+    prefix = "/".join(prefixes)
 
     client = minio.Minio(endpoint=parsed_url.netloc, secure=False)
 
-    for file_object in client.list_objects(bucket, recursive=True):
+    for file_object in client.list_objects(bucket, prefix=prefix, recursive=True):
         if file_object.object_name is None:
             raise ValueError("Object name is None.")
 
         _download_minio_file(
-            source=source + "/" + file_object.object_name,
-            destination=Path(destination, file_object.object_name),
+            source=source.rsplit("/", 1)[0] + "/" + file_object.object_name.rsplit("/", 1)[1],
+            destination=Path(destination, file_object.object_name.rsplit("/", 1)[1]),
             exists_ok=True,
         )
 
diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
@@ -1211,9 +1211,6 @@ def _get_dataset_parquet(
     # For now, it would be the only way for the user to fetch the additional
     # files in the bucket (no function exists on an OpenMLDataset to do this).
     if download_all_files:
-        if url.endswith(".pq"):
-            url, _ = url.rsplit("/", maxsplit=1)
-
         openml._api_calls._download_minio_bucket(source=url, destination=cache_directory)
 
     if not output_file_path.is_file():