Skip to content

Commit 2696ea3

Browse files
Wauplinclaude
andcommitted
fix: match Unix cp -r nesting semantics in copy_files
When copying a folder to an existing directory, Unix `cp -r` nests the source folder inside it (e.g. `cp -r src dst` → `dst/src/...`). The previous implementation always copied contents directly (→ `dst/...`), which is rsync-like rather than cp-like. Track `destination_exists_as_directory` separately from the `destination_is_directory` flag (which also covers trailing-slash hints) so that folder copies can distinguish "destination exists" from "destination was requested as a directory". Nesting only applies when the destination actually exists as a prefix in the bucket. Co-Authored-By: Claude Opus 4.6 (1M context) <[email protected]>
1 parent 3b383db commit 2696ea3

2 files changed

Lines changed: 74 additions & 8 deletions

File tree

src/huggingface_hub/hf_api.py

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12621,15 +12621,21 @@ def copy_files(self, source: str, destination: str, *, token: str | bool | None
1262112621

1262212622
destination_bucket_id = destination_handle.bucket_id
1262312623
destination_path = destination_handle.path
12624-
if destination_path == "" or destination.endswith("/"):
12624+
destination_is_directory = False
12625+
destination_exists_as_directory = False
12626+
12627+
if destination_path == "":
12628+
# Bucket root always exists as a directory
1262512629
destination_is_directory = True
12630+
destination_exists_as_directory = True
1262612631
else:
12627-
# Check if destination path is an existing file or a directory in the bucket
12632+
# Check if destination matches an existing file
1262812633
dest_path_info = list(self.get_bucket_paths_info(destination_bucket_id, [destination_path], token=token))
1262912634
if dest_path_info:
1263012635
destination_is_directory = False
1263112636
else:
12632-
destination_is_directory = (
12637+
# Check if destination is an existing "directory" (prefix with children)
12638+
destination_exists_as_directory = (
1263312639
next(
1263412640
iter(
1263512641
self.list_bucket_tree(
@@ -12640,6 +12646,8 @@ def copy_files(self, source: str, destination: str, *, token: str | bool | None
1264012646
)
1264112647
is not None
1264212648
)
12649+
# Treat as directory if it exists as one, or if the user signaled with trailing slash
12650+
destination_is_directory = destination_exists_as_directory or destination.endswith("/")
1264312651

1264412652
all_adds: list[tuple[str, str]] = []
1264512653
all_copies: list[_BucketCopyFile] = []
@@ -12665,6 +12673,14 @@ def _resolve_target_path(src_file_path: str, src_root_path: str | None, is_singl
1266512673

1266612674
if rel_path == "":
1266712675
raise ValueError("Cannot copy an empty relative path.")
12676+
12677+
# Match Unix `cp -r` behavior: when the destination already exists as a
12678+
# directory, nest the source folder inside it (e.g. cp -r src dst → dst/src/...).
12679+
# When the destination does not exist, use rename semantics (cp -r src new → new/...).
12680+
if destination_exists_as_directory and src_root_path is not None:
12681+
src_dir_basename = src_root_path.rsplit("/", 1)[-1]
12682+
rel_path = f"{src_dir_basename}/{rel_path}"
12683+
1266812684
if destination_path == "":
1266912685
return rel_path
1267012686
return f"{destination_path.rstrip('/')}/{rel_path}"
@@ -12712,7 +12728,6 @@ def _add_repo_file(file: RepoFile, target_path: str) -> None:
1271212728
)
1271312729
else:
1271412730
# Source path is a folder (or prefix) — list and copy all matching files
12715-
destination_is_directory = True
1271612731
for item in self.list_bucket_tree(
1271712732
source_handle.bucket_id, prefix=source_path or None, recursive=True, token=token
1271812733
):
@@ -12744,7 +12759,6 @@ def _add_repo_file(file: RepoFile, target_path: str) -> None:
1274412759
_add_repo_file(source_repo_path_info[0], target_path)
1274512760
else:
1274612761
# Source path is a folder — list and copy all files recursively
12747-
destination_is_directory = True
1274812762
for repo_item in self.list_repo_tree(
1274912763
repo_id=source_handle.repo_id,
1275012764
path_in_repo=source_path,

tests/test_buckets.py

Lines changed: 55 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -388,7 +388,7 @@ def test_copy_files_folder_to_nonexistent_dest(api: HfApi, bucket_write: str, bu
388388

389389
@requires("hf_xet")
390390
def test_copy_files_folder_to_existing_folder_dest(api: HfApi, bucket_write: str, bucket_write_2: str):
391-
"""source=folder, dest is an existing folder => files merged under dest path."""
391+
"""source=folder, dest is an existing folder => source folder nested under dest (like `cp -r`)."""
392392
api.batch_bucket_files(bucket_write, add=[(b"a", "folder/a.txt"), (b"b", "folder/sub/b.txt")])
393393
api.batch_bucket_files(bucket_write_2, add=[(b"existing", "target-folder/existing.txt")])
394394

@@ -397,10 +397,11 @@ def test_copy_files_folder_to_existing_folder_dest(api: HfApi, bucket_write: str
397397
f"hf://buckets/{bucket_write_2}/target-folder",
398398
)
399399

400+
# Like `cp -r folder target-folder` when target-folder exists: nests as target-folder/folder/...
400401
destination_files = {entry.path for entry in api.list_bucket_tree(bucket_write_2)}
401402
assert "target-folder/existing.txt" in destination_files
402-
assert "target-folder/a.txt" in destination_files
403-
assert "target-folder/sub/b.txt" in destination_files
403+
assert "target-folder/folder/a.txt" in destination_files
404+
assert "target-folder/folder/sub/b.txt" in destination_files
404405

405406

406407
@requires("hf_xet")
@@ -449,6 +450,57 @@ def test_copy_files_file_to_folder_dest(api: HfApi, bucket_write: str, bucket_wr
449450
assert output_path.read_bytes() == b"content"
450451

451452

453+
@requires("hf_xet")
454+
def test_copy_files_folder_to_existing_folder_with_trailing_slash(api: HfApi, bucket_write: str, bucket_write_2: str):
455+
"""source=folder, dest is existing folder with trailing '/' => source folder nested (like `cp -r`)."""
456+
api.batch_bucket_files(bucket_write, add=[(b"a", "logs/a.txt"), (b"b", "logs/sub/b.txt")])
457+
api.batch_bucket_files(bucket_write_2, add=[(b"existing", "backup/existing.txt")])
458+
459+
api.copy_files(
460+
f"hf://buckets/{bucket_write}/logs",
461+
f"hf://buckets/{bucket_write_2}/backup/",
462+
)
463+
464+
# Like `cp -r logs backup/` when backup/ exists: nests as backup/logs/...
465+
destination_files = {entry.path for entry in api.list_bucket_tree(bucket_write_2)}
466+
assert "backup/existing.txt" in destination_files
467+
assert "backup/logs/a.txt" in destination_files
468+
assert "backup/logs/sub/b.txt" in destination_files
469+
470+
471+
@requires("hf_xet")
472+
def test_copy_files_folder_to_nonexistent_dest_with_trailing_slash(api: HfApi, bucket_write: str, bucket_write_2: str):
473+
"""source=folder, dest doesn't exist but has trailing '/' => rename semantics (no nesting)."""
474+
api.batch_bucket_files(bucket_write, add=[(b"a", "logs/a.txt"), (b"b", "logs/sub/b.txt")])
475+
476+
api.copy_files(
477+
f"hf://buckets/{bucket_write}/logs",
478+
f"hf://buckets/{bucket_write_2}/new-backup/",
479+
)
480+
481+
# Like `cp -r logs new-backup/` when new-backup/ doesn't exist:
482+
# in Unix this errors, but in object storage we create it with rename semantics.
483+
destination_files = {entry.path for entry in api.list_bucket_tree(bucket_write_2)}
484+
assert "new-backup/a.txt" in destination_files
485+
assert "new-backup/sub/b.txt" in destination_files
486+
487+
488+
@requires("hf_xet")
489+
def test_copy_files_folder_to_bucket_root(api: HfApi, bucket_write: str, bucket_write_2: str):
490+
"""source=folder, dest is bucket root => source folder nested at root (like `cp -r models /`)."""
491+
api.batch_bucket_files(bucket_write, add=[(b"a", "models/a.txt"), (b"b", "models/sub/b.txt")])
492+
493+
api.copy_files(
494+
f"hf://buckets/{bucket_write}/models",
495+
f"hf://buckets/{bucket_write_2}/",
496+
)
497+
498+
# Bucket root always "exists" as a directory, so nesting applies
499+
destination_files = {entry.path for entry in api.list_bucket_tree(bucket_write_2)}
500+
assert "models/a.txt" in destination_files
501+
assert "models/sub/b.txt" in destination_files
502+
503+
452504
@pytest.mark.parametrize(
453505
"source, destination, expected_content_type",
454506
[

0 commit comments

Comments
 (0)