Skip to content

Commit eb7fb87

Browse files
coryanpitrou
authored andcommitted
ARROW-14918: [C++] Implement GcsFileSystem::GetFileInfo(FileSelector)
Closes #11943 from coryan/ARROW-14918-gcsfs-get-file-info-selector Authored-by: Carlos O'Ryan <[email protected]> Signed-off-by: Antoine Pitrou <[email protected]>
1 parent 63060c8 commit eb7fb87

File tree

2 files changed

+127
-2
lines changed

2 files changed

+127
-2
lines changed

cpp/src/arrow/filesystem/gcsfs.cc

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,45 @@ class GcsFileSystem::Impl {
295295
path.object.back() == '/' ? FileType::Directory : FileType::File);
296296
}
297297

298+
Result<FileInfoVector> GetFileInfo(const FileSelector& select) {
299+
ARROW_ASSIGN_OR_RAISE(auto p, GcsPath::FromString(select.base_dir));
300+
auto prefix = p.object.empty() ? gcs::Prefix() : gcs::Prefix(p.object);
301+
auto delimiter = select.recursive ? gcs::Delimiter() : gcs::Delimiter("/");
302+
bool found_directory = false;
303+
FileInfoVector result;
304+
for (auto const& o : client_.ListObjects(p.bucket, prefix, delimiter)) {
305+
if (!o.ok()) {
306+
if (select.allow_not_found &&
307+
o.status().code() == google::cloud::StatusCode::kNotFound) {
308+
continue;
309+
}
310+
return internal::ToArrowStatus(o.status());
311+
}
312+
found_directory = true;
313+
// Skip the directory itself from the results
314+
if (o->name() == p.object) {
315+
continue;
316+
}
317+
auto path = internal::ConcatAbstractPath(o->bucket(), o->name());
318+
if (o->name().back() == '/') {
319+
result.push_back(
320+
FileInfo(internal::EnsureTrailingSlash(path), FileType::Directory));
321+
continue;
322+
}
323+
auto info = FileInfo(path, FileType::File);
324+
info.set_size(static_cast<int64_t>(o->size()));
325+
// An object has multiple "time" attributes, including the time when its data was
326+
// created, and the time when its metadata was last updated. We use the object
327+
// creation time because the data for an object cannot be changed once created.
328+
info.set_mtime(o->time_created());
329+
result.push_back(std::move(info));
330+
}
331+
if (!found_directory && !select.allow_not_found) {
332+
return Status::IOError("No such file or directory '", select.base_dir, "'");
333+
}
334+
return result;
335+
}
336+
298337
// GCS does not have directories or folders. But folders can be emulated (with some
299338
// limitations) using marker objects. That and listing with prefixes creates the
300339
// illusion of folders.
@@ -528,7 +567,7 @@ Result<FileInfo> GcsFileSystem::GetFileInfo(const std::string& path) {
528567
}
529568

530569
Result<FileInfoVector> GcsFileSystem::GetFileInfo(const FileSelector& select) {
531-
return Status::NotImplemented("The GCS FileSystem is not fully implemented");
570+
return impl_->GetFileInfo(select);
532571
}
533572

534573
Status GcsFileSystem::CreateDir(const std::string& path, bool recursive) {

cpp/src/arrow/filesystem/gcsfs_test.cc

Lines changed: 87 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,11 @@
3939

4040
namespace arrow {
4141
namespace fs {
42+
/// Custom comparison for FileInfo, we need this to use complex googletest matchers.
43+
inline bool operator==(const FileInfo& a, const FileInfo& b) {
44+
return a.path() == b.path() && a.type() == b.type();
45+
}
46+
4247
namespace {
4348

4449
namespace bp = boost::process;
@@ -51,6 +56,7 @@ using ::testing::Not;
5156
using ::testing::NotNull;
5257
using ::testing::Pair;
5358
using ::testing::UnorderedElementsAre;
59+
using ::testing::UnorderedElementsAreArray;
5460

5561
auto const* kLoremIpsum = R"""(
5662
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor
@@ -93,7 +99,7 @@ class GcsTestbench : public ::testing::Environment {
9399
error_ = std::move(error);
94100
}
95101

96-
~GcsTestbench() {
102+
~GcsTestbench() override {
97103
// Brutal shutdown, kill the full process group because the GCS testbench may launch
98104
// additional children.
99105
group_.terminate();
@@ -186,6 +192,29 @@ class GcsIntegrationTest : public ::testing::Test {
186192

187193
std::string RandomFolderName() { return RandomChars(32) + "/"; }
188194

195+
struct Hierarchy {
196+
std::string base_dir;
197+
std::vector<FileInfo> contents;
198+
};
199+
200+
Result<Hierarchy> CreateHierarchy(std::shared_ptr<arrow::fs::FileSystem> fs) {
201+
const char* const kTestFolders[] = {
202+
"a/", "a/0/", "a/0/0/", "a/1/", "a/2/",
203+
};
204+
auto result = Hierarchy{PreexistingBucketPath() + "a/", {}};
205+
for (auto const* f : kTestFolders) {
206+
const auto folder = PreexistingBucketPath() + f;
207+
RETURN_NOT_OK(fs->CreateDir(folder, true));
208+
result.contents.push_back(arrow::fs::Dir(folder));
209+
for (int i = 0; i != 64; ++i) {
210+
const auto filename = folder + "test-file-" + std::to_string(i);
211+
CreateFile(fs.get(), filename, filename);
212+
result.contents.push_back(arrow::fs::File(filename));
213+
}
214+
}
215+
return result;
216+
}
217+
189218
private:
190219
std::string RandomChars(std::size_t count) {
191220
auto const fillers = std::string("abcdefghijlkmnopqrstuvwxyz0123456789");
@@ -445,6 +474,63 @@ TEST_F(GcsIntegrationTest, GetFileInfoObject) {
445474
arrow::fs::AssertFileInfo(fs.get(), PreexistingObjectPath(), FileType::File);
446475
}
447476

477+
TEST_F(GcsIntegrationTest, GetFileInfoSelectorRecursive) {
478+
auto fs = internal::MakeGcsFileSystemForTest(TestGcsOptions());
479+
ASSERT_OK_AND_ASSIGN(auto hierarchy, CreateHierarchy(fs));
480+
std::vector<arrow::fs::FileInfo> expected;
481+
std::copy_if(
482+
hierarchy.contents.begin(), hierarchy.contents.end(), std::back_inserter(expected),
483+
[&](const arrow::fs::FileInfo& info) { return hierarchy.base_dir != info.path(); });
484+
485+
auto selector = FileSelector();
486+
selector.base_dir = hierarchy.base_dir;
487+
selector.allow_not_found = false;
488+
selector.recursive = true;
489+
ASSERT_OK_AND_ASSIGN(auto results, fs->GetFileInfo(selector));
490+
EXPECT_THAT(results, UnorderedElementsAreArray(expected.begin(), expected.end()));
491+
}
492+
493+
TEST_F(GcsIntegrationTest, GetFileInfoSelectorNonRecursive) {
494+
auto fs = internal::MakeGcsFileSystemForTest(TestGcsOptions());
495+
ASSERT_OK_AND_ASSIGN(auto hierarchy, CreateHierarchy(fs));
496+
std::vector<arrow::fs::FileInfo> expected;
497+
std::copy_if(hierarchy.contents.begin(), hierarchy.contents.end(),
498+
std::back_inserter(expected), [&](const arrow::fs::FileInfo& info) {
499+
if (info.path() == hierarchy.base_dir) return false;
500+
return internal::EnsureTrailingSlash(
501+
internal::GetAbstractPathParent(info.path()).first) ==
502+
hierarchy.base_dir;
503+
});
504+
505+
auto selector = FileSelector();
506+
selector.base_dir = hierarchy.base_dir;
507+
selector.allow_not_found = false;
508+
selector.recursive = false;
509+
ASSERT_OK_AND_ASSIGN(auto results, fs->GetFileInfo(selector));
510+
EXPECT_THAT(results, UnorderedElementsAreArray(expected.begin(), expected.end()));
511+
}
512+
513+
TEST_F(GcsIntegrationTest, GetFileInfoSelectorNotFoundTrue) {
514+
auto fs = internal::MakeGcsFileSystemForTest(TestGcsOptions());
515+
516+
auto selector = FileSelector();
517+
selector.base_dir = NotFoundObjectPath() + "/";
518+
selector.allow_not_found = true;
519+
selector.recursive = true;
520+
ASSERT_OK_AND_ASSIGN(auto results, fs->GetFileInfo(selector));
521+
EXPECT_THAT(results, IsEmpty());
522+
}
523+
524+
TEST_F(GcsIntegrationTest, GetFileInfoSelectorNotFoundFalse) {
525+
auto fs = internal::MakeGcsFileSystemForTest(TestGcsOptions());
526+
527+
auto selector = FileSelector();
528+
selector.base_dir = NotFoundObjectPath() + "/";
529+
selector.allow_not_found = false;
530+
selector.recursive = false;
531+
ASSERT_RAISES(IOError, fs->GetFileInfo(selector));
532+
}
533+
448534
TEST_F(GcsIntegrationTest, CreateDirSuccessBucketOnly) {
449535
auto fs = internal::MakeGcsFileSystemForTest(TestGcsOptions());
450536
auto bucket_name = RandomBucketName();

0 commit comments

Comments
 (0)