1717
1818#include " ParquetMeta.h"
1919
20+ #include < Formats/FormatFactory.h>
2021#include < Formats/FormatSettings.h>
2122#include < Processors/Formats/Impl/ArrowBufferedStreams.h>
23+ #include < Processors/Formats/Impl/ArrowColumnToCHColumn.h>
2224#include < Processors/Formats/Impl/ArrowFieldIndexUtil.h>
2325#include < Storages/Parquet/ArrowUtils.h>
2426#include < parquet/arrow/reader.h>
2527#include < parquet/arrow/schema.h>
2628#include < parquet/metadata.h>
27- #include " Processors/Formats/Impl/ArrowColumnToCHColumn.h"
2829
2930namespace DB
3031{
@@ -34,13 +35,14 @@ extern const int BAD_ARGUMENTS;
3435}
3536}
3637
38+ using namespace DB ;
3739
3840namespace local_engine
3941{
4042
41- std::unique_ptr<parquet::ParquetFileReader> ParquetMetaBuilder::openInputParquetFile (DB:: ReadBuffer & read_buffer)
43+ std::unique_ptr<parquet::ParquetFileReader> ParquetMetaBuilder::openInputParquetFile (ReadBuffer & read_buffer)
4244{
43- const DB:: FormatSettings format_settings{
45+ const FormatSettings format_settings{
4446 .seekable_read = true ,
4547 };
4648 std::atomic<int > is_stopped{0 };
@@ -49,13 +51,28 @@ std::unique_ptr<parquet::ParquetFileReader> ParquetMetaBuilder::openInputParquet
4951 return parquet::ParquetFileReader::Open (arrow_file, parquet::default_reader_properties (), nullptr );
5052}
5153
54+ Block ParquetMetaBuilder::collectFileSchema (const ContextPtr & context, ReadBuffer & read_buffer)
55+ {
56+ assert (dynamic_cast <SeekableReadBuffer *>(&read_buffer) != nullptr );
57+
58+ FormatSettings format_settings = getFormatSettings (context);
59+ ParquetMetaBuilder metaBuilder{
60+ .case_insensitive = format_settings.parquet .case_insensitive_column_matching ,
61+ .allow_missing_columns = false ,
62+ .collectPageIndex = false ,
63+ .collectSchema = true };
64+ metaBuilder.build (read_buffer);
65+
66+ return metaBuilder.fileHeader ;
67+ }
68+
5269std::vector<Int32> ParquetMetaBuilder::pruneColumn (
53- const DB:: Block & header, const parquet::FileMetaData & metadata, bool case_insensitive, bool allow_missing_columns)
70+ const Block & header, const parquet::FileMetaData & metadata, bool case_insensitive, bool allow_missing_columns)
5471{
5572 std::shared_ptr<arrow::Schema> schema;
5673 THROW_ARROW_NOT_OK (parquet::arrow::FromParquetSchema (metadata.schema (), &schema));
5774
58- DB:: ArrowFieldIndexUtil field_util (case_insensitive, allow_missing_columns);
75+ ArrowFieldIndexUtil field_util (case_insensitive, allow_missing_columns);
5976 auto index_mapping = field_util.findRequiredIndices (header, *schema, metadata);
6077
6178 std::vector<Int32> column_indices;
@@ -93,7 +110,7 @@ ParquetMetaBuilder & ParquetMetaBuilder::buildSchema(const parquet::FileMetaData
93110 std::shared_ptr<arrow::Schema> schema;
94111 THROW_ARROW_NOT_OK (parquet::arrow::FromParquetSchema (file_meta.schema (), &schema));
95112
96- fileHeader = DB:: ArrowColumnToCHColumn::arrowSchemaToCHHeader (*schema, " Parquet" , false , true );
113+ fileHeader = ArrowColumnToCHColumn::arrowSchemaToCHHeader (*schema, " Parquet" , false , true );
97114 }
98115 return *this ;
99116}
@@ -175,7 +192,7 @@ ParquetMetaBuilder & ParquetMetaBuilder::buildAllRowRange(const parquet::FileMet
175192ParquetMetaBuilder & ParquetMetaBuilder::buildRowRange (
176193 parquet::ParquetFileReader & reader,
177194 const parquet::FileMetaData & file_meta,
178- const DB:: Block & readBlock,
195+ const Block & readBlock,
179196 const ColumnIndexFilter * column_index_filter)
180197{
181198 if (collectPageIndex)
@@ -200,8 +217,8 @@ ParquetMetaBuilder & ParquetMetaBuilder::buildRowRange(
200217}
201218
202219ParquetMetaBuilder & ParquetMetaBuilder::build (
203- DB:: ReadBuffer & read_buffer,
204- const DB:: Block & readBlock,
220+ ReadBuffer & read_buffer,
221+ const Block & readBlock,
205222 const ColumnIndexFilter * column_index_filter,
206223 const std::function<bool (UInt64)> & should_include_row_group)
207224{
@@ -213,7 +230,7 @@ ParquetMetaBuilder & ParquetMetaBuilder::build(
213230 .buildRowRange (*reader, *fileMetaData, readBlock, column_index_filter);
214231}
215232
216- ParquetMetaBuilder & ParquetMetaBuilder::build (DB:: ReadBuffer & read_buffer, const std::function<bool (UInt64)> & should_include_row_group)
233+ ParquetMetaBuilder & ParquetMetaBuilder::build (ReadBuffer & read_buffer, const std::function<bool (UInt64)> & should_include_row_group)
217234{
218235 auto reader = openInputParquetFile (read_buffer);
219236 fileMetaData = reader->metadata ();
0 commit comments