Skip to content

Commit a54cbef

Browse files
authored
Merge pull request #24341 from Avogar/arrow
Support complex types in Arrow/Parquet/ORC
2 parents 65ce392 + c723dd7 commit a54cbef

40 files changed

+858
-151
lines changed

.gitmodules

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@
103103
url = https://github.com/ClickHouse-Extras/fastops
104104
[submodule "contrib/orc"]
105105
path = contrib/orc
106-
url = https://github.com/apache/orc
106+
url = https://github.com/ClickHouse-Extras/orc
107107
[submodule "contrib/sparsehash-c11"]
108108
path = contrib/sparsehash-c11
109109
url = https://github.com/sparsehash/sparsehash-c11.git

contrib/arrow

Submodule arrow updated from 616b3dc to debf751

contrib/arrow-cmake/CMakeLists.txt

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,7 @@ set(ARROW_SRCS
188188
"${LIBRARY_DIR}/array/util.cc"
189189
"${LIBRARY_DIR}/array/validate.cc"
190190

191+
"${LIBRARY_DIR}/compute/api_aggregate.cc"
191192
"${LIBRARY_DIR}/compute/api_scalar.cc"
192193
"${LIBRARY_DIR}/compute/api_vector.cc"
193194
"${LIBRARY_DIR}/compute/cast.cc"
@@ -198,8 +199,11 @@ set(ARROW_SRCS
198199

199200
"${LIBRARY_DIR}/compute/kernels/aggregate_basic.cc"
200201
"${LIBRARY_DIR}/compute/kernels/aggregate_mode.cc"
202+
"${LIBRARY_DIR}/compute/kernels/aggregate_quantile.cc"
203+
"${LIBRARY_DIR}/compute/kernels/aggregate_tdigest.cc"
201204
"${LIBRARY_DIR}/compute/kernels/aggregate_var_std.cc"
202205
"${LIBRARY_DIR}/compute/kernels/codegen_internal.cc"
206+
"${LIBRARY_DIR}/compute/kernels/hash_aggregate.cc"
203207
"${LIBRARY_DIR}/compute/kernels/scalar_arithmetic.cc"
204208
"${LIBRARY_DIR}/compute/kernels/scalar_boolean.cc"
205209
"${LIBRARY_DIR}/compute/kernels/scalar_cast_boolean.cc"
@@ -243,6 +247,7 @@ set(ARROW_SRCS
243247
"${LIBRARY_DIR}/io/interfaces.cc"
244248
"${LIBRARY_DIR}/io/memory.cc"
245249
"${LIBRARY_DIR}/io/slow.cc"
250+
"${LIBRARY_DIR}/io/transform.cc"
246251

247252
"${LIBRARY_DIR}/tensor/coo_converter.cc"
248253
"${LIBRARY_DIR}/tensor/csf_converter.cc"
@@ -256,25 +261,23 @@ set(ARROW_SRCS
256261
"${LIBRARY_DIR}/util/bitmap_builders.cc"
257262
"${LIBRARY_DIR}/util/bitmap_ops.cc"
258263
"${LIBRARY_DIR}/util/bpacking.cc"
264+
"${LIBRARY_DIR}/util/cancel.cc"
259265
"${LIBRARY_DIR}/util/compression.cc"
260-
"${LIBRARY_DIR}/util/compression_lz4.cc"
261-
"${LIBRARY_DIR}/util/compression_snappy.cc"
262-
"${LIBRARY_DIR}/util/compression_zlib.cc"
263-
"${LIBRARY_DIR}/util/compression_zstd.cc"
264266
"${LIBRARY_DIR}/util/cpu_info.cc"
265267
"${LIBRARY_DIR}/util/decimal.cc"
266268
"${LIBRARY_DIR}/util/delimiting.cc"
267269
"${LIBRARY_DIR}/util/formatting.cc"
268270
"${LIBRARY_DIR}/util/future.cc"
269271
"${LIBRARY_DIR}/util/int_util.cc"
270272
"${LIBRARY_DIR}/util/io_util.cc"
271-
"${LIBRARY_DIR}/util/iterator.cc"
272273
"${LIBRARY_DIR}/util/key_value_metadata.cc"
273274
"${LIBRARY_DIR}/util/logging.cc"
274275
"${LIBRARY_DIR}/util/memory.cc"
276+
"${LIBRARY_DIR}/util/mutex.cc"
275277
"${LIBRARY_DIR}/util/string_builder.cc"
276278
"${LIBRARY_DIR}/util/string.cc"
277279
"${LIBRARY_DIR}/util/task_group.cc"
280+
"${LIBRARY_DIR}/util/tdigest.cc"
278281
"${LIBRARY_DIR}/util/thread_pool.cc"
279282
"${LIBRARY_DIR}/util/time.cc"
280283
"${LIBRARY_DIR}/util/trie.cc"
@@ -368,14 +371,14 @@ set(PARQUET_SRCS
368371
"${LIBRARY_DIR}/column_reader.cc"
369372
"${LIBRARY_DIR}/column_scanner.cc"
370373
"${LIBRARY_DIR}/column_writer.cc"
371-
"${LIBRARY_DIR}/deprecated_io.cc"
372374
"${LIBRARY_DIR}/encoding.cc"
373-
"${LIBRARY_DIR}/encryption.cc"
374-
"${LIBRARY_DIR}/encryption_internal.cc"
375+
"${LIBRARY_DIR}/encryption/encryption.cc"
376+
"${LIBRARY_DIR}/encryption/encryption_internal.cc"
377+
"${LIBRARY_DIR}/encryption/internal_file_decryptor.cc"
378+
"${LIBRARY_DIR}/encryption/internal_file_encryptor.cc"
379+
"${LIBRARY_DIR}/exception.cc"
375380
"${LIBRARY_DIR}/file_reader.cc"
376381
"${LIBRARY_DIR}/file_writer.cc"
377-
"${LIBRARY_DIR}/internal_file_decryptor.cc"
378-
"${LIBRARY_DIR}/internal_file_encryptor.cc"
379382
"${LIBRARY_DIR}/level_conversion.cc"
380383
"${LIBRARY_DIR}/level_comparison.cc"
381384
"${LIBRARY_DIR}/metadata.cc"
@@ -385,6 +388,8 @@ set(PARQUET_SRCS
385388
"${LIBRARY_DIR}/properties.cc"
386389
"${LIBRARY_DIR}/schema.cc"
387390
"${LIBRARY_DIR}/statistics.cc"
391+
"${LIBRARY_DIR}/stream_reader.cc"
392+
"${LIBRARY_DIR}/stream_writer.cc"
388393
"${LIBRARY_DIR}/types.cc"
389394

390395
"${GEN_LIBRARY_DIR}/parquet_constants.cpp"

contrib/orc

Submodule orc updated from 5981208 to 0a936f6

src/Columns/ColumnLowCardinality.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,7 @@ class ColumnLowCardinality final : public COWHelper<IColumn, ColumnLowCardinalit
191191
void nestedRemoveNullable() { dictionary.getColumnUnique().nestedRemoveNullable(); }
192192

193193
const IColumnUnique & getDictionary() const { return dictionary.getColumnUnique(); }
194+
IColumnUnique & getDictionary() { return dictionary.getColumnUnique(); }
194195
const ColumnPtr & getDictionaryPtr() const { return dictionary.getColumnUniquePtr(); }
195196
/// IColumnUnique & getUnique() { return static_cast<IColumnUnique &>(*column_unique); }
196197
/// ColumnPtr getUniquePtr() const { return column_unique; }

src/Core/Settings.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -564,7 +564,8 @@ class IColumn;
564564
M(Bool, output_format_pretty_row_numbers, false, "Add row numbers before each row for pretty output format", 0) \
565565
M(Bool, insert_distributed_one_random_shard, false, "If setting is enabled, inserting into distributed table will choose a random shard to write when there is no sharding key", 0) \
566566
M(Bool, cross_to_inner_join_rewrite, true, "Use inner join instead of comma/cross join if possible", 0) \
567-
567+
\
568+
M(Bool, output_format_arrow_low_cardinality_as_dictionary, false, "Enable output LowCardinality type as Dictionary Arrow type", 0) \
568569

569570
// End of FORMAT_FACTORY_SETTINGS
570571
// Please add settings non-related to formats into the COMMON_SETTINGS above.

src/Formats/FormatFactory.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
112112
format_settings.values.interpret_expressions = settings.input_format_values_interpret_expressions;
113113
format_settings.with_names_use_header = settings.input_format_with_names_use_header;
114114
format_settings.write_statistics = settings.output_format_write_statistics;
115+
format_settings.arrow.low_cardinality_as_dictionary = settings.output_format_arrow_low_cardinality_as_dictionary;
115116

116117
/// Validate avro_schema_registry_url with RemoteHostFilter when non-empty and in Server context
117118
if (format_settings.schema.is_server)

src/Formats/FormatSettings.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ struct FormatSettings
5252
struct
5353
{
5454
UInt64 row_group_size = 1000000;
55+
bool low_cardinality_as_dictionary = false;
5556
} arrow;
5657

5758
struct

src/Processors/Formats/Impl/ArrowBlockInputFormat.cpp

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#include "ArrowBlockInputFormat.h"
2+
23
#if USE_ARROW
34

45
#include <Formats/FormatFactory.h>
@@ -29,7 +30,6 @@ ArrowBlockInputFormat::ArrowBlockInputFormat(ReadBuffer & in_, const Block & hea
2930
Chunk ArrowBlockInputFormat::generate()
3031
{
3132
Chunk res;
32-
const Block & header = getPort().getHeader();
3333
arrow::Result<std::shared_ptr<arrow::RecordBatch>> batch_result;
3434

3535
if (stream)
@@ -63,7 +63,7 @@ Chunk ArrowBlockInputFormat::generate()
6363

6464
++record_batch_current;
6565

66-
ArrowColumnToCHColumn::arrowTableToCHChunk(res, *table_result, header, "Arrow");
66+
arrow_column_to_ch_column->arrowTableToCHChunk(res, *table_result);
6767

6868
return res;
6969
}
@@ -81,13 +81,16 @@ void ArrowBlockInputFormat::resetParser()
8181

8282
void ArrowBlockInputFormat::prepareReader()
8383
{
84+
std::shared_ptr<arrow::Schema> schema;
85+
8486
if (stream)
8587
{
8688
auto stream_reader_status = arrow::ipc::RecordBatchStreamReader::Open(std::make_unique<ArrowInputStreamFromReadBuffer>(in));
8789
if (!stream_reader_status.ok())
8890
throw Exception(ErrorCodes::UNKNOWN_EXCEPTION,
8991
"Error while opening a table: {}", stream_reader_status.status().ToString());
9092
stream_reader = *stream_reader_status;
93+
schema = stream_reader->schema();
9194
}
9295
else
9396
{
@@ -96,8 +99,11 @@ void ArrowBlockInputFormat::prepareReader()
9699
throw Exception(ErrorCodes::UNKNOWN_EXCEPTION,
97100
"Error while opening a table: {}", file_reader_status.status().ToString());
98101
file_reader = *file_reader_status;
102+
schema = file_reader->schema();
99103
}
100104

105+
arrow_column_to_ch_column = std::make_unique<ArrowColumnToCHColumn>(getPort().getHeader(), std::move(schema), "Arrow");
106+
101107
if (stream)
102108
record_batch_total = -1;
103109
else

src/Processors/Formats/Impl/ArrowBlockInputFormat.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ namespace DB
1111
{
1212

1313
class ReadBuffer;
14+
class ArrowColumnToCHColumn;
1415

1516
class ArrowBlockInputFormat : public IInputFormat
1617
{
@@ -32,6 +33,8 @@ class ArrowBlockInputFormat : public IInputFormat
3233
// The following fields are used only for Arrow format
3334
std::shared_ptr<arrow::ipc::RecordBatchFileReader> file_reader;
3435

36+
std::unique_ptr<ArrowColumnToCHColumn> arrow_column_to_ch_column;
37+
3538
int record_batch_total = 0;
3639
int record_batch_current = 0;
3740

0 commit comments

Comments
 (0)