Skip to content

Commit 6c4bf59

Browse files
committed
fix suggestions and enhance tests
1 parent 9a2d89e commit 6c4bf59

File tree

6 files changed

+118
-63
lines changed

6 files changed

+118
-63
lines changed

src/Formats/NumpyDataTypes.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ class NumpyDataTypeFloat : public NumpyDataType
8282
case 4: type_index = NumpyDataTypeIndex::Float32; break;
8383
case 8: type_index = NumpyDataTypeIndex::Float64; break;
8484
default:
85-
throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "Incorrect float type with size {}", size);
85+
throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "Numpy float type with size {} is not supported", size);
8686
}
8787
}
8888

src/Processors/Formats/Impl/NpyRowInputFormat.cpp

Lines changed: 19 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1,39 +1,20 @@
1-
#include <IO/ReadHelpers.h>
2-
#include <cstddef>
3-
#include <iterator>
4-
#include <memory>
51
#include <string>
6-
#include <tuple>
72
#include <vector>
8-
#include <type_traits>
9-
#include <unordered_map>
103
#include <Processors/Formats/Impl/NpyRowInputFormat.h>
11-
#include <Formats/FormatFactory.h>
12-
#include <Formats/EscapingRuleUtils.h>
13-
#include <DataTypes/Serializations/SerializationNullable.h>
144
#include <DataTypes/DataTypeString.h>
155
#include <Common/assert_cast.h>
16-
#include <Common/typeid_cast.h>
176
#include <Common/Exception.h>
18-
#include "Formats/NumpyDataTypes.h"
7+
#include <DataTypes/DataTypeArray.h>
8+
#include <DataTypes/DataTypesNumber.h>
9+
#include <Formats/FormatFactory.h>
10+
#include <Formats/NumpyDataTypes.h>
1911
#include <Columns/ColumnFixedString.h>
20-
#include <Core/TypeId.h>
21-
#include <Core/Types_fwd.h>
2212
#include <Columns/ColumnString.h>
2313
#include <Columns/ColumnArray.h>
2414
#include <Columns/ColumnsNumber.h>
25-
#include <Storages/IStorage.h>
26-
#include <Columns/IColumn.h>
27-
#include <Core/Field.h>
28-
#include <Core/NamesAndTypes.h>
29-
#include <DataTypes/DataTypeArray.h>
30-
#include <DataTypes/DataTypesNumber.h>
3115
#include <DataTypes/IDataType.h>
32-
#include <DataTypes/Serializations/ISerialization.h>
3316
#include <IO/ReadBuffer.h>
34-
#include <IO/WriteHelpers.h>
3517
#include <Processors/Formats/IRowInputFormat.h>
36-
#include <base/types.h>
3718
#include <boost/algorithm/string/split.hpp>
3819
#include <IO/ReadBufferFromString.h>
3920

@@ -99,15 +80,11 @@ DataTypePtr createNestedArrayType(const DataTypePtr & nested_type, size_t depth)
9980

10081
size_t parseTypeSize(const std::string & size_str)
10182
{
102-
try
103-
{
104-
size_t size = std::stoi(size_str);
105-
return size;
106-
}
107-
catch (...)
108-
{
83+
ReadBufferFromString buf(size_str);
84+
size_t size;
85+
if (!tryReadIntText(size, buf))
10986
throw Exception(ErrorCodes::INCORRECT_DATA, "Invalid data type size: {}", size_str);
110-
}
87+
return size;
11188
}
11289

11390
std::shared_ptr<NumpyDataType> parseType(String type)
@@ -155,17 +132,14 @@ std::vector<int> parseShape(String shape_string)
155132
if (result_str[result_str.size()-1].empty())
156133
result_str.pop_back();
157134
shape.reserve(result_str.size());
158-
bool is_first_elem = true;
159135
for (const String & item : result_str)
160136
{
161137
int value;
162138
ReadBufferFromString buf(item);
163-
if (!is_first_elem)
164-
assertString(" ", buf);
139+
skipWhitespaceIfAny(buf);
165140
if (!tryReadIntText(value, buf))
166141
throw Exception(ErrorCodes::INCORRECT_DATA, "Invalid shape format: {}", shape_string);
167142
shape.push_back(value);
168-
is_first_elem = false;
169143
}
170144
return shape;
171145
}
@@ -316,8 +290,8 @@ void NpyRowInputFormat::readAndInsertInteger(IColumn * column, const DataTypePtr
316290
case NumpyDataTypeIndex::UInt32: readBinaryValueAndInsert<T, UInt32>(column->getPtr(), npy_type.getEndianness()); break;
317291
case NumpyDataTypeIndex::UInt64: readBinaryValueAndInsert<T, UInt64>(column->getPtr(), npy_type.getEndianness()); break;
318292
default:
319-
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot insert data type into column with type {}",
320-
data_type->getName());
293+
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot insert Numpy value with type {} into column with type {}",
294+
magic_enum::enum_name(npy_type.getTypeIndex()), data_type->getName());
321295
}
322296
}
323297

@@ -329,8 +303,8 @@ void NpyRowInputFormat::readAndInsertFloat(IColumn * column, const DataTypePtr &
329303
case NumpyDataTypeIndex::Float32: readBinaryValueAndInsert<T, Float32>(column->getPtr(), npy_type.getEndianness()); break;
330304
case NumpyDataTypeIndex::Float64: readBinaryValueAndInsert<T, Float64>(column->getPtr(), npy_type.getEndianness()); break;
331305
default:
332-
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot insert data type into column with type {}",
333-
data_type->getName());
306+
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot insert Numpy value with type {} into column with type {}",
307+
magic_enum::enum_name(npy_type.getTypeIndex()), data_type->getName());
334308
}
335309
}
336310

@@ -343,23 +317,19 @@ void NpyRowInputFormat::readAndInsertString(MutableColumnPtr column, const DataT
343317
else if (npy_type.getTypeIndex() == NumpyDataTypeIndex::Unicode)
344318
size = assert_cast<const NumpyDataTypeUnicode &>(npy_type).getSize();
345319
else
346-
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot insert data type into column with type {}",
347-
data_type->getName());
320+
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot insert Numpy value with type {} into column with type {}",
321+
magic_enum::enum_name(npy_type.getTypeIndex()), data_type->getName());
348322

349323
if (is_fixed)
350324
{
351325
auto & fixed_string_column = assert_cast<ColumnFixedString &>(*column);
352326
size_t n = fixed_string_column.getN();
353327
if (size > n)
354328
throw Exception(ErrorCodes::TOO_LARGE_STRING_SIZE, "Too large string for FixedString column");
355-
fixed_string_column.getChars().resize_fill(fixed_string_column.getChars().size() + n);
356-
357-
String tmp;
358-
tmp.resize(size);
359-
360-
in->readStrict(tmp.data(), size);
361-
tmp.erase(std::remove(tmp.begin(), tmp.end(), '\0'), tmp.end());
362-
fixed_string_column.insertData(tmp.c_str(), tmp.size());
329+
auto & chars = fixed_string_column.getChars();
330+
size_t prev_size = chars.size();
331+
chars.resize_fill(prev_size + n);
332+
in->readStrict(reinterpret_cast<char *>(chars.data() + prev_size), size);
363333
}
364334
else
365335
{
@@ -420,11 +390,6 @@ bool NpyRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & /*
420390
return true;
421391
}
422392

423-
void NpyRowInputFormat::resetParser()
424-
{
425-
IRowInputFormat::resetParser();
426-
}
427-
428393
NpySchemaReader::NpySchemaReader(ReadBuffer & in_)
429394
: ISchemaReader(in_) {}
430395

src/Processors/Formats/Impl/NpyRowInputFormat.h

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,15 @@
11
#pragma once
22

3-
#include <unordered_map>
43
#include <vector>
5-
#include <Core/Block.h>
64
#include <Processors/Formats/IRowInputFormat.h>
75
#include <Processors/Formats/ISchemaReader.h>
86
#include <Formats/FormatSettings.h>
9-
#include <Common/HashTable/HashMap.h>
107
#include <Columns/IColumn.h>
118
#include <Core/Field.h>
129
#include <Core/NamesAndTypes.h>
1310
#include <Core/Types.h>
1411
#include <Formats/NumpyDataTypes.h>
1512

16-
using NpySizeT = uint32_t;
17-
static const uint8_t NPY_DOCUMENT_END = 0x00;
18-
1913
namespace DB
2014
{
2115

@@ -34,8 +28,6 @@ class NpyRowInputFormat final : public IRowInputFormat
3428

3529
String getName() const override { return "NpyRowInputFormat"; }
3630

37-
void resetParser() override;
38-
3931
private:
4032
void readPrefix() override;
4133
bool readRow(MutableColumns & columns, RowReadExtension &) override;

tests/queries/0_stateless/02895_npy_format.reference

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,3 +26,61 @@ c
2626
[0,0,0]
2727
[[1,2],[3,4]]
2828
[[5,6],[7,8]]
29+
array Int64
30+
array Float64
31+
array String
32+
array String
33+
array Array(Int64)
34+
array Array(Float64)
35+
array Array(String)
36+
array Array(String)
37+
array Array(UInt8)
38+
array Array(Int64)
39+
array Array(Array(Int64))
40+
1
41+
2
42+
3
43+
1
44+
2
45+
3
46+
1
47+
2
48+
3
49+
1
50+
2
51+
3
52+
1
53+
2
54+
3
55+
1
56+
2
57+
3
58+
1
59+
2
60+
3
61+
1
62+
2
63+
3
64+
1.1
65+
2.2
66+
3.3
67+
1.1
68+
2.2
69+
3.3
70+
1
71+
a
72+
c
73+
1
74+
a
75+
c
76+
[1,2,3]
77+
[4,5,6]
78+
[[1,2],[3,4]]
79+
[[5,6],[7,8]]
80+
0
81+
0
82+
0
83+
0
84+
0
85+
0
86+
1

tests/queries/0_stateless/02895_npy_format.sh

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,43 @@ $CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_npy/two_dim_unicode.npy')
1616
$CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_npy/two_dim_bool.npy')"
1717
$CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_npy/two_dim_null.npy')"
1818
$CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_npy/three_dim.npy')"
19+
20+
$CLICKHOUSE_LOCAL -q "describe file('$CURDIR/data_npy/one_dim.npy')"
21+
$CLICKHOUSE_LOCAL -q "describe file('$CURDIR/data_npy/one_dim_float.npy')"
22+
$CLICKHOUSE_LOCAL -q "describe file('$CURDIR/data_npy/one_dim_str.npy')"
23+
$CLICKHOUSE_LOCAL -q "describe file('$CURDIR/data_npy/one_dim_unicode.npy')"
24+
$CLICKHOUSE_LOCAL -q "describe file('$CURDIR/data_npy/two_dim.npy')"
25+
$CLICKHOUSE_LOCAL -q "describe file('$CURDIR/data_npy/two_dim_float.npy')"
26+
$CLICKHOUSE_LOCAL -q "describe file('$CURDIR/data_npy/two_dim_str.npy')"
27+
$CLICKHOUSE_LOCAL -q "describe file('$CURDIR/data_npy/two_dim_unicode.npy')"
28+
$CLICKHOUSE_LOCAL -q "describe file('$CURDIR/data_npy/two_dim_bool.npy')"
29+
$CLICKHOUSE_LOCAL -q "describe file('$CURDIR/data_npy/two_dim_null.npy')"
30+
$CLICKHOUSE_LOCAL -q "describe file('$CURDIR/data_npy/three_dim.npy')"
31+
32+
$CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_npy/one_dim.npy', Npy, 'value UInt8')"
33+
$CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_npy/one_dim.npy', Npy, 'value UInt16')"
34+
$CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_npy/one_dim.npy', Npy, 'value UInt32')"
35+
$CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_npy/one_dim.npy', Npy, 'value UInt64')"
36+
$CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_npy/one_dim.npy', Npy, 'value Int8')"
37+
$CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_npy/one_dim.npy', Npy, 'value Int16')"
38+
$CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_npy/one_dim.npy', Npy, 'value Int32')"
39+
$CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_npy/one_dim.npy', Npy, 'value Int64')"
40+
41+
$CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_npy/one_dim_float.npy', Npy, 'value Float32')"
42+
$CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_npy/one_dim_float.npy', Npy, 'value Float64')"
43+
44+
$CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_npy/one_dim_str.npy', Npy, 'value FixedString(1)')"
45+
$CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_npy/one_dim_str.npy', Npy, 'value String')"
46+
47+
$CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_npy/two_dim.npy', Npy, 'value Array(Int8)')"
48+
$CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_npy/three_dim.npy', Npy, 'value Array(Array(Int8))')"
49+
50+
$CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_npy/one_dim_float.npy', Npy, 'value Array(Float32)')" 2>&1 | grep -c "BAD_ARGUMENTS"
51+
$CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_npy/one_dim_float.npy', Npy, 'value UUID')" 2>&1 | grep -c "BAD_ARGUMENTS"
52+
$CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_npy/one_dim_float.npy', Npy, 'value Tuple(UInt8)')" 2>&1 | grep -c "BAD_ARGUMENTS"
53+
54+
$CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_npy/one_dim_float.npy', Npy, 'value Int8')" 2>&1 | grep -c "BAD_ARGUMENTS"
55+
$CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_npy/one_dim_str.npy', Npy, 'value Int8')" 2>&1 | grep -c "BAD_ARGUMENTS"
56+
$CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_npy/one_dim_unicode.npy', Npy, 'value Float32')" 2>&1 | grep -c "BAD_ARGUMENTS"
57+
58+
$CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_npy/complex.npy')" 2>&1 | grep -c "BAD_ARGUMENTS"
176 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)