Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
441958f
Proper parser.
excitoon Apr 14, 2021
a6e246b
Parsing.
excitoon Apr 14, 2021
1146efa
Partitioned writes for S3.
excitoon May 25, 2021
0879dbb
Test.
excitoon May 27, 2021
328213f
Fixes.
excitoon May 31, 2021
e99433e
Better functionality.
excitoon Jun 1, 2021
d1efe3a
Review fixes.
excitoon Jul 14, 2021
c008eab
Review fixes.
excitoon Jul 15, 2021
d13b98d
Add test for unsupported INSERT PARTITION BY
vdimir Jul 14, 2021
eb31b19
Add validation to insert partition by key to s3
vdimir Jul 16, 2021
f02ceed
Add 01944_insert_partition_by to arcadia and fasttest skip list
vdimir Jul 17, 2021
4c9fb7f
Fix error code in StorageS3.cpp
vdimir Jul 18, 2021
5ce9e26
Less strict validation.
excitoon Jul 23, 2021
b4484f4
Test fix.
excitoon Jul 26, 2021
6b75863
Test fix.
excitoon Jul 26, 2021
24fcf15
Minor fix.
excitoon Jul 28, 2021
be34376
Update src/Storages/StorageS3.cpp
excitoon Jul 28, 2021
b34f60a
Fixed `find`.
excitoon Jul 28, 2021
cebe1e7
Style fix.
excitoon Jul 28, 2021
f45dbd6
Fixed test.
excitoon Jul 29, 2021
4b1cc38
Fixes.
excitoon Jul 29, 2021
d8bc601
Minor.
excitoon Jul 29, 2021
067cc76
Update src/Storages/StorageS3.cpp
vdimir Jul 29, 2021
0b75c66
Revert "Update src/Storages/StorageS3.cpp "
excitoon Jul 29, 2021
1191725
Better fix.
excitoon Jul 29, 2021
ea67849
Typo.
excitoon Jul 29, 2021
2223ae4
Added integration test.
excitoon Jul 29, 2021
6e10d28
Fix.
excitoon Jul 29, 2021
0479edd
Improved validation of S3 buckets and keys.
excitoon Jul 25, 2021
a2bbf98
Update.
excitoon Jul 25, 2021
3c2e090
Build and style fix.
excitoon Jul 26, 2021
e203c33
Validation.
excitoon Jul 29, 2021
f6cb82a
Minor fix.
excitoon Jul 29, 2021
3c08a4a
Rolled back changes in `Functions`.
excitoon Jul 30, 2021
6e600df
Rolled back changes in `Functions`.
excitoon Jul 30, 2021
925c496
Rolled back changes in `Functions`.
excitoon Jul 30, 2021
2c725a1
Move isValidUTF8Naive from Functions to Common
vdimir Aug 19, 2021
c4c42b6
Validate uft8 in partition key from PARTITION BY for s3
vdimir Aug 19, 2021
6ba6577
Do not allow slashes in bucket formatted from PARTITION BY
vdimir Aug 19, 2021
2907c86
Generate ya.make for isValidUTF8.cpp
vdimir Aug 19, 2021
295c8d5
Remove unused Error Code from IO/S3Common.cpp
vdimir Aug 19, 2021
6fe63a8
Fix validateKey/Bucket for S3
vdimir Aug 20, 2021
98acccb
Merge branch 'master' into s3partitionedwrite
vdimir Aug 24, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions docker/test/fasttest/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -393,6 +393,9 @@ function run_tests
01853_s2_cells_intersect
01854_s2_cap_contains
01854_s2_cap_union

# needs s3
01944_insert_partition_by
)

time clickhouse-test --hung-check -j 8 --order=random --use-skip-list \
Expand Down
131 changes: 131 additions & 0 deletions src/Common/isValidUTF8.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
#include <Common/isValidUTF8.h>
#include <cstring>

/// inspired by https://github.com/cyb70289/utf8/

/*
MIT License

Copyright (c) 2019 Yibo Cai

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/

/*
* http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94
*
* Table 3-7. Well-Formed UTF-8 Byte Sequences
*
* +--------------------+------------+-------------+------------+-------------+
* | Code Points | First Byte | Second Byte | Third Byte | Fourth Byte |
* +--------------------+------------+-------------+------------+-------------+
* | U+0000..U+007F | 00..7F | | | |
* +--------------------+------------+-------------+------------+-------------+
* | U+0080..U+07FF | C2..DF | 80..BF | | |
* +--------------------+------------+-------------+------------+-------------+
* | U+0800..U+0FFF | E0 | A0..BF | 80..BF | |
* +--------------------+------------+-------------+------------+-------------+
* | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | |
* +--------------------+------------+-------------+------------+-------------+
* | U+D000..U+D7FF | ED | 80..9F | 80..BF | |
* +--------------------+------------+-------------+------------+-------------+
* | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | |
* +--------------------+------------+-------------+------------+-------------+
* | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF |
* +--------------------+------------+-------------+------------+-------------+
* | U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF |
* +--------------------+------------+-------------+------------+-------------+
* | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF |
* +--------------------+------------+-------------+------------+-------------+
*/
namespace DB
{

namespace UTF8
{

UInt8 isValidUTF8(const UInt8 * data, UInt64 len)
{
while (len)
{
int bytes;
const UInt8 byte1 = data[0];
/* 00..7F */
if (byte1 <= 0x7F)
{
bytes = 1;
}
/* C2..DF, 80..BF */
else if (len >= 2 && byte1 >= 0xC2 && byte1 <= 0xDF && static_cast<Int8>(data[1]) <= static_cast<Int8>(0xBF))
{
bytes = 2;
}
else if (len >= 3)
{
const UInt8 byte2 = data[1];
bool byte2_ok = static_cast<Int8>(byte2) <= static_cast<Int8>(0xBF);
bool byte3_ok = static_cast<Int8>(data[2]) <= static_cast<Int8>(0xBF);

if (byte2_ok && byte3_ok &&
/* E0, A0..BF, 80..BF */
((byte1 == 0xE0 && byte2 >= 0xA0) ||
/* E1..EC, 80..BF, 80..BF */
(byte1 >= 0xE1 && byte1 <= 0xEC) ||
/* ED, 80..9F, 80..BF */
(byte1 == 0xED && byte2 <= 0x9F) ||
/* EE..EF, 80..BF, 80..BF */
(byte1 >= 0xEE && byte1 <= 0xEF)))
{
bytes = 3;
}
else if (len >= 4)
{
bool byte4_ok = static_cast<Int8>(data[3]) <= static_cast<Int8>(0xBF);
if (byte2_ok && byte3_ok && byte4_ok &&
/* F0, 90..BF, 80..BF, 80..BF */
((byte1 == 0xF0 && byte2 >= 0x90) ||
/* F1..F3, 80..BF, 80..BF, 80..BF */
(byte1 >= 0xF1 && byte1 <= 0xF3) ||
/* F4, 80..8F, 80..BF, 80..BF */
(byte1 == 0xF4 && byte2 <= 0x8F)))
{
bytes = 4;
}
else
{
return false;
}
}
else
{
return false;
}
}
else
{
return false;
}
len -= bytes;
data += bytes;
}
return true;
}

}
}
10 changes: 10 additions & 0 deletions src/Common/isValidUTF8.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#pragma once

#include <common/types.h>

namespace DB::UTF8
{

UInt8 isValidUTF8(const UInt8 * data, UInt64 len);

}
1 change: 1 addition & 0 deletions src/Common/ya.make
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ SRCS(
hasLinuxCapability.cpp
hex.cpp
isLocalAddress.cpp
isValidUTF8.cpp
malloc.cpp
memory.cpp
new_delete.cpp
Expand Down
78 changes: 2 additions & 76 deletions src/Functions/isValidUTF8.cpp
Original file line number Diff line number Diff line change
@@ -1,14 +1,7 @@
#include <DataTypes/DataTypeString.h>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionStringOrArrayToT.h>

#include <cstring>

#ifdef __SSE4_1__
# include <emmintrin.h>
# include <smmintrin.h>
# include <tmmintrin.h>
#endif
#include <Common/isValidUTF8.h>

namespace DB
{
Expand Down Expand Up @@ -71,75 +64,8 @@ SOFTWARE.
* +--------------------+------------+-------------+------------+-------------+
*/

static inline UInt8 isValidUTF8Naive(const UInt8 * data, UInt64 len)
{
while (len)
{
int bytes;
const UInt8 byte1 = data[0];
/* 00..7F */
if (byte1 <= 0x7F)
{
bytes = 1;
}
/* C2..DF, 80..BF */
else if (len >= 2 && byte1 >= 0xC2 && byte1 <= 0xDF && static_cast<Int8>(data[1]) <= static_cast<Int8>(0xBF))
{
bytes = 2;
}
else if (len >= 3)
{
const UInt8 byte2 = data[1];
bool byte2_ok = static_cast<Int8>(byte2) <= static_cast<Int8>(0xBF);
bool byte3_ok = static_cast<Int8>(data[2]) <= static_cast<Int8>(0xBF);

if (byte2_ok && byte3_ok &&
/* E0, A0..BF, 80..BF */
((byte1 == 0xE0 && byte2 >= 0xA0) ||
/* E1..EC, 80..BF, 80..BF */
(byte1 >= 0xE1 && byte1 <= 0xEC) ||
/* ED, 80..9F, 80..BF */
(byte1 == 0xED && byte2 <= 0x9F) ||
/* EE..EF, 80..BF, 80..BF */
(byte1 >= 0xEE && byte1 <= 0xEF)))
{
bytes = 3;
}
else if (len >= 4)
{
bool byte4_ok = static_cast<Int8>(data[3]) <= static_cast<Int8>(0xBF);
if (byte2_ok && byte3_ok && byte4_ok &&
/* F0, 90..BF, 80..BF, 80..BF */
((byte1 == 0xF0 && byte2 >= 0x90) ||
/* F1..F3, 80..BF, 80..BF, 80..BF */
(byte1 >= 0xF1 && byte1 <= 0xF3) ||
/* F4, 80..8F, 80..BF, 80..BF */
(byte1 == 0xF4 && byte2 <= 0x8F)))
{
bytes = 4;
}
else
{
return false;
}
}
else
{
return false;
}
}
else
{
return false;
}
len -= bytes;
data += bytes;
}
return true;
}

#ifndef __SSE4_1__
static inline UInt8 isValidUTF8(const UInt8 * data, UInt64 len) { return isValidUTF8Naive(data, len); }
static inline UInt8 isValidUTF8(const UInt8 * data, UInt64 len) { return DB::UTF8::isValidUTF8(data, len); }
#else
static inline UInt8 isValidUTF8(const UInt8 * data, UInt64 len)
{
Expand Down
31 changes: 16 additions & 15 deletions src/IO/S3Common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@

#if USE_AWS_S3

# include <IO/S3Common.h>

# include <Common/quoteString.h>

# include <IO/S3Common.h>
# include <IO/WriteBufferFromString.h>
# include <Storages/StorageS3Settings.h>

Expand Down Expand Up @@ -617,7 +618,7 @@ namespace S3
storage_name = S3;

if (uri.getHost().empty())
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Host is empty in S3 URI: {}", uri.toString());
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Host is empty in S3 URI.");

String name;
String endpoint_authority_from_uri;
Expand All @@ -626,12 +627,7 @@ namespace S3
{
is_virtual_hosted_style = true;
endpoint = uri.getScheme() + "://" + name + endpoint_authority_from_uri;

/// S3 specification requires at least 3 and at most 63 characters in bucket name.
/// https://docs.aws.amazon.com/awscloudtrail/latest/userguide/cloudtrail-s3-bucket-naming-requirements.html
if (bucket.length() < 3 || bucket.length() > 63)
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"Bucket name length is out of bounds in virtual hosted style S3 URI: {} ({})", quoteString(bucket), uri.toString());
validateBucket(bucket, uri);

if (!uri.getPath().empty())
{
Expand All @@ -642,7 +638,7 @@ namespace S3
boost::to_upper(name);
if (name != S3 && name != COS)
{
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Object storage system name is unrecognized in virtual hosted style S3 URI: {} ({})", quoteString(name), uri.toString());
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Object storage system name is unrecognized in virtual hosted style S3 URI: {}", quoteString(name));
}
if (name == S3)
{
Expand All @@ -657,14 +653,19 @@ namespace S3
{
is_virtual_hosted_style = false;
endpoint = uri.getScheme() + "://" + uri.getAuthority();

/// S3 specification requires at least 3 and at most 63 characters in bucket name.
/// https://docs.aws.amazon.com/awscloudtrail/latest/userguide/cloudtrail-s3-bucket-naming-requirements.html
if (bucket.length() < 3 || bucket.length() > 63)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Bucket name length is out of bounds in virtual hosted style S3 URI: {} ({})", quoteString(bucket), uri.toString());
validateBucket(bucket, uri);
}
else
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Bucket or key name are invalid in S3 URI: {}", uri.toString());
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Bucket or key name are invalid in S3 URI.");
}

void URI::validateBucket(const String & bucket, const Poco::URI & uri)
{
/// S3 specification requires at least 3 and at most 63 characters in bucket name.
/// https://docs.aws.amazon.com/awscloudtrail/latest/userguide/cloudtrail-s3-bucket-naming-requirements.html
if (bucket.length() < 3 || bucket.length() > 63)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Bucket name length is out of bounds in virtual hosted style S3 URI: {}{}",
quoteString(bucket), !uri.empty() ? " (" + uri.toString() + ")" : "");
}
}

Expand Down
2 changes: 2 additions & 0 deletions src/IO/S3Common.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@ struct URI
bool is_virtual_hosted_style;

explicit URI(const Poco::URI & uri_);

static void validateBucket(const String & bucket, const Poco::URI & uri);
};

}
Expand Down
4 changes: 4 additions & 0 deletions src/Interpreters/InterpreterInsertQuery.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ namespace DB

namespace ErrorCodes
{
extern const int NOT_IMPLEMENTED;
extern const int NO_SUCH_COLUMN_IN_TABLE;
extern const int ILLEGAL_COLUMN;
extern const int DUPLICATE_COLUMN;
Expand Down Expand Up @@ -155,6 +156,9 @@ BlockIO InterpreterInsertQuery::execute()
BlockIO res;

StoragePtr table = getTable(query);
if (query.partition_by && !table->supportsPartitionBy())
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "PARTITION BY clause is not supported by storage");

auto table_lock = table->lockForShare(getContext()->getInitialQueryId(), settings.lock_acquire_timeout);
auto metadata_snapshot = table->getInMemoryMetadataPtr();

Expand Down
5 changes: 5 additions & 0 deletions src/Parsers/ASTInsertQuery.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@ void ASTInsertQuery::formatImpl(const FormatSettings & settings, FormatState & s
{
settings.ostr << (settings.hilite ? hilite_keyword : "") << "FUNCTION ";
table_function->formatImpl(settings, state, frame);
if (partition_by)
{
settings.ostr << " PARTITION BY ";
partition_by->formatImpl(settings, state, frame);
}
}
else
settings.ostr << (settings.hilite ? hilite_none : "")
Expand Down
2 changes: 2 additions & 0 deletions src/Parsers/ASTInsertQuery.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ class ASTInsertQuery : public IAST
ASTPtr infile;
ASTPtr watch;
ASTPtr table_function;
ASTPtr partition_by;
ASTPtr settings_ast;

/// Data to insert
Expand All @@ -44,6 +45,7 @@ class ASTInsertQuery : public IAST
if (select) { res->select = select->clone(); res->children.push_back(res->select); }
if (watch) { res->watch = watch->clone(); res->children.push_back(res->watch); }
if (table_function) { res->table_function = table_function->clone(); res->children.push_back(res->table_function); }
if (partition_by) { res->partition_by = partition_by->clone(); res->children.push_back(res->partition_by); }
if (settings_ast) { res->settings_ast = settings_ast->clone(); res->children.push_back(res->settings_ast); }

return res;
Expand Down
Loading