Skip to content

Commit c110bca

Browse files
Merge branch 'master' into fix-test-01287_max_execution_speed
2 parents 9986f4f + e900946 commit c110bca

File tree

6 files changed

+82
-18
lines changed

6 files changed

+82
-18
lines changed

src/Common/parseGlobs.cpp

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,27 @@ namespace ErrorCodes
1515
extern const int BAD_ARGUMENTS;
1616
}
1717

18-
static const re2::RE2 range_regex(R"({([\d]+\.\.[\d]+)})"); /// regexp for {M..N}, where M and N - non-negative integers
19-
static const re2::RE2 enum_regex(R"({([^{}*,]+[^{}*]*[^{}*,])})"); /// regexp for {expr1,expr2,expr3}, expr's should be without "{", "}", "*" and ","
18+
namespace
19+
{
20+
struct Regexps
21+
{
22+
static const Regexps & instance()
23+
{
24+
static Regexps regexps;
25+
return regexps;
26+
}
27+
28+
/// regexp for {M..N}, where M and N - non-negative integers
29+
re2::RE2 range_regex{R"({([\d]+\.\.[\d]+)})"};
30+
31+
/// regexp for {expr1,expr2,expr3}, expr's should be without "{", "}", "*" and ","
32+
re2::RE2 enum_regex{R"({([^{}*,]+[^{}*]*[^{}*,])})"};
33+
};
34+
}
2035

2136
bool containsRangeGlob(const std::string & input)
2237
{
23-
return RE2::PartialMatch(input, range_regex);
38+
return RE2::PartialMatch(input, Regexps::instance().range_regex);
2439
}
2540

2641
bool containsOnlyEnumGlobs(const std::string & input)
@@ -67,8 +82,8 @@ std::string makeRegexpPatternFromGlobs(const std::string & initial_str_with_glob
6782
std::string_view matched_range;
6883
std::string_view matched_enum;
6984

70-
auto did_match_range = RE2::PartialMatch(input, range_regex, &matched_range);
71-
auto did_match_enum = RE2::PartialMatch(input, enum_regex, &matched_enum);
85+
auto did_match_range = RE2::PartialMatch(input, Regexps::instance().range_regex, &matched_range);
86+
auto did_match_enum = RE2::PartialMatch(input, Regexps::instance().enum_regex, &matched_enum);
7287

7388
/// Enum regex matches ranges, so if they both match and point to the same data,
7489
/// it is a range.
@@ -78,7 +93,7 @@ std::string makeRegexpPatternFromGlobs(const std::string & initial_str_with_glob
7893
/// We matched a range, and range comes earlier than enum
7994
if (did_match_range && (!did_match_enum || matched_range.data() < matched_enum.data()))
8095
{
81-
RE2::FindAndConsume(&input, range_regex, &matched);
96+
RE2::FindAndConsume(&input, Regexps::instance().range_regex, &matched);
8297
std::string buffer(matched);
8398
oss_for_replacing << escaped_with_globs.substr(current_index, matched_range.data() - escaped_with_globs.data() - current_index - 1) << '(';
8499

@@ -122,7 +137,7 @@ std::string makeRegexpPatternFromGlobs(const std::string & initial_str_with_glob
122137
/// We matched enum, and it comes earlier than range.
123138
else if (did_match_enum && (!did_match_range || matched_enum.data() < matched_range.data()))
124139
{
125-
RE2::FindAndConsume(&input, enum_regex, &matched);
140+
RE2::FindAndConsume(&input, Regexps::instance().enum_regex, &matched);
126141
std::string buffer(matched);
127142

128143
oss_for_replacing << escaped_with_globs.substr(current_index, matched.data() - escaped_with_globs.data() - current_index - 1) << '(';

src/Common/parseGlobs.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
#pragma once
2+
23
#include <string>
34
#include <vector>
45

6+
57
namespace DB
68
{
79
bool containsRangeGlob(const std::string & input);

src/Processors/Sources/MongoDBSource.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,8 @@ MongoDBSource::MongoDBSource(
166166
, cursor{collection.find(query, options)}
167167
, sample_block{sample_block_}
168168
, max_block_size{max_block_size_}
169+
, db_json_format_settings{.json= {.max_depth = 0, .quote_64bit_integers = false}}
170+
, json_format_settings{db_json_format_settings, 0, true, true}
169171
{
170172
for (const auto & idx : collections::range(0, sample_block.columns()))
171173
{

src/Processors/Sources/MongoDBSource.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,10 @@ class MongoDBSource final : public ISource
4848
Block sample_block;
4949
std::unordered_map<size_t, std::pair<size_t, std::pair<DataTypePtr, Field>>> arrays_info;
5050
const UInt64 max_block_size;
51-
52-
JSONBuilder::FormatSettings json_format_settings = {{}, 0, true, true};
5351
bool all_read = false;
52+
53+
const DB::FormatSettings db_json_format_settings;
54+
const JSONBuilder::FormatSettings json_format_settings;
5455
};
5556

5657
}

src/Storages/StorageURL.cpp

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -9,18 +9,15 @@
99
#include <Parsers/ASTInsertQuery.h>
1010
#include <Parsers/ASTLiteral.h>
1111
#include <Parsers/ASTFunction.h>
12-
#include <Parsers/ASTIdentifier.h>
1312

1413
#include <IO/ConnectionTimeouts.h>
1514
#include <IO/WriteBufferFromHTTP.h>
16-
#include <IO/WriteHelpers.h>
1715

1816
#include <Formats/FormatFactory.h>
1917
#include <Formats/ReadSchemaUtils.h>
2018
#include <Processors/Formats/IInputFormat.h>
2119
#include <Processors/Formats/IOutputFormat.h>
2220
#include <Processors/Executors/PullingPipelineExecutor.h>
23-
#include <Processors/ISource.h>
2421
#include <Processors/Sources/NullSource.h>
2522
#include <Processors/Transforms/AddingDefaultsTransform.h>
2623
#include <Processors/Transforms/ExtractColumnsTransform.h>
@@ -111,12 +108,6 @@ static const std::unordered_set<std::string_view> optional_configuration_keys =
111108
"headers.header.value",
112109
};
113110

114-
/// Headers in config file will have structure "headers.header.name" and "headers.header.value".
115-
/// But Poco::AbstractConfiguration converts them into "header", "header[1]", "header[2]".
116-
static const std::vector<std::shared_ptr<re2::RE2>> optional_regex_keys = {
117-
std::make_shared<re2::RE2>(R"(headers.header\[[0-9]*\].name)"),
118-
std::make_shared<re2::RE2>(R"(headers.header\[[0-9]*\].value)"),
119-
};
120111

121112
bool urlWithGlobs(const String & uri)
122113
{
@@ -1569,6 +1560,14 @@ size_t StorageURL::evalArgsAndCollectHeaders(
15691560

15701561
void StorageURL::processNamedCollectionResult(Configuration & configuration, const NamedCollection & collection)
15711562
{
1563+
/// Headers in config file will have structure "headers.header.name" and "headers.header.value".
1564+
/// But Poco::AbstractConfiguration converts them into "header", "header[1]", "header[2]".
1565+
static const std::vector<std::shared_ptr<re2::RE2>> optional_regex_keys
1566+
{
1567+
std::make_shared<re2::RE2>(R"(headers.header\[[0-9]*\].name)"),
1568+
std::make_shared<re2::RE2>(R"(headers.header\[[0-9]*\].value)"),
1569+
};
1570+
15721571
validateNamedCollection(collection, required_configuration_keys, optional_configuration_keys, optional_regex_keys);
15731572

15741573
configuration.url = collection.get<String>("url");

tests/integration/test_storage_mongodb/test.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import base64
12
import datetime
23
import json
34
import uuid
@@ -1227,3 +1228,47 @@ def test_password_masking(started_cluster):
12271228
== "CREATE DICTIONARY default.mongodb_dictionary_password_masking (`_id` String) PRIMARY KEY _id SOURCE(MONGODB(HOST \\'127.0.0.1\\' PORT 27017 USER \\'testuser\\' PASSWORD \\'[HIDDEN]\\' DB \\'example\\' COLLECTION \\'test_clickhouse\\' OPTIONS \\'ssl=true\\')) LIFETIME(MIN 0 MAX 0) LAYOUT(FLAT())\n"
12281229
)
12291230
node.query("DROP DICTIONARY IF EXISTS mongodb_dictionary_password_masking;")
1231+
1232+
1233+
def test_json_serialization(started_cluster):
1234+
mongo_connection = get_mongo_connection(started_cluster)
1235+
db = mongo_connection["test"]
1236+
db.command("dropAllUsersFromDatabase")
1237+
db.command("createUser", "root", pwd=mongo_pass, roles=["readWrite"])
1238+
json_serialization_table = db["json_serialization_table"]
1239+
1240+
date = datetime.datetime.strptime("2025-05-17 13:14:15", "%Y-%m-%d %H:%M:%S")
1241+
1242+
def create_dataset(mongo, level) -> dict:
1243+
return {
1244+
"type_string": "Type string",
1245+
"type_oid": bson.ObjectId("60f7e65e16b1c1d1c8a2b6b3") if mongo else "60f7e65e16b1c1d1c8a2b6b3",
1246+
"type_binary": bson.Binary(b"binarydata", subtype=0) if mongo else base64.b64encode(b"binarydata").decode(),
1247+
"type_bool": True,
1248+
"type_int32": 123,
1249+
"type_int64": bson.int64.Int64(2**63 - 1) if mongo else int(2**63 - 1),
1250+
"type_double": float(3.141592653589793238),
1251+
"type_date": date if mongo else date.strftime("%Y-%m-%d %H:%M:%S"),
1252+
"type_timestamp": bson.timestamp.Timestamp(date, 1) if mongo else date.strftime("%Y-%m-%d %H:%M:%S"),
1253+
"type_document": {"nested_doc": create_dataset(mongo, level - 1)} if level > 0 else {},
1254+
"type_array": [create_dataset(mongo, level - 1)] if level > 0 else [],
1255+
"type_regex": bson.regex.Regex(r"^pattern.*$", "i") if mongo else {"^pattern.*$": "i"},
1256+
"type_null": None,
1257+
}
1258+
1259+
json_serialization_table.insert_one({"dataset": create_dataset(True, 10)})
1260+
node = started_cluster.instances["node"]
1261+
node.query(
1262+
f"""
1263+
CREATE OR REPLACE TABLE json_serialization_table(
1264+
dataset String
1265+
) ENGINE = MongoDB('mongo1:27017', 'test', 'json_serialization_table', 'root', '{mongo_pass}')
1266+
"""
1267+
)
1268+
1269+
assert node.query(f"SELECT COUNT() FROM json_serialization_table") == "1\n"
1270+
assert (node.query(f"SELECT dataset FROM json_serialization_table")[:-1]
1271+
== json.dumps(create_dataset(False, 10), separators=(',', ':')))
1272+
1273+
node.query("DROP TABLE json_serialization_table")
1274+
json_serialization_table.drop()

0 commit comments

Comments
 (0)