Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 59 additions & 43 deletions src/Functions/ReplaceStringImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,6 @@
namespace DB
{

namespace ErrorCodes
{
extern const int ARGUMENT_OUT_OF_BOUND;
}

struct ReplaceStringTraits
{
enum class Replace : uint8_t
Expand All @@ -39,7 +34,11 @@ struct ReplaceStringImpl
size_t input_rows_count)
{
if (needle.empty())
throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name);
{
res_data.assign(haystack_data.begin(), haystack_data.end());
res_offsets.assign(haystack_offsets.begin(), haystack_offsets.end());
return;
}

const UInt8 * const begin = haystack_data.data();
const UInt8 * const end = haystack_data.data() + haystack_data.size();
Expand Down Expand Up @@ -145,34 +144,34 @@ struct ReplaceStringImpl
const auto * const cur_needle_data = &needle_data[prev_needle_offset];
const size_t cur_needle_length = needle_offsets[i] - prev_needle_offset - 1;

if (cur_needle_length == 0)
throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name);

/// Using "slow" "stdlib searcher instead of Volnitsky because there is a different pattern in each row
StdLibASCIIStringSearcher</*CaseInsensitive*/ false> searcher(cur_needle_data, cur_needle_length);

const auto * last_match = static_cast<UInt8 *>(nullptr);
const auto * start_pos = cur_haystack_data;
const auto * const cur_haystack_end = cur_haystack_data + cur_haystack_length;

while (start_pos < cur_haystack_end)
if (cur_needle_length)
{
if (const auto * const match = searcher.search(start_pos, cur_haystack_end); match != cur_haystack_end)
/// Using "slow" "stdlib searcher instead of Volnitsky because there is a different pattern in each row
StdLibASCIIStringSearcher</*CaseInsensitive*/ false> searcher(cur_needle_data, cur_needle_length);

while (start_pos < cur_haystack_end)
{
/// Copy prefix before match
copyToOutput(start_pos, match - start_pos, res_data, res_offset);
if (const auto * const match = searcher.search(start_pos, cur_haystack_end); match != cur_haystack_end)
{
/// Copy prefix before match
copyToOutput(start_pos, match - start_pos, res_data, res_offset);

/// Insert replacement for match
copyToOutput(replacement.data(), replacement.size(), res_data, res_offset);
/// Insert replacement for match
copyToOutput(replacement.data(), replacement.size(), res_data, res_offset);

last_match = match;
start_pos = match + cur_needle_length;
last_match = match;
start_pos = match + cur_needle_length;

if constexpr (replace == ReplaceStringTraits::Replace::First)
if constexpr (replace == ReplaceStringTraits::Replace::First)
break;
}
else
break;
}
else
break;
}

/// Copy suffix after last match
Expand Down Expand Up @@ -200,7 +199,11 @@ struct ReplaceStringImpl
chassert(haystack_offsets.size() == replacement_offsets.size());

if (needle.empty())
throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name);
{
res_data.assign(haystack_data.begin(), haystack_data.end());
res_offsets.assign(haystack_offsets.begin(), haystack_offsets.end());
return;
}

res_data.reserve(haystack_data.size());
res_offsets.resize(input_rows_count);
Expand Down Expand Up @@ -291,36 +294,35 @@ struct ReplaceStringImpl
const auto * const cur_replacement_data = &replacement_data[prev_replacement_offset];
const size_t cur_replacement_length = replacement_offsets[i] - prev_replacement_offset - 1;

if (cur_needle_length == 0)
throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name);

/// Using "slow" "stdlib searcher instead of Volnitsky because there is a different pattern in each row
StdLibASCIIStringSearcher</*CaseInsensitive*/ false> searcher(cur_needle_data, cur_needle_length);

const auto * last_match = static_cast<UInt8 *>(nullptr);
const auto * start_pos = cur_haystack_data;
const auto * const cur_haystack_end = cur_haystack_data + cur_haystack_length;

while (start_pos < cur_haystack_end)
if (cur_needle_length)
{
if (const auto * const match = searcher.search(start_pos, cur_haystack_end); match != cur_haystack_end)
/// Using "slow" "stdlib searcher instead of Volnitsky because there is a different pattern in each row
StdLibASCIIStringSearcher</*CaseInsensitive*/ false> searcher(cur_needle_data, cur_needle_length);

while (start_pos < cur_haystack_end)
{
/// Copy prefix before match
copyToOutput(start_pos, match - start_pos, res_data, res_offset);
if (const auto * const match = searcher.search(start_pos, cur_haystack_end); match != cur_haystack_end)
{
/// Copy prefix before match
copyToOutput(start_pos, match - start_pos, res_data, res_offset);

/// Insert replacement for match
copyToOutput(cur_replacement_data, cur_replacement_length, res_data, res_offset);
/// Insert replacement for match
copyToOutput(cur_replacement_data, cur_replacement_length, res_data, res_offset);

last_match = match;
start_pos = match + cur_needle_length;
last_match = match;
start_pos = match + cur_needle_length;

if constexpr (replace == ReplaceStringTraits::Replace::First)
if constexpr (replace == ReplaceStringTraits::Replace::First)
break;
}
else
break;
}
else
break;
}

/// Copy suffix after last match
size_t bytes = (last_match == nullptr) ? (cur_haystack_end - cur_haystack_data + 1)
: (cur_haystack_end - last_match - cur_needle_length + 1);
Expand All @@ -346,7 +348,21 @@ struct ReplaceStringImpl
size_t input_rows_count)
{
if (needle.empty())
throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Length of the pattern argument in function {} must be greater than 0.", name);
{
chassert(input_rows_count == haystack_data.size() / n);
/// Since ColumnFixedString does not have a zero byte at the end, while ColumnString does,
/// we need to split haystack_data into strings of length n, add 1 zero byte to the end of each string
/// and then copy to res_data, ref: ColumnString.h and ColumnFixedString.h
Copy link
Copy Markdown
Member

@vitlibar vitlibar Sep 26, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be nice to have a test with FixedString too

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you know how to construct a ColumnConst with string type without modifying code useDefaultImplementationForConstants = false ? I have tested with changing this code locally, it works expected.

Copy link
Copy Markdown
Member

@vitlibar vitlibar Sep 27, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Executing of the following query

select replace(materialize(toFixedString('abcdef', 6)), '', '01')

will lead to this part of code.

Though it's strange that a more simple query

select replace(toFixedString('abcdef', 6), '', '01')

just throws exception ILLEGAL_COLUMN.

res_data.reserve(haystack_data.size() + input_rows_count);
res_offsets.resize(input_rows_count);
for (size_t i = 0; i < input_rows_count; ++i)
{
res_data.insert(res_data.end(), haystack_data.begin() + i * n, haystack_data.begin() + (i + 1) * n);
res_data.push_back(0);
res_offsets[i] = (i + 1) * n + 1;
}
return;
}

const UInt8 * const begin = haystack_data.data();
const UInt8 * const end = haystack_data.data() + haystack_data.size();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -190,3 +190,6 @@ __.__ o_.__ o_.__ 1
__. o_. o_. 1
__.__ o_.__ o_.__ 1
__.__ o_.__ o_.__ 1
ABCabc
ABCabc
ABCabc
3 changes: 3 additions & 0 deletions tests/queries/0_stateless/00240_replace_substring_loop.sql
Original file line number Diff line number Diff line change
Expand Up @@ -99,3 +99,6 @@ SELECT s, replaceOne(s, '_', 'o') AS a, replaceRegexpOne(s, '_', 'o') AS b, a =
SELECT s, replaceOne(s, '_', 'o') AS a, replaceRegexpOne(s, '_', 'o') AS b, a = b FROM (SELECT arrayJoin(['__.__', '.__']) AS s);
SELECT s, replaceOne(s, '_', 'o') AS a, replaceRegexpOne(s, '_', 'o') AS b, a = b FROM (SELECT arrayJoin(['__.__', '__.']) AS s);
SELECT s, replaceOne(s, '_', 'o') AS a, replaceRegexpOne(s, '_', 'o') AS b, a = b FROM (SELECT arrayJoin(['__.__', '__.__']) AS s);
SELECT replace('ABCabc', '', 'DEF');
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is way too dirty. We

  • should not have 101 lines with tests where _ is used as pattern, and then suddenly test empty needles without explanation or comment what is being tested
  • and, even worse, without any connection to the test name (we are not looping anything),
  • and on top, only test the alias "replace" but not test the actual functions "replaceOne" and "replaceAll".

@zhanglistar Please move l. 102 into a new test 00240_replace_with_empty_needle and address these points.

SELECT replace(materialize('ABCabc'), materialize(''), 'DEF');
SELECT replace(materialize('ABCabc'), '', 'DEF');
Original file line number Diff line number Diff line change
Expand Up @@ -134,4 +134,16 @@
3 Hello World not_found x Hello World
4 Hello World [eo] x Hxllo World
5 Hello World . x xello World
Check that an exception is thrown if the needle is empty
Check that whether an exception is thrown if the needle is empty
Hexxo Worxd
Hello World
Hexlo World
Hello World
Hello World
Hello World
Hello World
Hello World
Hexxo Worxd
Hello World
Hexlo World
Hello World
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ SELECT id, haystack, needle, replacement, replaceRegexpOne('Hello World', needle
DROP TABLE IF EXISTS test_tab;


SELECT 'Check that an exception is thrown if the needle is empty';
SELECT 'Check that whether an exception is thrown if the needle is empty';
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bad grammar, please replace by "Check what happens if the needle is empty".


CREATE TABLE test_tab
(id UInt32, haystack String, needle String, replacement String)
Expand All @@ -80,20 +80,20 @@ CREATE TABLE test_tab
INSERT INTO test_tab VALUES (1, 'Hello World', 'l', 'x') (2, 'Hello World', '', 'y');

-- needle: non-const, replacement: const
SELECT replaceAll(haystack, needle, 'x') FROM test_tab; -- { serverError ARGUMENT_OUT_OF_BOUND }
SELECT replaceOne(haystack, needle, 'x') FROM test_tab; -- { serverError ARGUMENT_OUT_OF_BOUND }
SELECT replaceAll(haystack, needle, 'x') FROM test_tab;
SELECT replaceOne(haystack, needle, 'x') FROM test_tab;
SELECT replaceRegexpAll(haystack, needle, 'x') FROM test_tab; -- { serverError ARGUMENT_OUT_OF_BOUND }
SELECT replaceRegexpOne(haystack, needle, 'x') FROM test_tab; -- { serverError ARGUMENT_OUT_OF_BOUND }

-- needle: const, replacement: non-const
SELECT replaceAll(haystack, '', replacement) FROM test_tab; -- { serverError ARGUMENT_OUT_OF_BOUND }
SELECT replaceOne(haystack, '', replacement) FROM test_tab; -- { serverError ARGUMENT_OUT_OF_BOUND }
SELECT replaceAll(haystack, '', replacement) FROM test_tab;
SELECT replaceOne(haystack, '', replacement) FROM test_tab;
SELECT replaceRegexpAll(haystack, '', replacement) FROM test_tab; -- { serverError ARGUMENT_OUT_OF_BOUND }
SELECT replaceRegexpOne(haystack, '', replacement) FROM test_tab; -- { serverError ARGUMENT_OUT_OF_BOUND }

-- needle: non-const, replacement: non-const
SELECT replaceAll(haystack, needle, replacement) FROM test_tab; -- { serverError ARGUMENT_OUT_OF_BOUND }
SELECT replaceOne(haystack, needle, replacement) FROM test_tab; -- { serverError ARGUMENT_OUT_OF_BOUND }
SELECT replaceAll(haystack, needle, replacement) FROM test_tab;
SELECT replaceOne(haystack, needle, replacement) FROM test_tab;
SELECT replaceRegexpAll(haystack, needle, replacement) FROM test_tab; -- { serverError ARGUMENT_OUT_OF_BOUND }
SELECT replaceRegexpOne(haystack, needle, replacement) FROM test_tab; -- { serverError ARGUMENT_OUT_OF_BOUND }

Expand Down