Skip to content

Commit 2186aa8

Browse files
committed
Revert "Revert "Implement punycode encoding/decoding""
This reverts commit 345d29a.
1 parent ce13b21 commit 2186aa8

File tree

13 files changed

+369
-1
lines changed

13 files changed

+369
-1
lines changed

.gitmodules

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -360,3 +360,6 @@
360360
[submodule "contrib/sqids-cpp"]
361361
path = contrib/sqids-cpp
362362
url = https://github.com/sqids/sqids-cpp.git
363+
[submodule "contrib/idna"]
364+
path = contrib/idna
365+
url = https://github.com/ada-url/idna.git

contrib/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,7 @@ add_contrib (libpqxx-cmake libpqxx)
154154
add_contrib (libpq-cmake libpq)
155155
add_contrib (nuraft-cmake NuRaft)
156156
add_contrib (fast_float-cmake fast_float)
157+
add_contrib (idna-cmake idna)
157158
add_contrib (datasketches-cpp-cmake datasketches-cpp)
158159
add_contrib (incbin-cmake incbin)
159160
add_contrib (sqids-cpp-cmake sqids-cpp)

contrib/idna

Submodule idna added at 3c8be01

contrib/idna-cmake/CMakeLists.txt

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
option(ENABLE_IDNA "Enable idna support" ${ENABLE_LIBRARIES})
2+
if ((NOT ENABLE_IDNA))
3+
message (STATUS "Not using idna")
4+
return()
5+
endif()
6+
set (LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/idna")
7+
8+
set (SRCS
9+
"${LIBRARY_DIR}/src/idna.cpp"
10+
"${LIBRARY_DIR}/src/mapping.cpp"
11+
"${LIBRARY_DIR}/src/mapping_tables.cpp"
12+
"${LIBRARY_DIR}/src/normalization.cpp"
13+
"${LIBRARY_DIR}/src/normalization_tables.cpp"
14+
"${LIBRARY_DIR}/src/punycode.cpp"
15+
"${LIBRARY_DIR}/src/to_ascii.cpp"
16+
"${LIBRARY_DIR}/src/to_unicode.cpp"
17+
"${LIBRARY_DIR}/src/unicode_transcoding.cpp"
18+
"${LIBRARY_DIR}/src/validity.cpp"
19+
)
20+
21+
add_library (_idna ${SRCS})
22+
target_include_directories(_idna PUBLIC "${LIBRARY_DIR}/include")
23+
24+
add_library (ch_contrib::idna ALIAS _idna)

docs/en/sql-reference/functions/string-functions.md

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1383,6 +1383,71 @@ Result:
13831383
└──────────────────┘
13841384
```
13851385

1386+
## punycodeEncode
1387+
1388+
Returns the [Punycode](https://en.wikipedia.org/wiki/Punycode) of a string.
1389+
The string must be UTF8-encoded, otherwise results are undefined.
1390+
1391+
**Syntax**
1392+
1393+
``` sql
1394+
punycodeEncode(val)
1395+
```
1396+
1397+
**Arguments**
1398+
1399+
- `val` - Input value. [String](../data-types/string.md)
1400+
1401+
**Returned value**
1402+
1403+
- A Punycode representation of the input value. [String](../data-types/string.md)
1404+
1405+
**Example**
1406+
1407+
``` sql
1408+
select punycodeEncode('München');
1409+
```
1410+
1411+
Result:
1412+
1413+
```result
1414+
┌─punycodeEncode('München')─┐
1415+
│ Mnchen-3ya │
1416+
└───────────────────────────┘
1417+
```
1418+
1419+
## punycodeDecode
1420+
1421+
Returns the UTF8-encoded plaintext of a [Punycode](https://en.wikipedia.org/wiki/Punycode)-encoded string.
1422+
1423+
**Syntax**
1424+
1425+
``` sql
1426+
punycodeEncode(val)
1427+
```
1428+
1429+
**Arguments**
1430+
1431+
- `val` - Punycode-encoded string. [String](../data-types/string.md)
1432+
1433+
**Returned value**
1434+
1435+
- The plaintext of the input value. [String](../data-types/string.md)
1436+
1437+
**Example**
1438+
1439+
``` sql
1440+
select punycodeDecode('Mnchen-3ya');
1441+
```
1442+
1443+
Result:
1444+
1445+
```result
1446+
┌─punycodeEncode('Mnchen-3ya')─┐
1447+
│ München │
1448+
└──────────────────────────────┘
1449+
```
1450+
13861451
## byteHammingDistance
13871452

13881453
Calculates the [hamming distance](https://en.wikipedia.org/wiki/Hamming_distance) between two byte strings.

src/Common/config.h.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
#cmakedefine01 USE_S2_GEOMETRY
2929
#cmakedefine01 USE_FASTOPS
3030
#cmakedefine01 USE_SQIDS
31+
#cmakedefine01 USE_IDNA
3132
#cmakedefine01 USE_NLP
3233
#cmakedefine01 USE_VECTORSCAN
3334
#cmakedefine01 USE_LIBURING

src/Functions/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,10 @@ if (TARGET ch_contrib::sqids)
8383
list (APPEND PRIVATE_LIBS ch_contrib::sqids)
8484
endif()
8585

86+
if (TARGET ch_contrib::idna)
87+
list (APPEND PRIVATE_LIBS ch_contrib::idna)
88+
endif()
89+
8690
if (TARGET ch_contrib::h3)
8791
list (APPEND PRIVATE_LIBS ch_contrib::h3)
8892
endif()

src/Functions/FunctionSqid.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#include "config.h"
22

3-
#ifdef ENABLE_SQIDS
3+
#if USE_SQIDS
44

55
#include <Columns/ColumnString.h>
66
#include <Columns/ColumnsNumber.h>

src/Functions/punycode.cpp

Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
#include "config.h"
2+
3+
#if USE_IDNA
4+
5+
#include <Functions/FunctionFactory.h>
6+
#include <Functions/FunctionStringToString.h>
7+
8+
#ifdef __clang__
9+
# pragma clang diagnostic push
10+
# pragma clang diagnostic ignored "-Wnewline-eof"
11+
#endif
12+
# include <ada/idna/punycode.h>
13+
# include <ada/idna/unicode_transcoding.h>
14+
#ifdef __clang__
15+
# pragma clang diagnostic pop
16+
#endif
17+
18+
namespace DB
19+
{
20+
21+
namespace ErrorCodes
22+
{
23+
extern const int BAD_ARGUMENTS;
24+
extern const int ILLEGAL_COLUMN;
25+
}
26+
27+
struct PunycodeEncodeImpl
28+
{
29+
static void vector(
30+
const ColumnString::Chars & data,
31+
const ColumnString::Offsets & offsets,
32+
ColumnString::Chars & res_data,
33+
ColumnString::Offsets & res_offsets)
34+
{
35+
const size_t rows = offsets.size();
36+
res_data.reserve(data.size()); /// just a guess, assuming the input is all-ASCII
37+
res_offsets.reserve(rows);
38+
39+
size_t prev_offset = 0;
40+
std::u32string value_utf32;
41+
std::string value_puny;
42+
for (size_t row = 0; row < rows; ++row)
43+
{
44+
const char * value = reinterpret_cast<const char *>(&data[prev_offset]);
45+
const size_t value_length = offsets[row] - prev_offset - 1;
46+
47+
const size_t value_utf32_length = ada::idna::utf32_length_from_utf8(value, value_length);
48+
value_utf32.resize(value_utf32_length);
49+
ada::idna::utf8_to_utf32(value, value_length, value_utf32.data());
50+
51+
const bool ok = ada::idna::utf32_to_punycode(value_utf32, value_puny);
52+
if (!ok)
53+
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Internal error during Punycode encoding");
54+
55+
res_data.insert(value_puny.c_str(), value_puny.c_str() + value_puny.size() + 1);
56+
res_offsets.push_back(res_data.size());
57+
58+
prev_offset = offsets[row];
59+
60+
value_utf32.clear();
61+
value_puny.clear(); /// utf32_to_punycode() appends to its output string
62+
}
63+
}
64+
65+
[[noreturn]] static void vectorFixed(const ColumnString::Chars &, size_t, ColumnString::Chars &)
66+
{
67+
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Column of type FixedString is not supported by punycodeEncode function");
68+
}
69+
};
70+
71+
struct PunycodeDecodeImpl
72+
{
73+
static void vector(
74+
const ColumnString::Chars & data,
75+
const ColumnString::Offsets & offsets,
76+
ColumnString::Chars & res_data,
77+
ColumnString::Offsets & res_offsets)
78+
{
79+
const size_t rows = offsets.size();
80+
res_data.reserve(data.size()); /// just a guess, assuming the input is all-ASCII
81+
res_offsets.reserve(rows);
82+
83+
size_t prev_offset = 0;
84+
std::u32string value_utf32;
85+
std::string value_utf8;
86+
for (size_t row = 0; row < rows; ++row)
87+
{
88+
const char * value = reinterpret_cast<const char *>(&data[prev_offset]);
89+
const size_t value_length = offsets[row] - prev_offset - 1;
90+
91+
const std::string_view value_punycode(value, value_length);
92+
const bool ok = ada::idna::punycode_to_utf32(value_punycode, value_utf32);
93+
if (!ok)
94+
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Internal error during Punycode decoding");
95+
96+
const size_t utf8_length = ada::idna::utf8_length_from_utf32(value_utf32.data(), value_utf32.size());
97+
value_utf8.resize(utf8_length);
98+
ada::idna::utf32_to_utf8(value_utf32.data(), value_utf32.size(), value_utf8.data());
99+
100+
res_data.insert(value_utf8.c_str(), value_utf8.c_str() + value_utf8.size() + 1);
101+
res_offsets.push_back(res_data.size());
102+
103+
prev_offset = offsets[row];
104+
105+
value_utf32.clear(); /// punycode_to_utf32() appends to its output string
106+
value_utf8.clear();
107+
}
108+
}
109+
110+
[[noreturn]] static void vectorFixed(const ColumnString::Chars &, size_t, ColumnString::Chars &)
111+
{
112+
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Column of type FixedString is not supported by punycodeDecode function");
113+
}
114+
};
115+
116+
struct NamePunycodeEncode
117+
{
118+
static constexpr auto name = "punycodeEncode";
119+
};
120+
121+
struct NamePunycodeDecode
122+
{
123+
static constexpr auto name = "punycodeDecode";
124+
};
125+
126+
REGISTER_FUNCTION(Punycode)
127+
{
128+
factory.registerFunction<FunctionStringToString<PunycodeEncodeImpl, NamePunycodeEncode>>(FunctionDocumentation{
129+
.description=R"(
130+
Computes a Punycode representation of a string.)",
131+
.syntax="punycodeEncode(str)",
132+
.arguments={{"str", "Input string"}},
133+
.returned_value="The punycode representation [String](/docs/en/sql-reference/data-types/string.md).",
134+
.examples={
135+
{"simple",
136+
"SELECT punycodeEncode('München') AS puny;",
137+
R"(
138+
┌─puny───────┐
139+
│ Mnchen-3ya │
140+
└────────────┘
141+
)"
142+
}}
143+
});
144+
145+
factory.registerFunction<FunctionStringToString<PunycodeDecodeImpl, NamePunycodeDecode>>(FunctionDocumentation{
146+
.description=R"(
147+
Computes a Punycode representation of a string.)",
148+
.syntax="punycodeDecode(str)",
149+
.arguments={{"str", "A Punycode-encoded string"}},
150+
.returned_value="The plaintext representation [String](/docs/en/sql-reference/data-types/string.md).",
151+
.examples={
152+
{"simple",
153+
"SELECT punycodeDecode('Mnchen-3ya') AS plain;",
154+
R"(
155+
┌─plain───┐
156+
│ München │
157+
└─────────┘
158+
)"
159+
}}
160+
});
161+
}
162+
163+
}
164+
165+
#endif

src/configure_config.cmake

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,9 @@ endif()
129129
if (TARGET ch_contrib::sqids)
130130
set(USE_SQIDS 1)
131131
endif()
132+
if (TARGET ch_contrib::idna)
133+
set(USE_IDNA 1)
134+
endif()
132135
if (TARGET ch_contrib::vectorscan)
133136
set(USE_VECTORSCAN 1)
134137
endif()

0 commit comments

Comments
 (0)