Skip to content

Commit 7cd8315

Browse files
authored
Merge branch 'master' into aggregate_functions_6
2 parents 75f5e94 + b4ba0cb commit 7cd8315

22 files changed

+1494
-41
lines changed

src/AggregateFunctions/AggregateFunctionAvgWeighted.cpp

Lines changed: 64 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,70 @@ createAggregateFunctionAvgWeighted(const std::string & name, const DataTypes & a
184184

185185
void registerAggregateFunctionAvgWeighted(AggregateFunctionFactory & factory)
186186
{
187-
factory.registerFunction("avgWeighted", createAggregateFunctionAvgWeighted);
187+
FunctionDocumentation::Description description = R"(
188+
Calculates the [weighted arithmetic mean](https://en.wikipedia.org/wiki/Weighted_arithmetic_mean).
189+
)";
190+
FunctionDocumentation::Syntax syntax = "avgWeighted(x, weight)";
191+
FunctionDocumentation::Arguments arguments = {
192+
{"x", "Values.", {"(U)Int*", "Float*"}},
193+
{"weight", "Weights of the values.", {"(U)Int*", "Float*"}}
194+
};
195+
FunctionDocumentation::Parameters parameters = {};
196+
FunctionDocumentation::ReturnedValue returned_value = {"Returns `NaN` if all the weights are equal to 0 or the supplied weights parameter is empty, or the weighted mean otherwise.", {"Float64"}};
197+
FunctionDocumentation::Examples examples = {
198+
{
199+
"Usage example",
200+
R"(
201+
SELECT avgWeighted(x, w)
202+
FROM VALUES('x Int8, w Int8', (4, 1), (1, 0), (10, 2))
203+
)",
204+
R"(
205+
┌─avgWeighted(x, w)─┐
206+
│ 8 │
207+
└───────────────────┘
208+
)"
209+
},
210+
{
211+
"Mixed integer and float weights",
212+
R"(
213+
SELECT avgWeighted(x, w)
214+
FROM VALUES('x Int8, w Float64', (4, 1), (1, 0), (10, 2))
215+
)",
216+
R"(
217+
┌─avgWeighted(x, w)─┐
218+
│ 8 │
219+
└───────────────────┘
220+
)"
221+
},
222+
{
223+
"All weights are zero returns NaN",
224+
R"(
225+
SELECT avgWeighted(x, w)
226+
FROM VALUES('x Int8, w Int8', (0, 0), (1, 0), (10, 0))
227+
)",
228+
R"(
229+
┌─avgWeighted(x, w)─┐
230+
│ nan │
231+
└───────────────────┘
232+
)"
233+
},
234+
{
235+
"Empty table returns NaN",
236+
R"(
237+
CREATE TABLE test (t UInt8) ENGINE = Memory;
238+
SELECT avgWeighted(t, t) FROM test
239+
)",
240+
R"(
241+
┌─avgWeighted(t, t)─┐
242+
│ nan │
243+
└───────────────────┘
244+
)"
245+
}
246+
};
247+
FunctionDocumentation::Category category = FunctionDocumentation::Category::AggregateFunction;
248+
FunctionDocumentation::IntroducedIn introduced_in = {20, 1};
249+
FunctionDocumentation documentation = {description, syntax, arguments, parameters, returned_value, examples, introduced_in, category};
250+
factory.registerFunction("avgWeighted", {createAggregateFunctionAvgWeighted, AggregateFunctionProperties{}, documentation });
188251
}
189252

190253
}

src/AggregateFunctions/AggregateFunctionBitwise.cpp

Lines changed: 131 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -229,9 +229,137 @@ AggregateFunctionPtr createAggregateFunctionBitwise(const std::string & name, co
229229

230230
void registerAggregateFunctionsBitwise(AggregateFunctionFactory & factory)
231231
{
232-
factory.registerFunction("groupBitOr", createAggregateFunctionBitwise<AggregateFunctionGroupBitOrData>);
233-
factory.registerFunction("groupBitAnd", createAggregateFunctionBitwise<AggregateFunctionGroupBitAndData>);
234-
factory.registerFunction("groupBitXor", createAggregateFunctionBitwise<AggregateFunctionGroupBitXorData>);
232+
FunctionDocumentation::Description description_or = R"(
233+
Applies bitwise OR for series of numbers.
234+
)";
235+
FunctionDocumentation::Syntax syntax_or = R"(
236+
groupBitOr(expr)
237+
)";
238+
FunctionDocumentation::Arguments arguments_or = {
239+
{"expr", "Expression of `(U)Int*` type.", {"(U)Int*"}}
240+
};
241+
FunctionDocumentation::Parameters parameters_or = {};
242+
FunctionDocumentation::ReturnedValue returned_value_or = {"Returns a value of `(U)Int*` type.", {"(U)Int*"}};
243+
FunctionDocumentation::Examples examples_or = {
244+
{
245+
"Bitwise OR example",
246+
R"(
247+
CREATE TABLE t (num UInt32) ENGINE = Memory;
248+
INSERT INTO t VALUES (44), (28), (13), (85);
249+
250+
-- Test data:
251+
-- binary decimal
252+
-- 00101100 = 44
253+
-- 00011100 = 28
254+
-- 00001101 = 13
255+
-- 01010101 = 85
256+
257+
SELECT groupBitOr(num) FROM t;
258+
)",
259+
R"(
260+
-- Result:
261+
-- binary decimal
262+
-- 01111101 = 125
263+
264+
┌─groupBitOr(num)─┐
265+
│ 125 │
266+
└─────────────────┘
267+
)"
268+
}
269+
};
270+
FunctionDocumentation::IntroducedIn introduced_in_or = {1, 1};
271+
FunctionDocumentation::Category category_or = FunctionDocumentation::Category::AggregateFunction;
272+
FunctionDocumentation documentation_or = {description_or, syntax_or, arguments_or, parameters_or, returned_value_or, examples_or, introduced_in_or, category_or};
273+
274+
factory.registerFunction("groupBitOr", {createAggregateFunctionBitwise<AggregateFunctionGroupBitOrData>, {}, documentation_or});
275+
276+
FunctionDocumentation::Description description = R"(
277+
Applies bitwise AND for series of numbers.
278+
)";
279+
FunctionDocumentation::Syntax syntax = R"(
280+
groupBitAnd(expr)
281+
)";
282+
FunctionDocumentation::Arguments arguments = {
283+
{"expr", "Expression of `(U)Int*` type.", {"(U)Int*"}}
284+
};
285+
FunctionDocumentation::Parameters parameters = {};
286+
FunctionDocumentation::ReturnedValue returned_value = {"Returns a value of `(U)Int*` type.", {"(U)Int*"}};
287+
FunctionDocumentation::Examples examples = {
288+
{
289+
"Bitwise AND example",
290+
R"(
291+
CREATE TABLE t (num UInt32) ENGINE = Memory;
292+
INSERT INTO t VALUES (44), (28), (13), (85);
293+
294+
-- Test data:
295+
-- binary decimal
296+
-- 00101100 = 44
297+
-- 00011100 = 28
298+
-- 00001101 = 13
299+
-- 01010101 = 85
300+
301+
SELECT groupBitAnd(num) FROM t;
302+
)",
303+
R"(
304+
-- Result:
305+
-- binary decimal
306+
-- 00000100 = 4
307+
308+
┌─groupBitAnd(num)─┐
309+
│ 4 │
310+
└──────────────────┘
311+
)"
312+
}
313+
};
314+
FunctionDocumentation::IntroducedIn introduced_in = {1, 1};
315+
FunctionDocumentation::Category category = FunctionDocumentation::Category::AggregateFunction;
316+
FunctionDocumentation documentation = {description, syntax, arguments, parameters, returned_value, examples, introduced_in, category};
317+
318+
factory.registerFunction("groupBitAnd", {createAggregateFunctionBitwise<AggregateFunctionGroupBitAndData>, {}, documentation});
319+
320+
FunctionDocumentation::Description description_xor = R"(
321+
Applies bitwise XOR for series of numbers.
322+
)";
323+
FunctionDocumentation::Syntax syntax_xor = R"(
324+
groupBitXor(expr)
325+
)";
326+
FunctionDocumentation::Arguments arguments_xor = {
327+
{"expr", "Expression of `(U)Int*` type.", {"(U)Int*"}}
328+
};
329+
FunctionDocumentation::Parameters parameters_xor = {};
330+
FunctionDocumentation::ReturnedValue returned_value_xor = {"Returns a value of `(U)Int*` type.", {"(U)Int*"}};
331+
FunctionDocumentation::Examples examples_xor = {
332+
{
333+
"Bitwise XOR example",
334+
R"(
335+
CREATE TABLE t (num UInt32) ENGINE = Memory;
336+
INSERT INTO t VALUES (44), (28), (13), (85);
337+
338+
-- Test data:
339+
-- binary decimal
340+
-- 00101100 = 44
341+
-- 00011100 = 28
342+
-- 00001101 = 13
343+
-- 01010101 = 85
344+
345+
SELECT groupBitXor(num) FROM t;
346+
)",
347+
R"(
348+
-- Result:
349+
-- binary decimal
350+
-- 01101000 = 104
351+
352+
┌─groupBitXor(num)─┐
353+
│ 104 │
354+
└──────────────────┘
355+
)"
356+
}
357+
};
358+
FunctionDocumentation::IntroducedIn introduced_in_xor = {1, 1};
359+
FunctionDocumentation::Category category_xor = FunctionDocumentation::Category::AggregateFunction;
360+
FunctionDocumentation documentation_xor = {description_xor, syntax_xor, arguments_xor, parameters_xor, returned_value_xor, examples_xor, introduced_in_xor, category_xor};
361+
362+
factory.registerFunction("groupBitXor", {createAggregateFunctionBitwise<AggregateFunctionGroupBitXorData>, {}, documentation_xor});
235363

236364
/// Aliases for compatibility with MySQL.
237365
factory.registerAlias("BIT_OR", "groupBitOr", AggregateFunctionFactory::Case::Insensitive);

src/AggregateFunctions/AggregateFunctionBoundingRatio.cpp

Lines changed: 51 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,57 @@ AggregateFunctionPtr createAggregateFunctionRate(const std::string & name, const
202202

203203
void registerAggregateFunctionRate(AggregateFunctionFactory & factory)
204204
{
205-
factory.registerFunction("boundingRatio", createAggregateFunctionRate);
205+
FunctionDocumentation::Description description = R"(
206+
Calculates the slope between the leftmost and rightmost points across a group of values.
207+
)";
208+
FunctionDocumentation::Syntax syntax = "boundingRatio(x, y)";
209+
FunctionDocumentation::Arguments arguments = {
210+
{"x", "X-coordinate values.", {"(U)Int*", "Float*", "Decimal"}},
211+
{"y", "Y-coordinate values.", {"(U)Int*", "Float*", "Decimal"}}
212+
};
213+
FunctionDocumentation::Parameters parameters = {};
214+
FunctionDocumentation::ReturnedValue returned_value = {"Returns the slope of the line between the leftmost and rightmost points, otherwise returns `NaN` if the data is empty.", {"Float64"}};
215+
FunctionDocumentation::Examples examples = {
216+
{
217+
"Sample data",
218+
R"(
219+
SELECT
220+
number,
221+
number * 1.5
222+
FROM numbers(10)
223+
)",
224+
R"(
225+
┌─number─┬─multiply(number, 1.5)─┐
226+
│ 0 │ 0 │
227+
│ 1 │ 1.5 │
228+
│ 2 │ 3 │
229+
│ 3 │ 4.5 │
230+
│ 4 │ 6 │
231+
│ 5 │ 7.5 │
232+
│ 6 │ 9 │
233+
│ 7 │ 10.5 │
234+
│ 8 │ 12 │
235+
│ 9 │ 13.5 │
236+
└────────┴───────────────────────┘
237+
)"
238+
},
239+
{
240+
"Usage example",
241+
R"(
242+
SELECT boundingRatio(number, number * 1.5)
243+
FROM numbers(10)
244+
)",
245+
R"(
246+
┌─boundingRatio(number, multiply(number, 1.5))─┐
247+
│ 1.5 │
248+
└──────────────────────────────────────────────┘
249+
)"
250+
}
251+
};
252+
FunctionDocumentation::Category category = FunctionDocumentation::Category::AggregateFunction;
253+
FunctionDocumentation::IntroducedIn introduced_in = {20, 1};
254+
FunctionDocumentation documentation = {description, syntax, arguments, parameters, returned_value, examples, introduced_in, category};
255+
factory.registerFunction("boundingRatio", {createAggregateFunctionRate, AggregateFunctionProperties{}, documentation});
206256
}
207257

208258
}

src/AggregateFunctions/AggregateFunctionCategoricalInformationValue.cpp

Lines changed: 54 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,8 +180,61 @@ AggregateFunctionPtr createAggregateFunctionCategoricalIV(
180180

181181
void registerAggregateFunctionCategoricalIV(AggregateFunctionFactory & factory)
182182
{
183+
FunctionDocumentation::Description description = R"(
184+
Calculates the information value (IV) for categorical features in relation to a binary target variable.
185+
186+
For each category, the function computes: `(P(tag = 1) - P(tag = 0)) × (log(P(tag = 1)) - log(P(tag = 0)))`
187+
188+
where:
189+
- P(tag = 1) is the probability that the target equals 1 for the given category
190+
- P(tag = 0) is the probability that the target equals 0 for the given category
191+
192+
Information Value is a statistic used to measure the strength of a categorical feature's relationship with a binary target variable in predictive modeling.
193+
Higher absolute values indicate stronger predictive power.
194+
195+
The result indicates how much each discrete (categorical) feature `[category1, category2, ...]` contributes to a learning model which predicts the value of `tag`.
196+
)";
197+
FunctionDocumentation::Syntax syntax = "categoricalInformationValue(category1[, category2, ...,]tag)";
198+
FunctionDocumentation::Arguments arguments = {
199+
{"category1, category2, ...", "One or more categorical features to analyze. Each category should contain discrete values.", {"UInt8"}},
200+
{"tag", "Binary target variable for prediction. Should contain values 0 and 1.", {"UInt8"}}
201+
};
202+
FunctionDocumentation::Parameters parameters = {};
203+
FunctionDocumentation::ReturnedValue returned_value = {"Returns an array of Float64 values representing the information value for each unique combination of categories. Each value indicates the predictive strength of that category combination for the target variable.", {"Array(Float64)"}};
204+
FunctionDocumentation::Examples examples =
205+
{
206+
{
207+
"Basic usage analyzing age groups vs mobile usage",
208+
R"(
209+
-- Using the metrica.hits dataset (available on https://sql.clickhouse.com/) to analyze age-mobile relationship
210+
SELECT categoricalInformationValue(Age < 15, IsMobile)
211+
FROM metrica.hits;
212+
)",
213+
R"(
214+
[0.0014814694805292418]
215+
)"
216+
},
217+
{
218+
"Multiple categorical features with user demographics",
219+
R"(
220+
SELECT categoricalInformationValue(
221+
Sex, -- 0=male, 1=female
222+
toUInt8(Age < 25), -- 0=25+, 1=under 25
223+
toUInt8(IsMobile) -- 0=desktop, 1=mobile
224+
) AS iv_values
225+
FROM metrica.hits
226+
WHERE Sex IN (0, 1);
227+
)",
228+
R"(
229+
[0.00018965785460692887,0.004973668839403392]
230+
)"
231+
}
232+
};
233+
FunctionDocumentation::Category category = FunctionDocumentation::Category::AggregateFunction;
234+
FunctionDocumentation::IntroducedIn introduced_in = {20, 1};
235+
FunctionDocumentation documentation = {description, syntax, arguments, parameters, returned_value, examples, introduced_in, category};
183236
AggregateFunctionProperties properties = { .returns_default_when_only_null = true };
184-
factory.registerFunction("categoricalInformationValue", { createAggregateFunctionCategoricalIV, properties });
237+
factory.registerFunction("categoricalInformationValue", { createAggregateFunctionCategoricalIV, properties, documentation });
185238
}
186239

187240
}

src/AggregateFunctions/AggregateFunctionContingencyCoefficient.cpp

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,13 +47,54 @@ struct ContingencyData : CrossTabData
4747

4848
void registerAggregateFunctionContingency(AggregateFunctionFactory & factory)
4949
{
50+
FunctionDocumentation::Description description = R"(
51+
The `contingency` function calculates the [contingency coefficient](https://en.wikipedia.org/wiki/Contingency_table#Cram%C3%A9r's_V_and_the_contingency_coefficient_C), a value that measures the association between two columns in a table.
52+
The computation is similar to the [`cramersV`](./cramersv.md) function but with a different denominator in the square root.
53+
)";
54+
FunctionDocumentation::Syntax syntax = "contingency(column1, column2)";
55+
FunctionDocumentation::Arguments arguments = {
56+
{"column1", "First column to compare.", {"Any"}},
57+
{"column2", "Second column to compare.", {"Any"}}
58+
};
59+
FunctionDocumentation::Parameters docs_parameters = {};
60+
FunctionDocumentation::ReturnedValue returned_value = {"Returns a value between 0 and 1. The larger the result, the closer the association of the two columns.", {"Float64"}};
61+
FunctionDocumentation::Examples examples = {
62+
{
63+
"Comparison with cramersV",
64+
R"(
65+
SELECT
66+
cramersV(a, b),
67+
contingency(a, b)
68+
FROM
69+
(
70+
SELECT
71+
number % 10 AS a,
72+
number % 4 AS b
73+
FROM
74+
numbers(150)
75+
)
76+
)",
77+
R"(
78+
┌──────cramersV(a, b)─┬───contingency(a, b)─┐
79+
│ 0.41171788506213564 │ 0.05812725261759165 │
80+
└─────────────────────┴─────────────────────┘
81+
)"
82+
}
83+
};
84+
FunctionDocumentation::Category category = FunctionDocumentation::Category::AggregateFunction;
85+
FunctionDocumentation::IntroducedIn introduced_in = {22, 1};
86+
FunctionDocumentation documentation = {description, syntax, arguments, docs_parameters, returned_value, examples, introduced_in, category};
5087
factory.registerFunction(ContingencyData::getName(),
88+
{
5189
[](const std::string & name, const DataTypes & argument_types, const Array & parameters, const Settings *)
5290
{
5391
assertBinary(name, argument_types);
5492
assertNoParameters(name, parameters);
5593
return std::make_shared<AggregateFunctionCrossTab<ContingencyData>>(argument_types);
56-
});
94+
},
95+
{},
96+
documentation
97+
});
5798
}
5899

59100
}

0 commit comments

Comments
 (0)