Written by @javihonza in firebird-support list https://groups.google.com/g/firebird-support/c/VCXnWp0IZVw:
Hello,
we have a speed problem when using national COLLATE on UTF8 columns.
If we use UTF8 without COLLATE the speed problem does not arise. The problem is not when using ANSI with national COLLATE.
The speed problem is both in CASE SENSITIVE and CASE INSENSITIVE. Tested on FB3 (UCI 6.9) and FB 4 (default UCI).
The speed problem prevents us from switching to unicode (ANSI -> UTF8).
Please who should we contact to solve the problem?
How to simulate the problem:
create collation UNICODE_CSCZ_CI
for UTF8
from UNICODE
case insensitive
'LOCALE=cs_CZ'
;
create collation UNICODE_CSCZ_CS
for UTF8
from UNICODE
case sensitive
'LOCALE=cs_CZ'
;
CREATE TABLE TEST1M (
ANSI_CZ VARCHAR(10) CHARACTER SET WIN1250 COLLATE PXW_CSY,
UNICODE_CS_CZ VARCHAR(10) CHARACTER SET UTF8 COLLATE UNICODE_CSCZ_CS,
UNICODE_CI_CZ VARCHAR(10) CHARACTER SET UTF8 COLLATE UNICODE_CSCZ_CI,
UNICODE_CS VARCHAR(10) CHARACTER SET UTF8,
UNICODE_CI VARCHAR(10) CHARACTER SET UTF8 COLLATE UNICODE_CI
);
CREATE OR ALTER PROCEDURE GetStr(AORDERID BIGINT)
RETURNS (AResult CHAR(10)) AS
declare variable Base36Chars CHAR(36);
declare variable mResult VARCHAR(10);
declare variable ID BIGINT;
declare variable I INT;
BEGIN
Base36Chars = '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ';
mResult = '';
AResult = mResult;
ID = AORDERID;
WHILE (ID > 0) DO
BEGIN
I = MOD(ID, 36);
ID = ID / 36;
mResult = mResult || SubString(Base36Chars from I + 1 for 1);
END
AResult = LEFT(mResult || '0000000', 7);
Suspend;
END;
-- Generate test string data
-- 000000, 100000...900000...A00000...Z00000,
-- 010000, 110000...910000...A10000...Z10000,
-- ...
EXECUTE BLOCK
AS
DECLARE ROWSCOUNT INT = 1000000;
DECLARE I INT = 0;
DECLARE C INT = 0;
DECLARE Str VARCHAR(10);
BEGIN
WHILE (C < ROWSCOUNT) DO
BEGIN
SELECT AResult from GetStr(:I) into :Str;
-- Skip Y, Z
IF ((LEFT(Str, 1) <> 'Y') AND (LEFT(Str, 1) <> 'Z')) THEN BEGIN
INSERT INTO TEST1M(ANSI_CZ, UNICODE_CS_CZ, UNICODE_CI_CZ, UNICODE_CS, UNICODE_CI) VALUES (:Str, :Str, :Str, :Str, :Str);
C = C + 1;
END
I = I + 1;
END
END;
CREATE INDEX TEST1M_ANSI_CZ ON TEST1M (ANSI_CZ);
CREATE INDEX TEST1M_UNICODE_CS_CZ ON TEST1M (UNICODE_CS_CZ);
CREATE INDEX TEST1M_UNICODE_CI_CZ ON TEST1M (UNICODE_CI_CZ);
CREATE INDEX TEST1M_UNICODE_CS ON TEST1M (UNICODE_CS);
CREATE INDEX TEST1M_UNICODE_CI ON TEST1M (UNICODE_CI);
SELECT COUNT(*) FROM TEST1M;
-- Time 269ms <<<<<<< OK >>>>>>>
-- Result 1000000
--######################################### Scenario use WHERE >= #########################################
-- Problem only in CZECH collate case insensitive
SELECT ANSI_CZ FROM TEST1M
WHERE ANSI_CZ >= 'Z'
ORDER BY ANSI_CZ;
-- Time: 0ms <<<<<<< OK >>>>>>>
-- Result nothing
SELECT UNICODE_CS FROM TEST1M
WHERE UNICODE_CS >= 'Z'
ORDER BY UNICODE_CS;
-- Time: 0ms <<<<<<< OK >>>>>>>
-- Result nothing
SELECT UNICODE_CI FROM TEST1M
WHERE UNICODE_CI >= 'Z'
ORDER BY UNICODE_CI;
-- Time: 0ms <<<<<<< OK >>>>>>>
-- Result nothing
SELECT UNICODE_CS_CZ FROM TEST1M
WHERE UNICODE_CS_CZ >= 'Z'
ORDER BY UNICODE_CS_CZ;
-- Time: 0ms <<<<<<< OK >>>>>>>
-- Result nothing
SELECT UNICODE_CI_CZ FROM TEST1M
WHERE UNICODE_CI_CZ >= 'Z'
ORDER BY UNICODE_CI_CZ;
-- Time: 4,294s <<<<<<< COLLATE CASE INSENSITIVE HEAR IS PERFORMANCE PROBLEM FOR VALUE "Z" 4,294s WHY? VALUE "Y" IS OK 0ms >>>>>>>
-- Result nothing
SELECT UNICODE_CI_CZ FROM TEST1M
WHERE UNICODE_CI_CZ >= 'Y'
ORDER BY UNICODE_CI_CZ;
-- Time: 0ms <<<<<<< OK >>>>>>>
-- Result nothing
SELECT FIRST 1 UNICODE_CI_CZ FROM TEST1M
WHERE UNICODE_CI_CZ >= 'C'
ORDER BY UNICODE_CI_CZ;
-- Time: 1,531s <<<<<<< collate case INSENSITIVE NEXT PERFOMANCE PROBLEM HAVE VALUE "C" 1,531s WHY? VALUE "D" IS OK 0ms >>>>>>>
-- Result C000000
SELECT FIRST 1 UNICODE_CI_CZ FROM TEST1M
WHERE UNICODE_CI_CZ >= 'D'
ORDER BY UNICODE_CI_CZ;
-- Time: 0ms <<<<<<< OK >>>>>>>
-- Result D000000
--######################################### Scenario use WHERE like #########################################
-- Problem in CZECH collate case sensitive and insensitive
SELECT ANSI_CZ FROM TEST1M
WHERE ANSI_CZ LIKE 'Z%'
ORDER BY ANSI_CZ;
-- Time: 0ms <<<<<<< OK >>>>>>>
-- Result nothing
SELECT UNICODE_CS FROM TEST1M
WHERE UNICODE_CS LIKE 'Z%'
ORDER BY UNICODE_CS;
-- Time: 0ms <<<<<<< OK >>>>>>>
-- Result nothing
SELECT UNICODE_CI FROM TEST1M
WHERE UNICODE_CI LIKE 'Z%'
ORDER BY UNICODE_CI;
-- Time: 0ms <<<<<<< OK >>>>>>>
-- Result nothing
SELECT UNICODE_CS_CZ FROM TEST1M
WHERE UNICODE_CS_CZ LIKE 'Z%'
ORDER BY UNICODE_CS_CZ;
-- Time: 4,247s <<<<<<< collate case SENSITIVE HEAR IS PERFOMANCE PROBLEM FOR VALUE "Z" 4,247s WHY? VALUE "Y" IS OK 0ms >>>>>>>
-- Result nothing
SELECT UNICODE_CS_CZ FROM TEST1M
WHERE UNICODE_CS_CZ LIKE 'Y%'
ORDER BY UNICODE_CS_CZ;
-- Time: 0ms <<<<<<< FOR letter "Y" IS NOT PROBLEM :) WHY? >>>>>>>
-- Result nothing
SELECT UNICODE_CI_CZ FROM TEST1M
WHERE UNICODE_CI_CZ LIKE 'Z%'
ORDER BY UNICODE_CI_CZ;
-- Time: 4,52s <<<<<<< HEAR IS PERFORMANCE PROBLEM WHY? >>>>>>>
-- Result nothing
SELECT UNICODE_CI_CZ FROM TEST1M
WHERE UNICODE_CI_CZ LIKE 'Y%'
ORDER BY UNICODE_CI_CZ;
-- Time: 0ms <<<<<<< FOR letter "Y" IS NOT PROBLEM :) WHY? >>>>>>>
-- Result nothing
Written by @javihonza in firebird-support list https://groups.google.com/g/firebird-support/c/VCXnWp0IZVw:
Hello,
we have a speed problem when using national COLLATE on UTF8 columns.
If we use UTF8 without COLLATE the speed problem does not arise. The problem is not when using ANSI with national COLLATE.
The speed problem is both in CASE SENSITIVE and CASE INSENSITIVE. Tested on FB3 (UCI 6.9) and FB 4 (default UCI).
The speed problem prevents us from switching to unicode (ANSI -> UTF8).
Please who should we contact to solve the problem?
How to simulate the problem: