Skip to content

Commit d57481d

Browse files
Merge pull request #506 from kamil-tekiela/UtfString
Optimize offsetGet
2 parents 6fd2c59 + be2ca97 commit d57481d

File tree

4 files changed

+43
-228
lines changed

4 files changed

+43
-228
lines changed

src/Tools/CustomJsonSerializer.php

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,6 @@ class CustomJsonSerializer extends JsonSerializer
4343
'viewOptions',
4444
'eventOptions',
4545
'userOptions',
46-
'asciiMap',
4746
];
4847

4948
/**

src/UtfString.php

Lines changed: 15 additions & 171 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,10 @@
1010

1111
use function mb_check_encoding;
1212
use function mb_strlen;
13+
use function mb_substr;
1314
use function ord;
15+
use function strlen;
16+
use function substr;
1417

1518
/**
1619
* Implementation for UTF-8 strings.
@@ -68,119 +71,6 @@ class UtfString implements ArrayAccess, Stringable
6871
*/
6972
public $charLen = 0;
7073

71-
/**
72-
* A map of ASCII binary values to their ASCII code
73-
* This is to improve performance and avoid calling ord($byte)
74-
*
75-
* Source: https://www.freecodecamp.org/news/ascii-table-hex-to-ascii-value-character-code-chart-2/
76-
*
77-
* @var array<int|string,int>
78-
*/
79-
protected static $asciiMap = [
80-
"\0" => 0, // (00000000) NUL Null
81-
"\t" => 9, // (00001001) HT Horizontal Tab
82-
"\n" => 10, // (00001010) LF Newline / Line Feed
83-
"\v" => 11, // (00001011) VT Vertical Tab
84-
"\f" => 12, // (00001100) FF Form Feed
85-
"\r" => 13, // (00001101) CR Carriage Return
86-
' ' => 32, // (00100000) SP Space
87-
'!' => 33, // (00100001) ! Exclamation mark
88-
'"' => 34, // (00100010) " Double quote
89-
'#' => 35, // (00100011) # Number
90-
'$' => 36, // (00100100) $ Dollar
91-
'%' => 37, // (00100101) % Percent
92-
'&' => 38, // (00100110) & Ampersand
93-
'\'' => 39, // (00100111) ' Single quote
94-
'(' => 40, // (00101000) ( Left parenthesis
95-
')' => 41, // (00101001) ) Right parenthesis
96-
'*' => 42, // (00101010) * Asterisk
97-
'+' => 43, // (00101011) + Plus
98-
',' => 44, // (00101100) , Comma
99-
'-' => 45, // (00101101) - Minus
100-
'.' => 46, // (00101110) . Period
101-
'/' => 47, // (00101111) / Slash
102-
'0' => 48, // (00110000) 0 Zero
103-
'1' => 49, // (00110001) 1 One
104-
'2' => 50, // (00110010) 2 Two
105-
'3' => 51, // (00110011) 3 Three
106-
'4' => 52, // (00110100) 4 Four
107-
'5' => 53, // (00110101) 5 Five
108-
'6' => 54, // (00110110) 6 Six
109-
'7' => 55, // (00110111) 7 Seven
110-
'8' => 56, // (00111000) 8 Eight
111-
'9' => 57, // (00111001) 9 Nine
112-
':' => 58, // (00111010) : Colon
113-
';' => 59, // (00111011) ; Semicolon
114-
'<' => 60, // (00111100) < Less than
115-
'=' => 61, // (00111101) = Equal sign
116-
'>' => 62, // (00111110) > Greater than
117-
'?' => 63, // (00111111) ? Question mark
118-
'@' => 64, // (01000000) @ At sign
119-
'A' => 65, // (01000001) A Uppercase A
120-
'B' => 66, // (01000010) B Uppercase B
121-
'C' => 67, // (01000011) C Uppercase C
122-
'D' => 68, // (01000100) D Uppercase D
123-
'E' => 69, // (01000101) E Uppercase E
124-
'F' => 70, // (01000110) F Uppercase F
125-
'G' => 71, // (01000111) G Uppercase G
126-
'H' => 72, // (01001000) H Uppercase H
127-
'I' => 73, // (01001001) I Uppercase I
128-
'J' => 74, // (01001010) J Uppercase J
129-
'K' => 75, // (01001011) K Uppercase K
130-
'L' => 76, // (01001100) L Uppercase L
131-
'M' => 77, // (01001101) M Uppercase M
132-
'N' => 78, // (01001110) N Uppercase N
133-
'O' => 79, // (01001111) O Uppercase O
134-
'P' => 80, // (01010000) P Uppercase P
135-
'Q' => 81, // (01010001) Q Uppercase Q
136-
'R' => 82, // (01010010) R Uppercase R
137-
'S' => 83, // (01010011) S Uppercase S
138-
'T' => 84, // (01010100) T Uppercase T
139-
'U' => 85, // (01010101) U Uppercase U
140-
'V' => 86, // (01010110) V Uppercase V
141-
'W' => 87, // (01010111) W Uppercase W
142-
'X' => 88, // (01011000) X Uppercase X
143-
'Y' => 89, // (01011001) Y Uppercase Y
144-
'Z' => 90, // (01011010) Z Uppercase Z
145-
'[' => 91, // (01011011) [ Left square bracket
146-
'\\' => 92, // (01011100) \ backslash
147-
']' => 93, // (01011101) ] Right square bracket
148-
'^' => 94, // (01011110) ^ Caret / circumflex
149-
'_' => 95, // (01011111) _ Underscore
150-
'`' => 96, // (01100000) ` Grave / accent
151-
'a' => 97, // (01100001) a Lowercase a
152-
'b' => 98, // (01100010) b Lowercase b
153-
'c' => 99, // (01100011) c Lowercase c
154-
'd' => 100, // (01100100) d Lowercase d
155-
'e' => 101, // (01100101) e Lowercase e
156-
'f' => 102, // (01100110) f Lowercase
157-
'g' => 103, // (01100111) g Lowercase g
158-
'h' => 104, // (01101000) h Lowercase h
159-
'i' => 105, // (01101001) i Lowercase i
160-
'j' => 106, // (01101010) j Lowercase j
161-
'k' => 107, // (01101011) k Lowercase k
162-
'l' => 108, // (01101100) l Lowercase l
163-
'm' => 109, // (01101101) m Lowercase m
164-
'n' => 110, // (01101110) n Lowercase n
165-
'o' => 111, // (01101111) o Lowercase o
166-
'p' => 112, // (01110000) p Lowercase p
167-
'q' => 113, // (01110001) q Lowercase q
168-
'r' => 114, // (01110010) r Lowercase r
169-
's' => 115, // (01110011) s Lowercase s
170-
't' => 116, // (01110100) t Lowercase t
171-
'u' => 117, // (01110101) u Lowercase u
172-
'v' => 118, // (01110110) v Lowercase v
173-
'w' => 119, // (01110111) w Lowercase w
174-
'x' => 120, // (01111000) x Lowercase x
175-
'y' => 121, // (01111001) y Lowercase y
176-
'z' => 122, // (01111010) z Lowercase z
177-
'{' => 123, // (01111011) { Left curly bracket
178-
'|' => 124, // (01111100) | Vertical bar
179-
'}' => 125, // (01111101) } Right curly bracket
180-
'~' => 126, // (01111110) ~ Tilde
181-
"\x7f" => 127, // (01111111) DEL Delete
182-
];
183-
18474
/**
18575
* @param string $str the string
18676
*/
@@ -212,6 +102,12 @@ public function offsetExists($offset): bool
212102
*/
213103
public function offsetGet($offset): string|null
214104
{
105+
// This function moves the internal byte and character pointer to the requested offset.
106+
// This function is part of hot code so the aim is to do the following
107+
// operations as efficiently as possible.
108+
// UTF-8 character encoding is a variable length encoding that encodes Unicode
109+
// characters in 1-4 bytes. Thus we fetch 4 bytes from the current offset and then use mb_substr
110+
// to get the first UTF-8 character in it. We then use strlen to get the character's size in bytes.
215111
if (($offset < 0) || ($offset >= $this->charLen)) {
216112
return null;
217113
}
@@ -220,13 +116,13 @@ public function offsetGet($offset): string|null
220116

221117
if ($delta > 0) {
222118
// Fast forwarding.
223-
while ($delta-- > 0) {
224-
$this->byteIdx += static::getCharLength($this->str[$this->byteIdx]);
225-
++$this->charIdx;
226-
}
119+
$this->byteIdx += strlen(mb_substr(substr($this->str, $this->byteIdx, 4 * $delta), 0, $delta));
120+
$this->charIdx += $delta;
227121
} elseif ($delta < 0) {
228122
// Rewinding.
229123
while ($delta++ < 0) {
124+
// We rewind byte by byte and only count characters that are not continuation bytes,
125+
// i.e. ASCII characters and first octets of multibyte characters
230126
do {
231127
$byte = ord($this->str[--$this->byteIdx]);
232128
} while (($byte >= 128) && ($byte < 192));
@@ -235,14 +131,8 @@ public function offsetGet($offset): string|null
235131
}
236132
}
237133

238-
$bytesCount = static::getCharLength($this->str[$this->byteIdx]);
239-
240-
$ret = '';
241-
for ($i = 0; $bytesCount-- > 0; ++$i) {
242-
$ret .= $this->str[$this->byteIdx + $i];
243-
}
244-
245-
return $ret;
134+
// Fetch the first Unicode character within the next 4 bytes in the string.
135+
return mb_substr(substr($this->str, $this->byteIdx, 4), 0, 1);
246136
}
247137

248138
/**
@@ -270,52 +160,6 @@ public function offsetUnset($offset): void
270160
throw new Exception('Not implemented.');
271161
}
272162

273-
/**
274-
* Gets the length of an UTF-8 character.
275-
*
276-
* According to RFC 3629, a UTF-8 character can have at most 4 bytes.
277-
* However, this implementation supports UTF-8 characters containing up to 6
278-
* bytes.
279-
*
280-
* @see https://tools.ietf.org/html/rfc3629
281-
*
282-
* @param string $byte the byte to be analyzed
283-
*/
284-
public static function getCharLength($byte): int
285-
{
286-
// Use the default ASCII map as queries are mostly ASCII chars
287-
// ord($byte) has a performance cost
288-
289-
if (! isset(static::$asciiMap[$byte])) {
290-
// Complete the cache with missing items
291-
static::$asciiMap[$byte] = ord($byte);
292-
}
293-
294-
$byte = static::$asciiMap[$byte];
295-
296-
if ($byte < 128) {
297-
return 1;
298-
}
299-
300-
if ($byte < 224) {
301-
return 2;
302-
}
303-
304-
if ($byte < 240) {
305-
return 3;
306-
}
307-
308-
if ($byte < 248) {
309-
return 4;
310-
}
311-
312-
if ($byte < 252) {
313-
return 5; // unofficial
314-
}
315-
316-
return 6; // unofficial
317-
}
318-
319163
/**
320164
* Returns the length in characters of the string.
321165
*/

tests/Misc/UtfStringTest.php

Lines changed: 6 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,6 @@
99
use PHPUnit\Framework\Attributes\DataProvider;
1010
use Throwable;
1111

12-
use function chr;
13-
1412
class UtfStringTest extends TestCase
1513
{
1614
/**
@@ -55,27 +53,6 @@ public function testUnset(): void
5553
unset($str[0]);
5654
}
5755

58-
public function testGetCharLength(): void
59-
{
60-
$this->assertEquals(1, UtfString::getCharLength(chr(0x00))); // 00000000
61-
$this->assertEquals(1, UtfString::getCharLength(chr(0x7F))); // 01111111
62-
63-
$this->assertEquals(2, UtfString::getCharLength(chr(0xC0))); // 11000000
64-
$this->assertEquals(2, UtfString::getCharLength(chr(0xDF))); // 11011111
65-
66-
$this->assertEquals(3, UtfString::getCharLength(chr(0xE0))); // 11100000
67-
$this->assertEquals(3, UtfString::getCharLength(chr(0xEF))); // 11101111
68-
69-
$this->assertEquals(4, UtfString::getCharLength(chr(0xF0))); // 11110000
70-
$this->assertEquals(4, UtfString::getCharLength(chr(0xF7))); // 11110111
71-
72-
$this->assertEquals(5, UtfString::getCharLength(chr(0xF8))); // 11111000
73-
$this->assertEquals(5, UtfString::getCharLength(chr(0xFB))); // 11111011
74-
75-
$this->assertEquals(6, UtfString::getCharLength(chr(0xFC))); // 11111100
76-
$this->assertEquals(6, UtfString::getCharLength(chr(0xFD))); // 11111101
77-
}
78-
7956
public function testToString(): void
8057
{
8158
$str = new UtfString(self::TEST_PHRASE);
@@ -112,7 +89,7 @@ public static function utf8StringsProvider(): array
11289
'č',
11390
],
11491
'emoji' => [
115-
'😂😄😃😀😊😉😍😘😚😗😂👿😮😨😱😠😡😤😖😆😋👯',
92+
'🦋😄😃😀😊😉😍😘😚😗😂👿😮😨😱😠😡😤😖😆😋👯',
11693
'😂',
11794
'😋',
11895
],
@@ -121,6 +98,11 @@ public static function utf8StringsProvider(): array
12198
null,
12299
null,
123100
],
101+
'random' => [
102+
'xℤⅿↈⅬ⅀ↆℜℝ⅗ℾ℧ⅰℓⅯⅵⅣ⅒21⅞',
103+
'',
104+
'',
105+
],
124106
];
125107
}
126108
}

tests/benchmarks/UtfStringBench.php

Lines changed: 22 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66

77
use PhpMyAdmin\SqlParser\UtfString;
88

9-
use function chr;
109
use function file_get_contents;
1110

1211
class UtfStringBench
@@ -19,8 +18,7 @@ class UtfStringBench
1918
* @Iterations(20)
2019
* @Revs(4)
2120
* @OutputTimeUnit("milliseconds")
22-
* @Assert("mode(variant.time.avg) < 100 milliseconds +/- 10%")
23-
* @Assert("mode(variant.time.avg) > 30 milliseconds +/- 10%")
21+
* @Assert("mode(variant.time.avg) < 40 milliseconds +/- 10%")
2422
*/
2523
public function benchBuildUtfString(): void
2624
{
@@ -30,38 +28,30 @@ public function benchBuildUtfString(): void
3028
}
3129
}
3230

33-
/**
34-
* @BeforeMethods("setUp")
35-
* @Iterations(2)
36-
* @Revs(2)
37-
* @OutputTimeUnit("microseconds")
38-
* @Assert("mode(variant.time.avg) < 800 microseconds +/- 20%")
39-
* @Assert("mode(variant.time.avg) > 100 microseconds +/- 10%")
40-
*/
41-
public function benchGetCharLength(): void
42-
{
43-
UtfString::getCharLength(chr(0x00)); // 00000000
44-
UtfString::getCharLength(chr(0x7F)); // 01111111
45-
46-
UtfString::getCharLength(chr(0xC0)); // 11000000
47-
UtfString::getCharLength(chr(0xDF)); // 11011111
48-
49-
UtfString::getCharLength(chr(0xE0)); // 11100000
50-
UtfString::getCharLength(chr(0xEF)); // 11101111
51-
52-
UtfString::getCharLength(chr(0xF0)); // 11110000
53-
UtfString::getCharLength(chr(0xF7)); // 11110111
54-
55-
UtfString::getCharLength(chr(0xF8)); // 11111000
56-
UtfString::getCharLength(chr(0xFB)); // 11111011
57-
58-
UtfString::getCharLength(chr(0xFC)); // 11111100
59-
UtfString::getCharLength(chr(0xFD)); // 11111101
60-
}
61-
6231
public function setUp(): void
6332
{
6433
$contentsPath = __DIR__ . '/../../LICENSE.txt';
6534
$this->testContents = (string) file_get_contents($contentsPath);
6635
}
36+
37+
/**
38+
* @Iterations(20)
39+
* @Revs(4)
40+
* @OutputTimeUnit("microseconds")
41+
* @Assert("mode(variant.time.avg) < 120 microseconds +/- 10%")
42+
*/
43+
public function benchUtfStringRandomAccessWithUnicode(): void
44+
{
45+
$text = 'abcdefghijklmnopqrstuvwxyz
46+
áéíóúýěřťǔǐǒǎšďȟǰǩľžčǚň
47+
🦋😄😃😀😊😉😍😘😚😗😂👿😮😨😱😠😡😤😖😆😋👯
48+
P\xf8\xed\xb9ern\xec \xbelu\xbbou\xe8k\xfd k\xf3d \xfap\xecl \xef\xe1belsk\xe9 k\xf3dy
49+
xℤⅿↈⅬ⅀ↆℜℝ⅗ℾ℧ⅰℓⅯⅵⅣ⅒21⅞';
50+
51+
$str1 = new UtfString($text);
52+
$str1->offsetGet(10);
53+
$str1->offsetGet(100);
54+
$str1->offsetGet(20);
55+
$str1->offsetGet(0);
56+
}
6757
}

0 commit comments

Comments
 (0)