1010
1111use function mb_check_encoding ;
1212use function mb_strlen ;
13+ use function mb_substr ;
1314use function ord ;
15+ use function strlen ;
16+ use function substr ;
1417
1518/**
1619 * Implementation for UTF-8 strings.
@@ -68,119 +71,6 @@ class UtfString implements ArrayAccess, Stringable
6871 */
6972 public $ charLen = 0 ;
7073
71- /**
72- * A map of ASCII binary values to their ASCII code
73- * This is to improve performance and avoid calling ord($byte)
74- *
75- * Source: https://www.freecodecamp.org/news/ascii-table-hex-to-ascii-value-character-code-chart-2/
76- *
77- * @var array<int|string,int>
78- */
79- protected static $ asciiMap = [
80- "\0" => 0 , // (00000000) NUL Null
81- "\t" => 9 , // (00001001) HT Horizontal Tab
82- "\n" => 10 , // (00001010) LF Newline / Line Feed
83- "\v" => 11 , // (00001011) VT Vertical Tab
84- "\f" => 12 , // (00001100) FF Form Feed
85- "\r" => 13 , // (00001101) CR Carriage Return
86- ' ' => 32 , // (00100000) SP Space
87- '! ' => 33 , // (00100001) ! Exclamation mark
88- '" ' => 34 , // (00100010) " Double quote
89- '# ' => 35 , // (00100011) # Number
90- '$ ' => 36 , // (00100100) $ Dollar
91- '% ' => 37 , // (00100101) % Percent
92- '& ' => 38 , // (00100110) & Ampersand
93- '\'' => 39 , // (00100111) ' Single quote
94- '( ' => 40 , // (00101000) ( Left parenthesis
95- ') ' => 41 , // (00101001) ) Right parenthesis
96- '* ' => 42 , // (00101010) * Asterisk
97- '+ ' => 43 , // (00101011) + Plus
98- ', ' => 44 , // (00101100) , Comma
99- '- ' => 45 , // (00101101) - Minus
100- '. ' => 46 , // (00101110) . Period
101- '/ ' => 47 , // (00101111) / Slash
102- '0 ' => 48 , // (00110000) 0 Zero
103- '1 ' => 49 , // (00110001) 1 One
104- '2 ' => 50 , // (00110010) 2 Two
105- '3 ' => 51 , // (00110011) 3 Three
106- '4 ' => 52 , // (00110100) 4 Four
107- '5 ' => 53 , // (00110101) 5 Five
108- '6 ' => 54 , // (00110110) 6 Six
109- '7 ' => 55 , // (00110111) 7 Seven
110- '8 ' => 56 , // (00111000) 8 Eight
111- '9 ' => 57 , // (00111001) 9 Nine
112- ': ' => 58 , // (00111010) : Colon
113- '; ' => 59 , // (00111011) ; Semicolon
114- '< ' => 60 , // (00111100) < Less than
115- '= ' => 61 , // (00111101) = Equal sign
116- '> ' => 62 , // (00111110) > Greater than
117- '? ' => 63 , // (00111111) ? Question mark
118- '@ ' => 64 , // (01000000) @ At sign
119- 'A ' => 65 , // (01000001) A Uppercase A
120- 'B ' => 66 , // (01000010) B Uppercase B
121- 'C ' => 67 , // (01000011) C Uppercase C
122- 'D ' => 68 , // (01000100) D Uppercase D
123- 'E ' => 69 , // (01000101) E Uppercase E
124- 'F ' => 70 , // (01000110) F Uppercase F
125- 'G ' => 71 , // (01000111) G Uppercase G
126- 'H ' => 72 , // (01001000) H Uppercase H
127- 'I ' => 73 , // (01001001) I Uppercase I
128- 'J ' => 74 , // (01001010) J Uppercase J
129- 'K ' => 75 , // (01001011) K Uppercase K
130- 'L ' => 76 , // (01001100) L Uppercase L
131- 'M ' => 77 , // (01001101) M Uppercase M
132- 'N ' => 78 , // (01001110) N Uppercase N
133- 'O ' => 79 , // (01001111) O Uppercase O
134- 'P ' => 80 , // (01010000) P Uppercase P
135- 'Q ' => 81 , // (01010001) Q Uppercase Q
136- 'R ' => 82 , // (01010010) R Uppercase R
137- 'S ' => 83 , // (01010011) S Uppercase S
138- 'T ' => 84 , // (01010100) T Uppercase T
139- 'U ' => 85 , // (01010101) U Uppercase U
140- 'V ' => 86 , // (01010110) V Uppercase V
141- 'W ' => 87 , // (01010111) W Uppercase W
142- 'X ' => 88 , // (01011000) X Uppercase X
143- 'Y ' => 89 , // (01011001) Y Uppercase Y
144- 'Z ' => 90 , // (01011010) Z Uppercase Z
145- '[ ' => 91 , // (01011011) [ Left square bracket
146- '\\' => 92 , // (01011100) \ backslash
147- '] ' => 93 , // (01011101) ] Right square bracket
148- '^ ' => 94 , // (01011110) ^ Caret / circumflex
149- '_ ' => 95 , // (01011111) _ Underscore
150- '` ' => 96 , // (01100000) ` Grave / accent
151- 'a ' => 97 , // (01100001) a Lowercase a
152- 'b ' => 98 , // (01100010) b Lowercase b
153- 'c ' => 99 , // (01100011) c Lowercase c
154- 'd ' => 100 , // (01100100) d Lowercase d
155- 'e ' => 101 , // (01100101) e Lowercase e
156- 'f ' => 102 , // (01100110) f Lowercase
157- 'g ' => 103 , // (01100111) g Lowercase g
158- 'h ' => 104 , // (01101000) h Lowercase h
159- 'i ' => 105 , // (01101001) i Lowercase i
160- 'j ' => 106 , // (01101010) j Lowercase j
161- 'k ' => 107 , // (01101011) k Lowercase k
162- 'l ' => 108 , // (01101100) l Lowercase l
163- 'm ' => 109 , // (01101101) m Lowercase m
164- 'n ' => 110 , // (01101110) n Lowercase n
165- 'o ' => 111 , // (01101111) o Lowercase o
166- 'p ' => 112 , // (01110000) p Lowercase p
167- 'q ' => 113 , // (01110001) q Lowercase q
168- 'r ' => 114 , // (01110010) r Lowercase r
169- 's ' => 115 , // (01110011) s Lowercase s
170- 't ' => 116 , // (01110100) t Lowercase t
171- 'u ' => 117 , // (01110101) u Lowercase u
172- 'v ' => 118 , // (01110110) v Lowercase v
173- 'w ' => 119 , // (01110111) w Lowercase w
174- 'x ' => 120 , // (01111000) x Lowercase x
175- 'y ' => 121 , // (01111001) y Lowercase y
176- 'z ' => 122 , // (01111010) z Lowercase z
177- '{ ' => 123 , // (01111011) { Left curly bracket
178- '| ' => 124 , // (01111100) | Vertical bar
179- '} ' => 125 , // (01111101) } Right curly bracket
180- '~ ' => 126 , // (01111110) ~ Tilde
181- "\x7f" => 127 , // (01111111) DEL Delete
182- ];
183-
18474 /**
18575 * @param string $str the string
18676 */
@@ -212,6 +102,12 @@ public function offsetExists($offset): bool
212102 */
213103 public function offsetGet ($ offset ): string |null
214104 {
105+ // This function moves the internal byte and character pointer to the requested offset.
106+ // This function is part of hot code so the aim is to do the following
107+ // operations as efficiently as possible.
108+ // UTF-8 character encoding is a variable length encoding that encodes Unicode
109+ // characters in 1-4 bytes. Thus we fetch 4 bytes from the current offset and then use mb_substr
110+ // to get the first UTF-8 character in it. We then use strlen to get the character's size in bytes.
215111 if (($ offset < 0 ) || ($ offset >= $ this ->charLen )) {
216112 return null ;
217113 }
@@ -220,13 +116,13 @@ public function offsetGet($offset): string|null
220116
221117 if ($ delta > 0 ) {
222118 // Fast forwarding.
223- while ($ delta -- > 0 ) {
224- $ this ->byteIdx += static ::getCharLength ($ this ->str [$ this ->byteIdx ]);
225- ++$ this ->charIdx ;
226- }
119+ $ this ->byteIdx += strlen (mb_substr (substr ($ this ->str , $ this ->byteIdx , 4 * $ delta ), 0 , $ delta ));
120+ $ this ->charIdx += $ delta ;
227121 } elseif ($ delta < 0 ) {
228122 // Rewinding.
229123 while ($ delta ++ < 0 ) {
124+ // We rewind byte by byte and only count characters that are not continuation bytes,
125+ // i.e. ASCII characters and first octets of multibyte characters
230126 do {
231127 $ byte = ord ($ this ->str [--$ this ->byteIdx ]);
232128 } while (($ byte >= 128 ) && ($ byte < 192 ));
@@ -235,14 +131,8 @@ public function offsetGet($offset): string|null
235131 }
236132 }
237133
238- $ bytesCount = static ::getCharLength ($ this ->str [$ this ->byteIdx ]);
239-
240- $ ret = '' ;
241- for ($ i = 0 ; $ bytesCount -- > 0 ; ++$ i ) {
242- $ ret .= $ this ->str [$ this ->byteIdx + $ i ];
243- }
244-
245- return $ ret ;
134+ // Fetch the first Unicode character within the next 4 bytes in the string.
135+ return mb_substr (substr ($ this ->str , $ this ->byteIdx , 4 ), 0 , 1 );
246136 }
247137
248138 /**
@@ -270,52 +160,6 @@ public function offsetUnset($offset): void
270160 throw new Exception ('Not implemented. ' );
271161 }
272162
273- /**
274- * Gets the length of an UTF-8 character.
275- *
276- * According to RFC 3629, a UTF-8 character can have at most 4 bytes.
277- * However, this implementation supports UTF-8 characters containing up to 6
278- * bytes.
279- *
280- * @see https://tools.ietf.org/html/rfc3629
281- *
282- * @param string $byte the byte to be analyzed
283- */
284- public static function getCharLength ($ byte ): int
285- {
286- // Use the default ASCII map as queries are mostly ASCII chars
287- // ord($byte) has a performance cost
288-
289- if (! isset (static ::$ asciiMap [$ byte ])) {
290- // Complete the cache with missing items
291- static ::$ asciiMap [$ byte ] = ord ($ byte );
292- }
293-
294- $ byte = static ::$ asciiMap [$ byte ];
295-
296- if ($ byte < 128 ) {
297- return 1 ;
298- }
299-
300- if ($ byte < 224 ) {
301- return 2 ;
302- }
303-
304- if ($ byte < 240 ) {
305- return 3 ;
306- }
307-
308- if ($ byte < 248 ) {
309- return 4 ;
310- }
311-
312- if ($ byte < 252 ) {
313- return 5 ; // unofficial
314- }
315-
316- return 6 ; // unofficial
317- }
318-
319163 /**
320164 * Returns the length in characters of the string.
321165 */
0 commit comments