Skip to content

Commit eb248b3

Browse files
committed
Add explanator comments
Signed-off-by: Kamil Tekiela <tekiela246@gmail.com>
1 parent a66f0d5 commit eb248b3

1 file changed

Lines changed: 10 additions & 1 deletion

File tree

src/UtfString.php

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,12 @@ public function offsetExists($offset): bool
102102
*/
103103
public function offsetGet($offset): string|null
104104
{
105+
// This function moves the internal byte and character pointer to the requested offset.
106+
// This function is part of hot code so the aim is to do the following
107+
// operations as efficiently as possible.
108+
// UTF-8 character encoding is a variable length encoding that encodes Unicode
109+
// characters in 1-4 bytes. Thus we fetch 4 bytes from the current offset and then use mb_substr
110+
// to get the first UTF-8 character in it. We then use strlen to get the character's size in bytes.
105111
if (($offset < 0) || ($offset >= $this->charLen)) {
106112
return null;
107113
}
@@ -117,14 +123,17 @@ public function offsetGet($offset): string|null
117123
} elseif ($delta < 0) {
118124
// Rewinding.
119125
while ($delta++ < 0) {
126+
// We rewind byte by byte and only count characters that are not continuation bytes,
127+
// i.e. ASCII characters and first octets of multibyte characters
120128
do {
121129
$byte = ord($this->str[--$this->byteIdx]);
122130
} while (($byte >= 128) && ($byte < 192));
123131

124132
--$this->charIdx;
125133
}
126134
}
127-
135+
136+
// Fetch the first Unicode character within the next 4 bytes in the string.
128137
return mb_substr(substr($this->str, $this->byteIdx, 4), 0, 1);
129138
}
130139

0 commit comments

Comments
 (0)