@@ -188,12 +188,28 @@ pub fn scan<'a>(
188188
189189 // The target encoding is always UTF-8.
190190 if decoder_written > 0 {
191- // We check if the previous scan left some remaining bytes in
192- // the Decoder. This is a complicated corner case. First we
193- // check if this is the first round. The remaining bytes have to
194- // taken into account only if they had been completed by this
195- // run to a multibyte UTF-8. If the first character is a
196- // multibyte UTF-8, then the first bit of the first byte is set.
191+ // With the following `if`, we check if the previous scan has
192+ // potentially left some remaining bytes in the Decoder's inner
193+ // state. This is a complicated corner case, because the inner
194+ // state of the `encoding_rs` decoder is private and there is
195+ // yet not method to query if the decoder is in a neutral state.
196+ // Read the related Issue [Enhancement: get read access to the
197+ // decoder's inner state · Issue #48 ·
198+ // hsivonen/encoding_rs](https://github.com/hsivonen/encoding_rs/issues/48)
199+ //
200+ // As a workaround, we first check if this is the first round
201+ // (`decoder_input_start == 0`). Seeing, that we only know the
202+ // `ByteCounter` precisely at that point and that all other
203+ // round's findings will be tagged `Precision::After` anyway,
204+ // there is no need to investigate further in these cases.
205+ //
206+ // We can reduce the cases of double decoding by checking if the
207+ // first decoded character is a multi-byte UTF-8. If yes, this
208+ // means (in most cases), that no bytes had been stored in the
209+ // decoder's inner state and therefore we can assume that the
210+ // first character was found exactly at `decoder_input_start`.
211+ // If so, we can then tag this string-finding with
212+ // `Precision::exact`.
197213 if decoder_input_start == 0 && starts_with_multibyte_char ( output_buffer_slice) {
198214 // The only way to find out from which scan() run the first
199215 // bytes came, is to scan again with a new Decoder and compare
@@ -212,13 +228,12 @@ pub fn scan<'a>(
212228 & mut buffer[ ..] ,
213229 true ,
214230 ) ;
215- // When the result of the two decoders is not the same, as
216- // the bytes originating from the previous run, we know the
217- // extra bytes come from the previous run. Unfortunately
218- // there is no way to determine how many the decoder had
219- // internally stored. I can be one, two, or three. We only
220- // know that the multibyte sequence started some byte before
221- // 0.
231+ // When the result of the two decoders is not the same, as the
232+ // bytes originating from the previous run, we know the extra
233+ // bytes come from the previous run. Unfortunately there is no
234+ // way to determine how many the decoder had internally stored.
235+ // I can be one, two, or three. We only know that the multibyte
236+ // sequence started some byte before 0.
222237
223238 if ( written == 0 )
224239 || ( fc. output_buffer_bytes [ 0 ..written] != buffer_bytes[ 0 ..written] )
0 commit comments