@@ -5147,6 +5147,95 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size, size_t *p_wlen)
51475147}
51485148
51495149
5150+ /* UTF-8 encoder using the surrogateescape error handler .
5151+
5152+ On success, return a pointer to a newly allocated character string (use
5153+ PyMem_Free() to free the memory).
5154+
5155+ On encoding failure, return NULL and write the position of the invalid
5156+ surrogate character into *error_pos (if error_pos is set).
5157+
5158+ On memory allocation failure, return NULL and write (size_t)-1 into
5159+ *error_pos (if error_pos is set). */
5160+ char *
5161+ _Py_EncodeUTF8_surrogateescape (const wchar_t * text , size_t * error_pos )
5162+ {
5163+ const Py_ssize_t max_char_size = 4 ;
5164+ Py_ssize_t len = wcslen (text );
5165+
5166+ assert (len >= 0 );
5167+
5168+ char * bytes ;
5169+ if (len <= PY_SSIZE_T_MAX / max_char_size - 1 ) {
5170+ bytes = PyMem_Malloc ((len + 1 ) * max_char_size );
5171+ }
5172+ else {
5173+ bytes = NULL ;
5174+ }
5175+ if (bytes == NULL ) {
5176+ if (error_pos != NULL ) {
5177+ * error_pos = (size_t )-1 ;
5178+ }
5179+ return NULL ;
5180+ }
5181+
5182+ char * p = bytes ;
5183+ Py_ssize_t i ;
5184+ for (i = 0 ; i < len ;) {
5185+ Py_UCS4 ch = text [i ++ ];
5186+
5187+ if (ch < 0x80 ) {
5188+ /* Encode ASCII */
5189+ * p ++ = (char ) ch ;
5190+
5191+ }
5192+ else if (ch < 0x0800 ) {
5193+ /* Encode Latin-1 */
5194+ * p ++ = (char )(0xc0 | (ch >> 6 ));
5195+ * p ++ = (char )(0x80 | (ch & 0x3f ));
5196+ }
5197+ else if (Py_UNICODE_IS_SURROGATE (ch )) {
5198+ /* surrogateescape error handler */
5199+ if (!(0xDC80 <= ch && ch <= 0xDCFF )) {
5200+ if (error_pos != NULL ) {
5201+ * error_pos = (size_t )i - 1 ;
5202+ }
5203+ goto error ;
5204+ }
5205+ * p ++ = (char )(ch & 0xff );
5206+ }
5207+ else if (ch < 0x10000 ) {
5208+ * p ++ = (char )(0xe0 | (ch >> 12 ));
5209+ * p ++ = (char )(0x80 | ((ch >> 6 ) & 0x3f ));
5210+ * p ++ = (char )(0x80 | (ch & 0x3f ));
5211+ }
5212+ else { /* ch >= 0x10000 */
5213+ assert (ch <= MAX_UNICODE );
5214+ /* Encode UCS4 Unicode ordinals */
5215+ * p ++ = (char )(0xf0 | (ch >> 18 ));
5216+ * p ++ = (char )(0x80 | ((ch >> 12 ) & 0x3f ));
5217+ * p ++ = (char )(0x80 | ((ch >> 6 ) & 0x3f ));
5218+ * p ++ = (char )(0x80 | (ch & 0x3f ));
5219+ }
5220+ }
5221+ * p ++ = '\0' ;
5222+
5223+ size_t final_size = (p - bytes );
5224+ char * bytes2 = PyMem_Realloc (bytes , final_size );
5225+ if (bytes2 == NULL ) {
5226+ if (error_pos != NULL ) {
5227+ * error_pos = (size_t )-1 ;
5228+ }
5229+ goto error ;
5230+ }
5231+ return bytes2 ;
5232+
5233+ error :
5234+ PyMem_Free (bytes );
5235+ return NULL ;
5236+ }
5237+
5238+
51505239/* Primary internal function which creates utf8 encoded bytes objects.
51515240
51525241 Allocation strategy: if the string is short, convert into a stack buffer
0 commit comments