Skip to content

Commit e47e698

Browse files
authored
bpo-32030: Add _Py_EncodeUTF8_surrogateescape() (#4960)
Py_EncodeLocale() now uses _Py_EncodeUTF8_surrogateescape(), instead of using temporary unicode and bytes objects. So Py_EncodeLocale() doesn't use the Python C API anymore.
1 parent fbd6051 commit e47e698

File tree

2 files changed

+93
-38
lines changed

2 files changed

+93
-38
lines changed

Objects/unicodeobject.c

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5147,6 +5147,95 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size, size_t *p_wlen)
51475147
}
51485148

51495149

5150+
/* UTF-8 encoder using the surrogateescape error handler .
5151+
5152+
On success, return a pointer to a newly allocated character string (use
5153+
PyMem_Free() to free the memory).
5154+
5155+
On encoding failure, return NULL and write the position of the invalid
5156+
surrogate character into *error_pos (if error_pos is set).
5157+
5158+
On memory allocation failure, return NULL and write (size_t)-1 into
5159+
*error_pos (if error_pos is set). */
5160+
char*
5161+
_Py_EncodeUTF8_surrogateescape(const wchar_t *text, size_t *error_pos)
5162+
{
5163+
const Py_ssize_t max_char_size = 4;
5164+
Py_ssize_t len = wcslen(text);
5165+
5166+
assert(len >= 0);
5167+
5168+
char *bytes;
5169+
if (len <= PY_SSIZE_T_MAX / max_char_size - 1) {
5170+
bytes = PyMem_Malloc((len + 1) * max_char_size);
5171+
}
5172+
else {
5173+
bytes = NULL;
5174+
}
5175+
if (bytes == NULL) {
5176+
if (error_pos != NULL) {
5177+
*error_pos = (size_t)-1;
5178+
}
5179+
return NULL;
5180+
}
5181+
5182+
char *p = bytes;
5183+
Py_ssize_t i;
5184+
for (i = 0; i < len;) {
5185+
Py_UCS4 ch = text[i++];
5186+
5187+
if (ch < 0x80) {
5188+
/* Encode ASCII */
5189+
*p++ = (char) ch;
5190+
5191+
}
5192+
else if (ch < 0x0800) {
5193+
/* Encode Latin-1 */
5194+
*p++ = (char)(0xc0 | (ch >> 6));
5195+
*p++ = (char)(0x80 | (ch & 0x3f));
5196+
}
5197+
else if (Py_UNICODE_IS_SURROGATE(ch)) {
5198+
/* surrogateescape error handler */
5199+
if (!(0xDC80 <= ch && ch <= 0xDCFF)) {
5200+
if (error_pos != NULL) {
5201+
*error_pos = (size_t)i - 1;
5202+
}
5203+
goto error;
5204+
}
5205+
*p++ = (char)(ch & 0xff);
5206+
}
5207+
else if (ch < 0x10000) {
5208+
*p++ = (char)(0xe0 | (ch >> 12));
5209+
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5210+
*p++ = (char)(0x80 | (ch & 0x3f));
5211+
}
5212+
else { /* ch >= 0x10000 */
5213+
assert(ch <= MAX_UNICODE);
5214+
/* Encode UCS4 Unicode ordinals */
5215+
*p++ = (char)(0xf0 | (ch >> 18));
5216+
*p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5217+
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5218+
*p++ = (char)(0x80 | (ch & 0x3f));
5219+
}
5220+
}
5221+
*p++ = '\0';
5222+
5223+
size_t final_size = (p - bytes);
5224+
char *bytes2 = PyMem_Realloc(bytes, final_size);
5225+
if (bytes2 == NULL) {
5226+
if (error_pos != NULL) {
5227+
*error_pos = (size_t)-1;
5228+
}
5229+
goto error;
5230+
}
5231+
return bytes2;
5232+
5233+
error:
5234+
PyMem_Free(bytes);
5235+
return NULL;
5236+
}
5237+
5238+
51505239
/* Primary internal function which creates utf8 encoded bytes objects.
51515240
51525241
Allocation strategy: if the string is short, convert into a stack buffer

Python/fileutils.c

Lines changed: 4 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ extern int winerror_to_errno(int);
2222

2323
extern wchar_t* _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size,
2424
size_t *p_wlen);
25+
extern char* _Py_EncodeUTF8_surrogateescape(const wchar_t *text,
26+
size_t *error_pos);
2527

2628
#ifdef O_CLOEXEC
2729
/* Does open() support the O_CLOEXEC flag? Possible values:
@@ -418,42 +420,6 @@ Py_DecodeLocale(const char* arg, size_t *size)
418420
#endif /* __APPLE__ or __ANDROID__ */
419421
}
420422

421-
static char*
422-
_Py_EncodeLocaleUTF8(const wchar_t *text, size_t *error_pos)
423-
{
424-
Py_ssize_t len;
425-
PyObject *unicode, *bytes = NULL;
426-
char *cpath;
427-
428-
unicode = PyUnicode_FromWideChar(text, wcslen(text));
429-
if (unicode == NULL) {
430-
return NULL;
431-
}
432-
433-
bytes = _PyUnicode_AsUTF8String(unicode, "surrogateescape");
434-
Py_DECREF(unicode);
435-
if (bytes == NULL) {
436-
PyErr_Clear();
437-
if (error_pos != NULL) {
438-
*error_pos = (size_t)-1;
439-
}
440-
return NULL;
441-
}
442-
443-
len = PyBytes_GET_SIZE(bytes);
444-
cpath = PyMem_Malloc(len+1);
445-
if (cpath == NULL) {
446-
PyErr_Clear();
447-
Py_DECREF(bytes);
448-
if (error_pos != NULL) {
449-
*error_pos = (size_t)-1;
450-
}
451-
return NULL;
452-
}
453-
memcpy(cpath, PyBytes_AsString(bytes), len + 1);
454-
Py_DECREF(bytes);
455-
return cpath;
456-
}
457423

458424
#if !defined(__APPLE__) && !defined(__ANDROID__)
459425
static char*
@@ -537,10 +503,10 @@ char*
537503
Py_EncodeLocale(const wchar_t *text, size_t *error_pos)
538504
{
539505
#if defined(__APPLE__) || defined(__ANDROID__)
540-
return _Py_EncodeLocaleUTF8(text, error_pos);
506+
return _Py_EncodeUTF8_surrogateescape(text, error_pos);
541507
#else /* __APPLE__ */
542508
if (Py_UTF8Mode == 1) {
543-
return _Py_EncodeLocaleUTF8(text, error_pos);
509+
return _Py_EncodeUTF8_surrogateescape(text, error_pos);
544510
}
545511

546512
#ifndef MS_WINDOWS

0 commit comments

Comments
 (0)