bpo-32030: Add _Py_EncodeUTF8_surrogateescape() (#4960)

vstinner · web-flow · commit e47e698da6bd · 2017-12-21T15:45:16.000+01:00
Py_EncodeLocale() now uses _Py_EncodeUTF8_surrogateescape(), instead
of using temporary unicode and bytes objects. So Py_EncodeLocale()
doesn't use the Python C API anymore.
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
@@ -5147,6 +5147,95 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size, size_t *p_wlen)
 }
 
 
+/* UTF-8 encoder using the surrogateescape error handler .
+
+   On success, return a pointer to a newly allocated character string (use
+   PyMem_Free() to free the memory).
+
+   On encoding failure, return NULL and write the position of the invalid
+   surrogate character into *error_pos (if error_pos is set).
+
+   On memory allocation failure, return NULL and write (size_t)-1 into
+   *error_pos (if error_pos is set). */
+char*
+_Py_EncodeUTF8_surrogateescape(const wchar_t *text, size_t *error_pos)
+{
+    const Py_ssize_t max_char_size = 4;
+    Py_ssize_t len = wcslen(text);
+
+    assert(len >= 0);
+
+    char *bytes;
+    if (len <= PY_SSIZE_T_MAX / max_char_size - 1) {
+        bytes = PyMem_Malloc((len + 1) * max_char_size);
+    }
+    else {
+        bytes = NULL;
+    }
+    if (bytes == NULL) {
+        if (error_pos != NULL) {
+            *error_pos = (size_t)-1;
+        }
+        return NULL;
+    }
+
+    char *p = bytes;
+    Py_ssize_t i;
+    for (i = 0; i < len;) {
+        Py_UCS4 ch = text[i++];
+
+        if (ch < 0x80) {
+            /* Encode ASCII */
+            *p++ = (char) ch;
+
+        }
+        else if (ch < 0x0800) {
+            /* Encode Latin-1 */
+            *p++ = (char)(0xc0 | (ch >> 6));
+            *p++ = (char)(0x80 | (ch & 0x3f));
+        }
+        else if (Py_UNICODE_IS_SURROGATE(ch)) {
+            /* surrogateescape error handler */
+            if (!(0xDC80 <= ch && ch <= 0xDCFF)) {
+                if (error_pos != NULL) {
+                    *error_pos = (size_t)i - 1;
+                }
+                goto error;
+            }
+            *p++ = (char)(ch & 0xff);
+        }
+        else if (ch < 0x10000) {
+            *p++ = (char)(0xe0 | (ch >> 12));
+            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
+            *p++ = (char)(0x80 | (ch & 0x3f));
+        }
+        else {  /* ch >= 0x10000 */
+            assert(ch <= MAX_UNICODE);
+            /* Encode UCS4 Unicode ordinals */
+            *p++ = (char)(0xf0 | (ch >> 18));
+            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
+            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
+            *p++ = (char)(0x80 | (ch & 0x3f));
+        }
+    }
+    *p++ = '\0';
+
+    size_t final_size = (p - bytes);
+    char *bytes2 = PyMem_Realloc(bytes, final_size);
+    if (bytes2 == NULL) {
+        if (error_pos != NULL) {
+            *error_pos = (size_t)-1;
+        }
+        goto error;
+    }
+    return bytes2;
+
+ error:
+    PyMem_Free(bytes);
+    return NULL;
+}
+
+
 /* Primary internal function which creates utf8 encoded bytes objects.
 
    Allocation strategy:  if the string is short, convert into a stack buffer
diff --git a/Python/fileutils.c b/Python/fileutils.c
@@ -22,6 +22,8 @@ extern int winerror_to_errno(int);
 
 extern wchar_t* _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size,
                                                size_t *p_wlen);
+extern char* _Py_EncodeUTF8_surrogateescape(const wchar_t *text,
+                                            size_t *error_pos);
 
 #ifdef O_CLOEXEC
 /* Does open() support the O_CLOEXEC flag? Possible values:
@@ -418,42 +420,6 @@ Py_DecodeLocale(const char* arg, size_t *size)
 #endif   /* __APPLE__ or __ANDROID__ */
 }
 
-static char*
-_Py_EncodeLocaleUTF8(const wchar_t *text, size_t *error_pos)
-{
-    Py_ssize_t len;
-    PyObject *unicode, *bytes = NULL;
-    char *cpath;
-
-    unicode = PyUnicode_FromWideChar(text, wcslen(text));
-    if (unicode == NULL) {
-        return NULL;
-    }
-
-    bytes = _PyUnicode_AsUTF8String(unicode, "surrogateescape");
-    Py_DECREF(unicode);
-    if (bytes == NULL) {
-        PyErr_Clear();
-        if (error_pos != NULL) {
-            *error_pos = (size_t)-1;
-        }
-        return NULL;
-    }
-
-    len = PyBytes_GET_SIZE(bytes);
-    cpath = PyMem_Malloc(len+1);
-    if (cpath == NULL) {
-        PyErr_Clear();
-        Py_DECREF(bytes);
-        if (error_pos != NULL) {
-            *error_pos = (size_t)-1;
-        }
-        return NULL;
-    }
-    memcpy(cpath, PyBytes_AsString(bytes), len + 1);
-    Py_DECREF(bytes);
-    return cpath;
-}
 
 #if !defined(__APPLE__) && !defined(__ANDROID__)
 static char*
@@ -537,10 +503,10 @@ char*
 Py_EncodeLocale(const wchar_t *text, size_t *error_pos)
 {
 #if defined(__APPLE__) || defined(__ANDROID__)
-    return _Py_EncodeLocaleUTF8(text, error_pos);
+    return _Py_EncodeUTF8_surrogateescape(text, error_pos);
 #else   /* __APPLE__ */
     if (Py_UTF8Mode == 1) {
-        return _Py_EncodeLocaleUTF8(text, error_pos);
+        return _Py_EncodeUTF8_surrogateescape(text, error_pos);
     }
 
 #ifndef MS_WINDOWS