Skip to content

Commit b2457ef

Browse files
authored
bpo-34523: Add _PyCoreConfig.filesystem_encoding (GH-8963)
_PyCoreConfig_Read() is now responsible to choose the filesystem encoding and error handler. Using Py_Main(), the encoding is now chosen even before calling Py_Initialize(). _PyCoreConfig.filesystem_encoding is now the reference, instead of Py_FileSystemDefaultEncoding, for the Python filesystem encoding. Changes: * Add filesystem_encoding and filesystem_errors to _PyCoreConfig * _PyCoreConfig_Read() now reads the locale encoding for the file system encoding. * PyUnicode_EncodeFSDefault() and PyUnicode_DecodeFSDefaultAndSize() now use the interpreter configuration rather than Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors global configuration variables. * Add _Py_SetFileSystemEncoding() and _Py_ClearFileSystemEncoding() private functions to only modify Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors in coreconfig.c. * _Py_CoerceLegacyLocale() now takes an int rather than _PyCoreConfig for the warning.
1 parent dfe0dc7 commit b2457ef

File tree

12 files changed

+296
-100
lines changed

12 files changed

+296
-100
lines changed

Include/coreconfig.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,17 @@ typedef struct {
6666
int coerce_c_locale; /* PYTHONCOERCECLOCALE, -1 means unknown */
6767
int coerce_c_locale_warn; /* PYTHONCOERCECLOCALE=warn */
6868

69+
/* Python filesystem encoding and error handler: see
70+
sys.getfilesystemencoding() and sys.getfilesystemencodeerrors().
71+
72+
Updated later by initfsencoding(). On Windows, can be updated by
73+
sys._enablelegacywindowsfsencoding() at runtime.
74+
75+
See Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors.
76+
*/
77+
char *filesystem_encoding;
78+
char *filesystem_errors;
79+
6980
/* Enable UTF-8 mode?
7081
Set by -X utf8 command line option and PYTHONUTF8 environment variable.
7182
If set to -1 (default), inherit Py_UTF8Mode value. */
@@ -325,6 +336,14 @@ PyAPI_FUNC(int) _PyCoreConfig_GetEnvDup(
325336
#endif
326337

327338

339+
#ifdef Py_BUILD_CORE
340+
PyAPI_FUNC(int) _Py_SetFileSystemEncoding(
341+
const char *encoding,
342+
const char *errors);
343+
PyAPI_FUNC(void) _Py_ClearFileSystemEncoding(void);
344+
#endif
345+
346+
328347
#ifdef __cplusplus
329348
}
330349
#endif

Include/pylifecycle.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,7 @@ PyAPI_FUNC(int) _PyOS_URandomNonblock(void *buffer, Py_ssize_t size);
175175

176176
/* Legacy locale support */
177177
#ifndef Py_LIMITED_API
178-
PyAPI_FUNC(void) _Py_CoerceLegacyLocale(const _PyCoreConfig *config);
178+
PyAPI_FUNC(void) _Py_CoerceLegacyLocale(int warn);
179179
PyAPI_FUNC(int) _Py_LegacyLocaleDetected(void);
180180
PyAPI_FUNC(char *) _Py_SetLocaleFromEnv(int category);
181181
#endif

Lib/test/test_embed.py

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,8 @@ def test_initialize_pymain(self):
251251

252252
class InitConfigTests(EmbeddingTestsMixin, unittest.TestCase):
253253
maxDiff = 4096
254+
UTF8_MODE_ERRORS = ('surrogatepass' if sys.platform == 'win32'
255+
else 'surrogateescape')
254256
DEFAULT_CONFIG = {
255257
'install_signal_handlers': 1,
256258
'use_environment': 1,
@@ -265,8 +267,12 @@ class InitConfigTests(EmbeddingTestsMixin, unittest.TestCase):
265267
'show_alloc_count': 0,
266268
'dump_refs': 0,
267269
'malloc_stats': 0,
268-
'utf8_mode': 0,
269270

271+
# None means that the default encoding is read at runtime:
272+
# see get_locale_encoding().
273+
'filesystem_encoding': None,
274+
'filesystem_errors': sys.getfilesystemencodeerrors(),
275+
'utf8_mode': 0,
270276
'coerce_c_locale': 0,
271277
'coerce_c_locale_warn': 0,
272278

@@ -297,6 +303,7 @@ class InitConfigTests(EmbeddingTestsMixin, unittest.TestCase):
297303
'_frozen': 0,
298304
}
299305

306+
300307
def get_stdio_encoding(self, env):
301308
code = 'import sys; print(sys.stdout.encoding, sys.stdout.errors)'
302309
args = (sys.executable, '-c', code)
@@ -308,6 +315,29 @@ def get_stdio_encoding(self, env):
308315
out = proc.stdout.rstrip()
309316
return out.split()
310317

318+
def get_locale_encoding(self, isolated):
319+
if sys.platform in ('win32', 'darwin') or support.is_android:
320+
# Windows, macOS and Android use UTF-8
321+
return "utf-8"
322+
323+
code = ('import codecs, locale, sys',
324+
'locale.setlocale(locale.LC_CTYPE, "")',
325+
'enc = locale.nl_langinfo(locale.CODESET)',
326+
'enc = codecs.lookup(enc).name',
327+
'print(enc)')
328+
args = (sys.executable, '-c', '; '.join(code))
329+
env = dict(os.environ)
330+
if not isolated:
331+
env['PYTHONCOERCECLOCALE'] = '0'
332+
env['PYTHONUTF8'] = '0'
333+
proc = subprocess.run(args, text=True, env=env,
334+
stdout=subprocess.PIPE,
335+
stderr=subprocess.PIPE)
336+
if proc.returncode:
337+
raise Exception(f"failed to get the locale encoding: "
338+
f"stdout={proc.stdout!r} stderr={proc.stderr!r}")
339+
return proc.stdout.rstrip()
340+
311341
def check_config(self, testname, expected):
312342
expected = dict(self.DEFAULT_CONFIG, **expected)
313343

@@ -326,6 +356,8 @@ def check_config(self, testname, expected):
326356
expected['stdio_encoding'] = res[0]
327357
if expected['stdio_errors'] is None:
328358
expected['stdio_errors'] = res[1]
359+
if expected['filesystem_encoding'] is None:
360+
expected['filesystem_encoding'] = self.get_locale_encoding(expected['isolated'])
329361
for key, value in expected.items():
330362
expected[key] = str(value)
331363

@@ -357,7 +389,8 @@ def test_init_global_config(self):
357389
'utf8_mode': 1,
358390
'stdio_encoding': 'utf-8',
359391
'stdio_errors': 'surrogateescape',
360-
392+
'filesystem_encoding': 'utf-8',
393+
'filesystem_errors': self.UTF8_MODE_ERRORS,
361394
'user_site_directory': 0,
362395
'_frozen': 1,
363396
}
@@ -378,6 +411,8 @@ def test_init_from_config(self):
378411
'utf8_mode': 1,
379412
'stdio_encoding': 'iso8859-1',
380413
'stdio_errors': 'replace',
414+
'filesystem_encoding': 'utf-8',
415+
'filesystem_errors': self.UTF8_MODE_ERRORS,
381416

382417
'pycache_prefix': 'conf_pycache_prefix',
383418
'program_name': './conf_program_name',
@@ -409,6 +444,8 @@ def test_init_env(self):
409444
'import_time': 1,
410445
'malloc_stats': 1,
411446
'utf8_mode': 1,
447+
'filesystem_encoding': 'utf-8',
448+
'filesystem_errors': self.UTF8_MODE_ERRORS,
412449
'inspect': 1,
413450
'optimization_level': 2,
414451
'pycache_prefix': 'env_pycache_prefix',

Lib/test/test_sys.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -861,6 +861,16 @@ def check(tracebacklimit, expected):
861861
def test_no_duplicates_in_meta_path(self):
862862
self.assertEqual(len(sys.meta_path), len(set(sys.meta_path)))
863863

864+
@unittest.skipUnless(hasattr(sys, "_enablelegacywindowsfsencoding"),
865+
'needs sys._enablelegacywindowsfsencoding()')
866+
def test__enablelegacywindowsfsencoding(self):
867+
code = ('import sys',
868+
'sys._enablelegacywindowsfsencoding()',
869+
'print(sys.getfilesystemencoding(), sys.getfilesystemencodeerrors())')
870+
rc, out, err = assert_python_ok('-c', '; '.join(code))
871+
out = out.decode('ascii', 'replace').rstrip()
872+
self.assertEqual(out, 'mbcs replace')
873+
864874

865875
@test.support.cpython_only
866876
class SizeofTest(unittest.TestCase):
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
The Python filesystem encoding is now read earlier during the Python
2+
initialization.

Modules/main.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1339,7 +1339,7 @@ pymain_read_conf(_PyMain *pymain, _PyCoreConfig *config,
13391339
*/
13401340
if (config->coerce_c_locale && !locale_coerced) {
13411341
locale_coerced = 1;
1342-
_Py_CoerceLegacyLocale(config);
1342+
_Py_CoerceLegacyLocale(config->coerce_c_locale_warn);
13431343
encoding_changed = 1;
13441344
}
13451345

Objects/unicodeobject.c

Lines changed: 18 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -3410,27 +3410,24 @@ PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
34103410
PyObject *
34113411
PyUnicode_EncodeFSDefault(PyObject *unicode)
34123412
{
3413+
PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
3414+
const _PyCoreConfig *config = &interp->core_config;
34133415
#if defined(__APPLE__)
3414-
return _PyUnicode_AsUTF8String(unicode, Py_FileSystemDefaultEncodeErrors);
3416+
return _PyUnicode_AsUTF8String(unicode, config->filesystem_errors);
34153417
#else
3416-
PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
34173418
/* Bootstrap check: if the filesystem codec is implemented in Python, we
34183419
cannot use it to encode and decode filenames before it is loaded. Load
34193420
the Python codec requires to encode at least its own filename. Use the C
3420-
version of the locale codec until the codec registry is initialized and
3421-
the Python codec is loaded.
3422-
3423-
Py_FileSystemDefaultEncoding is shared between all interpreters, we
3424-
cannot only rely on it: check also interp->fscodec_initialized for
3425-
subinterpreters. */
3426-
if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3421+
implementation of the locale codec until the codec registry is
3422+
initialized and the Python codec is loaded. See initfsencoding(). */
3423+
if (interp->fscodec_initialized) {
34273424
return PyUnicode_AsEncodedString(unicode,
3428-
Py_FileSystemDefaultEncoding,
3429-
Py_FileSystemDefaultEncodeErrors);
3425+
config->filesystem_encoding,
3426+
config->filesystem_errors);
34303427
}
34313428
else {
34323429
return unicode_encode_locale(unicode,
3433-
Py_FileSystemDefaultEncodeErrors, 0);
3430+
config->filesystem_errors, 0);
34343431
}
34353432
#endif
34363433
}
@@ -3636,27 +3633,24 @@ PyUnicode_DecodeFSDefault(const char *s) {
36363633
PyObject*
36373634
PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
36383635
{
3636+
PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
3637+
const _PyCoreConfig *config = &interp->core_config;
36393638
#if defined(__APPLE__)
3640-
return PyUnicode_DecodeUTF8Stateful(s, size, Py_FileSystemDefaultEncodeErrors, NULL);
3639+
return PyUnicode_DecodeUTF8Stateful(s, size, config->filesystem_errors, NULL);
36413640
#else
3642-
PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
36433641
/* Bootstrap check: if the filesystem codec is implemented in Python, we
36443642
cannot use it to encode and decode filenames before it is loaded. Load
36453643
the Python codec requires to encode at least its own filename. Use the C
3646-
version of the locale codec until the codec registry is initialized and
3647-
the Python codec is loaded.
3648-
3649-
Py_FileSystemDefaultEncoding is shared between all interpreters, we
3650-
cannot only rely on it: check also interp->fscodec_initialized for
3651-
subinterpreters. */
3652-
if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3644+
implementation of the locale codec until the codec registry is
3645+
initialized and the Python codec is loaded. See initfsencoding(). */
3646+
if (interp->fscodec_initialized) {
36533647
return PyUnicode_Decode(s, size,
3654-
Py_FileSystemDefaultEncoding,
3655-
Py_FileSystemDefaultEncodeErrors);
3648+
config->filesystem_encoding,
3649+
config->filesystem_errors);
36563650
}
36573651
else {
36583652
return unicode_decode_locale(s, size,
3659-
Py_FileSystemDefaultEncodeErrors, 0);
3653+
config->filesystem_errors, 0);
36603654
}
36613655
#endif
36623656
}

Programs/_freeze_importlib.c

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,8 +81,15 @@ main(int argc, char *argv[])
8181
config.program_name = L"./_freeze_importlib";
8282
/* Don't install importlib, since it could execute outdated bytecode. */
8383
config._install_importlib = 0;
84-
config.install_signal_handlers = 1;
8584
config._frozen = 1;
85+
#ifdef MS_WINDOWS
86+
/* bpo-34523: initfsencoding() is not called if _install_importlib=0,
87+
so interp->fscodec_initialized value remains 0.
88+
PyUnicode_EncodeFSDefault() doesn't support the "surrogatepass" error
89+
handler in such case, whereas it's the default error handler on Windows.
90+
Force the "strict" error handler to work around this bootstrap issue. */
91+
config.filesystem_errors = "strict";
92+
#endif
8693

8794
_PyInitError err = _Py_InitializeFromConfig(&config);
8895
/* No need to call _PyCoreConfig_Clear() since we didn't allocate any

Programs/_testembed.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -328,6 +328,8 @@ dump_config(void)
328328
printf("dump_refs = %i\n", config->dump_refs);
329329
printf("malloc_stats = %i\n", config->malloc_stats);
330330

331+
printf("filesystem_encoding = %s\n", config->filesystem_encoding);
332+
printf("filesystem_errors = %s\n", config->filesystem_errors);
331333
printf("coerce_c_locale = %i\n", config->coerce_c_locale);
332334
printf("coerce_c_locale_warn = %i\n", config->coerce_c_locale_warn);
333335
printf("utf8_mode = %i\n", config->utf8_mode);

0 commit comments

Comments
 (0)