Skip to content

Commit c1e2c28

Browse files
bpo-36312: Fix decoders for some code pages. (GH-12369)
1 parent cc60cdd commit c1e2c28

File tree

3 files changed

+27
-5
lines changed

3 files changed

+27
-5
lines changed

Lib/test/test_codecs.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3066,6 +3066,15 @@ def test_multibyte_encoding(self):
30663066
('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
30673067
))
30683068

3069+
def test_code_page_decode_flags(self):
3070+
# Issue #36312: For some code pages (e.g. UTF-7) flags for
3071+
# MultiByteToWideChar() must be set to 0.
3072+
for cp in (50220, 50221, 50222, 50225, 50227, 50229,
3073+
*range(57002, 57011+1), 65000):
3074+
self.assertEqual(codecs.code_page_decode(cp, b'abc'), ('abc', 3))
3075+
self.assertEqual(codecs.code_page_decode(42, b'abc'),
3076+
('\uf061\uf062\uf063', 3))
3077+
30693078
def test_incremental(self):
30703079
decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
30713080
self.assertEqual(decoded, ('', 0))
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Fixed decoders for the following code pages: 50220, 50221, 50222, 50225,
2+
50227, 50229, 57002 through 57011, 65000 and 42.

Objects/unicodeobject.c

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7083,15 +7083,21 @@ decode_code_page_strict(UINT code_page,
70837083
const char *in,
70847084
int insize)
70857085
{
7086-
const DWORD flags = decode_code_page_flags(code_page);
7086+
DWORD flags = MB_ERR_INVALID_CHARS;
70877087
wchar_t *out;
70887088
DWORD outsize;
70897089

70907090
/* First get the size of the result */
70917091
assert(insize > 0);
7092-
outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7093-
if (outsize <= 0)
7094-
goto error;
7092+
while ((outsize = MultiByteToWideChar(code_page, flags,
7093+
in, insize, NULL, 0)) <= 0)
7094+
{
7095+
if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7096+
goto error;
7097+
}
7098+
/* For some code pages (e.g. UTF-7) flags must be set to 0. */
7099+
flags = 0;
7100+
}
70957101

70967102
/* Extend a wchar_t* buffer */
70977103
Py_ssize_t n = *bufsize; /* Get the current length */
@@ -7129,7 +7135,7 @@ decode_code_page_errors(UINT code_page,
71297135
{
71307136
const char *startin = in;
71317137
const char *endin = in + size;
7132-
const DWORD flags = decode_code_page_flags(code_page);
7138+
DWORD flags = MB_ERR_INVALID_CHARS;
71337139
/* Ideally, we should get reason from FormatMessage. This is the Windows
71347140
2000 English version of the message. */
71357141
const char *reason = "No mapping for the Unicode character exists "
@@ -7187,6 +7193,11 @@ decode_code_page_errors(UINT code_page,
71877193
if (outsize > 0)
71887194
break;
71897195
err = GetLastError();
7196+
if (err == ERROR_INVALID_FLAGS && flags) {
7197+
/* For some code pages (e.g. UTF-7) flags must be set to 0. */
7198+
flags = 0;
7199+
continue;
7200+
}
71907201
if (err != ERROR_NO_UNICODE_TRANSLATION
71917202
&& err != ERROR_INSUFFICIENT_BUFFER)
71927203
{

0 commit comments

Comments
 (0)