Skip to content

Commit 74829b7

Browse files
bpo-36312: Fix decoders for some code pages. (GH-12369)
(cherry picked from commit c1e2c28) Co-authored-by: Serhiy Storchaka <[email protected]>
1 parent 65b9849 commit 74829b7

File tree

3 files changed

+27
-5
lines changed

3 files changed

+27
-5
lines changed

Lib/test/test_codecs.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3159,6 +3159,15 @@ def test_multibyte_encoding(self):
31593159
('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
31603160
))
31613161

3162+
def test_code_page_decode_flags(self):
3163+
# Issue #36312: For some code pages (e.g. UTF-7) flags for
3164+
# MultiByteToWideChar() must be set to 0.
3165+
for cp in (50220, 50221, 50222, 50225, 50227, 50229,
3166+
*range(57002, 57011+1), 65000):
3167+
self.assertEqual(codecs.code_page_decode(cp, b'abc'), ('abc', 3))
3168+
self.assertEqual(codecs.code_page_decode(42, b'abc'),
3169+
('\uf061\uf062\uf063', 3))
3170+
31623171
def test_incremental(self):
31633172
decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
31643173
self.assertEqual(decoded, ('', 0))
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Fixed decoders for the following code pages: 50220, 50221, 50222, 50225,
2+
50227, 50229, 57002 through 57011, 65000 and 42.

Objects/unicodeobject.c

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7123,15 +7123,21 @@ decode_code_page_strict(UINT code_page,
71237123
const char *in,
71247124
int insize)
71257125
{
7126-
const DWORD flags = decode_code_page_flags(code_page);
7126+
DWORD flags = MB_ERR_INVALID_CHARS;
71277127
wchar_t *out;
71287128
DWORD outsize;
71297129

71307130
/* First get the size of the result */
71317131
assert(insize > 0);
7132-
outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7133-
if (outsize <= 0)
7134-
goto error;
7132+
while ((outsize = MultiByteToWideChar(code_page, flags,
7133+
in, insize, NULL, 0)) <= 0)
7134+
{
7135+
if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7136+
goto error;
7137+
}
7138+
/* For some code pages (e.g. UTF-7) flags must be set to 0. */
7139+
flags = 0;
7140+
}
71357141

71367142
if (*v == NULL) {
71377143
/* Create unicode object */
@@ -7177,7 +7183,7 @@ decode_code_page_errors(UINT code_page,
71777183
{
71787184
const char *startin = in;
71797185
const char *endin = in + size;
7180-
const DWORD flags = decode_code_page_flags(code_page);
7186+
DWORD flags = MB_ERR_INVALID_CHARS;
71817187
/* Ideally, we should get reason from FormatMessage. This is the Windows
71827188
2000 English version of the message. */
71837189
const char *reason = "No mapping for the Unicode character exists "
@@ -7248,6 +7254,11 @@ decode_code_page_errors(UINT code_page,
72487254
if (outsize > 0)
72497255
break;
72507256
err = GetLastError();
7257+
if (err == ERROR_INVALID_FLAGS && flags) {
7258+
/* For some code pages (e.g. UTF-7) flags must be set to 0. */
7259+
flags = 0;
7260+
continue;
7261+
}
72517262
if (err != ERROR_NO_UNICODE_TRANSLATION
72527263
&& err != ERROR_INSUFFICIENT_BUFFER)
72537264
{

0 commit comments

Comments
 (0)