Skip to content

Commit 62a435c

Browse files
Fix handling of Unicode escapes in string data in textproto files.
PiperOrigin-RevId: 573926175
1 parent 4354846 commit 62a435c

4 files changed

Lines changed: 14 additions & 57 deletions

File tree

conformance/text_format_failure_list_python.txt

Lines changed: 0 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -3,31 +3,6 @@
33
# TODO: These should be fixed.
44
Required.Proto3.TextFormatInput.FloatFieldMaxValue.ProtobufOutput
55
Required.Proto3.TextFormatInput.FloatFieldMaxValue.TextFormatOutput
6-
7-
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogateFirstOnlyBytes
8-
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogateFirstOnlyString
9-
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogatePairBytes
10-
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogatePairString
11-
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogateSecondOnlyBytes
12-
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogateSecondOnlyString
13-
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapesBytes.ProtobufOutput
14-
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapesBytes.TextFormatOutput
15-
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapesString.ProtobufOutput
16-
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapesString.TextFormatOutput
17-
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeBytes.ProtobufOutput
18-
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeBytes.TextFormatOutput
19-
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeString.ProtobufOutput
20-
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeString.TextFormatOutput
21-
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogateFirstOnlyBytes
22-
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogateFirstOnlyString
23-
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogatePairBytes
24-
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogatePairString
25-
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogateSecondOnlyBytes
26-
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogateSecondOnlyString
27-
Recommended.Proto3.TextFormatInput.StringLiteralUnicodeEscapeSurrogatePairLongShortBytes
28-
Recommended.Proto3.TextFormatInput.StringLiteralUnicodeEscapeSurrogatePairLongShortString
29-
Recommended.Proto3.TextFormatInput.StringLiteralUnicodeEscapeSurrogatePairShortLongBytes
30-
Recommended.Proto3.TextFormatInput.StringLiteralUnicodeEscapeSurrogatePairShortLongString
316
Required.Proto3.TextFormatInput.StringLiteralBasicEscapesBytes.ProtobufOutput
327
Required.Proto3.TextFormatInput.StringLiteralBasicEscapesBytes.TextFormatOutput
338
Required.Proto3.TextFormatInput.StringLiteralBasicEscapesString.ProtobufOutput

conformance/text_format_failure_list_python_cpp.txt

Lines changed: 0 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,3 @@
1-
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogateFirstOnlyBytes
2-
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogateFirstOnlyString
3-
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogatePairBytes
4-
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogatePairString
5-
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogateSecondOnlyBytes
6-
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogateSecondOnlyString
7-
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapesBytes.ProtobufOutput
8-
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapesBytes.TextFormatOutput
9-
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapesString.ProtobufOutput
10-
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapesString.TextFormatOutput
11-
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeBytes.ProtobufOutput
12-
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeBytes.TextFormatOutput
13-
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeString.ProtobufOutput
14-
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeString.TextFormatOutput
15-
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogateFirstOnlyBytes
16-
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogateFirstOnlyString
17-
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogatePairBytes
18-
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogatePairString
19-
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogateSecondOnlyBytes
20-
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogateSecondOnlyString
21-
Recommended.Proto3.TextFormatInput.StringLiteralUnicodeEscapeSurrogatePairLongShortBytes
22-
Recommended.Proto3.TextFormatInput.StringLiteralUnicodeEscapeSurrogatePairLongShortString
23-
Recommended.Proto3.TextFormatInput.StringLiteralUnicodeEscapeSurrogatePairShortLongBytes
24-
Recommended.Proto3.TextFormatInput.StringLiteralUnicodeEscapeSurrogatePairShortLongString
251
Required.Proto3.TextFormatInput.StringLiteralBasicEscapesBytes.ProtobufOutput
262
Required.Proto3.TextFormatInput.StringLiteralBasicEscapesBytes.TextFormatOutput
273
Required.Proto3.TextFormatInput.StringLiteralBasicEscapesString.ProtobufOutput

python/google/protobuf/internal/text_format_test.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -854,10 +854,11 @@ def testParseMultipleOneof(self, message_module):
854854
# itself for string fields. It also demonstrates escaped binary data.
855855
# The ur"" string prefix is unfortunately missing from Python 3
856856
# so we resort to double escaping our \s so that they come through.
857-
_UNICODE_SAMPLE = u"""
857+
_UNICODE_SAMPLE = """
858858
optional_bytes: 'Á short desçription'
859859
optional_string: 'Á short desçription'
860860
repeated_bytes: '\\303\\201 short des\\303\\247ription'
861+
repeated_bytes: '\\u00c1 short des\\u00e7ription'
861862
repeated_bytes: '\\x12\\x34\\x56\\x78\\x90\\xab\\xcd\\xef'
862863
repeated_string: '\\xd0\\x9f\\xd1\\x80\\xd0\\xb8\\xd0\\xb2\\xd0\\xb5\\xd1\\x82'
863864
"""
@@ -873,8 +874,9 @@ def testParseUnicode(self, message_module):
873874
self.assertEqual(m.optional_bytes, self._GOLDEN_BYTES)
874875
self.assertEqual(m.optional_string, self._GOLDEN_UNICODE)
875876
self.assertEqual(m.repeated_bytes[0], self._GOLDEN_BYTES)
876-
# repeated_bytes[1] contained simple \ escaped non-UTF-8 raw binary data.
877-
self.assertEqual(m.repeated_bytes[1], self._GOLDEN_BYTES_1)
877+
self.assertEqual(m.repeated_bytes[1], self._GOLDEN_BYTES)
878+
# repeated_bytes[2] contained simple \ escaped non-UTF-8 raw binary data.
879+
self.assertEqual(m.repeated_bytes[2], self._GOLDEN_BYTES_1)
878880
# repeated_string[0] contained \ escaped data representing the UTF-8
879881
# representation of _GOLDEN_STR_0 - it needs to decode as such.
880882
self.assertEqual(m.repeated_string[0], self._GOLDEN_STR_0)
@@ -885,8 +887,9 @@ def testParseBytes(self, message_module):
885887
self.assertEqual(m.optional_bytes, self._GOLDEN_BYTES)
886888
self.assertEqual(m.optional_string, self._GOLDEN_UNICODE)
887889
self.assertEqual(m.repeated_bytes[0], self._GOLDEN_BYTES)
890+
self.assertEqual(m.repeated_bytes[1], self._GOLDEN_BYTES)
888891
# repeated_bytes[1] contained simple \ escaped non-UTF-8 raw binary data.
889-
self.assertEqual(m.repeated_bytes[1], self._GOLDEN_BYTES_1)
892+
self.assertEqual(m.repeated_bytes[2], self._GOLDEN_BYTES_1)
890893
# repeated_string[0] contained \ escaped data representing the UTF-8
891894
# representation of _GOLDEN_STR_0 - it needs to decode as such.
892895
self.assertEqual(m.repeated_string[0], self._GOLDEN_STR_0)

python/google/protobuf/text_encoding.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,10 @@ def ReplaceHex(m):
7979
# allow single-digit hex escapes (like '\xf').
8080
result = _CUNESCAPE_HEX.sub(ReplaceHex, text)
8181

82-
return (result.encode('utf-8') # Make it bytes to allow decode.
83-
.decode('unicode_escape')
84-
# Make it bytes again to return the proper type.
85-
.encode('raw_unicode_escape'))
82+
# Replaces Unicode escape sequences with their character equivalents.
83+
result = result.encode('raw_unicode_escape').decode('raw_unicode_escape')
84+
# Encode Unicode characters as UTF-8, then decode to Latin-1 escaping
85+
# unprintable characters.
86+
result = result.encode('utf-8').decode('unicode_escape')
87+
# Convert Latin-1 text back to a byte string (latin-1 codec also works here).
88+
return result.encode('latin-1')

0 commit comments

Comments
 (0)