Skip to content

Commit 1ac8c04

Browse files
habermancopybara-github
authored andcommitted
Fixed UTF-8 TextFormat output to protect against invalid UTF-8 in string fields.
This will guarantee that the output of TextFormat is always valid UTF-8. Prior to this change, any invalid UTF-8 would escape into the TextFormat output if users enabled `SetUseUtf8StringEscaping(true)` or called `Utf8DebugString()`. This change currently only affects users who explicitly set `SetUseUtf8StringEscaping(true)` or use `Utf8DebugString()`, but the hope is to flip the default so that this mode is enabled unless overridden. PiperOrigin-RevId: 589844142
1 parent d605b48 commit 1ac8c04

2 files changed

Lines changed: 84 additions & 2 deletions

File tree

src/google/protobuf/text_format.cc

Lines changed: 83 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include <atomic>
1919
#include <climits>
2020
#include <cmath>
21+
#include <cstddef>
2122
#include <limits>
2223
#include <string>
2324
#include <utility>
@@ -47,6 +48,7 @@
4748
#include "google/protobuf/repeated_field.h"
4849
#include "google/protobuf/unknown_field_set.h"
4950
#include "google/protobuf/wire_format_lite.h"
51+
#include "utf8_validity.h"
5052

5153
// Must be included last.
5254
#include "google/protobuf/port_def.inc"
@@ -1647,6 +1649,83 @@ class TextFormat::Printer::DebugStringFieldValuePrinter
16471649
}
16481650
};
16491651

1652+
namespace {
1653+
1654+
// Returns true if `ch` needs to be escaped in TextFormat, independent of any
1655+
// UTF-8 validity issues.
1656+
bool DefinitelyNeedsEscape(unsigned char ch) {
1657+
if (ch < 32) return true;
1658+
switch (ch) {
1659+
case '\"':
1660+
case '\'':
1661+
case '\\':
1662+
return true;
1663+
}
1664+
return false;
1665+
}
1666+
1667+
// Returns true if this is a high byte that requires UTF-8 validation. If the
1668+
// UTF-8 validation fails, we must escape the byte.
1669+
bool NeedsUtf8Validation(unsigned char ch) { return ch > 127; }
1670+
1671+
// Returns the number of bytes in the prefix of `val` that do not need escaping.
1672+
// This is like utf8_range::SpanStructurallyValid(), except that it also
1673+
// terminates at any ASCII char that needs to be escaped in TextFormat (any char
1674+
// that has `DefinitelyNeedsEscape(ch) == true`).
1675+
//
1676+
// If we could get a variant of utf8_range::SpanStructurallyValid() that could
1677+
// terminate on any of these chars, that might be more efficient, but it would
1678+
// be much more complicated to modify that heavily SIMD code.
1679+
size_t SkipPassthroughBytes(absl::string_view val) {
1680+
for (size_t i = 0; i < val.size(); i++) {
1681+
unsigned char uc = val[i];
1682+
if (DefinitelyNeedsEscape(uc)) return i;
1683+
if (NeedsUtf8Validation(uc)) {
1684+
// Find the end of this region of consecutive high bytes, so that we only
1685+
// give high bytes to the UTF-8 checker. This avoids needing to perform
1686+
// a second scan of the ASCII characters looking for characters that
1687+
// need escaping.
1688+
//
1689+
// We assume that high bytes are less frequent than plain, printable ASCII
1690+
// bytes, so we accept the double-scan of high bytes.
1691+
size_t end = i + 1;
1692+
for (; end < val.size(); end++) {
1693+
if (!NeedsUtf8Validation(val[end])) break;
1694+
}
1695+
size_t n = end - i;
1696+
size_t ok = utf8_range::SpanStructurallyValid(val.substr(i, n));
1697+
if (ok != n) return i + ok;
1698+
i += ok - 1;
1699+
}
1700+
}
1701+
return val.size();
1702+
}
1703+
1704+
void HardenedPrintString(absl::string_view src,
1705+
TextFormat::BaseTextGenerator* generator) {
1706+
// Print as UTF-8, while guarding against any invalid UTF-8 in the string
1707+
// field.
1708+
//
1709+
// If in the future we have a guaranteed invariant that invalid UTF-8 will
1710+
// never be present, we could avoid the UTF-8 check here.
1711+
1712+
while (!src.empty()) {
1713+
size_t n = SkipPassthroughBytes(src);
1714+
if (n != 0) {
1715+
generator->PrintString(src.substr(0, n));
1716+
src.remove_prefix(n);
1717+
if (src.empty()) break;
1718+
}
1719+
1720+
// If repeated calls to CEscape() and PrintString() are expensive, we could
1721+
// consider batching them, at the cost of some complexity.
1722+
generator->PrintString(absl::CEscape(src.substr(0, 1)));
1723+
src.remove_prefix(1);
1724+
}
1725+
}
1726+
1727+
} // namespace
1728+
16501729
// ===========================================================================
16511730
// An internal field value printer that escape UTF8 strings.
16521731
class TextFormat::Printer::FastFieldValuePrinterUtf8Escaping
@@ -1655,7 +1734,7 @@ class TextFormat::Printer::FastFieldValuePrinterUtf8Escaping
16551734
void PrintString(const std::string& val,
16561735
TextFormat::BaseTextGenerator* generator) const override {
16571736
generator->PrintLiteral("\"");
1658-
generator->PrintString(absl::Utf8SafeCEscape(val));
1737+
HardenedPrintString(val, generator);
16591738
generator->PrintLiteral("\"");
16601739
}
16611740
void PrintBytes(const std::string& val,
@@ -1956,7 +2035,9 @@ void TextFormat::FastFieldValuePrinter::PrintEnum(
19562035
void TextFormat::FastFieldValuePrinter::PrintString(
19572036
const std::string& val, BaseTextGenerator* generator) const {
19582037
generator->PrintLiteral("\"");
1959-
generator->PrintString(absl::CEscape(val));
2038+
if (!val.empty()) {
2039+
generator->PrintString(absl::CEscape(val));
2040+
}
19602041
generator->PrintLiteral("\"");
19612042
}
19622043
void TextFormat::FastFieldValuePrinter::PrintBytes(

src/google/protobuf/text_format_unittest.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@
4747
#include "google/protobuf/unittest_mset.pb.h"
4848
#include "google/protobuf/unittest_mset_wire_format.pb.h"
4949
#include "google/protobuf/unittest_proto3.pb.h"
50+
#include "utf8_validity.h"
5051

5152

5253
// Must be included last.

0 commit comments

Comments
 (0)