1818#include < atomic>
1919#include < climits>
2020#include < cmath>
21+ #include < cstddef>
2122#include < limits>
2223#include < string>
2324#include < utility>
4748#include " google/protobuf/repeated_field.h"
4849#include " google/protobuf/unknown_field_set.h"
4950#include " google/protobuf/wire_format_lite.h"
51+ #include " utf8_validity.h"
5052
5153// Must be included last.
5254#include " google/protobuf/port_def.inc"
@@ -1647,6 +1649,83 @@ class TextFormat::Printer::DebugStringFieldValuePrinter
16471649 }
16481650};
16491651
1652+ namespace {
1653+
1654+ // Returns true if `ch` needs to be escaped in TextFormat, independent of any
1655+ // UTF-8 validity issues.
1656+ bool DefinitelyNeedsEscape (unsigned char ch) {
1657+ if (ch < 32 ) return true ;
1658+ switch (ch) {
1659+ case ' \" ' :
1660+ case ' \' ' :
1661+ case ' \\ ' :
1662+ return true ;
1663+ }
1664+ return false ;
1665+ }
1666+
1667+ // Returns true if this is a high byte that requires UTF-8 validation. If the
1668+ // UTF-8 validation fails, we must escape the byte.
1669+ bool NeedsUtf8Validation (unsigned char ch) { return ch > 127 ; }
1670+
1671+ // Returns the number of bytes in the prefix of `val` that do not need escaping.
1672+ // This is like utf8_range::SpanStructurallyValid(), except that it also
1673+ // terminates at any ASCII char that needs to be escaped in TextFormat (any char
1674+ // that has `DefinitelyNeedsEscape(ch) == true`).
1675+ //
1676+ // If we could get a variant of utf8_range::SpanStructurallyValid() that could
1677+ // terminate on any of these chars, that might be more efficient, but it would
1678+ // be much more complicated to modify that heavily SIMD code.
1679+ size_t SkipPassthroughBytes (absl::string_view val) {
1680+ for (size_t i = 0 ; i < val.size (); i++) {
1681+ unsigned char uc = val[i];
1682+ if (DefinitelyNeedsEscape (uc)) return i;
1683+ if (NeedsUtf8Validation (uc)) {
1684+ // Find the end of this region of consecutive high bytes, so that we only
1685+ // give high bytes to the UTF-8 checker. This avoids needing to perform
1686+ // a second scan of the ASCII characters looking for characters that
1687+ // need escaping.
1688+ //
1689+ // We assume that high bytes are less frequent than plain, printable ASCII
1690+ // bytes, so we accept the double-scan of high bytes.
1691+ size_t end = i + 1 ;
1692+ for (; end < val.size (); end++) {
1693+ if (!NeedsUtf8Validation (val[end])) break ;
1694+ }
1695+ size_t n = end - i;
1696+ size_t ok = utf8_range::SpanStructurallyValid (val.substr (i, n));
1697+ if (ok != n) return i + ok;
1698+ i += ok - 1 ;
1699+ }
1700+ }
1701+ return val.size ();
1702+ }
1703+
1704+ void HardenedPrintString (absl::string_view src,
1705+ TextFormat::BaseTextGenerator* generator) {
1706+ // Print as UTF-8, while guarding against any invalid UTF-8 in the string
1707+ // field.
1708+ //
1709+ // If in the future we have a guaranteed invariant that invalid UTF-8 will
1710+ // never be present, we could avoid the UTF-8 check here.
1711+
1712+ while (!src.empty ()) {
1713+ size_t n = SkipPassthroughBytes (src);
1714+ if (n != 0 ) {
1715+ generator->PrintString (src.substr (0 , n));
1716+ src.remove_prefix (n);
1717+ if (src.empty ()) break ;
1718+ }
1719+
1720+ // If repeated calls to CEscape() and PrintString() are expensive, we could
1721+ // consider batching them, at the cost of some complexity.
1722+ generator->PrintString (absl::CEscape (src.substr (0 , 1 )));
1723+ src.remove_prefix (1 );
1724+ }
1725+ }
1726+
1727+ } // namespace
1728+
16501729// ===========================================================================
16511730// An internal field value printer that escape UTF8 strings.
16521731class TextFormat ::Printer::FastFieldValuePrinterUtf8Escaping
@@ -1655,7 +1734,7 @@ class TextFormat::Printer::FastFieldValuePrinterUtf8Escaping
16551734 void PrintString (const std::string& val,
16561735 TextFormat::BaseTextGenerator* generator) const override {
16571736 generator->PrintLiteral (" \" " );
1658- generator-> PrintString ( absl::Utf8SafeCEscape ( val) );
1737+ HardenedPrintString ( val, generator );
16591738 generator->PrintLiteral (" \" " );
16601739 }
16611740 void PrintBytes (const std::string& val,
@@ -1956,7 +2035,9 @@ void TextFormat::FastFieldValuePrinter::PrintEnum(
19562035void TextFormat::FastFieldValuePrinter::PrintString (
19572036 const std::string& val, BaseTextGenerator* generator) const {
19582037 generator->PrintLiteral (" \" " );
1959- generator->PrintString (absl::CEscape (val));
2038+ if (!val.empty ()) {
2039+ generator->PrintString (absl::CEscape (val));
2040+ }
19602041 generator->PrintLiteral (" \" " );
19612042}
19622043void TextFormat::FastFieldValuePrinter::PrintBytes (
0 commit comments