Fixed UTF-8 TextFormat output to protect against invalid UTF-8 in string fields.

haberman · copybara-github · commit 1ac8c04604fe · 2023-12-11T09:13:52.000-08:00
This will guarantee that the output of TextFormat is always valid UTF-8.  Prior to this change, any invalid UTF-8 would escape into the TextFormat output if users enabled `SetUseUtf8StringEscaping(true)` or called `Utf8DebugString()`.

This change currently only affects users who explicitly set `SetUseUtf8StringEscaping(true)` or use `Utf8DebugString()`, but the hope is to flip the default so that this mode is enabled unless overridden.

PiperOrigin-RevId: 589844142
diff --git a/src/google/protobuf/text_format.cc b/src/google/protobuf/text_format.cc
@@ -18,6 +18,7 @@
 #include <atomic>
 #include <climits>
 #include <cmath>
+#include <cstddef>
 #include <limits>
 #include <string>
 #include <utility>
@@ -47,6 +48,7 @@
 #include "google/protobuf/repeated_field.h"
 #include "google/protobuf/unknown_field_set.h"
 #include "google/protobuf/wire_format_lite.h"
+#include "utf8_validity.h"
 
 // Must be included last.
 #include "google/protobuf/port_def.inc"
@@ -1647,6 +1649,83 @@ class TextFormat::Printer::DebugStringFieldValuePrinter
   }
 };
 
+namespace {
+
+// Returns true if `ch` needs to be escaped in TextFormat, independent of any
+// UTF-8 validity issues.
+bool DefinitelyNeedsEscape(unsigned char ch) {
+  if (ch < 32) return true;
+  switch (ch) {
+    case '\"':
+    case '\'':
+    case '\\':
+      return true;
+  }
+  return false;
+}
+
+// Returns true if this is a high byte that requires UTF-8 validation.  If the
+// UTF-8 validation fails, we must escape the byte.
+bool NeedsUtf8Validation(unsigned char ch) { return ch > 127; }
+
+// Returns the number of bytes in the prefix of `val` that do not need escaping.
+// This is like utf8_range::SpanStructurallyValid(), except that it also
+// terminates at any ASCII char that needs to be escaped in TextFormat (any char
+// that has `DefinitelyNeedsEscape(ch) == true`).
+//
+// If we could get a variant of utf8_range::SpanStructurallyValid() that could
+// terminate on any of these chars, that might be more efficient, but it would
+// be much more complicated to modify that heavily SIMD code.
+size_t SkipPassthroughBytes(absl::string_view val) {
+  for (size_t i = 0; i < val.size(); i++) {
+    unsigned char uc = val[i];
+    if (DefinitelyNeedsEscape(uc)) return i;
+    if (NeedsUtf8Validation(uc)) {
+      // Find the end of this region of consecutive high bytes, so that we only
+      // give high bytes to the UTF-8 checker.  This avoids needing to perform
+      // a second scan of the ASCII characters looking for characters that
+      // need escaping.
+      //
+      // We assume that high bytes are less frequent than plain, printable ASCII
+      // bytes, so we accept the double-scan of high bytes.
+      size_t end = i + 1;
+      for (; end < val.size(); end++) {
+        if (!NeedsUtf8Validation(val[end])) break;
+      }
+      size_t n = end - i;
+      size_t ok = utf8_range::SpanStructurallyValid(val.substr(i, n));
+      if (ok != n) return i + ok;
+      i += ok - 1;
+    }
+  }
+  return val.size();
+}
+
+void HardenedPrintString(absl::string_view src,
+                         TextFormat::BaseTextGenerator* generator) {
+  // Print as UTF-8, while guarding against any invalid UTF-8 in the string
+  // field.
+  //
+  // If in the future we have a guaranteed invariant that invalid UTF-8 will
+  // never be present, we could avoid the UTF-8 check here.
+
+  while (!src.empty()) {
+    size_t n = SkipPassthroughBytes(src);
+    if (n != 0) {
+      generator->PrintString(src.substr(0, n));
+      src.remove_prefix(n);
+      if (src.empty()) break;
+    }
+
+    // If repeated calls to CEscape() and PrintString() are expensive, we could
+    // consider batching them, at the cost of some complexity.
+    generator->PrintString(absl::CEscape(src.substr(0, 1)));
+    src.remove_prefix(1);
+  }
+}
+
+}  // namespace
+
 // ===========================================================================
 //  An internal field value printer that escape UTF8 strings.
 class TextFormat::Printer::FastFieldValuePrinterUtf8Escaping
@@ -1655,7 +1734,7 @@ class TextFormat::Printer::FastFieldValuePrinterUtf8Escaping
   void PrintString(const std::string& val,
                    TextFormat::BaseTextGenerator* generator) const override {
     generator->PrintLiteral("\"");
-    generator->PrintString(absl::Utf8SafeCEscape(val));
+    HardenedPrintString(val, generator);
     generator->PrintLiteral("\"");
   }
   void PrintBytes(const std::string& val,
@@ -1956,7 +2035,9 @@ void TextFormat::FastFieldValuePrinter::PrintEnum(
 void TextFormat::FastFieldValuePrinter::PrintString(
     const std::string& val, BaseTextGenerator* generator) const {
   generator->PrintLiteral("\"");
-  generator->PrintString(absl::CEscape(val));
+  if (!val.empty()) {
+    generator->PrintString(absl::CEscape(val));
+  }
   generator->PrintLiteral("\"");
 }
 void TextFormat::FastFieldValuePrinter::PrintBytes(
diff --git a/src/google/protobuf/text_format_unittest.cc b/src/google/protobuf/text_format_unittest.cc
@@ -47,6 +47,7 @@
 #include "google/protobuf/unittest_mset.pb.h"
 #include "google/protobuf/unittest_mset_wire_format.pb.h"
 #include "google/protobuf/unittest_proto3.pb.h"
+#include "utf8_validity.h"
 
 
 // Must be included last.