|
29 | 29 | #include "upb/wire/eps_copy_input_stream.h" |
30 | 30 | #include "upb/wire/reader.h" |
31 | 31 | #include "upb/wire/types.h" |
| 32 | +#include "utf8_range.h" |
32 | 33 |
|
33 | 34 | // Must be last. |
34 | 35 | #include "upb/port/def.inc" |
@@ -108,42 +109,121 @@ static void txtenc_enum(int32_t val, const upb_FieldDef* f, txtenc* e) { |
108 | 109 | } |
109 | 110 | } |
110 | 111 |
|
111 | | -static void txtenc_string(txtenc* e, upb_StringView str, bool bytes) { |
112 | | - const char* ptr = str.data; |
113 | | - const char* end = ptr + str.size; |
114 | | - txtenc_putstr(e, "\""); |
| 112 | +static void txtenc_escaped(txtenc* e, unsigned char ch) { |
| 113 | + switch (ch) { |
| 114 | + case '\n': |
| 115 | + txtenc_putstr(e, "\\n"); |
| 116 | + break; |
| 117 | + case '\r': |
| 118 | + txtenc_putstr(e, "\\r"); |
| 119 | + break; |
| 120 | + case '\t': |
| 121 | + txtenc_putstr(e, "\\t"); |
| 122 | + break; |
| 123 | + case '\"': |
| 124 | + txtenc_putstr(e, "\\\""); |
| 125 | + break; |
| 126 | + case '\'': |
| 127 | + txtenc_putstr(e, "\\'"); |
| 128 | + break; |
| 129 | + case '\\': |
| 130 | + txtenc_putstr(e, "\\\\"); |
| 131 | + break; |
| 132 | + default: |
| 133 | + txtenc_printf(e, "\\%03o", ch); |
| 134 | + break; |
| 135 | + } |
| 136 | +} |
| 137 | + |
| 138 | +// Returns true if `ch` needs to be escaped in TextFormat, independent of any |
| 139 | +// UTF-8 validity issues. |
| 140 | +static bool upb_DefinitelyNeedsEscape(unsigned char ch) { |
| 141 | + if (ch < 32) return true; |
| 142 | + switch (ch) { |
| 143 | + case '\"': |
| 144 | + case '\'': |
| 145 | + case '\\': |
| 146 | + case 127: |
| 147 | + return true; |
| 148 | + } |
| 149 | + return false; |
| 150 | +} |
| 151 | + |
| 152 | +static bool upb_AsciiIsPrint(unsigned char ch) { return ch >= 32 && ch < 127; } |
| 153 | + |
| 154 | +// Returns true if this is a high byte that requires UTF-8 validation. If the |
| 155 | +// UTF-8 validation fails, we must escape the byte. |
| 156 | +static bool upb_NeedsUtf8Validation(unsigned char ch) { return ch > 127; } |
| 157 | + |
| 158 | +// Returns the number of bytes in the prefix of `val` that do not need escaping. |
| 159 | +// This is like utf8_range::SpanStructurallyValid(), except that it also |
| 160 | +// terminates at any ASCII char that needs to be escaped in TextFormat (any char |
| 161 | +// that has `DefinitelyNeedsEscape(ch) == true`). |
| 162 | +// |
| 163 | +// If we could get a variant of utf8_range::SpanStructurallyValid() that could |
| 164 | +// terminate on any of these chars, that might be more efficient, but it would |
| 165 | +// be much more complicated to modify that heavily SIMD code. |
| 166 | +static size_t SkipPassthroughBytes(const char* ptr, size_t size) { |
| 167 | + for (size_t i = 0; i < size; i++) { |
| 168 | + unsigned char uc = ptr[i]; |
| 169 | + if (upb_DefinitelyNeedsEscape(uc)) return i; |
| 170 | + if (upb_NeedsUtf8Validation(uc)) { |
| 171 | + // Find the end of this region of consecutive high bytes, so that we only |
| 172 | + // give high bytes to the UTF-8 checker. This avoids needing to perform |
| 173 | + // a second scan of the ASCII characters looking for characters that |
| 174 | + // need escaping. |
| 175 | + // |
| 176 | + // We assume that high bytes are less frequent than plain, printable ASCII |
| 177 | + // bytes, so we accept the double-scan of high bytes. |
| 178 | + size_t end = i + 1; |
| 179 | + for (; end < size; end++) { |
| 180 | + if (!upb_NeedsUtf8Validation(ptr[end])) break; |
| 181 | + } |
| 182 | + size_t n = end - i; |
| 183 | + size_t ok = utf8_range_ValidPrefix(ptr + i, n); |
| 184 | + if (ok != n) return i + ok; |
| 185 | + i += ok - 1; |
| 186 | + } |
| 187 | + } |
| 188 | + return size; |
| 189 | +} |
115 | 190 |
|
| 191 | +static void upb_HardenedPrintString(txtenc* e, const char* ptr, size_t len) { |
| 192 | + // Print as UTF-8, while guarding against any invalid UTF-8 in the string |
| 193 | + // field. |
| 194 | + // |
| 195 | + // If in the future we have a guaranteed invariant that invalid UTF-8 will |
| 196 | + // never be present, we could avoid the UTF-8 check here. |
| 197 | + txtenc_putstr(e, "\""); |
| 198 | + const char* end = ptr + len; |
116 | 199 | while (ptr < end) { |
117 | | - switch (*ptr) { |
118 | | - case '\n': |
119 | | - txtenc_putstr(e, "\\n"); |
120 | | - break; |
121 | | - case '\r': |
122 | | - txtenc_putstr(e, "\\r"); |
123 | | - break; |
124 | | - case '\t': |
125 | | - txtenc_putstr(e, "\\t"); |
126 | | - break; |
127 | | - case '\"': |
128 | | - txtenc_putstr(e, "\\\""); |
129 | | - break; |
130 | | - case '\'': |
131 | | - txtenc_putstr(e, "\\'"); |
132 | | - break; |
133 | | - case '\\': |
134 | | - txtenc_putstr(e, "\\\\"); |
135 | | - break; |
136 | | - default: |
137 | | - if ((bytes || (uint8_t)*ptr < 0x80) && !isprint(*ptr)) { |
138 | | - txtenc_printf(e, "\\%03o", (int)(uint8_t)*ptr); |
139 | | - } else { |
140 | | - txtenc_putbytes(e, ptr, 1); |
141 | | - } |
142 | | - break; |
| 200 | + size_t n = SkipPassthroughBytes(ptr, end - ptr); |
| 201 | + if (n != 0) { |
| 202 | + txtenc_putbytes(e, ptr, n); |
| 203 | + ptr += n; |
| 204 | + if (ptr == end) break; |
143 | 205 | } |
| 206 | + |
| 207 | + // If repeated calls to CEscape() and PrintString() are expensive, we could |
| 208 | + // consider batching them, at the cost of some complexity. |
| 209 | + txtenc_escaped(e, *ptr); |
144 | 210 | ptr++; |
145 | 211 | } |
| 212 | + txtenc_putstr(e, "\""); |
| 213 | +} |
146 | 214 |
|
| 215 | +static void txtenc_bytes(txtenc* e, upb_StringView data) { |
| 216 | + const char* ptr = data.data; |
| 217 | + const char* end = ptr + data.size; |
| 218 | + txtenc_putstr(e, "\""); |
| 219 | + for (; ptr < end; ptr++) { |
| 220 | + unsigned char uc = *ptr; |
| 221 | + if (upb_AsciiIsPrint(uc)) { |
| 222 | + txtenc_putbytes(e, ptr, 1); |
| 223 | + } else { |
| 224 | + txtenc_escaped(e, uc); |
| 225 | + } |
| 226 | + } |
147 | 227 | txtenc_putstr(e, "\""); |
148 | 228 | } |
149 | 229 |
|
@@ -206,10 +286,10 @@ static void txtenc_field(txtenc* e, upb_MessageValue val, |
206 | 286 | txtenc_printf(e, "%" PRIu64, val.uint64_val); |
207 | 287 | break; |
208 | 288 | case kUpb_CType_String: |
209 | | - txtenc_string(e, val.str_val, false); |
| 289 | + upb_HardenedPrintString(e, val.str_val.data, val.str_val.size); |
210 | 290 | break; |
211 | 291 | case kUpb_CType_Bytes: |
212 | | - txtenc_string(e, val.str_val, true); |
| 292 | + txtenc_bytes(e, val.str_val); |
213 | 293 | break; |
214 | 294 | case kUpb_CType_Enum: |
215 | 295 | txtenc_enum(val.int32_val, f, e); |
@@ -378,7 +458,7 @@ static const char* txtenc_unknown(txtenc* e, const char* ptr, |
378 | 458 | const char* str = ptr; |
379 | 459 | ptr = upb_EpsCopyInputStream_ReadString(stream, &str, size, NULL); |
380 | 460 | UPB_ASSERT(ptr); |
381 | | - txtenc_string(e, (upb_StringView){.data = str, .size = size}, true); |
| 461 | + txtenc_bytes(e, (upb_StringView){.data = str, .size = size}); |
382 | 462 | } |
383 | 463 | break; |
384 | 464 | } |
|
0 commit comments