Skip to content

Commit 5c59f8a

Browse files
[pyupgrade] Ignore strings with string-only escapes (UP012) (#16058)
## Summary Resolves #12753. After this change, `UP012` will no longer report strings containing any of the following: * Name escapes (`\N{NAME}`) * Short (`\u0000`) and long (`\U00000000`) Unicode escapes * Octal escapes (`\0`, `\00`, `\000`) where the codepoint value is greater than 255 (377<sub>8</sub>) ## Test Plan `cargo nextest run` and `cargo insta test`.
1 parent 34d54b6 commit 5c59f8a

3 files changed

Lines changed: 457 additions & 2 deletions

File tree

crates/ruff_linter/resources/test/fixtures/pyupgrade/UP012.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,3 +88,44 @@ def _match_ignore(line):
8888
# AttributeError for t-strings so skip lint
8989
(t"foo{bar}").encode("utf-8")
9090
(t"foo{bar}").encode(encoding="utf-8")
91+
92+
93+
# https://github.com/astral-sh/ruff/issues/12753
94+
95+
## Errors
96+
("a" "b").encode()
97+
98+
'''\
99+
'''.encode()
100+
101+
'\x20\\'.encode()
102+
'\0\b0'.encode()
103+
'\01\fc'.encode()
104+
'\143\\'.encode()
105+
106+
("a" "\b").encode()
107+
("\a" "b").encode()
108+
("\a" r"\b").encode()
109+
(r"\a" "\b").encode()
110+
111+
'\"'.encode()
112+
"\'".encode()
113+
114+
'\\\\\\\\ '.encode() # 4 backslashes
115+
'\\\\\\\ '.encode() # `\ ` is invalid but only causes a SyntaxWarning
116+
117+
'\\a'.encode()
118+
'a\\\b'.encode()
119+
120+
'\\ u0000 '.encode()
121+
122+
123+
## No errors
124+
"\N{DIGIT ONE}".encode()
125+
"\u0031".encode()
126+
"\U00000031".encode()
127+
128+
'\477'.encode()
129+
130+
"\
131+
" "\u0001".encode()

crates/ruff_linter/src/rules/pyupgrade/rules/unnecessary_encode_utf8.rs

Lines changed: 86 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,9 @@ use std::fmt::Write as _;
22

33
use ruff_macros::{ViolationMetadata, derive_message_formats};
44
use ruff_python_ast::token::{TokenKind, Tokens};
5-
use ruff_python_ast::{self as ast, Arguments, Expr, Keyword};
6-
use ruff_text_size::{Ranged, TextRange};
5+
use ruff_python_ast::{self as ast, Arguments, Expr, Keyword, StringLiteral, StringLiteralValue};
6+
use ruff_python_trivia::Cursor;
7+
use ruff_text_size::{Ranged, TextLen, TextRange};
78

89
use crate::Locator;
910
use crate::checkers::ast::Checker;
@@ -158,6 +159,10 @@ pub(crate) fn unnecessary_encode_utf8(checker: &Checker, call: &ast::ExprCall) {
158159
};
159160
match variable {
160161
Expr::StringLiteral(ast::ExprStringLiteral { value: literal, .. }) => {
162+
if string_contains_string_only_escapes(literal, checker.locator()) {
163+
return;
164+
}
165+
161166
// Ex) `"str".encode()`, `"str".encode("utf-8")`
162167
if let Some(encoding_arg) = match_encoding_arg(&call.arguments) {
163168
if literal.to_str().is_ascii() {
@@ -259,3 +264,82 @@ pub(crate) fn unnecessary_encode_utf8(checker: &Checker, call: &ast::ExprCall) {
259264
_ => {}
260265
}
261266
}
267+
/// In a string, there are two kinds of escape sequences: "single" and "multi".
268+
///
269+
/// A "single" escape sequence is formed if a backslash is followed by
270+
/// a newline, another backslash, `'`, `"`, `a`, `b`, `f`, `n`, `t`, or `v`.
271+
/// A "multi" escape sequence is formed if a backslash is followed by
272+
/// `x` and 2 hex digits, `N` and a Unicode character name enclosed in a pair of braces,
273+
/// `u` and 4 hex digits, `U` and 8 hex digits, or 1 to 3 oct digits.
274+
///
275+
/// Out of the aforementioned, `u`, `U` and `N` are only valid in a string.
276+
/// However, an octal escape `\ooo` where `ooo` is greater than 377 base 8
277+
/// currently raises a `SyntaxWarning` (will eventually be a `SyntaxError`)
278+
/// in both strings and bytes and thus is not considered `bytes`-compatible.
279+
///
280+
/// An unrecognized escape sequence is ignored, resulting in both
281+
/// the backslash and the following character being part of the string.
282+
///
283+
/// Reference: [Lexical analysis &sect; 2.4.1.1. Escape sequences][escape-sequences]
284+
///
285+
/// [escape-sequences]: https://docs.python.org/3/reference/lexical_analysis.html#escape-sequences
286+
fn string_contains_string_only_escapes(string: &StringLiteralValue, locator: &Locator) -> bool {
287+
for literal in string {
288+
let flags = literal.flags;
289+
290+
if flags.prefix().is_raw() {
291+
continue;
292+
}
293+
294+
if literal.content_range().len() > literal.as_str().text_len()
295+
&& literal_contains_string_only_escapes(literal, locator)
296+
{
297+
return true;
298+
}
299+
}
300+
301+
false
302+
}
303+
304+
fn literal_contains_string_only_escapes(literal: &StringLiteral, locator: &Locator) -> bool {
305+
let inner_in_source = locator.slice(literal.content_range());
306+
307+
let mut cursor = Cursor::new(inner_in_source);
308+
309+
while let Some(backslash_offset) = memchr::memchr(b'\\', cursor.as_bytes()) {
310+
cursor.skip_bytes(backslash_offset + "\\".len());
311+
312+
let Some(escaped) = cursor.bump() else {
313+
continue;
314+
};
315+
316+
match escaped {
317+
'N' | 'u' | 'U' => return true,
318+
'x' => {
319+
cursor.skip_bytes(2);
320+
}
321+
'0'..='7' => {
322+
let (second, third) = (cursor.first(), cursor.second());
323+
324+
let octal_codepoint = match (is_octal_digit(second), is_octal_digit(third)) {
325+
(false, _) => escaped.to_string(),
326+
(true, false) => format!("{escaped}{second}"),
327+
(true, true) => format!("{escaped}{second}{third}"),
328+
};
329+
330+
if octal_codepoint.parse::<u8>().is_err() {
331+
return true;
332+
}
333+
334+
cursor.skip_bytes(octal_codepoint.len());
335+
}
336+
_ => {}
337+
}
338+
}
339+
340+
false
341+
}
342+
343+
const fn is_octal_digit(char: char) -> bool {
344+
matches!(char, '0'..='7')
345+
}

0 commit comments

Comments
 (0)