@@ -2,8 +2,9 @@ use std::fmt::Write as _;
22
33use ruff_macros:: { ViolationMetadata , derive_message_formats} ;
44use ruff_python_ast:: token:: { TokenKind , Tokens } ;
5- use ruff_python_ast:: { self as ast, Arguments , Expr , Keyword } ;
6- use ruff_text_size:: { Ranged , TextRange } ;
5+ use ruff_python_ast:: { self as ast, Arguments , Expr , Keyword , StringLiteral , StringLiteralValue } ;
6+ use ruff_python_trivia:: Cursor ;
7+ use ruff_text_size:: { Ranged , TextLen , TextRange } ;
78
89use crate :: Locator ;
910use crate :: checkers:: ast:: Checker ;
@@ -158,6 +159,10 @@ pub(crate) fn unnecessary_encode_utf8(checker: &Checker, call: &ast::ExprCall) {
158159 } ;
159160 match variable {
160161 Expr :: StringLiteral ( ast:: ExprStringLiteral { value : literal, .. } ) => {
162+ if string_contains_string_only_escapes ( literal, checker. locator ( ) ) {
163+ return ;
164+ }
165+
161166 // Ex) `"str".encode()`, `"str".encode("utf-8")`
162167 if let Some ( encoding_arg) = match_encoding_arg ( & call. arguments ) {
163168 if literal. to_str ( ) . is_ascii ( ) {
@@ -259,3 +264,82 @@ pub(crate) fn unnecessary_encode_utf8(checker: &Checker, call: &ast::ExprCall) {
259264 _ => { }
260265 }
261266}
267+ /// In a string, there are two kinds of escape sequences: "single" and "multi".
268+ ///
269+ /// A "single" escape sequence is formed if a backslash is followed by
270+ /// a newline, another backslash, `'`, `"`, `a`, `b`, `f`, `n`, `t`, or `v`.
271+ /// A "multi" escape sequence is formed if a backslash is followed by
272+ /// `x` and 2 hex digits, `N` and a Unicode character name enclosed in a pair of braces,
273+ /// `u` and 4 hex digits, `U` and 8 hex digits, or 1 to 3 oct digits.
274+ ///
275+ /// Out of the aforementioned, `u`, `U` and `N` are only valid in a string.
276+ /// However, an octal escape `\ooo` where `ooo` is greater than 377 base 8
277+ /// currently raises a `SyntaxWarning` (will eventually be a `SyntaxError`)
278+ /// in both strings and bytes and thus is not considered `bytes`-compatible.
279+ ///
280+ /// An unrecognized escape sequence is ignored, resulting in both
281+ /// the backslash and the following character being part of the string.
282+ ///
283+ /// Reference: [Lexical analysis § 2.4.1.1. Escape sequences][escape-sequences]
284+ ///
285+ /// [escape-sequences]: https://docs.python.org/3/reference/lexical_analysis.html#escape-sequences
286+ fn string_contains_string_only_escapes ( string : & StringLiteralValue , locator : & Locator ) -> bool {
287+ for literal in string {
288+ let flags = literal. flags ;
289+
290+ if flags. prefix ( ) . is_raw ( ) {
291+ continue ;
292+ }
293+
294+ if literal. content_range ( ) . len ( ) > literal. as_str ( ) . text_len ( )
295+ && literal_contains_string_only_escapes ( literal, locator)
296+ {
297+ return true ;
298+ }
299+ }
300+
301+ false
302+ }
303+
304+ fn literal_contains_string_only_escapes ( literal : & StringLiteral , locator : & Locator ) -> bool {
305+ let inner_in_source = locator. slice ( literal. content_range ( ) ) ;
306+
307+ let mut cursor = Cursor :: new ( inner_in_source) ;
308+
309+ while let Some ( backslash_offset) = memchr:: memchr ( b'\\' , cursor. as_bytes ( ) ) {
310+ cursor. skip_bytes ( backslash_offset + "\\ " . len ( ) ) ;
311+
312+ let Some ( escaped) = cursor. bump ( ) else {
313+ continue ;
314+ } ;
315+
316+ match escaped {
317+ 'N' | 'u' | 'U' => return true ,
318+ 'x' => {
319+ cursor. skip_bytes ( 2 ) ;
320+ }
321+ '0' ..='7' => {
322+ let ( second, third) = ( cursor. first ( ) , cursor. second ( ) ) ;
323+
324+ let octal_codepoint = match ( is_octal_digit ( second) , is_octal_digit ( third) ) {
325+ ( false , _) => escaped. to_string ( ) ,
326+ ( true , false ) => format ! ( "{escaped}{second}" ) ,
327+ ( true , true ) => format ! ( "{escaped}{second}{third}" ) ,
328+ } ;
329+
330+ if octal_codepoint. parse :: < u8 > ( ) . is_err ( ) {
331+ return true ;
332+ }
333+
334+ cursor. skip_bytes ( octal_codepoint. len ( ) ) ;
335+ }
336+ _ => { }
337+ }
338+ }
339+
340+ false
341+ }
342+
343+ const fn is_octal_digit ( char : char ) -> bool {
344+ matches ! ( char , '0' ..='7' )
345+ }
0 commit comments