@@ -970,122 +970,138 @@ public static StringBuilder decodeEscapes(ErrorCallback errors, String string, S
970970 }
971971
972972 private static <T > T unescapeString (SourceRange sourceRange , ErrorCallback errorCallback , String st , PythonStringFactory <T > stringFactory ) {
973- if (!st .contains ("\\ " )) {
973+ int backslashIndex = st .indexOf ('\\' );
974+ if (backslashIndex < 0 ) {
974975 return stringFactory .fromJavaString (st );
975976 }
976977 PythonStringFactory .PythonStringBuilder <T > sb = stringFactory .createBuilder (st .length ());
977- boolean wasDeprecationWarning = false ;
978- for (int i = 0 ; i < st .length (); i ++) {
979- char ch = st .charAt (i );
980- if (ch == '\\' ) {
981- char nextChar = (i == st .length () - 1 ) ? '\\' : st .charAt (i + 1 );
982- // Octal escape?
983- if (nextChar >= '0' && nextChar <= '7' ) {
984- String code = "" + nextChar ;
978+ boolean emittedDeprecationWarning = false ;
979+ int substringStart = 0 ;
980+ do {
981+ if (backslashIndex != 0 ) {
982+ sb .appendString (st .substring (substringStart , backslashIndex ));
983+ }
984+ if (backslashIndex + 1 < st .length ()) {
985+ substringStart = processEscapeSequence (sourceRange , errorCallback , st , sb , backslashIndex + 1 );
986+ if (substringStart == backslashIndex + 1 ) {
987+ sb .appendCodePoint ('\\' );
988+ if (!emittedDeprecationWarning ) {
989+ emittedDeprecationWarning = true ;
990+ warnInvalidEscapeSequence (errorCallback , sourceRange , st .codePointAt (substringStart ));
991+ }
992+ }
993+ } else {
994+ // Lone backslash at the end, can occur in f-strings
995+ substringStart = backslashIndex ;
996+ break ;
997+ }
998+ } while ((backslashIndex = st .indexOf ('\\' , substringStart )) >= 0 );
999+ if (substringStart < st .length ()) {
1000+ sb .appendString (st .substring (substringStart ));
1001+ }
1002+ return sb .build ();
1003+ }
1004+
1005+ private static <T > int processEscapeSequence (SourceRange sourceRange , ErrorCallback errorCallback , String st , PythonStringFactory .PythonStringBuilder <T > sb , int startIndex ) {
1006+ int cp = st .codePointAt (startIndex );
1007+ int i = startIndex + Character .charCount (cp );
1008+ return switch (cp ) {
1009+ case '\\' -> {
1010+ sb .appendCodePoint ('\\' );
1011+ yield i ;
1012+ }
1013+ case 'a' -> {
1014+ sb .appendCodePoint ('\u0007' );
1015+ yield i ;
1016+ }
1017+ case 'b' -> {
1018+ sb .appendCodePoint ('\b' );
1019+ yield i ;
1020+ }
1021+ case 'f' -> {
1022+ sb .appendCodePoint ('\f' );
1023+ yield i ;
1024+ }
1025+ case 'n' -> {
1026+ sb .appendCodePoint ('\n' );
1027+ yield i ;
1028+ }
1029+ case 'r' -> {
1030+ sb .appendCodePoint ('\r' );
1031+ yield i ;
1032+ }
1033+ case 't' -> {
1034+ sb .appendCodePoint ('\t' );
1035+ yield i ;
1036+ }
1037+ case 'v' -> {
1038+ sb .appendCodePoint ('\u000b' );
1039+ yield i ;
1040+ }
1041+ case '\"' -> {
1042+ sb .appendCodePoint ('\"' );
1043+ yield i ;
1044+ }
1045+ case '\'' -> {
1046+ sb .appendCodePoint ('\'' );
1047+ yield i ;
1048+ }
1049+ case '\r' , '\n' -> i ;
1050+ // Octal code point
1051+ case '0' , '1' , '2' , '3' , '4' , '5' , '6' , '7' -> {
1052+ int octalValue = cp - '0' ;
1053+ cp = i < st .length () ? st .codePointAt (i ) : 0 ;
1054+ if (cp >= '0' && cp <= '7' ) {
9851055 i ++;
986- if ((i < st .length () - 1 ) && st .charAt (i + 1 ) >= '0' && st .charAt (i + 1 ) <= '7' ) {
987- code += st .charAt (i + 1 );
1056+ octalValue = octalValue * 8 + cp - '0' ;
1057+ cp = i < st .length () ? st .codePointAt (i ) : 0 ;
1058+ if (cp >= '0' && cp <= '7' ) {
9881059 i ++;
989- if ((i < st .length () - 1 ) && st .charAt (i + 1 ) >= '0' && st .charAt (i + 1 ) <= '7' ) {
990- code += st .charAt (i + 1 );
991- i ++;
992- }
1060+ octalValue = octalValue * 8 + cp - '0' ;
9931061 }
994- sb .appendCodePoint (Integer .parseInt (code , 8 ));
995- continue ;
9961062 }
997- switch (nextChar ) {
998- case '\\' :
999- ch = '\\' ;
1000- break ;
1001- case 'a' :
1002- ch = '\u0007' ;
1003- break ;
1004- case 'b' :
1005- ch = '\b' ;
1006- break ;
1007- case 'f' :
1008- ch = '\f' ;
1009- break ;
1010- case 'n' :
1011- ch = '\n' ;
1012- break ;
1013- case 'r' :
1014- ch = '\r' ;
1015- break ;
1016- case 't' :
1017- ch = '\t' ;
1018- break ;
1019- case 'v' :
1020- ch = '\u000b' ;
1021- break ;
1022- case '\"' :
1023- ch = '\"' ;
1024- break ;
1025- case '\'' :
1026- ch = '\'' ;
1027- break ;
1028- case '\r' :
1029- nextChar = (i == st .length () - 2 ) ? '\\' : st .charAt (i + 2 );
1030- if (nextChar == '\n' ) {
1031- i ++;
1032- }
1033- i ++;
1034- continue ;
1035- case '\n' :
1036- i ++;
1037- continue ;
1038- // Hex Unicode: u????
1039- case 'u' :
1040- int code = getHexValue (st , sourceRange , i + 2 , 4 , errorCallback );
1041- if (code < 0 ) {
1042- return stringFactory .fromJavaString (st );
1043- }
1044- sb .appendCodePoint (code );
1045- i += 5 ;
1046- continue ;
1047- // Hex Unicode: U????????
1048- case 'U' :
1049- code = getHexValue (st , sourceRange , i + 2 , 8 , errorCallback );
1050- if (Character .isValidCodePoint (code )) {
1051- sb .appendCodePoint (code );
1052- } else {
1053- errorCallback .onError (ErrorCallback .ErrorType .Encoding , sourceRange , String .format (UNICODE_ERROR + ILLEGAL_CHARACTER , i , i + 9 ));
1054- return stringFactory .fromJavaString (st );
1055- }
1056- i += 9 ;
1057- continue ;
1058- // Hex Unicode: x??
1059- case 'x' :
1060- code = getHexValue (st , sourceRange , i + 2 , 2 , errorCallback );
1061- if (code < 0 ) {
1062- return stringFactory .fromJavaString (st );
1063- }
1064- sb .appendCodePoint (code );
1065- i += 3 ;
1066- continue ;
1067- case 'N' :
1068- // a character from Unicode Data Database
1069- i = doCharacterName (st , sourceRange , sb , i + 2 , errorCallback );
1070- if (i < 0 ) {
1071- return stringFactory .fromJavaString (st );
1072- }
1073- continue ;
1074- default :
1075- if (!wasDeprecationWarning ) {
1076- wasDeprecationWarning = true ;
1077- warnInvalidEscapeSequence (errorCallback , sourceRange , nextChar );
1078- }
1079- sb .appendCodePoint (ch );
1080- sb .appendCodePoint (nextChar );
1081- i ++;
1082- continue ;
1063+ sb .appendCodePoint (octalValue );
1064+ yield i ;
1065+ }
1066+ // Hex Unicode: u????
1067+ case 'u' -> {
1068+ int code = getHexValue (st , sourceRange , i , 4 , errorCallback );
1069+ if (code < 0 ) {
1070+ yield startIndex ;
10831071 }
1084- i ++;
1072+ sb .appendCodePoint (code );
1073+ yield i + 4 ;
10851074 }
1086- sb .appendCodePoint (ch );
1087- }
1088- return sb .build ();
1075+ // Hex Unicode: U????????
1076+ case 'U' -> {
1077+ int code = getHexValue (st , sourceRange , i , 8 , errorCallback );
1078+ if (Character .isValidCodePoint (code )) {
1079+ sb .appendCodePoint (code );
1080+ } else {
1081+ errorCallback .onError (ErrorCallback .ErrorType .Encoding , sourceRange , String .format (UNICODE_ERROR + ILLEGAL_CHARACTER , i , i + 9 ));
1082+ yield startIndex ;
1083+ }
1084+ yield i + 8 ;
1085+ }
1086+ // Hex Unicode: x??
1087+ case 'x' -> {
1088+ int code = getHexValue (st , sourceRange , i , 2 , errorCallback );
1089+ if (code < 0 ) {
1090+ yield startIndex ;
1091+ }
1092+ sb .appendCodePoint (code );
1093+ yield i + 2 ;
1094+ }
1095+ case 'N' -> {
1096+ i = doCharacterName (st , sourceRange , sb , i , errorCallback );
1097+ if (i < 0 ) {
1098+ yield startIndex ;
1099+ }
1100+ yield i ;
1101+ // a character from Unicode Data Database
1102+ }
1103+ default -> startIndex ;
1104+ };
10891105 }
10901106
10911107 private static int getHexValue (String text , SourceRange sourceRange , int start , int len , ErrorCallback errorCb ) {
@@ -1130,7 +1146,7 @@ private static int createTruncatedError(SourceRange sourceRange, int startIndex,
11301146 * @param text a text that contains /N{...} escape sequence
11311147 * @param sb string builder where the result code point will be written
11321148 * @param offset this is offset of the open brace
1133- * @return offset of the close brace or {@code -1} if an error was signaled
1149+ * @return offset after the close brace or {@code -1} if an error was signaled
11341150 */
11351151 private static int doCharacterName (String text , SourceRange sourceRange , PythonStringFactory .PythonStringBuilder <?> sb , int offset , ErrorCallback errorCallback ) {
11361152 if (offset >= text .length ()) {
@@ -1155,7 +1171,7 @@ private static int doCharacterName(String text, SourceRange sourceRange, PythonS
11551171 errorCallback .onError (ErrorCallback .ErrorType .Encoding , sourceRange , UNICODE_ERROR + UNKNOWN_UNICODE_ERROR , offset - 2 , closeIndex );
11561172 return -1 ;
11571173 }
1158- return closeIndex ;
1174+ return closeIndex + 1 ;
11591175 }
11601176
11611177 // Names for most control characters that mean 0-31, not some symbol
@@ -1216,8 +1232,8 @@ public static int getCodePoint(String charName) {
12161232 return -1 ;
12171233 }
12181234
1219- public static void warnInvalidEscapeSequence (ErrorCallback errorCallback , SourceRange sourceRange , char nextChar ) {
1220- errorCallback .onWarning (WarningType .Deprecation , sourceRange , "invalid escape sequence '\\ %c'" , nextChar );
1235+ public static void warnInvalidEscapeSequence (ErrorCallback errorCallback , SourceRange sourceRange , int nextCodePoint ) {
1236+ errorCallback .onWarning (WarningType .Deprecation , sourceRange , "invalid escape sequence '\\ %c'" , nextCodePoint );
12211237 }
12221238
12231239 private static final String UNICODE_ERROR = "(unicode error) 'unicodeescape' codec can't decode bytes in position %d-%d:" ;
0 commit comments