Skip to content

Commit a400f66

Browse files
committed
Improve lexing of numbers with underscores.
1 parent c32b714 commit a400f66

File tree

4 files changed

+110
-67
lines changed

4 files changed

+110
-67
lines changed

examples/parse_folder.rs

Lines changed: 40 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ use clap::{App, Arg};
1515

1616
use rustpython_parser::{ast, parser};
1717
use std::path::{Path, PathBuf};
18-
use std::time::Instant;
18+
use std::time::{Duration, Instant};
1919

2020
fn main() {
2121
env_logger::init();
@@ -61,30 +61,45 @@ fn parse_folder(path: &Path) -> std::io::Result<Vec<ParsedFile>> {
6161
}
6262

6363
if metadata.is_file() && path.extension().and_then(|s| s.to_str()) == Some("py") {
64-
let result = parse_python_file(&path);
65-
match &result {
64+
let parsed_file = parse_python_file(&path);
65+
match &parsed_file.result {
6666
Ok(_) => {}
6767
Err(y) => error!("Erreur in file {:?} {:?}", path, y),
6868
}
69-
res.push(ParsedFile {
70-
filename: Box::new(path),
71-
result,
72-
});
69+
70+
res.push(parsed_file);
7371
}
7472
}
7573
Ok(res)
7674
}
7775

78-
fn parse_python_file(filename: &Path) -> ParseResult {
76+
fn parse_python_file(filename: &Path) -> ParsedFile {
7977
info!("Parsing file {:?}", filename);
80-
let source = std::fs::read_to_string(filename).map_err(|e| e.to_string())?;
81-
parser::parse_program(&source).map_err(|e| e.to_string())
78+
match std::fs::read_to_string(filename) {
79+
Err(e) => ParsedFile {
80+
filename: Box::new(filename.to_path_buf()),
81+
code: "".to_string(),
82+
num_lines: 0,
83+
result: Err(e.to_string()),
84+
},
85+
Ok(source) => {
86+
let num_lines = source.to_string().lines().count();
87+
let result = parser::parse_program(&source).map_err(|e| e.to_string());
88+
ParsedFile {
89+
filename: Box::new(filename.to_path_buf()),
90+
code: source.to_string(),
91+
num_lines,
92+
result,
93+
}
94+
}
95+
}
8296
}
8397

8498
fn statistics(results: ScanResult) {
8599
// println!("Processed {:?} files", res.len());
86100
println!("Scanned a total of {} files", results.parsed_files.len());
87-
let total = results.parsed_files.len();
101+
let total: usize = results.parsed_files.len();
102+
let total_lines: usize = results.parsed_files.iter().map(|p| p.num_lines).sum();
88103
let failed = results
89104
.parsed_files
90105
.iter()
@@ -103,11 +118,21 @@ fn statistics(results: ScanResult) {
103118
let duration = results.t2 - results.t1;
104119
println!("Total time spend: {:?}", duration);
105120
println!(
106-
"File processing rate: {} files/second",
107-
(total * 1_000_000) as f64 / duration.as_micros() as f64
121+
"Processed {} files. That's {} files/second",
122+
total,
123+
rate(total, duration)
124+
);
125+
println!(
126+
"Processed {} lines of python code. That's {} lines/second",
127+
total_lines,
128+
rate(total_lines, duration)
108129
);
109130
}
110131

132+
fn rate(counter: usize, duration: Duration) -> f64 {
133+
(counter * 1_000_000) as f64 / duration.as_micros() as f64
134+
}
135+
111136
struct ScanResult {
112137
t1: Instant,
113138
t2: Instant,
@@ -116,6 +141,8 @@ struct ScanResult {
116141

117142
struct ParsedFile {
118143
filename: Box<PathBuf>,
144+
code: String,
145+
num_lines: usize,
119146
result: ParseResult,
120147
}
121148

parser/src/lexer.rs

Lines changed: 56 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -340,18 +340,7 @@ where
340340

341341
/// Lex a hex/octal/decimal/binary number without a decimal point.
342342
fn lex_number_radix(&mut self, start_pos: Location, radix: u32) -> LexResult {
343-
let mut value_text = String::new();
344-
345-
loop {
346-
if let Some(c) = self.take_number(radix) {
347-
value_text.push(c);
348-
} else if self.chr0 == Some('_') {
349-
self.next_char();
350-
} else {
351-
break;
352-
}
353-
}
354-
343+
let value_text = self.radix_run(radix);
355344
let end_pos = self.get_pos();
356345
let value = BigInt::from_str_radix(&value_text, radix).map_err(|e| LexicalError {
357346
error: LexicalErrorType::OtherError(format!("{:?}", e)),
@@ -360,24 +349,19 @@ where
360349
Ok((start_pos, Tok::Int { value }, end_pos))
361350
}
362351

352+
/// Lex a normal number, that is, no octal, hex or binary number.
363353
fn lex_normal_number(&mut self) -> LexResult {
364354
let start_pos = self.get_pos();
365355

366-
let mut value_text = String::new();
367-
368356
// Normal number:
369-
while let Some(c) = self.take_number(10) {
370-
value_text.push(c);
371-
}
357+
let mut value_text = self.radix_run(10);
372358

373359
// If float:
374360
if self.chr0 == Some('.') || self.at_exponent() {
375361
// Take '.':
376362
if self.chr0 == Some('.') {
377363
value_text.push(self.next_char().unwrap());
378-
while let Some(c) = self.take_number(10) {
379-
value_text.push(c);
380-
}
364+
value_text.push_str(&self.radix_run(10));
381365
}
382366

383367
// 1e6 for example:
@@ -389,9 +373,7 @@ where
389373
value_text.push(self.next_char().unwrap());
390374
}
391375

392-
while let Some(c) = self.take_number(10) {
393-
value_text.push(c);
394-
}
376+
value_text.push_str(&self.radix_run(10));
395377
}
396378

397379
let value = f64::from_str(&value_text).unwrap();
@@ -426,6 +408,57 @@ where
426408
}
427409
}
428410

411+
/// Consume a sequence of numbers with the given radix,
412+
/// the digits can be decorated with underscores
413+
/// like this: '1_2_3_4' == '1234'
414+
fn radix_run(&mut self, radix: u32) -> String {
415+
let mut value_text = String::new();
416+
loop {
417+
if let Some(c) = self.take_number(radix) {
418+
value_text.push(c);
419+
} else if self.chr0 == Some('_') && Lexer::<T>::is_digit_of_radix(&self.chr1, radix) {
420+
self.next_char();
421+
} else {
422+
break;
423+
}
424+
}
425+
value_text
426+
}
427+
428+
/// Consume a single character with the given radix.
429+
fn take_number(&mut self, radix: u32) -> Option<char> {
430+
let take_char = Lexer::<T>::is_digit_of_radix(&self.chr0, radix);
431+
432+
if take_char {
433+
Some(self.next_char().unwrap())
434+
} else {
435+
None
436+
}
437+
}
438+
439+
/// Test if a digit is of a certain radix.
440+
fn is_digit_of_radix(c: &Option<char>, radix: u32) -> bool {
441+
match radix {
442+
2 => match c {
443+
Some('0'..='1') => true,
444+
_ => false,
445+
},
446+
8 => match c {
447+
Some('0'..='7') => true,
448+
_ => false,
449+
},
450+
10 => match c {
451+
Some('0'..='9') => true,
452+
_ => false,
453+
},
454+
16 => match c {
455+
Some('0'..='9') | Some('a'..='f') | Some('A'..='F') => true,
456+
_ => false,
457+
},
458+
x => unimplemented!("Radix not implemented: {}", x),
459+
}
460+
}
461+
429462
/// Test if we face '[eE][-+]?[0-9]+'
430463
fn at_exponent(&self) -> bool {
431464
match self.chr0 {
@@ -626,34 +659,6 @@ where
626659
}
627660
}
628661

629-
fn take_number(&mut self, radix: u32) -> Option<char> {
630-
let take_char = match radix {
631-
2 => match self.chr0 {
632-
Some('0'..='1') => true,
633-
_ => false,
634-
},
635-
8 => match self.chr0 {
636-
Some('0'..='7') => true,
637-
_ => false,
638-
},
639-
10 => match self.chr0 {
640-
Some('0'..='9') => true,
641-
_ => false,
642-
},
643-
16 => match self.chr0 {
644-
Some('0'..='9') | Some('a'..='f') | Some('A'..='F') => true,
645-
_ => false,
646-
},
647-
x => unimplemented!("Radix not implemented: {}", x),
648-
};
649-
650-
if take_char {
651-
Some(self.next_char().unwrap())
652-
} else {
653-
None
654-
}
655-
}
656-
657662
/// This is the main entry point. Call this function to retrieve the next token.
658663
/// This function is used by the iterator implementation.
659664
fn inner_next(&mut self) -> LexResult {

parser/src/python.lalrpop

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -247,7 +247,7 @@ ImportDots: usize = {
247247

248248
ImportAsNames: Vec<ast::ImportSymbol> = {
249249
<i:OneOrMore<ImportAsAlias<Identifier>>> => i,
250-
"(" <i:OneOrMore<ImportAsAlias<Identifier>>> ")" => i,
250+
"(" <i:OneOrMore<ImportAsAlias<Identifier>>> ","? ")" => i,
251251
"*" => {
252252
// Star import all
253253
vec![ast::ImportSymbol { symbol: "*".to_string(), alias: None }]
@@ -952,11 +952,11 @@ Atom: ast::Expression = {
952952
};
953953

954954
ListLiteralValues: Vec<ast::Expression> = {
955-
<e:OneOrMore<TestOrStarExpr>> <_trailing_comma:","?> => e,
955+
<e:OneOrMore<TestOrStarExpr>> ","? => e,
956956
};
957957

958958
DictLiteralValues: Vec<(Option<ast::Expression>, ast::Expression)> = {
959-
<elements:OneOrMore<DictElement>> <_trailing_comma:","?> => elements,
959+
<elements:OneOrMore<DictElement>> ","? => elements,
960960
};
961961

962962
DictEntry: (ast::Expression, ast::Expression) = {

tests/snippets/numbers.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from testutils import assertRaises
2+
13
x = 5
24
x.__init__(6)
35
assert x == 5
@@ -42,3 +44,12 @@ class A(int):
4244
assert int(1).__rxor__(1) == 0
4345
assert int(3).__rxor__(-3) == -2
4446
assert int(3).__rxor__(4) == 7
47+
48+
# Test underscores in numbers:
49+
assert 1_2 == 12
50+
assert 1_2_3 == 123
51+
assert 1_2.3_4 == 12.34
52+
assert 1_2.3_4e0_0 == 12.34
53+
54+
with assertRaises(SyntaxError):
55+
eval('1__2')

0 commit comments

Comments
 (0)