11pub use super :: token:: Tok ;
22use std:: collections:: HashMap ;
3- use std:: str:: CharIndices ;
43
5- pub struct Lexer < ' input > {
6- chars : CharIndices < ' input > ,
4+ pub struct Lexer < T : Iterator < Item = char > > {
5+ chars : T ,
76 at_begin_of_line : bool ,
87 nesting : usize , // Amount of parenthesis
98 indentation_stack : Vec < usize > ,
@@ -85,10 +84,129 @@ pub fn get_keywords() -> HashMap<String, Tok> {
8584
8685pub type Spanned < Tok > = Result < ( Location , Tok , Location ) , LexicalError > ;
8786
88- impl < ' input > Lexer < ' input > {
89- pub fn new ( input : & ' input str ) -> Self {
87+ pub fn make_tokenizer < ' a > ( source : & ' a str ) -> impl Iterator < Item = Spanned < Tok > > + ' a {
88+ let nlh = NewlineHandler :: new ( source. chars ( ) ) ;
89+ let lch = LineContinationHandler :: new ( nlh) ;
90+ let lexer = Lexer :: new ( lch) ;
91+ lexer
92+ }
93+
94+ // The newline handler is an iterator which collapses different newline
95+ // types into \n always.
96+ pub struct NewlineHandler < T : Iterator < Item = char > > {
97+ source : T ,
98+ chr0 : Option < char > ,
99+ chr1 : Option < char > ,
100+ }
101+
102+ impl < T > NewlineHandler < T >
103+ where
104+ T : Iterator < Item = char > ,
105+ {
106+ pub fn new ( source : T ) -> Self {
107+ let mut nlh = NewlineHandler {
108+ source : source,
109+ chr0 : None ,
110+ chr1 : None ,
111+ } ;
112+ nlh. shift ( ) ;
113+ nlh. shift ( ) ;
114+ nlh
115+ }
116+
117+ fn shift ( & mut self ) -> Option < char > {
118+ let result = self . chr0 ;
119+ self . chr0 = self . chr1 ;
120+ self . chr1 = self . source . next ( ) ;
121+ result
122+ }
123+ }
124+
125+ impl < T > Iterator for NewlineHandler < T >
126+ where
127+ T : Iterator < Item = char > ,
128+ {
129+ type Item = char ;
130+
131+ fn next ( & mut self ) -> Option < Self :: Item > {
132+ // Collapse \r\n into \n
133+ loop {
134+ if self . chr0 == Some ( '\r' ) {
135+ if self . chr1 == Some ( '\n' ) {
136+ // Transform windows EOL into \n
137+ self . shift ( ) ;
138+ } else {
139+ // Transform MAC EOL into \n
140+ self . chr0 = Some ( '\n' )
141+ }
142+ } else {
143+ break ;
144+ }
145+ }
146+
147+ self . shift ( )
148+ }
149+ }
150+
151+ // Glues \ and \n into a single line:
152+ pub struct LineContinationHandler < T : Iterator < Item = char > > {
153+ source : T ,
154+ chr0 : Option < char > ,
155+ chr1 : Option < char > ,
156+ }
157+
158+ impl < T > LineContinationHandler < T >
159+ where
160+ T : Iterator < Item = char > ,
161+ {
162+ pub fn new ( source : T ) -> Self {
163+ let mut nlh = LineContinationHandler {
164+ source : source,
165+ chr0 : None ,
166+ chr1 : None ,
167+ } ;
168+ nlh. shift ( ) ;
169+ nlh. shift ( ) ;
170+ nlh
171+ }
172+
173+ fn shift ( & mut self ) -> Option < char > {
174+ let result = self . chr0 ;
175+ self . chr0 = self . chr1 ;
176+ self . chr1 = self . source . next ( ) ;
177+ result
178+ }
179+ }
180+
181+ impl < T > Iterator for LineContinationHandler < T >
182+ where
183+ T : Iterator < Item = char > ,
184+ {
185+ type Item = char ;
186+
187+ fn next ( & mut self ) -> Option < Self :: Item > {
188+ // Collapse \r\n into \n
189+ loop {
190+ if self . chr0 == Some ( '\\' ) && self . chr1 == Some ( '\n' ) {
191+ // Skip backslash and newline
192+ self . shift ( ) ;
193+ self . shift ( ) ;
194+ } else {
195+ break ;
196+ }
197+ }
198+
199+ self . shift ( )
200+ }
201+ }
202+
203+ impl < T > Lexer < T >
204+ where
205+ T : Iterator < Item = char > ,
206+ {
207+ pub fn new ( input : T ) -> Self {
90208 let mut lxr = Lexer {
91- chars : input. char_indices ( ) ,
209+ chars : input,
92210 at_begin_of_line : true ,
93211 nesting : 0 ,
94212 indentation_stack : vec ! [ 0 ] ,
@@ -155,16 +273,20 @@ impl<'input> Lexer<'input> {
155273 Some ( '\n' ) => {
156274 return ;
157275 }
158- Some ( '\r' ) => {
159- return ;
160- }
161276 Some ( _) => { }
162277 None => return ,
163278 }
164279 }
165280 }
166281
167282 fn lex_string ( & mut self ) -> Spanned < Tok > {
283+ let type_char = match self . chr0 {
284+ Some ( 'u' ) | Some ( 'f' ) | Some ( 'r' ) => self . next_char ( ) ,
285+ _ => None ,
286+ } ;
287+
288+ let is_raw = type_char == Some ( 'r' ) ;
289+
168290 let quote_char = self . next_char ( ) . unwrap ( ) ;
169291 let mut string_content = String :: new ( ) ;
170292 let start_pos = self . get_pos ( ) ;
@@ -182,43 +304,36 @@ impl<'input> Lexer<'input> {
182304 loop {
183305 match self . next_char ( ) {
184306 Some ( '\\' ) => {
185- match self . next_char ( ) {
186- Some ( '\\' ) => {
187- string_content. push ( '\\' ) ;
188- }
189- Some ( '\'' ) => string_content. push ( '\'' ) ,
190- Some ( '\"' ) => string_content. push ( '\"' ) ,
191- Some ( '\n' ) => {
192- // Ignore Unix EOL character
193- }
194- Some ( '\r' ) => {
195- match self . chr0 {
196- Some ( '\n' ) => {
197- // Ignore Windows EOL characters (2 bytes)
198- self . next_char ( ) ;
199- }
200- _ => {
201- // Ignore Mac EOL character
202- }
307+ if is_raw {
308+ string_content. push ( '\\' ) ;
309+ } else {
310+ match self . next_char ( ) {
311+ Some ( '\\' ) => {
312+ string_content. push ( '\\' ) ;
313+ }
314+ Some ( '\'' ) => string_content. push ( '\'' ) ,
315+ Some ( '\"' ) => string_content. push ( '\"' ) ,
316+ Some ( '\n' ) => {
317+ // Ignore Unix EOL character
318+ }
319+ Some ( 'a' ) => string_content. push ( '\x07' ) ,
320+ Some ( 'b' ) => string_content. push ( '\x08' ) ,
321+ Some ( 'f' ) => string_content. push ( '\x0c' ) ,
322+ Some ( 'n' ) => {
323+ string_content. push ( '\n' ) ;
324+ }
325+ Some ( 'r' ) => string_content. push ( '\r' ) ,
326+ Some ( 't' ) => {
327+ string_content. push ( '\t' ) ;
328+ }
329+ Some ( 'v' ) => string_content. push ( '\x0b' ) ,
330+ Some ( c) => {
331+ string_content. push ( '\\' ) ;
332+ string_content. push ( c) ;
333+ }
334+ None => {
335+ return Err ( LexicalError :: StringError ) ;
203336 }
204- }
205- Some ( 'a' ) => string_content. push ( '\x07' ) ,
206- Some ( 'b' ) => string_content. push ( '\x08' ) ,
207- Some ( 'f' ) => string_content. push ( '\x0c' ) ,
208- Some ( 'n' ) => {
209- string_content. push ( '\n' ) ;
210- }
211- Some ( 'r' ) => string_content. push ( '\r' ) ,
212- Some ( 't' ) => {
213- string_content. push ( '\t' ) ;
214- }
215- Some ( 'v' ) => string_content. push ( '\x0b' ) ,
216- Some ( c) => {
217- string_content. push ( '\\' ) ;
218- string_content. push ( c) ;
219- }
220- None => {
221- return Err ( LexicalError :: StringError ) ;
222337 }
223338 }
224339 }
@@ -281,7 +396,7 @@ impl<'input> Lexer<'input> {
281396 let c = self . chr0 ;
282397 let nxt = self . chars . next ( ) ;
283398 self . chr0 = self . chr1 ;
284- self . chr1 = nxt. map ( |x| x . 1 ) ;
399+ self . chr1 = nxt;
285400 self . location . column += 1 ;
286401 c
287402 }
@@ -318,17 +433,6 @@ impl<'input> Lexer<'input> {
318433 self . at_begin_of_line = true ;
319434 continue ' top_loop;
320435 }
321- Some ( '\r' ) => {
322- // Empty line!
323- self . next_char ( ) ;
324- if self . chr0 == Some ( '\n' ) {
325- // absorb two bytes if Windows line ending
326- self . next_char ( ) ;
327- }
328- self . at_begin_of_line = true ;
329- self . new_line ( ) ;
330- continue ' top_loop;
331- }
332436 Some ( '\n' ) => {
333437 // Empty line!
334438 self . next_char ( ) ;
@@ -376,7 +480,18 @@ impl<'input> Lexer<'input> {
376480
377481 match self . chr0 {
378482 Some ( '0' ...'9' ) => return Some ( self . lex_number ( ) ) ,
379- Some ( '_' ) | Some ( 'a' ...'z' ) | Some ( 'A' ...'Z' ) => return Some ( self . lex_identifier ( ) ) ,
483+ Some ( '_' ) | Some ( 'a' ...'z' ) | Some ( 'A' ...'Z' ) => {
484+ // Detect r"", f"" and u""
485+ match self . chr0 {
486+ Some ( 'r' ) | Some ( 'u' ) | Some ( 'f' ) => match self . chr1 {
487+ Some ( '\'' ) | Some ( '\"' ) => {
488+ return Some ( self . lex_string ( ) ) ;
489+ }
490+ _ => return Some ( self . lex_identifier ( ) ) ,
491+ } ,
492+ _ => return Some ( self . lex_identifier ( ) ) ,
493+ }
494+ }
380495 Some ( '#' ) => {
381496 self . lex_comment ( ) ;
382497 continue ;
@@ -691,20 +806,6 @@ impl<'input> Lexer<'input> {
691806 let tok_end = self . get_pos ( ) ;
692807 return Some ( Ok ( ( tok_start, Tok :: Dot , tok_end) ) ) ;
693808 }
694- Some ( '\r' ) => {
695- let tok_start = self . get_pos ( ) ;
696- self . next_char ( ) ;
697- let tok_end = self . get_pos ( ) ;
698- self . new_line ( ) ;
699-
700- // Depending on the nesting level, we emit newline or not:
701- if self . nesting == 0 {
702- self . at_begin_of_line = true ;
703- return Some ( Ok ( ( tok_start, Tok :: Newline , tok_end) ) ) ;
704- } else {
705- continue ;
706- }
707- }
708809 Some ( '\n' ) => {
709810 let tok_start = self . get_pos ( ) ;
710811 self . next_char ( ) ;
@@ -746,7 +847,10 @@ impl<'input> Lexer<'input> {
746847Calling the next element in the iterator will yield the next lexical
747848token.
748849*/
749- impl < ' input > Iterator for Lexer < ' input > {
850+ impl < T > Iterator for Lexer < T >
851+ where
852+ T : Iterator < Item = char > ,
853+ {
750854 type Item = Spanned < Tok > ;
751855
752856 fn next ( & mut self ) -> Option < Self :: Item > {
@@ -766,18 +870,46 @@ impl<'input> Iterator for Lexer<'input> {
766870
767871#[ cfg( test) ]
768872mod tests {
769- use super :: { Lexer , Tok } ;
873+ use super :: { make_tokenizer , NewlineHandler , Tok } ;
770874 use std:: iter:: FromIterator ;
875+ use std:: iter:: Iterator ;
771876
772877 const WINDOWS_EOL : & str = "\r \n " ;
773878 const MAC_EOL : & str = "\r " ;
774879 const UNIX_EOL : & str = "\n " ;
775880
776881 pub fn lex_source ( source : & String ) -> Vec < Tok > {
777- let lexer = Lexer :: new ( source) ;
882+ let lexer = make_tokenizer ( source) ;
778883 Vec :: from_iter ( lexer. map ( |x| x. unwrap ( ) . 1 ) )
779884 }
780885
886+ #[ test]
887+ fn test_newline_processor ( ) {
888+ // Escape \ followed by \n (by removal):
889+ let src = "b\\ \r \n " ;
890+ assert_eq ! ( 4 , src. len( ) ) ;
891+ let nlh = NewlineHandler :: new ( src. chars ( ) ) ;
892+ let x: Vec < char > = nlh. collect ( ) ;
893+ assert_eq ! ( vec![ 'b' , '\\' , '\n' ] , x) ;
894+ }
895+
896+ #[ test]
897+ fn test_raw_string ( ) {
898+ let source = String :: from ( "r\" \\ \\ \" \" \\ \\ \" " ) ;
899+ let tokens = lex_source ( & source) ;
900+ assert_eq ! (
901+ tokens,
902+ vec![
903+ Tok :: String {
904+ value: "\\ \\ " . to_string( )
905+ } ,
906+ Tok :: String {
907+ value: "\\ " . to_string( )
908+ }
909+ ]
910+ ) ;
911+ }
912+
781913 macro_rules! test_line_comment {
782914 ( $( $name: ident: $eol: expr, ) * ) => {
783915 $(
0 commit comments