forked from python-openxml/cxml
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathlexer.py
More file actions
139 lines (107 loc) · 3.41 KB
/
lexer.py
File metadata and controls
139 lines (107 loc) · 3.41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# encoding: utf-8
"""
Lexical analyzer, (a.k.a lexer, tokenizer) for CXML language.
"""
from __future__ import (
absolute_import, division, print_function, unicode_literals
)
from .lib.lexer import Lexer
from .symbols import (
COLON, COMMA, EQUAL, LBRACE, LPAREN, NAME, RBRACE, RPAREN, SLASH, SNTL,
TEXT
)
alphas = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
nums = '0123456789'
name_start_chars = alphas + '_'
name_chars = alphas + nums + '_-.'
punctuation = ':,=/{}()'
class CxmlLexer(Lexer):
"""
Lexer object for CXML.
"""
def _lex_start(self):
"""
The starting and fallback state of the lexer, where it is in-between
tokens.
"""
# should only be entering this state in-between tokens
assert self._start == self._pos
peek = self._peek
# test EOF first to avoid __contains__ errors
if peek is None:
return self._lex_eof
# ignore whitespace as a priority
elif peek == ' ':
return self._lex_whitespace
elif peek in name_start_chars:
return self._lex_name
elif peek in punctuation:
return self._lex_punctuation
elif peek == '"':
return self._lex_quoted_string
else:
raise SyntaxError(
"at character '%s' in '%s'" % (peek, self._input)
)
def _lex_eof(self):
"""
Emit `SNTL` token and end parsing by returning |None|.
"""
assert self._start == self._pos == self._len
self._emit(SNTL)
return None
def _lex_name(self):
"""
Emit maximal sequence of name characters.
"""
self._accept_run(name_chars)
self._emit(NAME)
return self._lex_start
def _lex_punctuation(self):
"""
Emit the appropriate single-character punctuation token, such as
COLON.
"""
symbol = self._next()
token_type = {
':': COLON, ',': COMMA, '{': LBRACE, '}': RBRACE,
'=': EQUAL, '/': SLASH, '(': LPAREN, ')': RPAREN,
}[symbol]
self._emit(token_type)
return self._lex_text if symbol in '=}' else self._lex_start
def _lex_quoted_string(self):
"""
Emit the text of a quoted string as a TEXT token, discarding the
enclosing quote characters.
"""
# skip over opening quote
self._skip()
# accept any character until another double-quote or EOF
self._accept_until('"')
self._emit(TEXT)
# raise unterminated if next character not closing quote
if self._peek != '"':
raise SyntaxError("unterminated quote")
self._skip()
return self._lex_start
def _lex_text(self):
"""
Parse a string value, either a quoted string or a raw string, which
is terminated by a comma, closing brace, slash, or right paren.
"""
peek = self._peek
if peek is None:
return self._lex_eof
if peek == '"':
return self._lex_quoted_string
if peek not in ',}/)':
self._accept_until(',}/)')
self._emit(TEXT)
return self._lex_start
def _lex_whitespace(self):
"""
Consume all whitespace at current position and ignore it.
"""
self._accept_run(' ')
self._ignore()
return self._lex_start