forked from Kozea/tinycss
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathspeedups.pyx
More file actions
189 lines (167 loc) · 6.95 KB
/
speedups.pyx
File metadata and controls
189 lines (167 loc) · 6.95 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
# coding: utf8
"""
tinycss.speedups
----------------
Cython module for speeding up inner loops.
Right now only :func:`tokenize_flat` has a second implementation.
:copyright: (c) 2010 by Simon Sapin.
:license: BSD, see LICENSE for more details.
"""
from __future__ import unicode_literals
from .token_data import (
COMPILED_TOKEN_REGEXPS, UNICODE_UNESCAPE, NEWLINE_UNESCAPE,
SIMPLE_UNESCAPE, FIND_NEWLINES, TOKEN_DISPATCH)
COMPILED_TOKEN_INDEXES = dict(
(name, i) for i, (name, regexp) in enumerate(COMPILED_TOKEN_REGEXPS))
cdef class CToken:
"""A token built by the Cython speedups. Identical to
:class:`~.token_data.Token`.
"""
is_container = False
cdef public object type, _as_css, value, unit
cdef public Py_ssize_t line, column
def __init__(self, type_, css_value, value, unit, line, column):
self.type = type_
self._as_css = css_value
self.value = value
self.unit = unit
self.line = line
self.column = column
def as_css(self):
"""
Return as an Unicode string the CSS representation of the token,
as parsed in the source.
"""
return self._as_css
def __repr__(self):
return ('<Token {0.type} at {0.line}:{0.column} {0.value!r}{1}>'
.format(self, self.unit or ''))
def tokenize_flat(css_source, int ignore_comments=1):
"""
:param css_source:
CSS as an unicode string
:param ignore_comments:
if true (the default) comments will not be included in the
return value
:return:
An iterator of :class:`Token`
"""
# Make these local variable to avoid global lookups in the loop
tokens_dispatch = TOKEN_DISPATCH
compiled_token_indexes = COMPILED_TOKEN_INDEXES
compiled_tokens = COMPILED_TOKEN_REGEXPS
unicode_unescape = UNICODE_UNESCAPE
newline_unescape = NEWLINE_UNESCAPE
simple_unescape = SIMPLE_UNESCAPE
find_newlines = FIND_NEWLINES
# Use the integer indexes instead of string markers
cdef Py_ssize_t BAD_COMMENT = compiled_token_indexes['BAD_COMMENT']
cdef Py_ssize_t BAD_STRING = compiled_token_indexes['BAD_STRING']
cdef Py_ssize_t PERCENTAGE = compiled_token_indexes['PERCENTAGE']
cdef Py_ssize_t DIMENSION = compiled_token_indexes['DIMENSION']
cdef Py_ssize_t ATKEYWORD = compiled_token_indexes['ATKEYWORD']
cdef Py_ssize_t FUNCTION = compiled_token_indexes['FUNCTION']
cdef Py_ssize_t COMMENT = compiled_token_indexes['COMMENT']
cdef Py_ssize_t NUMBER = compiled_token_indexes['NUMBER']
cdef Py_ssize_t STRING = compiled_token_indexes['STRING']
cdef Py_ssize_t IDENT = compiled_token_indexes['IDENT']
cdef Py_ssize_t HASH = compiled_token_indexes['HASH']
cdef Py_ssize_t URI = compiled_token_indexes['URI']
cdef Py_ssize_t DELIM = -1
cdef Py_ssize_t pos = 0
cdef Py_ssize_t line = 1
cdef Py_ssize_t column = 1
cdef Py_ssize_t source_len = len(css_source)
cdef Py_ssize_t n_tokens = len(compiled_tokens)
cdef Py_ssize_t length, next_pos, type_
cdef CToken token
tokens = []
while pos < source_len:
char = css_source[pos]
if char in ':;{}()[]':
type_ = -1 # not parsed further anyway
type_name = char
css_value = char
else:
codepoint = min(ord(char), 160)
for type_, type_name, regexp in tokens_dispatch[codepoint]:
match = regexp(css_source, pos)
if match:
# First match is the longest. See comments on TOKENS above.
css_value = match.group()
break
else:
# No match.
# "Any other character not matched by the above rules,
# and neither a single nor a double quote."
# ... but quotes at the start of a token are always matched
# by STRING or BAD_STRING. So DELIM is any single character.
type_ = DELIM
type_name = 'DELIM'
css_value = char
length = len(css_value)
next_pos = pos + length
# A BAD_COMMENT is a comment at EOF. Ignore it too.
if not (ignore_comments and type_ in (COMMENT, BAD_COMMENT)):
# Parse numbers, extract strings and URIs, unescape
unit = None
if type_ == DIMENSION:
value = match.group(1)
value = float(value) if '.' in value else int(value)
unit = match.group(2)
unit = unicode_unescape(unit)
unit = simple_unescape(unit)
unit = unit.lower() # normalize
elif type_ == PERCENTAGE:
value = css_value[:-1]
value = float(value) if '.' in value else int(value)
unit = '%'
elif type_ == NUMBER:
value = css_value
if '.' in value:
value = float(value)
else:
value = int(value)
type_name = 'INTEGER'
elif type_ in (IDENT, ATKEYWORD, HASH, FUNCTION):
value = unicode_unescape(css_value)
value = simple_unescape(value)
elif type_ == URI:
value = match.group(1)
if value and value[0] in '"\'':
value = value[1:-1] # Remove quotes
value = newline_unescape(value)
value = unicode_unescape(value)
value = simple_unescape(value)
elif type_ == STRING:
value = css_value[1:-1] # Remove quotes
value = newline_unescape(value)
value = unicode_unescape(value)
value = simple_unescape(value)
# BAD_STRING can only be one of:
# * Unclosed string at the end of the stylesheet:
# Close the string, but this is not an error.
# Make it a "good" STRING token.
# * Unclosed string at the (unescaped) end of the line:
# Close the string, but this is an error.
# Leave it as a BAD_STRING, don’t bother parsing it.
# See http://www.w3.org/TR/CSS21/syndata.html#parsing-errors
elif type_ == BAD_STRING and next_pos == source_len:
type_name = 'STRING'
value = css_value[1:] # Remove quote
value = newline_unescape(value)
value = unicode_unescape(value)
value = simple_unescape(value)
else:
value = css_value
token = CToken(type_name, css_value, value, unit, line, column)
tokens.append(token)
pos = next_pos
newlines = list(find_newlines(css_value))
if newlines:
line += len(newlines)
# Add 1 to have lines start at column 1, not 0
column = length - newlines[-1].end() + 1
else:
column += length
return tokens