forked from Surachai-kent/css-sanitizer
-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathMatcher.php
More file actions
240 lines (222 loc) · 8.14 KB
/
Matcher.php
File metadata and controls
240 lines (222 loc) · 8.14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
<?php
/**
* @file
* @license https://opensource.org/licenses/Apache-2.0 Apache-2.0
*/
namespace Wikimedia\CSS\Grammar;
use Iterator;
use Wikimedia\CSS\Objects\ComponentValueList;
use Wikimedia\CSS\Objects\CSSFunction;
use Wikimedia\CSS\Objects\SimpleBlock;
use Wikimedia\CSS\Objects\Token;
/**
* Base class for grammar matchers.
*
* The [CSS Syntax Level 3][SYN3] and [Values Level 3][VAL3] specifications use
* a mostly context-free grammar to define what things like selectors and
* property values look like. The Matcher classes allow for constructing an
* object that will determine whether a ComponentValueList actually matches
* this grammar.
*
* [SYN3]: https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/
* [VAL3]: https://www.w3.org/TR/2019/CR-css-values-3-20190606/
*/
abstract class Matcher {
/** @var string|null Name to set on GrammarMatch objects */
protected $captureName = null;
/**
* @var array Default options for self::matchAgainst()
* - skip-whitespace: (bool) Allow whitespace in between any two tokens
* - nonterminal: (bool) Don't require the whole of $values is matched
* - mark-significance: (bool) On a successful match, replace T_WHITESPACE
* tokens as necessary to indicate significant whitespace.
*/
protected $defaultOptions = [
'skip-whitespace' => true,
'nonterminal' => false,
'mark-significance' => false,
];
/**
* Create an instance.
* @param mixed ...$args See static::__construct()
* @return static
*/
public static function create( ...$args ) {
// @phan-suppress-next-line PhanParamTooManyUnpack,PhanTypeInstantiateAbstractStatic
return new static( ...$args );
}
/**
* Return a copy of this matcher that will capture its matches
*
* A "capturing" Matcher will produce GrammarMatches that return a value from
* the GrammarMatch::getName() method. The GrammarMatch::getCapturedMatches()
* method may be used to retrieve them from the top-level GrammarMatch.
*
* The concept is similar to capturing groups in PCRE and other regex
* languages.
*
* @param string|null $captureName Name to apply to captured GrammarMatch objects
* @return static
*/
public function capture( $captureName ) {
$ret = clone $this;
$ret->captureName = $captureName;
return $ret;
}
/**
* Match against a list of ComponentValues
* @param ComponentValueList $values
* @param array $options Matching options, see self::$defaultOptions
* @return GrammarMatch|null
*/
public function matchAgainst( ComponentValueList $values, array $options = [] ) {
$options += $this->getDefaultOptions();
$start = $this->next( $values, -1, $options );
$l = count( $values );
foreach ( $this->generateMatches( $values, $start, $options ) as $match ) {
if ( $options['nonterminal'] || $match->getNext() === $l ) {
if ( $options['mark-significance'] ) {
$significantWS = self::collectSignificantWhitespace( $match );
self::markSignificantWhitespace( $values, $match, $significantWS, $match->getNext() );
}
return $match;
}
}
return null;
}
/**
* Collect any 'significantWhitespace' matches
* @param GrammarMatch $match
* @param Token[] &$ret
* @return Token[]
*/
private static function collectSignificantWhitespace( GrammarMatch $match, &$ret = [] ) {
if ( $match->getName() === 'significantWhitespace' ) {
$ret = array_merge( $ret, $match->getValues() );
}
foreach ( $match->getCapturedMatches() as $m ) {
self::collectSignificantWhitespace( $m, $ret );
}
return $ret;
}
/**
* Mark whitespace as significant or not
* @param ComponentValueList $list
* @param GrammarMatch $match
* @param Token[] $significantWS
* @param int $end
*/
private static function markSignificantWhitespace( $list, $match, $significantWS, $end ) {
for ( $i = 0; $i < $end; $i++ ) {
$cv = $list[$i];
if ( $cv instanceof Token && $cv->type() === Token::T_WHITESPACE ) {
$significant = in_array( $cv, $significantWS, true );
if ( $significant !== $cv->significant() ) {
$newCv = $cv->copyWithSignificance( $significant );
$match->fixWhitespace( $cv, $newCv );
$list[$i] = $newCv;
}
} elseif ( $cv instanceof CSSFunction || $cv instanceof SimpleBlock ) {
self::markSignificantWhitespace(
$cv->getValue(), $match, $significantWS, count( $cv->getValue() )
);
}
}
}
/**
* Fetch the default options for this Matcher
* @return array See self::$defaultOptions
*/
public function getDefaultOptions() {
return $this->defaultOptions;
}
/**
* Set the default options for this Matcher
* @param array $options See self::$defaultOptions
* @return static $this
*/
public function setDefaultOptions( array $options ) {
$this->defaultOptions = $options + $this->defaultOptions;
return $this;
}
/**
* Find the next ComponentValue in the input, possibly skipping whitespace
* @param ComponentValueList $values Input values
* @param int $start Current position in the input. May be -1, in which
* case the first position in the input should be returned.
* @param array $options See self::$defaultOptions
* @return int Next token index
*/
protected function next( ComponentValueList $values, $start, array $options ) {
$skipWS = $options['skip-whitespace'];
$i = $start;
$l = count( $values );
do {
$i++;
} while ( $skipWS && $i < $l &&
// @phan-suppress-next-line PhanNonClassMethodCall False positive
$values[$i] instanceof Token && $values[$i]->type() === Token::T_WHITESPACE
);
return $i;
}
/**
* Create a GrammarMatch
* @param ComponentValueList $list
* @param int $start
* @param int $end First position after the match
* @param GrammarMatch|null $submatch Sub-match, for capturing. If $submatch
* itself named it will be kept as a capture in the returned GrammarMatch,
* otherwise its captured matches (if any) as returned by getCapturedMatches()
* will be kept as captures in the returned GrammarMatch.
* @param array $stack Stack from which to fetch more submatches for
* capturing (see $submatch). The stack is expected to be an array of
* arrays, with the first element of each subarray being a GrammarMatch.
* @return GrammarMatch
*/
protected function makeMatch(
ComponentValueList $list, $start, $end, GrammarMatch $submatch = null, array $stack = []
) {
$matches = array_column( $stack, 0 );
$matches[] = $submatch;
$keptMatches = [];
while ( $matches ) {
$m = array_shift( $matches );
if ( !$m instanceof GrammarMatch ) {
// skip it, probably null
} elseif ( $m->getName() !== null ) {
$keptMatches[] = $m;
} elseif ( $m->getCapturedMatches() ) {
$matches = array_merge( $m->getCapturedMatches(), $matches );
}
}
return new GrammarMatch( $list, $start, $end - $start, $this->captureName, $keptMatches );
}
/**
* Match against a list of ComponentValues
*
* The job of a Matcher is to determine all the ways its particular grammar
* fragment can consume ComponentValues starting at a particular location
* in the ComponentValueList, represented by returning GrammarMatch objects.
* For example, a matcher implementing `IDENT*` at a starting position where
* there are three IDENT tokens in a row would be able to match 0, 1, 2, or
* all 3 of those IDENT tokens, and therefore should return an iterator
* over that set of GrammarMatch objects.
*
* Some matchers take other matchers as input, for example `IDENT*` is
* probably going to be implemented as a matcher for `*` that repeatedly
* applies a matcher for `IDENT`. The `*` matcher would call the `IDENT`
* matcher's generateMatches() method directly.
*
* Most Matchers implement this method as a generator to not build up
* the full set of results when it's reasonably likely the caller is going
* to terminate early.
*
* @param ComponentValueList $values
* @param int $start Starting position in $values
* @param array $options See self::$defaultOptions.
* Always use the options passed in, don't use $this->defaultOptions yourself.
* @return Iterator<GrammarMatch> Iterates over the set of GrammarMatch
* objects defining all the ways this matcher can match.
*/
abstract protected function generateMatches( ComponentValueList $values, $start, array $options );
}