Skip to content

Commit 91c54cd

Browse files
committed
Merge remote-tracking branch 'noprompt/noprompt-regexp' into noprompt-regexp
Changed all var to all-fn * noprompt/noprompt-regexp: Add clojureRegexpBoundary and @clojureRegexpEscapes to clojureRegexpGroup Add sanity check for control characters Link clojureRegexpQuote to Character Add tests for all current regex patterns Fix octal escapes and back references Drop @clojureRegexpChars containment from clojureRegexpCharClass Add clojureRegexpQuantifier to clojureRegexpGroup and link to Special
2 parents 3a54951 + 3d7dd43 commit 91c54cd

2 files changed

Lines changed: 187 additions & 34 deletions

File tree

clj/test/syntax_test.clj

Lines changed: 169 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,26 @@
1010
(def !regexp (complement regexp))
1111
(def regexp-escape (all-fn :clojureRegexpEscape))
1212
(def !regexp-escape (complement regexp-escape))
13+
(def regexp-char-class (all-fn :clojureRegexpCharClass))
14+
(def !regexp-char-class (complement regexp-char-class))
15+
(def regexp-predefined-char-class (all-fn :clojureRegexpPredefinedCharClass))
16+
(def !regexp-predefined-char-class (complement regexp-predefined-char-class))
17+
(def regexp-posix-char-class (all-fn :clojureRegexpPosixCharClass))
18+
(def !regexp-posix-char-class (complement regexp-posix-char-class))
19+
(def regexp-boundary (all-fn :clojureRegexpBoundary))
20+
(def !regexp-boundary (complement regexp-boundary))
21+
(def regexp-quantifier (all-fn :clojureRegexpQuantifier))
22+
(def !regexp-quantifier (complement regexp-quantifier))
23+
(def regexp-back-ref (all-fn :clojureRegexpBackRef))
24+
(def !regexp-back-ref (complement regexp-back-ref))
25+
(def regexp-quote (all-fn :clojureRegexpQuote))
26+
(def !regexp-quote (complement regexp-quote))
27+
(def regexp-or (all-fn :clojureRegexpOr))
28+
(def !regexp-or (complement regexp-or))
29+
(def regexp-group (all-fn :clojureRegexpGroup))
30+
(def !regexp-group (complement regexp-group))
31+
(defn regexp-mod [xs] (= (second xs) :clojureRegexpMod))
32+
(def !regexp-mod (complement regexp-mod))
1333

1434
(defsyntaxtest number-literals-test
1535
(with-format "%s"
@@ -119,113 +139,236 @@
119139
"\\e" regexp-escape
120140
"\\E" !regexp-escape
121141
;; \cx The control character corresponding to x
122-
;;
123-
;; Character classes
142+
"\\cA" regexp-escape
143+
"\\c1" !regexp-escape
144+
"\\c" !regexp-escape
145+
;; Additional escape sequences not mentioned in the official documenation.
146+
"\\." regexp-escape
147+
"\\*" regexp-escape
148+
"\\?" regexp-escape
149+
"\\{" regexp-escape
150+
"\\}" regexp-escape
151+
"\\[" regexp-escape
152+
"\\]" regexp-escape
153+
"\\(" regexp-escape
154+
"\\)" regexp-escape
155+
156+
;;;; Character classes
157+
124158
;; [abc] a, b, or c (simple class)
159+
"[abc]" regexp-char-class
125160
;; [^abc] Any character except a, b, or c (negation)
161+
"[^abc]" regexp-char-class
126162
;; [a-zA-Z] a through z or A through Z, inclusive (range)
127163
;; [a-d[m-p]] a through d, or m through p: [a-dm-p] (union)
128164
;; [a-z&&[def]] d, e, or f (intersection)
129165
;; [a-z&&[^bc]] a through z, except for b and c: [ad-z] (subtraction)
130166
;; [a-z&&[^m-p]] a through z, and not m through p: [a-lq-z](subtraction)
131-
;;
132-
;; Predefined character classes
167+
168+
;;;; Predefined character classes
169+
133170
;; . Any character (may or may not match line terminators)
171+
"." regexp-predefined-char-class
134172
;; \d A digit: [0-9]
173+
"\\d" regexp-predefined-char-class
135174
;; \D A non-digit: [^0-9]
175+
"\\D" regexp-predefined-char-class
136176
;; \s A whitespace character: [ \t\n\x0B\f\r]
177+
"\\s" regexp-predefined-char-class
137178
;; \S A non-whitespace character: [^\s]
179+
"\\S" regexp-predefined-char-class
138180
;; \w A word character: [a-zA-Z_0-9]
181+
"\\w" regexp-predefined-char-class
139182
;; \W A non-word character: [^\w]
140-
;;
141-
;; POSIX character classes (US-ASCII only)
183+
"\\W" regexp-predefined-char-class
184+
185+
;;;; POSIX character classes (US-ASCII only)
186+
142187
;; \p{Lower} A lower-case alphabetic character: [a-z]
188+
"\\p{Lower}" regexp-posix-char-class
143189
;; \p{Upper} An upper-case alphabetic character:[A-Z]
190+
"\\p{Upper}" regexp-posix-char-class
144191
;; \p{ASCII} All ASCII:[\x00-\x7F]
192+
"\\p{ASCII}" regexp-posix-char-class
145193
;; \p{Alpha} An alphabetic character:[\p{Lower}\p{Upper}]
194+
"\\p{Alpha}" regexp-posix-char-class
146195
;; \p{Digit} A decimal digit: [0-9]
196+
"\\p{Digit}" regexp-posix-char-class
147197
;; \p{Alnum} An alphanumeric character:[\p{Alpha}\p{Digit}]
198+
"\\p{Alnum}" regexp-posix-char-class
148199
;; \p{Punct} Punctuation: One of !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
200+
"\\p{Punct}" regexp-posix-char-class
149201
;; \p{Graph} A visible character: [\p{Alnum}\p{Punct}]
202+
"\\p{Graph}" regexp-posix-char-class
150203
;; \p{Print} A printable character: [\p{Graph}\x20]
204+
"\\p{Print}" regexp-posix-char-class
151205
;; \p{Blank} A space or a tab: [ \t]
206+
"\\p{Blank}" regexp-posix-char-class
152207
;; \p{Cntrl} A control character: [\x00-\x1F\x7F]
208+
"\\p{Cntrl}" regexp-posix-char-class
153209
;; \p{XDigit} A hexadecimal digit: [0-9a-fA-F]
210+
"\\p{XDigit}" regexp-posix-char-class
154211
;; \p{Space} A whitespace character: [ \t\n\x0B\f\r]
155-
;;
156-
;; java.lang.Character classes (simple java character type)
212+
"\\p{Space}" regexp-posix-char-class
213+
214+
;;;; java.lang.Character classes (simple java character type)
215+
157216
;; \p{javaLowerCase} Equivalent to java.lang.Character.isLowerCase()
217+
"\\p{javaLowerCase}" regexp-posix-char-class
158218
;; \p{javaUpperCase} Equivalent to java.lang.Character.isUpperCase()
219+
"\\p{javaUpperCase}" regexp-posix-char-class
159220
;; \p{javaWhitespace} Equivalent to java.lang.Character.isWhitespace()
221+
"\\p{javaWhitespace}" regexp-posix-char-class
160222
;; \p{javaMirrored} Equivalent to java.lang.Character.isMirrored()
161-
;;
162-
;; Classes for Unicode scripts, blocks, categories and binary properties
223+
"\\p{javaMirrored}" regexp-posix-char-class
224+
225+
;;;; Classes for Unicode scripts, blocks, categories and binary properties
226+
163227
;; \p{IsLatin} A Latin script character (script)
228+
"\\p{IsLatin}" regexp-posix-char-class
164229
;; \p{InGreek} A character in the Greek block (block)
230+
"\\p{InGreek}" regexp-posix-char-class
165231
;; \p{Lu} An uppercase letter (category)
232+
"\\p{Lu}" regexp-posix-char-class
166233
;; \p{IsAlphabetic} An alphabetic character (binary property)
234+
"\\p{IsAlphabetic}" regexp-posix-char-class
167235
;; \p{Sc} A currency symbol
236+
"\\p{Sc}" regexp-posix-char-class
168237
;; \P{InGreek} Any character except one in the Greek block (negation)
238+
"\\P{InGreek}" regexp-posix-char-class
169239
;; [\p{L}&&[^\p{Lu}]] Any letter except an uppercase letter (subtraction)
170-
;;
171-
;; Boundary matchers
240+
241+
;;;; Invalid classes
242+
243+
"\\P{Xzibit}" !regexp-posix-char-class
244+
"\\p{YoDawg}" !regexp-posix-char-class
245+
246+
;;;; Boundary matchers
247+
172248
;; ^ The beginning of a line
249+
"^" regexp-boundary
173250
;; $ The end of a line
251+
"$" regexp-boundary
174252
;; \b A word boundary
253+
"\\b" regexp-boundary
175254
;; \B A non-word boundary
255+
"\\B" regexp-boundary
176256
;; \A The beginning of the input
257+
"\\A" regexp-boundary
177258
;; \G The end of the previous match
259+
"\\G" regexp-boundary
178260
;; \Z The end of the input but for the final terminator, if any
261+
"\\Z" regexp-boundary
179262
;; \z The end of the input
180-
;;
181-
;; Greedy quantifiers
263+
"\\z" regexp-boundary
264+
265+
;;;; Greedy quantifiers
266+
182267
;; X? X, once or not at all
268+
"?" regexp-quantifier
183269
;; X* X, zero or more times
270+
"*" regexp-quantifier
184271
;; X+ X, one or more times
272+
"+" regexp-quantifier
185273
;; X{n} X, exactly n times
274+
"{0}" regexp-quantifier
186275
;; X{n,} X, at least n times
276+
"{0,}" regexp-quantifier
187277
;; X{n,m} X, at least n but not more than m times
188-
;;
189-
;; Reluctant quantifiers
278+
"{0,1}" regexp-quantifier
279+
280+
;;;; Reluctant quantifiers
281+
190282
;; X?? X, once or not at all
283+
"??" regexp-quantifier
191284
;; X*? X, zero or more times
285+
"*?" regexp-quantifier
192286
;; X+? X, one or more times
287+
"+?" regexp-quantifier
193288
;; X{n}? X, exactly n times
289+
"{0}?" regexp-quantifier
194290
;; X{n,}? X, at least n times
291+
"{0,}?" regexp-quantifier
195292
;; X{n,m}? X, at least n but not more than m times
196-
;;
197-
;; Possessive quantifiers
293+
"{0,1}?" regexp-quantifier
294+
295+
;;;; Possessive quantifiers
296+
198297
;; X?+ X, once or not at all
298+
"?+" regexp-quantifier
199299
;; X*+ X, zero or more times
300+
"*+" regexp-quantifier
200301
;; X++ X, one or more times
302+
"++" regexp-quantifier
201303
;; X{n}+ X, exactly n times
304+
"{0}+" regexp-quantifier
202305
;; X{n,}+ X, at least n times
306+
"{0,}+" regexp-quantifier
203307
;; X{n,m}+ X, at least n but not more than m times
204-
;;
205-
;; Logical operators
308+
"{0,1}+" regexp-quantifier
309+
310+
"{-1}" !regexp-quantifier
311+
"{-1,}" !regexp-quantifier
312+
"{-1,-2}" !regexp-quantifier
313+
"{-1}?" !regexp-quantifier
314+
"{-1,}?" !regexp-quantifier
315+
"{-1,-2}?" !regexp-quantifier
316+
"{-1}?" !regexp-quantifier
317+
"{-1,}?" !regexp-quantifier
318+
"{-1,-2}?" !regexp-quantifier
319+
320+
;;;; Logical operators
206321
;; XY X followed by Y
322+
;; XXX: Tested above (regexp)
323+
207324
;; X|Y Either X or Y
325+
"|" regexp-or
326+
208327
;; (X) X, as a capturing group
209-
;;
210-
;; Back references
328+
"(X)" regexp-group
329+
330+
;;;; Back references
331+
211332
;; \n Whatever the nth capturing group matched
333+
"\\1" regexp-back-ref
212334
;; \k<name> Whatever the named-capturing group "name" matched
213-
;;
214-
;; Quotation
335+
"\\k<name>" regexp-back-ref
336+
337+
;;;; Quotation
338+
215339
;; \ Nothing, but quotes the following character
340+
;; XXX: Tested above
341+
216342
;; \Q Nothing, but quotes all characters until \E
217343
;; \E Nothing, but ends quoting started by \Q
218-
;;
219-
;; Special constructs (named-capturing and non-capturing)
344+
"\\Qabc\\E" regexp-quote
345+
"\\qabc\\E" !regexp-quote
346+
347+
;;;; Special constructs (named-capturing and non-capturing)
220348
;; (?<name>X) X, as a named-capturing group
349+
"(?<name>X)" regexp-mod
221350
;; (?:X) X, as a non-capturing group
351+
"(?:X)" regexp-mod
222352
;; (?idmsuxU-idmsuxU) Nothing, but turns match flags i d m s u x U on - off
353+
"(?idmsuxU-idmsuxU)" regexp-mod
354+
"(?idmsuxU)" regexp-mod
355+
"(?-idmsuxU)" regexp-mod
223356
;; (?idmsux-idmsux:X) X, as a non-capturing group with the given flags i d m s u x on - off
357+
"(?idmsuxU-idmsuxU:X)" regexp-mod
358+
"(?idmsuxU:)" regexp-mod
359+
"(?-idmsuxU:)" regexp-mod
224360
;; (?=X) X, via zero-width positive lookahead
361+
"(?=X)" regexp-mod
225362
;; (?!X) X, via zero-width negative lookahead
363+
"(?!X)" regexp-mod
226364
;; (?<=X) X, via zero-width positive lookbehind
365+
"(?<=X)" regexp-mod
227366
;; (?<!X) X, via zero-width negative lookbehind
367+
"(?<!X)" regexp-mod
228368
;; (?>X) X, as an independent, non-capturing group
369+
"(?>X)" regexp-mod
370+
371+
"(?X)" !regexp-mod
229372
))
230373

231374
;; (test #'java-regexp-literals-test)

syntax/clojure.vim

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -75,11 +75,18 @@ syntax match clojureDispatch "\v#[\^'=<_]?"
7575
" Clojure permits no more than 20 params.
7676
syntax match clojureAnonArg "%\(20\|1\d\|[1-9]\|&\)\?"
7777

78-
syntax match clojureRegexpEscape "\v\\{2}|\\%([tnrfae]|c[A-Z]|0%([0-7]{1,2}|[0-3][0-7]{2})|x\x{2}|u\x{4})" contained
78+
" Note: Although not mentioned in the official documenation prefixing the
79+
" characters ".", "+", "*", "?", "{", "}", "[", "]", "(", and ")" with a "\"
80+
" forms a legal escape sequence.
81+
syntax match clojureRegexpEscape "\v\\%(\\|[tnrfae]|c[A-Z]|0[0-3]?[0-7]{1,2}|x\x{2}|u\x{4}|[.+*?{}[\]()])" contained
82+
syntax region clojureRegexpQuote start="\v\<@!\\Q" end="\\E"
83+
syntax cluster clojureRegexpEscapes contains=clojureRegexpEscape,clojureRegexpQuote
7984
" Charactar classes
80-
syntax match clojureRegexpPredefinedCharClass "\\[dDsSwW]" contained
85+
syntax match clojureRegexpPredefinedCharClass "\v%(\\[dDsSwW]|\.)" contained
86+
" XXX: Should we distinguish between posix, java, and unicode character
87+
" classes as in the documenation?
8188
syntax match clojureRegexpPosixCharClass "\v\\[pP]\{%(Lower|Upper|ASCII|Alpha|Digit|Alnum|Punct|Graph|Print|Blank|Cntrl|XDigit|Space|IsLatin|InGreek|Lu|IsAlphabetic|Sc|java%(LowerCase|UpperCase|Whitespace|Mirrored))\}" contained
82-
syntax region clojureRegexpCharClass start="\\\@<!\[" end="\\\@<!\]" contained contains=clojureRegexpCharClasses,@clojureRegexpChars
89+
syntax region clojureRegexpCharClass start="\\\@<!\[" end="\\\@<!\]" contained contains=clojureRegexpSpecialChar,clojureRegexpPredefinedCharClass,clojureRegexpPosixCharClass
8390
syntax cluster clojureRegexpCharClasses contains=clojureRegexpPredefinedCharClass,clojureRegexpPosixCharClass,clojureRegexpCharClass
8491
" Boundary
8592
syntax match clojureRegexpBoundary "\v\\[bBAGZz]" contained
@@ -89,15 +96,16 @@ syntax match clojureRegexpQuantifier "\v\\@<![?*+]\??" contained
8996
syntax match clojureRegexpQuantifier "\v\\@<!\{\d+%(,|,\d+)?}\??" contained
9097
syntax match clojureRegexpOr "\v\<@!\|" contained
9198
" Back references
92-
syntax match clojureRegexpBackRef "\v\\%(\d+|k\<[a-zA-z]+\>)" contained
99+
syntax match clojureRegexpBackRef "\v\\%([1-9]\d*|k\<[a-zA-z]+\>)" contained
93100
" Mode modifiers, mode-modified spans, lookaround, regular and atomic
94101
" grouping, and named-capturing.
95-
syntax match clojureRegexpMod "\v\(@<=\?[xdsmiu]*%(-[xdsmiu]*)?:?" contained
96-
syntax match clojureRegexpMod "\v\(@<=\?[=!>]" contained
102+
syntax match clojureRegexpMod "\v\(@<=\?:" contained
103+
syntax match clojureRegexpMod "\v\(@<=\?[xdsmiuU]*-?[xdsmiuU]+:?" contained
104+
syntax match clojureRegexpMod "\v\(@<=\?%(\<?[=!]|\>)" contained
97105
syntax match clojureRegexpMod "\v\(@<=\?\<[a-zA-Z]+\>" contained
98106

99-
syntax region clojureRegexpGroup start="\\\@<!(" matchgroup=clojureRegexpGroup end="\\\@<!)" contained contains=clojureRegexpMod,@clojureRegexpCharClasses
100-
syntax region clojureRegexp start=/\#"/ skip=/\\"/ end=/"/ contains=clojureRegexpEscape,@clojureRegexpCharClasses,clojureRegexpBoundary,clojureRegexpQuantifier,clojureRegexpOr,clojureRegexpBackRef,clojureRegexpGroup
107+
syntax region clojureRegexpGroup start="\\\@<!(" matchgroup=clojureRegexpGroup end="\\\@<!)" contained contains=clojureRegexpMod,clojureRegexpQuantifier,clojureRegexpBoundary,@clojureRegexpEscapes,@clojureRegexpCharClasses
108+
syntax region clojureRegexp start=/\#"/ skip=/\\"/ end=/"/ contains=@clojureRegexpEscapes,@clojureRegexpCharClasses,clojureRegexpBoundary,clojureRegexpQuantifier,clojureRegexpOr,clojureRegexpBackRef,clojureRegexpGroup
101109

102110
syntax match clojureComment ";.*$" contains=clojureTodo,@Spell
103111
syntax match clojureComment "#!.*$"
@@ -123,10 +131,12 @@ highlight default link clojureStringEscape Character
123131

124132
highlight default link clojureRegexp Constant
125133
highlight default link clojureRegexpEscape Character
134+
highlight default link clojureRegexpQuote Character
126135
highlight default link clojureRegexpCharClass SpecialChar
127136
highlight default link clojureRegexpPosixCharClass SpecialChar
128137
highlight default link clojureRegexpPredefinedCharClass SpecialChar
129138
highlight default link clojureRegexpBoundary SpecialChar
139+
highlight default link clojureRegexpQuantifier SpecialChar
130140
highlight default link clojureRegexpMod SpecialChar
131141
highlight default link clojureRegexpOr SpecialChar
132142
highlight default link clojureRegexpBackRef SpecialChar

0 commit comments

Comments
 (0)