Skip to content

Commit cf1bce3

Browse files
committed
Generate valid Java and Unicode bracket expressions
Two new syntax match groups have been created: clojureRegexpUnicodeCharClass and clojureRegexpJavaCharClass. Both groups match all valid \p{..} patterns.
1 parent 361ae2a commit cf1bce3

2 files changed

Lines changed: 77 additions & 26 deletions

File tree

clj/src/vim_clojure_static/generate.clj

Lines changed: 69 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -69,26 +69,74 @@
6969
sort
7070
(string/join \,)))))
7171

72-
(def java-char-class-names
73-
"Returns a list of valid java character class names (excluding the \"java\"
74-
prefix) for use in a regular expression literal."
75-
;; java.lang.Character/is* methods.
76-
(let [is-ms (->> java.lang.Character
77-
r/reflect
78-
:members
79-
(map (comp name :name))
80-
(filter #(.startsWith % "is"))
81-
set
82-
sort)]
83-
(reduce
84-
(fn [pats is-m]
85-
(let [c-name (second (string/split is-m #"is" 2))]
86-
(try
87-
(re-pattern (format "\\p{java%s}" c-name))
88-
(conj pats c-name)
89-
(catch java.util.regex.PatternSyntaxException e pats))))
90-
[]
91-
is-ms)))
72+
;; Helper functions (should probably be moved to a util ns).
73+
74+
(defn syntax-match [group pattern contained?]
75+
"Returns a Vimscript literal `syntax match` statement. The content of pattern
76+
is automatically wrapped in quotes."
77+
(let [parts ["syntax match" (name group) (format "\"%s\"" pattern)]
78+
parts (if contained?
79+
(conj parts "contained")
80+
parts)]
81+
(string/join \space parts)))
82+
83+
(defn re-pattern? [s]
84+
"Returns true if s is a valid regular expression pattern, false otherwiese."
85+
(try
86+
(re-pattern s)
87+
true
88+
(catch java.util.regex.PatternSyntaxException _ false)))
89+
90+
(defn pipe-join [ss]
91+
(string/join \| ss))
92+
93+
;;;; clojureRegex*CharClass generation ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
94+
95+
(defn bracket-char-class? [s]
96+
"Returns true if s is a valid posix, java, or unicode character class."
97+
(re-pattern? (format "\\p{%s}" s)))
98+
99+
;; This helps cut down on line noise.
100+
(defn unicode-char-class-pattern [s]
101+
(format "\\v\\\\[pP]\\{%s\\}" s))
102+
103+
(def unicode-char-classes
104+
"Vimscript literal syntax match for unicode regex character classes."
105+
(delay ;; Since we need to hit the network.
106+
(let [page (slurp "http://www.regular-expressions.info/unicode.html")
107+
cs (loop [m (re-matcher #"\\p\{([a-zA-Z_]+)\}" page)
108+
v (transient [])]
109+
(if-let [[_ t] (re-find m)]
110+
(do
111+
(conj! v t)
112+
(recur m v))
113+
(sort (distinct (persistent! v)))))
114+
cs (filter bracket-char-class? cs)
115+
;; This complicates things mildly but apparently not every unicode
116+
;; class can be prefixed with "Is".
117+
{cs1 true cs2 false} (group-by #(bracket-char-class? (str "Is" %)) cs)]
118+
(syntax-match
119+
:clojureRegexpUnicodeCharClass
120+
(unicode-char-class-pattern (format "%%(%%(Is)?%%(%s)|%%(%s))" (pipe-join cs1) (pipe-join cs2)))
121+
true))))
122+
123+
(def java-char-classes
124+
"Vimscript literal syntax match for (Is)java* regex character classes."
125+
(let [is-methods (->> java.lang.Character
126+
r/reflect
127+
:members
128+
(map (comp name :name))
129+
(filter #(.startsWith % "is"))
130+
distinct
131+
sort)
132+
cs (filter #(bracket-char-class? (str "java" %))
133+
(map #(second (string/split % #"is" 2)) is-methods))
134+
{cs1 true cs2 false} (group-by #(bracket-char-class? (str "Is" %)) cs)]
135+
(syntax-match
136+
:clojureRegexpJavaCharClass
137+
(unicode-char-class-pattern (format "%%(%%(Is)?java%%(%s)|java%%(%s))" (pipe-join cs1) (pipe-join cs2)))
138+
true)))
92139

93140
(comment
94-
(spit "/tmp/clojure-defs.vim" (str syntax-keywords "\n\n" completion-words)))
141+
(spit "/tmp/clojure-defs.vim" (str syntax-keywords "\n\n" completion-words))
142+
(spit "/tmp/clojure-char-classes.vim" (str java-char-classes "\n" @unicode-char-classes)))

syntax/clojure.vim

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -79,13 +79,14 @@ syntax match clojureRegexpEscape "\v\\%(\\|[tnrfae]|c[A-Z]|0[0-3]?[0-7]{1,2}|x\x
7979
syntax region clojureRegexpQuoted start=/\v\<@!\\Q/ms=e+1 skip=/\v\\\\|\\"/ end=/\\E/me=s-1 end=/"/me=s-1 contained
8080
syntax region clojureRegexpQuote start=/\v\<@!\\Q/ skip=/\v\\\\|\\"/ end=/\\E/ end=/"/me=s-1 contains=clojureRegexpQuoted keepend contained
8181
syntax cluster clojureRegexpEscapes contains=clojureRegexpEscape,clojureRegexpQuote
82-
" Character classes
82+
" Character classes and bracket expressions.
8383
syntax match clojureRegexpPredefinedCharClass "\v%(\\[dDsSwW]|\.)" contained
8484
syntax match clojureRegexpPosixCharClass "\v\\[pP]\{%(Lower|Upper|ASCII|Alpha|Digit|Alnum|Punct|Graph|Print|Blank|Cntrl|XDigit|Space|IsLatin|InGreek|Lu|IsAlphabetic|Sc)\}" contained
85-
syntax match clojureRegexpPosixCharClass "\v\\[pP]\{%(Is)?%(Cn|Cc|Cf|Co|Cs|Lu|Ll|Lt|Lm|Lo|Mn|Me|Mc|Nd|Nl|No|Pd|Ps|Pe|Pc|Pi|Pf|Po|Sm|Sc|Sk|So|Zs|Zl|Zp)\}"
86-
syntax match clojureRegexpPosixCharClass "\v\\[pP]\{java%(Defined|Digit|ISOControl|IdentifierIgnorable|JavaIdentifierPart|JavaIdentifierStart|Letter|LetterOrDigit|LowerCase|Mirrored|SpaceChar|TitleCase|UnicodeIdentifierPart|UnicodeIdentifierStart|UpperCase|Whitespace)\}" contained
87-
syntax region clojureRegexpCharClass start="\\\@<!\[" end="\\\@<!\]" contained contains=clojureRegexpSpecialChar,clojureRegexpPredefinedCharClass,clojureRegexpPosixCharClass
88-
syntax cluster clojureRegexpCharClasses contains=clojureRegexpPredefinedCharClass,clojureRegexpPosixCharClass,clojureRegexpCharClass
85+
syntax match clojureRegexpJavaCharClass "\v\\[pP]\{%(%(Is)?java%(Alphabetic|Digit|Ideographic|Letter|LowerCase|TitleCase|UpperCase|Whitespace)|java%(Defined|ISOControl|IdentifierIgnorable|JavaIdentifierPart|JavaIdentifierStart|LetterOrDigit|Mirrored|SpaceChar|UnicodeIdentifierPart|UnicodeIdentifierStart))\}" contained
86+
syntax match clojureRegexpUnicodeCharClass "\v\\[pP]\{%(%(Is)?%(C|Cc|Cf|Cn|Co|Cs|L|Ll|Lm|Lo|Lt|Lu|M|Mc|Me|Mn|N|Nd|Nl|No|P|Pc|Pd|Pe|Pf|Pi|Po|Ps|S|Sc|Sk|Sm|So|Z|Zl|Zp|Zs)|%(InAlphabetic_Presentation_Forms|InArabic|InArmenian|InArrows|InBasic_Latin|InBengali|InBlock_Elements|InBopomofo|InBopomofo_Extended|InBox_Drawing|InBraille_Patterns|InBuhid|InCJK_Compatibility|InCJK_Compatibility_Forms|InCJK_Compatibility_Ideographs|InCJK_Radicals_Supplement|InCJK_Symbols_and_Punctuation|InCJK_Unified_Ideographs|InCJK_Unified_Ideographs_Extension_A|InCherokee|InCombining_Diacritical_Marks|InCombining_Half_Marks|InControl_Pictures|InCurrency_Symbols|InCyrillic|InCyrillic_Supplementary|InDevanagari|InDingbats|InEnclosed_Alphanumerics|InEnclosed_CJK_Letters_and_Months|InEthiopic|InGeneral_Punctuation|InGeometric_Shapes|InGeorgian|InGreek_Extended|InGujarati|InGurmukhi|InHalfwidth_and_Fullwidth_Forms|InHangul_Compatibility_Jamo|InHangul_Jamo|InHangul_Syllables|InHanunoo|InHebrew|InHigh_Private_Use_Surrogates|InHigh_Surrogates|InHiragana|InIPA_Extensions|InIdeographic_Description_Characters|InKanbun|InKangxi_Radicals|InKannada|InKatakana|InKatakana_Phonetic_Extensions|InKhmer|InKhmer_Symbols|InLao|InLatin_Extended_Additional|InLetterlike_Symbols|InLimbu|InLow_Surrogates|InMalayalam|InMathematical_Operators|InMiscellaneous_Symbols|InMiscellaneous_Symbols_and_Arrows|InMiscellaneous_Technical|InMongolian|InMyanmar|InNumber_Forms|InOgham|InOptical_Character_Recognition|InOriya|InPhonetic_Extensions|InPrivate_Use_Area|InRunic|InSinhala|InSmall_Form_Variants|InSpacing_Modifier_Letters|InSpecials|InSuperscripts_and_Subscripts|InSupplemental_Mathematical_Operators|InSyriac|InTagalog|InTagbanwa|InTai_Le|InTamil|InTelugu|InThaana|InThai|InTibetan|InUnified_Canadian_Aboriginal_Syllabics|InVariation_Selectors|InYi_Radicals|InYi_Syllables|InYijing_Hexagram_Symbols|IsL|IsLatin))\}" contained
87+
syntax cluster clojureRegexpBracketExp contains=clojureRegexpPosixCharClass,clojureRegexpUnicodeCharClass,clojureRegexpJavaCharClass
88+
syntax cluster clojureRegexpCharClasses contains=clojureRegexpPredefinedCharClass,clojureRegexpCharClass,@clojureRegexpBracketExp
89+
syntax region clojureRegexpCharClass start="\\\@<!\[" end="\\\@<!\]" contained contains=clojureRegexpSpecialChar,clojureRegexpPredefinedCharClass,@clojureRegexpBracketExp
8990
" Boundary
9091
syntax match clojureRegexpBoundary "\v\\[bBAGZz]" contained
9192
syntax match clojureRegexpBoundary "\v\<@![\^$]" contained
@@ -131,6 +132,8 @@ highlight default link clojureRegexp Constant
131132
highlight default link clojureRegexpEscape Character
132133
highlight default link clojureRegexpCharClass SpecialChar
133134
highlight default link clojureRegexpPosixCharClass SpecialChar
135+
highlight default link clojureRegexpJavaCharClass SpecialChar
136+
highlight default link clojureRegexpUnicodeCharClass SpecialChar
134137
highlight default link clojureRegexpPredefinedCharClass SpecialChar
135138
highlight default link clojureRegexpBoundary SpecialChar
136139
highlight default link clojureRegexpQuantifier SpecialChar

0 commit comments

Comments
 (0)