55
66import yaml
77
8- # Categories used for selecting relevant examples and docs.
9- _CATEGORIES : dict [str , list [str ]] = {
10- "types" : [], # Always matched
11- "string" : [
12- r"\bUPPER\b" ,
13- r"\bLOWER\b" ,
14- r"\bTRIM\b" ,
15- r"\bCONCAT\b" ,
16- r"\bSUBSTRING\b" ,
17- r"\bREPLACE\b" ,
18- r"\bLIKE\b" ,
19- r"\bREGEXP\b" ,
20- r"\bLENGTH\b" ,
21- r"\bINITCAP\b" ,
22- r"\bREVERSE\b" ,
23- r"\bREPEAT\b" ,
24- r"\bSPLIT\b" ,
25- r"\bLPAD\b" ,
26- r"\bRPAD\b" ,
27- ],
28- "datetime" : [
29- r"\bDATE\b" ,
30- r"\bTIMESTAMP\b" ,
31- r"\bINTERVAL\b" ,
32- r"\bYEAR\b" ,
33- r"\bMONTH\b" ,
34- r"\bDAY\b" ,
35- r"\bHOUR\b" ,
36- r"\bEXTRACT\b" ,
37- r"\bDATE_ADD\b" ,
38- r"\bDATE_SUB\b" ,
39- r"\bDATEDIFF\b" ,
40- r"\bDATE_TRUNC\b" ,
41- r"\bCURRENT_DATE\b" ,
42- r"\bCURRENT_TIMESTAMP\b" ,
43- ],
44- "json" : [
45- r"\bJSON\b" ,
46- r"\bPARSE_JSON\b" ,
47- r"\bVARIANT\b" ,
48- r"\bget_json_object\b" ,
49- r"\bfrom_json\b" ,
50- r"\bjson_tuple\b" ,
51- r"\bTO_JSON\b" ,
52- ],
53- "aggregates" : [
54- r"\bCOUNT\b" ,
55- r"\bSUM\b" ,
56- r"\bAVG\b" ,
57- r"\bGROUP\s+BY\b" ,
58- r"\bHAVING\b" ,
59- r"\bOVER\s*\(" ,
60- r"\bROW_NUMBER\b" ,
61- r"\bRANK\b" ,
62- r"\bLAG\b" ,
63- r"\bLEAD\b" ,
64- r"\bWINDOW\b" ,
65- ],
66- "array" : [
67- r"\bARRAY\b" ,
68- r"\bEXPLODE\b" ,
69- r"\bUNNEST\b" ,
70- r"\barray_contains\b" ,
71- r"\bsort_array\b" ,
72- r"\barray_distinct\b" ,
73- r"\bCARDINALITY\b" ,
74- r"\bsize\s*\(" ,
75- ],
76- "map" : [r"\bMAP\s*<" , r"\bMAP\s*\(" , r"\bmap_keys\b" , r"\bmap_values\b" ],
77- "decimal" : [
78- r"\bDECIMAL\b" ,
79- r"\bNUMERIC\b" ,
80- r"\bROUND\b" ,
81- r"\bCEIL\b" ,
82- r"\bFLOOR\b" ,
83- r"\bTRUNCATE\b" ,
84- ],
85- "float" : [
86- r"\bFLOAT\b" ,
87- r"\bDOUBLE\b" ,
88- r"\bPOWER\b" ,
89- r"\bSQRT\b" ,
90- r"\bLOG\b" ,
91- r"\bLN\b" ,
92- r"\bSIN\b" ,
93- r"\bCOS\b" ,
94- ],
95- "casts" : [r"\bCAST\s*\(" , r"::" ],
96- "comparisons" : [r"\bBETWEEN\b" , r"\bCASE\s+WHEN\b" , r"\bCOALESCE\b" , r"\bNULLIF\b" ],
97- }
98-
998# Map each category to its doc file.
1009_DOC_FILES : dict [str , str ] = {
10110 "types" : "types.md" ,
11120 "comparisons" : "comparisons.md" ,
11221}
11322
114- _doc_cache : dict [str , str ] = {}
23+ # SQL construct patterns that cannot be derived from the function index
24+ # (keywords, operators, syntax forms rather than named functions).
25+ _EXTRA_PATTERNS : dict [str , list [str ]] = {
26+ "types" : [], # always matched — no keywords needed
27+ "datetime" : [r"\bDATE\b" , r"\bTIMESTAMP\b" , r"\bINTERVAL\b" ],
28+ "aggregates" : [r"\bGROUP\s+BY\b" , r"\bHAVING\b" , r"\bOVER\s*\(" ],
29+ "array" : [r"\bARRAY\b" , r"\bEXPLODE\b" , r"\bUNNEST\b" , r"\bsize\s*\(" ],
30+ "map" : [r"\bMAP\s*<" , r"\bMAP\s*\(" ],
31+ "json" : [r"\bJSON\b" , r"\bVARIANT\b" ],
32+ "casts" : [r"\bCAST\s*\(" , r"::" ],
33+ "comparisons" : [r"\bCASE\s+WHEN\b" ],
34+ }
35+
36+ # Spark function names that appear in SQL but are not in the Feldera index.
37+ _SPARK_ALIASES : dict [str , list [str ]] = {
38+ "json" : [r"\bget_json_object\b" , r"\bfrom_json\b" , r"\bjson_tuple\b" ],
39+ "array" : [r"\barray_contains\b" , r"\bsort_array\b" , r"\barray_distinct\b" ],
40+ "decimal" : [r"\bNUMERIC\b" ],
41+ "float" : [r"\bFLOAT\b" ],
42+ }
43+
44+
45+ def _build_categories_from_index (index_path : Path ) -> dict [str , list [str ]]:
46+ """Parse function-index.md and return category → \\ bFUNC\\ b pattern lists.
47+
48+ Only populates categories that appear in _DOC_FILES. Falls back to an
49+ empty dict if the index file is not found.
50+ """
51+ known = set (_DOC_FILES ) - {"types" }
52+ cats : dict [str , list [str ]] = {cat : [] for cat in _DOC_FILES }
53+
54+ if not index_path .is_file ():
55+ return cats
56+
57+ func_re = re .compile (r"^\* `([A-Z_][A-Z_0-9 ]*)`" , re .IGNORECASE )
58+ link_re = re .compile (r"\[([a-z]+)\]\([^)]+\)" )
59+
60+ for line in index_path .read_text ().splitlines ():
61+ m = func_re .match (line )
62+ if not m :
63+ continue
64+ func_name = m .group (1 ).strip ()
65+ for link_m in link_re .finditer (line ):
66+ cat = link_m .group (1 )
67+ if cat in known :
68+ keyword = rf"\b{ re .escape (func_name )} \b"
69+ if keyword not in cats [cat ]:
70+ cats [cat ].append (keyword )
71+
72+ return cats
73+
74+
75+ def _make_categories () -> dict [str , list [str ]]:
76+ index_path = (
77+ Path (__file__ ).resolve ().parents [3 ]
78+ / "docs.feldera.com" / "docs" / "sql" / "function-index.md"
79+ )
80+ cats = _build_categories_from_index (index_path )
81+ for source in (_EXTRA_PATTERNS , _SPARK_ALIASES ):
82+ for cat , patterns in source .items ():
83+ seen = set (cats .get (cat , []))
84+ for p in patterns :
85+ if p not in seen :
86+ cats .setdefault (cat , []).append (p )
87+ seen .add (p )
88+ return cats
89+
90+
91+ # Categories used for selecting relevant docs and examples.
92+ # Built automatically from the Feldera function index, supplemented by
93+ # _EXTRA_PATTERNS (SQL construct keywords) and _SPARK_ALIASES (Spark names
94+ # not in the Feldera index).
95+ _CATEGORIES : dict [str , list [str ]] = _make_categories ()
96+
97+ _doc_cache : dict [Path , str ] = {}
11598
11699
117100def _detect_categories (sql : str ) -> set [str ]:
@@ -128,10 +111,7 @@ def _detect_categories(sql: str) -> set[str]:
128111
129112
130113def load_docs (sql : str , docs_dir : Path | None = None ) -> str :
131- """Load relevant Feldera doc files based on SQL content.
132-
133- Only loads docs not already covered by skills (currently just types.md).
134- """
114+ """Load relevant Feldera doc files based on SQL content."""
135115 if docs_dir is None :
136116 # Use the canonical docs from the repo root (docs.feldera.com/docs/sql/).
137117 docs_dir = Path (__file__ ).resolve ().parents [3 ] / "docs.feldera.com" / "docs" / "sql"
0 commit comments