Skip to content

Commit 83d7731

Browse files
committed
Revised doc indexing
1 parent 219f8b7 commit 83d7731

File tree

3 files changed

+77
-99
lines changed

3 files changed

+77
-99
lines changed

python/felderize/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ Environment variables (set in `.env`):
7979

8080
## How it works
8181

82-
1. Loads translation rules from skill files (`spark/data/skills/`)
82+
1. Loads translation rules from a single skill file (`spark/data/skills/spark_skills.md`)
8383
2. Sends Spark SQL to the LLM with rules, validated examples, and relevant Feldera SQL documentation (from `docs.feldera.com/docs/sql/`)
8484
3. Parses the translated Feldera SQL from the LLM response
8585
4. Optionally validates output against the Feldera compiler, retrying with error feedback if needed

python/felderize/pyproject.toml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,6 @@ felderize = [
2626
"data/samples/*.md",
2727
"data/demo/*.sql",
2828
"data/demo/expected/*.sql",
29-
"data/compiler/sql-to-dbsp",
30-
"data/compiler/*.jar",
3129
]
3230

3331
[project.scripts]

python/felderize/spark/docs.py

Lines changed: 76 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -5,97 +5,6 @@
55

66
import yaml
77

8-
# Categories used for selecting relevant examples and docs.
9-
_CATEGORIES: dict[str, list[str]] = {
10-
"types": [], # Always matched
11-
"string": [
12-
r"\bUPPER\b",
13-
r"\bLOWER\b",
14-
r"\bTRIM\b",
15-
r"\bCONCAT\b",
16-
r"\bSUBSTRING\b",
17-
r"\bREPLACE\b",
18-
r"\bLIKE\b",
19-
r"\bREGEXP\b",
20-
r"\bLENGTH\b",
21-
r"\bINITCAP\b",
22-
r"\bREVERSE\b",
23-
r"\bREPEAT\b",
24-
r"\bSPLIT\b",
25-
r"\bLPAD\b",
26-
r"\bRPAD\b",
27-
],
28-
"datetime": [
29-
r"\bDATE\b",
30-
r"\bTIMESTAMP\b",
31-
r"\bINTERVAL\b",
32-
r"\bYEAR\b",
33-
r"\bMONTH\b",
34-
r"\bDAY\b",
35-
r"\bHOUR\b",
36-
r"\bEXTRACT\b",
37-
r"\bDATE_ADD\b",
38-
r"\bDATE_SUB\b",
39-
r"\bDATEDIFF\b",
40-
r"\bDATE_TRUNC\b",
41-
r"\bCURRENT_DATE\b",
42-
r"\bCURRENT_TIMESTAMP\b",
43-
],
44-
"json": [
45-
r"\bJSON\b",
46-
r"\bPARSE_JSON\b",
47-
r"\bVARIANT\b",
48-
r"\bget_json_object\b",
49-
r"\bfrom_json\b",
50-
r"\bjson_tuple\b",
51-
r"\bTO_JSON\b",
52-
],
53-
"aggregates": [
54-
r"\bCOUNT\b",
55-
r"\bSUM\b",
56-
r"\bAVG\b",
57-
r"\bGROUP\s+BY\b",
58-
r"\bHAVING\b",
59-
r"\bOVER\s*\(",
60-
r"\bROW_NUMBER\b",
61-
r"\bRANK\b",
62-
r"\bLAG\b",
63-
r"\bLEAD\b",
64-
r"\bWINDOW\b",
65-
],
66-
"array": [
67-
r"\bARRAY\b",
68-
r"\bEXPLODE\b",
69-
r"\bUNNEST\b",
70-
r"\barray_contains\b",
71-
r"\bsort_array\b",
72-
r"\barray_distinct\b",
73-
r"\bCARDINALITY\b",
74-
r"\bsize\s*\(",
75-
],
76-
"map": [r"\bMAP\s*<", r"\bMAP\s*\(", r"\bmap_keys\b", r"\bmap_values\b"],
77-
"decimal": [
78-
r"\bDECIMAL\b",
79-
r"\bNUMERIC\b",
80-
r"\bROUND\b",
81-
r"\bCEIL\b",
82-
r"\bFLOOR\b",
83-
r"\bTRUNCATE\b",
84-
],
85-
"float": [
86-
r"\bFLOAT\b",
87-
r"\bDOUBLE\b",
88-
r"\bPOWER\b",
89-
r"\bSQRT\b",
90-
r"\bLOG\b",
91-
r"\bLN\b",
92-
r"\bSIN\b",
93-
r"\bCOS\b",
94-
],
95-
"casts": [r"\bCAST\s*\(", r"::"],
96-
"comparisons": [r"\bBETWEEN\b", r"\bCASE\s+WHEN\b", r"\bCOALESCE\b", r"\bNULLIF\b"],
97-
}
98-
998
# Map each category to its doc file.
1009
_DOC_FILES: dict[str, str] = {
10110
"types": "types.md",
@@ -111,7 +20,81 @@
11120
"comparisons": "comparisons.md",
11221
}
11322

114-
_doc_cache: dict[str, str] = {}
23+
# SQL construct patterns that cannot be derived from the function index
24+
# (keywords, operators, syntax forms rather than named functions).
25+
_EXTRA_PATTERNS: dict[str, list[str]] = {
26+
"types": [], # always matched — no keywords needed
27+
"datetime": [r"\bDATE\b", r"\bTIMESTAMP\b", r"\bINTERVAL\b"],
28+
"aggregates": [r"\bGROUP\s+BY\b", r"\bHAVING\b", r"\bOVER\s*\("],
29+
"array": [r"\bARRAY\b", r"\bEXPLODE\b", r"\bUNNEST\b", r"\bsize\s*\("],
30+
"map": [r"\bMAP\s*<", r"\bMAP\s*\("],
31+
"json": [r"\bJSON\b", r"\bVARIANT\b"],
32+
"casts": [r"\bCAST\s*\(", r"::"],
33+
"comparisons": [r"\bCASE\s+WHEN\b"],
34+
}
35+
36+
# Spark function names that appear in SQL but are not in the Feldera index.
37+
_SPARK_ALIASES: dict[str, list[str]] = {
38+
"json": [r"\bget_json_object\b", r"\bfrom_json\b", r"\bjson_tuple\b"],
39+
"array": [r"\barray_contains\b", r"\bsort_array\b", r"\barray_distinct\b"],
40+
"decimal": [r"\bNUMERIC\b"],
41+
"float": [r"\bFLOAT\b"],
42+
}
43+
44+
45+
def _build_categories_from_index(index_path: Path) -> dict[str, list[str]]:
46+
"""Parse function-index.md and return category → \\bFUNC\\b pattern lists.
47+
48+
Only populates categories that appear in _DOC_FILES. Falls back to an
49+
empty dict if the index file is not found.
50+
"""
51+
known = set(_DOC_FILES) - {"types"}
52+
cats: dict[str, list[str]] = {cat: [] for cat in _DOC_FILES}
53+
54+
if not index_path.is_file():
55+
return cats
56+
57+
func_re = re.compile(r"^\* `([A-Z_][A-Z_0-9 ]*)`", re.IGNORECASE)
58+
link_re = re.compile(r"\[([a-z]+)\]\([^)]+\)")
59+
60+
for line in index_path.read_text().splitlines():
61+
m = func_re.match(line)
62+
if not m:
63+
continue
64+
func_name = m.group(1).strip()
65+
for link_m in link_re.finditer(line):
66+
cat = link_m.group(1)
67+
if cat in known:
68+
keyword = rf"\b{re.escape(func_name)}\b"
69+
if keyword not in cats[cat]:
70+
cats[cat].append(keyword)
71+
72+
return cats
73+
74+
75+
def _make_categories() -> dict[str, list[str]]:
76+
index_path = (
77+
Path(__file__).resolve().parents[3]
78+
/ "docs.feldera.com" / "docs" / "sql" / "function-index.md"
79+
)
80+
cats = _build_categories_from_index(index_path)
81+
for source in (_EXTRA_PATTERNS, _SPARK_ALIASES):
82+
for cat, patterns in source.items():
83+
seen = set(cats.get(cat, []))
84+
for p in patterns:
85+
if p not in seen:
86+
cats.setdefault(cat, []).append(p)
87+
seen.add(p)
88+
return cats
89+
90+
91+
# Categories used for selecting relevant docs and examples.
92+
# Built automatically from the Feldera function index, supplemented by
93+
# _EXTRA_PATTERNS (SQL construct keywords) and _SPARK_ALIASES (Spark names
94+
# not in the Feldera index).
95+
_CATEGORIES: dict[str, list[str]] = _make_categories()
96+
97+
_doc_cache: dict[Path, str] = {}
11598

11699

117100
def _detect_categories(sql: str) -> set[str]:
@@ -128,10 +111,7 @@ def _detect_categories(sql: str) -> set[str]:
128111

129112

130113
def load_docs(sql: str, docs_dir: Path | None = None) -> str:
131-
"""Load relevant Feldera doc files based on SQL content.
132-
133-
Only loads docs not already covered by skills (currently just types.md).
134-
"""
114+
"""Load relevant Feldera doc files based on SQL content."""
135115
if docs_dir is None:
136116
# Use the canonical docs from the repo root (docs.feldera.com/docs/sql/).
137117
docs_dir = Path(__file__).resolve().parents[3] / "docs.feldera.com" / "docs" / "sql"

0 commit comments

Comments
 (0)