add --verbose flag, translate-file, combined demos, and Feldera PK/qu…

…oting rule
feldera · wilmaontherun · Mar 16, 2026 · Mar 16, 2026 · Mar 19, 2026 · Mar 19, 2026
commit 56bd911b8b5957c5b8710dfab25c147d5cb39e73
diff --git a/python/felderize/README.md b/python/felderize/README.md
@@ -27,16 +27,32 @@ echo 'ANTHROPIC_API_KEY=your-key-here' > .env
 # List available examples
 felderize example
 
-# Attempts to translate an example
+# Translate an example (validates by default)
 felderize example simple
 
-# With compiler validation
-felderize example simple --validate
+# Without compiler validation
+felderize example simple --no-validate
 
-# Output translation result as JSON
+# Log SQL submitted to the validator at each attempt
+felderize example json --verbose
+
+# Output as JSON
 felderize example simple --json-output
 ```
 
+Available examples:
+
+| Name | Description |
+|------|-------------|
+| `simple` | Date truncation, GROUP BY |
+| `strings` | INITCAP, LPAD, NVL, CONCAT_WS |
+| `arrays` | array_contains, size, element_at |
+| `joins` | Null-safe equality (`<=>`) |
+| `windows` | LAG, running SUM OVER |
+| `aggregations` | COUNT DISTINCT, HAVING (includes unsupported: COLLECT_LIST, PERCENTILE_APPROX) |
+| `json` | get_json_object → PARSE_JSON + VARIANT access *(combined file)* |
+| `topk` | ROW_NUMBER TopK, QUALIFY, DATEDIFF → TIMESTAMPDIFF *(combined file)* |
+
 The JSON output contains:
 
 ```json
@@ -52,11 +68,24 @@ The JSON output contains:
 
 ### Translate your own SQL
 
+Two input formats are supported:
+
+**Separate schema and query files:**
 ```bash
 felderize translate path/to/schema.sql path/to/query.sql
 felderize translate path/to/schema.sql path/to/query.sql --validate
 ```
 
+**Single combined file** (CREATE TABLE and CREATE VIEW statements in one file):
+```bash
+felderize translate-file path/to/combined.sql
+felderize translate-file path/to/combined.sql --validate
+```
+
+> **Note:** Running without `--validate` prints a warning — the output SQL has not been verified against the Feldera compiler.
+
+Both commands accept `--verbose` to log the SQL submitted to the validator at each repair attempt.
+
 ### Batch translation
 
 ```bash

diff --git a/python/felderize/pyproject.toml b/python/felderize/pyproject.toml
@@ -25,7 +25,6 @@ felderize = [
     "data/skills/**/*.md",
     "data/samples/*.md",
     "data/demo/*.sql",
-    "data/demo/expected/*.sql",
 ]
 
 [project.scripts]

diff --git a/python/felderize/spark/cli.py b/python/felderize/spark/cli.py
@@ -22,10 +22,13 @@ def cli():
 @click.option("--validate", is_flag=True, help="Validate against Feldera instance")
 @click.option("--json-output", is_flag=True, help="Output as JSON")
 @click.option("--no-docs", is_flag=True, help="Disable Feldera doc inclusion in prompt")
+@click.option("--verbose", is_flag=True, help="Log SQL submitted to validator at each attempt")
 def translate(
-    schema_file: str, query_file: str, validate: bool, json_output: bool, no_docs: bool
+    schema_file: str, query_file: str, validate: bool, json_output: bool, no_docs: bool, verbose: bool
 ):
     """Translate a single Spark SQL schema + query pair to Feldera SQL."""
+    if not validate:
+        click.echo("Warning: running without validation — output SQL is not verified against the Feldera compiler.", err=True)
     config = Config.from_env()
     schema_sql = Path(schema_file).read_text()
     query_sql = Path(query_file).read_text()
@@ -36,6 +39,7 @@ def translate(
         config,
         validate=validate,
         include_docs=not no_docs,
+        verbose=verbose,
     )
 
     if json_output:
@@ -49,8 +53,11 @@ def translate(
 @click.option("--validate", is_flag=True, help="Validate against Feldera instance")
 @click.option("--json-output", is_flag=True, help="Output as JSON")
 @click.option("--no-docs", is_flag=True, help="Disable Feldera doc inclusion in prompt")
-def translate_file(sql_file: str, validate: bool, json_output: bool, no_docs: bool):
+@click.option("--verbose", is_flag=True, help="Log SQL submitted to validator at each attempt")
+def translate_file(sql_file: str, validate: bool, json_output: bool, no_docs: bool, verbose: bool):
     """Translate a single combined Spark SQL file (schema + views) to Feldera SQL."""
+    if not validate:
+        click.echo("Warning: running without validation — output SQL is not verified against the Feldera compiler.", err=True)
     config = Config.from_env()
     combined_sql = Path(sql_file).read_text()
     schema_sql, query_sql = split_combined_sql(combined_sql)
@@ -61,6 +68,7 @@ def translate_file(sql_file: str, validate: bool, json_output: bool, no_docs: bo
         config,
         validate=validate,
         include_docs=not no_docs,
+        verbose=verbose,
     )
 
     if json_output:
@@ -140,7 +148,8 @@ def batch(data_dir: str, validate: bool, output_dir: str | None, no_docs: bool):
 )
 @click.option("--json-output", is_flag=True, help="Output as JSON")
 @click.option("--no-docs", is_flag=True, help="Disable Feldera doc inclusion in prompt")
-def example(name: str | None, validate: bool, json_output: bool, no_docs: bool):
+@click.option("--verbose", is_flag=True, help="Log SQL submitted to validator at each attempt")
+def example(name: str | None, validate: bool, json_output: bool, no_docs: bool, verbose: bool):
     """Run a built-in example translation.
 
     Without NAME, lists available examples. With NAME, translates that example.
@@ -150,43 +159,59 @@ def example(name: str | None, validate: bool, json_output: bool, no_docs: bool):
       felderize example              # list available examples
       felderize example simple       # translate the 'simple' example
     """
-    # Discover available examples
-    pairs: dict[str, tuple[Path, Path]] = {}
+    # Discover available examples: schema+query pairs and combined files
+    pairs: dict[str, tuple[Path, Path] | Path] = {}
     for schema_file in sorted(_EXAMPLES_DIR.glob("*_schema.sql")):
         example_name = schema_file.name.replace("_schema.sql", "")
         query_file = _EXAMPLES_DIR / f"{example_name}_query.sql"
         if query_file.is_file():
             pairs[example_name] = (schema_file, query_file)
+    for combined_file in sorted(_EXAMPLES_DIR.glob("*_combined.sql")):
+        example_name = combined_file.name.replace("_combined.sql", "")
+        pairs[example_name] = combined_file
 
     if not name:
         click.echo("Available examples:\n")
-        for ex_name, (sf, qf) in pairs.items():
-            schema_preview = sf.read_text().strip().split("\n")[0]
-            click.echo(f"  {ex_name:20s} {schema_preview}")
+        for ex_name, files in pairs.items():
+            if isinstance(files, Path):
+                preview = files.read_text().strip().split("\n")[0]
+                click.echo(f"  {ex_name:20s} {preview}  [combined]")
+            else:
+                preview = files[0].read_text().strip().split("\n")[0]
+                click.echo(f"  {ex_name:20s} {preview}")
         click.echo("\nRun one with: felderize example <name>")
         return
 
     if name not in pairs:
         click.echo(f"Unknown example '{name}'. Available: {', '.join(pairs)}", err=True)
         sys.exit(1)
 
-    schema_file, query_file = pairs[name]
-    schema_sql = schema_file.read_text()
-    query_sql = query_file.read_text()
-
-    click.echo(f"-- Spark Schema ({name}) --", err=True)
-    click.echo(schema_sql.strip(), err=True)
-    click.echo(f"\n-- Spark Query ({name}) --", err=True)
-    click.echo(query_sql.strip(), err=True)
+    files = pairs[name]
+    if isinstance(files, Path):
+        combined_sql = files.read_text()
+        schema_sql, query_sql = split_combined_sql(combined_sql)
+        click.echo(f"-- Spark SQL ({name}) --", err=True)
+        click.echo(combined_sql.strip(), err=True)
+    else:
+        schema_file, query_file = files
+        schema_sql = schema_file.read_text()
+        query_sql = query_file.read_text()
+        click.echo(f"-- Spark Schema ({name}) --", err=True)
+        click.echo(schema_sql.strip(), err=True)
+        click.echo(f"\n-- Spark Query ({name}) --", err=True)
+        click.echo(query_sql.strip(), err=True)
     click.echo("\nTranslating...\n", err=True)
 
+    if not validate:
+        click.echo("Warning: running without validation — output SQL is not verified against the Feldera compiler.", err=True)
     config = Config.from_env()
     result = translate_spark_to_feldera(
         schema_sql,
         query_sql,
         config,
         validate=validate,
         include_docs=not no_docs,
+        verbose=verbose,
     )
 
     if json_output:

diff --git a/python/felderize/spark/data/demo/expected/aggregations.sql b/python/felderize/spark/data/demo/expected/aggregations.sql
diff --git a/python/felderize/spark/data/demo/expected/arrays.sql b/python/felderize/spark/data/demo/expected/arrays.sql
diff --git a/python/felderize/spark/data/demo/expected/joins.sql b/python/felderize/spark/data/demo/expected/joins.sql
diff --git a/python/felderize/spark/data/demo/expected/simple.sql b/python/felderize/spark/data/demo/expected/simple.sql
diff --git a/python/felderize/spark/data/demo/expected/strings.sql b/python/felderize/spark/data/demo/expected/strings.sql
diff --git a/python/felderize/spark/data/demo/expected/windows.sql b/python/felderize/spark/data/demo/expected/windows.sql
diff --git a/python/felderize/spark/data/skills/spark_skills.md b/python/felderize/spark/data/skills/spark_skills.md
@@ -52,6 +52,64 @@ If the compiler reports `Encountered "<" ... ARRAY<VARCHAR>`: rewrite ALL `ARRAY
 | `CREATE TEMPORARY VIEW` | → `CREATE VIEW` |
 | `USING parquet` / `delta` / `csv` | Remove clause |
 | `PARTITIONED BY (...)` | Remove clause |
+| `CONSTRAINT name PRIMARY KEY (cols)` | → `PRIMARY KEY (cols)` — drop the `CONSTRAINT name` wrapper; Feldera rejects the named constraint syntax |
+| PK column without `NOT NULL` | Add `NOT NULL` — Feldera requires all PRIMARY KEY columns to be NOT NULL |
+
+### PRIMARY KEY rules
+
+Two constraints Feldera enforces that Spark does not:
+
+1. **No `CONSTRAINT name` wrapper** — Feldera rejects `CONSTRAINT pk PRIMARY KEY (col)`. Use bare `PRIMARY KEY (col)`.
+2. **All PK columns must be `NOT NULL`** — Feldera rejects nullable PK columns:
+
+```
+error: PRIMARY KEY cannot be nullable: PRIMARY KEY column 'borrowerid' has type VARCHAR, which is nullable
+```
+
+```sql
+-- Spark (both issues)
+CREATE TABLE orders (
+  order_id STRING,
+  item_id STRING,
+  CONSTRAINT orders_pk PRIMARY KEY (order_id, item_id)
+);
+
+-- Feldera (fixed)
+CREATE TABLE orders (
+  order_id VARCHAR NOT NULL,
+  item_id VARCHAR NOT NULL,
+  PRIMARY KEY (order_id, item_id)
+);
+```
+
+### Reserved words as column names must be quoted
+
+**Only quote column names that are SQL reserved words** — do not quote ordinary identifiers. Quoting non reserved words is wrong: it makes identifiers case-sensitive and adds unnecessary noise.
+
+Quote a column name only when the compiler rejects it unquoted:
+```
+error: Error parsing SQL: Encountered ", TimeStamp" at line 33, column 25.
+```
+
+When quoting is needed, apply it consistently in both `CREATE TABLE` and every query reference:
+
+```sql
+-- Schema: only "TimeStamp" is quoted — it clashes with the TIMESTAMP type keyword
+CREATE TABLE events (
+  id BIGINT NOT NULL,
+  source VARCHAR,
+  "TimeStamp" TIMESTAMP,
+  PRIMARY KEY (id)
+);
+
+-- Query: quote "TimeStamp" everywhere it appears, leave other columns unquoted
+SELECT e.source, e."TimeStamp" as ts,
+       MAX(e."TimeStamp") OVER (PARTITION BY e.id) as max_ts
+FROM events e
+WHERE e."TimeStamp" >= TIMESTAMP '2024-01-01 00:00:00'
+```
+
+Known column names that clash with SQL keywords: `TimeStamp`, `Date`, `Time`, `Value`, `Type`, `Name`, `Language`.
 
 ### DDL Examples
 
@@ -578,6 +636,8 @@ When the Feldera compiler rejects translated SQL, check these common causes firs
 | `No match found for function signature day(<TIMESTAMP>)` | Used `DAY(ts)` on a TIMESTAMP | Use `DAYOFMONTH(ts)` or `EXTRACT(DAY FROM ts)` |
 | `No match found for function signature X` | Function is unsupported | Check this reference; if listed as unsupported, return immediately — do NOT retry |
 | `Encountered "<" ... ARRAY<VARCHAR>` | Used Spark array syntax | Rewrite ALL `ARRAY<T>` to `T ARRAY` suffix form |
+| `Error parsing SQL: Encountered ", ColumnName"` | Column name is a SQL reserved word | Double-quote the column name in schema and all query references, e.g. `"TimeStamp"` |
+| `PRIMARY KEY cannot be nullable: column 'x' has type T, which is nullable` | PK column missing `NOT NULL` | Add `NOT NULL` to every column listed in the PRIMARY KEY |
 
 ## Important rules