Skip to content

Commit c892e06

Browse files
committed
[SQL] Improve usability of VARIANT type
Signed-off-by: Mihai Budiu <mbudiu@feldera.com>
1 parent 2e0a77e commit c892e06

File tree

20 files changed

+376
-64
lines changed

20 files changed

+376
-64
lines changed

crates/adapters/src/format/avro/schema.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,10 @@ pub fn validate_field_schema(
200200
// This type currently cannot occur in SQL table declarations.
201201
return Err("not implemented: Avro deserialization for 'INTERVAL' type".to_string());
202202
}
203+
SqlType::Variant => {
204+
// This type currently cannot occur in SQL table declarations.
205+
return Err("not implemented: Avro deserialization for 'VARIANT' type".to_string());
206+
}
203207
SqlType::Array => {
204208
// This schema is generated by the SQL compiler, so this should never happen.
205209
if field_schema.component.is_none() {
@@ -434,6 +438,9 @@ impl AvroSchemaBuilder {
434438
SqlType::Interval(_) => {
435439
return Err("not implemented: Avro encoding for the SQL interval type".to_string())
436440
}
441+
SqlType::Variant => {
442+
return Err("not implemented: Avro encoding for the SQL VARIANT type".to_string())
443+
}
437444
SqlType::Array => {
438445
let component = column_type
439446
.component

crates/adapters/src/format/json/schema.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -266,6 +266,7 @@ mod kafka_connect_json_converter {
266266
}
267267
}
268268
SqlType::Interval(_) => RepresentationType::String,
269+
SqlType::Variant => RepresentationType::String,
269270
SqlType::Null => RepresentationType::String,
270271
}
271272
}

crates/adapters/src/format/parquet/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,7 @@ pub fn relation_to_arrow_fields(fields: &[Field], delta_lake: bool) -> Vec<Arrow
277277
SqlType::Interval(
278278
IntervalUnit::YearToMonth | IntervalUnit::Year | IntervalUnit::Month,
279279
) => DataType::Interval(ArrowIntervalUnit::YearMonth),
280+
SqlType::Variant => unimplemented!(),
280281
SqlType::Interval(_) => DataType::Interval(ArrowIntervalUnit::DayTime),
281282
SqlType::Array => {
282283
// SqlType::Array implies c.component.is_some()

crates/adapters/src/transport/datagen.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -542,6 +542,7 @@ impl RecordGenerator {
542542
| SqlType::Varchar
543543
| SqlType::Timestamp
544544
| SqlType::Date
545+
| SqlType::Variant
545546
| SqlType::Time => Value::String(String::new()),
546547
SqlType::Interval(_unit) => Value::Null,
547548
SqlType::Array => Value::Array(Vec::new()),
@@ -616,6 +617,11 @@ impl RecordGenerator {
616617
*obj = Value::Null;
617618
Ok(())
618619
}
620+
SqlType::Variant => {
621+
// I don't think this can show up in a table schema
622+
*obj = Value::Null;
623+
Ok(())
624+
}
619625
SqlType::Array => self.generate_array(field, settings, rng, obj),
620626
SqlType::Map => self.generate_map(field, settings, rng, obj),
621627
SqlType::Struct => self.generate_fields(

crates/feldera-types/src/program_schema.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -404,6 +404,9 @@ pub enum SqlType {
404404
/// SQL `NULL` type.
405405
#[serde(rename = "NULL")]
406406
Null,
407+
/// SQL `VARIANT` type.
408+
#[serde(rename = "VARIANT")]
409+
Variant,
407410
}
408411

409412
impl Display for SqlType {
@@ -444,6 +447,7 @@ impl<'de> Deserialize<'de> for SqlType {
444447
"varchar" => Ok(SqlType::Varchar),
445448
"binary" => Ok(SqlType::Binary),
446449
"varbinary" => Ok(SqlType::Varbinary),
450+
"variant" => Ok(SqlType::Variant),
447451
"time" => Ok(SqlType::Time),
448452
"date" => Ok(SqlType::Date),
449453
"timestamp" => Ok(SqlType::Timestamp),
@@ -481,6 +485,7 @@ impl From<SqlType> for &'static str {
481485
SqlType::Array => "ARRAY",
482486
SqlType::Struct => "STRUCT",
483487
SqlType::Map => "MAP",
488+
SqlType::Variant => "VARIANT",
484489
SqlType::Null => "NULL",
485490
}
486491
}

docs/contributors/compiler.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,9 @@ Usage: sql-to-dbsp [options] Input file to compile
7575
How unquoted identifiers are treated. Choices are: 'upper', 'lower',
7676
'unchanged'
7777
Default: upper
78+
--no-restrict-io
79+
Do not restrict the types of columns allowed in tables and views
80+
Default: false
7881
-O
7982
Optimization level (0, 1, or 2)
8083
Default: 2

docs/sql/json.md

Lines changed: 114 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,40 @@ Any such value holds at runtime two pieces of information:
77
- the data type
88
- the data value
99

10-
Values of `VARIANT` type can be created by casting any other value to a `VARIANT`: e.g.
11-
`SELECT CAST(x AS VARIANT)`. Conversely, values of type `VARIANT` can be cast to any other data type
12-
`SELECT CAST(variant AS INT)`. A cast of a value of type `VARIANT` to target type T
13-
will compare the runtime type with T. If the types are identical, the
14-
original value is returned. Otherwise the `CAST` returns `NULL`.
10+
::: warning
1511

16-
Values of type `ARRAY`, `MAP`, and `ROW` type can be cast to `VARIANT`. `VARIANT` values
17-
also offer the following operations:
12+
Currently `VARIANT` values are not supported in table or output views;
13+
they can only be used in intermediate computations.
14+
15+
:::
16+
17+
18+
Values of `VARIANT` type can be created by casting any other value to
19+
a `VARIANT`: e.g. `SELECT CAST(x AS VARIANT)`. Conversely, values of
20+
type `VARIANT` can be cast to any other data type `SELECT CAST(variant
21+
AS INT)`. A cast of a value of type `VARIANT` to target type T will
22+
compare the runtime type with T. If the types are identical or there
23+
is a natural conversion from the runtime type to T, the original value
24+
is returned. Otherwise the `CAST` returns `NULL`.
25+
26+
::: warning
27+
28+
Remember that the `DECIMAL` type specified without precision is the
29+
same as `DECIMAL(0)`, with no digits after the decimal point. When
30+
you cast a `VARIANT` value to `DECIMAL` you should specify a precision
31+
and scale good enough for the values that you expect in the data.
32+
33+
:::
34+
35+
Values of type `ARRAY` and `MAP` can be cast to `VARIANT`.
36+
37+
There exists a special value of `VARIANT` type called `null`. This
38+
value is different from the SQL `NULL` value. It is used to implement
39+
the JSON `null` value. An important difference is that two `VARIANT`
40+
`null` values are equal, whereas `NULL` in SQL is not equal (or
41+
different!) to anything.
42+
43+
`VARIANT` values also offer the following operations:
1844

1945
- indexing using array indexing notation `variant[index]`. If the `VARIANT` is
2046
obtained from an `ARRAY` value, the indexing operation returns a `VARIANT` whose value element
@@ -30,11 +56,7 @@ also offer the following operations:
3056
is subject to the capitalization rules of the SQL dialect, so for correct
3157
operation the field may need to be quoted: `variant."field"`
3258

33-
The runtime types do not need to match exactly the compile-time types.
34-
As a compiler front-end, Calcite does not mandate exactly how the runtime types
35-
are represented. Calcite does include one particular implementation in
36-
Java runtime, which is used for testing. In this representation
37-
the runtime types are represented as follows:
59+
The runtime types do not match exactly the compile-time types:
3860

3961
- The scalar types do not include information about precision and scale. Thus all `DECIMAL`
4062
compile-time types are represented by a single run-time type.
@@ -43,20 +65,64 @@ the runtime types are represented as follows:
4365
- `FLOAT` and `DOUBLE` are both represented by the same runtime type.
4466
- All "short interval" types (from days to seconds) are represented by a single type.
4567
- All "long interval" types (from years to months) are represented by a single type.
46-
- Generic types such as `INT ARRAY`, `MULTISET`, and `MAP` do carry runtime
68+
- Generic types such as `INT ARRAY`, `MULTISET`, and `MAP` do not carry runtime
4769
information about the element types
48-
- The `ROW` type does have information about all field types (currently not yet supported)
4970

5071
## Functions that operate on `VARIANT` values
5172

5273
| Function | Description |
5374
| `VARIANTNULL()` | Can be used to create an instance of the `VARIANT` `null` value. |
54-
| `TYPEOF(` _variant_ `)` | Argument must be a `VARIANT` value. Returns the runtime type of the value |
75+
| `TYPEOF(` _variant_ `)` | Argument must be a `VARIANT` value. Returns a string describing the runtime type of the value |
5576
| `PARSE_JSON(` _string_ `)` | Parses a string that represents a JSON value, returns a `VARIANT` object, or `NULL` if parsing fails |
56-
| `UNPARSE_JSON(` _variant_ `)` | Returns a string that represents the serialization of a `VARIANT` value. If the value cannot be represented as JSON, the result is `NULL` |
77+
| `UNPARSE_JSON(` _variant_ `)` | Argument must be a `VARIANT` value. Returns a string that represents the serialization of a `VARIANT` value. If the value cannot be represented as JSON, the result is `NULL` |
78+
79+
### `PARSE_JSON`
80+
81+
`PARSE_JSON` converts a JSON value as follows:
82+
83+
- JSON `null` is converted to a `VARIANT` `null` value (not a SQL `NULL`!); see above the description of `VARIANT` `null`
84+
- JSON Booleans are converted to `BOOLEAN` values (wrapped in `VARIANT` values)
85+
- JSON numbers are converted to `DECIMAL` values (wrapped in `VARIANT` values)
86+
- JSON strings are converted to `VARCHAR` values (wrapped in `VARIANT` values)
87+
- JSON arrays are converted to `VARIANT ARRAY` values (wrapped in `VARIANT` values). Each array element is a `VARIANT`
88+
- JSON objects are converted to `MAP<VARIANT, VARIANT>` values (wrapped in `VARIANT` values). Each key and each value is a `VARIANT`
89+
90+
For example, `PARSE_JSON("{"a": 1, "b": [2, 3.3, null]}")` generates the same SQL value that would be generated by the following code:
91+
92+
```sql
93+
SELECT CAST(
94+
MAP[
95+
CAST('a' AS VARIANT), CAST(1.0 AS VARIANT),
96+
CAST('b' AS VARIANT), CAST(ARRAY[
97+
CAST(2.0 AS VARIANT),
98+
CAST(3.3 AS VARIANT),
99+
VARIANTNULL()
100+
] AS VARIANT)
101+
] AS VARIANT)
102+
```
57103

58-
Here are some examples using `VARIANT` and `JSON` values:
59104

105+
### `UNPARSE_JSON`
106+
107+
`PARSE_JSON` converts a `VARIANT` value to a `VARCHAR`. The
108+
assumption is that the `VARIANT` was has the structure as produced by
109+
the `PARSE_JSON` function:
110+
111+
- the `VARIANT` `null` value is converted to the string `null`
112+
- a `VARIANT` wrapping a Boolean value is converted to the respective Boolean string `true` or `false`
113+
- a `VARIANT` wrapping any numeric value (`DECIMAL`, `TINYINT`, `SMALLINT`, `INTEGER`, `BIGINT`, `REAL`, `DOUBLE`, `DECIMAL`) is converted to the string representation of the value as produced using a `CAST(value AS VARCHAR)`
114+
- a `VARIANT` wrapping a `VARCHAR` value is converted to a string with double quotes, and with escape sequences, as mandated by the JSON grammar
115+
- a `VARIANT` wrapping an `ARRAY` with elements of any type is converted to a JSON array, and the elements are recursively converted
116+
- a `VARIANT` wrapping a `MAP` whose keys have any SQL `CHAR` type, or `VARIANT` values wrapping `CHAR` values will generate a JSON object, by recursively converting each key-value pair.
117+
- any other data value is a conversion error, and causes the `UNPARSE_JSON` function to produce a `NULL` result
118+
119+
## Examples
120+
121+
Here are some simple SQL query examples using `VARIANT` and JSON
122+
values and the expected output values. (Note that these examples
123+
cannot be executed directly, since they involve no views.)
124+
125+
```sql
60126
SELECT CAST(1 AS VARIANT)
61127
1
62128

@@ -128,7 +194,7 @@ null
128194
SELECT CAST(ARRAY[1,2,3] AS VARIANT)[1]
129195
1
130196

131-
-- Acessing items in a VARIANT array returns VARIANT values,
197+
-- Accessing items in a VARIANT array returns VARIANT values,
132198
-- even if the array itself does not contain VARIANT values
133199
-- (Otherwise TYPEOF would not compile)
134200
SELECT TYPEOF(CAST(ARRAY[1,2,3] AS VARIANT)[1])
@@ -240,4 +306,33 @@ true
240306

241307
-- JSON cannot contain dates, result is NULL
242308
SELECT UNPARSE_JSON(CAST(DATE '2020-01-01' AS VARIANT))
243-
NULL
309+
NULL
310+
```
311+
312+
### Example SQL program manipulating JSON values
313+
314+
```sql
315+
CREATE TABLE json (json VARCHAR);
316+
317+
CREATE LOCAL VIEW tmp AS SELECT PARSE_JSON(json) AS json FROM json;
318+
319+
CREATE VIEW average AS SELECT
320+
CAST(json['name'] AS VARCHAR) as name,
321+
((CAST(json['scores'][1] AS DECIMAL(8, 2)) + CAST(json['scores'][2] AS DECIMAL(8, 2))) / 2) as average
322+
FROM tmp;
323+
```
324+
325+
The input table has a single column, with type `VARCHAR`.
326+
327+
The input data in table is parsed using `PARSE_JSON` and stored in the
328+
intermediate view `tmp`.
329+
330+
The query that defines the output view `average` accesses fields of
331+
the JSON values of `tmp`. Note how object fields are accessed using
332+
map indexing operators `['scores']`, `['name']`, and how array
333+
elements are accessed using indexing with numeric values `[1]`.
334+
Recall that array indexes in SQL start from 1!
335+
336+
Finally, notice how the `DECIMAL` values that are retrieved need to
337+
specify the precision and scale: `CAST(... AS DECIMAL(8, 2))`. Using
338+
`CAST(... AS DECIMAL)` would loose all digits after the decimal point.

docs/sql/types.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ The compiler supports the following SQL data types:
2525
| `GEOMETRY` | A geographic data type (only rudimentary support at this point). | |
2626
| `ARRAY` | An array with element of the specified type. Used as a suffix for another type (e.g., `INT ARRAY`) | |
2727
| `MAP` | A map with keys and values of specified types. The syntax is `MAP<KEYTYPE, VALUETYPE>` | |
28-
| `VARIANT` | A dynamically-typed value that can have as value any other SQL type | |
28+
| `VARIANT` | A dynamically-typed value that can wrap any other SQL type | |
2929

3030
- For `DECIMAL` types: 23.456 has a precision of 5 and a scale of 3.
3131
If scale is missing it is assumed to be 0.
@@ -43,6 +43,10 @@ The compiler supports the following SQL data types:
4343
user-defined types as field types. They can only used in
4444
expressions.
4545

46+
- `VARIANT` is used to implement JSON. See [JSON support](json.md)
47+
Currently `VARIANT` values are not supported in table or output
48+
views; they can only be used in intermediate computations.
49+
4650
## Computations on nullable types
4751

4852
A type is nullable if it can represent the `NULL` value. For input

python/tests/grades-json.csv

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
JSON
2+
"{""name"": ""Bob"", ""scores"": [8, 10]}"
3+
"{""name"": ""John"", ""scores"": [9, 10]}"
4+
"{""name"": ""Dunce"", ""scores"": [3, 4]}"

python/tests/test_variant.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
import os
2+
import unittest
3+
import pandas as pd
4+
5+
import unittest
6+
from feldera import PipelineBuilder, Pipeline
7+
from tests import TEST_CLIENT
8+
from decimal import Decimal
9+
10+
class TestVariant(unittest.TestCase):
11+
def test_local(self):
12+
sql = f"""
13+
CREATE TABLE json (json VARCHAR);
14+
15+
CREATE LOCAL VIEW tmp AS SELECT PARSE_JSON(json) AS json FROM json;
16+
17+
CREATE VIEW average AS SELECT
18+
CAST(json['name'] AS VARCHAR) as name,
19+
((CAST(json['scores'][1] AS DECIMAL(8, 2)) + CAST(json['scores'][2] AS DECIMAL(8, 2))) / 2) as average
20+
FROM tmp;
21+
"""
22+
23+
dir_path = os.path.dirname(os.path.realpath(__file__))
24+
pipeline = PipelineBuilder(TEST_CLIENT, name="test_variant", sql=sql).create_or_replace()
25+
df_grades = pd.read_csv(dir_path + '/grades-json.csv')
26+
27+
expected_data = [
28+
{ "name": "Bob", "average": Decimal(9) },
29+
{ "name": "Dunce", "average": Decimal(3.5) },
30+
{ "name": "John", "average": Decimal(9.5) }
31+
]
32+
for datum in expected_data:
33+
datum.update({"insert_delete": 1})
34+
35+
out = pipeline.listen("average")
36+
37+
pipeline.start()
38+
pipeline.input_pandas('json', df_grades)
39+
pipeline.wait_for_completion(True)
40+
out_data = out.to_dict()
41+
42+
assert expected_data == out_data
43+
pipeline.delete()
44+
45+
if __name__ == '__main__':
46+
unittest.main()

0 commit comments

Comments
 (0)