feldera · mihaibudiu · Dec 14, 2023 · Dec 7, 2023 · Dec 11, 2023 · Dec 11, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -16,6 +16,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - SQL: support for trigonometric functions `sin` and `cos` ([#1118](https://github.com/feldera/feldera/pull/1118))
 - SQL: support for mathematical constant `PI` ([#1123](https://github.com/feldera/feldera/pull/1123))
 - WebConsole: 'Inspect connector' button in the connector list in Pipeline Builder that opens a non-editable popup
+- SQL: Support for user-defined functions, declared in SQL and implemented
+        in Rust ([#1129](https://github.com/feldera/feldera/pull/1129))
 
 ## [0.5.0] - 2023-12-05
 

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -13,7 +13,7 @@ with `git commit -s`.
 Our team develops and tests using Linux and MacOS. Windows Subsystem for Linux works fine too.
 
 The Feldera container images and CI workflows use Linux. You can see our setup in
-our [Dockerfile](deploy/Dockerfile) and [Earthfile](Earthfile) .
+our [Dockerfile](deploy/Dockerfile) and [Earthfile](Earthfile).
 
 Our known dependencies are:
 - Runtime
@@ -29,6 +29,7 @@ Our known dependencies are:
   - Python 3
   - typescript
   - Redpanda or Kafka
+  - Earthly (https://earthly.dev/get-earthly)
 
 Additional dependencies are automatically installed by the Rust,
 maven, Python, and typescript build tools.

diff --git a/docs/contributors/compiler.md b/docs/contributors/compiler.md
@@ -56,6 +56,9 @@ Usage: sql-to-dbsp [options] Input file to compile
     --ignoreOrder
       Ignore ORDER BY clauses at the end
       Default: false
+    --udf
+      Specify a Rust file containing implementations of user-defined functions
+      Default: <empty string>
     -O
       Optimization level (0, 1, or 2)
       Default: 2

diff --git a/docs/sidebars.js b/docs/sidebars.js
@@ -108,6 +108,7 @@ const sidebars = {
         'sql/binary',
         'sql/array',
         'sql/datetime'
+        'sql/udf'
       ]
     },
     {

diff --git a/docs/sql/grammar.md b/docs/sql/grammar.md
@@ -1,6 +1,14 @@
 # SQL Grammar
 
-This is a short formal description of the grammar supported:
+This is a short formal description of the grammar supported in a BNF
+form.
+
+- Constructs enclosed between `[]` are optional.
+- `*` denotes zero or many repetitions.
+- Uppercase words (`FUNCTION`) and single-quoted text (`')'`) indicate
+  grammar terminals.
+- Parentheses `()` are used for grouping productions together.
+- The vertical bar `|` indicates alternation.
 
 ```
 statementList:
@@ -9,6 +17,13 @@ statementList:
 statement
   :   createTableStatement
   |   createViewStatement
+  |   createFunctionStatement
+
+generalType
+  :   type [NOT NULL]
+
+createFunctionStatement
+  :   CREATE FUNCTION name '(' [ columnDecl [, columnDecl ]* ] ')' RETURNS generalType
 
 createTableStatement
   :   CREATE TABLE name
@@ -20,7 +35,7 @@ createViewStatement
       AS query
 
 tableElement
-  :   columnName type [NOT [NULL]] ( columnConstraint )*
+  :   columnName generalType ( columnConstraint )*
   |   columnName
   |   tableConstraint
 
@@ -79,7 +94,7 @@ groupItem:
   |   '(' expression [, expression ]* ')'
 
 columnDecl
-  :   column type [ NOT NULL ]
+  :   column generalType
 
 selectWithoutFrom
   :   SELECT [ ALL | DISTINCT ]
@@ -105,7 +120,7 @@ tableExpression
 
 joinCondition
   :   ON booleanExpression
-  |   USING '(' column [, column ]* ')'
+  |   USING '(' column [, column ]* ')'
 
 tableReference
   :   tablePrimary [ pivot ] [ [ AS ] alias [ '(' columnAlias [, columnAlias ]* ')' ] ]
@@ -142,6 +157,9 @@ and is currently ignored.
 In `orderItem`, if expression is a positive integer n, it denotes the
 nth item in the `SELECT` clause.
 
+SQL `CREATE FUNCTION` can be used to declare [user-defined
+functions](udf.md).
+
 An aggregate query is a query that contains a `GROUP BY` or a `HAVING`
 clause, or aggregate functions in the `SELECT` clause. In the
 `SELECT`, `HAVING` and `ORDER` BY clauses of an aggregate query, all

diff --git a/docs/sql/udf.md b/docs/sql/udf.md
@@ -0,0 +1,126 @@
+# User-defined functions
+
+The SQL statement `CREATE FUNCTION` can be used to declare new
+functions whose implementation is provided separately in Rust.
+
+The implementation of these functions is provided to the compiler in a
+separate file, whose name is specified using the `--udf` flag.
+
+The following example shows a user-defined function:
+
+```sql
+CREATE FUNCTION contains_number(str VARCHAR NOT NULL, value INTEGER) RETURNS BOOLEAN NOT NULL
+CREATE VIEW V0 AS SELECT contains_number(CAST('YES: 10 NO:5' AS VARCHAR), 5)
+```
+
+The function name capitalization obeys the same rules as table and
+view names: the names are converted by default to all-capitals, but if
+the name is quoted capitalization is unchanged.  This is important,
+because the user must provide a rust implementation that matches the
+canonical function name.  Here is a possible implementation of the
+function `contains_number` above:
+
+```rs
+use sqllib::*;
+
+pub fn CONTAINS_NUMBER(pos: &SourcePositionRange, str: String, value: Option<i32>) ->
+   Result<bool, Box<dyn std::error::Error>> {
+   match value {
+      None => Err(format!(\"{}: null value\", pos).into()),
+      Some(value) => Ok(str.contains(&format!(\"{}\", value).to_string())),
+   }
+}
+```
+
+The `use sqllib::types::*` directive imports the definitions of the
+standard Rust types that the compiler uses to implement SQL datatypes.
+The next section explains what these types are.
+
+Notice the function implemented has an all-capitals name (which is not
+a standard convention for Rust), dictated by the default SQL
+capitalization rules.
+
+Currently there is no type casting or type inference performed for the
+function arguments in the SQL program.  For example, a call such as
+`CONTAINS_NUMBER('2010-10-20', '5')` will fail at SQL compilation time
+because the first argument has type `CHAR(8)` instead of `VARCHAR`,
+and the second argument has type `CHAR(1)` instead of `INTEGER`.
+
+Clearly, user-defined functions can pose security problems, since the
+Rust implementation is only verified by the Rust compiler.  Such
+functions are expected to have no side-effects, to be deterministic,
+and not to crash.
+
+## Type representation in Rust
+
+The following table shows the Rust representation of standard SQL data
+types.  A nullable SQL type is represented by the corresponding rust
+`Option<>` type.  Notice that some of these types are not standard
+Rust types, but are defined in the DBSP runtime library.
+
+SQL | Rust
+-----------
+`BOOLEAN` | `bool`
+`TINYINT` | `i8`
+`SMALLINT` | `i16`
+`INT`  | `i32`
+`BIGINT` | `i64`
+`DECIMAL`(p, s) | `Decimal`
+`REAL` | `F32`
+`DOUBLE` | `F64`
+`CHAR`(n) | `String`
+`VARCHAR`, `VARCHAR`(n) | `String`
+`BINARY`, `BINARY`(n) | `ByteArray`
+`NULL` | `()`
+`INTERVAL` | `ShortInterval`, `LongInterval`
+`TIME` | `Time`
+`TIMESTAMP` | `Timestamp`
+`DATE` | `Date`
+`ARRAY T` | `Vec<T>`
+
+Multiple SQL types may be represented by the same Rust type.  For
+example, `CHAR`, `CHAR(n)`, `VARCHAR(n)`, and `VARCHAR` are all
+represented by the standard Rust `String` type.
+
+The SQL family of `INTERVAL` types translates to one of two Rust
+types: `ShortInterval` (representing intervals from days to seconds),
+and `LongInterval` (representing intervals from years to months).
+(Our dialect of SQL does not allow mixing the two kinds of intervals
+in a single expression.)
+
+In the Rust implementation the function always has to return the type
+`Result<T, Box<dyn std::error::Error>>`, where `T` is the Rust
+equivalent of the expected return type of the SQL function.  The Rust
+function should return an `Err` only when the function fails at
+runtime; in this case the returned error can use the source position
+information to indicate where the error has originated in the code.
+The function should return an error only for fatal conditions, similar
+to other SQL functions (e.g., array index out of bounds, arithmetic
+overflows, etc.).
+
+## Source position information
+
+The first argument passed to the Rust function is always `pos:
+&SourcePositionRange`.  This argument indicates the position in the
+SQL source code of the call to this user-defined function.  This
+information can be used to generate better runtime error messages when
+the user-defined function encounters an error. (Note: currently
+Calcite does not provide any source position information, but we hope
+to remedy this state of affairs soon.)
+
+## Limitations
+
+There can be only one function with each name.
+
+Functions cannot have names identical to standard SQL library function
+names.
+
+Polymorphic functions are not supported.  For example, in SQL the
+addition operation operates on any numeric types; such an operation
+cannot be implemented as a single user-defined function.
+
+The current type checker is very strict, and it requires the function
+arguments to have exactly the specified types.  No casts are inserted
+by the compiler.  That is why the previous example requires casting
+the string `'YES: 10 NO: 5'` to `VARCHAR`.
+
diff --git a/sql-to-dbsp-compiler/SQL-compiler/out7670973118205954433.tmp b/sql-to-dbsp-compiler/SQL-compiler/out7670973118205954433.tmp
diff --git a/sql-to-dbsp-compiler/SQL-compiler/src/main/codegen/config.fmpp b/sql-to-dbsp-compiler/SQL-compiler/src/main/codegen/config.fmpp
@@ -15,6 +15,7 @@ data: {
       "org.apache.calcite.sql.ddl.SqlDdlNodes"
       "org.apache.calcite.sql.ddl.SqlCreateType"
       "org.dbsp.sqlCompiler.compiler.frontend.calciteCompiler.SqlExtendedColumnDeclaration"
+      "org.dbsp.sqlCompiler.compiler.frontend.calciteCompiler.SqlCreateFunctionDeclaration"
     ]
 
     # List of new keywords. Example: "DATABASES", "TABLES". If the keyword is
@@ -28,7 +29,7 @@ data: {
       "SEMI"
       "SEQUENCES"
       "TEMP"
-      "VOLATILE"
+      #"VOLATILE"
     ]
 
     # List of keywords from "keywords" section that are not reserved.
@@ -883,13 +884,6 @@ data: {
     # List of methods for parsing custom SQL statements.
     # Return type of method implementation should be 'SqlNode'.
     statementParserMethods: [
-      # The following are not yet released
-      # "PostgresqlSqlShow()",
-      # "PostgresqlSqlSetOption()",
-      # "PostgresqlSqlBegin()",
-      # "PostgresqlSqlDiscard()",
-      # "PostgresqlSqlCommit()",
-      # "PostgresqlSqlRollback()"
     ]
 
     # List of methods for parsing custom literals.
@@ -918,6 +912,7 @@ data: {
       "SqlCreateView"
       "SqlCreateExtendedTable"
       "SqlCreateType"
+      "SqlCreateFunction"
     ]
 
     truncateStatementParserMethods: [

diff --git a/sql-to-dbsp-compiler/SQL-compiler/src/main/codegen/includes/ddl.ftl b/sql-to-dbsp-compiler/SQL-compiler/src/main/codegen/includes/ddl.ftl
@@ -98,7 +98,7 @@ SqlExtendedColumnDeclaration ColumnAttribute(SqlExtendedColumnDeclaration column
         )
 }
 
-SqlNodeList AttributeDefList() :
+SqlNodeList NonEmptyAttributeDefList() :
 {
     final Span s;
     final List<SqlNode> list = new ArrayList<SqlNode>();
@@ -114,6 +114,23 @@ SqlNodeList AttributeDefList() :
     }
 }
 
+SqlNodeList AttributeDefList() :
+{
+    final Span s;
+    final List<SqlNode> list = new ArrayList<SqlNode>();
+}
+{
+    <LPAREN> { s = span(); }
+    ( AttributeDef(list)
+        (
+            <COMMA> AttributeDef(list)
+        )*
+    )?
+    <RPAREN> {
+        return new SqlNodeList(list, s.end(this));
+    }
+}
+
 void AttributeDef(List<SqlNode> list) :
 {
     final SqlIdentifier id;
@@ -135,6 +152,27 @@ void AttributeDef(List<SqlNode> list) :
     }
 }
 
+SqlCreateFunctionDeclaration SqlCreateFunction(Span s, boolean replace) :
+{
+    final boolean ifNotExists;
+    final SqlIdentifier id;
+    final SqlNodeList parameters;
+    final SqlDataTypeSpec type;
+    final boolean nullable;
+}
+{
+    <FUNCTION> ifNotExists = IfNotExistsOpt()
+    id = SimpleIdentifier()
+    parameters = AttributeDefList()
+    <RETURNS>
+    type = DataType()
+    nullable = NullableOptDefaultTrue()
+    {
+        return new SqlCreateFunctionDeclaration(s.end(this), replace, ifNotExists,
+            id, parameters, type.withNullable(nullable));
+    }
+}
+
 SqlCreate SqlCreateType(Span s, boolean replace) :
 {
     final SqlIdentifier id;
@@ -146,7 +184,7 @@ SqlCreate SqlCreateType(Span s, boolean replace) :
     id = CompoundIdentifier()
     <AS>
     (
-        attributeDefList = AttributeDefList()
+        attributeDefList = NonEmptyAttributeDefList()
     |
         type = DataType()
     )

diff --git a/sql-to-dbsp-compiler/SQL-compiler/src/main/codegen/includes/parserImpls.ftl b/sql-to-dbsp-compiler/SQL-compiler/src/main/codegen/includes/parserImpls.ftl
@@ -73,15 +73,6 @@ TableCollectionType TableCollectionTypeOpt() :
     { return TableCollectionType.UNSPECIFIED; }
 }
 
-boolean VolatileOpt() :
-{
-}
-{
-    <VOLATILE> { return true; }
-|
-    { return false; }
-}
-
 /* Extra operators */
 
 <DEFAULT, DQID, BTID> TOKEN :