Skip to content

Commit b41b54e

Browse files
committed
Initial DataFusion implementation
1 parent ad5d891 commit b41b54e

25 files changed

+2730
-8
lines changed

.github/workflows/main.yml

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,34 @@ jobs:
199199
run: |
200200
DATABEND_AVAILABLE=true mvn -Dtest=TestDatabend test
201201
202+
datafusion:
203+
name: DBMS Tests (DataFusion)
204+
runs-on: ubuntu-latest
205+
steps:
206+
- uses: actions/checkout@v3
207+
with:
208+
fetch-depth: 0
209+
- name: Set up Rust
210+
uses: actions-rs/toolchain@v1
211+
with:
212+
toolchain: stable
213+
override: true
214+
- name: Run DataFusion Server
215+
run: |
216+
cd src/sqlancer/datafusion/server/datafusion_server
217+
cargo run --features "datafusion_stable" & sleep 300
218+
- name: Set up JDK 11
219+
uses: actions/setup-java@v3
220+
with:
221+
distribution: 'temurin'
222+
java-version: '11'
223+
cache: 'maven'
224+
- name: Build SQLancer
225+
run: mvn -B package -DskipTests=true
226+
- name: Run Tests
227+
run: |
228+
DATAFUSION_AVAILABLE=true mvn test -Pdatafusion-tests
229+
202230
duckdb:
203231
name: DBMS Tests (DuckDB)
204232
runs-on: ubuntu-latest

README.md

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -56,11 +56,11 @@ Please find the `.bib` entries [here](docs/PAPERS.md).
5656
Since SQL dialects differ widely, each DBMS to be tested requires a separate implementation.
5757

5858
| DBMS | Status | Expression Generation | Description |
59-
|------------------------------|-------------|------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
59+
| ---------------------------- | ----------- | ---------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
6060
| SQLite | Working | Untyped | This implementation is currently affected by a significant performance regression that still needs to be investigated |
6161
| MySQL | Working | Untyped | Running this implementation likely uncovers additional, unreported bugs. |
6262
| PostgreSQL | Working | Typed | |
63-
| Citus (PostgreSQL Extension) | Working | Typed | This implementation extends the PostgreSQL implementation of SQLancer, and was contributed by the Citus team. |
63+
| Citus (PostgreSQL Extension) | Working | Typed | This implementation extends the PostgreSQL implementation of SQLancer, and was contributed by the Citus team. |
6464
| MariaDB | Preliminary | Untyped | The implementation of this DBMS is very preliminary, since we stopped extending it after all but one of our bug reports were addressed. Running it likely uncovers additional, unreported bugs. |
6565
| CockroachDB | Working | Typed | |
6666
| TiDB | Working | Untyped | |
@@ -69,12 +69,13 @@ Since SQL dialects differ widely, each DBMS to be tested requires a separate imp
6969
| TDEngine | Removed | Untyped | We removed the TDEngine implementation since all but one of our bug reports were still unaddressed five months after we reported them. |
7070
| OceanBase | Working | Untyped | |
7171
| YugabyteDB | Working | Typed (YSQL), Untyped (YCQL) | YSQL implementation based on Postgres code. YCQL implementation is primitive for now and uses Cassandra JDBC driver as a proxy interface. |
72-
| Databend | Working | Typed | |
73-
| QuestDB | Working | Untyped, Generic | The implementation of QuestDB is still WIP, current version covers very basic data types, operations and SQL keywords. |
74-
| CnosDB |Working | Typed | The implementation of CnosDB currently uses Restful API. |
75-
| Materialize |Working | Typed |
76-
| Apache Doris | Preliminary | Typed | This is a preliminary implementation, which only contains the common logic of Doris. We have found some errors through it, and hope to improve it in the future.
77-
| Presto | Preliminary | Typed | This is a preliminary implementation, only basic types supported.
72+
| Databend | Working | Typed | |
73+
| QuestDB | Working | Untyped, Generic | The implementation of QuestDB is still WIP, current version covers very basic data types, operations and SQL keywords. |
74+
| CnosDB | Working | Typed | The implementation of CnosDB currently uses Restful API. |
75+
| Materialize | Working | Typed | |
76+
| Apache Doris | Preliminary | Typed | This is a preliminary implementation, which only contains the common logic of Doris. We have found some errors through it, and hope to improve it in the future. |
77+
| Presto | Preliminary | Typed | This is a preliminary implementation, only basic types supported. |
78+
| DataFusion | Preliminary | Typed | Only basic SQL features are supported. |
7879

7980

8081

pom.xml

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -373,6 +373,11 @@
373373
<artifactId>jsqlparser</artifactId>
374374
<version>4.6</version>
375375
</dependency>
376+
<dependency>
377+
<groupId>org.apache.arrow</groupId>
378+
<artifactId>flight-sql-jdbc-driver</artifactId>
379+
<version>16.1.0</version>
380+
</dependency>
376381
</dependencies>
377382
<reporting>
378383
<plugins>
@@ -471,5 +476,23 @@
471476
</plugins>
472477
</build>
473478
</profile>
479+
<profile>
480+
<id>datafusion-tests</id>
481+
<build>
482+
<plugins>
483+
<plugin>
484+
<groupId>org.apache.maven.plugins</groupId>
485+
<artifactId>maven-surefire-plugin</artifactId>
486+
<version>3.3.0</version>
487+
<configuration>
488+
<includes>
489+
<include>**/TestDataFusion.java</include>
490+
</includes>
491+
<argLine>--add-opens java.base/java.nio=org.apache.arrow.memory.core,ALL-UNNAMED</argLine>
492+
</configuration>
493+
</plugin>
494+
</plugins>
495+
</build>
496+
</profile>
474497
</profiles>
475498
</project>

src/check_names.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ def verify_all_dbs(name_to_files: dict[str:List[str]]):
3838
name_to_files["CnosDB"] = get_java_files(os.path.join(cwd, "src", "sqlancer", "cnosdb"))
3939
name_to_files["CockroachDB"] = get_java_files(os.path.join(cwd, "src", "sqlancer", "cockroachdb"))
4040
name_to_files["Databend"] = get_java_files(os.path.join(cwd, "src", "sqlancer", "databend"))
41+
name_to_files["DataFusion"] = get_java_files(os.path.join(cwd, "src", "sqlancer", "datafusion"))
4142
name_to_files["DuckDB"] = get_java_files(os.path.join(cwd, "src", "sqlancer", "duckdb"))
4243
name_to_files["H2"] = get_java_files(os.path.join(cwd, "src", "sqlancer", "h2"))
4344
name_to_files["HSQLDB"] = get_java_files(os.path.join(cwd, "src", "sqlancer", "hsqldb"))
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
package sqlancer.datafusion;
2+
3+
import static sqlancer.datafusion.DataFusionUtil.dfAssert;
4+
5+
import sqlancer.common.query.ExpectedErrors;
6+
7+
public final class DataFusionErrors {
8+
private DataFusionErrors() {
9+
dfAssert(false, "Utility class cannot be instantiated");
10+
}
11+
12+
/*
13+
* During Oracle Checks, if ANY query returns one of the following error Then the current oracle check will be
14+
* skipped. e.g.: NoREC Q1 -> throw an expected error NoREC Q2 -> succeed Since it's a known error, `SQLancer` will
15+
* skip this check and don't report bug.
16+
*
17+
* Note now it's implemented this way for simplicity This way might cause false negative, because Q1 and Q2 should
18+
* both succeed or both fail TODO(datafusion): ensure both succeed or both fail
19+
*/
20+
public static void registerExpectedExecutionErrors(ExpectedErrors errors) {
21+
/*
22+
* Expected
23+
*/
24+
errors.add("Error building plan"); // Randomly generated SQL is not valid and caused palning error
25+
errors.add("Error during planning");
26+
errors.add("Execution error");
27+
errors.add("Overflow happened");
28+
errors.add("overflow");
29+
errors.add("Unsupported data type");
30+
errors.add("Divide by zero");
31+
/*
32+
* Known bugs
33+
*/
34+
errors.add("to type Int64"); // https://github.com/apache/datafusion/issues/11252
35+
errors.add("bitwise"); // https://github.com/apache/datafusion/issues/11260
36+
errors.add("NestedLoopJoinExec"); // https://github.com/apache/datafusion/issues/11269
37+
/*
38+
* False positives
39+
*/
40+
errors.add("Physical plan does not support logical expression AggregateFunction"); // False positive: when aggr
41+
// is generated in where
42+
// clause
43+
}
44+
}
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
package sqlancer.datafusion;
2+
3+
import java.sql.SQLException;
4+
import java.util.Arrays;
5+
import java.util.List;
6+
7+
import com.beust.jcommander.Parameter;
8+
import com.beust.jcommander.Parameters;
9+
10+
import sqlancer.DBMSSpecificOptions;
11+
import sqlancer.OracleFactory;
12+
import sqlancer.common.oracle.TestOracle;
13+
import sqlancer.datafusion.DataFusionOptions.DataFusionOracleFactory;
14+
import sqlancer.datafusion.DataFusionProvider.DataFusionGlobalState;
15+
import sqlancer.datafusion.test.DataFusionNoRECOracle;
16+
import sqlancer.datafusion.test.DataFusionQueryPartitioningWhereTester;
17+
18+
@Parameters(commandDescription = "DataFusion")
19+
public class DataFusionOptions implements DBMSSpecificOptions<DataFusionOracleFactory> {
20+
@Parameter(names = "--debug-info", description = "Show debug messages related to DataFusion", arity = 0)
21+
public boolean showDebugInfo;
22+
23+
@Override
24+
public List<DataFusionOracleFactory> getTestOracleFactory() {
25+
return Arrays.asList(DataFusionOracleFactory.NOREC, DataFusionOracleFactory.QUERY_PARTITIONING_WHERE);
26+
}
27+
28+
public enum DataFusionOracleFactory implements OracleFactory<DataFusionGlobalState> {
29+
NOREC {
30+
@Override
31+
public TestOracle<DataFusionGlobalState> create(DataFusionGlobalState globalState) throws SQLException {
32+
return new DataFusionNoRECOracle(globalState);
33+
}
34+
},
35+
QUERY_PARTITIONING_WHERE {
36+
@Override
37+
public TestOracle<DataFusionGlobalState> create(DataFusionGlobalState globalState) throws SQLException {
38+
return new DataFusionQueryPartitioningWhereTester(globalState);
39+
}
40+
}
41+
}
42+
43+
}
Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
package sqlancer.datafusion;
2+
3+
import static java.lang.System.exit;
4+
import static sqlancer.datafusion.DataFusionUtil.DataFusionLogger.DataFusionLogType.DML;
5+
import static sqlancer.datafusion.DataFusionUtil.displayTables;
6+
7+
import java.sql.Connection;
8+
import java.sql.DriverManager;
9+
import java.sql.SQLException;
10+
import java.util.List;
11+
import java.util.Properties;
12+
import java.util.stream.Collectors;
13+
14+
import com.google.auto.service.AutoService;
15+
16+
import sqlancer.DatabaseProvider;
17+
import sqlancer.IgnoreMeException;
18+
import sqlancer.Randomly;
19+
import sqlancer.SQLConnection;
20+
import sqlancer.SQLGlobalState;
21+
import sqlancer.SQLProviderAdapter;
22+
import sqlancer.common.query.SQLQueryAdapter;
23+
import sqlancer.datafusion.DataFusionProvider.DataFusionGlobalState;
24+
import sqlancer.datafusion.DataFusionSchema.DataFusionTable;
25+
import sqlancer.datafusion.DataFusionUtil.DataFusionInstanceID;
26+
import sqlancer.datafusion.DataFusionUtil.DataFusionLogger;
27+
import sqlancer.datafusion.gen.DataFusionInsertGenerator;
28+
import sqlancer.datafusion.gen.DataFusionTableGenerator;
29+
30+
@AutoService(DatabaseProvider.class)
31+
public class DataFusionProvider extends SQLProviderAdapter<DataFusionGlobalState, DataFusionOptions> {
32+
33+
public DataFusionProvider() {
34+
super(DataFusionGlobalState.class, DataFusionOptions.class);
35+
}
36+
37+
@Override
38+
public void generateDatabase(DataFusionGlobalState globalState) throws Exception {
39+
int tableCount = Randomly.fromOptions(1, 2, 3, 4, 5, 6, 7);
40+
for (int i = 0; i < tableCount; i++) {
41+
SQLQueryAdapter queryCreateRandomTable = new DataFusionTableGenerator().getQuery(globalState);
42+
queryCreateRandomTable.execute(globalState);
43+
globalState.updateSchema();
44+
globalState.dfLogger.appendToLog(DML, queryCreateRandomTable.toString() + "\n");
45+
}
46+
47+
// Now only `INSERT` DML is supported
48+
// If more DMLs are added later, should use`StatementExecutor` instead
49+
// (see DuckDB's implementation for reference)
50+
51+
globalState.updateSchema();
52+
List<DataFusionTable> allTables = globalState.getSchema().getDatabaseTables();
53+
List<String> allTablesName = allTables.stream().map(t -> t.getName()).collect(Collectors.toList());
54+
if (allTablesName.isEmpty()) {
55+
System.out.println("Generate database failed");
56+
exit(1);
57+
}
58+
59+
// Randomly insert some data into existing tables
60+
for (DataFusionTable table : allTables) {
61+
int nInsertQuery = globalState.getRandomly().getInteger(0, 8); // [0, 10)
62+
63+
for (int i = 0; i < nInsertQuery; i++) {
64+
SQLQueryAdapter insertQuery = null;
65+
try {
66+
insertQuery = DataFusionInsertGenerator.getQuery(globalState, table);
67+
} catch (IgnoreMeException e) {
68+
// Only for special case: table has 0 column
69+
continue;
70+
}
71+
72+
insertQuery.execute(globalState);
73+
globalState.dfLogger.appendToLog(DML, insertQuery.toString() + "\n");
74+
}
75+
}
76+
77+
// TODO(datafusion) add `DataFUsionLogType.STATE` for this whole db state log
78+
if (globalState.getDbmsSpecificOptions().showDebugInfo) {
79+
System.out.println(displayTables(globalState, allTablesName));
80+
}
81+
}
82+
83+
@Override
84+
public SQLConnection createDatabase(DataFusionGlobalState globalState) throws SQLException {
85+
if (globalState.getDbmsSpecificOptions().showDebugInfo) {
86+
System.out.println("A new database get created!\n");
87+
}
88+
Properties props = new Properties();
89+
props.setProperty("UseEncryption", "false");
90+
// must set 'user' and 'password' to trigger server 'do_handshake()'
91+
props.setProperty("user", "foo");
92+
props.setProperty("password", "bar");
93+
props.setProperty("create", globalState.getDatabaseName()); // Hack: use this property to let DataFusion server
94+
// clear the current context
95+
String url = "jdbc:arrow-flight-sql://127.0.0.1:50051";
96+
Connection connection = DriverManager.getConnection(url, props);
97+
98+
return new SQLConnection(connection);
99+
}
100+
101+
@Override
102+
public String getDBMSName() {
103+
return "datafusion";
104+
}
105+
106+
// If run SQLancer with multiple thread
107+
// Each thread's instance will have its own `DataFusionGlobalState`
108+
// It will store global states including:
109+
// JDBC connection to DataFusion server
110+
// Logger for this thread
111+
public static class DataFusionGlobalState extends SQLGlobalState<DataFusionOptions, DataFusionSchema> {
112+
public DataFusionLogger dfLogger;
113+
DataFusionInstanceID id;
114+
115+
public DataFusionGlobalState() {
116+
// HACK: test will only run in spawned thread, not main thread
117+
// this way redundant logger files won't be created
118+
if (Thread.currentThread().getName().equals("main")) {
119+
return;
120+
}
121+
122+
id = new DataFusionInstanceID(Thread.currentThread().getName());
123+
try {
124+
dfLogger = new DataFusionLogger(this, id);
125+
} catch (Exception e) {
126+
throw new IgnoreMeException();
127+
}
128+
}
129+
130+
@Override
131+
protected DataFusionSchema readSchema() throws SQLException {
132+
return DataFusionSchema.fromConnection(getConnection(), getDatabaseName());
133+
}
134+
}
135+
}

0 commit comments

Comments
 (0)