Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
89cd7dd
feat: JGraphT as dependency
SemyonSinchenko Jun 1, 2026
b339c90
wip
SemyonSinchenko Jun 7, 2026
bfbd4a2
Merge remote-tracking branch 'graphframes/main' into 829-schema-and-e…
SemyonSinchenko Jun 7, 2026
359c092
Merge remote-tracking branch 'graphframes/main' into 829-schema-and-e…
SemyonSinchenko Jun 9, 2026
ee1da9b
feat: public API for schema and schemaDOT
SemyonSinchenko Jun 9, 2026
4805369
Merge remote-tracking branch 'graphframes/main' into 829-schema-and-e…
SemyonSinchenko Jun 18, 2026
a36ef33
feat: ANTL4 grammar and custom plugin
SemyonSinchenko Jun 18, 2026
3d2aff6
fix: docs and switch to visitor
SemyonSinchenko Jun 18, 2026
918d8c6
feat: antl4 + resolver + analysis
SemyonSinchenko Jun 18, 2026
ba5562a
fix: fix warnings
SemyonSinchenko Jun 19, 2026
198c462
fix: scalastyle
SemyonSinchenko Jun 19, 2026
16370d5
feat: engine itself
SemyonSinchenko Jun 19, 2026
c6a65b5
fix: docs build
SemyonSinchenko Jun 20, 2026
f6188b3
feat: reusing scans
SemyonSinchenko Jun 20, 2026
158e098
feat: edge filters pushdown and shared edge scans
SemyonSinchenko Jun 22, 2026
f256d30
fix: codespell
SemyonSinchenko Jun 22, 2026
1ae2704
feat: undirected edges
SemyonSinchenko Jun 22, 2026
619f0c9
fix: tests & codespell
SemyonSinchenko Jun 22, 2026
0405746
feat: re-generate lock-file
SemyonSinchenko Jun 22, 2026
861eb27
fix: fix python build
SemyonSinchenko Jun 22, 2026
bc44b3f
feat: functions support
SemyonSinchenko Jun 23, 2026
349a6bc
feat: multi-hop patterns
SemyonSinchenko Jun 24, 2026
5545b1a
chore: sanytize
SemyonSinchenko Jun 24, 2026
18ffb3d
fix: remove old test
SemyonSinchenko Jun 24, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 18 additions & 2 deletions build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@ lazy val scalaVersions = sparkMajorVer match {
case "3" => Seq("2.12.21", "2.13.18")
case _ => throw new IllegalArgumentException(s"Unsupported Spark version: $sparkVer.")
}
lazy val antlr4ToolVersion = sys.props.getOrElse("spark.version", "3.5.8").substring(0, 1) match {
case "4" => "4.13.1"
case "3" => "4.9.3"
case v => throw new IllegalArgumentException(s"Unsupported Spark version major: $v")
}
lazy val scalaVer = sys.props.getOrElse("scala.version", scalaVersions.head)
lazy val defaultScalaTestVer = "3.2.19"
lazy val jmhVersion = "1.37"
Expand Down Expand Up @@ -75,14 +80,20 @@ lazy val commonSetting = Seq(
"org.apache.spark" %% "spark-sql" % sparkVer % "provided" cross CrossVersion.for3Use2_13,
"org.apache.spark" %% "spark-mllib" % sparkVer % "provided" cross CrossVersion.for3Use2_13,
"org.slf4j" % "slf4j-api" % "2.0.17" % "provided",
"org.apache.datasketches" % "datasketches-java" % "6.2.0", // transitive dependency from Spark
"org.apache.datasketches" % "datasketches-java" % "6.2.0" % "provided", // transitive from Spark
"org.antlr" % "antlr4" % antlr4ToolVersion % "provided", // transitive from Spark
"org.scalatest" %% "scalatest" % defaultScalaTestVer % Test,
"com.github.zafarkhaja" % "java-semver" % "0.10.2" % Test),
Compile / doc / scalacOptions ++= Seq(
"-groups",
"-implicits",
"-skip-packages",
Seq("org.apache.spark").mkString(":")),
// org.apache.spark is skipped to avoid rendering transitive Spark types; the GQL query engine
// under org.graphframes.propertygraph.internal is entirely private[propertygraph] (and
// AstBuilder references generated ANTLR Java types), so it is skipped from rendered output too.
// The internal package is still type-checked so that the public PropertyGraphFrame, which calls
// into it, compiles in doc.
Seq("org.apache.spark", "org.graphframes.propertygraph.internal").mkString(":")),
Test / doc / scalacOptions ++= Seq("-groups", "-implicits"),

// Test settings
Expand Down Expand Up @@ -158,13 +169,18 @@ lazy val graphx = (project in file("graphx"))

lazy val core = (project in file("core"))
.dependsOn(graphx)
.enablePlugins(GraphFramesAntlr4Plugin)
.settings(
commonSetting,
name := "graphframes",
moduleName := s"${name.value}-spark$sparkMajorVer",
// Export the JAR so that this can be excluded from shading in connect
exportJars := true,

// Emit the generated GQL parser/lexer into the internal package so the
// (forthcoming) AstBuilder can import them.
antlr4GenPackage := Some("org.graphframes.propertygraph.internal"),

// Global settings
Global / concurrentRestrictions := Seq(Tags.limitAll(1)),
autoAPIMappings := true,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

lexer grammar GqlLexer;

// ---------------------------------------------------------------------------
// Keywords (case-insensitive). These MUST precede IDENTIFIER so that, on a
// tie, the keyword token wins over the identifier rule.
// ---------------------------------------------------------------------------
MATCH: M A T C H;
WHERE: W H E R E;
RETURN: R E T U R N;
AND: A N D;
OR: O R;
NOT: N O T;
AS: A S;
TRUE: T R U E;
FALSE: F A L S E;
NULL: N U L L;
IS: I S;
IN: I N;

// ---------------------------------------------------------------------------
// Punctuation and operators.
//
// ANTLR4 uses maximal munch: the longest match always wins, and on ties the
// rule listed first wins. Multi-character tokens are therefore safe even when
// they share a prefix with a shorter one (e.g. '->' beats '-' regardless of
// ordering), but equal-length alternatives must be ordered with care. None of
// the tokens below collide on (length, prefix), so ordering within a group is
// not load-bearing; groups are kept before IDENTIFIER for clarity only.
// ---------------------------------------------------------------------------
ARROW_RIGHT: '->';
ARROW_LEFT: '<-';
LTE: '<=';
GTE: '>=';
NEQ: '<>';
NEQ_BANG: '!=';
LT: '<';
GT: '>';
EQ: '=';
DASH: '-';
PLUS: '+';
STAR: '*';
SLASH: '/';
PERCENT: '%';
DOT: '.';
DOTDOT: '..';
COMMA: ',';
COLON: ':';
LPAREN: '(';
RPAREN: ')';
LBRACK: '[';
RBRACK: ']';
LBRACE: '{';
RBRACE: '}';

// ---------------------------------------------------------------------------
// Literals
// ---------------------------------------------------------------------------

// Single-quoted string with '' escape, per SQL/GQL convention.
STRING_LITERAL: '\'' ( ~'\'' | '\'\'' )* '\'';

DECIMAL_LITERAL: DIGIT+ '.' DIGIT+;
INTEGER_LITERAL: DIGIT+;

// ---------------------------------------------------------------------------
// Identifiers. Must follow all keyword rules so keywords win the tie.
// ---------------------------------------------------------------------------
IDENTIFIER: [a-zA-Z_] [a-zA-Z0-9_]*;

// ---------------------------------------------------------------------------
// Whitespace and comments -> skipped.
// ---------------------------------------------------------------------------
WS: [ \t\r\n\u000C]+ -> skip;
LINE_COMMENT: '//' ~[\r\n]* -> skip;
BLOCK_COMMENT: '/*' .*? '*/' -> skip;

// ---------------------------------------------------------------------------
// Fragments
// ---------------------------------------------------------------------------
fragment DIGIT: [0-9];

// Letter fragments used to build case-insensitive keywords.
fragment A: ('a' | 'A');
fragment B: ('b' | 'B');
fragment C: ('c' | 'C');
fragment D: ('d' | 'D');
fragment E: ('e' | 'E');
fragment F: ('f' | 'F');
fragment G: ('g' | 'G');
fragment H: ('h' | 'H');
fragment I: ('i' | 'I');
fragment J: ('j' | 'J');
fragment K: ('k' | 'K');
fragment L: ('l' | 'L');
fragment M: ('m' | 'M');
fragment N: ('n' | 'N');
fragment O: ('o' | 'O');
fragment P: ('p' | 'P');
fragment Q: ('q' | 'Q');
fragment R: ('r' | 'R');
fragment S: ('s' | 'S');
fragment T: ('t' | 'T');
fragment U: ('u' | 'U');
fragment V: ('v' | 'V');
fragment W: ('w' | 'W');
fragment X: ('x' | 'X');
fragment Y: ('y' | 'Y');
fragment Z: ('z' | 'Z');
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

parser grammar GqlParser;

options {
tokenVocab = GqlLexer;
}

// ---------------------------------------------------------------------------
// Top-level statement.
//
// RETURN is optional in the grammar so the engine can default to returning
// matched IDs when the user omits it (forward-compatible; trivially tightened
// later by making RETURN mandatory).
// ---------------------------------------------------------------------------
gqlStatement
: MATCH matchPattern (WHERE whereClause)? (RETURN returnClause)? EOF
;

// A match pattern is a chain of alternating nodes and directed edges,
// e.g. (a:Person)-[:KNOWS]->(b:Person)-[:WORKS_AT]->(c:Company).
matchPattern
: nodePattern (edgePattern nodePattern)*
;

// Node pattern: typed (a:Person), untyped (x), or anonymous ().
nodePattern
: LPAREN (variable=IDENTIFIER)? (COLON label=IDENTIFIER)? RPAREN
;

// Edge pattern.
//
// Three forms:
// -[e:KNOWS]-> (left-to-right)
// <-[e:KNOWS]- (right-to-left)
// -[:KNOWS]- (undirected)
// The edge body [variable? :label?] is shared via edgeBody.
edgePattern
: DASH edgeBody ARROW_RIGHT // a -[e]-> b
| ARROW_LEFT edgeBody DASH // a <-[e]- b
| DASH edgeBody DASH // a -[e]- b
;

edgeBody
: LBRACK (variable=IDENTIFIER)? (COLON label=IDENTIFIER)? quantifier? RBRACK
;

// Variable-length pattern: [KNOWS*1..3] or [KNOWS*3]
quantifier
: STAR lo=INTEGER_LITERAL DOTDOT hi=INTEGER_LITERAL // *1..3 (bounded range)
| STAR exact=INTEGER_LITERAL // *3 (exactly 3 hops)
;

// ---------------------------------------------------------------------------
// WHERE clause: a single boolean expression.
// ---------------------------------------------------------------------------
whereClause
: expression
;

// ---------------------------------------------------------------------------
// RETURN clause: either SELECT * or a comma-separated list of items.
// ---------------------------------------------------------------------------
returnClause
: STAR
| returnItem (COMMA returnItem)*
;

returnItem
: expression (AS alias=IDENTIFIER)?
;

// ---------------------------------------------------------------------------
// Expression grammar.
//
// Precedence (lowest -> highest): OR < AND < NOT < comparison < additive <
// multiplicative < primary. Standard recursive-descent shape; ANTLR4 resolves
// left-recursive alternatives correctly.
// ---------------------------------------------------------------------------
expression
: orExpr
;

orExpr
: andExpr (OR andExpr)*
;

andExpr
: notExpr (AND notExpr)*
;

notExpr
: NOT notExpr
| comparison
;

comparison
: additive (compOp additive)?
;

additive
: multiplicative ((PLUS | DASH) multiplicative)*
;

multiplicative
: primary ((STAR | SLASH | PERCENT) primary)*
;

primary
: LPAREN expression RPAREN
| literal
| functionCall
| propertyAccess
| variable=IDENTIFIER
;

functionCall
: name=IDENTIFIER LPAREN ( expression ( COMMA expression )* )? RPAREN
;

propertyAccess
: variable=IDENTIFIER DOT property=IDENTIFIER
;

compOp
: EQ
| NEQ
| NEQ_BANG
| LT
| LTE
| GT
| GTE
;

literal
: INTEGER_LITERAL
| DECIMAL_LITERAL
| STRING_LITERAL
| TRUE
| FALSE
| NULL
;
Loading
Loading