-
Notifications
You must be signed in to change notification settings - Fork 268
feat: add Rocha–Thatte cycle detection algorithm #700
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
SemyonSinchenko
merged 6 commits into
graphframes:master
from
SemyonSinchenko:607-cycles-detection
Sep 22, 2025
Merged
Changes from all commits
Commits
Show all changes
6 commits
Select commit
Hold shift + click to select a range
870997f
Add Rocha–Thatte cycle detection algorithm
SemyonSinchenko ae227c1
fix 2.12 failing tests
SemyonSinchenko 233b463
Merge remote-tracking branch 'graphframes/master' into 607-cycles-det…
SemyonSinchenko 10fb9ae
merge main
SemyonSinchenko 8cdac0d
fix bad merge
SemyonSinchenko ad3b2ae
fix bad merge
SemyonSinchenko File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
98 changes: 98 additions & 0 deletions
98
core/src/main/scala/org/graphframes/lib/DetectingCycles.scala
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,98 @@ | ||
| package org.graphframes.lib | ||
|
|
||
| import org.apache.spark.sql.DataFrame | ||
| import org.apache.spark.sql.functions._ | ||
| import org.apache.spark.sql.types.ArrayType | ||
| import org.apache.spark.storage.StorageLevel | ||
| import org.graphframes.GraphFrame | ||
| import org.graphframes.Logging | ||
| import org.graphframes.WithCheckpointInterval | ||
| import org.graphframes.WithIntermediateStorageLevel | ||
| import org.graphframes.WithLocalCheckpoints | ||
|
|
||
| class DetectingCycles private[graphframes] (private val graph: GraphFrame) | ||
| extends Arguments | ||
| with Serializable | ||
| with Logging | ||
| with WithIntermediateStorageLevel | ||
| with WithLocalCheckpoints | ||
| with WithCheckpointInterval { | ||
| import DetectingCycles._ | ||
| def run(): DataFrame = { | ||
| val rawRes = DetectingCycles.run( | ||
| graph, | ||
| useLocalCheckpoints, | ||
| checkpointInterval, | ||
| intermediateStorageLevel) | ||
| val explodedRes = rawRes | ||
| .select( | ||
| col(GraphFrame.ID), | ||
| filter(col(foundSeqCol), x => size(x) > lit(0)).alias(foundSeqCol)) | ||
| .filter(size(col(foundSeqCol)) > lit(0)) | ||
| .select( | ||
| col(GraphFrame.ID), | ||
| // from vid -> [[cycle1, cycle2, ...]] | ||
| // to vid -> [cycle1], vid -> [cycle2], ... | ||
| explode(col(foundSeqCol)).alias(foundSeqCol)) | ||
| .persist(intermediateStorageLevel) | ||
| explodedRes.count() | ||
| resultIsPersistent() | ||
| rawRes.unpersist() | ||
| explodedRes | ||
| } | ||
| } | ||
|
|
||
| object DetectingCycles { | ||
| private val storedSeqCol: String = "sequences" | ||
| val foundSeqCol: String = "found_cycles" | ||
|
|
||
| def run( | ||
| graph: GraphFrame, | ||
| useLocalCheckpoints: Boolean, | ||
| checkpointInterval: Int, | ||
| intermediateStorageLevel: StorageLevel): DataFrame = { | ||
| val preparedGraph = GraphFrame( | ||
| graph.vertices.select(GraphFrame.ID), | ||
| graph.edges.select(GraphFrame.SRC, GraphFrame.DST)) | ||
|
|
||
| val vertexDT = preparedGraph.vertices.schema(GraphFrame.ID).dataType | ||
|
|
||
| // Each vertex stores sequences from the previous iteration, initial is just Array(Array(ID)) | ||
| val initSequences = array(array(col(GraphFrame.ID))) | ||
| // Each vertex stores all the found cycles | ||
| val foundSequences = array().cast(ArrayType(ArrayType(vertexDT))) | ||
| // Message is simply stored sequences | ||
| val sentMessages = when(size(Pregel.src(storedSeqCol)) =!= lit(0), Pregel.src(storedSeqCol)) | ||
| .otherwise(lit(null).cast(ArrayType(ArrayType(vertexDT)))) | ||
| // If the sequence contains the current vertex ID somewhere in the middle, it is | ||
| // a previously detected cycle and a sequence should be discarded. | ||
| val filterOutSequences = flatten(collect_list(Pregel.msg)) | ||
| when(Pregel.msg.isNull, array(array()).cast(ArrayType(ArrayType(vertexDT)))) | ||
| .otherwise(filter(Pregel.msg, x => !(array_position(x, col(GraphFrame.ID)) > lit(1)))) | ||
| // update found sequences by appending all from messages that start from the current vertex ID | ||
| val updateFound = when(Pregel.msg.isNull, col(foundSeqCol)).otherwise( | ||
| array_union( | ||
| col(foundSeqCol), | ||
| transform( | ||
| filter(Pregel.msg, x => try_element_at(x, lit(1)) === col(GraphFrame.ID)), | ||
| x => array_append(x, col(GraphFrame.ID))))) | ||
| // update stored sequences by filtering out already added sequences | ||
| val updateSequences = transform( | ||
| filter(Pregel.msg, x => !array_contains(x, col(GraphFrame.ID))), | ||
| x => array_append(x, col(GraphFrame.ID))) | ||
|
|
||
| preparedGraph.pregel | ||
| .setCheckpointInterval(checkpointInterval) | ||
| .setUseLocalCheckpoints(useLocalCheckpoints) | ||
| .setIntermediateStorageLevel(intermediateStorageLevel) | ||
| .setEarlyStopping(false) | ||
| .setSkipMessagesFromNonActiveVertices(true) | ||
| .setInitialActiveVertexExpression(lit(true)) | ||
| .sendMsgToDst(sentMessages) | ||
| .setUpdateActiveVertexExpression(Pregel.msg.isNotNull && (size(updateSequences) > lit(0))) | ||
| .withVertexColumn(storedSeqCol, initSequences, updateSequences) | ||
| .withVertexColumn(foundSeqCol, foundSequences, updateFound) | ||
| .aggMsgs(filterOutSequences) | ||
| .run() | ||
| } | ||
| } |
69 changes: 69 additions & 0 deletions
69
core/src/test/scala/org/graphframes/lib/DetectingCyclesSuite.scala
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,69 @@ | ||
| package org.graphframes.lib | ||
|
|
||
| import org.graphframes.GraphFrame | ||
| import org.graphframes.GraphFrameTestSparkContext | ||
| import org.graphframes.SparkFunSuite | ||
|
|
||
| import scala.annotation.nowarn | ||
| import scala.collection.mutable | ||
|
|
||
| class DetectingCyclesSuite extends SparkFunSuite with GraphFrameTestSparkContext { | ||
| test("test detecting cycles") { | ||
| val graph = GraphFrame( | ||
| spark | ||
| .createDataFrame(Seq((1L, "a"), (2L, "b"), (3L, "c"), (4L, "d"), (5L, "e"))) | ||
| .toDF("id", "attr"), | ||
| spark | ||
| .createDataFrame(Seq((1L, 2L), (2L, 3L), (3L, 1L), (1L, 4L), (2L, 5L))) | ||
| .toDF("src", "dst")) | ||
| val res = graph.detectingCycles.setUseLocalCheckpoints(true).run() | ||
| assert(res.count() == 3) | ||
| @nowarn val collected = | ||
| res | ||
| .sort(GraphFrame.ID) | ||
| .select(DetectingCycles.foundSeqCol) | ||
| .collect() | ||
| .map(r => r.getAs[mutable.WrappedArray[Long]](0)) | ||
|
|
||
| assert(collected(0) == Seq(1, 2, 3, 1)) | ||
| assert(collected(1) == Seq(2, 3, 1, 2)) | ||
| assert(collected(2) == Seq(3, 1, 2, 3)) | ||
| } | ||
|
|
||
| test("test no cycles") { | ||
| val graph = GraphFrame( | ||
| spark | ||
| .createDataFrame(Seq((1L, "a"), (2L, "b"), (3L, "c"), (4L, "d"), (5L, "e"))) | ||
| .toDF("id", "attr"), | ||
| spark | ||
| .createDataFrame(Seq((1L, 2L), (2L, 3L), (3L, 4L), (4L, 5L))) | ||
| .toDF("src", "dst")) | ||
| val res = graph.detectingCycles.setUseLocalCheckpoints(true).run() | ||
| assert(res.count() == 0) | ||
| } | ||
|
|
||
| test("test multiple cycles from one source") { | ||
| val graph = GraphFrame( | ||
| spark | ||
| .createDataFrame(Seq((1L, "a"), (2L, "b"), (3L, "c"), (4L, "d"), (5L, "e"))) | ||
| .toDF("id", "attr"), | ||
| spark | ||
| .createDataFrame(Seq((1L, 2L), (2L, 1L), (1L, 3L), (3L, 1L), (2L, 5L), (5L, 1L))) | ||
| .toDF("src", "dst")) | ||
| val res = graph.detectingCycles.setUseLocalCheckpoints(true).run() | ||
| assert(res.count() == 7) | ||
| @nowarn val collected = | ||
| res | ||
| .sort(GraphFrame.ID, DetectingCycles.foundSeqCol) | ||
| .select(DetectingCycles.foundSeqCol) | ||
| .collect() | ||
| .map(r => r.getAs[mutable.WrappedArray[Long]](0)) | ||
| assert(collected(0) == Seq(1, 2, 1)) | ||
| assert(collected(1) == Seq(1, 2, 5, 1)) | ||
| assert(collected(2) == Seq(1, 3, 1)) | ||
| assert(collected(3) == Seq(2, 1, 2)) | ||
| assert(collected(4) == Seq(2, 5, 1, 2)) | ||
| assert(collected(5) == Seq(3, 1, 3)) | ||
| assert(collected(6) == Seq(5, 1, 2, 5)) | ||
| } | ||
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.