From 498626de64450f00e6afe4be63c5eb3833a83cc9 Mon Sep 17 00:00:00 2001 From: semyonsinchenko Date: Fri, 19 Sep 2025 13:39:01 +0200 Subject: [PATCH] update docs --- docs/src/04-user-guide/02-basic-operations.md | 34 +++++++++++++++++-- docs/src/04-user-guide/14-special-columns.md | 28 +++++++++++++++ 2 files changed, 60 insertions(+), 2 deletions(-) create mode 100644 docs/src/04-user-guide/14-special-columns.md diff --git a/docs/src/04-user-guide/02-basic-operations.md b/docs/src/04-user-guide/02-basic-operations.md index cb808fac1..36cf3223e 100644 --- a/docs/src/04-user-guide/02-basic-operations.md +++ b/docs/src/04-user-guide/02-basic-operations.md @@ -1,8 +1,10 @@ # Basic Graph Operations +## Basics + GraphFrames provide several simple graph queries, such as node degree. Also, since GraphFrames represent graphs as pairs of vertex and edge DataFrames, it is easy to make powerful queries directly on the vertex and edge DataFrames. Those DataFrames are made available as `vertices` and `edges` fields in the GraphFrame. -## Python API +### Python API ```python from graphframes.examples import Graphs @@ -52,7 +54,7 @@ g.vertices.groupBy().min("age").show() numFollows = g.edges.filter("relationship = 'follow'").count() ``` -## Scala API +### Scala API ```scala import org.graphframes.{examples,GraphFrame} @@ -102,3 +104,31 @@ g.vertices.groupBy().min("age").show() // This queries the edge DataFrame. val numFollows = g.edges.filter("relationship = 'follow'").count() ``` + +## Filtering edges or vertices + +GraphFrames provides an API for filtering edges and vertices based on their attributes. + +**NOTE:** *This API is for simple filtering. For the more complex use cases, it is recommended to use [`PropertyGraphFrame` model](/04-user-guide/11-property-graphs.md). `PropertyGraphFrame` handles the logical schema of the whole graph and provides a more powerful API for selecting any subgraph based on required properties and filters.* + +### Python API + +```python +from pyspark.sql import functions as F +from graphframes.examples import Graphs + +g = Graphs(spark).friends() # Get example graph +g.filterVertices(F.col("name") == F.lit("Alice")) +g.filterEdges(F.col("relationship") == F.lit("follow")) +``` + +### Scala API + +```scala +import org.apache.spark.sql.functions._ +import org.graphframes.{examples,GraphFrame} + +val g: GraphFrame = examples.Graphs.friends +g.filterVertices(col("name") === lit("Alice")) +g.filterEdges(col("relationship") === lit("follow")) +``` diff --git a/docs/src/04-user-guide/14-special-columns.md b/docs/src/04-user-guide/14-special-columns.md new file mode 100644 index 000000000..2ad0f3e39 --- /dev/null +++ b/docs/src/04-user-guide/14-special-columns.md @@ -0,0 +1,28 @@ +# Reserved Columns + +GraphFrames internally use the following reserved columns: + +- `id` for vertex IDs +- `src` for edge source IDs +- `dst` for edge destination IDs +- `attr` for vertex attributes during the GraphX conversion +- `new_id` for indexed Long IDs for vertices +- `new_src` for indexed Long IDs for edge sources +- `new_dst` for indexed Long IDs for edge destinations +- `graphx_attr` for vertex attributes during the GraphX conversion +- `weight` for edge weights +- `MSG` for messages in AggregateMessages +- `_pregel_msg` for pregel messages +- `_pregel_is_active` for pregel vertex active status + +## Algorithm Specific Columns + +- `component` for result of connected components +- `label` for result of label propagation +- `distances` for result of shortest paths +- `pagerank` for result of pagerank +- `count` for result of triangle count +- `column{1-4}` for SVD++ reserved columns +- `outDegree` for result of out degree +- `inDegree` for result of in degree +- `degree` for result of degree \ No newline at end of file