diff --git a/docs/img/Directed-Graphlet-G30.png b/docs/img/Directed-Graphlet-G30.png new file mode 100644 index 000000000..065452153 Binary files /dev/null and b/docs/img/Directed-Graphlet-G30.png differ diff --git a/docs/motif-tutorial.md b/docs/motif-tutorial.md index 4d512f656..aabe07ccd 100644 --- a/docs/motif-tutorial.md +++ b/docs/motif-tutorial.md @@ -92,7 +92,6 @@ from pyspark import SparkContext from pyspark.sql import DataFrame, SparkSession # Initialize a SparkSession - spark: SparkSession = ( SparkSession.builder.appName("Stack Overflow Motif Analysis") # Lets the Id:(Stack Overflow int) and id:(GraphFrames ULID) coexist @@ -103,7 +102,6 @@ sc: SparkContext = spark.sparkContext sc.setCheckpointDir("/tmp/graphframes-checkpoints") # Change me if you download a different stackexchange site - STACKEXCHANGE_SITE = "stats.meta.stackexchange.com" BASE_PATH = f"python/graphframes/tutorials/data/{STACKEXCHANGE_SITE}" {% endhighlight %} @@ -118,21 +116,17 @@ Load the nodes and edges of the graph from the `data` folder and count the types # # We created these in stackexchange.py from Stack Exchange data dump XML files - NODES_PATH: str = f"{BASE_PATH}/Nodes.parquet" nodes_df: DataFrame = spark.read.parquet(NODES_PATH) # Repartition the nodes to give our motif searches parallelism - nodes_df = nodes_df.repartition(50).checkpoint().cache() # We created these in stackexchange.py from Stack Exchange data dump XML files - EDGES_PATH: str = f"{BASE_PATH}/Edges.parquet" edges_df: DataFrame = spark.read.parquet(EDGES_PATH) # Repartition the edges to give our motif searches parallelism - edges_df = edges_df.repartition(50).checkpoint().cache() {% endhighlight %} @@ -243,7 +237,6 @@ def add_missing_columns(df: DataFrame, all_cols: List[Tuple[str, T.StructField]] return df # Now apply this function to each of your DataFrames to get a consistent schema - posts_df = add_missing_columns(posts_df, all_cols).select(all_column_names) post_links_df = add_missing_columns(post_links_df, all_cols).select(all_column_names) users_df = add_missing_columns(users_df, all_cols).select(all_column_names) @@ -322,7 +315,6 @@ valid_edge_count = ( ) # Just up and die if we have edges that point to non-existent nodes - assert ( edge_count == valid_edge_count ), f"Edge count {edge_count} != valid edge count {valid_edge_count}" @@ -359,7 +351,6 @@ A complete description of the graph query language is in the [GraphFrames User G paths = g.find("(a)-[e1]->(b); (b)-[e2]->(c); (c)-[e3]->(a)") # Show the first path - paths.show(3) {% endhighlight %} diff --git a/docs/quick-start.md b/docs/quick-start.md index f7e16c8ce..ae28d9c94 100644 --- a/docs/quick-start.md +++ b/docs/quick-start.md @@ -66,6 +66,7 @@ val v = spark.createDataFrame(List( ("b", "Bob", 36), ("c", "Charlie", 30) )).toDF("id", "name", "age") + // Create an Edge DataFrame with "src" and "dst" columns val e = spark.createDataFrame(List( ("a", "b", "friend"), @@ -96,6 +97,7 @@ v = spark.createDataFrame([ ("b", "Bob", 36), ("c", "Charlie", 30), ], ["id", "name", "age"]) + # Create an Edge DataFrame with "src" and "dst" columns e = spark.createDataFrame([ ("a", "b", "friend"), diff --git a/docs/user-guide.md b/docs/user-guide.md index 5d2a112a9..2ec964da3 100644 --- a/docs/user-guide.md +++ b/docs/user-guide.md @@ -45,6 +45,7 @@ val v = spark.createDataFrame(List( ("f", "Fanny", 36), ("g", "Gabby", 60) )).toDF("id", "name", "age") + // Edge DataFrame val e = spark.createDataFrame(List( ("a", "b", "friend"), @@ -80,6 +81,7 @@ v = spark.createDataFrame([ ("f", "Fanny", 36), ("g", "Gabby", 60) ], ["id", "name", "age"]) + # Edge DataFrame e = spark.createDataFrame([ ("a", "b", "friend"), @@ -172,8 +174,7 @@ from graphframes.examples import Graphs g = Graphs(spark).friends() # Get example graph -# Display the vertex and edge DataFrames - +# Display the vertex DataFrame g.vertices.show() # +--+-------+---+ @@ -188,6 +189,7 @@ g.vertices.show() # | g| Gabby| 60| # +--+-------+---+ +# Display the edge DataFrame g.edges.show() # +---+---+------------+ @@ -368,6 +370,7 @@ from pyspark.sql.functions import col, lit, when from pyspark.sql.types import IntegerType from graphframes.examples import Graphs + g = Graphs(spark).friends() # Get example graph chain4 = g.find("(a)-[ab]->(b); (b)-[bc]->(c); (c)-[cd]->(d)") @@ -476,7 +479,6 @@ paths = g.find("(a)-[e]->(b)")\ .filter("a.age < b.age") # "paths" contains vertex info. Extract the edges - e2 = paths.select("e.src", "e.dst", "e.relationship") # In Spark 1.5+, the user may simplify this call @@ -539,6 +541,7 @@ For API details, refer to the [API docs](api/python/graphframes.html#graphframes {% highlight python %} from graphframes.examples import Graphs + g = Graphs(spark).friends() # Get example graph # Search from "Esther" for users of age < 32 @@ -630,6 +633,7 @@ For API details, refer to the [API docs](api/python/graphframes.html#graphframes {% highlight python %} from graphframes.examples import Graphs + sc.setCheckpointDir("/tmp/spark-checkpoints") g = Graphs(spark).friends() # Get example graph @@ -678,6 +682,7 @@ For API details, refer to the [API docs](api/python/graphframes.html#graphframes {% highlight python %} from graphframes.examples import Graphs + g = Graphs(spark).friends() # Get example graph result = g.labelPropagation(maxIter=5) @@ -741,6 +746,7 @@ For API details, refer to the [API docs](api/python/graphframes.html#graphframes {% highlight python %} from graphframes.examples import Graphs + g = Graphs(spark).friends() # Get example graph # Run PageRank until convergence to tolerance "tol" @@ -796,6 +802,7 @@ For API details, refer to the [API docs](api/python/graphframes.html#graphframes {% highlight python %} from graphframes.examples import Graphs + g = Graphs(spark).friends() # Get example graph results = g.shortestPaths(landmarks=["a", "d"]) @@ -832,6 +839,7 @@ For API details, refer to the [API docs](api/python/graphframes.html#graphframes {% highlight python %} from graphframes.examples import Graphs + g = Graphs(spark).friends() # Get example graph results = g.triangleCount() @@ -875,6 +883,7 @@ val sameG = GraphFrame(sameV, sameE) {% highlight python %} from graphframes.examples import Graphs + g = Graphs(spark).friends() # Get example graph # Save vertices and edges as Parquet to some location @@ -946,6 +955,7 @@ from graphframes.lib import AggregateMessages as AM from graphframes.examples import Graphs from pyspark.sql.functions import sum as sqlsum + g = Graphs(spark).friends() # Get example graph # For each user, sum the ages of the adjacent users