From f4e9cdbb9c1d56195c04e3142e01b7b77198085d Mon Sep 17 00:00:00 2001 From: Russell Jurney Date: Sat, 15 Feb 2025 20:58:48 -0800 Subject: [PATCH 01/53] Converted tests to pytest. Build a Python package. Update requirements.txt and split out requirements-dev.txt. Version bumps. --- .github/workflows/python-ci.yml | 7 +- .gitignore | 10 + Dockerfile | 10 +- VERSION | 0 build.sbt | 2 +- docs/_config.yml | 2 +- python/.gitignore | 4 - python/MANIFEST.in | 4 + python/graphframes/tests.py | 405 ++++++++++++++++++-------------- python/requirements-dev.txt | 6 + python/requirements.txt | 5 +- python/run-tests.sh | 17 +- python/setup.cfg | 44 +++- python/setup.py | 37 ++- version.sbt | 2 +- 15 files changed, 342 insertions(+), 213 deletions(-) delete mode 100644 VERSION delete mode 100644 python/.gitignore create mode 100644 python/requirements-dev.txt diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml index 8b84d6d82..36b6b97e7 100644 --- a/.github/workflows/python-ci.yml +++ b/.github/workflows/python-ci.yml @@ -7,8 +7,8 @@ jobs: matrix: include: - spark-version: 3.5.4 - scala-version: 2.12.18 - python-version: 3.9.19 + scala-version: 2.12.20 + python-version: 3.11.11 runs-on: ubuntu-22.04 env: # define Java options for both official sbt and sbt-extras @@ -35,8 +35,11 @@ jobs: run: | python -m pip install --upgrade pip wheel pip install -r ./python/requirements.txt + pip install -r ./python/requirements-dev.txt pip install pyspark==${{ matrix.spark-version }} - name: Test run: | + python python/setup.py install + python python/setup.py bdist_wheel export SPARK_HOME=$(python -c "import os; from importlib.util import find_spec; print(os.path.join(os.path.dirname(find_spec('pyspark').origin)))") ./python/run-tests.sh diff --git a/.gitignore b/.gitignore index a07973c1e..dcbde8186 100644 --- a/.gitignore +++ b/.gitignore @@ -26,3 +26,13 @@ project/plugins/project/ # Mac *.DS_Store +.vscode + +# Python specific +python/build +python/dist +build/lib +python/graphframes.egg-info +python/graphframes/tutorials/data +python/docs/_build +python/docs/_site diff --git a/Dockerfile b/Dockerfile index 1c4430912..b9fe8c528 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,16 +1,16 @@ FROM ubuntu:22.04 -ARG PYTHON_VERSION=3.8 +ARG PYTHON_VERSION=3.9 ARG DEBIAN_FRONTEND=noninteractive RUN apt-get update && \ - apt-get install -y wget bzip2 build-essential openjdk-8-jdk ssh sudo && \ + apt-get install -y wget bzip2 build-essential openjdk-11-jdk ssh sudo && \ apt-get clean # Install Spark and update env variables. -ENV SCALA_VERSION 2.12.17 -ENV SPARK_VERSION "3.4.1" -ENV SPARK_BUILD "spark-${SPARK_VERSION}-bin-hadoop3.2" +ENV SCALA_VERSION 2.12.20 +ENV SPARK_VERSION "3.5.4" +ENV SPARK_BUILD "spark-${SPARK_VERSION}-bin-hadoop3" ENV SPARK_BUILD_URL "https://dist.apache.org/repos/dist/release/spark/spark-${SPARK_VERSION}/${SPARK_BUILD}.tgz" RUN wget --quiet "$SPARK_BUILD_URL" -O /tmp/spark.tgz && \ tar -C /opt -xf /tmp/spark.tgz && \ diff --git a/VERSION b/VERSION deleted file mode 100644 index e69de29bb..000000000 diff --git a/build.sbt b/build.sbt index 061901717..63168c57d 100644 --- a/build.sbt +++ b/build.sbt @@ -3,7 +3,7 @@ import ReleaseTransformations._ lazy val sparkVer = sys.props.getOrElse("spark.version", "3.5.4") lazy val sparkBranch = sparkVer.substring(0, 3) lazy val defaultScalaVer = sparkBranch match { - case "3.5" => "2.12.18" + case "3.5" => "2.12.20" case _ => throw new IllegalArgumentException(s"Unsupported Spark version: $sparkVer.") } lazy val scalaVer = sys.props.getOrElse("scala.version", defaultScalaVer) diff --git a/docs/_config.yml b/docs/_config.yml index 4c1ab075c..379fc242f 100644 --- a/docs/_config.yml +++ b/docs/_config.yml @@ -13,7 +13,7 @@ include: # These allow the documentation to be updated with newer releases # of Spark, Scala, and Mesos. -GRAPHFRAMES_VERSION: 0.8.4 +GRAPHFRAMES_VERSION: 0.8.5 #SCALA_BINARY_VERSION: "2.10" #SCALA_VERSION: "2.10.4" #MESOS_VERSION: 0.21.0 diff --git a/python/.gitignore b/python/.gitignore deleted file mode 100644 index 81410ca55..000000000 --- a/python/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -*.pyc -docs/_build/ -build/ -dist/ diff --git a/python/MANIFEST.in b/python/MANIFEST.in index 73eaf8ba2..4eb0ee5af 100644 --- a/python/MANIFEST.in +++ b/python/MANIFEST.in @@ -2,3 +2,7 @@ # https://github.com/pypa/sampleproject/blob/master/MANIFEST.in # For more details about the MANIFEST file, you may read the docs at # https://docs.python.org/2/distutils/sourcedist.html#the-manifest-in-template +recursive-include python/graphframes *.py +recursive-exclude * __pycache__ +recursive-exclude * *.pyc +include graphframes/tutorials/data/.exists diff --git a/python/graphframes/tests.py b/python/graphframes/tests.py index 9a7ad1371..259435759 100644 --- a/python/graphframes/tests.py +++ b/python/graphframes/tests.py @@ -15,63 +15,72 @@ # limitations under the License. # -import sys +import os import tempfile import shutil import re -if sys.version_info[:2] <= (2, 6): - try: - import unittest2 as unittest - except ImportError: - sys.stderr.write('Please install unittest2 to test with Python 2.6 or earlier') - sys.exit(1) -else: - import unittest - -from pyspark import SparkContext -from pyspark.sql import functions as sqlfunctions, SparkSession +import pytest +from pyspark import SparkConf, SparkContext +from pyspark.sql import functions as F, SparkSession from .graphframe import GraphFrame, Pregel, _java_api, _from_java_gf from .lib import AggregateMessages as AM from .examples import Graphs, BeliefPropagation + +VERSION = open("version.sbt").read().strip() + + +@pytest.fixture(scope="class", autouse=True) +def set_spark(request, spark_session): + request.cls.spark = spark_session + + +@pytest.mark.usefixtures("set_spark") class GraphFrameTestUtils(object): @classmethod def parse_spark_version(cls, version_str): - """ take an input version string - return version items in a dictionary + """take an input version string + return version items in a dictionary """ - _sc_ver_patt = r'(\d+)\.(\d+)(\.(\d+)(-(.+))?)?' + _sc_ver_patt = r"(\d+)\.(\d+)(\.(\d+)(-(.+))?)?" m = re.match(_sc_ver_patt, version_str) if not m: - raise TypeError("version {} shoud be in ..".format(version_str)) + raise TypeError( + "version {} shoud be in ..".format(version_str) + ) version_info = {} try: - version_info['major'] = int(m.group(1)) + version_info["major"] = int(m.group(1)) except: raise TypeError("invalid minor version") try: - version_info['minor'] = int(m.group(2)) + version_info["minor"] = int(m.group(2)) except: raise TypeError("invalid major version") try: - version_info['maintenance'] = int(m.group(4)) + version_info["maintenance"] = int(m.group(4)) except: - version_info['maintenance'] = 0 + version_info["maintenance"] = 0 try: - version_info['special'] = m.group(6) + version_info["special"] = m.group(6) except: pass return version_info @classmethod def createSparkContext(cls): - cls.sc = sc = SparkContext('local[4]', "GraphFramesTests") + cls.conf = SparkConf().setAppName("GraphFramesTests") + cls.conf.set( + "spark.submit.pyFiles", + os.path.abspath("python/dist/graphframes-{VERSION}-py3-none-any.whl"), + ) + cls.sc = SparkContext(master="local[4]", appName="GraphFramesTests", conf=cls.conf) cls.checkpointDir = tempfile.mkdtemp() cls.sc.setCheckpointDir(cls.checkpointDir) - cls.spark_version = cls.parse_spark_version(sc.version) + cls.spark_version = cls.parse_spark_version(cls.sc.version) @classmethod def stopSparkContext(cls): @@ -81,10 +90,10 @@ def stopSparkContext(cls): @classmethod def spark_at_least_of_version(cls, version_str): - assert hasattr(cls, 'spark_version') + assert hasattr(cls, "spark_version") required_version = cls.parse_spark_version(version_str) spark_version = cls.spark_version - for _name in ['major', 'minor', 'maintenance']: + for _name in ["major", "minor", "maintenance"]: sc_ver = spark_version[_name] req_ver = required_version[_name] if sc_ver != req_ver: @@ -92,28 +101,31 @@ def spark_at_least_of_version(cls, version_str): # All major.minor.maintenance equal return True -def setUpModule(): - GraphFrameTestUtils.createSparkContext() -def tearDownModule(): +@pytest.fixture(scope="module", autouse=True) +def spark_context(): + GraphFrameTestUtils.createSparkContext() + yield GraphFrameTestUtils.stopSparkContext() -class GraphFrameTestCase(unittest.TestCase): +@pytest.fixture(scope="class") +def spark_session(): + # Create a SparkSession with a smaller number of shuffle partitions. + spark = ( + SparkSession(GraphFrameTestUtils.sc) + .builder.config("spark.sql.shuffle.partitions", 4) + .getOrCreate() + ) + yield spark + # No explicit stop; SparkContext shutdown will clean up. - @classmethod - def setUpClass(cls): - # Small tests run much faster with spark.sql.shuffle.partitions = 4 - cls.spark = SparkSession(GraphFrameTestUtils.sc).builder.config('spark.sql.shuffle.partitions', 4).getOrCreate() - - @classmethod - def tearDownClass(cls): - cls.spark = None +@pytest.mark.usefixtures("set_spark") +class GraphFrameTest: -class GraphFrameTest(GraphFrameTestCase): - def setUp(self): - super(GraphFrameTest, self).setUp() + def setup_method(self, method): + # Mimic setUp: create a simple GraphFrame instance for each test. localVertices = [(1, "A"), (2, "B"), (3, "C")] localEdges = [(1, 2, "love"), (2, 1, "hate"), (2, 3, "follow")] v = self.spark.createDataFrame(localVertices, ["id", "name"]) @@ -123,28 +135,38 @@ def setUp(self): def test_spark_version_check(self): gtu = GraphFrameTestUtils gtu.spark_version = gtu.parse_spark_version("2.0.2") - self.assertTrue(gtu.spark_at_least_of_version("1.7")) - self.assertTrue(gtu.spark_at_least_of_version("2.0")) - self.assertTrue(gtu.spark_at_least_of_version("2.0.1")) - self.assertTrue(gtu.spark_at_least_of_version("2.0.2")) - self.assertFalse(gtu.spark_at_least_of_version("2.0.3")) - self.assertFalse(gtu.spark_at_least_of_version("2.1")) + + assert gtu.spark_at_least_of_version("1.7") + assert gtu.spark_at_least_of_version("2.0") + assert gtu.spark_at_least_of_version("2.0.1") + assert gtu.spark_at_least_of_version("2.0.2") + assert not gtu.spark_at_least_of_version("2.0.3") + assert not gtu.spark_at_least_of_version("2.1") def test_construction(self): g = self.g - vertexIDs = map(lambda x: x[0], g.vertices.select("id").collect()) + vertexIDs = [row[0] for row in g.vertices.select("id").collect()] assert sorted(vertexIDs) == [1, 2, 3] - edgeActions = map(lambda x: x[0], g.edges.select("action").collect()) + + edgeActions = [row[0] for row in g.edges.select("action").collect()] assert sorted(edgeActions) == ["follow", "hate", "love"] - tripletsFirst = list(map(lambda x: (x[0][1], x[1][1], x[2][2]), - g.triplets.sort("src.id").select("src", "dst", "edge").take(1))) + + tripletsFirst = list( + map( + lambda x: (x[0][1], x[1][1], x[2][2]), + g.triplets.sort("src.id").select("src", "dst", "edge").take(1), + ) + ) assert tripletsFirst == [("A", "B", "love")], tripletsFirst + # Try with invalid vertices and edges DataFrames v_invalid = self.spark.createDataFrame( - [(1, "A"), (2, "B"), (3, "C")], ["invalid_colname_1", "invalid_colname_2"]) + [(1, "A"), (2, "B"), (3, "C")], ["invalid_colname_1", "invalid_colname_2"] + ) e_invalid = self.spark.createDataFrame( - [(1, 2), (2, 3), (3, 1)], ["invalid_colname_3", "invalid_colname_4"]) - with self.assertRaises(ValueError): + [(1, 2), (2, 3), (3, 1)], ["invalid_colname_3", "invalid_colname_4"] + ) + with pytest.raises(ValueError): GraphFrame(v_invalid, e_invalid) def test_cache(self): @@ -155,17 +177,17 @@ def test_cache(self): def test_degrees(self): g = self.g outDeg = g.outDegrees - self.assertSetEqual(set(outDeg.columns), {"id", "outDegree"}) + assert set(outDeg.columns) == {"id", "outDegree"} inDeg = g.inDegrees - self.assertSetEqual(set(inDeg.columns), {"id", "inDegree"}) + assert set(inDeg.columns) == {"id", "inDegree"} deg = g.degrees - self.assertSetEqual(set(deg.columns), {"id", "degree"}) + assert set(deg.columns) == {"id", "degree"} def test_motif_finding(self): g = self.g motifs = g.find("(a)-[e]->(b)") assert motifs.count() == 3 - self.assertSetEqual(set(motifs.columns), {"a", "e", "b"}) + assert set(motifs.columns) == {"a", "e", "b"} def test_filterVertices(self): g = self.g @@ -178,8 +200,8 @@ def test_filterVertices(self): e2 = g2.edges.select("src", "dst", "action").collect() assert len(v2) == len(expected_v) assert len(e2) == len(expected_e) - self.assertSetEqual(set(v2), set(expected_v)) - self.assertSetEqual(set(e2), set(expected_e)) + assert set(v2) == set(expected_v) + assert set(e2) == set(expected_e) def test_filterEdges(self): g = self.g @@ -192,8 +214,8 @@ def test_filterEdges(self): e2 = g2.edges.select("src", "dst", "action").collect() assert len(v2) == len(expected_v) assert len(e2) == len(expected_e) - self.assertSetEqual(set(v2), set(expected_v)) - self.assertSetEqual(set(e2), set(expected_e)) + assert set(v2) == set(expected_v) + assert set(e2) == set(expected_e) def test_dropIsolatedVertices(self): g = self.g @@ -204,74 +226,93 @@ def test_dropIsolatedVertices(self): expected_e = [(2, 3, "follow")] assert len(v2) == len(expected_v) assert len(e2) == len(expected_e) - self.assertSetEqual(set(v2), set(expected_v)) - self.assertSetEqual(set(e2), set(expected_e)) + assert set(v2) == set(expected_v) + assert set(e2) == set(expected_e) def test_bfs(self): g = self.g paths = g.bfs("name='A'", "name='C'") - self.assertEqual(paths.count(), 1) - self.assertEqual(paths.select("v1.name").head()[0], "B") + assert paths.count() == 1 + # Expecting that the first intermediary vertex in the BFS is "B" + assert paths.select("v1.name").head()[0] == "B" + paths2 = g.bfs("name='A'", "name='C'", edgeFilter="action!='follow'") - self.assertEqual(paths2.count(), 0) + assert paths2.count() == 0 + paths3 = g.bfs("name='A'", "name='C'", maxPathLength=1) - self.assertEqual(paths3.count(), 0) + assert paths3.count() == 0 -class PregelTest(GraphFrameTestCase): - def setUp(self): - super(PregelTest, self).setUp() +@pytest.mark.usefixtures("set_spark") +class TestPregel: def test_page_rank(self): - from pyspark.sql.functions import coalesce, col, lit, sum, when - edges = self.spark.createDataFrame([[0, 1], - [1, 2], - [2, 4], - [2, 0], - [3, 4], # 3 has no in-links - [4, 0], - [4, 2]], ["src", "dst"]) + # Create an edge DataFrame; note that vertex 3 has no in-links. + edges = self.spark.createDataFrame( + [[0, 1], [1, 2], [2, 4], [2, 0], [3, 4], [4, 0], [4, 2]], + ["src", "dst"], + ) edges.cache() + + # Create a vertex DataFrame and count vertices. vertices = self.spark.createDataFrame([[0], [1], [2], [3], [4]], ["id"]) numVertices = vertices.count() + + # Get the outDegrees DataFrame from a GraphFrame built on the original vertices and edges. vertices = GraphFrame(vertices, edges).outDegrees vertices.cache() + + # Construct a new GraphFrame with the updated vertices DataFrame. graph = GraphFrame(vertices, edges) alpha = 0.15 - ranks = graph.pregel \ - .setMaxIter(5) \ - .withVertexColumn("rank", lit(1.0 / numVertices), - coalesce(Pregel.msg(), - lit(0.0)) * lit(1.0 - alpha) + lit(alpha / numVertices)) \ - .sendMsgToDst(Pregel.src("rank") / Pregel.src("outDegree")) \ - .aggMsgs(sum(Pregel.msg())) \ + + # Run PageRank via Pregel. + ranks = ( + graph.pregel.setMaxIter(5) + .withVertexColumn( + "rank", + F.lit(1.0 / numVertices), + F.coalesce(Pregel.msg(), F.lit(0.0)) * F.lit(1.0 - alpha) + + F.lit(alpha / numVertices), + ) + .sendMsgToDst(Pregel.src("rank") / Pregel.src("outDegree")) + .aggMsgs(F.sum(Pregel.msg())) .run() + ) + + # Collect and sort results. resultRows = ranks.sort(ranks.id).collect() - result = map(lambda x: x.rank, resultRows) + result = list(map(lambda x: x.rank, resultRows)) expected = [0.245, 0.224, 0.303, 0.03, 0.197] + + # Compare each result with its expected value using a tolerance of 1e-3. for a, b in zip(result, expected): - self.assertAlmostEqual(a, b, delta = 1e-3) + assert a == pytest.approx(b, abs=1e-3) + +@pytest.mark.usefixtures("set_spark") +class TestGraphFrameLib: -class GraphFrameLibTest(GraphFrameTestCase): - def setUp(self): - super(GraphFrameLibTest, self).setUp() + def setup_method(self, method): + # Set up the Java API instance for each test. self.japi = _java_api(self.spark._sc) - def _hasCols(self, graph, vcols = [], ecols = []): - map(lambda c: self.assertIn(c, graph.vertices.columns), vcols) - map(lambda c: self.assertIn(c, graph.edges.columns), ecols) + def _hasCols(self, graph, vcols=[], ecols=[]): + for c in vcols: + assert c in graph.vertices.columns, f"Vertex DataFrame missing column: {c}" + for c in ecols: + assert c in graph.edges.columns, f"Edge DataFrame missing column: {c}" - def _df_hasCols(self, vertices, vcols = []): - map(lambda c: self.assertIn(c, vertices.columns), vcols) + def _df_hasCols(self, df, vcols=[]): + for c in vcols: + assert c in df.columns, f"DataFrame missing column: {c}" def _graph(self, name, *args): """ - Convenience to call one of the example graphs, passing the arguments and wrapping the result back - as a python object. - :param name: the name of the example graph - :param args: all the required arguments, without the initial spark session - :return: + Convenience to call one of the example graphs, passing the arguments and wrapping the result as a Python object. + :param name: the name of the example graph. + :param args: all the required arguments (excluding the initial SparkSession). + :return: a GraphFrame object. """ examples = self.japi.examples() jgraph = getattr(examples, name)(*args) @@ -281,83 +322,79 @@ def test_aggregate_messages(self): g = self._graph("friends") # For each user, sum the ages of the adjacent users, # plus 1 for the src's sum if the edge is "friend". - sendToSrc = ( - AM.dst['age'] + - sqlfunctions.when( - AM.edge['relationship'] == 'friend', - sqlfunctions.lit(1) - ).otherwise(0)) - sendToDst = AM.src['age'] + sendToSrc = AM.dst["age"] + F.when(AM.edge["relationship"] == "friend", F.lit(1)).otherwise( + 0 + ) + sendToDst = AM.src["age"] agg = g.aggregateMessages( - sqlfunctions.sum(AM.msg).alias('summedAges'), - sendToSrc=sendToSrc, - sendToDst=sendToDst) - # Run the aggregation again providing SQL expressions as String instead. + F.sum(AM.msg).alias("summedAges"), sendToSrc=sendToSrc, sendToDst=sendToDst + ) + # Run the aggregation again using SQL expressions as Strings. agg2 = g.aggregateMessages( "sum(MSG) AS `summedAges`", sendToSrc="(dst['age'] + CASE WHEN (edge['relationship'] = 'friend') THEN 1 ELSE 0 END)", - sendToDst="src['age']") - # Convert agg and agg2 to a mapping from id to the aggregated message. - aggMap = {id_: s for id_, s in agg.select('id', 'summedAges').collect()} - agg2Map = {id_: s for id_, s in agg2.select('id', 'summedAges').collect()} - # Compute the truth via brute force. - user2age = {id_: age for id_, age in g.vertices.select('id', 'age').collect()} + sendToDst="src['age']", + ) + # Build mappings from id to the aggregated message. + aggMap = {row.id: row.summedAges for row in agg.select("id", "summedAges").collect()} + agg2Map = {row.id: row.summedAges for row in agg2.select("id", "summedAges").collect()} + # Compute the expected aggregation via brute force. + user2age = {row.id: row.age for row in g.vertices.select("id", "age").collect()} trueAgg = {} - for src, dst, rel in g.edges.select("src", "dst", "relationship").collect(): - trueAgg[src] = trueAgg.get(src, 0) + user2age[dst] + (1 if rel == 'friend' else 0) + for row in g.edges.select("src", "dst", "relationship").collect(): + src, dst, rel = row.src, row.dst, row.relationship + trueAgg[src] = trueAgg.get(src, 0) + user2age[dst] + (1 if rel == "friend" else 0) trueAgg[dst] = trueAgg.get(dst, 0) + user2age[src] - # Compare if the agg mappings match the brute force mapping - self.assertEqual(aggMap, trueAgg) - self.assertEqual(agg2Map, trueAgg) - # Check that TypeError is raises with messages of wrong type - with self.assertRaises(TypeError): + # Verify both aggregations match the expected results. + assert aggMap == trueAgg, f"aggMap {aggMap} does not equal expected {trueAgg}" + assert agg2Map == trueAgg, f"agg2Map {agg2Map} does not equal expected {trueAgg}" + # Check that passing a wrong type for messages raises a TypeError. + with pytest.raises(TypeError): g.aggregateMessages( - "sum(MSG) AS `summedAges`", - sendToSrc=object(), - sendToDst="src['age']") - with self.assertRaises(TypeError): + "sum(MSG) AS `summedAges`", sendToSrc=object(), sendToDst="src['age']" + ) + with pytest.raises(TypeError): g.aggregateMessages( - "sum(MSG) AS `summedAges`", - sendToSrc=dst['age'], - sendToDst=object()) + "sum(MSG) AS `summedAges`", sendToSrc=F.col("dst")["age"], sendToDst=object() + ) def test_connected_components(self): - v = self.spark.createDataFrame([ - (0, "a", "b")], ["id", "vattr", "gender"]) + v = self.spark.createDataFrame([(0, "a", "b")], ["id", "vattr", "gender"]) e = self.spark.createDataFrame([(0, 0, 1)], ["src", "dst", "test"]).filter("src > 10") g = GraphFrame(v, e) comps = g.connectedComponents() - self._df_hasCols(comps, vcols=['id', 'component', 'vattr', 'gender']) - self.assertEqual(comps.count(), 1) + self._df_hasCols(comps, vcols=["id", "component", "vattr", "gender"]) + assert comps.count() == 1 def test_connected_components2(self): v = self.spark.createDataFrame([(0, "a0", "b0"), (1, "a1", "b1")], ["id", "A", "B"]) e = self.spark.createDataFrame([(0, 1, "a01", "b01")], ["src", "dst", "A", "B"]) g = GraphFrame(v, e) comps = g.connectedComponents() - self._df_hasCols(comps, vcols=['id', 'component', 'A', 'B']) - self.assertEqual(comps.count(), 2) + self._df_hasCols(comps, vcols=["id", "component", "A", "B"]) + assert comps.count() == 2 def test_connected_components_friends(self): g = self._graph("friends") - comps_tests = [] - comps_tests += [g.connectedComponents()] - comps_tests += [g.connectedComponents(broadcastThreshold=1)] - comps_tests += [g.connectedComponents(checkpointInterval=0)] - comps_tests += [g.connectedComponents(checkpointInterval=10)] - comps_tests += [g.connectedComponents(algorithm="graphx")] + comps_tests = [ + g.connectedComponents(), + g.connectedComponents(broadcastThreshold=1), + g.connectedComponents(checkpointInterval=0), + g.connectedComponents(checkpointInterval=10), + g.connectedComponents(algorithm="graphx"), + ] for c in comps_tests: - self.assertEqual(c.groupBy("component").count().count(), 2) + assert c.groupBy("component").count().count() == 2 def test_label_progagation(self): n = 5 g = self._graph("twoBlobs", n) labels = g.labelPropagation(maxIter=4 * n) labels1 = labels.filter("id < 5").select("label").collect() - all1 = set([x.label for x in labels1]) + all1 = {row.label for row in labels1} assert len(all1) == 1 labels2 = labels.filter("id >= 5").select("label").collect() - all2 = set([x.label for x in labels2]) + all2 = {row.label for row in labels2} assert len(all2) == 1 assert all1 != all2 @@ -367,7 +404,7 @@ def test_page_rank(self): resetProb = 0.15 errorTol = 1.0e-5 pr = g.pageRank(resetProb, tol=errorTol) - self._hasCols(pr, vcols=['id', 'pagerank'], ecols=['src', 'dst', 'weight']) + self._hasCols(pr, vcols=["id", "pagerank"], ecols=["src", "dst", "weight"]) def test_parallel_personalized_page_rank(self): n = 100 @@ -376,31 +413,34 @@ def test_parallel_personalized_page_rank(self): maxIter = 15 sourceIds = [1, 2, 3, 4] pr = g.parallelPersonalizedPageRank(resetProb, sourceIds=sourceIds, maxIter=maxIter) - self._hasCols(pr, vcols=['id', 'pageranks'], ecols=['src', 'dst', 'weight']) + self._hasCols(pr, vcols=["id", "pageranks"], ecols=["src", "dst", "weight"]) def test_shortest_paths(self): edges = [(1, 2), (1, 5), (2, 3), (2, 5), (3, 4), (4, 5), (4, 6)] + # Create bidirectional edges. all_edges = [z for (a, b) in edges for z in [(a, b), (b, a)]] - edges = self.spark.createDataFrame(all_edges, ["src", "dst"]) + edgesDF = self.spark.createDataFrame(all_edges, ["src", "dst"]) vertices = self.spark.createDataFrame([(i,) for i in range(1, 7)], ["id"]) - g = GraphFrame(vertices, edges) + g = GraphFrame(vertices, edgesDF) landmarks = [1, 4] v2 = g.shortestPaths(landmarks) self._df_hasCols(v2, vcols=["id", "distances"]) def test_svd_plus_plus(self): g = self._graph("ALSSyntheticData") - (v2, cost) = g.svdPlusPlus() - self._df_hasCols(v2, vcols=['id', 'column1', 'column2', 'column3', 'column4']) + v2, cost = g.svdPlusPlus() + self._df_hasCols(v2, vcols=["id", "column1", "column2", "column3", "column4"]) def test_strongly_connected_components(self): - # Simple island test + # Simple island test. vertices = self.spark.createDataFrame([(i,) for i in range(1, 6)], ["id"]) edges = self.spark.createDataFrame([(7, 8)], ["src", "dst"]) g = GraphFrame(vertices, edges) c = g.stronglyConnectedComponents(5) for row in c.collect(): - self.assertEqual(row.id, row.component) + assert ( + row.id == row.component + ), f"Vertex {row.id} not equal to its component {row.component}" def test_triangle_counts(self): edges = self.spark.createDataFrame([(0, 1), (1, 2), (2, 0)], ["src", "dst"]) @@ -408,61 +448,66 @@ def test_triangle_counts(self): g = GraphFrame(vertices, edges) c = g.triangleCount() for row in c.select("id", "count").collect(): - self.assertEqual(row.asDict()['count'], 1) - + assert row.asDict()["count"] == 1, f"Triangle count for vertex {row.id} is not 1" + def test_mutithreaded_sparksession_usage(self): - # Test that we can use the GraphFrame API from multiple threads + # Test that the GraphFrame API works correctly from multiple threads. localVertices = [(1, "A"), (2, "B"), (3, "C")] localEdges = [(1, 2, "love"), (2, 1, "hate"), (2, 3, "follow")] v = self.spark.createDataFrame(localVertices, ["id", "name"]) e = self.spark.createDataFrame(localEdges, ["src", "dst", "action"]) - - + exc = None + def run_graphframe() -> None: + nonlocal exc try: GraphFrame(v, e) except Exception as _e: - nonlocal exc exc = _e - + import threading + thread = threading.Thread(target=run_graphframe) thread.start() thread.join() - self.assertIsNone(exc, f"Exception was raised in thread: {exc}") + assert exc is None, f"Exception was raised in thread: {exc}" + +@pytest.mark.usefixtures("set_spark") +class TestGraphFrameExamples: -class GraphFrameExamplesTest(GraphFrameTestCase): - def setUp(self): - super(GraphFrameExamplesTest, self).setUp() + def setup_method(self, method): + # Set up the Java API instance for use in the tests. self.japi = _java_api(self.spark._sc) def test_belief_propagation(self): - # create graphical model g of size 3 x 3 + # Create a graphical model g of size 3x3. g = Graphs(self.spark).gridIsingModel(3) - # run BP for 5 iterations + # Run Belief Propagation (BP) for 5 iterations. numIter = 5 results = BeliefPropagation.runBPwithGraphFrames(g, numIter) - # check beliefs are valid - for row in results.vertices.select('belief').collect(): - belief = row['belief'] - self.assertTrue( - 0 <= belief <= 1, - msg="Expected belief to be probability in [0,1], but found {}".format(belief)) + # Check that each belief is a valid probability in [0, 1]. + for row in results.vertices.select("belief").collect(): + belief = row["belief"] + assert ( + 0 <= belief <= 1 + ), f"Expected belief to be probability in [0,1], but found {belief}" def test_graph_friends(self): - # construct graph + # Construct the graph. g = Graphs(self.spark).friends() - # check that a GraphFrame instance was returned - self.assertIsInstance(g, GraphFrame) + # Check that the result is an instance of GraphFrame. + assert isinstance(g, GraphFrame) def test_graph_grid_ising_model(self): - # construct graph + # Construct a grid Ising model graph. n = 3 g = Graphs(self.spark).gridIsingModel(n) - # check that all the vertices exist - ids = [v['id'] for v in g.vertices.collect()] + # Collect the vertex ids. + ids = [v["id"] for v in g.vertices.collect()] + # Verify that every expected vertex id appears. for i in range(n): for j in range(n): - self.assertIn('{},{}'.format(i, j), ids) + expected_id = f"{i},{j}" + assert expected_id in ids, f"Vertex {expected_id} not found in {ids}" diff --git a/python/requirements-dev.txt b/python/requirements-dev.txt new file mode 100644 index 000000000..b27da4d73 --- /dev/null +++ b/python/requirements-dev.txt @@ -0,0 +1,6 @@ +pytest==8.3.4 +Sphinx==8.1.3 +flake8==7.1.1 +isort==6.0.0 +mypy==1.14.1 +pre-commit==4.0.1 diff --git a/python/requirements.txt b/python/requirements.txt index efb5ec378..fb73319f2 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -1,3 +1,6 @@ # This file should list any python package dependencies. -nose==1.3.7 +pyspark>=2.0.0 +click==8.1.8 numpy>=1.7 +py7zr==0.22.0 +requests==2.32.3 diff --git a/python/run-tests.sh b/python/run-tests.sh index af4e0a139..dc496e8b0 100755 --- a/python/run-tests.sh +++ b/python/run-tests.sh @@ -38,7 +38,7 @@ echo $pyver LIBS="" for lib in "$SPARK_HOME/python/lib"/*zip ; do - LIBS=$LIBS:$lib + LIBS=$LIBS:$lib done # The current directory of the script. @@ -51,7 +51,7 @@ assembly_path="$DIR/../target/scala-$scala_version_major_minor" echo `ls $assembly_path/graphframes-assembly*.jar` JAR_PATH="" for assembly in $assembly_path/graphframes-assembly*.jar ; do - JAR_PATH=$assembly + JAR_PATH=$assembly done export PYSPARK_SUBMIT_ARGS="--driver-memory 2g --executor-memory 2g --jars $JAR_PATH pyspark-shell " @@ -62,17 +62,7 @@ export PYTHONPATH=$PYTHONPATH:graphframes # Run test suites - -if [[ "$python_major" == "2" ]]; then - - # Horrible hack for spark 1.x: we manually remove some log lines to stay below the 4MB log limit on Travis. - $PYSPARK_DRIVER_PYTHON `which nosetests` -v --all-modules -w $DIR 2>&1 | grep -vE "INFO (ParquetOutputFormat|SparkContext|ContextCleaner|ShuffleBlockFetcherIterator|MapOutputTrackerMaster|TaskSetManager|Executor|MemoryStore|CacheManager|BlockManager|DAGScheduler|PythonRDD|TaskSchedulerImpl|ZippedPartitionsRDD2)"; - -else - - $PYSPARK_DRIVER_PYTHON -m "nose" -v --all-modules -w $DIR 2>&1 | grep -vE "INFO (ParquetOutputFormat|SparkContext|ContextCleaner|ShuffleBlockFetcherIterator|MapOutputTrackerMaster|TaskSetManager|Executor|MemoryStore|CacheManager|BlockManager|DAGScheduler|PythonRDD|TaskSchedulerImpl|ZippedPartitionsRDD2)"; - -fi +$PYSPARK_DRIVER_PYTHON -m "pytest" -v $DIR/graphframes/tests.py 2>&1 | grep -vE "INFO (ParquetOutputFormat|SparkContext|ContextCleaner|ShuffleBlockFetcherIterator|MapOutputTrackerMaster|TaskSetManager|Executor|MemoryStore|CacheManager|BlockManager|DAGScheduler|PythonRDD|TaskSchedulerImpl|ZippedPartitionsRDD2)"; # Exit immediately if the tests fail. # Since we pipe to remove the output, we need to use some horrible BASH features: @@ -80,7 +70,6 @@ fi test ${PIPESTATUS[0]} -eq 0 || exit 1; # Run doc tests - cd "$DIR" $PYSPARK_PYTHON -u ./graphframes/graphframe.py "$@" diff --git a/python/setup.cfg b/python/setup.cfg index f127b08af..02a0d5136 100644 --- a/python/setup.cfg +++ b/python/setup.cfg @@ -1,2 +1,42 @@ -# This file contains the default option values to be used during setup. An -# example can be found at https://github.com/pypa/sampleproject/blob/master/setup.cfg +[metadata] +name = graphframes +version = 0.8.5 +description = GraphFrames: Graph Processing Framework for Apache Spark +long_description = file: ../README.md +long_description_content_type = text/markdown +author = GraphFrames Contributors +author_email = graphframes@googlegroups.com +url = https://pypi.org/project/graphframes-py/ +license = Apache License 2.0 +classifiers = + Development Status :: 4 - Beta + Programming Language :: Python :: 3 + Operating System :: OS Independent + +[options] +packages = find: +package_dir = + = python +include_package_data = True +install_requires = + pyspark>=2.0.0 + click==8.1.8 + numpy>=1.7 + py7zr==0.22.0 + requests==2.32.3 + +[options.packages.find] +where = python + exclude = + tests.py + docs + +[options.extras_require] +dev = + pytest==8.3.4 + Sphinx==8.1.3 + black==25.1.0 + flake8==7.1.1 + isort==6.0.0 + mypy==1.14.1 + pre-commit==3.5.1 diff --git a/python/setup.py b/python/setup.py index 9dad5462e..a91fb629a 100644 --- a/python/setup.py +++ b/python/setup.py @@ -1,2 +1,35 @@ -# Your python setup file. An example can be found at: -# https://github.com/pypa/sampleproject/blob/master/setup.py +from setuptools import setup, find_packages # type: ignore +import os + + +def parse_requirements(filename): + """Load requirements from a pip requirements file.""" + with open(filename, encoding="utf-8") as f: + # Filter out comments and empty lines. + return [line.strip() for line in f if line.strip() and not line.startswith("#")] + + +# Read the long description from the README file. +here = os.path.abspath(os.path.dirname(__file__)) + +# Use requirements.txt to get the list of dependencies. +requirements = parse_requirements(os.path.join(here, "requirements.txt")) + +setup( + name="graphframes", + version=open("version.sbt").read().strip(), # Update this version as needed + description="GraphFrames: Graph Processing Framework for Apache Spark", + long_description=open(os.path.join(f"{here}/..", "README.md"), encoding="utf-8").read(), + long_description_content_type="text/markdown", + author="GraphFrames Contributors", + author_email="graphframes@googlegroups.com", + url="https://pypi.org/project/graphframes-py", + packages=find_packages(where="python"), + package_dir={"": "python"}, + include_package_data=True, # Include non-code files specified in MANIFEST.in + install_requires=requirements, + classifiers=[ + "Programming Language :: Python :: 3", + "Operating System :: OS Independent", + ], +) diff --git a/version.sbt b/version.sbt index f72bdcc0e..6fbb590a4 100644 --- a/version.sbt +++ b/version.sbt @@ -1 +1 @@ -ThisBuild / version := "0.8.4" +ThisBuild / version := "0.8.5" From c25624474e261d73c1eeb12d35b2604ef6e977cf Mon Sep 17 00:00:00 2001 From: Russell Jurney Date: Sat, 15 Feb 2025 21:07:24 -0800 Subject: [PATCH 02/53] Restore Python .gitignore --- python/.gitignore | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 python/.gitignore diff --git a/python/.gitignore b/python/.gitignore new file mode 100644 index 000000000..2130ff922 --- /dev/null +++ b/python/.gitignore @@ -0,0 +1,5 @@ +*.pyc +docs/_build/ +build/ +dist/ + From 6c3df0b1cdf606ddf8e1aed00edd1d93ffb11220 Mon Sep 17 00:00:00 2001 From: Russell Jurney Date: Sat, 15 Feb 2025 21:08:05 -0800 Subject: [PATCH 03/53] Extra newline removed --- python/.gitignore | 1 - 1 file changed, 1 deletion(-) diff --git a/python/.gitignore b/python/.gitignore index 2130ff922..81410ca55 100644 --- a/python/.gitignore +++ b/python/.gitignore @@ -2,4 +2,3 @@ docs/_build/ build/ dist/ - From caf50911ed5da315ca66134798a28ea240b71a81 Mon Sep 17 00:00:00 2001 From: Russell Jurney Date: Sun, 16 Feb 2025 11:58:23 -0800 Subject: [PATCH 04/53] Added VERSION file set to 0.8.5 --- VERSION | 1 + 1 file changed, 1 insertion(+) create mode 100644 VERSION diff --git a/VERSION b/VERSION new file mode 100644 index 000000000..7ada0d303 --- /dev/null +++ b/VERSION @@ -0,0 +1 @@ +0.8.5 From 7cfa2d18152e566f39dc57ea5c8e1c6075648542 Mon Sep 17 00:00:00 2001 From: Russell Jurney Date: Sun, 16 Feb 2025 12:40:44 -0800 Subject: [PATCH 05/53] isort; fiex edgesDF variable name. --- python/graphframes/tests.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/python/graphframes/tests.py b/python/graphframes/tests.py index 259435759..d4269f449 100644 --- a/python/graphframes/tests.py +++ b/python/graphframes/tests.py @@ -16,18 +16,18 @@ # import os -import tempfile -import shutil import re +import shutil +import tempfile import pytest from pyspark import SparkConf, SparkContext -from pyspark.sql import functions as F, SparkSession +from pyspark.sql import SparkSession +from pyspark.sql import functions as F -from .graphframe import GraphFrame, Pregel, _java_api, _from_java_gf +from .examples import BeliefPropagation, Graphs +from .graphframe import GraphFrame, Pregel, _from_java_gf, _java_api from .lib import AggregateMessages as AM -from .examples import Graphs, BeliefPropagation - VERSION = open("version.sbt").read().strip() @@ -419,9 +419,9 @@ def test_shortest_paths(self): edges = [(1, 2), (1, 5), (2, 3), (2, 5), (3, 4), (4, 5), (4, 6)] # Create bidirectional edges. all_edges = [z for (a, b) in edges for z in [(a, b), (b, a)]] - edgesDF = self.spark.createDataFrame(all_edges, ["src", "dst"]) + edges = self.spark.createDataFrame(all_edges, ["src", "dst"]) vertices = self.spark.createDataFrame([(i,) for i in range(1, 7)], ["id"]) - g = GraphFrame(vertices, edgesDF) + g = GraphFrame(vertices, edges) landmarks = [1, 4] v2 = g.shortestPaths(landmarks) self._df_hasCols(v2, vcols=["id", "distances"]) From a8bf0be4523bc41dd01966f7b525a9ec7c918ede Mon Sep 17 00:00:00 2001 From: Russell Jurney Date: Sun, 16 Feb 2025 13:08:48 -0800 Subject: [PATCH 06/53] Back out Dockerfile changes --- Dockerfile | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Dockerfile b/Dockerfile index b9fe8c528..1c4430912 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,16 +1,16 @@ FROM ubuntu:22.04 -ARG PYTHON_VERSION=3.9 +ARG PYTHON_VERSION=3.8 ARG DEBIAN_FRONTEND=noninteractive RUN apt-get update && \ - apt-get install -y wget bzip2 build-essential openjdk-11-jdk ssh sudo && \ + apt-get install -y wget bzip2 build-essential openjdk-8-jdk ssh sudo && \ apt-get clean # Install Spark and update env variables. -ENV SCALA_VERSION 2.12.20 -ENV SPARK_VERSION "3.5.4" -ENV SPARK_BUILD "spark-${SPARK_VERSION}-bin-hadoop3" +ENV SCALA_VERSION 2.12.17 +ENV SPARK_VERSION "3.4.1" +ENV SPARK_BUILD "spark-${SPARK_VERSION}-bin-hadoop3.2" ENV SPARK_BUILD_URL "https://dist.apache.org/repos/dist/release/spark/spark-${SPARK_VERSION}/${SPARK_BUILD}.tgz" RUN wget --quiet "$SPARK_BUILD_URL" -O /tmp/spark.tgz && \ tar -C /opt -xf /tmp/spark.tgz && \ From 54a942da4a572471eead3e5df4f721096d638ca6 Mon Sep 17 00:00:00 2001 From: Russell Jurney Date: Sun, 16 Feb 2025 13:21:47 -0800 Subject: [PATCH 07/53] Back out version change in build.sbt --- build.sbt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build.sbt b/build.sbt index c6b503989..4ee4d9bd5 100644 --- a/build.sbt +++ b/build.sbt @@ -3,7 +3,7 @@ import ReleaseTransformations.* lazy val sparkVer = sys.props.getOrElse("spark.version", "3.5.4") lazy val sparkBranch = sparkVer.substring(0, 3) lazy val defaultScalaVer = sparkBranch match { - case "3.5" => "2.12.20" + case "3.5" => "2.12.18" case _ => throw new IllegalArgumentException(s"Unsupported Spark version: $sparkVer.") } lazy val scalaVer = sys.props.getOrElse("scala.version", defaultScalaVer) From 8b0e34697928ad0cb1b07cba96c0d0657a0d771b Mon Sep 17 00:00:00 2001 From: Russell Jurney Date: Sun, 16 Feb 2025 13:23:35 -0800 Subject: [PATCH 08/53] Backout changes to config and run-tests --- docs/_config.yml | 2 +- python/run-tests.sh | 17 ++++++++++++++--- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/docs/_config.yml b/docs/_config.yml index 379fc242f..4c1ab075c 100644 --- a/docs/_config.yml +++ b/docs/_config.yml @@ -13,7 +13,7 @@ include: # These allow the documentation to be updated with newer releases # of Spark, Scala, and Mesos. -GRAPHFRAMES_VERSION: 0.8.5 +GRAPHFRAMES_VERSION: 0.8.4 #SCALA_BINARY_VERSION: "2.10" #SCALA_VERSION: "2.10.4" #MESOS_VERSION: 0.21.0 diff --git a/python/run-tests.sh b/python/run-tests.sh index dc496e8b0..af4e0a139 100755 --- a/python/run-tests.sh +++ b/python/run-tests.sh @@ -38,7 +38,7 @@ echo $pyver LIBS="" for lib in "$SPARK_HOME/python/lib"/*zip ; do - LIBS=$LIBS:$lib + LIBS=$LIBS:$lib done # The current directory of the script. @@ -51,7 +51,7 @@ assembly_path="$DIR/../target/scala-$scala_version_major_minor" echo `ls $assembly_path/graphframes-assembly*.jar` JAR_PATH="" for assembly in $assembly_path/graphframes-assembly*.jar ; do - JAR_PATH=$assembly + JAR_PATH=$assembly done export PYSPARK_SUBMIT_ARGS="--driver-memory 2g --executor-memory 2g --jars $JAR_PATH pyspark-shell " @@ -62,7 +62,17 @@ export PYTHONPATH=$PYTHONPATH:graphframes # Run test suites -$PYSPARK_DRIVER_PYTHON -m "pytest" -v $DIR/graphframes/tests.py 2>&1 | grep -vE "INFO (ParquetOutputFormat|SparkContext|ContextCleaner|ShuffleBlockFetcherIterator|MapOutputTrackerMaster|TaskSetManager|Executor|MemoryStore|CacheManager|BlockManager|DAGScheduler|PythonRDD|TaskSchedulerImpl|ZippedPartitionsRDD2)"; + +if [[ "$python_major" == "2" ]]; then + + # Horrible hack for spark 1.x: we manually remove some log lines to stay below the 4MB log limit on Travis. + $PYSPARK_DRIVER_PYTHON `which nosetests` -v --all-modules -w $DIR 2>&1 | grep -vE "INFO (ParquetOutputFormat|SparkContext|ContextCleaner|ShuffleBlockFetcherIterator|MapOutputTrackerMaster|TaskSetManager|Executor|MemoryStore|CacheManager|BlockManager|DAGScheduler|PythonRDD|TaskSchedulerImpl|ZippedPartitionsRDD2)"; + +else + + $PYSPARK_DRIVER_PYTHON -m "nose" -v --all-modules -w $DIR 2>&1 | grep -vE "INFO (ParquetOutputFormat|SparkContext|ContextCleaner|ShuffleBlockFetcherIterator|MapOutputTrackerMaster|TaskSetManager|Executor|MemoryStore|CacheManager|BlockManager|DAGScheduler|PythonRDD|TaskSchedulerImpl|ZippedPartitionsRDD2)"; + +fi # Exit immediately if the tests fail. # Since we pipe to remove the output, we need to use some horrible BASH features: @@ -70,6 +80,7 @@ $PYSPARK_DRIVER_PYTHON -m "pytest" -v $DIR/graphframes/tests.py 2>&1 | grep -vE test ${PIPESTATUS[0]} -eq 0 || exit 1; # Run doc tests + cd "$DIR" $PYSPARK_PYTHON -u ./graphframes/graphframe.py "$@" From 46c2b9300ace8c95ba482e6582631b79a348705c Mon Sep 17 00:00:00 2001 From: Russell Jurney Date: Sun, 16 Feb 2025 13:24:19 -0800 Subject: [PATCH 09/53] Back out pytest conversion --- python/graphframes/tests.py | 409 ++++++++++++++++-------------------- 1 file changed, 182 insertions(+), 227 deletions(-) diff --git a/python/graphframes/tests.py b/python/graphframes/tests.py index d4269f449..9a7ad1371 100644 --- a/python/graphframes/tests.py +++ b/python/graphframes/tests.py @@ -15,72 +15,63 @@ # limitations under the License. # -import os -import re -import shutil +import sys import tempfile +import shutil +import re -import pytest -from pyspark import SparkConf, SparkContext -from pyspark.sql import SparkSession -from pyspark.sql import functions as F - -from .examples import BeliefPropagation, Graphs -from .graphframe import GraphFrame, Pregel, _from_java_gf, _java_api -from .lib import AggregateMessages as AM - -VERSION = open("version.sbt").read().strip() - +if sys.version_info[:2] <= (2, 6): + try: + import unittest2 as unittest + except ImportError: + sys.stderr.write('Please install unittest2 to test with Python 2.6 or earlier') + sys.exit(1) +else: + import unittest -@pytest.fixture(scope="class", autouse=True) -def set_spark(request, spark_session): - request.cls.spark = spark_session +from pyspark import SparkContext +from pyspark.sql import functions as sqlfunctions, SparkSession +from .graphframe import GraphFrame, Pregel, _java_api, _from_java_gf +from .lib import AggregateMessages as AM +from .examples import Graphs, BeliefPropagation -@pytest.mark.usefixtures("set_spark") class GraphFrameTestUtils(object): @classmethod def parse_spark_version(cls, version_str): - """take an input version string - return version items in a dictionary + """ take an input version string + return version items in a dictionary """ - _sc_ver_patt = r"(\d+)\.(\d+)(\.(\d+)(-(.+))?)?" + _sc_ver_patt = r'(\d+)\.(\d+)(\.(\d+)(-(.+))?)?' m = re.match(_sc_ver_patt, version_str) if not m: - raise TypeError( - "version {} shoud be in ..".format(version_str) - ) + raise TypeError("version {} shoud be in ..".format(version_str)) version_info = {} try: - version_info["major"] = int(m.group(1)) + version_info['major'] = int(m.group(1)) except: raise TypeError("invalid minor version") try: - version_info["minor"] = int(m.group(2)) + version_info['minor'] = int(m.group(2)) except: raise TypeError("invalid major version") try: - version_info["maintenance"] = int(m.group(4)) + version_info['maintenance'] = int(m.group(4)) except: - version_info["maintenance"] = 0 + version_info['maintenance'] = 0 try: - version_info["special"] = m.group(6) + version_info['special'] = m.group(6) except: pass return version_info @classmethod def createSparkContext(cls): - cls.conf = SparkConf().setAppName("GraphFramesTests") - cls.conf.set( - "spark.submit.pyFiles", - os.path.abspath("python/dist/graphframes-{VERSION}-py3-none-any.whl"), - ) - cls.sc = SparkContext(master="local[4]", appName="GraphFramesTests", conf=cls.conf) + cls.sc = sc = SparkContext('local[4]', "GraphFramesTests") cls.checkpointDir = tempfile.mkdtemp() cls.sc.setCheckpointDir(cls.checkpointDir) - cls.spark_version = cls.parse_spark_version(cls.sc.version) + cls.spark_version = cls.parse_spark_version(sc.version) @classmethod def stopSparkContext(cls): @@ -90,10 +81,10 @@ def stopSparkContext(cls): @classmethod def spark_at_least_of_version(cls, version_str): - assert hasattr(cls, "spark_version") + assert hasattr(cls, 'spark_version') required_version = cls.parse_spark_version(version_str) spark_version = cls.spark_version - for _name in ["major", "minor", "maintenance"]: + for _name in ['major', 'minor', 'maintenance']: sc_ver = spark_version[_name] req_ver = required_version[_name] if sc_ver != req_ver: @@ -101,31 +92,28 @@ def spark_at_least_of_version(cls, version_str): # All major.minor.maintenance equal return True - -@pytest.fixture(scope="module", autouse=True) -def spark_context(): +def setUpModule(): GraphFrameTestUtils.createSparkContext() - yield + +def tearDownModule(): GraphFrameTestUtils.stopSparkContext() -@pytest.fixture(scope="class") -def spark_session(): - # Create a SparkSession with a smaller number of shuffle partitions. - spark = ( - SparkSession(GraphFrameTestUtils.sc) - .builder.config("spark.sql.shuffle.partitions", 4) - .getOrCreate() - ) - yield spark - # No explicit stop; SparkContext shutdown will clean up. +class GraphFrameTestCase(unittest.TestCase): + + @classmethod + def setUpClass(cls): + # Small tests run much faster with spark.sql.shuffle.partitions = 4 + cls.spark = SparkSession(GraphFrameTestUtils.sc).builder.config('spark.sql.shuffle.partitions', 4).getOrCreate() + @classmethod + def tearDownClass(cls): + cls.spark = None -@pytest.mark.usefixtures("set_spark") -class GraphFrameTest: - def setup_method(self, method): - # Mimic setUp: create a simple GraphFrame instance for each test. +class GraphFrameTest(GraphFrameTestCase): + def setUp(self): + super(GraphFrameTest, self).setUp() localVertices = [(1, "A"), (2, "B"), (3, "C")] localEdges = [(1, 2, "love"), (2, 1, "hate"), (2, 3, "follow")] v = self.spark.createDataFrame(localVertices, ["id", "name"]) @@ -135,38 +123,28 @@ def setup_method(self, method): def test_spark_version_check(self): gtu = GraphFrameTestUtils gtu.spark_version = gtu.parse_spark_version("2.0.2") - - assert gtu.spark_at_least_of_version("1.7") - assert gtu.spark_at_least_of_version("2.0") - assert gtu.spark_at_least_of_version("2.0.1") - assert gtu.spark_at_least_of_version("2.0.2") - assert not gtu.spark_at_least_of_version("2.0.3") - assert not gtu.spark_at_least_of_version("2.1") + self.assertTrue(gtu.spark_at_least_of_version("1.7")) + self.assertTrue(gtu.spark_at_least_of_version("2.0")) + self.assertTrue(gtu.spark_at_least_of_version("2.0.1")) + self.assertTrue(gtu.spark_at_least_of_version("2.0.2")) + self.assertFalse(gtu.spark_at_least_of_version("2.0.3")) + self.assertFalse(gtu.spark_at_least_of_version("2.1")) def test_construction(self): g = self.g - vertexIDs = [row[0] for row in g.vertices.select("id").collect()] + vertexIDs = map(lambda x: x[0], g.vertices.select("id").collect()) assert sorted(vertexIDs) == [1, 2, 3] - - edgeActions = [row[0] for row in g.edges.select("action").collect()] + edgeActions = map(lambda x: x[0], g.edges.select("action").collect()) assert sorted(edgeActions) == ["follow", "hate", "love"] - - tripletsFirst = list( - map( - lambda x: (x[0][1], x[1][1], x[2][2]), - g.triplets.sort("src.id").select("src", "dst", "edge").take(1), - ) - ) + tripletsFirst = list(map(lambda x: (x[0][1], x[1][1], x[2][2]), + g.triplets.sort("src.id").select("src", "dst", "edge").take(1))) assert tripletsFirst == [("A", "B", "love")], tripletsFirst - # Try with invalid vertices and edges DataFrames v_invalid = self.spark.createDataFrame( - [(1, "A"), (2, "B"), (3, "C")], ["invalid_colname_1", "invalid_colname_2"] - ) + [(1, "A"), (2, "B"), (3, "C")], ["invalid_colname_1", "invalid_colname_2"]) e_invalid = self.spark.createDataFrame( - [(1, 2), (2, 3), (3, 1)], ["invalid_colname_3", "invalid_colname_4"] - ) - with pytest.raises(ValueError): + [(1, 2), (2, 3), (3, 1)], ["invalid_colname_3", "invalid_colname_4"]) + with self.assertRaises(ValueError): GraphFrame(v_invalid, e_invalid) def test_cache(self): @@ -177,17 +155,17 @@ def test_cache(self): def test_degrees(self): g = self.g outDeg = g.outDegrees - assert set(outDeg.columns) == {"id", "outDegree"} + self.assertSetEqual(set(outDeg.columns), {"id", "outDegree"}) inDeg = g.inDegrees - assert set(inDeg.columns) == {"id", "inDegree"} + self.assertSetEqual(set(inDeg.columns), {"id", "inDegree"}) deg = g.degrees - assert set(deg.columns) == {"id", "degree"} + self.assertSetEqual(set(deg.columns), {"id", "degree"}) def test_motif_finding(self): g = self.g motifs = g.find("(a)-[e]->(b)") assert motifs.count() == 3 - assert set(motifs.columns) == {"a", "e", "b"} + self.assertSetEqual(set(motifs.columns), {"a", "e", "b"}) def test_filterVertices(self): g = self.g @@ -200,8 +178,8 @@ def test_filterVertices(self): e2 = g2.edges.select("src", "dst", "action").collect() assert len(v2) == len(expected_v) assert len(e2) == len(expected_e) - assert set(v2) == set(expected_v) - assert set(e2) == set(expected_e) + self.assertSetEqual(set(v2), set(expected_v)) + self.assertSetEqual(set(e2), set(expected_e)) def test_filterEdges(self): g = self.g @@ -214,8 +192,8 @@ def test_filterEdges(self): e2 = g2.edges.select("src", "dst", "action").collect() assert len(v2) == len(expected_v) assert len(e2) == len(expected_e) - assert set(v2) == set(expected_v) - assert set(e2) == set(expected_e) + self.assertSetEqual(set(v2), set(expected_v)) + self.assertSetEqual(set(e2), set(expected_e)) def test_dropIsolatedVertices(self): g = self.g @@ -226,93 +204,74 @@ def test_dropIsolatedVertices(self): expected_e = [(2, 3, "follow")] assert len(v2) == len(expected_v) assert len(e2) == len(expected_e) - assert set(v2) == set(expected_v) - assert set(e2) == set(expected_e) + self.assertSetEqual(set(v2), set(expected_v)) + self.assertSetEqual(set(e2), set(expected_e)) def test_bfs(self): g = self.g paths = g.bfs("name='A'", "name='C'") - assert paths.count() == 1 - # Expecting that the first intermediary vertex in the BFS is "B" - assert paths.select("v1.name").head()[0] == "B" - + self.assertEqual(paths.count(), 1) + self.assertEqual(paths.select("v1.name").head()[0], "B") paths2 = g.bfs("name='A'", "name='C'", edgeFilter="action!='follow'") - assert paths2.count() == 0 - + self.assertEqual(paths2.count(), 0) paths3 = g.bfs("name='A'", "name='C'", maxPathLength=1) - assert paths3.count() == 0 + self.assertEqual(paths3.count(), 0) -@pytest.mark.usefixtures("set_spark") -class TestPregel: +class PregelTest(GraphFrameTestCase): + def setUp(self): + super(PregelTest, self).setUp() def test_page_rank(self): - # Create an edge DataFrame; note that vertex 3 has no in-links. - edges = self.spark.createDataFrame( - [[0, 1], [1, 2], [2, 4], [2, 0], [3, 4], [4, 0], [4, 2]], - ["src", "dst"], - ) + from pyspark.sql.functions import coalesce, col, lit, sum, when + edges = self.spark.createDataFrame([[0, 1], + [1, 2], + [2, 4], + [2, 0], + [3, 4], # 3 has no in-links + [4, 0], + [4, 2]], ["src", "dst"]) edges.cache() - - # Create a vertex DataFrame and count vertices. vertices = self.spark.createDataFrame([[0], [1], [2], [3], [4]], ["id"]) numVertices = vertices.count() - - # Get the outDegrees DataFrame from a GraphFrame built on the original vertices and edges. vertices = GraphFrame(vertices, edges).outDegrees vertices.cache() - - # Construct a new GraphFrame with the updated vertices DataFrame. graph = GraphFrame(vertices, edges) alpha = 0.15 - - # Run PageRank via Pregel. - ranks = ( - graph.pregel.setMaxIter(5) - .withVertexColumn( - "rank", - F.lit(1.0 / numVertices), - F.coalesce(Pregel.msg(), F.lit(0.0)) * F.lit(1.0 - alpha) - + F.lit(alpha / numVertices), - ) - .sendMsgToDst(Pregel.src("rank") / Pregel.src("outDegree")) - .aggMsgs(F.sum(Pregel.msg())) + ranks = graph.pregel \ + .setMaxIter(5) \ + .withVertexColumn("rank", lit(1.0 / numVertices), + coalesce(Pregel.msg(), + lit(0.0)) * lit(1.0 - alpha) + lit(alpha / numVertices)) \ + .sendMsgToDst(Pregel.src("rank") / Pregel.src("outDegree")) \ + .aggMsgs(sum(Pregel.msg())) \ .run() - ) - - # Collect and sort results. resultRows = ranks.sort(ranks.id).collect() - result = list(map(lambda x: x.rank, resultRows)) + result = map(lambda x: x.rank, resultRows) expected = [0.245, 0.224, 0.303, 0.03, 0.197] - - # Compare each result with its expected value using a tolerance of 1e-3. for a, b in zip(result, expected): - assert a == pytest.approx(b, abs=1e-3) - + self.assertAlmostEqual(a, b, delta = 1e-3) -@pytest.mark.usefixtures("set_spark") -class TestGraphFrameLib: - def setup_method(self, method): - # Set up the Java API instance for each test. +class GraphFrameLibTest(GraphFrameTestCase): + def setUp(self): + super(GraphFrameLibTest, self).setUp() self.japi = _java_api(self.spark._sc) - def _hasCols(self, graph, vcols=[], ecols=[]): - for c in vcols: - assert c in graph.vertices.columns, f"Vertex DataFrame missing column: {c}" - for c in ecols: - assert c in graph.edges.columns, f"Edge DataFrame missing column: {c}" + def _hasCols(self, graph, vcols = [], ecols = []): + map(lambda c: self.assertIn(c, graph.vertices.columns), vcols) + map(lambda c: self.assertIn(c, graph.edges.columns), ecols) - def _df_hasCols(self, df, vcols=[]): - for c in vcols: - assert c in df.columns, f"DataFrame missing column: {c}" + def _df_hasCols(self, vertices, vcols = []): + map(lambda c: self.assertIn(c, vertices.columns), vcols) def _graph(self, name, *args): """ - Convenience to call one of the example graphs, passing the arguments and wrapping the result as a Python object. - :param name: the name of the example graph. - :param args: all the required arguments (excluding the initial SparkSession). - :return: a GraphFrame object. + Convenience to call one of the example graphs, passing the arguments and wrapping the result back + as a python object. + :param name: the name of the example graph + :param args: all the required arguments, without the initial spark session + :return: """ examples = self.japi.examples() jgraph = getattr(examples, name)(*args) @@ -322,79 +281,83 @@ def test_aggregate_messages(self): g = self._graph("friends") # For each user, sum the ages of the adjacent users, # plus 1 for the src's sum if the edge is "friend". - sendToSrc = AM.dst["age"] + F.when(AM.edge["relationship"] == "friend", F.lit(1)).otherwise( - 0 - ) - sendToDst = AM.src["age"] + sendToSrc = ( + AM.dst['age'] + + sqlfunctions.when( + AM.edge['relationship'] == 'friend', + sqlfunctions.lit(1) + ).otherwise(0)) + sendToDst = AM.src['age'] agg = g.aggregateMessages( - F.sum(AM.msg).alias("summedAges"), sendToSrc=sendToSrc, sendToDst=sendToDst - ) - # Run the aggregation again using SQL expressions as Strings. + sqlfunctions.sum(AM.msg).alias('summedAges'), + sendToSrc=sendToSrc, + sendToDst=sendToDst) + # Run the aggregation again providing SQL expressions as String instead. agg2 = g.aggregateMessages( "sum(MSG) AS `summedAges`", sendToSrc="(dst['age'] + CASE WHEN (edge['relationship'] = 'friend') THEN 1 ELSE 0 END)", - sendToDst="src['age']", - ) - # Build mappings from id to the aggregated message. - aggMap = {row.id: row.summedAges for row in agg.select("id", "summedAges").collect()} - agg2Map = {row.id: row.summedAges for row in agg2.select("id", "summedAges").collect()} - # Compute the expected aggregation via brute force. - user2age = {row.id: row.age for row in g.vertices.select("id", "age").collect()} + sendToDst="src['age']") + # Convert agg and agg2 to a mapping from id to the aggregated message. + aggMap = {id_: s for id_, s in agg.select('id', 'summedAges').collect()} + agg2Map = {id_: s for id_, s in agg2.select('id', 'summedAges').collect()} + # Compute the truth via brute force. + user2age = {id_: age for id_, age in g.vertices.select('id', 'age').collect()} trueAgg = {} - for row in g.edges.select("src", "dst", "relationship").collect(): - src, dst, rel = row.src, row.dst, row.relationship - trueAgg[src] = trueAgg.get(src, 0) + user2age[dst] + (1 if rel == "friend" else 0) + for src, dst, rel in g.edges.select("src", "dst", "relationship").collect(): + trueAgg[src] = trueAgg.get(src, 0) + user2age[dst] + (1 if rel == 'friend' else 0) trueAgg[dst] = trueAgg.get(dst, 0) + user2age[src] - # Verify both aggregations match the expected results. - assert aggMap == trueAgg, f"aggMap {aggMap} does not equal expected {trueAgg}" - assert agg2Map == trueAgg, f"agg2Map {agg2Map} does not equal expected {trueAgg}" - # Check that passing a wrong type for messages raises a TypeError. - with pytest.raises(TypeError): + # Compare if the agg mappings match the brute force mapping + self.assertEqual(aggMap, trueAgg) + self.assertEqual(agg2Map, trueAgg) + # Check that TypeError is raises with messages of wrong type + with self.assertRaises(TypeError): g.aggregateMessages( - "sum(MSG) AS `summedAges`", sendToSrc=object(), sendToDst="src['age']" - ) - with pytest.raises(TypeError): + "sum(MSG) AS `summedAges`", + sendToSrc=object(), + sendToDst="src['age']") + with self.assertRaises(TypeError): g.aggregateMessages( - "sum(MSG) AS `summedAges`", sendToSrc=F.col("dst")["age"], sendToDst=object() - ) + "sum(MSG) AS `summedAges`", + sendToSrc=dst['age'], + sendToDst=object()) def test_connected_components(self): - v = self.spark.createDataFrame([(0, "a", "b")], ["id", "vattr", "gender"]) + v = self.spark.createDataFrame([ + (0, "a", "b")], ["id", "vattr", "gender"]) e = self.spark.createDataFrame([(0, 0, 1)], ["src", "dst", "test"]).filter("src > 10") g = GraphFrame(v, e) comps = g.connectedComponents() - self._df_hasCols(comps, vcols=["id", "component", "vattr", "gender"]) - assert comps.count() == 1 + self._df_hasCols(comps, vcols=['id', 'component', 'vattr', 'gender']) + self.assertEqual(comps.count(), 1) def test_connected_components2(self): v = self.spark.createDataFrame([(0, "a0", "b0"), (1, "a1", "b1")], ["id", "A", "B"]) e = self.spark.createDataFrame([(0, 1, "a01", "b01")], ["src", "dst", "A", "B"]) g = GraphFrame(v, e) comps = g.connectedComponents() - self._df_hasCols(comps, vcols=["id", "component", "A", "B"]) - assert comps.count() == 2 + self._df_hasCols(comps, vcols=['id', 'component', 'A', 'B']) + self.assertEqual(comps.count(), 2) def test_connected_components_friends(self): g = self._graph("friends") - comps_tests = [ - g.connectedComponents(), - g.connectedComponents(broadcastThreshold=1), - g.connectedComponents(checkpointInterval=0), - g.connectedComponents(checkpointInterval=10), - g.connectedComponents(algorithm="graphx"), - ] + comps_tests = [] + comps_tests += [g.connectedComponents()] + comps_tests += [g.connectedComponents(broadcastThreshold=1)] + comps_tests += [g.connectedComponents(checkpointInterval=0)] + comps_tests += [g.connectedComponents(checkpointInterval=10)] + comps_tests += [g.connectedComponents(algorithm="graphx")] for c in comps_tests: - assert c.groupBy("component").count().count() == 2 + self.assertEqual(c.groupBy("component").count().count(), 2) def test_label_progagation(self): n = 5 g = self._graph("twoBlobs", n) labels = g.labelPropagation(maxIter=4 * n) labels1 = labels.filter("id < 5").select("label").collect() - all1 = {row.label for row in labels1} + all1 = set([x.label for x in labels1]) assert len(all1) == 1 labels2 = labels.filter("id >= 5").select("label").collect() - all2 = {row.label for row in labels2} + all2 = set([x.label for x in labels2]) assert len(all2) == 1 assert all1 != all2 @@ -404,7 +367,7 @@ def test_page_rank(self): resetProb = 0.15 errorTol = 1.0e-5 pr = g.pageRank(resetProb, tol=errorTol) - self._hasCols(pr, vcols=["id", "pagerank"], ecols=["src", "dst", "weight"]) + self._hasCols(pr, vcols=['id', 'pagerank'], ecols=['src', 'dst', 'weight']) def test_parallel_personalized_page_rank(self): n = 100 @@ -413,11 +376,10 @@ def test_parallel_personalized_page_rank(self): maxIter = 15 sourceIds = [1, 2, 3, 4] pr = g.parallelPersonalizedPageRank(resetProb, sourceIds=sourceIds, maxIter=maxIter) - self._hasCols(pr, vcols=["id", "pageranks"], ecols=["src", "dst", "weight"]) + self._hasCols(pr, vcols=['id', 'pageranks'], ecols=['src', 'dst', 'weight']) def test_shortest_paths(self): edges = [(1, 2), (1, 5), (2, 3), (2, 5), (3, 4), (4, 5), (4, 6)] - # Create bidirectional edges. all_edges = [z for (a, b) in edges for z in [(a, b), (b, a)]] edges = self.spark.createDataFrame(all_edges, ["src", "dst"]) vertices = self.spark.createDataFrame([(i,) for i in range(1, 7)], ["id"]) @@ -428,19 +390,17 @@ def test_shortest_paths(self): def test_svd_plus_plus(self): g = self._graph("ALSSyntheticData") - v2, cost = g.svdPlusPlus() - self._df_hasCols(v2, vcols=["id", "column1", "column2", "column3", "column4"]) + (v2, cost) = g.svdPlusPlus() + self._df_hasCols(v2, vcols=['id', 'column1', 'column2', 'column3', 'column4']) def test_strongly_connected_components(self): - # Simple island test. + # Simple island test vertices = self.spark.createDataFrame([(i,) for i in range(1, 6)], ["id"]) edges = self.spark.createDataFrame([(7, 8)], ["src", "dst"]) g = GraphFrame(vertices, edges) c = g.stronglyConnectedComponents(5) for row in c.collect(): - assert ( - row.id == row.component - ), f"Vertex {row.id} not equal to its component {row.component}" + self.assertEqual(row.id, row.component) def test_triangle_counts(self): edges = self.spark.createDataFrame([(0, 1), (1, 2), (2, 0)], ["src", "dst"]) @@ -448,66 +408,61 @@ def test_triangle_counts(self): g = GraphFrame(vertices, edges) c = g.triangleCount() for row in c.select("id", "count").collect(): - assert row.asDict()["count"] == 1, f"Triangle count for vertex {row.id} is not 1" - + self.assertEqual(row.asDict()['count'], 1) + def test_mutithreaded_sparksession_usage(self): - # Test that the GraphFrame API works correctly from multiple threads. + # Test that we can use the GraphFrame API from multiple threads localVertices = [(1, "A"), (2, "B"), (3, "C")] localEdges = [(1, 2, "love"), (2, 1, "hate"), (2, 3, "follow")] v = self.spark.createDataFrame(localVertices, ["id", "name"]) e = self.spark.createDataFrame(localEdges, ["src", "dst", "action"]) - + + exc = None - def run_graphframe() -> None: - nonlocal exc try: GraphFrame(v, e) except Exception as _e: + nonlocal exc exc = _e - + import threading - thread = threading.Thread(target=run_graphframe) thread.start() thread.join() - assert exc is None, f"Exception was raised in thread: {exc}" - + self.assertIsNone(exc, f"Exception was raised in thread: {exc}") -@pytest.mark.usefixtures("set_spark") -class TestGraphFrameExamples: - def setup_method(self, method): - # Set up the Java API instance for use in the tests. +class GraphFrameExamplesTest(GraphFrameTestCase): + def setUp(self): + super(GraphFrameExamplesTest, self).setUp() self.japi = _java_api(self.spark._sc) def test_belief_propagation(self): - # Create a graphical model g of size 3x3. + # create graphical model g of size 3 x 3 g = Graphs(self.spark).gridIsingModel(3) - # Run Belief Propagation (BP) for 5 iterations. + # run BP for 5 iterations numIter = 5 results = BeliefPropagation.runBPwithGraphFrames(g, numIter) - # Check that each belief is a valid probability in [0, 1]. - for row in results.vertices.select("belief").collect(): - belief = row["belief"] - assert ( - 0 <= belief <= 1 - ), f"Expected belief to be probability in [0,1], but found {belief}" + # check beliefs are valid + for row in results.vertices.select('belief').collect(): + belief = row['belief'] + self.assertTrue( + 0 <= belief <= 1, + msg="Expected belief to be probability in [0,1], but found {}".format(belief)) def test_graph_friends(self): - # Construct the graph. + # construct graph g = Graphs(self.spark).friends() - # Check that the result is an instance of GraphFrame. - assert isinstance(g, GraphFrame) + # check that a GraphFrame instance was returned + self.assertIsInstance(g, GraphFrame) def test_graph_grid_ising_model(self): - # Construct a grid Ising model graph. + # construct graph n = 3 g = Graphs(self.spark).gridIsingModel(n) - # Collect the vertex ids. - ids = [v["id"] for v in g.vertices.collect()] - # Verify that every expected vertex id appears. + # check that all the vertices exist + ids = [v['id'] for v in g.vertices.collect()] for i in range(n): for j in range(n): - expected_id = f"{i},{j}" - assert expected_id in ids, f"Vertex {expected_id} not found in {ids}" + self.assertIn('{},{}'.format(i, j), ids) From 18b5da033042e328c08062c03002fba1c7ab7a75 Mon Sep 17 00:00:00 2001 From: Russell Jurney Date: Sun, 16 Feb 2025 13:27:42 -0800 Subject: [PATCH 10/53] Back out version changes to make nose tests pass --- .github/workflows/python-ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml index 36b6b97e7..157b328f1 100644 --- a/.github/workflows/python-ci.yml +++ b/.github/workflows/python-ci.yml @@ -7,8 +7,8 @@ jobs: matrix: include: - spark-version: 3.5.4 - scala-version: 2.12.20 - python-version: 3.11.11 + scala-version: 2.12.18 + python-version: 3.9.19 runs-on: ubuntu-22.04 env: # define Java options for both official sbt and sbt-extras From 8eca097f11c75c82dd72b2d5de596c935192f400 Mon Sep 17 00:00:00 2001 From: Russell Jurney Date: Sun, 16 Feb 2025 13:30:24 -0800 Subject: [PATCH 11/53] Remove changes to requirements --- python/requirements.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/requirements.txt b/python/requirements.txt index fb73319f2..3db67f231 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -1,6 +1,4 @@ # This file should list any python package dependencies. pyspark>=2.0.0 -click==8.1.8 numpy>=1.7 py7zr==0.22.0 -requests==2.32.3 From 277c06fe75fe288657f64778e78d9b0a9712a2ae Mon Sep 17 00:00:00 2001 From: Russell Jurney Date: Sun, 16 Feb 2025 13:31:16 -0800 Subject: [PATCH 12/53] Put nose back in requirements.txt --- python/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/requirements.txt b/python/requirements.txt index 3db67f231..9893b3cb1 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -1,4 +1,4 @@ # This file should list any python package dependencies. +nose==1.3.7 pyspark>=2.0.0 numpy>=1.7 -py7zr==0.22.0 From b55ee4881849a882717bc0035902a855817c7113 Mon Sep 17 00:00:00 2001 From: Russell Jurney Date: Sun, 16 Feb 2025 13:31:51 -0800 Subject: [PATCH 13/53] Remove version bump to version.sbt --- version.sbt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/version.sbt b/version.sbt index 6fbb590a4..f72bdcc0e 100644 --- a/version.sbt +++ b/version.sbt @@ -1 +1 @@ -ThisBuild / version := "0.8.5" +ThisBuild / version := "0.8.4" From f8a8fd9ea062ad2e0504ddb7e7e22b70d5c7b013 Mon Sep 17 00:00:00 2001 From: Russell Jurney Date: Sun, 16 Feb 2025 13:39:29 -0800 Subject: [PATCH 14/53] Remove packages related to testing --- python/requirements-dev.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/requirements-dev.txt b/python/requirements-dev.txt index b27da4d73..6e596dc62 100644 --- a/python/requirements-dev.txt +++ b/python/requirements-dev.txt @@ -1,5 +1,3 @@ -pytest==8.3.4 -Sphinx==8.1.3 flake8==7.1.1 isort==6.0.0 mypy==1.14.1 From bc2cb36e7f1012c10b12907ff3b6b284e9c9b6c3 Mon Sep 17 00:00:00 2001 From: Russell Jurney Date: Sun, 16 Feb 2025 14:15:36 -0800 Subject: [PATCH 15/53] Remove old setup.py / setup.cfg --- python/setup.cfg | 42 ------------------------------------------ python/setup.py | 35 ----------------------------------- 2 files changed, 77 deletions(-) delete mode 100644 python/setup.cfg delete mode 100644 python/setup.py diff --git a/python/setup.cfg b/python/setup.cfg deleted file mode 100644 index 02a0d5136..000000000 --- a/python/setup.cfg +++ /dev/null @@ -1,42 +0,0 @@ -[metadata] -name = graphframes -version = 0.8.5 -description = GraphFrames: Graph Processing Framework for Apache Spark -long_description = file: ../README.md -long_description_content_type = text/markdown -author = GraphFrames Contributors -author_email = graphframes@googlegroups.com -url = https://pypi.org/project/graphframes-py/ -license = Apache License 2.0 -classifiers = - Development Status :: 4 - Beta - Programming Language :: Python :: 3 - Operating System :: OS Independent - -[options] -packages = find: -package_dir = - = python -include_package_data = True -install_requires = - pyspark>=2.0.0 - click==8.1.8 - numpy>=1.7 - py7zr==0.22.0 - requests==2.32.3 - -[options.packages.find] -where = python - exclude = - tests.py - docs - -[options.extras_require] -dev = - pytest==8.3.4 - Sphinx==8.1.3 - black==25.1.0 - flake8==7.1.1 - isort==6.0.0 - mypy==1.14.1 - pre-commit==3.5.1 diff --git a/python/setup.py b/python/setup.py deleted file mode 100644 index a91fb629a..000000000 --- a/python/setup.py +++ /dev/null @@ -1,35 +0,0 @@ -from setuptools import setup, find_packages # type: ignore -import os - - -def parse_requirements(filename): - """Load requirements from a pip requirements file.""" - with open(filename, encoding="utf-8") as f: - # Filter out comments and empty lines. - return [line.strip() for line in f if line.strip() and not line.startswith("#")] - - -# Read the long description from the README file. -here = os.path.abspath(os.path.dirname(__file__)) - -# Use requirements.txt to get the list of dependencies. -requirements = parse_requirements(os.path.join(here, "requirements.txt")) - -setup( - name="graphframes", - version=open("version.sbt").read().strip(), # Update this version as needed - description="GraphFrames: Graph Processing Framework for Apache Spark", - long_description=open(os.path.join(f"{here}/..", "README.md"), encoding="utf-8").read(), - long_description_content_type="text/markdown", - author="GraphFrames Contributors", - author_email="graphframes@googlegroups.com", - url="https://pypi.org/project/graphframes-py", - packages=find_packages(where="python"), - package_dir={"": "python"}, - include_package_data=True, # Include non-code files specified in MANIFEST.in - install_requires=requirements, - classifiers=[ - "Programming Language :: Python :: 3", - "Operating System :: OS Independent", - ], -) From 728be33b6dfd2ee7135711df752289966afdbc2a Mon Sep 17 00:00:00 2001 From: Russell Jurney Date: Sun, 16 Feb 2025 14:15:46 -0800 Subject: [PATCH 16/53] New pyproject.toml and poetry.lock --- python/poetry.lock | 360 ++++++++++++++++++++++++++++++++++++++++++ python/pyproject.toml | 38 +++++ 2 files changed, 398 insertions(+) create mode 100644 python/poetry.lock create mode 100644 python/pyproject.toml diff --git a/python/poetry.lock b/python/poetry.lock new file mode 100644 index 000000000..6eb61618d --- /dev/null +++ b/python/poetry.lock @@ -0,0 +1,360 @@ +# This file is automatically @generated by Poetry 2.0.1 and should not be changed by hand. + +[[package]] +name = "black" +version = "25.1.0" +description = "The uncompromising code formatter." +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "black-25.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:759e7ec1e050a15f89b770cefbf91ebee8917aac5c20483bc2d80a6c3a04df32"}, + {file = "black-25.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0e519ecf93120f34243e6b0054db49c00a35f84f195d5bce7e9f5cfc578fc2da"}, + {file = "black-25.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:055e59b198df7ac0b7efca5ad7ff2516bca343276c466be72eb04a3bcc1f82d7"}, + {file = "black-25.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:db8ea9917d6f8fc62abd90d944920d95e73c83a5ee3383493e35d271aca872e9"}, + {file = "black-25.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a39337598244de4bae26475f77dda852ea00a93bd4c728e09eacd827ec929df0"}, + {file = "black-25.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:96c1c7cd856bba8e20094e36e0f948718dc688dba4a9d78c3adde52b9e6c2299"}, + {file = "black-25.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bce2e264d59c91e52d8000d507eb20a9aca4a778731a08cfff7e5ac4a4bb7096"}, + {file = "black-25.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:172b1dbff09f86ce6f4eb8edf9dede08b1fce58ba194c87d7a4f1a5aa2f5b3c2"}, + {file = "black-25.1.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:4b60580e829091e6f9238c848ea6750efed72140b91b048770b64e74fe04908b"}, + {file = "black-25.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1e2978f6df243b155ef5fa7e558a43037c3079093ed5d10fd84c43900f2d8ecc"}, + {file = "black-25.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3b48735872ec535027d979e8dcb20bf4f70b5ac75a8ea99f127c106a7d7aba9f"}, + {file = "black-25.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:ea0213189960bda9cf99be5b8c8ce66bb054af5e9e861249cd23471bd7b0b3ba"}, + {file = "black-25.1.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:8f0b18a02996a836cc9c9c78e5babec10930862827b1b724ddfe98ccf2f2fe4f"}, + {file = "black-25.1.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:afebb7098bfbc70037a053b91ae8437c3857482d3a690fefc03e9ff7aa9a5fd3"}, + {file = "black-25.1.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:030b9759066a4ee5e5aca28c3c77f9c64789cdd4de8ac1df642c40b708be6171"}, + {file = "black-25.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:a22f402b410566e2d1c950708c77ebf5ebd5d0d88a6a2e87c86d9fb48afa0d18"}, + {file = "black-25.1.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a1ee0a0c330f7b5130ce0caed9936a904793576ef4d2b98c40835d6a65afa6a0"}, + {file = "black-25.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f3df5f1bf91d36002b0a75389ca8663510cf0531cca8aa5c1ef695b46d98655f"}, + {file = "black-25.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d9e6827d563a2c820772b32ce8a42828dc6790f095f441beef18f96aa6f8294e"}, + {file = "black-25.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:bacabb307dca5ebaf9c118d2d2f6903da0d62c9faa82bd21a33eecc319559355"}, + {file = "black-25.1.0-py3-none-any.whl", hash = "sha256:95e8176dae143ba9097f351d174fdaf0ccd29efb414b362ae3fd72bf0f710717"}, + {file = "black-25.1.0.tar.gz", hash = "sha256:33496d5cd1222ad73391352b4ae8da15253c5de89b93a80b3e2c8d9a19ec2666"}, +] + +[package.dependencies] +click = ">=8.0.0" +mypy-extensions = ">=0.4.3" +packaging = ">=22.0" +pathspec = ">=0.9.0" +platformdirs = ">=2" +tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} +typing-extensions = {version = ">=4.0.1", markers = "python_version < \"3.11\""} + +[package.extras] +colorama = ["colorama (>=0.4.3)"] +d = ["aiohttp (>=3.10)"] +jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"] +uvloop = ["uvloop (>=0.15.2)"] + +[[package]] +name = "click" +version = "8.1.8" +description = "Composable command line interface toolkit" +optional = false +python-versions = ">=3.7" +groups = ["dev"] +files = [ + {file = "click-8.1.8-py3-none-any.whl", hash = "sha256:63c132bbbed01578a06712a2d1f497bb62d9c1c0d329b7903a866228027263b2"}, + {file = "click-8.1.8.tar.gz", hash = "sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "platform_system == \"Windows\""} + +[[package]] +name = "colorama" +version = "0.4.6" +description = "Cross-platform colored terminal text." +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" +groups = ["dev"] +markers = "platform_system == \"Windows\"" +files = [ + {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, + {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, +] + +[[package]] +name = "flake8" +version = "7.1.2" +description = "the modular source code checker: pep8 pyflakes and co" +optional = false +python-versions = ">=3.8.1" +groups = ["dev"] +files = [ + {file = "flake8-7.1.2-py2.py3-none-any.whl", hash = "sha256:1cbc62e65536f65e6d754dfe6f1bada7f5cf392d6f5db3c2b85892466c3e7c1a"}, + {file = "flake8-7.1.2.tar.gz", hash = "sha256:c586ffd0b41540951ae41af572e6790dbd49fc12b3aa2541685d253d9bd504bd"}, +] + +[package.dependencies] +mccabe = ">=0.7.0,<0.8.0" +pycodestyle = ">=2.12.0,<2.13.0" +pyflakes = ">=3.2.0,<3.3.0" + +[[package]] +name = "isort" +version = "6.0.0" +description = "A Python utility / library to sort Python imports." +optional = false +python-versions = ">=3.9.0" +groups = ["dev"] +files = [ + {file = "isort-6.0.0-py3-none-any.whl", hash = "sha256:567954102bb47bb12e0fae62606570faacddd441e45683968c8d1734fb1af892"}, + {file = "isort-6.0.0.tar.gz", hash = "sha256:75d9d8a1438a9432a7d7b54f2d3b45cad9a4a0fdba43617d9873379704a8bdf1"}, +] + +[package.extras] +colors = ["colorama"] +plugins = ["setuptools"] + +[[package]] +name = "mccabe" +version = "0.7.0" +description = "McCabe checker, plugin for flake8" +optional = false +python-versions = ">=3.6" +groups = ["dev"] +files = [ + {file = "mccabe-0.7.0-py2.py3-none-any.whl", hash = "sha256:6c2d30ab6be0e4a46919781807b4f0d834ebdd6c6e3dca0bda5a15f863427b6e"}, + {file = "mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"}, +] + +[[package]] +name = "mypy-extensions" +version = "1.0.0" +description = "Type system extensions for programs checked with the mypy type checker." +optional = false +python-versions = ">=3.5" +groups = ["dev"] +files = [ + {file = "mypy_extensions-1.0.0-py3-none-any.whl", hash = "sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d"}, + {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"}, +] + +[[package]] +name = "nose" +version = "1.3.7" +description = "nose extends unittest to make testing easier" +optional = false +python-versions = "*" +groups = ["main"] +files = [ + {file = "nose-1.3.7-py2-none-any.whl", hash = "sha256:dadcddc0aefbf99eea214e0f1232b94f2fa9bd98fa8353711dacb112bfcbbb2a"}, + {file = "nose-1.3.7-py3-none-any.whl", hash = "sha256:9ff7c6cc443f8c51994b34a667bbcf45afd6d945be7477b52e97516fd17c53ac"}, + {file = "nose-1.3.7.tar.gz", hash = "sha256:f1bffef9cbc82628f6e7d7b40d7e255aefaa1adb6a1b1d26c69a8b79e6208a98"}, +] + +[[package]] +name = "numpy" +version = "2.0.2" +description = "Fundamental package for array computing in Python" +optional = false +python-versions = ">=3.9" +groups = ["main"] +files = [ + {file = "numpy-2.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:51129a29dbe56f9ca83438b706e2e69a39892b5eda6cedcb6b0c9fdc9b0d3ece"}, + {file = "numpy-2.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f15975dfec0cf2239224d80e32c3170b1d168335eaedee69da84fbe9f1f9cd04"}, + {file = "numpy-2.0.2-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:8c5713284ce4e282544c68d1c3b2c7161d38c256d2eefc93c1d683cf47683e66"}, + {file = "numpy-2.0.2-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:becfae3ddd30736fe1889a37f1f580e245ba79a5855bff5f2a29cb3ccc22dd7b"}, + {file = "numpy-2.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2da5960c3cf0df7eafefd806d4e612c5e19358de82cb3c343631188991566ccd"}, + {file = "numpy-2.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:496f71341824ed9f3d2fd36cf3ac57ae2e0165c143b55c3a035ee219413f3318"}, + {file = "numpy-2.0.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a61ec659f68ae254e4d237816e33171497e978140353c0c2038d46e63282d0c8"}, + {file = "numpy-2.0.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d731a1c6116ba289c1e9ee714b08a8ff882944d4ad631fd411106a30f083c326"}, + {file = "numpy-2.0.2-cp310-cp310-win32.whl", hash = "sha256:984d96121c9f9616cd33fbd0618b7f08e0cfc9600a7ee1d6fd9b239186d19d97"}, + {file = "numpy-2.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:c7b0be4ef08607dd04da4092faee0b86607f111d5ae68036f16cc787e250a131"}, + {file = "numpy-2.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:49ca4decb342d66018b01932139c0961a8f9ddc7589611158cb3c27cbcf76448"}, + {file = "numpy-2.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:11a76c372d1d37437857280aa142086476136a8c0f373b2e648ab2c8f18fb195"}, + {file = "numpy-2.0.2-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:807ec44583fd708a21d4a11d94aedf2f4f3c3719035c76a2bbe1fe8e217bdc57"}, + {file = "numpy-2.0.2-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:8cafab480740e22f8d833acefed5cc87ce276f4ece12fdaa2e8903db2f82897a"}, + {file = "numpy-2.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a15f476a45e6e5a3a79d8a14e62161d27ad897381fecfa4a09ed5322f2085669"}, + {file = "numpy-2.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:13e689d772146140a252c3a28501da66dfecd77490b498b168b501835041f951"}, + {file = "numpy-2.0.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:9ea91dfb7c3d1c56a0e55657c0afb38cf1eeae4544c208dc465c3c9f3a7c09f9"}, + {file = "numpy-2.0.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c1c9307701fec8f3f7a1e6711f9089c06e6284b3afbbcd259f7791282d660a15"}, + {file = "numpy-2.0.2-cp311-cp311-win32.whl", hash = "sha256:a392a68bd329eafac5817e5aefeb39038c48b671afd242710b451e76090e81f4"}, + {file = "numpy-2.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:286cd40ce2b7d652a6f22efdfc6d1edf879440e53e76a75955bc0c826c7e64dc"}, + {file = "numpy-2.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:df55d490dea7934f330006d0f81e8551ba6010a5bf035a249ef61a94f21c500b"}, + {file = "numpy-2.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8df823f570d9adf0978347d1f926b2a867d5608f434a7cff7f7908c6570dcf5e"}, + {file = "numpy-2.0.2-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:9a92ae5c14811e390f3767053ff54eaee3bf84576d99a2456391401323f4ec2c"}, + {file = "numpy-2.0.2-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:a842d573724391493a97a62ebbb8e731f8a5dcc5d285dfc99141ca15a3302d0c"}, + {file = "numpy-2.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c05e238064fc0610c840d1cf6a13bf63d7e391717d247f1bf0318172e759e692"}, + {file = "numpy-2.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0123ffdaa88fa4ab64835dcbde75dcdf89c453c922f18dced6e27c90d1d0ec5a"}, + {file = "numpy-2.0.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:96a55f64139912d61de9137f11bf39a55ec8faec288c75a54f93dfd39f7eb40c"}, + {file = "numpy-2.0.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ec9852fb39354b5a45a80bdab5ac02dd02b15f44b3804e9f00c556bf24b4bded"}, + {file = "numpy-2.0.2-cp312-cp312-win32.whl", hash = "sha256:671bec6496f83202ed2d3c8fdc486a8fc86942f2e69ff0e986140339a63bcbe5"}, + {file = "numpy-2.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:cfd41e13fdc257aa5778496b8caa5e856dc4896d4ccf01841daee1d96465467a"}, + {file = "numpy-2.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9059e10581ce4093f735ed23f3b9d283b9d517ff46009ddd485f1747eb22653c"}, + {file = "numpy-2.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:423e89b23490805d2a5a96fe40ec507407b8ee786d66f7328be214f9679df6dd"}, + {file = "numpy-2.0.2-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:2b2955fa6f11907cf7a70dab0d0755159bca87755e831e47932367fc8f2f2d0b"}, + {file = "numpy-2.0.2-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:97032a27bd9d8988b9a97a8c4d2c9f2c15a81f61e2f21404d7e8ef00cb5be729"}, + {file = "numpy-2.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1e795a8be3ddbac43274f18588329c72939870a16cae810c2b73461c40718ab1"}, + {file = "numpy-2.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f26b258c385842546006213344c50655ff1555a9338e2e5e02a0756dc3e803dd"}, + {file = "numpy-2.0.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5fec9451a7789926bcf7c2b8d187292c9f93ea30284802a0ab3f5be8ab36865d"}, + {file = "numpy-2.0.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:9189427407d88ff25ecf8f12469d4d39d35bee1db5d39fc5c168c6f088a6956d"}, + {file = "numpy-2.0.2-cp39-cp39-win32.whl", hash = "sha256:905d16e0c60200656500c95b6b8dca5d109e23cb24abc701d41c02d74c6b3afa"}, + {file = "numpy-2.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:a3f4ab0caa7f053f6797fcd4e1e25caee367db3112ef2b6ef82d749530768c73"}, + {file = "numpy-2.0.2-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:7f0a0c6f12e07fa94133c8a67404322845220c06a9e80e85999afe727f7438b8"}, + {file = "numpy-2.0.2-pp39-pypy39_pp73-macosx_14_0_x86_64.whl", hash = "sha256:312950fdd060354350ed123c0e25a71327d3711584beaef30cdaa93320c392d4"}, + {file = "numpy-2.0.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:26df23238872200f63518dd2aa984cfca675d82469535dc7162dc2ee52d9dd5c"}, + {file = "numpy-2.0.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a46288ec55ebbd58947d31d72be2c63cbf839f0a63b49cb755022310792a3385"}, + {file = "numpy-2.0.2.tar.gz", hash = "sha256:883c987dee1880e2a864ab0dc9892292582510604156762362d9326444636e78"}, +] + +[[package]] +name = "packaging" +version = "24.2" +description = "Core utilities for Python packages" +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759"}, + {file = "packaging-24.2.tar.gz", hash = "sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f"}, +] + +[[package]] +name = "pathspec" +version = "0.12.1" +description = "Utility library for gitignore style pattern matching of file paths." +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08"}, + {file = "pathspec-0.12.1.tar.gz", hash = "sha256:a482d51503a1ab33b1c67a6c3813a26953dbdc71c31dacaef9a838c4e29f5712"}, +] + +[[package]] +name = "platformdirs" +version = "4.3.6" +description = "A small Python package for determining appropriate platform-specific dirs, e.g. a `user data dir`." +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "platformdirs-4.3.6-py3-none-any.whl", hash = "sha256:73e575e1408ab8103900836b97580d5307456908a03e92031bab39e4554cc3fb"}, + {file = "platformdirs-4.3.6.tar.gz", hash = "sha256:357fb2acbc885b0419afd3ce3ed34564c13c9b95c89360cd9563f73aa5e2b907"}, +] + +[package.extras] +docs = ["furo (>=2024.8.6)", "proselint (>=0.14)", "sphinx (>=8.0.2)", "sphinx-autodoc-typehints (>=2.4)"] +test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=8.3.2)", "pytest-cov (>=5)", "pytest-mock (>=3.14)"] +type = ["mypy (>=1.11.2)"] + +[[package]] +name = "py4j" +version = "0.10.9.7" +description = "Enables Python programs to dynamically access arbitrary Java objects" +optional = false +python-versions = "*" +groups = ["main"] +files = [ + {file = "py4j-0.10.9.7-py2.py3-none-any.whl", hash = "sha256:85defdfd2b2376eb3abf5ca6474b51ab7e0de341c75a02f46dc9b5976f5a5c1b"}, + {file = "py4j-0.10.9.7.tar.gz", hash = "sha256:0b6e5315bb3ada5cf62ac651d107bb2ebc02def3dee9d9548e3baac644ea8dbb"}, +] + +[[package]] +name = "pycodestyle" +version = "2.12.1" +description = "Python style guide checker" +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "pycodestyle-2.12.1-py2.py3-none-any.whl", hash = "sha256:46f0fb92069a7c28ab7bb558f05bfc0110dac69a0cd23c61ea0040283a9d78b3"}, + {file = "pycodestyle-2.12.1.tar.gz", hash = "sha256:6838eae08bbce4f6accd5d5572075c63626a15ee3e6f842df996bf62f6d73521"}, +] + +[[package]] +name = "pyflakes" +version = "3.2.0" +description = "passive checker of Python programs" +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "pyflakes-3.2.0-py2.py3-none-any.whl", hash = "sha256:84b5be138a2dfbb40689ca07e2152deb896a65c3a3e24c251c5c62489568074a"}, + {file = "pyflakes-3.2.0.tar.gz", hash = "sha256:1c61603ff154621fb2a9172037d84dca3500def8c8b630657d1701f026f8af3f"}, +] + +[[package]] +name = "pyspark" +version = "3.5.4" +description = "Apache Spark Python API" +optional = false +python-versions = ">=3.8" +groups = ["main"] +files = [ + {file = "pyspark-3.5.4.tar.gz", hash = "sha256:1c2926d63020902163f58222466adf6f8016f6c43c1f319b8e7a71dbaa05fc51"}, +] + +[package.dependencies] +py4j = "0.10.9.7" + +[package.extras] +connect = ["googleapis-common-protos (>=1.56.4)", "grpcio (>=1.56.0)", "grpcio-status (>=1.56.0)", "numpy (>=1.15,<2)", "pandas (>=1.0.5)", "pyarrow (>=4.0.0)"] +ml = ["numpy (>=1.15,<2)"] +mllib = ["numpy (>=1.15,<2)"] +pandas-on-spark = ["numpy (>=1.15,<2)", "pandas (>=1.0.5)", "pyarrow (>=4.0.0)"] +sql = ["numpy (>=1.15,<2)", "pandas (>=1.0.5)", "pyarrow (>=4.0.0)"] + +[[package]] +name = "tomli" +version = "2.2.1" +description = "A lil' TOML parser" +optional = false +python-versions = ">=3.8" +groups = ["dev"] +markers = "python_version < \"3.11\"" +files = [ + {file = "tomli-2.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:678e4fa69e4575eb77d103de3df8a895e1591b48e740211bd1067378c69e8249"}, + {file = "tomli-2.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:023aa114dd824ade0100497eb2318602af309e5a55595f76b626d6d9f3b7b0a6"}, + {file = "tomli-2.2.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ece47d672db52ac607a3d9599a9d48dcb2f2f735c6c2d1f34130085bb12b112a"}, + {file = "tomli-2.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6972ca9c9cc9f0acaa56a8ca1ff51e7af152a9f87fb64623e31d5c83700080ee"}, + {file = "tomli-2.2.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c954d2250168d28797dd4e3ac5cf812a406cd5a92674ee4c8f123c889786aa8e"}, + {file = "tomli-2.2.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8dd28b3e155b80f4d54beb40a441d366adcfe740969820caf156c019fb5c7ec4"}, + {file = "tomli-2.2.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:e59e304978767a54663af13c07b3d1af22ddee3bb2fb0618ca1593e4f593a106"}, + {file = "tomli-2.2.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:33580bccab0338d00994d7f16f4c4ec25b776af3ffaac1ed74e0b3fc95e885a8"}, + {file = "tomli-2.2.1-cp311-cp311-win32.whl", hash = "sha256:465af0e0875402f1d226519c9904f37254b3045fc5084697cefb9bdde1ff99ff"}, + {file = "tomli-2.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:2d0f2fdd22b02c6d81637a3c95f8cd77f995846af7414c5c4b8d0545afa1bc4b"}, + {file = "tomli-2.2.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:4a8f6e44de52d5e6c657c9fe83b562f5f4256d8ebbfe4ff922c495620a7f6cea"}, + {file = "tomli-2.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8d57ca8095a641b8237d5b079147646153d22552f1c637fd3ba7f4b0b29167a8"}, + {file = "tomli-2.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e340144ad7ae1533cb897d406382b4b6fede8890a03738ff1683af800d54192"}, + {file = "tomli-2.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:db2b95f9de79181805df90bedc5a5ab4c165e6ec3fe99f970d0e302f384ad222"}, + {file = "tomli-2.2.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:40741994320b232529c802f8bc86da4e1aa9f413db394617b9a256ae0f9a7f77"}, + {file = "tomli-2.2.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:400e720fe168c0f8521520190686ef8ef033fb19fc493da09779e592861b78c6"}, + {file = "tomli-2.2.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:02abe224de6ae62c19f090f68da4e27b10af2b93213d36cf44e6e1c5abd19fdd"}, + {file = "tomli-2.2.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:b82ebccc8c8a36f2094e969560a1b836758481f3dc360ce9a3277c65f374285e"}, + {file = "tomli-2.2.1-cp312-cp312-win32.whl", hash = "sha256:889f80ef92701b9dbb224e49ec87c645ce5df3fa2cc548664eb8a25e03127a98"}, + {file = "tomli-2.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:7fc04e92e1d624a4a63c76474610238576942d6b8950a2d7f908a340494e67e4"}, + {file = "tomli-2.2.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f4039b9cbc3048b2416cc57ab3bda989a6fcf9b36cf8937f01a6e731b64f80d7"}, + {file = "tomli-2.2.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:286f0ca2ffeeb5b9bd4fcc8d6c330534323ec51b2f52da063b11c502da16f30c"}, + {file = "tomli-2.2.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a92ef1a44547e894e2a17d24e7557a5e85a9e1d0048b0b5e7541f76c5032cb13"}, + {file = "tomli-2.2.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9316dc65bed1684c9a98ee68759ceaed29d229e985297003e494aa825ebb0281"}, + {file = "tomli-2.2.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e85e99945e688e32d5a35c1ff38ed0b3f41f43fad8df0bdf79f72b2ba7bc5272"}, + {file = "tomli-2.2.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ac065718db92ca818f8d6141b5f66369833d4a80a9d74435a268c52bdfa73140"}, + {file = "tomli-2.2.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:d920f33822747519673ee656a4b6ac33e382eca9d331c87770faa3eef562aeb2"}, + {file = "tomli-2.2.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a198f10c4d1b1375d7687bc25294306e551bf1abfa4eace6650070a5c1ae2744"}, + {file = "tomli-2.2.1-cp313-cp313-win32.whl", hash = "sha256:d3f5614314d758649ab2ab3a62d4f2004c825922f9e370b29416484086b264ec"}, + {file = "tomli-2.2.1-cp313-cp313-win_amd64.whl", hash = "sha256:a38aa0308e754b0e3c67e344754dff64999ff9b513e691d0e786265c93583c69"}, + {file = "tomli-2.2.1-py3-none-any.whl", hash = "sha256:cb55c73c5f4408779d0cf3eef9f762b9c9f147a77de7b258bef0a5628adc85cc"}, + {file = "tomli-2.2.1.tar.gz", hash = "sha256:cd45e1dc79c835ce60f7404ec8119f2eb06d38b1deba146f07ced3bbc44505ff"}, +] + +[[package]] +name = "typing-extensions" +version = "4.12.2" +description = "Backported and Experimental Type Hints for Python 3.8+" +optional = false +python-versions = ">=3.8" +groups = ["dev"] +markers = "python_version < \"3.11\"" +files = [ + {file = "typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d"}, + {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"}, +] + +[metadata] +lock-version = "2.1" +python-versions = ">=3.9 <3.13" +content-hash = "430f562a040c0eabc2fe5c93757801dd9d7ed4c5173be37c7fc3808e04668ccd" diff --git a/python/pyproject.toml b/python/pyproject.toml new file mode 100644 index 000000000..076cc3ce4 --- /dev/null +++ b/python/pyproject.toml @@ -0,0 +1,38 @@ +[tool.poetry] +name = "graphframes-py" +version = "0.8.4" +description = "GraphFrames: Graph Processing Framework for Apache Spark" +authors = ["GraphFrames Contributors "] +license = "Apache 2.0" +readme = "../README.md" +packages = [{include = "graphframes"}] + +[tool.poetry.urls] +"Project Homepage" = "https://graphframes.github.io/graphframes" +"PyPi Homepage" = "https://pypi.org/project/graphframes-py" +"Code Repository" = "https://github.com/graphframes/graphframes" +"Bug Tracker" = "https://github.com/graphframes/graphframes/issues" + +[tool.poetry.dependencies] +python = ">=3.9 <3.13" +nose = "1.3.7" +pyspark = ">= 2.0.0" +numpy = ">= 1.7" + +[tool.poetry.group.dev.dependencies] +black = "^25.1.0" +flake8 = "^7.1.1" +isort = "^6.0.0" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" + +[tool.black] +line-length = 100 +target-version = ["py39"] +include = ["graphframes", "test"] + +[tool.isort] +profile = "black" +src_paths = ["graphframes", "test"] From 3cea1a88e85b54662a9126dc56ab52c625ca7b3a Mon Sep 17 00:00:00 2001 From: Russell Jurney Date: Sun, 16 Feb 2025 14:24:58 -0800 Subject: [PATCH 17/53] Short README for Python package, poetry won't allow a ../README.md path --- python/README.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 python/README.md diff --git a/python/README.md b/python/README.md new file mode 100644 index 000000000..0ab7cd4ba --- /dev/null +++ b/python/README.md @@ -0,0 +1,17 @@ +# GraphFrames `graphframes-py` Python Package + +The is the officila [graphframes-py PyPI package](https://pypi.org/project/graphframes-py/), which is a Python wrapper for the Scala GraphFrames library. This package is maintained by the GraphFrames project and is available on PyPI. + +For instructions on GraphFrames, check the project [../README.md](../README.md). See [Installation and Quick-Start](#installation-and-quick-start) for the best way to install and use GraphFrames. + +## Running `graphframes-py` + +You should use GraphFrames via the `--packages` argument to `pyspark` or `spark-submit`, but this package is helpful in development environments. + +```bash +# Interactive Python +$ pyspark --packages graphframes:graphframes:0.8.4-spark3.5-s_2.12 + +# Submit a script in Scala/Java/Python +$ spark-submit --packages graphframes:graphframes:0.8.4-spark3.5-s_2.12 script.py +``` From 87cc97514c4aa7e1d76b5a3bb80fd5ee4e2abf50 Mon Sep 17 00:00:00 2001 From: Russell Jurney Date: Sun, 16 Feb 2025 14:25:41 -0800 Subject: [PATCH 18/53] Remove requirements files in favor of pyproject.toml --- python/requirements-dev.txt | 4 ---- python/requirements.txt | 4 ---- 2 files changed, 8 deletions(-) delete mode 100644 python/requirements-dev.txt delete mode 100644 python/requirements.txt diff --git a/python/requirements-dev.txt b/python/requirements-dev.txt deleted file mode 100644 index 6e596dc62..000000000 --- a/python/requirements-dev.txt +++ /dev/null @@ -1,4 +0,0 @@ -flake8==7.1.1 -isort==6.0.0 -mypy==1.14.1 -pre-commit==4.0.1 diff --git a/python/requirements.txt b/python/requirements.txt deleted file mode 100644 index 9893b3cb1..000000000 --- a/python/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -# This file should list any python package dependencies. -nose==1.3.7 -pyspark>=2.0.0 -numpy>=1.7 From 6f84a5a634bcdf731c469644a3509074c3ce58d7 Mon Sep 17 00:00:00 2001 From: Russell Jurney Date: Sun, 16 Feb 2025 14:33:18 -0800 Subject: [PATCH 19/53] Try to poetrize CI build --- .github/workflows/python-ci.yml | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml index 157b328f1..47a484c1e 100644 --- a/.github/workflows/python-ci.yml +++ b/.github/workflows/python-ci.yml @@ -31,15 +31,13 @@ jobs: - uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - - name: Install python dependencies + - name: Build Python package and its dependencies + working-directory: ./python run: | - python -m pip install --upgrade pip wheel - pip install -r ./python/requirements.txt - pip install -r ./python/requirements-dev.txt - pip install pyspark==${{ matrix.spark-version }} + python -m pip install --upgrade poetry + poetry build + poetry install - name: Test run: | - python python/setup.py install - python python/setup.py bdist_wheel export SPARK_HOME=$(python -c "import os; from importlib.util import find_spec; print(os.path.join(os.path.dirname(find_spec('pyspark').origin)))") ./python/run-tests.sh From 9a8eef0d29e2d36129427ae9efa13fc8bb044021 Mon Sep 17 00:00:00 2001 From: Russell Jurney Date: Sun, 16 Feb 2025 15:01:09 -0800 Subject: [PATCH 20/53] pyspark min 3.4 --- python/poetry.lock | 2 +- python/pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/poetry.lock b/python/poetry.lock index 6eb61618d..0fb5fb139 100644 --- a/python/poetry.lock +++ b/python/poetry.lock @@ -357,4 +357,4 @@ files = [ [metadata] lock-version = "2.1" python-versions = ">=3.9 <3.13" -content-hash = "430f562a040c0eabc2fe5c93757801dd9d7ed4c5173be37c7fc3808e04668ccd" +content-hash = "52c129fee3e94e69edf727f219bc7582ddbfcedf6c43547a7f67a876051bd7c4" diff --git a/python/pyproject.toml b/python/pyproject.toml index 076cc3ce4..0cff88d08 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -16,7 +16,7 @@ packages = [{include = "graphframes"}] [tool.poetry.dependencies] python = ">=3.9 <3.13" nose = "1.3.7" -pyspark = ">= 2.0.0" +pyspark = "^3.4" numpy = ">= 1.7" [tool.poetry.group.dev.dependencies] From 75ecd997d2cfbf0e52799b86b3f9f261e63e375e Mon Sep 17 00:00:00 2001 From: Russell Jurney Date: Sun, 16 Feb 2025 15:02:53 -0800 Subject: [PATCH 21/53] Local python README in pyproject.toml --- python/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyproject.toml b/python/pyproject.toml index 0cff88d08..84050dcc2 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -4,7 +4,7 @@ version = "0.8.4" description = "GraphFrames: Graph Processing Framework for Apache Spark" authors = ["GraphFrames Contributors "] license = "Apache 2.0" -readme = "../README.md" +readme = "README.md" packages = [{include = "graphframes"}] [tool.poetry.urls] From 80231d0e2262eb1044a619dbd2792f6cdcc41d35 Mon Sep 17 00:00:00 2001 From: Russell Jurney Date: Sun, 16 Feb 2025 15:23:20 -0800 Subject: [PATCH 22/53] Trying to remove he working folder to debug scala issue --- .github/workflows/python-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml index 47a484c1e..3af7339b0 100644 --- a/.github/workflows/python-ci.yml +++ b/.github/workflows/python-ci.yml @@ -32,7 +32,7 @@ jobs: with: python-version: ${{ matrix.python-version }} - name: Build Python package and its dependencies - working-directory: ./python + # working-directory: ./python run: | python -m pip install --upgrade poetry poetry build From 2a9170baad6e6d2791f258841b7db54cecec251d Mon Sep 17 00:00:00 2001 From: Russell Jurney Date: Sun, 16 Feb 2025 15:50:44 -0800 Subject: [PATCH 23/53] Set Python working directory again --- .github/workflows/python-ci.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml index 3af7339b0..f863785b0 100644 --- a/.github/workflows/python-ci.yml +++ b/.github/workflows/python-ci.yml @@ -10,6 +10,7 @@ jobs: scala-version: 2.12.18 python-version: 3.9.19 runs-on: ubuntu-22.04 + env: # define Java options for both official sbt and sbt-extras JAVA_OPTS: -Xms2048M -Xmx2048M -Xss6M -XX:ReservedCodeCacheSize=256M -Dfile.encoding=UTF-8 @@ -32,7 +33,7 @@ jobs: with: python-version: ${{ matrix.python-version }} - name: Build Python package and its dependencies - # working-directory: ./python + working-directory: ./python run: | python -m pip install --upgrade poetry poetry build From 3de22636760c3059361efd5c6135c99236c88949 Mon Sep 17 00:00:00 2001 From: Russell Jurney Date: Sun, 16 Feb 2025 15:51:43 -0800 Subject: [PATCH 24/53] Accidental newline --- .github/workflows/python-ci.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml index f863785b0..47a484c1e 100644 --- a/.github/workflows/python-ci.yml +++ b/.github/workflows/python-ci.yml @@ -10,7 +10,6 @@ jobs: scala-version: 2.12.18 python-version: 3.9.19 runs-on: ubuntu-22.04 - env: # define Java options for both official sbt and sbt-extras JAVA_OPTS: -Xms2048M -Xmx2048M -Xss6M -XX:ReservedCodeCacheSize=256M -Dfile.encoding=UTF-8 From 4662717935fd3629a237d1ab454ba6fc6b42327f Mon Sep 17 00:00:00 2001 From: Russell Jurney Date: Sun, 16 Feb 2025 16:10:08 -0800 Subject: [PATCH 25/53] Install Python for test... --- .github/workflows/python-ci.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml index 47a484c1e..519c5fb4a 100644 --- a/.github/workflows/python-ci.yml +++ b/.github/workflows/python-ci.yml @@ -39,5 +39,8 @@ jobs: poetry install - name: Test run: | + python -m pip install --upgrade poetry + poetry build + poetry install export SPARK_HOME=$(python -c "import os; from importlib.util import find_spec; print(os.path.join(os.path.dirname(find_spec('pyspark').origin)))") ./python/run-tests.sh From 1b7b9f83a82cb120cd831cbeb38a71065d9030fd Mon Sep 17 00:00:00 2001 From: Russell Jurney Date: Sun, 16 Feb 2025 16:19:15 -0800 Subject: [PATCH 26/53] Run tests from python/ folder --- .github/workflows/python-ci.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml index 519c5fb4a..72ffe6e22 100644 --- a/.github/workflows/python-ci.yml +++ b/.github/workflows/python-ci.yml @@ -38,9 +38,7 @@ jobs: poetry build poetry install - name: Test + working-directory: ./python run: | - python -m pip install --upgrade poetry - poetry build - poetry install export SPARK_HOME=$(python -c "import os; from importlib.util import find_spec; print(os.path.join(os.path.dirname(find_spec('pyspark').origin)))") ./python/run-tests.sh From 58da4932cb3997b797a8ec9f98e6bd95e49f543e Mon Sep 17 00:00:00 2001 From: Russell Jurney Date: Sun, 16 Feb 2025 16:37:58 -0800 Subject: [PATCH 27/53] Try running tests from python/ --- .github/workflows/python-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml index 72ffe6e22..2e3e44311 100644 --- a/.github/workflows/python-ci.yml +++ b/.github/workflows/python-ci.yml @@ -41,4 +41,4 @@ jobs: working-directory: ./python run: | export SPARK_HOME=$(python -c "import os; from importlib.util import find_spec; print(os.path.join(os.path.dirname(find_spec('pyspark').origin)))") - ./python/run-tests.sh + ./run-tests.sh From 9f4aa24e6d77ccb45c42b4ea8bf02b1905e826a1 Mon Sep 17 00:00:00 2001 From: Russell Jurney Date: Sun, 16 Feb 2025 16:45:10 -0800 Subject: [PATCH 28/53] poetry run the unit tests --- .github/workflows/python-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml index 2e3e44311..3d939db65 100644 --- a/.github/workflows/python-ci.yml +++ b/.github/workflows/python-ci.yml @@ -40,5 +40,5 @@ jobs: - name: Test working-directory: ./python run: | - export SPARK_HOME=$(python -c "import os; from importlib.util import find_spec; print(os.path.join(os.path.dirname(find_spec('pyspark').origin)))") + export SPARK_HOME=$(poetry run python -c "import os; from importlib.util import find_spec; spec = find_spec('pyspark'); print(os.path.join(os.path.dirname(spec.origin)))") ./run-tests.sh From 11b2782e519a518287b62b8e6969bc7b2f2f0947 Mon Sep 17 00:00:00 2001 From: Russell Jurney Date: Sun, 16 Feb 2025 16:49:18 -0800 Subject: [PATCH 29/53] poetry run the tests --- python/run-tests.sh | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/python/run-tests.sh b/python/run-tests.sh index af4e0a139..4c558dfd3 100755 --- a/python/run-tests.sh +++ b/python/run-tests.sh @@ -38,7 +38,7 @@ echo $pyver LIBS="" for lib in "$SPARK_HOME/python/lib"/*zip ; do - LIBS=$LIBS:$lib + LIBS=$LIBS:$lib done # The current directory of the script. @@ -51,7 +51,7 @@ assembly_path="$DIR/../target/scala-$scala_version_major_minor" echo `ls $assembly_path/graphframes-assembly*.jar` JAR_PATH="" for assembly in $assembly_path/graphframes-assembly*.jar ; do - JAR_PATH=$assembly + JAR_PATH=$assembly done export PYSPARK_SUBMIT_ARGS="--driver-memory 2g --executor-memory 2g --jars $JAR_PATH pyspark-shell " @@ -64,14 +64,14 @@ export PYTHONPATH=$PYTHONPATH:graphframes # Run test suites if [[ "$python_major" == "2" ]]; then - - # Horrible hack for spark 1.x: we manually remove some log lines to stay below the 4MB log limit on Travis. - $PYSPARK_DRIVER_PYTHON `which nosetests` -v --all-modules -w $DIR 2>&1 | grep -vE "INFO (ParquetOutputFormat|SparkContext|ContextCleaner|ShuffleBlockFetcherIterator|MapOutputTrackerMaster|TaskSetManager|Executor|MemoryStore|CacheManager|BlockManager|DAGScheduler|PythonRDD|TaskSchedulerImpl|ZippedPartitionsRDD2)"; - + + # Horrible hack for spark 1.x: we manually remove some log lines to stay below the 4MB log limit on Travis. + poetry run $PYSPARK_DRIVER_PYTHON `which nosetests` -v --all-modules -w $DIR 2>&1 | grep -vE "INFO (ParquetOutputFormat|SparkContext|ContextCleaner|ShuffleBlockFetcherIterator|MapOutputTrackerMaster|TaskSetManager|Executor|MemoryStore|CacheManager|BlockManager|DAGScheduler|PythonRDD|TaskSchedulerImpl|ZippedPartitionsRDD2)"; + else - - $PYSPARK_DRIVER_PYTHON -m "nose" -v --all-modules -w $DIR 2>&1 | grep -vE "INFO (ParquetOutputFormat|SparkContext|ContextCleaner|ShuffleBlockFetcherIterator|MapOutputTrackerMaster|TaskSetManager|Executor|MemoryStore|CacheManager|BlockManager|DAGScheduler|PythonRDD|TaskSchedulerImpl|ZippedPartitionsRDD2)"; - + + poetry run $PYSPARK_DRIVER_PYTHON -m "nose" -v --all-modules -w $DIR 2>&1 | grep -vE "INFO (ParquetOutputFormat|SparkContext|ContextCleaner|ShuffleBlockFetcherIterator|MapOutputTrackerMaster|TaskSetManager|Executor|MemoryStore|CacheManager|BlockManager|DAGScheduler|PythonRDD|TaskSchedulerImpl|ZippedPartitionsRDD2)"; + fi # Exit immediately if the tests fail. From 9772344b96fb353a3f7ad17f9198a28ee0aef568 Mon Sep 17 00:00:00 2001 From: Russell Jurney Date: Sun, 16 Feb 2025 16:52:30 -0800 Subject: [PATCH 30/53] Try just using 'python' instead of a path --- python/run-tests.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/run-tests.sh b/python/run-tests.sh index 4c558dfd3..6527a60a7 100755 --- a/python/run-tests.sh +++ b/python/run-tests.sh @@ -66,11 +66,11 @@ export PYTHONPATH=$PYTHONPATH:graphframes if [[ "$python_major" == "2" ]]; then # Horrible hack for spark 1.x: we manually remove some log lines to stay below the 4MB log limit on Travis. - poetry run $PYSPARK_DRIVER_PYTHON `which nosetests` -v --all-modules -w $DIR 2>&1 | grep -vE "INFO (ParquetOutputFormat|SparkContext|ContextCleaner|ShuffleBlockFetcherIterator|MapOutputTrackerMaster|TaskSetManager|Executor|MemoryStore|CacheManager|BlockManager|DAGScheduler|PythonRDD|TaskSchedulerImpl|ZippedPartitionsRDD2)"; + poetry run python `which nosetests` -v --all-modules -w $DIR 2>&1 | grep -vE "INFO (ParquetOutputFormat|SparkContext|ContextCleaner|ShuffleBlockFetcherIterator|MapOutputTrackerMaster|TaskSetManager|Executor|MemoryStore|CacheManager|BlockManager|DAGScheduler|PythonRDD|TaskSchedulerImpl|ZippedPartitionsRDD2)"; else - poetry run $PYSPARK_DRIVER_PYTHON -m "nose" -v --all-modules -w $DIR 2>&1 | grep -vE "INFO (ParquetOutputFormat|SparkContext|ContextCleaner|ShuffleBlockFetcherIterator|MapOutputTrackerMaster|TaskSetManager|Executor|MemoryStore|CacheManager|BlockManager|DAGScheduler|PythonRDD|TaskSchedulerImpl|ZippedPartitionsRDD2)"; + poetry run python -m "nose" -v --all-modules -w $DIR 2>&1 | grep -vE "INFO (ParquetOutputFormat|SparkContext|ContextCleaner|ShuffleBlockFetcherIterator|MapOutputTrackerMaster|TaskSetManager|Executor|MemoryStore|CacheManager|BlockManager|DAGScheduler|PythonRDD|TaskSchedulerImpl|ZippedPartitionsRDD2)"; fi From d55dbfe4815c2f0b0870cdc53b65fb9d9a075b42 Mon Sep 17 00:00:00 2001 From: Russell Jurney Date: Sun, 16 Feb 2025 16:58:08 -0800 Subject: [PATCH 31/53] poetry run the last line, graphframes.main --- python/run-tests.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/run-tests.sh b/python/run-tests.sh index 6527a60a7..0382efbd0 100755 --- a/python/run-tests.sh +++ b/python/run-tests.sh @@ -83,4 +83,4 @@ test ${PIPESTATUS[0]} -eq 0 || exit 1; cd "$DIR" -$PYSPARK_PYTHON -u ./graphframes/graphframe.py "$@" +poetry run python -u ./graphframes/graphframe.py "$@" From 2fc4d0818f35a86874b67f86893f48b5f83d7285 Mon Sep 17 00:00:00 2001 From: Russell Jurney Date: Sun, 16 Feb 2025 16:59:23 -0800 Subject: [PATCH 32/53] Remove test/ folder from style paths, it doesn't exist --- python/pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyproject.toml b/python/pyproject.toml index 84050dcc2..e21c4cc80 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -31,8 +31,8 @@ build-backend = "poetry.core.masonry.api" [tool.black] line-length = 100 target-version = ["py39"] -include = ["graphframes", "test"] +include = ["graphframes"] [tool.isort] profile = "black" -src_paths = ["graphframes", "test"] +src_paths = ["graphframes"] From 8297a13232f29f9466f8c0ac3bd577e2cbb066ea Mon Sep 17 00:00:00 2001 From: Russell Jurney Date: Sun, 16 Feb 2025 17:18:13 -0800 Subject: [PATCH 33/53] Remove .vscode --- .gitignore | 1 - 1 file changed, 1 deletion(-) diff --git a/.gitignore b/.gitignore index 7036d69e3..93246acbe 100644 --- a/.gitignore +++ b/.gitignore @@ -27,7 +27,6 @@ project/plugins/project/ # Mac *.DS_Store -.vscode # Python specific python/build From 2035d9854344a53ce4ba77c1d6a4f7478763f963 Mon Sep 17 00:00:00 2001 From: Russell Jurney Date: Sun, 16 Feb 2025 17:18:42 -0800 Subject: [PATCH 34/53] VERSION back to 0.8.4 --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index 7ada0d303..b60d71966 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.8.5 +0.8.4 From f9f4bd7b9dbdf0bf18e1dde83090bdc59d5fc23d Mon Sep 17 00:00:00 2001 From: Russell Jurney Date: Sun, 16 Feb 2025 17:19:55 -0800 Subject: [PATCH 35/53] Remove tutorials reference --- python/MANIFEST.in | 1 - 1 file changed, 1 deletion(-) diff --git a/python/MANIFEST.in b/python/MANIFEST.in index 4eb0ee5af..8e453d713 100644 --- a/python/MANIFEST.in +++ b/python/MANIFEST.in @@ -5,4 +5,3 @@ recursive-include python/graphframes *.py recursive-exclude * __pycache__ recursive-exclude * *.pyc -include graphframes/tutorials/data/.exists From 9ddd6b24cefc4528a9bfa75e8d8ddf3d365b8eaf Mon Sep 17 00:00:00 2001 From: Russell Jurney Date: Sun, 16 Feb 2025 17:23:35 -0800 Subject: [PATCH 36/53] VERSION is a Python thing, it belongs in python/ --- VERSION => python/VERSION | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename VERSION => python/VERSION (100%) diff --git a/VERSION b/python/VERSION similarity index 100% rename from VERSION rename to python/VERSION From 7065647d6cf0be0513af34bf355aea21ff5a2090 Mon Sep 17 00:00:00 2001 From: Russell Jurney Date: Sun, 16 Feb 2025 17:33:27 -0800 Subject: [PATCH 37/53] Include the README.md and LICENSE in the Python package --- python/MANIFEST.in | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/MANIFEST.in b/python/MANIFEST.in index 8e453d713..f883d48c1 100644 --- a/python/MANIFEST.in +++ b/python/MANIFEST.in @@ -5,3 +5,5 @@ recursive-include python/graphframes *.py recursive-exclude * __pycache__ recursive-exclude * *.pyc +include README.md +include LICENSE From a6c7e91f151ae7f04268f98c52aed995855e881f Mon Sep 17 00:00:00 2001 From: Russell Jurney Date: Sun, 16 Feb 2025 17:34:21 -0800 Subject: [PATCH 38/53] Some classifiers for pyproject.toml --- python/pyproject.toml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/python/pyproject.toml b/python/pyproject.toml index e21c4cc80..8c0c1ba05 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -6,6 +6,16 @@ authors = ["GraphFrames Contributors "] license = "Apache 2.0" readme = "README.md" packages = [{include = "graphframes"}] +classifiers = [ + "Development Status :: 4 - Beta", + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12" +] [tool.poetry.urls] "Project Homepage" = "https://graphframes.github.io/graphframes" From 51e3e6d95d312d83e01d91151bcc90e5e9a63edf Mon Sep 17 00:00:00 2001 From: Russell Jurney Date: Mon, 17 Feb 2025 08:49:21 -0800 Subject: [PATCH 39/53] Trying poetry install action instead of manual install --- .github/workflows/python-ci.yml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml index 3d939db65..1095ce49e 100644 --- a/.github/workflows/python-ci.yml +++ b/.github/workflows/python-ci.yml @@ -31,10 +31,16 @@ jobs: - uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} + - name: Install and configure Poetry + uses: snok/install-poetry@v1 + with: + version: 2.1.1 + virtualenvs-create: true + virtualenvs-in-project: false + installer-parallel: true - name: Build Python package and its dependencies working-directory: ./python run: | - python -m pip install --upgrade poetry poetry build poetry install - name: Test From 272be064e60f3c07817533a1e02b5a0eec2b89cf Mon Sep 17 00:00:00 2001 From: Russell Jurney Date: Mon, 17 Feb 2025 08:53:55 -0800 Subject: [PATCH 40/53] Removing SPARK_HOME --- .github/workflows/python-ci.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml index 1095ce49e..7f201a049 100644 --- a/.github/workflows/python-ci.yml +++ b/.github/workflows/python-ci.yml @@ -46,5 +46,4 @@ jobs: - name: Test working-directory: ./python run: | - export SPARK_HOME=$(poetry run python -c "import os; from importlib.util import find_spec; spec = find_spec('pyspark'); print(os.path.join(os.path.dirname(spec.origin)))") ./run-tests.sh From 45879995d1c6c6bc22a9f82b59290f7912b5ba3b Mon Sep 17 00:00:00 2001 From: Russell Jurney Date: Mon, 17 Feb 2025 09:46:00 -0800 Subject: [PATCH 41/53] Returned SPARK_HOME settings --- .github/workflows/python-ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml index 7f201a049..1095ce49e 100644 --- a/.github/workflows/python-ci.yml +++ b/.github/workflows/python-ci.yml @@ -46,4 +46,5 @@ jobs: - name: Test working-directory: ./python run: | + export SPARK_HOME=$(poetry run python -c "import os; from importlib.util import find_spec; spec = find_spec('pyspark'); print(os.path.join(os.path.dirname(spec.origin)))") ./run-tests.sh From 2422b226b341cdf728fdaaf9ca109833d6ad11fe Mon Sep 17 00:00:00 2001 From: Russell Jurney Date: Mon, 17 Feb 2025 10:06:54 -0800 Subject: [PATCH 42/53] Minimized the PR to just these files --- python/MANIFEST.in | 1 + python/graphframes/tutorials/download.py | 64 ++ python/graphframes/tutorials/motif.py | 207 +++++++ python/graphframes/tutorials/stackexchange.py | 579 ++++++++++++++++++ python/graphframes/tutorials/utils.py | 122 ++++ 5 files changed, 973 insertions(+) create mode 100755 python/graphframes/tutorials/download.py create mode 100644 python/graphframes/tutorials/motif.py create mode 100644 python/graphframes/tutorials/stackexchange.py create mode 100644 python/graphframes/tutorials/utils.py diff --git a/python/MANIFEST.in b/python/MANIFEST.in index 73eaf8ba2..22100a328 100644 --- a/python/MANIFEST.in +++ b/python/MANIFEST.in @@ -2,3 +2,4 @@ # https://github.com/pypa/sampleproject/blob/master/MANIFEST.in # For more details about the MANIFEST file, you may read the docs at # https://docs.python.org/2/distutils/sourcedist.html#the-manifest-in-template +include graphframes/tutorials/data/.exists diff --git a/python/graphframes/tutorials/download.py b/python/graphframes/tutorials/download.py new file mode 100755 index 000000000..154d84c14 --- /dev/null +++ b/python/graphframes/tutorials/download.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python + +import os +import click +import requests +import py7zr + + +@click.command() +@click.argument("subdomain") +@click.option("--data-dir", default="python/graphframes/tutorials/data", help="Directory to store downloaded files") +@click.option( + "--extract/--no-extract", default=True, help="Whether to extract the archive after download" +) +def download_stackexchange(subdomain: str, data_dir: str, extract: bool) -> None: + """Download Stack Exchange archive for a given SUBDOMAIN. + + Example: python/graphframes/tutorials/download.py stats.meta + + Note: This won't work for stackoverflow.com archives due to size. + """ + # Create data directory if it doesn't exist + os.makedirs(data_dir, exist_ok=True) + + # Construct archive URL and filename + archive_url = f"https://archive.org/download/stackexchange/{subdomain}.stackexchange.com.7z" + archive_path = os.path.join(data_dir, f"{subdomain}.stackexchange.com.7z") + + click.echo(f"Downloading archive from {archive_url}") + + try: + # Download the file + response = requests.get(archive_url, stream=True) + response.raise_for_status() # Raise exception for bad status codes + + total_size = int(response.headers.get("content-length", 0)) + + with click.progressbar(length=total_size, label="Downloading") as bar: + with open(archive_path, "wb") as f: + for chunk in response.iter_content(chunk_size=8192): + if chunk: + f.write(chunk) + bar.update(len(chunk)) + + click.echo(f"Download complete: {archive_path}") + + # Extract if requested + if extract: + click.echo("Extracting archive...") + output_dir = f"{subdomain}.stackexchange.com" + with py7zr.SevenZipFile(archive_path, mode="r") as z: + z.extractall(path=os.path.join(data_dir, output_dir)) + click.echo(f"Extraction complete: {output_dir}") + + except requests.exceptions.RequestException as e: + click.echo(f"Error downloading archive: {e}", err=True) + raise click.Abort() + except py7zr.Bad7zFile as e: + click.echo(f"Error extracting archive: {e}", err=True) + raise click.Abort() + + +if __name__ == "__main__": + download_stackexchange() diff --git a/python/graphframes/tutorials/motif.py b/python/graphframes/tutorials/motif.py new file mode 100644 index 000000000..4a2189c56 --- /dev/null +++ b/python/graphframes/tutorials/motif.py @@ -0,0 +1,207 @@ +# Demonstrate GraphFrames network motif finding capabilities + +# +# Interactive Usage: pyspark --packages graphframes:graphframes:0.8.4-spark3.5-s_2.12 +# +# Batch Usage: spark-submit --packages graphframes:graphframes:0.8.4-spark3.5-s_2.12 python/graphframes/tutorials/motif.py +# + +import pyspark.sql.functions as F +from pyspark import SparkContext +from pyspark.sql import DataFrame, SparkSession + +from graphframes import GraphFrame + +# Initialize a SparkSession +spark: SparkSession = ( + SparkSession.builder.appName("Stack Overflow Motif Analysis") + # Lets the Id:(Stack Overflow int) and id:(GraphFrames ULID) coexist + .config("spark.sql.caseSensitive", True).getOrCreate() +) +sc: SparkContext = spark.sparkContext +sc.setCheckpointDir("/tmp/graphframes-checkpoints") + +# Change me if you download a different stackexchange site +STACKEXCHANGE_SITE = "stats.meta.stackexchange.com" +BASE_PATH = f"python/graphframes/tutorials/data/{STACKEXCHANGE_SITE}" + +# +# Load the nodes and edges from disk, repartition, checkpoint [plan got long for some reason] and cache. +# + +# We created these in stackexchange.py from Stack Exchange data dump XML files +NODES_PATH: str = f"{BASE_PATH}/Nodes.parquet" +nodes_df: DataFrame = spark.read.parquet(NODES_PATH) + +# Repartition the nodes to give our motif searches parallelism +nodes_df = nodes_df.repartition(50).checkpoint().cache() + +# We created these in stackexchange.py from Stack Exchange data dump XML files +EDGES_PATH: str = f"{BASE_PATH}/Edges.parquet" +edges_df: DataFrame = spark.read.parquet(EDGES_PATH) + +# Repartition the edges to give our motif searches parallelism +edges_df = edges_df.repartition(50).checkpoint().cache() + +# What kind of nodes we do we have to work with? +node_counts = ( + nodes_df.select("id", F.col("Type").alias("Node Type")) + .groupBy("Node Type") + .count() + .orderBy(F.col("count").desc()) + # Add a comma formatted column for display + .withColumn("count", F.format_number(F.col("count"), 0)) +) +node_counts.show() + +# What kind of edges do we have to work with? +edge_counts = ( + edges_df.select("src", "dst", F.col("relationship").alias("Edge Type")) + .groupBy("Edge Type") + .count() + .orderBy(F.col("count").desc()) + # Add a comma formatted column for display + .withColumn("count", F.format_number(F.col("count"), 0)) +) +edge_counts.show() + +g = GraphFrame(nodes_df, edges_df) + +g.vertices.show(10) +print(f"Node columns: {g.vertices.columns}") + +g.edges.sample(0.0001).show(10) + +# Sanity test that all edges have valid ids +edge_count = g.edges.count() +valid_edge_count = ( + g.edges.join(g.vertices, on=g.edges.src == g.vertices.id) + .select("src", "dst", "relationship") + .join(g.vertices, on=g.edges.dst == g.vertices.id) + .count() +) + +# Just up and die if we have edges that point to non-existent nodes +assert ( + edge_count == valid_edge_count +), f"Edge count {edge_count} != valid edge count {valid_edge_count}" +print(f"Edge count: {edge_count:,} == Valid edge count: {valid_edge_count:,}") + +# G4: Continuous Triangles +paths = g.find("(a)-[e1]->(b); (b)-[e2]->(c); (c)-[e3]->(a)") + +# Show the first path +paths.show(3) + +graphlet_type_df = paths.select( + F.col("a.Type").alias("A_Type"), + F.col("e1.relationship").alias("(a)-[e1]->(b)"), + F.col("b.Type").alias("B_Type"), + F.col("e2.relationship").alias("(b)-[e2]->(c)"), + F.col("c.Type").alias("C_Type"), + F.col("e3.relationship").alias("(c)-[e3]->(a)"), +) + +graphlet_count_df = ( + graphlet_type_df.groupby( + "A_Type", "(a)-[e1]->(b)", "B_Type", "(b)-[e2]->(c)", "C_Type", "(c)-[e3]->(a)" + ) + .count() + .orderBy(F.col("count").desc()) + # Add a comma formatted column for display + .withColumn("count", F.format_number(F.col("count"), 0)) +) +graphlet_count_df.show() + +# G5: Divergent Triangles +paths = g.find("(a)-[e1]->(b); (a)-[e2]->(c); (c)-[e3]->(b)") + +graphlet_type_df = paths.select( + F.col("a.Type").alias("A_Type"), + F.col("e1.relationship").alias("(a)-[e1]->(b)"), + F.col("b.Type").alias("B_Type"), + F.col("e2.relationship").alias("(a)-[e2]->(c)"), + F.col("c.Type").alias("C_Type"), + F.col("e3.relationship").alias("(c)-[e3]->(b)"), +) + +graphlet_count_df = ( + graphlet_type_df.groupby( + "A_Type", "(a)-[e1]->(b)", "B_Type", "(a)-[e2]->(c)", "C_Type", "(c)-[e3]->(b)" + ) + .count() + .orderBy(F.col("count").desc()) + # Add a comma formatted column for display + .withColumn("count", F.format_number(F.col("count"), 0)) +) +graphlet_count_df.show() + +# G17: A directed 3-path is a surprisingly diverse graphlet +paths = g.find("(a)-[e1]->(b); (b)-[e2]->(c); (d)-[e3]->(c)") + +# Visualize the four-path by counting instances of paths by node / edge type +graphlet_type_df = paths.select( + F.col("a.Type").alias("A_Type"), + F.col("e1.relationship").alias("(a)-[e1]->(b)"), + F.col("b.Type").alias("B_Type"), + F.col("e2.relationship").alias("(b)-[e2]->(c)"), + F.col("c.Type").alias("C_Type"), + F.col("e3.relationship").alias("(d)-[e3]->(c)"), + F.col("d.Type").alias("D_Type"), +) +graphlet_count_df = ( + graphlet_type_df.groupby( + "A_Type", + "(a)-[e1]->(b)", + "B_Type", + "(b)-[e2]->(c)", + "C_Type", + "(d)-[e3]->(c)", + "D_Type", + ) + .count() + .orderBy(F.col("count").desc()) + # Add a comma formatted column for display + .withColumn("count", F.format_number(F.col("count"), 0)) +) +graphlet_count_df.show() + +graphlet_count_df.orderBy( + [ + "A_Type", + "(a)-[e1]->(b)", + "B_Type", + "(b)-[e2]->(c)", + "C_Type", + "(d)-[e3]->(c)", + "D_Type", + ], + ascending=False, +).show(104) + +# A user answers an answer that answers a question that links to an answer. +linked_vote_paths = paths.filter( + (F.col("a.Type") == "Vote") + & (F.col("e1.relationship") == "CastFor") + & (F.col("b.Type") == "Question") + & (F.col("e2.relationship") == "Links") + & (F.col("c.Type") == "Question") + & (F.col("e3.relationship") == "CastFor") + & (F.col("d.Type") == "Vote") +) + +# Sanity check the count - it should match the table above +linked_vote_paths.count() + +b_vote_counts = linked_vote_paths.select("a", "b").distinct().groupBy("b").count() +c_vote_counts = linked_vote_paths.select("c", "d").distinct().groupBy("c").count() + +linked_vote_counts = ( + linked_vote_paths.filter((F.col("a.VoteTypeId") == 2) & (F.col("d.VoteTypeId") == 2)) + .select("b", "c") + .join(b_vote_counts, on="b", how="inner") + .withColumnRenamed("count", "b_count") + .join(c_vote_counts, on="c", how="inner") + .withColumnRenamed("count", "c_count") +) +linked_vote_counts.stat.corr("b_count", "c_count") diff --git a/python/graphframes/tutorials/stackexchange.py b/python/graphframes/tutorials/stackexchange.py new file mode 100644 index 000000000..c52f323bb --- /dev/null +++ b/python/graphframes/tutorials/stackexchange.py @@ -0,0 +1,579 @@ +# Build a Graph out of the Stack Exchange Data Dump XML files + +# +# Interactive Usage: pyspark --packages com.databricks:spark-xml_2.12:0.18.0 +# +# Batch Usage: spark-submit --packages com.databricks:spark-xml_2.12:0.18.0 python/graphframes/tutorials/stackexchange.py +# + +import re +from typing import List, Tuple + +import pyspark.sql.functions as F +import pyspark.sql.types as T +from pyspark.sql import DataFrame, SparkSession + +# Change me if you download a different stackexchange site +STACKEXCHANGE_SITE = "stats.meta.stackexchange.com" +BASE_PATH = f"python/graphframes/tutorials/data/{STACKEXCHANGE_SITE}" + + +# +# Some utility functions +# + + +def remove_prefix(df: DataFrame) -> DataFrame: + """Remove the _ prefix present in the fields of the DataFrame""" + field_names = [x.name for x in df.schema] + new_field_names = [x[1:] for x in field_names] + s = [] + + # Substitute the old name for the new one + for old, new in zip(field_names, new_field_names): + s.append(F.col(old).alias(new)) + return df.select(s) + + +@F.udf(returnType=T.ArrayType(T.StringType())) +def split_tags(tags: str) -> List[str]: + if not tags: + return [] + # Remove < and > and split into array + return re.findall(r"<([^>]+)>", tags) + + +# +# Initialize a SparkSession with case sensitivity +# + +spark: SparkSession = ( + SparkSession.builder.appName("Stack Exchange Graph Builder") + # Lets the Id:(Stack Overflow int) and id:(GraphFrames UUID) coexist + .config("spark.sql.caseSensitive", True).getOrCreate() +) + +print("Loading data for stats.meta.stackexchange.com ...") + + +# +# Load the Posts... +# +posts_df: DataFrame = ( + spark.read.format("xml") + .options(rowTag="row") + .options(rootTag="posts") + .load(f"{BASE_PATH}/Posts.xml") +) +print(f"\nTotal Posts: {posts_df.count():,}") + +# Remove the _ prefix from field names +posts_df = remove_prefix(posts_df) + +# Create a list of tags +posts_df = ( + posts_df.withColumn( + "ParsedTags", F.split(F.regexp_replace(F.col("Tags"), "^\\||\\|$", ""), "\\|") + ) + .drop("Tags") + .withColumnRenamed("ParsedTags", "Tags") +) + + +# +# Building blocks: separate the questions and answers +# + +# Do the questions look ok? Questions have NO parent ID and DO have a Title +questions_df: DataFrame = posts_df.filter(posts_df.ParentId.isNull()) +questions_df = questions_df.withColumn("Type", F.lit("Question")).cache() +print(f"\nTotal questions: {questions_df.count():,}\n") + +questions_df.select("ParentId", "Title", "Body").show(10) + +# Answers DO have a ParentId parent post and no Title +answers_df: DataFrame = posts_df.filter(posts_df.ParentId.isNotNull()) +answers_df = answers_df.withColumn("Type", F.lit("Answer")).cache() +print(f"\nTotal answers: {answers_df.count():,}\n") + +answers_df.select("ParentId", "Title", "Body").show(10) + + +# +# Load the PostLinks... +# + +post_links_df = ( + spark.read.format("xml") + .options(rowTag="row") + .options(rootTag="postlinks") + .load(f"{BASE_PATH}/PostLinks.xml") +) +print(f"Total PostLinks: {post_links_df.count():,}") + +# Remove the _ prefix from field names +post_links_df = ( + remove_prefix(post_links_df) + .withColumn( + "LinkType", + F.when(F.col("LinkTypeId") == 1, "Linked") + .when(F.col("LinkTypeId") == 3, "Duplicate") + .otherwise("Unknown"), + ) + .withColumn("Type", F.lit("PostLinks")) +) + + +# +# Load the PostHistory... +# + +post_history_df = ( + spark.read.format("xml") + .options(rowTag="row") + .options(rootTag="posthistory") + .load(f"{BASE_PATH}/PostHistory.xml") +) +print(f"Total PostHistory: {post_history_df.count():,}") + +# Remove the _ prefix from field names +post_history_df = remove_prefix(post_history_df).withColumn("Type", F.lit("PostHistory")) + + +# +# Load the Comments... +# + +comments_df = ( + spark.read.format("xml") + .options(rowTag="row") + .options(rootTag="comments") + .load(f"{BASE_PATH}/Comments.xml") +) +print(f"Total Comments: {comments_df.count():,}") + +# Remove the _ prefix from field names +comments_df = remove_prefix(comments_df).withColumn("Type", F.lit("Comment")) + + +# +# Load the Users... +# + +users_df = ( + spark.read.format("xml") + .options(rowTag="row") + .options(rootTag="users") + .load(f"{BASE_PATH}/Users.xml") +) +print(f"Total Users: {users_df.count():,}") + +# Remove the _ prefix from field names +users_df = remove_prefix(users_df).withColumn("Type", F.lit("User")) + + +# +# Load the Votes... +# + +votes_df = ( + spark.read.format("xml") + .options(rowTag="row") + .options(rootTag="votes") + .load(f"{BASE_PATH}/Votes.xml") +) +print(f"Total Votes: {votes_df.count():,}") + +# Remove the _ prefix from field names +votes_df = remove_prefix(votes_df).withColumn("Type", F.lit("Vote")) + +# Add a VoteType column +votes_df = votes_df.withColumn( + "VoteType", + F.when(F.col("VoteTypeId") == 2, "UpVote") + .when(F.col("VoteTypeId") == 3, "DownVote") + .when(F.col("VoteTypeId") == 4, "Favorite") + .when(F.col("VoteTypeId") == 5, "Close") + .when(F.col("VoteTypeId") == 6, "Reopen") + .when(F.col("VoteTypeId") == 7, "BountyStart") + .when(F.col("VoteTypeId") == 8, "BountyClose") + .when(F.col("VoteTypeId") == 9, "Deletion") + .when(F.col("VoteTypeId") == 10, "Undeletion") + .when(F.col("VoteTypeId") == 11, "Spam") + .when(F.col("VoteTypeId") == 12, "InformModerator") + .otherwise("Unknown"), +) + + +# +# Load the Tags... +# + +tags_df = ( + spark.read.format("xml") + .options(rowTag="row") + .options(rootTag="tags") + .load(f"{BASE_PATH}/Tags.xml") +) +print(f"Total Tags: {tags_df.count():,}") + +# Remove the _ prefix from field names +tags_df = remove_prefix(tags_df).withColumn("Type", F.lit("Tag")) + + +# +# Load the Badges... +# + +badges_df = ( + spark.read.format("xml") + .options(rowTag="row") + .options(rootTag="badges") + .load(f"{BASE_PATH}/Badges.xml") +) +print(f"Total Badges: {badges_df.count():,}\n") + +# Remove the _ prefix from field names +badges_df = remove_prefix(badges_df).withColumn("Type", F.lit("Badge")) + + +# +# Form the nodes from the UNION of posts, users, votes and their combined schemas +# + +all_cols: List[Tuple[str, T.StructField]] = list( + set( + list(zip(answers_df.columns, answers_df.schema)) + + list(zip(questions_df.columns, questions_df.schema)) + + list(zip(post_links_df.columns, post_links_df.schema)) + + list(zip(comments_df.columns, comments_df.schema)) + + list(zip(users_df.columns, users_df.schema)) + + list(zip(votes_df.columns, votes_df.schema)) + + list(zip(tags_df.columns, tags_df.schema)) + + list(zip(badges_df.columns, badges_df.schema)) + ) +) +all_column_names: List[str] = sorted([x[0] for x in all_cols]) + + +def add_missing_columns(df: DataFrame, all_cols: List[Tuple[str, T.StructField]]) -> DataFrame: + """Add any missing columns from any DataFrame among several we want to merge.""" + for col_name, schema_field in all_cols: + if col_name not in df.columns: + df = df.withColumn(col_name, F.lit(None).cast(schema_field.dataType)) + return df + + +# Now apply this function to each of your DataFrames to get a consistent schema +# posts_df = add_missing_columns(posts_df, all_cols).select(all_column_names) +questions_df = add_missing_columns(questions_df, all_cols).select(all_column_names) +answers_df = add_missing_columns(answers_df, all_cols).select(all_column_names) +post_links_df = add_missing_columns(post_links_df, all_cols).select(all_column_names) +users_df = add_missing_columns(users_df, all_cols).select(all_column_names) +votes_df = add_missing_columns(votes_df, all_cols).select(all_column_names) +tags_df = add_missing_columns(tags_df, all_cols).select(all_column_names) +badges_df = add_missing_columns(badges_df, all_cols).select(all_column_names) +assert ( + set(questions_df.columns) + == set(answers_df.columns) + == set(post_links_df.columns) + == set(users_df.columns) + == set(votes_df.columns) + == set(all_column_names) + == set(tags_df.columns) + == set(badges_df.columns) +) + +# Now union them together and remove duplicates +nodes_df: DataFrame = ( + questions_df.unionByName(answers_df) + .unionByName(post_links_df) + .unionByName(users_df) + .unionByName(votes_df) + .unionByName(tags_df) + .unionByName(badges_df) + .distinct() +) +print(f"Total distinct nodes: {nodes_df.count():,}") + +# Now add a unique ID field +nodes_df = nodes_df.withColumn("id", F.expr("uuid()")).select("id", *all_column_names) + +# Now create posts - combined questions and answers for things that can apply to them both +posts_df = questions_df.unionByName(answers_df).cache() + +# +# Store the nodes to disk, reload and cache +# + +NODES_PATH: str = f"{BASE_PATH}/Nodes.parquet" + +# Write to disk and load back again +nodes_df.write.mode("overwrite").parquet(NODES_PATH) +nodes_df = spark.read.parquet(NODES_PATH) + +nodes_df.select("id", "Type").groupBy("Type").count().orderBy(F.col("count").desc()).show() + +# +---------+------+ +# | Type| count| +# +---------+------+ +# | Badge|43,029| +# | Vote|42,593| +# | User|37,709| +# | Answer| 2,978| +# | Question| 2,025| +# |PostLinks| 1,274| +# | Tag| 143| +# +---------+------+ + +# Helps performance of GraphFrames' algorithms +nodes_df = nodes_df.cache() + +# Make sure we have the right columns and cached data +posts_df = nodes_df.filter(nodes_df.Type.isin("Question", "Answer")).cache() +questions_df = nodes_df.filter(nodes_df.Type == "Question").cache() +answers_df = nodes_df.filter(nodes_df.Type == "Answer").cache() +post_links_df = nodes_df.filter(nodes_df.Type == "PostLinks").cache() +users_df = nodes_df.filter(nodes_df.Type == "User").cache() +votes_df = nodes_df.filter(nodes_df.Type == "Vote").cache() +tags_df = nodes_df.filter(nodes_df.Type == "Tag").cache() +badges_df = nodes_df.filter(nodes_df.Type == "Badge").cache() + + +# +# Build the edges DataFrame: +# +# * [Vote]--CastFor-->[Post] +# * [User]--Asks-->[Question] +# * [User]--Posts-->[Answer] +# * [Post]--Answers-->[Question] +# * [Tag]--Tags-->[Post] +# * [User]--Earns-->[Badge] +# * [Post]--Links-->[Post] +# +# Remember: 'src', 'dst' and 'relationship' are standard edge fields in GraphFrames +# Remember: we must produce src/dst based on lowercase 'id' UUID, not 'Id' which is Stack Overflow's integer. +# + +# +# Create a [Vote]--CastFor-->[Post] edge... remember a Post is a Question or Answer +# + +src_vote_df: DataFrame = votes_df.select( + F.col("id").alias("src"), + F.col("Id").alias("VoteId"), + # Everything has all the fields - should build from base records but need UUIDs + F.col("PostId").alias("VotePostId"), +) +cast_for_edge_df: DataFrame = src_vote_df.join( + posts_df, on=src_vote_df.VotePostId == posts_df.Id, how="inner" +).select( + # 'src' comes from the votes' 'id' + "src", + # 'dst' comes from the posts' 'id' + F.col("id").alias("dst"), + # All edges have a 'relationship' field + F.lit("CastFor").alias("relationship"), +) +print(f"Total CastFor edges: {cast_for_edge_df.count():,}") +print(f"Percentage of linked votes: {cast_for_edge_df.count() / votes_df.count():.2%}\n") + +# +# Create a [User]--Asks-->[Question] edge +# + +questions_asked_df: DataFrame = questions_df.select( + F.col("OwnerUserId").alias("QuestionUserId"), + F.col("id").alias("dst"), + F.lit("Asks").alias("relationship"), +) +user_asks_edges_df: DataFrame = questions_asked_df.join( + users_df, on=questions_asked_df.QuestionUserId == users_df.Id, how="inner" +).select( + # 'src' comes from the users' 'id' + F.col("id").alias("src"), + # 'dst' comes from the posts' 'id' + "dst", + # All edges have a 'relationship' field + "relationship", +) +print(f"Total Asks edges: {user_asks_edges_df.count():,}") +print( + f"Percentage of asked questions linked to users: {user_asks_edges_df.count() / questions_df.count():.2%}\n" +) + +# +# Create a [User]--Posts-->[Answer] edge. +# + +user_answers_df: DataFrame = answers_df.select( + F.col("OwnerUserId").alias("AnswerUserId"), + F.col("id").alias("dst"), + F.lit("Posts").alias("relationship"), +) +user_answers_edges_df = user_answers_df.join( + users_df, on=user_answers_df.AnswerUserId == users_df.Id, how="inner" +).select( + # 'src' comes from the users' 'id' + F.col("id").alias("src"), + # 'dst' comes from the posts' 'id' + "dst", + # All edges have a 'relationship' field + "relationship", +) +print(f"Total User Answers edges: {user_answers_edges_df.count():,}") +print( + f"Percentage of answers linked to users: {user_answers_edges_df.count() / answers_df.count():.2%}\n" +) + +# +# Create a [Answer]--Answers-->[Question] edge +# + +src_answers_df: DataFrame = answers_df.select( + F.col("id").alias("src"), + F.col("Id").alias("AnswerId"), + F.col("ParentId").alias("AnswerParentId"), +) +question_answers_edges_df: DataFrame = src_answers_df.join( + posts_df, on=src_answers_df.AnswerParentId == questions_df.Id, how="inner" +).select( + # 'src' comes from the answers' 'id' + "src", + # 'dst' comes from the posts' 'id' + F.col("id").alias("dst"), + # All edges have a 'relationship' field + F.lit("Answers").alias("relationship"), +) +print(f"Total Posts Answers edges: {question_answers_edges_df.count():,}") +print( + f"Percentage of linked answers: {question_answers_edges_df.count() / answers_df.count():.2%}\n" +) + +# +# Create a [Tag]--Tags-->[Post] edge... remember a Post is a Question or Answer +# + +src_tags_df: DataFrame = posts_df.select( + F.col("id").alias("dst"), + # First remove leading/trailing < and >, then split on "><" + F.explode("Tags").alias("Tag"), +) +tags_edge_df: DataFrame = src_tags_df.join( + tags_df, on=src_tags_df.Tag == tags_df.TagName, how="inner" +).select( + # 'src' comes from the posts' 'id' + F.col("id").alias("src"), + # 'dst' comes from the tags' 'id' + "dst", + # All edges have a 'relationship' field + F.lit("Tags").alias("relationship"), +) +print(f"Total Tags edges: {tags_edge_df.count():,}") +print(f"Percentage of linked tags: {tags_edge_df.count() / posts_df.count():.2%}\n") + +# +# Create a [User]--Earns-->[Badge] edge +# + +earns_edges_df: DataFrame = badges_df.select( + F.col("UserId").alias("BadgeUserId"), + F.col("id").alias("dst"), + F.lit("Earns").alias("relationship"), +) +earns_edges_df = earns_edges_df.join( + users_df, on=earns_edges_df.BadgeUserId == users_df.Id, how="inner" +).select( + # 'src' comes from the users' 'id' + F.col("id").alias("src"), + # 'dst' comes from the badges' 'id' + "dst", + # All edges have a 'relationship' field + "relationship", +) +print(f"Total Earns edges: {earns_edges_df.count():,}") +print(f"Percentage of earned badges: {earns_edges_df.count() / badges_df.count():.2%}\n") + +# +# Create a [Post]--Links-->[Post] edge... remember a Post is a Question or Answer +# Also a [Post]--Duplicates-->[Post] edge... remember a Post is a Question or Answer +# + +trim_links_df: DataFrame = post_links_df.select( + F.col("PostId").alias("SrcPostId"), + F.col("RelatedPostId").alias("DstPostId"), + "LinkType", +) +links_src_edge_df: DataFrame = trim_links_df.join( + posts_df.drop("LinkType"), on=trim_links_df.SrcPostId == posts_df.Id, how="inner" +).select( + # 'dst' comes from the posts' 'id' + F.col("id").alias("src"), + "DstPostId", + "LinkType", +) +raw_links_edge_df = links_src_edge_df.join( + posts_df.drop("LinkType"), on=links_src_edge_df.DstPostId == posts_df.Id, how="inner" +).select( + "src", + # 'src' comes from the posts' 'id' + F.col("id").alias("dst"), + # All edges have a 'relationship' field + F.lit("Links").alias("relationship"), + "LinkType", +) + +duplicates_edge_df: DataFrame = ( + raw_links_edge_df.filter(F.col("LinkType") == "Duplicate") + .withColumn("relationship", F.lit("Duplicates")) + .select("src", "dst", "relationship") +) +print(f"Total Duplicates edges: {duplicates_edge_df.count():,}") +print(f"Percentage of duplicate posts: {duplicates_edge_df.count() / post_links_df.count():.2%}\n") + +linked_edge_df = ( + raw_links_edge_df.filter(F.col("LinkType") == "Linked") + .withColumn("relationship", F.lit("Links")) + .select("src", "dst", "relationship") +) +print(f"Total Links edges: {linked_edge_df.count():,}") +print(f"Percentage of linked posts: {linked_edge_df.count() / post_links_df.count():.2%}\n") + + +# +# Combine all the edges together into one relationships DataFrame +# + +relationships_df: DataFrame = ( + cast_for_edge_df.unionByName(user_asks_edges_df) + .unionByName(user_answers_edges_df) + .unionByName(question_answers_edges_df) + .unionByName(tags_edge_df) + .unionByName(earns_edges_df) + .unionByName(duplicates_edge_df) + .unionByName(linked_edge_df) +) +relationships_df.groupBy("relationship").count().orderBy(F.col("count").desc()).withColumn( + "count", F.format_number(F.col("count"), 0) +).show() + +# +------------+------+ +# |relationship| count| +# +------------+------+ +# | Earns|43,029| +# | CastFor|40,701| +# | Tags| 4,427| +# | Answers| 2,978| +# | Posts| 2,767| +# | Asks| 1,934| +# | Links| 1,180| +# | Duplicates| 88| +# +------------+------+ + +EDGES_PATH: str = f"{BASE_PATH}/Edges.parquet" + +# Write to disk and back again +relationships_df.write.mode("overwrite").parquet(EDGES_PATH) + +spark.stop() +print("Spark stopped.") diff --git a/python/graphframes/tutorials/utils.py b/python/graphframes/tutorials/utils.py new file mode 100644 index 000000000..54ef40f8b --- /dev/null +++ b/python/graphframes/tutorials/utils.py @@ -0,0 +1,122 @@ +from pyspark.sql import DataFrame +from graphframes import GraphFrame +from pyspark.sql import functions as F + + +def three_edge_count(paths: DataFrame) -> DataFrame: + """three_edge_count View the counts of the different types of 3-node graphlets in the graph. + + Parameters + ---------- + paths : pyspark.sql.DataFrame + A DataFrame of 3-paths in the graph. + + Returns + ------- + DataFrame + A DataFrame of the counts of the different types of 3-node graphlets in the graph. + """ + graphlet_type_df = paths.select( + F.col("a.Type").alias("A_Type"), + F.col("e1.relationship").alias("E_relationship"), + F.col("b.Type").alias("B_Type"), + F.col("e2.relationship").alias("E2_relationship"), + F.col("c.Type").alias("C_Type"), + F.col("e3.relationship").alias("E3_relationship"), + F.when(F.col("d").isNotNull(), F.col("d.Type")).alias("D_Type"), + ) + graphlet_count_df = ( + graphlet_type_df.groupby( + "A_Type", "E_relationship", "B_Type", "E2_relationship", "C_Type", "E3_relationship" + ) + .count() + .orderBy(F.col("count").desc()) + # Add a comma formatted column for display + .withColumn("count", F.format_number(F.col("count"), 0)) + ) + return graphlet_count_df + + +def four_edge_count(paths: DataFrame) -> DataFrame: + """four_edge_count View the counts of the different types of 4-node graphlets in the graph. + + Parameters + ---------- + paths : DataFrame + A DataFrame of 4-paths in the graph. + + Returns + ------- + DataFrame + A DataFrame of the counts of the different types of 4-node graphlets in the graph. + """ + + graphlet_type_df = paths.select( + F.col("a.Type").alias("A_Type"), + F.col("e1.relationship").alias("E_relationship"), + F.col("b.Type").alias("B_Type"), + F.col("e2.relationship").alias("E2_relationship"), + F.col("c.Type").alias("C_Type"), + F.col("e3.relationship").alias("E3_relationship"), + F.col("d.Type").alias("D_Type"), + F.col("e4.relationship").alias("E4_relationship"), + F.when(F.col("e").isNotNull(), F.col("e.Type")).alias("E_Type"), + ) + graphlet_count_df = ( + graphlet_type_df.groupby( + "A_Type", + "E_relationship", + "B_Type", + "E2_relationship", + "C_Type", + "E3_relationship", + "D_Type", + "E4_relationship", + ) + .count() + .orderBy(F.col("count").desc()) + # Add a comma formatted column for display + .withColumn("count", F.format_number(F.col("count"), 0)) + ) + return graphlet_count_df + + +def add_degree(g: GraphFrame) -> GraphFrame: + """add_degree compute the degree, adding it as a property of the nodes in the GraphFrame. + + Parameters + ---------- + g : GraphFrame + Any valid GraphFrame + + Returns + ------- + GraphFrame + Same GraphFrame with a 'degree' property added + """ + degree_vertices: DataFrame = g.vertices.join(g.degrees, on="id") + return GraphFrame(degree_vertices, g.edges) + + +def add_type_degree(g: GraphFrame) -> DataFrame: + """add_type_degree add a map property to the vertices with the degree by each type of relationship. + + Parameters + ---------- + g : GraphFrame + Any valid GraphFrame + + Returns + ------- + DataFrame - I am broke, next line is wrong + A GraphFrame with a map[type:degree] 'type_degree' field added to the vertices + """ + type_degree: DataFrame = ( + g.edges.select(F.col("src").alias("id"), "relationship") + .filter(F.col("id").isNotNull()) + .groupby("id", "relationship") + .count() + ) + type_degree = type_degree.withColumn("type_degree", F.create_map(type_degree.columns)) + type_degree = type_degree.select("src", "type_degree") + return g.vertices.join(type_degree, on="src") From 0a1fabad7ba44d8463b0b4b23cdb360181b583cb Mon Sep 17 00:00:00 2001 From: Russell Jurney Date: Mon, 17 Feb 2025 10:37:23 -0800 Subject: [PATCH 43/53] Created tutorials dependency group to minimize main bloat --- python/poetry.lock | 848 +++++++++++++++++++++++++++++++++++++++++- python/pyproject.toml | 5 + 2 files changed, 850 insertions(+), 3 deletions(-) diff --git a/python/poetry.lock b/python/poetry.lock index 0fb5fb139..a96131b72 100644 --- a/python/poetry.lock +++ b/python/poetry.lock @@ -47,13 +47,385 @@ d = ["aiohttp (>=3.10)"] jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"] uvloop = ["uvloop (>=0.15.2)"] +[[package]] +name = "brotli" +version = "1.1.0" +description = "Python bindings for the Brotli compression library" +optional = false +python-versions = "*" +groups = ["tutorials"] +markers = "platform_python_implementation == \"CPython\"" +files = [ + {file = "Brotli-1.1.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:e1140c64812cb9b06c922e77f1c26a75ec5e3f0fb2bf92cc8c58720dec276752"}, + {file = "Brotli-1.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c8fd5270e906eef71d4a8d19b7c6a43760c6abcfcc10c9101d14eb2357418de9"}, + {file = "Brotli-1.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1ae56aca0402a0f9a3431cddda62ad71666ca9d4dc3a10a142b9dce2e3c0cda3"}, + {file = "Brotli-1.1.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:43ce1b9935bfa1ede40028054d7f48b5469cd02733a365eec8a329ffd342915d"}, + {file = "Brotli-1.1.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:7c4855522edb2e6ae7fdb58e07c3ba9111e7621a8956f481c68d5d979c93032e"}, + {file = "Brotli-1.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:38025d9f30cf4634f8309c6874ef871b841eb3c347e90b0851f63d1ded5212da"}, + {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:e6a904cb26bfefc2f0a6f240bdf5233be78cd2488900a2f846f3c3ac8489ab80"}, + {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:a37b8f0391212d29b3a91a799c8e4a2855e0576911cdfb2515487e30e322253d"}, + {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:e84799f09591700a4154154cab9787452925578841a94321d5ee8fb9a9a328f0"}, + {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f66b5337fa213f1da0d9000bc8dc0cb5b896b726eefd9c6046f699b169c41b9e"}, + {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:5dab0844f2cf82be357a0eb11a9087f70c5430b2c241493fc122bb6f2bb0917c"}, + {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:e4fe605b917c70283db7dfe5ada75e04561479075761a0b3866c081d035b01c1"}, + {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:1e9a65b5736232e7a7f91ff3d02277f11d339bf34099a56cdab6a8b3410a02b2"}, + {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:58d4b711689366d4a03ac7957ab8c28890415e267f9b6589969e74b6e42225ec"}, + {file = "Brotli-1.1.0-cp310-cp310-win32.whl", hash = "sha256:be36e3d172dc816333f33520154d708a2657ea63762ec16b62ece02ab5e4daf2"}, + {file = "Brotli-1.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:0c6244521dda65ea562d5a69b9a26120769b7a9fb3db2fe9545935ed6735b128"}, + {file = "Brotli-1.1.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:a3daabb76a78f829cafc365531c972016e4aa8d5b4bf60660ad8ecee19df7ccc"}, + {file = "Brotli-1.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c8146669223164fc87a7e3de9f81e9423c67a79d6b3447994dfb9c95da16e2d6"}, + {file = "Brotli-1.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:30924eb4c57903d5a7526b08ef4a584acc22ab1ffa085faceb521521d2de32dd"}, + {file = "Brotli-1.1.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ceb64bbc6eac5a140ca649003756940f8d6a7c444a68af170b3187623b43bebf"}, + {file = "Brotli-1.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a469274ad18dc0e4d316eefa616d1d0c2ff9da369af19fa6f3daa4f09671fd61"}, + {file = "Brotli-1.1.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:524f35912131cc2cabb00edfd8d573b07f2d9f21fa824bd3fb19725a9cf06327"}, + {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:5b3cc074004d968722f51e550b41a27be656ec48f8afaeeb45ebf65b561481dd"}, + {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:19c116e796420b0cee3da1ccec3b764ed2952ccfcc298b55a10e5610ad7885f9"}, + {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:510b5b1bfbe20e1a7b3baf5fed9e9451873559a976c1a78eebaa3b86c57b4265"}, + {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:a1fd8a29719ccce974d523580987b7f8229aeace506952fa9ce1d53a033873c8"}, + {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c247dd99d39e0338a604f8c2b3bc7061d5c2e9e2ac7ba9cc1be5a69cb6cd832f"}, + {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:1b2c248cd517c222d89e74669a4adfa5577e06ab68771a529060cf5a156e9757"}, + {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:2a24c50840d89ded6c9a8fdc7b6ed3692ed4e86f1c4a4a938e1e92def92933e0"}, + {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f31859074d57b4639318523d6ffdca586ace54271a73ad23ad021acd807eb14b"}, + {file = "Brotli-1.1.0-cp311-cp311-win32.whl", hash = "sha256:39da8adedf6942d76dc3e46653e52df937a3c4d6d18fdc94a7c29d263b1f5b50"}, + {file = "Brotli-1.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:aac0411d20e345dc0920bdec5548e438e999ff68d77564d5e9463a7ca9d3e7b1"}, + {file = "Brotli-1.1.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:32d95b80260d79926f5fab3c41701dbb818fde1c9da590e77e571eefd14abe28"}, + {file = "Brotli-1.1.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b760c65308ff1e462f65d69c12e4ae085cff3b332d894637f6273a12a482d09f"}, + {file = "Brotli-1.1.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:316cc9b17edf613ac76b1f1f305d2a748f1b976b033b049a6ecdfd5612c70409"}, + {file = "Brotli-1.1.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:caf9ee9a5775f3111642d33b86237b05808dafcd6268faa492250e9b78046eb2"}, + {file = "Brotli-1.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:70051525001750221daa10907c77830bc889cb6d865cc0b813d9db7fefc21451"}, + {file = "Brotli-1.1.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7f4bf76817c14aa98cc6697ac02f3972cb8c3da93e9ef16b9c66573a68014f91"}, + {file = "Brotli-1.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d0c5516f0aed654134a2fc936325cc2e642f8a0e096d075209672eb321cff408"}, + {file = "Brotli-1.1.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6c3020404e0b5eefd7c9485ccf8393cfb75ec38ce75586e046573c9dc29967a0"}, + {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:4ed11165dd45ce798d99a136808a794a748d5dc38511303239d4e2363c0695dc"}, + {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:4093c631e96fdd49e0377a9c167bfd75b6d0bad2ace734c6eb20b348bc3ea180"}, + {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:7e4c4629ddad63006efa0ef968c8e4751c5868ff0b1c5c40f76524e894c50248"}, + {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:861bf317735688269936f755fa136a99d1ed526883859f86e41a5d43c61d8966"}, + {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:87a3044c3a35055527ac75e419dfa9f4f3667a1e887ee80360589eb8c90aabb9"}, + {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:c5529b34c1c9d937168297f2c1fde7ebe9ebdd5e121297ff9c043bdb2ae3d6fb"}, + {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:ca63e1890ede90b2e4454f9a65135a4d387a4585ff8282bb72964fab893f2111"}, + {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e79e6520141d792237c70bcd7a3b122d00f2613769ae0cb61c52e89fd3443839"}, + {file = "Brotli-1.1.0-cp312-cp312-win32.whl", hash = "sha256:5f4d5ea15c9382135076d2fb28dde923352fe02951e66935a9efaac8f10e81b0"}, + {file = "Brotli-1.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:906bc3a79de8c4ae5b86d3d75a8b77e44404b0f4261714306e3ad248d8ab0951"}, + {file = "Brotli-1.1.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:8bf32b98b75c13ec7cf774164172683d6e7891088f6316e54425fde1efc276d5"}, + {file = "Brotli-1.1.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:7bc37c4d6b87fb1017ea28c9508b36bbcb0c3d18b4260fcdf08b200c74a6aee8"}, + {file = "Brotli-1.1.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3c0ef38c7a7014ffac184db9e04debe495d317cc9c6fb10071f7fefd93100a4f"}, + {file = "Brotli-1.1.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:91d7cc2a76b5567591d12c01f019dd7afce6ba8cba6571187e21e2fc418ae648"}, + {file = "Brotli-1.1.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a93dde851926f4f2678e704fadeb39e16c35d8baebd5252c9fd94ce8ce68c4a0"}, + {file = "Brotli-1.1.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f0db75f47be8b8abc8d9e31bc7aad0547ca26f24a54e6fd10231d623f183d089"}, + {file = "Brotli-1.1.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6967ced6730aed543b8673008b5a391c3b1076d834ca438bbd70635c73775368"}, + {file = "Brotli-1.1.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:7eedaa5d036d9336c95915035fb57422054014ebdeb6f3b42eac809928e40d0c"}, + {file = "Brotli-1.1.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:d487f5432bf35b60ed625d7e1b448e2dc855422e87469e3f450aa5552b0eb284"}, + {file = "Brotli-1.1.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:832436e59afb93e1836081a20f324cb185836c617659b07b129141a8426973c7"}, + {file = "Brotli-1.1.0-cp313-cp313-win32.whl", hash = "sha256:43395e90523f9c23a3d5bdf004733246fba087f2948f87ab28015f12359ca6a0"}, + {file = "Brotli-1.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:9011560a466d2eb3f5a6e4929cf4a09be405c64154e12df0dd72713f6500e32b"}, + {file = "Brotli-1.1.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:a090ca607cbb6a34b0391776f0cb48062081f5f60ddcce5d11838e67a01928d1"}, + {file = "Brotli-1.1.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2de9d02f5bda03d27ede52e8cfe7b865b066fa49258cbab568720aa5be80a47d"}, + {file = "Brotli-1.1.0-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2333e30a5e00fe0fe55903c8832e08ee9c3b1382aacf4db26664a16528d51b4b"}, + {file = "Brotli-1.1.0-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:4d4a848d1837973bf0f4b5e54e3bec977d99be36a7895c61abb659301b02c112"}, + {file = "Brotli-1.1.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:fdc3ff3bfccdc6b9cc7c342c03aa2400683f0cb891d46e94b64a197910dc4064"}, + {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:5eeb539606f18a0b232d4ba45adccde4125592f3f636a6182b4a8a436548b914"}, + {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:fd5f17ff8f14003595ab414e45fce13d073e0762394f957182e69035c9f3d7c2"}, + {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:069a121ac97412d1fe506da790b3e69f52254b9df4eb665cd42460c837193354"}, + {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:e93dfc1a1165e385cc8239fab7c036fb2cd8093728cbd85097b284d7b99249a2"}, + {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_2_aarch64.whl", hash = "sha256:aea440a510e14e818e67bfc4027880e2fb500c2ccb20ab21c7a7c8b5b4703d75"}, + {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_2_i686.whl", hash = "sha256:6974f52a02321b36847cd19d1b8e381bf39939c21efd6ee2fc13a28b0d99348c"}, + {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_2_ppc64le.whl", hash = "sha256:a7e53012d2853a07a4a79c00643832161a910674a893d296c9f1259859a289d2"}, + {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_2_x86_64.whl", hash = "sha256:d7702622a8b40c49bffb46e1e3ba2e81268d5c04a34f460978c6b5517a34dd52"}, + {file = "Brotli-1.1.0-cp36-cp36m-win32.whl", hash = "sha256:a599669fd7c47233438a56936988a2478685e74854088ef5293802123b5b2460"}, + {file = "Brotli-1.1.0-cp36-cp36m-win_amd64.whl", hash = "sha256:d143fd47fad1db3d7c27a1b1d66162e855b5d50a89666af46e1679c496e8e579"}, + {file = "Brotli-1.1.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:11d00ed0a83fa22d29bc6b64ef636c4552ebafcef57154b4ddd132f5638fbd1c"}, + {file = "Brotli-1.1.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f733d788519c7e3e71f0855c96618720f5d3d60c3cb829d8bbb722dddce37985"}, + {file = "Brotli-1.1.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:929811df5462e182b13920da56c6e0284af407d1de637d8e536c5cd00a7daf60"}, + {file = "Brotli-1.1.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:0b63b949ff929fbc2d6d3ce0e924c9b93c9785d877a21a1b678877ffbbc4423a"}, + {file = "Brotli-1.1.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:d192f0f30804e55db0d0e0a35d83a9fead0e9a359a9ed0285dbacea60cc10a84"}, + {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:f296c40e23065d0d6650c4aefe7470d2a25fffda489bcc3eb66083f3ac9f6643"}, + {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:919e32f147ae93a09fe064d77d5ebf4e35502a8df75c29fb05788528e330fe74"}, + {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:23032ae55523cc7bccb4f6a0bf368cd25ad9bcdcc1990b64a647e7bbcce9cb5b"}, + {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:224e57f6eac61cc449f498cc5f0e1725ba2071a3d4f48d5d9dffba42db196438"}, + {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:cb1dac1770878ade83f2ccdf7d25e494f05c9165f5246b46a621cc849341dc01"}, + {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_2_i686.whl", hash = "sha256:3ee8a80d67a4334482d9712b8e83ca6b1d9bc7e351931252ebef5d8f7335a547"}, + {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_2_ppc64le.whl", hash = "sha256:5e55da2c8724191e5b557f8e18943b1b4839b8efc3ef60d65985bcf6f587dd38"}, + {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:d342778ef319e1026af243ed0a07c97acf3bad33b9f29e7ae6a1f68fd083e90c"}, + {file = "Brotli-1.1.0-cp37-cp37m-win32.whl", hash = "sha256:587ca6d3cef6e4e868102672d3bd9dc9698c309ba56d41c2b9c85bbb903cdb95"}, + {file = "Brotli-1.1.0-cp37-cp37m-win_amd64.whl", hash = "sha256:2954c1c23f81c2eaf0b0717d9380bd348578a94161a65b3a2afc62c86467dd68"}, + {file = "Brotli-1.1.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:efa8b278894b14d6da122a72fefcebc28445f2d3f880ac59d46c90f4c13be9a3"}, + {file = "Brotli-1.1.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:03d20af184290887bdea3f0f78c4f737d126c74dc2f3ccadf07e54ceca3bf208"}, + {file = "Brotli-1.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6172447e1b368dcbc458925e5ddaf9113477b0ed542df258d84fa28fc45ceea7"}, + {file = "Brotli-1.1.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a743e5a28af5f70f9c080380a5f908d4d21d40e8f0e0c8901604d15cfa9ba751"}, + {file = "Brotli-1.1.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:0541e747cce78e24ea12d69176f6a7ddb690e62c425e01d31cc065e69ce55b48"}, + {file = "Brotli-1.1.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:cdbc1fc1bc0bff1cef838eafe581b55bfbffaed4ed0318b724d0b71d4d377619"}, + {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:890b5a14ce214389b2cc36ce82f3093f96f4cc730c1cffdbefff77a7c71f2a97"}, + {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:1ab4fbee0b2d9098c74f3057b2bc055a8bd92ccf02f65944a241b4349229185a"}, + {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:141bd4d93984070e097521ed07e2575b46f817d08f9fa42b16b9b5f27b5ac088"}, + {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:fce1473f3ccc4187f75b4690cfc922628aed4d3dd013d047f95a9b3919a86596"}, + {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:d2b35ca2c7f81d173d2fadc2f4f31e88cc5f7a39ae5b6db5513cf3383b0e0ec7"}, + {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:af6fa6817889314555aede9a919612b23739395ce767fe7fcbea9a80bf140fe5"}, + {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:2feb1d960f760a575dbc5ab3b1c00504b24caaf6986e2dc2b01c09c87866a943"}, + {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:4410f84b33374409552ac9b6903507cdb31cd30d2501fc5ca13d18f73548444a"}, + {file = "Brotli-1.1.0-cp38-cp38-win32.whl", hash = "sha256:db85ecf4e609a48f4b29055f1e144231b90edc90af7481aa731ba2d059226b1b"}, + {file = "Brotli-1.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:3d7954194c36e304e1523f55d7042c59dc53ec20dd4e9ea9d151f1b62b4415c0"}, + {file = "Brotli-1.1.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:5fb2ce4b8045c78ebbc7b8f3c15062e435d47e7393cc57c25115cfd49883747a"}, + {file = "Brotli-1.1.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7905193081db9bfa73b1219140b3d315831cbff0d8941f22da695832f0dd188f"}, + {file = "Brotli-1.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a77def80806c421b4b0af06f45d65a136e7ac0bdca3c09d9e2ea4e515367c7e9"}, + {file = "Brotli-1.1.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8dadd1314583ec0bf2d1379f7008ad627cd6336625d6679cf2f8e67081b83acf"}, + {file = "Brotli-1.1.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:901032ff242d479a0efa956d853d16875d42157f98951c0230f69e69f9c09bac"}, + {file = "Brotli-1.1.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:22fc2a8549ffe699bfba2256ab2ed0421a7b8fadff114a3d201794e45a9ff578"}, + {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:ae15b066e5ad21366600ebec29a7ccbc86812ed267e4b28e860b8ca16a2bc474"}, + {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:949f3b7c29912693cee0afcf09acd6ebc04c57af949d9bf77d6101ebb61e388c"}, + {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:89f4988c7203739d48c6f806f1e87a1d96e0806d44f0fba61dba81392c9e474d"}, + {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:de6551e370ef19f8de1807d0a9aa2cdfdce2e85ce88b122fe9f6b2b076837e59"}, + {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:0737ddb3068957cf1b054899b0883830bb1fec522ec76b1098f9b6e0f02d9419"}, + {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:4f3607b129417e111e30637af1b56f24f7a49e64763253bbc275c75fa887d4b2"}, + {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:6c6e0c425f22c1c719c42670d561ad682f7bfeeef918edea971a79ac5252437f"}, + {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:494994f807ba0b92092a163a0a283961369a65f6cbe01e8891132b7a320e61eb"}, + {file = "Brotli-1.1.0-cp39-cp39-win32.whl", hash = "sha256:f0d8a7a6b5983c2496e364b969f0e526647a06b075d034f3297dc66f3b360c64"}, + {file = "Brotli-1.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:cdad5b9014d83ca68c25d2e9444e28e967ef16e80f6b436918c700c117a85467"}, + {file = "Brotli-1.1.0.tar.gz", hash = "sha256:81de08ac11bcb85841e440c13611c00b67d3bf82698314928d0b676362546724"}, +] + +[[package]] +name = "brotlicffi" +version = "1.1.0.0" +description = "Python CFFI bindings to the Brotli library" +optional = false +python-versions = ">=3.7" +groups = ["tutorials"] +markers = "platform_python_implementation == \"PyPy\"" +files = [ + {file = "brotlicffi-1.1.0.0-cp37-abi3-macosx_10_9_x86_64.whl", hash = "sha256:9b7ae6bd1a3f0df532b6d67ff674099a96d22bc0948955cb338488c31bfb8851"}, + {file = "brotlicffi-1.1.0.0-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:19ffc919fa4fc6ace69286e0a23b3789b4219058313cf9b45625016bf7ff996b"}, + {file = "brotlicffi-1.1.0.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9feb210d932ffe7798ee62e6145d3a757eb6233aa9a4e7db78dd3690d7755814"}, + {file = "brotlicffi-1.1.0.0-cp37-abi3-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:84763dbdef5dd5c24b75597a77e1b30c66604725707565188ba54bab4f114820"}, + {file = "brotlicffi-1.1.0.0-cp37-abi3-win32.whl", hash = "sha256:1b12b50e07c3911e1efa3a8971543e7648100713d4e0971b13631cce22c587eb"}, + {file = "brotlicffi-1.1.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:994a4f0681bb6c6c3b0925530a1926b7a189d878e6e5e38fae8efa47c5d9c613"}, + {file = "brotlicffi-1.1.0.0-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:2e4aeb0bd2540cb91b069dbdd54d458da8c4334ceaf2d25df2f4af576d6766ca"}, + {file = "brotlicffi-1.1.0.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4b7b0033b0d37bb33009fb2fef73310e432e76f688af76c156b3594389d81391"}, + {file = "brotlicffi-1.1.0.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:54a07bb2374a1eba8ebb52b6fafffa2afd3c4df85ddd38fcc0511f2bb387c2a8"}, + {file = "brotlicffi-1.1.0.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7901a7dc4b88f1c1475de59ae9be59799db1007b7d059817948d8e4f12e24e35"}, + {file = "brotlicffi-1.1.0.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:ce01c7316aebc7fce59da734286148b1d1b9455f89cf2c8a4dfce7d41db55c2d"}, + {file = "brotlicffi-1.1.0.0-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:246f1d1a90279bb6069de3de8d75a8856e073b8ff0b09dcca18ccc14cec85979"}, + {file = "brotlicffi-1.1.0.0-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc4bc5d82bc56ebd8b514fb8350cfac4627d6b0743382e46d033976a5f80fab6"}, + {file = "brotlicffi-1.1.0.0-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:37c26ecb14386a44b118ce36e546ce307f4810bc9598a6e6cb4f7fca725ae7e6"}, + {file = "brotlicffi-1.1.0.0-pp37-pypy37_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ca72968ae4eaf6470498d5c2887073f7efe3b1e7d7ec8be11a06a79cc810e990"}, + {file = "brotlicffi-1.1.0.0-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:add0de5b9ad9e9aa293c3aa4e9deb2b61e99ad6c1634e01d01d98c03e6a354cc"}, + {file = "brotlicffi-1.1.0.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:9b6068e0f3769992d6b622a1cd2e7835eae3cf8d9da123d7f51ca9c1e9c333e5"}, + {file = "brotlicffi-1.1.0.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8557a8559509b61e65083f8782329188a250102372576093c88930c875a69838"}, + {file = "brotlicffi-1.1.0.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2a7ae37e5d79c5bdfb5b4b99f2715a6035e6c5bf538c3746abc8e26694f92f33"}, + {file = "brotlicffi-1.1.0.0-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:391151ec86bb1c683835980f4816272a87eaddc46bb91cbf44f62228b84d8cca"}, + {file = "brotlicffi-1.1.0.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:2f3711be9290f0453de8eed5275d93d286abe26b08ab4a35d7452caa1fef532f"}, + {file = "brotlicffi-1.1.0.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:1a807d760763e398bbf2c6394ae9da5815901aa93ee0a37bca5efe78d4ee3171"}, + {file = "brotlicffi-1.1.0.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fa8ca0623b26c94fccc3a1fdd895be1743b838f3917300506d04aa3346fd2a14"}, + {file = "brotlicffi-1.1.0.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3de0cf28a53a3238b252aca9fed1593e9d36c1d116748013339f0949bfc84112"}, + {file = "brotlicffi-1.1.0.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6be5ec0e88a4925c91f3dea2bb0013b3a2accda6f77238f76a34a1ea532a1cb0"}, + {file = "brotlicffi-1.1.0.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:d9eb71bb1085d996244439154387266fd23d6ad37161f6f52f1cd41dd95a3808"}, + {file = "brotlicffi-1.1.0.0.tar.gz", hash = "sha256:b77827a689905143f87915310b93b273ab17888fd43ef350d4832c4a71083c13"}, +] + +[package.dependencies] +cffi = ">=1.0.0" + +[[package]] +name = "certifi" +version = "2025.1.31" +description = "Python package for providing Mozilla's CA Bundle." +optional = false +python-versions = ">=3.6" +groups = ["tutorials"] +files = [ + {file = "certifi-2025.1.31-py3-none-any.whl", hash = "sha256:ca78db4565a652026a4db2bcdf68f2fb589ea80d0be70e03929ed730746b84fe"}, + {file = "certifi-2025.1.31.tar.gz", hash = "sha256:3d5da6925056f6f18f119200434a4780a94263f10d1c21d032a6f6b2baa20651"}, +] + +[[package]] +name = "cffi" +version = "1.17.1" +description = "Foreign Function Interface for Python calling C code." +optional = false +python-versions = ">=3.8" +groups = ["tutorials"] +markers = "platform_python_implementation == \"PyPy\"" +files = [ + {file = "cffi-1.17.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:df8b1c11f177bc2313ec4b2d46baec87a5f3e71fc8b45dab2ee7cae86d9aba14"}, + {file = "cffi-1.17.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8f2cdc858323644ab277e9bb925ad72ae0e67f69e804f4898c070998d50b1a67"}, + {file = "cffi-1.17.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:edae79245293e15384b51f88b00613ba9f7198016a5948b5dddf4917d4d26382"}, + {file = "cffi-1.17.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45398b671ac6d70e67da8e4224a065cec6a93541bb7aebe1b198a61b58c7b702"}, + {file = "cffi-1.17.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ad9413ccdeda48c5afdae7e4fa2192157e991ff761e7ab8fdd8926f40b160cc3"}, + {file = "cffi-1.17.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5da5719280082ac6bd9aa7becb3938dc9f9cbd57fac7d2871717b1feb0902ab6"}, + {file = "cffi-1.17.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2bb1a08b8008b281856e5971307cc386a8e9c5b625ac297e853d36da6efe9c17"}, + {file = "cffi-1.17.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:045d61c734659cc045141be4bae381a41d89b741f795af1dd018bfb532fd0df8"}, + {file = "cffi-1.17.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:6883e737d7d9e4899a8a695e00ec36bd4e5e4f18fabe0aca0efe0a4b44cdb13e"}, + {file = "cffi-1.17.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:6b8b4a92e1c65048ff98cfe1f735ef8f1ceb72e3d5f0c25fdb12087a23da22be"}, + {file = "cffi-1.17.1-cp310-cp310-win32.whl", hash = "sha256:c9c3d058ebabb74db66e431095118094d06abf53284d9c81f27300d0e0d8bc7c"}, + {file = "cffi-1.17.1-cp310-cp310-win_amd64.whl", hash = "sha256:0f048dcf80db46f0098ccac01132761580d28e28bc0f78ae0d58048063317e15"}, + {file = "cffi-1.17.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a45e3c6913c5b87b3ff120dcdc03f6131fa0065027d0ed7ee6190736a74cd401"}, + {file = "cffi-1.17.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:30c5e0cb5ae493c04c8b42916e52ca38079f1b235c2f8ae5f4527b963c401caf"}, + {file = "cffi-1.17.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f75c7ab1f9e4aca5414ed4d8e5c0e303a34f4421f8a0d47a4d019ceff0ab6af4"}, + {file = "cffi-1.17.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1ed2dd2972641495a3ec98445e09766f077aee98a1c896dcb4ad0d303628e41"}, + {file = "cffi-1.17.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:46bf43160c1a35f7ec506d254e5c890f3c03648a4dbac12d624e4490a7046cd1"}, + {file = "cffi-1.17.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a24ed04c8ffd54b0729c07cee15a81d964e6fee0e3d4d342a27b020d22959dc6"}, + {file = "cffi-1.17.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:610faea79c43e44c71e1ec53a554553fa22321b65fae24889706c0a84d4ad86d"}, + {file = "cffi-1.17.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:a9b15d491f3ad5d692e11f6b71f7857e7835eb677955c00cc0aefcd0669adaf6"}, + {file = "cffi-1.17.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:de2ea4b5833625383e464549fec1bc395c1bdeeb5f25c4a3a82b5a8c756ec22f"}, + {file = "cffi-1.17.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:fc48c783f9c87e60831201f2cce7f3b2e4846bf4d8728eabe54d60700b318a0b"}, + {file = "cffi-1.17.1-cp311-cp311-win32.whl", hash = "sha256:85a950a4ac9c359340d5963966e3e0a94a676bd6245a4b55bc43949eee26a655"}, + {file = "cffi-1.17.1-cp311-cp311-win_amd64.whl", hash = "sha256:caaf0640ef5f5517f49bc275eca1406b0ffa6aa184892812030f04c2abf589a0"}, + {file = "cffi-1.17.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:805b4371bf7197c329fcb3ead37e710d1bca9da5d583f5073b799d5c5bd1eee4"}, + {file = "cffi-1.17.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:733e99bc2df47476e3848417c5a4540522f234dfd4ef3ab7fafdf555b082ec0c"}, + {file = "cffi-1.17.1-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1257bdabf294dceb59f5e70c64a3e2f462c30c7ad68092d01bbbfb1c16b1ba36"}, + {file = "cffi-1.17.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da95af8214998d77a98cc14e3a3bd00aa191526343078b530ceb0bd710fb48a5"}, + {file = "cffi-1.17.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d63afe322132c194cf832bfec0dc69a99fb9bb6bbd550f161a49e9e855cc78ff"}, + {file = "cffi-1.17.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f79fc4fc25f1c8698ff97788206bb3c2598949bfe0fef03d299eb1b5356ada99"}, + {file = "cffi-1.17.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b62ce867176a75d03a665bad002af8e6d54644fad99a3c70905c543130e39d93"}, + {file = "cffi-1.17.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:386c8bf53c502fff58903061338ce4f4950cbdcb23e2902d86c0f722b786bbe3"}, + {file = "cffi-1.17.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4ceb10419a9adf4460ea14cfd6bc43d08701f0835e979bf821052f1805850fe8"}, + {file = "cffi-1.17.1-cp312-cp312-win32.whl", hash = "sha256:a08d7e755f8ed21095a310a693525137cfe756ce62d066e53f502a83dc550f65"}, + {file = "cffi-1.17.1-cp312-cp312-win_amd64.whl", hash = "sha256:51392eae71afec0d0c8fb1a53b204dbb3bcabcb3c9b807eedf3e1e6ccf2de903"}, + {file = "cffi-1.17.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f3a2b4222ce6b60e2e8b337bb9596923045681d71e5a082783484d845390938e"}, + {file = "cffi-1.17.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:0984a4925a435b1da406122d4d7968dd861c1385afe3b45ba82b750f229811e2"}, + {file = "cffi-1.17.1-cp313-cp313-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d01b12eeeb4427d3110de311e1774046ad344f5b1a7403101878976ecd7a10f3"}, + {file = "cffi-1.17.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:706510fe141c86a69c8ddc029c7910003a17353970cff3b904ff0686a5927683"}, + {file = "cffi-1.17.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:de55b766c7aa2e2a3092c51e0483d700341182f08e67c63630d5b6f200bb28e5"}, + {file = "cffi-1.17.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c59d6e989d07460165cc5ad3c61f9fd8f1b4796eacbd81cee78957842b834af4"}, + {file = "cffi-1.17.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd398dbc6773384a17fe0d3e7eeb8d1a21c2200473ee6806bb5e6a8e62bb73dd"}, + {file = "cffi-1.17.1-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:3edc8d958eb099c634dace3c7e16560ae474aa3803a5df240542b305d14e14ed"}, + {file = "cffi-1.17.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:72e72408cad3d5419375fc87d289076ee319835bdfa2caad331e377589aebba9"}, + {file = "cffi-1.17.1-cp313-cp313-win32.whl", hash = "sha256:e03eab0a8677fa80d646b5ddece1cbeaf556c313dcfac435ba11f107ba117b5d"}, + {file = "cffi-1.17.1-cp313-cp313-win_amd64.whl", hash = "sha256:f6a16c31041f09ead72d69f583767292f750d24913dadacf5756b966aacb3f1a"}, + {file = "cffi-1.17.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:636062ea65bd0195bc012fea9321aca499c0504409f413dc88af450b57ffd03b"}, + {file = "cffi-1.17.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c7eac2ef9b63c79431bc4b25f1cd649d7f061a28808cbc6c47b534bd789ef964"}, + {file = "cffi-1.17.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e221cf152cff04059d011ee126477f0d9588303eb57e88923578ace7baad17f9"}, + {file = "cffi-1.17.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:31000ec67d4221a71bd3f67df918b1f88f676f1c3b535a7eb473255fdc0b83fc"}, + {file = "cffi-1.17.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6f17be4345073b0a7b8ea599688f692ac3ef23ce28e5df79c04de519dbc4912c"}, + {file = "cffi-1.17.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e2b1fac190ae3ebfe37b979cc1ce69c81f4e4fe5746bb401dca63a9062cdaf1"}, + {file = "cffi-1.17.1-cp38-cp38-win32.whl", hash = "sha256:7596d6620d3fa590f677e9ee430df2958d2d6d6de2feeae5b20e82c00b76fbf8"}, + {file = "cffi-1.17.1-cp38-cp38-win_amd64.whl", hash = "sha256:78122be759c3f8a014ce010908ae03364d00a1f81ab5c7f4a7a5120607ea56e1"}, + {file = "cffi-1.17.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b2ab587605f4ba0bf81dc0cb08a41bd1c0a5906bd59243d56bad7668a6fc6c16"}, + {file = "cffi-1.17.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:28b16024becceed8c6dfbc75629e27788d8a3f9030691a1dbf9821a128b22c36"}, + {file = "cffi-1.17.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1d599671f396c4723d016dbddb72fe8e0397082b0a77a4fab8028923bec050e8"}, + {file = "cffi-1.17.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca74b8dbe6e8e8263c0ffd60277de77dcee6c837a3d0881d8c1ead7268c9e576"}, + {file = "cffi-1.17.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f7f5baafcc48261359e14bcd6d9bff6d4b28d9103847c9e136694cb0501aef87"}, + {file = "cffi-1.17.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:98e3969bcff97cae1b2def8ba499ea3d6f31ddfdb7635374834cf89a1a08ecf0"}, + {file = "cffi-1.17.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cdf5ce3acdfd1661132f2a9c19cac174758dc2352bfe37d98aa7512c6b7178b3"}, + {file = "cffi-1.17.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9755e4345d1ec879e3849e62222a18c7174d65a6a92d5b346b1863912168b595"}, + {file = "cffi-1.17.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:f1e22e8c4419538cb197e4dd60acc919d7696e5ef98ee4da4e01d3f8cfa4cc5a"}, + {file = "cffi-1.17.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:c03e868a0b3bc35839ba98e74211ed2b05d2119be4e8a0f224fba9384f1fe02e"}, + {file = "cffi-1.17.1-cp39-cp39-win32.whl", hash = "sha256:e31ae45bc2e29f6b2abd0de1cc3b9d5205aa847cafaecb8af1476a609a2f6eb7"}, + {file = "cffi-1.17.1-cp39-cp39-win_amd64.whl", hash = "sha256:d016c76bdd850f3c626af19b0542c9677ba156e4ee4fccfdd7848803533ef662"}, + {file = "cffi-1.17.1.tar.gz", hash = "sha256:1c39c6016c32bc48dd54561950ebd6836e1670f2ae46128f67cf49e789c52824"}, +] + +[package.dependencies] +pycparser = "*" + +[[package]] +name = "charset-normalizer" +version = "3.4.1" +description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." +optional = false +python-versions = ">=3.7" +groups = ["tutorials"] +files = [ + {file = "charset_normalizer-3.4.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:91b36a978b5ae0ee86c394f5a54d6ef44db1de0815eb43de826d41d21e4af3de"}, + {file = "charset_normalizer-3.4.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7461baadb4dc00fd9e0acbe254e3d7d2112e7f92ced2adc96e54ef6501c5f176"}, + {file = "charset_normalizer-3.4.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e218488cd232553829be0664c2292d3af2eeeb94b32bea483cf79ac6a694e037"}, + {file = "charset_normalizer-3.4.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:80ed5e856eb7f30115aaf94e4a08114ccc8813e6ed1b5efa74f9f82e8509858f"}, + {file = "charset_normalizer-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b010a7a4fd316c3c484d482922d13044979e78d1861f0e0650423144c616a46a"}, + {file = "charset_normalizer-3.4.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4532bff1b8421fd0a320463030c7520f56a79c9024a4e88f01c537316019005a"}, + {file = "charset_normalizer-3.4.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d973f03c0cb71c5ed99037b870f2be986c3c05e63622c017ea9816881d2dd247"}, + {file = "charset_normalizer-3.4.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:3a3bd0dcd373514dcec91c411ddb9632c0d7d92aed7093b8c3bbb6d69ca74408"}, + {file = "charset_normalizer-3.4.1-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:d9c3cdf5390dcd29aa8056d13e8e99526cda0305acc038b96b30352aff5ff2bb"}, + {file = "charset_normalizer-3.4.1-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:2bdfe3ac2e1bbe5b59a1a63721eb3b95fc9b6817ae4a46debbb4e11f6232428d"}, + {file = "charset_normalizer-3.4.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:eab677309cdb30d047996b36d34caeda1dc91149e4fdca0b1a039b3f79d9a807"}, + {file = "charset_normalizer-3.4.1-cp310-cp310-win32.whl", hash = "sha256:c0429126cf75e16c4f0ad00ee0eae4242dc652290f940152ca8c75c3a4b6ee8f"}, + {file = "charset_normalizer-3.4.1-cp310-cp310-win_amd64.whl", hash = "sha256:9f0b8b1c6d84c8034a44893aba5e767bf9c7a211e313a9605d9c617d7083829f"}, + {file = "charset_normalizer-3.4.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:8bfa33f4f2672964266e940dd22a195989ba31669bd84629f05fab3ef4e2d125"}, + {file = "charset_normalizer-3.4.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:28bf57629c75e810b6ae989f03c0828d64d6b26a5e205535585f96093e405ed1"}, + {file = "charset_normalizer-3.4.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f08ff5e948271dc7e18a35641d2f11a4cd8dfd5634f55228b691e62b37125eb3"}, + {file = "charset_normalizer-3.4.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:234ac59ea147c59ee4da87a0c0f098e9c8d169f4dc2a159ef720f1a61bbe27cd"}, + {file = "charset_normalizer-3.4.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd4ec41f914fa74ad1b8304bbc634b3de73d2a0889bd32076342a573e0779e00"}, + {file = "charset_normalizer-3.4.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eea6ee1db730b3483adf394ea72f808b6e18cf3cb6454b4d86e04fa8c4327a12"}, + {file = "charset_normalizer-3.4.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c96836c97b1238e9c9e3fe90844c947d5afbf4f4c92762679acfe19927d81d77"}, + {file = "charset_normalizer-3.4.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:4d86f7aff21ee58f26dcf5ae81a9addbd914115cdebcbb2217e4f0ed8982e146"}, + {file = "charset_normalizer-3.4.1-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:09b5e6733cbd160dcc09589227187e242a30a49ca5cefa5a7edd3f9d19ed53fd"}, + {file = "charset_normalizer-3.4.1-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:5777ee0881f9499ed0f71cc82cf873d9a0ca8af166dfa0af8ec4e675b7df48e6"}, + {file = "charset_normalizer-3.4.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:237bdbe6159cff53b4f24f397d43c6336c6b0b42affbe857970cefbb620911c8"}, + {file = "charset_normalizer-3.4.1-cp311-cp311-win32.whl", hash = "sha256:8417cb1f36cc0bc7eaba8ccb0e04d55f0ee52df06df3ad55259b9a323555fc8b"}, + {file = "charset_normalizer-3.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:d7f50a1f8c450f3925cb367d011448c39239bb3eb4117c36a6d354794de4ce76"}, + {file = "charset_normalizer-3.4.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:73d94b58ec7fecbc7366247d3b0b10a21681004153238750bb67bd9012414545"}, + {file = "charset_normalizer-3.4.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dad3e487649f498dd991eeb901125411559b22e8d7ab25d3aeb1af367df5efd7"}, + {file = "charset_normalizer-3.4.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c30197aa96e8eed02200a83fba2657b4c3acd0f0aa4bdc9f6c1af8e8962e0757"}, + {file = "charset_normalizer-3.4.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2369eea1ee4a7610a860d88f268eb39b95cb588acd7235e02fd5a5601773d4fa"}, + {file = "charset_normalizer-3.4.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc2722592d8998c870fa4e290c2eec2c1569b87fe58618e67d38b4665dfa680d"}, + {file = "charset_normalizer-3.4.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffc9202a29ab3920fa812879e95a9e78b2465fd10be7fcbd042899695d75e616"}, + {file = "charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:804a4d582ba6e5b747c625bf1255e6b1507465494a40a2130978bda7b932c90b"}, + {file = "charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:0f55e69f030f7163dffe9fd0752b32f070566451afe180f99dbeeb81f511ad8d"}, + {file = "charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:c4c3e6da02df6fa1410a7680bd3f63d4f710232d3139089536310d027950696a"}, + {file = "charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:5df196eb874dae23dcfb968c83d4f8fdccb333330fe1fc278ac5ceeb101003a9"}, + {file = "charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e358e64305fe12299a08e08978f51fc21fac060dcfcddd95453eabe5b93ed0e1"}, + {file = "charset_normalizer-3.4.1-cp312-cp312-win32.whl", hash = "sha256:9b23ca7ef998bc739bf6ffc077c2116917eabcc901f88da1b9856b210ef63f35"}, + {file = "charset_normalizer-3.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:6ff8a4a60c227ad87030d76e99cd1698345d4491638dfa6673027c48b3cd395f"}, + {file = "charset_normalizer-3.4.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:aabfa34badd18f1da5ec1bc2715cadc8dca465868a4e73a0173466b688f29dda"}, + {file = "charset_normalizer-3.4.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:22e14b5d70560b8dd51ec22863f370d1e595ac3d024cb8ad7d308b4cd95f8313"}, + {file = "charset_normalizer-3.4.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8436c508b408b82d87dc5f62496973a1805cd46727c34440b0d29d8a2f50a6c9"}, + {file = "charset_normalizer-3.4.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2d074908e1aecee37a7635990b2c6d504cd4766c7bc9fc86d63f9c09af3fa11b"}, + {file = "charset_normalizer-3.4.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:955f8851919303c92343d2f66165294848d57e9bba6cf6e3625485a70a038d11"}, + {file = "charset_normalizer-3.4.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:44ecbf16649486d4aebafeaa7ec4c9fed8b88101f4dd612dcaf65d5e815f837f"}, + {file = "charset_normalizer-3.4.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:0924e81d3d5e70f8126529951dac65c1010cdf117bb75eb02dd12339b57749dd"}, + {file = "charset_normalizer-3.4.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:2967f74ad52c3b98de4c3b32e1a44e32975e008a9cd2a8cc8966d6a5218c5cb2"}, + {file = "charset_normalizer-3.4.1-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:c75cb2a3e389853835e84a2d8fb2b81a10645b503eca9bcb98df6b5a43eb8886"}, + {file = "charset_normalizer-3.4.1-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:09b26ae6b1abf0d27570633b2b078a2a20419c99d66fb2823173d73f188ce601"}, + {file = "charset_normalizer-3.4.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:fa88b843d6e211393a37219e6a1c1df99d35e8fd90446f1118f4216e307e48cd"}, + {file = "charset_normalizer-3.4.1-cp313-cp313-win32.whl", hash = "sha256:eb8178fe3dba6450a3e024e95ac49ed3400e506fd4e9e5c32d30adda88cbd407"}, + {file = "charset_normalizer-3.4.1-cp313-cp313-win_amd64.whl", hash = "sha256:b1ac5992a838106edb89654e0aebfc24f5848ae2547d22c2c3f66454daa11971"}, + {file = "charset_normalizer-3.4.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f30bf9fd9be89ecb2360c7d94a711f00c09b976258846efe40db3d05828e8089"}, + {file = "charset_normalizer-3.4.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:97f68b8d6831127e4787ad15e6757232e14e12060bec17091b85eb1486b91d8d"}, + {file = "charset_normalizer-3.4.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7974a0b5ecd505609e3b19742b60cee7aa2aa2fb3151bc917e6e2646d7667dcf"}, + {file = "charset_normalizer-3.4.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc54db6c8593ef7d4b2a331b58653356cf04f67c960f584edb7c3d8c97e8f39e"}, + {file = "charset_normalizer-3.4.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:311f30128d7d333eebd7896965bfcfbd0065f1716ec92bd5638d7748eb6f936a"}, + {file = "charset_normalizer-3.4.1-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:7d053096f67cd1241601111b698f5cad775f97ab25d81567d3f59219b5f1adbd"}, + {file = "charset_normalizer-3.4.1-cp37-cp37m-musllinux_1_2_i686.whl", hash = "sha256:807f52c1f798eef6cf26beb819eeb8819b1622ddfeef9d0977a8502d4db6d534"}, + {file = "charset_normalizer-3.4.1-cp37-cp37m-musllinux_1_2_ppc64le.whl", hash = "sha256:dccbe65bd2f7f7ec22c4ff99ed56faa1e9f785482b9bbd7c717e26fd723a1d1e"}, + {file = "charset_normalizer-3.4.1-cp37-cp37m-musllinux_1_2_s390x.whl", hash = "sha256:2fb9bd477fdea8684f78791a6de97a953c51831ee2981f8e4f583ff3b9d9687e"}, + {file = "charset_normalizer-3.4.1-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:01732659ba9b5b873fc117534143e4feefecf3b2078b0a6a2e925271bb6f4cfa"}, + {file = "charset_normalizer-3.4.1-cp37-cp37m-win32.whl", hash = "sha256:7a4f97a081603d2050bfaffdefa5b02a9ec823f8348a572e39032caa8404a487"}, + {file = "charset_normalizer-3.4.1-cp37-cp37m-win_amd64.whl", hash = "sha256:7b1bef6280950ee6c177b326508f86cad7ad4dff12454483b51d8b7d673a2c5d"}, + {file = "charset_normalizer-3.4.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:ecddf25bee22fe4fe3737a399d0d177d72bc22be6913acfab364b40bce1ba83c"}, + {file = "charset_normalizer-3.4.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c60ca7339acd497a55b0ea5d506b2a2612afb2826560416f6894e8b5770d4a9"}, + {file = "charset_normalizer-3.4.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b7b2d86dd06bfc2ade3312a83a5c364c7ec2e3498f8734282c6c3d4b07b346b8"}, + {file = "charset_normalizer-3.4.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dd78cfcda14a1ef52584dbb008f7ac81c1328c0f58184bf9a84c49c605002da6"}, + {file = "charset_normalizer-3.4.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e27f48bcd0957c6d4cb9d6fa6b61d192d0b13d5ef563e5f2ae35feafc0d179c"}, + {file = "charset_normalizer-3.4.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:01ad647cdd609225c5350561d084b42ddf732f4eeefe6e678765636791e78b9a"}, + {file = "charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:619a609aa74ae43d90ed2e89bdd784765de0a25ca761b93e196d938b8fd1dbbd"}, + {file = "charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:89149166622f4db9b4b6a449256291dc87a99ee53151c74cbd82a53c8c2f6ccd"}, + {file = "charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:7709f51f5f7c853f0fb938bcd3bc59cdfdc5203635ffd18bf354f6967ea0f824"}, + {file = "charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:345b0426edd4e18138d6528aed636de7a9ed169b4aaf9d61a8c19e39d26838ca"}, + {file = "charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:0907f11d019260cdc3f94fbdb23ff9125f6b5d1039b76003b5b0ac9d6a6c9d5b"}, + {file = "charset_normalizer-3.4.1-cp38-cp38-win32.whl", hash = "sha256:ea0d8d539afa5eb2728aa1932a988a9a7af94f18582ffae4bc10b3fbdad0626e"}, + {file = "charset_normalizer-3.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:329ce159e82018d646c7ac45b01a430369d526569ec08516081727a20e9e4af4"}, + {file = "charset_normalizer-3.4.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:b97e690a2118911e39b4042088092771b4ae3fc3aa86518f84b8cf6888dbdb41"}, + {file = "charset_normalizer-3.4.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:78baa6d91634dfb69ec52a463534bc0df05dbd546209b79a3880a34487f4b84f"}, + {file = "charset_normalizer-3.4.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1a2bc9f351a75ef49d664206d51f8e5ede9da246602dc2d2726837620ea034b2"}, + {file = "charset_normalizer-3.4.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:75832c08354f595c760a804588b9357d34ec00ba1c940c15e31e96d902093770"}, + {file = "charset_normalizer-3.4.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0af291f4fe114be0280cdd29d533696a77b5b49cfde5467176ecab32353395c4"}, + {file = "charset_normalizer-3.4.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0167ddc8ab6508fe81860a57dd472b2ef4060e8d378f0cc555707126830f2537"}, + {file = "charset_normalizer-3.4.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:2a75d49014d118e4198bcee5ee0a6f25856b29b12dbf7cd012791f8a6cc5c496"}, + {file = "charset_normalizer-3.4.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:363e2f92b0f0174b2f8238240a1a30142e3db7b957a5dd5689b0e75fb717cc78"}, + {file = "charset_normalizer-3.4.1-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:ab36c8eb7e454e34e60eb55ca5d241a5d18b2c6244f6827a30e451c42410b5f7"}, + {file = "charset_normalizer-3.4.1-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:4c0907b1928a36d5a998d72d64d8eaa7244989f7aaaf947500d3a800c83a3fd6"}, + {file = "charset_normalizer-3.4.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:04432ad9479fa40ec0f387795ddad4437a2b50417c69fa275e212933519ff294"}, + {file = "charset_normalizer-3.4.1-cp39-cp39-win32.whl", hash = "sha256:3bed14e9c89dcb10e8f3a29f9ccac4955aebe93c71ae803af79265c9ca5644c5"}, + {file = "charset_normalizer-3.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:49402233c892a461407c512a19435d1ce275543138294f7ef013f0b63d5d3765"}, + {file = "charset_normalizer-3.4.1-py3-none-any.whl", hash = "sha256:d98b1668f06378c6dbefec3b92299716b931cd4e6061f3c875a71ced1780ab85"}, + {file = "charset_normalizer-3.4.1.tar.gz", hash = "sha256:44251f18cd68a75b56585dd00dae26183e102cd5e0f9f1466e6df5da2ed64ea3"}, +] + [[package]] name = "click" version = "8.1.8" description = "Composable command line interface toolkit" optional = false python-versions = ">=3.7" -groups = ["dev"] +groups = ["dev", "tutorials"] files = [ {file = "click-8.1.8-py3-none-any.whl", hash = "sha256:63c132bbbed01578a06712a2d1f497bb62d9c1c0d329b7903a866228027263b2"}, {file = "click-8.1.8.tar.gz", hash = "sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a"}, @@ -68,7 +440,7 @@ version = "0.4.6" description = "Cross-platform colored terminal text." optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" -groups = ["dev"] +groups = ["dev", "tutorials"] markers = "platform_system == \"Windows\"" files = [ {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, @@ -92,6 +464,77 @@ mccabe = ">=0.7.0,<0.8.0" pycodestyle = ">=2.12.0,<2.13.0" pyflakes = ">=3.2.0,<3.3.0" +[[package]] +name = "idna" +version = "3.10" +description = "Internationalized Domain Names in Applications (IDNA)" +optional = false +python-versions = ">=3.6" +groups = ["tutorials"] +files = [ + {file = "idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3"}, + {file = "idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9"}, +] + +[package.extras] +all = ["flake8 (>=7.1.1)", "mypy (>=1.11.2)", "pytest (>=8.3.2)", "ruff (>=0.6.2)"] + +[[package]] +name = "inflate64" +version = "1.0.1" +description = "deflate64 compression/decompression library" +optional = false +python-versions = ">=3.9" +groups = ["tutorials"] +files = [ + {file = "inflate64-1.0.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:5122a188995e47a735ab969edc9129d42bbd97b993df5a3f0819b87205ce81b4"}, + {file = "inflate64-1.0.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:975ed694c680e46a5c0bb872380a9c9da271a91f9c0646561c58e8f3714347d4"}, + {file = "inflate64-1.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8bcaf445d9cda5f7358e0c2b78144641560f8ce9e3e4351099754c49d26a34e8"}, + {file = "inflate64-1.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:daede09baba24117279109b30fdf935195e91957e31b995b86f8dd01711376ee"}, + {file = "inflate64-1.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0df40eaaba4fb8379d5c4fa5f56cc24741c4f1a91d4aef66438207473351ceaa"}, + {file = "inflate64-1.0.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:ef90855ff63d53c8fd3bfbf85b5280b22f82b9ab2e21a7eee45b8a19d9866c42"}, + {file = "inflate64-1.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:5daa4566c0b009c9ab8a6bf18ce407d14f5dbbb0d3068f3a43af939a17e117a7"}, + {file = "inflate64-1.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:d58a360b59685561a8feacee743479a9d7cc17c8d210aa1f2ae221f2513973cb"}, + {file = "inflate64-1.0.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:31198c5f156806cee05b69b149074042b7b7d39274ff4c259b898e617294ac17"}, + {file = "inflate64-1.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4ab693bb1cd92573a997f8fe7b90a2ec1e17a507884598f5640656257b95ef49"}, + {file = "inflate64-1.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:95b6a60e305e6e759e37d6c36691fcb87678922c56b3ddc2df06cd56e04f41f6"}, + {file = "inflate64-1.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:711ef889bdb3b3b296881d1e49830a3a896938fba7033c4287f1aed9b9a20111"}, + {file = "inflate64-1.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d3178495970ecb5c6a32167a8b57fdeef3bf4e2843eaf8f2d8f816f523741e36"}, + {file = "inflate64-1.0.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e8373b7feedf10236eb56d21598a19a3eb51077c3702d0ce3456b827374025e1"}, + {file = "inflate64-1.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:cf026d5c885f2d2bbf233e9a0c8c6d046ec727e2467024ffe0ac76b5be308258"}, + {file = "inflate64-1.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:3aa7489241e6c6f6d34b9561efdf06031c35305b864267a5b8f406abcd3e85c5"}, + {file = "inflate64-1.0.1-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:b81b3d373190ecd82901f42afd90b7127e9bdef341032a94db381c750ed3ddb2"}, + {file = "inflate64-1.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:dbfddc5dac975227c20997f0ac515917a15421767c6bff0c209ac6ff9d7b17cc"}, + {file = "inflate64-1.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2adeabe79cc2f90bca832673520c8cbad7370f86353e151293add7ca529bed34"}, + {file = "inflate64-1.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b235c97a05dbe2f92f0f057426e4d05a449e1fccf8e9aa88075ea9c6a06a182"}, + {file = "inflate64-1.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:19b74e30734dca5f1c83ca07074e1f25bf7b63f4a5ee7e074d9a4cb05af65cd5"}, + {file = "inflate64-1.0.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:b298feb85204b5ef148ccf807744c836fffed7c1ed3ec8bc9b4e323a03163291"}, + {file = "inflate64-1.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8a4c75241bc442267f79b8242135f2ded29405662c44b9353d34fbd4fa6e56b3"}, + {file = "inflate64-1.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:7b210392f0830ab27371e36478592f47757f5ea6c09ddb96e2125847b309eb5e"}, + {file = "inflate64-1.0.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:8dd58aa1adc4f98bf9b52baffa8f2ddf589e071a90db2f2bec9024328d4608cf"}, + {file = "inflate64-1.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c108be2b87e88c966570f84f839eb37f489b45dc3fa3046dc228327af6e921bb"}, + {file = "inflate64-1.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:63971c6b096c0d533c0e38b4257f5a7748501a8bc04d00cf239bd06467888703"}, + {file = "inflate64-1.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2d0077edb6b1cabfa2223b71a4a725e5755148f551a7a396c7d5698e45fb8828"}, + {file = "inflate64-1.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f05b5f2a6f1bf2f70e9c20d997261711cbc1ae477379662b05b36911da60a67"}, + {file = "inflate64-1.0.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:5f3c7402165f7e15789caa0787e5a349465d9a454105d0c3a0ccf2e9cdfb8117"}, + {file = "inflate64-1.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:39bced168822e4bf2f545d1b6dbeded6db01c32629d9e4549ef2cd1604a12e1b"}, + {file = "inflate64-1.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:70bb6a22d300d8ca25c26bc60afb5662c5a96d97a801962874d0461568512789"}, + {file = "inflate64-1.0.1-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:f3d5ea758358a1cc50f9e8e41de2134e9b5c5ca8bbcd88d1cd135d0e953d0fa8"}, + {file = "inflate64-1.0.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8fa102c834314c3d7edbf249d1be0bce5d12a9e122228a7ac3f861ee82c3dc5c"}, + {file = "inflate64-1.0.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8c2ae56a34e6cc2a712418ac82332e5d550ef8599e0ffb64c19b86d63a7df0c5"}, + {file = "inflate64-1.0.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:9808ae50b5db661770992566e51e648cac286c32bd80892b151e7b1eca81afe8"}, + {file = "inflate64-1.0.1-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:04b2788c6a26e1e525f53cc3d8c58782d41f18bef8d2a34a3d58beaaf0bfdd3b"}, + {file = "inflate64-1.0.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:67fd5b1f9e433b0abab8cb91f4da94d16223a5241008268a57f4729fdbfc4dbc"}, + {file = "inflate64-1.0.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c6f3b00c17ae365e82fc3d48ff9a7a566820a6c8c55b4e16c6cfbcbd46505a72"}, + {file = "inflate64-1.0.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:91c0c1d41c1655fb0189630baaa894a3b778d77062bb90ca11db878422948395"}, + {file = "inflate64-1.0.1.tar.gz", hash = "sha256:3b1c83c22651b5942b35829df526e89602e494192bf021e0d7d0b600e76c429d"}, +] + +[package.extras] +check = ["check-manifest", "flake8", "flake8-black", "flake8-deprecated", "flake8-isort", "mypy (>=1.10.0)", "mypy_extensions (>=0.4.1)", "pygments", "readme-renderer", "twine"] +docs = ["docutils", "sphinx (>=5.0)"] +test = ["pytest"] + [[package]] name = "isort" version = "6.0.0" @@ -120,6 +563,23 @@ files = [ {file = "mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"}, ] +[[package]] +name = "multivolumefile" +version = "0.2.3" +description = "multi volume file wrapper library" +optional = false +python-versions = ">=3.6" +groups = ["tutorials"] +files = [ + {file = "multivolumefile-0.2.3-py3-none-any.whl", hash = "sha256:237f4353b60af1703087cf7725755a1f6fcaeeea48421e1896940cd1c920d678"}, + {file = "multivolumefile-0.2.3.tar.gz", hash = "sha256:a0648d0aafbc96e59198d5c17e9acad7eb531abea51035d08ce8060dcad709d6"}, +] + +[package.extras] +check = ["check-manifest", "flake8", "flake8-black", "isort (>=5.0.3)", "pygments", "readme-renderer", "twine"] +test = ["coverage[toml] (>=5.2)", "coveralls (>=2.1.1)", "hypothesis", "pyannotate", "pytest", "pytest-cov"] +type = ["mypy", "mypy-extensions"] + [[package]] name = "mypy-extensions" version = "1.0.0" @@ -241,6 +701,31 @@ docs = ["furo (>=2024.8.6)", "proselint (>=0.14)", "sphinx (>=8.0.2)", "sphinx-a test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=8.3.2)", "pytest-cov (>=5)", "pytest-mock (>=3.14)"] type = ["mypy (>=1.11.2)"] +[[package]] +name = "psutil" +version = "7.0.0" +description = "Cross-platform lib for process and system monitoring in Python. NOTE: the syntax of this script MUST be kept compatible with Python 2.7." +optional = false +python-versions = ">=3.6" +groups = ["tutorials"] +markers = "sys_platform != \"cygwin\"" +files = [ + {file = "psutil-7.0.0-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:101d71dc322e3cffd7cea0650b09b3d08b8e7c4109dd6809fe452dfd00e58b25"}, + {file = "psutil-7.0.0-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:39db632f6bb862eeccf56660871433e111b6ea58f2caea825571951d4b6aa3da"}, + {file = "psutil-7.0.0-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1fcee592b4c6f146991ca55919ea3d1f8926497a713ed7faaf8225e174581e91"}, + {file = "psutil-7.0.0-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b1388a4f6875d7e2aff5c4ca1cc16c545ed41dd8bb596cefea80111db353a34"}, + {file = "psutil-7.0.0-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5f098451abc2828f7dc6b58d44b532b22f2088f4999a937557b603ce72b1993"}, + {file = "psutil-7.0.0-cp36-cp36m-win32.whl", hash = "sha256:84df4eb63e16849689f76b1ffcb36db7b8de703d1bc1fe41773db487621b6c17"}, + {file = "psutil-7.0.0-cp36-cp36m-win_amd64.whl", hash = "sha256:1e744154a6580bc968a0195fd25e80432d3afec619daf145b9e5ba16cc1d688e"}, + {file = "psutil-7.0.0-cp37-abi3-win32.whl", hash = "sha256:ba3fcef7523064a6c9da440fc4d6bd07da93ac726b5733c29027d7dc95b39d99"}, + {file = "psutil-7.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:4cf3d4eb1aa9b348dec30105c55cd9b7d4629285735a102beb4441e38db90553"}, + {file = "psutil-7.0.0.tar.gz", hash = "sha256:7be9c3eba38beccb6495ea33afd982a44074b78f28c434a1f51cc07fd315c456"}, +] + +[package.extras] +dev = ["abi3audit", "black (==24.10.0)", "check-manifest", "coverage", "packaging", "pylint", "pyperf", "pypinfo", "pytest", "pytest-cov", "pytest-xdist", "requests", "rstcheck", "ruff", "setuptools", "sphinx", "sphinx_rtd_theme", "toml-sort", "twine", "virtualenv", "vulture", "wheel"] +test = ["pytest", "pytest-xdist", "setuptools"] + [[package]] name = "py4j" version = "0.10.9.7" @@ -253,6 +738,92 @@ files = [ {file = "py4j-0.10.9.7.tar.gz", hash = "sha256:0b6e5315bb3ada5cf62ac651d107bb2ebc02def3dee9d9548e3baac644ea8dbb"}, ] +[[package]] +name = "py7zr" +version = "0.22.0" +description = "Pure python 7-zip library" +optional = false +python-versions = ">=3.8" +groups = ["tutorials"] +files = [ + {file = "py7zr-0.22.0-py3-none-any.whl", hash = "sha256:993b951b313500697d71113da2681386589b7b74f12e48ba13cc12beca79d078"}, + {file = "py7zr-0.22.0.tar.gz", hash = "sha256:c6c7aea5913535184003b73938490f9a4d8418598e533f9ca991d3b8e45a139e"}, +] + +[package.dependencies] +brotli = {version = ">=1.1.0", markers = "platform_python_implementation == \"CPython\""} +brotlicffi = {version = ">=1.1.0.0", markers = "platform_python_implementation == \"PyPy\""} +inflate64 = ">=1.0.0,<1.1.0" +multivolumefile = ">=0.2.3" +psutil = {version = "*", markers = "sys_platform != \"cygwin\""} +pybcj = ">=1.0.0,<1.1.0" +pycryptodomex = ">=3.16.0" +pyppmd = ">=1.1.0,<1.2.0" +pyzstd = ">=0.15.9" +texttable = "*" + +[package.extras] +check = ["black (>=23.1.0)", "check-manifest", "flake8 (<8)", "flake8-black (>=0.3.6)", "flake8-deprecated", "flake8-isort", "isort (>=5.0.3)", "lxml", "mypy (>=0.940)", "mypy-extensions (>=0.4.1)", "pygments", "readme-renderer", "twine", "types-psutil"] +debug = ["pytest", "pytest-leaks", "pytest-profiling"] +docs = ["docutils", "sphinx (>=5.0)", "sphinx-a4doc", "sphinx-py3doc-enhanced-theme"] +test = ["coverage[toml] (>=5.2)", "coveralls (>=2.1.1)", "py-cpuinfo", "pytest", "pytest-benchmark", "pytest-cov", "pytest-remotedata", "pytest-timeout"] +test-compat = ["libarchive-c"] + +[[package]] +name = "pybcj" +version = "1.0.3" +description = "bcj filter library" +optional = false +python-versions = ">=3.9" +groups = ["tutorials"] +files = [ + {file = "pybcj-1.0.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:0bd8afeacf9173af091a08783aa9111500f5619ce0ae486bffb5ee4d08a331b4"}, + {file = "pybcj-1.0.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:fc81d3c941485e7d3c2812834ca005849fe91a624977ed5227658cf952d19696"}, + {file = "pybcj-1.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f01b75621452578ccd48a79819bc95ddac41535e16aa163ea1d86b14258afa00"}, + {file = "pybcj-1.0.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e08431845702173d50d66cbbd169969d7b7cf67992f5fb7bc27a8c67e19d3d1f"}, + {file = "pybcj-1.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:476f3c815b85e563d13238c4310b9cb47aefd0c51ac1b33312e41fcd079ea94f"}, + {file = "pybcj-1.0.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:97bfd712bfce0d58099a02acc05b15b1d1aa3e6edf4dd8e018f43349182ffa3f"}, + {file = "pybcj-1.0.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:4d1374806cde777bc6e371f79c7f3acfb2b0906a418e04cf5331866a321633c3"}, + {file = "pybcj-1.0.3-cp310-cp310-win_amd64.whl", hash = "sha256:9245039e0fc87921f702133c019722e333934e61f1c90408f16618d585ff88ec"}, + {file = "pybcj-1.0.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:ae30aa62deff1ba40e4f13ef6964cf083ece541dbfb3ec3731c1fc58cc218b7d"}, + {file = "pybcj-1.0.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6639f5443bc696a981a502c37e1393398a7182d61820eb39ee6d122076b6ad8c"}, + {file = "pybcj-1.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4502c5afa2a41e569b94527bbb46185ee1a378a4fb3e9d7806ad10e892ecdf58"}, + {file = "pybcj-1.0.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c4ff48aaadd8fd91ac02557eec225ce7c1a3b627a6832d6cb723469891b3b242"}, + {file = "pybcj-1.0.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:62668bd0a1aedaa3b779615cf129d9469fd709ab8d944aa07aad68dc189de349"}, + {file = "pybcj-1.0.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:8af60d5eeed32fd1a9f6a2a11eef47cb7ebd80fe9853e709a2c1d9e29108cdf2"}, + {file = "pybcj-1.0.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:68e1bd1b0836e216cce3d9a33795501dfc956c61ff52768737e26286e65a3771"}, + {file = "pybcj-1.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:05738d44a987422e21f4ee15023a8c4f38a5509fdf6e6f6dfaaf43ca05cef7db"}, + {file = "pybcj-1.0.3-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:c68a3fe847f22a8393fe71b1b16450b6b9e8ef36faa36d0c03759f58740f6eff"}, + {file = "pybcj-1.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:17f610ede3a766c0ff1869a4dd7750db78d39e4bfc9997f6bef050fe794c051b"}, + {file = "pybcj-1.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:15f776925a4d6f69b344cde9035fc8f1fd02f1f2a4ccb76f4047406c0ea4241d"}, + {file = "pybcj-1.0.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7bdda28e0a20214c7f0e7de9e260122b9197106231249bf07a5ca5b84a5d38a1"}, + {file = "pybcj-1.0.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:764cba20166fcd9ff580f4d877f17807be057da7d1234caaf54fd5fd5c591387"}, + {file = "pybcj-1.0.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:97cf7f788560c3283a8afe3de585abb849bb1338d007e53fb6441d6ccd202e0a"}, + {file = "pybcj-1.0.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:26d201f773d17d5e8a88785f00fa73a6647e080d933e75ddeb33da7f0baff657"}, + {file = "pybcj-1.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:990047ac176317d42e7059b3cd357ff7c7201f3e3f08b35d083b2004d066cd39"}, + {file = "pybcj-1.0.3-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:3bbbf22687c9f6c57cc9b605a3a60937230843ff1b5560e2a42133fd4dd5dc73"}, + {file = "pybcj-1.0.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e0a75d5ec3fa40af865f93f29e613d93fb67dc016fc60e64a4b3a4621076fecd"}, + {file = "pybcj-1.0.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:631bcdea0d47ae562f118f8404fb6ef5813eb2dcfbcc53c7b9ac6bc5d4c2ef32"}, + {file = "pybcj-1.0.3-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:75c9430a10e69fbea336668944c0f4a9979e0bb3ab5de820315025c157baa2ae"}, + {file = "pybcj-1.0.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5221652a9c656f6b27fda389cc4888354a287d3e0f6ea6d5b70718b6d9ec110d"}, + {file = "pybcj-1.0.3-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:f6a6c3a776aa9b579c51768d2c727d3912cd8e1c2add61898dc6794b269e7ab3"}, + {file = "pybcj-1.0.3-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:cb50276bd804f58690571c13e2e6eb26eca6c4a39a611591e2202136dca1b7a5"}, + {file = "pybcj-1.0.3-cp313-cp313-win_amd64.whl", hash = "sha256:623a4eef080f5cb0405ce19f90fa9824e2477f4a85d8b888e613cf7f146b84d1"}, + {file = "pybcj-1.0.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:47d2a0f33dfd55dfa961502922d2b0f090857585b321f838f1c2510de4e66a9a"}, + {file = "pybcj-1.0.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:cf8ac15785412aa6924818fb86e250ae15e8238b7db7d410e28d3ae0743cdbd3"}, + {file = "pybcj-1.0.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:de02d2933fef5b26d845d2e002996c5e22c710af5b5dfc930285dff09db885cf"}, + {file = "pybcj-1.0.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:40a0f542dba6d079d702c1c129cc8cdc0f20bf2c5cb45defba8d5ac8e2d691a1"}, + {file = "pybcj-1.0.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ace508285fd4788845a208dd00f1c7af8e68dd222cf7797ae525562a2eb22bab"}, + {file = "pybcj-1.0.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:6da2b0c120a415fa5620b76110bab487de20f8a108756499fd4df9c92fc10098"}, + {file = "pybcj-1.0.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a9c6347f1e2c78cf2584fddebe6fb9dc036b75020887facec1bab149fd6056c6"}, + {file = "pybcj-1.0.3-cp39-cp39-win_amd64.whl", hash = "sha256:be309c0fbf06b1e8cd1c40b24dd621271b5fb5d9fe7a0becb40ed64ac92ff50b"}, + {file = "pybcj-1.0.3.tar.gz", hash = "sha256:b8873637f0be00ceaa372d0fb81693604b4bbc8decdb2b1ae5f9b84d196788d9"}, +] + +[package.extras] +check = ["check-manifest", "flake8 (<5)", "flake8-black", "flake8-colors", "flake8-isort", "flake8-pyi", "flake8-typing-imports", "mypy (>=1.10.0)", "pygments", "readme-renderer"] +test = ["coverage[toml] (>=5.2)", "hypothesis", "pytest (>=6.0)", "pytest-cov"] + [[package]] name = "pycodestyle" version = "2.12.1" @@ -265,6 +836,61 @@ files = [ {file = "pycodestyle-2.12.1.tar.gz", hash = "sha256:6838eae08bbce4f6accd5d5572075c63626a15ee3e6f842df996bf62f6d73521"}, ] +[[package]] +name = "pycparser" +version = "2.22" +description = "C parser in Python" +optional = false +python-versions = ">=3.8" +groups = ["tutorials"] +markers = "platform_python_implementation == \"PyPy\"" +files = [ + {file = "pycparser-2.22-py3-none-any.whl", hash = "sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc"}, + {file = "pycparser-2.22.tar.gz", hash = "sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6"}, +] + +[[package]] +name = "pycryptodomex" +version = "3.21.0" +description = "Cryptographic library for Python" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7" +groups = ["tutorials"] +files = [ + {file = "pycryptodomex-3.21.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:dbeb84a399373df84a69e0919c1d733b89e049752426041deeb30d68e9867822"}, + {file = "pycryptodomex-3.21.0-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:a192fb46c95489beba9c3f002ed7d93979423d1b2a53eab8771dbb1339eb3ddd"}, + {file = "pycryptodomex-3.21.0-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:1233443f19d278c72c4daae749872a4af3787a813e05c3561c73ab0c153c7b0f"}, + {file = "pycryptodomex-3.21.0-cp27-cp27m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bbb07f88e277162b8bfca7134b34f18b400d84eac7375ce73117f865e3c80d4c"}, + {file = "pycryptodomex-3.21.0-cp27-cp27m-musllinux_1_1_aarch64.whl", hash = "sha256:e859e53d983b7fe18cb8f1b0e29d991a5c93be2c8dd25db7db1fe3bd3617f6f9"}, + {file = "pycryptodomex-3.21.0-cp27-cp27m-win32.whl", hash = "sha256:ef046b2e6c425647971b51424f0f88d8a2e0a2a63d3531817968c42078895c00"}, + {file = "pycryptodomex-3.21.0-cp27-cp27m-win_amd64.whl", hash = "sha256:da76ebf6650323eae7236b54b1b1f0e57c16483be6e3c1ebf901d4ada47563b6"}, + {file = "pycryptodomex-3.21.0-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:c07e64867a54f7e93186a55bec08a18b7302e7bee1b02fd84c6089ec215e723a"}, + {file = "pycryptodomex-3.21.0-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:56435c7124dd0ce0c8bdd99c52e5d183a0ca7fdcd06c5d5509423843f487dd0b"}, + {file = "pycryptodomex-3.21.0-cp27-cp27mu-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:65d275e3f866cf6fe891411be9c1454fb58809ccc5de6d3770654c47197acd65"}, + {file = "pycryptodomex-3.21.0-cp27-cp27mu-musllinux_1_1_aarch64.whl", hash = "sha256:5241bdb53bcf32a9568770a6584774b1b8109342bd033398e4ff2da052123832"}, + {file = "pycryptodomex-3.21.0-cp36-abi3-macosx_10_9_universal2.whl", hash = "sha256:34325b84c8b380675fd2320d0649cdcbc9cf1e0d1526edbe8fce43ed858cdc7e"}, + {file = "pycryptodomex-3.21.0-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:103c133d6cd832ae7266feb0a65b69e3a5e4dbbd6f3a3ae3211a557fd653f516"}, + {file = "pycryptodomex-3.21.0-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77ac2ea80bcb4b4e1c6a596734c775a1615d23e31794967416afc14852a639d3"}, + {file = "pycryptodomex-3.21.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9aa0cf13a1a1128b3e964dc667e5fe5c6235f7d7cfb0277213f0e2a783837cc2"}, + {file = "pycryptodomex-3.21.0-cp36-abi3-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:46eb1f0c8d309da63a2064c28de54e5e614ad17b7e2f88df0faef58ce192fc7b"}, + {file = "pycryptodomex-3.21.0-cp36-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:cc7e111e66c274b0df5f4efa679eb31e23c7545d702333dfd2df10ab02c2a2ce"}, + {file = "pycryptodomex-3.21.0-cp36-abi3-musllinux_1_2_i686.whl", hash = "sha256:770d630a5c46605ec83393feaa73a9635a60e55b112e1fb0c3cea84c2897aa0a"}, + {file = "pycryptodomex-3.21.0-cp36-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:52e23a0a6e61691134aa8c8beba89de420602541afaae70f66e16060fdcd677e"}, + {file = "pycryptodomex-3.21.0-cp36-abi3-win32.whl", hash = "sha256:a3d77919e6ff56d89aada1bd009b727b874d464cb0e2e3f00a49f7d2e709d76e"}, + {file = "pycryptodomex-3.21.0-cp36-abi3-win_amd64.whl", hash = "sha256:b0e9765f93fe4890f39875e6c90c96cb341767833cfa767f41b490b506fa9ec0"}, + {file = "pycryptodomex-3.21.0-pp27-pypy_73-manylinux2010_x86_64.whl", hash = "sha256:feaecdce4e5c0045e7a287de0c4351284391fe170729aa9182f6bd967631b3a8"}, + {file = "pycryptodomex-3.21.0-pp27-pypy_73-win32.whl", hash = "sha256:365aa5a66d52fd1f9e0530ea97f392c48c409c2f01ff8b9a39c73ed6f527d36c"}, + {file = "pycryptodomex-3.21.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:3efddfc50ac0ca143364042324046800c126a1d63816d532f2e19e6f2d8c0c31"}, + {file = "pycryptodomex-3.21.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0df2608682db8279a9ebbaf05a72f62a321433522ed0e499bc486a6889b96bf3"}, + {file = "pycryptodomex-3.21.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5823d03e904ea3e53aebd6799d6b8ec63b7675b5d2f4a4bd5e3adcb512d03b37"}, + {file = "pycryptodomex-3.21.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:27e84eeff24250ffec32722334749ac2a57a5fd60332cd6a0680090e7c42877e"}, + {file = "pycryptodomex-3.21.0-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:8ef436cdeea794015263853311f84c1ff0341b98fc7908e8a70595a68cefd971"}, + {file = "pycryptodomex-3.21.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7a1058e6dfe827f4209c5cae466e67610bcd0d66f2f037465daa2a29d92d952b"}, + {file = "pycryptodomex-3.21.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9ba09a5b407cbb3bcb325221e346a140605714b5e880741dc9a1e9ecf1688d42"}, + {file = "pycryptodomex-3.21.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:8a9d8342cf22b74a746e3c6c9453cb0cfbb55943410e3a2619bd9164b48dc9d9"}, + {file = "pycryptodomex-3.21.0.tar.gz", hash = "sha256:222d0bd05381dd25c32dd6065c071ebf084212ab79bab4599ba9e6a3e0009e6c"}, +] + [[package]] name = "pyflakes" version = "3.2.0" @@ -277,6 +903,77 @@ files = [ {file = "pyflakes-3.2.0.tar.gz", hash = "sha256:1c61603ff154621fb2a9172037d84dca3500def8c8b630657d1701f026f8af3f"}, ] +[[package]] +name = "pyppmd" +version = "1.1.1" +description = "PPMd compression/decompression library" +optional = false +python-versions = ">=3.9" +groups = ["tutorials"] +files = [ + {file = "pyppmd-1.1.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:406b184132c69e3f60ea9621b69eaa0c5494e83f82c307b3acce7b86a4f8f888"}, + {file = "pyppmd-1.1.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c2cf003bb184adf306e1ac1828107307927737dde63474715ba16462e266cbef"}, + {file = "pyppmd-1.1.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:71c8fd0ecc8d4760e852dd6df19d1a827427cb9e6c9e568cbf5edba7d860c514"}, + {file = "pyppmd-1.1.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e6b5edee08b66ad6c39fd4d34a7ef4cfeb4b69fd6d68957e59cd2db674611a9e"}, + {file = "pyppmd-1.1.1-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e95bd23eb1543ab3149f24fe02f6dd2695023326027a4b989fb2c6dba256e75e"}, + {file = "pyppmd-1.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e633ee4cc19d0c71b3898092c3c4cc20a10bd5e6197229fffac29d68ad5d83b8"}, + {file = "pyppmd-1.1.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:ecaafe2807ef557f0c49b8476a4fa04091b43866072fbcf31b3ceb01a96c9168"}, + {file = "pyppmd-1.1.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:c182fccff60ae8f24f28f5145c36a60708b5b041a25d36b67f23c44923552fa4"}, + {file = "pyppmd-1.1.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:70c93d19efe67cdac3e7fa2d4e171650a2c4f90127a9781b25e496a43f12fbbc"}, + {file = "pyppmd-1.1.1-cp310-cp310-win32.whl", hash = "sha256:57c75856920a210ed72b553885af7bc06eddfd30ff26b62a3a63cb8f86f3d217"}, + {file = "pyppmd-1.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:d5293f10dc8c1d571b780e0d54426d3d858c19bbd8cb0fe972dcea3906acd05c"}, + {file = "pyppmd-1.1.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:753c5297c91c059443caef33bccbffb10764221739d218046981638aeb9bc5f2"}, + {file = "pyppmd-1.1.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:9b5a73da09de480a94793c9064876af14a01be117de872737935ac447b7cde3c"}, + {file = "pyppmd-1.1.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:89c6febb7114dea02a061143d78d04751a945dfcadff77560e9a3d3c7583c24b"}, + {file = "pyppmd-1.1.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0001e467c35e35e6076a8c32ed9074aa45833615ee16115de9282d5c0985a1d8"}, + {file = "pyppmd-1.1.1-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c76820db25596afc859336ba06c01c9be0ff326480beec9c699fd378a546a77f"}, + {file = "pyppmd-1.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b67f0a228f8c58750a21ba667c170ae957283e08fd580857f13cb686334e5b3e"}, + {file = "pyppmd-1.1.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:b18f24c14f0b0f1757a42c458ae7b6fd7aa0bce8147ac1016a9c134068c1ccc2"}, + {file = "pyppmd-1.1.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:c9e43729161cc3b6ad5b04b16bae7665d3c0cc803de047d8a979aa9232a4f94a"}, + {file = "pyppmd-1.1.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:fe057d254528b4eeebe2800baefde47d6af679bae184d3793c13a06f794df442"}, + {file = "pyppmd-1.1.1-cp311-cp311-win32.whl", hash = "sha256:faa51240493a5c53c9b544c99722f70303eea702742bf90f3c3064144342da4a"}, + {file = "pyppmd-1.1.1-cp311-cp311-win_amd64.whl", hash = "sha256:62486f544d6957e1381147e3961eee647b7f4421795be4fb4f1e29d52aee6cb5"}, + {file = "pyppmd-1.1.1-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:9877ef273e2c0efdec740855e28004a708ada9012e0db6673df4bb6eba3b05e0"}, + {file = "pyppmd-1.1.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:f816a5cbccceced80e15335389eeeaf1b56a605fb7eebe135b1c85bd161e288c"}, + {file = "pyppmd-1.1.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6bddabf8f2c6b991d15d6785e603d9d414ae4a791f131b1a729bb8a5d31133d1"}, + {file = "pyppmd-1.1.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:855bc2b0d19c3fead5815d72dbe350b4f765334336cbf8bcb504d46edc9e9dd2"}, + {file = "pyppmd-1.1.1-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a95b11b3717c083b912f0879678ba72f301bbdb9b69efed46dbc5df682aa3ce7"}, + {file = "pyppmd-1.1.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:38b645347b6ea217b0c58e8edac27473802868f152db520344ac8c7490981849"}, + {file = "pyppmd-1.1.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:f8f94b6222262def5b532f2b9716554ef249ad8411fd4da303596cc8c2e8eda1"}, + {file = "pyppmd-1.1.1-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:1c0306f69ceddf385ef689ebd0218325b7e523c48333d87157b37393466cfa1e"}, + {file = "pyppmd-1.1.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:a4ba510457a56535522a660098399e3fa8722e4de55808d089c9d13435d87069"}, + {file = "pyppmd-1.1.1-cp312-cp312-win32.whl", hash = "sha256:032f040a89fd8348109e8638f94311bd4c3c693fb4cad213ad06a37c203690b1"}, + {file = "pyppmd-1.1.1-cp312-cp312-win_amd64.whl", hash = "sha256:2be8cbd13dd59fad1a0ad38062809e28596f3673b77a799dfe82b287986265ed"}, + {file = "pyppmd-1.1.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:9458f972f090f3846fc5bea0a6f7363da773d3c4b2d4654f1d4ca3c11f6ecbfa"}, + {file = "pyppmd-1.1.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:44811a9d958873d857ca81cebf7ba646a0952f8a7bbf8a60cf6ec5d002faa040"}, + {file = "pyppmd-1.1.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a1b12460958885ca44e433986644009d0599b87a444f668ce3724a46ce588924"}, + {file = "pyppmd-1.1.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:200c74f05b97b00f047cf60607914a0b50f80991f1fb3677f624a85aa79d9458"}, + {file = "pyppmd-1.1.1-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2ebe0d98a341b32f164e860059243e125398865cc0363b32ffc31f953460fe87"}, + {file = "pyppmd-1.1.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bf93e1e047a82f1e7e194fcf49da166d2b9d8dc98d7c0b5cd844dc4360d9c1f5"}, + {file = "pyppmd-1.1.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:f5b0b8c746bde378ae3b4df42a11fd8599ba3e5808dfea36e16d722b74bd0506"}, + {file = "pyppmd-1.1.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:bcdd5207b6c79887f25639632ca2623a399d8c54f567973e9ba474b5ebae2b1c"}, + {file = "pyppmd-1.1.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:7bfcca94e5452b6d54ac24a11c2402f6a193c331e5dc221c1f1df71773624374"}, + {file = "pyppmd-1.1.1-cp39-cp39-win32.whl", hash = "sha256:18e99c074664f996f511bc6e87aab46bc4c75f5bd0157d3210292919be35e22c"}, + {file = "pyppmd-1.1.1-cp39-cp39-win_amd64.whl", hash = "sha256:b29788d5a0f8f39ea46a1255cd886daddf9c64ba9d4cb64677bc93bd3859ac0e"}, + {file = "pyppmd-1.1.1-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:28648ef56793bf1ed0ff24728642f56fa39cb96ea161dec6ee2d26f97c0cdd28"}, + {file = "pyppmd-1.1.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:427d6f9b9c011e032db9529b2a15773f2e2944ca490b67d5757f4af33bbda406"}, + {file = "pyppmd-1.1.1-pp310-pypy310_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:34c7a07197a03656c1920fd88e05049c155a955c4de4b8b8a8e5fec19a97b45b"}, + {file = "pyppmd-1.1.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e1fea2eee28beca61165c4714dcd032de76af318553791107d308b4b08575ecc"}, + {file = "pyppmd-1.1.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:04391e4f82c8c2c316ba60e480300ad1af37ec12bdb5c20f06b502030ff35975"}, + {file = "pyppmd-1.1.1-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:cf08a354864c352a94e6e53733009baeab1e7c570010c4f5be226923ecfa09d1"}, + {file = "pyppmd-1.1.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:334e5fe5d75764b87c591a16d2b2df6f9939e2ad114dacf98bb4b0e7c90911e9"}, + {file = "pyppmd-1.1.1-pp39-pypy39_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:15d5928b25f04f5431585d17c835cd509a34e1c9f1416653db8d2815e97d4e20"}, + {file = "pyppmd-1.1.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:af06329796a4965788910ac40f1b012d2e173ede08456ceea0ec7fc4d2e69d62"}, + {file = "pyppmd-1.1.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:4ccdd3751e432e71e02de96f16fc8824e4f4bfc47a8b470f0c7aae88dae4c666"}, + {file = "pyppmd-1.1.1.tar.gz", hash = "sha256:f1a812f1e7628f4c26d05de340b91b72165d7b62778c27d322b82ce2e8ff00cb"}, +] + +[package.extras] +check = ["check-manifest", "flake8", "flake8-black", "flake8-isort", "mypy (>=1.10.0)", "pygments", "readme-renderer"] +docs = ["sphinx", "sphinx_rtd_theme"] +fuzzer = ["atheris", "hypothesis"] +test = ["coverage[toml] (>=5.2)", "hypothesis", "pytest (>=6.0)", "pytest-benchmark", "pytest-cov", "pytest-timeout"] + [[package]] name = "pyspark" version = "3.5.4" @@ -298,6 +995,133 @@ mllib = ["numpy (>=1.15,<2)"] pandas-on-spark = ["numpy (>=1.15,<2)", "pandas (>=1.0.5)", "pyarrow (>=4.0.0)"] sql = ["numpy (>=1.15,<2)", "pandas (>=1.0.5)", "pyarrow (>=4.0.0)"] +[[package]] +name = "pyzstd" +version = "0.16.2" +description = "Python bindings to Zstandard (zstd) compression library." +optional = false +python-versions = ">=3.5" +groups = ["tutorials"] +files = [ + {file = "pyzstd-0.16.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:637376c8f8cbd0afe1cab613f8c75fd502bd1016bf79d10760a2d5a00905fe62"}, + {file = "pyzstd-0.16.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3e7a7118cbcfa90ca2ddbf9890c7cb582052a9a8cf2b7e2c1bbaf544bee0f16a"}, + {file = "pyzstd-0.16.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a74cb1ba05876179525144511eed3bd5a509b0ab2b10632c1215a85db0834dfd"}, + {file = "pyzstd-0.16.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7c084dde218ffbf112e507e72cbf626b8f58ce9eb23eec129809e31037984662"}, + {file = "pyzstd-0.16.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d4646459ebd3d7a59ddbe9312f020bcf7cdd1f059a2ea07051258f7af87a0b31"}, + {file = "pyzstd-0.16.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:14bfc2833cc16d7657fc93259edeeaa793286e5031b86ca5dc861ba49b435fce"}, + {file = "pyzstd-0.16.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f27d488f19e5bf27d1e8aa1ae72c6c0a910f1e1ffbdf3c763d02ab781295dd27"}, + {file = "pyzstd-0.16.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:91e134ca968ff7dcfa8b7d433318f01d309b74ee87e0d2bcadc117c08e1c80db"}, + {file = "pyzstd-0.16.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:6b5f64cd3963c58b8f886eb6139bb8d164b42a74f8a1bb95d49b4804f4592d61"}, + {file = "pyzstd-0.16.2-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:0b4a8266871b9e0407f9fd8e8d077c3558cf124d174e6357b523d14f76971009"}, + {file = "pyzstd-0.16.2-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:1bb19f7acac30727354c25125922aa59f44d82e0e6a751df17d0d93ff6a73853"}, + {file = "pyzstd-0.16.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3008325b7368e794d66d4d98f2ee1d867ef5afd09fd388646ae02b25343c420d"}, + {file = "pyzstd-0.16.2-cp310-cp310-win32.whl", hash = "sha256:66f2d5c0bbf5bf32c577aa006197b3525b80b59804450e2c32fbcc2d16e850fd"}, + {file = "pyzstd-0.16.2-cp310-cp310-win_amd64.whl", hash = "sha256:5fe5f5459ebe1161095baa7a86d04ab625b35148f6c425df0347ed6c90a2fd58"}, + {file = "pyzstd-0.16.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1c1bdbe7f01c7f37d5cd07be70e32a84010d7dfd6677920c0de04cf7d245b60d"}, + {file = "pyzstd-0.16.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1882a3ceaaf9adc12212d587d150ec5e58cfa9a765463d803d739abbd3ac0f7a"}, + {file = "pyzstd-0.16.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ea46a8b9d60f6a6eba29facba54c0f0d70328586f7ef0da6f57edf7e43db0303"}, + {file = "pyzstd-0.16.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d7865bc06589cdcecdede0deefe3da07809d5b7ad9044c224d7b2a0867256957"}, + {file = "pyzstd-0.16.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:52f938a65b409c02eb825e8c77fc5ea54508b8fc44b5ce226db03011691ae8cc"}, + {file = "pyzstd-0.16.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e97620d3f53a0282947304189deef7ca7f7d0d6dfe15033469dc1c33e779d5e5"}, + {file = "pyzstd-0.16.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c7c40e9983d017108670dc8df68ceef14c7c1cf2d19239213274783041d0e64c"}, + {file = "pyzstd-0.16.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:7cd4b3b2c6161066e4bde6af1cf78ed3acf5d731884dd13fdf31f1db10830080"}, + {file = "pyzstd-0.16.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:454f31fd84175bb203c8c424f2255a343fa9bd103461a38d1bf50487c3b89508"}, + {file = "pyzstd-0.16.2-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:5ef754a93743f08fb0386ce3596780bfba829311b49c8f4107af1a4bcc16935d"}, + {file = "pyzstd-0.16.2-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:be81081db9166e10846934f0e3576a263cbe18d81eca06e6a5c23533f8ce0dc6"}, + {file = "pyzstd-0.16.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:738bcb2fa1e5f1868986f5030955e64de53157fa1141d01f3a4daf07a1aaf644"}, + {file = "pyzstd-0.16.2-cp311-cp311-win32.whl", hash = "sha256:0ea214c9b97046867d1657d55979021028d583704b30c481a9c165191b08d707"}, + {file = "pyzstd-0.16.2-cp311-cp311-win_amd64.whl", hash = "sha256:c17c0fc02f0e75b0c7cd21f8eaf4c6ce4112333b447d93da1773a5f705b2c178"}, + {file = "pyzstd-0.16.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d4081fd841a9efe9ded7290ee7502dbf042c4158b90edfadea3b8a072c8ec4e1"}, + {file = "pyzstd-0.16.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:fd3fa45d2aeb65367dd702806b2e779d13f1a3fa2d13d5ec777cfd09de6822de"}, + {file = "pyzstd-0.16.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8b5f0d2c07994a5180d8259d51df6227a57098774bb0618423d7eb4a7303467"}, + {file = "pyzstd-0.16.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:60c9d25b15c7ae06ed5d516d096a0d8254f9bed4368b370a09cccf191eaab5cb"}, + {file = "pyzstd-0.16.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:29acf31ce37254f6cad08deb24b9d9ba954f426fa08f8fae4ab4fdc51a03f4ae"}, + {file = "pyzstd-0.16.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1ec77612a17697a9f7cf6634ffcee616eba9b997712fdd896e77fd19ab3a0618"}, + {file = "pyzstd-0.16.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:313ea4974be93be12c9a640ab40f0fc50a023178aae004a8901507b74f190173"}, + {file = "pyzstd-0.16.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e91acdefc8c2c6c3b8d5b1b5fe837dce4e591ecb7c0a2a50186f552e57d11203"}, + {file = "pyzstd-0.16.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:929bd91a403539e72b5b5cb97f725ac4acafe692ccf52f075e20cd9bf6e5493d"}, + {file = "pyzstd-0.16.2-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:740837a379aa32d110911ebcbbc524f9a9b145355737527543a884bd8777ca4f"}, + {file = "pyzstd-0.16.2-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:adfc0e80dd157e6d1e0b0112c8ecc4b58a7a23760bd9623d74122ef637cfbdb6"}, + {file = "pyzstd-0.16.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:79b183beae1c080ad3dca39019e49b7785391947f9aab68893ad85d27828c6e7"}, + {file = "pyzstd-0.16.2-cp312-cp312-win32.whl", hash = "sha256:b8d00631a3c466bc313847fab2a01f6b73b3165de0886fb03210e08567ae3a89"}, + {file = "pyzstd-0.16.2-cp312-cp312-win_amd64.whl", hash = "sha256:c0d43764e9a60607f35d8cb3e60df772a678935ab0e02e2804d4147377f4942c"}, + {file = "pyzstd-0.16.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:3ae9ae7ad730562810912d7ecaf1fff5eaf4c726f4b4dfe04784ed5f06d7b91f"}, + {file = "pyzstd-0.16.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:2ce8d3c213f76a564420f3d0137066ac007ce9fb4e156b989835caef12b367a7"}, + {file = "pyzstd-0.16.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c2c14dac23c865e2d78cebd9087e148674b7154f633afd4709b4cd1520b99a61"}, + {file = "pyzstd-0.16.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4527969d66a943e36ef374eda847e918077de032d58b5df84d98ffd717b6fa77"}, + {file = "pyzstd-0.16.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cd8256149b88e657e99f31e6d4b114c8ff2935951f1d8bb8e1fe501b224999c0"}, + {file = "pyzstd-0.16.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5bd1f1822d65c9054bf36d35307bf8ed4aa2d2d6827431761a813628ff671b1d"}, + {file = "pyzstd-0.16.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6f6733f4d373ec9ad2c1976cf06f973a3324c1f9abe236d114d6bb91165a397d"}, + {file = "pyzstd-0.16.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:7bec165ab6524663f00b69bfefd13a46a69fed3015754abaf81b103ec73d92c6"}, + {file = "pyzstd-0.16.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:e4460fa6949aac6528a1ad0de8871079600b12b3ef4db49316306786a3598321"}, + {file = "pyzstd-0.16.2-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:75df79ea0315c97d88337953a17daa44023dbf6389f8151903d371513f503e3c"}, + {file = "pyzstd-0.16.2-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:93e1d45f4a196afb6f18682c79bdd5399277ead105b67f30b35c04c207966071"}, + {file = "pyzstd-0.16.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:075e18b871f38a503b5d23e40a661adfc750bd4bd0bb8b208c1e290f3ceb8fa2"}, + {file = "pyzstd-0.16.2-cp313-cp313-win32.whl", hash = "sha256:9e4295eb299f8d87e3487852bca033d30332033272a801ca8130e934475e07a9"}, + {file = "pyzstd-0.16.2-cp313-cp313-win_amd64.whl", hash = "sha256:18deedc70f858f4cf574e59f305d2a0678e54db2751a33dba9f481f91bc71c28"}, + {file = "pyzstd-0.16.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a9892b707ef52f599098b1e9528df0e7849c5ec01d3e8035fb0e67de4b464839"}, + {file = "pyzstd-0.16.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:4fbd647864341f3c174c4a6d7f20e6ea6b4be9d840fb900dc0faf0849561badc"}, + {file = "pyzstd-0.16.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:20ac2c15656cc6194c4fed1cb0e8159f9394d4ea1d58be755448743d2ec6c9c4"}, + {file = "pyzstd-0.16.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b239fb9a20c1be3374b9a2bd183ba624fd22ad7a3f67738c0d80cda68b4ae1d3"}, + {file = "pyzstd-0.16.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cc52400412cdae2635e0978b8d6bcc0028cc638fdab2fd301f6d157675d26896"}, + {file = "pyzstd-0.16.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3b766a6aeb8dbb6c46e622e7a1aebfa9ab03838528273796941005a5ce7257b1"}, + {file = "pyzstd-0.16.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:abd4b8676052f9d59579242bf3cfe5fd02532b6a9a93ab7737c118ae3b8509dc"}, + {file = "pyzstd-0.16.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:1c6c0a677aac7c0e3d2d2605d4d68ffa9893fdeeb2e071040eb7c8750969d463"}, + {file = "pyzstd-0.16.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:15f9c2d612e7e2023d68d321d1b479846751f792af89141931d44e82ae391394"}, + {file = "pyzstd-0.16.2-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:11740bff847aad23beef4085a1bb767d101895881fe891f0a911aa27d43c372c"}, + {file = "pyzstd-0.16.2-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:b9067483ebe860e4130a03ee665b3d7be4ec1608b208e645d5e7eb3492379464"}, + {file = "pyzstd-0.16.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:988f0ba19b14c2fe0afefc444ac1edfb2f497b7d7c3212b2f587504cc2ec804e"}, + {file = "pyzstd-0.16.2-cp39-cp39-win32.whl", hash = "sha256:8855acb1c3e3829030b9e9e9973b19e2d70f33efb14ad5c474b4d086864c959c"}, + {file = "pyzstd-0.16.2-cp39-cp39-win_amd64.whl", hash = "sha256:018e88378df5e76f5e1d8cf4416576603b6bc4a103cbc66bb593eaac54c758de"}, + {file = "pyzstd-0.16.2-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:4b631117b97a42ff6dfd0ffc885a92fff462d7c34766b28383c57b996f863338"}, + {file = "pyzstd-0.16.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:56493a3fbe1b651a02102dd0902b0aa2377a732ff3544fb6fb3f114ca18db52f"}, + {file = "pyzstd-0.16.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f1eae9bdba4a1e5d3181331f403114ff5b8ce0f4b569f48eba2b9beb2deef1e4"}, + {file = "pyzstd-0.16.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e1be6972391c8aeecc7e61feb96ffc8e77a401bcba6ed994e7171330c45a1948"}, + {file = "pyzstd-0.16.2-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:761439d687e3a5687c2ff5c6a1190e1601362a4a3e8c6c82ff89719d51d73e19"}, + {file = "pyzstd-0.16.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:f5fbdb8cf31b60b2dc586fecb9b73e2f172c21a0b320ed275f7b8d8a866d9003"}, + {file = "pyzstd-0.16.2-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:183f26e34f9becf0f2db38be9c0bfb136753d228bcb47c06c69175901bea7776"}, + {file = "pyzstd-0.16.2-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:88318b64b5205a67748148d6d244097fa6cf61fcea02ad3435511b9e7155ae16"}, + {file = "pyzstd-0.16.2-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:73142aa2571b6480136a1865ebda8257e09eabbc8bcd54b222202f6fa4febe1e"}, + {file = "pyzstd-0.16.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1d3f8877c29a97f1b1bba16f3d3ab01ad10ad3da7bad317aecf36aaf8848b37c"}, + {file = "pyzstd-0.16.2-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d1f25754562473ac7de856b8331ebd5964f5d85601045627a5f0bb0e4e899990"}, + {file = "pyzstd-0.16.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:6ce17e84310080c55c02827ad9bb17893c00a845c8386a328b346f814aabd2c1"}, + {file = "pyzstd-0.16.2.tar.gz", hash = "sha256:179c1a2ea1565abf09c5f2fd72f9ce7c54b2764cf7369e05c0bfd8f1f67f63d2"}, +] + +[[package]] +name = "requests" +version = "2.32.3" +description = "Python HTTP for Humans." +optional = false +python-versions = ">=3.8" +groups = ["tutorials"] +files = [ + {file = "requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6"}, + {file = "requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760"}, +] + +[package.dependencies] +certifi = ">=2017.4.17" +charset-normalizer = ">=2,<4" +idna = ">=2.5,<4" +urllib3 = ">=1.21.1,<3" + +[package.extras] +socks = ["PySocks (>=1.5.6,!=1.5.7)"] +use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] + +[[package]] +name = "texttable" +version = "1.7.0" +description = "module to create simple ASCII tables" +optional = false +python-versions = "*" +groups = ["tutorials"] +files = [ + {file = "texttable-1.7.0-py2.py3-none-any.whl", hash = "sha256:72227d592c82b3d7f672731ae73e4d1f88cd8e2ef5b075a7a7f01a23a3743917"}, + {file = "texttable-1.7.0.tar.gz", hash = "sha256:2d2068fb55115807d3ac77a4ca68fa48803e84ebb0ee2340f858107a36522638"}, +] + [[package]] name = "tomli" version = "2.2.1" @@ -354,7 +1178,25 @@ files = [ {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"}, ] +[[package]] +name = "urllib3" +version = "2.3.0" +description = "HTTP library with thread-safe connection pooling, file post, and more." +optional = false +python-versions = ">=3.9" +groups = ["tutorials"] +files = [ + {file = "urllib3-2.3.0-py3-none-any.whl", hash = "sha256:1cee9ad369867bfdbbb48b7dd50374c0967a0bb7710050facf0dd6911440e3df"}, + {file = "urllib3-2.3.0.tar.gz", hash = "sha256:f8c5449b3cf0861679ce7e0503c7b44b5ec981bec0d1d3795a07f1ba96f0204d"}, +] + +[package.extras] +brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"] +h2 = ["h2 (>=4,<5)"] +socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] +zstd = ["zstandard (>=0.18.0)"] + [metadata] lock-version = "2.1" python-versions = ">=3.9 <3.13" -content-hash = "52c129fee3e94e69edf727f219bc7582ddbfcedf6c43547a7f67a876051bd7c4" +content-hash = "33ae7f96a3999d6822af7778f9b7878355d811534a4b5fec14d51ec29aa8dce2" diff --git a/python/pyproject.toml b/python/pyproject.toml index 8c0c1ba05..36097e2c9 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -34,6 +34,11 @@ black = "^25.1.0" flake8 = "^7.1.1" isort = "^6.0.0" +[tool.poetry.group.tutorials.dependencies] +py7zr = "^0.22.0" +requests = "^2.32.3" +click = "^8.1.8" + [build-system] requires = ["poetry-core"] build-backend = "poetry.core.masonry.api" From c0d6d7b58175a04d08f81de7d574979ea2af4610 Mon Sep 17 00:00:00 2001 From: Russell Jurney Date: Mon, 17 Feb 2025 10:57:04 -0800 Subject: [PATCH 44/53] Make motif.py execute in whole again --- python/graphframes/tutorials/motif.py | 52 ++++++++++++++------------- 1 file changed, 27 insertions(+), 25 deletions(-) diff --git a/python/graphframes/tutorials/motif.py b/python/graphframes/tutorials/motif.py index 4a2189c56..2f5eb030c 100644 --- a/python/graphframes/tutorials/motif.py +++ b/python/graphframes/tutorials/motif.py @@ -16,7 +16,8 @@ spark: SparkSession = ( SparkSession.builder.appName("Stack Overflow Motif Analysis") # Lets the Id:(Stack Overflow int) and id:(GraphFrames ULID) coexist - .config("spark.sql.caseSensitive", True).getOrCreate() + .config("spark.sql.caseSensitive", True) + .getOrCreate() ) sc: SparkContext = spark.sparkContext sc.setCheckpointDir("/tmp/graphframes-checkpoints") @@ -25,8 +26,9 @@ STACKEXCHANGE_SITE = "stats.meta.stackexchange.com" BASE_PATH = f"python/graphframes/tutorials/data/{STACKEXCHANGE_SITE}" + # -# Load the nodes and edges from disk, repartition, checkpoint [plan got long for some reason] and cache. +# Load the nodes and edges from disk, repartition, checkpoint [plan got long for some reason] and cache. # # We created these in stackexchange.py from Stack Exchange data dump XML files @@ -45,7 +47,8 @@ # What kind of nodes we do we have to work with? node_counts = ( - nodes_df.select("id", F.col("Type").alias("Node Type")) + nodes_df + .select("id", F.col("Type").alias("Node Type")) .groupBy("Node Type") .count() .orderBy(F.col("count").desc()) @@ -56,7 +59,8 @@ # What kind of edges do we have to work with? edge_counts = ( - edges_df.select("src", "dst", F.col("relationship").alias("Edge Type")) + edges_df + .select("src", "dst", F.col("relationship").alias("Edge Type")) .groupBy("Edge Type") .count() .orderBy(F.col("count").desc()) @@ -65,7 +69,7 @@ ) edge_counts.show() -g = GraphFrame(nodes_df, edges_df) +g = GraphFrame(nodes_df, edges_df) g.vertices.show(10) print(f"Node columns: {g.vertices.columns}") @@ -166,28 +170,25 @@ ) graphlet_count_df.show() -graphlet_count_df.orderBy( - [ - "A_Type", - "(a)-[e1]->(b)", - "B_Type", - "(b)-[e2]->(c)", - "C_Type", - "(d)-[e3]->(c)", - "D_Type", - ], - ascending=False, -).show(104) +graphlet_count_df.orderBy([ + "A_Type", + "(a)-[e1]->(b)", + "B_Type", + "(b)-[e2]->(c)", + "C_Type", + "(d)-[e3]->(c)", + "D_Type", +], ascending=False).show(104) # A user answers an answer that answers a question that links to an answer. linked_vote_paths = paths.filter( - (F.col("a.Type") == "Vote") - & (F.col("e1.relationship") == "CastFor") - & (F.col("b.Type") == "Question") - & (F.col("e2.relationship") == "Links") - & (F.col("c.Type") == "Question") - & (F.col("e3.relationship") == "CastFor") - & (F.col("d.Type") == "Vote") + (F.col("a.Type") == "Vote") & + (F.col("e1.relationship") == "CastFor") & + (F.col("b.Type") == "Question") & + (F.col("e2.relationship") == "Links") & + (F.col("c.Type") == "Question") & + (F.col("e3.relationship") == "CastFor") & + (F.col("d.Type") == "Vote") ) # Sanity check the count - it should match the table above @@ -197,7 +198,8 @@ c_vote_counts = linked_vote_paths.select("c", "d").distinct().groupBy("c").count() linked_vote_counts = ( - linked_vote_paths.filter((F.col("a.VoteTypeId") == 2) & (F.col("d.VoteTypeId") == 2)) + linked_vote_paths + .filter((F.col("a.VoteTypeId") == 2) & (F.col("d.VoteTypeId") == 2)) .select("b", "c") .join(b_vote_counts, on="b", how="inner") .withColumnRenamed("count", "b_count") From 5bb4c26b101b524193076e2870a5f5894e0a9c16 Mon Sep 17 00:00:00 2001 From: Russell Jurney Date: Mon, 17 Feb 2025 11:55:50 -0800 Subject: [PATCH 45/53] Minor isort format and cleanup of download.py --- python/graphframes/tutorials/download.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/python/graphframes/tutorials/download.py b/python/graphframes/tutorials/download.py index 154d84c14..e81eff8b9 100755 --- a/python/graphframes/tutorials/download.py +++ b/python/graphframes/tutorials/download.py @@ -1,14 +1,21 @@ #!/usr/bin/env python +"""Download and decompress the Stack Exchange data dump from the Internet Archive.""" + import os + import click -import requests import py7zr +import requests # type: ignore @click.command() @click.argument("subdomain") -@click.option("--data-dir", default="python/graphframes/tutorials/data", help="Directory to store downloaded files") +@click.option( + "--data-dir", + default="python/graphframes/tutorials/data", + help="Directory to store downloaded files", +) @click.option( "--extract/--no-extract", default=True, help="Whether to extract the archive after download" ) From 99e6a4d14e6eb7cdc2c001ebefc1c3312ff43ced Mon Sep 17 00:00:00 2001 From: Russell Jurney Date: Mon, 17 Feb 2025 11:56:13 -0800 Subject: [PATCH 46/53] Minor isort format and cleanup of utils.py --- python/graphframes/tutorials/utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/graphframes/tutorials/utils.py b/python/graphframes/tutorials/utils.py index 54ef40f8b..46db14d96 100644 --- a/python/graphframes/tutorials/utils.py +++ b/python/graphframes/tutorials/utils.py @@ -1,7 +1,10 @@ +"""Utilities for Network Moitif Finding Tutorial""" + from pyspark.sql import DataFrame -from graphframes import GraphFrame from pyspark.sql import functions as F +from graphframes import GraphFrame + def three_edge_count(paths: DataFrame) -> DataFrame: """three_edge_count View the counts of the different types of 3-node graphlets in the graph. From 662e197960a424c1f58c151b663c46d9d63da6be Mon Sep 17 00:00:00 2001 From: Russell Jurney Date: Mon, 17 Feb 2025 11:57:40 -0800 Subject: [PATCH 47/53] Removed case sensitivity from the script - that was confusing people who just pasted or tried to run the code without a new SparkSession. --- python/graphframes/tutorials/stackexchange.py | 48 ++++++++++++------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/python/graphframes/tutorials/stackexchange.py b/python/graphframes/tutorials/stackexchange.py index c52f323bb..5e029746e 100644 --- a/python/graphframes/tutorials/stackexchange.py +++ b/python/graphframes/tutorials/stackexchange.py @@ -1,4 +1,4 @@ -# Build a Graph out of the Stack Exchange Data Dump XML files +"""Build a Graph out of the Stack Exchange Data Dump XML files.""" # # Interactive Usage: pyspark --packages com.databricks:spark-xml_2.12:0.18.0 @@ -47,11 +47,9 @@ def split_tags(tags: str) -> List[str]: # Initialize a SparkSession with case sensitivity # -spark: SparkSession = ( - SparkSession.builder.appName("Stack Exchange Graph Builder") - # Lets the Id:(Stack Overflow int) and id:(GraphFrames UUID) coexist - .config("spark.sql.caseSensitive", True).getOrCreate() -) +spark: SparkSession = SparkSession.builder.appName("Stack Exchange Graph Builder").getOrCreate() +sc = spark.sparkContext +sc.setCheckpointDir("/tmp/graphframes-checkpoints") print("Loading data for stats.meta.stackexchange.com ...") @@ -296,12 +294,23 @@ def add_missing_columns(df: DataFrame, all_cols: List[Tuple[str, T.StructField]] ) print(f"Total distinct nodes: {nodes_df.count():,}") -# Now add a unique ID field +# Now add a unique lowercase 'id' field - standard for GraphFrames - moving the original... +# Stack Exchange Id to StackId +nodes_df = nodes_df.withColumnRenamed("Id", "StackId").drop("Id") + +# Update the column list... +if "Id" in all_column_names: + all_column_names.remove("Id") +all_column_names += ["StackId"] +all_column_names = sorted(all_column_names) + +# Add the UUID 'id' field for GraphFrames. It will go in edges as 'src' and 'dst' nodes_df = nodes_df.withColumn("id", F.expr("uuid()")).select("id", *all_column_names) # Now create posts - combined questions and answers for things that can apply to them both posts_df = questions_df.unionByName(answers_df).cache() + # # Store the nodes to disk, reload and cache # @@ -361,12 +370,12 @@ def add_missing_columns(df: DataFrame, all_cols: List[Tuple[str, T.StructField]] src_vote_df: DataFrame = votes_df.select( F.col("id").alias("src"), - F.col("Id").alias("VoteId"), + F.col("StackId").alias("VoteId"), # Everything has all the fields - should build from base records but need UUIDs F.col("PostId").alias("VotePostId"), ) cast_for_edge_df: DataFrame = src_vote_df.join( - posts_df, on=src_vote_df.VotePostId == posts_df.Id, how="inner" + posts_df, on=src_vote_df.VotePostId == posts_df.StackId, how="inner" ).select( # 'src' comes from the votes' 'id' "src", @@ -378,6 +387,7 @@ def add_missing_columns(df: DataFrame, all_cols: List[Tuple[str, T.StructField]] print(f"Total CastFor edges: {cast_for_edge_df.count():,}") print(f"Percentage of linked votes: {cast_for_edge_df.count() / votes_df.count():.2%}\n") + # # Create a [User]--Asks-->[Question] edge # @@ -388,7 +398,7 @@ def add_missing_columns(df: DataFrame, all_cols: List[Tuple[str, T.StructField]] F.lit("Asks").alias("relationship"), ) user_asks_edges_df: DataFrame = questions_asked_df.join( - users_df, on=questions_asked_df.QuestionUserId == users_df.Id, how="inner" + users_df, on=questions_asked_df.QuestionUserId == users_df.StackId, how="inner" ).select( # 'src' comes from the users' 'id' F.col("id").alias("src"), @@ -402,6 +412,7 @@ def add_missing_columns(df: DataFrame, all_cols: List[Tuple[str, T.StructField]] f"Percentage of asked questions linked to users: {user_asks_edges_df.count() / questions_df.count():.2%}\n" ) + # # Create a [User]--Posts-->[Answer] edge. # @@ -412,7 +423,7 @@ def add_missing_columns(df: DataFrame, all_cols: List[Tuple[str, T.StructField]] F.lit("Posts").alias("relationship"), ) user_answers_edges_df = user_answers_df.join( - users_df, on=user_answers_df.AnswerUserId == users_df.Id, how="inner" + users_df, on=user_answers_df.AnswerUserId == users_df.StackId, how="inner" ).select( # 'src' comes from the users' 'id' F.col("id").alias("src"), @@ -426,17 +437,18 @@ def add_missing_columns(df: DataFrame, all_cols: List[Tuple[str, T.StructField]] f"Percentage of answers linked to users: {user_answers_edges_df.count() / answers_df.count():.2%}\n" ) + # # Create a [Answer]--Answers-->[Question] edge # src_answers_df: DataFrame = answers_df.select( F.col("id").alias("src"), - F.col("Id").alias("AnswerId"), + F.col("StackId").alias("AnswerId"), F.col("ParentId").alias("AnswerParentId"), ) question_answers_edges_df: DataFrame = src_answers_df.join( - posts_df, on=src_answers_df.AnswerParentId == questions_df.Id, how="inner" + posts_df, on=src_answers_df.AnswerParentId == questions_df.StackId, how="inner" ).select( # 'src' comes from the answers' 'id' "src", @@ -450,6 +462,7 @@ def add_missing_columns(df: DataFrame, all_cols: List[Tuple[str, T.StructField]] f"Percentage of linked answers: {question_answers_edges_df.count() / answers_df.count():.2%}\n" ) + # # Create a [Tag]--Tags-->[Post] edge... remember a Post is a Question or Answer # @@ -472,6 +485,7 @@ def add_missing_columns(df: DataFrame, all_cols: List[Tuple[str, T.StructField]] print(f"Total Tags edges: {tags_edge_df.count():,}") print(f"Percentage of linked tags: {tags_edge_df.count() / posts_df.count():.2%}\n") + # # Create a [User]--Earns-->[Badge] edge # @@ -482,7 +496,7 @@ def add_missing_columns(df: DataFrame, all_cols: List[Tuple[str, T.StructField]] F.lit("Earns").alias("relationship"), ) earns_edges_df = earns_edges_df.join( - users_df, on=earns_edges_df.BadgeUserId == users_df.Id, how="inner" + users_df, on=earns_edges_df.BadgeUserId == users_df.StackId, how="inner" ).select( # 'src' comes from the users' 'id' F.col("id").alias("src"), @@ -494,6 +508,7 @@ def add_missing_columns(df: DataFrame, all_cols: List[Tuple[str, T.StructField]] print(f"Total Earns edges: {earns_edges_df.count():,}") print(f"Percentage of earned badges: {earns_edges_df.count() / badges_df.count():.2%}\n") + # # Create a [Post]--Links-->[Post] edge... remember a Post is a Question or Answer # Also a [Post]--Duplicates-->[Post] edge... remember a Post is a Question or Answer @@ -505,7 +520,7 @@ def add_missing_columns(df: DataFrame, all_cols: List[Tuple[str, T.StructField]] "LinkType", ) links_src_edge_df: DataFrame = trim_links_df.join( - posts_df.drop("LinkType"), on=trim_links_df.SrcPostId == posts_df.Id, how="inner" + posts_df.drop("LinkType"), on=trim_links_df.SrcPostId == posts_df.StackId, how="inner" ).select( # 'dst' comes from the posts' 'id' F.col("id").alias("src"), @@ -513,7 +528,7 @@ def add_missing_columns(df: DataFrame, all_cols: List[Tuple[str, T.StructField]] "LinkType", ) raw_links_edge_df = links_src_edge_df.join( - posts_df.drop("LinkType"), on=links_src_edge_df.DstPostId == posts_df.Id, how="inner" + posts_df.drop("LinkType"), on=links_src_edge_df.DstPostId == posts_df.StackId, how="inner" ).select( "src", # 'src' comes from the posts' 'id' @@ -557,6 +572,7 @@ def add_missing_columns(df: DataFrame, all_cols: List[Tuple[str, T.StructField]] "count", F.format_number(F.col("count"), 0) ).show() + # +------------+------+ # |relationship| count| # +------------+------+ From beaa35d60be2a8635e3f2743b3543631875cadcb Mon Sep 17 00:00:00 2001 From: Russell Jurney Date: Mon, 17 Feb 2025 11:58:29 -0800 Subject: [PATCH 48/53] motif.py now matches tutorial code, runs and handles case insensitivity. --- python/graphframes/tutorials/motif.py | 57 ++++++++++++--------------- 1 file changed, 26 insertions(+), 31 deletions(-) diff --git a/python/graphframes/tutorials/motif.py b/python/graphframes/tutorials/motif.py index 2f5eb030c..a4a82953a 100644 --- a/python/graphframes/tutorials/motif.py +++ b/python/graphframes/tutorials/motif.py @@ -1,4 +1,4 @@ -# Demonstrate GraphFrames network motif finding capabilities +"""Demonstrate GraphFrames network motif finding capabilities. Code from the Network Motif Finding Tutorial.""" # # Interactive Usage: pyspark --packages graphframes:graphframes:0.8.4-spark3.5-s_2.12 @@ -13,12 +13,7 @@ from graphframes import GraphFrame # Initialize a SparkSession -spark: SparkSession = ( - SparkSession.builder.appName("Stack Overflow Motif Analysis") - # Lets the Id:(Stack Overflow int) and id:(GraphFrames ULID) coexist - .config("spark.sql.caseSensitive", True) - .getOrCreate() -) +spark: SparkSession = SparkSession.builder.appName("Stack Overflow Motif Analysis").getOrCreate() sc: SparkContext = spark.sparkContext sc.setCheckpointDir("/tmp/graphframes-checkpoints") @@ -28,7 +23,7 @@ # -# Load the nodes and edges from disk, repartition, checkpoint [plan got long for some reason] and cache. +# Load the nodes and edges from disk, repartition, checkpoint [plan got long for some reason] and cache. # # We created these in stackexchange.py from Stack Exchange data dump XML files @@ -47,8 +42,7 @@ # What kind of nodes we do we have to work with? node_counts = ( - nodes_df - .select("id", F.col("Type").alias("Node Type")) + nodes_df.select("id", F.col("Type").alias("Node Type")) .groupBy("Node Type") .count() .orderBy(F.col("count").desc()) @@ -59,8 +53,7 @@ # What kind of edges do we have to work with? edge_counts = ( - edges_df - .select("src", "dst", F.col("relationship").alias("Edge Type")) + edges_df.select("src", "dst", F.col("relationship").alias("Edge Type")) .groupBy("Edge Type") .count() .orderBy(F.col("count").desc()) @@ -69,7 +62,7 @@ ) edge_counts.show() -g = GraphFrame(nodes_df, edges_df) +g = GraphFrame(nodes_df, edges_df) g.vertices.show(10) print(f"Node columns: {g.vertices.columns}") @@ -170,25 +163,28 @@ ) graphlet_count_df.show() -graphlet_count_df.orderBy([ - "A_Type", - "(a)-[e1]->(b)", - "B_Type", - "(b)-[e2]->(c)", - "C_Type", - "(d)-[e3]->(c)", - "D_Type", -], ascending=False).show(104) +graphlet_count_df.orderBy( + [ + "A_Type", + "(a)-[e1]->(b)", + "B_Type", + "(b)-[e2]->(c)", + "C_Type", + "(d)-[e3]->(c)", + "D_Type", + ], + ascending=False, +).show(104) # A user answers an answer that answers a question that links to an answer. linked_vote_paths = paths.filter( - (F.col("a.Type") == "Vote") & - (F.col("e1.relationship") == "CastFor") & - (F.col("b.Type") == "Question") & - (F.col("e2.relationship") == "Links") & - (F.col("c.Type") == "Question") & - (F.col("e3.relationship") == "CastFor") & - (F.col("d.Type") == "Vote") + (F.col("a.Type") == "Vote") + & (F.col("e1.relationship") == "CastFor") + & (F.col("b.Type") == "Question") + & (F.col("e2.relationship") == "Links") + & (F.col("c.Type") == "Question") + & (F.col("e3.relationship") == "CastFor") + & (F.col("d.Type") == "Vote") ) # Sanity check the count - it should match the table above @@ -198,8 +194,7 @@ c_vote_counts = linked_vote_paths.select("c", "d").distinct().groupBy("c").count() linked_vote_counts = ( - linked_vote_paths - .filter((F.col("a.VoteTypeId") == 2) & (F.col("d.VoteTypeId") == 2)) + linked_vote_paths.filter((F.col("a.VoteTypeId") == 2) & (F.col("d.VoteTypeId") == 2)) .select("b", "c") .join(b_vote_counts, on="b", how="inner") .withColumnRenamed("count", "b_count") From ef19784b9dd1befdab3d422fadf660139291f9b8 Mon Sep 17 00:00:00 2001 From: Russell Jurney Date: Fri, 21 Feb 2025 11:11:31 +0100 Subject: [PATCH 49/53] Setup a 'graphframes stackexchange' comand. --- python/graphframes/console.py | 19 +++++++++++++++++++ python/graphframes/tutorials/download.py | 4 ++-- python/pyproject.toml | 6 ++++++ 3 files changed, 27 insertions(+), 2 deletions(-) create mode 100644 python/graphframes/console.py diff --git a/python/graphframes/console.py b/python/graphframes/console.py new file mode 100644 index 000000000..d2b38d28b --- /dev/null +++ b/python/graphframes/console.py @@ -0,0 +1,19 @@ +import click +from graphframes.tutorials import download + + +@click.group() +def cli(): + """GraphFrames CLI: a collection of commands for graphframes.""" + pass + + +cli.add_command(download.stackexchange) + + +def main(): + cli() + + +if __name__ == "__main__": + main() diff --git a/python/graphframes/tutorials/download.py b/python/graphframes/tutorials/download.py index e81eff8b9..049b1fa15 100755 --- a/python/graphframes/tutorials/download.py +++ b/python/graphframes/tutorials/download.py @@ -19,7 +19,7 @@ @click.option( "--extract/--no-extract", default=True, help="Whether to extract the archive after download" ) -def download_stackexchange(subdomain: str, data_dir: str, extract: bool) -> None: +def stackexchange(subdomain: str, data_dir: str, extract: bool) -> None: """Download Stack Exchange archive for a given SUBDOMAIN. Example: python/graphframes/tutorials/download.py stats.meta @@ -68,4 +68,4 @@ def download_stackexchange(subdomain: str, data_dir: str, extract: bool) -> None if __name__ == "__main__": - download_stackexchange() + stackexchange() diff --git a/python/pyproject.toml b/python/pyproject.toml index 36097e2c9..819d2bbdd 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -39,6 +39,9 @@ py7zr = "^0.22.0" requests = "^2.32.3" click = "^8.1.8" +[tool.poetry.scripts] +graphframes = "graphframes.console:main" + [build-system] requires = ["poetry-core"] build-backend = "poetry.core.masonry.api" @@ -48,6 +51,9 @@ line-length = 100 target-version = ["py39"] include = ["graphframes"] +[tool.flake8] +max-line-length = 100 + [tool.isort] profile = "black" src_paths = ["graphframes"] From 4400cb4335a9237363ee033c40e44cbb7b3041c0 Mon Sep 17 00:00:00 2001 From: Russell Jurney Date: Fri, 21 Feb 2025 11:13:29 +0100 Subject: [PATCH 50/53] Make graphframes.tutorials.motif use a checkpoint dir unique, and from SparkSession.sparkContext. Use click.echo instead of print --- python/graphframes/tutorials/motif.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/python/graphframes/tutorials/motif.py b/python/graphframes/tutorials/motif.py index a4a82953a..59691946a 100644 --- a/python/graphframes/tutorials/motif.py +++ b/python/graphframes/tutorials/motif.py @@ -6,16 +6,15 @@ # Batch Usage: spark-submit --packages graphframes:graphframes:0.8.4-spark3.5-s_2.12 python/graphframes/tutorials/motif.py # +import click import pyspark.sql.functions as F -from pyspark import SparkContext from pyspark.sql import DataFrame, SparkSession from graphframes import GraphFrame # Initialize a SparkSession spark: SparkSession = SparkSession.builder.appName("Stack Overflow Motif Analysis").getOrCreate() -sc: SparkContext = spark.sparkContext -sc.setCheckpointDir("/tmp/graphframes-checkpoints") +spark.sparkContext.setCheckpointDir("/tmp/graphframes-checkpoints/motif") # Change me if you download a different stackexchange site STACKEXCHANGE_SITE = "stats.meta.stackexchange.com" @@ -65,7 +64,7 @@ g = GraphFrame(nodes_df, edges_df) g.vertices.show(10) -print(f"Node columns: {g.vertices.columns}") +click.echo(f"Node columns: {g.vertices.columns}") g.edges.sample(0.0001).show(10) @@ -82,7 +81,7 @@ assert ( edge_count == valid_edge_count ), f"Edge count {edge_count} != valid edge count {valid_edge_count}" -print(f"Edge count: {edge_count:,} == Valid edge count: {valid_edge_count:,}") +click.echo(f"Edge count: {edge_count:,} == Valid edge count: {valid_edge_count:,}") # G4: Continuous Triangles paths = g.find("(a)-[e1]->(b); (b)-[e2]->(c); (c)-[e3]->(a)") From d549c566c7a500d4b16319851e88f8ffbd4df61e Mon Sep 17 00:00:00 2001 From: Russell Jurney Date: Fri, 21 Feb 2025 11:23:19 +0100 Subject: [PATCH 51/53] Use spark.sparkContext.setCheckpointDir directly instead of instantiating a SparkContext. print-->click.echo --- python/graphframes/tutorials/stackexchange.py | 63 +++++++++---------- 1 file changed, 31 insertions(+), 32 deletions(-) diff --git a/python/graphframes/tutorials/stackexchange.py b/python/graphframes/tutorials/stackexchange.py index 5e029746e..72185c446 100644 --- a/python/graphframes/tutorials/stackexchange.py +++ b/python/graphframes/tutorials/stackexchange.py @@ -5,10 +5,10 @@ # # Batch Usage: spark-submit --packages com.databricks:spark-xml_2.12:0.18.0 python/graphframes/tutorials/stackexchange.py # - import re from typing import List, Tuple +import click import pyspark.sql.functions as F import pyspark.sql.types as T from pyspark.sql import DataFrame, SparkSession @@ -48,10 +48,9 @@ def split_tags(tags: str) -> List[str]: # spark: SparkSession = SparkSession.builder.appName("Stack Exchange Graph Builder").getOrCreate() -sc = spark.sparkContext -sc.setCheckpointDir("/tmp/graphframes-checkpoints") +spark.sparkContext.setCheckpointDir("/tmp/graphframes-checkpoints/stackexchange") -print("Loading data for stats.meta.stackexchange.com ...") +click.echo("Loading data for stats.meta.stackexchange.com ...") # @@ -63,7 +62,7 @@ def split_tags(tags: str) -> List[str]: .options(rootTag="posts") .load(f"{BASE_PATH}/Posts.xml") ) -print(f"\nTotal Posts: {posts_df.count():,}") +click.echo(f"\nTotal Posts: {posts_df.count():,}") # Remove the _ prefix from field names posts_df = remove_prefix(posts_df) @@ -85,14 +84,14 @@ def split_tags(tags: str) -> List[str]: # Do the questions look ok? Questions have NO parent ID and DO have a Title questions_df: DataFrame = posts_df.filter(posts_df.ParentId.isNull()) questions_df = questions_df.withColumn("Type", F.lit("Question")).cache() -print(f"\nTotal questions: {questions_df.count():,}\n") +click.echo(f"\nTotal questions: {questions_df.count():,}\n") questions_df.select("ParentId", "Title", "Body").show(10) # Answers DO have a ParentId parent post and no Title answers_df: DataFrame = posts_df.filter(posts_df.ParentId.isNotNull()) answers_df = answers_df.withColumn("Type", F.lit("Answer")).cache() -print(f"\nTotal answers: {answers_df.count():,}\n") +click.echo(f"\nTotal answers: {answers_df.count():,}\n") answers_df.select("ParentId", "Title", "Body").show(10) @@ -107,7 +106,7 @@ def split_tags(tags: str) -> List[str]: .options(rootTag="postlinks") .load(f"{BASE_PATH}/PostLinks.xml") ) -print(f"Total PostLinks: {post_links_df.count():,}") +click.echo(f"Total PostLinks: {post_links_df.count():,}") # Remove the _ prefix from field names post_links_df = ( @@ -132,7 +131,7 @@ def split_tags(tags: str) -> List[str]: .options(rootTag="posthistory") .load(f"{BASE_PATH}/PostHistory.xml") ) -print(f"Total PostHistory: {post_history_df.count():,}") +click.echo(f"Total PostHistory: {post_history_df.count():,}") # Remove the _ prefix from field names post_history_df = remove_prefix(post_history_df).withColumn("Type", F.lit("PostHistory")) @@ -148,7 +147,7 @@ def split_tags(tags: str) -> List[str]: .options(rootTag="comments") .load(f"{BASE_PATH}/Comments.xml") ) -print(f"Total Comments: {comments_df.count():,}") +click.echo(f"Total Comments: {comments_df.count():,}") # Remove the _ prefix from field names comments_df = remove_prefix(comments_df).withColumn("Type", F.lit("Comment")) @@ -164,7 +163,7 @@ def split_tags(tags: str) -> List[str]: .options(rootTag="users") .load(f"{BASE_PATH}/Users.xml") ) -print(f"Total Users: {users_df.count():,}") +click.echo(f"Total Users: {users_df.count():,}") # Remove the _ prefix from field names users_df = remove_prefix(users_df).withColumn("Type", F.lit("User")) @@ -180,7 +179,7 @@ def split_tags(tags: str) -> List[str]: .options(rootTag="votes") .load(f"{BASE_PATH}/Votes.xml") ) -print(f"Total Votes: {votes_df.count():,}") +click.echo(f"Total Votes: {votes_df.count():,}") # Remove the _ prefix from field names votes_df = remove_prefix(votes_df).withColumn("Type", F.lit("Vote")) @@ -213,7 +212,7 @@ def split_tags(tags: str) -> List[str]: .options(rootTag="tags") .load(f"{BASE_PATH}/Tags.xml") ) -print(f"Total Tags: {tags_df.count():,}") +click.echo(f"Total Tags: {tags_df.count():,}") # Remove the _ prefix from field names tags_df = remove_prefix(tags_df).withColumn("Type", F.lit("Tag")) @@ -229,7 +228,7 @@ def split_tags(tags: str) -> List[str]: .options(rootTag="badges") .load(f"{BASE_PATH}/Badges.xml") ) -print(f"Total Badges: {badges_df.count():,}\n") +click.echo(f"Total Badges: {badges_df.count():,}\n") # Remove the _ prefix from field names badges_df = remove_prefix(badges_df).withColumn("Type", F.lit("Badge")) @@ -292,7 +291,7 @@ def add_missing_columns(df: DataFrame, all_cols: List[Tuple[str, T.StructField]] .unionByName(badges_df) .distinct() ) -print(f"Total distinct nodes: {nodes_df.count():,}") +click.echo(f"Total distinct nodes: {nodes_df.count():,}") # Now add a unique lowercase 'id' field - standard for GraphFrames - moving the original... # Stack Exchange Id to StackId @@ -384,8 +383,8 @@ def add_missing_columns(df: DataFrame, all_cols: List[Tuple[str, T.StructField]] # All edges have a 'relationship' field F.lit("CastFor").alias("relationship"), ) -print(f"Total CastFor edges: {cast_for_edge_df.count():,}") -print(f"Percentage of linked votes: {cast_for_edge_df.count() / votes_df.count():.2%}\n") +click.echo(f"Total CastFor edges: {cast_for_edge_df.count():,}") +click.echo(f"Percentage of linked votes: {cast_for_edge_df.count() / votes_df.count():.2%}\n") # @@ -407,8 +406,8 @@ def add_missing_columns(df: DataFrame, all_cols: List[Tuple[str, T.StructField]] # All edges have a 'relationship' field "relationship", ) -print(f"Total Asks edges: {user_asks_edges_df.count():,}") -print( +click.echo(f"Total Asks edges: {user_asks_edges_df.count():,}") +click.echo( f"Percentage of asked questions linked to users: {user_asks_edges_df.count() / questions_df.count():.2%}\n" ) @@ -432,8 +431,8 @@ def add_missing_columns(df: DataFrame, all_cols: List[Tuple[str, T.StructField]] # All edges have a 'relationship' field "relationship", ) -print(f"Total User Answers edges: {user_answers_edges_df.count():,}") -print( +click.echo(f"Total User Answers edges: {user_answers_edges_df.count():,}") +click.echo( f"Percentage of answers linked to users: {user_answers_edges_df.count() / answers_df.count():.2%}\n" ) @@ -457,8 +456,8 @@ def add_missing_columns(df: DataFrame, all_cols: List[Tuple[str, T.StructField]] # All edges have a 'relationship' field F.lit("Answers").alias("relationship"), ) -print(f"Total Posts Answers edges: {question_answers_edges_df.count():,}") -print( +click.echo(f"Total Posts Answers edges: {question_answers_edges_df.count():,}") +click.echo( f"Percentage of linked answers: {question_answers_edges_df.count() / answers_df.count():.2%}\n" ) @@ -482,8 +481,8 @@ def add_missing_columns(df: DataFrame, all_cols: List[Tuple[str, T.StructField]] # All edges have a 'relationship' field F.lit("Tags").alias("relationship"), ) -print(f"Total Tags edges: {tags_edge_df.count():,}") -print(f"Percentage of linked tags: {tags_edge_df.count() / posts_df.count():.2%}\n") +click.echo(f"Total Tags edges: {tags_edge_df.count():,}") +click.echo(f"Percentage of linked tags: {tags_edge_df.count() / posts_df.count():.2%}\n") # @@ -505,8 +504,8 @@ def add_missing_columns(df: DataFrame, all_cols: List[Tuple[str, T.StructField]] # All edges have a 'relationship' field "relationship", ) -print(f"Total Earns edges: {earns_edges_df.count():,}") -print(f"Percentage of earned badges: {earns_edges_df.count() / badges_df.count():.2%}\n") +click.echo(f"Total Earns edges: {earns_edges_df.count():,}") +click.echo(f"Percentage of earned badges: {earns_edges_df.count() / badges_df.count():.2%}\n") # @@ -543,16 +542,16 @@ def add_missing_columns(df: DataFrame, all_cols: List[Tuple[str, T.StructField]] .withColumn("relationship", F.lit("Duplicates")) .select("src", "dst", "relationship") ) -print(f"Total Duplicates edges: {duplicates_edge_df.count():,}") -print(f"Percentage of duplicate posts: {duplicates_edge_df.count() / post_links_df.count():.2%}\n") +click.echo(f"Total Duplicates edges: {duplicates_edge_df.count():,}") +click.echo(f"Percentage of duplicate posts: {duplicates_edge_df.count() / post_links_df.count():.2%}\n") linked_edge_df = ( raw_links_edge_df.filter(F.col("LinkType") == "Linked") .withColumn("relationship", F.lit("Links")) .select("src", "dst", "relationship") ) -print(f"Total Links edges: {linked_edge_df.count():,}") -print(f"Percentage of linked posts: {linked_edge_df.count() / post_links_df.count():.2%}\n") +click.echo(f"Total Links edges: {linked_edge_df.count():,}") +click.echo(f"Percentage of linked posts: {linked_edge_df.count() / post_links_df.count():.2%}\n") # @@ -592,4 +591,4 @@ def add_missing_columns(df: DataFrame, all_cols: List[Tuple[str, T.StructField]] relationships_df.write.mode("overwrite").parquet(EDGES_PATH) spark.stop() -print("Spark stopped.") +click.echo("Spark stopped.") From b97063677aca43d918ad775469a58cff39eefb3a Mon Sep 17 00:00:00 2001 From: Russell Jurney Date: Fri, 21 Feb 2025 11:49:44 +0100 Subject: [PATCH 52/53] Using 'from __future__ import annotations' intsead of List and Tuple --- python/graphframes/tutorials/stackexchange.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/python/graphframes/tutorials/stackexchange.py b/python/graphframes/tutorials/stackexchange.py index 72185c446..02ebb2bb5 100644 --- a/python/graphframes/tutorials/stackexchange.py +++ b/python/graphframes/tutorials/stackexchange.py @@ -5,8 +5,9 @@ # # Batch Usage: spark-submit --packages com.databricks:spark-xml_2.12:0.18.0 python/graphframes/tutorials/stackexchange.py # +from __future__ import annotations + import re -from typing import List, Tuple import click import pyspark.sql.functions as F @@ -36,7 +37,7 @@ def remove_prefix(df: DataFrame) -> DataFrame: @F.udf(returnType=T.ArrayType(T.StringType())) -def split_tags(tags: str) -> List[str]: +def split_tags(tags: str) -> list[str]: if not tags: return [] # Remove < and > and split into array @@ -238,7 +239,7 @@ def split_tags(tags: str) -> List[str]: # Form the nodes from the UNION of posts, users, votes and their combined schemas # -all_cols: List[Tuple[str, T.StructField]] = list( +all_cols: list[tuple[str, T.StructField]] = list( set( list(zip(answers_df.columns, answers_df.schema)) + list(zip(questions_df.columns, questions_df.schema)) @@ -250,10 +251,10 @@ def split_tags(tags: str) -> List[str]: + list(zip(badges_df.columns, badges_df.schema)) ) ) -all_column_names: List[str] = sorted([x[0] for x in all_cols]) +all_column_names: list[str] = sorted([x[0] for x in all_cols]) -def add_missing_columns(df: DataFrame, all_cols: List[Tuple[str, T.StructField]]) -> DataFrame: +def add_missing_columns(df: DataFrame, all_cols: list[tuple[str, T.StructField]]) -> DataFrame: """Add any missing columns from any DataFrame among several we want to merge.""" for col_name, schema_field in all_cols: if col_name not in df.columns: @@ -543,7 +544,9 @@ def add_missing_columns(df: DataFrame, all_cols: List[Tuple[str, T.StructField]] .select("src", "dst", "relationship") ) click.echo(f"Total Duplicates edges: {duplicates_edge_df.count():,}") -click.echo(f"Percentage of duplicate posts: {duplicates_edge_df.count() / post_links_df.count():.2%}\n") +click.echo( + f"Percentage of duplicate posts: {duplicates_edge_df.count() / post_links_df.count():.2%}\n" +) linked_edge_df = ( raw_links_edge_df.filter(F.col("LinkType") == "Linked") From 378894125e6baff2e0a6deab0635224e05f3ad26 Mon Sep 17 00:00:00 2001 From: Russell Jurney Date: Fri, 21 Feb 2025 12:11:18 +0100 Subject: [PATCH 53/53] Now retry three times if we can't connect for any reason in 'graphframes stackexchange' command. --- python/graphframes/tutorials/download.py | 25 ++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/python/graphframes/tutorials/download.py b/python/graphframes/tutorials/download.py index 049b1fa15..4eadfa647 100755 --- a/python/graphframes/tutorials/download.py +++ b/python/graphframes/tutorials/download.py @@ -36,13 +36,30 @@ def stackexchange(subdomain: str, data_dir: str, extract: bool) -> None: click.echo(f"Downloading archive from {archive_url}") try: - # Download the file - response = requests.get(archive_url, stream=True) - response.raise_for_status() # Raise exception for bad status codes + # Download the file with retries + max_retries = 3 + retry_count = 0 + + while retry_count < max_retries: + try: + response = requests.get(archive_url, stream=True) + response.raise_for_status() # Raise exception for bad status codes + break + except ( + requests.exceptions.RequestException, + requests.exceptions.ConnectionError, + requests.exceptions.HTTPError, + requests.exceptions.Timeout, + ) as e: + retry_count += 1 + if retry_count == max_retries: + click.echo(f"Failed to download after {max_retries} attempts: {e}", err=True) + raise click.Abort() + click.echo(f"Download attempt {retry_count} failed, retrying...") total_size = int(response.headers.get("content-length", 0)) - with click.progressbar(length=total_size, label="Downloading") as bar: + with click.progressbar(length=total_size, label="Downloading") as bar: # type: ignore with open(archive_path, "wb") as f: for chunk in response.iter_content(chunk_size=8192): if chunk: