From cb6d7ec687f62b930fb8e774117770937ab3aaba Mon Sep 17 00:00:00 2001 From: Adam Binford Date: Tue, 1 Jul 2025 11:14:28 -0400 Subject: [PATCH 1/7] Try new method for shading --- build.sbt | 45 ++++++++++++++++++++-------------------- python/tests/conftest.py | 4 ++-- 2 files changed, 25 insertions(+), 24 deletions(-) diff --git a/build.sbt b/build.sbt index 9787a7cf9..c53f2d49b 100644 --- a/build.sbt +++ b/build.sbt @@ -116,16 +116,6 @@ lazy val root = (project in file(".")) Compile / unmanagedSourceDirectories += (Compile / baseDirectory).value / "src" / "main" / s"scala-spark-$sparkMajorVer", - // Assembly settings - assembly / test := {}, // No tests in assembly - assemblyPackageScala / assembleArtifact := false, - assembly / assemblyMergeStrategy := { - case PathList("META-INF", xs @ _*) => MergeStrategy.discard - case x if x.endsWith("module-info.class") => MergeStrategy.discard - case x => - val oldStrategy = (assembly / assemblyMergeStrategy).value - oldStrategy(x) - }, Test / packageBin / publishArtifact := false, Test / packageDoc / publishArtifact := false, Test / packageSrc / publishArtifact := false, @@ -133,12 +123,13 @@ lazy val root = (project in file(".")) Compile / packageDoc / publishArtifact := true, Compile / packageSrc / publishArtifact := true) -lazy val connect = (project in file("graphframes-connect")) +// Dedicated project for creating the shaded JAR that doesn't get published +lazy val connectAssembly = (project in file("graphframes-connect")) .dependsOn(root) .settings( + name := s"graphframes-connect-assembly", + moduleName := s"graphframes-connect-spark${sparkMajorVer}", commonSetting, - name := s"graphframes-connect", - moduleName := s"${name.value}-spark${sparkMajorVer}", Compile / unmanagedSourceDirectories += (Compile / baseDirectory).value / "src" / "main" / s"scala-spark-$sparkMajorVer", Compile / PB.targets := Seq(PB.gens.java -> (Compile / sourceManaged).value), Compile / PB.includePaths ++= Seq(file("src/main/protobuf")), @@ -147,8 +138,8 @@ lazy val connect = (project in file("graphframes-connect")) "org.apache.spark" %% "spark-connect" % sparkVer % "provided" cross CrossVersion.for3Use2_13), // Assembly and shading + assembly / assemblyJarName := s"${moduleName.value}_${(scalaBinaryVersion).value}-${version.value}.jar", assembly / test := {}, - assemblyPackageScala / assembleArtifact := false, assembly / assemblyShadeRules := Seq( ShadeRule.rename("com.google.protobuf.**" -> protobufShadingPattern).inAll), assembly / assemblyMergeStrategy := { @@ -157,18 +148,28 @@ lazy val connect = (project in file("graphframes-connect")) case x if x.endsWith("module-info.class") => MergeStrategy.discard case x => MergeStrategy.first }, - assembly / assemblyExcludedJars := (Compile / fullClasspath).value.filter { className => - className.data - .getName() - .contains("scala-library-") || className.data - .getName() - .contains("slf4j-api-") + assembly / assemblyExcludedJars := { + val cp = (assembly / fullClasspath).value + val allowedPrefixes = Set("protobuf-java") + cp.filter { f => + !allowedPrefixes.exists(prefix => f.data.getName.startsWith(prefix)) + } }, - publish / skip := false, + publish / skip := true, Compile / packageBin := assembly.value, Test / packageBin / publishArtifact := false, Test / packageDoc / publishArtifact := false, Test / packageSrc / publishArtifact := false, - Compile / packageBin / publishArtifact := true, + Compile / packageBin / publishArtifact := false, Compile / packageDoc / publishArtifact := false, Compile / packageSrc / publishArtifact := false) + +// Publish the shaded JAR with the correct dependencies in the POM +lazy val connect = project + .dependsOn(root) + .settings( + commonSetting, + name := s"graphframes-connect", + moduleName := s"${name.value}-spark${sparkMajorVer}", + Compile / packageBin := (connectAssembly / Compile / assembly).value + ) diff --git a/python/tests/conftest.py b/python/tests/conftest.py index 6265cc33e..9cfdc5ed5 100644 --- a/python/tests/conftest.py +++ b/python/tests/conftest.py @@ -36,7 +36,7 @@ def get_gf_jar_locations() -> Tuple[str, str]: core_jar: Optional[str] = None connect_jar: Optional[str] = None - for pp in core_dir.glob("graphframes-assembly-*.jar"): + for pp in core_dir.glob(f"graphframes-spark{spark_major_version}-*.jar"): assert isinstance(pp, pathlib.PosixPath) # type checking core_jar = str(pp.absolute()) @@ -45,7 +45,7 @@ def get_gf_jar_locations() -> Tuple[str, str]: f"Failed to find graphframes jar for Spark {spark_major_version} in {core_dir}" ) - for pp in connect_dir.glob("graphframes-connect-assembly-*.jar"): + for pp in connect_dir.glob(f"graphframes-connect-spark{spark_major_version}-*.jar"): assert isinstance(pp, pathlib.PosixPath) # type checking connect_jar = str(pp.absolute()) From dc9b47cf32c33a4588e8e9970c24978ef29b54db Mon Sep 17 00:00:00 2001 From: Adam Binford Date: Tue, 1 Jul 2025 11:51:17 -0400 Subject: [PATCH 2/7] Fix tests --- python/dev/build_jar.py | 4 ++-- python/tests/conftest.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/python/dev/build_jar.py b/python/dev/build_jar.py index d9920a7ca..3bcce30ca 100644 --- a/python/dev/build_jar.py +++ b/python/dev/build_jar.py @@ -16,9 +16,9 @@ def build(spark_versions: Sequence[str] = ["3.5.5"]): sbt_executable, f"-Dspark.version={spark_version}", "clean", - "assembly", + "package", "connect/clean", - "connect/assembly" + "connect/package" ] sbt_build = subprocess.Popen( sbt_build_command, diff --git a/python/tests/conftest.py b/python/tests/conftest.py index 9cfdc5ed5..aeb543cbe 100644 --- a/python/tests/conftest.py +++ b/python/tests/conftest.py @@ -36,7 +36,7 @@ def get_gf_jar_locations() -> Tuple[str, str]: core_jar: Optional[str] = None connect_jar: Optional[str] = None - for pp in core_dir.glob(f"graphframes-spark{spark_major_version}-*.jar"): + for pp in core_dir.glob(f"graphframes-spark{spark_major_version}*.jar"): assert isinstance(pp, pathlib.PosixPath) # type checking core_jar = str(pp.absolute()) @@ -45,7 +45,7 @@ def get_gf_jar_locations() -> Tuple[str, str]: f"Failed to find graphframes jar for Spark {spark_major_version} in {core_dir}" ) - for pp in connect_dir.glob(f"graphframes-connect-spark{spark_major_version}-*.jar"): + for pp in connect_dir.glob(f"graphframes-connect-spark{spark_major_version}*.jar"): assert isinstance(pp, pathlib.PosixPath) # type checking connect_jar = str(pp.absolute()) From 702d0d6865c9ed61ad0501aab84cb3b1f7727f64 Mon Sep 17 00:00:00 2001 From: Adam Binford Date: Tue, 1 Jul 2025 20:16:15 -0400 Subject: [PATCH 3/7] Add exportJars --- build.sbt | 1 + 1 file changed, 1 insertion(+) diff --git a/build.sbt b/build.sbt index c53f2d49b..877d9d011 100644 --- a/build.sbt +++ b/build.sbt @@ -108,6 +108,7 @@ lazy val root = (project in file(".")) commonSetting, name := "graphframes", moduleName := s"${name.value}-spark$sparkMajorVer", + exportJars := true, // Global settings Global / concurrentRestrictions := Seq(Tags.limitAll(1)), From 43d372dbcdc3b2c0f300e1f5e27308834346eac6 Mon Sep 17 00:00:00 2001 From: Adam Binford Date: Tue, 1 Jul 2025 20:32:31 -0400 Subject: [PATCH 4/7] Just use target dir --- src/test/scala/org/graphframes/ldbc/TestLDBCCases.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/test/scala/org/graphframes/ldbc/TestLDBCCases.scala b/src/test/scala/org/graphframes/ldbc/TestLDBCCases.scala index a89f0a1a7..0e4b4df1a 100644 --- a/src/test/scala/org/graphframes/ldbc/TestLDBCCases.scala +++ b/src/test/scala/org/graphframes/ldbc/TestLDBCCases.scala @@ -11,11 +11,12 @@ import org.graphframes.GraphFrameTestSparkContext import org.graphframes.SparkFunSuite import org.graphframes.examples.LDBCUtils +import java.io.File import java.nio.file._ import java.util.Properties class TestLDBCCases extends SparkFunSuite with GraphFrameTestSparkContext { - private val resourcesPath = Paths.get(getClass().getResource("/").toURI()) + private val resourcesPath = Path.of(new File("target").toURI()) private val unreachableID = 9223372036854775807L private def readUndirectedUnweighted(pathPrefix: String): GraphFrame = { From 4fec8ec8a95f938fa611dbce61a99e1723ea5f96 Mon Sep 17 00:00:00 2001 From: Adam Binford Date: Tue, 1 Jul 2025 20:33:08 -0400 Subject: [PATCH 5/7] Add comment --- build.sbt | 1 + 1 file changed, 1 insertion(+) diff --git a/build.sbt b/build.sbt index 877d9d011..f3903a1ce 100644 --- a/build.sbt +++ b/build.sbt @@ -108,6 +108,7 @@ lazy val root = (project in file(".")) commonSetting, name := "graphframes", moduleName := s"${name.value}-spark$sparkMajorVer", + // Export the JAR so that this can be excluded from shading in connect exportJars := true, // Global settings From b7c51cdef7299f9060fb73cf1ce93e5e41020ce4 Mon Sep 17 00:00:00 2001 From: Adam Binford Date: Wed, 2 Jul 2025 11:22:08 -0400 Subject: [PATCH 6/7] Don't shade anything, just rename --- build.sbt | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/build.sbt b/build.sbt index f3903a1ce..0339247d0 100644 --- a/build.sbt +++ b/build.sbt @@ -144,19 +144,8 @@ lazy val connectAssembly = (project in file("graphframes-connect")) assembly / test := {}, assembly / assemblyShadeRules := Seq( ShadeRule.rename("com.google.protobuf.**" -> protobufShadingPattern).inAll), - assembly / assemblyMergeStrategy := { - case PathList("google", "protobuf", xs @ _*) => MergeStrategy.discard - case PathList("META-INF", xs @ _*) => MergeStrategy.discard - case x if x.endsWith("module-info.class") => MergeStrategy.discard - case x => MergeStrategy.first - }, - assembly / assemblyExcludedJars := { - val cp = (assembly / fullClasspath).value - val allowedPrefixes = Set("protobuf-java") - cp.filter { f => - !allowedPrefixes.exists(prefix => f.data.getName.startsWith(prefix)) - } - }, + // Don't actually shade anything, we just need to rename the protobuf packages to what's bundled with Spark + assembly / assemblyExcludedJars := (assembly / fullClasspath).value, publish / skip := true, Compile / packageBin := assembly.value, Test / packageBin / publishArtifact := false, From 687ea18174dc340fd33e4365bd91df5ab067134f Mon Sep 17 00:00:00 2001 From: Adam Binford Date: Wed, 2 Jul 2025 11:37:50 -0400 Subject: [PATCH 7/7] Don't actually need the extra project --- build.sbt | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/build.sbt b/build.sbt index 0339247d0..925365dfa 100644 --- a/build.sbt +++ b/build.sbt @@ -125,17 +125,17 @@ lazy val root = (project in file(".")) Compile / packageDoc / publishArtifact := true, Compile / packageSrc / publishArtifact := true) -// Dedicated project for creating the shaded JAR that doesn't get published -lazy val connectAssembly = (project in file("graphframes-connect")) +lazy val connect = (project in file("graphframes-connect")) .dependsOn(root) .settings( - name := s"graphframes-connect-assembly", - moduleName := s"graphframes-connect-spark${sparkMajorVer}", + name := s"graphframes-connect", + moduleName := s"${name.value}-spark${sparkMajorVer}", commonSetting, Compile / unmanagedSourceDirectories += (Compile / baseDirectory).value / "src" / "main" / s"scala-spark-$sparkMajorVer", Compile / PB.targets := Seq(PB.gens.java -> (Compile / sourceManaged).value), Compile / PB.includePaths ++= Seq(file("src/main/protobuf")), PB.protocVersion := protocVersion, + PB.additionalDependencies := Nil, libraryDependencies ++= Seq( "org.apache.spark" %% "spark-connect" % sparkVer % "provided" cross CrossVersion.for3Use2_13), @@ -146,21 +146,11 @@ lazy val connectAssembly = (project in file("graphframes-connect")) ShadeRule.rename("com.google.protobuf.**" -> protobufShadingPattern).inAll), // Don't actually shade anything, we just need to rename the protobuf packages to what's bundled with Spark assembly / assemblyExcludedJars := (assembly / fullClasspath).value, - publish / skip := true, Compile / packageBin := assembly.value, Test / packageBin / publishArtifact := false, Test / packageDoc / publishArtifact := false, Test / packageSrc / publishArtifact := false, - Compile / packageBin / publishArtifact := false, + Compile / packageBin / publishArtifact := true, Compile / packageDoc / publishArtifact := false, - Compile / packageSrc / publishArtifact := false) - -// Publish the shaded JAR with the correct dependencies in the POM -lazy val connect = project - .dependsOn(root) - .settings( - commonSetting, - name := s"graphframes-connect", - moduleName := s"${name.value}-spark${sparkMajorVer}", - Compile / packageBin := (connectAssembly / Compile / assembly).value + Compile / packageSrc / publishArtifact := false )