diff --git a/README.md b/README.md index 64efd47b3..7e1eece68 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,9 @@ This will also run the Scala unit tests. To run the Python unit tests, run the `run-tests.sh` script from the `python/` directory. You will need to set `SPARK_HOME` to your local Spark installation directory. +## Release new version +Please see guide `dev/release_guide.md`. + ## Spark version compatibility This project is compatible with Spark 2.4+. However, significant speed improvements have been diff --git a/dev/build-docs-in-docker.sh b/dev/build-docs-in-docker.sh deleted file mode 100755 index fa110ab7b..000000000 --- a/dev/build-docs-in-docker.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash - -set -euxo pipefail - -cd ./docs -SKIP_SCALADOC=0 PRODUCTION=1 jekyll build diff --git a/dev/build-docs.sh b/dev/build-docs.sh deleted file mode 100755 index 47be6a247..000000000 --- a/dev/build-docs.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash - -set -euxo pipefail - -if [[ $(uname -sm) == "Darwin arm64" ]]; then - # https://github.com/docker/for-mac/issues/5419#issuecomment-834624859 - PLATFORM_OPT="--platform=linux/arm64" -else - PLATFORM_OPT= -fi - -docker build $PLATFORM_OPT -t graphframes/dev . - -# build the docs image -docker build $PLATFORM_OPT -t graphframes/docs docs/ - -# build the API docs -# TODO fix docker on linux to only create files as current user. -docker run $PLATFORM_OPT --rm \ - -v "$(pwd):/mnt/graphframes" \ - -v "$HOME/.sbt:/root/.sbt" \ - -v "$HOME/.ivy2:/root/.ivy2" \ - graphframes/docs bash -i -c dev/build-docs-in-docker.sh diff --git a/dev/release.py b/dev/release.py deleted file mode 100755 index 719886671..000000000 --- a/dev/release.py +++ /dev/null @@ -1,141 +0,0 @@ -#!/usr/bin/env python -import click -from datetime import datetime -from subprocess import check_call, check_output -import sys - -DATABRICKS_REMOTE = "git@github.com:graphframes/graphframes.git" -PUBLISH_MODES = { - "local": "publishLocal", - "m2": "publishM2", - "spark-package-publish": "spDist", -} -PUBLISH_DOCS_DEFAULT = True - -WORKING_BRANCH = "WORKING_BRANCH_RELEASE_%s_@%s" -# lower case "z" puts the branch at the end of the github UI. -WORKING_DOCS_BRANCH = "zWORKING_BRANCH_DOCS_%s_@%s" -RELEASE_TAG = "v%s" - - -def prominentPrint(x): - click.echo(click.style(x, underline=True)) - - -def verify(prompt, interactive): - if not interactive: - return True - return click.confirm(prompt, show_default=True) - - -@click.command() -@click.argument("release-version", type=str) -@click.argument("next-version", type=str) -@click.option("--publish-to", default="local", show_default=True, - help="Where to publish artifact, one of: %s" % list(PUBLISH_MODES.keys())) -@click.option("--no-prompt", is_flag=True, help="Automated mode with no user prompts.") -@click.option("--git-remote", default=DATABRICKS_REMOTE, - help="Push current branch and docs to this git remote.") -@click.option("--publish-docs", type=bool, default=PUBLISH_DOCS_DEFAULT, show_default=True, - help="Publish docs to github-pages.") -@click.option("--spark-version", multiple=True, show_default=True, - default=["3.2.4", "3.3.3", "3.4.1", "3.5.0"]) -def main(release_version, next_version, publish_to, no_prompt, git_remote, publish_docs, - spark_version): - interactive = not no_prompt - - time = datetime.now().strftime("%Y-%m-%dT%H-%M-%S") - if publish_to not in PUBLISH_MODES: - modes = list(PUBLISH_MODES.keys()) - prominentPrint("Unknown publish target, --publish-to should be one of: %s." % modes) - sys.exit(1) - - if not next_version.endswith("SNAPSHOT"): - next_version += "-SNAPSHOT" - - if not verify("Publishing version: %s\n" - "Next version will be: %s\n" - "Continue?" % (release_version, next_version), interactive): - sys.exit(1) - - current_branch = check_output(["git", "rev-parse", "--abbrev-ref", "HEAD"]).strip() - if current_branch == b"HEAD": - prominentPrint("Cannot build from detached head state. Please make a branch.") - sys.exit(1) - if current_branch != b"master": - if not verify("You're not on the master branch do you want to continue?", - interactive): - sys.exit(1) - - uncommitted_changes = check_output(["git", "diff", "--stat"]) - if uncommitted_changes != b"": - prominentPrint("There seem to be uncommitted changes on your current branch. Please commit or " - "stash them and try again.") - prominentPrint(uncommitted_changes) - sys.exit(1) - - working_branch = WORKING_BRANCH % (release_version, time) - gh_pages_branch = WORKING_DOCS_BRANCH % (release_version, time) - - release_tag = RELEASE_TAG % release_version - target_tags = [release_tag] - - existing_tags = check_output(["git", "tag"]).decode().split() - conflict_tags = list(filter(lambda a: a in existing_tags, target_tags)) - if conflict_tags: - msg = ("The following tags already exist:\n" - " %s\n" - "Please delete them and try.") - msg = msg % "\n ".join(conflict_tags) - prominentPrint(msg) - sys.exit(1) - - prominentPrint("Creating working branch for this release.") - check_call(["git", "checkout", "-b", working_branch]) - - prominentPrint("Creating release tag and updating snapshot version.") - update_version = "release release-version %s next-version %s" % (release_version, next_version) - check_call(["./build/sbt", update_version]) - - prominentPrint("Building and testing with sbt.") - check_call(["git", "checkout", release_tag]) - - publish_target = PUBLISH_MODES[publish_to] - for version in spark_version: - check_call(["./build/sbt", "-Dspark.version=%s" % version, "clean", publish_target]) - - prominentPrint("Updating local branch: %s" % current_branch) - check_call(["git", "checkout", current_branch]) - check_call(["git", "merge", "--ff", working_branch]) - check_call(["git", "branch", "-d", working_branch]) - - prominentPrint("Local branch updated") - if verify("Would you like to push local branch & version tag to remote: %s?" % git_remote, - interactive): - check_call(["git", "push", git_remote, current_branch]) - check_call(["git", "push", git_remote, release_tag]) - - prominentPrint("Building release docs") - - if not (publish_docs and verify("Would you like to build release docs?", interactive)): - # All done, exit happy - sys.exit(0) - - check_call(["git", "checkout", "-b", gh_pages_branch, release_tag]) - check_call(["./dev/build-docs.sh"]) - - commit_message = "Build docs for release %s." % release_version - check_call(["git", "add", "-f", "docs/_site"]) - check_call(["git", "commit", "-m", commit_message]) - msg = "Would you like to push docs branch to %s and update gh-pages branch?" - msg %= git_remote - if verify(msg, interactive): - check_call(["git", "push", git_remote, gh_pages_branch]) - check_call(["git", "push", "-f", git_remote, gh_pages_branch+":gh-pages"]) - - check_call(["git", "checkout", current_branch]) - check_call(["git", "branch", "-D", gh_pages_branch]) - - -if __name__ == "__main__": - main() diff --git a/dev/release_guide.md b/dev/release_guide.md new file mode 100644 index 000000000..19be87366 --- /dev/null +++ b/dev/release_guide.md @@ -0,0 +1,54 @@ +# Guild for releasing a new Graphframe version + +## How to build GraphFrame package ? + +To build a GraphFrame package for releasing, you only need to run the following command: + +``` +cd graphframe_repo + +# build graphframe against scala 2.12.12 version +build/sbt ++2.12.12 clean spDist + +# build graphframe against scala 2.13.8 version +build/sbt ++2.13.8 clean spDist +``` + +The above command execution generates zip file with the following path +``` +target/graphframes-{graphframe-version}-spark{spark-version}-s_{scala_version}.zip +``` +The zip file is the Graphframe package we need to publish, the zip file contains JAR file and POM file. +Note that python module files are included in the JAR file. + +## How to publish the GraphFrame package ? + +To publish the GraphFrame package, you need to have "admin" role of https://github.com/graphframes/graphframes project. + +then you need to log in https://spark-packages.org/package/graphframes/graphframes website, +then upload the zip file generated by instructions in "How to build GraphFrame package" section. + +## How to publish the GraphFrame doc ? + +GraphFrame doc is hosted in 'https://graphframes.github.io/graphframes/', to publish doc, +you just need to build doc content, then push the doc content to gh-pages branch of https://github.com/graphframes/graphframes project. + +Before building doc, you need to install jekyll, please refer to 'docs/README.md' for details. + +The following command is for building and publishing doc: +``` +cd graphframe_repo + +cd ./docs +SKIP_SCALADOC=0 PRODUCTION=1 jekyll build + +git fetch upstream gh-pages:gh-pages +git checkout gh-pages + +# The doc content is under docs/_site directory +git add -f docs/_site + +git commit -m "doc update for version xx" +git push upstream gh-pages +``` + diff --git a/src/main/scala/org/graphframes/GraphFrame.scala b/src/main/scala/org/graphframes/GraphFrame.scala index 416a1b053..523026092 100644 --- a/src/main/scala/org/graphframes/GraphFrame.scala +++ b/src/main/scala/org/graphframes/GraphFrame.scala @@ -636,7 +636,14 @@ object GraphFrame extends Serializable with Logging { } } - /** Column name for vertex IDs in [[GraphFrame.vertices]] */ + /** + * Column name for vertex IDs in [[GraphFrame.vertices]] + * Note that GraphFrame assigns a unique long ID to each vertex, + * If the vertex ID type is one of byte / int / long / short type, + * GraphFrame casts the original IDs to long as the unique long ID, + * otherwise GraphFrame generates the unique long ID by Spark function + * ``monotonically_increasing_id`` which is less performant. + */ val ID: String = "id" /**