Add an example for Dataproc PySpark Hudi (GoogleCloudPlatform#8828)

functicons · web-flow · commit 7ebfe51a0ed8 · 2023-02-08T19:42:35.000Z
## Description Add an example for creating/writing/reading Hudi table with PySpark on Dataproc. ## Checklist - [ ] I have followed [Sample Guidelines from AUTHORING_GUIDE.MD](https://togithub.com/GoogleCloudPlatform/python-docs-samples/blob/main/AUTHORING_GUIDE.md) - [ ] README is updated to include [all relevant information](https://togithub.com/GoogleCloudPlatform/python-docs-samples/blob/main/AUTHORING_GUIDE.md#readme-file) - [ ] **Tests** pass: `nox -s py-3.9` (see [Test Environment Setup](https://togithub.com/GoogleCloudPlatform/python-docs-samples/blob/main/AUTHORING_GUIDE.md#test-environment-setup)) - [ ] **Lint** pass: `nox -s lint` (see [Test Environment Setup](https://togithub.com/GoogleCloudPlatform/python-docs-samples/blob/main/AUTHORING_GUIDE.md#test-environment-setup)) - [ ] These samples need a new **API enabled** in testing projects to pass (let us know which ones) - [ ] These samples need a new/updated **env vars** in testing projects set to pass (let us know which ones) - [ ] Please **merge** this PR for me once it is approved. - [ ] This sample adds a new sample directory, and I updated the [CODEOWNERS file](https://togithub.com/GoogleCloudPlatform/python-docs-samples/blob/main/.github/CODEOWNERS) with the codeowners for this sample
diff --git a/dataproc/snippets/pyspark_hudi.py b/dataproc/snippets/pyspark_hudi.py
@@ -0,0 +1,168 @@
+#!/usr/bin/env python
+
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Pyspark Hudi example."""
+
+import sys
+
+# pylint: disable=import-error
+from pyspark.sql import SparkSession
+
+
+def create_hudi_table(spark, table_name, table_uri):
+    """Creates Hudi table."""
+    create_table_sql = f"""
+        CREATE TABLE IF NOT EXISTS {table_name} (
+            uuid string,
+            begin_lat double,
+            begin_lon double,
+            end_lat double,
+            end_lon double,
+            driver string,
+            rider string,
+            fare double,
+            partitionpath string,
+            ts long
+        ) USING hudi
+        LOCATION '{table_uri}'
+        TBLPROPERTIES (
+            type = 'cow',
+            primaryKey = 'uuid',
+            preCombineField = 'ts'
+        )
+        PARTITIONED BY (partitionpath)
+    """
+    spark.sql(create_table_sql)
+
+
+def delete_hudi_table(spark, table_name):
+    """Deletes Hudi table."""
+    spark.sql(f'DROP TABLE IF EXISTS {table_name}')
+
+
+def generate_test_dataframe(spark, n_rows):
+    """Generates test dataframe with Hudi's built-in data generator."""
+    spark_context = spark.sparkContext
+    # pylint: disable=protected-access
+    utils = spark_context._jvm.org.apache.hudi.QuickstartUtils
+    data_generator = utils.DataGenerator()
+    inserts = utils.convertToStringList(data_generator.generateInserts(n_rows))
+    return spark.read.json(spark_context.parallelize(inserts, 2))
+
+
+def write_hudi_table(name, uri, dataframe):
+    """Writes Hudi table."""
+    options = {
+            'hoodie.table.name': name,
+            'hoodie.datasource.write.recordkey.field': 'uuid',
+            'hoodie.datasource.write.partitionpath.field': 'partitionpath',
+            'hoodie.datasource.write.table.name': name,
+            'hoodie.datasource.write.operation': 'upsert',
+            'hoodie.datasource.write.precombine.field': 'ts',
+            'hoodie.upsert.shuffle.parallelism': 2,
+            'hoodie.insert.shuffle.parallelism': 2,
+    }
+    dataframe.write.format('hudi').options(**options).mode('append').save(uri)
+
+
+def query_commit_history(spark, name, uri):
+    """Query commit history."""
+    tmp_table = f'{name}_commit_history'
+    spark.read.format('hudi').load(uri).createOrReplaceTempView(tmp_table)
+    query = f"""
+        SELECT DISTINCT(_hoodie_commit_time)
+        FROM {tmp_table}
+        ORDER BY _hoodie_commit_time
+        DESC
+    """
+    return spark.sql(query)
+
+
+def read_hudi_table(spark, table_name, table_uri, commit_ts=''):
+    """Reads Hudi table at the given commit timestamp."""
+    if commit_ts:
+        options = {'as.of.instant': commit_ts}
+    else:
+        options = {}
+    tmp_table = f'{table_name}_snapshot'
+    spark.read.format('hudi').options(**options).load(
+            table_uri
+    ).createOrReplaceTempView(tmp_table)
+    query = f"""
+        SELECT _hoodie_commit_time, begin_lat, begin_lon,
+                driver, end_lat, end_lon, fare, partitionpath,
+                rider, ts, uuid
+        FROM {tmp_table}
+    """
+    return spark.sql(query)
+
+
+def main():
+    """Test create write and read Hudi table."""
+    if len(sys.argv) != 3:
+        raise Exception('Expected arguments: <table_name> <table_uri>')
+
+    table_name = sys.argv[1]
+    table_uri = sys.argv[2]
+
+    app_name = f'pyspark-hudi-test_{table_name}'
+    print(f'Creating Spark session {app_name} ...')
+    spark = SparkSession.builder.appName(app_name).getOrCreate()
+    spark.sparkContext.setLogLevel('WARN')
+
+    print(f'Creating Hudi table {table_name} at {table_uri} ...')
+    create_hudi_table(spark, table_name, table_uri)
+
+    print('Generating test data batch 1...')
+    n_rows1 = 10
+    input_df1 = generate_test_dataframe(spark, n_rows1)
+    input_df1.show(truncate=False)
+
+    print('Writing Hudi table, batch 1 ...')
+    write_hudi_table(table_name, table_uri, input_df1)
+
+    print('Generating test data batch 2...')
+    n_rows2 = 10
+    input_df2 = generate_test_dataframe(spark, n_rows2)
+    input_df2.show(truncate=False)
+
+    print('Writing Hudi table, batch 2 ...')
+    write_hudi_table(table_name, table_uri, input_df2)
+
+    print('Querying commit history ...')
+    commits_df = query_commit_history(spark, table_name, table_uri)
+    commits_df.show(truncate=False)
+    # pylint: disable=protected-access
+    previous_commit = commits_df.collect()[1]._hoodie_commit_time
+
+    print('Reading the Hudi table snapshot at the latest commit ...')
+    output_df1 = read_hudi_table(spark, table_name, table_uri)
+    output_df1.show(truncate=False)
+
+    print(f'Reading the Hudi table snapshot at {previous_commit} ...')
+    output_df2 = read_hudi_table(spark, table_name, table_uri, previous_commit)
+    output_df2.show(truncate=False)
+
+    print('Deleting Hudi table ...')
+    delete_hudi_table(spark, table_name)
+
+    print('Stopping Spark session ...')
+    spark.stop()
+
+    print('All done')
+
+
+main()