Pub/Sub: enable parallel writes to GCS in Pub/Sub Dataflow example (GoogleCloudPlatform#5547)

anguillanneuf · web-flow · commit 97e7e82a69c3 · 2021-03-23T09:26:16.000-07:00
* feat: enable parallel writes

* address david's comments

* address david's 2nd round of comments
diff --git a/pubsub/streaming-analytics/PubSubToGCS.py b/pubsub/streaming-analytics/PubSubToGCS.py
@@ -14,119 +14,122 @@
 
 # [START pubsub_to_gcs]
 import argparse
-import datetime
-import json
+from datetime import datetime
 import logging
+import random
 
-import apache_beam as beam
+from apache_beam import DoFn, GroupByKey, io, ParDo, Pipeline, PTransform, WindowInto, WithKeys
 from apache_beam.options.pipeline_options import PipelineOptions
-import apache_beam.transforms.window as window
+from apache_beam.transforms.window import FixedWindows
 
 
-class GroupWindowsIntoBatches(beam.PTransform):
-    """A composite transform that groups Pub/Sub messages based on publish
-    time and outputs a list of dictionaries, where each contains one message
-    and its publish timestamp.
+class GroupMessagesByFixedWindows(PTransform):
+    """A composite transform that groups Pub/Sub messages based on publish time
+    and outputs a list of tuples, each containing a message and its publish time.
     """
 
-    def __init__(self, window_size):
-        # Convert minutes into seconds.
+    def __init__(self, window_size, num_shards=5):
+        # Set window size to 60 seconds.
         self.window_size = int(window_size * 60)
+        self.num_shards = num_shards
 
     def expand(self, pcoll):
         return (
             pcoll
-            # Assigns window info to each Pub/Sub message based on its
-            # publish timestamp.
-            | "Window into Fixed Intervals"
-            >> beam.WindowInto(window.FixedWindows(self.window_size))
-            | "Add timestamps to messages" >> beam.ParDo(AddTimestamps())
-            # Use a dummy key to group the elements in the same window.
-            # Note that all the elements in one window must fit into memory
-            # for this. If the windowed elements do not fit into memory,
-            # please consider using `beam.util.BatchElements`.
-            # https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.util.html#apache_beam.transforms.util.BatchElements
-            | "Add Dummy Key" >> beam.Map(lambda elem: (None, elem))
-            | "Groupby" >> beam.GroupByKey()
-            | "Abandon Dummy Key" >> beam.MapTuple(lambda _, val: val)
+            # Bind window info to each element using element timestamp (or publish time).
+            | "Window into fixed intervals"
+            >> WindowInto(FixedWindows(self.window_size))
+            | "Add timestamp to windowed elements" >> ParDo(AddTimestamp())
+            # Assign a random key to each windowed element based on the number of shards.
+            | "Add key" >> WithKeys(lambda _: random.randint(0, self.num_shards - 1))
+            # Group windowed elements by key. All the elements in the same window must fit
+            # memory for this. If not, you need to use `beam.util.BatchElements`.
+            | "Group by key" >> GroupByKey()
         )
 
 
-class AddTimestamps(beam.DoFn):
-    def process(self, element, publish_time=beam.DoFn.TimestampParam):
-        """Processes each incoming windowed element by extracting the Pub/Sub
-        message and its publish timestamp into a dictionary. `publish_time`
-        defaults to the publish timestamp returned by the Pub/Sub server. It
-        is bound to each element by Beam at runtime.
+class AddTimestamp(DoFn):
+    def process(self, element, publish_time=DoFn.TimestampParam):
+        """Processes each windowed element by extracting the message body and its
+        publish time into a tuple.
         """
-
-        yield {
-            "message_body": element.decode("utf-8"),
-            "publish_time": datetime.datetime.utcfromtimestamp(
-                float(publish_time)
-            ).strftime("%Y-%m-%d %H:%M:%S.%f"),
-        }
+        yield (
+            element.decode("utf-8"),
+            datetime.utcfromtimestamp(float(publish_time)).strftime(
+                "%Y-%m-%d %H:%M:%S.%f"
+            ),
+        )
 
 
-class WriteBatchesToGCS(beam.DoFn):
+class WriteToGCS(DoFn):
     def __init__(self, output_path):
         self.output_path = output_path
 
-    def process(self, batch, window=beam.DoFn.WindowParam):
-        """Write one batch per file to a Google Cloud Storage bucket. """
+    def process(self, key_value, window=DoFn.WindowParam):
+        """Write messages in a batch to Google Cloud Storage."""
 
         ts_format = "%H:%M"
         window_start = window.start.to_utc_datetime().strftime(ts_format)
         window_end = window.end.to_utc_datetime().strftime(ts_format)
-        filename = "-".join([self.output_path, window_start, window_end])
+        shard_id, batch = key_value
+        filename = "-".join([self.output_path, window_start, window_end, str(shard_id)])
 
-        with beam.io.gcp.gcsio.GcsIO().open(filename=filename, mode="w") as f:
-            for element in batch:
-                f.write("{}\n".format(json.dumps(element)).encode("utf-8"))
+        with io.gcsio.GcsIO().open(filename=filename, mode="w") as f:
+            for message_body, publish_time in batch:
+                f.write(f"{message_body},{publish_time}\n".encode("utf-8"))
 
 
-def run(input_topic, output_path, window_size=1.0, pipeline_args=None):
-    # `save_main_session` is set to true because some DoFn's rely on
-    # globally imported modules.
+def run(input_topic, output_path, window_size=1.0, num_shards=5, pipeline_args=None):
+    # Set `save_main_session` to True so DoFns can access globally imported modules.
     pipeline_options = PipelineOptions(
         pipeline_args, streaming=True, save_main_session=True
     )
 
-    with beam.Pipeline(options=pipeline_options) as pipeline:
+    with Pipeline(options=pipeline_options) as pipeline:
         (
             pipeline
-            | "Read PubSub Messages"
-            >> beam.io.ReadFromPubSub(topic=input_topic)
-            | "Window into" >> GroupWindowsIntoBatches(window_size)
-            | "Write to GCS" >> beam.ParDo(WriteBatchesToGCS(output_path))
+            # Because `timestamp_attribute` is unspecified in `ReadFromPubSub`, Beam
+            # binds the publish time returned by the Pub/Sub server for each message
+            # to the element's timestamp parameter, accessible via `DoFn.TimestampParam`.
+            # https://beam.apache.org/releases/pydoc/current/apache_beam.io.gcp.pubsub.html#apache_beam.io.gcp.pubsub.ReadFromPubSub
+            | "Read from Pub/Sub" >> io.ReadFromPubSub(topic=input_topic)
+            | "Window into" >> GroupMessagesByFixedWindows(window_size, num_shards)
+            | "Write to GCS" >> ParDo(WriteToGCS(output_path))
         )
 
 
-if __name__ == "__main__":  # noqa
+if __name__ == "__main__":
     logging.getLogger().setLevel(logging.INFO)
 
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "--input_topic",
-        help="The Cloud Pub/Sub topic to read from.\n"
-        '"projects/<PROJECT_NAME>/topics/<TOPIC_NAME>".',
+        help="The Cloud Pub/Sub topic to read from."
+        '"projects/<PROJECT_ID>/topics/<TOPIC_ID>".',
     )
     parser.add_argument(
         "--window_size",
         type=float,
         default=1.0,
-        help="Output file's window size in number of minutes.",
+        help="Output file's window size in minutes.",
     )
     parser.add_argument(
         "--output_path",
-        help="GCS Path of the output file including filename prefix.",
+        help="Path of the output GCS file including the prefix.",
+    )
+    parser.add_argument(
+        "--num_shards",
+        type=int,
+        default=5,
+        help="Number of shards to use when writing windowed elements to GCS.",
     )
     known_args, pipeline_args = parser.parse_known_args()
 
     run(
         known_args.input_topic,
         known_args.output_path,
         known_args.window_size,
+        known_args.num_shards,
         pipeline_args,
     )
 # [END pubsub_to_gcs]
diff --git a/pubsub/streaming-analytics/PubSubToGCS_test.py b/pubsub/streaming-analytics/PubSubToGCS_test.py
@@ -15,7 +15,7 @@
 import os
 import uuid
 
-import apache_beam as beam
+from apache_beam.io.gcp.gcsio import GcsIO
 from apache_beam.testing.test_pipeline import TestPipeline
 from apache_beam.testing.test_stream import TestStream
 from apache_beam.testing.test_utils import TempDir
@@ -47,8 +47,9 @@
 def test_pubsub_to_gcs():
     PubSubToGCS.run(
         input_topic="unused",  # mocked by TestStream
-        output_path="gs://{}/pubsub/{}/output".format(BUCKET, UUID),
+        output_path=f"gs://{BUCKET}/pubsub/{UUID}/output",
         window_size=1,  # 1 minute
+        num_shards=1,
         pipeline_args=[
             "--project",
             PROJECT,
@@ -58,8 +59,8 @@ def test_pubsub_to_gcs():
     )
 
     # Check for output files on GCS.
-    gcs_client = beam.io.gcp.gcsio.GcsIO()
-    files = gcs_client.list_prefix("gs://{}/pubsub/{}".format(BUCKET, UUID))
+    gcs_client = GcsIO()
+    files = gcs_client.list_prefix(f"gs://{BUCKET}/pubsub/{UUID}")
     assert len(files) > 0
 
     # Clean up.
diff --git a/pubsub/streaming-analytics/README.md b/pubsub/streaming-analytics/README.md
@@ -14,8 +14,8 @@ Sample(s) showing how to use [Google Cloud Pub/Sub] with [Google Cloud Dataflow]
    or via the `gcloud` command line tool.
 
    ```sh
-   export PROJECT_NAME=your-google-cloud-project-id
-   gcloud projects create $PROJECT_NAME
+   export PROJECT_ID=your-google-cloud-project-id
+   gcloud projects create $PROJECT_ID
    ```
 
 1. [Enable billing].
@@ -41,15 +41,16 @@ Sample(s) showing how to use [Google Cloud Pub/Sub] with [Google Cloud Dataflow]
    Alternatively, you can use `gcloud` through the command line.
 
    ```sh
-   export PROJECT_NAME=$(gcloud config get-value project)
-   export SA_NAME=samples
-   export IAM_ACCOUNT=$SA_NAME@$PROJECT_NAME.iam.gserviceaccount.com
+   export PROJECT_ID=$(gcloud config get-value project)
+   export SERVICE_ACCOUNT_NAME=samples
+   export IAM_ACCOUNT=$SERVICE_ACCOUNT_NAME@$PROJECT_ID.iam.gserviceaccount.com
 
    # Create the service account.
-   gcloud iam service-accounts create $SA_NAME --display-name $SA_NAME
+   gcloud iam service-accounts create $SERVICE_ACCOUNT_NAME \
+     --display-name $SERVICE_ACCOUNT_NAME
 
    # Set the role to Project Owner (*).
-   gcloud projects add-iam-policy-binding $PROJECT_NAME \
+   gcloud projects add-iam-policy-binding $PROJECT_ID \
      --member serviceAccount:$IAM_ACCOUNT \
      --role roles/owner
 
@@ -77,16 +78,18 @@ Sample(s) showing how to use [Google Cloud Pub/Sub] with [Google Cloud Dataflow]
 1. Create a Cloud Storage bucket.
 
    ```bash
-   export BUCKET_NAME=your-gcs-bucket
+   export BUCKET_ID=your-gcs-bucket-id
 
-   gsutil mb gs://$BUCKET_NAME
+   gsutil mb gs://$BUCKET_ID
    ```
 
  1. Start a [Google Cloud Scheduler] job that publishes one message to a [Google Cloud Pub/Sub] topic every minute. This will create an [App Engine] app if one has never been created on the project.
 
     ```bash
+    export TOPIC_ID=your-topic-id
+
     # Create a Pub/Sub topic.
-    gcloud pubsub topics create cron-topic
+    gcloud pubsub topics create $TOPIC_ID
 
     # Create a Cloud Scheduler job
     gcloud scheduler jobs create pubsub publisher-job --schedule="* * * * *" \
@@ -134,33 +137,43 @@ The following instructions will help you prepare your development environment.
 
 * [PubSubToGCS.py](PubSubToGCS.py)
 
-The following example will run a streaming pipeline. It will read messages from a Pub/Sub topic, then window them into fixed-sized intervals, and write one file per window into a GCS location.
+The following example will run a streaming pipeline. The pipeline does the following:
+1. Reads messages from a Pub/Sub topic.
+1. Group messages into batches for every windows.
+  1. Adds window start and end time to each element/message.
+  1. Adds publish timestamp to each element/message.
+  1. Adds a random shard ID as key to each windowed element. *Sharding* lets you split the elements in the same window into multiple small batches. This way, multiple workers can each write a batch of elements into Cloud Storage. This results in one file per shard.
+  1. Groups the elements by their shard ID for every window.
+1. Writes the grouped elements to a file on Cloud Storage.
 
 + `--project`: sets the Google Cloud project ID to run the pipeline on
 + `--region`: sets the Dataflow [regional endpoint](https://cloud.google.com/dataflow/docs/concepts/regional-endpoints)
 + `--input_topic`: sets the input Pub/Sub topic to read messages from
 + `--output_path`: sets the output GCS path prefix to write files to
 + `--runner`: specifies the runner to run the pipeline, if not set to `DataflowRunner`, `DirectRunner` is used
 + `--window_size [optional]`: specifies the window size in minutes, defaults to 1.0
++ `--num_shards [optional]`: sets the number of shards when writing windowed elements to GCS, defaults to 5.
 + `--temp_location`: needed for executing the pipeline
 
 ```bash
 python PubSubToGCS.py \
-  --project=$PROJECT_NAME \
+  --project=$PROJECT_ID \
   --region=us-central1 \
-  --input_topic=projects/$PROJECT_NAME/topics/$TOPIC_NAME \
-  --output_path=gs://$BUCKET_NAME/samples/output \
+  --input_topic=projects/$PROJECT_ID/topics/$TOPIC_ID \
+  --output_path=gs://$BUCKET_ID/samples/output \
   --runner=DataflowRunner \
-  --window_size=2 \
-  --temp_location=gs://$BUCKET_NAME/temp
+  --window_size=1 \
+  # If set, you will write up to `num_shards` files per window to GCS.
+  # --num_shards=2 \
+  --temp_location=gs://$BUCKET_ID/temp
 ```
 
 After the job has been submitted, you can check its status in the [GCP Console Dataflow page].
 
 You can also check the output to your GCS bucket using the command line below or in the [GCP Console Storage page]. You may need to wait a few minutes for the files to appear.
 
 ```bash
-gsutil ls gs://$BUCKET_NAME/samples/
+gsutil ls gs://$BUCKET_ID/samples/
 ```
 
 ## Cleanup
@@ -178,17 +191,17 @@ gsutil ls gs://$BUCKET_NAME/samples/
 1. Delete the topic. [Google Cloud Dataflow] will automatically delete the subscription associated with the streaming pipeline when the job is canceled.
 
    ```bash
-   gcloud pubsub topics delete cron-topic
+   gcloud pubsub topics delete $TOPIC_ID
    ```
 
 1. Lastly, to avoid incurring charges to your GCP account for the resources created in this tutorial:
 
     ```bash
     # Delete only the files created by this sample.
-    gsutil -m rm -rf "gs://$BUCKET_NAME/samples/output*"
+    gsutil -m rm -rf "gs://$BUCKET_ID/samples/output*"
 
     # [optional] Remove the Cloud Storage bucket.
-    gsutil rb gs://$BUCKET_NAME
+    gsutil rb gs://$BUCKET_ID
     ```
 
 [Apache Beam]: https://beam.apache.org/