Added sample Python and Java mergers.

bashir2 · bashir2 · commit c739c952d6ca · 2018-02-19T03:04:36.000-05:00
diff --git a/CallMerger.java b/CallMerger.java
@@ -0,0 +1,152 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.examples;
+
+import org.apache.beam.sdk.Pipeline;
+import org.apache.beam.sdk.io.TextIO;
+import org.apache.beam.sdk.options.Default;
+import org.apache.beam.sdk.options.Description;
+import org.apache.beam.sdk.options.PipelineOptions;
+import org.apache.beam.sdk.options.PipelineOptionsFactory;
+import org.apache.beam.sdk.options.Validation.Required;
+import org.apache.beam.sdk.transforms.DoFn;
+import org.apache.beam.sdk.transforms.GroupByKey;
+import org.apache.beam.sdk.transforms.PTransform;
+import org.apache.beam.sdk.transforms.ParDo;
+import org.apache.beam.sdk.values.KV;
+import org.apache.beam.sdk.values.PCollection;
+
+/**
+ * A naive simulation of the Variant Transforms pipeline.
+ * Modified from the WordCount example of Beam Java SDK.
+ */
+public class CallMerger {
+
+  /**
+   * This DoFn filters out lines starting with "##" and create a key for others.
+   */
+  static class FilterOrKeyDoFn extends DoFn<String, KV<String, String>> {
+
+    @ProcessElement
+    public void processElement(ProcessContext c) {
+      if (c.element().startsWith("##")) return;
+      // Split the line into words.
+      String[] words = c.element().trim().split("\\s+");
+      if (words.length < 5) return;
+      StringBuilder b = new StringBuilder();
+      b.append(words[0]).append(":");
+      b.append(words[1]).append(":");
+      b.append(words[2]).append(":");
+      b.append(words[3]).append(":");
+      b.append(words[4]);
+      String key = b.toString();
+
+      c.output(KV.of(key, c.element()));
+    }
+  }
+
+  /**
+   * This DoFn merges lines with the same key by taking the first full line and
+   * adding the last word of other lines (e.g., simulating merging samples).
+   */
+  static class MergeDoFn extends DoFn<KV<String, Iterable<String>>, String> {
+
+    @ProcessElement
+    public void processElement(ProcessContext c) {
+      boolean first = true;
+      StringBuilder b = new StringBuilder();
+      for (String line : c.element().getValue()) {
+        if (first) {
+          b.append(line);
+          first = false;
+        } else {
+          String[] words = line.split("\\s+");
+          // This section is added for making this a CPU intensive DoFn.
+          int s = 1;
+          for (int i = 0; i < 10000; i++) {
+            for (String word : words) {
+              s = (s * (i + word.length()) + 1) % 1000;
+            }
+          }
+          // End of dummy CPU intensive part.
+          if (words.length > 0) {
+            b.append("\t").append(words[words.length - 1]).append("\t").append(s);
+          }
+        }
+      }
+      c.output(b.toString());
+    }
+  }
+
+  /**
+   * A PTransform that converts a PCollection containing lines of text into a
+   * PCollection of merged lines based on line keys.
+   */
+  public static class MergeCalls extends PTransform<PCollection<String>,
+      PCollection<String>> {
+    @Override
+    public PCollection<String> expand(PCollection<String> lines) {
+
+      // Convert lines of text into individual words.
+      PCollection<KV<String, String>> keyLines = lines.apply(
+          ParDo.of(new FilterOrKeyDoFn()));
+
+      PCollection<KV<String, Iterable<String>>> groupedLines =
+          keyLines.apply(GroupByKey.<String, String>create());
+
+      PCollection<String> mergedLines = groupedLines.apply(
+          ParDo.of(new MergeDoFn()));
+      return mergedLines;
+    }
+  }
+
+  /**
+   * Options supported by {@link CallMerger}.
+   */
+  public interface CallMergerOptions extends PipelineOptions {
+
+    /**
+     * By default, this example reads from a public dataset containing the text
+     * of King Lear. Set this option to choose a different input file or glob.
+     */
+    @Description("Path of the file to read from")
+    @Default.String("gs://apache-beam-samples/shakespeare/kinglear.txt")
+    String getInputFile();
+    void setInputFile(String value);
+
+    /**
+     * Set this required option to specify where to write the output.
+     */
+    @Description("Path of the file to write to")
+    @Required
+    String getOutput();
+    void setOutput(String value);
+  }
+
+  public static void main(String[] args) {
+    CallMergerOptions options = PipelineOptionsFactory.fromArgs(args)
+      .withValidation().as(CallMergerOptions.class);
+    Pipeline p = Pipeline.create(options);
+
+    p.apply("ReadLines", TextIO.read().from(options.getInputFile()))
+     .apply(new MergeCalls())
+     .apply("WriteCounts", TextIO.write().to(options.getOutput()));
+
+    p.run().waitUntilFinish();
+  }
+}
diff --git a/merger.py b/merger.py
@@ -0,0 +1,123 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Copied then modified from the wordcount.py example of Beam Python SDK.
+
+"""A naive simulation of the variant merging pipeline."""
+
+from __future__ import absolute_import
+
+import argparse
+import logging
+
+import apache_beam as beam
+from apache_beam.io import ReadFromText
+from apache_beam.io import WriteToText
+from apache_beam.options.pipeline_options import PipelineOptions
+from apache_beam.options.pipeline_options import SetupOptions
+
+
+class FilterOrKeyDoFn(beam.DoFn):
+  """Parses each line of input text and filters those starting with '##'."""
+
+  def __init__(self):
+    super(FilterOrKeyDoFn, self).__init__()
+
+  def process(self, element):
+    """Returns stripped version of element if it does not start with '##'.
+
+    Args:
+      element: the element being processed
+
+    Returns:
+      The processed element.
+    """
+    text_line = element.strip()
+    if text_line.startswith('##'):
+      return
+    parts = text_line.split()
+    if len(parts) < 5:
+      return
+    key_str = ':'.join([parts[0], parts[1], parts[2], parts[3], parts[4]])
+    yield (key_str, text_line)
+
+
+class MergeDoFn(beam.DoFn):
+  """Adds all 'calls' to the same variant."""
+
+  def __init__(self):
+    super(MergeDoFn, self).__init__()
+
+  def process(self, (key, lines)):
+    if not lines:
+      return
+    output = ''
+    for line in lines:
+      if not output:
+        output = [line]
+      else:
+        words = line.split()
+        # This section is added for making this a CPU intensive DoFn.
+        s = 1
+        for i in range(10000):
+          for w in words:
+            s = (s * (i + len(w)) + 1) % 1000
+        # End of dummy CPU intensive part.
+        if words:
+          output.append(words[len(words)-1])
+          output.append(str(s))
+    yield '\t'.join(output)
+
+
+def run(argv=None):
+  """Main entry point; defines and runs the wordcount pipeline."""
+  parser = argparse.ArgumentParser()
+  parser.add_argument('--input',
+                      dest='input',
+                      default='gs://dataflow-samples/shakespeare/kinglear.txt',
+                      help='Input file to process.')
+  parser.add_argument('--output',
+                      dest='output',
+                      required=True,
+                      help='Output file to write results to.')
+  known_args, pipeline_args = parser.parse_known_args(argv)
+
+  # We use the save_main_session option because one or more DoFn's in this
+  # workflow rely on global context (e.g., a module imported at module level).
+  pipeline_options = PipelineOptions(pipeline_args)
+  pipeline_options.view_as(SetupOptions).save_main_session = True
+  p = beam.Pipeline(options=pipeline_options)
+
+  # Read the text file[pattern] into a PCollection.
+  lines = p | 'read' >> ReadFromText(known_args.input)
+
+  # Merges lines with the same "key".
+  merged_lines = (
+      lines
+      | 'filter_or_key' >> beam.ParDo(FilterOrKeyDoFn())
+      | 'group' >> beam.GroupByKey()
+      | 'merge' >> beam.ParDo(MergeDoFn()))
+
+  # Write the output using a "Write" transform that has side effects.
+  merged_lines | 'write' >> WriteToText(known_args.output)
+
+  result = p.run()
+  result.wait_until_finish()
+
+
+if __name__ == '__main__':
+  logging.getLogger().setLevel(logging.INFO)
+  run()