Merge pull request #15603 from zhoufek/fat

Kyle Weaver · web-flow · commit abe3aa768e1e · 2021-10-18T13:24:04.000-07:00
[BEAM-9487] Various Trigger.may_lose_data fixes
diff --git a/sdks/python/apache_beam/transforms/ptransform_test.py b/sdks/python/apache_beam/transforms/ptransform_test.py
@@ -50,7 +50,6 @@
 from apache_beam.testing.util import SortLists
 from apache_beam.testing.util import assert_that
 from apache_beam.testing.util import equal_to
-from apache_beam.testing.util import is_empty
 from apache_beam.transforms import WindowInto
 from apache_beam.transforms import trigger
 from apache_beam.transforms import window
@@ -507,12 +506,10 @@ def test_group_by_key_allow_unsafe_triggers(self):
           | beam.Create([(1, 1), (1, 2), (1, 3), (1, 4)])
           | WindowInto(
               window.GlobalWindows(),
-              trigger=trigger.AfterCount(5),
+              trigger=trigger.AfterCount(4),
               accumulation_mode=trigger.AccumulationMode.ACCUMULATING)
           | beam.GroupByKey())
-      # We need five, but it only has four - Displays how this option is
-      # dangerous.
-      assert_that(pcoll, is_empty())
+      assert_that(pcoll, equal_to([(1, [1, 2, 3, 4])]))
 
   def test_group_by_key_reiteration(self):
     class MyDoFn(beam.DoFn):
diff --git a/sdks/python/apache_beam/transforms/trigger.py b/sdks/python/apache_beam/transforms/trigger.py
@@ -30,9 +30,7 @@
 from abc import abstractmethod
 from enum import Flag
 from enum import auto
-from functools import reduce
 from itertools import zip_longest
-from operator import or_
 
 from apache_beam.coders import coder_impl
 from apache_beam.coders import observable
@@ -161,12 +159,34 @@ def with_prefix(self, prefix):
 
 
 class DataLossReason(Flag):
-  """Enum defining potential reasons that a trigger may cause data loss."""
+  """Enum defining potential reasons that a trigger may cause data loss.
+
+  These flags should only cover when the trigger is the cause, though windowing
+  can be taken into account. For instance, AfterWatermark may not flag itself
+  as finishing if the windowing doesn't allow lateness.
+  """
+
+  # Trigger will never be the source of data loss.
   NO_POTENTIAL_LOSS = 0
+
+  # Trigger may finish. In this case, data that comes in after the trigger may
+  # be lost. Example: AfterCount(1) will stop firing after the first element.
   MAY_FINISH = auto()
+
+  # Deprecated: Beam will emit buffered data at GC time. Any other behavior
+  # should be treated as a bug with the runner used.
   CONDITION_NOT_GUARANTEED = auto()
 
 
+# Convenience functions for checking if a flag is included. Each is equivalent
+# to `reason & flag == flag`
+
+
+def _IncludesMayFinish(reason):
+  # type: (DataLossReason) -> bool
+  return reason & DataLossReason.MAY_FINISH == DataLossReason.MAY_FINISH
+
+
 # pylint: disable=unused-argument
 # TODO(robertwb): Provisional API, Java likely to change as well.
 class TriggerFn(metaclass=ABCMeta):
@@ -260,12 +280,6 @@ def may_lose_data(self, unused_windowing):
           scenario is only accounted for if the windowing strategy allows
           late data. Otherwise, the trigger is not responsible for the data
           loss.
-        * The trigger condition may not be met. For instance,
-          Repeatedly(AfterCount(N)) may not fire due to N not being met. This
-          is only accounted for if the condition itself led to data loss.
-          Repeatedly(AfterCount(1)) is safe, since it would only not fire if
-          there is no data to lose, but Repeatedly(AfterCount(2)) can cause
-          data loss if there is only one record.
 
     Note that this only returns the potential for loss. It does not mean that
     there will be data loss. It also only accounts for loss related to the
@@ -278,9 +292,7 @@ def may_lose_data(self, unused_windowing):
     Returns:
       The DataLossReason. If there is no potential loss,
         DataLossReason.NO_POTENTIAL_LOSS is returned. Otherwise, all the
-        potential reasons are returned as a single value. For instance, if
-        data loss can result from finishing or not having the condition met,
-        the result will be DataLossReason.MAY_FINISH|CONDITION_NOT_GUARANTEED.
+        potential reasons are returned as a single value.
     """
     # For backwards compatibility's sake, we're assuming the trigger is safe.
     return DataLossReason.NO_POTENTIAL_LOSS
@@ -390,6 +402,7 @@ def reset(self, window, context):
     pass
 
   def may_lose_data(self, unused_windowing):
+    """AfterProcessingTime may finish."""
     return DataLossReason.MAY_FINISH
 
   @staticmethod
@@ -444,6 +457,7 @@ def on_fire(self, watermark, window, context):
     return False
 
   def may_lose_data(self, unused_windowing):
+    """No potential loss, since the trigger always fires."""
     return DataLossReason.NO_POTENTIAL_LOSS
 
   @staticmethod
@@ -494,7 +508,7 @@ def may_lose_data(self, unused_windowing):
     """No potential data loss.
 
     Though Never doesn't explicitly trigger, it still collects data on
-    windowing closing, so any data loss is due to windowing closing.
+    windowing closing.
     """
     return DataLossReason.NO_POTENTIAL_LOSS
 
@@ -591,13 +605,7 @@ def reset(self, window, context):
       self.late.reset(window, NestedContext(context, 'late'))
 
   def may_lose_data(self, windowing):
-    """May cause data loss if the windowing allows lateness and either:
-
-      * The late trigger is not set
-      * The late trigger may cause data loss.
-
-    The second case is equivalent to Repeatedly(late).may_lose_data(windowing)
-    """
+    """May cause data loss if lateness allowed and no late trigger set."""
     if windowing.allowed_lateness == 0:
       return DataLossReason.NO_POTENTIAL_LOSS
     if self.late is None:
@@ -674,10 +682,8 @@ def reset(self, window, context):
     context.clear_state(self.COUNT_TAG)
 
   def may_lose_data(self, unused_windowing):
-    reason = DataLossReason.MAY_FINISH
-    if self.count > 1:
-      reason |= DataLossReason.CONDITION_NOT_GUARANTEED
-    return reason
+    """AfterCount may finish."""
+    return DataLossReason.MAY_FINISH
 
   @staticmethod
   def from_runner_api(proto, unused_context):
@@ -787,6 +793,13 @@ def on_fire(self, watermark, window, context):
         finished.append(trigger.on_fire(watermark, window, nested_context))
     return self.combine_op(finished)
 
+  def may_lose_data(self, windowing):
+    may_finish = self.combine_op(
+        _IncludesMayFinish(t.may_lose_data(windowing)) for t in self.triggers)
+    return (
+        DataLossReason.MAY_FINISH
+        if may_finish else DataLossReason.NO_POTENTIAL_LOSS)
+
   def reset(self, window, context):
     for ix, trigger in enumerate(self.triggers):
       trigger.reset(window, self._sub_context(context, ix))
@@ -832,15 +845,6 @@ class AfterAny(_ParallelTriggerFn):
   """
   combine_op = any
 
-  def may_lose_data(self, windowing):
-    reason = DataLossReason.NO_POTENTIAL_LOSS
-    for trigger in self.triggers:
-      t_reason = trigger.may_lose_data(windowing)
-      if t_reason == DataLossReason.NO_POTENTIAL_LOSS:
-        return t_reason
-      reason |= t_reason
-    return reason
-
 
 class AfterAll(_ParallelTriggerFn):
   """Fires when all subtriggers have fired.
@@ -849,9 +853,6 @@ class AfterAll(_ParallelTriggerFn):
   """
   combine_op = all
 
-  def may_lose_data(self, windowing):
-    return reduce(or_, (t.may_lose_data(windowing) for t in self.triggers))
-
 
 class AfterEach(TriggerFn):
 
@@ -908,7 +909,12 @@ def reset(self, window, context):
       trigger.reset(window, self._sub_context(context, ix))
 
   def may_lose_data(self, windowing):
-    return reduce(or_, (t.may_lose_data(windowing) for t in self.triggers))
+    """If all sub-triggers may finish, this may finish."""
+    may_finish = all(
+        _IncludesMayFinish(t.may_lose_data(windowing)) for t in self.triggers)
+    return (
+        DataLossReason.MAY_FINISH
+        if may_finish else DataLossReason.NO_POTENTIAL_LOSS)
 
   @staticmethod
   def _sub_context(context, index):
diff --git a/sdks/python/apache_beam/transforms/trigger_test.py b/sdks/python/apache_beam/transforms/trigger_test.py
@@ -449,8 +449,8 @@ def _test(self, trigger, lateness, expected):
   def test_default_trigger(self):
     self._test(DefaultTrigger(), 0, DataLossReason.NO_POTENTIAL_LOSS)
 
-  def test_after_processing_time(self):
-    self._test(AfterProcessingTime(), 0, DataLossReason.MAY_FINISH)
+  def test_after_processing(self):
+    self._test(AfterProcessingTime(42), 0, DataLossReason.MAY_FINISH)
 
   def test_always(self):
     self._test(Always(), 0, DataLossReason.NO_POTENTIAL_LOSS)
@@ -461,7 +461,7 @@ def test_never(self):
   def test_after_watermark_no_allowed_lateness(self):
     self._test(AfterWatermark(), 0, DataLossReason.NO_POTENTIAL_LOSS)
 
-  def test_after_watermark_late_none(self):
+  def test_after_watermark_no_late_trigger(self):
     self._test(AfterWatermark(), 60, DataLossReason.MAY_FINISH)
 
   def test_after_watermark_no_allowed_lateness_safe_late(self):
@@ -470,93 +470,58 @@ def test_after_watermark_no_allowed_lateness_safe_late(self):
         0,
         DataLossReason.NO_POTENTIAL_LOSS)
 
-  def test_after_watermark_safe_late(self):
+  def test_after_watermark_allowed_lateness_safe_late(self):
     self._test(
         AfterWatermark(late=DefaultTrigger()),
         60,
         DataLossReason.NO_POTENTIAL_LOSS)
 
-  def test_after_watermark_no_allowed_lateness_may_finish_late(self):
-    self._test(
-        AfterWatermark(late=AfterProcessingTime()),
-        0,
-        DataLossReason.NO_POTENTIAL_LOSS)
-
-  def test_after_watermark_may_finish_late(self):
-    self._test(
-        AfterWatermark(late=AfterProcessingTime()),
-        60,
-        DataLossReason.NO_POTENTIAL_LOSS)
-
-  def test_after_watermark_no_allowed_lateness_condition_late(self):
-    self._test(
-        AfterWatermark(late=AfterCount(5)), 0, DataLossReason.NO_POTENTIAL_LOSS)
-
-  def test_after_watermark_condition_late(self):
-    self._test(
-        AfterWatermark(late=AfterCount(5)),
-        60,
-        DataLossReason.NO_POTENTIAL_LOSS)
-
-  def test_after_count_one(self):
-    self._test(AfterCount(1), 0, DataLossReason.MAY_FINISH)
-
-  def test_after_count_gt_one(self):
-    self._test(
-        AfterCount(2),
-        0,
-        DataLossReason.MAY_FINISH | DataLossReason.CONDITION_NOT_GUARANTEED)
+  def test_after_count(self):
+    self._test(AfterCount(42), 0, DataLossReason.MAY_FINISH)
 
   def test_repeatedly_safe_underlying(self):
     self._test(
         Repeatedly(DefaultTrigger()), 0, DataLossReason.NO_POTENTIAL_LOSS)
 
-  def test_repeatedly_may_finish_underlying(self):
-    self._test(Repeatedly(AfterCount(1)), 0, DataLossReason.NO_POTENTIAL_LOSS)
-
-  def test_repeatedly_condition_underlying(self):
-    self._test(Repeatedly(AfterCount(2)), 0, DataLossReason.NO_POTENTIAL_LOSS)
+  def test_repeatedly_unsafe_underlying(self):
+    self._test(Repeatedly(AfterCount(42)), 0, DataLossReason.NO_POTENTIAL_LOSS)
 
-  def test_after_any_some_unsafe(self):
+  def test_after_any_one_may_finish(self):
     self._test(
-        AfterAny(AfterCount(1), DefaultTrigger()),
-        0,
-        DataLossReason.NO_POTENTIAL_LOSS)
-
-  def test_after_any_same_reason(self):
-    self._test(
-        AfterAny(AfterCount(1), AfterProcessingTime()),
+        AfterAny(AfterCount(42), DefaultTrigger()),
         0,
         DataLossReason.MAY_FINISH)
 
-  def test_after_any_different_reasons(self):
+  def test_after_any_all_safe(self):
     self._test(
-        AfterAny(AfterCount(2), AfterProcessingTime()),
+        AfterAny(Repeatedly(AfterCount(42)), DefaultTrigger()),
         0,
-        DataLossReason.MAY_FINISH | DataLossReason.CONDITION_NOT_GUARANTEED)
-
-  def test_after_all_some_unsafe(self):
-    self._test(
-        AfterAll(AfterCount(1), DefaultTrigger()), 0, DataLossReason.MAY_FINISH)
+        DataLossReason.NO_POTENTIAL_LOSS)
 
-  def test_after_all_safe(self):
+  def test_after_all_some_may_finish(self):
     self._test(
-        AfterAll(Repeatedly(AfterCount(1)), DefaultTrigger()),
+        AfterAll(AfterCount(1), DefaultTrigger()),
         0,
         DataLossReason.NO_POTENTIAL_LOSS)
 
-  def test_after_each_some_unsafe(self):
+  def test_afer_all_all_may_finish(self):
     self._test(
-        AfterEach(AfterCount(1), DefaultTrigger()),
+        AfterAll(AfterCount(42), AfterProcessingTime(42)),
         0,
         DataLossReason.MAY_FINISH)
 
-  def test_after_each_all_safe(self):
+  def test_after_each_at_least_one_safe(self):
     self._test(
-        AfterEach(Repeatedly(AfterCount(1)), DefaultTrigger()),
+        AfterEach(AfterCount(1), DefaultTrigger(), AfterCount(2)),
         0,
         DataLossReason.NO_POTENTIAL_LOSS)
 
+  def test_after_each_all_may_finish(self):
+    self._test(
+        AfterEach(AfterCount(1), AfterCount(2), AfterCount(3)),
+        0,
+        DataLossReason.MAY_FINISH)
+
 
 class RunnerApiTest(unittest.TestCase):
   def test_trigger_encoding(self):
@@ -606,6 +571,35 @@ def format_result(k, vs):
                   'B-3': {10, 15, 16},
               }.items())))
 
+  def test_after_count_streaming(self):
+    test_options = PipelineOptions(
+        flags=['--allow_unsafe_triggers', '--streaming'])
+    with TestPipeline(options=test_options) as p:
+      # yapf: disable
+      test_stream = (
+          TestStream()
+          .advance_watermark_to(0)
+          .add_elements([('A', 1), ('A', 2), ('A', 3)])
+          .add_elements([('A', 4), ('A', 5), ('A', 6)])
+          .add_elements([('B', 1), ('B', 2), ('B', 3)])
+          .advance_watermark_to_infinity())
+      # yapf: enable
+
+      results = (
+          p
+          | test_stream
+          | beam.WindowInto(
+              FixedWindows(10),
+              trigger=AfterCount(3),
+              accumulation_mode=AccumulationMode.ACCUMULATING)
+          | beam.GroupByKey())
+
+      assert_that(
+          results,
+          equal_to(list({
+            'A': [1, 2, 3], # 4 - 6 discarded because trigger finished
+            'B': [1, 2, 3]}.items())))
+
   def test_always(self):
     with TestPipeline() as p:
 
@@ -714,7 +708,8 @@ def test_on_pane_watermark_hold_no_pipeline_stall(self):
     test_stream.advance_processing_time(START_TIMESTAMP + 2)
     test_stream.advance_watermark_to(START_TIMESTAMP + 2)
 
-    with TestPipeline(options=PipelineOptions(['--streaming'])) as p:
+    with TestPipeline(options=PipelineOptions(
+        ['--streaming', '--allow_unsafe_triggers'])) as p:
       # pylint: disable=expression-not-assigned
       (
           p