Use a step counter rather than physics.time() to enforce episode time limits

alimuldal · alimuldal · commit cabe81720a62 · 2018-10-10T20:25:39.000+01:00
This circumvents a problem where the step count sometimes differs from `time_limit / control_timestep` due to accumulation of rounding error in `mjData-&gt;time`.

PiperOrigin-RevId: 192746383
diff --git a/dm_control/rl/control.py b/dm_control/rl/control.py
@@ -23,8 +23,6 @@
 import collections
 import contextlib
 
-# Internal dependencies.
-
 import numpy as np
 import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
@@ -64,7 +62,6 @@ def __init__(self,
     """
     self._task = task
     self._physics = physics
-    self._time_limit = time_limit
     self._flat_observation = flat_observation
 
     if n_sub_steps is not None and control_timestep is not None:
@@ -77,11 +74,18 @@ def __init__(self,
     else:
       self._n_sub_steps = 1
 
+    if time_limit == float('inf'):
+      self._step_limit = float('inf')
+    else:
+      self._step_limit = time_limit / (
+          self._physics.timestep() * self._n_sub_steps)
+    self._step_count = 0
     self._reset_next_step = True
 
   def reset(self):
     """Starts a new episode and returns the first `TimeStep`."""
     self._reset_next_step = False
+    self._step_count = 0
     with self._physics.reset_context():
       self._task.initialize_episode(self._physics)
 
@@ -111,18 +115,21 @@ def step(self, action):
     if self._flat_observation:
       observation = flatten_observation(observation)
 
-    if self.physics.time() >= self._time_limit:
+    self._step_count += 1
+    if self._step_count >= self._step_limit:
       discount = 1.0
     else:
       discount = self._task.get_termination(self._physics)
 
-    if discount is None:
-      return environment.TimeStep(
-          environment.StepType.MID, reward, 1.0, observation)
-    else:
+    episode_over = discount is not None
+
+    if episode_over:
       self._reset_next_step = True
       return environment.TimeStep(
           environment.StepType.LAST, reward, discount, observation)
+    else:
+      return environment.TimeStep(
+          environment.StepType.MID, reward, 1.0, observation)
 
   def action_spec(self):
     """Returns the action specification for this environment."""
diff --git a/dm_control/rl/control_test.py b/dm_control/rl/control_test.py
@@ -74,12 +74,25 @@ def test_environment_calls(self):
 
     self.assertEquals(_CONSTANT_REWARD_VALUE, time_step.reward)
 
-  def test_timeout(self):
-    self._physics.time = mock.Mock(return_value=2.)
+  @parameterized.parameters(
+      {'physics_timestep': .01, 'control_timestep': None,
+       'expected_steps': 1000},
+      {'physics_timestep': .01, 'control_timestep': .05,
+       'expected_steps': 5000})
+  def test_timeout(self, expected_steps, physics_timestep, control_timestep):
+    self._physics.timestep.return_value = physics_timestep
+    time_limit = expected_steps * (control_timestep or physics_timestep)
     env = control.Environment(
-        physics=self._physics, task=self._task, time_limit=1.)
-    env.reset()
-    time_step = env.step([1])
+        physics=self._physics, task=self._task, time_limit=time_limit,
+        control_timestep=control_timestep)
+
+    time_step = env.reset()
+    steps = 0
+    while not time_step.last():
+      time_step = env.step([1])
+      steps += 1
+
+    self.assertEqual(steps, expected_steps)
     self.assertTrue(time_step.last())
 
     time_step = env.step([1])