-
-
Notifications
You must be signed in to change notification settings - Fork 524
Expand file tree
/
Copy pathtest_objectives.py
More file actions
106 lines (86 loc) · 4.54 KB
/
test_objectives.py
File metadata and controls
106 lines (86 loc) · 4.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# Copyright 2020 Tensorforce Team. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import unittest
from test.unittest_base import UnittestBase
class TestObjectives(UnittestBase, unittest.TestCase):
def test_deterministic_policy_gradient(self):
self.start_tests(name='deterministic-policy-gradient')
actions = dict(
gaussian_action1=dict(type='float', shape=(1, 2), min_value=1.0, max_value=2.0),
gaussian_action2=dict(type='float', shape=(1,), min_value=-2.0, max_value=1.0),
beta_action=dict(type='float', shape=(), min_value=1.0, max_value=2.0)
)
# TODO: no-RNN restriction can be removed
policy = dict(network=dict(type='auto', size=8, depth=1, rnn=False), distributions=dict(
gaussian_action2=dict(type='gaussian', stddev_mode='global'), beta_action='beta'
))
objective = 'deterministic_policy_gradient'
reward_estimation = dict(
horizon=3, estimate_advantage=True, predict_horizon_values='late',
predict_action_values=True,
return_processing=dict(type='clipping', lower=-1.0, upper=1.0),
advantage_processing='batch_normalization'
)
baseline = dict(network=dict(type='auto', size=7, depth=1, rnn=False))
baseline_objective = 'action_value'
self.unittest(
actions=actions, policy=policy, objective=objective,
reward_estimation=reward_estimation, baseline=baseline,
baseline_objective=baseline_objective
)
def test_plus(self):
self.start_tests(name='plus')
actions = dict(
bool_action=dict(type='bool', shape=(1,)),
int_action1=dict(type='int', shape=(), num_values=4),
int_action2=dict(type='int', shape=(2,), num_values=3),
int_action3=dict(type='int', shape=(2, 1), num_values=2),
gaussian_action1=dict(type='float', shape=(1, 2), min_value=1.0, max_value=2.0),
gaussian_action2=dict(type='float', shape=(1,), min_value=-2.0, max_value=1.0)
)
objective = dict(type='plus', objective1='policy_gradient', objective2='action_value')
self.unittest(actions=actions, objective=objective)
def test_policy_gradient(self):
self.start_tests(name='policy-gradient')
objective = 'policy_gradient'
self.unittest(objective=objective)
objective = dict(type='policy_gradient', importance_sampling=True)
self.unittest(objective=objective)
objective = dict(type='policy_gradient', clipping_value=1.0)
self.unittest(objective=objective)
objective = dict(type='policy_gradient', importance_sampling=True, clipping_value=0.2)
self.unittest(objective=objective)
objective = dict(type='policy_gradient', early_reduce=True)
self.unittest(objective=objective)
def test_value(self):
self.start_tests(name='value')
actions = dict(
bool_action=dict(type='bool', shape=(1,)),
int_action1=dict(type='int', shape=(), num_values=4),
int_action2=dict(type='int', shape=(2,), num_values=3),
int_action3=dict(type='int', shape=(2, 1), num_values=2)
)
# State value does not affect advantage variables of main policy
objective = 'state_value'
self.unittest(actions=actions, baseline_objective=objective, entropy_regularization=0.0)
policy = dict(network=dict(type='auto', size=8, depth=1, rnn=2))
objective = dict(type='value', value='action')
self.unittest(
actions=actions, policy=policy, objective=objective, entropy_regularization=0.0
)
objective = dict(type='value', value='state', huber_loss=1.0)
self.unittest(actions=actions, baseline_objective=objective, entropy_regularization=0.0)
objective = dict(type='action_value', early_reduce=True)
self.unittest(actions=actions, baseline_objective=objective, entropy_regularization=0.0)