提交 8c6711c8 编写于 作者: P pkpk 提交者: Yibing Liu

test=develop (#3205)

上级 05795e85
......@@ -4,3 +4,6 @@
[submodule "PaddleNLP/language_representations_kit/ERNIE"]
path = PaddleNLP/language_representations_kit/ERNIE
url = https://github.com/PaddlePaddle/ERNIE
[submodule "PaddleRL"]
path = PaddleRL
url = https://github.com/PaddlePaddle/PARL
Subproject commit a884635519c529c69c34e1134ca6c9d99f2c0007
......@@ -71,17 +71,17 @@ class DQNModel(object):
optimizer.minimize(cost)
vars = list(self.train_program.list_vars())
target_vars = list(filter(
lambda x: 'GRAD' not in x.name and 'target' in x.name, vars))
target_vars = list(
filter(lambda x: 'GRAD' not in x.name and 'target' in x.name, vars))
policy_vars_name = [
x.name.replace('target', 'policy') for x in target_vars]
policy_vars = list(filter(
lambda x: x.name in policy_vars_name, vars))
x.name.replace('target', 'policy') for x in target_vars
]
policy_vars = list(filter(lambda x: x.name in policy_vars_name, vars))
policy_vars.sort(key=lambda x: x.name)
target_vars.sort(key=lambda x: x.name)
with fluid.program_guard(self._sync_program):
sync_ops = []
for i, var in enumerate(policy_vars):
......@@ -153,7 +153,6 @@ class DQNModel(object):
bias_attr=ParamAttr(name='{}_fc1_b'.format(variable_field)))
return out
def act(self, state, train_or_test):
sample = np.random.random()
if train_or_test == 'train' and sample < self.exploration:
......
......@@ -64,9 +64,11 @@ class DoubleDQNModel(object):
greedy_action = fluid.layers.argmax(next_s_predcit_value, axis=1)
greedy_action = fluid.layers.unsqueeze(greedy_action, axes=[1])
predict_onehot = fluid.layers.one_hot(greedy_action, self.action_dim)
predict_onehot = fluid.layers.one_hot(greedy_action,
self.action_dim)
best_v = fluid.layers.reduce_sum(
fluid.layers.elementwise_mul(predict_onehot, targetQ_predict_value),
fluid.layers.elementwise_mul(predict_onehot,
targetQ_predict_value),
dim=1)
best_v.stop_gradient = True
......@@ -79,17 +81,17 @@ class DoubleDQNModel(object):
optimizer.minimize(cost)
vars = list(self.train_program.list_vars())
target_vars = list(filter(
lambda x: 'GRAD' not in x.name and 'target' in x.name, vars))
target_vars = list(
filter(lambda x: 'GRAD' not in x.name and 'target' in x.name, vars))
policy_vars_name = [
x.name.replace('target', 'policy') for x in target_vars]
policy_vars = list(filter(
lambda x: x.name in policy_vars_name, vars))
x.name.replace('target', 'policy') for x in target_vars
]
policy_vars = list(filter(lambda x: x.name in policy_vars_name, vars))
policy_vars.sort(key=lambda x: x.name)
target_vars.sort(key=lambda x: x.name)
with fluid.program_guard(self._sync_program):
sync_ops = []
for i, var in enumerate(policy_vars):
......@@ -161,7 +163,6 @@ class DoubleDQNModel(object):
bias_attr=ParamAttr(name='{}_fc1_b'.format(variable_field)))
return out
def act(self, state, train_or_test):
sample = np.random.random()
if train_or_test == 'train' and sample < self.exploration:
......
......@@ -71,17 +71,17 @@ class DuelingDQNModel(object):
optimizer.minimize(cost)
vars = list(self.train_program.list_vars())
target_vars = list(filter(
lambda x: 'GRAD' not in x.name and 'target' in x.name, vars))
target_vars = list(
filter(lambda x: 'GRAD' not in x.name and 'target' in x.name, vars))
policy_vars_name = [
x.name.replace('target', 'policy') for x in target_vars]
policy_vars = list(filter(
lambda x: x.name in policy_vars_name, vars))
x.name.replace('target', 'policy') for x in target_vars
]
policy_vars = list(filter(lambda x: x.name in policy_vars_name, vars))
policy_vars.sort(key=lambda x: x.name)
target_vars.sort(key=lambda x: x.name)
with fluid.program_guard(self._sync_program):
sync_ops = []
for i, var in enumerate(policy_vars):
......@@ -163,7 +163,6 @@ class DuelingDQNModel(object):
advantage, dim=1, keep_dim=True))
return Q
def act(self, state, train_or_test):
sample = np.random.random()
if train_or_test == 'train' and sample < self.exploration:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册