提交 8c6711c8 编写于 作者: P pkpk 提交者: Yibing Liu

test=develop (#3205)

上级 05795e85
...@@ -4,3 +4,6 @@ ...@@ -4,3 +4,6 @@
[submodule "PaddleNLP/language_representations_kit/ERNIE"] [submodule "PaddleNLP/language_representations_kit/ERNIE"]
path = PaddleNLP/language_representations_kit/ERNIE path = PaddleNLP/language_representations_kit/ERNIE
url = https://github.com/PaddlePaddle/ERNIE url = https://github.com/PaddlePaddle/ERNIE
[submodule "PaddleRL"]
path = PaddleRL
url = https://github.com/PaddlePaddle/PARL
Subproject commit a884635519c529c69c34e1134ca6c9d99f2c0007
...@@ -71,17 +71,17 @@ class DQNModel(object): ...@@ -71,17 +71,17 @@ class DQNModel(object):
optimizer.minimize(cost) optimizer.minimize(cost)
vars = list(self.train_program.list_vars()) vars = list(self.train_program.list_vars())
target_vars = list(filter( target_vars = list(
lambda x: 'GRAD' not in x.name and 'target' in x.name, vars)) filter(lambda x: 'GRAD' not in x.name and 'target' in x.name, vars))
policy_vars_name = [ policy_vars_name = [
x.name.replace('target', 'policy') for x in target_vars] x.name.replace('target', 'policy') for x in target_vars
policy_vars = list(filter( ]
lambda x: x.name in policy_vars_name, vars)) policy_vars = list(filter(lambda x: x.name in policy_vars_name, vars))
policy_vars.sort(key=lambda x: x.name) policy_vars.sort(key=lambda x: x.name)
target_vars.sort(key=lambda x: x.name) target_vars.sort(key=lambda x: x.name)
with fluid.program_guard(self._sync_program): with fluid.program_guard(self._sync_program):
sync_ops = [] sync_ops = []
for i, var in enumerate(policy_vars): for i, var in enumerate(policy_vars):
...@@ -153,7 +153,6 @@ class DQNModel(object): ...@@ -153,7 +153,6 @@ class DQNModel(object):
bias_attr=ParamAttr(name='{}_fc1_b'.format(variable_field))) bias_attr=ParamAttr(name='{}_fc1_b'.format(variable_field)))
return out return out
def act(self, state, train_or_test): def act(self, state, train_or_test):
sample = np.random.random() sample = np.random.random()
if train_or_test == 'train' and sample < self.exploration: if train_or_test == 'train' and sample < self.exploration:
......
...@@ -64,9 +64,11 @@ class DoubleDQNModel(object): ...@@ -64,9 +64,11 @@ class DoubleDQNModel(object):
greedy_action = fluid.layers.argmax(next_s_predcit_value, axis=1) greedy_action = fluid.layers.argmax(next_s_predcit_value, axis=1)
greedy_action = fluid.layers.unsqueeze(greedy_action, axes=[1]) greedy_action = fluid.layers.unsqueeze(greedy_action, axes=[1])
predict_onehot = fluid.layers.one_hot(greedy_action, self.action_dim) predict_onehot = fluid.layers.one_hot(greedy_action,
self.action_dim)
best_v = fluid.layers.reduce_sum( best_v = fluid.layers.reduce_sum(
fluid.layers.elementwise_mul(predict_onehot, targetQ_predict_value), fluid.layers.elementwise_mul(predict_onehot,
targetQ_predict_value),
dim=1) dim=1)
best_v.stop_gradient = True best_v.stop_gradient = True
...@@ -79,17 +81,17 @@ class DoubleDQNModel(object): ...@@ -79,17 +81,17 @@ class DoubleDQNModel(object):
optimizer.minimize(cost) optimizer.minimize(cost)
vars = list(self.train_program.list_vars()) vars = list(self.train_program.list_vars())
target_vars = list(filter( target_vars = list(
lambda x: 'GRAD' not in x.name and 'target' in x.name, vars)) filter(lambda x: 'GRAD' not in x.name and 'target' in x.name, vars))
policy_vars_name = [ policy_vars_name = [
x.name.replace('target', 'policy') for x in target_vars] x.name.replace('target', 'policy') for x in target_vars
policy_vars = list(filter( ]
lambda x: x.name in policy_vars_name, vars)) policy_vars = list(filter(lambda x: x.name in policy_vars_name, vars))
policy_vars.sort(key=lambda x: x.name) policy_vars.sort(key=lambda x: x.name)
target_vars.sort(key=lambda x: x.name) target_vars.sort(key=lambda x: x.name)
with fluid.program_guard(self._sync_program): with fluid.program_guard(self._sync_program):
sync_ops = [] sync_ops = []
for i, var in enumerate(policy_vars): for i, var in enumerate(policy_vars):
...@@ -161,7 +163,6 @@ class DoubleDQNModel(object): ...@@ -161,7 +163,6 @@ class DoubleDQNModel(object):
bias_attr=ParamAttr(name='{}_fc1_b'.format(variable_field))) bias_attr=ParamAttr(name='{}_fc1_b'.format(variable_field)))
return out return out
def act(self, state, train_or_test): def act(self, state, train_or_test):
sample = np.random.random() sample = np.random.random()
if train_or_test == 'train' and sample < self.exploration: if train_or_test == 'train' and sample < self.exploration:
......
...@@ -71,17 +71,17 @@ class DuelingDQNModel(object): ...@@ -71,17 +71,17 @@ class DuelingDQNModel(object):
optimizer.minimize(cost) optimizer.minimize(cost)
vars = list(self.train_program.list_vars()) vars = list(self.train_program.list_vars())
target_vars = list(filter( target_vars = list(
lambda x: 'GRAD' not in x.name and 'target' in x.name, vars)) filter(lambda x: 'GRAD' not in x.name and 'target' in x.name, vars))
policy_vars_name = [ policy_vars_name = [
x.name.replace('target', 'policy') for x in target_vars] x.name.replace('target', 'policy') for x in target_vars
policy_vars = list(filter( ]
lambda x: x.name in policy_vars_name, vars)) policy_vars = list(filter(lambda x: x.name in policy_vars_name, vars))
policy_vars.sort(key=lambda x: x.name) policy_vars.sort(key=lambda x: x.name)
target_vars.sort(key=lambda x: x.name) target_vars.sort(key=lambda x: x.name)
with fluid.program_guard(self._sync_program): with fluid.program_guard(self._sync_program):
sync_ops = [] sync_ops = []
for i, var in enumerate(policy_vars): for i, var in enumerate(policy_vars):
...@@ -163,7 +163,6 @@ class DuelingDQNModel(object): ...@@ -163,7 +163,6 @@ class DuelingDQNModel(object):
advantage, dim=1, keep_dim=True)) advantage, dim=1, keep_dim=True))
return Q return Q
def act(self, state, train_or_test): def act(self, state, train_or_test):
sample = np.random.random() sample = np.random.random()
if train_or_test == 'train' and sample < self.exploration: if train_or_test == 'train' and sample < self.exploration:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册