提交 6b9dd1d8 编写于 作者: Z ZHANG, Zijie

fix PaddleRL policy_gradient bug

上级 3a33a0bb
......@@ -22,6 +22,8 @@ class PolicyGradient:
self.place = fluid.CPUPlace()
self.exe = fluid.Executor(self.place)
self.all_act_prob = None
def build_net(self):
obs = fluid.layers.data(
......@@ -31,10 +33,10 @@ class PolicyGradient:
# fc1
fc1 = fluid.layers.fc(input=obs, size=10, act="tanh") # tanh activation
# fc2
all_act_prob = fluid.layers.fc(input=fc1,
self.all_act_prob = fluid.layers.fc(input=fc1,
size=self.n_actions,
act="softmax")
self.inferece_program = fluid.defaul_main_program().clone()
self.inferece_program = fluid.default_main_program().clone()
# to maximize total reward (log_p * R) is to minimize -(log_p * R)
neg_log_prob = fluid.layers.cross_entropy(
input=self.all_act_prob,
......
......@@ -43,9 +43,9 @@ class Env():
def draw(self, new_line=False):
if new_line:
print ""
print("")
else:
print "\r",
print("\r")
for i in range(self.stage_len):
if i == self.position:
sys.stdout.write("O")
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册