diff --git a/fluid/policy_gradient/brain.py b/fluid/policy_gradient/brain.py index 9c7041e698b0cdc03d614ffdef7d779f4f632327..272d0f817fad14835993e2d0a8d733f78288d537 100644 --- a/fluid/policy_gradient/brain.py +++ b/fluid/policy_gradient/brain.py @@ -36,9 +36,10 @@ class PolicyGradient: act="tanh" # tanh activation ) # fc2 - self.all_act_prob = fluid.layers.fc(input=fc1, + all_act_prob = fluid.layers.fc(input=fc1, size=self.n_actions, act="softmax") + self.inferece_program = fluid.defaul_main_program().clone() # to maximize total reward (log_p * R) is to minimize -(log_p * R) neg_log_prob = fluid.layers.cross_entropy( input=self.all_act_prob, @@ -53,7 +54,7 @@ class PolicyGradient: def choose_action(self, observation): prob_weights = self.exe.run( - fluid.default_main_program().prune(self.all_act_prob), + self.inferece_program, feed={"obs": observation[np.newaxis, :]}, fetch_list=[self.all_act_prob]) prob_weights = np.array(prob_weights[0])