brain.py 3.5 KB
Newer Older
W
wanghaoshuang 已提交
1
import numpy as np
L
Luo Tao 已提交
2
import paddle.fluid as fluid
W
wanghaoshuang 已提交
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
# reproducible
np.random.seed(1)


class PolicyGradient:
    def __init__(
            self,
            n_actions,
            n_features,
            learning_rate=0.01,
            reward_decay=0.95,
            output_graph=False, ):
        self.n_actions = n_actions
        self.n_features = n_features
        self.lr = learning_rate
        self.gamma = reward_decay

        self.ep_obs, self.ep_as, self.ep_rs = [], [], []
W
Add env  
wanghaoshuang 已提交
21 22 23

        self.place = fluid.CPUPlace()
        self.exe = fluid.Executor(self.place)
W
wanghaoshuang 已提交
24 25 26

    def build_net(self):

W
Add env  
wanghaoshuang 已提交
27 28 29 30
        obs = fluid.layers.data(
            name='obs', shape=[self.n_features], dtype='float32')
        acts = fluid.layers.data(name='acts', shape=[1], dtype='int64')
        vt = fluid.layers.data(name='vt', shape=[1], dtype='float32')
W
wanghaoshuang 已提交
31
        # fc1
32
        fc1 = fluid.layers.fc(input=obs, size=10, act="tanh")  # tanh activation
W
wanghaoshuang 已提交
33
        # fc2
W
whs 已提交
34
        self.all_act_prob = fluid.layers.fc(input=fc1,
35 36
                                       size=self.n_actions,
                                       act="softmax")
W
whs 已提交
37
        self.inferece_program = fluid.default_main_program().clone()
W
wanghaoshuang 已提交
38
        # to maximize total reward (log_p * R) is to minimize -(log_p * R)
W
Add env  
wanghaoshuang 已提交
39 40
        neg_log_prob = fluid.layers.cross_entropy(
            input=self.all_act_prob,
W
wanghaoshuang 已提交
41
            label=acts)  # this is negative log of chosen action
W
Add env  
wanghaoshuang 已提交
42 43
        neg_log_prob_weight = fluid.layers.elementwise_mul(x=neg_log_prob, y=vt)
        loss = fluid.layers.reduce_mean(
44
            neg_log_prob_weight)  # reward guided loss
W
wanghaoshuang 已提交
45

W
Add env  
wanghaoshuang 已提交
46 47 48
        sgd_optimizer = fluid.optimizer.SGD(self.lr)
        sgd_optimizer.minimize(loss)
        self.exe.run(fluid.default_startup_program())
W
wanghaoshuang 已提交
49 50

    def choose_action(self, observation):
51 52 53
        prob_weights = self.exe.run(self.inferece_program,
                                    feed={"obs": observation[np.newaxis, :]},
                                    fetch_list=[self.all_act_prob])
W
wanghaoshuang 已提交
54
        prob_weights = np.array(prob_weights[0])
55
        # select action w.r.t the actions prob
W
wanghaoshuang 已提交
56
        action = np.random.choice(
57
            range(prob_weights.shape[1]), p=prob_weights.ravel())
W
wanghaoshuang 已提交
58 59 60 61 62 63 64 65 66 67
        return action

    def store_transition(self, s, a, r):
        self.ep_obs.append(s)
        self.ep_as.append(a)
        self.ep_rs.append(r)

    def learn(self):
        # discount and normalize episode reward
        discounted_ep_rs_norm = self._discount_and_norm_rewards()
W
Add env  
wanghaoshuang 已提交
68 69 70 71
        tensor_obs = np.vstack(self.ep_obs).astype("float32")
        tensor_as = np.array(self.ep_as).astype("int64")
        tensor_as = tensor_as.reshape([tensor_as.shape[0], 1])
        tensor_vt = discounted_ep_rs_norm.astype("float32")[:, np.newaxis]
W
wanghaoshuang 已提交
72 73
        # train on episode
        self.exe.run(
W
Add env  
wanghaoshuang 已提交
74
            fluid.default_main_program(),
W
wanghaoshuang 已提交
75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94
            feed={
                "obs": tensor_obs,  # shape=[None, n_obs]
                "acts": tensor_as,  # shape=[None, ]
                "vt": tensor_vt  # shape=[None, ]
            })
        self.ep_obs, self.ep_as, self.ep_rs = [], [], []  # empty episode data
        return discounted_ep_rs_norm

    def _discount_and_norm_rewards(self):
        # discount episode rewards
        discounted_ep_rs = np.zeros_like(self.ep_rs)
        running_add = 0
        for t in reversed(range(0, len(self.ep_rs))):
            running_add = running_add * self.gamma + self.ep_rs[t]
            discounted_ep_rs[t] = running_add

        # normalize episode rewards
        discounted_ep_rs -= np.mean(discounted_ep_rs)
        discounted_ep_rs /= np.std(discounted_ep_rs)
        return discounted_ep_rs