brain.py 3.6 KB
Newer Older
W
wanghaoshuang 已提交
1 2
import numpy as np
import paddle.v2 as paddle
W
Add env  
wanghaoshuang 已提交
3
import paddle.v2.fluid as fluid
W
wanghaoshuang 已提交
4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
# reproducible
np.random.seed(1)


class PolicyGradient:
    def __init__(
            self,
            n_actions,
            n_features,
            learning_rate=0.01,
            reward_decay=0.95,
            output_graph=False, ):
        self.n_actions = n_actions
        self.n_features = n_features
        self.lr = learning_rate
        self.gamma = reward_decay

        self.ep_obs, self.ep_as, self.ep_rs = [], [], []
W
Add env  
wanghaoshuang 已提交
22 23 24

        self.place = fluid.CPUPlace()
        self.exe = fluid.Executor(self.place)
W
wanghaoshuang 已提交
25 26 27

    def build_net(self):

W
Add env  
wanghaoshuang 已提交
28 29 30 31
        obs = fluid.layers.data(
            name='obs', shape=[self.n_features], dtype='float32')
        acts = fluid.layers.data(name='acts', shape=[1], dtype='int64')
        vt = fluid.layers.data(name='vt', shape=[1], dtype='float32')
W
wanghaoshuang 已提交
32
        # fc1
W
Add env  
wanghaoshuang 已提交
33
        fc1 = fluid.layers.fc(
W
wanghaoshuang 已提交
34 35 36 37 38
            input=obs,
            size=10,
            act="tanh"  # tanh activation
        )
        # fc2
W
wanghaoshuang 已提交
39 40 41
        self.all_act_prob = fluid.layers.fc(input=fc1,
                                            size=self.n_actions,
                                            act="softmax")
W
wanghaoshuang 已提交
42
        # to maximize total reward (log_p * R) is to minimize -(log_p * R)
W
Add env  
wanghaoshuang 已提交
43 44
        neg_log_prob = fluid.layers.cross_entropy(
            input=self.all_act_prob,
W
wanghaoshuang 已提交
45
            label=acts)  # this is negative log of chosen action
W
Add env  
wanghaoshuang 已提交
46 47 48
        neg_log_prob_weight = fluid.layers.elementwise_mul(x=neg_log_prob, y=vt)
        loss = fluid.layers.reduce_mean(
            x=neg_log_prob_weight)  # reward guided loss
W
wanghaoshuang 已提交
49

W
Add env  
wanghaoshuang 已提交
50 51 52
        sgd_optimizer = fluid.optimizer.SGD(self.lr)
        sgd_optimizer.minimize(loss)
        self.exe.run(fluid.default_startup_program())
W
wanghaoshuang 已提交
53 54 55

    def choose_action(self, observation):
        prob_weights = self.exe.run(
W
Add env  
wanghaoshuang 已提交
56
            fluid.default_main_program().prune(self.all_act_prob),
W
wanghaoshuang 已提交
57
            feed={"obs": observation[np.newaxis, :]},
W
Add env  
wanghaoshuang 已提交
58
            fetch_list=[self.all_act_prob])
W
wanghaoshuang 已提交
59 60 61 62 63 64 65 66 67 68 69 70 71 72
        prob_weights = np.array(prob_weights[0])
        action = np.random.choice(
            range(prob_weights.shape[1]),
            p=prob_weights.ravel())  # select action w.r.t the actions prob
        return action

    def store_transition(self, s, a, r):
        self.ep_obs.append(s)
        self.ep_as.append(a)
        self.ep_rs.append(r)

    def learn(self):
        # discount and normalize episode reward
        discounted_ep_rs_norm = self._discount_and_norm_rewards()
W
Add env  
wanghaoshuang 已提交
73 74 75 76
        tensor_obs = np.vstack(self.ep_obs).astype("float32")
        tensor_as = np.array(self.ep_as).astype("int64")
        tensor_as = tensor_as.reshape([tensor_as.shape[0], 1])
        tensor_vt = discounted_ep_rs_norm.astype("float32")[:, np.newaxis]
W
wanghaoshuang 已提交
77 78
        # train on episode
        self.exe.run(
W
Add env  
wanghaoshuang 已提交
79
            fluid.default_main_program(),
W
wanghaoshuang 已提交
80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99
            feed={
                "obs": tensor_obs,  # shape=[None, n_obs]
                "acts": tensor_as,  # shape=[None, ]
                "vt": tensor_vt  # shape=[None, ]
            })
        self.ep_obs, self.ep_as, self.ep_rs = [], [], []  # empty episode data
        return discounted_ep_rs_norm

    def _discount_and_norm_rewards(self):
        # discount episode rewards
        discounted_ep_rs = np.zeros_like(self.ep_rs)
        running_add = 0
        for t in reversed(range(0, len(self.ep_rs))):
            running_add = running_add * self.gamma + self.ep_rs[t]
            discounted_ep_rs[t] = running_add

        # normalize episode rewards
        discounted_ep_rs -= np.mean(discounted_ep_rs)
        discounted_ep_rs /= np.std(discounted_ep_rs)
        return discounted_ep_rs