brain.py 3.6 KB
Newer Older
W
wanghaoshuang 已提交
1 2
import numpy as np
import paddle.v2 as paddle
L
Luo Tao 已提交
3
import paddle.fluid as fluid
W
wanghaoshuang 已提交
4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
# reproducible
np.random.seed(1)


class PolicyGradient:
    def __init__(
            self,
            n_actions,
            n_features,
            learning_rate=0.01,
            reward_decay=0.95,
            output_graph=False, ):
        self.n_actions = n_actions
        self.n_features = n_features
        self.lr = learning_rate
        self.gamma = reward_decay

        self.ep_obs, self.ep_as, self.ep_rs = [], [], []
W
Add env  
wanghaoshuang 已提交
22 23 24

        self.place = fluid.CPUPlace()
        self.exe = fluid.Executor(self.place)
W
wanghaoshuang 已提交
25 26 27

    def build_net(self):

W
Add env  
wanghaoshuang 已提交
28 29 30 31
        obs = fluid.layers.data(
            name='obs', shape=[self.n_features], dtype='float32')
        acts = fluid.layers.data(name='acts', shape=[1], dtype='int64')
        vt = fluid.layers.data(name='vt', shape=[1], dtype='float32')
W
wanghaoshuang 已提交
32
        # fc1
33
        fc1 = fluid.layers.fc(input=obs, size=10, act="tanh")  # tanh activation
W
wanghaoshuang 已提交
34
        # fc2
D
dzhwinter 已提交
35
        all_act_prob = fluid.layers.fc(input=fc1,
36 37
                                       size=self.n_actions,
                                       act="softmax")
D
dzhwinter 已提交
38
        self.inferece_program = fluid.defaul_main_program().clone()
W
wanghaoshuang 已提交
39
        # to maximize total reward (log_p * R) is to minimize -(log_p * R)
W
Add env  
wanghaoshuang 已提交
40 41
        neg_log_prob = fluid.layers.cross_entropy(
            input=self.all_act_prob,
W
wanghaoshuang 已提交
42
            label=acts)  # this is negative log of chosen action
W
Add env  
wanghaoshuang 已提交
43 44
        neg_log_prob_weight = fluid.layers.elementwise_mul(x=neg_log_prob, y=vt)
        loss = fluid.layers.reduce_mean(
45
            neg_log_prob_weight)  # reward guided loss
W
wanghaoshuang 已提交
46

W
Add env  
wanghaoshuang 已提交
47 48 49
        sgd_optimizer = fluid.optimizer.SGD(self.lr)
        sgd_optimizer.minimize(loss)
        self.exe.run(fluid.default_startup_program())
W
wanghaoshuang 已提交
50 51

    def choose_action(self, observation):
52 53 54
        prob_weights = self.exe.run(self.inferece_program,
                                    feed={"obs": observation[np.newaxis, :]},
                                    fetch_list=[self.all_act_prob])
W
wanghaoshuang 已提交
55 56 57 58 59 60 61 62 63 64 65 66 67 68
        prob_weights = np.array(prob_weights[0])
        action = np.random.choice(
            range(prob_weights.shape[1]),
            p=prob_weights.ravel())  # select action w.r.t the actions prob
        return action

    def store_transition(self, s, a, r):
        self.ep_obs.append(s)
        self.ep_as.append(a)
        self.ep_rs.append(r)

    def learn(self):
        # discount and normalize episode reward
        discounted_ep_rs_norm = self._discount_and_norm_rewards()
W
Add env  
wanghaoshuang 已提交
69 70 71 72
        tensor_obs = np.vstack(self.ep_obs).astype("float32")
        tensor_as = np.array(self.ep_as).astype("int64")
        tensor_as = tensor_as.reshape([tensor_as.shape[0], 1])
        tensor_vt = discounted_ep_rs_norm.astype("float32")[:, np.newaxis]
W
wanghaoshuang 已提交
73 74
        # train on episode
        self.exe.run(
W
Add env  
wanghaoshuang 已提交
75
            fluid.default_main_program(),
W
wanghaoshuang 已提交
76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
            feed={
                "obs": tensor_obs,  # shape=[None, n_obs]
                "acts": tensor_as,  # shape=[None, ]
                "vt": tensor_vt  # shape=[None, ]
            })
        self.ep_obs, self.ep_as, self.ep_rs = [], [], []  # empty episode data
        return discounted_ep_rs_norm

    def _discount_and_norm_rewards(self):
        # discount episode rewards
        discounted_ep_rs = np.zeros_like(self.ep_rs)
        running_add = 0
        for t in reversed(range(0, len(self.ep_rs))):
            running_add = running_add * self.gamma + self.ep_rs[t]
            discounted_ep_rs[t] = running_add

        # normalize episode rewards
        discounted_ep_rs -= np.mean(discounted_ep_rs)
        discounted_ep_rs /= np.std(discounted_ep_rs)
        return discounted_ep_rs