提交 bbeb1e98 编写于 作者: W wanghaoshuang

Add env

上级 615e4fca
import numpy as np import numpy as np
import paddle.v2 as paddle import paddle.v2 as paddle
import paddle.v2.fluid.core as core import paddle.v2.fluid as fluid
import paddle.v2.fluid.framework as framework
import paddle.v2.fluid.layers as layers
from paddle.v2.fluid.executor import Executor
from paddle.v2.fluid.io import save_persistables, load_persistables
from paddle.v2.fluid.optimizer import SGDOptimizer
# reproducible # reproducible
np.random.seed(1) np.random.seed(1)
...@@ -25,38 +19,42 @@ class PolicyGradient: ...@@ -25,38 +19,42 @@ class PolicyGradient:
self.gamma = reward_decay self.gamma = reward_decay
self.ep_obs, self.ep_as, self.ep_rs = [], [], [] self.ep_obs, self.ep_as, self.ep_rs = [], [], []
self.build_net(self)
self.place = core.CPUPlace() self.place = fluid.CPUPlace()
self.exe = Executor(self.place) self.exe = fluid.Executor(self.place)
def build_net(self): def build_net(self):
obs = layers.data( obs = fluid.layers.data(
name='obs', shape=[self.n_features], data_type='float32') name='obs', shape=[self.n_features], dtype='float32')
acts = layers.data(name='acts', shape=[1], data_type='int32') acts = fluid.layers.data(name='acts', shape=[1], dtype='int64')
vt = layers.data(name='vt', shape=[1], data_type='float32') vt = fluid.layers.data(name='vt', shape=[1], dtype='float32')
# fc1 # fc1
fc1 = layers.fc( fc1 = fluid.layers.fc(
input=obs, input=obs,
size=10, size=10,
act="tanh" # tanh activation act="tanh" # tanh activation
) )
# fc2 # fc2
all_act_prob = layers.fc(input=fc1, size=self.n_actions, act="softmax") self.all_act_prob = fluid.layers.fc(
input=fc1, size=self.n_actions, act="softmax")
# to maximize total reward (log_p * R) is to minimize -(log_p * R) # to maximize total reward (log_p * R) is to minimize -(log_p * R)
neg_log_prob = layers.cross_entropy( neg_log_prob = fluid.layers.cross_entropy(
input=all_act_prob, input=self.all_act_prob,
label=acts) # this is negative log of chosen action label=acts) # this is negative log of chosen action
neg_log_prob_weight = layers.elementwise_mul(x=neg_log_prob, y=vt) neg_log_prob_weight = fluid.layers.elementwise_mul(x=neg_log_prob, y=vt)
loss = layers.reduce_mean(x=neg_log_prob_weight) # reward guided loss loss = fluid.layers.reduce_mean(
x=neg_log_prob_weight) # reward guided loss
self.optimizer = SGDOptimizer(self.lr).minimize(loss) sgd_optimizer = fluid.optimizer.SGD(self.lr)
sgd_optimizer.minimize(loss)
self.exe.run(fluid.default_startup_program())
def choose_action(self, observation): def choose_action(self, observation):
prob_weights = self.exe.run( prob_weights = self.exe.run(
framework.default_main_program().prune(all_act_prob), fluid.default_main_program().prune(self.all_act_prob),
feed={"obs": observation[np.newaxis, :]}, feed={"obs": observation[np.newaxis, :]},
fetch_list=[all_act_prob]) fetch_list=[self.all_act_prob])
prob_weights = np.array(prob_weights[0]) prob_weights = np.array(prob_weights[0])
action = np.random.choice( action = np.random.choice(
range(prob_weights.shape[1]), range(prob_weights.shape[1]),
...@@ -71,23 +69,18 @@ class PolicyGradient: ...@@ -71,23 +69,18 @@ class PolicyGradient:
def learn(self): def learn(self):
# discount and normalize episode reward # discount and normalize episode reward
discounted_ep_rs_norm = self._discount_and_norm_rewards() discounted_ep_rs_norm = self._discount_and_norm_rewards()
#print framework.default_main_program() tensor_obs = np.vstack(self.ep_obs).astype("float32")
tensor_obs = core.LoDTensor() tensor_as = np.array(self.ep_as).astype("int64")
tensor_obs.set(np.vstack(self.ep_obs), self.place) tensor_as = tensor_as.reshape([tensor_as.shape[0], 1])
tensor_as = core.LoDTensor() tensor_vt = discounted_ep_rs_norm.astype("float32")[:, np.newaxis]
tensor_as.set(np.array(self.ep_as), self.place)
tensor_vt = core.LoDTensor()
tensor_vt.set(discounted_ep_rs_norm, self.place)
# train on episode # train on episode
self.exe.run( self.exe.run(
framework.default_main_program(), fluid.default_main_program(),
feed={ feed={
"obs": tensor_obs, # shape=[None, n_obs] "obs": tensor_obs, # shape=[None, n_obs]
"acts": tensor_as, # shape=[None, ] "acts": tensor_as, # shape=[None, ]
"vt": tensor_vt # shape=[None, ] "vt": tensor_vt # shape=[None, ]
}) })
self.ep_obs, self.ep_as, self.ep_rs = [], [], [] # empty episode data self.ep_obs, self.ep_as, self.ep_rs = [], [], [] # empty episode data
return discounted_ep_rs_norm return discounted_ep_rs_norm
......
import time
import sys
import numpy as np
class Env():
def __init__(self, stage_len, interval):
self.stage_len = stage_len
self.end = self.stage_len - 1
self.position = 0
self.interval = interval
self.step = 0
self.epoch = -1
self.render = False
def reset(self):
self.end = self.stage_len - 1
self.position = 0
self.epoch += 1
self.step = 0
if self.render:
self.draw(True)
def status(self):
s = np.zeros([self.stage_len]).astype("float32")
s[self.position] = 1
return s
def move(self, action):
self.step += 1
reward = 0.0
done = False
if action == 0:
self.position = max(0, self.position - 1)
else:
self.position = min(self.end, self.position + 1)
if self.render:
self.draw()
if self.position == self.end:
reward = 1.0
done = True
return reward, done, self.status()
def draw(self, new_line=False):
if new_line:
print ""
else:
print "\r",
for i in range(self.stage_len):
if i == self.position:
sys.stdout.write("O")
else:
sys.stdout.write("-")
sys.stdout.write(" epoch: %d; steps: %d" % (self.epoch, self.step))
sys.stdout.flush()
time.sleep(self.interval)
from brain import PolicyGradient from brain import PolicyGradient
from env import Env
import numpy as np
n_features = 10 n_actions = 2
n_actions = 4 interval = 0.01
stage_len = 10
epoches = 10000
if __name__ == "__main__": if __name__ == "__main__":
brain = PolicyGradient(n_actions, n_features) brain = PolicyGradient(n_actions, stage_len)
brain.store_transition([1] * n_features, 1, 1.0) e = Env(stage_len, interval)
#brain.build_net() brain.build_net()
brain.learn() done = False
for epoch in range(epoches):
if epoch % 100 == 1:
e.render = True
else:
e.render = False
e.reset()
while not done:
s = e.status()
action = brain.choose_action(s)
r, done, _ = e.move(action)
brain.store_transition(s, action, r)
done = False
brain.learn()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册