关于参数初始化的一些问题
Created by: kj7541
我在做强化学习的时候,发现一个问题。同样的一个网络。有的时候reward的奖励会上升的很快,但是有的时候可能他永远都上升不了。请问下这是不是和参数内部的初始化的问题有关。 PS:如果200个episode后 reward不上升,就重新运行一遍 一个是PolicyGradient的算法:https://aistudio.baidu.com/aistudio/projectdetail/618692 还有一个是动态图写的dqn的算法 `import paddle.fluid.dygraph as dygraph import paddle.fluid as fluid import gym from collections import deque import numpy as np import random
有时候网络会一直不收敛,此时要重新开始
class Model(fluid.dygraph.Layer): def init(self): super(Model, self).init()
self.fc1 = dygraph.Linear(input_dim=4, output_dim=100, act='tanh')
self.fc2 = dygraph.Linear(input_dim=100, output_dim=100, act='tanh')
self.fc3 = dygraph.Linear(input_dim=100, output_dim=100, act='relu')
self.fc4 = dygraph.Linear(input_dim=100, output_dim=2, act=None)
def forward(self, inputs):
fc1 = self.fc1(inputs)
fc2 = self.fc2(fc1)
fc3 = self.fc3(fc2)
fc4 = self.fc4(fc3)
return fc4
class Agent(object): def init(self): self.lr = 0.001 self.memory_buffer = deque(maxlen=1000) self.step = 0 with dygraph.guard(): self.eval_net = Model() self.target_net = Model() # self.opt = fluid.optimizer.AdamOptimizer(self.lr,)
def choose_act(self, obs):
with dygraph.guard():
self.eval_net.eval()
obs = dygraph.base.to_variable(obs)
act = self.eval_net(obs)
return np.argmax(act.numpy())
def train(self, obs, act, reward, done, next_obs):
self.step += 1
with dygraph.guard():
self.eval_net.train()
self.target_net.eval()
obs = dygraph.base.to_variable(obs)
act = dygraph.base.to_variable(act)
reward = dygraph.base.to_variable(reward)
done = dygraph.base.to_variable(done)
next_obs = dygraph.base.to_variable(next_obs)
eval_value = self.eval_net(obs)
eval_q = eval_value * fluid.one_hot(act, 2)
target_value = self.target_net(next_obs)
target_q = reward + (1-done) * 0.9 * fluid.layers.reduce_max(target_value, dim=1, keep_dim=True)
cost = fluid.layers.square_error_cost(eval_q, target_q)
avg_cost = fluid.layers.reduce_mean(cost)
avg_cost.backward()
opt = fluid.optimizer.AdamOptimizer(self.lr, parameter_list=self.eval_net.parameters())
opt.minimize(avg_cost)
self.eval_net.clear_gradients()
self.target_net.clear_gradients()
if self.step % 50 == 0:
self.sync_target()
def sync_target(self):
self.target_net.set_dict(self.eval_net.state_dict())
def sample_exp(self):
exp_batch = random.sample(self.memory_buffer, 32)
state, act, reward, done, next_state = [], [], [], [], []
for exp in exp_batch:
(s, a, r, d, n_s) = exp
state.append(s)
act.append(a)
reward.append(r)
done.append(d)
next_state.append(n_s)
state_arr = np.array(state).astype(np.float32)
act_arr = np.array(act).astype(np.int64)
reward_arr = np.expand_dims(reward, axis=1).astype(np.float32)
done_arr = np.expand_dims(done, axis=1).astype(np.float32)
next_state_arr = np.array(next_state).astype(np.float32)
return state_arr, act_arr, reward_arr, done_arr, next_state_arr
def store_exp(self, exp):
if len(self.memory_buffer) > 1000:
self.memory_buffer.popleft()
self.memory_buffer.append(exp)
env = gym.make('CartPole-v0') agent = Agent() for i in range(1000): obs = env.reset() total_reward = 0 while True: # print(obs.shape) act = agent.choose_act(np.array(obs).astype(np.float32)) next_obs, reward, done, info = env.step(act) # if i > 100: # # env.render() total_reward += reward agent.store_exp((obs, act, reward, done, next_obs)) obs = next_obs if len(agent.memory_buffer) > 100: batch_obs, batch_act, batch_reward, batch_done, batch_next_obs = agent.sample_exp() agent.train(batch_obs, batch_act, batch_reward, batch_done, batch_next_obs) # print(batch_obs.shape) # print(batch_act.shape) # print(batch_reward.shape) # print(batch_done.shape) # print(batch_next_obs.shape)
if done:
print('ep:{} || R:{}'.format(i, total_reward))
break
`