diff --git a/benchmark/torch/dqn/replay_memory.py b/benchmark/torch/dqn/replay_memory.py index ea8c6565155ddacae568e901566f9b390ee3a8b8..25ef0e50a7e53fd68d19b6ac3bffb807b18d6d29 100644 --- a/benchmark/torch/dqn/replay_memory.py +++ b/benchmark/torch/dqn/replay_memory.py @@ -16,16 +16,16 @@ import numpy as np import copy from collections import deque, namedtuple -Experience = namedtuple('Experience', ['state', 'action', 'reward', 'isOver']) +Experience = namedtuple('Experience', ['obs', 'action', 'reward', 'isOver']) class ReplayMemory(object): - def __init__(self, max_size, state_shape, context_len): + def __init__(self, max_size, obs_shape, context_len): self.max_size = int(max_size) - self.state_shape = state_shape + self.obs_shape = obs_shape self.context_len = int(context_len) - self.state = np.zeros((self.max_size, ) + state_shape, dtype='uint8') + self.obs = np.zeros((self.max_size, ) + obs_shape, dtype='uint8') self.action = np.zeros((self.max_size, ), dtype='int32') self.reward = np.zeros((self.max_size, ), dtype='float32') self.isOver = np.zeros((self.max_size, ), dtype='bool') @@ -48,42 +48,41 @@ class ReplayMemory(object): else: self._context.append(exp) - def recent_state(self): - """ maintain recent state for training""" + def recent_obs(self): + """ maintain recent obs for training""" lst = list(self._context) - states = [np.zeros(self.state_shape, dtype='uint8')] * \ + obs = [np.zeros(self.obs_shape, dtype='uint8')] * \ (self._context.maxlen - len(lst)) - states.extend([k.state for k in lst]) - return states + obs.extend([k.obs for k in lst]) + return obs def sample(self, idx): - """ return state, action, reward, isOver, - note that some frames in state may be generated from last episode, - they should be removed from state + """ return obs, action, reward, isOver, + note that some frames in obs may be generated from last episode, + they should be removed from obs """ - state = np.zeros( - (self.context_len + 1, ) + self.state_shape, dtype=np.uint8) - state_idx = np.arange(idx, - idx + self.context_len + 1) % self._curr_size + obs = np.zeros( + (self.context_len + 1, ) + self.obs_shape, dtype=np.uint8) + obs_idx = np.arange(idx, idx + self.context_len + 1) % self._curr_size # confirm that no frame was generated from last episode has_last_episode = False for k in range(self.context_len - 2, -1, -1): - to_check_idx = state_idx[k] + to_check_idx = obs_idx[k] if self.isOver[to_check_idx]: has_last_episode = True - state_idx = state_idx[k + 1:] - state[k + 1:] = self.state[state_idx] + obs_idx = obs_idx[k + 1:] + obs[k + 1:] = self.obs[obs_idx] break if not has_last_episode: - state = self.state[state_idx] + obs = self.obs[obs_idx] real_idx = (idx + self.context_len - 1) % self._curr_size action = self.action[real_idx] reward = self.reward[real_idx] isOver = self.isOver[real_idx] - return state, reward, action, isOver + return obs, reward, action, isOver def __len__(self): return self._curr_size @@ -92,7 +91,7 @@ class ReplayMemory(object): return self._curr_size def _assign(self, pos, exp): - self.state[pos] = exp.state + self.obs[pos] = exp.obs self.reward[pos] = exp.reward self.action[pos] = exp.action self.isOver[pos] = exp.isOver @@ -107,8 +106,8 @@ class ReplayMemory(object): return self._process_batch(batch_exp) def _process_batch(self, batch_exp): - state = np.asarray([e[0] for e in batch_exp], dtype='uint8') + obs = np.asarray([e[0] for e in batch_exp], dtype='uint8') reward = np.asarray([e[1] for e in batch_exp], dtype='float32') action = np.asarray([e[2] for e in batch_exp], dtype='int8') isOver = np.asarray([e[3] for e in batch_exp], dtype='bool') - return [state, action, reward, isOver] + return [obs, action, reward, isOver] diff --git a/benchmark/torch/dqn/train.py b/benchmark/torch/dqn/train.py index 9db3b8f776fa669772bb2748cbfed0a7067f5909..26d24d8b1a3cc88bdb26aec954094f282e463003 100644 --- a/benchmark/torch/dqn/train.py +++ b/benchmark/torch/dqn/train.py @@ -26,7 +26,7 @@ from parl.utils import tensorboard, logger from parl.algorithms import DQN, DDQN from agent import AtariAgent -from atari_wrapper import FireResetEnv, FrameStack, LimitLength, MapState +from atari_wrapper import FireResetEnv, FrameStack, LimitLength from model import AtariModel from replay_memory import ReplayMemory, Experience from utils import get_player @@ -43,57 +43,57 @@ GAMMA = 0.99 def run_train_episode(env, agent, rpm): total_reward = 0 all_cost = [] - state = env.reset() + obs = env.reset() steps = 0 while True: steps += 1 - context = rpm.recent_state() - context.append(state) + context = rpm.recent_obs() + context.append(obs) context = np.stack(context, axis=0) action = agent.sample(context) - next_state, reward, isOver, _ = env.step(action) - rpm.append(Experience(state, action, reward, isOver)) + next_obs, reward, isOver, _ = env.step(action) + rpm.append(Experience(obs, action, reward, isOver)) if rpm.size() > MEMORY_WARMUP_SIZE: if steps % UPDATE_FREQ == 0: - batch_all_state, batch_action, batch_reward, batch_isOver = rpm.sample_batch( + batch_all_obs, batch_action, batch_reward, batch_isOver = rpm.sample_batch( args.batch_size) - batch_state = batch_all_state[:, :CONTEXT_LEN, :, :] - batch_next_state = batch_all_state[:, 1:, :, :] - cost = agent.learn(batch_state, batch_action, batch_reward, - batch_next_state, batch_isOver) + batch_obs = batch_all_obs[:, :CONTEXT_LEN, :, :] + batch_next_obs = batch_all_obs[:, 1:, :, :] + cost = agent.learn(batch_obs, batch_action, batch_reward, + batch_next_obs, batch_isOver) all_cost.append(cost) total_reward += reward - state = next_state + obs = next_obs if isOver: mean_loss = np.mean(all_cost) if all_cost else None return total_reward, steps, mean_loss def run_evaluate_episode(env, agent): - state = env.reset() + obs = env.reset() total_reward = 0 while True: - pred_Q = agent.predict(state) + pred_Q = agent.predict(obs) action = pred_Q.max(1)[1].item() - state, reward, isOver, _ = env.step(action) + obs, reward, isOver, _ = env.step(action) total_reward += reward if isOver: return total_reward -def get_fixed_states(rpm, batch_size): - states = [] +def get_fixed_obs(rpm, batch_size): + obs = [] for _ in range(3): - batch_all_state = rpm.sample_batch(batch_size)[0] - batch_state = batch_all_state[:, :CONTEXT_LEN, :, :] - states.append(batch_state) - fixed_states = np.concatenate(states, axis=0) - return fixed_states + batch_all_obs = rpm.sample_batch(batch_size)[0] + batch_obs = batch_all_obs[:, :CONTEXT_LEN, :, :] + obs.append(batch_obs) + fixed_obs = np.concatenate(obs, axis=0) + return fixed_obs -def evaluate_fixed_Q(agent, states): +def evaluate_fixed_Q(agent, obs): with torch.no_grad(): - max_pred_Q = agent.alg.model(states).max(1)[0].mean() + max_pred_Q = agent.alg.model(obs).max(1)[0].mean() return max_pred_Q.item() @@ -131,9 +131,9 @@ def main(): total_reward, steps, _ = run_train_episode(env, agent, rpm) pbar.update(steps) - # Get fixed states to check value function. - fixed_states = get_fixed_states(rpm, args.batch_size) - fixed_states = torch.tensor(fixed_states, dtype=torch.float, device=device) + # Get fixed obs to check value function. + fixed_obs = get_fixed_obs(rpm, args.batch_size) + fixed_obs = torch.tensor(fixed_obs, dtype=torch.float, device=device) # train test_flag = 0 @@ -159,7 +159,7 @@ def main(): tensorboard.add_scalar('dqn/exploration', agent.exploration, total_steps) tensorboard.add_scalar('dqn/Q value', - evaluate_fixed_Q(agent, fixed_states), + evaluate_fixed_Q(agent, fixed_obs), total_steps) tensorboard.add_scalar('dqn/grad_norm', get_grad_norm(agent.alg.model), diff --git a/examples/DQN/cartpole_agent.py b/examples/DQN/cartpole_agent.py index bf911f8ac2c8b103caf3a855026d2edc9d4a7631..c7d16cf43a1729f7cfd62f0981679ec373dd77ae 100755 --- a/examples/DQN/cartpole_agent.py +++ b/examples/DQN/cartpole_agent.py @@ -21,13 +21,13 @@ from parl import layers class CartpoleAgent(parl.Agent): def __init__(self, algorithm, - state_dim, + obs_dim, act_dim, e_greed=0.1, e_greed_decrement=0): - assert isinstance(state_dim, int) + assert isinstance(obs_dim, int) assert isinstance(act_dim, int) - self.state_dim = state_dim + self.obs_dim = obs_dim self.act_dim = act_dim super(CartpoleAgent, self).__init__(algorithm) @@ -43,16 +43,16 @@ class CartpoleAgent(parl.Agent): with fluid.program_guard(self.pred_program): obs = layers.data( - name='obs', shape=[self.state_dim], dtype='float32') + name='obs', shape=[self.obs_dim], dtype='float32') self.value = self.alg.predict(obs) with fluid.program_guard(self.learn_program): obs = layers.data( - name='obs', shape=[self.state_dim], dtype='float32') + name='obs', shape=[self.obs_dim], dtype='float32') action = layers.data(name='act', shape=[1], dtype='int32') reward = layers.data(name='reward', shape=[], dtype='float32') next_obs = layers.data( - name='next_obs', shape=[self.state_dim], dtype='float32') + name='next_obs', shape=[self.obs_dim], dtype='float32') terminal = layers.data(name='terminal', shape=[], dtype='bool') lr = layers.data( name='lr', shape=[1], dtype='float32', append_batch_size=False) diff --git a/examples/DQN/replay_memory.py b/examples/DQN/replay_memory.py index d15463ce8f2967431acb01b86c6a9b63080252fa..c9474a0dce8d3cc9f5d5610cafbd1df5b1a03586 100755 --- a/examples/DQN/replay_memory.py +++ b/examples/DQN/replay_memory.py @@ -28,19 +28,19 @@ class ReplayMemory(object): def sample(self, batch_size): mini_batch = random.sample(self.buffer, batch_size) - state_batch, action_batch, reward_batch, next_state_batch, done_batch = [], [], [], [], [] + obs_batch, action_batch, reward_batch, next_obs_batch, done_batch = [], [], [], [], [] for experience in mini_batch: s, a, r, s_p, done = experience - state_batch.append(s) + obs_batch.append(s) action_batch.append(a) reward_batch.append(r) - next_state_batch.append(s_p) + next_obs_batch.append(s_p) done_batch.append(done) - return np.array(state_batch).astype('float32'), \ + return np.array(obs_batch).astype('float32'), \ np.array(action_batch).astype('float32'), np.array(reward_batch).astype('float32'),\ - np.array(next_state_batch).astype('float32'), np.array(done_batch).astype('float32') + np.array(next_obs_batch).astype('float32'), np.array(done_batch).astype('float32') def __len__(self): return len(self.buffer) diff --git a/examples/DQN/train.py b/examples/DQN/train.py index 63fb86058df73fde88a2c43e5d695b501d3d57de..ae504f9f711da14246ebba04a0e12e2dc25bf57a 100755 --- a/examples/DQN/train.py +++ b/examples/DQN/train.py @@ -32,24 +32,24 @@ GAMMA = 0.99 # discount factor of reward def run_episode(agent, env, rpm): total_reward = 0 - state = env.reset() + obs = env.reset() step = 0 while True: step += 1 - action = agent.sample(state) - next_state, reward, isOver, _ = env.step(action) - rpm.append((state, action, reward, next_state, isOver)) + action = agent.sample(obs) + next_obs, reward, isOver, _ = env.step(action) + rpm.append((obs, action, reward, next_obs, isOver)) # train model if (len(rpm) > MEMORY_WARMUP_SIZE) and (step % LEARN_FREQ == 0): - (batch_state, batch_action, batch_reward, batch_next_state, + (batch_obs, batch_action, batch_reward, batch_next_obs, batch_isOver) = rpm.sample(BATCH_SIZE) - train_loss = agent.learn(batch_state, batch_action, batch_reward, - batch_next_state, batch_isOver, + train_loss = agent.learn(batch_obs, batch_action, batch_reward, + batch_next_obs, batch_isOver, LEARNING_RATE) total_reward += reward - state = next_state + obs = next_obs if isOver: break return total_reward @@ -59,14 +59,14 @@ def evaluate(agent, env, render=False): # test part, run 5 episodes and average eval_reward = [] for i in range(5): - state = env.reset() + obs = env.reset() episode_reward = 0 isOver = False while not isOver: - action = agent.predict(state) + action = agent.predict(obs) if render: env.render() - state, reward, isOver, _ = env.step(action) + obs, reward, isOver, _ = env.step(action) episode_reward += reward eval_reward.append(episode_reward) return np.mean(eval_reward) @@ -75,7 +75,7 @@ def evaluate(agent, env, render=False): def main(): env = gym.make('CartPole-v1') action_dim = env.action_space.n - state_shape = env.observation_space.shape + obs_shape = env.observation_space.shape rpm = ReplayMemory(MEMORY_SIZE) @@ -83,7 +83,7 @@ def main(): algorithm = parl.algorithms.DQN(model, act_dim=action_dim, gamma=GAMMA) agent = CartpoleAgent( algorithm, - state_dim=state_shape[0], + obs_dim=obs_shape[0], act_dim=action_dim, e_greed=0.1, # explore e_greed_decrement=1e-6 diff --git a/examples/DQN_variant/replay_memory.py b/examples/DQN_variant/replay_memory.py index ea8c6565155ddacae568e901566f9b390ee3a8b8..25ef0e50a7e53fd68d19b6ac3bffb807b18d6d29 100644 --- a/examples/DQN_variant/replay_memory.py +++ b/examples/DQN_variant/replay_memory.py @@ -16,16 +16,16 @@ import numpy as np import copy from collections import deque, namedtuple -Experience = namedtuple('Experience', ['state', 'action', 'reward', 'isOver']) +Experience = namedtuple('Experience', ['obs', 'action', 'reward', 'isOver']) class ReplayMemory(object): - def __init__(self, max_size, state_shape, context_len): + def __init__(self, max_size, obs_shape, context_len): self.max_size = int(max_size) - self.state_shape = state_shape + self.obs_shape = obs_shape self.context_len = int(context_len) - self.state = np.zeros((self.max_size, ) + state_shape, dtype='uint8') + self.obs = np.zeros((self.max_size, ) + obs_shape, dtype='uint8') self.action = np.zeros((self.max_size, ), dtype='int32') self.reward = np.zeros((self.max_size, ), dtype='float32') self.isOver = np.zeros((self.max_size, ), dtype='bool') @@ -48,42 +48,41 @@ class ReplayMemory(object): else: self._context.append(exp) - def recent_state(self): - """ maintain recent state for training""" + def recent_obs(self): + """ maintain recent obs for training""" lst = list(self._context) - states = [np.zeros(self.state_shape, dtype='uint8')] * \ + obs = [np.zeros(self.obs_shape, dtype='uint8')] * \ (self._context.maxlen - len(lst)) - states.extend([k.state for k in lst]) - return states + obs.extend([k.obs for k in lst]) + return obs def sample(self, idx): - """ return state, action, reward, isOver, - note that some frames in state may be generated from last episode, - they should be removed from state + """ return obs, action, reward, isOver, + note that some frames in obs may be generated from last episode, + they should be removed from obs """ - state = np.zeros( - (self.context_len + 1, ) + self.state_shape, dtype=np.uint8) - state_idx = np.arange(idx, - idx + self.context_len + 1) % self._curr_size + obs = np.zeros( + (self.context_len + 1, ) + self.obs_shape, dtype=np.uint8) + obs_idx = np.arange(idx, idx + self.context_len + 1) % self._curr_size # confirm that no frame was generated from last episode has_last_episode = False for k in range(self.context_len - 2, -1, -1): - to_check_idx = state_idx[k] + to_check_idx = obs_idx[k] if self.isOver[to_check_idx]: has_last_episode = True - state_idx = state_idx[k + 1:] - state[k + 1:] = self.state[state_idx] + obs_idx = obs_idx[k + 1:] + obs[k + 1:] = self.obs[obs_idx] break if not has_last_episode: - state = self.state[state_idx] + obs = self.obs[obs_idx] real_idx = (idx + self.context_len - 1) % self._curr_size action = self.action[real_idx] reward = self.reward[real_idx] isOver = self.isOver[real_idx] - return state, reward, action, isOver + return obs, reward, action, isOver def __len__(self): return self._curr_size @@ -92,7 +91,7 @@ class ReplayMemory(object): return self._curr_size def _assign(self, pos, exp): - self.state[pos] = exp.state + self.obs[pos] = exp.obs self.reward[pos] = exp.reward self.action[pos] = exp.action self.isOver[pos] = exp.isOver @@ -107,8 +106,8 @@ class ReplayMemory(object): return self._process_batch(batch_exp) def _process_batch(self, batch_exp): - state = np.asarray([e[0] for e in batch_exp], dtype='uint8') + obs = np.asarray([e[0] for e in batch_exp], dtype='uint8') reward = np.asarray([e[1] for e in batch_exp], dtype='float32') action = np.asarray([e[2] for e in batch_exp], dtype='int8') isOver = np.asarray([e[3] for e in batch_exp], dtype='bool') - return [state, action, reward, isOver] + return [obs, action, reward, isOver] diff --git a/examples/DQN_variant/train.py b/examples/DQN_variant/train.py index 3149e6b81a34e81aff038a12994e7eb4e91eac22..0bd22797892783758b1059a4df2a06e4096736f6 100644 --- a/examples/DQN_variant/train.py +++ b/examples/DQN_variant/train.py @@ -39,28 +39,28 @@ LEARNING_RATE = 3e-4 def run_train_episode(env, agent, rpm): total_reward = 0 all_cost = [] - state = env.reset() + obs = env.reset() steps = 0 while True: steps += 1 - context = rpm.recent_state() - context.append(state) + context = rpm.recent_obs() + context.append(obs) context = np.stack(context, axis=0) action = agent.sample(context) - next_state, reward, isOver, _ = env.step(action) - rpm.append(Experience(state, action, reward, isOver)) + next_obs, reward, isOver, _ = env.step(action) + rpm.append(Experience(obs, action, reward, isOver)) # start training if rpm.size() > MEMORY_WARMUP_SIZE: if steps % UPDATE_FREQ == 0: - batch_all_state, batch_action, batch_reward, batch_isOver = rpm.sample_batch( + batch_all_obs, batch_action, batch_reward, batch_isOver = rpm.sample_batch( args.batch_size) - batch_state = batch_all_state[:, :CONTEXT_LEN, :, :] - batch_next_state = batch_all_state[:, 1:, :, :] - cost = agent.learn(batch_state, batch_action, batch_reward, - batch_next_state, batch_isOver) + batch_obs = batch_all_obs[:, :CONTEXT_LEN, :, :] + batch_next_obs = batch_all_obs[:, 1:, :, :] + cost = agent.learn(batch_obs, batch_action, batch_reward, + batch_next_obs, batch_isOver) all_cost.append(float(cost)) total_reward += reward - state = next_state + obs = next_obs if isOver: break if all_cost: @@ -70,11 +70,11 @@ def run_train_episode(env, agent, rpm): def run_evaluate_episode(env, agent): - state = env.reset() + obs = env.reset() total_reward = 0 while True: - action = agent.predict(state) - state, reward, isOver, info = env.step(action) + action = agent.predict(obs) + obs, reward, isOver, info = env.step(action) total_reward += reward if isOver: break diff --git a/examples/offline-Q-learning/atari.py b/examples/offline-Q-learning/atari.py index 11909eba8307ef781337b20ca2fe200ed967cc45..e0e1b3cc097be221483d0a8712951b9d38f5da54 120000 --- a/examples/offline-Q-learning/atari.py +++ b/examples/offline-Q-learning/atari.py @@ -1 +1 @@ -../DQN/atari.py \ No newline at end of file +../DQN_variant/atari.py \ No newline at end of file diff --git a/examples/offline-Q-learning/atari_wrapper.py b/examples/offline-Q-learning/atari_wrapper.py index e58186a870b13dc7fff25c52cbdd1d009a18f4ac..2904fb39b7934d104209d0085ca814d5c132fe90 120000 --- a/examples/offline-Q-learning/atari_wrapper.py +++ b/examples/offline-Q-learning/atari_wrapper.py @@ -1 +1 @@ -../DQN/atari_wrapper.py \ No newline at end of file +../DQN_variant/atari_wrapper.py \ No newline at end of file diff --git a/examples/offline-Q-learning/parallel_run.py b/examples/offline-Q-learning/parallel_run.py index 3416f8cd6708d75ce0884584a43b66c674d8c699..281ea4504e030f5b7296349a5912a07b30fdec27 100644 --- a/examples/offline-Q-learning/parallel_run.py +++ b/examples/offline-Q-learning/parallel_run.py @@ -45,21 +45,21 @@ gpu_num = get_gpu_count() def run_train_step(agent, rpm): for step in range(args.train_total_steps): # use the first 80% data to train - batch_all_state, batch_action, batch_reward, batch_isOver = rpm.sample_batch( + batch_all_obs, batch_action, batch_reward, batch_isOver = rpm.sample_batch( args.batch_size * gpu_num) - batch_state = batch_all_state[:, :CONTEXT_LEN, :, :] - batch_next_state = batch_all_state[:, 1:, :, :] - cost = agent.learn(batch_state, batch_action, batch_reward, - batch_next_state, batch_isOver) + batch_obs = batch_all_obs[:, :CONTEXT_LEN, :, :] + batch_next_obs = batch_all_obs[:, 1:, :, :] + cost = agent.learn(batch_obs, batch_action, batch_reward, + batch_next_obs, batch_isOver) if step % 100 == 0: # use the last 20% data to evaluate - batch_all_state, batch_action, batch_reward, batch_isOver = rpm.sample_test_batch( + batch_all_obs, batch_action, batch_reward, batch_isOver = rpm.sample_test_batch( args.batch_size) - batch_state = batch_all_state[:, :CONTEXT_LEN, :, :] - batch_next_state = batch_all_state[:, 1:, :, :] - eval_cost = agent.supervised_eval(batch_state, batch_action, - batch_reward, batch_next_state, + batch_obs = batch_all_obs[:, :CONTEXT_LEN, :, :] + batch_next_obs = batch_all_obs[:, 1:, :, :] + eval_cost = agent.supervised_eval(batch_obs, batch_action, + batch_reward, batch_next_obs, batch_isOver) logger.info( "train step {}, train costs are {}, eval cost is {}.".format( @@ -67,17 +67,17 @@ def run_train_step(agent, rpm): def collect_exp(env, rpm, agent): - state = env.reset() + obs = env.reset() # collect data to fulfill replay memory for i in tqdm(range(MEMORY_SIZE)): - context = rpm.recent_state() - context.append(state) + context = rpm.recent_obs() + context.append(obs) context = np.stack(context, axis=0) action = agent.sample(context) - next_state, reward, isOver, _ = env.step(action) - rpm.append(Experience(state, action, reward, isOver)) - state = next_state + next_obs, reward, isOver, _ = env.step(action) + rpm.append(Experience(obs, action, reward, isOver)) + obs = next_obs def main(): diff --git a/examples/offline-Q-learning/replay_memory.py b/examples/offline-Q-learning/replay_memory.py index 2296ea906ee47a53f697777b6885dad6365460e8..94a43c25d32ac9c9107dfa90a33d1280a5bebd16 100644 --- a/examples/offline-Q-learning/replay_memory.py +++ b/examples/offline-Q-learning/replay_memory.py @@ -18,18 +18,18 @@ import os from collections import deque, namedtuple from parl.utils import logger -Experience = namedtuple('Experience', ['state', 'action', 'reward', 'isOver']) +Experience = namedtuple('Experience', ['obs', 'action', 'reward', 'isOver']) class ReplayMemory(object): def __init__(self, max_size, - state_shape, + obs_shape, context_len, load_file=False, file_path=None): self.max_size = int(max_size) - self.state_shape = state_shape + self.obs_shape = obs_shape self.context_len = int(context_len) self.file_path = file_path @@ -38,8 +38,7 @@ class ReplayMemory(object): self.load_memory() logger.info("memory size is {}".format(self._curr_size)) else: - self.state = np.zeros( - (self.max_size, ) + state_shape, dtype='uint8') + self.obs = np.zeros((self.max_size, ) + obs_shape, dtype='uint8') self.action = np.zeros((self.max_size, ), dtype='int32') self.reward = np.zeros((self.max_size, ), dtype='float32') self.isOver = np.zeros((self.max_size, ), dtype='bool') @@ -62,42 +61,41 @@ class ReplayMemory(object): else: self._context.append(exp) - def recent_state(self): - """ maintain recent state for training""" + def recent_obs(self): + """ maintain recent obs for training""" lst = list(self._context) - states = [np.zeros(self.state_shape, dtype='uint8')] * \ + obs = [np.zeros(self.obs_shape, dtype='uint8')] * \ (self._context.maxlen - len(lst)) - states.extend([k.state for k in lst]) - return states + obs.extend([k.obs for k in lst]) + return obs def sample(self, idx): - """ return state, action, reward, isOver, - note that some frames in state may be generated from last episode, - they should be removed from state + """ return obs, action, reward, isOver, + note that some frames in obs may be generated from last episode, + they should be removed from obs """ - state = np.zeros( - (self.context_len + 1, ) + self.state_shape, dtype=np.uint8) - state_idx = np.arange(idx, - idx + self.context_len + 1) % self._curr_size + obs = np.zeros( + (self.context_len + 1, ) + self.obs_shape, dtype=np.uint8) + obs_idx = np.arange(idx, idx + self.context_len + 1) % self._curr_size # confirm that no frame was generated from last episode has_last_episode = False for k in range(self.context_len - 2, -1, -1): - to_check_idx = state_idx[k] + to_check_idx = obs_idx[k] if self.isOver[to_check_idx]: has_last_episode = True - state_idx = state_idx[k + 1:] - state[k + 1:] = self.state[state_idx] + obs_idx = obs_idx[k + 1:] + obs[k + 1:] = self.obs[obs_idx] break if not has_last_episode: - state = self.state[state_idx] + obs = self.obs[obs_idx] real_idx = (idx + self.context_len - 1) % self._curr_size action = self.action[real_idx] reward = self.reward[real_idx] isOver = self.isOver[real_idx] - return state, reward, action, isOver + return obs, reward, action, isOver def __len__(self): return self._curr_size @@ -106,7 +104,7 @@ class ReplayMemory(object): return self._curr_size def _assign(self, pos, exp): - self.state[pos] = exp.state + self.obs[pos] = exp.obs self.reward[pos] = exp.reward self.action[pos] = exp.action self.isOver[pos] = exp.isOver @@ -129,15 +127,15 @@ class ReplayMemory(object): return self._process_batch(batch_exp) def _process_batch(self, batch_exp): - state = np.asarray([e[0] for e in batch_exp], dtype='uint8') + obs = np.asarray([e[0] for e in batch_exp], dtype='uint8') reward = np.asarray([e[1] for e in batch_exp], dtype='float32') action = np.asarray([e[2] for e in batch_exp], dtype='int8') isOver = np.asarray([e[3] for e in batch_exp], dtype='bool') - return [state, action, reward, isOver] + return [obs, action, reward, isOver] def save_memory(self): save_data = [ - self.state, self.reward, self.action, self.isOver, self._curr_size, + self.obs, self.reward, self.action, self.isOver, self._curr_size, self._curr_pos, self._context ] np.savez(self.file_path, *save_data) @@ -145,7 +143,7 @@ class ReplayMemory(object): def load_memory(self): container = np.load(self.file_path, allow_pickle=True) [ - self.state, self.reward, self.action, self.isOver, self._curr_size, + self.obs, self.reward, self.action, self.isOver, self._curr_size, self._curr_pos, self._context ] = [container[key] for key in container] self._curr_size = self._curr_size.astype(int) diff --git a/examples/offline-Q-learning/rom_files b/examples/offline-Q-learning/rom_files index 966a8940cbb2d928de9f816d41efada9aa3c9b6e..c1c50b9a99991f7f5dd34d7f243e999a636ba926 120000 --- a/examples/offline-Q-learning/rom_files +++ b/examples/offline-Q-learning/rom_files @@ -1 +1 @@ -../DQN/rom_files/ \ No newline at end of file +../DQN_variant/rom_files \ No newline at end of file diff --git a/examples/offline-Q-learning/utils.py b/examples/offline-Q-learning/utils.py index 721338d52451903eb1599e2396c9699a410a188d..04c590ec46f98b6cfa6d1ec833112730900fb840 120000 --- a/examples/offline-Q-learning/utils.py +++ b/examples/offline-Q-learning/utils.py @@ -1 +1 @@ -../DQN/utils.py \ No newline at end of file +../DQN_variant/utils.py \ No newline at end of file diff --git a/parl/algorithms/fluid/sac.py b/parl/algorithms/fluid/sac.py index cec92c98568905af7bce64252e9f3ff0531da039..32d7b1edfca1498fb40ece392025d310e162dd50 100644 --- a/parl/algorithms/fluid/sac.py +++ b/parl/algorithms/fluid/sac.py @@ -102,11 +102,11 @@ class SAC(Algorithm): return cost def critic_learn(self, obs, action, reward, next_obs, terminal): - next_state_action, next_state_log_pi = self.sample(next_obs) + next_obs_action, next_obs_log_pi = self.sample(next_obs) qf1_next_target, qf2_next_target = self.target_critic.value( - next_obs, next_state_action) + next_obs, next_obs_action) min_qf_next_target = layers.elementwise_min( - qf1_next_target, qf2_next_target) - next_state_log_pi * self.alpha + qf1_next_target, qf2_next_target) - next_obs_log_pi * self.alpha terminal = layers.cast(terminal, dtype='float32') target_Q = reward + (1.0 - terminal) * self.gamma * min_qf_next_target