diff --git a/benchmark/torch/dqn/replay_memory.py b/benchmark/torch/dqn/replay_memory.py
index ea8c6565155ddacae568e901566f9b390ee3a8b8..25ef0e50a7e53fd68d19b6ac3bffb807b18d6d29 100644
--- a/benchmark/torch/dqn/replay_memory.py
+++ b/benchmark/torch/dqn/replay_memory.py
@@ -16,16 +16,16 @@ import numpy as np
 import copy
 from collections import deque, namedtuple
 
-Experience = namedtuple('Experience', ['state', 'action', 'reward', 'isOver'])
+Experience = namedtuple('Experience', ['obs', 'action', 'reward', 'isOver'])
 
 
 class ReplayMemory(object):
-    def __init__(self, max_size, state_shape, context_len):
+    def __init__(self, max_size, obs_shape, context_len):
         self.max_size = int(max_size)
-        self.state_shape = state_shape
+        self.obs_shape = obs_shape
         self.context_len = int(context_len)
 
-        self.state = np.zeros((self.max_size, ) + state_shape, dtype='uint8')
+        self.obs = np.zeros((self.max_size, ) + obs_shape, dtype='uint8')
         self.action = np.zeros((self.max_size, ), dtype='int32')
         self.reward = np.zeros((self.max_size, ), dtype='float32')
         self.isOver = np.zeros((self.max_size, ), dtype='bool')
@@ -48,42 +48,41 @@ class ReplayMemory(object):
         else:
             self._context.append(exp)
 
-    def recent_state(self):
-        """ maintain recent state for training"""
+    def recent_obs(self):
+        """ maintain recent obs for training"""
         lst = list(self._context)
-        states = [np.zeros(self.state_shape, dtype='uint8')] * \
+        obs = [np.zeros(self.obs_shape, dtype='uint8')] * \
                     (self._context.maxlen - len(lst))
-        states.extend([k.state for k in lst])
-        return states
+        obs.extend([k.obs for k in lst])
+        return obs
 
     def sample(self, idx):
-        """ return state, action, reward, isOver,
-            note that some frames in state may be generated from last episode,
-            they should be removed from state
+        """ return obs, action, reward, isOver,
+            note that some frames in obs may be generated from last episode,
+            they should be removed from obs
             """
-        state = np.zeros(
-            (self.context_len + 1, ) + self.state_shape, dtype=np.uint8)
-        state_idx = np.arange(idx,
-                              idx + self.context_len + 1) % self._curr_size
+        obs = np.zeros(
+            (self.context_len + 1, ) + self.obs_shape, dtype=np.uint8)
+        obs_idx = np.arange(idx, idx + self.context_len + 1) % self._curr_size
 
         # confirm that no frame was generated from last episode
         has_last_episode = False
         for k in range(self.context_len - 2, -1, -1):
-            to_check_idx = state_idx[k]
+            to_check_idx = obs_idx[k]
             if self.isOver[to_check_idx]:
                 has_last_episode = True
-                state_idx = state_idx[k + 1:]
-                state[k + 1:] = self.state[state_idx]
+                obs_idx = obs_idx[k + 1:]
+                obs[k + 1:] = self.obs[obs_idx]
                 break
 
         if not has_last_episode:
-            state = self.state[state_idx]
+            obs = self.obs[obs_idx]
 
         real_idx = (idx + self.context_len - 1) % self._curr_size
         action = self.action[real_idx]
         reward = self.reward[real_idx]
         isOver = self.isOver[real_idx]
-        return state, reward, action, isOver
+        return obs, reward, action, isOver
 
     def __len__(self):
         return self._curr_size
@@ -92,7 +91,7 @@ class ReplayMemory(object):
         return self._curr_size
 
     def _assign(self, pos, exp):
-        self.state[pos] = exp.state
+        self.obs[pos] = exp.obs
         self.reward[pos] = exp.reward
         self.action[pos] = exp.action
         self.isOver[pos] = exp.isOver
@@ -107,8 +106,8 @@ class ReplayMemory(object):
         return self._process_batch(batch_exp)
 
     def _process_batch(self, batch_exp):
-        state = np.asarray([e[0] for e in batch_exp], dtype='uint8')
+        obs = np.asarray([e[0] for e in batch_exp], dtype='uint8')
         reward = np.asarray([e[1] for e in batch_exp], dtype='float32')
         action = np.asarray([e[2] for e in batch_exp], dtype='int8')
         isOver = np.asarray([e[3] for e in batch_exp], dtype='bool')
-        return [state, action, reward, isOver]
+        return [obs, action, reward, isOver]
diff --git a/benchmark/torch/dqn/train.py b/benchmark/torch/dqn/train.py
index 9db3b8f776fa669772bb2748cbfed0a7067f5909..26d24d8b1a3cc88bdb26aec954094f282e463003 100644
--- a/benchmark/torch/dqn/train.py
+++ b/benchmark/torch/dqn/train.py
@@ -26,7 +26,7 @@ from parl.utils import tensorboard, logger
 from parl.algorithms import DQN, DDQN
 
 from agent import AtariAgent
-from atari_wrapper import FireResetEnv, FrameStack, LimitLength, MapState
+from atari_wrapper import FireResetEnv, FrameStack, LimitLength
 from model import AtariModel
 from replay_memory import ReplayMemory, Experience
 from utils import get_player
@@ -43,57 +43,57 @@ GAMMA = 0.99
 def run_train_episode(env, agent, rpm):
     total_reward = 0
     all_cost = []
-    state = env.reset()
+    obs = env.reset()
     steps = 0
     while True:
         steps += 1
-        context = rpm.recent_state()
-        context.append(state)
+        context = rpm.recent_obs()
+        context.append(obs)
         context = np.stack(context, axis=0)
         action = agent.sample(context)
-        next_state, reward, isOver, _ = env.step(action)
-        rpm.append(Experience(state, action, reward, isOver))
+        next_obs, reward, isOver, _ = env.step(action)
+        rpm.append(Experience(obs, action, reward, isOver))
         if rpm.size() > MEMORY_WARMUP_SIZE:
             if steps % UPDATE_FREQ == 0:
-                batch_all_state, batch_action, batch_reward, batch_isOver = rpm.sample_batch(
+                batch_all_obs, batch_action, batch_reward, batch_isOver = rpm.sample_batch(
                     args.batch_size)
-                batch_state = batch_all_state[:, :CONTEXT_LEN, :, :]
-                batch_next_state = batch_all_state[:, 1:, :, :]
-                cost = agent.learn(batch_state, batch_action, batch_reward,
-                                   batch_next_state, batch_isOver)
+                batch_obs = batch_all_obs[:, :CONTEXT_LEN, :, :]
+                batch_next_obs = batch_all_obs[:, 1:, :, :]
+                cost = agent.learn(batch_obs, batch_action, batch_reward,
+                                   batch_next_obs, batch_isOver)
                 all_cost.append(cost)
         total_reward += reward
-        state = next_state
+        obs = next_obs
         if isOver:
             mean_loss = np.mean(all_cost) if all_cost else None
             return total_reward, steps, mean_loss
 
 
 def run_evaluate_episode(env, agent):
-    state = env.reset()
+    obs = env.reset()
     total_reward = 0
     while True:
-        pred_Q = agent.predict(state)
+        pred_Q = agent.predict(obs)
         action = pred_Q.max(1)[1].item()
-        state, reward, isOver, _ = env.step(action)
+        obs, reward, isOver, _ = env.step(action)
         total_reward += reward
         if isOver:
             return total_reward
 
 
-def get_fixed_states(rpm, batch_size):
-    states = []
+def get_fixed_obs(rpm, batch_size):
+    obs = []
     for _ in range(3):
-        batch_all_state = rpm.sample_batch(batch_size)[0]
-        batch_state = batch_all_state[:, :CONTEXT_LEN, :, :]
-        states.append(batch_state)
-    fixed_states = np.concatenate(states, axis=0)
-    return fixed_states
+        batch_all_obs = rpm.sample_batch(batch_size)[0]
+        batch_obs = batch_all_obs[:, :CONTEXT_LEN, :, :]
+        obs.append(batch_obs)
+    fixed_obs = np.concatenate(obs, axis=0)
+    return fixed_obs
 
 
-def evaluate_fixed_Q(agent, states):
+def evaluate_fixed_Q(agent, obs):
     with torch.no_grad():
-        max_pred_Q = agent.alg.model(states).max(1)[0].mean()
+        max_pred_Q = agent.alg.model(obs).max(1)[0].mean()
     return max_pred_Q.item()
 
 
@@ -131,9 +131,9 @@ def main():
             total_reward, steps, _ = run_train_episode(env, agent, rpm)
             pbar.update(steps)
 
-    # Get fixed states to check value function.
-    fixed_states = get_fixed_states(rpm, args.batch_size)
-    fixed_states = torch.tensor(fixed_states, dtype=torch.float, device=device)
+    # Get fixed obs to check value function.
+    fixed_obs = get_fixed_obs(rpm, args.batch_size)
+    fixed_obs = torch.tensor(fixed_obs, dtype=torch.float, device=device)
 
     # train
     test_flag = 0
@@ -159,7 +159,7 @@ def main():
                 tensorboard.add_scalar('dqn/exploration', agent.exploration,
                                        total_steps)
                 tensorboard.add_scalar('dqn/Q value',
-                                       evaluate_fixed_Q(agent, fixed_states),
+                                       evaluate_fixed_Q(agent, fixed_obs),
                                        total_steps)
                 tensorboard.add_scalar('dqn/grad_norm',
                                        get_grad_norm(agent.alg.model),
diff --git a/examples/DQN/cartpole_agent.py b/examples/DQN/cartpole_agent.py
index bf911f8ac2c8b103caf3a855026d2edc9d4a7631..c7d16cf43a1729f7cfd62f0981679ec373dd77ae 100755
--- a/examples/DQN/cartpole_agent.py
+++ b/examples/DQN/cartpole_agent.py
@@ -21,13 +21,13 @@ from parl import layers
 class CartpoleAgent(parl.Agent):
     def __init__(self,
                  algorithm,
-                 state_dim,
+                 obs_dim,
                  act_dim,
                  e_greed=0.1,
                  e_greed_decrement=0):
-        assert isinstance(state_dim, int)
+        assert isinstance(obs_dim, int)
         assert isinstance(act_dim, int)
-        self.state_dim = state_dim
+        self.obs_dim = obs_dim
         self.act_dim = act_dim
         super(CartpoleAgent, self).__init__(algorithm)
 
@@ -43,16 +43,16 @@ class CartpoleAgent(parl.Agent):
 
         with fluid.program_guard(self.pred_program):
             obs = layers.data(
-                name='obs', shape=[self.state_dim], dtype='float32')
+                name='obs', shape=[self.obs_dim], dtype='float32')
             self.value = self.alg.predict(obs)
 
         with fluid.program_guard(self.learn_program):
             obs = layers.data(
-                name='obs', shape=[self.state_dim], dtype='float32')
+                name='obs', shape=[self.obs_dim], dtype='float32')
             action = layers.data(name='act', shape=[1], dtype='int32')
             reward = layers.data(name='reward', shape=[], dtype='float32')
             next_obs = layers.data(
-                name='next_obs', shape=[self.state_dim], dtype='float32')
+                name='next_obs', shape=[self.obs_dim], dtype='float32')
             terminal = layers.data(name='terminal', shape=[], dtype='bool')
             lr = layers.data(
                 name='lr', shape=[1], dtype='float32', append_batch_size=False)
diff --git a/examples/DQN/replay_memory.py b/examples/DQN/replay_memory.py
index d15463ce8f2967431acb01b86c6a9b63080252fa..c9474a0dce8d3cc9f5d5610cafbd1df5b1a03586 100755
--- a/examples/DQN/replay_memory.py
+++ b/examples/DQN/replay_memory.py
@@ -28,19 +28,19 @@ class ReplayMemory(object):
 
     def sample(self, batch_size):
         mini_batch = random.sample(self.buffer, batch_size)
-        state_batch, action_batch, reward_batch, next_state_batch, done_batch = [], [], [], [], []
+        obs_batch, action_batch, reward_batch, next_obs_batch, done_batch = [], [], [], [], []
 
         for experience in mini_batch:
             s, a, r, s_p, done = experience
-            state_batch.append(s)
+            obs_batch.append(s)
             action_batch.append(a)
             reward_batch.append(r)
-            next_state_batch.append(s_p)
+            next_obs_batch.append(s_p)
             done_batch.append(done)
 
-        return np.array(state_batch).astype('float32'), \
+        return np.array(obs_batch).astype('float32'), \
             np.array(action_batch).astype('float32'), np.array(reward_batch).astype('float32'),\
-            np.array(next_state_batch).astype('float32'), np.array(done_batch).astype('float32')
+            np.array(next_obs_batch).astype('float32'), np.array(done_batch).astype('float32')
 
     def __len__(self):
         return len(self.buffer)
diff --git a/examples/DQN/train.py b/examples/DQN/train.py
index 63fb86058df73fde88a2c43e5d695b501d3d57de..ae504f9f711da14246ebba04a0e12e2dc25bf57a 100755
--- a/examples/DQN/train.py
+++ b/examples/DQN/train.py
@@ -32,24 +32,24 @@ GAMMA = 0.99  # discount factor of reward
 
 def run_episode(agent, env, rpm):
     total_reward = 0
-    state = env.reset()
+    obs = env.reset()
     step = 0
     while True:
         step += 1
-        action = agent.sample(state)
-        next_state, reward, isOver, _ = env.step(action)
-        rpm.append((state, action, reward, next_state, isOver))
+        action = agent.sample(obs)
+        next_obs, reward, isOver, _ = env.step(action)
+        rpm.append((obs, action, reward, next_obs, isOver))
 
         # train model
         if (len(rpm) > MEMORY_WARMUP_SIZE) and (step % LEARN_FREQ == 0):
-            (batch_state, batch_action, batch_reward, batch_next_state,
+            (batch_obs, batch_action, batch_reward, batch_next_obs,
              batch_isOver) = rpm.sample(BATCH_SIZE)
-            train_loss = agent.learn(batch_state, batch_action, batch_reward,
-                                     batch_next_state, batch_isOver,
+            train_loss = agent.learn(batch_obs, batch_action, batch_reward,
+                                     batch_next_obs, batch_isOver,
                                      LEARNING_RATE)
 
         total_reward += reward
-        state = next_state
+        obs = next_obs
         if isOver:
             break
     return total_reward
@@ -59,14 +59,14 @@ def evaluate(agent, env, render=False):
     # test part, run 5 episodes and average
     eval_reward = []
     for i in range(5):
-        state = env.reset()
+        obs = env.reset()
         episode_reward = 0
         isOver = False
         while not isOver:
-            action = agent.predict(state)
+            action = agent.predict(obs)
             if render:
                 env.render()
-            state, reward, isOver, _ = env.step(action)
+            obs, reward, isOver, _ = env.step(action)
             episode_reward += reward
         eval_reward.append(episode_reward)
     return np.mean(eval_reward)
@@ -75,7 +75,7 @@ def evaluate(agent, env, render=False):
 def main():
     env = gym.make('CartPole-v1')
     action_dim = env.action_space.n
-    state_shape = env.observation_space.shape
+    obs_shape = env.observation_space.shape
 
     rpm = ReplayMemory(MEMORY_SIZE)
 
@@ -83,7 +83,7 @@ def main():
     algorithm = parl.algorithms.DQN(model, act_dim=action_dim, gamma=GAMMA)
     agent = CartpoleAgent(
         algorithm,
-        state_dim=state_shape[0],
+        obs_dim=obs_shape[0],
         act_dim=action_dim,
         e_greed=0.1,  # explore
         e_greed_decrement=1e-6
diff --git a/examples/DQN_variant/replay_memory.py b/examples/DQN_variant/replay_memory.py
index ea8c6565155ddacae568e901566f9b390ee3a8b8..25ef0e50a7e53fd68d19b6ac3bffb807b18d6d29 100644
--- a/examples/DQN_variant/replay_memory.py
+++ b/examples/DQN_variant/replay_memory.py
@@ -16,16 +16,16 @@ import numpy as np
 import copy
 from collections import deque, namedtuple
 
-Experience = namedtuple('Experience', ['state', 'action', 'reward', 'isOver'])
+Experience = namedtuple('Experience', ['obs', 'action', 'reward', 'isOver'])
 
 
 class ReplayMemory(object):
-    def __init__(self, max_size, state_shape, context_len):
+    def __init__(self, max_size, obs_shape, context_len):
         self.max_size = int(max_size)
-        self.state_shape = state_shape
+        self.obs_shape = obs_shape
         self.context_len = int(context_len)
 
-        self.state = np.zeros((self.max_size, ) + state_shape, dtype='uint8')
+        self.obs = np.zeros((self.max_size, ) + obs_shape, dtype='uint8')
         self.action = np.zeros((self.max_size, ), dtype='int32')
         self.reward = np.zeros((self.max_size, ), dtype='float32')
         self.isOver = np.zeros((self.max_size, ), dtype='bool')
@@ -48,42 +48,41 @@ class ReplayMemory(object):
         else:
             self._context.append(exp)
 
-    def recent_state(self):
-        """ maintain recent state for training"""
+    def recent_obs(self):
+        """ maintain recent obs for training"""
         lst = list(self._context)
-        states = [np.zeros(self.state_shape, dtype='uint8')] * \
+        obs = [np.zeros(self.obs_shape, dtype='uint8')] * \
                     (self._context.maxlen - len(lst))
-        states.extend([k.state for k in lst])
-        return states
+        obs.extend([k.obs for k in lst])
+        return obs
 
     def sample(self, idx):
-        """ return state, action, reward, isOver,
-            note that some frames in state may be generated from last episode,
-            they should be removed from state
+        """ return obs, action, reward, isOver,
+            note that some frames in obs may be generated from last episode,
+            they should be removed from obs
             """
-        state = np.zeros(
-            (self.context_len + 1, ) + self.state_shape, dtype=np.uint8)
-        state_idx = np.arange(idx,
-                              idx + self.context_len + 1) % self._curr_size
+        obs = np.zeros(
+            (self.context_len + 1, ) + self.obs_shape, dtype=np.uint8)
+        obs_idx = np.arange(idx, idx + self.context_len + 1) % self._curr_size
 
         # confirm that no frame was generated from last episode
         has_last_episode = False
         for k in range(self.context_len - 2, -1, -1):
-            to_check_idx = state_idx[k]
+            to_check_idx = obs_idx[k]
             if self.isOver[to_check_idx]:
                 has_last_episode = True
-                state_idx = state_idx[k + 1:]
-                state[k + 1:] = self.state[state_idx]
+                obs_idx = obs_idx[k + 1:]
+                obs[k + 1:] = self.obs[obs_idx]
                 break
 
         if not has_last_episode:
-            state = self.state[state_idx]
+            obs = self.obs[obs_idx]
 
         real_idx = (idx + self.context_len - 1) % self._curr_size
         action = self.action[real_idx]
         reward = self.reward[real_idx]
         isOver = self.isOver[real_idx]
-        return state, reward, action, isOver
+        return obs, reward, action, isOver
 
     def __len__(self):
         return self._curr_size
@@ -92,7 +91,7 @@ class ReplayMemory(object):
         return self._curr_size
 
     def _assign(self, pos, exp):
-        self.state[pos] = exp.state
+        self.obs[pos] = exp.obs
         self.reward[pos] = exp.reward
         self.action[pos] = exp.action
         self.isOver[pos] = exp.isOver
@@ -107,8 +106,8 @@ class ReplayMemory(object):
         return self._process_batch(batch_exp)
 
     def _process_batch(self, batch_exp):
-        state = np.asarray([e[0] for e in batch_exp], dtype='uint8')
+        obs = np.asarray([e[0] for e in batch_exp], dtype='uint8')
         reward = np.asarray([e[1] for e in batch_exp], dtype='float32')
         action = np.asarray([e[2] for e in batch_exp], dtype='int8')
         isOver = np.asarray([e[3] for e in batch_exp], dtype='bool')
-        return [state, action, reward, isOver]
+        return [obs, action, reward, isOver]
diff --git a/examples/DQN_variant/train.py b/examples/DQN_variant/train.py
index 3149e6b81a34e81aff038a12994e7eb4e91eac22..0bd22797892783758b1059a4df2a06e4096736f6 100644
--- a/examples/DQN_variant/train.py
+++ b/examples/DQN_variant/train.py
@@ -39,28 +39,28 @@ LEARNING_RATE = 3e-4
 def run_train_episode(env, agent, rpm):
     total_reward = 0
     all_cost = []
-    state = env.reset()
+    obs = env.reset()
     steps = 0
     while True:
         steps += 1
-        context = rpm.recent_state()
-        context.append(state)
+        context = rpm.recent_obs()
+        context.append(obs)
         context = np.stack(context, axis=0)
         action = agent.sample(context)
-        next_state, reward, isOver, _ = env.step(action)
-        rpm.append(Experience(state, action, reward, isOver))
+        next_obs, reward, isOver, _ = env.step(action)
+        rpm.append(Experience(obs, action, reward, isOver))
         # start training
         if rpm.size() > MEMORY_WARMUP_SIZE:
             if steps % UPDATE_FREQ == 0:
-                batch_all_state, batch_action, batch_reward, batch_isOver = rpm.sample_batch(
+                batch_all_obs, batch_action, batch_reward, batch_isOver = rpm.sample_batch(
                     args.batch_size)
-                batch_state = batch_all_state[:, :CONTEXT_LEN, :, :]
-                batch_next_state = batch_all_state[:, 1:, :, :]
-                cost = agent.learn(batch_state, batch_action, batch_reward,
-                                   batch_next_state, batch_isOver)
+                batch_obs = batch_all_obs[:, :CONTEXT_LEN, :, :]
+                batch_next_obs = batch_all_obs[:, 1:, :, :]
+                cost = agent.learn(batch_obs, batch_action, batch_reward,
+                                   batch_next_obs, batch_isOver)
                 all_cost.append(float(cost))
         total_reward += reward
-        state = next_state
+        obs = next_obs
         if isOver:
             break
     if all_cost:
@@ -70,11 +70,11 @@ def run_train_episode(env, agent, rpm):
 
 
 def run_evaluate_episode(env, agent):
-    state = env.reset()
+    obs = env.reset()
     total_reward = 0
     while True:
-        action = agent.predict(state)
-        state, reward, isOver, info = env.step(action)
+        action = agent.predict(obs)
+        obs, reward, isOver, info = env.step(action)
         total_reward += reward
         if isOver:
             break
diff --git a/examples/offline-Q-learning/atari.py b/examples/offline-Q-learning/atari.py
index 11909eba8307ef781337b20ca2fe200ed967cc45..e0e1b3cc097be221483d0a8712951b9d38f5da54 120000
--- a/examples/offline-Q-learning/atari.py
+++ b/examples/offline-Q-learning/atari.py
@@ -1 +1 @@
-../DQN/atari.py
\ No newline at end of file
+../DQN_variant/atari.py
\ No newline at end of file
diff --git a/examples/offline-Q-learning/atari_wrapper.py b/examples/offline-Q-learning/atari_wrapper.py
index e58186a870b13dc7fff25c52cbdd1d009a18f4ac..2904fb39b7934d104209d0085ca814d5c132fe90 120000
--- a/examples/offline-Q-learning/atari_wrapper.py
+++ b/examples/offline-Q-learning/atari_wrapper.py
@@ -1 +1 @@
-../DQN/atari_wrapper.py
\ No newline at end of file
+../DQN_variant/atari_wrapper.py
\ No newline at end of file
diff --git a/examples/offline-Q-learning/parallel_run.py b/examples/offline-Q-learning/parallel_run.py
index 3416f8cd6708d75ce0884584a43b66c674d8c699..281ea4504e030f5b7296349a5912a07b30fdec27 100644
--- a/examples/offline-Q-learning/parallel_run.py
+++ b/examples/offline-Q-learning/parallel_run.py
@@ -45,21 +45,21 @@ gpu_num = get_gpu_count()
 def run_train_step(agent, rpm):
     for step in range(args.train_total_steps):
         # use the first 80% data to train
-        batch_all_state, batch_action, batch_reward, batch_isOver = rpm.sample_batch(
+        batch_all_obs, batch_action, batch_reward, batch_isOver = rpm.sample_batch(
             args.batch_size * gpu_num)
-        batch_state = batch_all_state[:, :CONTEXT_LEN, :, :]
-        batch_next_state = batch_all_state[:, 1:, :, :]
-        cost = agent.learn(batch_state, batch_action, batch_reward,
-                           batch_next_state, batch_isOver)
+        batch_obs = batch_all_obs[:, :CONTEXT_LEN, :, :]
+        batch_next_obs = batch_all_obs[:, 1:, :, :]
+        cost = agent.learn(batch_obs, batch_action, batch_reward,
+                           batch_next_obs, batch_isOver)
 
         if step % 100 == 0:
             # use the last 20% data to evaluate
-            batch_all_state, batch_action, batch_reward, batch_isOver = rpm.sample_test_batch(
+            batch_all_obs, batch_action, batch_reward, batch_isOver = rpm.sample_test_batch(
                 args.batch_size)
-            batch_state = batch_all_state[:, :CONTEXT_LEN, :, :]
-            batch_next_state = batch_all_state[:, 1:, :, :]
-            eval_cost = agent.supervised_eval(batch_state, batch_action,
-                                              batch_reward, batch_next_state,
+            batch_obs = batch_all_obs[:, :CONTEXT_LEN, :, :]
+            batch_next_obs = batch_all_obs[:, 1:, :, :]
+            eval_cost = agent.supervised_eval(batch_obs, batch_action,
+                                              batch_reward, batch_next_obs,
                                               batch_isOver)
             logger.info(
                 "train step {}, train costs are {}, eval cost is {}.".format(
@@ -67,17 +67,17 @@ def run_train_step(agent, rpm):
 
 
 def collect_exp(env, rpm, agent):
-    state = env.reset()
+    obs = env.reset()
     # collect data to fulfill replay memory
     for i in tqdm(range(MEMORY_SIZE)):
-        context = rpm.recent_state()
-        context.append(state)
+        context = rpm.recent_obs()
+        context.append(obs)
         context = np.stack(context, axis=0)
         action = agent.sample(context)
 
-        next_state, reward, isOver, _ = env.step(action)
-        rpm.append(Experience(state, action, reward, isOver))
-        state = next_state
+        next_obs, reward, isOver, _ = env.step(action)
+        rpm.append(Experience(obs, action, reward, isOver))
+        obs = next_obs
 
 
 def main():
diff --git a/examples/offline-Q-learning/replay_memory.py b/examples/offline-Q-learning/replay_memory.py
index 2296ea906ee47a53f697777b6885dad6365460e8..94a43c25d32ac9c9107dfa90a33d1280a5bebd16 100644
--- a/examples/offline-Q-learning/replay_memory.py
+++ b/examples/offline-Q-learning/replay_memory.py
@@ -18,18 +18,18 @@ import os
 from collections import deque, namedtuple
 from parl.utils import logger
 
-Experience = namedtuple('Experience', ['state', 'action', 'reward', 'isOver'])
+Experience = namedtuple('Experience', ['obs', 'action', 'reward', 'isOver'])
 
 
 class ReplayMemory(object):
     def __init__(self,
                  max_size,
-                 state_shape,
+                 obs_shape,
                  context_len,
                  load_file=False,
                  file_path=None):
         self.max_size = int(max_size)
-        self.state_shape = state_shape
+        self.obs_shape = obs_shape
         self.context_len = int(context_len)
 
         self.file_path = file_path
@@ -38,8 +38,7 @@ class ReplayMemory(object):
             self.load_memory()
             logger.info("memory size is {}".format(self._curr_size))
         else:
-            self.state = np.zeros(
-                (self.max_size, ) + state_shape, dtype='uint8')
+            self.obs = np.zeros((self.max_size, ) + obs_shape, dtype='uint8')
             self.action = np.zeros((self.max_size, ), dtype='int32')
             self.reward = np.zeros((self.max_size, ), dtype='float32')
             self.isOver = np.zeros((self.max_size, ), dtype='bool')
@@ -62,42 +61,41 @@ class ReplayMemory(object):
         else:
             self._context.append(exp)
 
-    def recent_state(self):
-        """ maintain recent state for training"""
+    def recent_obs(self):
+        """ maintain recent obs for training"""
         lst = list(self._context)
-        states = [np.zeros(self.state_shape, dtype='uint8')] * \
+        obs = [np.zeros(self.obs_shape, dtype='uint8')] * \
                     (self._context.maxlen - len(lst))
-        states.extend([k.state for k in lst])
-        return states
+        obs.extend([k.obs for k in lst])
+        return obs
 
     def sample(self, idx):
-        """ return state, action, reward, isOver,
-            note that some frames in state may be generated from last episode,
-            they should be removed from state
+        """ return obs, action, reward, isOver,
+            note that some frames in obs may be generated from last episode,
+            they should be removed from obs
             """
-        state = np.zeros(
-            (self.context_len + 1, ) + self.state_shape, dtype=np.uint8)
-        state_idx = np.arange(idx,
-                              idx + self.context_len + 1) % self._curr_size
+        obs = np.zeros(
+            (self.context_len + 1, ) + self.obs_shape, dtype=np.uint8)
+        obs_idx = np.arange(idx, idx + self.context_len + 1) % self._curr_size
 
         # confirm that no frame was generated from last episode
         has_last_episode = False
         for k in range(self.context_len - 2, -1, -1):
-            to_check_idx = state_idx[k]
+            to_check_idx = obs_idx[k]
             if self.isOver[to_check_idx]:
                 has_last_episode = True
-                state_idx = state_idx[k + 1:]
-                state[k + 1:] = self.state[state_idx]
+                obs_idx = obs_idx[k + 1:]
+                obs[k + 1:] = self.obs[obs_idx]
                 break
 
         if not has_last_episode:
-            state = self.state[state_idx]
+            obs = self.obs[obs_idx]
 
         real_idx = (idx + self.context_len - 1) % self._curr_size
         action = self.action[real_idx]
         reward = self.reward[real_idx]
         isOver = self.isOver[real_idx]
-        return state, reward, action, isOver
+        return obs, reward, action, isOver
 
     def __len__(self):
         return self._curr_size
@@ -106,7 +104,7 @@ class ReplayMemory(object):
         return self._curr_size
 
     def _assign(self, pos, exp):
-        self.state[pos] = exp.state
+        self.obs[pos] = exp.obs
         self.reward[pos] = exp.reward
         self.action[pos] = exp.action
         self.isOver[pos] = exp.isOver
@@ -129,15 +127,15 @@ class ReplayMemory(object):
         return self._process_batch(batch_exp)
 
     def _process_batch(self, batch_exp):
-        state = np.asarray([e[0] for e in batch_exp], dtype='uint8')
+        obs = np.asarray([e[0] for e in batch_exp], dtype='uint8')
         reward = np.asarray([e[1] for e in batch_exp], dtype='float32')
         action = np.asarray([e[2] for e in batch_exp], dtype='int8')
         isOver = np.asarray([e[3] for e in batch_exp], dtype='bool')
-        return [state, action, reward, isOver]
+        return [obs, action, reward, isOver]
 
     def save_memory(self):
         save_data = [
-            self.state, self.reward, self.action, self.isOver, self._curr_size,
+            self.obs, self.reward, self.action, self.isOver, self._curr_size,
             self._curr_pos, self._context
         ]
         np.savez(self.file_path, *save_data)
@@ -145,7 +143,7 @@ class ReplayMemory(object):
     def load_memory(self):
         container = np.load(self.file_path, allow_pickle=True)
         [
-            self.state, self.reward, self.action, self.isOver, self._curr_size,
+            self.obs, self.reward, self.action, self.isOver, self._curr_size,
             self._curr_pos, self._context
         ] = [container[key] for key in container]
         self._curr_size = self._curr_size.astype(int)
diff --git a/examples/offline-Q-learning/rom_files b/examples/offline-Q-learning/rom_files
index 966a8940cbb2d928de9f816d41efada9aa3c9b6e..c1c50b9a99991f7f5dd34d7f243e999a636ba926 120000
--- a/examples/offline-Q-learning/rom_files
+++ b/examples/offline-Q-learning/rom_files
@@ -1 +1 @@
-../DQN/rom_files/
\ No newline at end of file
+../DQN_variant/rom_files
\ No newline at end of file
diff --git a/examples/offline-Q-learning/utils.py b/examples/offline-Q-learning/utils.py
index 721338d52451903eb1599e2396c9699a410a188d..04c590ec46f98b6cfa6d1ec833112730900fb840 120000
--- a/examples/offline-Q-learning/utils.py
+++ b/examples/offline-Q-learning/utils.py
@@ -1 +1 @@
-../DQN/utils.py
\ No newline at end of file
+../DQN_variant/utils.py
\ No newline at end of file
diff --git a/parl/algorithms/fluid/sac.py b/parl/algorithms/fluid/sac.py
index cec92c98568905af7bce64252e9f3ff0531da039..32d7b1edfca1498fb40ece392025d310e162dd50 100644
--- a/parl/algorithms/fluid/sac.py
+++ b/parl/algorithms/fluid/sac.py
@@ -102,11 +102,11 @@ class SAC(Algorithm):
         return cost
 
     def critic_learn(self, obs, action, reward, next_obs, terminal):
-        next_state_action, next_state_log_pi = self.sample(next_obs)
+        next_obs_action, next_obs_log_pi = self.sample(next_obs)
         qf1_next_target, qf2_next_target = self.target_critic.value(
-            next_obs, next_state_action)
+            next_obs, next_obs_action)
         min_qf_next_target = layers.elementwise_min(
-            qf1_next_target, qf2_next_target) - next_state_log_pi * self.alpha
+            qf1_next_target, qf2_next_target) - next_obs_log_pi * self.alpha
 
         terminal = layers.cast(terminal, dtype='float32')
         target_Q = reward + (1.0 - terminal) * self.gamma * min_qf_next_target