diff --git a/fluid/DeepASR/examples/aishell/prepare_data.sh b/fluid/DeepASR/examples/aishell/prepare_data.sh
index d2c051c4d9ea10547f5ba4cc20213f430bf6dfce..3b4a8753a7558c8fe5dc9b1045862ec3d29b2734 100644
--- a/fluid/DeepASR/examples/aishell/prepare_data.sh
+++ b/fluid/DeepASR/examples/aishell/prepare_data.sh
@@ -1,7 +1,7 @@
 data_dir=~/.cache/paddle/dataset/speech/deep_asr_data/aishell
 data_url='http://deep-asr-data.gz.bcebos.com/aishell_data.tar.gz'
 lst_url='http://deep-asr-data.gz.bcebos.com/aishell_lst.tar.gz'
-md5=e017d858d9e509c8a84b73f673f08b9a
+md5=17669b8d63331c9326f4a9393d289bfb
 
 if [ ! -e $data_dir ]; then
     mkdir -p $data_dir
diff --git a/fluid/DeepQNetwork/DQN.py b/fluid/DeepQNetwork/DQN.py
deleted file mode 100644
index b4dcae6fbdb7a5df03ed6ca50a4d8183e26ee288..0000000000000000000000000000000000000000
--- a/fluid/DeepQNetwork/DQN.py
+++ /dev/null
@@ -1,88 +0,0 @@
-#-*- coding: utf-8 -*-
-#File: DQN.py
-
-from agent import Model
-import gym
-import argparse
-from tqdm import tqdm
-from expreplay import ReplayMemory, Experience
-import numpy as np
-import os
-
-UPDATE_FREQ = 4
-
-MEMORY_WARMUP_SIZE = 1000
-
-
-def run_episode(agent, env, exp, train_or_test):
-    assert train_or_test in ['train', 'test'], train_or_test
-    total_reward = 0
-    state = env.reset()
-    for step in range(200):
-        action = agent.act(state, train_or_test)
-        next_state, reward, isOver, _ = env.step(action)
-        if train_or_test == 'train':
-            exp.append(Experience(state, action, reward, isOver))
-            # train model
-            # start training 
-            if len(exp) > MEMORY_WARMUP_SIZE:
-                batch_idx = np.random.randint(
-                    len(exp) - 1, size=(args.batch_size))
-                if step % UPDATE_FREQ == 0:
-                    batch_state, batch_action, batch_reward, \
-                    batch_next_state, batch_isOver = exp.sample(batch_idx)
-                    agent.train(batch_state, batch_action, batch_reward, \
-                                batch_next_state, batch_isOver)
-        total_reward += reward
-        state = next_state
-        if isOver:
-            break
-    return total_reward
-
-
-def train_agent():
-    env = gym.make(args.env)
-    state_shape = env.observation_space.shape
-    exp = ReplayMemory(args.mem_size, state_shape)
-    action_dim = env.action_space.n
-    agent = Model(state_shape[0], action_dim, gamma=0.99)
-
-    while len(exp) < MEMORY_WARMUP_SIZE:
-        run_episode(agent, env, exp, train_or_test='train')
-
-    max_episode = 4000
-
-    # train
-    total_episode = 0
-    pbar = tqdm(total=max_episode)
-    recent_100_reward = []
-    for episode in xrange(max_episode):
-        # start epoch
-        total_reward = run_episode(agent, env, exp, train_or_test='train')
-        pbar.set_description('[train]exploration:{}'.format(agent.exploration))
-        pbar.update()
-
-        # recent 100 reward
-        total_reward = run_episode(agent, env, exp, train_or_test='test')
-        recent_100_reward.append(total_reward)
-        if len(recent_100_reward) > 100:
-            recent_100_reward = recent_100_reward[1:]
-        pbar.write("episode:{}    test_reward:{}".format(\
-                    episode, np.mean(recent_100_reward)))
-
-    pbar.close()
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--env', type=str, default='MountainCar-v0', \
-                        help='enviroment to train DQN model, e.g CartPole-v0')
-    parser.add_argument('--gamma', type=float, default=0.99, \
-                        help='discount factor for accumulated reward computation')
-    parser.add_argument('--mem_size', type=int, default=500000, \
-                        help='memory size for experience replay')
-    parser.add_argument('--batch_size', type=int, default=192, \
-                        help='batch size for training')
-    args = parser.parse_args()
-
-    train_agent()
diff --git a/fluid/DeepQNetwork/DQN_agent.py b/fluid/DeepQNetwork/DQN_agent.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e377c9d0eb34f4bef844427da6d77b3fe19e331
--- /dev/null
+++ b/fluid/DeepQNetwork/DQN_agent.py
@@ -0,0 +1,188 @@
+#-*- coding: utf-8 -*-
+
+import paddle.fluid as fluid
+from paddle.fluid.param_attr import ParamAttr
+import numpy as np
+import math
+from tqdm import tqdm
+from utils import fluid_flatten
+
+
+class DQNModel(object):
+    def __init__(self, state_dim, action_dim, gamma, hist_len, use_cuda=False):
+        self.img_height = state_dim[0]
+        self.img_width = state_dim[1]
+        self.action_dim = action_dim
+        self.gamma = gamma
+        self.exploration = 1.1
+        self.update_target_steps = 10000 // 4
+        self.hist_len = hist_len
+        self.use_cuda = use_cuda
+
+        self.global_step = 0
+        self._build_net()
+
+    def _get_inputs(self):
+        return fluid.layers.data(
+                   name='state',
+                   shape=[self.hist_len, self.img_height, self.img_width],
+                   dtype='float32'), \
+               fluid.layers.data(
+                   name='action', shape=[1], dtype='int32'), \
+               fluid.layers.data(
+                   name='reward', shape=[], dtype='float32'), \
+               fluid.layers.data(
+                   name='next_s',
+                   shape=[self.hist_len, self.img_height, self.img_width],
+                   dtype='float32'), \
+               fluid.layers.data(
+                   name='isOver', shape=[], dtype='bool')
+
+    def _build_net(self):
+        state, action, reward, next_s, isOver = self._get_inputs()
+        self.pred_value = self.get_DQN_prediction(state)
+        self.predict_program = fluid.default_main_program().clone()
+
+        reward = fluid.layers.clip(reward, min=-1.0, max=1.0)
+
+        action_onehot = fluid.layers.one_hot(action, self.action_dim)
+        action_onehot = fluid.layers.cast(action_onehot, dtype='float32')
+
+        pred_action_value = fluid.layers.reduce_sum(
+            fluid.layers.elementwise_mul(action_onehot, self.pred_value), dim=1)
+
+        targetQ_predict_value = self.get_DQN_prediction(next_s, target=True)
+        best_v = fluid.layers.reduce_max(targetQ_predict_value, dim=1)
+        best_v.stop_gradient = True
+
+        target = reward + (1.0 - fluid.layers.cast(
+            isOver, dtype='float32')) * self.gamma * best_v
+        cost = fluid.layers.square_error_cost(pred_action_value, target)
+        cost = fluid.layers.reduce_mean(cost)
+
+        self._sync_program = self._build_sync_target_network()
+
+        optimizer = fluid.optimizer.Adam(1e-3 * 0.5, epsilon=1e-3)
+        optimizer.minimize(cost)
+
+        # define program
+        self.train_program = fluid.default_main_program()
+
+        # fluid exe
+        place = fluid.CUDAPlace(0) if self.use_cuda else fluid.CPUPlace()
+        self.exe = fluid.Executor(place)
+        self.exe.run(fluid.default_startup_program())
+
+    def get_DQN_prediction(self, image, target=False):
+        image = image / 255.0
+
+        variable_field = 'target' if target else 'policy'
+
+        conv1 = fluid.layers.conv2d(
+            input=image,
+            num_filters=32,
+            filter_size=[5, 5],
+            stride=[1, 1],
+            padding=[2, 2],
+            act='relu',
+            param_attr=ParamAttr(name='{}_conv1'.format(variable_field)),
+            bias_attr=ParamAttr(name='{}_conv1_b'.format(variable_field)))
+        max_pool1 = fluid.layers.pool2d(
+            input=conv1, pool_size=[2, 2], pool_stride=[2, 2], pool_type='max')
+
+        conv2 = fluid.layers.conv2d(
+            input=max_pool1,
+            num_filters=32,
+            filter_size=[5, 5],
+            stride=[1, 1],
+            padding=[2, 2],
+            act='relu',
+            param_attr=ParamAttr(name='{}_conv2'.format(variable_field)),
+            bias_attr=ParamAttr(name='{}_conv2_b'.format(variable_field)))
+        max_pool2 = fluid.layers.pool2d(
+            input=conv2, pool_size=[2, 2], pool_stride=[2, 2], pool_type='max')
+
+        conv3 = fluid.layers.conv2d(
+            input=max_pool2,
+            num_filters=64,
+            filter_size=[4, 4],
+            stride=[1, 1],
+            padding=[1, 1],
+            act='relu',
+            param_attr=ParamAttr(name='{}_conv3'.format(variable_field)),
+            bias_attr=ParamAttr(name='{}_conv3_b'.format(variable_field)))
+        max_pool3 = fluid.layers.pool2d(
+            input=conv3, pool_size=[2, 2], pool_stride=[2, 2], pool_type='max')
+
+        conv4 = fluid.layers.conv2d(
+            input=max_pool3,
+            num_filters=64,
+            filter_size=[3, 3],
+            stride=[1, 1],
+            padding=[1, 1],
+            act='relu',
+            param_attr=ParamAttr(name='{}_conv4'.format(variable_field)),
+            bias_attr=ParamAttr(name='{}_conv4_b'.format(variable_field)))
+
+        flatten = fluid_flatten(conv4)
+
+        out = fluid.layers.fc(
+            input=flatten,
+            size=self.action_dim,
+            param_attr=ParamAttr(name='{}_fc1'.format(variable_field)),
+            bias_attr=ParamAttr(name='{}_fc1_b'.format(variable_field)))
+        return out
+
+    def _build_sync_target_network(self):
+        vars = list(fluid.default_main_program().list_vars())
+        policy_vars = filter(
+            lambda x: 'GRAD' not in x.name and 'policy' in x.name, vars)
+        target_vars = filter(
+            lambda x: 'GRAD' not in x.name and 'target' in x.name, vars)
+        policy_vars.sort(key=lambda x: x.name)
+        target_vars.sort(key=lambda x: x.name)
+
+        sync_program = fluid.default_main_program().clone()
+        with fluid.program_guard(sync_program):
+            sync_ops = []
+            for i, var in enumerate(policy_vars):
+                sync_op = fluid.layers.assign(policy_vars[i], target_vars[i])
+                sync_ops.append(sync_op)
+        sync_program = sync_program.prune(sync_ops)
+        return sync_program
+
+    def act(self, state, train_or_test):
+        sample = np.random.random()
+        if train_or_test == 'train' and sample < self.exploration:
+            act = np.random.randint(self.action_dim)
+        else:
+            if np.random.random() < 0.01:
+                act = np.random.randint(self.action_dim)
+            else:
+                state = np.expand_dims(state, axis=0)
+                pred_Q = self.exe.run(self.predict_program,
+                                      feed={'state': state.astype('float32')},
+                                      fetch_list=[self.pred_value])[0]
+                pred_Q = np.squeeze(pred_Q, axis=0)
+                act = np.argmax(pred_Q)
+        if train_or_test == 'train':
+            self.exploration = max(0.1, self.exploration - 1e-6)
+        return act
+
+    def train(self, state, action, reward, next_state, isOver):
+        if self.global_step % self.update_target_steps == 0:
+            self.sync_target_network()
+        self.global_step += 1
+
+        action = np.expand_dims(action, -1)
+        self.exe.run(self.train_program,
+                     feed={
+                         'state': state.astype('float32'),
+                         'action': action.astype('int32'),
+                         'reward': reward,
+                         'next_s': next_state.astype('float32'),
+                         'isOver': isOver
+                     })
+
+    def sync_target_network(self):
+        self.exe.run(self._sync_program)
diff --git a/fluid/DeepQNetwork/DoubleDQN_agent.py b/fluid/DeepQNetwork/DoubleDQN_agent.py
new file mode 100644
index 0000000000000000000000000000000000000000..a46b2bd62e5c989b14858ed893f7fb132a0c5767
--- /dev/null
+++ b/fluid/DeepQNetwork/DoubleDQN_agent.py
@@ -0,0 +1,195 @@
+#-*- coding: utf-8 -*-
+
+import paddle.fluid as fluid
+from paddle.fluid.param_attr import ParamAttr
+import numpy as np
+from tqdm import tqdm
+import math
+from utils import fluid_argmax, fluid_flatten
+
+
+class DoubleDQNModel(object):
+    def __init__(self, state_dim, action_dim, gamma, hist_len, use_cuda=False):
+        self.img_height = state_dim[0]
+        self.img_width = state_dim[1]
+        self.action_dim = action_dim
+        self.gamma = gamma
+        self.exploration = 1.1
+        self.update_target_steps = 10000 // 4
+        self.hist_len = hist_len
+        self.use_cuda = use_cuda
+
+        self.global_step = 0
+        self._build_net()
+
+    def _get_inputs(self):
+        return fluid.layers.data(
+                   name='state',
+                   shape=[self.hist_len, self.img_height, self.img_width],
+                   dtype='float32'), \
+               fluid.layers.data(
+                   name='action', shape=[1], dtype='int32'), \
+               fluid.layers.data(
+                   name='reward', shape=[], dtype='float32'), \
+               fluid.layers.data(
+                   name='next_s',
+                   shape=[self.hist_len, self.img_height, self.img_width],
+                   dtype='float32'), \
+               fluid.layers.data(
+                   name='isOver', shape=[], dtype='bool')
+
+    def _build_net(self):
+        state, action, reward, next_s, isOver = self._get_inputs()
+        self.pred_value = self.get_DQN_prediction(state)
+        self.predict_program = fluid.default_main_program().clone()
+
+        reward = fluid.layers.clip(reward, min=-1.0, max=1.0)
+
+        action_onehot = fluid.layers.one_hot(action, self.action_dim)
+        action_onehot = fluid.layers.cast(action_onehot, dtype='float32')
+
+        pred_action_value = fluid.layers.reduce_sum(
+            fluid.layers.elementwise_mul(action_onehot, self.pred_value), dim=1)
+
+        targetQ_predict_value = self.get_DQN_prediction(next_s, target=True)
+
+        next_s_predcit_value = self.get_DQN_prediction(next_s)
+        greedy_action = fluid_argmax(next_s_predcit_value)
+
+        predict_onehot = fluid.layers.one_hot(greedy_action, self.action_dim)
+        best_v = fluid.layers.reduce_sum(
+            fluid.layers.elementwise_mul(predict_onehot, targetQ_predict_value),
+            dim=1)
+        best_v.stop_gradient = True
+
+        target = reward + (1.0 - fluid.layers.cast(
+            isOver, dtype='float32')) * self.gamma * best_v
+        cost = fluid.layers.square_error_cost(pred_action_value, target)
+        cost = fluid.layers.reduce_mean(cost)
+
+        self._sync_program = self._build_sync_target_network()
+
+        optimizer = fluid.optimizer.Adam(1e-3 * 0.5, epsilon=1e-3)
+        optimizer.minimize(cost)
+
+        # define program
+        self.train_program = fluid.default_main_program()
+
+        # fluid exe
+        place = fluid.CUDAPlace(0) if self.use_cuda else fluid.CPUPlace()
+        self.exe = fluid.Executor(place)
+        self.exe.run(fluid.default_startup_program())
+
+    def get_DQN_prediction(self, image, target=False):
+        image = image / 255.0
+
+        variable_field = 'target' if target else 'policy'
+
+        conv1 = fluid.layers.conv2d(
+            input=image,
+            num_filters=32,
+            filter_size=[5, 5],
+            stride=[1, 1],
+            padding=[2, 2],
+            act='relu',
+            param_attr=ParamAttr(name='{}_conv1'.format(variable_field)),
+            bias_attr=ParamAttr(name='{}_conv1_b'.format(variable_field)))
+        max_pool1 = fluid.layers.pool2d(
+            input=conv1, pool_size=[2, 2], pool_stride=[2, 2], pool_type='max')
+
+        conv2 = fluid.layers.conv2d(
+            input=max_pool1,
+            num_filters=32,
+            filter_size=[5, 5],
+            stride=[1, 1],
+            padding=[2, 2],
+            act='relu',
+            param_attr=ParamAttr(name='{}_conv2'.format(variable_field)),
+            bias_attr=ParamAttr(name='{}_conv2_b'.format(variable_field)))
+        max_pool2 = fluid.layers.pool2d(
+            input=conv2, pool_size=[2, 2], pool_stride=[2, 2], pool_type='max')
+
+        conv3 = fluid.layers.conv2d(
+            input=max_pool2,
+            num_filters=64,
+            filter_size=[4, 4],
+            stride=[1, 1],
+            padding=[1, 1],
+            act='relu',
+            param_attr=ParamAttr(name='{}_conv3'.format(variable_field)),
+            bias_attr=ParamAttr(name='{}_conv3_b'.format(variable_field)))
+        max_pool3 = fluid.layers.pool2d(
+            input=conv3, pool_size=[2, 2], pool_stride=[2, 2], pool_type='max')
+
+        conv4 = fluid.layers.conv2d(
+            input=max_pool3,
+            num_filters=64,
+            filter_size=[3, 3],
+            stride=[1, 1],
+            padding=[1, 1],
+            act='relu',
+            param_attr=ParamAttr(name='{}_conv4'.format(variable_field)),
+            bias_attr=ParamAttr(name='{}_conv4_b'.format(variable_field)))
+
+        flatten = fluid_flatten(conv4)
+
+        out = fluid.layers.fc(
+            input=flatten,
+            size=self.action_dim,
+            param_attr=ParamAttr(name='{}_fc1'.format(variable_field)),
+            bias_attr=ParamAttr(name='{}_fc1_b'.format(variable_field)))
+        return out
+
+    def _build_sync_target_network(self):
+        vars = list(fluid.default_main_program().list_vars())
+        policy_vars = filter(
+            lambda x: 'GRAD' not in x.name and 'policy' in x.name, vars)
+        target_vars = filter(
+            lambda x: 'GRAD' not in x.name and 'target' in x.name, vars)
+        policy_vars.sort(key=lambda x: x.name)
+        target_vars.sort(key=lambda x: x.name)
+
+        sync_program = fluid.default_main_program().clone()
+        with fluid.program_guard(sync_program):
+            sync_ops = []
+            for i, var in enumerate(policy_vars):
+                sync_op = fluid.layers.assign(policy_vars[i], target_vars[i])
+                sync_ops.append(sync_op)
+        sync_program = sync_program.prune(sync_ops)
+        return sync_program
+
+    def act(self, state, train_or_test):
+        sample = np.random.random()
+        if train_or_test == 'train' and sample < self.exploration:
+            act = np.random.randint(self.action_dim)
+        else:
+            if np.random.random() < 0.01:
+                act = np.random.randint(self.action_dim)
+            else:
+                state = np.expand_dims(state, axis=0)
+                pred_Q = self.exe.run(self.predict_program,
+                                      feed={'state': state.astype('float32')},
+                                      fetch_list=[self.pred_value])[0]
+                pred_Q = np.squeeze(pred_Q, axis=0)
+                act = np.argmax(pred_Q)
+        if train_or_test == 'train':
+            self.exploration = max(0.1, self.exploration - 1e-6)
+        return act
+
+    def train(self, state, action, reward, next_state, isOver):
+        if self.global_step % self.update_target_steps == 0:
+            self.sync_target_network()
+        self.global_step += 1
+
+        action = np.expand_dims(action, -1)
+        self.exe.run(self.train_program,
+                     feed={
+                         'state': state.astype('float32'),
+                         'action': action.astype('int32'),
+                         'reward': reward,
+                         'next_s': next_state.astype('float32'),
+                         'isOver': isOver
+                     })
+
+    def sync_target_network(self):
+        self.exe.run(self._sync_program)
diff --git a/fluid/DeepQNetwork/DuelingDQN_agent.py b/fluid/DeepQNetwork/DuelingDQN_agent.py
new file mode 100644
index 0000000000000000000000000000000000000000..582111730da408b741ad6b6eeded34685d026f40
--- /dev/null
+++ b/fluid/DeepQNetwork/DuelingDQN_agent.py
@@ -0,0 +1,196 @@
+#-*- coding: utf-8 -*-
+
+import paddle.fluid as fluid
+from paddle.fluid.param_attr import ParamAttr
+import numpy as np
+from tqdm import tqdm
+import math
+from utils import fluid_flatten
+
+
+class DuelingDQNModel(object):
+    def __init__(self, state_dim, action_dim, gamma, hist_len, use_cuda=False):
+        self.img_height = state_dim[0]
+        self.img_width = state_dim[1]
+        self.action_dim = action_dim
+        self.gamma = gamma
+        self.exploration = 1.1
+        self.update_target_steps = 10000 // 4
+        self.hist_len = hist_len
+        self.use_cuda = use_cuda
+
+        self.global_step = 0
+        self._build_net()
+
+    def _get_inputs(self):
+        return fluid.layers.data(
+                   name='state',
+                   shape=[self.hist_len, self.img_height, self.img_width],
+                   dtype='float32'), \
+               fluid.layers.data(
+                   name='action', shape=[1], dtype='int32'), \
+               fluid.layers.data(
+                   name='reward', shape=[], dtype='float32'), \
+               fluid.layers.data(
+                   name='next_s',
+                   shape=[self.hist_len, self.img_height, self.img_width],
+                   dtype='float32'), \
+               fluid.layers.data(
+                   name='isOver', shape=[], dtype='bool')
+
+    def _build_net(self):
+        state, action, reward, next_s, isOver = self._get_inputs()
+        self.pred_value = self.get_DQN_prediction(state)
+        self.predict_program = fluid.default_main_program().clone()
+
+        reward = fluid.layers.clip(reward, min=-1.0, max=1.0)
+
+        action_onehot = fluid.layers.one_hot(action, self.action_dim)
+        action_onehot = fluid.layers.cast(action_onehot, dtype='float32')
+
+        pred_action_value = fluid.layers.reduce_sum(
+            fluid.layers.elementwise_mul(action_onehot, self.pred_value), dim=1)
+
+        targetQ_predict_value = self.get_DQN_prediction(next_s, target=True)
+        best_v = fluid.layers.reduce_max(targetQ_predict_value, dim=1)
+        best_v.stop_gradient = True
+
+        target = reward + (1.0 - fluid.layers.cast(
+            isOver, dtype='float32')) * self.gamma * best_v
+        cost = fluid.layers.square_error_cost(pred_action_value, target)
+        cost = fluid.layers.reduce_mean(cost)
+
+        self._sync_program = self._build_sync_target_network()
+
+        optimizer = fluid.optimizer.Adam(1e-3 * 0.5, epsilon=1e-3)
+        optimizer.minimize(cost)
+
+        # define program
+        self.train_program = fluid.default_main_program()
+
+        # fluid exe
+        place = fluid.CUDAPlace(0) if self.use_cuda else fluid.CPUPlace()
+        self.exe = fluid.Executor(place)
+        self.exe.run(fluid.default_startup_program())
+
+    def get_DQN_prediction(self, image, target=False):
+        image = image / 255.0
+
+        variable_field = 'target' if target else 'policy'
+
+        conv1 = fluid.layers.conv2d(
+            input=image,
+            num_filters=32,
+            filter_size=[5, 5],
+            stride=[1, 1],
+            padding=[2, 2],
+            act='relu',
+            param_attr=ParamAttr(name='{}_conv1'.format(variable_field)),
+            bias_attr=ParamAttr(name='{}_conv1_b'.format(variable_field)))
+        max_pool1 = fluid.layers.pool2d(
+            input=conv1, pool_size=[2, 2], pool_stride=[2, 2], pool_type='max')
+
+        conv2 = fluid.layers.conv2d(
+            input=max_pool1,
+            num_filters=32,
+            filter_size=[5, 5],
+            stride=[1, 1],
+            padding=[2, 2],
+            act='relu',
+            param_attr=ParamAttr(name='{}_conv2'.format(variable_field)),
+            bias_attr=ParamAttr(name='{}_conv2_b'.format(variable_field)))
+        max_pool2 = fluid.layers.pool2d(
+            input=conv2, pool_size=[2, 2], pool_stride=[2, 2], pool_type='max')
+
+        conv3 = fluid.layers.conv2d(
+            input=max_pool2,
+            num_filters=64,
+            filter_size=[4, 4],
+            stride=[1, 1],
+            padding=[1, 1],
+            act='relu',
+            param_attr=ParamAttr(name='{}_conv3'.format(variable_field)),
+            bias_attr=ParamAttr(name='{}_conv3_b'.format(variable_field)))
+        max_pool3 = fluid.layers.pool2d(
+            input=conv3, pool_size=[2, 2], pool_stride=[2, 2], pool_type='max')
+
+        conv4 = fluid.layers.conv2d(
+            input=max_pool3,
+            num_filters=64,
+            filter_size=[3, 3],
+            stride=[1, 1],
+            padding=[1, 1],
+            act='relu',
+            param_attr=ParamAttr(name='{}_conv4'.format(variable_field)),
+            bias_attr=ParamAttr(name='{}_conv4_b'.format(variable_field)))
+
+        flatten = fluid_flatten(conv4)
+
+        value = fluid.layers.fc(
+            input=flatten,
+            size=1,
+            param_attr=ParamAttr(name='{}_value_fc'.format(variable_field)),
+            bias_attr=ParamAttr(name='{}_value_fc_b'.format(variable_field)))
+
+        advantage = fluid.layers.fc(
+            input=flatten,
+            size=self.action_dim,
+            param_attr=ParamAttr(name='{}_advantage_fc'.format(variable_field)),
+            bias_attr=ParamAttr(
+                name='{}_advantage_fc_b'.format(variable_field)))
+
+        Q = advantage + (value - fluid.layers.reduce_mean(
+            advantage, dim=1, keep_dim=True))
+        return Q
+
+    def _build_sync_target_network(self):
+        vars = list(fluid.default_main_program().list_vars())
+        policy_vars = filter(
+            lambda x: 'GRAD' not in x.name and 'policy' in x.name, vars)
+        target_vars = filter(
+            lambda x: 'GRAD' not in x.name and 'target' in x.name, vars)
+        policy_vars.sort(key=lambda x: x.name)
+        target_vars.sort(key=lambda x: x.name)
+
+        sync_program = fluid.default_main_program().clone()
+        with fluid.program_guard(sync_program):
+            sync_ops = []
+            for i, var in enumerate(policy_vars):
+                sync_op = fluid.layers.assign(policy_vars[i], target_vars[i])
+                sync_ops.append(sync_op)
+        sync_program = sync_program.prune(sync_ops)
+        return sync_program
+
+    def act(self, state, train_or_test):
+        sample = np.random.random()
+        if train_or_test == 'train' and sample < self.exploration:
+            act = np.random.randint(self.action_dim)
+        else:
+            if np.random.random() < 0.01:
+                act = np.random.randint(self.action_dim)
+            else:
+                state = np.expand_dims(state, axis=0)
+                pred_Q = self.exe.run(self.predict_program,
+                                      feed={'state': state.astype('float32')},
+                                      fetch_list=[self.pred_value])[0]
+                pred_Q = np.squeeze(pred_Q, axis=0)
+                act = np.argmax(pred_Q)
+        if train_or_test == 'train':
+            self.exploration = max(0.1, self.exploration - 1e-6)
+        return act
+
+    def train(self, state, action, reward, next_state, isOver):
+        if self.global_step % self.update_target_steps == 0:
+            self.sync_target_network()
+        self.global_step += 1
+
+        action = np.expand_dims(action, -1)
+        self.exe.run(self.train_program, \
+                  feed={'state': state.astype('float32'), \
+                        'action': action.astype('int32'), \
+                        'reward': reward, \
+                        'next_s': next_state.astype('float32'), \
+                        'isOver': isOver})
+
+    def sync_target_network(self):
+        self.exe.run(self._sync_program)
diff --git a/fluid/DeepQNetwork/README.md b/fluid/DeepQNetwork/README.md
index a69835271675a0fa5087b279e30643dd1cd5adc0..6df88ecbf50e5d0375070c772e8b5b2340791b78 100644
--- a/fluid/DeepQNetwork/README.md
+++ b/fluid/DeepQNetwork/README.md
@@ -1,31 +1,44 @@
-<img src="mountain_car.gif" width="300" height="200">
+# Reproduce DQN, DoubleDQN, DuelingDQN model with fluid version of PaddlePaddle
 
-# Reproduce DQN model
- + DQN in:
++ DQN in:
 [Human-level Control Through Deep Reinforcement Learning](http://www.nature.com/nature/journal/v518/n7540/full/nature14236.html)
++ DoubleDQN in:
+[Deep Reinforcement Learning with Double Q-Learning](https://www.aaai.org/ocs/index.php/AAAI/AAAI16/paper/viewPaper/12389)
++ DuelingDQN in:
+[Dueling Network Architectures for Deep Reinforcement Learning](http://proceedings.mlr.press/v48/wangf16.html)
 
-# Mountain-CAR benchmark & performance
-[MountainCar-v0](https://gym.openai.com/envs/MountainCar-v0/)
+# Atari benchmark & performance
+## [Atari games introduction](https://gym.openai.com/envs/#atari)
 
-A car is on a one-dimensional track, positioned between two "mountains". The goal is to drive up the mountain on the right; however, the car's engine is not strong enough to scale the mountain in a single pass. Therefore, the only way to succeed is to drive back and forth to build up momentum.
++ Pong game result
+![DQN result](assets/dqn.png)
 
+# How to use
++ Dependencies:
+    + python2.7
+    + gym
+    + tqdm
+    + paddlepaddle-gpu==0.12.0
 
++ Start Training:
+    ```
+    # To train a model for Pong game with gpu (use DQN model as default)
+    python train.py --rom ./rom_files/pong.bin --use_cuda
 
-<img src="curve.png" >
+    # To train a model for Pong with DoubleDQN
+    python train.py --rom ./rom_files/pong.bin --use_cuda --alg DoubleDQN
 
+    # To train a model for Pong with DuelingDQN
+    python train.py --rom ./rom_files/pong.bin --use_cuda --alg DuelingDQN
+    ```
 
+To train more games, can install more rom files from [here](https://github.com/openai/atari-py/tree/master/atari_py/atari_roms)
 
-# How to use
-+ Dependencies:
-   + python2.7
-   + gym
-   + tqdm
-   + paddle-fluid
-+ Start Training:
-   ```
-   # use mountain-car enviroment as default
-   python DQN.py
++ Start Testing:
+    ```
+    # Play the game with saved model and calculate the average rewards
+    python play.py --rom ./rom_files/pong.bin --use_cuda --model_path ./saved_model/DQN-pong/stepXXXXX
 
-   # use other enviorment
-   python DQN.py --env CartPole-v0
-   ```
+    # Play the game with visualization
+    python play.py --rom ./rom_files/pong.bin --use_cuda --model_path ./saved_model/DQN-pong/stepXXXXX --viz 0.01
+    ```
diff --git a/fluid/DeepQNetwork/agent.py b/fluid/DeepQNetwork/agent.py
deleted file mode 100644
index 928ce86e573ed1f042d1b8a85d5443405ea109e1..0000000000000000000000000000000000000000
--- a/fluid/DeepQNetwork/agent.py
+++ /dev/null
@@ -1,148 +0,0 @@
-#-*- coding: utf-8 -*-
-#File: agent.py
-
-import paddle.fluid as fluid
-from paddle.fluid.param_attr import ParamAttr
-import numpy as np
-from tqdm import tqdm
-import math
-
-UPDATE_TARGET_STEPS = 200
-
-
-class Model(object):
-    def __init__(self, state_dim, action_dim, gamma):
-        self.global_step = 0
-        self.state_dim = state_dim
-        self.action_dim = action_dim
-        self.gamma = gamma
-        self.exploration = 1.0
-
-        self._build_net()
-
-    def _get_inputs(self):
-        return [fluid.layers.data(\
-                    name='state', shape=[self.state_dim], dtype='float32'),
-                fluid.layers.data(\
-                    name='action', shape=[1], dtype='int32'),
-                fluid.layers.data(\
-                    name='reward', shape=[], dtype='float32'),
-                fluid.layers.data(\
-                    name='next_s', shape=[self.state_dim], dtype='float32'),
-                fluid.layers.data(\
-                  name='isOver', shape=[], dtype='bool')]
-
-    def _build_net(self):
-        state, action, reward, next_s, isOver = self._get_inputs()
-        self.pred_value = self.get_DQN_prediction(state)
-        self.predict_program = fluid.default_main_program().clone()
-
-        action_onehot = fluid.layers.one_hot(action, self.action_dim)
-        action_onehot = fluid.layers.cast(action_onehot, dtype='float32')
-
-        pred_action_value = fluid.layers.reduce_sum(\
-                    fluid.layers.elementwise_mul(action_onehot, self.pred_value), dim=1)
-
-        targetQ_predict_value = self.get_DQN_prediction(next_s, target=True)
-        best_v = fluid.layers.reduce_max(targetQ_predict_value, dim=1)
-        best_v.stop_gradient = True
-
-        target = reward + (1.0 - fluid.layers.cast(\
-            isOver, dtype='float32')) * self.gamma * best_v
-        cost = fluid.layers.square_error_cost(\
-            input=pred_action_value, label=target)
-        cost = fluid.layers.reduce_mean(cost)
-
-        self._sync_program = self._build_sync_target_network()
-
-        optimizer = fluid.optimizer.Adam(1e-3)
-        optimizer.minimize(cost)
-
-        # define program
-        self.train_program = fluid.default_main_program()
-
-        # fluid exe
-        place = fluid.CUDAPlace(0)
-        self.exe = fluid.Executor(place)
-        self.exe.run(fluid.default_startup_program())
-
-    def get_DQN_prediction(self, state, target=False):
-        variable_field = 'target' if target else 'policy'
-        # layer fc1
-        param_attr = ParamAttr(name='{}_fc1'.format(variable_field))
-        bias_attr = ParamAttr(name='{}_fc1_b'.format(variable_field))
-        fc1 = fluid.layers.fc(input=state,
-                              size=256,
-                              act='relu',
-                              param_attr=param_attr,
-                              bias_attr=bias_attr)
-
-        param_attr = ParamAttr(name='{}_fc2'.format(variable_field))
-        bias_attr = ParamAttr(name='{}_fc2_b'.format(variable_field))
-        fc2 = fluid.layers.fc(input=fc1,
-                              size=128,
-                              act='tanh',
-                              param_attr=param_attr,
-                              bias_attr=bias_attr)
-
-        param_attr = ParamAttr(name='{}_fc3'.format(variable_field))
-        bias_attr = ParamAttr(name='{}_fc3_b'.format(variable_field))
-        value = fluid.layers.fc(input=fc2,
-                                size=self.action_dim,
-                                param_attr=param_attr,
-                                bias_attr=bias_attr)
-
-        return value
-
-    def _build_sync_target_network(self):
-        vars = fluid.default_main_program().list_vars()
-        policy_vars = []
-        target_vars = []
-        for var in vars:
-            if 'GRAD' in var.name: continue
-            if 'policy' in var.name:
-                policy_vars.append(var)
-            elif 'target' in var.name:
-                target_vars.append(var)
-
-        policy_vars.sort(key=lambda x: x.name.split('policy_')[1])
-        target_vars.sort(key=lambda x: x.name.split('target_')[1])
-
-        sync_program = fluid.default_main_program().clone()
-        with fluid.program_guard(sync_program):
-            sync_ops = []
-            for i, var in enumerate(policy_vars):
-                sync_op = fluid.layers.assign(policy_vars[i], target_vars[i])
-                sync_ops.append(sync_op)
-        sync_program = sync_program.prune(sync_ops)
-        return sync_program
-
-    def act(self, state, train_or_test):
-        sample = np.random.random()
-        if train_or_test == 'train' and sample < self.exploration:
-            act = np.random.randint(self.action_dim)
-        else:
-            state = np.expand_dims(state, axis=0)
-            pred_Q = self.exe.run(self.predict_program,
-                                  feed={'state': state.astype('float32')},
-                                  fetch_list=[self.pred_value])[0]
-            pred_Q = np.squeeze(pred_Q, axis=0)
-            act = np.argmax(pred_Q)
-        self.exploration = max(0.1, self.exploration - 1e-6)
-        return act
-
-    def train(self, state, action, reward, next_state, isOver):
-        if self.global_step % UPDATE_TARGET_STEPS == 0:
-            self.sync_target_network()
-        self.global_step += 1
-
-        action = np.expand_dims(action, -1)
-        self.exe.run(self.train_program, \
-                  feed={'state': state, \
-                        'action': action, \
-                        'reward': reward, \
-                        'next_s': next_state, \
-                        'isOver': isOver})
-
-    def sync_target_network(self):
-        self.exe.run(self._sync_program)
diff --git a/fluid/DeepQNetwork/assets/dqn.png b/fluid/DeepQNetwork/assets/dqn.png
new file mode 100644
index 0000000000000000000000000000000000000000..f8f8d12f9887cdab62f09b52597ec187a4c8107c
Binary files /dev/null and b/fluid/DeepQNetwork/assets/dqn.png differ
diff --git a/fluid/DeepQNetwork/atari.py b/fluid/DeepQNetwork/atari.py
new file mode 100644
index 0000000000000000000000000000000000000000..5006de4d4e6b57110ca7301395e170666d24e8b4
--- /dev/null
+++ b/fluid/DeepQNetwork/atari.py
@@ -0,0 +1,160 @@
+# -*- coding: utf-8 -*-
+
+import numpy as np
+import os
+import cv2
+import threading
+
+import gym
+from gym import spaces
+from gym.envs.atari.atari_env import ACTION_MEANING
+
+from ale_python_interface import ALEInterface
+
+__all__ = ['AtariPlayer']
+
+ROM_URL = "https://github.com/openai/atari-py/tree/master/atari_py/atari_roms"
+_ALE_LOCK = threading.Lock()
+"""
+The following AtariPlayer are copied or modified from tensorpack/tensorpack:
+    https://github.com/tensorpack/tensorpack/blob/master/examples/DeepQNetwork/atari.py
+"""
+
+
+class AtariPlayer(gym.Env):
+    """
+    A wrapper for ALE emulator, with configurations to mimic DeepMind DQN settings.
+    Info:
+        score: the accumulated reward in the current game
+        gameOver: True when the current game is Over
+    """
+
+    def __init__(self,
+                 rom_file,
+                 viz=0,
+                 frame_skip=4,
+                 nullop_start=30,
+                 live_lost_as_eoe=True,
+                 max_num_frames=0):
+        """
+        Args:
+            rom_file: path to the rom
+            frame_skip: skip every k frames and repeat the action
+            viz: visualization to be done.
+                Set to 0 to disable.
+                Set to a positive number to be the delay between frames to show.
+                Set to a string to be a directory to store frames.
+            nullop_start: start with random number of null ops.
+            live_losts_as_eoe: consider lost of lives as end of episode. Useful for training.
+            max_num_frames: maximum number of frames per episode.
+        """
+        super(AtariPlayer, self).__init__()
+        assert os.path.isfile(rom_file), \
+            "rom {} not found. Please download at {}".format(rom_file, ROM_URL)
+
+        try:
+            ALEInterface.setLoggerMode(ALEInterface.Logger.Error)
+        except AttributeError:
+            print "You're not using latest ALE"
+
+        # avoid simulator bugs: https://github.com/mgbellemare/Arcade-Learning-Environment/issues/86
+        with _ALE_LOCK:
+            self.ale = ALEInterface()
+            self.ale.setInt(b"random_seed", np.random.randint(0, 30000))
+            self.ale.setInt(b"max_num_frames_per_episode", max_num_frames)
+            self.ale.setBool(b"showinfo", False)
+
+            self.ale.setInt(b"frame_skip", 1)
+            self.ale.setBool(b'color_averaging', False)
+            # manual.pdf suggests otherwise.
+            self.ale.setFloat(b'repeat_action_probability', 0.0)
+
+            # viz setup
+            if isinstance(viz, str):
+                assert os.path.isdir(viz), viz
+                self.ale.setString(b'record_screen_dir', viz)
+                viz = 0
+            if isinstance(viz, int):
+                viz = float(viz)
+            self.viz = viz
+            if self.viz and isinstance(self.viz, float):
+                self.windowname = os.path.basename(rom_file)
+                cv2.startWindowThread()
+                cv2.namedWindow(self.windowname)
+
+            self.ale.loadROM(rom_file.encode('utf-8'))
+        self.width, self.height = self.ale.getScreenDims()
+        self.actions = self.ale.getMinimalActionSet()
+
+        self.live_lost_as_eoe = live_lost_as_eoe
+        self.frame_skip = frame_skip
+        self.nullop_start = nullop_start
+
+        self.action_space = spaces.Discrete(len(self.actions))
+        self.observation_space = spaces.Box(low=0,
+                                            high=255,
+                                            shape=(self.height, self.width),
+                                            dtype=np.uint8)
+        self._restart_episode()
+
+    def get_action_meanings(self):
+        return [ACTION_MEANING[i] for i in self.actions]
+
+    def _grab_raw_image(self):
+        """
+        :returns: the current 3-channel image
+        """
+        m = self.ale.getScreenRGB()
+        return m.reshape((self.height, self.width, 3))
+
+    def _current_state(self):
+        """
+        returns: a gray-scale (h, w) uint8 image
+        """
+        ret = self._grab_raw_image()
+        # avoid missing frame issue: max-pooled over the last screen
+        ret = np.maximum(ret, self.last_raw_screen)
+        if self.viz:
+            if isinstance(self.viz, float):
+                cv2.imshow(self.windowname, ret)
+                cv2.waitKey(int(self.viz * 1000))
+        ret = ret.astype('float32')
+        # 0.299,0.587.0.114. same as rgb2y in torch/image
+        ret = cv2.cvtColor(ret, cv2.COLOR_RGB2GRAY)
+        return ret.astype('uint8')  # to save some memory
+
+    def _restart_episode(self):
+        with _ALE_LOCK:
+            self.ale.reset_game()
+
+        # random null-ops start
+        n = np.random.randint(self.nullop_start)
+        self.last_raw_screen = self._grab_raw_image()
+        for k in range(n):
+            if k == n - 1:
+                self.last_raw_screen = self._grab_raw_image()
+            self.ale.act(0)
+
+    def reset(self):
+        if self.ale.game_over():
+            self._restart_episode()
+        return self._current_state()
+
+    def step(self, act):
+        oldlives = self.ale.lives()
+        r = 0
+        for k in range(self.frame_skip):
+            if k == self.frame_skip - 1:
+                self.last_raw_screen = self._grab_raw_image()
+            r += self.ale.act(self.actions[act])
+            newlives = self.ale.lives()
+            if self.ale.game_over() or \
+                    (self.live_lost_as_eoe and newlives < oldlives):
+                break
+
+        isOver = self.ale.game_over()
+        if self.live_lost_as_eoe:
+            isOver = isOver or newlives < oldlives
+
+        info = {'ale.lives': newlives}
+        return self._current_state(), r, isOver, info
diff --git a/fluid/DeepQNetwork/atari_wrapper.py b/fluid/DeepQNetwork/atari_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..81ec7e0ba0ee191f70591c16bfff560a62d3d395
--- /dev/null
+++ b/fluid/DeepQNetwork/atari_wrapper.py
@@ -0,0 +1,106 @@
+# -*- coding: utf-8 -*-
+
+import numpy as np
+from collections import deque
+
+import gym
+from gym import spaces
+
+_v0, _v1 = gym.__version__.split('.')[:2]
+assert int(_v0) > 0 or int(_v1) >= 10, gym.__version__
+"""
+The following wrappers are copied or modified from openai/baselines:
+https://github.com/openai/baselines/blob/master/baselines/common/atari_wrappers.py
+"""
+
+
+class MapState(gym.ObservationWrapper):
+    def __init__(self, env, map_func):
+        gym.ObservationWrapper.__init__(self, env)
+        self._func = map_func
+
+    def observation(self, obs):
+        return self._func(obs)
+
+
+class FrameStack(gym.Wrapper):
+    def __init__(self, env, k):
+        """Buffer observations and stack across channels (last axis)."""
+        gym.Wrapper.__init__(self, env)
+        self.k = k
+        self.frames = deque([], maxlen=k)
+        shp = env.observation_space.shape
+        chan = 1 if len(shp) == 2 else shp[2]
+        self.observation_space = spaces.Box(low=0,
+                                            high=255,
+                                            shape=(shp[0], shp[1], chan * k),
+                                            dtype=np.uint8)
+
+    def reset(self):
+        """Clear buffer and re-fill by duplicating the first observation."""
+        ob = self.env.reset()
+        for _ in range(self.k - 1):
+            self.frames.append(np.zeros_like(ob))
+        self.frames.append(ob)
+        return self.observation()
+
+    def step(self, action):
+        ob, reward, done, info = self.env.step(action)
+        self.frames.append(ob)
+        return self.observation(), reward, done, info
+
+    def observation(self):
+        assert len(self.frames) == self.k
+        return np.stack(self.frames, axis=0)
+
+
+class _FireResetEnv(gym.Wrapper):
+    def __init__(self, env):
+        """Take action on reset for environments that are fixed until firing."""
+        gym.Wrapper.__init__(self, env)
+        assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
+        assert len(env.unwrapped.get_action_meanings()) >= 3
+
+    def reset(self):
+        self.env.reset()
+        obs, _, done, _ = self.env.step(1)
+        if done:
+            self.env.reset()
+        obs, _, done, _ = self.env.step(2)
+        if done:
+            self.env.reset()
+        return obs
+
+    def step(self, action):
+        return self.env.step(action)
+
+
+def FireResetEnv(env):
+    if isinstance(env, gym.Wrapper):
+        baseenv = env.unwrapped
+    else:
+        baseenv = env
+    if 'FIRE' in baseenv.get_action_meanings():
+        return _FireResetEnv(env)
+    return env
+
+
+class LimitLength(gym.Wrapper):
+    def __init__(self, env, k):
+        gym.Wrapper.__init__(self, env)
+        self.k = k
+
+    def reset(self):
+        # This assumes that reset() will really reset the env.
+        # If the underlying env tries to be smart about reset
+        # (e.g. end-of-life), the assumption doesn't hold.
+        ob = self.env.reset()
+        self.cnt = 0
+        return ob
+
+    def step(self, action):
+        ob, r, done, info = self.env.step(action)
+        self.cnt += 1
+        if self.cnt == self.k:
+            done = True
+        return ob, r, done, info
diff --git a/fluid/DeepQNetwork/curve.png b/fluid/DeepQNetwork/curve.png
deleted file mode 100644
index a283413797c96350f399ea0236750525d2dba1f3..0000000000000000000000000000000000000000
Binary files a/fluid/DeepQNetwork/curve.png and /dev/null differ
diff --git a/fluid/DeepQNetwork/expreplay.py b/fluid/DeepQNetwork/expreplay.py
index 06599226418ffa7ec04905e5f538d272ef986bf0..5f27ca7286b5db7ac963bc25236be416fad50eb0 100644
--- a/fluid/DeepQNetwork/expreplay.py
+++ b/fluid/DeepQNetwork/expreplay.py
@@ -1,50 +1,98 @@
-#-*- coding: utf-8 -*-
-#File: expreplay.py
+# -*- coding: utf-8 -*-
 
-from collections import namedtuple
 import numpy as np
+import copy
+from collections import deque, namedtuple
 
 Experience = namedtuple('Experience', ['state', 'action', 'reward', 'isOver'])
 
 
 class ReplayMemory(object):
-    def __init__(self, max_size, state_shape):
+    def __init__(self, max_size, state_shape, context_len):
         self.max_size = int(max_size)
         self.state_shape = state_shape
+        self.context_len = int(context_len)
 
-        self.state = np.zeros((self.max_size, ) + state_shape, dtype='float32')
+        self.state = np.zeros((self.max_size, ) + state_shape, dtype='uint8')
         self.action = np.zeros((self.max_size, ), dtype='int32')
         self.reward = np.zeros((self.max_size, ), dtype='float32')
         self.isOver = np.zeros((self.max_size, ), dtype='bool')
 
         self._curr_size = 0
         self._curr_pos = 0
+        self._context = deque(maxlen=context_len - 1)
 
     def append(self, exp):
+        """append a new experience into replay memory
+        """
         if self._curr_size < self.max_size:
             self._assign(self._curr_pos, exp)
             self._curr_size += 1
         else:
             self._assign(self._curr_pos, exp)
         self._curr_pos = (self._curr_pos + 1) % self.max_size
+        if exp.isOver:
+            self._context.clear()
+        else:
+            self._context.append(exp)
+
+    def recent_state(self):
+        """ maintain recent state for training"""
+        lst = list(self._context)
+        states = [np.zeros(self.state_shape, dtype='uint8')] * \
+                    (self._context.maxlen - len(lst))
+        states.extend([k.state for k in lst])
+        return states
+
+    def sample(self, idx):
+        """ return state, action, reward, isOver,
+            note that some frames in state may be generated from last episode,
+            they should be removed from state
+            """
+        state = np.zeros(
+            (self.context_len + 1, ) + self.state_shape, dtype=np.uint8)
+        state_idx = np.arange(idx, idx + self.context_len + 1) % self._curr_size
+
+        # confirm that no frame was generated from last episode
+        has_last_episode = False
+        for k in range(self.context_len - 2, -1, -1):
+            to_check_idx = state_idx[k]
+            if self.isOver[to_check_idx]:
+                has_last_episode = True
+                state_idx = state_idx[k + 1:]
+                state[k + 1:] = self.state[state_idx]
+                break
+
+        if not has_last_episode:
+            state = self.state[state_idx]
+
+        real_idx = (idx + self.context_len - 1) % self._curr_size
+        action = self.action[real_idx]
+        reward = self.reward[real_idx]
+        isOver = self.isOver[real_idx]
+        return state, reward, action, isOver
+
+    def __len__(self):
+        return self._curr_size
 
     def _assign(self, pos, exp):
         self.state[pos] = exp.state
-        self.action[pos] = exp.action
         self.reward[pos] = exp.reward
+        self.action[pos] = exp.action
         self.isOver[pos] = exp.isOver
 
-    def __len__(self):
-        return self._curr_size
-
-    def sample(self, batch_idx):
-        # index mapping to avoid sampling lastest state
+    def sample_batch(self, batch_size):
+        """sample a batch from replay memory for training
+        """
+        batch_idx = np.random.randint(
+            self._curr_size - self.context_len - 1, size=batch_size)
         batch_idx = (self._curr_pos + batch_idx) % self._curr_size
-        next_idx = (batch_idx + 1) % self._curr_size
-
-        state = self.state[batch_idx]
-        reward = self.reward[batch_idx]
-        action = self.action[batch_idx]
-        next_state = self.state[next_idx]
-        isOver = self.isOver[batch_idx]
-        return (state, action, reward, next_state, isOver)
+        batch_exp = [self.sample(i) for i in batch_idx]
+        return self._process_batch(batch_exp)
+
+    def _process_batch(self, batch_exp):
+        state = np.asarray([e[0] for e in batch_exp], dtype='uint8')
+        reward = np.asarray([e[1] for e in batch_exp], dtype='float32')
+        action = np.asarray([e[2] for e in batch_exp], dtype='int8')
+        isOver = np.asarray([e[3] for e in batch_exp], dtype='bool')
+        return [state, action, reward, isOver]
diff --git a/fluid/DeepQNetwork/mountain_car.gif b/fluid/DeepQNetwork/mountain_car.gif
deleted file mode 100644
index 5665d67d2cddbfb9c30dc588a085748e056bb16a..0000000000000000000000000000000000000000
Binary files a/fluid/DeepQNetwork/mountain_car.gif and /dev/null differ
diff --git a/fluid/DeepQNetwork/play.py b/fluid/DeepQNetwork/play.py
new file mode 100644
index 0000000000000000000000000000000000000000..2920391f105aeca1e99c347174464688edb47dae
--- /dev/null
+++ b/fluid/DeepQNetwork/play.py
@@ -0,0 +1,65 @@
+#-*- coding: utf-8 -*-
+
+import argparse
+import os
+import numpy as np
+import paddle.fluid as fluid
+
+from train import get_player
+from tqdm import tqdm
+
+
+def predict_action(exe, state, predict_program, feed_names, fetch_targets,
+                   action_dim):
+    if np.random.randint(100) == 0:
+        act = np.random.randint(action_dim)
+    else:
+        state = np.expand_dims(state, axis=0)
+        pred_Q = exe.run(predict_program,
+                         feed={feed_names[0]: state.astype('float32')},
+                         fetch_list=fetch_targets)[0]
+        pred_Q = np.squeeze(pred_Q, axis=0)
+        act = np.argmax(pred_Q)
+    return act
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--use_cuda', action='store_true', help='if set, use cuda')
+    parser.add_argument('--rom', type=str, required=True, help='atari rom')
+    parser.add_argument(
+        '--model_path', type=str, required=True, help='dirname to load model')
+    parser.add_argument(
+        '--viz',
+        type=float,
+        default=0,
+        help='''viz: visualization setting:
+                Set to 0 to disable;
+                Set to a positive number to be the delay between frames to show.
+             ''')
+    args = parser.parse_args()
+
+    env = get_player(args.rom, viz=args.viz)
+
+    place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    inference_scope = fluid.core.Scope()
+    with fluid.scope_guard(inference_scope):
+        [predict_program, feed_names,
+         fetch_targets] = fluid.io.load_inference_model(args.model_path, exe)
+
+        episode_reward = []
+        for _ in tqdm(xrange(30), desc='eval agent'):
+            state = env.reset()
+            total_reward = 0
+            while True:
+                action = predict_action(exe, state, predict_program, feed_names,
+                                        fetch_targets, env.action_space.n)
+                state, reward, isOver, info = env.step(action)
+                total_reward += reward
+                if isOver:
+                    break
+            episode_reward.append(total_reward)
+        eval_reward = np.mean(episode_reward)
+        print('Average reward of 30 epidose: {}'.format(eval_reward))
diff --git a/fluid/DeepQNetwork/rom_files/breakout.bin b/fluid/DeepQNetwork/rom_files/breakout.bin
new file mode 100644
index 0000000000000000000000000000000000000000..abab5a8c0a1890461a11b78d4265f1b794327793
Binary files /dev/null and b/fluid/DeepQNetwork/rom_files/breakout.bin differ
diff --git a/fluid/DeepQNetwork/rom_files/pong.bin b/fluid/DeepQNetwork/rom_files/pong.bin
new file mode 100644
index 0000000000000000000000000000000000000000..14a5bdfc72548613c059938bdf712efdbb5d3806
Binary files /dev/null and b/fluid/DeepQNetwork/rom_files/pong.bin differ
diff --git a/fluid/DeepQNetwork/train.py b/fluid/DeepQNetwork/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e75fe77bc53df24cab2f5bebad9f59ee88a8a3e
--- /dev/null
+++ b/fluid/DeepQNetwork/train.py
@@ -0,0 +1,187 @@
+#-*- coding: utf-8 -*-
+
+from DQN_agent import DQNModel
+from DoubleDQN_agent import DoubleDQNModel
+from DuelingDQN_agent import DuelingDQNModel
+from atari import AtariPlayer
+import paddle.fluid as fluid
+import gym
+import argparse
+import cv2
+from tqdm import tqdm
+from expreplay import ReplayMemory, Experience
+import numpy as np
+import os
+
+from datetime import datetime
+from atari_wrapper import FrameStack, MapState, FireResetEnv, LimitLength
+from collections import deque
+
+UPDATE_FREQ = 4
+
+#MEMORY_WARMUP_SIZE = 2000
+MEMORY_SIZE = 1e6
+MEMORY_WARMUP_SIZE = MEMORY_SIZE // 20
+IMAGE_SIZE = (84, 84)
+CONTEXT_LEN = 4
+ACTION_REPEAT = 4  # aka FRAME_SKIP
+UPDATE_FREQ = 4
+
+
+def run_train_episode(agent, env, exp):
+    total_reward = 0
+    state = env.reset()
+    step = 0
+    while True:
+        step += 1
+        context = exp.recent_state()
+        context.append(state)
+        context = np.stack(context, axis=0)
+        action = agent.act(context, train_or_test='train')
+        next_state, reward, isOver, _ = env.step(action)
+        exp.append(Experience(state, action, reward, isOver))
+        # train model
+        # start training 
+        if len(exp) > MEMORY_WARMUP_SIZE:
+            if step % UPDATE_FREQ == 0:
+                batch_all_state, batch_action, batch_reward, batch_isOver = exp.sample_batch(
+                    args.batch_size)
+                batch_state = batch_all_state[:, :CONTEXT_LEN, :, :]
+                batch_next_state = batch_all_state[:, 1:, :, :]
+                agent.train(batch_state, batch_action, batch_reward,
+                            batch_next_state, batch_isOver)
+        total_reward += reward
+        state = next_state
+        if isOver:
+            break
+    return total_reward, step
+
+
+def get_player(rom, viz=False, train=False):
+    env = AtariPlayer(
+        rom,
+        frame_skip=ACTION_REPEAT,
+        viz=viz,
+        live_lost_as_eoe=train,
+        max_num_frames=60000)
+    env = FireResetEnv(env)
+    env = MapState(env, lambda im: cv2.resize(im, IMAGE_SIZE))
+    if not train:
+        # in training, context is taken care of in expreplay buffer
+        env = FrameStack(env, CONTEXT_LEN)
+    return env
+
+
+def eval_agent(agent, env):
+    episode_reward = []
+    for _ in tqdm(xrange(30), desc='eval agent'):
+        state = env.reset()
+        total_reward = 0
+        step = 0
+        while True:
+            step += 1
+            action = agent.act(state, train_or_test='test')
+            state, reward, isOver, info = env.step(action)
+            total_reward += reward
+            if isOver:
+                break
+        episode_reward.append(total_reward)
+    eval_reward = np.mean(episode_reward)
+    return eval_reward
+
+
+def train_agent():
+    env = get_player(args.rom, train=True)
+    test_env = get_player(args.rom)
+    exp = ReplayMemory(args.mem_size, IMAGE_SIZE, CONTEXT_LEN)
+    action_dim = env.action_space.n
+
+    if args.alg == 'DQN':
+        agent = DQNModel(IMAGE_SIZE, action_dim, args.gamma, CONTEXT_LEN,
+                         args.use_cuda)
+    elif args.alg == 'DoubleDQN':
+        agent = DoubleDQNModel(IMAGE_SIZE, action_dim, args.gamma, CONTEXT_LEN,
+                               args.use_cuda)
+    elif args.alg == 'DuelingDQN':
+        agent = DuelingDQNModel(IMAGE_SIZE, action_dim, args.gamma, CONTEXT_LEN,
+                                args.use_cuda)
+    else:
+        print('Input algorithm name error!')
+        return
+
+    with tqdm(total=MEMORY_WARMUP_SIZE) as pbar:
+        while len(exp) < MEMORY_WARMUP_SIZE:
+            total_reward, step = run_train_episode(agent, env, exp)
+            pbar.update(step)
+
+    # train
+    test_flag = 0
+    save_flag = 0
+    pbar = tqdm(total=1e8)
+    recent_100_reward = []
+    total_step = 0
+    while True:
+        # start epoch
+        total_reward, step = run_train_episode(agent, env, exp)
+        total_step += step
+        pbar.set_description('[train]exploration:{}'.format(agent.exploration))
+        pbar.update(step)
+
+        if total_step // args.test_every_steps == test_flag:
+            pbar.write("testing")
+            eval_reward = eval_agent(agent, test_env)
+            test_flag += 1
+            print("eval_agent done, (steps, eval_reward): ({}, {})".format(
+                total_step, eval_reward))
+
+        if total_step // args.save_every_steps == save_flag:
+            save_flag += 1
+            save_path = os.path.join(args.model_dirname, '{}-{}'.format(
+                args.alg, os.path.basename(args.rom).split('.')[0]),
+                                     'step{}'.format(total_step))
+            fluid.io.save_inference_model(save_path, ['state'],
+                                          agent.pred_value, agent.exe,
+                                          agent.predict_program)
+    pbar.close()
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--alg',
+        type=str,
+        default='DQN',
+        help='Reinforcement learning algorithm, support: DQN, DoubleDQN, DuelingDQN'
+    )
+    parser.add_argument(
+        '--use_cuda', action='store_true', help='if set, use cuda')
+    parser.add_argument(
+        '--gamma',
+        type=float,
+        default=0.99,
+        help='discount factor for accumulated reward computation')
+    parser.add_argument(
+        '--mem_size',
+        type=int,
+        default=1000000,
+        help='memory size for experience replay')
+    parser.add_argument(
+        '--batch_size', type=int, default=64, help='batch size for training')
+    parser.add_argument('--rom', help='atari rom', required=True)
+    parser.add_argument(
+        '--model_dirname',
+        type=str,
+        default='saved_model',
+        help='dirname to save model')
+    parser.add_argument(
+        '--save_every_steps',
+        type=int,
+        default=100000,
+        help='every steps number to save model')
+    parser.add_argument(
+        '--test_every_steps',
+        type=int,
+        default=100000,
+        help='every steps number to run test')
+    args = parser.parse_args()
+    train_agent()
diff --git a/fluid/DeepQNetwork/utils.py b/fluid/DeepQNetwork/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..26ed7fbdb54494c3cf9a983f8ecafdfbcd4d2719
--- /dev/null
+++ b/fluid/DeepQNetwork/utils.py
@@ -0,0 +1,20 @@
+#-*- coding: utf-8 -*-
+#File: utils.py
+
+import paddle.fluid as fluid
+import numpy as np
+
+
+def fluid_argmax(x):
+    """
+    Get index of max value for the last dimension
+    """
+    _, max_index = fluid.layers.topk(x, k=1)
+    return max_index
+
+
+def fluid_flatten(x):
+    """
+    Flatten fluid variable along the first dimension
+    """
+    return fluid.layers.reshape(x, shape=[-1, np.prod(x.shape[1:])])
diff --git a/fluid/face_detection/.gitignore b/fluid/face_detection/.gitignore
index 27735faca6e555e439300fca5dccd893f70ef9a0..13d42af893162c1908a39fea1d072a22929e5430 100644
--- a/fluid/face_detection/.gitignore
+++ b/fluid/face_detection/.gitignore
@@ -1,5 +1,7 @@
 model/
+pretrained/
 data/
 label/
-pretrained/
 *.swp
+*.log
+infer_results/
diff --git a/fluid/face_detection/infer.py b/fluid/face_detection/infer.py
new file mode 100644
index 0000000000000000000000000000000000000000..71a878cb39f9888e3c308ee24e34dd6c3a073d33
--- /dev/null
+++ b/fluid/face_detection/infer.py
@@ -0,0 +1,303 @@
+import os
+import time
+import numpy as np
+import argparse
+import functools
+from PIL import Image
+from PIL import ImageDraw
+
+import paddle
+import paddle.fluid as fluid
+import reader
+from pyramidbox import PyramidBox
+from utility import add_arguments, print_arguments
+parser = argparse.ArgumentParser(description=__doc__)
+add_arg = functools.partial(add_arguments, argparser=parser)
+# yapf: disable
+add_arg('use_gpu',          bool,  True,      "Whether use GPU.")
+add_arg('use_pyramidbox',   bool,  False, "Whether use PyramidBox model.")
+add_arg('confs_threshold',  float, 0.25,    "Confidence threshold to draw bbox.")
+add_arg('image_path',       str,   '',        "The data root path.")
+add_arg('model_dir',        str,   '',     "The model path.")
+# yapf: enable
+
+
+def draw_bounding_box_on_image(image_path, nms_out, confs_threshold):
+    image = Image.open(image_path)
+    draw = ImageDraw.Draw(image)
+    for dt in nms_out:
+        xmin, ymin, xmax, ymax, score = dt
+        if score < confs_threshold:
+            continue
+        (left, right, top, bottom) = (xmin, xmax, ymin, ymax)
+        draw.line(
+            [(left, top), (left, bottom), (right, bottom), (right, top),
+             (left, top)],
+            width=4,
+            fill='red')
+    image_name = image_path.split('/')[-1]
+    image_class = image_path.split('/')[-2]
+    print("image with bbox drawed saved as {}".format(image_name))
+    image.save('./infer_results/' + image_class.encode('utf-8') + '/' +
+               image_name.encode('utf-8'))
+
+
+def write_to_txt(image_path, f, nms_out):
+    image_name = image_path.split('/')[-1]
+    image_class = image_path.split('/')[-2]
+    f.write('{:s}\n'.format(
+        image_class.encode('utf-8') + '/' + image_name.encode('utf-8')))
+    f.write('{:d}\n'.format(nms_out.shape[0]))
+    for dt in nms_out:
+        xmin, ymin, xmax, ymax, score = dt
+        f.write('{:.1f} {:.1f} {:.1f} {:.1f} {:.3f}\n'.format(xmin, ymin, (
+            xmax - xmin + 1), (ymax - ymin + 1), score))
+    print("image infer result saved {}".format(image_name[:-4]))
+
+
+def get_round(x, loc):
+    str_x = str(x)
+    if '.' in str_x:
+        len_after = len(str_x.split('.')[1])
+        str_before = str_x.split('.')[0]
+        str_after = str_x.split('.')[1]
+        if len_after >= 3:
+            str_final = str_before + '.' + str_after[0:loc]
+            return float(str_final)
+        else:
+            return x
+
+
+def bbox_vote(det):
+    order = det[:, 4].ravel().argsort()[::-1]
+    det = det[order, :]
+    if det.shape[0] == 0:
+        dets = np.array([[10, 10, 20, 20, 0.002]])
+        det = np.empty(shape=[0, 5])
+    while det.shape[0] > 0:
+        # IOU
+        area = (det[:, 2] - det[:, 0] + 1) * (det[:, 3] - det[:, 1] + 1)
+        xx1 = np.maximum(det[0, 0], det[:, 0])
+        yy1 = np.maximum(det[0, 1], det[:, 1])
+        xx2 = np.minimum(det[0, 2], det[:, 2])
+        yy2 = np.minimum(det[0, 3], det[:, 3])
+        w = np.maximum(0.0, xx2 - xx1 + 1)
+        h = np.maximum(0.0, yy2 - yy1 + 1)
+        inter = w * h
+        o = inter / (area[0] + area[:] - inter)
+
+        # get needed merge det and delete these det
+        merge_index = np.where(o >= 0.3)[0]
+        det_accu = det[merge_index, :]
+        det = np.delete(det, merge_index, 0)
+        if merge_index.shape[0] <= 1:
+            if det.shape[0] == 0:
+                try:
+                    dets = np.row_stack((dets, det_accu))
+                except:
+                    dets = det_accu
+            continue
+        det_accu[:, 0:4] = det_accu[:, 0:4] * np.tile(det_accu[:, -1:], (1, 4))
+        max_score = np.max(det_accu[:, 4])
+        det_accu_sum = np.zeros((1, 5))
+        det_accu_sum[:, 0:4] = np.sum(det_accu[:, 0:4],
+                                      axis=0) / np.sum(det_accu[:, -1:])
+        det_accu_sum[:, 4] = max_score
+        try:
+            dets = np.row_stack((dets, det_accu_sum))
+        except:
+            dets = det_accu_sum
+    dets = dets[0:750, :]
+    return dets
+
+
+def image_preprocess(image):
+    img = np.array(image)
+    # HWC to CHW
+    if len(img.shape) == 3:
+        img = np.swapaxes(img, 1, 2)
+        img = np.swapaxes(img, 1, 0)
+    # RBG to BGR
+    img = img[[2, 1, 0], :, :]
+    img = img.astype('float32')
+    img -= np.array(
+        [104., 117., 123.])[:, np.newaxis, np.newaxis].astype('float32')
+    img = img * 0.007843
+    img = [img]
+    img = np.array(img)
+    return img
+
+
+def detect_face(image, shrink):
+    image_shape = [3, image.size[1], image.size[0]]
+    num_classes = 2
+    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    if shrink != 1:
+        image = image.resize((int(image_shape[2] * shrink),
+                              int(image_shape[1] * shrink)), Image.ANTIALIAS)
+        image_shape = [
+            image_shape[0], int(image_shape[1] * shrink),
+            int(image_shape[2] * shrink)
+        ]
+    print "image_shape:", image_shape
+    img = image_preprocess(image)
+
+    scope = fluid.core.Scope()
+    main_program = fluid.Program()
+    startup_program = fluid.Program()
+
+    with fluid.scope_guard(scope):
+        with fluid.unique_name.guard():
+            with fluid.program_guard(main_program, startup_program):
+                fetches = []
+                network = PyramidBox(
+                    image_shape,
+                    num_classes,
+                    sub_network=args.use_pyramidbox,
+                    is_infer=True)
+                infer_program, nmsed_out = network.infer(main_program)
+                fetches = [nmsed_out]
+                fluid.io.load_persistables(
+                    exe, args.model_dir, main_program=main_program)
+
+                detection, = exe.run(infer_program,
+                                     feed={'image': img},
+                                     fetch_list=fetches,
+                                     return_numpy=False)
+                detection = np.array(detection)
+    # layout: xmin, ymin, xmax. ymax, score
+    det_conf = detection[:, 1]
+    det_xmin = image_shape[2] * detection[:, 2] / shrink
+    det_ymin = image_shape[1] * detection[:, 3] / shrink
+    det_xmax = image_shape[2] * detection[:, 4] / shrink
+    det_ymax = image_shape[1] * detection[:, 5] / shrink
+
+    det = np.column_stack((det_xmin, det_ymin, det_xmax, det_ymax, det_conf))
+    keep_index = np.where(det[:, 4] >= 0)[0]
+    det = det[keep_index, :]
+    return det
+
+
+def flip_test(image, shrink):
+    img = image.transpose(Image.FLIP_LEFT_RIGHT)
+    det_f = detect_face(img, shrink)
+    det_t = np.zeros(det_f.shape)
+    # image.size: [width, height]
+    det_t[:, 0] = image.size[0] - det_f[:, 2]
+    det_t[:, 1] = det_f[:, 1]
+    det_t[:, 2] = image.size[0] - det_f[:, 0]
+    det_t[:, 3] = det_f[:, 3]
+    det_t[:, 4] = det_f[:, 4]
+    return det_t
+
+
+def multi_scale_test(image, max_shrink):
+    # shrink detecting and shrink only detect big face
+    st = 0.5 if max_shrink >= 0.75 else 0.5 * max_shrink
+    det_s = detect_face(image, st)
+    index = np.where(
+        np.maximum(det_s[:, 2] - det_s[:, 0] + 1, det_s[:, 3] - det_s[:, 1] + 1)
+        > 30)[0]
+    det_s = det_s[index, :]
+    # enlarge one times
+    bt = min(2, max_shrink) if max_shrink > 1 else (st + max_shrink) / 2
+    det_b = detect_face(image, bt)
+
+    # enlarge small image x times for small face
+    if max_shrink > 2:
+        bt *= 2
+        while bt < max_shrink:
+            det_b = np.row_stack((det_b, detect_face(image, bt)))
+            bt *= 2
+        det_b = np.row_stack((det_b, detect_face(image, max_shrink)))
+
+    # enlarge only detect small face
+    if bt > 1:
+        index = np.where(
+            np.minimum(det_b[:, 2] - det_b[:, 0] + 1,
+                       det_b[:, 3] - det_b[:, 1] + 1) < 100)[0]
+        det_b = det_b[index, :]
+    else:
+        index = np.where(
+            np.maximum(det_b[:, 2] - det_b[:, 0] + 1,
+                       det_b[:, 3] - det_b[:, 1] + 1) > 30)[0]
+        det_b = det_b[index, :]
+    return det_s, det_b
+
+
+def get_im_shrink(image_shape):
+    max_shrink_v1 = (0x7fffffff / 577.0 /
+                     (image_shape[1] * image_shape[2]))**0.5
+    max_shrink_v2 = (
+        (678 * 1024 * 2.0 * 2.0) / (image_shape[1] * image_shape[2]))**0.5
+    max_shrink = get_round(min(max_shrink_v1, max_shrink_v2), 2) - 0.3
+
+    if max_shrink >= 1.5 and max_shrink < 2:
+        max_shrink = max_shrink - 0.1
+    elif max_shrink >= 2 and max_shrink < 3:
+        max_shrink = max_shrink - 0.2
+    elif max_shrink >= 3 and max_shrink < 4:
+        max_shrink = max_shrink - 0.3
+    elif max_shrink >= 4 and max_shrink < 5:
+        max_shrink = max_shrink - 0.4
+    elif max_shrink >= 5:
+        max_shrink = max_shrink - 0.5
+
+    print 'max_shrink = ', max_shrink
+    shrink = max_shrink if max_shrink < 1 else 1
+    print "shrink = ", shrink
+
+    return shrink, max_shrink
+
+
+def infer(args, batch_size, data_args):
+    if not os.path.exists(args.model_dir):
+        raise ValueError("The model path [%s] does not exist." %
+                         (args.model_dir))
+
+    infer_reader = paddle.batch(
+        reader.test(data_args, file_list), batch_size=batch_size)
+
+    for batch_id, img in enumerate(infer_reader()):
+        image = img[0][0]
+        image_path = img[0][1]
+
+        # image.size: [width, height]
+        image_shape = [3, image.size[1], image.size[0]]
+
+        shrink, max_shrink = get_im_shrink(image_shape)
+
+        det0 = detect_face(image, shrink)
+        det1 = flip_test(image, shrink)
+        [det2, det3] = multi_scale_test(image, max_shrink)
+        det = np.row_stack((det0, det1, det2, det3))
+        dets = bbox_vote(det)
+
+        image_name = image_path.split('/')[-1]
+        image_class = image_path.split('/')[-2]
+        if not os.path.exists('./infer_results/' + image_class.encode('utf-8')):
+            os.makedirs('./infer_results/' + image_class.encode('utf-8'))
+
+        f = open('./infer_results/' + image_class.encode('utf-8') + '/' +
+                 image_name.encode('utf-8')[:-4] + '.txt', 'w')
+        write_to_txt(image_path, f, dets)
+        # draw_bounding_box_on_image(image_path, dets, args.confs_threshold)
+    print "Done"
+
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+    print_arguments(args)
+
+    data_dir = 'data/WIDERFACE/WIDER_val/images/'
+    file_list = 'label/val_gt_widerface.res'
+
+    data_args = reader.Settings(
+        data_dir=data_dir,
+        mean_value=[104., 117., 123],
+        apply_distort=False,
+        apply_expand=False,
+        ap_version='11point')
+    infer(args, batch_size=1, data_args=data_args)
diff --git a/fluid/face_detection/pyramidbox.py b/fluid/face_detection/pyramidbox.py
index 6e38dabb3570646d35c8b41e39c39d17a7b3f190..ce01cb7a113219e08d4deb2984d2a12b2590faa5 100644
--- a/fluid/face_detection/pyramidbox.py
+++ b/fluid/face_detection/pyramidbox.py
@@ -4,6 +4,7 @@ import paddle.fluid as fluid
 from paddle.fluid.param_attr import ParamAttr
 from paddle.fluid.initializer import Xavier
 from paddle.fluid.initializer import Constant
+from paddle.fluid.initializer import Bilinear
 from paddle.fluid.regularizer import L2Decay
 
 
@@ -38,17 +39,31 @@ def conv_block(input, groups, filters, ksizes, strides=None, with_pool=True):
             act='relu')
     if with_pool:
         pool = fluid.layers.pool2d(
-            input=conv, pool_size=2, pool_type='max', pool_stride=2)
+            input=conv,
+            pool_size=2,
+            pool_type='max',
+            pool_stride=2,
+            ceil_mode=True)
         return conv, pool
     else:
         return conv
 
 
 class PyramidBox(object):
-    def __init__(self, data_shape, is_infer=False, sub_network=False):
+    def __init__(self,
+                 data_shape,
+                 num_classes,
+                 use_transposed_conv2d=True,
+                 is_infer=False,
+                 sub_network=False):
+        """
+        TODO(qingqing): add comments.
+        """
         self.data_shape = data_shape
         self.min_sizes = [16., 32., 64., 128., 256., 512.]
         self.steps = [4., 8., 16., 32., 64., 128.]
+        self.num_classes = num_classes
+        self.use_transposed_conv2d = use_transposed_conv2d
         self.is_infer = is_infer
         self.sub_network = sub_network
 
@@ -59,6 +74,8 @@ class PyramidBox(object):
             self._low_level_fpn()
             self._cpm_module()
             self._pyramidbox()
+        else:
+            self._vgg_ssd()
 
     def feeds(self):
         if self.is_infer:
@@ -113,20 +130,32 @@ class PyramidBox(object):
             b_attr = ParamAttr(learning_rate=2., regularizer=L2Decay(0.))
             conv1 = fluid.layers.conv2d(
                 up_from, ch, 1, act='relu', bias_attr=b_attr)
-            conv_trans = fluid.layers.conv2d_transpose(
-                conv1,
-                ch,
-                output_size=None,
-                filter_size=4,
-                padding=1,
-                stride=2,
-                groups=ch,
-                bias_attr=False)
+            if self.use_transposed_conv2d:
+                w_attr = ParamAttr(
+                    learning_rate=0.,
+                    regularizer=L2Decay(0.),
+                    initializer=Bilinear())
+                upsampling = fluid.layers.conv2d_transpose(
+                    conv1,
+                    ch,
+                    output_size=None,
+                    filter_size=4,
+                    padding=1,
+                    stride=2,
+                    groups=ch,
+                    param_attr=w_attr,
+                    bias_attr=False)
+            else:
+                upsampling = fluid.layers.resize_bilinear(
+                    conv1, out_shape=up_to.shape[2:])
+
             b_attr = ParamAttr(learning_rate=2., regularizer=L2Decay(0.))
             conv2 = fluid.layers.conv2d(
                 up_to, ch, 1, act='relu', bias_attr=b_attr)
+            if self.is_infer:
+                upsampling = fluid.layers.crop(upsampling, shape=conv2)
             # eltwise mul
-            conv_fuse = conv_trans * conv2
+            conv_fuse = upsampling * conv2
             return conv_fuse
 
         self.lfpn2_on_conv5 = fpn(self.conv6, self.conv5)
@@ -188,9 +217,10 @@ class PyramidBox(object):
         """
         Get prior-boxes and pyramid-box
         """
-        self.ssh_conv3_norm = self._l2_norm_scale(self.ssh_conv3)
-        self.ssh_conv4_norm = self._l2_norm_scale(self.ssh_conv4)
-        self.ssh_conv5_norm = self._l2_norm_scale(self.ssh_conv5)
+        self.ssh_conv3_norm = self._l2_norm_scale(
+            self.ssh_conv3, init_scale=10.)
+        self.ssh_conv4_norm = self._l2_norm_scale(self.ssh_conv4, init_scale=8.)
+        self.ssh_conv5_norm = self._l2_norm_scale(self.ssh_conv5, init_scale=5.)
 
         def permute_and_reshape(input, last_dim):
             trans = fluid.layers.transpose(input, perm=[0, 2, 3, 1])
@@ -234,9 +264,11 @@ class PyramidBox(object):
             box, var = fluid.layers.prior_box(
                 input,
                 self.image,
-                min_sizes=[self.min_sizes[1]],
+                min_sizes=[self.min_sizes[i]],
                 steps=[self.steps[i]] * 2,
                 aspect_ratios=[1.],
+                clip=False,
+                flip=True,
                 offset=0.5)
             box = fluid.layers.reshape(box, shape=[-1, 4])
             var = fluid.layers.reshape(var, shape=[-1, 4])
@@ -253,52 +285,125 @@ class PyramidBox(object):
         self.prior_boxes = fluid.layers.concat(boxes)
         self.box_vars = fluid.layers.concat(vars)
 
-    def vgg_ssd(self, num_classes, image_shape):
-        self.conv3_norm = self._l2_norm_scale(self.conv3)
-        self.conv4_norm = self._l2_norm_scale(self.conv4)
-        self.conv5_norm = self._l2_norm_scale(self.conv5)
-
-        mbox_locs, mbox_confs, box, box_var = fluid.layers.multi_box_head(
-            inputs=[
-                self.conv3_norm, self.conv4_norm, self.conv5_norm, self.conv6,
-                self.conv7, self.conv8
-            ],
-            image=self.image,
-            num_classes=num_classes,
-            # min_ratio=20,
-            # max_ratio=90,
-            min_sizes=[16.0, 32.0, 64.0, 128.0, 256.0, 512.0],
-            max_sizes=[[], [], [], [], [], []],
-            # max_sizes=[[], 150.0, 195.0, 240.0, 285.0, 300.0],
-            aspect_ratios=[[1.], [1.], [1.], [1.], [1.], [1.]],
-            steps=[4.0, 8.0, 16.0, 32.0, 64.0, 128.0],
-            base_size=image_shape[2],
-            offset=0.5,
-            flip=False)
-
-        # locs, confs, box, box_var = vgg_extra_net(num_classes, image, image_shape)
-        # nmsed_out = fluid.layers.detection_output(
-        # locs, confs, box, box_var, nms_threshold=args.nms_threshold)
-        loss = fluid.layers.ssd_loss(mbox_locs, mbox_confs, self.face_box,
-                                     self.gt_label, box, box_var)
-        loss = fluid.layers.reduce_sum(loss)
+    def _vgg_ssd(self):
+        self.conv3_norm = self._l2_norm_scale(self.conv3, init_scale=10.)
+        self.conv4_norm = self._l2_norm_scale(self.conv4, init_scale=8.)
+        self.conv5_norm = self._l2_norm_scale(self.conv5, init_scale=5.)
+
+        def permute_and_reshape(input, last_dim):
+            trans = fluid.layers.transpose(input, perm=[0, 2, 3, 1])
+            new_shape = [
+                trans.shape[0], np.prod(trans.shape[1:]) / last_dim, last_dim
+            ]
+            return fluid.layers.reshape(trans, shape=new_shape)
+
+        locs, confs = [], []
+        boxes, vars = [], []
+        b_attr = ParamAttr(learning_rate=2., regularizer=L2Decay(0.))
+
+        # conv3
+        mbox_loc = fluid.layers.conv2d(
+            self.conv3_norm, 4, 3, 1, 1, bias_attr=b_attr)
+        loc = permute_and_reshape(mbox_loc, 4)
+        mbox_conf = fluid.layers.conv2d(
+            self.conv3_norm, 4, 3, 1, 1, bias_attr=b_attr)
+        conf1, conf3 = fluid.layers.split(
+            mbox_conf, num_or_sections=[1, 3], dim=1)
+        conf3_maxin = fluid.layers.reduce_max(conf3, dim=1, keep_dim=True)
+        conf = fluid.layers.concat([conf1, conf3_maxin], axis=1)
+        conf = permute_and_reshape(conf, 2)
+        box, var = fluid.layers.prior_box(
+            self.conv3_norm,
+            self.image,
+            min_sizes=[16.],
+            steps=[4, 4],
+            aspect_ratios=[1.],
+            clip=False,
+            flip=True,
+            offset=0.5)
+        box = fluid.layers.reshape(box, shape=[-1, 4])
+        var = fluid.layers.reshape(var, shape=[-1, 4])
+
+        locs.append(loc)
+        confs.append(conf)
+        boxes.append(box)
+        vars.append(var)
+
+        min_sizes = [32., 64., 128., 256., 512.]
+        steps = [8., 16., 32., 64., 128.]
+        inputs = [
+            self.conv4_norm, self.conv5_norm, self.conv6, self.conv7, self.conv8
+        ]
+        for i, input in enumerate(inputs):
+            mbox_loc = fluid.layers.conv2d(input, 4, 3, 1, 1, bias_attr=b_attr)
+            loc = permute_and_reshape(mbox_loc, 4)
 
+            mbox_conf = fluid.layers.conv2d(input, 2, 3, 1, 1, bias_attr=b_attr)
+            conf = permute_and_reshape(mbox_conf, 2)
+            box, var = fluid.layers.prior_box(
+                input,
+                self.image,
+                min_sizes=[min_sizes[i]],
+                steps=[steps[i]] * 2,
+                aspect_ratios=[1.],
+                clip=False,
+                flip=True,
+                offset=0.5)
+            box = fluid.layers.reshape(box, shape=[-1, 4])
+            var = fluid.layers.reshape(var, shape=[-1, 4])
+
+            locs.append(loc)
+            confs.append(conf)
+            boxes.append(box)
+            vars.append(var)
+
+        self.face_mbox_loc = fluid.layers.concat(locs, axis=1)
+        self.face_mbox_conf = fluid.layers.concat(confs, axis=1)
+        self.prior_boxes = fluid.layers.concat(boxes)
+        self.box_vars = fluid.layers.concat(vars)
+
+    def vgg_ssd_loss(self):
+        loss = fluid.layers.ssd_loss(
+            self.face_mbox_loc,
+            self.face_mbox_conf,
+            self.face_box,
+            self.gt_label,
+            self.prior_boxes,
+            self.box_vars,
+            overlap_threshold=0.35,
+            neg_overlap=0.35)
+        loss = fluid.layers.reduce_sum(loss)
         return loss
 
     def train(self):
         face_loss = fluid.layers.ssd_loss(
-            self.face_mbox_loc, self.face_mbox_conf, self.face_box,
-            self.gt_label, self.prior_boxes, self.box_vars)
+            self.face_mbox_loc,
+            self.face_mbox_conf,
+            self.face_box,
+            self.gt_label,
+            self.prior_boxes,
+            self.box_vars,
+            overlap_threshold=0.35,
+            neg_overlap=0.35)
         head_loss = fluid.layers.ssd_loss(
-            self.head_mbox_loc, self.head_mbox_conf, self.head_box,
-            self.gt_label, self.prior_boxes, self.box_vars)
+            self.head_mbox_loc,
+            self.head_mbox_conf,
+            self.head_box,
+            self.gt_label,
+            self.prior_boxes,
+            self.box_vars,
+            overlap_threshold=0.35,
+            neg_overlap=0.35)
         face_loss = fluid.layers.reduce_sum(face_loss)
         head_loss = fluid.layers.reduce_sum(head_loss)
         total_loss = face_loss + head_loss
         return face_loss, head_loss, total_loss
 
-    def test(self):
-        test_program = fluid.default_main_program().clone(for_test=True)
+    def infer(self, main_program=None):
+        if main_program is None:
+            test_program = fluid.default_main_program().clone(for_test=True)
+        else:
+            test_program = main_program.clone(for_test=True)
         with fluid.program_guard(test_program):
             face_nmsed_out = fluid.layers.detection_output(
                 self.face_mbox_loc,
@@ -306,24 +411,4 @@ class PyramidBox(object):
                 self.prior_boxes,
                 self.box_vars,
                 nms_threshold=0.45)
-            head_nmsed_out = fluid.layers.detection_output(
-                self.head_mbox_loc,
-                self.head_mbox_conf,
-                self.prior_boxes,
-                self.box_vars,
-                nms_threshold=0.45)
-            face_map_eval = fluid.evaluator.DetectionMAP(
-                face_nmsed_out,
-                self.gt_label,
-                self.face_box,
-                class_num=2,
-                overlap_threshold=0.5,
-                ap_version='11point')
-            head_map_eval = fluid.evaluator.DetectionMAP(
-                head_nmsed_out,
-                self.gt_label,
-                self.head_box,
-                class_num=2,
-                overlap_threshold=0.5,
-                ap_version='11point')
-        return test_program, face_map_eval, head_map_eval
+        return test_program, face_nmsed_out
diff --git a/fluid/face_detection/reader.py b/fluid/face_detection/reader.py
index f41f6cfc3cc1d7df2a05aa4bab1bad817c5f7889..42109b1194cad071c6571ffa1eb590526a688033 100644
--- a/fluid/face_detection/reader.py
+++ b/fluid/face_detection/reader.py
@@ -238,37 +238,71 @@ def pyramidbox(settings, file_list, mode, shuffle):
             im_width, im_height = im.size
 
             # layout: label | xmin | ymin | xmax | ymax
-            bbox_labels = []
-            for index_box in range(len(dict_input_txt[index_image])):
-                if index_box >= 2:
-                    bbox_sample = []
-                    temp_info_box = dict_input_txt[index_image][
-                        index_box].split(' ')
-                    xmin = float(temp_info_box[0])
-                    ymin = float(temp_info_box[1])
-                    w = float(temp_info_box[2])
-                    h = float(temp_info_box[3])
-                    xmax = xmin + w
-                    ymax = ymin + h
-
-                    bbox_sample.append(1)
-                    bbox_sample.append(float(xmin) / im_width)
-                    bbox_sample.append(float(ymin) / im_height)
-                    bbox_sample.append(float(xmax) / im_width)
-                    bbox_sample.append(float(ymax) / im_height)
-                    bbox_labels.append(bbox_sample)
-
-            im, sample_labels = preprocess(im, bbox_labels, mode, settings)
-            sample_labels = np.array(sample_labels)
-            if len(sample_labels) == 0: continue
-            im = im.astype('float32')
-            boxes = sample_labels[:, 1:5]
-            lbls = [1] * len(boxes)
-            difficults = [1] * len(boxes)
-            yield im, boxes, expand_bboxes(boxes), lbls, difficults
+            if mode == 'train':
+                bbox_labels = []
+                for index_box in range(len(dict_input_txt[index_image])):
+                    if index_box >= 2:
+                        bbox_sample = []
+                        temp_info_box = dict_input_txt[index_image][
+                            index_box].split(' ')
+                        xmin = float(temp_info_box[0])
+                        ymin = float(temp_info_box[1])
+                        w = float(temp_info_box[2])
+                        h = float(temp_info_box[3])
+                        xmax = xmin + w
+                        ymax = ymin + h
+
+                        bbox_sample.append(1)
+                        bbox_sample.append(float(xmin) / im_width)
+                        bbox_sample.append(float(ymin) / im_height)
+                        bbox_sample.append(float(xmax) / im_width)
+                        bbox_sample.append(float(ymax) / im_height)
+                        bbox_labels.append(bbox_sample)
+
+                im, sample_labels = preprocess(im, bbox_labels, mode, settings)
+                sample_labels = np.array(sample_labels)
+                if len(sample_labels) == 0: continue
+                im = im.astype('float32')
+                boxes = sample_labels[:, 1:5]
+                lbls = [1] * len(boxes)
+                difficults = [1] * len(boxes)
+                yield im, boxes, expand_bboxes(boxes), lbls, difficults
+
+            if mode == 'test':
+                yield im, image_path
 
     return reader
 
 
 def train(settings, file_list, shuffle=True):
     return pyramidbox(settings, file_list, 'train', shuffle)
+
+
+def test(settings, file_list):
+    return pyramidbox(settings, file_list, 'test', False)
+
+
+def infer(settings, image_path):
+    def batch_reader():
+        img = Image.open(image_path)
+        if img.mode == 'L':
+            img = im.convert('RGB')
+        im_width, im_height = img.size
+        if settings.resize_w and settings.resize_h:
+            img = img.resize((settings.resize_w, settings.resize_h),
+                             Image.ANTIALIAS)
+        img = np.array(img)
+        # HWC to CHW
+        if len(img.shape) == 3:
+            img = np.swapaxes(img, 1, 2)
+            img = np.swapaxes(img, 1, 0)
+        # RBG to BGR
+        img = img[[2, 1, 0], :, :]
+        img = img.astype('float32')
+        img -= settings.img_mean
+        img = img * 0.007843
+        img = [img]
+        img = np.array(img)
+        return img
+
+    return batch_reader
diff --git a/fluid/face_detection/train.py b/fluid/face_detection/train.py
index e0643cfe494a9a40c7f4ae8c17506572cb0df89c..c10722b9e33d6c9d05f961d3b2cf73a859b9da3c 100644
--- a/fluid/face_detection/train.py
+++ b/fluid/face_detection/train.py
@@ -16,11 +16,11 @@ add_arg = functools.partial(add_arguments, argparser=parser)
 
 # yapf: disable
 add_arg('parallel', bool, True, "parallel")
-add_arg('learning_rate', float, 0.0001, "Learning rate.")
-add_arg('batch_size', int, 16, "Minibatch size.")
+add_arg('learning_rate', float, 0.001, "Learning rate.")
+add_arg('batch_size', int, 12, "Minibatch size.")
 add_arg('num_passes', int, 120, "Epoch number.")
 add_arg('use_gpu', bool, True, "Whether use GPU.")
-add_arg('use_pyramidbox', bool, False, "Whether use PyramidBox model.")
+add_arg('use_pyramidbox', bool, True, "Whether use PyramidBox model.")
 add_arg('dataset', str, 'WIDERFACE', "coco2014, coco2017, and pascalvoc.")
 add_arg('model_save_dir', str, 'model', "The path to save model.")
 add_arg('pretrained_model', str, './pretrained/', "The init model path.")
@@ -40,20 +40,20 @@ def train(args, data_args, learning_rate, batch_size, pretrained_model,
     image_shape = [3, data_args.resize_h, data_args.resize_w]
 
     fetches = []
+    network = PyramidBox(image_shape, num_classes,
+                         sub_network=args.use_pyramidbox)
     if args.use_pyramidbox:
-        network = PyramidBox(image_shape, sub_network=args.use_pyramidbox)
         face_loss, head_loss, loss = network.train()
         fetches = [face_loss, head_loss]
     else:
-        network = PyramidBox(image_shape, sub_network=args.use_pyramidbox)
-        loss = network.vgg_ssd(num_classes, image_shape)
+        loss = network.vgg_ssd_loss()
         fetches = [loss]
 
     epocs = 12880 / batch_size
-    boundaries = [epocs * 100, epocs * 125, epocs * 150]
+    boundaries = [epocs * 40, epocs * 60, epocs * 80, epocs * 100]
     values = [
-        learning_rate, learning_rate * 0.1, learning_rate * 0.01,
-        learning_rate * 0.001
+        learning_rate, learning_rate * 0.5, learning_rate * 0.25,
+        learning_rate * 0.1, learning_rate * 0.01
     ]
 
     if optimizer_method == "momentum":
@@ -70,12 +70,19 @@ def train(args, data_args, learning_rate, batch_size, pretrained_model,
         )
 
     optimizer.minimize(loss)
+    # fluid.memory_optimize(fluid.default_main_program())
 
     place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
     exe = fluid.Executor(place)
     exe.run(fluid.default_startup_program())
 
+    start_pass = 0
     if pretrained_model:
+        if pretrained_model.isdigit():
+            start_pass = int(pretrained_model) + 1
+            pretrained_model = os.path.join(args.model_save_dir, pretrained_model)
+            print("Resume from %s " %(pretrained_model))
+
         if not os.path.exists(pretrained_model):
             raise ValueError("The pre-trained model path [%s] does not exist." %
                              (pretrained_model))
@@ -98,14 +105,14 @@ def train(args, data_args, learning_rate, batch_size, pretrained_model,
         print 'save models to %s' % (model_path)
         fluid.io.save_persistables(exe, model_path)
 
-    for pass_id in range(num_passes):
+    for pass_id in range(start_pass, num_passes):
         start_time = time.time()
         prev_start_time = start_time
         end_time = 0
         for batch_id, data in enumerate(train_reader()):
             prev_start_time = start_time
             start_time = time.time()
-            if len(data) < devices_num: continue
+            if len(data) < 2 * devices_num: continue
             if args.parallel:
                 fetch_vars = train_exe.run(fetch_list=[v.name for v in fetches],
                                            feed=feeder.feed(data))
@@ -126,7 +133,7 @@ def train(args, data_args, learning_rate, batch_size, pretrained_model,
                            batch_id, fetch_vars[0], fetch_vars[1],
                            start_time - prev_start_time))
 
-        if pass_id % 10 == 0 or pass_id == num_passes - 1:
+        if pass_id % 1 == 0 or pass_id == num_passes - 1:
             save_model(str(pass_id))
 
 
diff --git a/fluid/image_classification/README.md b/fluid/image_classification/README.md
index 1000717c87fb60763cdfad78967ad6ec867f5120..b8cd82a68acf4abfd868f70042c7a66facfd5030 100644
--- a/fluid/image_classification/README.md
+++ b/fluid/image_classification/README.md
@@ -1,38 +1,37 @@
-The minimum PaddlePaddle version needed for the code sample in this directory is the lastest develop branch. If you are on a version of PaddlePaddle earlier than this, [please update your installation](http://www.paddlepaddle.org/docs/develop/documentation/en/build_and_install/pip_install_en.html).
+# Image Classification and Model Zoo
+Image classification, which is an important field of computer vision, is to classify an image into pre-defined labels. Recently, many researchers developed different kinds of neural networks and highly improve the classification performance. This page introduces how to do image classification with PaddlePaddle Fluid, including [data preparation](#data-preparation), [training](#training-a-model), [finetuning](#finetuning), [evaluation](#evaluation) and [inference](#inference).
 
 ---
+## Table of Contents
+- [Installation](#installation)
+- [Data preparation](#data-preparation)
+- [Training a model with flexible parameters](#training-a-model)
+- [Finetuning](#finetuning)
+- [Evaluation](#evaluation)
+- [Inference](#inference)
+- [Supported models and performances](#supported-models)
 
-# SE-ResNeXt for image classification
+## Installation
 
-This model built with paddle fluid is still under active development and is not
-the final version. We welcome feedbacks.
+Running sample code in this directory requires PaddelPaddle Fluid v0.13.0 and later. If the PaddlePaddle on your device is lower than this version, please follow the instructions in [installation document](http://www.paddlepaddle.org/docs/develop/documentation/zh/build_and_install/pip_install_cn.html) and make an update.
 
-## Introduction
+## Data preparation
 
-The current code support the training of [SE-ResNeXt](https://arxiv.org/abs/1709.01507) (50/152 layers).
+An example for ImageNet classification is as follows. First of all, preparation of imagenet data can be done as:
+```
+cd data/ILSVRC2012/
+sh download_imagenet2012.sh
+```
 
-## Data Preparation
+In the shell script ```download_imagenet2012.sh```,  there are three steps to prepare data:
 
-1. Download ImageNet-2012 dataset
-```
-cd data/
-mkdir -p ILSVRC2012/
-cd ILSVRC2012/
-# get training set
-wget http://www.image-net.org/challenges/LSVRC/2012/nnoupb/ILSVRC2012_img_train.tar
-# get validation set
-wget http://www.image-net.org/challenges/LSVRC/2012/nnoupb/ILSVRC2012_img_val.tar
-# prepare directory
-tar xf ILSVRC2012_img_train.tar
-tar xf ILSVRC2012_img_val.tar
+**step-1:** Register at ```image-net.org``` first in order to get a pair of ```Username``` and ```AccessKey```, which are used to download ImageNet data.
 
-# unzip all classes data using unzip.sh
-sh unzip.sh
-```
+**step-2:** Download ImageNet-2012 dataset from website. The training and validation data will be downloaded into folder "train" and "val" respectively. Please note that the size of data is more than 40 GB, it will take much time to download. Users who have downloaded the ImageNet data can organize it into ```data/ILSVRC2012``` directly.
 
-2. Download training and validation label files from [ImageNet2012 url](https://pan.baidu.com/s/1Y6BCo0nmxsm_FsEqmx2hKQ)(password:```wx99```). Untar it into workspace ```ILSVRC2012/```. The files include
+**step-3:** Download training and validation label files. There are two label files which contain train and validation image labels respectively:
 
-**train_list.txt**: training list of imagenet 2012 classification task, with each line seperated by SPACE.
+* *train_list.txt*: label file of imagenet-2012 training set, with each line seperated by ```SPACE```, like:
 ```
 train/n02483708/n02483708_2436.jpeg 369
 train/n03998194/n03998194_7015.jpeg 741
@@ -41,7 +40,7 @@ train/n04596742/n04596742_3032.jpeg 909
 train/n03208938/n03208938_7065.jpeg 535
 ...
 ```
-**val_list.txt**: validation list of imagenet 2012 classification task, with each line seperated by SPACE.
+* *val_list.txt*: label file of imagenet-2012 validation set, with each line seperated by ```SPACE```, like.
 ```
 val/ILSVRC2012_val_00000001.jpeg 65
 val/ILSVRC2012_val_00000002.jpeg 970
@@ -50,38 +49,160 @@ val/ILSVRC2012_val_00000004.jpeg 809
 val/ILSVRC2012_val_00000005.jpeg 516
 ...
 ```
-**synset_words.txt**: the semantic label of each class.
 
-## Training a model
+## Training a model with flexible parameters
 
-To start a training task, one can use command line as:
+After data preparation, one can start the training step by:
 
 ```
-python train.py --num_layers=50 --batch_size=8 --with_mem_opt=True --parallel_exe=False
+python train.py \
+       --model=SE_ResNeXt50_32x4d \
+       --batch_size=32 \
+       --total_images=1281167 \
+       --class_dim=1000
+       --image_shape=3,224,224 \
+       --model_save_dir=output/ \
+       --with_mem_opt=False \
+       --lr_strategy=piecewise_decay \
+       --lr=0.1
 ```
-## Finetune a model
+**parameter introduction:**
+* **model**: name model to use. Default: "SE_ResNeXt50_32x4d".
+* **num_epochs**: the number of epochs. Default: 120.
+* **batch_size**: the size of each mini-batch. Default: 256.
+* **use_gpu**: whether to use GPU or not. Default: True.
+* **total_images**: total number of images in the training set. Default: 1281167.
+* **class_dim**: the class number of the classification task. Default: 1000.
+* **image_shape**: input size of the network. Default: "3,224,224".
+* **model_save_dir**: the directory to save trained model. Default: "output".
+* **with_mem_opt**: whether to use memory optimization or not. Default: False.
+* **lr_strategy**: learning rate changing strategy. Default: "piecewise_decay".
+* **lr**: initialized learning rate. Default: 0.1.
+* **pretrained_model**: model path for pretraining. Default: None.
+* **checkpoint**: the checkpoint path to resume. Default: None.
+
+**data reader introduction:** Data reader is defined in ```reader.py```. In [training stage](#training-a-model), random crop and flipping are used, while center crop is used in [evaluation](#inference) and [inference](#inference) stages. Supported data augmentation includes:
+* rotation
+* color jitter
+* random crop
+* center crop
+* resize
+* flipping
+
+**training curve:** The training curve can be drawn based on training log. For example, the log from training AlexNet is like:
 ```
-python train.py --num_layers=50 --batch_size=8 --with_mem_opt=True --parallel_exe=False --pretrained_model="pretrain/96/"
+End pass 1, train_loss 6.23153877258, train_acc1 0.0150696625933, train_acc5 0.0552518665791, test_loss 5.41981744766, test_acc1 0.0519132651389, test_acc5 0.156150355935
+End pass 2, train_loss 5.15442800522, train_acc1 0.0784279331565, train_acc5 0.211050540209, test_loss 4.45795249939, test_acc1 0.140469551086, test_acc5 0.333163291216
+End pass 3, train_loss 4.51505613327, train_acc1 0.145300447941, train_acc5 0.331567406654, test_loss 3.86548018456, test_acc1 0.219443559647, test_acc5 0.446448504925
+End pass 4, train_loss 4.12735557556, train_acc1 0.19437250495, train_acc5 0.405713528395, test_loss 3.56990146637, test_acc1 0.264536827803, test_acc5 0.507190704346
+End pass 5, train_loss 3.87505435944, train_acc1 0.229518383741, train_acc5 0.453582793474, test_loss 3.35345435143, test_acc1 0.297349333763, test_acc5 0.54753267765
+End pass 6, train_loss 3.6929500103, train_acc1 0.255628824234, train_acc5 0.487188398838, test_loss 3.17112898827, test_acc1 0.326953113079, test_acc5 0.581780135632
+End pass 7, train_loss 3.55882954597, train_acc1 0.275381118059, train_acc5 0.511990904808, test_loss 3.03736782074, test_acc1 0.349035382271, test_acc5 0.606293857098
+End pass 8, train_loss 3.45595097542, train_acc1 0.291462600231, train_acc5 0.530815005302, test_loss 2.96034455299, test_acc1 0.362228929996, test_acc5 0.617390751839
+End pass 9, train_loss 3.3745200634, train_acc1 0.303871691227, train_acc5 0.545210540295, test_loss 2.93932366371, test_acc1 0.37129303813, test_acc5 0.623573005199
+...
 ```
-TBD
-## Inference
+
+The error rate curves of AlexNet, ResNet50 and SE-ResNeXt-50 are shown in the figure below.
+<p align="center">
+<img src="images/curve.jpg" height=480 width=640 hspace='10'/> <br />
+Training and validation Curves
+</p>
+
+## Finetuning
+
+Finetuning is to finetune model weights in a specific task by loading pretrained weights. After initializing ```path_to_pretrain_model```, one can finetune a model as:
 ```
-python infer.py --num_layers=50 --batch_size=8 --model='model/90' --test_list=''
+python train.py
+       --model=SE_ResNeXt50_32x4d \
+       --pretrained_model=${path_to_pretrain_model} \
+       --batch_size=32 \
+       --total_images=1281167 \
+       --class_dim=1000 \
+       --image_shape=3,224,224 \
+       --model_save_dir=output/ \
+       --with_mem_opt=True \
+       --lr_strategy=piecewise_decay \
+       --lr=0.1
 ```
-TBD
 
-## Results
+## Evaluation
+Evaluation is to evaluate the performance of a trained model. One can download [pretrained models](#supported-models) and set its path to ```path_to_pretrain_model```. Then top1/top5 accuracy can be obtained by running the following command:
+```
+python eval.py \
+       --model=SE_ResNeXt50_32x4d \
+       --batch_size=32 \
+       --class_dim=1000 \
+       --image_shape=3,224,224 \
+       --with_mem_opt=True \
+       --pretrained_model=${path_to_pretrain_model}
+```
 
-The SE-ResNeXt-50 model is trained by starting with learning rate ```0.1``` and decaying it by ```0.1``` after each ```10``` epoches. Top-1/Top-5 Validation Accuracy on ImageNet 2012 is listed in table.
+According to the congfiguration of evaluation, the output log is like:
+```
+Testbatch 0,loss 2.1786134243, acc1 0.625,acc5 0.8125,time 0.48 sec
+Testbatch 10,loss 0.898496925831, acc1 0.75,acc5 0.9375,time 0.51 sec
+Testbatch 20,loss 1.32524681091, acc1 0.6875,acc5 0.9375,time 0.37 sec
+Testbatch 30,loss 1.46830511093, acc1 0.5,acc5 0.9375,time 0.51 sec
+Testbatch 40,loss 1.12802267075, acc1 0.625,acc5 0.9375,time 0.35 sec
+Testbatch 50,loss 0.881597697735, acc1 0.8125,acc5 1.0,time 0.32 sec
+Testbatch 60,loss 0.300163716078, acc1 0.875,acc5 1.0,time 0.48 sec
+Testbatch 70,loss 0.692037761211, acc1 0.875,acc5 1.0,time 0.35 sec
+Testbatch 80,loss 0.0969972759485, acc1 1.0,acc5 1.0,time 0.41 sec
+...
+```
 
-|model | [original paper(Fig.5)](https://arxiv.org/abs/1709.01507) | Pytorch | Paddle fluid
-|- | :-: |:-: | -:
-|SE-ResNeXt-50 | 77.6%/- | 77.71%/93.63% | 77.42%/93.50%
+## Inference
+Inference is used to get prediction score or image features based on trained models.
+```
+python infer.py \
+       --model=SE_ResNeXt50_32x4d \
+       --batch_size=32 \
+       --class_dim=1000 \
+       --image_shape=3,224,224 \
+       --with_mem_opt=True \
+       --pretrained_model=${path_to_pretrain_model}
+```
+The output contains predication results, including maximum score (before softmax) and corresponding predicted label.
+```
+Test-0-score: [13.168352], class [491]
+Test-1-score: [7.913302], class [975]
+Test-2-score: [16.959702], class [21]
+Test-3-score: [14.197695], class [383]
+Test-4-score: [12.607652], class [878]
+Test-5-score: [17.725458], class [15]
+Test-6-score: [12.678599], class [118]
+Test-7-score: [12.353498], class [505]
+Test-8-score: [20.828007], class [747]
+Test-9-score: [15.135801], class [315]
+Test-10-score: [14.585114], class [920]
+Test-11-score: [13.739927], class [679]
+Test-12-score: [15.040644], class [386]
+...
+```
 
+## Supported models and performances
 
+Models are trained by starting with learning rate ```0.1``` and decaying it by ```0.1``` after each pre-defined epoches, if not special introduced. Available top-1/top-5 validation accuracy on ImageNet 2012 are listed in table. Pretrained models can be downloaded by clicking related model names.
 
-## Released models
-|model | Baidu Cloud
+|model | top-1/top-5 accuracy
 |- | -:
-|SE-ResNeXt-50 | [url]()
-TBD
+|[AlexNet](http://paddle-imagenet-models.bj.bcebos.com/alexnet_model.tar) | 57.21%/79.72%
+|VGG11 | -
+|VGG13 | -
+|VGG16 | -
+|VGG19 | -
+|GoogleNet | -
+|InceptionV4 | -
+|MobileNet | -
+|[ResNet50](http://paddle-imagenet-models.bj.bcebos.com/resnet_50_model.tar) | 76.63%/93.10%
+|ResNet101 | -
+|ResNet152 | -
+|[SE_ResNeXt50_32x4d](http://paddle-imagenet-models.bj.bcebos.com/se_resnext_50_model.tar) | 78.33%/93.96%
+|SE_ResNeXt101_32x4d | -
+|SE_ResNeXt152_32x4d | -
+|DPN68 | -
+|DPN92 | -
+|DPN98 | -
+|DPN107 | -
+|DPN131 | -
diff --git a/fluid/image_classification/README_cn.md b/fluid/image_classification/README_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..937dd148c70ccb86a24c9ad7fd1705de3d5b4678
--- /dev/null
+++ b/fluid/image_classification/README_cn.md
@@ -0,0 +1,209 @@
+
+# 图像分类以及模型库
+图像分类是计算机视觉的重要领域，它的目标是将图像分类到预定义的标签。近期，需要研究者提出很多不同种类的神经网络，并且极大的提升了分类算法的性能。本页将介绍如何使用PaddlePaddle进行图像分类，包括[数据准备](#data-preparation)、 [训练](#training-a-model)、[参数微调](#finetuning)、[模型评估](#evaluation)以及[模型推断](#inference)。
+
+---
+## 内容
+- [安装](#installation)
+- [数据准备](#data-preparation)
+- [模型训练](#training-a-model)
+- [参数微调](#finetuning)
+- [模型评估](#evaluation)
+- [模型推断](#inference)
+- [已有模型及其性能](#supported-models)
+
+## 安装
+
+在当前目录下运行样例代码需要PadddlePaddle Fluid的v0.13.0或以上的版本。如果你的运行环境中的PaddlePaddle低于此版本，请根据[安装文档](http://www.paddlepaddle.org/docs/develop/documentation/zh/build_and_install/pip_install_cn.html)中的说明来更新PaddlePaddle。
+
+## 数据准备
+
+下面给出了ImageNet分类任务的样例，首先，通过如下的方式进行数据的准备：
+```
+cd data/ILSVRC2012/
+sh download_imagenet2012.sh
+```
+在```download_imagenet2012.sh```脚本中，通过下面三步来准备数据：
+
+**步骤一：** 首先在```image-net.org```网站上完成注册，用于获得一对```Username```和```AccessKey```。
+
+**步骤二：** 从ImageNet官网下载ImageNet-2012的图像数据。训练以及验证数据集会分别被下载到"train" 和 "val" 目录中。请注意，ImaegNet数据的大小超过40GB，下载非常耗时；已经自行下载ImageNet的用户可以直接将数据组织放置到```data/ILSVRC2012```。
+
+**步骤三：** 下载训练与验证集合对应的标签文件。下面两个文件分别包含了训练集合与验证集合中图像的标签：
+
+* *train_list.txt*: ImageNet-2012训练集合的标签文件，每一行采用"空格"分隔图像路径与标注，例如：
+```
+train/n02483708/n02483708_2436.jpeg 369
+train/n03998194/n03998194_7015.jpeg 741
+train/n04523525/n04523525_38118.jpeg 884
+train/n04596742/n04596742_3032.jpeg 909
+train/n03208938/n03208938_7065.jpeg 535
+...
+```
+* *val_list.txt*: ImageNet-2012验证集合的标签文件，每一行采用"空格"分隔图像路径与标注，例如：
+```
+val/ILSVRC2012_val_00000001.jpeg 65
+val/ILSVRC2012_val_00000002.jpeg 970
+val/ILSVRC2012_val_00000003.jpeg 230
+val/ILSVRC2012_val_00000004.jpeg 809
+val/ILSVRC2012_val_00000005.jpeg 516
+...
+```
+
+## 模型训练
+
+数据准备完毕后，可以通过如下的方式启动训练：
+```
+python train.py \
+       --model=SE_ResNeXt50_32x4d \
+       --batch_size=32 \
+       --total_images=1281167 \
+       --class_dim=1000
+       --image_shape=3,224,224 \
+       --model_save_dir=output/ \
+       --with_mem_opt=False \
+       --lr_strategy=piecewise_decay \
+       --lr=0.1
+```
+**参数说明：**
+* **model**: name model to use. Default: "SE_ResNeXt50_32x4d".
+* **num_epochs**: the number of epochs. Default: 120.
+* **batch_size**: the size of each mini-batch. Default: 256.
+* **use_gpu**: whether to use GPU or not. Default: True.
+* **total_images**: total number of images in the training set. Default: 1281167.
+* **class_dim**: the class number of the classification task. Default: 1000.
+* **image_shape**: input size of the network. Default: "3,224,224".
+* **model_save_dir**: the directory to save trained model. Default: "output".
+* **with_mem_opt**: whether to use memory optimization or not. Default: False.
+* **lr_strategy**: learning rate changing strategy. Default: "piecewise_decay".
+* **lr**: initialized learning rate. Default: 0.1.
+* **pretrained_model**: model path for pretraining. Default: None.
+* **checkpoint**: the checkpoint path to resume. Default: None.
+
+**数据读取器说明：** 数据读取器定义在```reader.py```中。在[训练阶段](#training-a-model), 默认采用的增广方式是随机裁剪与水平翻转, 而在[评估](#inference)与[推断](#inference)阶段用的默认方式是中心裁剪。当前支持的数据增广方式有：
+* 旋转
+* 颜色抖动
+* 随机裁剪
+* 中心裁剪
+* 长宽调整
+* 水平翻转
+
+**训练曲线：** 通过训练过程中的日志可以画出训练曲线。举个例子，训练AlexNet出来的日志如下所示：
+```
+End pass 1, train_loss 6.23153877258, train_acc1 0.0150696625933, train_acc5 0.0552518665791, test_loss 5.41981744766, test_acc1 0.0519132651389, test_acc5 0.156150355935
+End pass 2, train_loss 5.15442800522, train_acc1 0.0784279331565, train_acc5 0.211050540209, test_loss 4.45795249939, test_acc1 0.140469551086, test_acc5 0.333163291216
+End pass 3, train_loss 4.51505613327, train_acc1 0.145300447941, train_acc5 0.331567406654, test_loss 3.86548018456, test_acc1 0.219443559647, test_acc5 0.446448504925
+End pass 4, train_loss 4.12735557556, train_acc1 0.19437250495, train_acc5 0.405713528395, test_loss 3.56990146637, test_acc1 0.264536827803, test_acc5 0.507190704346
+End pass 5, train_loss 3.87505435944, train_acc1 0.229518383741, train_acc5 0.453582793474, test_loss 3.35345435143, test_acc1 0.297349333763, test_acc5 0.54753267765
+End pass 6, train_loss 3.6929500103, train_acc1 0.255628824234, train_acc5 0.487188398838, test_loss 3.17112898827, test_acc1 0.326953113079, test_acc5 0.581780135632
+End pass 7, train_loss 3.55882954597, train_acc1 0.275381118059, train_acc5 0.511990904808, test_loss 3.03736782074, test_acc1 0.349035382271, test_acc5 0.606293857098
+End pass 8, train_loss 3.45595097542, train_acc1 0.291462600231, train_acc5 0.530815005302, test_loss 2.96034455299, test_acc1 0.362228929996, test_acc5 0.617390751839
+End pass 9, train_loss 3.3745200634, train_acc1 0.303871691227, train_acc5 0.545210540295, test_loss 2.93932366371, test_acc1 0.37129303813, test_acc5 0.623573005199
+...
+```
+
+下图给出了AlexNet、ResNet50以及SE-ResNeXt-50网络的错误率曲线：
+<p align="center">
+<img src="images/curve.jpg" height=480 width=640 hspace='10'/> <br />
+训练集合与验证集合上的错误率曲线
+</p>
+
+
+## 参数微调
+
+参数微调是指在特定任务上微调已训练模型的参数。通过初始化```path_to_pretrain_model```，微调一个模型可以采用如下的命令：
+```
+python train.py
+       --model=SE_ResNeXt50_32x4d \
+       --pretrained_model=${path_to_pretrain_model} \
+       --batch_size=32 \
+       --total_images=1281167 \
+       --class_dim=1000 \
+       --image_shape=3,224,224 \
+       --model_save_dir=output/ \
+       --with_mem_opt=True \
+       --lr_strategy=piecewise_decay \
+       --lr=0.1
+```
+
+## 模型评估
+模型评估是指对训练完毕的模型评估各类性能指标。用户可以下载[预训练模型](#supported-models)并且设置```path_to_pretrain_model```为模型所在路径。运行如下的命令，可以获得一个模型top-1/top-5精度:
+```
+python eval.py \
+       --model=SE_ResNeXt50_32x4d \
+       --batch_size=32 \
+       --class_dim=1000 \
+       --image_shape=3,224,224 \
+       --with_mem_opt=True \
+       --pretrained_model=${path_to_pretrain_model}
+```
+
+根据这个评估程序的配置，输出日志形式如下：
+```
+Testbatch 0,loss 2.1786134243, acc1 0.625,acc5 0.8125,time 0.48 sec
+Testbatch 10,loss 0.898496925831, acc1 0.75,acc5 0.9375,time 0.51 sec
+Testbatch 20,loss 1.32524681091, acc1 0.6875,acc5 0.9375,time 0.37 sec
+Testbatch 30,loss 1.46830511093, acc1 0.5,acc5 0.9375,time 0.51 sec
+Testbatch 40,loss 1.12802267075, acc1 0.625,acc5 0.9375,time 0.35 sec
+Testbatch 50,loss 0.881597697735, acc1 0.8125,acc5 1.0,time 0.32 sec
+Testbatch 60,loss 0.300163716078, acc1 0.875,acc5 1.0,time 0.48 sec
+Testbatch 70,loss 0.692037761211, acc1 0.875,acc5 1.0,time 0.35 sec
+Testbatch 80,loss 0.0969972759485, acc1 1.0,acc5 1.0,time 0.41 sec
+...
+```
+
+
+## 模型推断
+模型推断可以获取一个模型的预测分数或者图像的特征：
+```
+python infer.py \
+       --model=SE_ResNeXt50_32x4d \
+       --batch_size=32 \
+       --class_dim=1000 \
+       --image_shape=3,224,224 \
+       --with_mem_opt=True \
+       --pretrained_model=${path_to_pretrain_model}
+```
+输出的预测结果包括最高分数(未经过softmax处理)以及相应的预测标签。
+```
+Test-0-score: [13.168352], class [491]
+Test-1-score: [7.913302], class [975]
+Test-2-score: [16.959702], class [21]
+Test-3-score: [14.197695], class [383]
+Test-4-score: [12.607652], class [878]
+Test-5-score: [17.725458], class [15]
+Test-6-score: [12.678599], class [118]
+Test-7-score: [12.353498], class [505]
+Test-8-score: [20.828007], class [747]
+Test-9-score: [15.135801], class [315]
+Test-10-score: [14.585114], class [920]
+Test-11-score: [13.739927], class [679]
+Test-12-score: [15.040644], class [386]
+...
+```
+
+## 已有模型及其性能
+
+表格中列出了在"models"目录下支持的神经网络种类，并且给出了已完成训练的模型在ImageNet-2012验证集合上的top-1/top-5精度；如无特征说明，训练模型的初始学习率为```0.1```，每隔预定的epochs会下降```0.1```。预训练模型可以通过点击相应模型的名称进行下载。
+
+|model | top-1/top-5 accuracy
+|- | -:
+|[AlexNet](http://paddle-imagenet-models.bj.bcebos.com/alexnet_model.tar) | 57.21%/79.72%
+|VGG11 | -
+|VGG13 | -
+|VGG16 | -
+|VGG19 | -
+|GoogleNet | -
+|InceptionV4 | -
+|MobileNet | -
+|[ResNet50](http://paddle-imagenet-models.bj.bcebos.com/resnet_50_model.tar) | 76.63%/93.10%
+|ResNet101 | -
+|ResNet152 | -
+|[SE_ResNeXt50_32x4d](http://paddle-imagenet-models.bj.bcebos.com/se_resnext_50_model.tar) | 78.33%/93.96%
+|SE_ResNeXt101_32x4d | -
+|SE_ResNeXt152_32x4d | -
+|DPN68 | -
+|DPN92 | -
+|DPN98 | -
+|DPN107 | -
+|DPN131 | -
diff --git a/fluid/image_classification/__init__.py b/fluid/image_classification/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/fluid/image_classification/caffe2fluid/examples/imagenet/compare.py b/fluid/image_classification/caffe2fluid/examples/imagenet/compare.py
index 041d0094b4e262838e33b922cf770715425d6f04..05fbd6b85c2d70124817e7c5a2d5a90e78ba7847 100644
--- a/fluid/image_classification/caffe2fluid/examples/imagenet/compare.py
+++ b/fluid/image_classification/caffe2fluid/examples/imagenet/compare.py
@@ -20,8 +20,8 @@ def calc_diff(f1, f2):
     d1 = np.load(f1)
     d2 = np.load(f2)
 
-    print d1.shape
-    print d2.shape
+    #print d1.shape
+    #print d2.shape
     #print d1[0, 0, 0:10, 0:10]
     #print d2[0, 0, 0:10, 0:10]
     #d1 = d1[:, :, 1:-2, 1:-2]
diff --git a/fluid/image_classification/caffe2fluid/examples/imagenet/infer.py b/fluid/image_classification/caffe2fluid/examples/imagenet/infer.py
index 57f80d4cfe03acd5f78bc873e0c6245a4d2548e7..9de51e1af9685478c3a30b7692e6472bf2ce17fd 100644
--- a/fluid/image_classification/caffe2fluid/examples/imagenet/infer.py
+++ b/fluid/image_classification/caffe2fluid/examples/imagenet/infer.py
@@ -78,6 +78,54 @@ def dump_results(results, names, root):
         np.save(filename + '.npy', res)
 
 
+def normalize_name(name_map):
+    return {
+        k.replace('/', '_'): v.replace('/', '_')
+        for k, v in name_map.items()
+    }
+
+
+def rename_layer_name(names, net):
+    """ because the names of output layers from caffe maybe changed for 'INPLACE' operation,
+        and paddle's layers maybe fused, so we need to re-mapping their relationship for comparing
+    """
+    #build a mapping from paddle's name to caffe's name
+    trace = getattr(net, 'name_trace', None)
+    cf_trace = trace['caffe']
+    real2cf = normalize_name(cf_trace['real2chg'])
+
+    pd_trace = trace['paddle']
+    pd2real = normalize_name(pd_trace['chg2real'])
+    pd_deleted = normalize_name(pd_trace['deleted'])
+
+    pd2cf_name = {}
+    for pd_name, real_name in pd2real.items():
+        if real_name in real2cf:
+            pd2cf_name[pd_name] = '%s.%s.%s.both_changed' \
+                    % (real2cf[real_name], real_name, pd_name)
+        else:
+            pd2cf_name[pd_name] = '%s.%s.pd_changed' % (real_name, pd_name)
+
+    for pd_name, trace in pd_deleted.items():
+        assert pd_name not in pd2cf_name, "this name[%s] has already exist" % (
+            pd_name)
+        pd2cf_name[pd_name] = '%s.pd_deleted' % (pd_name)
+
+    for real_name, cf_name in real2cf.items():
+        if cf_name not in pd2cf_name:
+            pd2cf_name[cf_name] = '%s.cf_deleted' % (cf_name)
+
+        if real_name not in pd2cf_name:
+            pd2cf_name[real_name] = '%s.%s.cf_changed' % (cf_name, real_name)
+
+    ret = []
+    for name in names:
+        new_name = pd2cf_name[name] if name in pd2cf_name else name
+        print('remap paddle name[%s] to output name[%s]' % (name, new_name))
+        ret.append(new_name)
+    return ret
+
+
 def load_model(exe, place, net_file, net_name, net_weight, debug):
     """ load model using xxxnet.py and xxxnet.npy
     """
@@ -117,7 +165,8 @@ def load_model(exe, place, net_file, net_name, net_weight, debug):
         'feed_names': feed_names,
         'fetch_vars': fetch_list_var,
         'fetch_names': fetch_list_name,
-        'feed_shapes': feed_shapes
+        'feed_shapes': feed_shapes,
+        'net': net
     }
 
 
@@ -171,6 +220,7 @@ def infer(model_path, imgfile, net_file=None, net_name=None, debug=True):
         fetch_targets = ret['fetch_vars']
         fetch_list_name = ret['fetch_names']
         feed_shapes = ret['feed_shapes']
+        net = ret['net']
 
     input_name = feed_names[0]
     input_shape = feed_shapes[0]
@@ -182,7 +232,8 @@ def infer(model_path, imgfile, net_file=None, net_name=None, debug=True):
 
     if debug is True:
         dump_path = 'results.paddle'
-        dump_results(results, fetch_list_name, dump_path)
+        dump_names = rename_layer_name(fetch_list_name, net)
+        dump_results(results, dump_names, dump_path)
         print('all result of layers dumped to [%s]' % (dump_path))
     else:
         result = results[0]
diff --git a/fluid/image_classification/caffe2fluid/examples/imagenet/tools/cmp.sh b/fluid/image_classification/caffe2fluid/examples/imagenet/tools/cmp.sh
index 1ed2c8446d3a98aef302fa6a2c82d158a9b08419..54c7b48bf303aeeb0bebc4858dcba6db8700df3f 100755
--- a/fluid/image_classification/caffe2fluid/examples/imagenet/tools/cmp.sh
+++ b/fluid/image_classification/caffe2fluid/examples/imagenet/tools/cmp.sh
@@ -19,4 +19,6 @@ if [[ $# -eq 3 ]];then
 else
     caffe_file="./results/${model_name}.caffe/${2}.npy"
 fi
-python ./compare.py $paddle_file $caffe_file
+cmd="python ./compare.py $paddle_file $caffe_file"
+echo $cmd
+eval $cmd
diff --git a/fluid/image_classification/caffe2fluid/examples/imagenet/tools/cmp_layers.sh b/fluid/image_classification/caffe2fluid/examples/imagenet/tools/cmp_layers.sh
index d080f78bc58b58a121dd577b837786911e44f7a4..37a106eb25bb63814a1e696e8b42a8f708ac6aa9 100755
--- a/fluid/image_classification/caffe2fluid/examples/imagenet/tools/cmp_layers.sh
+++ b/fluid/image_classification/caffe2fluid/examples/imagenet/tools/cmp_layers.sh
@@ -3,7 +3,7 @@
 #function:
 #   a tool used to compare all layers' results
 #
-
+#set -x
 if [[ $# -ne 1 ]];then
     echo "usage:"
     echo "  bash $0 [model_name]"
@@ -13,11 +13,20 @@ fi
 
 model_name=$1
 prototxt="models.caffe/$model_name/${model_name}.prototxt"
-layers=$(cat $prototxt | perl -ne 'if(/^\s+name\s*:\s*\"([^\"]+)/){print $1."\n";}')
+cat $prototxt | grep name | perl -ne 'if(/^\s*name\s*:\s+\"([^\"]+)/){ print $1."\n";}' >.layer_names
+
+final_layer=$(cat $prototxt | perl -ne 'if(/^\s*top\s*:\s+\"([^\"]+)/){ print $1."\n";}' | tail -n1)
+ret=$(grep "^$final_layer$" .layer_names | wc -l)
+if [[ $ret -eq 0 ]];then
+    echo $final_layer >>.layer_names
+fi
 
-for i in $layers;do
+for i in $(cat .layer_names);do
+    i=${i//\//_}
     cf_npy="results/${model_name}.caffe/${i}.npy"
-    pd_npy="results/${model_name}.paddle/${i}.npy"
+    #pd_npy="results/${model_name}.paddle/${i}.npy"
+    #pd_npy=$(find results/${model_name}.paddle -iname "${i}*.npy" | head -n1)
+    pd_npy=$(find results/${model_name}.paddle -iname "${i}.*npy" | grep deleted -v | head -n1)
 
     if [[ ! -e $cf_npy ]];then
         echo "caffe's result not exist[$cf_npy]"
diff --git a/fluid/image_classification/caffe2fluid/examples/imagenet/tools/diff.sh b/fluid/image_classification/caffe2fluid/examples/imagenet/tools/diff.sh
index 7324ad315608693f448608e112cfcf5ac2337ac5..25e5d3b6c1bc301fbc505ce45103ddf091fd86f7 100755
--- a/fluid/image_classification/caffe2fluid/examples/imagenet/tools/diff.sh
+++ b/fluid/image_classification/caffe2fluid/examples/imagenet/tools/diff.sh
@@ -29,8 +29,8 @@ fi
 
 mkdir -p $results_root
 
-model_prototxt="models.caffe/$model_name/${model_name}.prototxt"
-model_caffemodel="models.caffe/${model_name}/${model_name}.caffemodel"
+prototxt="models.caffe/$model_name/${model_name}.prototxt"
+caffemodel="models.caffe/${model_name}/${model_name}.caffemodel"
 
 #1, dump layers' results from paddle
 paddle_results="$results_root/${model_name}.paddle"
@@ -51,7 +51,7 @@ PYTHON=`which cfpython`
 if [[ -z $PYTHON ]];then
     PYTHON=`which python`
 fi
-$PYTHON ./infer.py caffe $model_prototxt $model_caffemodel $paddle_results/data.npy
+$PYTHON ./infer.py caffe $prototxt $caffemodel $paddle_results/data.npy
 if [[ $? -ne 0 ]] || [[ ! -e "results.caffe" ]];then
     echo "not found caffe's results, maybe failed to do inference with caffe"
     exit 1
@@ -59,10 +59,25 @@ fi
 mv results.caffe $caffe_results
 
 #3, extract layer names
-cat $model_prototxt | grep name | perl -ne 'if(/^\s*name:\s+\"([^\"]+)/){ print $1."\n";}' >.layer_names
+cat $prototxt | grep name | perl -ne 'if(/^\s*name\s*:\s+\"([^\"]+)/){ print $1."\n";}' >.layer_names
+
+final_layer=$(cat $prototxt | perl -ne 'if(/^\s*top\s*:\s+\"([^\"]+)/){ print $1."\n";}' | tail -n1)
+ret=$(grep "^$final_layer$" .layer_names | wc -l)
+if [[ $ret -eq 0 ]];then
+    echo $final_layer >>.layer_names
+fi
 
 #4, compare one by one
-for i in $(cat ".layer_names" | tail -n1);do
+#for i in $(cat .layer_names);do
+for i in $(cat .layer_names | tail -n1);do
+    i=${i//\//_}
     echo "process $i"
-    $PYTHON compare.py $caffe_results/${i}.npy $paddle_results/${i}.npy
+    pd_npy=$(find $paddle_results/ -iname "${i}.*npy" | grep deleted -v | head -n1)
+    #pd_npy="$paddle_results/${i}.npy"
+    if [[ -f $pd_npy ]];then
+        $PYTHON compare.py $caffe_results/${i}.npy $pd_npy
+    else
+        echo "not found npy file[${i}.*npy] for layer[$i]"
+        exit 1
+    fi
 done
diff --git a/fluid/image_classification/caffe2fluid/examples/imagenet/tools/run.sh b/fluid/image_classification/caffe2fluid/examples/imagenet/tools/run.sh
index d9b2365d57b15df7a1810547a2d7da0a2105c2b5..7eb23f4c1257da259f62af6ce152fb3a5fda3b43 100755
--- a/fluid/image_classification/caffe2fluid/examples/imagenet/tools/run.sh
+++ b/fluid/image_classification/caffe2fluid/examples/imagenet/tools/run.sh
@@ -71,7 +71,9 @@ if [[ -z $only_convert ]];then
     if [[ -z $net_name ]];then
         net_name="MyNet"
     fi
-    $PYTHON ./infer.py dump $net_file $weight_file $imgfile $net_name
+    cmd="$PYTHON ./infer.py dump $net_file $weight_file $imgfile $net_name"
+    echo $cmd
+    eval $cmd
     ret=$?
 fi
 exit $ret
diff --git a/fluid/image_classification/caffe2fluid/examples/imagenet/tools/test.sh b/fluid/image_classification/caffe2fluid/examples/imagenet/tools/test.sh
new file mode 100755
index 0000000000000000000000000000000000000000..13e5db6381fbbbbdec03529cfb733a1727c894a4
--- /dev/null
+++ b/fluid/image_classification/caffe2fluid/examples/imagenet/tools/test.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+#
+#script to test all models
+#
+
+models="alexnet vgg16 googlenet resnet152 resnet101 resnet50"
+for i in $models;do
+    echo "begin to process $i"
+    bash ./tools/diff.sh $i 2>&1
+    echo "finished to process $i with ret[$?]"
+done
diff --git a/fluid/image_classification/caffe2fluid/kaffe/custom_layers/argmax.py b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/argmax.py
index 0d884f53a1027d091fe409632209a2d9a579f573..d419832de5d1cd893e38423ee8d9ec8017fd8ae8 100644
--- a/fluid/image_classification/caffe2fluid/kaffe/custom_layers/argmax.py
+++ b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/argmax.py
@@ -58,11 +58,13 @@ def argmax_layer(input, name, out_max_val=False, top_k=1, axis=-1):
     if axis < 0:
         axis += len(input.shape)
 
-    topk_var, index_var = fluid.layers.topk(input=input, k=top_k)
     if out_max_val is True:
+        topk_var, index_var = fluid.layers.topk(input=input, k=top_k)
         index_var = fluid.layers.cast(index_var, dtype=topk_var.dtype)
-        output = fluid.layers.concat([index_var, topk_var], axis=axis)
+        output = fluid.layers.concat(
+            [index_var, topk_var], axis=axis, name=name)
     else:
+        topk_var, index_var = fluid.layers.topk(input=input, k=top_k, name=name)
         output = index_var
 
     return output
diff --git a/fluid/image_classification/caffe2fluid/kaffe/custom_layers/axpy.py b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/axpy.py
index 389bb7996e87b2813a7704ef5e0c14332f95ab08..b81d4f25afbdda970a6c72582f315217a41c9999 100644
--- a/fluid/image_classification/caffe2fluid/kaffe/custom_layers/axpy.py
+++ b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/axpy.py
@@ -43,7 +43,7 @@ def axpy_layer(inputs, name):
     x = inputs[1]
     y = inputs[2]
     output = fluid.layers.elementwise_mul(x, alpha, axis=0)
-    output = fluid.layers.elementwise_add(output, y)
+    output = fluid.layers.elementwise_add(output, y, name=name)
 
     return output
 
diff --git a/fluid/image_classification/caffe2fluid/kaffe/graph.py b/fluid/image_classification/caffe2fluid/kaffe/graph.py
index 9d006aa9bc84dd081c7bd3d20c50e041a79da645..baea3cc1dc9431d07d0d3ca7191a429d1ef0f398 100644
--- a/fluid/image_classification/caffe2fluid/kaffe/graph.py
+++ b/fluid/image_classification/caffe2fluid/kaffe/graph.py
@@ -63,9 +63,10 @@ class Node(object):
 
 
 class Graph(object):
-    def __init__(self, nodes=None, name=None):
+    def __init__(self, nodes=None, name=None, trace={}):
         self.nodes = nodes or []
         self.node_lut = {node.name: node for node in self.nodes}
+        self.output_trace = trace
         if name is None or name == '':
             self.name = 'MyNet'
         else:
@@ -81,6 +82,15 @@ class Graph(object):
         except KeyError:
             raise KaffeError('Layer not found: %s' % name)
 
+    def add_name_trace(self, trace, which='caffe'):
+        self.output_trace[which] = trace
+
+    def get_name_trace(self, which=None):
+        if which is not None:
+            return self.output_trace[which]
+        else:
+            return self.output_trace
+
     def get_input_nodes(self):
         return [node for node in self.nodes if len(node.parents) == 0]
 
@@ -116,7 +126,7 @@ class Graph(object):
                 *NodeKind.compute_output_shape(node))
 
     def replaced(self, new_nodes):
-        return Graph(nodes=new_nodes, name=self.name)
+        return Graph(nodes=new_nodes, name=self.name, trace=self.output_trace)
 
     def transformed(self, transformers):
         graph = self
@@ -262,6 +272,7 @@ class GraphBuilder(object):
         # The current implementation only supports single-output nodes (note that a node can still
         # have multiple children, since multiple child nodes can refer to the single top's name).
         node_outputs = {}
+        output_trace = {}
         for layer in layers:
             node = graph.get_node(layer.name)
             for input_name in layer.bottom:
@@ -291,7 +302,26 @@ class GraphBuilder(object):
                 #
                 # For both cases, future references to this top re-routes to this node.
                 node_outputs[output_name] = node
+                if output_name in output_trace:
+                    output_trace[output_name].append(node.name)
+                else:
+                    output_trace[output_name] = [output_name, node.name]
+
+        #build a mapping from real-name to changed-name(for caffe's INPLACE inference)
+        real2chg = {}
+        deleted = {}
+        for k, v in output_trace.items():
+            real2chg[v[-1]] = k
+            for n in v:
+                if n in real2chg:
+                    continue
+                if n not in deleted:
+                    deleted[n] = '%s.%s' % (k, v[-1])
 
+        graph.add_name_trace({
+            'real2chg': real2chg,
+            'deleted': deleted
+        }, 'caffe')
         graph.compute_output_shapes()
         return graph
 
diff --git a/fluid/image_classification/caffe2fluid/kaffe/layers.py b/fluid/image_classification/caffe2fluid/kaffe/layers.py
index dcdd26040b6918d524f1d5ae58aa92f6da1a9550..f2d54c59fe8ee78840ce7d23a67694e495ceddf8 100644
--- a/fluid/image_classification/caffe2fluid/kaffe/layers.py
+++ b/fluid/image_classification/caffe2fluid/kaffe/layers.py
@@ -216,7 +216,7 @@ class LayerAdapter(object):
         s_w = self.get_kernel_value(
             params.stride_w, params.stride, 1, default=1)
         p_h = self.get_kernel_value(params.pad_h, params.pad, 0, default=0)
-        p_w = self.get_kernel_value(params.pad_h, params.pad, 1, default=0)
+        p_w = self.get_kernel_value(params.pad_w, params.pad, 1, default=0)
         return KernelParameters(k_h, k_w, s_h, s_w, p_h, p_w)
 
 
diff --git a/fluid/image_classification/caffe2fluid/kaffe/paddle/network.py b/fluid/image_classification/caffe2fluid/kaffe/paddle/network.py
index dabe9fa03c105dd2d4d9acc335c8e81df3377119..e8b0f2c3a91aaafcfc0951524ac64ed9723ad902 100644
--- a/fluid/image_classification/caffe2fluid/kaffe/paddle/network.py
+++ b/fluid/image_classification/caffe2fluid/kaffe/paddle/network.py
@@ -47,6 +47,8 @@ class Network(object):
         self.trainable = trainable
         # Switch variable for dropout
         self.paddle_env = None
+        self.output_names = []
+        self.name_trace = None
         self.setup()
 
     def setup(self):
@@ -79,6 +81,10 @@ class Network(object):
 
         data_dict = np.load(data_path).item()
         for op_name in data_dict:
+            if op_name == 'caffe2fluid_name_trace':
+                self.name_trace = data_dict[op_name]
+                continue
+
             layer = self.layers[op_name]
             for param_name, data in data_dict[op_name].iteritems():
                 try:
@@ -117,6 +123,15 @@ class Network(object):
         ident = sum(t.startswith(prefix) for t, _ in self.layers.items()) + 1
         return '%s_%d' % (prefix, ident)
 
+    def get_unique_output_name(self, prefix, layertype):
+        '''Returns an index-suffixed unique name for the given prefix.
+            This is used for auto-generating layer names based on the type-prefix.
+        '''
+        ident = sum(t.startswith(prefix) for t in self.output_names) + 1
+        unique_name = '%s.%s.output.%d' % (prefix, layertype, ident)
+        self.output_names.append(unique_name)
+        return unique_name
+
     @layer
     def conv(self,
              input,
@@ -152,6 +167,7 @@ class Network(object):
             act = None
 
         output = fluid.layers.conv2d(
+            name=self.get_unique_output_name(name, 'conv2d'),
             input=input,
             filter_size=[k_h, k_w],
             num_filters=c_o,
@@ -170,7 +186,8 @@ class Network(object):
     @layer
     def relu(self, input, name):
         fluid = import_fluid()
-        output = fluid.layers.relu(x=input)
+        output = fluid.layers.relu(
+            name=self.get_unique_output_name(name, 'relu'), x=input)
         return output
 
     def pool(self, pool_type, input, k_h, k_w, s_h, s_w, ceil_mode, padding,
@@ -182,6 +199,7 @@ class Network(object):
 
         fluid = import_fluid()
         output = fluid.layers.pool2d(
+            name=name,
             input=input,
             pool_size=k_hw,
             pool_stride=s_hw,
@@ -200,8 +218,16 @@ class Network(object):
                  ceil_mode,
                  padding=[0, 0],
                  name=None):
-        return self.pool('max', input, k_h, k_w, s_h, s_w, ceil_mode, padding,
-                         name)
+        return self.pool(
+            'max',
+            input,
+            k_h,
+            k_w,
+            s_h,
+            s_w,
+            ceil_mode,
+            padding,
+            name=self.get_unique_output_name(name, 'max_pool'))
 
     @layer
     def avg_pool(self,
@@ -213,25 +239,41 @@ class Network(object):
                  ceil_mode,
                  padding=[0, 0],
                  name=None):
-        return self.pool('avg', input, k_h, k_w, s_h, s_w, ceil_mode, padding,
-                         name)
+        return self.pool(
+            'avg',
+            input,
+            k_h,
+            k_w,
+            s_h,
+            s_w,
+            ceil_mode,
+            padding,
+            name=self.get_unique_output_name(name, 'avg_pool'))
 
     @layer
     def sigmoid(self, input, name):
         fluid = import_fluid()
-        return fluid.layers.sigmoid(input)
+        return fluid.layers.sigmoid(
+            input, name=self.get_unique_output_name(name, 'sigmoid'))
 
     @layer
     def lrn(self, input, radius, alpha, beta, name, bias=1.0):
         fluid = import_fluid()
-        output = fluid.layers.lrn(input=input, \
-                n=radius, k=bias, alpha=alpha, beta=beta, name=name)
+        output = fluid.layers.lrn(input=input,
+                                  n=radius,
+                                  k=bias,
+                                  alpha=alpha,
+                                  beta=beta,
+                                  name=self.get_unique_output_name(name, 'lrn'))
         return output
 
     @layer
     def concat(self, inputs, axis, name):
         fluid = import_fluid()
-        output = fluid.layers.concat(input=inputs, axis=axis)
+        output = fluid.layers.concat(
+            input=inputs,
+            axis=axis,
+            name=self.get_unique_output_name(name, 'concat'))
         return output
 
     @layer
@@ -239,7 +281,8 @@ class Network(object):
         fluid = import_fluid()
         output = inputs[0]
         for i in inputs[1:]:
-            output = fluid.layers.elementwise_add(x=output, y=i)
+            output = fluid.layers.elementwise_add(
+                x=output, y=i, name=self.get_unique_output_name(name, 'add'))
         return output
 
     @layer
@@ -251,7 +294,7 @@ class Network(object):
 
         prefix = name + '_'
         output = fluid.layers.fc(
-            name=name,
+            name=self.get_unique_output_name(name, 'fc'),
             input=input,
             size=num_out,
             act=act,
@@ -269,7 +312,8 @@ class Network(object):
                     str(shape))
             input = fluid.layers.reshape(input, shape[0:2])
 
-        output = fluid.layers.softmax(input)
+        output = fluid.layers.softmax(
+            input, name=self.get_unique_output_name(name, 'softmax'))
         return output
 
     @layer
@@ -289,7 +333,7 @@ class Network(object):
         mean_name = prefix + 'mean'
         variance_name = prefix + 'variance'
         output = fluid.layers.batch_norm(
-            name=name,
+            name=self.get_unique_output_name(name, 'batch_norm'),
             input=input,
             is_test=True,
             param_attr=param_attr,
@@ -308,7 +352,10 @@ class Network(object):
             output = input
         else:
             output = fluid.layers.dropout(
-                input, dropout_prob=drop_prob, is_test=is_test)
+                input,
+                dropout_prob=drop_prob,
+                is_test=is_test,
+                name=self.get_unique_output_name(name, 'dropout'))
         return output
 
     @layer
@@ -328,8 +375,16 @@ class Network(object):
         offset_param = fluid.layers.create_parameter(
             shape=scale_shape, dtype=input.dtype, name=name, attr=offset_attr)
 
-        output = fluid.layers.elementwise_mul(input, scale_param, axis=axis)
-        output = fluid.layers.elementwise_add(output, offset_param, axis=axis)
+        output = fluid.layers.elementwise_mul(
+            input,
+            scale_param,
+            axis=axis,
+            name=self.get_unique_output_name(name, 'scale_mul'))
+        output = fluid.layers.elementwise_add(
+            output,
+            offset_param,
+            axis=axis,
+            name=self.get_unique_output_name(name, 'scale_add'))
         return output
 
     def custom_layer_factory(self):
@@ -342,5 +397,6 @@ class Network(object):
     def custom_layer(self, inputs, kind, name, *args, **kwargs):
         """ make custom layer
         """
+        name = self.get_unique_output_name(name, kind)
         layer_factory = self.custom_layer_factory()
         return layer_factory(kind, inputs, name, *args, **kwargs)
diff --git a/fluid/image_classification/caffe2fluid/kaffe/paddle/transformer.py b/fluid/image_classification/caffe2fluid/kaffe/paddle/transformer.py
index 8607b8748a60aa3a72f77a589727190efa2b8a36..02a600bcd0ac7732b5162070064cd10ff1359dc2 100644
--- a/fluid/image_classification/caffe2fluid/kaffe/paddle/transformer.py
+++ b/fluid/image_classification/caffe2fluid/kaffe/paddle/transformer.py
@@ -3,9 +3,9 @@ import numpy as np
 from ..errors import KaffeError, print_stderr
 from ..graph import GraphBuilder, NodeMapper
 from ..layers import NodeKind
-from ..transformers import (DataInjector, DataReshaper, NodeRenamer, ReLUFuser,
-                            BatchNormScaleBiasFuser, BatchNormPreprocessor,
-                            ParameterNamer)
+from ..transformers import (DataInjector, DataReshaper, NodeRenamer,
+                            SubNodeFuser, ReLUFuser, BatchNormScaleBiasFuser,
+                            BatchNormPreprocessor, ParameterNamer)
 from . import network
 
 
@@ -18,7 +18,7 @@ def get_padding_type(kernel_params, input_shape, output_shape):
     https://github.com/Yangqing/caffe2/blob/master/caffe2/proto/caffe2_legacy.proto
     '''
     k_h, k_w, s_h, s_w, p_h, p_w = kernel_params
-    if p_h * p_w > 0:
+    if p_h > 0 or p_w > 0:
         return [p_h, p_w]
     else:
         return None
@@ -315,6 +315,23 @@ class Transformer(object):
 
         self.graph = graph.transformed(transformers)
 
+        #for the purpose of recording name mapping because of fused nodes
+        trace = SubNodeFuser.traced_names()
+        chg2real = {}
+        deleted = {}
+        for k, v in trace.items():
+            chg2real[k] = v[-1]  #mapping from changed-name to real-name
+            for n in v:
+                if n in chg2real:
+                    continue
+                if n not in deleted:
+                    deleted[n] = '%s.%s' % (k, v[-1])
+
+        self.graph.add_name_trace({
+            'chg2real': chg2real,
+            'deleted': deleted
+        }, 'paddle')
+
         # Display the graph
         if self.verbose:
             print_stderr(self.graph)
@@ -339,6 +356,8 @@ class Transformer(object):
                 node.name: node.data
                 for node in self.graph.nodes if node.data
             }
+            self.params['caffe2fluid_name_trace'] = self.graph.get_name_trace()
+
         return self.params
 
     def transform_source(self):
diff --git a/fluid/image_classification/caffe2fluid/kaffe/transformers.py b/fluid/image_classification/caffe2fluid/kaffe/transformers.py
index 33919275a93001dec8c2accf10312b18c52c7081..6b53e05a57a657015662c24ae2d974d6f25e7d00 100644
--- a/fluid/image_classification/caffe2fluid/kaffe/transformers.py
+++ b/fluid/image_classification/caffe2fluid/kaffe/transformers.py
@@ -181,6 +181,20 @@ class SubNodeFuser(object):
     '''
     An abstract helper for merging a single-child with its single-parent.
     '''
+    _traced_names = {}
+
+    @classmethod
+    def traced_names(cls):
+        return cls._traced_names
+
+    @classmethod
+    def trace(cls, fname, tname):
+        """ recording the names mapping,
+            the value of 'fname' will be replaced by value of 'tname'
+        """
+        if fname not in cls._traced_names:
+            cls._traced_names[fname] = []
+        cls._traced_names[fname].append(tname)
 
     def __call__(self, graph):
         nodes = graph.nodes
@@ -234,6 +248,7 @@ class ReLUFuser(SubNodeFuser):
                 child.kind == NodeKind.ReLU)
 
     def merge(self, parent, child):
+        SubNodeFuser.trace(parent.name, child.name)
         parent.metadata['relu'] = True
         parent.metadata['relu_negative_slope'] = child.parameters.negative_slope
 
@@ -255,6 +270,7 @@ class BatchNormScaleBiasFuser(SubNodeFuser):
                 child.parameters.bias_term == True)
 
     def merge(self, parent, child):
+        SubNodeFuser.trace(parent.name, child.name)
         parent.scale_bias_node = child
 
 
diff --git a/fluid/image_classification/data/ILSVRC2012/download_imagenet2012.sh b/fluid/image_classification/data/ILSVRC2012/download_imagenet2012.sh
new file mode 100644
index 0000000000000000000000000000000000000000..947b8900bd944759437a55c20fb32bca4a1b9380
--- /dev/null
+++ b/fluid/image_classification/data/ILSVRC2012/download_imagenet2012.sh
@@ -0,0 +1,40 @@
+set -e
+if [ "x${IMAGENET_USERNAME}" == x -o "x${IMAGENET_ACCESS_KEY}" == x ];then
+  echo "Please create an account on image-net.org."
+  echo "It will provide you a pair of username and accesskey to download imagenet data."
+  read -p "Username: " IMAGENET_USERNAME
+  read -p "Accesskey: " IMAGENET_ACCESS_KEY
+fi
+
+root_url=http://www.image-net.org/challenges/LSVRC/2012/nnoupb
+valid_tar=ILSVRC2012_img_val.tar
+train_tar=ILSVRC2012_img_train.tar
+train_folder=train/
+valid_folder=val/
+
+echo "Download imagenet training data..."
+mkdir -p ${train_folder}
+wget -nd -c ${root_url}/${train_tar}
+tar xf ${train_tar} -C ${train_folder}
+
+cd ${train_folder}
+for x in `ls *.tar`
+do
+  filename=`basename $x .tar`
+  mkdir -p $filename
+  tar -xf $x -C $filename
+  rm -rf $x
+done
+cd -
+
+echo "Download imagenet validation data..."
+mkdir -p ${valid_folder}
+wget -nd -c ${root_url}/${valid_tar}
+tar xf ${valid_tar} -C ${valid_folder}
+
+echo "Download imagenet label file: val_list.txt & train_list.txt"
+label_file=ImageNet_label.tgz
+label_url=http://imagenet-data.bj.bcebos.com/${label_file}
+wget -nd -c ${label_url}
+tar zxf ${label_file}
+
diff --git a/fluid/image_classification/data/ILSVRC2012/unzip.sh b/fluid/image_classification/data/ILSVRC2012/unzip.sh
deleted file mode 100644
index 704a0e55770047313ce750d0e8fa6265408edd2a..0000000000000000000000000000000000000000
--- a/fluid/image_classification/data/ILSVRC2012/unzip.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-cd train
-
-dir=./
-for x in `ls *.tar`
-do
-filename=`basename $x .tar`
-mkdir $filename
-tar -xvf $x -C ./$filename
-done
diff --git a/fluid/image_classification/eval.py b/fluid/image_classification/eval.py
index dd1c2cc1d0349e03d972d82835dddabfae2ce75a..e0c96d0f1333e69a5260ae7ae88af8729033cdcf 100644
--- a/fluid/image_classification/eval.py
+++ b/fluid/image_classification/eval.py
@@ -1,83 +1,127 @@
 import os
-import sys
 import numpy as np
-import argparse
-import functools
-
+import time
+import sys
 import paddle
 import paddle.fluid as fluid
-from utility import add_arguments, print_arguments
-from se_resnext import SE_ResNeXt
+import models
 import reader
+import argparse
+import functools
+from models.learning_rate import cosine_decay
+from utility import add_arguments, print_arguments
+import math
 
 parser = argparse.ArgumentParser(description=__doc__)
 add_arg = functools.partial(add_arguments, argparser=parser)
 # yapf: disable
-add_arg('batch_size',       int,   32,        "Minibatch size.")
-add_arg('use_gpu',          bool,  True,      "Whether to use GPU or not.")
-add_arg('test_list',        str,   '',        "The testing data lists.")
-add_arg('num_layers',       int,  50,         "How many layers for SE-ResNeXt model.")
-add_arg('model_dir',        str,   '',        "The model path.")
+add_arg('batch_size',       int,  256,                 "Minibatch size.")
+add_arg('use_gpu',          bool, True,                "Whether to use GPU or not.")
+add_arg('class_dim',        int,  1000,                "Class number.")
+add_arg('image_shape',      str,  "3,224,224",         "Input image size")
+add_arg('with_mem_opt',     bool, True,                "Whether to use memory optimization or not.")
+add_arg('pretrained_model', str,  None,                "Whether to use pretrained model.")
+add_arg('model',            str, "SE_ResNeXt50_32x4d", "Set the network to use.")
 # yapf: enable
 
+model_list = [m for m in dir(models) if "__" not in m]
+
 
 def eval(args):
-    class_dim = 1000
-    image_shape = [3, 224, 224]
+    # parameters from arguments
+    class_dim = args.class_dim
+    model_name = args.model
+    pretrained_model = args.pretrained_model
+    with_memory_optimization = args.with_mem_opt
+    image_shape = [int(m) for m in args.image_shape.split(",")]
+
+    assert model_name in model_list, "{} is not in lists: {}".format(args.model,
+                                                                     model_list)
+
     image = fluid.layers.data(name='image', shape=image_shape, dtype='float32')
     label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    out = SE_ResNeXt(input=image, class_dim=class_dim, layers=args.num_layers)
-    cost = fluid.layers.cross_entropy(input=out, label=label)
-    acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
-    acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
-    avg_cost = fluid.layers.mean(x=cost)
 
-    inference_program = fluid.default_main_program().clone(for_test=True)
+    # model definition
+    model = models.__dict__[model_name]()
+
+    if model_name is "GoogleNet":
+        out0, out1, out2 = model.net(input=image, class_dim=class_dim)
+        cost0 = fluid.layers.cross_entropy(input=out0, label=label)
+        cost1 = fluid.layers.cross_entropy(input=out1, label=label)
+        cost2 = fluid.layers.cross_entropy(input=out2, label=label)
+        avg_cost0 = fluid.layers.mean(x=cost0)
+        avg_cost1 = fluid.layers.mean(x=cost1)
+        avg_cost2 = fluid.layers.mean(x=cost2)
+
+        avg_cost = avg_cost0 + 0.3 * avg_cost1 + 0.3 * avg_cost2
+        acc_top1 = fluid.layers.accuracy(input=out0, label=label, k=1)
+        acc_top5 = fluid.layers.accuracy(input=out0, label=label, k=5)
+    else:
+        out = model.net(input=image, class_dim=class_dim)
+        cost = fluid.layers.cross_entropy(input=out, label=label)
+
+        avg_cost = fluid.layers.mean(x=cost)
+        acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
+        acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
+
+    test_program = fluid.default_main_program().clone(for_test=True)
+
+    if with_memory_optimization:
+        fluid.memory_optimize(fluid.default_main_program())
 
     place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
     exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
 
-    if not os.path.exists(args.model_dir):
-        raise ValueError("The model path [%s] does not exist." %
-                         (args.model_dir))
-    if not os.path.exists(args.test_list):
-        raise ValueError("The test lists [%s] does not exist." %
-                         (args.test_list))
+    if pretrained_model:
 
-    def if_exist(var):
-        return os.path.exists(os.path.join(args.model_dir, var.name))
+        def if_exist(var):
+            return os.path.exists(os.path.join(pretrained_model, var.name))
 
-    fluid.io.load_vars(exe, args.model_dir, predicate=if_exist)
+        fluid.io.load_vars(exe, pretrained_model, predicate=if_exist)
 
-    test_reader = paddle.batch(
-        reader.test(args.test_list), batch_size=args.batch_size)
+    val_reader = paddle.batch(reader.val(), batch_size=args.batch_size)
     feeder = fluid.DataFeeder(place=place, feed_list=[image, label])
 
-    fetch_list = [avg_cost, acc_top1, acc_top5]
+    fetch_list = [avg_cost.name, acc_top1.name, acc_top5.name]
 
     test_info = [[], [], []]
-    for batch_id, data in enumerate(test_reader()):
-        loss, acc1, acc5 = exe.run(inference_program,
-                                   feed=feeder.feed(data),
-                                   fetch_list=fetch_list)
-        test_info[0].append(loss[0])
-        test_info[1].append(acc1[0])
-        test_info[2].append(acc5[0])
-        if batch_id % 1 == 0:
-            print("Test {0}, loss {1}, acc1 {2}, acc5 {3}"
-                  .format(batch_id, loss[0], acc1[0], acc5[0]))
+    cnt = 0
+    for batch_id, data in enumerate(val_reader()):
+        t1 = time.time()
+        loss, acc1, acc5 = exe.run(test_program,
+                                   fetch_list=fetch_list,
+                                   feed=feeder.feed(data))
+        t2 = time.time()
+        period = t2 - t1
+        loss = np.mean(loss)
+        acc1 = np.mean(acc1)
+        acc5 = np.mean(acc5)
+        test_info[0].append(loss * len(data))
+        test_info[1].append(acc1 * len(data))
+        test_info[2].append(acc5 * len(data))
+        cnt += len(data)
+        if batch_id % 10 == 0:
+            print("Testbatch {0},loss {1}, "
+                  "acc1 {2},acc5 {3},time {4}".format(batch_id, \
+                  loss, acc1, acc5, \
+                  "%2.2f sec" % period))
             sys.stdout.flush()
 
-    test_loss = np.array(test_info[0]).mean()
-    test_acc1 = np.array(test_info[1]).mean()
-    test_acc5 = np.array(test_info[2]).mean()
+    test_loss = np.sum(test_info[0]) / cnt
+    test_acc1 = np.sum(test_info[1]) / cnt
+    test_acc5 = np.sum(test_info[2]) / cnt
 
-    print("Test loss {0}, acc1 {1}, acc5 {2}".format(test_loss, test_acc1,
-                                                     test_acc5))
+    print("Test_loss {0}, test_acc1 {1}, test_acc5 {2}".format(
+        test_loss, test_acc1, test_acc5))
     sys.stdout.flush()
 
 
-if __name__ == '__main__':
+def main():
     args = parser.parse_args()
     print_arguments(args)
     eval(args)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/fluid/image_classification/images/curve.jpg b/fluid/image_classification/images/curve.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..15694c2962ae3a4a93cbb6f0f8b07c39e4db79ae
Binary files /dev/null and b/fluid/image_classification/images/curve.jpg differ
diff --git a/fluid/image_classification/inception_v4.py b/fluid/image_classification/inception_v4.py
deleted file mode 100644
index 3410bd6d2ffde3981ce5a6b8cdb36f5415e14221..0000000000000000000000000000000000000000
--- a/fluid/image_classification/inception_v4.py
+++ /dev/null
@@ -1,467 +0,0 @@
-import os
-import paddle.fluid as fluid
-
-
-def inception_v4(img, class_dim):
-
-    tmp = stem(input=img)
-    for i in range(1):
-        tmp = inception_A(input=tmp, depth=i)
-    tmp = reduction_A(input=tmp)
-
-    for i in range(7):
-        tmp = inception_B(input=tmp, depth=i)
-    reduction_B(input=tmp)
-
-    for i in range(3):
-        tmp = inception_C(input=tmp, depth=i)
-
-    pool = fluid.layers.pool2d(
-        pool_type='avg', input=tmp, pool_size=7, pool_stride=1)
-    dropout = fluid.layers.dropout(x=pool, dropout_prob=0.2)
-    fc = fluid.layers.fc(input=dropout, size=class_dim, act='softmax')
-    out = fluid.layers.softmax(input=fc)
-    return out
-
-
-def conv_bn_layer(name,
-                  input,
-                  num_filters,
-                  filter_size,
-                  padding=0,
-                  stride=1,
-                  groups=1,
-                  act=None):
-    conv = fluid.layers.conv2d(
-        name=name,
-        input=input,
-        num_filters=num_filters,
-        filter_size=filter_size,
-        stride=stride,
-        padding=padding,
-        groups=groups,
-        act=None,
-        bias_attr=False)
-    return fluid.layers.batch_norm(name=name + '_norm', input=conv, act=act)
-
-
-def stem(input):
-    conv0 = conv_bn_layer(
-        name='stem_conv_0',
-        input=input,
-        num_filters=32,
-        filter_size=3,
-        padding=1,
-        stride=2)
-    conv1 = conv_bn_layer(
-        name='stem_conv_1',
-        input=conv0,
-        num_filters=32,
-        filter_size=3,
-        padding=1)
-    conv2 = conv_bn_layer(
-        name='stem_conv_2',
-        input=conv1,
-        num_filters=64,
-        filter_size=3,
-        padding=1)
-
-    def block0(input):
-        pool0 = fluid.layers.pool2d(
-            input=input,
-            pool_size=3,
-            pool_stride=2,
-            pool_type='max',
-            pool_padding=1)
-        conv0 = conv_bn_layer(
-            name='stem_block0_conv',
-            input=input,
-            num_filters=96,
-            filter_size=3,
-            stride=2,
-            padding=1)
-        return fluid.layers.concat(input=[pool0, conv0], axis=1)
-
-    def block1(input):
-        l_conv0 = conv_bn_layer(
-            name='stem_block1_l_conv0',
-            input=input,
-            num_filters=64,
-            filter_size=1,
-            stride=1,
-            padding=0)
-        l_conv1 = conv_bn_layer(
-            name='stem_block1_l_conv1',
-            input=l_conv0,
-            num_filters=96,
-            filter_size=3,
-            stride=1,
-            padding=1)
-        r_conv0 = conv_bn_layer(
-            name='stem_block1_r_conv0',
-            input=input,
-            num_filters=64,
-            filter_size=1,
-            stride=1,
-            padding=0)
-        r_conv1 = conv_bn_layer(
-            name='stem_block1_r_conv1',
-            input=r_conv0,
-            num_filters=64,
-            filter_size=(7, 1),
-            stride=1,
-            padding=(3, 0))
-        r_conv2 = conv_bn_layer(
-            name='stem_block1_r_conv2',
-            input=r_conv1,
-            num_filters=64,
-            filter_size=(1, 7),
-            stride=1,
-            padding=(0, 3))
-        r_conv3 = conv_bn_layer(
-            name='stem_block1_r_conv3',
-            input=r_conv2,
-            num_filters=96,
-            filter_size=3,
-            stride=1,
-            padding=1)
-        return fluid.layers.concat(input=[l_conv1, r_conv3], axis=1)
-
-    def block2(input):
-        conv0 = conv_bn_layer(
-            name='stem_block2_conv',
-            input=input,
-            num_filters=192,
-            filter_size=3,
-            stride=2,
-            padding=1)
-        pool0 = fluid.layers.pool2d(
-            input=input,
-            pool_size=3,
-            pool_stride=2,
-            pool_padding=1,
-            pool_type='max')
-        return fluid.layers.concat(input=[conv0, pool0], axis=1)
-
-    conv3 = block0(conv2)
-    conv4 = block1(conv3)
-    conv5 = block2(conv4)
-    return conv5
-
-
-def inception_A(input, depth):
-    b0_pool0 = fluid.layers.pool2d(
-        name='inceptA{0}_branch0_pool0'.format(depth),
-        input=input,
-        pool_size=3,
-        pool_stride=1,
-        pool_padding=1,
-        pool_type='avg')
-    b0_conv0 = conv_bn_layer(
-        name='inceptA{0}_branch0_conv0'.format(depth),
-        input=b0_pool0,
-        num_filters=96,
-        filter_size=1,
-        stride=1,
-        padding=0)
-    b1_conv0 = conv_bn_layer(
-        name='inceptA{0}_branch1_conv0'.format(depth),
-        input=input,
-        num_filters=96,
-        filter_size=1,
-        stride=1,
-        padding=0)
-    b2_conv0 = conv_bn_layer(
-        name='inceptA{0}_branch2_conv0'.format(depth),
-        input=input,
-        num_filters=64,
-        filter_size=1,
-        stride=1,
-        padding=0)
-    b2_conv1 = conv_bn_layer(
-        name='inceptA{0}_branch2_conv1'.format(depth),
-        input=b2_conv0,
-        num_filters=96,
-        filter_size=3,
-        stride=1,
-        padding=1)
-    b3_conv0 = conv_bn_layer(
-        name='inceptA{0}_branch3_conv0'.format(depth),
-        input=input,
-        num_filters=64,
-        filter_size=1,
-        stride=1,
-        padding=0)
-    b3_conv1 = conv_bn_layer(
-        name='inceptA{0}_branch3_conv1'.format(depth),
-        input=b3_conv0,
-        num_filters=96,
-        filter_size=3,
-        stride=1,
-        padding=1)
-    b3_conv2 = conv_bn_layer(
-        name='inceptA{0}_branch3_conv2'.format(depth),
-        input=b3_conv1,
-        num_filters=96,
-        filter_size=3,
-        stride=1,
-        padding=1)
-    return fluid.layers.concat(
-        input=[b0_conv0, b1_conv0, b2_conv1, b3_conv2], axis=1)
-
-
-def reduction_A(input):
-    b0_pool0 = fluid.layers.pool2d(
-        name='ReductA_branch0_pool0',
-        input=input,
-        pool_size=3,
-        pool_stride=2,
-        pool_padding=1,
-        pool_type='max')
-    b1_conv0 = conv_bn_layer(
-        name='ReductA_branch1_conv0',
-        input=input,
-        num_filters=384,
-        filter_size=3,
-        stride=2,
-        padding=1)
-    b2_conv0 = conv_bn_layer(
-        name='ReductA_branch2_conv0',
-        input=input,
-        num_filters=192,
-        filter_size=1,
-        stride=1,
-        padding=0)
-    b2_conv1 = conv_bn_layer(
-        name='ReductA_branch2_conv1',
-        input=b2_conv0,
-        num_filters=224,
-        filter_size=3,
-        stride=1,
-        padding=1)
-    b2_conv2 = conv_bn_layer(
-        name='ReductA_branch2_conv2',
-        input=b2_conv1,
-        num_filters=256,
-        filter_size=3,
-        stride=2,
-        padding=1)
-    return fluid.layers.concat(input=[b0_pool0, b1_conv0, b2_conv2], axis=1)
-
-
-def inception_B(input, depth):
-    b0_pool0 = fluid.layers.pool2d(
-        name='inceptB{0}_branch0_pool0'.format(depth),
-        input=input,
-        pool_size=3,
-        pool_stride=1,
-        pool_padding=1,
-        pool_type='avg')
-    b0_conv0 = conv_bn_layer(
-        name='inceptB{0}_branch0_conv0'.format(depth),
-        input=b0_pool0,
-        num_filters=128,
-        filter_size=1,
-        stride=1,
-        padding=0)
-    b1_conv0 = conv_bn_layer(
-        name='inceptB{0}_branch1_conv0'.format(depth),
-        input=input,
-        num_filters=384,
-        filter_size=1,
-        stride=1,
-        padding=0)
-    b2_conv0 = conv_bn_layer(
-        name='inceptB{0}_branch2_conv0'.format(depth),
-        input=input,
-        num_filters=192,
-        filter_size=1,
-        stride=1,
-        padding=0)
-    b2_conv1 = conv_bn_layer(
-        name='inceptB{0}_branch2_conv1'.format(depth),
-        input=b2_conv0,
-        num_filters=224,
-        filter_size=(1, 7),
-        stride=1,
-        padding=(0, 3))
-    b2_conv2 = conv_bn_layer(
-        name='inceptB{0}_branch2_conv2'.format(depth),
-        input=b2_conv1,
-        num_filters=256,
-        filter_size=(7, 1),
-        stride=1,
-        padding=(3, 0))
-    b3_conv0 = conv_bn_layer(
-        name='inceptB{0}_branch3_conv0'.format(depth),
-        input=input,
-        num_filters=192,
-        filter_size=1,
-        stride=1,
-        padding=0)
-    b3_conv1 = conv_bn_layer(
-        name='inceptB{0}_branch3_conv1'.format(depth),
-        input=b3_conv0,
-        num_filters=192,
-        filter_size=(1, 7),
-        stride=1,
-        padding=(0, 3))
-    b3_conv2 = conv_bn_layer(
-        name='inceptB{0}_branch3_conv2'.format(depth),
-        input=b3_conv1,
-        num_filters=224,
-        filter_size=(7, 1),
-        stride=1,
-        padding=(3, 0))
-    b3_conv3 = conv_bn_layer(
-        name='inceptB{0}_branch3_conv3'.format(depth),
-        input=b3_conv2,
-        num_filters=224,
-        filter_size=(1, 7),
-        stride=1,
-        padding=(0, 3))
-    b3_conv4 = conv_bn_layer(
-        name='inceptB{0}_branch3_conv4'.format(depth),
-        input=b3_conv3,
-        num_filters=256,
-        filter_size=(7, 1),
-        stride=1,
-        padding=(3, 0))
-    return fluid.layers.concat(
-        input=[b0_conv0, b1_conv0, b2_conv2, b3_conv4], axis=1)
-
-
-def reduction_B(input):
-    b0_pool0 = fluid.layers.pool2d(
-        name='ReductB_branch0_pool0',
-        input=input,
-        pool_size=3,
-        pool_stride=2,
-        pool_padding=1,
-        pool_type='max')
-    b1_conv0 = conv_bn_layer(
-        name='ReductB_branch1_conv0',
-        input=input,
-        num_filters=192,
-        filter_size=1,
-        stride=1,
-        padding=0)
-    b1_conv1 = conv_bn_layer(
-        name='ReductB_branch1_conv1',
-        input=b1_conv0,
-        num_filters=192,
-        filter_size=3,
-        stride=2,
-        padding=1)
-    b2_conv0 = conv_bn_layer(
-        name='ReductB_branch2_conv0',
-        input=input,
-        num_filters=256,
-        filter_size=1,
-        stride=1,
-        padding=0)
-    b2_conv1 = conv_bn_layer(
-        name='ReductB_branch2_conv1',
-        input=b2_conv0,
-        num_filters=256,
-        filter_size=(1, 7),
-        stride=1,
-        padding=(0, 3))
-    b2_conv2 = conv_bn_layer(
-        name='ReductB_branch2_conv2',
-        input=b2_conv1,
-        num_filters=320,
-        filter_size=(7, 1),
-        stride=1,
-        padding=(3, 0))
-    b2_conv3 = conv_bn_layer(
-        name='ReductB_branch2_conv3',
-        input=b2_conv2,
-        num_filters=320,
-        filter_size=3,
-        stride=2,
-        padding=1)
-    return fluid.layers.concat(input=[b0_pool0, b1_conv1, b2_conv3], axis=1)
-
-
-def inception_C(input, depth):
-    b0_pool0 = fluid.layers.pool2d(
-        name='inceptC{0}_branch0_pool0'.format(depth),
-        input=input,
-        pool_size=3,
-        pool_stride=1,
-        pool_padding=1,
-        pool_type='avg')
-    b0_conv0 = conv_bn_layer(
-        name='inceptC{0}_branch0_conv0'.format(depth),
-        input=b0_pool0,
-        num_filters=256,
-        filter_size=1,
-        stride=1,
-        padding=0)
-    b1_conv0 = conv_bn_layer(
-        name='inceptC{0}_branch1_conv0'.format(depth),
-        input=input,
-        num_filters=256,
-        filter_size=1,
-        stride=1,
-        padding=0)
-    b2_conv0 = conv_bn_layer(
-        name='inceptC{0}_branch2_conv0'.format(depth),
-        input=input,
-        num_filters=384,
-        filter_size=1,
-        stride=1,
-        padding=0)
-    b2_conv1 = conv_bn_layer(
-        name='inceptC{0}_branch2_conv1'.format(depth),
-        input=b2_conv0,
-        num_filters=256,
-        filter_size=(1, 3),
-        stride=1,
-        padding=(0, 1))
-    b2_conv2 = conv_bn_layer(
-        name='inceptC{0}_branch2_conv2'.format(depth),
-        input=b2_conv0,
-        num_filters=256,
-        filter_size=(3, 1),
-        stride=1,
-        padding=(1, 0))
-    b3_conv0 = conv_bn_layer(
-        name='inceptC{0}_branch3_conv0'.format(depth),
-        input=input,
-        num_filters=384,
-        filter_size=1,
-        stride=1,
-        padding=0)
-    b3_conv1 = conv_bn_layer(
-        name='inceptC{0}_branch3_conv1'.format(depth),
-        input=b3_conv0,
-        num_filters=448,
-        filter_size=(1, 3),
-        stride=1,
-        padding=(0, 1))
-    b3_conv2 = conv_bn_layer(
-        name='inceptC{0}_branch3_conv2'.format(depth),
-        input=b3_conv1,
-        num_filters=512,
-        filter_size=(3, 1),
-        stride=1,
-        padding=(1, 0))
-    b3_conv3 = conv_bn_layer(
-        name='inceptC{0}_branch3_conv3'.format(depth),
-        input=b3_conv2,
-        num_filters=256,
-        filter_size=(3, 1),
-        stride=1,
-        padding=(1, 0))
-    b3_conv4 = conv_bn_layer(
-        name='inceptC{0}_branch3_conv4'.format(depth),
-        input=b3_conv2,
-        num_filters=256,
-        filter_size=(1, 3),
-        stride=1,
-        padding=(0, 1))
-    return fluid.layers.concat(
-        input=[b0_conv0, b1_conv0, b2_conv1, b2_conv2, b3_conv3, b3_conv4],
-        axis=1)
diff --git a/fluid/image_classification/infer.py b/fluid/image_classification/infer.py
index f2a6ca3c703398ad21089ae78bd79ff2e4b5dee1..a835926da248d926de36d1c46bc3f7df7265b1d0 100644
--- a/fluid/image_classification/infer.py
+++ b/fluid/image_classification/infer.py
@@ -1,69 +1,92 @@
 import os
-import sys
 import numpy as np
-import argparse
-import functools
-
+import time
+import sys
 import paddle
 import paddle.fluid as fluid
-from utility import add_arguments, print_arguments
-from se_resnext import SE_ResNeXt
+import models
 import reader
+import argparse
+import functools
+from models.learning_rate import cosine_decay
+from utility import add_arguments, print_arguments
+import math
 
 parser = argparse.ArgumentParser(description=__doc__)
-add_arg = functools.partial(add_arguments, argparser=parser)
 # yapf: disable
-add_arg('batch_size',       int,   1,        "Minibatch size.")
-add_arg('use_gpu',          bool,  True,      "Whether to use GPU or not.")
-add_arg('test_list',        str,   '',        "The testing data lists.")
-add_arg('num_layers',       int,  50,         "How many layers for SE-ResNeXt model.")
-add_arg('model_dir',        str,   '',        "The model path.")
+add_arg = functools.partial(add_arguments, argparser=parser)
+add_arg('batch_size',       int,  256,                  "Minibatch size.")
+add_arg('use_gpu',          bool, True,                 "Whether to use GPU or not.")
+add_arg('class_dim',        int,  1000,                 "Class number.")
+add_arg('image_shape',      str,  "3,224,224",          "Input image size")
+add_arg('with_mem_opt',     bool, True,                 "Whether to use memory optimization or not.")
+add_arg('pretrained_model', str,  None,                 "Whether to use pretrained model.")
+add_arg('model',            str,  "SE_ResNeXt50_32x4d", "Set the network to use.")
 # yapf: enable
 
+model_list = [m for m in dir(models) if "__" not in m]
+
 
 def infer(args):
-    class_dim = 1000
-    image_shape = [3, 224, 224]
+    # parameters from arguments
+    class_dim = args.class_dim
+    model_name = args.model
+    pretrained_model = args.pretrained_model
+    with_memory_optimization = args.with_mem_opt
+    image_shape = [int(m) for m in args.image_shape.split(",")]
+
+    assert model_name in model_list, "{} is not in lists: {}".format(args.model,
+                                                                     model_list)
+
     image = fluid.layers.data(name='image', shape=image_shape, dtype='float32')
-    out = SE_ResNeXt(input=image, class_dim=class_dim, layers=args.num_layers)
-    out = fluid.layers.softmax(input=out)
 
-    inference_program = fluid.default_main_program().clone(for_test=True)
+    # model definition
+    model = models.__dict__[model_name]()
+
+    if model_name is "GoogleNet":
+        out, _, _ = model.net(input=image, class_dim=class_dim)
+    else:
+        out = model.net(input=image, class_dim=class_dim)
+
+    test_program = fluid.default_main_program().clone(for_test=True)
+
+    if with_memory_optimization:
+        fluid.memory_optimize(fluid.default_main_program())
 
     place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
     exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
 
-    if not os.path.exists(args.model_dir):
-        raise ValueError("The model path [%s] does not exist." %
-                         (args.model_dir))
-    if not os.path.exists(args.test_list):
-        raise ValueError("The test lists [%s] does not exist." %
-                         (args.test_list))
+    if pretrained_model:
 
-    def if_exist(var):
-        return os.path.exists(os.path.join(args.model_dir, var.name))
+        def if_exist(var):
+            return os.path.exists(os.path.join(pretrained_model, var.name))
 
-    fluid.io.load_vars(exe, args.model_dir, predicate=if_exist)
+        fluid.io.load_vars(exe, pretrained_model, predicate=if_exist)
 
-    test_reader = paddle.batch(
-        reader.infer(args.test_list), batch_size=args.batch_size)
+    test_batch_size = 1
+    test_reader = paddle.batch(reader.test(), batch_size=test_batch_size)
     feeder = fluid.DataFeeder(place=place, feed_list=[image])
 
-    fetch_list = [out]
+    fetch_list = [out.name]
 
     TOPK = 1
     for batch_id, data in enumerate(test_reader()):
-        result = exe.run(inference_program,
-                         feed=feeder.feed(data),
-                         fetch_list=fetch_list)
-        result = result[0]
-        pred_label = np.argsort(result)[::-1][0][0]
-        print("Test {0}-score {1}, class {2}: "
-              .format(batch_id, result[0][pred_label], pred_label))
+        result = exe.run(test_program,
+                         fetch_list=fetch_list,
+                         feed=feeder.feed(data))
+        result = result[0][0]
+        pred_label = np.argsort(result)[::-1][:TOPK]
+        print("Test-{0}-score: {1}, class {2}"
+              .format(batch_id, result[pred_label], pred_label))
         sys.stdout.flush()
 
 
-if __name__ == '__main__':
+def main():
     args = parser.parse_args()
     print_arguments(args)
     infer(args)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/fluid/image_classification/mobilenet.py b/fluid/image_classification/mobilenet.py
deleted file mode 100644
index edee1bf07532bb680c512dffe852ae3968257e9b..0000000000000000000000000000000000000000
--- a/fluid/image_classification/mobilenet.py
+++ /dev/null
@@ -1,155 +0,0 @@
-import os
-
-import paddle.v2 as paddle
-import paddle.fluid as fluid
-from paddle.fluid.initializer import MSRA
-from paddle.fluid.param_attr import ParamAttr
-
-parameter_attr = ParamAttr(initializer=MSRA())
-
-
-def conv_bn_layer(input,
-                  filter_size,
-                  num_filters,
-                  stride,
-                  padding,
-                  channels=None,
-                  num_groups=1,
-                  act='relu',
-                  use_cudnn=True):
-    conv = fluid.layers.conv2d(
-        input=input,
-        num_filters=num_filters,
-        filter_size=filter_size,
-        stride=stride,
-        padding=padding,
-        groups=num_groups,
-        act=None,
-        use_cudnn=use_cudnn,
-        param_attr=parameter_attr,
-        bias_attr=False)
-    return fluid.layers.batch_norm(input=conv, act=act)
-
-
-def depthwise_separable(input, num_filters1, num_filters2, num_groups, stride,
-                        scale):
-    """
-    """
-    depthwise_conv = conv_bn_layer(
-        input=input,
-        filter_size=3,
-        num_filters=int(num_filters1 * scale),
-        stride=stride,
-        padding=1,
-        num_groups=int(num_groups * scale),
-        use_cudnn=False)
-
-    pointwise_conv = conv_bn_layer(
-        input=depthwise_conv,
-        filter_size=1,
-        num_filters=int(num_filters2 * scale),
-        stride=1,
-        padding=0)
-    return pointwise_conv
-
-
-def mobile_net(img, class_dim, scale=1.0):
-
-    # conv1: 112x112
-    tmp = conv_bn_layer(
-        img,
-        filter_size=3,
-        channels=3,
-        num_filters=int(32 * scale),
-        stride=2,
-        padding=1)
-
-    # 56x56
-    tmp = depthwise_separable(
-        tmp,
-        num_filters1=32,
-        num_filters2=64,
-        num_groups=32,
-        stride=1,
-        scale=scale)
-
-    tmp = depthwise_separable(
-        tmp,
-        num_filters1=64,
-        num_filters2=128,
-        num_groups=64,
-        stride=2,
-        scale=scale)
-
-    # 28x28
-    tmp = depthwise_separable(
-        tmp,
-        num_filters1=128,
-        num_filters2=128,
-        num_groups=128,
-        stride=1,
-        scale=scale)
-
-    tmp = depthwise_separable(
-        tmp,
-        num_filters1=128,
-        num_filters2=256,
-        num_groups=128,
-        stride=2,
-        scale=scale)
-
-    # 14x14
-    tmp = depthwise_separable(
-        tmp,
-        num_filters1=256,
-        num_filters2=256,
-        num_groups=256,
-        stride=1,
-        scale=scale)
-
-    tmp = depthwise_separable(
-        tmp,
-        num_filters1=256,
-        num_filters2=512,
-        num_groups=256,
-        stride=2,
-        scale=scale)
-
-    # 14x14
-    for i in range(5):
-        tmp = depthwise_separable(
-            tmp,
-            num_filters1=512,
-            num_filters2=512,
-            num_groups=512,
-            stride=1,
-            scale=scale)
-    # 7x7
-    tmp = depthwise_separable(
-        tmp,
-        num_filters1=512,
-        num_filters2=1024,
-        num_groups=512,
-        stride=2,
-        scale=scale)
-
-    tmp = depthwise_separable(
-        tmp,
-        num_filters1=1024,
-        num_filters2=1024,
-        num_groups=1024,
-        stride=1,
-        scale=scale)
-
-    tmp = fluid.layers.pool2d(
-        input=tmp,
-        pool_size=0,
-        pool_stride=1,
-        pool_type='avg',
-        global_pooling=True)
-
-    tmp = fluid.layers.fc(input=tmp,
-                          size=class_dim,
-                          act='softmax',
-                          param_attr=parameter_attr)
-    return tmp
diff --git a/fluid/image_classification/models/__init__.py b/fluid/image_classification/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..34134fd01e8e5db99abdc375bc1aa94dcfe8b567
--- /dev/null
+++ b/fluid/image_classification/models/__init__.py
@@ -0,0 +1,8 @@
+from .alexnet import AlexNet
+from .mobilenet import MobileNet
+from .googlenet import GoogleNet
+from .vgg import VGG11, VGG13, VGG16, VGG19
+from .resnet import ResNet50, ResNet101, ResNet152
+from .inception_v4 import InceptionV4
+from .se_resnext import SE_ResNeXt50_32x4d, SE_ResNeXt101_32x4d, SE_ResNeXt152_32x4d
+from .dpn import DPN68, DPN92, DPN98, DPN107, DPN131
diff --git a/fluid/image_classification/models/alexnet.py b/fluid/image_classification/models/alexnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..b090f6bddbd8a73025fecde30c1296e078fdb222
--- /dev/null
+++ b/fluid/image_classification/models/alexnet.py
@@ -0,0 +1,147 @@
+import paddle
+import paddle.fluid as fluid
+import math
+
+__all__ = ['AlexNet']
+
+train_parameters = {
+    "input_size": [3, 224, 224],
+    "input_mean": [0.485, 0.456, 0.406],
+    "input_std": [0.229, 0.224, 0.225],
+    "learning_strategy": {
+        "name": "piecewise_decay",
+        "batch_size": 256,
+        "epochs": [40, 70, 100],
+        "steps": [0.01, 0.001, 0.0001, 0.00001]
+    }
+}
+
+
+class AlexNet():
+    def __init__(self):
+        self.params = train_parameters
+
+    def net(self, input, class_dim=1000):
+        stdv = 1.0 / math.sqrt(input.shape[1] * 11 * 11)
+        conv1 = fluid.layers.conv2d(
+            input=input,
+            num_filters=64,
+            filter_size=11,
+            stride=4,
+            padding=2,
+            groups=1,
+            act='relu',
+            bias_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv)),
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv)))
+        pool1 = fluid.layers.pool2d(
+            input=conv1,
+            pool_size=3,
+            pool_stride=2,
+            pool_padding=0,
+            pool_type='max')
+
+        stdv = 1.0 / math.sqrt(pool1.shape[1] * 5 * 5)
+        conv2 = fluid.layers.conv2d(
+            input=pool1,
+            num_filters=192,
+            filter_size=5,
+            stride=1,
+            padding=2,
+            groups=1,
+            act='relu',
+            bias_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv)),
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv)))
+        pool2 = fluid.layers.pool2d(
+            input=conv2,
+            pool_size=3,
+            pool_stride=2,
+            pool_padding=0,
+            pool_type='max')
+
+        stdv = 1.0 / math.sqrt(pool2.shape[1] * 3 * 3)
+        conv3 = fluid.layers.conv2d(
+            input=pool2,
+            num_filters=384,
+            filter_size=3,
+            stride=1,
+            padding=1,
+            groups=1,
+            act='relu',
+            bias_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv)),
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv)))
+
+        stdv = 1.0 / math.sqrt(conv3.shape[1] * 3 * 3)
+        conv4 = fluid.layers.conv2d(
+            input=conv3,
+            num_filters=256,
+            filter_size=3,
+            stride=1,
+            padding=1,
+            groups=1,
+            act='relu',
+            bias_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv)),
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv)))
+
+        stdv = 1.0 / math.sqrt(conv4.shape[1] * 3 * 3)
+        conv5 = fluid.layers.conv2d(
+            input=conv4,
+            num_filters=256,
+            filter_size=3,
+            stride=1,
+            padding=1,
+            groups=1,
+            act='relu',
+            bias_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv)),
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv)))
+        pool5 = fluid.layers.pool2d(
+            input=conv5,
+            pool_size=3,
+            pool_stride=2,
+            pool_padding=0,
+            pool_type='max')
+
+        drop6 = fluid.layers.dropout(x=pool5, dropout_prob=0.5)
+
+        stdv = 1.0 / math.sqrt(drop6.shape[1] * drop6.shape[2] *
+                               drop6.shape[3] * 1.0)
+        fc6 = fluid.layers.fc(
+            input=drop6,
+            size=4096,
+            act='relu',
+            bias_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv)),
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv)))
+
+        drop7 = fluid.layers.dropout(x=fc6, dropout_prob=0.5)
+
+        stdv = 1.0 / math.sqrt(drop7.shape[1] * 1.0)
+        fc7 = fluid.layers.fc(
+            input=drop7,
+            size=4096,
+            act='relu',
+            bias_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv)),
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv)))
+
+        stdv = 1.0 / math.sqrt(fc7.shape[1] * 1.0)
+        out = fluid.layers.fc(
+            input=fc7,
+            size=class_dim,
+            act='softmax',
+            bias_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv)),
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv)))
+        return out
diff --git a/fluid/image_classification/models/dpn.py b/fluid/image_classification/models/dpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f0680aad09025ba61aa352c9bc16766798e89d9
--- /dev/null
+++ b/fluid/image_classification/models/dpn.py
@@ -0,0 +1,281 @@
+import os
+import numpy as np
+import time
+import sys
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers.control_flow as control_flow
+import paddle.fluid.layers.nn as nn
+import paddle.fluid.layers.tensor as tensor
+import math
+
+__all__ = ["DPN", "DPN68", "DPN92", "DPN98", "DPN107", "DPN131"]
+
+train_parameters = {
+    "input_size": [3, 224, 224],
+    "input_mean": [0.485, 0.456, 0.406],
+    "input_std": [0.229, 0.224, 0.225],
+    "learning_strategy": {
+        "name": "piecewise_decay",
+        "batch_size": 256,
+        "epochs": [30, 60, 90],
+        "steps": [0.1, 0.01, 0.001, 0.0001]
+    }
+}
+
+
+class DPN(object):
+    def __init__(self, layers=68):
+        self.params = train_parameters
+        self.layers = layers
+
+    def net(self, input, class_dim=1000):
+        # get network args
+        args = self.get_net_args(self.layers)
+        bws = args['bw']
+        inc_sec = args['inc_sec']
+        rs = args['bw']
+        k_r = args['k_r']
+        k_sec = args['k_sec']
+        G = args['G']
+        init_num_filter = args['init_num_filter']
+        init_filter_size = args['init_filter_size']
+        init_padding = args['init_padding']
+
+        ## define Dual Path Network
+
+        # conv1
+        conv1_x_1 = fluid.layers.conv2d(
+            input=input,
+            num_filters=init_num_filter,
+            filter_size=init_filter_size,
+            stride=2,
+            padding=init_padding,
+            groups=1,
+            act=None,
+            bias_attr=False)
+        conv1_x_1 = fluid.layers.batch_norm(
+            input=conv1_x_1, act='relu', is_test=False)
+        convX_x_x = fluid.layers.pool2d(
+            input=conv1_x_1,
+            pool_size=3,
+            pool_stride=2,
+            pool_padding=1,
+            pool_type='max')
+
+        #conv2 - conv5
+        for gc in range(4):
+            bw = bws[gc]
+            inc = inc_sec[gc]
+            R = (k_r * bw) / rs[gc]
+            if gc == 0:
+                _type1 = 'proj'
+                _type2 = 'normal'
+            else:
+                _type1 = 'down'
+                _type2 = 'normal'
+            convX_x_x = self.dual_path_factory(convX_x_x, R, R, bw, inc, G,
+                                               _type1)
+            for i_ly in range(2, k_sec[gc] + 1):
+                convX_x_x = self.dual_path_factory(convX_x_x, R, R, bw, inc, G,
+                                                   _type2)
+
+        conv5_x_x = fluid.layers.concat(convX_x_x, axis=1)
+        conv5_x_x = fluid.layers.batch_norm(
+            input=conv5_x_x, act='relu', is_test=False)
+        pool5 = fluid.layers.pool2d(
+            input=conv5_x_x,
+            pool_size=7,
+            pool_stride=1,
+            pool_padding=0,
+            pool_type='avg')
+
+        #stdv = 1.0 / math.sqrt(pool5.shape[1] * 1.0)
+        stdv = 0.01
+        param_attr = fluid.param_attr.ParamAttr(
+            initializer=fluid.initializer.Uniform(-stdv, stdv))
+        fc6 = fluid.layers.fc(input=pool5,
+                              size=class_dim,
+                              act='softmax',
+                              param_attr=param_attr)
+
+        return fc6
+
+    def get_net_args(self, layers):
+        if layers == 68:
+            k_r = 128
+            G = 32
+            k_sec = [3, 4, 12, 3]
+            inc_sec = [16, 32, 32, 64]
+            bw = [64, 128, 256, 512]
+            r = [64, 64, 64, 64]
+            init_num_filter = 10
+            init_filter_size = 3
+            init_padding = 1
+        elif layers == 92:
+            k_r = 96
+            G = 32
+            k_sec = [3, 4, 20, 3]
+            inc_sec = [16, 32, 24, 128]
+            bw = [256, 512, 1024, 2048]
+            r = [256, 256, 256, 256]
+            init_num_filter = 64
+            init_filter_size = 7
+            init_padding = 3
+        elif layers == 98:
+            k_r = 160
+            G = 40
+            k_sec = [3, 6, 20, 3]
+            inc_sec = [16, 32, 32, 128]
+            bw = [256, 512, 1024, 2048]
+            r = [256, 256, 256, 256]
+            init_num_filter = 96
+            init_filter_size = 7
+            init_padding = 3
+        elif layers == 107:
+            k_r = 200
+            G = 50
+            k_sec = [4, 8, 20, 3]
+            inc_sec = [20, 64, 64, 128]
+            bw = [256, 512, 1024, 2048]
+            r = [256, 256, 256, 256]
+            init_num_filter = 128
+            init_filter_size = 7
+            init_padding = 3
+        elif layers == 131:
+            k_r = 160
+            G = 40
+            k_sec = [4, 8, 28, 3]
+            inc_sec = [16, 32, 32, 128]
+            bw = [256, 512, 1024, 2048]
+            r = [256, 256, 256, 256]
+            init_num_filter = 128
+            init_filter_size = 7
+            init_padding = 3
+        else:
+            raise NotImplementedError
+        net_arg = {
+            'k_r': k_r,
+            'G': G,
+            'k_sec': k_sec,
+            'inc_sec': inc_sec,
+            'bw': bw,
+            'r': r
+        }
+        net_arg['init_num_filter'] = init_num_filter
+        net_arg['init_filter_size'] = init_filter_size
+        net_arg['init_padding'] = init_padding
+
+        return net_arg
+
+    def dual_path_factory(self,
+                          data,
+                          num_1x1_a,
+                          num_3x3_b,
+                          num_1x1_c,
+                          inc,
+                          G,
+                          _type='normal'):
+        kw = 3
+        kh = 3
+        pw = (kw - 1) / 2
+        ph = (kh - 1) / 2
+
+        # type
+        if _type is 'proj':
+            key_stride = 1
+            has_proj = True
+        if _type is 'down':
+            key_stride = 2
+            has_proj = True
+        if _type is 'normal':
+            key_stride = 1
+            has_proj = False
+
+        # PROJ
+        if type(data) is list:
+            data_in = fluid.layers.concat([data[0], data[1]], axis=1)
+        else:
+            data_in = data
+
+        if has_proj:
+            c1x1_w = self.bn_ac_conv(
+                data=data_in,
+                num_filter=(num_1x1_c + 2 * inc),
+                kernel=(1, 1),
+                pad=(0, 0),
+                stride=(key_stride, key_stride))
+            data_o1, data_o2 = fluid.layers.split(
+                c1x1_w, num_or_sections=[num_1x1_c, 2 * inc], dim=1)
+        else:
+            data_o1 = data[0]
+            data_o2 = data[1]
+
+        # MAIN
+        c1x1_a = self.bn_ac_conv(
+            data=data_in, num_filter=num_1x1_a, kernel=(1, 1), pad=(0, 0))
+        c3x3_b = self.bn_ac_conv(
+            data=c1x1_a,
+            num_filter=num_3x3_b,
+            kernel=(kw, kh),
+            pad=(pw, ph),
+            stride=(key_stride, key_stride),
+            num_group=G)
+        c1x1_c = self.bn_ac_conv(
+            data=c3x3_b,
+            num_filter=(num_1x1_c + inc),
+            kernel=(1, 1),
+            pad=(0, 0))
+
+        c1x1_c1, c1x1_c2 = fluid.layers.split(
+            c1x1_c, num_or_sections=[num_1x1_c, inc], dim=1)
+
+        # OUTPUTS
+        summ = fluid.layers.elementwise_add(x=data_o1, y=c1x1_c1)
+        dense = fluid.layers.concat([data_o2, c1x1_c2], axis=1)
+
+        return [summ, dense]
+
+    def bn_ac_conv(self,
+                   data,
+                   num_filter,
+                   kernel,
+                   pad,
+                   stride=(1, 1),
+                   num_group=1):
+        bn_ac = fluid.layers.batch_norm(input=data, act='relu', is_test=False)
+        bn_ac_conv = fluid.layers.conv2d(
+            input=bn_ac,
+            num_filters=num_filter,
+            filter_size=kernel,
+            stride=stride,
+            padding=pad,
+            groups=num_group,
+            act=None,
+            bias_attr=False)
+        return bn_ac_conv
+
+
+def DPN68():
+    model = DPN(layers=68)
+    return model
+
+
+def DPN92():
+    model = DPN(layers=92)
+    return model
+
+
+def DPN98():
+    model = DPN(layers=98)
+    return model
+
+
+def DPN107():
+    model = DPN(layers=107)
+    return model
+
+
+def DPN131():
+    model = DPN(layers=131)
+    return model
diff --git a/fluid/image_classification/models/googlenet.py b/fluid/image_classification/models/googlenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f8043a2b69a8883f6662185921009e6b977ded9
--- /dev/null
+++ b/fluid/image_classification/models/googlenet.py
@@ -0,0 +1,164 @@
+import paddle
+import paddle.fluid as fluid
+
+__all__ = ['GoogleNet']
+
+train_parameters = {
+    "input_size": [3, 224, 224],
+    "input_mean": [0.485, 0.456, 0.406],
+    "input_std": [0.229, 0.224, 0.225],
+    "learning_strategy": {
+        "name": "piecewise_decay",
+        "batch_size": 256,
+        "epochs": [30, 60, 90],
+        "steps": [0.1, 0.01, 0.001, 0.0001]
+    }
+}
+
+
+class GoogleNet():
+    def __init__(self):
+        self.params = train_parameters
+
+    def conv_layer(self,
+                   input,
+                   num_filters,
+                   filter_size,
+                   stride=1,
+                   groups=1,
+                   act=None):
+        channels = input.shape[1]
+        stdv = (3.0 / (filter_size**2 * channels))**0.5
+        param_attr = fluid.param_attr.ParamAttr(
+            initializer=fluid.initializer.Uniform(-stdv, stdv))
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) / 2,
+            groups=groups,
+            act=act,
+            param_attr=param_attr,
+            bias_attr=False)
+        return conv
+
+    def xavier(self, channels, filter_size):
+        stdv = (3.0 / (filter_size**2 * channels))**0.5
+        param_attr = fluid.param_attr.ParamAttr(
+            initializer=fluid.initializer.Uniform(-stdv, stdv))
+        return param_attr
+
+    def inception(self, name, input, channels, filter1, filter3R, filter3,
+                  filter5R, filter5, proj):
+        conv1 = self.conv_layer(
+            input=input, num_filters=filter1, filter_size=1, stride=1, act=None)
+        conv3r = self.conv_layer(
+            input=input,
+            num_filters=filter3R,
+            filter_size=1,
+            stride=1,
+            act=None)
+        conv3 = self.conv_layer(
+            input=conv3r,
+            num_filters=filter3,
+            filter_size=3,
+            stride=1,
+            act=None)
+        conv5r = self.conv_layer(
+            input=input,
+            num_filters=filter5R,
+            filter_size=1,
+            stride=1,
+            act=None)
+        conv5 = self.conv_layer(
+            input=conv5r,
+            num_filters=filter5,
+            filter_size=5,
+            stride=1,
+            act=None)
+        pool = fluid.layers.pool2d(
+            input=input,
+            pool_size=3,
+            pool_stride=1,
+            pool_padding=1,
+            pool_type='max')
+        convprj = fluid.layers.conv2d(
+            input=pool, filter_size=1, num_filters=proj, stride=1, padding=0)
+        cat = fluid.layers.concat(input=[conv1, conv3, conv5, convprj], axis=1)
+        cat = fluid.layers.relu(cat)
+        return cat
+
+    def net(self, input, class_dim=1000):
+        conv = self.conv_layer(
+            input=input, num_filters=64, filter_size=7, stride=2, act=None)
+        pool = fluid.layers.pool2d(
+            input=conv, pool_size=3, pool_type='max', pool_stride=2)
+
+        conv = self.conv_layer(
+            input=pool, num_filters=64, filter_size=1, stride=1, act=None)
+        conv = self.conv_layer(
+            input=conv, num_filters=192, filter_size=3, stride=1, act=None)
+        pool = fluid.layers.pool2d(
+            input=conv, pool_size=3, pool_type='max', pool_stride=2)
+
+        ince3a = self.inception("ince3a", pool, 192, 64, 96, 128, 16, 32, 32)
+        ince3b = self.inception("ince3b", ince3a, 256, 128, 128, 192, 32, 96,
+                                64)
+        pool3 = fluid.layers.pool2d(
+            input=ince3b, pool_size=3, pool_type='max', pool_stride=2)
+
+        ince4a = self.inception("ince4a", pool3, 480, 192, 96, 208, 16, 48, 64)
+        ince4b = self.inception("ince4b", ince4a, 512, 160, 112, 224, 24, 64,
+                                64)
+        ince4c = self.inception("ince4c", ince4b, 512, 128, 128, 256, 24, 64,
+                                64)
+        ince4d = self.inception("ince4d", ince4c, 512, 112, 144, 288, 32, 64,
+                                64)
+        ince4e = self.inception("ince4e", ince4d, 528, 256, 160, 320, 32, 128,
+                                128)
+        pool4 = fluid.layers.pool2d(
+            input=ince4e, pool_size=3, pool_type='max', pool_stride=2)
+
+        ince5a = self.inception("ince5a", pool4, 832, 256, 160, 320, 32, 128,
+                                128)
+        ince5b = self.inception("ince5b", ince5a, 832, 384, 192, 384, 48, 128,
+                                128)
+        pool5 = fluid.layers.pool2d(
+            input=ince5b, pool_size=7, pool_type='avg', pool_stride=7)
+        dropout = fluid.layers.dropout(x=pool5, dropout_prob=0.4)
+        out = fluid.layers.fc(input=dropout,
+                              size=class_dim,
+                              act='softmax',
+                              param_attr=self.xavier(1024, 1))
+
+        pool_o1 = fluid.layers.pool2d(
+            input=ince4a, pool_size=5, pool_type='avg', pool_stride=3)
+        conv_o1 = self.conv_layer(
+            input=pool_o1, num_filters=128, filter_size=1, stride=1, act=None)
+        fc_o1 = fluid.layers.fc(input=conv_o1,
+                                size=1024,
+                                act='relu',
+                                param_attr=self.xavier(2048, 1))
+        dropout_o1 = fluid.layers.dropout(x=fc_o1, dropout_prob=0.7)
+        out1 = fluid.layers.fc(input=dropout_o1,
+                               size=class_dim,
+                               act='softmax',
+                               param_attr=self.xavier(1024, 1))
+
+        pool_o2 = fluid.layers.pool2d(
+            input=ince4d, pool_size=5, pool_type='avg', pool_stride=3)
+        conv_o2 = self.conv_layer(
+            input=pool_o2, num_filters=128, filter_size=1, stride=1, act=None)
+        fc_o2 = fluid.layers.fc(input=conv_o2,
+                                size=1024,
+                                act='relu',
+                                param_attr=self.xavier(2048, 1))
+        dropout_o2 = fluid.layers.dropout(x=fc_o2, dropout_prob=0.7)
+        out2 = fluid.layers.fc(input=dropout_o2,
+                               size=class_dim,
+                               act='softmax',
+                               param_attr=self.xavier(1024, 1))
+
+        # last fc layer is "out"
+        return out, out1, out2
diff --git a/fluid/image_classification/models/inception_v4.py b/fluid/image_classification/models/inception_v4.py
new file mode 100644
index 0000000000000000000000000000000000000000..51c7266f69571b4f38d5a68b4de7838d72afc2b9
--- /dev/null
+++ b/fluid/image_classification/models/inception_v4.py
@@ -0,0 +1,204 @@
+import paddle
+import paddle.fluid as fluid
+import math
+
+__all__ = ['InceptionV4']
+
+train_parameters = {
+    "input_size": [3, 224, 224],
+    "input_mean": [0.485, 0.456, 0.406],
+    "input_std": [0.229, 0.224, 0.225],
+    "learning_strategy": {
+        "name": "piecewise_decay",
+        "batch_size": 256,
+        "epochs": [30, 60, 90],
+        "steps": [0.1, 0.01, 0.001, 0.0001]
+    }
+}
+
+
+class InceptionV4():
+    def __init__(self):
+        self.params = train_parameters
+
+    def net(self, input, class_dim=1000):
+        x = self.inception_stem(input)
+
+        for i in range(4):
+            x = self.inceptionA(x)
+        x = self.reductionA(x)
+
+        for i in range(7):
+            x = self.inceptionB(x)
+        x = self.reductionB(x)
+
+        for i in range(3):
+            x = self.inceptionC(x)
+
+        pool = fluid.layers.pool2d(
+            input=x, pool_size=8, pool_type='avg', global_pooling=True)
+
+        drop = fluid.layers.dropout(x=pool, dropout_prob=0.2)
+
+        stdv = 1.0 / math.sqrt(drop.shape[1] * 1.0)
+        out = fluid.layers.fc(
+            input=drop,
+            size=class_dim,
+            act='softmax',
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Uniform(-stdv, stdv)))
+        return out
+
+    def conv_bn_layer(self,
+                      data,
+                      num_filters,
+                      filter_size,
+                      stride=1,
+                      padding=0,
+                      groups=1,
+                      act='relu'):
+        conv = fluid.layers.conv2d(
+            input=data,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            act=None,
+            bias_attr=False)
+        return fluid.layers.batch_norm(input=conv, act=act)
+
+    def inception_stem(self, data):
+        conv = self.conv_bn_layer(data, 32, 3, stride=2, act='relu')
+        conv = self.conv_bn_layer(conv, 32, 3, act='relu')
+        conv = self.conv_bn_layer(conv, 64, 3, padding=1, act='relu')
+
+        pool1 = fluid.layers.pool2d(
+            input=conv, pool_size=3, pool_stride=2, pool_type='max')
+        conv2 = self.conv_bn_layer(conv, 96, 3, stride=2, act='relu')
+        concat = fluid.layers.concat([pool1, conv2], axis=1)
+
+        conv1 = self.conv_bn_layer(concat, 64, 1, act='relu')
+        conv1 = self.conv_bn_layer(conv1, 96, 3, act='relu')
+
+        conv2 = self.conv_bn_layer(concat, 64, 1, act='relu')
+        conv2 = self.conv_bn_layer(
+            conv2, 64, (7, 1), padding=(3, 0), act='relu')
+        conv2 = self.conv_bn_layer(
+            conv2, 64, (1, 7), padding=(0, 3), act='relu')
+        conv2 = self.conv_bn_layer(conv2, 96, 3, act='relu')
+
+        concat = fluid.layers.concat([conv1, conv2], axis=1)
+
+        conv1 = self.conv_bn_layer(concat, 192, 3, stride=2, act='relu')
+        pool1 = fluid.layers.pool2d(
+            input=concat, pool_size=3, pool_stride=2, pool_type='max')
+
+        concat = fluid.layers.concat([conv1, pool1], axis=1)
+
+        return concat
+
+    def inceptionA(self, data):
+        pool1 = fluid.layers.pool2d(
+            input=data, pool_size=3, pool_padding=1, pool_type='avg')
+        conv1 = self.conv_bn_layer(pool1, 96, 1, act='relu')
+
+        conv2 = self.conv_bn_layer(data, 96, 1, act='relu')
+
+        conv3 = self.conv_bn_layer(data, 64, 1, act='relu')
+        conv3 = self.conv_bn_layer(conv3, 96, 3, padding=1, act='relu')
+
+        conv4 = self.conv_bn_layer(data, 64, 1, act='relu')
+        conv4 = self.conv_bn_layer(conv4, 96, 3, padding=1, act='relu')
+        conv4 = self.conv_bn_layer(conv4, 96, 3, padding=1, act='relu')
+
+        concat = fluid.layers.concat([conv1, conv2, conv3, conv4], axis=1)
+
+        return concat
+
+    def reductionA(self, data):
+        pool1 = fluid.layers.pool2d(
+            input=data, pool_size=3, pool_stride=2, pool_type='max')
+
+        conv2 = self.conv_bn_layer(data, 384, 3, stride=2, act='relu')
+
+        conv3 = self.conv_bn_layer(data, 192, 1, act='relu')
+        conv3 = self.conv_bn_layer(conv3, 224, 3, padding=1, act='relu')
+        conv3 = self.conv_bn_layer(conv3, 256, 3, stride=2, act='relu')
+
+        concat = fluid.layers.concat([pool1, conv2, conv3], axis=1)
+
+        return concat
+
+    def inceptionB(self, data):
+        pool1 = fluid.layers.pool2d(
+            input=data, pool_size=3, pool_padding=1, pool_type='avg')
+        conv1 = self.conv_bn_layer(pool1, 128, 1, act='relu')
+
+        conv2 = self.conv_bn_layer(data, 384, 1, act='relu')
+
+        conv3 = self.conv_bn_layer(data, 192, 1, act='relu')
+        conv3 = self.conv_bn_layer(
+            conv3, 224, (1, 7), padding=(0, 3), act='relu')
+        conv3 = self.conv_bn_layer(
+            conv3, 256, (7, 1), padding=(3, 0), act='relu')
+
+        conv4 = self.conv_bn_layer(data, 192, 1, act='relu')
+        conv4 = self.conv_bn_layer(
+            conv4, 192, (1, 7), padding=(0, 3), act='relu')
+        conv4 = self.conv_bn_layer(
+            conv4, 224, (7, 1), padding=(3, 0), act='relu')
+        conv4 = self.conv_bn_layer(
+            conv4, 224, (1, 7), padding=(0, 3), act='relu')
+        conv4 = self.conv_bn_layer(
+            conv4, 256, (7, 1), padding=(3, 0), act='relu')
+
+        concat = fluid.layers.concat([conv1, conv2, conv3, conv4], axis=1)
+
+        return concat
+
+    def reductionB(self, data):
+        pool1 = fluid.layers.pool2d(
+            input=data, pool_size=3, pool_stride=2, pool_type='max')
+
+        conv2 = self.conv_bn_layer(data, 192, 1, act='relu')
+        conv2 = self.conv_bn_layer(conv2, 192, 3, stride=2, act='relu')
+
+        conv3 = self.conv_bn_layer(data, 256, 1, act='relu')
+        conv3 = self.conv_bn_layer(
+            conv3, 256, (1, 7), padding=(0, 3), act='relu')
+        conv3 = self.conv_bn_layer(
+            conv3, 320, (7, 1), padding=(3, 0), act='relu')
+        conv3 = self.conv_bn_layer(conv3, 320, 3, stride=2, act='relu')
+
+        concat = fluid.layers.concat([pool1, conv2, conv3], axis=1)
+
+        return concat
+
+    def inceptionC(self, data):
+        pool1 = fluid.layers.pool2d(
+            input=data, pool_size=3, pool_padding=1, pool_type='avg')
+        conv1 = self.conv_bn_layer(pool1, 256, 1, act='relu')
+
+        conv2 = self.conv_bn_layer(data, 256, 1, act='relu')
+
+        conv3 = self.conv_bn_layer(data, 384, 1, act='relu')
+        conv3_1 = self.conv_bn_layer(
+            conv3, 256, (1, 3), padding=(0, 1), act='relu')
+        conv3_2 = self.conv_bn_layer(
+            conv3, 256, (3, 1), padding=(1, 0), act='relu')
+
+        conv4 = self.conv_bn_layer(data, 384, 1, act='relu')
+        conv4 = self.conv_bn_layer(
+            conv4, 448, (1, 3), padding=(0, 1), act='relu')
+        conv4 = self.conv_bn_layer(
+            conv4, 512, (3, 1), padding=(1, 0), act='relu')
+        conv4_1 = self.conv_bn_layer(
+            conv4, 256, (1, 3), padding=(0, 1), act='relu')
+        conv4_2 = self.conv_bn_layer(
+            conv4, 256, (3, 1), padding=(1, 0), act='relu')
+
+        concat = fluid.layers.concat(
+            [conv1, conv2, conv3_1, conv3_2, conv4_1, conv4_2], axis=1)
+
+        return concat
diff --git a/fluid/image_classification/models/learning_rate.py b/fluid/image_classification/models/learning_rate.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8c137e6db8f5ecaac7f7b43d6537e16d2ae03c8
--- /dev/null
+++ b/fluid/image_classification/models/learning_rate.py
@@ -0,0 +1,19 @@
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers.ops as ops
+from paddle.fluid.initializer import init_on_cpu
+from paddle.fluid.layers.learning_rate_scheduler import _decay_step_counter
+import math
+
+
+def cosine_decay(learning_rate, step_each_epoch, epochs=120):
+    """Applies cosine decay to the learning rate.
+    lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1)
+    """
+    global_step = _decay_step_counter()
+
+    with init_on_cpu():
+        epoch = ops.floor(global_step / step_each_epoch)
+        decayed_lr = learning_rate * \
+                     (ops.cos(epoch * (math.pi / epochs)) + 1)/2
+    return decayed_lr
diff --git a/fluid/image_classification/models/mobilenet.py b/fluid/image_classification/models/mobilenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..bae564fc31f19b4b9abb9c0e7c3c1488ab7b9219
--- /dev/null
+++ b/fluid/image_classification/models/mobilenet.py
@@ -0,0 +1,164 @@
+import paddle.fluid as fluid
+from paddle.fluid.initializer import MSRA
+from paddle.fluid.param_attr import ParamAttr
+
+__all__ = ['MobileNet']
+
+train_parameters = {
+    "input_size": [3, 224, 224],
+    "input_mean": [0.485, 0.456, 0.406],
+    "input_std": [0.229, 0.224, 0.225],
+    "learning_strategy": {
+        "name": "piecewise_decay",
+        "batch_size": 256,
+        "epochs": [30, 60, 90],
+        "steps": [0.1, 0.01, 0.001, 0.0001]
+    }
+}
+
+
+class MobileNet():
+    def __init__(self):
+        self.params = train_parameters
+
+    def net(self, input, class_dim=1000, scale=1.0):
+        # conv1: 112x112
+        input = self.conv_bn_layer(
+            input,
+            filter_size=3,
+            channels=3,
+            num_filters=int(32 * scale),
+            stride=2,
+            padding=1)
+
+        # 56x56
+        input = self.depthwise_separable(
+            input,
+            num_filters1=32,
+            num_filters2=64,
+            num_groups=32,
+            stride=1,
+            scale=scale)
+
+        input = self.depthwise_separable(
+            input,
+            num_filters1=64,
+            num_filters2=128,
+            num_groups=64,
+            stride=2,
+            scale=scale)
+
+        # 28x28
+        input = self.depthwise_separable(
+            input,
+            num_filters1=128,
+            num_filters2=128,
+            num_groups=128,
+            stride=1,
+            scale=scale)
+
+        input = self.depthwise_separable(
+            input,
+            num_filters1=128,
+            num_filters2=256,
+            num_groups=128,
+            stride=2,
+            scale=scale)
+
+        # 14x14
+        input = self.depthwise_separable(
+            input,
+            num_filters1=256,
+            num_filters2=256,
+            num_groups=256,
+            stride=1,
+            scale=scale)
+
+        input = self.depthwise_separable(
+            input,
+            num_filters1=256,
+            num_filters2=512,
+            num_groups=256,
+            stride=2,
+            scale=scale)
+
+        # 14x14
+        for i in range(5):
+            input = self.depthwise_separable(
+                input,
+                num_filters1=512,
+                num_filters2=512,
+                num_groups=512,
+                stride=1,
+                scale=scale)
+        # 7x7
+        input = self.depthwise_separable(
+            input,
+            num_filters1=512,
+            num_filters2=1024,
+            num_groups=512,
+            stride=2,
+            scale=scale)
+
+        input = self.depthwise_separable(
+            input,
+            num_filters1=1024,
+            num_filters2=1024,
+            num_groups=1024,
+            stride=1,
+            scale=scale)
+
+        input = fluid.layers.pool2d(
+            input=input,
+            pool_size=0,
+            pool_stride=1,
+            pool_type='avg',
+            global_pooling=True)
+
+        output = fluid.layers.fc(input=input,
+                                 size=class_dim,
+                                 act='softmax',
+                                 param_attr=ParamAttr(initializer=MSRA()))
+        return output
+
+    def conv_bn_layer(self,
+                      input,
+                      filter_size,
+                      num_filters,
+                      stride,
+                      padding,
+                      channels=None,
+                      num_groups=1,
+                      act='relu',
+                      use_cudnn=True):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            act=None,
+            use_cudnn=use_cudnn,
+            param_attr=ParamAttr(initializer=MSRA()),
+            bias_attr=False)
+        return fluid.layers.batch_norm(input=conv, act=act)
+
+    def depthwise_separable(self, input, num_filters1, num_filters2, num_groups,
+                            stride, scale):
+        depthwise_conv = self.conv_bn_layer(
+            input=input,
+            filter_size=3,
+            num_filters=int(num_filters1 * scale),
+            stride=stride,
+            padding=1,
+            num_groups=int(num_groups * scale),
+            use_cudnn=False)
+
+        pointwise_conv = self.conv_bn_layer(
+            input=depthwise_conv,
+            filter_size=1,
+            num_filters=int(num_filters2 * scale),
+            stride=1,
+            padding=0)
+        return pointwise_conv
diff --git a/fluid/image_classification/models/resnet.py b/fluid/image_classification/models/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3a6956222173aba8d7350e30e6c5344a7317682
--- /dev/null
+++ b/fluid/image_classification/models/resnet.py
@@ -0,0 +1,120 @@
+import paddle
+import paddle.fluid as fluid
+import math
+
+__all__ = ["ResNet", "ResNet50", "ResNet101", "ResNet152"]
+
+train_parameters = {
+    "input_size": [3, 224, 224],
+    "input_mean": [0.485, 0.456, 0.406],
+    "input_std": [0.229, 0.224, 0.225],
+    "learning_strategy": {
+        "name": "piecewise_decay",
+        "batch_size": 256,
+        "epochs": [30, 60, 90],
+        "steps": [0.1, 0.01, 0.001, 0.0001]
+    }
+}
+
+
+class ResNet():
+    def __init__(self, layers=50):
+        self.params = train_parameters
+        self.layers = layers
+
+    def net(self, input, class_dim=1000):
+        layers = self.layers
+        supported_layers = [50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(supported_layers, layers)
+
+        if layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        num_filters = [64, 128, 256, 512]
+
+        conv = self.conv_bn_layer(
+            input=input, num_filters=64, filter_size=7, stride=2, act='relu')
+        conv = fluid.layers.pool2d(
+            input=conv,
+            pool_size=3,
+            pool_stride=2,
+            pool_padding=1,
+            pool_type='max')
+
+        for block in range(len(depth)):
+            for i in range(depth[block]):
+                conv = self.bottleneck_block(
+                    input=conv,
+                    num_filters=num_filters[block],
+                    stride=2 if i == 0 and block != 0 else 1)
+
+        pool = fluid.layers.pool2d(
+            input=conv, pool_size=7, pool_type='avg', global_pooling=True)
+        stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
+        out = fluid.layers.fc(input=pool,
+                              size=class_dim,
+                              act='softmax',
+                              param_attr=fluid.param_attr.ParamAttr(
+                                  initializer=fluid.initializer.Uniform(-stdv,
+                                                                        stdv)))
+        return out
+
+    def conv_bn_layer(self,
+                      input,
+                      num_filters,
+                      filter_size,
+                      stride=1,
+                      groups=1,
+                      act=None):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) / 2,
+            groups=groups,
+            act=None,
+            bias_attr=False)
+        return fluid.layers.batch_norm(input=conv, act=act)
+
+    def shortcut(self, input, ch_out, stride):
+        ch_in = input.shape[1]
+        if ch_in != ch_out or stride != 1:
+            return self.conv_bn_layer(input, ch_out, 1, stride)
+        else:
+            return input
+
+    def bottleneck_block(self, input, num_filters, stride):
+        conv0 = self.conv_bn_layer(
+            input=input, num_filters=num_filters, filter_size=1, act='relu')
+        conv1 = self.conv_bn_layer(
+            input=conv0,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act='relu')
+        conv2 = self.conv_bn_layer(
+            input=conv1, num_filters=num_filters * 4, filter_size=1, act=None)
+
+        short = self.shortcut(input, num_filters * 4, stride)
+
+        return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
+
+
+def ResNet50():
+    model = ResNet(layers=50)
+    return model
+
+
+def ResNet101():
+    model = ResNet(layers=101)
+    return model
+
+
+def ResNet152():
+    model = ResNet(layers=152)
+    return model
diff --git a/fluid/image_classification/models/se_resnext.py b/fluid/image_classification/models/se_resnext.py
new file mode 100644
index 0000000000000000000000000000000000000000..2cef2ef6bd09b3d46ac7533496b0d14e3513a5f8
--- /dev/null
+++ b/fluid/image_classification/models/se_resnext.py
@@ -0,0 +1,195 @@
+import paddle
+import paddle.fluid as fluid
+import math
+
+__all__ = [
+    "SE_ResNeXt", "SE_ResNeXt50_32x4d", "SE_ResNeXt101_32x4d",
+    "SE_ResNeXt152_32x4d"
+]
+
+train_parameters = {
+    "input_size": [3, 224, 224],
+    "input_mean": [0.485, 0.456, 0.406],
+    "input_std": [0.229, 0.224, 0.225],
+    "learning_strategy": {
+        "name": "piecewise_decay",
+        "batch_size": 256,
+        "epochs": [30, 60, 90],
+        "steps": [0.1, 0.01, 0.001, 0.0001]
+    }
+}
+
+
+class SE_ResNeXt():
+    def __init__(self, layers=50):
+        self.params = train_parameters
+        self.layers = layers
+
+    def net(self, input, class_dim=1000):
+        layers = self.layers
+        supported_layers = [50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(supported_layers, layers)
+        if layers == 50:
+            cardinality = 32
+            reduction_ratio = 16
+            depth = [3, 4, 6, 3]
+            num_filters = [128, 256, 512, 1024]
+
+            conv = self.conv_bn_layer(
+                input=input,
+                num_filters=64,
+                filter_size=7,
+                stride=2,
+                act='relu')
+            conv = fluid.layers.pool2d(
+                input=conv,
+                pool_size=3,
+                pool_stride=2,
+                pool_padding=1,
+                pool_type='max')
+        elif layers == 101:
+            cardinality = 32
+            reduction_ratio = 16
+            depth = [3, 4, 23, 3]
+            num_filters = [128, 256, 512, 1024]
+
+            conv = self.conv_bn_layer(
+                input=input,
+                num_filters=64,
+                filter_size=7,
+                stride=2,
+                act='relu')
+            conv = fluid.layers.pool2d(
+                input=conv,
+                pool_size=3,
+                pool_stride=2,
+                pool_padding=1,
+                pool_type='max')
+        elif layers == 152:
+            cardinality = 64
+            reduction_ratio = 16
+            depth = [3, 8, 36, 3]
+            num_filters = [128, 256, 512, 1024]
+
+            conv = self.conv_bn_layer(
+                input=input,
+                num_filters=64,
+                filter_size=3,
+                stride=2,
+                act='relu')
+            conv = self.conv_bn_layer(
+                input=conv, num_filters=64, filter_size=3, stride=1, act='relu')
+            conv = self.conv_bn_layer(
+                input=conv,
+                num_filters=128,
+                filter_size=3,
+                stride=1,
+                act='relu')
+            conv = fluid.layers.pool2d(
+                input=conv, pool_size=3, pool_stride=2, pool_padding=1, \
+                pool_type='max')
+
+        for block in range(len(depth)):
+            for i in range(depth[block]):
+                conv = self.bottleneck_block(
+                    input=conv,
+                    num_filters=num_filters[block],
+                    stride=2 if i == 0 and block != 0 else 1,
+                    cardinality=cardinality,
+                    reduction_ratio=reduction_ratio)
+
+        pool = fluid.layers.pool2d(
+            input=conv, pool_size=7, pool_type='avg', global_pooling=True)
+        drop = fluid.layers.dropout(x=pool, dropout_prob=0.5)
+        stdv = 1.0 / math.sqrt(drop.shape[1] * 1.0)
+        out = fluid.layers.fc(input=drop,
+                              size=class_dim,
+                              act='softmax',
+                              param_attr=fluid.param_attr.ParamAttr(
+                                  initializer=fluid.initializer.Uniform(-stdv,
+                                                                        stdv)))
+        return out
+
+    def shortcut(self, input, ch_out, stride):
+        ch_in = input.shape[1]
+        if ch_in != ch_out or stride != 1:
+            filter_size = 1
+            return self.conv_bn_layer(input, ch_out, filter_size, stride)
+        else:
+            return input
+
+    def bottleneck_block(self, input, num_filters, stride, cardinality,
+                         reduction_ratio):
+        conv0 = self.conv_bn_layer(
+            input=input, num_filters=num_filters, filter_size=1, act='relu')
+        conv1 = self.conv_bn_layer(
+            input=conv0,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            groups=cardinality,
+            act='relu')
+        conv2 = self.conv_bn_layer(
+            input=conv1, num_filters=num_filters * 2, filter_size=1, act=None)
+        scale = self.squeeze_excitation(
+            input=conv2,
+            num_channels=num_filters * 2,
+            reduction_ratio=reduction_ratio)
+
+        short = self.shortcut(input, num_filters * 2, stride)
+
+        return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
+
+    def conv_bn_layer(self,
+                      input,
+                      num_filters,
+                      filter_size,
+                      stride=1,
+                      groups=1,
+                      act=None):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) / 2,
+            groups=groups,
+            act=None,
+            bias_attr=False)
+        return fluid.layers.batch_norm(input=conv, act=act)
+
+    def squeeze_excitation(self, input, num_channels, reduction_ratio):
+        pool = fluid.layers.pool2d(
+            input=input, pool_size=0, pool_type='avg', global_pooling=True)
+        stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
+        squeeze = fluid.layers.fc(input=pool,
+                                  size=num_channels / reduction_ratio,
+                                  act='relu',
+                                  param_attr=fluid.param_attr.ParamAttr(
+                                      initializer=fluid.initializer.Uniform(
+                                          -stdv, stdv)))
+        stdv = 1.0 / math.sqrt(squeeze.shape[1] * 1.0)
+        excitation = fluid.layers.fc(input=squeeze,
+                                     size=num_channels,
+                                     act='sigmoid',
+                                     param_attr=fluid.param_attr.ParamAttr(
+                                         initializer=fluid.initializer.Uniform(
+                                             -stdv, stdv)))
+        scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0)
+        return scale
+
+
+def SE_ResNeXt50_32x4d():
+    model = SE_ResNeXt(layers=50)
+    return model
+
+
+def SE_ResNeXt101_32x4d():
+    model = SE_ResNeXt(layers=101)
+    return model
+
+
+def SE_ResNeXt152_32x4d():
+    model = SE_ResNeXt(layers=152)
+    return model
diff --git a/fluid/image_classification/models/vgg.py b/fluid/image_classification/models/vgg.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b6f8bd171902d0ea1d27a00a66dce2722f745c5
--- /dev/null
+++ b/fluid/image_classification/models/vgg.py
@@ -0,0 +1,107 @@
+import paddle
+import paddle.fluid as fluid
+
+__all__ = ["VGGNet", "VGG11", "VGG13", "VGG16", "VGG19"]
+
+train_parameters = {
+    "input_size": [3, 224, 224],
+    "input_mean": [0.485, 0.456, 0.406],
+    "input_std": [0.229, 0.224, 0.225],
+    "learning_strategy": {
+        "name": "piecewise_decay",
+        "batch_size": 256,
+        "epochs": [30, 60, 90],
+        "steps": [0.1, 0.01, 0.001, 0.0001]
+    }
+}
+
+
+class VGGNet():
+    def __init__(self, layers=16):
+        self.params = train_parameters
+        self.layers = layers
+
+    def net(self, input, class_dim=1000):
+        layers = self.layers
+        vgg_spec = {
+            11: ([1, 1, 2, 2, 2]),
+            13: ([2, 2, 2, 2, 2]),
+            16: ([2, 2, 3, 3, 3]),
+            19: ([2, 2, 4, 4, 4])
+        }
+        assert layers in vgg_spec.keys(), \
+            "supported layers are {} but input layer is {}".format(vgg_spec.keys(), layers)
+
+        nums = vgg_spec[layers]
+        conv1 = self.conv_block(input, 64, nums[0])
+        conv2 = self.conv_block(conv1, 128, nums[1])
+        conv3 = self.conv_block(conv2, 256, nums[2])
+        conv4 = self.conv_block(conv3, 512, nums[3])
+        conv5 = self.conv_block(conv4, 512, nums[4])
+
+        fc_dim = 4096
+        fc1 = fluid.layers.fc(
+            input=conv5,
+            size=fc_dim,
+            act='relu',
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Normal(scale=0.005)),
+            bias_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.1)))
+        fc1 = fluid.layers.dropout(x=fc1, dropout_prob=0.5)
+        fc2 = fluid.layers.fc(
+            input=fc1,
+            size=fc_dim,
+            act='relu',
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Normal(scale=0.005)),
+            bias_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.1)))
+        fc2 = fluid.layers.dropout(x=fc2, dropout_prob=0.5)
+        out = fluid.layers.fc(
+            input=fc2,
+            size=class_dim,
+            act='softmax',
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Normal(scale=0.005)),
+            bias_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.1)))
+
+        return out
+
+    def conv_block(self, input, num_filter, groups):
+        conv = input
+        for i in range(groups):
+            conv = fluid.layers.conv2d(
+                input=conv,
+                num_filters=num_filter,
+                filter_size=3,
+                stride=1,
+                padding=1,
+                act='relu',
+                param_attr=fluid.param_attr.ParamAttr(
+                    initializer=fluid.initializer.Normal(scale=0.01)),
+                bias_attr=fluid.param_attr.ParamAttr(
+                    initializer=fluid.initializer.Constant(value=0.0)))
+        return fluid.layers.pool2d(
+            input=conv, pool_size=2, pool_type='max', pool_stride=2)
+
+
+def VGG11():
+    model = VGGNet(layers=11)
+    return model
+
+
+def VGG13():
+    model = VGGNet(layers=13)
+    return model
+
+
+def VGG16():
+    model = VGGNet(layers=16)
+    return model
+
+
+def VGG19():
+    model = VGGNet(layers=19)
+    return model
diff --git a/fluid/image_classification/reader.py b/fluid/image_classification/reader.py
index a5a8f45dc75695253ae012f26bc39c73fec68775..b503b67ce09fba80bc49a07665ba0290e75f1ed1 100644
--- a/fluid/image_classification/reader.py
+++ b/fluid/image_classification/reader.py
@@ -11,7 +11,7 @@ random.seed(0)
 DATA_DIM = 224
 
 THREAD = 8
-BUF_SIZE = 1024
+BUF_SIZE = 102400
 
 DATA_DIR = 'data/ILSVRC2012'
 TRAIN_LIST = 'data/ILSVRC2012/train_list.txt'
@@ -105,7 +105,7 @@ def process_image(sample, mode, color_jitter, rotate):
         if rotate: img = rotate_image(img)
         img = random_crop(img, DATA_DIM)
     else:
-        img = resize_short(img, DATA_DIM)
+        img = resize_short(img, target_size=256)
         img = crop_image(img, target_size=DATA_DIM, center=True)
     if mode == 'train':
         if color_jitter:
@@ -120,9 +120,9 @@ def process_image(sample, mode, color_jitter, rotate):
     img -= img_mean
     img /= img_std
 
-    if mode == 'train' or mode == 'test':
+    if mode == 'train' or mode == 'val':
         return img, sample[1]
-    elif mode == 'infer':
+    elif mode == 'test':
         return [img]
 
 
@@ -137,11 +137,11 @@ def _reader_creator(file_list,
             if shuffle:
                 random.shuffle(lines)
             for line in lines:
-                if mode == 'train' or mode == 'test':
+                if mode == 'train' or mode == 'val':
                     img_path, label = line.split()
                     img_path = os.path.join(DATA_DIR, img_path)
                     yield img_path, int(label)
-                elif mode == 'infer':
+                elif mode == 'test':
                     img_path = os.path.join(DATA_DIR, line)
                     yield [img_path]
 
@@ -156,9 +156,9 @@ def train(file_list=TRAIN_LIST):
         file_list, 'train', shuffle=True, color_jitter=False, rotate=False)
 
 
-def test(file_list=TEST_LIST):
-    return _reader_creator(file_list, 'test', shuffle=False)
+def val(file_list=TEST_LIST):
+    return _reader_creator(file_list, 'val', shuffle=False)
 
 
-def infer(file_list):
-    return _reader_creator(file_list, 'infer', shuffle=False)
+def test(file_list):
+    return _reader_creator(file_list, 'test', shuffle=False)
diff --git a/fluid/image_classification/se_resnext.py b/fluid/image_classification/se_resnext.py
deleted file mode 100644
index ad533c756fd3ef92f2a75519949ba28da32704a0..0000000000000000000000000000000000000000
--- a/fluid/image_classification/se_resnext.py
+++ /dev/null
@@ -1,138 +0,0 @@
-import os
-import numpy as np
-import time
-import sys
-import paddle
-import paddle.fluid as fluid
-import reader
-import paddle.fluid.layers.control_flow as control_flow
-import paddle.fluid.layers.nn as nn
-import paddle.fluid.layers.tensor as tensor
-import math
-
-
-def conv_bn_layer(input, num_filters, filter_size, stride=1, groups=1,
-                  act=None):
-    conv = fluid.layers.conv2d(
-        input=input,
-        num_filters=num_filters,
-        filter_size=filter_size,
-        stride=stride,
-        padding=(filter_size - 1) / 2,
-        groups=groups,
-        act=None,
-        bias_attr=False)
-    return fluid.layers.batch_norm(input=conv, act=act)
-
-
-def squeeze_excitation(input, num_channels, reduction_ratio):
-    pool = fluid.layers.pool2d(
-        input=input, pool_size=0, pool_type='avg', global_pooling=True)
-    stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
-    squeeze = fluid.layers.fc(input=pool,
-                              size=num_channels / reduction_ratio,
-                              act='relu',
-                              param_attr=fluid.param_attr.ParamAttr(
-                                  initializer=fluid.initializer.Uniform(-stdv,
-                                                                        stdv)))
-    stdv = 1.0 / math.sqrt(squeeze.shape[1] * 1.0)
-    excitation = fluid.layers.fc(input=squeeze,
-                                 size=num_channels,
-                                 act='sigmoid',
-                                 param_attr=fluid.param_attr.ParamAttr(
-                                     initializer=fluid.initializer.Uniform(
-                                         -stdv, stdv)))
-    scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0)
-    return scale
-
-
-def shortcut(input, ch_out, stride):
-    ch_in = input.shape[1]
-    if ch_in != ch_out or stride != 1:
-        filter_size = 1
-        return conv_bn_layer(input, ch_out, filter_size, stride)
-    else:
-        return input
-
-
-def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio):
-    conv0 = conv_bn_layer(
-        input=input, num_filters=num_filters, filter_size=1, act='relu')
-    conv1 = conv_bn_layer(
-        input=conv0,
-        num_filters=num_filters,
-        filter_size=3,
-        stride=stride,
-        groups=cardinality,
-        act='relu')
-    conv2 = conv_bn_layer(
-        input=conv1, num_filters=num_filters * 2, filter_size=1, act=None)
-    scale = squeeze_excitation(
-        input=conv2,
-        num_channels=num_filters * 2,
-        reduction_ratio=reduction_ratio)
-
-    short = shortcut(input, num_filters * 2, stride)
-
-    return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
-
-
-def SE_ResNeXt(input, class_dim, infer=False, layers=50):
-    supported_layers = [50, 152]
-    if layers not in supported_layers:
-        print("supported layers are", supported_layers, \
-              "but input layer is ", layers)
-        exit()
-    if layers == 50:
-        cardinality = 32
-        reduction_ratio = 16
-        depth = [3, 4, 6, 3]
-        num_filters = [128, 256, 512, 1024]
-
-        conv = conv_bn_layer(
-            input=input, num_filters=64, filter_size=7, stride=2, act='relu')
-        conv = fluid.layers.pool2d(
-            input=conv,
-            pool_size=3,
-            pool_stride=2,
-            pool_padding=1,
-            pool_type='max')
-    elif layers == 152:
-        cardinality = 64
-        reduction_ratio = 16
-        depth = [3, 8, 36, 3]
-        num_filters = [128, 256, 512, 1024]
-
-        conv = conv_bn_layer(
-            input=input, num_filters=64, filter_size=3, stride=2, act='relu')
-        conv = conv_bn_layer(
-            input=conv, num_filters=64, filter_size=3, stride=1, act='relu')
-        conv = conv_bn_layer(
-            input=conv, num_filters=128, filter_size=3, stride=1, act='relu')
-        conv = fluid.layers.pool2d(
-            input=conv, pool_size=3, pool_stride=2, pool_padding=1, \
-            pool_type='max')
-
-    for block in range(len(depth)):
-        for i in range(depth[block]):
-            conv = bottleneck_block(
-                input=conv,
-                num_filters=num_filters[block],
-                stride=2 if i == 0 and block != 0 else 1,
-                cardinality=cardinality,
-                reduction_ratio=reduction_ratio)
-
-    pool = fluid.layers.pool2d(
-        input=conv, pool_size=7, pool_type='avg', global_pooling=True)
-    if not infer:
-        drop = fluid.layers.dropout(x=pool, dropout_prob=0.5)
-    else:
-        drop = pool
-    stdv = 1.0 / math.sqrt(drop.shape[1] * 1.0)
-    out = fluid.layers.fc(input=drop,
-                          size=class_dim,
-                          act='softmax',
-                          param_attr=fluid.param_attr.ParamAttr(
-                              initializer=fluid.initializer.Uniform(-stdv,
-                                                                    stdv)))
-    return out
diff --git a/fluid/image_classification/train.py b/fluid/image_classification/train.py
index 84e05ecd77e038756503d61fa7dd3b081b682004..74588e21c93e40ee7f5bcde7d6cbbc7c873278ba 100644
--- a/fluid/image_classification/train.py
+++ b/fluid/image_classification/train.py
@@ -4,278 +4,145 @@ import time
 import sys
 import paddle
 import paddle.fluid as fluid
-from se_resnext import SE_ResNeXt
-from mobilenet import mobile_net
-from inception_v4 import inception_v4
+import models
 import reader
 import argparse
 import functools
-import paddle.fluid.layers.ops as ops
+from models.learning_rate import cosine_decay
 from utility import add_arguments, print_arguments
-from paddle.fluid.initializer import init_on_cpu
-from paddle.fluid.layers.learning_rate_scheduler import _decay_step_counter
 import math
 
 parser = argparse.ArgumentParser(description=__doc__)
 add_arg = functools.partial(add_arguments, argparser=parser)
-add_arg('batch_size', int, 256, "Minibatch size.")
-add_arg('num_layers', int, 50, "How many layers for SE-ResNeXt model.")
-add_arg('with_mem_opt', bool, True,
-        "Whether to use memory optimization or not.")
-add_arg('parallel_exe', bool, True,
-        "Whether to use ParallelExecutor to train or not.")
-add_arg('init_model', str, None, "Whether to use initialized model.")
-add_arg('pretrained_model', str, None, "Whether to use pretrained model.")
-add_arg('lr_strategy', str, "cosine_decay",
-        "Set the learning rate decay strategy.")
-add_arg('model', str, "se_resnext", "Set the network to use.")
-
-
-def cosine_decay(learning_rate, step_each_epoch, epochs=120):
-    """Applies cosine decay to the learning rate.
-    lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1)
-    """
-    global_step = _decay_step_counter()
-
-    with init_on_cpu():
-        epoch = ops.floor(global_step / step_each_epoch)
-        decayed_lr = learning_rate * \
-                     (ops.cos(epoch * (math.pi / epochs)) + 1)/2
-    return decayed_lr
-
-
-def train_parallel_do(args,
-                      learning_rate,
-                      batch_size,
-                      num_passes,
-                      init_model=None,
-                      pretrained_model=None,
-                      model_save_dir='model',
-                      parallel=True,
-                      use_nccl=True,
-                      lr_strategy=None,
-                      layers=50):
-    class_dim = 1000
-    image_shape = [3, 224, 224]
-
-    image = fluid.layers.data(name='image', shape=image_shape, dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-    if parallel:
-        places = fluid.layers.get_places()
-        pd = fluid.layers.ParallelDo(places, use_nccl=use_nccl)
-
-        with pd.do():
-            image_ = pd.read_input(image)
-            label_ = pd.read_input(label)
-            if args.model is 'se_resnext':
-                out = SE_ResNeXt(
-                    input=image_, class_dim=class_dim, layers=layers)
-            elif args.model is 'mobile_net':
-                out = mobile_net(img=image_, class_dim=class_dim)
-            else:
-                out = inception_v4(img=image_, class_dim=class_dim)
-
-            cost = fluid.layers.cross_entropy(input=out, label=label_)
-            avg_cost = fluid.layers.mean(x=cost)
-            acc_top1 = fluid.layers.accuracy(input=out, label=label_, k=1)
-            acc_top5 = fluid.layers.accuracy(input=out, label=label_, k=5)
-            pd.write_output(avg_cost)
-            pd.write_output(acc_top1)
-            pd.write_output(acc_top5)
-
-        avg_cost, acc_top1, acc_top5 = pd()
-        avg_cost = fluid.layers.mean(x=avg_cost)
-        acc_top1 = fluid.layers.mean(x=acc_top1)
-        acc_top5 = fluid.layers.mean(x=acc_top5)
-    else:
-        if args.model is 'se_resnext':
-            out = SE_ResNeXt(input=image, class_dim=class_dim, layers=layers)
-        elif args.model is 'mobile_net':
-            out = mobile_net(img=image, class_dim=class_dim)
+# yapf: disable
+add_arg('batch_size',       int,   256,                  "Minibatch size.")
+add_arg('use_gpu',          bool,  True,                 "Whether to use GPU or not.")
+add_arg('total_images',     int,   1281167,              "Training image number.")
+add_arg('num_epochs',       int,   120,                  "number of epochs.")
+add_arg('class_dim',        int,   1000,                 "Class number.")
+add_arg('image_shape',      str,   "3,224,224",          "input image size")
+add_arg('model_save_dir',   str,   "output",             "model save directory")
+add_arg('with_mem_opt',     bool,  True,                 "Whether to use memory optimization or not.")
+add_arg('pretrained_model', str,   None,                 "Whether to use pretrained model.")
+add_arg('checkpoint',       str,   None,                 "Whether to resume checkpoint.")
+add_arg('lr',               float, 0.1,                  "set learning rate.")
+add_arg('lr_strategy',      str,   "piecewise_decay",    "Set the learning rate decay strategy.")
+add_arg('model',            str,   "SE_ResNeXt50_32x4d", "Set the network to use.")
+# yapf: enable
+
+model_list = [m for m in dir(models) if "__" not in m]
+
+
+def optimizer_setting(params):
+    ls = params["learning_strategy"]
+
+    if ls["name"] == "piecewise_decay":
+        if "total_images" not in params:
+            total_images = 1281167
         else:
-            out = inception_v4(img=image, class_dim=class_dim)
-
-        cost = fluid.layers.cross_entropy(input=out, label=label)
-        avg_cost = fluid.layers.mean(x=cost)
-        acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
-        acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
+            total_images = params["total_images"]
 
-    inference_program = fluid.default_main_program().clone(for_test=True)
+        batch_size = ls["batch_size"]
+        step = int(total_images / batch_size + 1)
 
-    if "piecewise_decay" in lr_strategy:
-        bd = lr_strategy["piecewise_decay"]["bd"]
-        lr = lr_strategy["piecewise_decay"]["lr"]
+        bd = [step * e for e in ls["epochs"]]
+        base_lr = params["lr"]
+        lr = []
+        lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
         optimizer = fluid.optimizer.Momentum(
             learning_rate=fluid.layers.piecewise_decay(
                 boundaries=bd, values=lr),
             momentum=0.9,
             regularization=fluid.regularizer.L2Decay(1e-4))
-    elif "cosine_decay" in lr_strategy:
-        step_each_epoch = lr_strategy["cosine_decay"]["step_each_epoch"]
-        epochs = lr_strategy["cosine_decay"]["epochs"]
+    elif ls["name"] == "cosine_decay":
+        if "total_images" not in params:
+            total_images = 1281167
+        else:
+            total_images = params["total_images"]
+
+        batch_size = ls["batch_size"]
+        step = int(total_images / batch_size + 1)
+
+        lr = params["lr"]
+        num_epochs = params["num_epochs"]
+
         optimizer = fluid.optimizer.Momentum(
             learning_rate=cosine_decay(
-                learning_rate=learning_rate,
-                step_each_epoch=step_each_epoch,
-                epochs=epochs),
+                learning_rate=lr, step_each_epoch=step, epochs=num_epochs),
             momentum=0.9,
             regularization=fluid.regularizer.L2Decay(1e-4))
     else:
+        lr = params["lr"]
         optimizer = fluid.optimizer.Momentum(
-            learning_rate=learning_rate,
+            learning_rate=lr,
             momentum=0.9,
             regularization=fluid.regularizer.L2Decay(1e-4))
 
-    opts = optimizer.minimize(avg_cost)
-    if args.with_mem_opt:
-        fluid.memory_optimize(fluid.default_main_program())
-
-    place = fluid.CUDAPlace(0)
-    exe = fluid.Executor(place)
-    exe.run(fluid.default_startup_program())
-
-    if init_model is not None:
-        fluid.io.load_persistables(exe, init_model)
-
-    if pretrained_model:
-
-        def if_exist(var):
-            return os.path.exists(os.path.join(pretrained_model, var.name))
-
-        fluid.io.load_vars(exe, pretrained_model, predicate=if_exist)
-
-    train_reader = paddle.batch(reader.train(), batch_size=batch_size)
-    test_reader = paddle.batch(reader.test(), batch_size=batch_size)
-    feeder = fluid.DataFeeder(place=place, feed_list=[image, label])
-
-    for pass_id in range(num_passes):
-        train_info = [[], [], []]
-        test_info = [[], [], []]
-        for batch_id, data in enumerate(train_reader()):
-            t1 = time.time()
-            loss, acc1, acc5 = exe.run(
-                fluid.default_main_program(),
-                feed=feeder.feed(data),
-                fetch_list=[avg_cost, acc_top1, acc_top5])
-            t2 = time.time()
-            period = t2 - t1
-            train_info[0].append(loss[0])
-            train_info[1].append(acc1[0])
-            train_info[2].append(acc5[0])
-            if batch_id % 10 == 0:
-                print("Pass {0}, trainbatch {1}, loss {2}, \
-                       acc1 {3}, acc5 {4} time {5}"
-                                                   .format(pass_id, \
-                       batch_id, loss[0], acc1[0], acc5[0], \
-                       "%2.2f sec" % period))
-                sys.stdout.flush()
-
-        train_loss = np.array(train_info[0]).mean()
-        train_acc1 = np.array(train_info[1]).mean()
-        train_acc5 = np.array(train_info[2]).mean()
-        for data in test_reader():
-            t1 = time.time()
-            loss, acc1, acc5 = exe.run(
-                inference_program,
-                feed=feeder.feed(data),
-                fetch_list=[avg_cost, acc_top1, acc_top5])
-            t2 = time.time()
-            period = t2 - t1
-            test_info[0].append(loss[0])
-            test_info[1].append(acc1[0])
-            test_info[2].append(acc5[0])
-            if batch_id % 10 == 0:
-                print("Pass {0},testbatch {1},loss {2}, \
-                       acc1 {3},acc5 {4},time {5}"
-                                                  .format(pass_id, \
-                       batch_id, loss[0], acc1[0], acc5[0], \
-                       "%2.2f sec" % period))
-                sys.stdout.flush()
-
-        test_loss = np.array(test_info[0]).mean()
-        test_acc1 = np.array(test_info[1]).mean()
-        test_acc5 = np.array(test_info[2]).mean()
+    return optimizer
 
-        print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, \
-               test_loss {4}, test_acc1 {5}, test_acc5 {6}"
-                                                           .format(pass_id, \
-              train_loss, train_acc1, train_acc5, test_loss, test_acc1, \
-              test_acc5))
-        sys.stdout.flush()
-
-        model_path = os.path.join(model_save_dir + '/' + args.model,
-                                  str(pass_id))
-        if not os.path.isdir(model_path):
-            os.makedirs(model_path)
-        fluid.io.save_persistables(exe, model_path)
 
+def train(args):
+    # parameters from arguments
+    class_dim = args.class_dim
+    model_name = args.model
+    checkpoint = args.checkpoint
+    pretrained_model = args.pretrained_model
+    with_memory_optimization = args.with_mem_opt
+    model_save_dir = args.model_save_dir
+    image_shape = [int(m) for m in args.image_shape.split(",")]
 
-def train_parallel_exe(args,
-                       learning_rate,
-                       batch_size,
-                       num_passes,
-                       init_model=None,
-                       pretrained_model=None,
-                       model_save_dir='model',
-                       parallel=True,
-                       use_nccl=True,
-                       lr_strategy=None,
-                       layers=50):
-    class_dim = 1000
-    image_shape = [3, 224, 224]
+    assert model_name in model_list, "{} is not in lists: {}".format(args.model,
+                                                                     model_list)
 
     image = fluid.layers.data(name='image', shape=image_shape, dtype='float32')
     label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    if args.model is 'se_resnext':
-        out = SE_ResNeXt(input=image, class_dim=class_dim, layers=layers)
-    elif args.model is 'mobile_net':
-        out = mobile_net(img=image, class_dim=class_dim)
+
+    # model definition
+    model = models.__dict__[model_name]()
+
+    if model_name is "GoogleNet":
+        out0, out1, out2 = model.net(input=image, class_dim=class_dim)
+        cost0 = fluid.layers.cross_entropy(input=out0, label=label)
+        cost1 = fluid.layers.cross_entropy(input=out1, label=label)
+        cost2 = fluid.layers.cross_entropy(input=out2, label=label)
+        avg_cost0 = fluid.layers.mean(x=cost0)
+        avg_cost1 = fluid.layers.mean(x=cost1)
+        avg_cost2 = fluid.layers.mean(x=cost2)
+
+        avg_cost = avg_cost0 + 0.3 * avg_cost1 + 0.3 * avg_cost2
+        acc_top1 = fluid.layers.accuracy(input=out0, label=label, k=1)
+        acc_top5 = fluid.layers.accuracy(input=out0, label=label, k=5)
     else:
-        out = inception_v4(img=image, class_dim=class_dim)
+        out = model.net(input=image, class_dim=class_dim)
+        cost = fluid.layers.cross_entropy(input=out, label=label)
 
-    cost = fluid.layers.cross_entropy(input=out, label=label)
-    acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
-    acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
-    avg_cost = fluid.layers.mean(x=cost)
+        avg_cost = fluid.layers.mean(x=cost)
+        acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
+        acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
 
     test_program = fluid.default_main_program().clone(for_test=True)
 
-    if "piecewise_decay" in lr_strategy:
-        bd = lr_strategy["piecewise_decay"]["bd"]
-        lr = lr_strategy["piecewise_decay"]["lr"]
-        optimizer = fluid.optimizer.Momentum(
-            learning_rate=fluid.layers.piecewise_decay(
-                boundaries=bd, values=lr),
-            momentum=0.9,
-            regularization=fluid.regularizer.L2Decay(1e-4))
-    elif "cosine_decay" in lr_strategy:
-        step_each_epoch = lr_strategy["cosine_decay"]["step_each_epoch"]
-        epochs = lr_strategy["cosine_decay"]["epochs"]
-        optimizer = fluid.optimizer.Momentum(
-            learning_rate=cosine_decay(
-                learning_rate=learning_rate,
-                step_each_epoch=step_each_epoch,
-                epochs=epochs),
-            momentum=0.9,
-            regularization=fluid.regularizer.L2Decay(1e-4))
-    else:
-        optimizer = fluid.optimizer.Momentum(
-            learning_rate=learning_rate,
-            momentum=0.9,
-            regularization=fluid.regularizer.L2Decay(1e-4))
+    # parameters from model and arguments
+    params = model.params
+    params["total_images"] = args.total_images
+    params["lr"] = args.lr
+    params["num_epochs"] = args.num_epochs
+    params["learning_strategy"]["batch_size"] = args.batch_size
+    params["learning_strategy"]["name"] = args.lr_strategy
 
+    # initialize optimizer
+    optimizer = optimizer_setting(params)
     opts = optimizer.minimize(avg_cost)
 
-    if args.with_mem_opt:
+    if with_memory_optimization:
         fluid.memory_optimize(fluid.default_main_program())
 
-    place = fluid.CUDAPlace(0)
+    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
     exe = fluid.Executor(place)
     exe.run(fluid.default_startup_program())
 
-    if init_model is not None:
-        fluid.io.load_persistables(exe, init_model)
+    if checkpoint is not None:
+        fluid.io.load_persistables(exe, checkpoint)
 
     if pretrained_model:
 
@@ -284,18 +151,17 @@ def train_parallel_exe(args,
 
         fluid.io.load_vars(exe, pretrained_model, predicate=if_exist)
 
-    train_reader = paddle.batch(reader.train(), batch_size=batch_size)
-    test_reader = paddle.batch(reader.test(), batch_size=batch_size)
-
+    train_batch_size = args.batch_size
+    test_batch_size = 16
+    train_reader = paddle.batch(reader.train(), batch_size=train_batch_size)
+    test_reader = paddle.batch(reader.val(), batch_size=test_batch_size)
     feeder = fluid.DataFeeder(place=place, feed_list=[image, label])
 
     train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name)
-    test_exe = fluid.ParallelExecutor(
-        use_cuda=True, main_program=test_program, share_vars_from=train_exe)
 
     fetch_list = [avg_cost.name, acc_top1.name, acc_top5.name]
 
-    for pass_id in range(num_passes):
+    for pass_id in range(params["num_epochs"]):
         train_info = [[], [], []]
         test_info = [[], [], []]
         for batch_id, data in enumerate(train_reader()):
@@ -320,81 +186,51 @@ def train_parallel_exe(args,
         train_loss = np.array(train_info[0]).mean()
         train_acc1 = np.array(train_info[1]).mean()
         train_acc5 = np.array(train_info[2]).mean()
-        for data in test_reader():
+        cnt = 0
+        for test_batch_id, data in enumerate(test_reader()):
             t1 = time.time()
-            loss, acc1, acc5 = test_exe.run(fetch_list, feed=feeder.feed(data))
+            loss, acc1, acc5 = exe.run(test_program,
+                                       fetch_list=fetch_list,
+                                       feed=feeder.feed(data))
             t2 = time.time()
             period = t2 - t1
-            loss = np.mean(np.array(loss))
-            acc1 = np.mean(np.array(acc1))
-            acc5 = np.mean(np.array(acc5))
-            test_info[0].append(loss)
-            test_info[1].append(acc1)
-            test_info[2].append(acc5)
-            if batch_id % 10 == 0:
+            loss = np.mean(loss)
+            acc1 = np.mean(acc1)
+            acc5 = np.mean(acc5)
+            test_info[0].append(loss * len(data))
+            test_info[1].append(acc1 * len(data))
+            test_info[2].append(acc5 * len(data))
+            cnt += len(data)
+            if test_batch_id % 10 == 0:
                 print("Pass {0},testbatch {1},loss {2}, \
                        acc1 {3},acc5 {4},time {5}"
                                                   .format(pass_id, \
-                       batch_id, loss, acc1, acc5, \
+                       test_batch_id, loss, acc1, acc5, \
                        "%2.2f sec" % period))
                 sys.stdout.flush()
 
-        test_loss = np.array(test_info[0]).mean()
-        test_acc1 = np.array(test_info[1]).mean()
-        test_acc5 = np.array(test_info[2]).mean()
+        test_loss = np.sum(test_info[0]) / cnt
+        test_acc1 = np.sum(test_info[1]) / cnt
+        test_acc5 = np.sum(test_info[2]) / cnt
 
-        print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, \
-               test_loss {4}, test_acc1 {5}, test_acc5 {6}"
-                                                           .format(pass_id, \
+        print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, "
+              "test_loss {4}, test_acc1 {5}, test_acc5 {6}".format(pass_id, \
               train_loss, train_acc1, train_acc5, test_loss, test_acc1, \
               test_acc5))
         sys.stdout.flush()
 
-        model_path = os.path.join(model_save_dir + '/' + args.model,
+        model_path = os.path.join(model_save_dir + '/' + model_name,
                                   str(pass_id))
         if not os.path.isdir(model_path):
             os.makedirs(model_path)
         fluid.io.save_persistables(exe, model_path)
 
 
-if __name__ == '__main__':
+def main():
     args = parser.parse_args()
     print_arguments(args)
+    train(args)
 
-    total_images = 1281167
-    batch_size = args.batch_size
-    step = int(total_images / batch_size + 1)
-    num_epochs = 120
-
-    learning_rate_mode = args.lr_strategy
-    lr_strategy = {}
-    if learning_rate_mode == "piecewise_decay":
-        epoch_points = [30, 60, 90]
-        bd = [e * step for e in epoch_points]
-        lr = [0.1, 0.01, 0.001, 0.0001]
-        lr_strategy[learning_rate_mode] = {"bd": bd, "lr": lr}
-    elif learning_rate_mode == "cosine_decay":
-        lr_strategy[learning_rate_mode] = {
-            "step_each_epoch": step,
-            "epochs": num_epochs
-        }
-    else:
-        lr_strategy = None
-
-    use_nccl = True
-    # layers: 50, 152
-    layers = args.num_layers
-    method = train_parallel_exe if args.parallel_exe else train_parallel_do
-    init_model = args.init_model if args.init_model else None
-    pretrained_model = args.pretrained_model if args.pretrained_model else None
-    method(
-        args,
-        learning_rate=0.1,
-        batch_size=batch_size,
-        num_passes=num_epochs,
-        init_model=init_model,
-        pretrained_model=pretrained_model,
-        parallel=True,
-        use_nccl=True,
-        lr_strategy=lr_strategy,
-        layers=layers)
+
+if __name__ == '__main__':
+    main()
diff --git a/fluid/neural_machine_translation/transformer/config.py b/fluid/neural_machine_translation/transformer/config.py
index 8ab9efce1a275ea9539b05c0b959dee42d83c759..8ab6b2de4ff7a44e807002c79b8be46f4a912920 100644
--- a/fluid/neural_machine_translation/transformer/config.py
+++ b/fluid/neural_machine_translation/transformer/config.py
@@ -1,8 +1,10 @@
 class TrainTaskConfig(object):
+    # only support GPU currently
     use_gpu = True
     # the epoch number to train.
     pass_num = 30
     # the number of sequences contained in a mini-batch.
+    # deprecated, set batch_size in args.
     batch_size = 32
     # the hyper parameters for Adam optimizer.
     # This static learning_rate will be multiplied to the LearningRateScheduler
@@ -13,8 +15,6 @@ class TrainTaskConfig(object):
     eps = 1e-9
     # the parameters for learning rate scheduling.
     warmup_steps = 4000
-    # the flag indicating to use average loss or sum loss when training.
-    use_avg_cost = True
     # the weight used to mix up the ground-truth distribution and the fixed
     # uniform distribution in label smoothing when training.
     # Set this as zero if label smoothing is not wanted.
@@ -38,22 +38,20 @@ class InferTaskConfig(object):
     batch_size = 10
     # the parameters for beam search.
     beam_size = 5
-    max_length = 30
+    max_length = 256
     # the number of decoded sentences to output.
     n_best = 1
     # the flags indicating whether to output the special tokens.
     output_bos = False
     output_eos = False
-    output_unk = False
+    output_unk = True
     # the directory for loading the trained model.
     model_path = "trained_models/pass_1.infer.model"
 
 
 class ModelHyperParams(object):
-    # This model directly uses paddle.dataset.wmt16 in which <bos>, <eos> and
-    # <unk> token has alreay been added. As for the <pad> token, any token
-    # included in dict can be used to pad, since the paddings' loss will be
-    # masked out and make no effect on parameter gradients.
+    # These following five vocabularies related configurations will be set
+    # automatically according to the passed vocabulary path and special tokens.
     # size of source word dictionary.
     src_vocab_size = 10000
     # size of target word dictionay
@@ -68,13 +66,13 @@ class ModelHyperParams(object):
     # The size of position encoding table should at least plus 1, since the
     # sinusoid position encoding starts from 1 and 0 can be used as the padding
     # token for position encoding.
-    max_length = 50
+    max_length = 256
     # the dimension for word embeddings, which is also the last dimension of
     # the input and output of multi-head attention, position-wise feed-forward
     # networks, encoder and decoder.
     d_model = 512
     # size of the hidden layer in position-wise feed-forward networks.
-    d_inner_hid = 1024
+    d_inner_hid = 2048
     # the dimension that keys are projected to for dot-product attention.
     d_key = 64
     # the dimension that values are projected to for dot-product attention.
@@ -85,6 +83,9 @@ class ModelHyperParams(object):
     n_layer = 6
     # dropout rate used by all dropout layers.
     dropout = 0.1
+    # the flag indicating whether to share embedding and softmax weights.
+    # vocabularies in source and target should be same for weight sharing.
+    weight_sharing = True
 
 
 def merge_cfg_from_list(cfg_list, g_cfgs):
@@ -97,7 +98,7 @@ def merge_cfg_from_list(cfg_list, g_cfgs):
             if hasattr(g_cfg, key):
                 try:
                     value = eval(value)
-                except SyntaxError:  # for file path
+                except Exception:  # for file path
                     pass
                 setattr(g_cfg, key, value)
                 break
@@ -172,6 +173,10 @@ input_descs = {
     "lbl_weight": [(1 * (ModelHyperParams.max_length + 1), 1L), "float32"],
 }
 
+# Names of word embedding table which might be reused for weight sharing.
+word_emb_param_names = (
+    "src_word_emb_table",
+    "trg_word_emb_table", )
 # Names of position encoding table which will be initialized externally.
 pos_enc_param_names = (
     "src_pos_enc_table",
diff --git a/fluid/neural_machine_translation/transformer/infer.py b/fluid/neural_machine_translation/transformer/infer.py
index e8f7f47dd5c0dc4937b73bd1693b2fd14fb8d55c..c3d1f0af5d319f838e968930ec7d4083baeaab6f 100644
--- a/fluid/neural_machine_translation/transformer/infer.py
+++ b/fluid/neural_machine_translation/transformer/infer.py
@@ -308,7 +308,7 @@ def infer(args):
             ModelHyperParams.n_layer, ModelHyperParams.n_head,
             ModelHyperParams.d_key, ModelHyperParams.d_value,
             ModelHyperParams.d_model, ModelHyperParams.d_inner_hid,
-            ModelHyperParams.dropout)
+            ModelHyperParams.dropout, ModelHyperParams.weight_sharing)
 
     decoder_program = fluid.Program()
     with fluid.program_guard(main_program=decoder_program):
@@ -317,7 +317,7 @@ def infer(args):
             ModelHyperParams.n_layer, ModelHyperParams.n_head,
             ModelHyperParams.d_key, ModelHyperParams.d_value,
             ModelHyperParams.d_model, ModelHyperParams.d_inner_hid,
-            ModelHyperParams.dropout)
+            ModelHyperParams.dropout, ModelHyperParams.weight_sharing)
 
     # Load model parameters of encoder and decoder separately from the saved
     # transformer model.
@@ -359,6 +359,7 @@ def infer(args):
         start_mark=args.special_token[0],
         end_mark=args.special_token[1],
         unk_mark=args.special_token[2],
+        max_length=ModelHyperParams.max_length,
         clip_last_batch=False)
 
     trg_idx2word = test_data.load_dict(
diff --git a/fluid/neural_machine_translation/transformer/model.py b/fluid/neural_machine_translation/transformer/model.py
index 9c5d8adc312d48eb7c232789e590755e1b349d3a..7756d633fb05d27904f84dc9c41e25643c17eb04 100644
--- a/fluid/neural_machine_translation/transformer/model.py
+++ b/fluid/neural_machine_translation/transformer/model.py
@@ -46,26 +46,14 @@ def multi_head_attention(queries,
         """
         q = layers.fc(input=queries,
                       size=d_key * n_head,
-                      param_attr=fluid.initializer.Xavier(
-                          uniform=False,
-                          fan_in=d_model * d_key,
-                          fan_out=n_head * d_key),
                       bias_attr=False,
                       num_flatten_dims=2)
         k = layers.fc(input=keys,
                       size=d_key * n_head,
-                      param_attr=fluid.initializer.Xavier(
-                          uniform=False,
-                          fan_in=d_model * d_key,
-                          fan_out=n_head * d_key),
                       bias_attr=False,
                       num_flatten_dims=2)
         v = layers.fc(input=values,
                       size=d_value * n_head,
-                      param_attr=fluid.initializer.Xavier(
-                          uniform=False,
-                          fan_in=d_model * d_value,
-                          fan_out=n_head * d_value),
                       bias_attr=False,
                       num_flatten_dims=2)
         return q, k, v
@@ -84,7 +72,7 @@ def multi_head_attention(queries,
         # The value 0 in shape attr means copying the corresponding dimension
         # size of the input as the output dimension size.
         reshaped = layers.reshape(
-            x=x, shape=[0, -1, n_head, hidden_size // n_head])
+            x=x, shape=[0, 0, n_head, hidden_size // n_head])
 
         # permuate the dimensions into:
         # [batch_size, n_head, max_sequence_len, hidden_size_per_head]
@@ -104,7 +92,7 @@ def multi_head_attention(queries,
         # size of the input as the output dimension size.
         return layers.reshape(
             x=trans_x,
-            shape=map(int, [0, -1, trans_x.shape[2] * trans_x.shape[3]]))
+            shape=map(int, [0, 0, trans_x.shape[2] * trans_x.shape[3]]))
 
     def scaled_dot_product_attention(q, k, v, attn_bias, d_model, dropout_rate):
         """
@@ -140,7 +128,6 @@ def multi_head_attention(queries,
     # Project back to the model size.
     proj_out = layers.fc(input=out,
                          size=d_model,
-                         param_attr=fluid.initializer.Xavier(uniform=False),
                          bias_attr=False,
                          num_flatten_dims=2)
     return proj_out
@@ -155,14 +142,8 @@ def positionwise_feed_forward(x, d_inner_hid, d_hid):
     hidden = layers.fc(input=x,
                        size=d_inner_hid,
                        num_flatten_dims=2,
-                       param_attr=fluid.initializer.Uniform(
-                           low=-(d_hid**-0.5), high=(d_hid**-0.5)),
                        act="relu")
-    out = layers.fc(input=hidden,
-                    size=d_hid,
-                    num_flatten_dims=2,
-                    param_attr=fluid.initializer.Uniform(
-                        low=-(d_inner_hid**-0.5), high=(d_inner_hid**-0.5)))
+    out = layers.fc(input=hidden, size=d_hid, num_flatten_dims=2)
     return out
 
 
@@ -200,6 +181,7 @@ def prepare_encoder(src_word,
                     src_max_len,
                     dropout_rate=0.,
                     src_data_shape=None,
+                    word_emb_param_name=None,
                     pos_enc_param_name=None):
     """Add word embeddings and position encodings.
     The output tensor has a shape of:
@@ -209,7 +191,10 @@ def prepare_encoder(src_word,
     src_word_emb = layers.embedding(
         src_word,
         size=[src_vocab_size, src_emb_dim],
-        param_attr=fluid.initializer.Normal(0., 1.))
+        param_attr=fluid.ParamAttr(
+            name=word_emb_param_name,
+            initializer=fluid.initializer.Normal(0., src_emb_dim**-0.5)))
+    src_word_emb = layers.scale(x=src_word_emb, scale=src_emb_dim**0.5)
     src_pos_enc = layers.embedding(
         src_pos,
         size=[src_max_len, src_emb_dim],
@@ -415,7 +400,12 @@ def transformer(
         d_model,
         d_inner_hid,
         dropout_rate,
+        weight_sharing,
         label_smooth_eps, ):
+    if weight_sharing:
+        assert src_vocab_size == src_vocab_size, (
+            "Vocabularies in source and target should be same for weight sharing."
+        )
     enc_inputs = make_all_inputs(encoder_data_input_fields +
                                  encoder_util_input_fields)
 
@@ -429,6 +419,7 @@ def transformer(
         d_model,
         d_inner_hid,
         dropout_rate,
+        weight_sharing,
         enc_inputs, )
 
     dec_inputs = make_all_inputs(decoder_data_input_fields[:-1] +
@@ -444,6 +435,7 @@ def transformer(
         d_model,
         d_inner_hid,
         dropout_rate,
+        weight_sharing,
         dec_inputs,
         enc_output, )
 
@@ -459,7 +451,6 @@ def transformer(
         logits=predict,
         label=label,
         soft_label=True if label_smooth_eps else False)
-    # cost = layers.softmax_with_cross_entropy(logits=predict, label=gold)
     weighted_cost = cost * weights
     sum_cost = layers.reduce_sum(weighted_cost)
     token_num = layers.reduce_sum(weights)
@@ -476,6 +467,7 @@ def wrap_encoder(src_vocab_size,
                  d_model,
                  d_inner_hid,
                  dropout_rate,
+                 weight_sharing,
                  enc_inputs=None):
     """
     The wrapper assembles together all needed layers for the encoder.
@@ -497,7 +489,8 @@ def wrap_encoder(src_vocab_size,
         d_model,
         max_length,
         dropout_rate,
-        src_data_shape, )
+        src_data_shape,
+        word_emb_param_name=word_emb_param_names[0])
     enc_output = encoder(
         enc_input,
         src_slf_attn_bias,
@@ -522,6 +515,7 @@ def wrap_decoder(trg_vocab_size,
                  d_model,
                  d_inner_hid,
                  dropout_rate,
+                 weight_sharing,
                  dec_inputs=None,
                  enc_output=None):
     """
@@ -547,7 +541,9 @@ def wrap_decoder(trg_vocab_size,
         d_model,
         max_length,
         dropout_rate,
-        trg_data_shape, )
+        trg_data_shape,
+        word_emb_param_name=word_emb_param_names[0]
+        if weight_sharing else word_emb_param_names[1])
     dec_output = decoder(
         dec_input,
         enc_output,
@@ -565,11 +561,20 @@ def wrap_decoder(trg_vocab_size,
         src_attn_pre_softmax_shape,
         src_attn_post_softmax_shape, )
     # Return logits for training and probs for inference.
-    predict = layers.reshape(
-        x=layers.fc(input=dec_output,
-                    size=trg_vocab_size,
-                    bias_attr=False,
-                    num_flatten_dims=2),
-        shape=[-1, trg_vocab_size],
-        act="softmax" if dec_inputs is None else None)
+    if weight_sharing:
+        predict = layers.reshape(
+            x=layers.matmul(
+                x=dec_output,
+                y=fluid.get_var(word_emb_param_names[0]),
+                transpose_y=True),
+            shape=[-1, trg_vocab_size],
+            act="softmax" if dec_inputs is None else None)
+    else:
+        predict = layers.reshape(
+            x=layers.fc(input=dec_output,
+                        size=trg_vocab_size,
+                        bias_attr=False,
+                        num_flatten_dims=2),
+            shape=[-1, trg_vocab_size],
+            act="softmax" if dec_inputs is None else None)
     return predict
diff --git a/fluid/neural_machine_translation/transformer/train.py b/fluid/neural_machine_translation/transformer/train.py
index bf9edb52bedf065242d4f49391302ba988d7dcac..e3c9b62d068b7cbf0433328d1fcb559a4e659166 100644
--- a/fluid/neural_machine_translation/transformer/train.py
+++ b/fluid/neural_machine_translation/transformer/train.py
@@ -43,9 +43,11 @@ def parse_args():
     parser.add_argument(
         "--batch_size",
         type=int,
-        default=2000,
+        default=2048,
         help="The number of sequences contained in a mini-batch, or the maximum "
-        "number of tokens (include paddings) contained in a mini-batch.")
+        "number of tokens (include paddings) contained in a mini-batch. Note "
+        "that this represents the number on single device and the actual batch "
+        "size for multi-devices will multiply the device number.")
     parser.add_argument(
         "--pool_size",
         type=int,
@@ -203,50 +205,50 @@ def prepare_batch_input(insts, data_input_names, util_input_names, src_pad_idx,
         [num_token], dtype="float32")
 
 
-def train(args):
-    dev_count = fluid.core.get_cuda_device_count()
+def read_multiple(reader, count, clip_last=True):
+    """
+    Stack data from reader for multi-devices.
+    """
 
-    def read_multiple(reader,
-                      count=dev_count if args.use_token_batch else 1,
-                      clip_last=True):
-        """
-        Stack data from reader for multi-devices.
-        """
-
-        def __impl__():
-            res = []
-            for item in reader():
-                res.append(item)
-                if len(res) == count:
-                    yield res
-                    res = []
+    def __impl__():
+        res = []
+        for item in reader():
+            res.append(item)
             if len(res) == count:
                 yield res
-            elif not clip_last:
-                data = []
-                for item in res:
-                    data += item
-                if len(data) > count:
-                    inst_num_per_part = len(data) // count
-                    yield [
-                        data[inst_num_per_part * i:inst_num_per_part * (i + 1)]
-                        for i in range(count)
-                    ]
-
-        return __impl__
-
-    def split_data(data, num_part=dev_count):
-        """
-        Split data for each device.
-        """
-        if len(data) == num_part:
-            return data
-        data = data[0]
-        inst_num_per_part = len(data) // num_part
-        return [
-            data[inst_num_per_part * i:inst_num_per_part * (i + 1)]
-            for i in range(num_part)
-        ]
+                res = []
+        if len(res) == count:
+            yield res
+        elif not clip_last:
+            data = []
+            for item in res:
+                data += item
+            if len(data) > count:
+                inst_num_per_part = len(data) // count
+                yield [
+                    data[inst_num_per_part * i:inst_num_per_part * (i + 1)]
+                    for i in range(count)
+                ]
+
+    return __impl__
+
+
+def split_data(data, num_part):
+    """
+    Split data for each device.
+    """
+    if len(data) == num_part:
+        return data
+    data = data[0]
+    inst_num_per_part = len(data) // num_part
+    return [
+        data[inst_num_per_part * i:inst_num_per_part * (i + 1)]
+        for i in range(num_part)
+    ]
+
+
+def train(args):
+    dev_count = fluid.core.get_cuda_device_count()
 
     sum_cost, avg_cost, predict, token_num = transformer(
         ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size,
@@ -254,7 +256,7 @@ def train(args):
         ModelHyperParams.n_head, ModelHyperParams.d_key,
         ModelHyperParams.d_value, ModelHyperParams.d_model,
         ModelHyperParams.d_inner_hid, ModelHyperParams.dropout,
-        TrainTaskConfig.label_smooth_eps)
+        ModelHyperParams.weight_sharing, TrainTaskConfig.label_smooth_eps)
 
     lr_scheduler = LearningRateScheduler(ModelHyperParams.d_model,
                                          TrainTaskConfig.warmup_steps,
@@ -288,9 +290,12 @@ def train(args):
         start_mark=args.special_token[0],
         end_mark=args.special_token[1],
         unk_mark=args.special_token[2],
+        max_length=ModelHyperParams.max_length,
         clip_last_batch=False)
+    train_data = read_multiple(
+        reader=train_data.batch_generator,
+        count=dev_count if args.use_token_batch else 1)
 
-    train_data = read_multiple(reader=train_data.batch_generator)
     build_strategy = fluid.BuildStrategy()
     # Since the token number differs among devices, customize gradient scale to
     # use token average cost among multi-devices. and the gradient scale is
@@ -303,9 +308,11 @@ def train(args):
 
     def test_context():
         # Context to do validation.
-        test_program = fluid.default_main_program().clone()
-        with fluid.program_guard(test_program):
-            test_program = fluid.io.get_inference_program([avg_cost])
+        test_program = fluid.default_main_program().clone(for_test=True)
+        test_exe = fluid.ParallelExecutor(
+            use_cuda=TrainTaskConfig.use_gpu,
+            main_program=test_program,
+            share_vars_from=train_exe)
 
         val_data = reader.DataReader(
             src_vocab_fpath=args.src_vocab_fpath,
@@ -319,22 +326,22 @@ def train(args):
             start_mark=args.special_token[0],
             end_mark=args.special_token[1],
             unk_mark=args.special_token[2],
+            max_length=ModelHyperParams.max_length,
             clip_last_batch=False,
             shuffle=False,
             shuffle_batch=False)
 
-        test_exe = fluid.ParallelExecutor(
-            use_cuda=TrainTaskConfig.use_gpu,
-            main_program=test_program,
-            share_vars_from=train_exe)
-
         def test(exe=test_exe):
             test_total_cost = 0
             test_total_token = 0
-            test_data = read_multiple(reader=val_data.batch_generator)
+            test_data = read_multiple(
+                reader=val_data.batch_generator,
+                count=dev_count if args.use_token_batch else 1)
             for batch_id, data in enumerate(test_data()):
                 feed_list = []
-                for place_id, data_buffer in enumerate(split_data(data)):
+                for place_id, data_buffer in enumerate(
+                        split_data(
+                            data, num_part=dev_count)):
                     data_input_dict, util_input_dict, _ = prepare_batch_input(
                         data_buffer, data_input_names, util_input_names,
                         ModelHyperParams.eos_idx, ModelHyperParams.eos_idx,
@@ -367,7 +374,9 @@ def train(args):
             feed_list = []
             total_num_token = 0
             lr_rate = lr_scheduler.update_learning_rate()
-            for place_id, data_buffer in enumerate(split_data(data)):
+            for place_id, data_buffer in enumerate(
+                    split_data(
+                        data, num_part=dev_count)):
                 data_input_dict, util_input_dict, num_token = prepare_batch_input(
                     data_buffer, data_input_names, util_input_names,
                     ModelHyperParams.eos_idx, ModelHyperParams.eos_idx,
@@ -377,17 +386,14 @@ def train(args):
                     dict(data_input_dict.items() + util_input_dict.items() +
                          {lr_scheduler.learning_rate.name: lr_rate}.items()))
 
-                if not init:
+                if not init:  # init the position encoding table
                     for pos_enc_param_name in pos_enc_param_names:
                         pos_enc = position_encoding_init(
                             ModelHyperParams.max_length + 1,
                             ModelHyperParams.d_model)
                         feed_list[place_id][pos_enc_param_name] = pos_enc
             for feed_dict in feed_list:
-                feed_dict[
-                    sum_cost.name +
-                    "@GRAD"] = 1. / total_num_token if TrainTaskConfig.use_avg_cost else np.asarray(
-                        [1.], dtype="float32")
+                feed_dict[sum_cost.name + "@GRAD"] = 1. / total_num_token
             outs = train_exe.run(fetch_list=[sum_cost.name, token_num.name],
                                  feed=feed_list)
             sum_cost_val, token_num_val = np.array(outs[0]), np.array(outs[1])
diff --git a/fluid/object_detection/README.md b/fluid/object_detection/README.md
index 5f018ab33c9e947dc3fd843403a3277e879cf28f..ec93f153e085401fd9d89b257b5ba45a700db08c 100644
--- a/fluid/object_detection/README.md
+++ b/fluid/object_detection/README.md
@@ -1,20 +1,33 @@
-The minimum PaddlePaddle version needed for the code sample in this directory is the lastest develop branch. If you are on a version of PaddlePaddle earlier than this, [please update your installation](http://www.paddlepaddle.org/docs/develop/documentation/en/build_and_install/pip_install_en.html).
+The minimum PaddlePaddle version needed for the code sample in this directory is the latest develop branch. If you are on a version of PaddlePaddle earlier than this, [please update your installation](http://www.paddlepaddle.org/docs/develop/documentation/en/build_and_install/pip_install_en.html).
 
 ---
 
 ## SSD Object Detection
 
+## Table of Contents
+- [Introduction](#introduction)
+- [Data Preparation](#data-preparation)
+- [Train](#train)
+- [Evaluate](#evaluate)
+- [Infer and Visualize](#infer-and-visualize)
+- [Released Model](#released-model)
+
 ### Introduction
 
-[Single Shot MultiBox Detector (SSD)](https://arxiv.org/abs/1512.02325) framework for object detection is based on a feed-forward convolutional network. The early network is a standard convolutional architecture for image classification, such as VGG, ResNet, or MobileNet, which is also called base network. In this tutorial we used [MobileNet](https://arxiv.org/abs/1704.04861).
+[Single Shot MultiBox Detector (SSD)](https://arxiv.org/abs/1512.02325) framework for object detection can be categorized as a single stage detector. A single stage detector simplifies object detection as a regression problem, which directly predicts the bounding boxes and class probabilities without region proposal. SSD further makes improves by producing these predictions of different scales from different layers, as shown below. Six levels predictions are made in six different scale feature maps. And there are two 3x3 convolutional layers in each feature map, which predict category or a shape offset relative to the prior box(also called anchor), respectively. Thus, we get 38x38x4 + 19x19x6 + 10x10x6 + 5x5x6 + 3x3x4 + 1x1x4 = 8732 detections per class.
+<p align="center">
+<img src="images/SSD_paper_figure.jpg" height=300 width=900 hspace='10'/> <br />
+The Single Shot MultiBox Detector (SSD)
+</p>
+
+SSD is readily pluggable into a wide variant standard convolutional network, such as VGG, ResNet, or MobileNet, which is also called base network or backbone. In this tutorial we used [MobileNet](https://arxiv.org/abs/1704.04861).
+
 
 ### Data Preparation
 
 You can use [PASCAL VOC dataset](http://host.robots.ox.ac.uk/pascal/VOC/) or [MS-COCO dataset](http://cocodataset.org/#download).
 
-#### PASCAL VOC Dataset
-
-If you want to train model on PASCAL VOC dataset, please download datset at first, skip this step if you already have one.
+If you want to train a model on PASCAL VOC dataset, please download dataset at first, skip this step if you already have one.
 
 ```bash
 cd data/pascalvoc
@@ -23,9 +36,7 @@ cd data/pascalvoc
 
 The command `download.sh` also will create training and testing file lists.
 
-#### MS-COCO Dataset
-
-If you want to train model on MS-COCO dataset, please download datset at first, skip this step if you already have one.
+If you want to train a model on MS-COCO dataset, please download dataset at first, skip this step if you already have one.
 
 ```
 cd data/coco
@@ -36,45 +47,52 @@ cd data/coco
 
 #### Download the Pre-trained Model.
 
-We provide two pre-trained models. The one is MobileNet-v1 SSD trained on COCO dataset, but removed the convolutional predictors for COCO dataset. This model can be used to initialize the models when training other dataset, like PASCAL VOC. Then other pre-trained model is MobileNet v1 trained on ImageNet 2012 dataset, but removed the last weights and bias in Fully-Connected layer.
+We provide two pre-trained models. The one is MobileNet-v1 SSD trained on COCO dataset, but removed the convolutional predictors for COCO dataset. This model can be used to initialize the models when training other datasets, like PASCAL VOC. The other pre-trained model is MobileNet-v1 trained on ImageNet 2012 dataset but removed the last weights and bias in the Fully-Connected layer.
 
-Declaration: the MobileNet-v1 SSD model is converted by [TensorFlow model](https://github.com/tensorflow/models/blob/f87a58cd96d45de73c9a8330a06b2ab56749a7fa/research/object_detection/g3doc/detection_model_zoo.md). The MobileNet v1 model is converted [Caffe](https://github.com/shicai/MobileNet-Caffe).
+Declaration: the MobileNet-v1 SSD model is converted by [TensorFlow model](https://github.com/tensorflow/models/blob/f87a58cd96d45de73c9a8330a06b2ab56749a7fa/research/object_detection/g3doc/detection_model_zoo.md). The MobileNet-v1 model is converted from [Caffe](https://github.com/shicai/MobileNet-Caffe).
+We will release the pre-trained models by ourself in the upcoming soon.
 
   - Download MobileNet-v1 SSD:
-    ```
+    ```bash
     ./pretrained/download_coco.sh
     ```
   - Download MobileNet-v1:
-    ```
+    ```bash
     ./pretrained/download_imagenet.sh
     ```
 
 #### Train on PASCAL VOC
-  - Train on one device (/GPU).
-  ```python
-  env CUDA_VISIBLE_DEVICES=0 python -u train.py --parallel=False --dataset='pascalvoc' --pretrained_model='pretrained/ssd_mobilenet_v1_coco/'
-  ```
-  - Train on multi devices (/GPUs).
 
-  ```python
-  env CUDA_VISIBLE_DEVICES=0,1 python -u train.py --batch_size=64 --dataset='pascalvoc' --pretrained_model='pretrained/ssd_mobilenet_v1_coco/'
+`train.py` is the main caller of the training module. Examples of usage are shown below.
+  ```bash
+  python -u train.py --batch_size=64 --dataset='pascalvoc' --pretrained_model='pretrained/ssd_mobilenet_v1_coco/'
   ```
+   - Set ```export CUDA_VISIBLE_DEVICES=0,1``` to specifiy the number of GPU you want to use.
+   - Set ```--dataset='coco2014'``` or ```--dataset='coco2017'``` to train model on MS COCO dataset.
+   - For more help on arguments:
 
-#### Train on MS-COCO
-  - Train on one device (/GPU).
-  ```python
-  env CUDA_VISIBLE_DEVICES=0 python -u train.py --parallel=False --dataset='coco2014' --pretrained_model='pretrained/mobilenet_v1_imagenet/'
-  ```
-  - Train on multi devices (/GPUs).
-  ```python
-  env CUDA_VISIBLE_DEVICES=0,1 python -u train.py --batch_size=64 --dataset='coco2014' --pretrained_model='pretrained/mobilenet_v1_imagenet/'
+  ```bash
+  python train.py --help
   ```
 
-TBD
+Data reader is defined in `reader.py`. All images will be resized to 300x300. In training stage, images are randomly distorted, expanded, cropped and flipped:
+   - distort: distort brightness, contrast, saturation, and hue.
+   - expand: put the original image into a larger expanded image which is initialized using image mean.
+   - crop: crop image with respect to different scale, aspect ratio, and overlap.
+   - flip: flip horizontally.
+
+We used RMSProp optimizer with mini-batch size 64 to train the MobileNet-SSD. The initial learning rate is 0.001, and was decayed at 40, 60, 80, 100 epochs with multiplier 0.5, 0.25, 0.1, 0.01, respectively. Weight decay is 0.00005. After 120 epochs we achieve 73.32% mAP under 11point metric.
 
 ### Evaluate
 
-You can evaluate your trained model in different metric like 11point, integral on both PASCAL VOC and COCO dataset. Moreover, we provide eval_coco_map.py which uses a COCO-specific mAP metric defined by [COCO committee](http://cocodataset.org/#detections-eval). To use this eval_coco_map.py, [cocoapi](https://github.com/cocodataset/cocoapi) is needed.
+You can evaluate your trained model in different metrics like 11point, integral on both PASCAL VOC and COCO dataset. Note we set the default test list to the dataset's test/val list, you can use your own test list by setting ```--test_list``` args.
+
+`eval.py` is the main caller of the evaluating module. Examples of usage are shown below.
+```bash
+python eval.py --dataset='pascalvoc' --model_dir='train_pascal_model/best_model' --data_dir='data/pascalvoc' --test_list='test.txt' --ap_version='11point' --nms_threshold=0.45
+```
+
+You can set ```--dataset``` to ```coco2014``` or ```coco2017``` to evaluate COCO dataset. Moreover, we provide `eval_coco_map.py` which uses a COCO-specific mAP metric defined by [COCO committee](http://cocodataset.org/#detections-eval). To use this eval_coco_map.py, [cocoapi](https://github.com/cocodataset/cocoapi) is needed.
 Install the cocoapi:
 ```
 # COCOAPI=/path/to/clone/cocoapi
@@ -86,44 +104,25 @@ make install
 # not to install the COCO API into global site-packages
 python2 setup.py install --user
 ```
-Note we set the defualt test list to the dataset's test/val list, you can use your own test list by setting test_list args.
-
-#### Evaluate on PASCAL VOC
-```python
-env CUDA_VISIBLE_DEVICES=0 python eval.py --dataset='pascalvoc' --model_dir='train_pascal_model/90' --data_dir='data/pascalvoc' --test_list='test.txt' --ap_version='11point'
-```
-
-#### Evaluate on MS-COCO
-```python
-env CUDA_VISIBLE_DEVICES=0 python eval.py --dataset='coco2014' --nms_threshold=0.5 --model_dir='train_coco_model/40' --test_list='annotations/instances_minival2014.json' --ap_version='integral'
-env CUDA_VISIBLE_DEVICES=0 python eval_coco_map.py --dataset='coco2017' --nms_threshold=0.5 --model_dir='train_coco_model/40' --test_list='annotations/instances_minival2017.json'
-```
-
-TBD
 
 ### Infer and Visualize
-
-```python
-env CUDA_VISIBLE_DEVICES=0 python infer.py --dataset='coco' --nms_threshold=0.5 --model_dir='train_coco_model/20' --image_path='./data/coco/val2014/COCO_val2014_000000000139.jpg'
+`infer.py` is the main caller of the inferring module. Examples of usage are shown below.
+```bash
+python infer.py --dataset='pascalvoc' --nms_threshold=0.45 --model_dir='train_pascal_model/best_model' --image_path='./data/pascalvoc/VOCdevkit/VOC2007/JPEGImages/009963.jpg'
 ```
-Below is the examples after running python infer.py to inference and visualize the model result.
+Below are the examples of running the inference and visualizing the model result.
 <p align="center">
-<img src="images/COCO_val2014_000000000139.jpg" height=300 width=400 hspace='10'/>
-<img src="images/COCO_val2014_000000000785.jpg" height=300 width=400 hspace='10'/>
-<img src="images/COCO_val2014_000000142324.jpg" height=300 width=400 hspace='10'/>
-<img src="images/COCO_val2014_000000144003.jpg" height=300 width=400 hspace='10'/> <br />
-MobileNet-SSD300x300 Visualization Examples
+<img src="images/009943.jpg" height=300 width=400 hspace='10'/>
+<img src="images/009956.jpg" height=300 width=400 hspace='10'/>
+<img src="images/009960.jpg" height=300 width=400 hspace='10'/>
+<img src="images/009962.jpg" height=300 width=400 hspace='10'/> <br />
+MobileNet-v1-SSD 300x300 Visualization Examples
 </p>
 
-TBD
 
 ### Released Model
 
 
 | Model                    | Pre-trained Model  | Training data    | Test data    | mAP |
 |:------------------------:|:------------------:|:----------------:|:------------:|:----:|
-|MobileNet-v1-SSD 300x300  | COCO MobileNet SSD | VOC07+12 trainval| VOC07 test   | xx%  |
-|MobileNet-v1-SSD 300x300  | ImageNet MobileNet | VOC07+12 trainval| VOC07 test   | xx%  |
-|MobileNet-v1-SSD 300x300  | ImageNet MobileNet | MS-COCO trainval | MS-COCO test | xx%  |
-
-TBD
+|[MobileNet-v1-SSD 300x300](http://paddlemodels.bj.bcebos.com/ssd_mobilenet_v1_pascalvoc.tar.gz) | COCO MobileNet SSD | VOC07+12 trainval| VOC07 test   | 73.32%  |
diff --git a/fluid/object_detection/README_cn.md b/fluid/object_detection/README_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..57c4e275d8d7bcc7c38d17af09c6b84329df9b68
--- /dev/null
+++ b/fluid/object_detection/README_cn.md
@@ -0,0 +1,127 @@
+运行本目录下的程序示例需要使用 PaddlePaddle 最新的 develop branch 版本。如果您的 PaddlePaddle 安装版本低于此要求，请按照[安装文档](http://www.paddlepaddle.org/docs/develop/documentation/zh/build_and_install/pip_install_cn.html)中的说明更新 PaddlePaddle 安装版本。
+
+---
+
+## SSD 目标检测
+
+## Table of Contents
+- [简介](#简介)
+- [数据准备](#数据准备)
+- [模型训练](#模型训练)
+- [模型评估](#模型评估)
+- [模型预测以及可视化](#模型预测以及可视化)
+- [模型发布](#模型发布)
+
+### 简介
+
+[Single Shot MultiBox Detector (SSD)](https://arxiv.org/abs/1512.02325) 是一种单阶段的目标检测器。与两阶段的检测方法不同，单阶段目标检测并不进行区域推荐，而是直接从特征图回归出目标的边界框和分类概率。SSD 运用了这种单阶段检测的思想，并且对其进行改进：在不同尺度的特征图上检测对应尺度的目标。如下图所示，SSD 在六个尺度的特征图上进行了不同层级的预测。每个层级由两个3x3卷积分别对目标类别和边界框偏移进行回归。因此对于每个类别，SSD 的六个层级一共会产生 38x38x4 + 19x19x6 + 10x10x6 + 5x5x6 + 3x3x4 + 1x1x4 = 8732 个检测结果。
+<p align="center">
+<img src="images/SSD_paper_figure.jpg" height=300 width=900 hspace='10'/> <br />
+SSD 目标检测模型
+</p>
+
+SSD 可以方便地插入到任何一种标准卷积网络中，比如 VGG、ResNet 或者 MobileNet，这些网络被称作检测器的基网络。在这个示例中我们使用 [MobileNet](https://arxiv.org/abs/1704.04861)。
+
+
+### 数据准备
+
+你可以使用 [PASCAL VOC 数据集](http://host.robots.ox.ac.uk/pascal/VOC/) 或者 [MS-COCO 数据集](http://cocodataset.org/#download)。
+
+如果你想在 PASCAL VOC 数据集上进行训练，请先使用下面的命令下载数据集。
+
+```bash
+cd data/pascalvoc
+./download.sh
+```
+
+`download.sh` 命令会自动创建训练和测试用的列表文件。
+
+如果你想在 MS-COCO 数据集上进行训练，请先使用下面的命令下载数据集。
+
+```
+cd data/coco
+./download.sh
+```
+
+### 模型训练
+
+#### 下载预训练模型
+
+我们提供了两个预训练模型。第一个模型是在 COCO 数据集上预训练的 MobileNet-v1 SSD，我们将它的预测头移除了以便在 COCO 以外的数据集上进行训练。第二个模型是在 ImageNet 2012 数据集上预训练的 MobileNet-v1，我们也将最后的全连接层移除以便进行目标检测训练。
+
+声明：MobileNet-v1 SSD 模型转换自[TensorFlow model](https://github.com/tensorflow/models/blob/f87a58cd96d45de73c9a8330a06b2ab56749a7fa/research/object_detection/g3doc/detection_model_zoo.md)。MobileNet-v1 模型转换自[Caffe](https://github.com/shicai/MobileNet-Caffe)。我们不久也会发布我们自己预训练的模型。
+
+  - 下载 MobileNet-v1 SSD:
+    ```bash
+    ./pretrained/download_coco.sh
+    ```
+  - 下载 MobileNet-v1:
+    ```bash
+    ./pretrained/download_imagenet.sh
+    ```
+
+#### 训练 PASCAL VOC 数据集
+
+`train.py` 是训练模块的主要执行程序，调用示例如下：
+  ```bash
+  python -u train.py --batch_size=64 --dataset='pascalvoc' --pretrained_model='pretrained/ssd_mobilenet_v1_coco/'
+  ```
+   - 可以通过设置 ```export CUDA_VISIBLE_DEVICES=0,1``` 指定想要使用的GPU数量。
+   - 可以通过设置 ```--dataset='coco2014'``` 或 ```--dataset='coco2017'``` 指定训练 MS-COCO数据集。
+   - 更多的可选参数见:
+
+  ```bash
+  python train.py --help
+  ```
+
+数据的读取行为定义在 `reader.py` 中，所有的图片都会被缩放到300x300。在训练时，数据还会进行图片增强和标签增强，图片增强包括对图片本身的随机扰动、扩张和翻转，标签增强包括随机裁剪:
+   - 扰动: 扰动图片亮度、对比度、饱和度和色相。
+   - 扩张: 将原始图片放进一张使用像素均值填充(随后会在减均值操作中减掉)的扩张图中，再对此图进行裁剪、缩放和翻转。
+   - 翻转: 水平翻转。
+   - 裁剪: 根据缩放比例、长宽比例两个参数生成若干候选框，再依据这些候选框和标注框的面积交并比(IoU)挑选出符合要求的裁剪结果。
+
+我们使用了 RMSProp 优化算法来训练 MobileNet-SSD，batch大小为64，权重衰减系数为0.00005，初始学习率为 0.001，并且在第40、60、80、100 轮时使用 0.5, 0.25, 0.1, 0.01乘子进行学习率衰减。在120轮训练后，11point评价标准下的mAP为73.32%。
+
+### 模型评估
+
+你可以使用11point、integral等指标在PASCAL VOC 和 COCO 数据集上评估训练好的模型。不失一般性，我们采用相应数据集的测试列表作为样例代码的默认列表，你也可以通过设置```--test_list```来指定自己的测试样本列表。
+
+`eval.py`是评估模块的主要执行程序，调用示例如下：
+```bash
+python eval.py --dataset='pascalvoc' --model_dir='train_pascal_model/best_model' --data_dir='data/pascalvoc' --test_list='test.txt' --ap_version='11point' --nms_threshold=0.45
+```
+
+你可以设置```--dataset``` 为 ```coco2014``` 或 ```coco2017```来评估 COCO 数据集。我们也提供了`eval_coco_map.py`以进行[COCO官方评估](http://cocodataset.org/#detections-eval)。若要使用 eval_coco_map.py, 需要首先下载[cocoapi](https://github.com/cocodataset/cocoapi)：
+```
+# COCOAPI=/path/to/clone/cocoapi
+git clone https://github.com/cocodataset/cocoapi.git $COCOAPI
+cd $COCOAPI/PythonAPI
+# Install into global site-packages
+make install
+# Alternatively, if you do not have permissions or prefer
+# not to install the COCO API into global site-packages
+python2 setup.py install --user
+```
+
+### 模型预测以及可视化
+
+`infer.py`是预测及可视化模块的主要执行程序，调用示例如下：
+```bash
+python infer.py --dataset='pascalvoc' --nms_threshold=0.45 --model_dir='train_pascal_model/best_model' --image_path='./data/pascalvoc/VOCdevkit/VOC2007/JPEGImages/009963.jpg'
+```
+下图可视化了模型的预测结果：
+<p align="center">
+<img src="images/009943.jpg" height=300 width=400 hspace='10'/>
+<img src="images/009956.jpg" height=300 width=400 hspace='10'/>
+<img src="images/009960.jpg" height=300 width=400 hspace='10'/>
+<img src="images/009962.jpg" height=300 width=400 hspace='10'/> <br />
+MobileNet-v1-SSD 300x300 预测可视化
+</p>
+
+
+### 模型发布
+
+
+| 模型                    | 预训练模型  | 训练数据    | 测试数据    | mAP |
+|:------------------------:|:------------------:|:----------------:|:------------:|:----:|
+|[MobileNet-v1-SSD 300x300](http://paddlemodels.bj.bcebos.com/ssd_mobilenet_v1_pascalvoc.tar.gz) | COCO MobileNet SSD | VOC07+12 trainval| VOC07 test   | 73.32%  |
diff --git a/fluid/object_detection/eval.py b/fluid/object_detection/eval.py
index 627461c3b3846158fbd2dd815feba09ab0967425..59130d9907a1349237c08256214b24f92b8b36c5 100644
--- a/fluid/object_detection/eval.py
+++ b/fluid/object_detection/eval.py
@@ -64,6 +64,7 @@ def eval(args, data_args, test_list, batch_size, model_dir=None):
         place=place, feed_list=[image, gt_box, gt_label, difficult])
 
     def test():
+        # switch network to test mode (i.e. batch norm test mode)
         test_program = fluid.default_main_program().clone(for_test=True)
         with fluid.program_guard(test_program):
             map_eval = fluid.evaluator.DetectionMAP(
@@ -79,12 +80,12 @@ def eval(args, data_args, test_list, batch_size, model_dir=None):
         _, accum_map = map_eval.get_map_var()
         map_eval.reset(exe)
         for batch_id, data in enumerate(test_reader()):
-            test_map = exe.run(test_program,
-                               feed=feeder.feed(data),
-                               fetch_list=[accum_map])
+            test_map, = exe.run(test_program,
+                                feed=feeder.feed(data),
+                                fetch_list=[accum_map])
             if batch_id % 20 == 0:
-                print("Batch {0}, map {1}".format(batch_id, test_map[0]))
-        print("Test model {0}, map {1}".format(model_dir, test_map[0]))
+                print("Batch {0}, map {1}".format(batch_id, test_map))
+        print("Test model {0}, map {1}".format(model_dir, test_map))
 
     test()
 
@@ -101,9 +102,9 @@ if __name__ == '__main__':
         raise ValueError("The model path [%s] does not exist." %
                          (args.model_dir))
     if 'coco' in args.dataset:
-        data_dir = './data/coco'
+        data_dir = 'data/coco'
         if '2014' in args.dataset:
-            test_list = 'annotations/instances_minival2014.json'
+            test_list = 'annotations/instances_val2014.json'
         elif '2017' in args.dataset:
             test_list = 'annotations/instances_val2017.json'
 
diff --git a/fluid/object_detection/eval_coco_map.py b/fluid/object_detection/eval_coco_map.py
index b9f03a63004341e7081c424c633ac14d3127b7fb..0837f42ad89cda1e6a81825bc0545a11b48c4b3c 100644
--- a/fluid/object_detection/eval_coco_map.py
+++ b/fluid/object_detection/eval_coco_map.py
@@ -133,7 +133,7 @@ if __name__ == '__main__':
 
     data_dir = './data/coco'
     if '2014' in args.dataset:
-        test_list = 'annotations/instances_minival2014.json'
+        test_list = 'annotations/instances_val2014.json'
     elif '2017' in args.dataset:
         test_list = 'annotations/instances_val2017.json'
 
diff --git a/fluid/object_detection/images/009943.jpg b/fluid/object_detection/images/009943.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..d6262f97052aa7d82068e7d01f4d9982fcf0d3a9
Binary files /dev/null and b/fluid/object_detection/images/009943.jpg differ
diff --git a/fluid/object_detection/images/009956.jpg b/fluid/object_detection/images/009956.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..320d3e251782e946395e7fcadbef051bc2e94bee
Binary files /dev/null and b/fluid/object_detection/images/009956.jpg differ
diff --git a/fluid/object_detection/images/009960.jpg b/fluid/object_detection/images/009960.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..2f73d3d6f1956b1fa9ae1aba3b5d516a53f26b8f
Binary files /dev/null and b/fluid/object_detection/images/009960.jpg differ
diff --git a/fluid/object_detection/images/009962.jpg b/fluid/object_detection/images/009962.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..182d6677bb80d94c5e7e4db3bf6654d3c064566c
Binary files /dev/null and b/fluid/object_detection/images/009962.jpg differ
diff --git a/fluid/object_detection/images/SSD_paper_figure.jpg b/fluid/object_detection/images/SSD_paper_figure.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..4a180341ceffd6a4c3446994203f1a3adc4c6796
Binary files /dev/null and b/fluid/object_detection/images/SSD_paper_figure.jpg differ
diff --git a/fluid/object_detection/infer.py b/fluid/object_detection/infer.py
index 698a89ad0b932b50c6ef942c082757db9d290974..9861004127f9d7fcc5cd0881097daa189dd0783f 100644
--- a/fluid/object_detection/infer.py
+++ b/fluid/object_detection/infer.py
@@ -5,6 +5,7 @@ import argparse
 import functools
 from PIL import Image
 from PIL import ImageDraw
+from PIL import ImageFont
 
 import paddle
 import paddle.fluid as fluid
@@ -20,7 +21,7 @@ add_arg('use_gpu',          bool,  True,      "Whether use GPU.")
 add_arg('image_path',       str,   '',        "The image used to inference and visualize.")
 add_arg('model_dir',        str,   '',     "The model path.")
 add_arg('nms_threshold',    float, 0.45,   "NMS threshold.")
-add_arg('confs_threshold',  float, 0.2,    "Confidence threshold to draw bbox.")
+add_arg('confs_threshold',  float, 0.5,    "Confidence threshold to draw bbox.")
 add_arg('resize_h',         int,   300,    "The resized image height.")
 add_arg('resize_w',         int,   300,    "The resized image height.")
 add_arg('mean_value_B',     float, 127.5,  "Mean value for B channel which will be subtracted.")  #123.68
@@ -33,8 +34,20 @@ def infer(args, data_args, image_path, model_dir):
     image_shape = [3, data_args.resize_h, data_args.resize_w]
     if 'coco' in data_args.dataset:
         num_classes = 91
+        # cocoapi
+        from pycocotools.coco import COCO
+        from pycocotools.cocoeval import COCOeval
+        label_fpath = os.path.join(data_dir, label_file)
+        coco = COCO(label_fpath)
+        category_ids = coco.getCatIds()
+        label_list = {
+            item['id']: item['name']
+            for item in coco.loadCats(category_ids)
+        }
+        label_list[0] = ['background']
     elif 'pascalvoc' in data_args.dataset:
         num_classes = 21
+        label_list = data_args.label_list
 
     image = fluid.layers.data(name='image', shape=image_shape, dtype='float32')
     locs, confs, box, box_var = mobile_net(num_classes, image, image_shape)
@@ -52,22 +65,21 @@ def infer(args, data_args, image_path, model_dir):
     infer_reader = reader.infer(data_args, image_path)
     feeder = fluid.DataFeeder(place=place, feed_list=[image])
 
-    def infer():
-        data = infer_reader()
-        nmsed_out_v = exe.run(fluid.default_main_program(),
-                              feed=feeder.feed([[data]]),
-                              fetch_list=[nmsed_out],
-                              return_numpy=False)
-        nmsed_out_v = np.array(nmsed_out_v[0])
-        draw_bounding_box_on_image(image_path, nmsed_out_v,
-                                   args.confs_threshold)
-        for dt in nmsed_out_v:
-            category_id, score, xmin, ymin, xmax, ymax = dt.tolist()
+    data = infer_reader()
 
-    infer()
+    # switch network to test mode (i.e. batch norm test mode)
+    test_program = fluid.default_main_program().clone(for_test=True)
+    nmsed_out_v, = exe.run(test_program,
+                           feed=feeder.feed([[data]]),
+                           fetch_list=[nmsed_out],
+                           return_numpy=False)
+    nmsed_out_v = np.array(nmsed_out_v)
+    draw_bounding_box_on_image(image_path, nmsed_out_v, args.confs_threshold,
+                               label_list)
 
 
-def draw_bounding_box_on_image(image_path, nms_out, confs_threshold):
+def draw_bounding_box_on_image(image_path, nms_out, confs_threshold,
+                               label_list):
     image = Image.open(image_path)
     draw = ImageDraw.Draw(image)
     im_width, im_height = image.size
@@ -85,6 +97,8 @@ def draw_bounding_box_on_image(image_path, nms_out, confs_threshold):
              (left, top)],
             width=4,
             fill='red')
+        if image.mode == 'RGB':
+            draw.text((left, top), label_list[int(category_id)], (255, 255, 0))
     image_name = image_path.split('/')[-1]
     print("image with bbox drawed saved as {}".format(image_name))
     image.save(image_name)
@@ -94,10 +108,20 @@ if __name__ == '__main__':
     args = parser.parse_args()
     print_arguments(args)
 
+    data_dir = 'data/pascalvoc'
+    label_file = 'label_list'
+
+    if not os.path.exists(args.model_dir):
+        raise ValueError("The model path [%s] does not exist." %
+                         (args.model_dir))
+    if 'coco' in args.dataset:
+        data_dir = 'data/coco'
+        label_file = 'annotations/instances_val2014.json'
+
     data_args = reader.Settings(
         dataset=args.dataset,
-        data_dir='',
-        label_file='',
+        data_dir=data_dir,
+        label_file=label_file,
         resize_h=args.resize_h,
         resize_w=args.resize_w,
         mean_value=[args.mean_value_B, args.mean_value_G, args.mean_value_R],
diff --git a/fluid/object_detection/train.py b/fluid/object_detection/train.py
index c81791044920493919d2ce18d56c4eb838ed0de5..c29bd070eda4cf82f5ac36a3eb5699ae13ae86d2 100644
--- a/fluid/object_detection/train.py
+++ b/fluid/object_detection/train.py
@@ -15,18 +15,17 @@ parser = argparse.ArgumentParser(description=__doc__)
 add_arg = functools.partial(add_arguments, argparser=parser)
 # yapf: disable
 add_arg('learning_rate',    float, 0.001,     "Learning rate.")
-add_arg('batch_size',       int,   32,        "Minibatch size.")
+add_arg('batch_size',       int,   64,        "Minibatch size.")
 add_arg('num_passes',       int,   120,       "Epoch number.")
 add_arg('use_gpu',          bool,  True,      "Whether use GPU.")
 add_arg('parallel',         bool,  True,      "Parallel.")
-add_arg('use_nccl',         bool,  True,      "NCCL.")
 add_arg('dataset',          str,   'pascalvoc', "coco2014, coco2017, and pascalvoc.")
 add_arg('model_save_dir',   str,   'model',     "The path to save model.")
 add_arg('pretrained_model', str,   'pretrained/ssd_mobilenet_v1_coco/', "The init model path.")
 add_arg('apply_distort',    bool,  True,   "Whether apply distort.")
-add_arg('apply_expand',     bool,  False,  "Whether appley expand.")
+add_arg('apply_expand',     bool,  True,  "Whether appley expand.")
 add_arg('nms_threshold',    float, 0.45,   "NMS threshold.")
-add_arg('ap_version',       str,   'integral',   "integral, 11point.")
+add_arg('ap_version',       str,   '11point',   "integral, 11point.")
 add_arg('resize_h',         int,   300,    "The resized image height.")
 add_arg('resize_w',         int,   300,    "The resized image height.")
 add_arg('mean_value_B',     float, 127.5,  "Mean value for B channel which will be subtracted.")  #123.68
@@ -35,141 +34,16 @@ add_arg('mean_value_R',     float, 127.5,  "Mean value for R channel which will
 add_arg('is_toy',           int,   0, "Toy for quick debug, 0 means using all data, while n means using only n sample.")
 #yapf: enable
 
-def parallel_do(args,
-                train_file_list,
-                val_file_list,
-                data_args,
-                learning_rate,
-                batch_size,
-                num_passes,
-                model_save_dir,
-                pretrained_model=None):
-    image_shape = [3, data_args.resize_h, data_args.resize_w]
-    if data_args.dataset == 'coco':
-        num_classes = 81
-    elif data_args.dataset == 'pascalvoc':
-        num_classes = 21
-
-    image = fluid.layers.data(name='image', shape=image_shape, dtype='float32')
-    gt_box = fluid.layers.data(
-        name='gt_box', shape=[4], dtype='float32', lod_level=1)
-    gt_label = fluid.layers.data(
-        name='gt_label', shape=[1], dtype='int32', lod_level=1)
-    difficult = fluid.layers.data(
-        name='gt_difficult', shape=[1], dtype='int32', lod_level=1)
-
-    if args.parallel:
-        places = fluid.layers.get_places()
-        pd = fluid.layers.ParallelDo(places, use_nccl=args.use_nccl)
-        with pd.do():
-            image_ = pd.read_input(image)
-            gt_box_ = pd.read_input(gt_box)
-            gt_label_ = pd.read_input(gt_label)
-            difficult_ = pd.read_input(difficult)
-            locs, confs, box, box_var = mobile_net(num_classes, image_,
-                                                   image_shape)
-            loss = fluid.layers.ssd_loss(locs, confs, gt_box_, gt_label_, box,
-                                         box_var)
-            nmsed_out = fluid.layers.detection_output(
-                locs, confs, box, box_var, nms_threshold=0.45)
-            loss = fluid.layers.reduce_sum(loss)
-            pd.write_output(loss)
-            pd.write_output(nmsed_out)
-
-        loss, nmsed_out = pd()
-        loss = fluid.layers.mean(loss)
-    else:
-        locs, confs, box, box_var = mobile_net(num_classes, image, image_shape)
-        nmsed_out = fluid.layers.detection_output(
-            locs, confs, box, box_var, nms_threshold=0.45)
-        loss = fluid.layers.ssd_loss(locs, confs, gt_box, gt_label, box,
-                                     box_var)
-        loss = fluid.layers.reduce_sum(loss)
-
-    test_program = fluid.default_main_program().clone(for_test=True)
-    with fluid.program_guard(test_program):
-        map_eval = fluid.evaluator.DetectionMAP(
-            nmsed_out,
-            gt_label,
-            gt_box,
-            difficult,
-            num_classes,
-            overlap_threshold=0.5,
-            evaluate_difficult=False,
-            ap_version=args.ap_version)
-
-    if data_args.dataset == 'coco':
-        # learning rate decay in 12, 19 pass, respectively
-        if '2014' in train_file_list:
-            boundaries = [82783 / batch_size * 12, 82783 / batch_size * 19]
-        elif '2017' in train_file_list:
-            boundaries = [118287 / batch_size * 12, 118287 / batch_size * 19]
-    elif data_args.dataset == 'pascalvoc':
-        boundaries = [40000, 60000]
-    values = [learning_rate, learning_rate * 0.5, learning_rate * 0.25]
-    optimizer = fluid.optimizer.RMSProp(
-        learning_rate=fluid.layers.piecewise_decay(boundaries, values),
-        regularization=fluid.regularizer.L2Decay(0.00005), )
-
-    optimizer.minimize(loss)
-
-    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
-    exe = fluid.Executor(place)
-    exe.run(fluid.default_startup_program())
-
-    if pretrained_model:
-        def if_exist(var):
-            return os.path.exists(os.path.join(pretrained_model, var.name))
-        fluid.io.load_vars(exe, pretrained_model, predicate=if_exist)
-
-    train_reader = paddle.batch(
-        reader.train(data_args, train_file_list), batch_size=batch_size)
-    test_reader = paddle.batch(
-        reader.test(data_args, val_file_list), batch_size=batch_size)
-    feeder = fluid.DataFeeder(
-        place=place, feed_list=[image, gt_box, gt_label, difficult])
-
-    def test(pass_id):
-        _, accum_map = map_eval.get_map_var()
-        map_eval.reset(exe)
-        test_map = None
-        for data in test_reader():
-            test_map = exe.run(test_program,
-                               feed=feeder.feed(data),
-                               fetch_list=[accum_map])
-        print("Pass {0}, test map {1}".format(pass_id, test_map[0]))
-
-    for pass_id in range(num_passes):
-        start_time = time.time()
-        prev_start_time = start_time
-        end_time = 0
-        for batch_id, data in enumerate(train_reader()):
-            prev_start_time = start_time
-            start_time = time.time()
-            loss_v = exe.run(fluid.default_main_program(),
-                             feed=feeder.feed(data),
-                             fetch_list=[loss])
-            end_time = time.time()
-            if batch_id % 20 == 0:
-                print("Pass {0}, batch {1}, loss {2}, time {3}".format(
-                    pass_id, batch_id, loss_v[0], start_time - prev_start_time))
-        test(pass_id)
-
-        if pass_id % 10 == 0 or pass_id == num_passes - 1:
-            model_path = os.path.join(model_save_dir, str(pass_id))
-            print 'save models to %s' % (model_path)
-            fluid.io.save_persistables(exe, model_path)
-
 
-def parallel_exe(args,
-                 train_file_list,
-                 val_file_list,
-                 data_args,
-                 learning_rate,
-                 batch_size,
-                 num_passes,
-                 model_save_dir,
-                 pretrained_model=None):
+def train(args,
+          train_file_list,
+          val_file_list,
+          data_args,
+          learning_rate,
+          batch_size,
+          num_passes,
+          model_save_dir,
+          pretrained_model=None):
     image_shape = [3, data_args.resize_h, data_args.resize_w]
     if 'coco' in data_args.dataset:
         num_classes = 91
@@ -186,10 +60,6 @@ def parallel_exe(args,
         name='gt_label', shape=[1], dtype='int32', lod_level=1)
     difficult = fluid.layers.data(
         name='gt_difficult', shape=[1], dtype='int32', lod_level=1)
-    gt_iscrowd = fluid.layers.data(
-        name='gt_iscrowd', shape=[1], dtype='int32', lod_level=1)
-    gt_image_info = fluid.layers.data(
-        name='gt_image_id', shape=[3], dtype='int32', lod_level=1)
 
     locs, confs, box, box_var = mobile_net(num_classes, image, image_shape)
     nmsed_out = fluid.layers.detection_output(
@@ -267,15 +137,15 @@ def parallel_exe(args,
         _, accum_map = map_eval.get_map_var()
         map_eval.reset(exe)
         for batch_id, data in enumerate(test_reader()):
-            test_map = exe.run(test_program,
+            test_map, = exe.run(test_program,
                                feed=feeder.feed(data),
                                fetch_list=[accum_map])
             if batch_id % 20 == 0:
-                print("Batch {0}, map {1}".format(batch_id, test_map[0]))
+                print("Batch {0}, map {1}".format(batch_id, test_map))
         if test_map[0] > best_map:
             best_map = test_map[0]
             save_model('best_model')
-        print("Pass {0}, test map {1}".format(pass_id, test_map[0]))
+        print("Pass {0}, test map {1}".format(pass_id, test_map))
         return best_map
 
     for pass_id in range(num_passes):
@@ -285,7 +155,9 @@ def parallel_exe(args,
         for batch_id, data in enumerate(train_reader()):
             prev_start_time = start_time
             start_time = time.time()
-            if len(data) < devices_num: continue
+            if len(data) < (devices_num * 2):
+                print("There are too few data to train on all devices.")
+                continue
             if args.parallel:
                 loss_v, = train_exe.run(fetch_list=[loss.name],
                                         feed=feeder.feed(data))
@@ -314,10 +186,10 @@ if __name__ == '__main__':
     label_file = 'label_list'
     model_save_dir = args.model_save_dir
     if 'coco' in args.dataset:
-        data_dir = './data/coco'
+        data_dir = 'data/coco'
         if '2014' in args.dataset:
             train_file_list = 'annotations/instances_train2014.json'
-            val_file_list = 'annotations/instances_minival2014.json'
+            val_file_list = 'annotations/instances_val2014.json'
         elif '2017' in args.dataset:
             train_file_list = 'annotations/instances_train2017.json'
             val_file_list = 'annotations/instances_val2017.json'
@@ -333,8 +205,7 @@ if __name__ == '__main__':
         apply_expand=args.apply_expand,
         ap_version = args.ap_version,
         toy=args.is_toy)
-    method = parallel_exe
-    method(
+    train(
         args,
         train_file_list=train_file_list,
         val_file_list=val_file_list,
diff --git a/fluid/text_classification/clouds/scdb_parallel_executor.py b/fluid/text_classification/clouds/scdb_parallel_executor.py
index b2531c8b8c5800d8e25b3231277e5a021b8ab688..042f02b53fc1ba9d29563a91d7da3643790a22e9 100644
--- a/fluid/text_classification/clouds/scdb_parallel_executor.py
+++ b/fluid/text_classification/clouds/scdb_parallel_executor.py
@@ -238,7 +238,7 @@ def lstm_net(data,
         size=[dict_dim, emb_dim],
         param_attr=fluid.ParamAttr(learning_rate=emb_lr))
 
-    fc0 = fluid.layers.fc(input=emb, size=hid_dim * 4, act='tanh')
+    fc0 = fluid.layers.fc(input=emb, size=hid_dim * 4)
 
     lstm_h, c = fluid.layers.dynamic_lstm(
         input=fc0, size=hid_dim * 4, is_reverse=False)
@@ -273,9 +273,9 @@ def bilstm_net(data,
         size=[dict_dim, emb_dim],
         param_attr=fluid.ParamAttr(learning_rate=emb_lr))
 
-    fc0 = fluid.layers.fc(input=emb, size=hid_dim * 4, act='tanh')
+    fc0 = fluid.layers.fc(input=emb, size=hid_dim * 4)
 
-    rfc0 = fluid.layers.fc(input=emb, size=hid_dim * 4, act='tanh')
+    rfc0 = fluid.layers.fc(input=emb, size=hid_dim * 4)
 
     lstm_h, c = fluid.layers.dynamic_lstm(
         input=fc0, size=hid_dim * 4, is_reverse=False)
diff --git a/fluid/text_classification/clouds/scdb_single_card.py b/fluid/text_classification/clouds/scdb_single_card.py
index 03bed2de3498f0bc4ff14590b47de5eeb1972579..490c4f3791c1566cc67951f5098cbef8ab171b59 100644
--- a/fluid/text_classification/clouds/scdb_single_card.py
+++ b/fluid/text_classification/clouds/scdb_single_card.py
@@ -238,7 +238,7 @@ def lstm_net(data,
         size=[dict_dim, emb_dim],
         param_attr=fluid.ParamAttr(learning_rate=emb_lr))
 
-    fc0 = fluid.layers.fc(input=emb, size=hid_dim * 4, act='tanh')
+    fc0 = fluid.layers.fc(input=emb, size=hid_dim * 4)
 
     lstm_h, c = fluid.layers.dynamic_lstm(
         input=fc0, size=hid_dim * 4, is_reverse=False)
@@ -273,9 +273,9 @@ def bilstm_net(data,
         size=[dict_dim, emb_dim],
         param_attr=fluid.ParamAttr(learning_rate=emb_lr))
 
-    fc0 = fluid.layers.fc(input=emb, size=hid_dim * 4, act='tanh')
+    fc0 = fluid.layers.fc(input=emb, size=hid_dim * 4)
 
-    rfc0 = fluid.layers.fc(input=emb, size=hid_dim * 4, act='tanh')
+    rfc0 = fluid.layers.fc(input=emb, size=hid_dim * 4)
 
     lstm_h, c = fluid.layers.dynamic_lstm(
         input=fc0, size=hid_dim * 4, is_reverse=False)
diff --git a/fluid/text_classification/nets.py b/fluid/text_classification/nets.py
index a21742d22d0bd1676c8c5874899af746b5225636..98028c871a83cafe6d1de7b545f333c4581e0a40 100644
--- a/fluid/text_classification/nets.py
+++ b/fluid/text_classification/nets.py
@@ -75,7 +75,7 @@ def lstm_net(data,
         size=[dict_dim, emb_dim],
         param_attr=fluid.ParamAttr(learning_rate=emb_lr))
 
-    fc0 = fluid.layers.fc(input=emb, size=hid_dim * 4, act='tanh')
+    fc0 = fluid.layers.fc(input=emb, size=hid_dim * 4)
 
     lstm_h, c = fluid.layers.dynamic_lstm(
         input=fc0, size=hid_dim * 4, is_reverse=False)