diff --git a/.travis.yml b/.travis.yml
index bf6a41d13c4eabc2d8543ab821ce0ff747a061df..929c847bd36d64e79a199b2634ebf68c3225429b 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -34,7 +34,7 @@ addons:
       - automake
       - libtool
       - ccache
-  ssh_known_hosts: 52.76.173.135
+  ssh_known_hosts: 13.229.163.131
 before_install:
   - if [[ "$JOB" == "check_style" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
   # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python
diff --git a/benchmark/fluid/machine_translation.py b/benchmark/fluid/machine_translation.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc31d098328bc237c018ebf8f158bdab5c37bff1
--- /dev/null
+++ b/benchmark/fluid/machine_translation.py
@@ -0,0 +1,349 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""seq2seq model for fluid."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+import distutils.util
+
+import paddle.v2 as paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.framework as framework
+from paddle.fluid.executor import Executor
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--embedding_dim",
+    type=int,
+    default=512,
+    help="The dimension of embedding table. (default: %(default)d)")
+parser.add_argument(
+    "--encoder_size",
+    type=int,
+    default=512,
+    help="The size of encoder bi-rnn unit. (default: %(default)d)")
+parser.add_argument(
+    "--decoder_size",
+    type=int,
+    default=512,
+    help="The size of decoder rnn unit. (default: %(default)d)")
+parser.add_argument(
+    "--batch_size",
+    type=int,
+    default=16,
+    help="The sequence number of a mini-batch data. (default: %(default)d)")
+parser.add_argument(
+    "--dict_size",
+    type=int,
+    default=30000,
+    help="The dictionary capacity. Dictionaries of source sequence and "
+    "target dictionary have same capacity. (default: %(default)d)")
+parser.add_argument(
+    "--pass_num",
+    type=int,
+    default=2,
+    help="The pass number to train. (default: %(default)d)")
+parser.add_argument(
+    "--learning_rate",
+    type=float,
+    default=0.0002,
+    help="Learning rate used to train the model. (default: %(default)f)")
+parser.add_argument(
+    "--infer_only", action='store_true', help="If set, run forward only.")
+parser.add_argument(
+    "--beam_size",
+    type=int,
+    default=3,
+    help="The width for beam searching. (default: %(default)d)")
+parser.add_argument(
+    "--use_gpu",
+    type=distutils.util.strtobool,
+    default=True,
+    help="Whether to use gpu. (default: %(default)d)")
+parser.add_argument(
+    "--max_length",
+    type=int,
+    default=250,
+    help="The maximum length of sequence when doing generation. "
+    "(default: %(default)d)")
+
+
+def lstm_step(x_t, hidden_t_prev, cell_t_prev, size):
+    def linear(inputs):
+        return fluid.layers.fc(input=inputs, size=size, bias_attr=True)
+
+    forget_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
+    input_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
+    output_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
+    cell_tilde = fluid.layers.tanh(x=linear([hidden_t_prev, x_t]))
+
+    cell_t = fluid.layers.sums(input=[
+        fluid.layers.elementwise_mul(
+            x=forget_gate, y=cell_t_prev), fluid.layers.elementwise_mul(
+                x=input_gate, y=cell_tilde)
+    ])
+
+    hidden_t = fluid.layers.elementwise_mul(
+        x=output_gate, y=fluid.layers.tanh(x=cell_t))
+
+    return hidden_t, cell_t
+
+
+def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim,
+                   target_dict_dim, is_generating, beam_size, max_length):
+    """Construct a seq2seq network."""
+
+    def bi_lstm_encoder(input_seq, gate_size):
+        # Linear transformation part for input gate, output gate, forget gate
+        # and cell activation vectors need be done outside of dynamic_lstm.
+        # So the output size is 4 times of gate_size.
+        input_forward_proj = fluid.layers.fc(input=input_seq,
+                                             size=gate_size * 4,
+                                             act=None,
+                                             bias_attr=False)
+        forward, _ = fluid.layers.dynamic_lstm(
+            input=input_forward_proj, size=gate_size * 4, use_peepholes=False)
+        input_reversed_proj = fluid.layers.fc(input=input_seq,
+                                              size=gate_size * 4,
+                                              act=None,
+                                              bias_attr=False)
+        reversed, _ = fluid.layers.dynamic_lstm(
+            input=input_reversed_proj,
+            size=gate_size * 4,
+            is_reverse=True,
+            use_peepholes=False)
+        return forward, reversed
+
+    src_word_idx = fluid.layers.data(
+        name='source_sequence', shape=[1], dtype='int64', lod_level=1)
+
+    src_embedding = fluid.layers.embedding(
+        input=src_word_idx,
+        size=[source_dict_dim, embedding_dim],
+        dtype='float32')
+
+    src_forward, src_reversed = bi_lstm_encoder(
+        input_seq=src_embedding, gate_size=encoder_size)
+
+    encoded_vector = fluid.layers.concat(
+        input=[src_forward, src_reversed], axis=1)
+
+    encoded_proj = fluid.layers.fc(input=encoded_vector,
+                                   size=decoder_size,
+                                   bias_attr=False)
+
+    backward_first = fluid.layers.sequence_pool(
+        input=src_reversed, pool_type='first')
+
+    decoder_boot = fluid.layers.fc(input=backward_first,
+                                   size=decoder_size,
+                                   bias_attr=False,
+                                   act='tanh')
+
+    def lstm_decoder_with_attention(target_embedding, encoder_vec, encoder_proj,
+                                    decoder_boot, decoder_size):
+        def simple_attention(encoder_vec, encoder_proj, decoder_state):
+            decoder_state_proj = fluid.layers.fc(input=decoder_state,
+                                                 size=decoder_size,
+                                                 bias_attr=False)
+            decoder_state_expand = fluid.layers.sequence_expand(
+                x=decoder_state_proj, y=encoder_proj)
+            concated = fluid.layers.concat(
+                input=[encoder_proj, decoder_state_expand], axis=1)
+            attention_weights = fluid.layers.fc(input=concated,
+                                                size=1,
+                                                act='tanh',
+                                                bias_attr=False)
+            attention_weights = fluid.layers.sequence_softmax(
+                input=attention_weights)
+            weigths_reshape = fluid.layers.reshape(
+                x=attention_weights, shape=[-1])
+            scaled = fluid.layers.elementwise_mul(
+                x=encoder_vec, y=weigths_reshape, axis=0)
+            context = fluid.layers.sequence_pool(input=scaled, pool_type='sum')
+            return context
+
+        rnn = fluid.layers.DynamicRNN()
+
+        cell_init = fluid.layers.fill_constant_batch_size_like(
+            input=decoder_boot,
+            value=0.0,
+            shape=[-1, decoder_size],
+            dtype='float32')
+        cell_init.stop_gradient = False
+
+        with rnn.block():
+            current_word = rnn.step_input(target_embedding)
+            encoder_vec = rnn.static_input(encoder_vec)
+            encoder_proj = rnn.static_input(encoder_proj)
+            hidden_mem = rnn.memory(init=decoder_boot, need_reorder=True)
+            cell_mem = rnn.memory(init=cell_init)
+            context = simple_attention(encoder_vec, encoder_proj, hidden_mem)
+            decoder_inputs = fluid.layers.concat(
+                input=[context, current_word], axis=1)
+            h, c = lstm_step(decoder_inputs, hidden_mem, cell_mem, decoder_size)
+            rnn.update_memory(hidden_mem, h)
+            rnn.update_memory(cell_mem, c)
+            out = fluid.layers.fc(input=h,
+                                  size=target_dict_dim,
+                                  bias_attr=True,
+                                  act='softmax')
+            rnn.output(out)
+        return rnn()
+
+    if not is_generating:
+        trg_word_idx = fluid.layers.data(
+            name='target_sequence', shape=[1], dtype='int64', lod_level=1)
+
+        trg_embedding = fluid.layers.embedding(
+            input=trg_word_idx,
+            size=[target_dict_dim, embedding_dim],
+            dtype='float32')
+
+        prediction = lstm_decoder_with_attention(trg_embedding, encoded_vector,
+                                                 encoded_proj, decoder_boot,
+                                                 decoder_size)
+        label = fluid.layers.data(
+            name='label_sequence', shape=[1], dtype='int64', lod_level=1)
+        cost = fluid.layers.cross_entropy(input=prediction, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+
+        feeding_list = ["source_sequence", "target_sequence", "label_sequence"]
+
+        return avg_cost, feeding_list
+
+
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    lod_t = core.LoDTensor()
+    lod_t.set(flattened_data, place)
+    lod_t.set_lod([lod])
+    return lod_t, lod[-1]
+
+
+def lodtensor_to_ndarray(lod_tensor):
+    dims = lod_tensor.get_dims()
+    ndarray = np.zeros(shape=dims).astype('float32')
+    for i in xrange(np.product(dims)):
+        ndarray.ravel()[i] = lod_tensor.get_float_element(i)
+    return ndarray
+
+
+def train():
+    avg_cost, feeding_list = seq_to_seq_net(
+        args.embedding_dim,
+        args.encoder_size,
+        args.decoder_size,
+        args.dict_size,
+        args.dict_size,
+        False,
+        beam_size=args.beam_size,
+        max_length=args.max_length)
+
+    # clone from default main program
+    inference_program = fluid.default_main_program().clone()
+
+    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
+    optimizer.minimize(avg_cost)
+
+    fluid.memory_optimize(fluid.default_main_program())
+
+    train_batch_generator = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.train(args.dict_size), buf_size=1000),
+        batch_size=args.batch_size)
+
+    test_batch_generator = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.test(args.dict_size), buf_size=1000),
+        batch_size=args.batch_size)
+
+    place = core.CUDAPlace(0) if args.use_gpu else core.CPUPlace()
+    exe = Executor(place)
+    exe.run(framework.default_startup_program())
+
+    def do_validation():
+        total_loss = 0.0
+        count = 0
+        for batch_id, data in enumerate(test_batch_generator()):
+            src_seq = to_lodtensor(map(lambda x: x[0], data), place)[0]
+            trg_seq = to_lodtensor(map(lambda x: x[1], data), place)[0]
+            lbl_seq = to_lodtensor(map(lambda x: x[2], data), place)[0]
+
+            fetch_outs = exe.run(inference_program,
+                                 feed={
+                                     feeding_list[0]: src_seq,
+                                     feeding_list[1]: trg_seq,
+                                     feeding_list[2]: lbl_seq
+                                 },
+                                 fetch_list=[avg_cost],
+                                 return_numpy=False)
+
+            total_loss += lodtensor_to_ndarray(fetch_outs[0])[0]
+            count += 1
+
+        return total_loss / count
+
+    for pass_id in xrange(args.pass_num):
+        pass_start_time = time.time()
+        words_seen = 0
+        for batch_id, data in enumerate(train_batch_generator()):
+            src_seq, word_num = to_lodtensor(map(lambda x: x[0], data), place)
+            words_seen += word_num
+            trg_seq, word_num = to_lodtensor(map(lambda x: x[1], data), place)
+            words_seen += word_num
+            lbl_seq, _ = to_lodtensor(map(lambda x: x[2], data), place)
+
+            fetch_outs = exe.run(framework.default_main_program(),
+                                 feed={
+                                     feeding_list[0]: src_seq,
+                                     feeding_list[1]: trg_seq,
+                                     feeding_list[2]: lbl_seq
+                                 },
+                                 fetch_list=[avg_cost])
+
+            avg_cost_val = np.array(fetch_outs[0])
+            print('pass_id=%d, batch_id=%d, train_loss: %f' %
+                  (pass_id, batch_id, avg_cost_val))
+
+        pass_end_time = time.time()
+        test_loss = do_validation()
+        time_consumed = pass_end_time - pass_start_time
+        words_per_sec = words_seen / time_consumed
+        print("pass_id=%d, test_loss: %f, words/s: %f, sec/pass: %f" %
+              (pass_id, test_loss, words_per_sec, time_consumed))
+
+
+def infer():
+    pass
+
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+    if args.infer_only:
+        infer()
+    else:
+        train()
diff --git a/benchmark/fluid/mnist.py b/benchmark/fluid/mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f7afaeb11447d936b65a1d83701b0176ecbc111
--- /dev/null
+++ b/benchmark/fluid/mnist.py
@@ -0,0 +1,205 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+
+import paddle.v2 as paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+
+SEED = 1
+DTYPE = "float32"
+
+# random seed must set before configuring the network.
+# fluid.default_startup_program().random_seed = SEED
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("mnist model benchmark.")
+    parser.add_argument(
+        '--batch_size', type=int, default=128, help='The minibatch size.')
+    parser.add_argument(
+        '--iterations', type=int, default=35, help='The number of minibatches.')
+    parser.add_argument(
+        '--pass_num', type=int, default=5, help='The number of passes.')
+    parser.add_argument(
+        '--device',
+        type=str,
+        default='GPU',
+        choices=['CPU', 'GPU'],
+        help='The device type.')
+    parser.add_argument(
+        '--infer_only', action='store_true', help='If set, run forward only.')
+    parser.add_argument(
+        '--use_cprof', action='store_true', help='If set, use cProfile.')
+    parser.add_argument(
+        '--use_nvprof',
+        action='store_true',
+        help='If set, use nvprof for CUDA.')
+    args = parser.parse_args()
+    return args
+
+
+def print_arguments(args):
+    vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
+                                vars(args)['device'] == 'GPU')
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+def cnn_model(data):
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+        input=data,
+        filter_size=5,
+        num_filters=20,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+
+    # TODO(dzhwinter) : refine the initializer and random seed settting
+    SIZE = 10
+    input_shape = conv_pool_2.shape
+    param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
+    scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
+
+    predict = fluid.layers.fc(
+        input=conv_pool_2,
+        size=SIZE,
+        act="softmax",
+        param_attr=fluid.param_attr.ParamAttr(
+            initializer=fluid.initializer.NormalInitializer(
+                loc=0.0, scale=scale)))
+    return predict
+
+
+def eval_test(exe, batch_acc, batch_size_tensor, inference_program):
+    test_reader = paddle.batch(
+        paddle.dataset.mnist.test(), batch_size=args.batch_size)
+    test_pass_acc = fluid.average.WeightedAverage()
+    for batch_id, data in enumerate(test_reader()):
+        img_data = np.array(map(lambda x: x[0].reshape([1, 28, 28]),
+                                data)).astype(DTYPE)
+        y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+        y_data = y_data.reshape([len(y_data), 1])
+
+        acc, weight = exe.run(inference_program,
+                              feed={"pixel": img_data,
+                                    "label": y_data},
+                              fetch_list=[batch_acc, batch_size_tensor])
+        test_pass_acc.add(value=acc, weight=weight)
+        pass_acc = test_pass_acc.eval()
+    return pass_acc
+
+
+def run_benchmark(model, args):
+    if args.use_cprof:
+        pr = cProfile.Profile()
+        pr.enable()
+    start_time = time.time()
+    # Input data
+    images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    # Train program
+    predict = model(images)
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    # Evaluator
+    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+    batch_acc = fluid.layers.accuracy(
+        input=predict, label=label, total=batch_size_tensor)
+
+    # inference program
+    inference_program = fluid.default_main_program().clone()
+    with fluid.program_guard(inference_program):
+        inference_program = fluid.io.get_inference_program(
+            target_vars=[batch_acc, batch_size_tensor])
+
+    # Optimization
+    opt = fluid.optimizer.AdamOptimizer(
+        learning_rate=0.001, beta1=0.9, beta2=0.999)
+    opt.minimize(avg_cost)
+
+    fluid.memory_optimize(fluid.default_main_program())
+
+    # Initialize executor
+    place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0)
+    exe = fluid.Executor(place)
+
+    # Parameter initialization
+    exe.run(fluid.default_startup_program())
+
+    # Reader
+    train_reader = paddle.batch(
+        paddle.dataset.mnist.train(), batch_size=args.batch_size)
+
+    accuracy = fluid.average.WeightedAverage()
+    for pass_id in range(args.pass_num):
+        accuracy.reset()
+        pass_start = time.time()
+        for batch_id, data in enumerate(train_reader()):
+            img_data = np.array(
+                map(lambda x: x[0].reshape([1, 28, 28]), data)).astype(DTYPE)
+            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+            y_data = y_data.reshape([len(y_data), 1])
+
+            start = time.time()
+            outs = exe.run(
+                fluid.default_main_program(),
+                feed={"pixel": img_data,
+                      "label": y_data},
+                fetch_list=[avg_cost, batch_acc, batch_size_tensor]
+            )  # The accuracy is the accumulation of batches, but not the current batch.
+            accuracy.add(value=outs[1], weight=outs[2])
+            end = time.time()
+            loss = np.array(outs[0])
+            acc = np.array(outs[1])
+            print("pass=%d, batch=%d, loss=%f, error=%f, elapse=%f" %
+                  (pass_id, batch_id, loss, 1 - acc, (end - start) / 1000))
+
+        pass_end = time.time()
+
+        train_avg_acc = accuracy.eval()
+        test_avg_acc = eval_test(exe, batch_acc, batch_size_tensor,
+                                 inference_program)
+
+        print("pass=%d, train_avg_acc=%f, test_avg_acc=%f, elapse=%f" %
+              (pass_id, train_avg_acc, test_avg_acc,
+               (pass_end - pass_start) / 1000))
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    print_arguments(args)
+    if args.use_nvprof and args.device == 'GPU':
+        with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
+            run_benchmark(cnn_model, args)
+    else:
+        run_benchmark(cnn_model, args)
diff --git a/benchmark/fluid/resnet.py b/benchmark/fluid/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0f1db979fa7fb640679beacafd66dfbe1f62ab8
--- /dev/null
+++ b/benchmark/fluid/resnet.py
@@ -0,0 +1,323 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import functools
+import numpy as np
+import time
+
+import cProfile, pstats, StringIO
+
+import paddle.v2 as paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.profiler as profiler
+
+
+def parse_args():
+    parser = argparse.ArgumentParser('Convolution model benchmark.')
+    parser.add_argument(
+        '--model',
+        type=str,
+        choices=['resnet_imagenet', 'resnet_cifar10'],
+        default='resnet_imagenet',
+        help='The model architecture.')
+    parser.add_argument(
+        '--batch_size', type=int, default=32, help='The minibatch size.')
+    parser.add_argument(
+        '--use_fake_data',
+        action='store_true',
+        help='use real data or fake data')
+    parser.add_argument(
+        '--skip_batch_num',
+        type=int,
+        default=5,
+        help='The first num of minibatch num to skip, for better performance test'
+    )
+    parser.add_argument(
+        '--iterations', type=int, default=80, help='The number of minibatches.')
+    parser.add_argument(
+        '--pass_num', type=int, default=100, help='The number of passes.')
+    parser.add_argument(
+        '--data_format',
+        type=str,
+        default='NCHW',
+        choices=['NCHW', 'NHWC'],
+        help='The data data_format, now only support NCHW.')
+    parser.add_argument(
+        '--device',
+        type=str,
+        default='GPU',
+        choices=['CPU', 'GPU'],
+        help='The device type.')
+    parser.add_argument(
+        '--data_set',
+        type=str,
+        default='flowers',
+        choices=['cifar10', 'flowers'],
+        help='Optional dataset for benchmark.')
+    parser.add_argument(
+        '--infer_only', action='store_true', help='If set, run forward only.')
+    parser.add_argument(
+        '--use_cprof', action='store_true', help='If set, use cProfile.')
+    parser.add_argument(
+        '--use_nvprof',
+        action='store_true',
+        help='If set, use nvprof for CUDA.')
+    parser.add_argument(
+        '--with_test',
+        action='store_true',
+        help='If set, test the testset during training.')
+    args = parser.parse_args()
+    return args
+
+
+def print_arguments(args):
+    vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
+                                vars(args)['device'] == 'GPU')
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
+    conv1 = fluid.layers.conv2d(
+        input=input,
+        filter_size=filter_size,
+        num_filters=ch_out,
+        stride=stride,
+        padding=padding,
+        act=None,
+        bias_attr=False)
+    return fluid.layers.batch_norm(input=conv1, act=act)
+
+
+def shortcut(input, ch_out, stride):
+    ch_in = input.shape[1] if args.data_format == 'NCHW' else input.shape[-1]
+    if ch_in != ch_out:
+        return conv_bn_layer(input, ch_out, 1, stride, 0, None)
+    else:
+        return input
+
+
+def basicblock(input, ch_out, stride):
+    short = shortcut(input, ch_out, stride)
+    conv1 = conv_bn_layer(input, ch_out, 3, stride, 1)
+    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None)
+    return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
+
+
+def bottleneck(input, ch_out, stride):
+    short = shortcut(input, ch_out * 4, stride)
+    conv1 = conv_bn_layer(input, ch_out, 1, stride, 0)
+    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1)
+    conv3 = conv_bn_layer(conv2, ch_out * 4, 1, 1, 0, act=None)
+    return fluid.layers.elementwise_add(x=short, y=conv3, act='relu')
+
+
+def layer_warp(block_func, input, ch_out, count, stride):
+    res_out = block_func(input, ch_out, stride)
+    for i in range(1, count):
+        res_out = block_func(res_out, ch_out, 1)
+    return res_out
+
+
+def resnet_imagenet(input, class_dim, depth=50, data_format='NCHW'):
+
+    cfg = {
+        18: ([2, 2, 2, 1], basicblock),
+        34: ([3, 4, 6, 3], basicblock),
+        50: ([3, 4, 6, 3], bottleneck),
+        101: ([3, 4, 23, 3], bottleneck),
+        152: ([3, 8, 36, 3], bottleneck)
+    }
+    stages, block_func = cfg[depth]
+    conv1 = conv_bn_layer(input, ch_out=64, filter_size=7, stride=2, padding=3)
+    pool1 = fluid.layers.pool2d(
+        input=conv1, pool_type='avg', pool_size=3, pool_stride=2)
+    res1 = layer_warp(block_func, pool1, 64, stages[0], 1)
+    res2 = layer_warp(block_func, res1, 128, stages[1], 2)
+    res3 = layer_warp(block_func, res2, 256, stages[2], 2)
+    res4 = layer_warp(block_func, res3, 512, stages[3], 2)
+    pool2 = fluid.layers.pool2d(
+        input=res4,
+        pool_size=7,
+        pool_type='avg',
+        pool_stride=1,
+        global_pooling=True)
+    out = fluid.layers.fc(input=pool2, size=class_dim, act='softmax')
+    return out
+
+
+def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'):
+    assert (depth - 2) % 6 == 0
+
+    n = (depth - 2) // 6
+
+    conv1 = conv_bn_layer(
+        input=input, ch_out=16, filter_size=3, stride=1, padding=1)
+    res1 = layer_warp(basicblock, conv1, 16, n, 1)
+    res2 = layer_warp(basicblock, res1, 32, n, 2)
+    res3 = layer_warp(basicblock, res2, 64, n, 2)
+    pool = fluid.layers.pool2d(
+        input=res3, pool_size=8, pool_type='avg', pool_stride=1)
+    out = fluid.layers.fc(input=pool, size=class_dim, act='softmax')
+    return out
+
+
+def run_benchmark(model, args):
+    if args.use_cprof:
+        pr = cProfile.Profile()
+        pr.enable()
+
+    if args.data_set == "cifar10":
+        class_dim = 10
+        if args.data_format == 'NCHW':
+            dshape = [3, 32, 32]
+        else:
+            dshape = [32, 32, 3]
+    else:
+        class_dim = 102
+        if args.data_format == 'NCHW':
+            dshape = [3, 224, 224]
+        else:
+            dshape = [224, 224, 3]
+
+    input = fluid.layers.data(name='data', shape=dshape, dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    predict = model(input, class_dim)
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+    batch_acc = fluid.layers.accuracy(
+        input=predict, label=label, total=batch_size_tensor)
+
+    inference_program = fluid.default_main_program().clone()
+    with fluid.program_guard(inference_program):
+        inference_program = fluid.io.get_inference_program(
+            target_vars=[batch_acc, batch_size_tensor])
+
+    optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
+    opts = optimizer.minimize(avg_cost)
+
+    fluid.memory_optimize(fluid.default_main_program())
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.cifar.train10()
+            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
+            buf_size=5120),
+        batch_size=args.batch_size)
+    test_reader = paddle.batch(
+        paddle.dataset.cifar.test10()
+        if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
+        batch_size=args.batch_size)
+
+    def test(exe):
+        test_accuracy = fluid.average.WeightedAverage()
+        for batch_id, data in enumerate(test_reader()):
+            img_data = np.array(map(lambda x: x[0].reshape(dshape),
+                                    data)).astype("float32")
+            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+            y_data = y_data.reshape([-1, 1])
+
+            acc, weight = exe.run(inference_program,
+                                  feed={"data": img_data,
+                                        "label": y_data},
+                                  fetch_list=[batch_acc, batch_size_tensor])
+            test_accuracy.add(value=acc, weight=weight)
+
+        return test_accuracy.eval()
+
+    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+    accuracy = fluid.average.WeightedAverage()
+    if args.use_fake_data:
+        data = train_reader().next()
+        image = np.array(map(lambda x: x[0].reshape(dshape), data)).astype(
+            'float32')
+        label = np.array(map(lambda x: x[1], data)).astype('int64')
+        label = label.reshape([-1, 1])
+
+    iters, num_samples, start_time = 0, 0, time.time()
+    for pass_id in range(args.pass_num):
+        accuracy.reset()
+        train_accs = []
+        train_losses = []
+        for batch_id, data in enumerate(train_reader()):
+            if iters == args.skip_batch_num:
+                start_time = time.time()
+                num_samples = 0
+            if iters == args.iterations:
+                break
+            if not args.use_fake_data:
+                image = np.array(map(lambda x: x[0].reshape(dshape),
+                                     data)).astype('float32')
+                label = np.array(map(lambda x: x[1], data)).astype('int64')
+                label = label.reshape([-1, 1])
+            loss, acc, weight = exe.run(
+                fluid.default_main_program(),
+                feed={'data': image,
+                      'label': label},
+                fetch_list=[avg_cost, batch_acc, batch_size_tensor])
+            iters += 1
+            num_samples += label[0]
+            accuracy.add(value=acc, weight=weight)
+            train_losses.append(loss)
+            train_accs.append(acc)
+            print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" %
+                  (pass_id, iters, loss, acc))
+        pass_train_acc = accuracy.eval()
+        # evaluation
+        if args.with_test:
+            pass_test_acc = test(exe)
+        train_elapsed = time.time() - start_time
+        print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
+              (pass_id, np.mean(train_losses), np.mean(train_accs)))
+
+        examples_per_sec = num_samples / train_elapsed
+
+        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
+              (num_samples, train_elapsed, examples_per_sec))
+
+    if args.use_cprof:
+        pr.disable()
+        s = StringIO.StringIO()
+        sortby = 'cumulative'
+        ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
+        ps.print_stats()
+        print(s.getvalue())
+
+
+if __name__ == '__main__':
+    model_map = {
+        'resnet_imagenet': resnet_imagenet,
+        'resnet_cifar10': resnet_cifar10
+    }
+    args = parse_args()
+    print_arguments(args)
+    if args.data_format == 'NHWC':
+        raise ValueError('Only support NCHW data_format now.')
+    if args.use_nvprof and args.device == 'GPU':
+        with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
+            run_benchmark(model_map[args.model], args)
+    else:
+        run_benchmark(model_map[args.model], args)
diff --git a/benchmark/fluid/run.sh b/benchmark/fluid/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..663e2efd5392a6cd1a71f51fa0d017070b489341
--- /dev/null
+++ b/benchmark/fluid/run.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+# This script benchmarking the PaddlePaddle Fluid on
+# single thread single GPU.
+export CUDNN_PATH=/paddle/cudnn_v5/cuda/lib
+
+# disable openmp and mkl parallel
+#https://github.com/PaddlePaddle/Paddle/issues/7199
+export MKL_NUM_THREADS=1
+export OMP_NUM_THREADS=1
+ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs`
+if [ $ht -eq 1 ]; then # HT is OFF
+    if [ -z "$KMP_AFFINITY" ]; then
+        export KMP_AFFINITY="granularity=fine,compact,0,0"
+    fi
+    if [ -z "$OMP_DYNAMIC" ]; then
+        export OMP_DYNAMIC="FALSE"
+    fi
+else # HT is ON
+    if [ -z "$KMP_AFFINITY" ]; then
+        export KMP_AFFINITY="granularity=fine,compact,1,0"
+    fi
+fi
+# disable multi-gpu if have more than one
+export CUDA_VISIBLE_DEVICES=0
+export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=$CUDNN_PATH:$LD_LIBRARY_PATH
+
+
+# vgg16
+# cifar10 gpu cifar10 128
+FLAGS_benchmark=true python fluid/vgg.py \
+               --device=GPU \
+               --batch_size=128 \
+               --skip_batch_num=5 \
+               --iterations=30  \
+               2>&1 > vgg16_gpu_128.log
+
+# resnet50
+# resnet50 gpu cifar10 128
+FLAGS_benchmark=true python fluid/resnet.py \
+               --device=GPU \
+               --batch_size=128 \
+               --data_set=cifar10 \
+               --model=resnet_cifar10 \
+               --skip_batch_num=5 \
+               --iterations=30 \
+               2>&1 > resnet50_gpu_128.log
+
+# lstm
diff --git a/benchmark/fluid/stacked_dynamic_lstm.py b/benchmark/fluid/stacked_dynamic_lstm.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e063549e0239abf9d946ed8735f0306203509d0
--- /dev/null
+++ b/benchmark/fluid/stacked_dynamic_lstm.py
@@ -0,0 +1,209 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import cPickle
+import os
+import random
+import time
+
+import numpy
+import paddle.v2 as paddle
+import paddle.v2.dataset.imdb as imdb
+import paddle.fluid as fluid
+from paddle.v2 import batch
+import paddle.fluid.profiler as profiler
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("Understand Sentiment by Dynamic RNN.")
+    parser.add_argument(
+        '--batch_size',
+        type=int,
+        default=32,
+        help='The sequence number of a batch data. (default: %(default)d)')
+    parser.add_argument(
+        '--emb_dim',
+        type=int,
+        default=512,
+        help='Dimension of embedding table. (default: %(default)d)')
+    parser.add_argument(
+        '--hidden_dim',
+        type=int,
+        default=512,
+        help='Hidden size of lstm unit. (default: %(default)d)')
+    parser.add_argument(
+        '--pass_num',
+        type=int,
+        default=100,
+        help='Epoch number to train. (default: %(default)d)')
+    parser.add_argument(
+        '--device',
+        type=str,
+        default='CPU',
+        choices=['CPU', 'GPU'],
+        help='The device type.')
+    parser.add_argument(
+        '--crop_size',
+        type=int,
+        default=int(os.environ.get('CROP_SIZE', '1500')),
+        help='The max sentence length of input. Since this model use plain RNN,'
+        ' Gradient could be explored if sentence is too long')
+    args = parser.parse_args()
+    return args
+
+
+word_dict = imdb.word_dict()
+
+
+def crop_sentence(reader, crop_size):
+    unk_value = word_dict['<unk>']
+
+    def __impl__():
+        for item in reader():
+            if len([x for x in item[0] if x != unk_value]) < crop_size:
+                yield item
+
+    return __impl__
+
+
+def main():
+    args = parse_args()
+    lstm_size = args.hidden_dim
+
+    data = fluid.layers.data(
+        name="words", shape=[1], lod_level=1, dtype='int64')
+    sentence = fluid.layers.embedding(
+        input=data, size=[len(word_dict), args.emb_dim])
+
+    sentence = fluid.layers.fc(input=sentence, size=lstm_size, act='tanh')
+
+    rnn = fluid.layers.DynamicRNN()
+    with rnn.block():
+        word = rnn.step_input(sentence)
+        prev_hidden = rnn.memory(value=0.0, shape=[lstm_size])
+        prev_cell = rnn.memory(value=0.0, shape=[lstm_size])
+
+        def gate_common(
+                ipt,
+                hidden,
+                size, ):
+            gate0 = fluid.layers.fc(input=ipt, size=size, bias_attr=True)
+            gate1 = fluid.layers.fc(input=hidden, size=size, bias_attr=False)
+            gate = fluid.layers.sums(input=[gate0, gate1])
+            return gate
+
+        forget_gate = fluid.layers.sigmoid(
+            x=gate_common(word, prev_hidden, lstm_size))
+        input_gate = fluid.layers.sigmoid(
+            x=gate_common(word, prev_hidden, lstm_size))
+        output_gate = fluid.layers.sigmoid(
+            x=gate_common(word, prev_hidden, lstm_size))
+        cell_gate = fluid.layers.tanh(
+            x=gate_common(word, prev_hidden, lstm_size))
+
+        cell = fluid.layers.sums(input=[
+            fluid.layers.elementwise_mul(
+                x=forget_gate, y=prev_cell), fluid.layers.elementwise_mul(
+                    x=input_gate, y=cell_gate)
+        ])
+
+        hidden = fluid.layers.elementwise_mul(
+            x=output_gate, y=fluid.layers.tanh(x=cell))
+
+        rnn.update_memory(prev_cell, cell)
+        rnn.update_memory(prev_hidden, hidden)
+        rnn.output(hidden)
+
+    last = fluid.layers.sequence_pool(rnn(), 'last')
+    logit = fluid.layers.fc(input=last, size=2, act='softmax')
+    loss = fluid.layers.cross_entropy(
+        input=logit,
+        label=fluid.layers.data(
+            name='label', shape=[1], dtype='int64'))
+    loss = fluid.layers.mean(x=loss)
+
+    # add acc
+    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+    batch_acc = fluid.layers.accuracy(input=logit, label=fluid.layers.data(name='label', \
+                shape=[1], dtype='int64'), total=batch_size_tensor)
+
+    inference_program = fluid.default_main_program().clone()
+    with fluid.program_guard(inference_program):
+        inference_program = fluid.io.get_inference_program(
+            target_vars=[batch_acc, batch_size_tensor])
+
+    adam = fluid.optimizer.Adam()
+    adam.minimize(loss)
+
+    fluid.memory_optimize(fluid.default_main_program())
+
+    place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0)
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+
+    def train_loop(pass_num, crop_size):
+        with profiler.profiler(args.device, 'total') as prof:
+            for pass_id in range(pass_num):
+                train_reader = batch(
+                    paddle.reader.shuffle(
+                        crop_sentence(imdb.train(word_dict), crop_size),
+                        buf_size=25000),
+                    batch_size=args.batch_size)
+                word_nums = 0
+                pass_start_time = time.time()
+                for batch_id, data in enumerate(train_reader()):
+                    tensor_words = to_lodtensor([x[0] for x in data], place)
+                    for x in data:
+                        word_nums += len(x[0])
+                    label = numpy.array([x[1] for x in data]).astype("int64")
+                    label = label.reshape((-1, 1))
+                    loss_np, acc, weight = exe.run(
+                        fluid.default_main_program(),
+                        feed={"words": tensor_words,
+                              "label": label},
+                        fetch_list=[loss, batch_acc, batch_size_tensor])
+                    print("pass_id=%d, batch_id=%d, loss=%f, acc=%f" %
+                          (pass_id, batch_id, loss_np, acc))
+
+                pass_end_time = time.time()
+                time_consumed = pass_end_time - pass_start_time
+                words_per_sec = word_nums / time_consumed
+                print("pass_id=%d, sec/pass: %f, words/s: %f" %
+                      (pass_id, time_consumed, words_per_sec))
+
+    train_loop(args.pass_num, args.crop_size)
+
+
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = numpy.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = fluid.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+
+if __name__ == '__main__':
+    main()
diff --git a/benchmark/fluid/vgg.py b/benchmark/fluid/vgg.py
new file mode 100644
index 0000000000000000000000000000000000000000..3bf78e4cf08d43127a05c740fa30ca6d2bc416b0
--- /dev/null
+++ b/benchmark/fluid/vgg.py
@@ -0,0 +1,220 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""VGG16 benchmark in Fluid"""
+from __future__ import print_function
+
+import sys
+import time
+import numpy as np
+import paddle.v2 as paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import argparse
+import functools
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    '--batch_size', type=int, default=128, help="Batch size for training.")
+parser.add_argument(
+    '--skip_batch_num',
+    type=int,
+    default=5,
+    help='The first num of minibatch num to skip, for better performance test')
+parser.add_argument(
+    '--iterations', type=int, default=80, help='The number of minibatches.')
+parser.add_argument(
+    '--learning_rate',
+    type=float,
+    default=1e-3,
+    help="Learning rate for training.")
+parser.add_argument('--pass_num', type=int, default=50, help="No. of passes.")
+parser.add_argument(
+    '--device',
+    type=str,
+    default='GPU',
+    choices=['CPU', 'GPU'],
+    help="The device type.")
+parser.add_argument(
+    '--data_format',
+    type=str,
+    default='NCHW',
+    choices=['NCHW', 'NHWC'],
+    help='The data order, now only support NCHW.')
+parser.add_argument(
+    '--data_set',
+    type=str,
+    default='cifar10',
+    choices=['cifar10', 'flowers'],
+    help='Optional dataset for benchmark.')
+parser.add_argument(
+    '--with_test',
+    action='store_true',
+    help='If set, test the testset during training.')
+args = parser.parse_args()
+
+
+def vgg16_bn_drop(input):
+    def conv_block(input, num_filter, groups, dropouts):
+        return fluid.nets.img_conv_group(
+            input=input,
+            pool_size=2,
+            pool_stride=2,
+            conv_num_filter=[num_filter] * groups,
+            conv_filter_size=3,
+            conv_act='relu',
+            conv_with_batchnorm=True,
+            conv_batchnorm_drop_rate=dropouts,
+            pool_type='max')
+
+    conv1 = conv_block(input, 64, 2, [0.3, 0])
+    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
+    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
+    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
+    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
+
+    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
+    fc1 = fluid.layers.fc(input=drop, size=512, act=None)
+    bn = fluid.layers.batch_norm(input=fc1, act='relu')
+    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
+    fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
+    return fc2
+
+
+def main():
+    if args.data_set == "cifar10":
+        classdim = 10
+        if args.data_format == 'NCHW':
+            data_shape = [3, 32, 32]
+        else:
+            data_shape = [32, 32, 3]
+    else:
+        classdim = 102
+        if args.data_format == 'NCHW':
+            data_shape = [3, 224, 224]
+        else:
+            data_shape = [224, 224, 3]
+
+    # Input data
+    images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    # Train program
+    net = vgg16_bn_drop(images)
+    predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    # Evaluator
+    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+    batch_acc = fluid.layers.accuracy(
+        input=predict, label=label, total=batch_size_tensor)
+
+    # inference program
+    inference_program = fluid.default_main_program().clone()
+    with fluid.program_guard(inference_program):
+        inference_program = fluid.io.get_inference_program(
+            target_vars=[batch_acc, batch_size_tensor])
+
+    # Optimization
+    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
+    opts = optimizer.minimize(avg_cost)
+
+    fluid.memory_optimize(fluid.default_main_program())
+
+    # Initialize executor
+    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
+    exe = fluid.Executor(place)
+
+    # Parameter initialization
+    exe.run(fluid.default_startup_program())
+
+    # data reader
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.cifar.train10()
+            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
+            buf_size=5120),
+        batch_size=args.batch_size)
+    test_reader = paddle.batch(
+        paddle.dataset.cifar.test10()
+        if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
+        batch_size=args.batch_size)
+
+    # test
+    def test(exe):
+        test_accuracy = fluid.average.WeightedAverage()
+        for batch_id, data in enumerate(test_reader()):
+            img_data = np.array(map(lambda x: x[0].reshape(data_shape),
+                                    data)).astype("float32")
+            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+            y_data = y_data.reshape([-1, 1])
+
+            acc, weight = exe.run(inference_program,
+                                  feed={"pixel": img_data,
+                                        "label": y_data},
+                                  fetch_list=[batch_acc, batch_size_tensor])
+            test_accuracy.add(value=acc, weight=weight)
+        return test_accuracy.eval()
+
+    iters, num_samples, start_time = 0, 0, time.time()
+    accuracy = fluid.average.WeightedAverage()
+    for pass_id in range(args.pass_num):
+        accuracy.reset()
+        train_accs = []
+        train_losses = []
+        for batch_id, data in enumerate(train_reader()):
+            if iters == args.skip_batch_num:
+                start_time = time.time()
+                num_samples = 0
+            if iters == args.iterations:
+                break
+            img_data = np.array(map(lambda x: x[0].reshape(data_shape),
+                                    data)).astype("float32")
+            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+            y_data = y_data.reshape([-1, 1])
+
+            loss, acc, weight = exe.run(
+                fluid.default_main_program(),
+                feed={"pixel": img_data,
+                      "label": y_data},
+                fetch_list=[avg_cost, batch_acc, batch_size_tensor])
+            accuracy.add(value=acc, weight=weight)
+            iters += 1
+            num_samples += len(data)
+            print(
+                "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" %
+                (pass_id, iters, loss, acc)
+            )  # The accuracy is the accumulation of batches, but not the current batch.
+
+        pass_train_acc = accuracy.eval()
+        train_losses.append(loss)
+        train_accs.append(acc)
+        # evaluation
+        if args.with_test:
+            pass_test_acc = test(exe)
+        train_elapsed = time.time() - start_time
+        print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
+              (pass_id, np.mean(train_losses), np.mean(train_accs)))
+
+
+def print_arguments():
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+if __name__ == "__main__":
+    print_arguments()
+    main()
diff --git a/doc/v2/faq/build_and_install/index_en.rst b/doc/v2/faq/build_and_install/index_en.rst
index 614db457d715665073cec1a495d4d7df6887532f..7488ed8137d57785f36b9f1e1ed1269f864960bc 100644
--- a/doc/v2/faq/build_and_install/index_en.rst
+++ b/doc/v2/faq/build_and_install/index_en.rst
@@ -1,5 +1,143 @@
-############################
-Install, Build and Unit test
-############################
+.. _install_faq:
 
-TBD
+###############################
+Compile, Install, and Unit Test
+###############################
+
+..  contents::
+
+1. Insufficient CUDA driver version
+----------------------------------------------------------------
+
+Many users usually face issues like `Cuda Error: CUDA driver version is insufficient for CUDA runtime version` when running the PaddlePaddle GPU Docker image. The cause is that you may not map the local CUDA driver to a container directory.
+You can solve the issue by running the following commands:
+
+..  code-block:: bash
+
+    $ export CUDA_SO="$(\ls usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+    $ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+    $ docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu
+
+For more infomation about Docker's installation and usage, please refer to `PaddlePaddle Docker documentation <http://www.paddlepaddle.org/docs/0.11.0/documentation/zh/getstarted/build_and_install/docker_install_en.html>`_ .
+
+
+2. Version mismatch between PythonLibs and PythonInterpreter
+----------------------------------------------------------------
+
+It is a common bug when CMake looks up Python. If you install multiple versions of Python, Cmake may find the version mismatch between PythonLibs and PythonInterpreter . You are forced to specify a Python version, as follows.
+
+    ..  code-block:: bash
+
+        cmake .. -DPYTHON_EXECUTABLE=<exc_path> -DPYTHON_LIBRARY=<lib_path>  -DPYTHON_INCLUDE_DIR=<inc_path>
+
+You should specify ``<exc_path>``, ``<lib_path>``, ``<inc_path>`` to your local paths.
+
+3. PaddlePaddle version is 0.0.0
+------------------------------------------------
+This issue would happen when you run the code  `paddle version` or `cmake ..`
+
+..  code-block:: bash
+
+    CMake Warning at cmake/version.cmake:20 (message):
+      Cannot add paddle version from git tag
+
+You should pull all remote branches to your local machine with the command :code:`git fetch upstream` and then run :code:`cmake`
+
+4. paddlepaddle\*.whl is not a supported wheel on this platform.
+------------------------------------------------------------------------
+
+The primary cause for this issue is that it can not find the correct PaddlePaddle installation package that matches your current system.The latest PaddlePaddle Python installation package supports Linux x86_64 and MacOS 10.12 os including Python2.7 and Pip 9.0.1.
+
+You can upgrade Pip with the following command\:
+
+..  code-block:: bash
+
+    pip install --upgrade pip
+
+If it does not work for you, you can run the command :code:`python -c "import pip; print(pip.pep425tags.get_supported())"` to get the suffix of Python package which your system may support and then compare it with the suffix of your installation.
+
+If the system supports :code:`linux_x86_64` and  the installation package is :code:`manylinux1_x86_64`, you should upgrade pip to the latest 
+
+if the system supports :code:`manylinux_x86_64` and the local installation package is :code:`linux1_x86_64`, you can rename the whl package to :code:`manylinux1_x86_64` and then try again.
+
+
+5. ImportError: No module named v2
+----------------------------------
+Please uninstall Paddle V1 if you have installed it before.
+
+..  code-block:: bash
+
+    pip uninstall py_paddle paddle
+
+Then install Python for PaddlePaddle , enter the build directory and run the following commands
+
+pip install python/dist/paddle*.whl && pip install ../paddle/dist/py_paddle*.whl
+
+6. Illegal instruction
+-----------------------
+This issue may be caused by the wrong usage of PaddlePaddle binary version which uses avx SIMD instructions to increase the performance of cpu. Please choose the correct version.
+
+7.  Python unittest fails
+--------------------------------
+
+If the following python unittest testcases fail:
+
+..  code-block:: bash
+
+    24 - test_PyDataProvider (Failed)
+    26 - test_RecurrentGradientMachine (Failed)
+    27 - test_NetworkCompare (Failed)
+    28 - test_PyDataProvider2 (Failed)
+    32 - test_Prediction (Failed)
+    33 - test_Compare (Failed)
+    34 - test_Trainer (Failed)
+    35 - test_TrainerOnePass (Failed)
+    36 - test_CompareTwoNets (Failed)
+    37 - test_CompareTwoOpts (Failed)
+    38 - test_CompareSparse (Failed)
+    39 - test_recurrent_machine_generation (Failed)
+    40 - test_PyDataProviderWrapper (Failed)
+    41 - test_config_parser (Failed)
+    42 - test_swig_api (Failed)
+    43 - layers_test (Failed)
+
+Please check the PaddlePaddle unittest logs which may suggest the following:
+
+..  code-block:: bash
+
+    paddle package is already in your PYTHONPATH. But unittest need a clean environment.
+    Please uninstall paddle package before start unittest. Try to 'pip uninstall paddle'.
+
+The solution is:
+
+* Remove old PaddlePaddle to make a clean environment for the unit tests. If PaddlePaddle package is already in Python's site-packages, unit tests would refer Python package in site-packages instead of Python package in the :code:`/python` directory of the source directory.  Setting :code:`PYTHONPATH` to :code:`/python` is also useless because Python's search path would give the priority to the installed Python package.
+
+
+8. Failed to download the MKLML library
+----------------------------------------------
+
+..  code-block:: bash
+
+    make[2]: *** [third_party/mklml/src/extern_mklml-stamp/extern_mklml-download] error 4
+    make[1]: *** [CMakeFiles/extern_mklml.dir/all] error 2
+    make[1]: *** waiting for the unfinished  jobs....
+
+Cause: The network speed or SSL link causes the MKLML library to download unsuccessfully.
+
+The solution is: manually download and install, the specific steps are as follows.
+
+..  code-block:: bash
+
+    // 1. enter the directory
+    cd build/third_party/mklml/src/extern_mklml
+
+    // 2. check the size of the package, normally 75M, if less than 75M, the download fails
+    du -sh mklml_lnx_2018.0.1.20171007.tgz
+
+    // 3. manually download and unzip and make the download success tag:
+    wget --no-check-certificate https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz -c -O mklml_lnx_2018.0.1.20171007.tgz 
+    tar zxf mklml_lnx_2018.0.1.20171007.tgz
+    touch ../extern_mklml-stamp/extern_mklml-download
+
+    // 4. then compile
+    
diff --git a/paddle/fluid/framework/details/var_handle.h b/paddle/fluid/framework/details/var_handle.h
index 893cc15f6c8b34fcfc33554f8ef48ffeb00cd75c..569dda17c6e91d5658c4f8b9ba0b8c8fbd966832 100644
--- a/paddle/fluid/framework/details/var_handle.h
+++ b/paddle/fluid/framework/details/var_handle.h
@@ -22,7 +22,7 @@
 namespace paddle {
 namespace framework {
 namespace details {
-struct OpHandleBase;
+class OpHandleBase;
 
 // VarHandleBase is the var node in the dependency graph.
 // A variable can only be generated by a single operator. i.e.
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 91f2db9354c2a00ec7e51ea4595c7cfa00da23ea..577eea92d217f9feb54085b4d9f071494c3c5165 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/parallel_executor.h"
-#include <string>
 
-#include "ThreadPool.h"
+#include <string>
+#include <vector>
 
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/nccl_helper.h"
diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h
index c9c2c1bb721f2c527fa52f45cc54883f639f4ef8..9458d56a01df432aea573d796456b9be31350038 100644
--- a/paddle/fluid/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
@@ -10,6 +10,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+
+#include <vector>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor.h"
 
@@ -52,7 +55,7 @@ class SelectedRows {
 
  private:
   // Notice: rows can be duplicate. We can have {0, 4, 7, 0, 5, 7, 9} here.
-  // SelectedRows are simplely concated when adding together. Until a
+  // SelectedRows are simply concated when adding together. Until a
   // SelectedRows add a Tensor, will the duplicate rows be handled.
   Vector<int64_t> rows_;
   std::unique_ptr<Tensor> value_{nullptr};
diff --git a/paddle/fluid/operators/detail/CMakeLists.txt b/paddle/fluid/operators/detail/CMakeLists.txt
index d59411dfb9122537e99f483478fdac06fc8275db..3adeeda90645ca983d9d9229b4cc1c4c90302206 100644
--- a/paddle/fluid/operators/detail/CMakeLists.txt
+++ b/paddle/fluid/operators/detail/CMakeLists.txt
@@ -2,7 +2,8 @@ if(WITH_DISTRIBUTE)
   grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
       grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor selected_rows)
   set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-  set_source_files_properties(serde_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(serde_test.cc grpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
   cc_test(serde_test SRCS serde_test.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr
       cares zlib protobuf sendrecvop_grpc)
+  cc_test(grpc_server_test SRCS grpc_server_test.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf)
 endif()
diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/detail/grpc_client.cc
index 6c2b78fb3a72adbaa6123601b0c524ce1c33b50a..ef987d07f08525bff5267cdc2076ae767417e4f1 100644
--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ b/paddle/fluid/operators/detail/grpc_client.cc
@@ -74,8 +74,7 @@ void ProcGetResponse(const VarHandle& var_h,
 template <typename T>
 void RequestToByteBuffer(const T& proto, ::grpc::ByteBuffer* result) {
   ::grpc::Slice slice(proto.ByteSizeLong());
-  proto.SerializeWithCachedSizesToArray(
-      const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(slice.begin())));
+  proto.SerializeWithCachedSizesToArray(const_cast<uint8_t*>(slice.begin()));
   ::grpc::ByteBuffer tmp(&slice, 1);
   result->Swap(&tmp);
 }
@@ -154,9 +153,10 @@ bool RPCClient::AsyncPrefetchVariable(const std::string& ep,
     s->response_call_back_ = ProcGetResponse;
 
     auto call = s->stub_g_.PrepareUnaryCall(
-        s->context_.get(), "/sendrecv.SendRecvService/GetVariable", req, &cq_);
+        s->context_.get(), "/sendrecv.SendRecvService/PrefetchVariable", req,
+        &cq_);
     call->StartCall();
-    call->Finish(&s->reply_, &s->status_, (void*)s);
+    call->Finish(&s->reply_, &s->status_, static_cast<void*>(s));
   });
 
   req_count_++;
diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc
index 0833710b35bf77b8c7fef34bfd3281d94b090014..19bba46e3bd49a689fbe1d0c93efe01806fb0228 100644
--- a/paddle/fluid/operators/detail/grpc_server.cc
+++ b/paddle/fluid/operators/detail/grpc_server.cc
@@ -131,6 +131,49 @@ class RequestGet final : public RequestBase {
   SimpleBlockQueue<MessageWithName>* queue_;
 };
 
+class RequestPrefetch final : public RequestBase {
+ public:
+  explicit RequestPrefetch(GrpcService::AsyncService* service,
+                           ::grpc::ServerCompletionQueue* cq,
+                           framework::Scope* scope,
+                           const platform::DeviceContext* dev_ctx,
+                           framework::Executor* executor,
+                           framework::ProgramDesc* program, int blkid)
+      : RequestBase(service, cq, dev_ctx),
+        responder_(&ctx_),
+        scope_(scope),
+        executor_(executor),
+        program_(program),
+        blkid_(blkid) {
+    int method_id = static_cast<int>(detail::GrpcMethod::kPrefetchVariable);
+    service_->RequestAsyncUnary(method_id, &ctx_, &request_, &responder_, cq_,
+                                cq_, this);
+  }
+
+  virtual ~RequestPrefetch() {}
+
+  virtual std::string GetReqName() { return request_.varname(); }
+
+  virtual void Process() {
+    // prefetch process...
+    ::grpc::ByteBuffer reply;
+    // TODO(Yancey1989): execute the Block which containers prefetch ops
+
+    VLOG(3) << "RequestPrefetch Process in";
+
+    responder_.Finish(reply, ::grpc::Status::OK, this);
+    status_ = FINISH;
+  }
+
+ protected:
+  sendrecv::VariableMessage request_;
+  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
+  framework::Scope* scope_;
+  framework::Executor* executor_;
+  framework::ProgramDesc* program_;
+  int blkid_;
+};
+
 void AsyncGRPCServer::WaitClientGet(int count) {
   int fetch_barriers = 0;
   while (fetch_barriers < count) {
@@ -150,6 +193,7 @@ void AsyncGRPCServer::RunSyncUpdate() {
 
   cq_send_ = builder.AddCompletionQueue();
   cq_get_ = builder.AddCompletionQueue();
+  cq_prefetch_ = builder.AddCompletionQueue();
 
   server_ = builder.BuildAndStart();
   LOG(INFO) << "Server listening on " << address_ << std::endl;
@@ -158,6 +202,8 @@ void AsyncGRPCServer::RunSyncUpdate() {
       std::bind(&AsyncGRPCServer::TryToRegisterNewSendOne, this);
   std::function<void()> get_register =
       std::bind(&AsyncGRPCServer::TryToRegisterNewGetOne, this);
+  std::function<void()> prefetch_register =
+      std::bind(&AsyncGRPCServer::TryToRegisterNewPrefetchOne, this);
 
   t_send_.reset(
       new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this,
@@ -166,17 +212,21 @@ void AsyncGRPCServer::RunSyncUpdate() {
   t_get_.reset(
       new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this,
                                 cq_get_.get(), "cq_get", get_register)));
-
+  t_prefetch_.reset(new std::thread(
+      std::bind(&AsyncGRPCServer::HandleRequest, this, cq_prefetch_.get(),
+                "cq_prefetch", prefetch_register)));
   // wait server
   server_->Wait();
   t_send_->join();
   t_get_->join();
+  t_prefetch_->join();
 }
 
 void AsyncGRPCServer::ShutdownQueue() {
   std::unique_lock<std::mutex> lock(cq_mutex_);
   cq_send_->Shutdown();
   cq_get_->Shutdown();
+  cq_prefetch_->Shutdown();
 }
 
 // This URL explains why shutdown is complicate:
@@ -189,6 +239,7 @@ void AsyncGRPCServer::ShutDown() {
 void AsyncGRPCServer::TryToRegisterNewSendOne() {
   std::unique_lock<std::mutex> lock(cq_mutex_);
   if (is_shut_down_) {
+    VLOG(3) << "shutdown, do not TryToRegisterNewSendOne";
     return;
   }
   RequestSend* send = new RequestSend(&service_, cq_send_.get(), scope_,
@@ -199,6 +250,7 @@ void AsyncGRPCServer::TryToRegisterNewSendOne() {
 void AsyncGRPCServer::TryToRegisterNewGetOne() {
   std::unique_lock<std::mutex> lock(cq_mutex_);
   if (is_shut_down_) {
+    VLOG(3) << "shutdown, do not TryToRegisterNewGetOne";
     return;
   }
   RequestGet* get = new RequestGet(&service_, cq_get_.get(), scope_, dev_ctx_,
@@ -206,6 +258,19 @@ void AsyncGRPCServer::TryToRegisterNewGetOne() {
   VLOG(4) << "Create RequestGet status:" << get->Status();
 }
 
+void AsyncGRPCServer::TryToRegisterNewPrefetchOne() {
+  std::unique_lock<std::mutex> lock(cq_mutex_);
+  if (is_shut_down_) {
+    VLOG(3) << "shutdown, do not TryToRegisterNewPrefetchOne";
+    return;
+  }
+  RequestPrefetch* prefetch =
+      new RequestPrefetch(&service_, cq_prefetch_.get(), scope_, dev_ctx_,
+                          executor_, program_, prefetch_blk_id_);
+
+  VLOG(4) << "Create RequestPrefetch status:" << prefetch->Status();
+}
+
 // FIXME(typhoonzero): change cq_name to enum.
 void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq,
                                     const std::string& cq_name,
@@ -214,11 +279,14 @@ void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq,
 
   void* tag = NULL;
   bool ok = false;
+
   while (true) {
+    VLOG(3) << "HandleRequest for " << cq_name << " while in";
     if (!cq->Next(&tag, &ok)) {
       LOG(INFO) << cq_name << " CompletionQueue shutdown!";
       break;
     }
+    VLOG(3) << "HandleRequest for " << cq_name << " while after Next";
 
     PADDLE_ENFORCE(tag);
     // FIXME(typhoonzero): de-couple the barriers with recv_op
@@ -231,8 +299,8 @@ void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq,
     // https://groups.google.com/forum/#!topic/grpc-io/xftlRy-IQwM
     // https://groups.google.com/forum/#!topic/grpc-io/ywATt88Ef_I
     if (!ok) {
-      LOG(WARNING) << cq_name << " recv no regular event:argument name"
-                   << base->GetReqName();
+      LOG(WARNING) << cq_name << " recv no regular event:argument name["
+                   << base->GetReqName() << "]";
       TryToRegisterNewOne();
       delete base;
       continue;
diff --git a/paddle/fluid/operators/detail/grpc_server.h b/paddle/fluid/operators/detail/grpc_server.h
index 0fc9740ccb8652e30ac5287314113e34138ebad4..5b5033018c6aefdc165886fc4e9086ff0a7e9201 100644
--- a/paddle/fluid/operators/detail/grpc_server.h
+++ b/paddle/fluid/operators/detail/grpc_server.h
@@ -19,7 +19,9 @@ limitations under the License. */
 #include <utility>
 
 #include "grpc++/grpc++.h"
+#include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/var_type.h"
@@ -55,6 +57,12 @@ class AsyncGRPCServer final {
 
   void SetDevCtx(const platform::DeviceContext *dev_ctx) { dev_ctx_ = dev_ctx; }
 
+  void SetProgram(framework::ProgramDesc *program) { program_ = program; }
+
+  void SetPrefetchBlkdId(int blkid) { prefetch_blk_id_ = blkid; }
+
+  void SetExecutor(framework::Executor *executor) { executor_ = executor; }
+
   const ReceivedMessage Get() { return this->var_recv_queue_.Pop(); }
 
   void Push(const std::string &msg_name) {
@@ -69,6 +77,7 @@ class AsyncGRPCServer final {
                      std::function<void()> TryToRegisterNewOne);
   void TryToRegisterNewSendOne();
   void TryToRegisterNewGetOne();
+  void TryToRegisterNewPrefetchOne();
   void ShutdownQueue();
 
  private:
@@ -76,6 +85,7 @@ class AsyncGRPCServer final {
   volatile bool is_shut_down_ = false;
   std::unique_ptr<::grpc::ServerCompletionQueue> cq_send_;
   std::unique_ptr<::grpc::ServerCompletionQueue> cq_get_;
+  std::unique_ptr<::grpc::ServerCompletionQueue> cq_prefetch_;
 
   GrpcService::AsyncService service_;
   std::unique_ptr<::grpc::Server> server_;
@@ -86,6 +96,7 @@ class AsyncGRPCServer final {
 
   // received variable from RPC, operators fetch variable from this queue.
   SimpleBlockQueue<MessageWithName> var_get_queue_;
+  // client send variable to this queue.
   ReceivedQueue var_recv_queue_;
 
   // condition of the sub program
@@ -95,6 +106,11 @@ class AsyncGRPCServer final {
 
   std::unique_ptr<std::thread> t_send_;
   std::unique_ptr<std::thread> t_get_;
+  std::unique_ptr<std::thread> t_prefetch_;
+
+  int prefetch_blk_id_;
+  framework::ProgramDesc *program_;
+  framework::Executor *executor_;
 };
 
 };  // namespace detail
diff --git a/paddle/fluid/operators/detail/grpc_server_test.cc b/paddle/fluid/operators/detail/grpc_server_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b89aed0157de8e95564015b3e7f42316a39537f5
--- /dev/null
+++ b/paddle/fluid/operators/detail/grpc_server_test.cc
@@ -0,0 +1,62 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <unistd.h>
+#include <string>
+#include <thread>  // NOLINT
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/operators/detail/grpc_client.h"
+#include "paddle/fluid/operators/detail/grpc_server.h"
+
+namespace framework = paddle::framework;
+namespace platform = paddle::platform;
+namespace detail = paddle::operators::detail;
+
+std::unique_ptr<detail::AsyncGRPCServer> rpc_service_;
+
+void StartServer(const std::string& endpoint) {
+  rpc_service_.reset(new detail::AsyncGRPCServer(endpoint));
+  rpc_service_->RunSyncUpdate();
+}
+
+TEST(PREFETCH, CPU) {
+  // start up a server instance backend
+  // TODO(Yancey1989): Need to start a server with optimize blocks and
+  // prefetch blocks.
+  std::thread server_thread(StartServer, "127.0.0.1:8889");
+  framework::Scope scope;
+  platform::CPUPlace place;
+  platform::CPUDeviceContext ctx(place);
+  // create var on local scope
+  std::string in_var_name("in");
+  std::string out_var_name("out");
+  auto* in_var = scope.Var(in_var_name);
+  auto* in_tensor = in_var->GetMutable<framework::LoDTensor>();
+  in_tensor->Resize({10, 10});
+  VLOG(3) << "before mutable_data";
+  in_tensor->mutable_data<int>(place);
+
+  scope.Var(out_var_name);
+
+  VLOG(3) << "before fetch";
+  detail::RPCClient client;
+  client.AsyncPrefetchVariable("127.0.0.1:8889", ctx, scope, in_var_name,
+                               out_var_name);
+  client.Wait();
+
+  rpc_service_->ShutDown();
+  server_thread.join();
+  rpc_service_.reset(nullptr);
+}
diff --git a/paddle/fluid/operators/detail/grpc_service.h b/paddle/fluid/operators/detail/grpc_service.h
index acaefd92a26422fb231087ec5a605f06ad474017..e6dab2f5a3a4280f3979417c3ca2d884a0b8ff2f 100644
--- a/paddle/fluid/operators/detail/grpc_service.h
+++ b/paddle/fluid/operators/detail/grpc_service.h
@@ -76,10 +76,11 @@ namespace detail {
 enum class GrpcMethod {
   kSendVariable,
   kGetVariable,
+  kPrefetchVariable,
 };
 
 static const int kGrpcNumMethods =
-    static_cast<int>(GrpcMethod::kGetVariable) + 1;
+    static_cast<int>(GrpcMethod::kPrefetchVariable) + 1;
 
 inline const char* GrpcMethodName(GrpcMethod id) {
   switch (id) {
@@ -87,6 +88,8 @@ inline const char* GrpcMethodName(GrpcMethod id) {
       return "/sendrecv.SendRecvService/SendVariable";
     case GrpcMethod::kGetVariable:
       return "/sendrecv.SendRecvService/GetVariable";
+    case GrpcMethod::kPrefetchVariable:
+      return "/sendrecv.SendRecvService/PrefetchVariable";
   }
 
   // Shouldn't be reached.
diff --git a/paddle/fluid/operators/detail/send_recv.proto b/paddle/fluid/operators/detail/send_recv.proto
index 2d33f026e45c51d9a3812b2391381f74d6fddb29..fc12e82a7e6bd10262092d1ca367980df64e91c2 100644
--- a/paddle/fluid/operators/detail/send_recv.proto
+++ b/paddle/fluid/operators/detail/send_recv.proto
@@ -21,6 +21,8 @@ service SendRecvService {
   rpc SendVariable(VariableMessage) returns (VoidMessage) {}
   // Argument VariableMessage for GetVariable should only contain varname.
   rpc GetVariable(VariableMessage) returns (VariableMessage) {}
+  // Prefetch variable by Ids
+  rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {}
 }
 
 // VariableMessage is serialized paddle variable message.
diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index c27ea1268321744f6566b8b65e98f0df6d408186..b19add24e2bd325896a96be53d3d9762abfe217c 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -13,22 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <stdint.h>
-#include <sys/stat.h>
 #include <ostream>
-#include <thread>
-
-#include <unistd.h>
 
 #include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/proto_desc.h"
 #include "paddle/fluid/framework/threadpool.h"
 #include "paddle/fluid/operators/detail/grpc_server.h"
-#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
-#include "paddle/fluid/operators/detail/simple_block_queue.h"
-#include "paddle/fluid/string/printf.h"
 
 namespace paddle {
 namespace operators {
@@ -111,6 +102,11 @@ class ListenAndServOp : public framework::OperatorBase {
 
     framework::Executor executor(dev_place);
 
+    // TODO(qiao) set proper fields for table lookup and update
+    rpc_service_->SetExecutor(&executor);
+    rpc_service_->SetPrefetchBlkdId(0);
+    rpc_service_->SetProgram(program);
+
     // TODO(typhoonzero): change this to a while_op for every cluster-batch.
     bool exit_flag = false;
     // Record received sparse variables, so that
@@ -173,7 +169,8 @@ class ListenAndServOp : public framework::OperatorBase {
       }
       ParallelExecuteBlocks(parallel_blkids, &executor, program, &recv_scope);
 
-      VLOG(2) << "run all blocks spent (ms) " << detail::GetTimestamp() - ts;
+      VLOG(3) << "run all blocks spent " << detail::GetTimestamp() - ts
+              << "(ms)";
 
       // Reset the received sparse variables, the sum operator would not
       // sum the input sparse variables which rows is empty at the next
diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc
index 50eeadab72e71f39325c5eda69e9a3c3e6517d7d..deabcdc99f819851b2df9bb0c7b05a5b339568f3 100644
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
@@ -18,6 +18,22 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+static inline framework::OpKernelType ExpectedKernelType(
+    const framework::ExecutionContext& ctx) {
+  auto* table_var = ctx.InputVar("W");
+  if (table_var->IsType<LoDTensor>()) {
+    return framework::OpKernelType(
+        framework::ToDataType(table_var->Get<LoDTensor>().type()),
+        ctx.device_context());
+  } else if (table_var->IsType<SelectedRows>()) {
+    return framework::OpKernelType(
+        framework::ToDataType(table_var->Get<SelectedRows>().value().type()),
+        ctx.device_context());
+  } else {
+    PADDLE_THROW("W should be LoDTensor or SelectedRows");
+  }
+}
+
 class LookupTableOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -51,9 +67,7 @@ class LookupTableOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<LoDTensor>("W")->type()),
-        ctx.device_context());
+    return ExpectedKernelType(ctx);
   }
 };
 
@@ -84,7 +98,7 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
                      "If the value is -1, it makes no effect to lookup. "
                      "Otherwise the given value indicates padding the output "
                      "with zeros whenever lookup encounters it in Ids.")
-        .SetDefault(-1);
+        .SetDefault(kNoPadding);
     AddComment(R"DOC(
 Lookup Table Operator.
 
@@ -124,9 +138,7 @@ class LookupTableOpGrad : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<LoDTensor>("W")->type()),
-        ctx.device_context());
+    return ExpectedKernelType(ctx);
   }
 };
 
diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h
index c92ce78eeffb8f1517e61c6d6624d406e04d974d..fff5edda62d4b115605a4cab35ed5457b4db5f21 100644
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -14,6 +14,9 @@ limitations under the License. */
 
 #pragma once
 
+#include <string>
+#include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -25,16 +28,37 @@ namespace operators {
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
 using SelectedRows = framework::SelectedRows;
+using DDim = framework::DDim;
+
+static constexpr int64_t kNoPadding = -1;
+
+inline size_t getIndex(const std::vector<int64_t> &rows, int64_t value) {
+  auto it = std::find(rows.begin(), rows.end(), value);
+  PADDLE_ENFORCE(it != rows.end(), "id should be in rows");
+  return static_cast<size_t>(std::distance(rows.begin(), it));
+}
 
 template <typename T>
 class LookupTableKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* table_t = context.Input<LoDTensor>("W");
-    auto* ids_var = context.InputVar("Ids");
-    Tensor* output_t = context.Output<Tensor>("Out");
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *table_var = context.InputVar("W");
+    auto *ids_var = context.InputVar("Ids");
+    Tensor *output_t = context.Output<Tensor>("Out");
+    int64_t padding_idx = context.Attr<int64_t>("padding_idx");
+
+    DDim table_dim;
 
-    int64_t* ids;
+    if (table_var->IsType<LoDTensor>()) {
+      table_dim = context.Input<LoDTensor>("W")->dims();
+    } else if (table_var->IsType<SelectedRows>()) {
+      auto *table_t = context.Input<SelectedRows>("W");
+      table_dim = table_t->value().dims();
+    } else {
+      PADDLE_THROW("table only support LoDTensor and SelectedRows");
+    }
+
+    int64_t *ids;
     int64_t ids_numel;
 
     // The type of Ids(Input) is SelectedRows or LoDTensor, when Ids's type
@@ -42,39 +66,50 @@ class LookupTableKernel : public framework::OpKernel<T> {
     // when Ids's type is SelectedRows, the rows of Ids contains the
     // ids to be looked up in W.
     if (ids_var->IsType<LoDTensor>()) {
-      auto* ids_t = context.Input<LoDTensor>("Ids");
-      ids = const_cast<int64_t*>(ids_t->data<int64_t>());
+      auto *ids_t = context.Input<LoDTensor>("Ids");
+      ids = const_cast<int64_t *>(ids_t->data<int64_t>());
       ids_numel = ids_t->numel();
     } else if (ids_var->IsType<SelectedRows>()) {
-      auto* ids_t = context.Input<SelectedRows>("Ids");
-      ids = const_cast<int64_t*>(ids_t->rows().data());
+      auto *ids_t = context.Input<SelectedRows>("Ids");
+      ids = const_cast<int64_t *>(ids_t->rows().data());
       ids_numel = ids_t->rows().size();
-      output_t->Resize({ids_numel, table_t->dims()[1]});
+      output_t->Resize({ids_numel, table_dim[1]});
     } else {
       PADDLE_THROW("Unsupported Variable Type of Ids");
     }
 
-    int64_t padding_idx = context.Attr<int64_t>("padding_idx");
+    if (table_var->IsType<LoDTensor>()) {
+      auto *table_t = context.Input<LoDTensor>("W");
+      int64_t row_number = table_t->dims()[0];
+      int64_t row_width = table_t->dims()[1];
 
-    int N = table_t->dims()[0];
-    int D = table_t->dims()[1];
-    auto* table = table_t->data<T>();
-    auto* output = output_t->mutable_data<T>(context.GetPlace());
+      auto *table = table_t->data<T>();
+      auto *output = output_t->mutable_data<T>(context.GetPlace());
 
-    if (padding_idx == -1) {
       for (int64_t i = 0; i < ids_numel; ++i) {
-        PADDLE_ENFORCE_LT(ids[i], N);
-        PADDLE_ENFORCE_GE(ids[i], 0);
-        memcpy(output + i * D, table + ids[i] * D, D * sizeof(T));
+        if (padding_idx != kNoPadding && ids[i] == padding_idx) {
+          memset(output + i * row_width, 0, row_width * sizeof(T));
+        } else {
+          PADDLE_ENFORCE_LT(ids[i], row_number);
+          PADDLE_ENFORCE_GE(ids[i], 0);
+          memcpy(output + i * row_width, table + ids[i] * row_width,
+                 row_width * sizeof(T));
+        }
       }
-    } else {
+    } else if (table_var->IsType<SelectedRows>()) {
+      const auto &table_t = table_var->Get<SelectedRows>();
+      int64_t row_width = table_t.value().dims()[1];
+      const auto *table = table_t.value().data<T>();
+      auto *output = output_t->mutable_data<T>(context.GetPlace());
+
       for (int64_t i = 0; i < ids_numel; ++i) {
-        if (ids[i] == padding_idx) {
-          memset(output + i * D, 0, D * sizeof(T));
+        if (padding_idx != kNoPadding && ids[i] == padding_idx) {
+          memset(output + i * row_width, 0, row_width * sizeof(T));
         } else {
-          PADDLE_ENFORCE_LT(ids[i], N);
           PADDLE_ENFORCE_GE(ids[i], 0);
-          memcpy(output + i * D, table + ids[i] * D, D * sizeof(T));
+          auto id_index = getIndex(table_t.rows(), ids[i]);
+          memcpy(output + i * row_width, table + id_index * row_width,
+                 row_width * sizeof(T));
         }
       }
     }
@@ -84,17 +119,27 @@ class LookupTableKernel : public framework::OpKernel<T> {
 template <typename T>
 class LookupTableGradKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& context) const override {
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *table_var = context.InputVar("W");
+    DDim table_dim;
+    if (table_var->IsType<LoDTensor>()) {
+      table_dim = context.Input<LoDTensor>("W")->dims();
+    } else if (table_var->IsType<SelectedRows>()) {
+      auto *table_t = context.Input<SelectedRows>("W");
+      table_dim = table_t->value().dims();
+    } else {
+      PADDLE_THROW("table only support LoDTensor and SelectedRows");
+    }
+
     bool is_sparse = context.Attr<bool>("is_sparse");
     // Since paddings are not trainable and fixed in forward, the gradient of
     // paddings makes no sense and we don't deal with it in backward.
     if (is_sparse) {
-      auto* ids = context.Input<LoDTensor>("Ids");
-      auto* table = context.Input<LoDTensor>("W");
-      auto* d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
-      auto* d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
+      auto *ids = context.Input<LoDTensor>("Ids");
+      auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
+      auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
 
-      auto* ids_data = ids->data<int64_t>();
+      auto *ids_data = ids->data<int64_t>();
       auto ids_dim = ids->dims();
 
       framework::Vector<int64_t> new_rows;
@@ -104,31 +149,30 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
       }
       d_table->set_rows(new_rows);
 
-      auto* d_table_value = d_table->mutable_value();
-      d_table_value->Resize({ids_dim[0], table->dims()[1]});
+      auto *d_table_value = d_table->mutable_value();
+      d_table_value->Resize({ids_dim[0], table_dim[1]});
       d_table_value->mutable_data<T>(context.GetPlace());
 
-      d_table->set_height(table->dims()[0]);
+      d_table->set_height(table_dim[0]);
 
-      auto* d_output_data = d_output->data<T>();
-      auto* d_table_data = d_table_value->data<T>();
+      auto *d_output_data = d_output->data<T>();
+      auto *d_table_data = d_table_value->data<T>();
 
       PADDLE_ENFORCE_EQ(d_table_value->dims(), d_output->dims());
       memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel());
     } else {
-      auto* ids = context.Input<LoDTensor>("Ids");
-      auto* d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
-      auto* d_table = context.Output<LoDTensor>(framework::GradVarName("W"));
-      auto* table = context.Input<LoDTensor>("W");
+      auto *ids = context.Input<LoDTensor>("Ids");
+      auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
+      auto *d_table = context.Output<LoDTensor>(framework::GradVarName("W"));
 
-      auto* ids_data = ids->data<int64_t>();
+      auto *ids_data = ids->data<int64_t>();
       auto ids_dim = ids->dims();
 
-      int N = table->dims()[0];
+      int N = table_dim[0];
       int D = d_output->dims()[1];
 
-      auto* d_output_data = d_output->data<T>();
-      auto* d_table_data = d_table->mutable_data<T>(context.GetPlace());
+      auto *d_output_data = d_output->data<T>();
+      auto *d_table_data = d_table->mutable_data<T>(context.GetPlace());
 
       memset(d_table_data, 0, d_table->numel() * sizeof(T));
 
diff --git a/paddle/fluid/operators/nccl_op_test.cu.cc b/paddle/fluid/operators/nccl_op_test.cu.cc
index 90f6f955cea51ded2dbb2bde459113458d7749a4..20b8a5c98ab16ac8121cb2fd01deb8ecc1966d44 100644
--- a/paddle/fluid/operators/nccl_op_test.cu.cc
+++ b/paddle/fluid/operators/nccl_op_test.cu.cc
@@ -15,8 +15,8 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 #include <memory>
-#include <mutex>
-#include <thread>
+#include <mutex>   // NOLINT
+#include <thread>  // NOLINT
 #include <vector>
 
 #include "paddle/fluid/framework/init.h"
@@ -43,7 +43,7 @@ const f::DDim kDims = {20, 20};
 // nccl op common tester, init communicator.
 class NCCLTester : public ::testing::Test {
  public:
-  virtual void SetUp() override {
+  void SetUp() override {
     int count = p::GetCUDADeviceCount();
     if (count <= 1) {
       LOG(WARNING)
@@ -64,7 +64,7 @@ class NCCLTester : public ::testing::Test {
     NCCLInitOp();
   }
 
-  virtual void TearDown() override {
+  void TearDown() override {
     for (auto &device_context : dev_ctxs_) {
       delete device_context;
     }
@@ -137,6 +137,8 @@ class NCCLTester : public ::testing::Test {
 TEST_F(NCCLTester, ncclInitOp) {}
 
 // ncclAllReduceOp with desc
+// TODO(helin): https://github.com/PaddlePaddle/Paddle/issues/9367
+/*
 TEST_F(NCCLTester, ncclAllReduceOp) {
   std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
   op2->SetType("ncclAllReduce");
@@ -184,6 +186,7 @@ TEST_F(NCCLTester, ncclAllReduceOp) {
     }
   }
 }
+*/
 
 // ncclReduceOp with desc
 TEST_F(NCCLTester, ncclReduceOp) {
@@ -236,6 +239,8 @@ TEST_F(NCCLTester, ncclReduceOp) {
 }
 
 // ncclBcastOp with desc
+// TODO(helin): https://github.com/PaddlePaddle/Paddle/issues/9540
+/*
 TEST_F(NCCLTester, ncclBcastOp) {
   std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
   const int kRoot = 0;
@@ -281,3 +286,4 @@ TEST_F(NCCLTester, ncclBcastOp) {
     ASSERT_NEAR(ct[j], result, 1e-5);
   }
 }
+*/
diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
index 141a3eb93555c32efabc2465dc6daadf41c9d659..96c0c1cbe6d588364416925a7ab1bc8f90ac6fd7 100644
--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
@@ -12,7 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <thread>
+#include <thread>  // NOLINT
+
 #include "paddle/fluid/framework/channel.h"
 #include "paddle/fluid/operators/reader/reader_op_registry.h"
 
@@ -20,12 +21,29 @@ namespace paddle {
 namespace operators {
 namespace reader {
 
-static constexpr size_t kDoubleBufferSize = 2;
+// 'Double buffer' means we shall maintain two batches of input data at the same
+// time. So the kCacheSize shoul be at least 2.
+static constexpr size_t kCacheSize = 2;
+// There will be two bacthes out of the channel during training:
+// 1. the one waiting to be sent to the channel
+// 2. the one just be received from the channel, which is also being used by
+// subsequent operators.
+// So the channel size should be kChacheSize - 2
+static constexpr size_t kChannelSize = 0;  // kCacheSize - 2
 
 class DoubleBufferReader : public framework::DecoratedReader {
  public:
   struct Item {
     Item() : ctx_(nullptr) {}
+    Item(Item&& b) {
+      payloads_ = std::move(b.payloads_);
+      ctx_ = std::move(b.ctx_);
+    }
+    Item& operator=(Item&& b) {
+      payloads_ = std::move(b.payloads_);
+      ctx_ = std::move(b.ctx_);
+      return *this;
+    }
 
     std::vector<framework::LoDTensor> payloads_;
     platform::DeviceContext* ctx_;
@@ -34,42 +52,44 @@ class DoubleBufferReader : public framework::DecoratedReader {
   explicit DoubleBufferReader(
       ReaderBase* reader, platform::Place target_place = platform::CPUPlace())
       : DecoratedReader(reader), place_(target_place) {
-    for (size_t i = 0; i < kDoubleBufferSize; ++i) {
-      if (platform::is_gpu_place(place_)) {
 #ifdef PADDLE_WITH_CUDA
+    for (size_t i = 0; i < kCacheSize; ++i) {
+      if (platform::is_gpu_place(place_)) {
         ctxs_.emplace_back(new platform::CUDADeviceContext(
             boost::get<platform::CUDAPlace>(place_)));
-#endif
       }
     }
-
-    start_thread();
-  }
-
-  void start_thread() {
-    buffer_ = framework::MakeChannel<Item>(kDoubleBufferSize);
-    prefetcher_ = std::thread([this] { PrefetchThreadFunc(); });
+#endif
+    StartPrefetcher();
   }
 
+  bool HasNext() const override;
   void ReadNext(std::vector<framework::LoDTensor>* out) override;
   void ReInit() override;
 
-  ~DoubleBufferReader() {
-    buffer_->Close();
-    prefetcher_.join();
-    delete buffer_;
+  ~DoubleBufferReader() { EndPrefetcher(); }
+
+ private:
+  void StartPrefetcher() {
+    channel_ = framework::MakeChannel<Item>(kChannelSize);
+    prefetcher_ = std::thread([this] { PrefetchThreadFunc(); });
   }
 
-  bool HasNext() const override;
+  void EndPrefetcher() {
+    channel_->Close();
+    if (prefetcher_.joinable()) {
+      prefetcher_.join();
+    }
+    delete channel_;
+    channel_ = nullptr;
+  }
 
- private:
   void PrefetchThreadFunc();
 
   std::thread prefetcher_;
-  framework::Channel<Item>* buffer_;
+  framework::Channel<Item>* channel_;
   platform::Place place_;
   std::vector<std::unique_ptr<platform::DeviceContext>> ctxs_;
-  mutable Item local_buffer_;
 };
 
 class CreateDoubleBufferReaderOp : public framework::OperatorBase {
@@ -123,70 +143,70 @@ class CreateDoubleBufferReaderOpMaker : public DecoratedReaderMakerBase {
   }
 };
 
+bool DoubleBufferReader::HasNext() const {
+  while (!channel_->IsClosed() && !channel_->CanReceive()) {
+  }
+  return channel_->CanReceive();
+}
+
 void DoubleBufferReader::ReadNext(std::vector<framework::LoDTensor>* out) {
   if (!HasNext()) {
     PADDLE_THROW("There is no next data!");
   }
 
-  if (local_buffer_.payloads_.empty()) {
-    buffer_->Receive(&local_buffer_);
-  }
-  *out = local_buffer_.payloads_;
-  local_buffer_.payloads_.clear();
-  if (local_buffer_.ctx_) {
-    local_buffer_.ctx_->Wait();
+  Item batch;
+  channel_->Receive(&batch);
+  *out = batch.payloads_;
+  if (batch.ctx_) {
+    batch.ctx_->Wait();
   }
 }
 
 void DoubleBufferReader::ReInit() {
   reader_->ReInit();
-  buffer_->Close();
-  prefetcher_.join();
-  delete buffer_;
-  start_thread();
+  EndPrefetcher();
+  StartPrefetcher();
 }
 
 void DoubleBufferReader::PrefetchThreadFunc() {
   VLOG(5) << "A new prefetch thread starts.";
-  size_t gpu_ctx_offset = 0;
+  std::vector<std::vector<framework::LoDTensor>> cpu_tensor_cache(kCacheSize);
+  std::vector<std::vector<framework::LoDTensor>> gpu_tensor_cache(kCacheSize);
+  size_t cached_tensor_id = 0;
+
   while (reader_->HasNext()) {
     Item batch;
-    reader_->ReadNext(&batch.payloads_);
+    auto& cpu_batch = cpu_tensor_cache[cached_tensor_id];
+    reader_->ReadNext(&cpu_batch);
     if (platform::is_gpu_place(place_)) {
-      std::vector<framework::LoDTensor> gpu_batch;
-      auto& gpu_ctx = this->ctxs_[gpu_ctx_offset++];
-      gpu_ctx_offset %= this->ctxs_.size();
-      gpu_batch.resize(batch.payloads_.size());
-      for (size_t i = 0; i < batch.payloads_.size(); ++i) {
-        framework::TensorCopy(batch.payloads_[i], place_, *gpu_ctx,
-                              &gpu_batch[i]);
-        gpu_batch[i].set_lod(batch.payloads_[i].lod());
+      auto& gpu_batch = gpu_tensor_cache[cached_tensor_id];
+      auto* gpu_ctx = ctxs_[cached_tensor_id].get();
+      gpu_batch.resize(cpu_batch.size());
+      for (size_t i = 0; i < cpu_batch.size(); ++i) {
+        framework::TensorCopy(cpu_batch[i], place_, *gpu_ctx, &gpu_batch[i]);
+        gpu_batch[i].set_lod(cpu_batch[i].lod());
       }
-      batch.ctx_ = gpu_ctx.get();
-      std::swap(gpu_batch, batch.payloads_);
+      batch.payloads_ = gpu_batch;
+      batch.ctx_ = gpu_ctx;
+    } else {
+      // CPUPlace
+      batch.payloads_ = cpu_batch;
     }
+    ++cached_tensor_id;
+    cached_tensor_id %= kCacheSize;
 
     try {
-      buffer_->Send(&batch);
+      channel_->Send(&batch);
     } catch (paddle::platform::EnforceNotMet e) {
       VLOG(5) << "WARNING: The double buffer channel has been closed. The "
                  "prefetch thread will terminate.";
       break;
     }
   }
-  buffer_->Close();
+  channel_->Close();
   VLOG(5) << "Prefetch thread terminates.";
 }
 
-bool DoubleBufferReader::HasNext() const {
-  if (local_buffer_.payloads_.empty()) {
-    bool ok = buffer_->Receive(&local_buffer_);
-    return ok;
-  } else {
-    return true;
-  }
-}
-
 }  // namespace reader
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 832509641cc3d5178ff090e05437484d395bfe51..93f9c74b809770136d3d3300e0e0700b1bc0459e 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -14,93 +14,72 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/reshape_op.h"
 
+#include <string>
+#include <vector>
+
 namespace paddle {
 namespace operators {
 
-class ReshapeOp : public framework::OperatorWithKernel {
- public:
-  ReshapeOp(const std::string &type, const framework::VariableNameMap &inputs,
-            const framework::VariableNameMap &outputs,
-            const framework::AttributeMap &attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    // input check
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of ReshapeOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of ReshapeOp should not be null.");
-
-    auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
-    PADDLE_ENFORCE(shape.size() > 0, "Attr(shape) shouldn't be empty.");
-    auto x_dims = ctx->GetInputDim("X");
-
-    std::vector<size_t> neg_dims_idx;
-    // set some dimension to -1 if it is unknown
-    const int unknown_size = -1;
-    for (size_t i = 0; i < shape.size(); ++i) {
-      PADDLE_ENFORCE(shape[i] > 0 || shape[i] == unknown_size,
-                     "Each dimension of Attr(shape) must be positive or %d.",
-                     unknown_size);
-      if (shape[i] == unknown_size) {
-        neg_dims_idx.push_back(i);
-        PADDLE_ENFORCE(neg_dims_idx.size() <= 1,
-                       "Only one dimension of Attr(shape) can be unknown.");
-      }
-    }
-
-    int64_t capacity =
-        std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
-    int64_t in_size = framework::product(x_dims);
-    if (neg_dims_idx.size() == 1) {
-      // dim infer
-      shape[neg_dims_idx[0]] = in_size / (-capacity);
-      // recalculate capacity
-      capacity = shape[neg_dims_idx[0]] * (-capacity);
-    }
-    // capacity check
-    PADDLE_ENFORCE(capacity == in_size,
-                   "The size of Input(X) mismatches with Attr(shape).");
-    // resize output
-    std::vector<int64_t> shape_int64(shape.size(), 0);
-    std::transform(shape.begin(), shape.end(), shape_int64.begin(),
-                   [](int a) { return static_cast<int64_t>(a); });
-    auto out_dims = framework::make_ddim(shape_int64);
-    ctx->SetOutputDim("Out", out_dims);
-    if (shape[0] == x_dims[0]) {
-      // Only pass LoD when the first dimension is equal between
-      // output and input.
-      ctx->ShareLoD("X", /*->*/ "Out");
-    }
-  }
-};
-
 class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   ReshapeOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The input tensor of reshape operator.");
-    AddOutput("Out", "The output tensor of reshape operator.");
-    AddAttr<std::vector<int>>("shape",
-                              "(vector<int>) "
-                              "Target shape of reshape operator.");
+    AddInput("X", "(Tensor). The input tensor of reshape operator.");
+    AddInput("Shape",
+             "(Tensor<int32>, optional). If provided, reshape according to "
+             "this given shape. That is to say it has a higher priority than "
+             "the shape attribute, while the shape attribute still should be "
+             "set correctly to gurantee shape inference in compile time.")
+        .AsDispensable();
+    AddOutput("Out", "(Tensor). The output tensor of reshape operator.");
+    AddAttr<std::vector<int>>(
+        "shape", "(std::vector<int>) Target shape of reshape operator.");
     AddAttr<bool>("inplace",
-                  "Change the source tensor's shape without copy memory.")
-        .SetDefault(true);
+                  "(default: false) Change the source tensor's shape without "
+                  "memory copy. When Attr(inplace) is set true, the output "
+                  "tensor shares memory with Input(X), otherwise, a new output "
+                  "tensor is created, and its data are copied from Input(x).")
+        .SetDefault(false);
     AddComment(R"DOC(
 Reshape Operator.
 
-Reshape Input(X) into the shape specified by Attr(shape).
+Reshape Input(X) into the shape specified by Attr(shape) or Input(Shape). The
+data in Input(X) are unchanged.
 
-An example:
-Given a 2-D tensor X with 2 rows and 2 columns : [[1, 2], [3, 4]]
+Examples:
 
-and target shape = [1, 4], the reshape operator will transform
-the tensor X into a 2-D tensor: [[1, 2, 3, 4]]
+1. Given a 3-D tensor Input(X) with a shape [2, 4, 6], and the target shape
+specified by Attr(shape) is [6, 8], the reshape operator will transform Input(X)
+into a 2-D tensor with shape [6, 8] and leaving Input(X)'s data unchanged.
+
+2. Given a 3-D tensor Input(X) with a shape [2, 4, 6], and the target shape
+specified by Attr(shape) is [2, 3, -1, 2], the reshape operator will transform
+Input(X) into a 4-D tensor with shape [2, 3, 4, 2] and leaving Input(X)'s data
+unchanged. In this case, one and only dimension of Attr(shape) can be set to -1,
+the value of this dimension is inferred from the total element number of
+Input(X) and remaining dimensions.
+
+3. Given a 3-D tensor Input(X) with a shape [2, 4, 6], and the target shape
+specified by Attr(shape) is [-1, 0, 3, 2], the reshape operator will transform
+Input(X) into a 4-D tensor with shape [2, 4, 3, 2] and leaving Input(X)'s data
+unchanged. In this case, besides -1, 0 means the actual dimension value is going
+to be copied from the corresponding dimension of Input(X).
+
+Note:
+
+1. One and only one dimension in Attr(shape) can be set -1. In this case,
+the actual dimension value will be infered from the total element number of
+Input(X) and remaining dimensions.
+
+2. More than one dimensions in Attr(shape) can be set to 0, which means the real
+dimension value will be copied from Input(X) at runtime. Note that the index of
+0 can not exceed Rank(X). For example, Input(X) is a 3-D tensor with shape
+[2, 3, 4], Attr(shape) = [2, 3, 2, 0] is an invalid input.
+
+3. Input(Shape) has a higher priority than Attr(shape) if it is provided, while
+Attr(shape) still should be set correctly to gurantee shape inference in 
+compile-time.
 
-One dimension in the target shape can be set -1, representing that its
-size is unknown. In this case, the real dimension will be infered from 
-the original shape of Input(X) and other dimensions in the target shape.
 )DOC");
   }
 };
@@ -119,6 +98,14 @@ class ReshapeGradOp : public framework::OperatorWithKernel {
                    "Input(Out@GRAD) shouldn't be null.");
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/reshape_op.h b/paddle/fluid/operators/reshape_op.h
index eacb0a0cf21a60ffbdef5787434859ac549388bc..807e5ad951b893a4c027a96d743f0606b70cf160 100644
--- a/paddle/fluid/operators/reshape_op.h
+++ b/paddle/fluid/operators/reshape_op.h
@@ -14,23 +14,138 @@ limitations under the License. */
 
 #pragma once
 
+#include <string>
+#include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
 
+class ReshapeOp : public framework::OperatorWithKernel {
+ public:
+  ReshapeOp(const std::string &type, const framework::VariableNameMap &inputs,
+            const framework::VariableNameMap &outputs,
+            const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ReshapeOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ReshapeOp should not be null.");
+
+    const std::vector<int> &shape = ctx->Attrs().Get<std::vector<int>>("shape");
+    PADDLE_ENFORCE(!shape.empty(),
+                   "The shape information must be set by Attr(shape).");
+
+    if (ctx->HasInput("Shape") && ctx->IsRuntime()) {
+      // If true, set the shape of Output(Out) according to Input(Shape) in
+      // ReshapeKernel with ExecutionContext. Also check LoD in ReshapeKernel.
+      ctx->ShareLoD("X", /*->*/ "Out");
+      return;
+    }
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto out_dims = ValidateShape(shape, x_dims);
+    ctx->SetOutputDim("Out", out_dims);
+    if (x_dims[0] == out_dims[0]) {
+      // Only pass LoD when the first dimension of output and Input(X)
+      // are the same.
+      ctx->ShareLoD("X", /*->*/ "Out");
+    }
+  }
+
+  static framework::DDim ValidateShape(const std::vector<int> shape,
+                                       const framework::DDim &in_dims) {
+    const int64_t in_size = framework::product(in_dims);
+    // only one dimension canbe set to -1, whose size will be automatically
+    // infered.
+    const int64_t unk_dim_val = -1;
+    const int64_t copy_dim_val = 0;
+
+    std::vector<int64_t> output_shape(shape.size(), 0);
+    int64_t capacity = 1;
+    int unk_dim_idx = -1;
+    for (size_t i = 0; i < shape.size(); ++i) {
+      if (shape[i] == unk_dim_val) {
+        PADDLE_ENFORCE(
+            unk_dim_idx == -1,
+            "Only one input dimension of Attr(shape) can be unknown.");
+        unk_dim_idx = i;
+      } else if (shape[i] == copy_dim_val) {
+        PADDLE_ENFORCE(
+            static_cast<int>(i) < in_dims.size(),
+            "The index of dimension to copy from input shape must be less "
+            "than the size of input shape.");
+      } else {
+        PADDLE_ENFORCE(
+            shape[i] > 0,
+            "Each input dimension of Attr(shape) must not be negtive except "
+            "one unknown dimension.");
+      }
+
+      capacity *= (shape[i] ? shape[i] : in_dims[i]);
+      output_shape[i] =
+          (shape[i] ? static_cast<int64_t>(shape[i]) : in_dims[i]);
+    }
+
+    if (unk_dim_idx != -1) {
+      output_shape[unk_dim_idx] = -in_size / capacity;
+      PADDLE_ENFORCE_EQ(output_shape[unk_dim_idx] * capacity, -in_size,
+                        "Invalid shape is given.");
+    } else {
+      PADDLE_ENFORCE_EQ(capacity, in_size, "Invalid shape is given.");
+    }
+    return framework::make_ddim(output_shape);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
 template <typename DeviceContext, typename T>
 class ReshapeKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    auto* in = ctx.Input<framework::Tensor>("X");
+  void Compute(const framework::ExecutionContext &ctx) const {
+    auto *out = ctx.Output<framework::LoDTensor>("Out");
+    auto *in = ctx.Input<framework::LoDTensor>("X");
+    auto *shape_tensor = ctx.Input<framework::LoDTensor>("Shape");
+
+    framework::DDim out_dims = out->dims();
+    if (shape_tensor) {
+      auto *shape_data = shape_tensor->data<int>();
+      if (platform::is_gpu_place(ctx.GetPlace())) {
+        framework::Tensor cpu_shape_tensor;
+        TensorCopy(*shape_tensor, platform::CPUPlace(), ctx.device_context(),
+                   &cpu_shape_tensor);
+        shape_data = cpu_shape_tensor.data<int>();
+      }
+      auto shape =
+          std::vector<int>(shape_data, shape_data + shape_tensor->numel());
+      out_dims = ReshapeOp::ValidateShape(shape, in->dims());
+    }
+    if (!in->lod().empty()) {
+      PADDLE_ENFORCE_EQ(
+          out_dims[0], in->dims()[0],
+          "Reshape operator cannot reshape an input sequence batch "
+          "into an output sequence batch that has a different "
+          "number of time steps. Please consider using "
+          "sequence_reshape op.");
+    }
+
     bool inplace = ctx.Attr<bool>("inplace");
-    auto out_dims = out->dims();
+    out->Resize(out_dims);
     if (!inplace) {
       out->mutable_data<T>(ctx.GetPlace());
       framework::TensorCopy(*in, ctx.GetPlace(), ctx.device_context(), out);
+      // TensorCopy will resize to in_dims.
       out->Resize(out_dims);
     } else {
       out->ShareDataWith(*in);
@@ -42,9 +157,10 @@ class ReshapeKernel : public framework::OpKernel<T> {
 template <typename DeviceContext, typename T>
 class ReshapeGradKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+  void Compute(const framework::ExecutionContext &ctx) const {
+    auto *d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto *d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+
     d_x->mutable_data<T>(ctx.GetPlace());
     bool inplace = ctx.Attr<bool>("inplace");
 
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index f5ae553c8571e21b351d0f5507afdf1539843a51..d074b0136d77fa5a1ce5c29cd52347d04475b029 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -81,6 +81,7 @@ if (WITH_TESTING)
       # enable v2 API unittest only when paddle swig api is compiled
       add_subdirectory(paddle/v2/tests)
       add_subdirectory(paddle/v2/plot/tests)
+      add_subdirectory(paddle/v2/reader/tests)
     endif()
   endif()
   add_subdirectory(paddle/fluid/tests)
diff --git a/python/paddle/dataset/__init__.py b/python/paddle/dataset/__init__.py
index 1fdfd49f1c970d89bfde9d12a24076d38c54ba66..3315e826e82a33dfeb9c5223ce196cffb1ae7234 100644
--- a/python/paddle/dataset/__init__.py
+++ b/python/paddle/dataset/__init__.py
@@ -37,7 +37,7 @@ __all__ = [
     'cifar',
     'movielens',
     'conll05',
-    'sentiment'
+    'sentiment',
     'uci_housing',
     'wmt14',
     'wmt16',
diff --git a/python/paddle/fluid/distribute_transpiler.py b/python/paddle/fluid/distribute_transpiler.py
index 24297ffe33bc720ff7b4f2b0dbd82452dc7e0ae2..9311fc9904eb730aa56e94a4e45a1479a67df641 100644
--- a/python/paddle/fluid/distribute_transpiler.py
+++ b/python/paddle/fluid/distribute_transpiler.py
@@ -276,20 +276,25 @@ class DistributeTranspiler:
             suff_idx = v.name.find(".trainer_")
             if suff_idx >= 0:
                 orig_var_name = v.name[:suff_idx]
-            pserver_program.global_block().create_var(
+            else:
+                orig_var_name = v.name
+            single_trainer_var = pserver_program.global_block().create_var(
                 name=orig_var_name,
                 persistable=True,
                 type=v.type,
                 dtype=v.dtype,
                 shape=v.shape)
-            for trainer_id in xrange(self.trainers):
-                var = pserver_program.global_block().create_var(
-                    name="%s.trainer_%d" % (orig_var_name, trainer_id),
-                    persistable=False,
-                    type=v.type,
-                    dtype=v.dtype,
-                    shape=v.shape)
-                recv_inputs.append(var)
+            if self.trainers > 1:
+                for trainer_id in xrange(self.trainers):
+                    var = pserver_program.global_block().create_var(
+                        name="%s.trainer_%d" % (orig_var_name, trainer_id),
+                        persistable=False,
+                        type=v.type,
+                        dtype=v.dtype,
+                        shape=v.shape)
+                    recv_inputs.append(var)
+            else:
+                recv_inputs.append(single_trainer_var)
 
         # step3
         optimize_block = pserver_program.create_block(0)
@@ -511,8 +516,11 @@ class DistributeTranspiler:
 
     def _append_split_op(self, program, gradblocks):
         # Split variables that need to be split and append respective ops
+        add_suffix = False
+        if self.trainers > 1:
+            add_suffix = True
         var_mapping = self._create_vars_from_blocklist(
-            program, gradblocks, add_trainer_suffix=True)
+            program, gradblocks, add_trainer_suffix=add_suffix)
         for varname, splited_vars in var_mapping.iteritems():
             # variable that don't need to split have empty splited_vars
             if len(splited_vars) <= 1:
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 2612fb1ae41986ae0d5c6e942cc3accebcb00e19..54d0a12bcdbb1b6c13e584dd1a3a5d73cddd4af7 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -48,8 +48,7 @@ def as_numpy(tensor):
     assert isinstance(tensor, core.LoDTensor)
     lod = tensor.lod()
     if len(lod) > 0:
-        raise RuntimeError(
-            "Some of your featched tensors hold LoD information. \
+        raise RuntimeError("Some of your fetched tensors hold LoD information. \
             They can not be completely cast to Python ndarray. \
             Please set the parameter 'return_numpy' as 'False' to \
             return LoDTensor itself directly.")
@@ -180,60 +179,24 @@ def get_program_cache_key(feed, fetch_list):
 
 
 class Executor(object):
-    def __init__(self, places):
-        if not isinstance(places, list) and not isinstance(places, tuple):
-            places = [places]
-
-        act_places = []
-        for each in places:
-            p = core.Place()
-            p.set_place(each)
-            act_places.append(p)
-
-        # TODO(dzhwinter) : only use the first place
-        self.executor = core.Executor(act_places[0])
-        self.places = places
+    def __init__(self, place):
+        self.place = place
+        p = core.Place()
+        p.set_place(place)
+        self.executor = core.Executor(p)
         self.program_caches = dict()
 
-    def aslodtensor(self, data):
-        def accumulate(data):
-            if not isinstance(data, list):
-                return 1
-            return sum([accumulate(sub) for sub in data])
-
-        def parselod(data):
-            seq_lens = [accumulate(seq) for seq in data]
-            cur_len = 0
-            lod = [cur_len]
-            for l in seq_lens:
-                cur_len += l
-                lod.append(cur_len)
-            return lod
-
-        assert len(self.places) != 0
-        if not isinstance(data, list):
-            # pure tensor case
-            tensor = core.LoDTensor()
-            tensor.set(data, self.places[0])
-            return tensor
-        else:
-            raise RuntimeError("Current implementation lacks unittests")
-            # lodtensor case
-            lod = []
-            if not isinstance(data[0], list):
-                lod.append(parselod(data))
-                flattened_data = np.concatenate(data, axis=0).astype("int64")
-            else:
-                while isinstance(data[0], list):
-                    lod.append(parselod(seq))
-                    flattened_data = [item for seq in data for item in seq]
-                    data = flattened_data
-                flattened_data = np.concatenate(data, axis=0).astype("int64")
-            flattened_data = flattened_data.reshape([len(flattened_data), 1])
-            tensor = core.LoDTensor()
-            tensor.set(flattened_data, self.places[0])
-            tensor.set_lod(lod)
-            return tensor
+    def as_lodtensor(self, data):
+        if isinstance(data, list):
+            raise RuntimeError("Some of your feed data hold LoD information. \
+                They can not be completely cast from a list of Python \
+                ndarray to LoDTensor. Please convert data to LoDTensor \
+                directly before feeding the data.\
+                ")
+        # single tensor case
+        tensor = core.LoDTensor()
+        tensor.set(data, self.place)
+        return tensor
 
     def _get_program_cache(self, program_cache_key):
         return self.program_caches.get(program_cache_key, None)
@@ -293,7 +256,7 @@ class Executor(object):
                 feed_target_name = op.desc.output('Out')[0]
                 cur_feed = feed[feed_target_name]
                 if not isinstance(cur_feed, core.LoDTensor):
-                    cur_feed = self.aslodtensor(cur_feed)
+                    cur_feed = self.as_lodtensor(cur_feed)
                 idx = op.desc.attr('col')
                 core.set_feed_variable(scope, cur_feed, feed_var_name, idx)
             else:
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 3e649dc5fd32c4ed8fa6ad273b7be04d552b51ae..a5938fe494265778ef7032c56a8d6d35acd729c5 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -19,7 +19,6 @@ from layer_function_generator import generate_layer_fn
 from layer_function_generator import autodoc
 from ..layer_helper import LayerHelper
 import tensor
-import ops
 import nn
 import math
 
@@ -58,7 +57,7 @@ def detection_output(loc,
 
     This operation is to get the detection results by performing following
     two steps:
-    
+
     1. Decode input bounding box predictions according to the prior boxes.
     2. Get the final detection results by applying multi-class non maximum
        suppression (NMS).
@@ -130,9 +129,9 @@ def detection_output(loc,
         target_box=loc,
         code_type='decode_center_size')
     old_shape = scores.shape
-    scores = ops.reshape(x=scores, shape=(-1, old_shape[-1]))
+    scores = nn.reshape(x=scores, shape=(-1, old_shape[-1]))
     scores = nn.softmax(input=scores)
-    scores = ops.reshape(x=scores, shape=old_shape)
+    scores = nn.reshape(x=scores, shape=old_shape)
     scores = nn.transpose(scores, perm=[0, 2, 1])
     scores.stop_gradient = True
     nmsed_outs = helper.create_tmp_variable(dtype=decoded_box.dtype)
@@ -463,7 +462,7 @@ def ssd_loss(location,
     num, num_prior, num_class = confidence.shape
 
     def __reshape_to_2d(var):
-        return ops.reshape(x=var, shape=[-1, var.shape[-1]])
+        return nn.reshape(x=var, shape=[-1, var.shape[-1]])
 
     # 1. Find matched boundding box by prior box.
     #   1.1 Compute IOU similarity between ground-truth boxes and prior boxes.
@@ -474,7 +473,7 @@ def ssd_loss(location,
 
     # 2. Compute confidence for mining hard examples
     # 2.1. Get the target label based on matched indices
-    gt_label = ops.reshape(x=gt_label, shape=gt_label.shape + (1, ))
+    gt_label = nn.reshape(x=gt_label, shape=gt_label.shape + (1, ))
     gt_label.stop_gradient = True
     target_label, _ = target_assign(
         gt_label, matched_indices, mismatch_value=background_label)
@@ -487,7 +486,7 @@ def ssd_loss(location,
     conf_loss = nn.softmax_with_cross_entropy(confidence, target_label)
 
     # 3. Mining hard examples
-    conf_loss = ops.reshape(x=conf_loss, shape=(num, num_prior))
+    conf_loss = nn.reshape(x=conf_loss, shape=(num, num_prior))
     conf_loss.stop_gradient = True
     neg_indices = helper.create_tmp_variable(dtype='int32')
     dtype = matched_indices.dtype
@@ -556,7 +555,7 @@ def ssd_loss(location,
     # 5.3 Compute overall weighted loss.
     loss = conf_loss_weight * conf_loss + loc_loss_weight * loc_loss
     # reshape to [N, Np], N is the batch size and Np is the prior box number.
-    loss = ops.reshape(x=loss, shape=[-1, num_prior])
+    loss = nn.reshape(x=loss, shape=[-1, num_prior])
     loss = nn.reduce_sum(loss, dim=1, keep_dim=True)
     if normalize:
         normalizer = nn.reduce_sum(target_loc_weight)
@@ -709,7 +708,7 @@ def multi_box_head(inputs,
         new_shape = [
             -1, reduce(lambda x, y: x * y, input.shape[axis:len(input.shape)])
         ]
-        out = ops.reshape(x=input, shape=new_shape)
+        out = nn.reshape(x=input, shape=new_shape)
         return out
 
     def _is_list_or_tuple_(data):
@@ -803,7 +802,7 @@ def multi_box_head(inputs,
             mbox_loc.shape[0],
             mbox_loc.shape[1] * mbox_loc.shape[2] * mbox_loc.shape[3] / 4, 4
         ]
-        mbox_loc_flatten = ops.reshape(mbox_loc, shape=new_shape)
+        mbox_loc_flatten = nn.reshape(mbox_loc, shape=new_shape)
         mbox_locs.append(mbox_loc_flatten)
 
         # get conf
@@ -819,7 +818,7 @@ def multi_box_head(inputs,
             conf_loc.shape[0], conf_loc.shape[1] * conf_loc.shape[2] *
             conf_loc.shape[3] / num_classes, num_classes
         ]
-        conf_loc_flatten = ops.reshape(conf_loc, shape=new_shape)
+        conf_loc_flatten = nn.reshape(conf_loc, shape=new_shape)
         mbox_confs.append(conf_loc_flatten)
 
     if len(box_results) == 1:
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 0332556f62c46b187bd79841e4969d9da08b57a5..e59ee251202d4985f0aa9ce46408fc7f0d197e1d 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -73,6 +73,7 @@ __all__ = [
     'smooth_l1',
     'one_hot',
     'autoincreased_step_counter',
+    'reshape',
     'lod_reset',
     'lrn',
 ]
@@ -3265,6 +3266,8 @@ def one_hot(input, depth):
          The one-hot tensor or LodTensor, same as input.
 
     Examples:
+        .. code-block:: python
+
         X is a LoDTensor:
           X.lod = [[0, 1, 4]]
           X.shape = [4, 1]
@@ -3319,6 +3322,101 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1):
     return counter
 
 
+def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
+    """
+    Gives a new shape to the input Tensor without changing its data.
+
+    The target shape can be given by :attr:`shape` or :attr:`actual_shape`.
+    :attr:`shape` is a list of integer while :attr:`actual_shape` is a tensor
+    variable. :attr:`actual_shape` has a higher priority than :attr:`shape`
+    if it is provided, while :attr:`shape` still should be set correctly to
+    gurantee shape inference in compile-time.
+
+    Some tricks exist when specifying the target shape.
+
+    1. -1 means the value of this dimension is inferred from the total element
+    number of x and remaining dimensions. Thus one and only one dimension can
+    be set -1.
+
+    2. 0 means the actual dimension value is going to be copied from the
+    corresponding dimension of x. The indice of 0s in shape can not exceed
+    Rank(X).
+
+    Here are some examples to explain it.
+
+    1. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
+    is [6, 8], the reshape operator will transform x into a 2-D tensor with 
+    shape [6, 8] and leaving x's data unchanged.
+
+    2. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
+    specified is [2, 3, -1, 2], the reshape operator will transform x into a
+    4-D tensor with shape [2, 3, 4, 2] and leaving x's data unchanged. In this
+    case, one dimension of the target shape is set to -1, the value of this 
+    dimension is inferred from the total element number of x and remaining 
+    dimensions.
+
+    3. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
+    is [-1, 0, 3, 2], the reshape operator will transform x into a 4-D tensor
+    with shape [2, 4, 3, 2] and leaving x's data unchanged. In this case,
+    besides -1, 0 means the actual dimension value is going to be copied from
+    the corresponding dimension of x.
+
+    Args:
+        input(variable): The input tensor.
+        shape(list): The new shape. At most one dimension of the new shape can
+                     be -1.
+        actual_shape(variable): An optional input. If provided, reshape
+                                according to this given shape rather than
+                                :attr:`shape` specifying shape. That is to
+                                say :attr:`actual_shape` has a higher priority
+                                than :attr:`shape`.
+        act (str): The non-linear activation to be applied to output variable.
+        inplace(bool): If this flag is set true, a new output tensor is created
+                       whose data is copied from input x, otherwise the output
+                       shares data with input without copying.
+
+    Returns(variable): The output tensor.
+
+    Examples:
+        .. code-block:: python
+            data = fluid.layers.data(
+                name='data', shape=[2, 4, 6], dtype='float32')
+            reshaped = fluid.layers.reshape(
+                x=data, shape=[-1, 0, 3, 2], act='tanh', inplace=True)
+    """
+
+    if not (isinstance(shape, list) or isinstance(shape, tuple)):
+        raise ValueError("Input shape must be a python lsit or tuple.")
+
+    # Validate the shape
+    unk_dim_idx = -1
+    for dim_idx, dim_size in enumerate(shape):
+        if dim_size == -1:
+            assert unk_dim_idx == -1, (
+                "Only one dimension in shape can be unknown.")
+            unk_dim_idx = dim_idx
+        elif dim_size == 0:
+            assert dim_idx < len(x.shape), (
+                "The indice of 0s in shape can not exceed Rank(X).")
+        else:
+            assert dim_size > 0, (
+                "Each dimension size given in shape must not be negtive "
+                "except one unknown dimension.")
+
+    helper = LayerHelper("reshape", **locals())
+    reshaped = helper.create_tmp_variable(dtype=x.dtype)
+    helper.append_op(
+        type="reshape",
+        inputs={"X": x,
+                "Shape": actual_shape}
+        if isinstance(actual_shape, Variable) else {"X": x},
+        attrs={"shape": shape,
+               "inplace": inplace},
+        outputs={"Out": reshaped})
+
+    return helper.append_activation(reshaped)
+
+
 def lod_reset(x, y=None, target_lod=None):
     """
     LoD Reset Operator. Set LoD of **x** to a new one specified by **y** or
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index 0e5987ee598158d189db8bc956b7e7fea2517554..a9fe25744cc0b385479c9366af1b731ec221dd5a 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -49,7 +49,6 @@ __activations__ = [
 __all__ = [
     'mean',
     'mul',
-    'reshape',
     'scale',
     'sigmoid_cross_entropy_with_logits',
     'elementwise_add',
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 8393f7827b1c7d361ebea72f2cfc6033268772f0..299ab8e51f017e1980a8b40e3830fc42b1ff7ccc 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -334,7 +334,7 @@ class OpTest(unittest.TestCase):
                     np.allclose(
                         actual_t, expect_t, atol=atol),
                     "Output (" + out_name + ") has diff at " + str(place) +
-                    str(actual_t) + str(expect_t))
+                    str(actual_t) + "\n" + str(expect_t))
                 if isinstance(expect, tuple):
                     self.assertListEqual(actual.lod(), expect[1],
                                          "Output (" + out_name +
@@ -568,6 +568,6 @@ class OpTest(unittest.TestCase):
 
         fetch_list = [g for p, g in param_grad_list]
         executor = Executor(place)
-        return map(
-            np.array,
-            executor.run(prog, feed_dict, fetch_list, return_numpy=False))
+        return map(np.array,
+                   executor.run(prog, feed_dict, fetch_list,
+                                return_numpy=False))
diff --git a/python/paddle/fluid/tests/unittests/test_lookup_table_op.py b/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
index ed920ad388ff0e01887404e70fe82565b4cd28fa..3f739afd2516fdc2bdf3711d4780a1196c6f3f13 100644
--- a/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
@@ -96,5 +96,47 @@ class TestLookupTableIdsIsSelectedRows(OpTest):
             self.check_with_place(place)
 
 
+class TestLookupTableWIsSelectedRows(OpTest):
+    def check_with_place(self, place):
+        scope = core.Scope()
+
+        # create and initialize Id Variable
+        ids_tensor = scope.var('Ids').get_tensor()
+        ids_array = np.array([[0], [4], [3], [5]]).astype("int64")
+        ids_tensor.set(ids_array, place)
+
+        # create and initialize W Variable
+        rows = [0, 1, 2, 3, 4, 5, 6]
+        row_numel = 12
+
+        w_selected_rows = scope.var('W').get_selected_rows()
+        w_selected_rows.set_height(len(rows))
+        w_selected_rows.set_rows(rows)
+        w_array = np.ones((len(rows), row_numel)).astype("float32")
+        for i in range(len(rows)):
+            w_array[i] *= i
+        ids_tensor = w_selected_rows.get_tensor()
+        ids_tensor.set(w_array, place)
+
+        # create Out Variable
+        Out_tensor = scope.var('Out').get_tensor()
+
+        # create and run lookup_table operator
+        lookup_table = Operator("lookup_table", W='W', Ids='Ids', Out='Out')
+        lookup_table.run(scope, place)
+
+        # get result from Out
+        result_array = np.array(Out_tensor)
+        # all(): return True if all elements of the iterable are true (or if the iterable is empty)
+        for idx, row in enumerate(ids_array):
+            assert (row[0] == result_array[idx]).all()
+
+    def test_w_is_selected_rows(self):
+        places = [core.CPUPlace()]
+        # currently only support CPU
+        for place in places:
+            self.check_with_place(place)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py b/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py
old mode 100755
new mode 100644
diff --git a/python/paddle/fluid/tests/unittests/test_recv_op.py b/python/paddle/fluid/tests/unittests/test_recv_op.py
index 854238c6279528d8f3adf173140a47e233134f43..2ebceca7e4b7b824194d94180462870e6cfe6d21 100644
--- a/python/paddle/fluid/tests/unittests/test_recv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_recv_op.py
@@ -23,7 +23,7 @@ import time
 
 
 class TestRecvOp(unittest.TestCase):
-    def test_send(self):
+    def no_test_send(self):
         # Run init_serv in a thread
         place = fluid.CPUPlace()
         p = Process(target=self.init_serv, args=(place, ))
diff --git a/python/paddle/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py
index 11f35c74d41146118525a5efa6c211d528e255fe..f51b5a7e9907294a5b91c920a363830d8b9a7137 100644
--- a/python/paddle/fluid/tests/unittests/test_reshape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py
@@ -14,15 +14,19 @@
 
 import unittest
 import numpy as np
+
 from op_test import OpTest
 
 
 class TestReshapeOp(OpTest):
     def setUp(self):
+        ori_shape = (2, 25)
+        new_shape = (5, 10)
+
         self.op_type = "reshape"
-        self.inputs = {'X': np.random.random((10, 20)).astype("float32")}
-        self.attrs = {'shape': [10 * 20]}
-        self.outputs = {'Out': self.inputs['X'].reshape(self.attrs['shape'])}
+        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.attrs = {"shape": new_shape, "inplace": False}
+        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
 
     def test_check_output(self):
         self.check_output()
@@ -31,12 +35,33 @@ class TestReshapeOp(OpTest):
         self.check_grad(["X"], "Out")
 
 
-class TestReshapeOpDimInfer(OpTest):
+class TestReshapeOpDimInfer1(OpTest):
     def setUp(self):
+        ori_shape = (5, 10)
+        new_shape = (5, -1, 5)
+
         self.op_type = "reshape"
-        self.inputs = {'X': np.random.random((10, 20)).astype("float32")}
-        self.attrs = {'shape': [4, -1, 5]}
-        self.outputs = {'Out': self.inputs['X'].reshape(self.attrs['shape'])}
+        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.attrs = {"shape": new_shape, "inplace": False}
+        self.outputs = {"Out": self.inputs["X"].reshape(self.attrs["shape"])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestReshapeOpDimInfer2(OpTest):
+    def setUp(self):
+        ori_shape = (2, 2, 6)
+        new_shape = (2, 0, 3, -1)
+        infered_shape = (2, 2, 3, -1)
+
+        self.op_type = "reshape"
+        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.attrs = {"shape": new_shape, "inplace": False}
+        self.outputs = {"Out": self.inputs["X"].reshape(infered_shape)}
 
     def test_check_output(self):
         self.check_output()
@@ -47,10 +72,30 @@ class TestReshapeOpDimInfer(OpTest):
 
 class TestReshapeOpInplace(OpTest):
     def setUp(self):
+        ori_shape = (2, 25)
+        new_shape = (5, 10)
+
+        self.op_type = "reshape"
+        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.attrs = {"shape": new_shape}
+        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestReshapeOpDimInferInplace1(OpTest):
+    def setUp(self):
+        ori_shape = (5, 10)
+        new_shape = (5, -1, 5)
+
         self.op_type = "reshape"
-        self.inputs = {'X': np.random.random((10, 20)).astype("float32")}
-        self.attrs = {'shape': [10 * 20], 'inplace': True}
-        self.outputs = {'Out': self.inputs['X'].reshape(self.attrs['shape'])}
+        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.attrs = {"shape": new_shape}
+        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
 
     def test_check_output(self):
         self.check_output()
@@ -59,12 +104,38 @@ class TestReshapeOpInplace(OpTest):
         self.check_grad(["X"], "Out")
 
 
-class TestReshapeOpDimInferInplace(OpTest):
+class TestReshapeOpDimInferInplace2(OpTest):
     def setUp(self):
+        ori_shape = (2, 2, 6)
+        new_shape = (2, 0, 3, -1)
+        infered_shape = (2, 2, 3, -1)
+
+        self.op_type = "reshape"
+        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.attrs = {"shape": new_shape}
+        self.outputs = {"Out": self.inputs["X"].reshape(infered_shape)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestReshapeOpWithInputShape(OpTest):
+    def setUp(self):
+        ori_shape = (6, 5)
+        new_shape = (0, -1, 5)
+        actual_shape = (2, 3, 5)
+
         self.op_type = "reshape"
-        self.inputs = {'X': np.random.random((10, 20)).astype("float32")}
-        self.attrs = {'shape': [4, -1, 5], 'inplace': True}
-        self.outputs = {'Out': self.inputs['X'].reshape(self.attrs['shape'])}
+        self.inputs = {
+            "X": np.random.random(ori_shape).astype("float32"),
+            "Shape": np.array(
+                actual_shape, dtype="int32")
+        }
+        self.attrs = {"shape": new_shape}
+        self.outputs = {"Out": self.inputs["X"].reshape(actual_shape)}
 
     def test_check_output(self):
         self.check_output()
@@ -73,5 +144,5 @@ class TestReshapeOpDimInferInplace(OpTest):
         self.check_grad(["X"], "Out")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_target_assign_op.py b/python/paddle/fluid/tests/unittests/test_target_assign_op.py
old mode 100755
new mode 100644
diff --git a/python/paddle/v2/__init__.py b/python/paddle/v2/__init__.py
index 02b0d077eefa431baed05c421a367ebe3581626c..df710c33d0c0ca16d358dac1eb42327e9cd4c7ae 100644
--- a/python/paddle/v2/__init__.py
+++ b/python/paddle/v2/__init__.py
@@ -22,13 +22,17 @@ import data_type
 import topology
 import networks
 import evaluator
+from . import dataset
+from . import reader
 from . import plot
 import attr
 import op
 import pooling
 import inference
 import networks
+import minibatch
 import plot
+import image
 import paddle.trainer.config_parser as cp
 
 __all__ = [
@@ -44,11 +48,14 @@ __all__ = [
     'data_type',
     'attr',
     'pooling',
+    'dataset',
+    'reader',
     'topology',
     'networks',
     'infer',
     'plot',
     'evaluator',
+    'image',
     'master',
 ]
 
@@ -146,3 +153,4 @@ def init(**kwargs):
 
 
 infer = inference.infer
+batch = minibatch.batch
diff --git a/python/paddle/v2/dataset/__init__.py b/python/paddle/v2/dataset/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..38056fe0a9496bcb5de76634bbab267e324dc2a4
--- /dev/null
+++ b/python/paddle/v2/dataset/__init__.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Dataset package.
+"""
+
+import mnist
+import imikolov
+import imdb
+import cifar
+import movielens
+import conll05
+import uci_housing
+import sentiment
+import wmt14
+import wmt16
+import mq2007
+import flowers
+import voc2012
+
+__all__ = [
+    'mnist',
+    'imikolov',
+    'imdb',
+    'cifar',
+    'movielens',
+    'conll05',
+    'sentiment',
+    'uci_housing',
+    'wmt14',
+    'wmt16',
+    'mq2007',
+    'flowers',
+    'voc2012',
+]
diff --git a/python/paddle/v2/dataset/cifar.py b/python/paddle/v2/dataset/cifar.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a2a1ced11ee5cb2fb407b229ce810d553c2fa46
--- /dev/null
+++ b/python/paddle/v2/dataset/cifar.py
@@ -0,0 +1,139 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+CIFAR dataset.
+
+This module will download dataset from
+https://www.cs.toronto.edu/~kriz/cifar.html and parse train/test set into
+paddle reader creators.
+
+The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes,
+with 6000 images per class. There are 50000 training images and 10000 test
+images.
+
+The CIFAR-100 dataset is just like the CIFAR-10, except it has 100 classes
+containing 600 images each. There are 500 training images and 100 testing
+images per class.
+
+"""
+
+import cPickle
+import itertools
+import numpy
+import paddle.v2.dataset.common
+import tarfile
+
+__all__ = ['train100', 'test100', 'train10', 'test10', 'convert']
+
+URL_PREFIX = 'https://www.cs.toronto.edu/~kriz/'
+CIFAR10_URL = URL_PREFIX + 'cifar-10-python.tar.gz'
+CIFAR10_MD5 = 'c58f30108f718f92721af3b95e74349a'
+CIFAR100_URL = URL_PREFIX + 'cifar-100-python.tar.gz'
+CIFAR100_MD5 = 'eb9058c3a382ffc7106e4002c42a8d85'
+
+
+def reader_creator(filename, sub_name):
+    def read_batch(batch):
+        data = batch['data']
+        labels = batch.get('labels', batch.get('fine_labels', None))
+        assert labels is not None
+        for sample, label in itertools.izip(data, labels):
+            yield (sample / 255.0).astype(numpy.float32), int(label)
+
+    def reader():
+        with tarfile.open(filename, mode='r') as f:
+            names = (each_item.name for each_item in f
+                     if sub_name in each_item.name)
+
+            for name in names:
+                batch = cPickle.load(f.extractfile(name))
+                for item in read_batch(batch):
+                    yield item
+
+    return reader
+
+
+def train100():
+    """
+    CIFAR-100 training set creator.
+
+    It returns a reader creator, each sample in the reader is image pixels in
+    [0, 1] and label in [0, 99].
+
+    :return: Training reader creator
+    :rtype: callable
+    """
+    return reader_creator(
+        paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
+        'train')
+
+
+def test100():
+    """
+    CIFAR-100 test set creator.
+
+    It returns a reader creator, each sample in the reader is image pixels in
+    [0, 1] and label in [0, 9].
+
+    :return: Test reader creator.
+    :rtype: callable
+    """
+    return reader_creator(
+        paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
+        'test')
+
+
+def train10():
+    """
+    CIFAR-10 training set creator.
+
+    It returns a reader creator, each sample in the reader is image pixels in
+    [0, 1] and label in [0, 9].
+
+    :return: Training reader creator
+    :rtype: callable
+    """
+    return reader_creator(
+        paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
+        'data_batch')
+
+
+def test10():
+    """
+    CIFAR-10 test set creator.
+
+    It returns a reader creator, each sample in the reader is image pixels in
+    [0, 1] and label in [0, 9].
+
+    :return: Test reader creator.
+    :rtype: callable
+    """
+    return reader_creator(
+        paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
+        'test_batch')
+
+
+def fetch():
+    paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5)
+    paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    paddle.v2.dataset.common.convert(path, train100(), 1000, "cifar_train100")
+    paddle.v2.dataset.common.convert(path, test100(), 1000, "cifar_test100")
+    paddle.v2.dataset.common.convert(path, train10(), 1000, "cifar_train10")
+    paddle.v2.dataset.common.convert(path, test10(), 1000, "cifar_test10")
diff --git a/python/paddle/v2/dataset/common.py b/python/paddle/v2/dataset/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6ff09a1d1e3ca56877e986c3ed3ae9ecd0a7316
--- /dev/null
+++ b/python/paddle/v2/dataset/common.py
@@ -0,0 +1,236 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import requests
+import hashlib
+import os
+import errno
+import shutil
+import sys
+import importlib
+import paddle.v2.dataset
+import cPickle
+import glob
+import cPickle as pickle
+
+__all__ = [
+    'DATA_HOME',
+    'download',
+    'md5file',
+    'split',
+    'cluster_files_reader',
+    'convert',
+]
+
+DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset')
+
+
+# When running unit tests, there could be multiple processes that
+# trying to create DATA_HOME directory simultaneously, so we cannot
+# use a if condition to check for the existence of the directory;
+# instead, we use the filesystem as the synchronization mechanism by
+# catching returned errors.
+def must_mkdirs(path):
+    try:
+        os.makedirs(DATA_HOME)
+    except OSError as exc:
+        if exc.errno != errno.EEXIST:
+            raise
+        pass
+
+
+must_mkdirs(DATA_HOME)
+
+
+def md5file(fname):
+    hash_md5 = hashlib.md5()
+    f = open(fname, "rb")
+    for chunk in iter(lambda: f.read(4096), b""):
+        hash_md5.update(chunk)
+    f.close()
+    return hash_md5.hexdigest()
+
+
+def download(url, module_name, md5sum, save_name=None):
+    dirname = os.path.join(DATA_HOME, module_name)
+    if not os.path.exists(dirname):
+        os.makedirs(dirname)
+
+    filename = os.path.join(dirname,
+                            url.split('/')[-1]
+                            if save_name is None else save_name)
+
+    retry = 0
+    retry_limit = 3
+    while not (os.path.exists(filename) and md5file(filename) == md5sum):
+        if os.path.exists(filename):
+            print "file md5", md5file(filename), md5sum
+        if retry < retry_limit:
+            retry += 1
+        else:
+            raise RuntimeError("Cannot download {0} within retry limit {1}".
+                               format(url, retry_limit))
+        print "Cache file %s not found, downloading %s" % (filename, url)
+        r = requests.get(url, stream=True)
+        total_length = r.headers.get('content-length')
+
+        if total_length is None:
+            with open(filename, 'w') as f:
+                shutil.copyfileobj(r.raw, f)
+        else:
+            with open(filename, 'w') as f:
+                dl = 0
+                total_length = int(total_length)
+                for data in r.iter_content(chunk_size=4096):
+                    dl += len(data)
+                    f.write(data)
+                    done = int(50 * dl / total_length)
+                    sys.stdout.write("\r[%s%s]" % ('=' * done,
+                                                   ' ' * (50 - done)))
+                    sys.stdout.flush()
+
+    return filename
+
+
+def fetch_all():
+    for module_name in filter(lambda x: not x.startswith("__"),
+                              dir(paddle.v2.dataset)):
+        if "fetch" in dir(
+                importlib.import_module("paddle.v2.dataset.%s" % module_name)):
+            getattr(
+                importlib.import_module("paddle.v2.dataset.%s" % module_name),
+                "fetch")()
+
+
+def fetch_all_recordio(path):
+    for module_name in filter(lambda x: not x.startswith("__"),
+                              dir(paddle.v2.dataset)):
+        if "convert" in dir(
+                importlib.import_module("paddle.v2.dataset.%s" % module_name)) and \
+                not module_name == "common":
+            ds_path = os.path.join(path, module_name)
+            must_mkdirs(ds_path)
+            getattr(
+                importlib.import_module("paddle.v2.dataset.%s" % module_name),
+                "convert")(ds_path)
+
+
+def split(reader, line_count, suffix="%05d.pickle", dumper=cPickle.dump):
+    """
+    you can call the function as:
+
+    split(paddle.v2.dataset.cifar.train10(), line_count=1000,
+        suffix="imikolov-train-%05d.pickle")
+
+    the output files as:
+
+    |-imikolov-train-00000.pickle
+    |-imikolov-train-00001.pickle
+    |- ...
+    |-imikolov-train-00480.pickle
+
+    :param reader: is a reader creator
+    :param line_count: line count for each file
+    :param suffix: the suffix for the output files, should contain "%d"
+                means the id for each file. Default is "%05d.pickle"
+    :param dumper: is a callable function that dump object to file, this
+                function will be called as dumper(obj, f) and obj is the object
+                will be dumped, f is a file object. Default is cPickle.dump.
+    """
+    if not callable(dumper):
+        raise TypeError("dumper should be callable.")
+    lines = []
+    indx_f = 0
+    for i, d in enumerate(reader()):
+        lines.append(d)
+        if i >= line_count and i % line_count == 0:
+            with open(suffix % indx_f, "w") as f:
+                dumper(lines, f)
+                lines = []
+                indx_f += 1
+    if lines:
+        with open(suffix % indx_f, "w") as f:
+            dumper(lines, f)
+
+
+def cluster_files_reader(files_pattern,
+                         trainer_count,
+                         trainer_id,
+                         loader=cPickle.load):
+    """
+    Create a reader that yield element from the given files, select
+    a file set according trainer count and trainer_id
+
+    :param files_pattern: the files which generating by split(...)
+    :param trainer_count: total trainer count
+    :param trainer_id: the trainer rank id
+    :param loader: is a callable function that load object from file, this
+                function will be called as loader(f) and f is a file object.
+                Default is cPickle.load
+    """
+
+    def reader():
+        if not callable(loader):
+            raise TypeError("loader should be callable.")
+        file_list = glob.glob(files_pattern)
+        file_list.sort()
+        my_file_list = []
+        for idx, fn in enumerate(file_list):
+            if idx % trainer_count == trainer_id:
+                print "append file: %s" % fn
+                my_file_list.append(fn)
+        for fn in my_file_list:
+            with open(fn, "r") as f:
+                lines = loader(f)
+                for line in lines:
+                    yield line
+
+    return reader
+
+
+def convert(output_path, reader, line_count, name_prefix):
+    import recordio
+    """
+    Convert data from reader to recordio format files.
+
+    :param output_path: directory in which output files will be saved.
+    :param reader: a data reader, from which the convert program will read
+                   data instances.
+    :param name_prefix: the name prefix of generated files.
+    :param max_lines_to_shuffle: the max lines numbers to shuffle before
+                                 writing.
+    """
+
+    assert line_count >= 1
+    indx_f = 0
+
+    def write_data(indx_f, lines):
+        filename = "%s/%s-%05d" % (output_path, name_prefix, indx_f)
+        writer = recordio.writer(filename)
+        for l in lines:
+            # FIXME(Yancey1989):
+            # dumps with protocol: pickle.HIGHEST_PROTOCOL
+            writer.write(cPickle.dumps(l))
+        writer.close()
+
+    lines = []
+    for i, d in enumerate(reader()):
+        lines.append(d)
+        if i % line_count == 0 and i >= line_count:
+            write_data(indx_f, lines)
+            lines = []
+            indx_f += 1
+            continue
+
+    write_data(indx_f, lines)
diff --git a/python/paddle/v2/dataset/conll05.py b/python/paddle/v2/dataset/conll05.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d544efac9cd20157f87b5cd3b68f97ab5ed2dbc
--- /dev/null
+++ b/python/paddle/v2/dataset/conll05.py
@@ -0,0 +1,257 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Conll05 dataset.
+Paddle semantic role labeling Book and demo use this dataset as an example.
+Because Conll05 is not free in public, the default downloaded URL is test set
+of Conll05 (which is public). Users can change URL and MD5 to their Conll
+dataset. And a pre-trained word vector model based on Wikipedia corpus is used
+to initialize SRL model.
+"""
+
+import tarfile
+import gzip
+import itertools
+import paddle.v2.dataset.common
+
+__all__ = ['test, get_dict', 'get_embedding', 'convert']
+
+DATA_URL = 'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz'
+DATA_MD5 = '387719152ae52d60422c016e92a742fc'
+WORDDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt'
+WORDDICT_MD5 = 'ea7fb7d4c75cc6254716f0177a506baa'
+VERBDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/verbDict.txt'
+VERBDICT_MD5 = '0d2977293bbb6cbefab5b0f97db1e77c'
+TRGDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/targetDict.txt'
+TRGDICT_MD5 = 'd8c7f03ceb5fc2e5a0fa7503a4353751'
+EMB_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb'
+EMB_MD5 = 'bf436eb0faa1f6f9103017f8be57cdb7'
+
+UNK_IDX = 0
+
+
+def load_label_dict(filename):
+    d = dict()
+    tag_dict = set()
+    with open(filename, 'r') as f:
+        for i, line in enumerate(f):
+            line = line.strip()
+            if line.startswith("B-"):
+                tag_dict.add(line[2:])
+            elif line.startswith("I-"):
+                tag_dict.add(line[2:])
+        index = 0
+        for tag in tag_dict:
+            d["B-" + tag] = index
+            index += 1
+            d["I-" + tag] = index
+            index += 1
+        d["O"] = index
+    return d
+
+
+def load_dict(filename):
+    d = dict()
+    with open(filename, 'r') as f:
+        for i, line in enumerate(f):
+            d[line.strip()] = i
+    return d
+
+
+def corpus_reader(data_path, words_name, props_name):
+    """
+    Read one corpus. It returns an iterator. Each element of
+    this iterator is a tuple including sentence and labels. The sentence is
+    consist of a list of word IDs. The labels include a list of label IDs.
+    :return: a iterator of data.
+    :rtype: iterator
+    """
+
+    def reader():
+        tf = tarfile.open(data_path)
+        wf = tf.extractfile(words_name)
+        pf = tf.extractfile(props_name)
+        with gzip.GzipFile(fileobj=wf) as words_file, gzip.GzipFile(
+                fileobj=pf) as props_file:
+            sentences = []
+            labels = []
+            one_seg = []
+            for word, label in itertools.izip(words_file, props_file):
+                word = word.strip()
+                label = label.strip().split()
+
+                if len(label) == 0:  # end of sentence
+                    for i in xrange(len(one_seg[0])):
+                        a_kind_lable = [x[i] for x in one_seg]
+                        labels.append(a_kind_lable)
+
+                    if len(labels) >= 1:
+                        verb_list = []
+                        for x in labels[0]:
+                            if x != '-':
+                                verb_list.append(x)
+
+                        for i, lbl in enumerate(labels[1:]):
+                            cur_tag = 'O'
+                            is_in_bracket = False
+                            lbl_seq = []
+                            verb_word = ''
+                            for l in lbl:
+                                if l == '*' and is_in_bracket == False:
+                                    lbl_seq.append('O')
+                                elif l == '*' and is_in_bracket == True:
+                                    lbl_seq.append('I-' + cur_tag)
+                                elif l == '*)':
+                                    lbl_seq.append('I-' + cur_tag)
+                                    is_in_bracket = False
+                                elif l.find('(') != -1 and l.find(')') != -1:
+                                    cur_tag = l[1:l.find('*')]
+                                    lbl_seq.append('B-' + cur_tag)
+                                    is_in_bracket = False
+                                elif l.find('(') != -1 and l.find(')') == -1:
+                                    cur_tag = l[1:l.find('*')]
+                                    lbl_seq.append('B-' + cur_tag)
+                                    is_in_bracket = True
+                                else:
+                                    raise RuntimeError('Unexpected label: %s' %
+                                                       l)
+
+                            yield sentences, verb_list[i], lbl_seq
+
+                    sentences = []
+                    labels = []
+                    one_seg = []
+                else:
+                    sentences.append(word)
+                    one_seg.append(label)
+
+        pf.close()
+        wf.close()
+        tf.close()
+
+    return reader
+
+
+def reader_creator(corpus_reader,
+                   word_dict=None,
+                   predicate_dict=None,
+                   label_dict=None):
+    def reader():
+        for sentence, predicate, labels in corpus_reader():
+
+            sen_len = len(sentence)
+
+            verb_index = labels.index('B-V')
+            mark = [0] * len(labels)
+            if verb_index > 0:
+                mark[verb_index - 1] = 1
+                ctx_n1 = sentence[verb_index - 1]
+            else:
+                ctx_n1 = 'bos'
+
+            if verb_index > 1:
+                mark[verb_index - 2] = 1
+                ctx_n2 = sentence[verb_index - 2]
+            else:
+                ctx_n2 = 'bos'
+
+            mark[verb_index] = 1
+            ctx_0 = sentence[verb_index]
+
+            if verb_index < len(labels) - 1:
+                mark[verb_index + 1] = 1
+                ctx_p1 = sentence[verb_index + 1]
+            else:
+                ctx_p1 = 'eos'
+
+            if verb_index < len(labels) - 2:
+                mark[verb_index + 2] = 1
+                ctx_p2 = sentence[verb_index + 2]
+            else:
+                ctx_p2 = 'eos'
+
+            word_idx = [word_dict.get(w, UNK_IDX) for w in sentence]
+
+            ctx_n2_idx = [word_dict.get(ctx_n2, UNK_IDX)] * sen_len
+            ctx_n1_idx = [word_dict.get(ctx_n1, UNK_IDX)] * sen_len
+            ctx_0_idx = [word_dict.get(ctx_0, UNK_IDX)] * sen_len
+            ctx_p1_idx = [word_dict.get(ctx_p1, UNK_IDX)] * sen_len
+            ctx_p2_idx = [word_dict.get(ctx_p2, UNK_IDX)] * sen_len
+
+            pred_idx = [predicate_dict.get(predicate)] * sen_len
+            label_idx = [label_dict.get(w) for w in labels]
+
+            yield word_idx, ctx_n2_idx, ctx_n1_idx, \
+              ctx_0_idx, ctx_p1_idx, ctx_p2_idx, pred_idx, mark, label_idx
+
+    return reader
+
+
+def get_dict():
+    """
+    Get the word, verb and label dictionary of Wikipedia corpus.
+    """
+    word_dict = load_dict(
+        paddle.v2.dataset.common.download(WORDDICT_URL, 'conll05st',
+                                          WORDDICT_MD5))
+    verb_dict = load_dict(
+        paddle.v2.dataset.common.download(VERBDICT_URL, 'conll05st',
+                                          VERBDICT_MD5))
+    label_dict = load_label_dict(
+        paddle.v2.dataset.common.download(TRGDICT_URL, 'conll05st',
+                                          TRGDICT_MD5))
+    return word_dict, verb_dict, label_dict
+
+
+def get_embedding():
+    """
+    Get the trained word vector based on Wikipedia corpus.
+    """
+    return paddle.v2.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5)
+
+
+def test():
+    """
+    Conll05 test set creator.
+
+    Because the training dataset is not free, the test dataset is used for
+    training. It returns a reader creator, each sample in the reader is nine
+    features, including sentence sequence, predicate, predicate context,
+    predicate context flag and tagged sequence.
+
+    :return: Training reader creator
+    :rtype: callable
+    """
+    word_dict, verb_dict, label_dict = get_dict()
+    reader = corpus_reader(
+        paddle.v2.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5),
+        words_name='conll05st-release/test.wsj/words/test.wsj.words.gz',
+        props_name='conll05st-release/test.wsj/props/test.wsj.props.gz')
+    return reader_creator(reader, word_dict, verb_dict, label_dict)
+
+
+def fetch():
+    paddle.v2.dataset.common.download(WORDDICT_URL, 'conll05st', WORDDICT_MD5)
+    paddle.v2.dataset.common.download(VERBDICT_URL, 'conll05st', VERBDICT_MD5)
+    paddle.v2.dataset.common.download(TRGDICT_URL, 'conll05st', TRGDICT_MD5)
+    paddle.v2.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5)
+    paddle.v2.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    paddle.v2.dataset.common.convert(path, test(), 1000, "conl105_train")
+    paddle.v2.dataset.common.convert(path, test(), 1000, "conl105_test")
diff --git a/python/paddle/v2/dataset/flowers.py b/python/paddle/v2/dataset/flowers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7bdddeaabec733ef26b3f766c6437f5c53d65044
--- /dev/null
+++ b/python/paddle/v2/dataset/flowers.py
@@ -0,0 +1,199 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This module will download dataset from
+http://www.robots.ox.ac.uk/~vgg/data/flowers/102/index.html
+and parse train/test set intopaddle reader creators.
+
+This set contains images of flowers belonging to 102 different categories.
+The images were acquired by searching the web and taking pictures. There are a
+minimum of 40 images for each category.
+
+The database was used in:
+
+Nilsback, M-E. and Zisserman, A. Automated flower classification over a large
+ number of classes.Proceedings of the Indian Conference on Computer Vision,
+Graphics and Image Processing (2008)
+http://www.robots.ox.ac.uk/~vgg/publications/papers/nilsback08.{pdf,ps.gz}.
+
+"""
+import cPickle
+import itertools
+import functools
+from common import download
+import tarfile
+import scipy.io as scio
+from paddle.v2.image import *
+from paddle.v2.reader import *
+import os
+import numpy as np
+from multiprocessing import cpu_count
+__all__ = ['train', 'test', 'valid']
+
+DATA_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz'
+LABEL_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/imagelabels.mat'
+SETID_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/setid.mat'
+DATA_MD5 = '33bfc11892f1e405ca193ae9a9f2a118'
+LABEL_MD5 = 'e0620be6f572b9609742df49c70aed4d'
+SETID_MD5 = 'a5357ecc9cb78c4bef273ce3793fc85c'
+# In official 'readme', tstid is the flag of test data
+# and trnid is the flag of train data. But test data is more than train data.
+# So we exchange the train data and test data.
+TRAIN_FLAG = 'tstid'
+TEST_FLAG = 'trnid'
+VALID_FLAG = 'valid'
+
+
+def default_mapper(is_train, sample):
+    '''
+    map image bytes data to type needed by model input layer
+    '''
+    img, label = sample
+    img = load_image_bytes(img)
+    img = simple_transform(
+        img, 256, 224, is_train, mean=[103.94, 116.78, 123.68])
+    return img.flatten().astype('float32'), label
+
+
+train_mapper = functools.partial(default_mapper, True)
+test_mapper = functools.partial(default_mapper, False)
+
+
+def reader_creator(data_file,
+                   label_file,
+                   setid_file,
+                   dataset_name,
+                   mapper,
+                   buffered_size=1024,
+                   use_xmap=True):
+    '''
+    1. read images from tar file and
+        merge images into batch files in 102flowers.tgz_batch/
+    2. get a reader to read sample from batch file
+
+    :param data_file: downloaded data file
+    :type data_file: string
+    :param label_file: downloaded label file
+    :type label_file: string
+    :param setid_file: downloaded setid file containing information
+                        about how to split dataset
+    :type setid_file: string
+    :param dataset_name: data set name (tstid|trnid|valid)
+    :type dataset_name: string
+    :param mapper: a function to map image bytes data to type
+                    needed by model input layer
+    :type mapper: callable
+    :param buffered_size: the size of buffer used to process images
+    :type buffered_size: int
+    :return: data reader
+    :rtype: callable
+    '''
+    labels = scio.loadmat(label_file)['labels'][0]
+    indexes = scio.loadmat(setid_file)[dataset_name][0]
+    img2label = {}
+    for i in indexes:
+        img = "jpg/image_%05d.jpg" % i
+        img2label[img] = labels[i - 1]
+    file_list = batch_images_from_tar(data_file, dataset_name, img2label)
+
+    def reader():
+        for file in open(file_list):
+            file = file.strip()
+            batch = None
+            with open(file, 'r') as f:
+                batch = cPickle.load(f)
+            data = batch['data']
+            labels = batch['label']
+            for sample, label in itertools.izip(data, batch['label']):
+                yield sample, int(label) - 1
+
+    if use_xmap:
+        return xmap_readers(mapper, reader, cpu_count(), buffered_size)
+    else:
+        return map_readers(mapper, reader)
+
+
+def train(mapper=train_mapper, buffered_size=1024, use_xmap=True):
+    '''
+    Create flowers training set reader.
+    It returns a reader, each sample in the reader is
+    image pixels in [0, 1] and label in [1, 102]
+    translated from original color image by steps:
+    1. resize to 256*256
+    2. random crop to 224*224
+    3. flatten
+    :param mapper:  a function to map sample.
+    :type mapper: callable
+    :param buffered_size: the size of buffer used to process images
+    :type buffered_size: int
+    :return: train data reader
+    :rtype: callable
+    '''
+    return reader_creator(
+        download(DATA_URL, 'flowers', DATA_MD5),
+        download(LABEL_URL, 'flowers', LABEL_MD5),
+        download(SETID_URL, 'flowers', SETID_MD5), TRAIN_FLAG, mapper,
+        buffered_size, use_xmap)
+
+
+def test(mapper=test_mapper, buffered_size=1024, use_xmap=True):
+    '''
+    Create flowers test set reader.
+    It returns a reader, each sample in the reader is
+    image pixels in [0, 1] and label in [1, 102]
+    translated from original color image by steps:
+    1. resize to 256*256
+    2. random crop to 224*224
+    3. flatten
+    :param mapper:  a function to map sample.
+    :type mapper: callable
+    :param buffered_size: the size of buffer used to process images
+    :type buffered_size: int
+    :return: test data reader
+    :rtype: callable
+    '''
+    return reader_creator(
+        download(DATA_URL, 'flowers', DATA_MD5),
+        download(LABEL_URL, 'flowers', LABEL_MD5),
+        download(SETID_URL, 'flowers', SETID_MD5), TEST_FLAG, mapper,
+        buffered_size, use_xmap)
+
+
+def valid(mapper=test_mapper, buffered_size=1024, use_xmap=True):
+    '''
+    Create flowers validation set reader.
+    It returns a reader, each sample in the reader is
+    image pixels in [0, 1] and label in [1, 102]
+    translated from original color image by steps:
+    1. resize to 256*256
+    2. random crop to 224*224
+    3. flatten
+    :param mapper:  a function to map sample.
+    :type mapper: callable
+    :param buffered_size: the size of buffer used to process images
+    :type buffered_size: int
+    :return: test data reader
+    :rtype: callable
+    '''
+    return reader_creator(
+        download(DATA_URL, 'flowers', DATA_MD5),
+        download(LABEL_URL, 'flowers', LABEL_MD5),
+        download(SETID_URL, 'flowers', SETID_MD5), VALID_FLAG, mapper,
+        buffered_size, use_xmap)
+
+
+def fetch():
+    download(DATA_URL, 'flowers', DATA_MD5)
+    download(LABEL_URL, 'flowers', LABEL_MD5)
+    download(SETID_URL, 'flowers', SETID_MD5)
diff --git a/python/paddle/v2/dataset/imdb.py b/python/paddle/v2/dataset/imdb.py
new file mode 100644
index 0000000000000000000000000000000000000000..37c4296f9bcea7e16daa46f778934331513c30c4
--- /dev/null
+++ b/python/paddle/v2/dataset/imdb.py
@@ -0,0 +1,148 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+IMDB dataset.
+
+This module downloads IMDB dataset from
+http://ai.stanford.edu/%7Eamaas/data/sentiment/. This dataset contains a set
+of 25,000 highly polar movie reviews for training, and 25,000 for testing.
+Besides, this module also provides API for building dictionary.
+"""
+
+import paddle.v2.dataset.common
+import collections
+import tarfile
+import re
+import string
+
+__all__ = ['build_dict', 'train', 'test', 'convert']
+
+URL = 'http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz'
+MD5 = '7c2ac02c03563afcf9b574c7e56c153a'
+
+
+def tokenize(pattern):
+    """
+    Read files that match the given pattern.  Tokenize and yield each file.
+    """
+
+    with tarfile.open(paddle.v2.dataset.common.download(URL, 'imdb',
+                                                        MD5)) as tarf:
+        # Note that we should use tarfile.next(), which does
+        # sequential access of member files, other than
+        # tarfile.extractfile, which does random access and might
+        # destroy hard disks.
+        tf = tarf.next()
+        while tf != None:
+            if bool(pattern.match(tf.name)):
+                # newline and punctuations removal and ad-hoc tokenization.
+                yield tarf.extractfile(tf).read().rstrip("\n\r").translate(
+                    None, string.punctuation).lower().split()
+            tf = tarf.next()
+
+
+def build_dict(pattern, cutoff):
+    """
+    Build a word dictionary from the corpus. Keys of the dictionary are words,
+    and values are zero-based IDs of these words.
+    """
+    word_freq = collections.defaultdict(int)
+    for doc in tokenize(pattern):
+        for word in doc:
+            word_freq[word] += 1
+
+    # Not sure if we should prune less-frequent words here.
+    word_freq = filter(lambda x: x[1] > cutoff, word_freq.items())
+
+    dictionary = sorted(word_freq, key=lambda x: (-x[1], x[0]))
+    words, _ = list(zip(*dictionary))
+    word_idx = dict(zip(words, xrange(len(words))))
+    word_idx['<unk>'] = len(words)
+    return word_idx
+
+
+def reader_creator(pos_pattern, neg_pattern, word_idx):
+    UNK = word_idx['<unk>']
+    INS = []
+
+    def load(pattern, out, label):
+        for doc in tokenize(pattern):
+            out.append(([word_idx.get(w, UNK) for w in doc], label))
+
+    load(pos_pattern, INS, 0)
+    load(neg_pattern, INS, 1)
+
+    def reader():
+        for doc, label in INS:
+            yield doc, label
+
+    return reader
+
+
+def train(word_idx):
+    """
+    IMDB training set creator.
+
+    It returns a reader creator, each sample in the reader is an zero-based ID
+    sequence and label in [0, 1].
+
+    :param word_idx: word dictionary
+    :type word_idx: dict
+    :return: Training reader creator
+    :rtype: callable
+    """
+    return reader_creator(
+        re.compile("aclImdb/train/pos/.*\.txt$"),
+        re.compile("aclImdb/train/neg/.*\.txt$"), word_idx)
+
+
+def test(word_idx):
+    """
+    IMDB test set creator.
+
+    It returns a reader creator, each sample in the reader is an zero-based ID
+    sequence and label in [0, 1].
+
+    :param word_idx: word dictionary
+    :type word_idx: dict
+    :return: Test reader creator
+    :rtype: callable
+    """
+    return reader_creator(
+        re.compile("aclImdb/test/pos/.*\.txt$"),
+        re.compile("aclImdb/test/neg/.*\.txt$"), word_idx)
+
+
+def word_dict():
+    """
+    Build a word dictionary from the corpus.
+
+    :return: Word dictionary
+    :rtype: dict
+    """
+    return build_dict(
+        re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), 150)
+
+
+def fetch():
+    paddle.v2.dataset.common.download(URL, 'imdb', MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    w = word_dict()
+    paddle.v2.dataset.common.convert(path, lambda: train(w), 1000, "imdb_train")
+    paddle.v2.dataset.common.convert(path, lambda: test(w), 1000, "imdb_test")
diff --git a/python/paddle/v2/dataset/imikolov.py b/python/paddle/v2/dataset/imikolov.py
new file mode 100644
index 0000000000000000000000000000000000000000..617c722c4165cdfed9e650fc968d623ef6ed4391
--- /dev/null
+++ b/python/paddle/v2/dataset/imikolov.py
@@ -0,0 +1,161 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+imikolov's simple dataset.
+
+This module will download dataset from 
+http://www.fit.vutbr.cz/~imikolov/rnnlm/ and parse training set and test set
+into paddle reader creators.
+"""
+import paddle.v2.dataset.common
+import collections
+import tarfile
+
+__all__ = ['train', 'test', 'build_dict', 'convert']
+
+URL = 'http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz'
+MD5 = '30177ea32e27c525793142b6bf2c8e2d'
+
+
+class DataType(object):
+    NGRAM = 1
+    SEQ = 2
+
+
+def word_count(f, word_freq=None):
+    if word_freq is None:
+        word_freq = collections.defaultdict(int)
+
+    for l in f:
+        for w in l.strip().split():
+            word_freq[w] += 1
+        word_freq['<s>'] += 1
+        word_freq['<e>'] += 1
+
+    return word_freq
+
+
+def build_dict(min_word_freq=50):
+    """
+    Build a word dictionary from the corpus,  Keys of the dictionary are words,
+    and values are zero-based IDs of these words.
+    """
+    train_filename = './simple-examples/data/ptb.train.txt'
+    test_filename = './simple-examples/data/ptb.valid.txt'
+    with tarfile.open(
+            paddle.v2.dataset.common.download(
+                paddle.v2.dataset.imikolov.URL, 'imikolov',
+                paddle.v2.dataset.imikolov.MD5)) as tf:
+        trainf = tf.extractfile(train_filename)
+        testf = tf.extractfile(test_filename)
+        word_freq = word_count(testf, word_count(trainf))
+        if '<unk>' in word_freq:
+            # remove <unk> for now, since we will set it as last index
+            del word_freq['<unk>']
+
+        word_freq = filter(lambda x: x[1] > min_word_freq, word_freq.items())
+
+        word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0]))
+        words, _ = list(zip(*word_freq_sorted))
+        word_idx = dict(zip(words, xrange(len(words))))
+        word_idx['<unk>'] = len(words)
+
+    return word_idx
+
+
+def reader_creator(filename, word_idx, n, data_type):
+    def reader():
+        with tarfile.open(
+                paddle.v2.dataset.common.download(
+                    paddle.v2.dataset.imikolov.URL, 'imikolov',
+                    paddle.v2.dataset.imikolov.MD5)) as tf:
+            f = tf.extractfile(filename)
+
+            UNK = word_idx['<unk>']
+            for l in f:
+                if DataType.NGRAM == data_type:
+                    assert n > -1, 'Invalid gram length'
+                    l = ['<s>'] + l.strip().split() + ['<e>']
+                    if len(l) >= n:
+                        l = [word_idx.get(w, UNK) for w in l]
+                        for i in range(n, len(l) + 1):
+                            yield tuple(l[i - n:i])
+                elif DataType.SEQ == data_type:
+                    l = l.strip().split()
+                    l = [word_idx.get(w, UNK) for w in l]
+                    src_seq = [word_idx['<s>']] + l
+                    trg_seq = l + [word_idx['<e>']]
+                    if n > 0 and len(src_seq) > n: continue
+                    yield src_seq, trg_seq
+                else:
+                    assert False, 'Unknow data type'
+
+    return reader
+
+
+def train(word_idx, n, data_type=DataType.NGRAM):
+    """
+    imikolov training set creator.
+
+    It returns a reader creator, each sample in the reader is a word ID
+    tuple.
+
+    :param word_idx: word dictionary
+    :type word_idx: dict
+    :param n: sliding window size if type is ngram, otherwise max length of sequence
+    :type n: int
+    :param data_type: data type (ngram or sequence)
+    :type data_type: member variable of DataType (NGRAM or SEQ)
+    :return: Training reader creator
+    :rtype: callable
+    """
+    return reader_creator('./simple-examples/data/ptb.train.txt', word_idx, n,
+                          data_type)
+
+
+def test(word_idx, n, data_type=DataType.NGRAM):
+    """
+    imikolov test set creator.
+
+    It returns a reader creator, each sample in the reader is a word ID
+    tuple.
+
+    :param word_idx: word dictionary
+    :type word_idx: dict
+    :param n: sliding window size if type is ngram, otherwise max length of sequence
+    :type n: int
+    :param data_type: data type (ngram or sequence)
+    :type data_type: member variable of DataType (NGRAM or SEQ)
+    :return: Test reader creator
+    :rtype: callable
+    """
+    return reader_creator('./simple-examples/data/ptb.valid.txt', word_idx, n,
+                          data_type)
+
+
+def fetch():
+    paddle.v2.dataset.common.download(URL, "imikolov", MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    N = 5
+    word_dict = build_dict()
+    paddle.v2.dataset.common.convert(path,
+                                     train(word_dict, N), 1000,
+                                     "imikolov_train")
+    paddle.v2.dataset.common.convert(path,
+                                     test(word_dict, N), 1000, "imikolov_test")
diff --git a/python/paddle/v2/dataset/mnist.py b/python/paddle/v2/dataset/mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f675bed895223e054cd3bb6e504fe1607f19858
--- /dev/null
+++ b/python/paddle/v2/dataset/mnist.py
@@ -0,0 +1,123 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+MNIST dataset.
+
+This module will download dataset from http://yann.lecun.com/exdb/mnist/ and
+parse training set and test set into paddle reader creators.
+"""
+import paddle.v2.dataset.common
+import subprocess
+import numpy
+import platform
+__all__ = ['train', 'test', 'convert']
+
+URL_PREFIX = 'http://yann.lecun.com/exdb/mnist/'
+TEST_IMAGE_URL = URL_PREFIX + 't10k-images-idx3-ubyte.gz'
+TEST_IMAGE_MD5 = '9fb629c4189551a2d022fa330f9573f3'
+TEST_LABEL_URL = URL_PREFIX + 't10k-labels-idx1-ubyte.gz'
+TEST_LABEL_MD5 = 'ec29112dd5afa0611ce80d1b7f02629c'
+TRAIN_IMAGE_URL = URL_PREFIX + 'train-images-idx3-ubyte.gz'
+TRAIN_IMAGE_MD5 = 'f68b3c2dcbeaaa9fbdd348bbdeb94873'
+TRAIN_LABEL_URL = URL_PREFIX + 'train-labels-idx1-ubyte.gz'
+TRAIN_LABEL_MD5 = 'd53e105ee54ea40749a09fcbcd1e9432'
+
+
+def reader_creator(image_filename, label_filename, buffer_size):
+    def reader():
+        if platform.system() == 'Darwin':
+            zcat_cmd = 'gzcat'
+        elif platform.system() == 'Linux':
+            zcat_cmd = 'zcat'
+        else:
+            raise NotImplementedError()
+
+        # According to http://stackoverflow.com/a/38061619/724872, we
+        # cannot use standard package gzip here.
+        m = subprocess.Popen([zcat_cmd, image_filename], stdout=subprocess.PIPE)
+        m.stdout.read(16)  # skip some magic bytes
+
+        l = subprocess.Popen([zcat_cmd, label_filename], stdout=subprocess.PIPE)
+        l.stdout.read(8)  # skip some magic bytes
+
+        try:  # reader could be break.
+            while True:
+                labels = numpy.fromfile(
+                    l.stdout, 'ubyte', count=buffer_size).astype("int")
+
+                if labels.size != buffer_size:
+                    break  # numpy.fromfile returns empty slice after EOF.
+
+                images = numpy.fromfile(
+                    m.stdout, 'ubyte', count=buffer_size * 28 * 28).reshape(
+                        (buffer_size, 28 * 28)).astype('float32')
+
+                images = images / 255.0 * 2.0 - 1.0
+
+                for i in xrange(buffer_size):
+                    yield images[i, :], int(labels[i])
+        finally:
+            m.terminate()
+            l.terminate()
+
+    return reader
+
+
+def train():
+    """
+    MNIST training set creator.
+
+    It returns a reader creator, each sample in the reader is image pixels in
+    [0, 1] and label in [0, 9].
+
+    :return: Training reader creator
+    :rtype: callable
+    """
+    return reader_creator(
+        paddle.v2.dataset.common.download(TRAIN_IMAGE_URL, 'mnist',
+                                          TRAIN_IMAGE_MD5),
+        paddle.v2.dataset.common.download(TRAIN_LABEL_URL, 'mnist',
+                                          TRAIN_LABEL_MD5), 100)
+
+
+def test():
+    """
+    MNIST test set creator.
+
+    It returns a reader creator, each sample in the reader is image pixels in
+    [0, 1] and label in [0, 9].
+
+    :return: Test reader creator.
+    :rtype: callable
+    """
+    return reader_creator(
+        paddle.v2.dataset.common.download(TEST_IMAGE_URL, 'mnist',
+                                          TEST_IMAGE_MD5),
+        paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist',
+                                          TEST_LABEL_MD5), 100)
+
+
+def fetch():
+    paddle.v2.dataset.common.download(TRAIN_IMAGE_URL, 'mnist', TRAIN_IMAGE_MD5)
+    paddle.v2.dataset.common.download(TRAIN_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
+    paddle.v2.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5)
+    paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    paddle.v2.dataset.common.convert(path, train(), 1000, "minist_train")
+    paddle.v2.dataset.common.convert(path, test(), 1000, "minist_test")
diff --git a/python/paddle/v2/dataset/movielens.py b/python/paddle/v2/dataset/movielens.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b61a9420af1bb81e1d826f8a7b69f34c306d382
--- /dev/null
+++ b/python/paddle/v2/dataset/movielens.py
@@ -0,0 +1,262 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Movielens 1-M dataset.
+
+Movielens 1-M dataset contains 1 million ratings from 6000 users on 4000
+movies, which was collected by GroupLens Research. This module will download
+Movielens 1-M dataset from 
+http://files.grouplens.org/datasets/movielens/ml-1m.zip and parse training
+set and test set into paddle reader creators.
+
+"""
+
+import zipfile
+import paddle.v2.dataset.common
+import re
+import random
+import functools
+
+__all__ = [
+    'train', 'test', 'get_movie_title_dict', 'max_movie_id', 'max_user_id',
+    'age_table', 'movie_categories', 'max_job_id', 'user_info', 'movie_info',
+    'convert'
+]
+
+age_table = [1, 18, 25, 35, 45, 50, 56]
+
+URL = 'http://files.grouplens.org/datasets/movielens/ml-1m.zip'
+MD5 = 'c4d9eecfca2ab87c1945afe126590906'
+
+
+class MovieInfo(object):
+    """
+    Movie id, title and categories information are stored in MovieInfo.
+    """
+
+    def __init__(self, index, categories, title):
+        self.index = int(index)
+        self.categories = categories
+        self.title = title
+
+    def value(self):
+        """
+        Get information from a movie.
+        """
+        return [
+            self.index, [CATEGORIES_DICT[c] for c in self.categories],
+            [MOVIE_TITLE_DICT[w.lower()] for w in self.title.split()]
+        ]
+
+    def __str__(self):
+        return "<MovieInfo id(%d), title(%s), categories(%s)>" % (
+            self.index, self.title, self.categories)
+
+    def __repr__(self):
+        return self.__str__()
+
+
+class UserInfo(object):
+    """
+    User id, gender, age, and job information are stored in UserInfo.
+    """
+
+    def __init__(self, index, gender, age, job_id):
+        self.index = int(index)
+        self.is_male = gender == 'M'
+        self.age = age_table.index(int(age))
+        self.job_id = int(job_id)
+
+    def value(self):
+        """
+        Get information from a user.
+        """
+        return [self.index, 0 if self.is_male else 1, self.age, self.job_id]
+
+    def __str__(self):
+        return "<UserInfo id(%d), gender(%s), age(%d), job(%d)>" % (
+            self.index, "M"
+            if self.is_male else "F", age_table[self.age], self.job_id)
+
+    def __repr__(self):
+        return str(self)
+
+
+MOVIE_INFO = None
+MOVIE_TITLE_DICT = None
+CATEGORIES_DICT = None
+USER_INFO = None
+
+
+def __initialize_meta_info__():
+    fn = paddle.v2.dataset.common.download(URL, "movielens", MD5)
+    global MOVIE_INFO
+    if MOVIE_INFO is None:
+        pattern = re.compile(r'^(.*)\((\d+)\)$')
+        with zipfile.ZipFile(file=fn) as package:
+            for info in package.infolist():
+                assert isinstance(info, zipfile.ZipInfo)
+                MOVIE_INFO = dict()
+                title_word_set = set()
+                categories_set = set()
+                with package.open('ml-1m/movies.dat') as movie_file:
+                    for i, line in enumerate(movie_file):
+                        movie_id, title, categories = line.strip().split('::')
+                        categories = categories.split('|')
+                        for c in categories:
+                            categories_set.add(c)
+                        title = pattern.match(title).group(1)
+                        MOVIE_INFO[int(movie_id)] = MovieInfo(
+                            index=movie_id, categories=categories, title=title)
+                        for w in title.split():
+                            title_word_set.add(w.lower())
+
+                global MOVIE_TITLE_DICT
+                MOVIE_TITLE_DICT = dict()
+                for i, w in enumerate(title_word_set):
+                    MOVIE_TITLE_DICT[w] = i
+
+                global CATEGORIES_DICT
+                CATEGORIES_DICT = dict()
+                for i, c in enumerate(categories_set):
+                    CATEGORIES_DICT[c] = i
+
+                global USER_INFO
+                USER_INFO = dict()
+                with package.open('ml-1m/users.dat') as user_file:
+                    for line in user_file:
+                        uid, gender, age, job, _ = line.strip().split("::")
+                        USER_INFO[int(uid)] = UserInfo(
+                            index=uid, gender=gender, age=age, job_id=job)
+    return fn
+
+
+def __reader__(rand_seed=0, test_ratio=0.1, is_test=False):
+    fn = __initialize_meta_info__()
+    rand = random.Random(x=rand_seed)
+    with zipfile.ZipFile(file=fn) as package:
+        with package.open('ml-1m/ratings.dat') as rating:
+            for line in rating:
+                if (rand.random() < test_ratio) == is_test:
+                    uid, mov_id, rating, _ = line.strip().split("::")
+                    uid = int(uid)
+                    mov_id = int(mov_id)
+                    rating = float(rating) * 2 - 5.0
+
+                    mov = MOVIE_INFO[mov_id]
+                    usr = USER_INFO[uid]
+                    yield usr.value() + mov.value() + [[rating]]
+
+
+def __reader_creator__(**kwargs):
+    return lambda: __reader__(**kwargs)
+
+
+train = functools.partial(__reader_creator__, is_test=False)
+test = functools.partial(__reader_creator__, is_test=True)
+
+
+def get_movie_title_dict():
+    """
+    Get movie title dictionary.
+    """
+    __initialize_meta_info__()
+    return MOVIE_TITLE_DICT
+
+
+def __max_index_info__(a, b):
+    if a.index > b.index:
+        return a
+    else:
+        return b
+
+
+def max_movie_id():
+    """
+    Get the maximum value of movie id.
+    """
+    __initialize_meta_info__()
+    return reduce(__max_index_info__, MOVIE_INFO.viewvalues()).index
+
+
+def max_user_id():
+    """
+    Get the maximum value of user id.
+    """
+    __initialize_meta_info__()
+    return reduce(__max_index_info__, USER_INFO.viewvalues()).index
+
+
+def __max_job_id_impl__(a, b):
+    if a.job_id > b.job_id:
+        return a
+    else:
+        return b
+
+
+def max_job_id():
+    """
+    Get the maximum value of job id.
+    """
+    __initialize_meta_info__()
+    return reduce(__max_job_id_impl__, USER_INFO.viewvalues()).job_id
+
+
+def movie_categories():
+    """
+    Get movie categoriges dictionary.
+    """
+    __initialize_meta_info__()
+    return CATEGORIES_DICT
+
+
+def user_info():
+    """
+    Get user info dictionary.
+    """
+    __initialize_meta_info__()
+    return USER_INFO
+
+
+def movie_info():
+    """
+    Get movie info dictionary.
+    """
+    __initialize_meta_info__()
+    return MOVIE_INFO
+
+
+def unittest():
+    for train_count, _ in enumerate(train()()):
+        pass
+    for test_count, _ in enumerate(test()()):
+        pass
+
+    print train_count, test_count
+
+
+def fetch():
+    paddle.v2.dataset.common.download(URL, "movielens", MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    paddle.v2.dataset.common.convert(path, train(), 1000, "movielens_train")
+    paddle.v2.dataset.common.convert(path, test(), 1000, "movielens_test")
+
+
+if __name__ == '__main__':
+    unittest()
diff --git a/python/paddle/v2/dataset/mq2007.py b/python/paddle/v2/dataset/mq2007.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3b3dd524c34be660c5f2d4fc5ce2fa0420efbc1
--- /dev/null
+++ b/python/paddle/v2/dataset/mq2007.py
@@ -0,0 +1,333 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+MQ2007 dataset
+
+MQ2007 is a query set from Million Query track of TREC 2007. There are about 1700 queries in it with labeled documents. In MQ2007, the 5-fold cross
+validation strategy is adopted and the 5-fold partitions are included in the package. In each fold, there are three subsets for learning: training set,
+validation set and testing set.
+
+MQ2007 dataset from website
+http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2007.rar and parse training set and test set into paddle reader creators
+
+"""
+
+import os
+import functools
+import rarfile
+from common import download
+import numpy as np
+
+# URL = "http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2007.rar"
+URL = "http://www.bigdatalab.ac.cn/benchmark/upload/download_source/7b6dbbe2-842c-11e4-a536-bcaec51b9163_MQ2007.rar"
+MD5 = "7be1640ae95c6408dab0ae7207bdc706"
+
+
+def __initialize_meta_info__():
+    """
+  download and extract the MQ2007 dataset
+  """
+    fn = fetch()
+    rar = rarfile.RarFile(fn)
+    dirpath = os.path.dirname(fn)
+    rar.extractall(path=dirpath)
+    return dirpath
+
+
+class Query(object):
+    """
+  queries used for learning to rank algorithms. It is created from relevance scores,  query-document feature vectors
+
+  Parameters:
+  ----------
+  query_id : int
+    query_id in dataset, mapping from query to relevance documents
+  relevance_score : int 
+    relevance score of query and document pair
+  feature_vector : array, dense feature
+    feature in vector format
+  description : string
+    comment section in query doc pair data
+  """
+
+    def __init__(self,
+                 query_id=-1,
+                 relevance_score=-1,
+                 feature_vector=None,
+                 description=""):
+        self.query_id = query_id
+        self.relevance_score = relevance_score
+        if feature_vector is None:
+            self.feature_vector = []
+        else:
+            self.feature_vector = feature_vector
+        self.description = description
+
+    def __str__(self):
+        string = "%s %s %s" % (str(self.relevance_score), str(self.query_id),
+                               " ".join(str(f) for f in self.feature_vector))
+        return string
+
+    # @classmethod
+    def _parse_(self, text):
+        """
+    parse line into Query
+    """
+        comment_position = text.find('#')
+        line = text[:comment_position].strip()
+        self.description = text[comment_position + 1:].strip()
+        parts = line.split()
+        if len(parts) != 48:
+            sys.stdout.write("expect 48 space split parts, get %d" %
+                             (len(parts)))
+            return None
+        # format : 0 qid:10 1:0.000272 2:0.000000 .... 
+        self.relevance_score = int(parts[0])
+        self.query_id = int(parts[1].split(':')[1])
+        for p in parts[2:]:
+            pair = p.split(':')
+            self.feature_vector.append(float(pair[1]))
+        return self
+
+
+class QueryList(object):
+    """
+  group query into list, every item in list is a Query
+  """
+
+    def __init__(self, querylist=None):
+        self.query_id = -1
+        if querylist is None:
+            self.querylist = []
+        else:
+            self.querylist = querylist
+            for query in self.querylist:
+                if self.query_id == -1:
+                    self.query_id = query.query_id
+                else:
+                    if self.query_id != query.query_id:
+                        raise ValueError("query in list must be same query_id")
+
+    def __iter__(self):
+        for query in self.querylist:
+            yield query
+
+    def __len__(self):
+        return len(self.querylist)
+
+    def __getitem__(self, i):
+        return self.querylist[i]
+
+    def _correct_ranking_(self):
+        if self.querylist is None:
+            return
+        self.querylist.sort(key=lambda x: x.relevance_score, reverse=True)
+
+    def _add_query(self, query):
+        if self.query_id == -1:
+            self.query_id = query.query_id
+        else:
+            if self.query_id != query.query_id:
+                raise ValueError("query in list must be same query_id")
+        self.querylist.append(query)
+
+
+def gen_plain_txt(querylist):
+    """
+  gen plain text in list for other usage
+  Paramters:
+  --------
+  querylist : querylist, one query match many docment pairs in list, see QueryList
+
+  return :
+  ------
+  query_id : np.array, shape=(samples_num, )
+  label : np.array, shape=(samples_num, )
+  querylist : np.array, shape=(samples_num, feature_dimension)
+    """
+    if not isinstance(querylist, QueryList):
+        querylist = QueryList(querylist)
+    querylist._correct_ranking_()
+    for query in querylist:
+        yield querylist.query_id, query.relevance_score, np.array(
+            query.feature_vector)
+
+
+def gen_point(querylist):
+    """
+  gen item in list for point-wise learning to rank algorithm
+  Paramters:
+  --------
+  querylist : querylist, one query match many docment pairs in list, see QueryList
+
+  return :
+  ------
+  label : np.array, shape=(samples_num, )
+  querylist : np.array, shape=(samples_num, feature_dimension)
+  """
+    if not isinstance(querylist, QueryList):
+        querylist = QueryList(querylist)
+    querylist._correct_ranking_()
+    for query in querylist:
+        yield query.relevance_score, np.array(query.feature_vector)
+
+
+def gen_pair(querylist, partial_order="full"):
+    """
+  gen pair for pair-wise learning to rank algorithm
+  Paramters:
+  --------
+  querylist : querylist, one query match many docment pairs in list, see QueryList
+  pairtial_order : "full" or "neighbour"
+    there is redudant in all possiable pair combinations, which can be simplifed
+  gen pairs for neighbour items or the full partial order pairs
+
+  return :
+  ------
+  label : np.array, shape=(1)
+  query_left : np.array, shape=(1, feature_dimension)
+  query_right : same as left
+  """
+    if not isinstance(querylist, QueryList):
+        querylist = QueryList(querylist)
+    querylist._correct_ranking_()
+    labels = []
+    docpairs = []
+
+    # C(n,2)
+    for i in range(len(querylist)):
+        query_left = querylist[i]
+        for j in range(i + 1, len(querylist)):
+            query_right = querylist[j]
+            if query_left.relevance_score > query_right.relevance_score:
+                labels.append([1])
+                docpairs.append([
+                    np.array(query_left.feature_vector),
+                    np.array(query_right.feature_vector)
+                ])
+            elif query_left.relevance_score < query_right.relevance_score:
+                labels.append([1])
+                docpairs.append([
+                    np.array(query_right.feature_vector),
+                    np.array(query_left.feature_vector)
+                ])
+    for label, pair in zip(labels, docpairs):
+        yield np.array(label), pair[0], pair[1]
+
+
+def gen_list(querylist):
+    """
+  gen item in list for list-wise learning to rank algorithm
+  Paramters:
+  --------
+  querylist : querylist, one query match many docment pairs in list, see QueryList
+
+  return :
+  ------
+  label : np.array, shape=(samples_num, )
+  querylist : np.array, shape=(samples_num, feature_dimension)
+  """
+    if not isinstance(querylist, QueryList):
+        querylist = QueryList(querylist)
+    querylist._correct_ranking_()
+    relevance_score_list = [[query.relevance_score] for query in querylist]
+    feature_vector_list = [query.feature_vector for query in querylist]
+    yield np.array(relevance_score_list), np.array(feature_vector_list)
+
+
+def query_filter(querylists):
+    """
+    filter query get only document with label 0.
+    label 0, 1, 2 means the relevance score document with query
+    parameters :
+      querylist : QueyList list
+
+    return :
+      querylist : QueyList list
+    """
+    filter_query = []
+    for querylist in querylists:
+        relevance_score_list = [query.relevance_score for query in querylist]
+        if sum(relevance_score_list) != .0:
+            filter_query.append(querylist)
+    return filter_query
+
+
+def load_from_text(filepath, shuffle=False, fill_missing=-1):
+    """
+  parse data file into querys
+  """
+    prev_query_id = -1
+    querylists = []
+    querylist = None
+    fn = __initialize_meta_info__()
+    with open(os.path.join(fn, filepath)) as f:
+        for line in f:
+            query = Query()
+            query = query._parse_(line)
+            if query == None:
+                continue
+            if query.query_id != prev_query_id:
+                if querylist is not None:
+                    querylists.append(querylist)
+                querylist = QueryList()
+                prev_query_id = query.query_id
+            querylist._add_query(query)
+    if querylist is not None:
+        querylists.append(querylist)
+    return querylists
+
+
+def __reader__(filepath, format="pairwise", shuffle=False, fill_missing=-1):
+    """
+  Parameters
+  --------
+  filename : string
+  fill_missing : fill the missing value. default in MQ2007 is -1
+  
+  Returns
+  ------
+  yield
+    label query_left, query_right  # format = "pairwise"
+    label querylist # format = "listwise"
+  """
+    querylists = query_filter(
+        load_from_text(
+            filepath, shuffle=shuffle, fill_missing=fill_missing))
+    for querylist in querylists:
+        if format == "plain_txt":
+            yield next(gen_plain_txt(querylist))
+        elif format == "pointwise":
+            yield next(gen_point(querylist))
+        elif format == "pairwise":
+            for pair in gen_pair(querylist):
+                yield pair
+        elif format == "listwise":
+            yield next(gen_list(querylist))
+
+
+train = functools.partial(__reader__, filepath="MQ2007/MQ2007/Fold1/train.txt")
+test = functools.partial(__reader__, filepath="MQ2007/MQ2007/Fold1/test.txt")
+
+
+def fetch():
+    return download(URL, "MQ2007", MD5)
+
+
+if __name__ == "__main__":
+    fetch()
+    mytest = functools.partial(
+        __reader__, filepath="MQ2007/MQ2007/Fold1/sample", format="listwise")
+    for label, query in mytest():
+        print label, query
diff --git a/python/paddle/v2/dataset/sentiment.py b/python/paddle/v2/dataset/sentiment.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0b9757c1a75d215cf8945b5cedbb1239fd43af7
--- /dev/null
+++ b/python/paddle/v2/dataset/sentiment.py
@@ -0,0 +1,141 @@
+# /usr/bin/env python
+# -*- coding:utf-8 -*-
+
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+The script fetch and preprocess movie_reviews data set that provided by NLTK
+
+TODO(yuyang18): Complete dataset.
+"""
+
+import collections
+from itertools import chain
+
+import nltk
+from nltk.corpus import movie_reviews
+
+import paddle.v2.dataset.common
+
+__all__ = ['train', 'test', 'get_word_dict', 'convert']
+NUM_TRAINING_INSTANCES = 1600
+NUM_TOTAL_INSTANCES = 2000
+
+
+def download_data_if_not_yet():
+    """
+    Download the data set, if the data set is not download.
+    """
+    try:
+        # make sure that nltk can find the data
+        if paddle.v2.dataset.common.DATA_HOME not in nltk.data.path:
+            nltk.data.path.append(paddle.v2.dataset.common.DATA_HOME)
+        movie_reviews.categories()
+    except LookupError:
+        print "Downloading movie_reviews data set, please wait....."
+        nltk.download(
+            'movie_reviews', download_dir=paddle.v2.dataset.common.DATA_HOME)
+        print "Download data set success....."
+        print "Path is " + nltk.data.find('corpora/movie_reviews').path
+
+
+def get_word_dict():
+    """
+    Sorted the words by the frequency of words which occur in sample
+    :return:
+        words_freq_sorted
+    """
+    words_freq_sorted = list()
+    word_freq_dict = collections.defaultdict(int)
+    download_data_if_not_yet()
+
+    for category in movie_reviews.categories():
+        for field in movie_reviews.fileids(category):
+            for words in movie_reviews.words(field):
+                word_freq_dict[words] += 1
+    words_sort_list = word_freq_dict.items()
+    words_sort_list.sort(cmp=lambda a, b: b[1] - a[1])
+    for index, word in enumerate(words_sort_list):
+        words_freq_sorted.append((word[0], index))
+    return words_freq_sorted
+
+
+def sort_files():
+    """
+    Sorted the sample for cross reading the sample
+    :return:
+        files_list
+    """
+    files_list = list()
+    neg_file_list = movie_reviews.fileids('neg')
+    pos_file_list = movie_reviews.fileids('pos')
+    files_list = list(chain.from_iterable(zip(neg_file_list, pos_file_list)))
+    return files_list
+
+
+def load_sentiment_data():
+    """
+    Load the data set
+    :return:
+        data_set
+    """
+    data_set = list()
+    download_data_if_not_yet()
+    words_ids = dict(get_word_dict())
+    for sample_file in sort_files():
+        words_list = list()
+        category = 0 if 'neg' in sample_file else 1
+        for word in movie_reviews.words(sample_file):
+            words_list.append(words_ids[word.lower()])
+        data_set.append((words_list, category))
+    return data_set
+
+
+def reader_creator(data):
+    """
+    Reader creator, generate an iterator for data set
+    :param data:
+        train data set or test data set
+    """
+    for each in data:
+        yield each[0], each[1]
+
+
+def train():
+    """
+    Default training set reader creator
+    """
+    data_set = load_sentiment_data()
+    return reader_creator(data_set[0:NUM_TRAINING_INSTANCES])
+
+
+def test():
+    """
+    Default test set reader creator
+    """
+    data_set = load_sentiment_data()
+    return reader_creator(data_set[NUM_TRAINING_INSTANCES:])
+
+
+def fetch():
+    nltk.download(
+        'movie_reviews', download_dir=paddle.v2.dataset.common.DATA_HOME)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    paddle.v2.dataset.common.convert(path, train, 1000, "sentiment_train")
+    paddle.v2.dataset.common.convert(path, test, 1000, "sentiment_test")
diff --git a/python/paddle/v2/dataset/tests/cifar_test.py b/python/paddle/v2/dataset/tests/cifar_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0e18229da7818be5752ee592e094a00da286ad9
--- /dev/null
+++ b/python/paddle/v2/dataset/tests/cifar_test.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.dataset.cifar
+import unittest
+
+
+class TestCIFAR(unittest.TestCase):
+    def check_reader(self, reader):
+        sum = 0
+        label = 0
+        for l in reader():
+            self.assertEqual(l[0].size, 3072)
+            if l[1] > label:
+                label = l[1]
+            sum += 1
+        return sum, label
+
+    def test_test10(self):
+        instances, max_label_value = self.check_reader(
+            paddle.v2.dataset.cifar.test10())
+        self.assertEqual(instances, 10000)
+        self.assertEqual(max_label_value, 9)
+
+    def test_train10(self):
+        instances, max_label_value = self.check_reader(
+            paddle.v2.dataset.cifar.train10())
+        self.assertEqual(instances, 50000)
+        self.assertEqual(max_label_value, 9)
+
+    def test_test100(self):
+        instances, max_label_value = self.check_reader(
+            paddle.v2.dataset.cifar.test100())
+        self.assertEqual(instances, 10000)
+        self.assertEqual(max_label_value, 99)
+
+    def test_train100(self):
+        instances, max_label_value = self.check_reader(
+            paddle.v2.dataset.cifar.train100())
+        self.assertEqual(instances, 50000)
+        self.assertEqual(max_label_value, 99)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/dataset/tests/common_test.py b/python/paddle/v2/dataset/tests/common_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..cfa194eba38ea70311c4deeac2635dc0a0103576
--- /dev/null
+++ b/python/paddle/v2/dataset/tests/common_test.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.dataset.common
+import unittest
+import tempfile
+import glob
+
+
+class TestCommon(unittest.TestCase):
+    def test_md5file(self):
+        _, temp_path = tempfile.mkstemp()
+        with open(temp_path, 'w') as f:
+            f.write("Hello\n")
+        self.assertEqual('09f7e02f1290be211da707a266f153b3',
+                         paddle.v2.dataset.common.md5file(temp_path))
+
+    def test_download(self):
+        yi_avatar = 'https://avatars0.githubusercontent.com/u/1548775?v=3&s=460'
+        self.assertEqual(
+            paddle.v2.dataset.common.DATA_HOME + '/test/1548775?v=3&s=460',
+            paddle.v2.dataset.common.download(
+                yi_avatar, 'test', 'f75287202d6622414c706c36c16f8e0d'))
+
+    def test_split(self):
+        def test_reader():
+            def reader():
+                for x in xrange(10):
+                    yield x
+
+            return reader
+
+        _, temp_path = tempfile.mkstemp()
+        paddle.v2.dataset.common.split(
+            test_reader(), 4, suffix=temp_path + '/test-%05d.pickle')
+        files = glob.glob(temp_path + '/test-%05d.pickle')
+        self.assertEqual(len(files), 3)
+
+    def test_cluster_file_reader(self):
+        _, temp_path = tempfile.mkstemp()
+        for x in xrange(5):
+            with open(temp_path + '/%05d.test' % x) as f:
+                f.write('%d\n' % x)
+        reader = paddle.v2.dataset.common.cluster_files_reader(
+            temp_path + '/*.test', 5, 0)
+        for idx, e in enumerate(reader()):
+            self.assertEqual(e, str("0"))
+
+    def test_convert(self):
+        record_num = 10
+        num_shards = 4
+
+        def test_reader():
+            def reader():
+                for x in xrange(record_num):
+                    yield x
+
+            return reader
+
+        path = tempfile.mkdtemp()
+        paddle.v2.dataset.common.convert(path,
+                                         test_reader(), num_shards,
+                                         'random_images')
+
+        files = glob.glob(path + '/random_images-*')
+        self.assertEqual(len(files), num_shards)
+
+        recs = []
+        for i in range(0, num_shards):
+            n = "%s/random_images-%05d-of-%05d" % (path, i, num_shards - 1)
+            r = recordio.reader(n)
+            while True:
+                d = r.read()
+                if d is None:
+                    break
+                recs.append(d)
+
+        recs.sort()
+        self.assertEqual(total, record_num)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/dataset/tests/flowers_test.py b/python/paddle/v2/dataset/tests/flowers_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8ae9a07acc22eb9d3c0cc5ebb07f8f11ed21233
--- /dev/null
+++ b/python/paddle/v2/dataset/tests/flowers_test.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.dataset.flowers
+import unittest
+
+
+class TestFlowers(unittest.TestCase):
+    def check_reader(self, reader):
+        sum = 0
+        label = 0
+        size = 224 * 224 * 3
+        for l in reader():
+            self.assertEqual(l[0].size, size)
+            if l[1] > label:
+                label = l[1]
+            sum += 1
+        return sum, label
+
+    def test_train(self):
+        instances, max_label_value = self.check_reader(
+            paddle.v2.dataset.flowers.train())
+        self.assertEqual(instances, 6149)
+        self.assertEqual(max_label_value, 102)
+
+    def test_test(self):
+        instances, max_label_value = self.check_reader(
+            paddle.v2.dataset.flowers.test())
+        self.assertEqual(instances, 1020)
+        self.assertEqual(max_label_value, 102)
+
+    def test_valid(self):
+        instances, max_label_value = self.check_reader(
+            paddle.v2.dataset.flowers.valid())
+        self.assertEqual(instances, 1020)
+        self.assertEqual(max_label_value, 102)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/dataset/tests/imdb_test.py b/python/paddle/v2/dataset/tests/imdb_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4d82f26895d77d05c6e936bd636b1239e1a0cd8
--- /dev/null
+++ b/python/paddle/v2/dataset/tests/imdb_test.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.dataset.imdb
+import unittest
+import re
+
+TRAIN_POS_PATTERN = re.compile("aclImdb/train/pos/.*\.txt$")
+TRAIN_NEG_PATTERN = re.compile("aclImdb/train/neg/.*\.txt$")
+TRAIN_PATTERN = re.compile("aclImdb/train/.*\.txt$")
+
+TEST_POS_PATTERN = re.compile("aclImdb/test/pos/.*\.txt$")
+TEST_NEG_PATTERN = re.compile("aclImdb/test/neg/.*\.txt$")
+TEST_PATTERN = re.compile("aclImdb/test/.*\.txt$")
+
+
+class TestIMDB(unittest.TestCase):
+    word_idx = None
+
+    def test_build_dict(self):
+        if self.word_idx == None:
+            self.word_idx = paddle.v2.dataset.imdb.build_dict(TRAIN_PATTERN,
+                                                              150)
+
+        self.assertEqual(len(self.word_idx), 7036)
+
+    def check_dataset(self, dataset, expected_size):
+        if self.word_idx == None:
+            self.word_idx = paddle.v2.dataset.imdb.build_dict(TRAIN_PATTERN,
+                                                              150)
+
+        sum = 0
+        for l in dataset(self.word_idx):
+            self.assertEqual(l[1], sum % 2)
+            sum += 1
+        self.assertEqual(sum, expected_size)
+
+    def test_train(self):
+        self.check_dataset(paddle.v2.dataset.imdb.train, 25000)
+
+    def test_test(self):
+        self.check_dataset(paddle.v2.dataset.imdb.test, 25000)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/dataset/tests/imikolov_test.py b/python/paddle/v2/dataset/tests/imikolov_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..714a75d6f1ff31697eec2d893d350a726d6390fe
--- /dev/null
+++ b/python/paddle/v2/dataset/tests/imikolov_test.py
@@ -0,0 +1,67 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.dataset.imikolov
+import unittest
+
+WORD_DICT = paddle.v2.dataset.imikolov.build_dict()
+
+
+class TestMikolov(unittest.TestCase):
+    def check_reader(self, reader, n):
+        for l in reader():
+            self.assertEqual(len(l), n)
+
+    def test_train(self):
+        n = 5
+        self.check_reader(paddle.v2.dataset.imikolov.train(WORD_DICT, n), n)
+
+        first_line = 'aer banknote berlitz calloway centrust cluett fromstein '\
+            'gitano guterman hydro-quebec ipo kia memotec mlx nahb punts '\
+            'rake regatta rubens sim snack-food ssangyong swapo wachter'
+        first_line = [
+            WORD_DICT.get(ch, WORD_DICT['<unk>'])
+            for ch in first_line.split(' ')
+        ]
+        for l in paddle.v2.dataset.imikolov.train(
+                WORD_DICT, n=-1,
+                data_type=paddle.v2.dataset.imikolov.DataType.SEQ)():
+            read_line = l[0][1:]
+            break
+        self.assertEqual(first_line, read_line)
+
+    def test_test(self):
+        n = 5
+        self.check_reader(paddle.v2.dataset.imikolov.test(WORD_DICT, n), n)
+
+        first_line = 'consumers may want to move their telephones a little '\
+                'closer to the tv set'
+        first_line = [
+            WORD_DICT.get(ch, WORD_DICT['<unk>'])
+            for ch in first_line.split(' ')
+        ]
+        for l in paddle.v2.dataset.imikolov.test(
+                WORD_DICT, n=-1,
+                data_type=paddle.v2.dataset.imikolov.DataType.SEQ)():
+            read_line = l[0][1:]
+            break
+        self.assertEqual(first_line, read_line)
+
+    def test_total(self):
+        _, idx = zip(*WORD_DICT.items())
+        self.assertEqual(sorted(idx)[-1], len(WORD_DICT) - 1)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/dataset/tests/mnist_test.py b/python/paddle/v2/dataset/tests/mnist_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d344cac3e7483a351033570fbec75a4d19f4a55
--- /dev/null
+++ b/python/paddle/v2/dataset/tests/mnist_test.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.dataset.mnist
+import unittest
+
+
+class TestMNIST(unittest.TestCase):
+    def check_reader(self, reader):
+        sum = 0
+        label = 0
+        for l in reader():
+            self.assertEqual(l[0].size, 784)
+            if l[1] > label:
+                label = l[1]
+            sum += 1
+        return sum, label
+
+    def test_train(self):
+        instances, max_label_value = self.check_reader(
+            paddle.v2.dataset.mnist.train())
+        self.assertEqual(instances, 60000)
+        self.assertEqual(max_label_value, 9)
+
+    def test_test(self):
+        instances, max_label_value = self.check_reader(
+            paddle.v2.dataset.mnist.test())
+        self.assertEqual(instances, 10000)
+        self.assertEqual(max_label_value, 9)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/dataset/tests/mq2007_test.py b/python/paddle/v2/dataset/tests/mq2007_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..59847b6c18eadb12123cae824e8bce1051a69d4c
--- /dev/null
+++ b/python/paddle/v2/dataset/tests/mq2007_test.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.dataset.mq2007
+import unittest
+
+
+class TestMQ2007(unittest.TestCase):
+    def test_pairwise(self):
+        for label, query_left, query_right in paddle.v2.dataset.mq2007.test(
+                format="pairwise"):
+            self.assertEqual(query_left.shape(), (46, ))
+            self.assertEqual(query_right.shape(), (46, ))
+
+    def test_listwise(self):
+        for label_array, query_array in paddle.v2.dataset.mq2007.test(
+                format="listwise"):
+            self.assertEqual(len(label_array), len(query_array))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/dataset/tests/test_sentiment.py b/python/paddle/v2/dataset/tests/test_sentiment.py
new file mode 100644
index 0000000000000000000000000000000000000000..407405290734609059c1767600748d530e8a13a6
--- /dev/null
+++ b/python/paddle/v2/dataset/tests/test_sentiment.py
@@ -0,0 +1,55 @@
+# /usr/bin/env python
+# -*- coding:utf-8 -*-
+
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import nltk
+import paddle.v2.dataset.sentiment as st
+from nltk.corpus import movie_reviews
+
+
+class TestSentimentMethods(unittest.TestCase):
+    def test_get_word_dict(self):
+        word_dict = st.get_word_dict()[0:10]
+        test_word_list = [(u',', 0), (u'the', 1), (u'.', 2), (u'a', 3),
+                          (u'and', 4), (u'of', 5), (u'to', 6), (u"'", 7),
+                          (u'is', 8), (u'in', 9)]
+        for idx, each in enumerate(word_dict):
+            self.assertEqual(each, test_word_list[idx])
+        self.assertTrue("/root/.cache/paddle/dataset" in nltk.data.path)
+
+    def test_sort_files(self):
+        last_label = ''
+        for sample_file in st.sort_files():
+            current_label = sample_file.split("/")[0]
+            self.assertNotEqual(current_label, last_label)
+            last_label = current_label
+
+    def test_data_set(self):
+        data_set = st.load_sentiment_data()
+        last_label = -1
+        for each in st.test():
+            self.assertNotEqual(each[1], last_label)
+            last_label = each[1]
+        self.assertEqual(len(data_set), st.NUM_TOTAL_INSTANCES)
+        self.assertEqual(len(list(st.train())), st.NUM_TRAINING_INSTANCES)
+        self.assertEqual(
+            len(list(st.test())),
+            (st.NUM_TOTAL_INSTANCES - st.NUM_TRAINING_INSTANCES))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/dataset/tests/voc2012_test.py b/python/paddle/v2/dataset/tests/voc2012_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..31e72ebf5eac0508d12783f9ceaa6eef0fa6d353
--- /dev/null
+++ b/python/paddle/v2/dataset/tests/voc2012_test.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.dataset.voc2012
+import unittest
+
+
+class TestVOC(unittest.TestCase):
+    def check_reader(self, reader):
+        sum = 0
+        label = 0
+        for l in reader():
+            self.assertEqual(l[0].size, 3 * l[1].size)
+            sum += 1
+        return sum
+
+    def test_train(self):
+        count = self.check_reader(paddle.v2.dataset.voc_seg.train())
+        self.assertEqual(count, 2913)
+
+    def test_test(self):
+        count = self.check_reader(paddle.v2.dataset.voc_seg.test())
+        self.assertEqual(count, 1464)
+
+    def test_val(self):
+        count = self.check_reader(paddle.v2.dataset.voc_seg.val())
+        self.assertEqual(count, 1449)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/dataset/tests/wmt16_test.py b/python/paddle/v2/dataset/tests/wmt16_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..cef6c3216e7de8d9785a063976e63f88d90b24df
--- /dev/null
+++ b/python/paddle/v2/dataset/tests/wmt16_test.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.dataset.wmt16
+import unittest
+
+
+class TestWMT16(unittest.TestCase):
+    def checkout_one_sample(self, sample):
+        # train data has 3 field: source language word indices,
+        # target language word indices, and target next word indices.
+        self.assertEqual(len(sample), 3)
+
+        # test start mark and end mark in source word indices.
+        self.assertEqual(sample[0][0], 0)
+        self.assertEqual(sample[0][-1], 1)
+
+        # test start mask in target word indices
+        self.assertEqual(sample[1][0], 0)
+
+        # test en mask in target next word indices
+        self.assertEqual(sample[2][-1], 1)
+
+    def test_train(self):
+        for idx, sample in enumerate(
+                paddle.v2.dataset.wmt16.train(
+                    src_dict_size=100000, trg_dict_size=100000)()):
+            if idx >= 10: break
+            self.checkout_one_sample(sample)
+
+    def test_test(self):
+        for idx, sample in enumerate(
+                paddle.v2.dataset.wmt16.test(
+                    src_dict_size=1000, trg_dict_size=1000)()):
+            if idx >= 10: break
+            self.checkout_one_sample(sample)
+
+    def test_val(self):
+        for idx, sample in enumerate(
+                paddle.v2.dataset.wmt16.validation(
+                    src_dict_size=1000, trg_dict_size=1000)()):
+            if idx >= 10: break
+            self.checkout_one_sample(sample)
+
+    def test_get_dict(self):
+        dict_size = 1000
+        word_dict = paddle.v2.dataset.wmt16.get_dict("en", dict_size, True)
+        self.assertEqual(len(word_dict), dict_size)
+        self.assertEqual(word_dict[0], "<s>")
+        self.assertEqual(word_dict[1], "<e>")
+        self.assertEqual(word_dict[2], "<unk>")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/dataset/uci_housing.py b/python/paddle/v2/dataset/uci_housing.py
new file mode 100644
index 0000000000000000000000000000000000000000..f10bf7e42a1ead09b3eba0d61e55701215e4360f
--- /dev/null
+++ b/python/paddle/v2/dataset/uci_housing.py
@@ -0,0 +1,134 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+UCI Housing dataset.
+
+This module will download dataset from
+https://archive.ics.uci.edu/ml/machine-learning-databases/housing/ and
+parse training set and test set into paddle reader creators.
+"""
+
+import numpy as np
+import os
+import paddle.v2.dataset.common
+from paddle.v2.parameters import Parameters
+
+__all__ = ['train', 'test']
+
+URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data'
+MD5 = 'd4accdce7a25600298819f8e28e8d593'
+feature_names = [
+    'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
+    'PTRATIO', 'B', 'LSTAT', 'convert'
+]
+
+UCI_TRAIN_DATA = None
+UCI_TEST_DATA = None
+URL_MODEL = 'https://github.com/PaddlePaddle/book/raw/develop/01.fit_a_line/fit_a_line.tar'
+MD5_MODEL = '52fc3da8ef3937822fcdd87ee05c0c9b'
+
+
+def feature_range(maximums, minimums):
+    import matplotlib
+    matplotlib.use('Agg')
+    import matplotlib.pyplot as plt
+    fig, ax = plt.subplots()
+    feature_num = len(maximums)
+    ax.bar(range(feature_num), maximums - minimums, color='r', align='center')
+    ax.set_title('feature scale')
+    plt.xticks(range(feature_num), feature_names)
+    plt.xlim([-1, feature_num])
+    fig.set_figheight(6)
+    fig.set_figwidth(10)
+    if not os.path.exists('./image'):
+        os.makedirs('./image')
+    fig.savefig('image/ranges.png', dpi=48)
+    plt.close(fig)
+
+
+def load_data(filename, feature_num=14, ratio=0.8):
+    global UCI_TRAIN_DATA, UCI_TEST_DATA
+    if UCI_TRAIN_DATA is not None and UCI_TEST_DATA is not None:
+        return
+
+    data = np.fromfile(filename, sep=' ')
+    data = data.reshape(data.shape[0] / feature_num, feature_num)
+    maximums, minimums, avgs = data.max(axis=0), data.min(axis=0), data.sum(
+        axis=0) / data.shape[0]
+    feature_range(maximums[:-1], minimums[:-1])
+    for i in xrange(feature_num - 1):
+        data[:, i] = (data[:, i] - avgs[i]) / (maximums[i] - minimums[i])
+    offset = int(data.shape[0] * ratio)
+    UCI_TRAIN_DATA = data[:offset]
+    UCI_TEST_DATA = data[offset:]
+
+
+def train():
+    """
+    UCI_HOUSING training set creator.
+
+    It returns a reader creator, each sample in the reader is features after
+    normalization and price number.
+
+    :return: Training reader creator
+    :rtype: callable
+    """
+    global UCI_TRAIN_DATA
+    load_data(paddle.v2.dataset.common.download(URL, 'uci_housing', MD5))
+
+    def reader():
+        for d in UCI_TRAIN_DATA:
+            yield d[:-1], d[-1:]
+
+    return reader
+
+
+def test():
+    """
+    UCI_HOUSING test set creator.
+
+    It returns a reader creator, each sample in the reader is features after
+    normalization and price number.
+
+    :return: Test reader creator
+    :rtype: callable
+    """
+    global UCI_TEST_DATA
+    load_data(paddle.v2.dataset.common.download(URL, 'uci_housing', MD5))
+
+    def reader():
+        for d in UCI_TEST_DATA:
+            yield d[:-1], d[-1:]
+
+    return reader
+
+
+def model():
+    tar_file = paddle.v2.dataset.common.download(URL_MODEL, 'fit_a_line.tar',
+                                                 MD5_MODEL)
+    with open(tar_file, 'r') as f:
+        parameters = Parameters.from_tar(f)
+    return parameters
+
+
+def fetch():
+    paddle.v2.dataset.common.download(URL, 'uci_housing', MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    paddle.v2.dataset.common.convert(path, train(), 1000, "uci_housing_train")
+    paddle.v2.dataset.common.convert(path, test(), 1000, "uci_houseing_test")
diff --git a/python/paddle/v2/dataset/voc2012.py b/python/paddle/v2/dataset/voc2012.py
new file mode 100644
index 0000000000000000000000000000000000000000..617e212d67fbe37f9d9663e9c83c62045411fa77
--- /dev/null
+++ b/python/paddle/v2/dataset/voc2012.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Image dataset for segmentation.
+The 2012 dataset contains images from 2008-2011 for which additional
+segmentations have been prepared. As in previous years the assignment
+to training/test sets has been maintained. The total number of images
+with segmentation has been increased from 7,062 to 9,993.
+"""
+
+import tarfile
+import io
+import numpy as np
+from paddle.v2.dataset.common import download
+from paddle.v2.image import *
+from PIL import Image
+
+__all__ = ['train', 'test', 'val']
+
+VOC_URL = 'http://host.robots.ox.ac.uk/pascal/VOC/voc2012/\
+VOCtrainval_11-May-2012.tar'
+
+VOC_MD5 = '6cd6e144f989b92b3379bac3b3de84fd'
+SET_FILE = 'VOCdevkit/VOC2012/ImageSets/Segmentation/{}.txt'
+DATA_FILE = 'VOCdevkit/VOC2012/JPEGImages/{}.jpg'
+LABEL_FILE = 'VOCdevkit/VOC2012/SegmentationClass/{}.png'
+
+CACHE_DIR = 'voc2012'
+
+
+def reader_creator(filename, sub_name):
+
+    tarobject = tarfile.open(filename)
+    name2mem = {}
+    for ele in tarobject.getmembers():
+        name2mem[ele.name] = ele
+
+    def reader():
+        set_file = SET_FILE.format(sub_name)
+        sets = tarobject.extractfile(name2mem[set_file])
+        for line in sets:
+            line = line.strip()
+            data_file = DATA_FILE.format(line)
+            label_file = LABEL_FILE.format(line)
+            data = tarobject.extractfile(name2mem[data_file]).read()
+            label = tarobject.extractfile(name2mem[label_file]).read()
+            data = Image.open(io.BytesIO(data))
+            label = Image.open(io.BytesIO(label))
+            data = np.array(data)
+            label = np.array(label)
+            yield data, label
+
+    return reader
+
+
+def train():
+    """
+    Create a train dataset reader containing 2913 images in HWC order.
+    """
+    return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'trainval')
+
+
+def test():
+    """
+    Create a test dataset reader containing 1464 images in HWC order.
+    """
+    return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'train')
+
+
+def val():
+    """
+    Create a val dataset reader containing 1449 images in HWC order.
+    """
+    return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'val')
diff --git a/python/paddle/v2/dataset/wmt14.py b/python/paddle/v2/dataset/wmt14.py
new file mode 100644
index 0000000000000000000000000000000000000000..5104e29051e4480f3a7eb18421f1b519841b009b
--- /dev/null
+++ b/python/paddle/v2/dataset/wmt14.py
@@ -0,0 +1,182 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+WMT14 dataset.
+The original WMT14 dataset is too large and a small set of data for set is
+provided. This module will download dataset from
+http://paddlepaddle.cdn.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz and
+parse training set and test set into paddle reader creators.
+
+"""
+import tarfile
+import gzip
+
+import paddle.v2.dataset.common
+from paddle.v2.parameters import Parameters
+
+__all__ = [
+    'train',
+    'test',
+    'get_dict',
+    'convert',
+]
+
+URL_DEV_TEST = ('http://www-lium.univ-lemans.fr/~schwenk/'
+                'cslm_joint_paper/data/dev+test.tgz')
+MD5_DEV_TEST = '7d7897317ddd8ba0ae5c5fa7248d3ff5'
+# this is a small set of data for test. The original data is too large and
+# will be add later.
+URL_TRAIN = ('http://paddlepaddle.cdn.bcebos.com/demo/'
+             'wmt_shrinked_data/wmt14.tgz')
+MD5_TRAIN = '0791583d57d5beb693b9414c5b36798c'
+# BLEU of this trained model is 26.92
+URL_MODEL = 'http://paddlepaddle.bj.bcebos.com/demo/wmt_14/wmt14_model.tar.gz'
+MD5_MODEL = '0cb4a5366189b6acba876491c8724fa3'
+
+START = "<s>"
+END = "<e>"
+UNK = "<unk>"
+UNK_IDX = 2
+
+
+def __read_to_dict(tar_file, dict_size):
+    def __to_dict(fd, size):
+        out_dict = dict()
+        for line_count, line in enumerate(fd):
+            if line_count < size:
+                out_dict[line.strip()] = line_count
+            else:
+                break
+        return out_dict
+
+    with tarfile.open(tar_file, mode='r') as f:
+        names = [
+            each_item.name for each_item in f
+            if each_item.name.endswith("src.dict")
+        ]
+        assert len(names) == 1
+        src_dict = __to_dict(f.extractfile(names[0]), dict_size)
+        names = [
+            each_item.name for each_item in f
+            if each_item.name.endswith("trg.dict")
+        ]
+        assert len(names) == 1
+        trg_dict = __to_dict(f.extractfile(names[0]), dict_size)
+        return src_dict, trg_dict
+
+
+def reader_creator(tar_file, file_name, dict_size):
+    def reader():
+        src_dict, trg_dict = __read_to_dict(tar_file, dict_size)
+        with tarfile.open(tar_file, mode='r') as f:
+            names = [
+                each_item.name for each_item in f
+                if each_item.name.endswith(file_name)
+            ]
+            for name in names:
+                for line in f.extractfile(name):
+                    line_split = line.strip().split('\t')
+                    if len(line_split) != 2:
+                        continue
+                    src_seq = line_split[0]  # one source sequence
+                    src_words = src_seq.split()
+                    src_ids = [
+                        src_dict.get(w, UNK_IDX)
+                        for w in [START] + src_words + [END]
+                    ]
+
+                    trg_seq = line_split[1]  # one target sequence
+                    trg_words = trg_seq.split()
+                    trg_ids = [trg_dict.get(w, UNK_IDX) for w in trg_words]
+
+                    # remove sequence whose length > 80 in training mode
+                    if len(src_ids) > 80 or len(trg_ids) > 80:
+                        continue
+                    trg_ids_next = trg_ids + [trg_dict[END]]
+                    trg_ids = [trg_dict[START]] + trg_ids
+
+                    yield src_ids, trg_ids, trg_ids_next
+
+    return reader
+
+
+def train(dict_size):
+    """
+    WMT14 training set creator.
+
+    It returns a reader creator, each sample in the reader is source language
+    word ID sequence, target language word ID sequence and next word ID
+    sequence.
+
+    :return: Training reader creator
+    :rtype: callable
+    """
+    return reader_creator(
+        paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
+        'train/train', dict_size)
+
+
+def test(dict_size):
+    """
+    WMT14 test set creator.
+
+    It returns a reader creator, each sample in the reader is source language
+    word ID sequence, target language word ID sequence and next word ID
+    sequence.
+
+    :return: Test reader creator
+    :rtype: callable
+    """
+    return reader_creator(
+        paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
+        'test/test', dict_size)
+
+
+def gen(dict_size):
+    return reader_creator(
+        paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
+        'gen/gen', dict_size)
+
+
+def model():
+    tar_file = paddle.v2.dataset.common.download(URL_MODEL, 'wmt14', MD5_MODEL)
+    with gzip.open(tar_file, 'r') as f:
+        parameters = Parameters.from_tar(f)
+    return parameters
+
+
+def get_dict(dict_size, reverse=True):
+    # if reverse = False, return dict = {'a':'001', 'b':'002', ...}
+    # else reverse = true, return dict = {'001':'a', '002':'b', ...}
+    tar_file = paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN)
+    src_dict, trg_dict = __read_to_dict(tar_file, dict_size)
+    if reverse:
+        src_dict = {v: k for k, v in src_dict.items()}
+        trg_dict = {v: k for k, v in trg_dict.items()}
+    return src_dict, trg_dict
+
+
+def fetch():
+    paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN)
+    paddle.v2.dataset.common.download(URL_MODEL, 'wmt14', MD5_MODEL)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    dict_size = 30000
+    paddle.v2.dataset.common.convert(path,
+                                     train(dict_size), 1000, "wmt14_train")
+    paddle.v2.dataset.common.convert(path, test(dict_size), 1000, "wmt14_test")
diff --git a/python/paddle/v2/dataset/wmt16.py b/python/paddle/v2/dataset/wmt16.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8818f715beadd9499ae588f2c19a57fbf26f372
--- /dev/null
+++ b/python/paddle/v2/dataset/wmt16.py
@@ -0,0 +1,349 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+ACL2016 Multimodal Machine Translation. Please see this website for more
+details: http://www.statmt.org/wmt16/multimodal-task.html#task1
+
+If you use the dataset created for your task, please cite the following paper:
+Multi30K: Multilingual English-German Image Descriptions.
+
+@article{elliott-EtAl:2016:VL16,
+ author    = {{Elliott}, D. and {Frank}, S. and {Sima"an}, K. and {Specia}, L.},
+ title     = {Multi30K: Multilingual English-German Image Descriptions},
+ booktitle = {Proceedings of the 6th Workshop on Vision and Language},
+ year      = {2016},
+ pages     = {70--74},
+ year      = 2016
+}
+"""
+
+import os
+import tarfile
+import gzip
+from collections import defaultdict
+
+import paddle.v2.dataset.common
+
+__all__ = [
+    "train",
+    "test",
+    "validation",
+    "convert",
+    "fetch",
+    "get_dict",
+]
+
+DATA_URL = ("http://cloud.dlnel.org/filepub/"
+            "?uuid=46a0808e-ddd8-427c-bacd-0dbc6d045fed")
+DATA_MD5 = "0c38be43600334966403524a40dcd81e"
+
+TOTAL_EN_WORDS = 11250
+TOTAL_DE_WORDS = 19220
+
+START_MARK = "<s>"
+END_MARK = "<e>"
+UNK_MARK = "<unk>"
+
+
+def __build_dict(tar_file, dict_size, save_path, lang):
+    word_dict = defaultdict(int)
+    with tarfile.open(tar_file, mode="r") as f:
+        for line in f.extractfile("wmt16/train"):
+            line_split = line.strip().split("\t")
+            if len(line_split) != 2: continue
+            sen = line_split[0] if lang == "en" else line_split[1]
+            for w in sen.split():
+                word_dict[w] += 1
+
+    with open(save_path, "w") as fout:
+        fout.write("%s\n%s\n%s\n" % (START_MARK, END_MARK, UNK_MARK))
+        for idx, word in enumerate(
+                sorted(
+                    word_dict.iteritems(), key=lambda x: x[1], reverse=True)):
+            if idx + 3 == dict_size: break
+            fout.write("%s\n" % (word[0]))
+
+
+def __load_dict(tar_file, dict_size, lang, reverse=False):
+    dict_path = os.path.join(paddle.v2.dataset.common.DATA_HOME,
+                             "wmt16/%s_%d.dict" % (lang, dict_size))
+    if not os.path.exists(dict_path) or (
+            len(open(dict_path, "r").readlines()) != dict_size):
+        __build_dict(tar_file, dict_size, dict_path, lang)
+
+    word_dict = {}
+    with open(dict_path, "r") as fdict:
+        for idx, line in enumerate(fdict):
+            if reverse:
+                word_dict[idx] = line.strip()
+            else:
+                word_dict[line.strip()] = idx
+    return word_dict
+
+
+def __get_dict_size(src_dict_size, trg_dict_size, src_lang):
+    src_dict_size = min(src_dict_size, (TOTAL_EN_WORDS if src_lang == "en" else
+                                        TOTAL_DE_WORDS))
+    trg_dict_size = min(trg_dict_size, (TOTAL_DE_WORDS if src_lang == "en" else
+                                        TOTAL_ENG_WORDS))
+    return src_dict_size, trg_dict_size
+
+
+def reader_creator(tar_file, file_name, src_dict_size, trg_dict_size, src_lang):
+    def reader():
+        src_dict = __load_dict(tar_file, src_dict_size, src_lang)
+        trg_dict = __load_dict(tar_file, trg_dict_size,
+                               ("de" if src_lang == "en" else "en"))
+
+        # the indice for start mark, end mark, and unk are the same in source
+        # language and target language. Here uses the source language
+        # dictionary to determine their indices.
+        start_id = src_dict[START_MARK]
+        end_id = src_dict[END_MARK]
+        unk_id = src_dict[UNK_MARK]
+
+        src_col = 0 if src_lang == "en" else 1
+        trg_col = 1 - src_col
+
+        with tarfile.open(tar_file, mode="r") as f:
+            for line in f.extractfile(file_name):
+                line_split = line.strip().split("\t")
+                if len(line_split) != 2:
+                    continue
+                src_words = line_split[src_col].split()
+                src_ids = [start_id] + [
+                    src_dict.get(w, unk_id) for w in src_words
+                ] + [end_id]
+
+                trg_words = line_split[trg_col].split()
+                trg_ids = [trg_dict.get(w, unk_id) for w in trg_words]
+
+                trg_ids_next = trg_ids + [end_id]
+                trg_ids = [start_id] + trg_ids
+
+                yield src_ids, trg_ids, trg_ids_next
+
+    return reader
+
+
+def train(src_dict_size, trg_dict_size, src_lang="en"):
+    """
+    WMT16 train set reader.
+
+    This function returns the reader for train data. Each sample the reader
+    returns is made up of three fields: the source language word index sequence,
+    target language word index sequence and next word index sequence.
+
+
+    NOTE:
+    The original like for training data is:
+    http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/training.tar.gz
+
+    paddle.dataset.wmt16 provides a tokenized version of the original dataset by
+    using moses's tokenization script:
+    https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl
+
+    Args:
+        src_dict_size(int): Size of the source language dictionary. Three
+                            special tokens will be added into the dictionary:
+                            <s> for start mark, <e> for end mark, and <unk> for
+                            unknown word.
+        trg_dict_size(int): Size of the target language dictionary. Three
+                            special tokens will be added into the dictionary:
+                            <s> for start mark, <e> for end mark, and <unk> for
+                            unknown word.
+        src_lang(string): A string indicating which language is the source
+                          language. Available options are: "en" for English
+                          and "de" for Germany.
+
+    Returns:
+        callable: The train reader.
+    """
+
+    if src_lang not in ["en", "de"]:
+        raise ValueError("An error language type.  Only support: "
+                         "en (for English); de(for Germany).")
+    src_dict_size, trg_dict_size = __get_dict_size(src_dict_size, trg_dict_size,
+                                                   src_lang)
+
+    return reader_creator(
+        tar_file=paddle.v2.dataset.common.download(DATA_URL, "wmt16", DATA_MD5,
+                                                   "wmt16.tar.gz"),
+        file_name="wmt16/train",
+        src_dict_size=src_dict_size,
+        trg_dict_size=trg_dict_size,
+        src_lang=src_lang)
+
+
+def test(src_dict_size, trg_dict_size, src_lang="en"):
+    """
+    WMT16 test set reader.
+
+    This function returns the reader for test data. Each sample the reader
+    returns is made up of three fields: the source language word index sequence,
+    target language word index sequence and next word index sequence.
+
+    NOTE:
+    The original like for test data is:
+    http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/mmt16_task1_test.tar.gz
+
+    paddle.dataset.wmt16 provides a tokenized version of the original dataset by
+    using moses's tokenization script:
+    https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl
+
+    Args:
+        src_dict_size(int): Size of the source language dictionary. Three
+                            special tokens will be added into the dictionary:
+                            <s> for start mark, <e> for end mark, and <unk> for
+                            unknown word.
+        trg_dict_size(int): Size of the target language dictionary. Three
+                            special tokens will be added into the dictionary:
+                            <s> for start mark, <e> for end mark, and <unk> for
+                            unknown word.
+        src_lang(string): A string indicating which language is the source
+                          language. Available options are: "en" for English
+                          and "de" for Germany.
+
+    Returns:
+        callable: The test reader.
+    """
+
+    if src_lang not in ["en", "de"]:
+        raise ValueError("An error language type. "
+                         "Only support: en (for English); de(for Germany).")
+
+    src_dict_size, trg_dict_size = __get_dict_size(src_dict_size, trg_dict_size,
+                                                   src_lang)
+
+    return reader_creator(
+        tar_file=paddle.v2.dataset.common.download(DATA_URL, "wmt16", DATA_MD5,
+                                                   "wmt16.tar.gz"),
+        file_name="wmt16/test",
+        src_dict_size=src_dict_size,
+        trg_dict_size=trg_dict_size,
+        src_lang=src_lang)
+
+
+def validation(src_dict_size, trg_dict_size, src_lang="en"):
+    """
+    WMT16 validation set reader.
+
+    This function returns the reader for validation data. Each sample the reader
+    returns is made up of three fields: the source language word index sequence,
+    target language word index sequence and next word index sequence.
+
+    NOTE:
+    The original like for validation data is:
+    http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/validation.tar.gz
+
+    paddle.dataset.wmt16 provides a tokenized version of the original dataset by
+    using moses's tokenization script:
+    https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl
+
+    Args:
+        src_dict_size(int): Size of the source language dictionary. Three
+                            special tokens will be added into the dictionary:
+                            <s> for start mark, <e> for end mark, and <unk> for
+                            unknown word.
+        trg_dict_size(int): Size of the target language dictionary. Three
+                            special tokens will be added into the dictionary:
+                            <s> for start mark, <e> for end mark, and <unk> for
+                            unknown word.
+        src_lang(string): A string indicating which language is the source
+                          language. Available options are: "en" for English
+                          and "de" for Germany.
+
+    Returns:
+        callable: The validation reader.
+    """
+    if src_lang not in ["en", "de"]:
+        raise ValueError("An error language type. "
+                         "Only support: en (for English); de(for Germany).")
+    src_dict_size, trg_dict_size = __get_dict_size(src_dict_size, trg_dict_size,
+                                                   src_lang)
+
+    return reader_creator(
+        tar_file=paddle.v2.dataset.common.download(DATA_URL, "wmt16", DATA_MD5,
+                                                   "wmt16.tar.gz"),
+        file_name="wmt16/val",
+        src_dict_size=src_dict_size,
+        trg_dict_size=trg_dict_size,
+        src_lang=src_lang)
+
+
+def get_dict(lang, dict_size, reverse=False):
+    """
+    return the word dictionary for the specified language.
+
+    Args:
+        lang(string): A string indicating which language is the source
+                      language. Available options are: "en" for English
+                      and "de" for Germany.
+        dict_size(int): Size of the specified language dictionary.
+        reverse(bool): If reverse is set to False, the returned python
+                       dictionary will use word as key and use index as value.
+                       If reverse is set to True, the returned python
+                       dictionary will use index as key and word as value.
+
+    Returns:
+        dict: The word dictionary for the specific language.
+    """
+
+    if lang == "en": dict_size = min(dict_size, TOTAL_EN_WORDS)
+    else: dict_size = min(dict_size, TOTAL_DE_WORDS)
+
+    dict_path = os.path.join(paddle.v2.dataset.common.DATA_HOME,
+                             "wmt16/%s_%d.dict" % (lang, dict_size))
+    assert os.path.exists(dict_path), "Word dictionary does not exist. "
+    "Please invoke paddle.dataset.wmt16.train/test/validation first "
+    "to build the dictionary."
+    tar_file = os.path.join(paddle.v2.dataset.common.DATA_HOME, "wmt16.tar.gz")
+    return __load_dict(tar_file, dict_size, lang, reverse)
+
+
+def fetch():
+    """download the entire dataset.
+    """
+    paddle.v4.dataset.common.download(DATA_URL, "wmt16", DATA_MD5,
+                                      "wmt16.tar.gz")
+
+
+def convert(path, src_dict_size, trg_dict_size, src_lang):
+    """Converts dataset to recordio format.
+    """
+
+    paddle.v2.dataset.common.convert(
+        path,
+        train(
+            src_dict_size=src_dict_size,
+            trg_dict_size=trg_dict_size,
+            src_lang=src_lang),
+        1000,
+        "wmt16_train")
+    paddle.v2.dataset.common.convert(
+        path,
+        test(
+            src_dict_size=src_dict_size,
+            trg_dict_size=trg_dict_size,
+            src_lang=src_lang),
+        1000,
+        "wmt16_test")
+    paddle.v2.dataset.common.convert(
+        path,
+        validation(
+            src_dict_size=src_dict_size,
+            trg_dict_size=trg_dict_size,
+            src_lang=src_lang),
+        1000,
+        "wmt16_validation")
diff --git a/python/paddle/v2/image.py b/python/paddle/v2/image.py
new file mode 100644
index 0000000000000000000000000000000000000000..9235c41e9eb95b25a0dc53a494a203e7a4525981
--- /dev/null
+++ b/python/paddle/v2/image.py
@@ -0,0 +1,381 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This file contains some common interfaces for image preprocess.
+Many users are confused about the image layout. We introduce
+the image layout as follows.
+
+- CHW Layout
+
+  - The abbreviations: C=channel, H=Height, W=Width
+  - The default layout of image opened by cv2 or PIL is HWC.
+    PaddlePaddle only supports the CHW layout. And CHW is simply
+    a transpose of HWC. It must transpose the input image.
+
+- Color format: RGB or BGR
+
+  OpenCV use BGR color format. PIL use RGB color format. Both
+  formats can be used for training. Noted that, the format should
+  be keep consistent between the training and inference peroid.
+"""
+import numpy as np
+try:
+    import cv2
+except ImportError:
+    cv2 = None
+import os
+import tarfile
+import cPickle
+
+__all__ = [
+    "load_image_bytes", "load_image", "resize_short", "to_chw", "center_crop",
+    "random_crop", "left_right_flip", "simple_transform", "load_and_transform",
+    "batch_images_from_tar"
+]
+
+
+def batch_images_from_tar(data_file,
+                          dataset_name,
+                          img2label,
+                          num_per_batch=1024):
+    """
+    Read images from tar file and batch them into batch file.
+
+    :param data_file: path of image tar file
+    :type data_file: string
+    :param dataset_name: 'train','test' or 'valid'
+    :type dataset_name: string
+    :param img2label: a dic with image file name as key 
+                    and image's label as value
+    :type img2label: dic
+    :param num_per_batch: image number per batch file
+    :type num_per_batch: int
+    :return: path of list file containing paths of batch file
+    :rtype: string
+    """
+    batch_dir = data_file + "_batch"
+    out_path = "%s/%s" % (batch_dir, dataset_name)
+    meta_file = "%s/%s.txt" % (batch_dir, dataset_name)
+
+    if os.path.exists(out_path):
+        return meta_file
+    else:
+        os.makedirs(out_path)
+
+    tf = tarfile.open(data_file)
+    mems = tf.getmembers()
+    data = []
+    labels = []
+    file_id = 0
+    for mem in mems:
+        if mem.name in img2label:
+            data.append(tf.extractfile(mem).read())
+            labels.append(img2label[mem.name])
+            if len(data) == num_per_batch:
+                output = {}
+                output['label'] = labels
+                output['data'] = data
+                cPickle.dump(
+                    output,
+                    open('%s/batch_%d' % (out_path, file_id), 'w'),
+                    protocol=cPickle.HIGHEST_PROTOCOL)
+                file_id += 1
+                data = []
+                labels = []
+    if len(data) > 0:
+        output = {}
+        output['label'] = labels
+        output['data'] = data
+        cPickle.dump(
+            output,
+            open('%s/batch_%d' % (out_path, file_id), 'w'),
+            protocol=cPickle.HIGHEST_PROTOCOL)
+
+    with open(meta_file, 'a') as meta:
+        for file in os.listdir(out_path):
+            meta.write(os.path.abspath("%s/%s" % (out_path, file)) + "\n")
+    return meta_file
+
+
+def load_image_bytes(bytes, is_color=True):
+    """
+    Load an color or gray image from bytes array.
+
+    Example usage:
+    
+    .. code-block:: python
+
+        with open('cat.jpg') as f:
+            im = load_image_bytes(f.read())
+
+    :param bytes: the input image bytes array.
+    :type bytes: str
+    :param is_color: If set is_color True, it will load and
+                     return a color image. Otherwise, it will
+                     load and return a gray image.
+    :type is_color: bool
+    """
+    flag = 1 if is_color else 0
+    file_bytes = np.asarray(bytearray(bytes), dtype=np.uint8)
+    img = cv2.imdecode(file_bytes, flag)
+    return img
+
+
+def load_image(file, is_color=True):
+    """
+    Load an color or gray image from the file path.
+
+    Example usage:
+    
+    .. code-block:: python
+
+        im = load_image('cat.jpg')
+
+    :param file: the input image path.
+    :type file: string
+    :param is_color: If set is_color True, it will load and
+                     return a color image. Otherwise, it will
+                     load and return a gray image.
+    :type is_color: bool
+    """
+    # cv2.IMAGE_COLOR for OpenCV3
+    # cv2.CV_LOAD_IMAGE_COLOR for older OpenCV Version
+    # cv2.IMAGE_GRAYSCALE for OpenCV3
+    # cv2.CV_LOAD_IMAGE_GRAYSCALE for older OpenCV Version
+    # Here, use constant 1 and 0
+    # 1: COLOR, 0: GRAYSCALE
+    flag = 1 if is_color else 0
+    im = cv2.imread(file, flag)
+    return im
+
+
+def resize_short(im, size):
+    """ 
+    Resize an image so that the length of shorter edge is size.
+
+    Example usage:
+    
+    .. code-block:: python
+
+        im = load_image('cat.jpg')
+        im = resize_short(im, 256)
+    
+    :param im: the input image with HWC layout.
+    :type im: ndarray
+    :param size: the shorter edge size of image after resizing.
+    :type size: int
+    """
+    h, w = im.shape[:2]
+    h_new, w_new = size, size
+    if h > w:
+        h_new = size * h / w
+    else:
+        w_new = size * w / h
+    im = cv2.resize(im, (h_new, w_new), interpolation=cv2.INTER_CUBIC)
+    return im
+
+
+def to_chw(im, order=(2, 0, 1)):
+    """
+    Transpose the input image order. The image layout is HWC format
+    opened by cv2 or PIL. Transpose the input image to CHW layout
+    according the order (2,0,1).
+
+    Example usage:
+    
+    .. code-block:: python
+
+        im = load_image('cat.jpg')
+        im = resize_short(im, 256)
+        im = to_chw(im)
+    
+    :param im: the input image with HWC layout.
+    :type im: ndarray
+    :param order: the transposed order.
+    :type order: tuple|list 
+    """
+    assert len(im.shape) == len(order)
+    im = im.transpose(order)
+    return im
+
+
+def center_crop(im, size, is_color=True):
+    """
+    Crop the center of image with size.
+
+    Example usage:
+    
+    .. code-block:: python
+
+        im = center_crop(im, 224)
+    
+    :param im: the input image with HWC layout.
+    :type im: ndarray
+    :param size: the cropping size.
+    :type size: int
+    :param is_color: whether the image is color or not.
+    :type is_color: bool
+    """
+    h, w = im.shape[:2]
+    h_start = (h - size) / 2
+    w_start = (w - size) / 2
+    h_end, w_end = h_start + size, w_start + size
+    if is_color:
+        im = im[h_start:h_end, w_start:w_end, :]
+    else:
+        im = im[h_start:h_end, w_start:w_end]
+    return im
+
+
+def random_crop(im, size, is_color=True):
+    """
+    Randomly crop input image with size.
+
+    Example usage:
+    
+    .. code-block:: python
+
+        im = random_crop(im, 224)
+    
+    :param im: the input image with HWC layout.
+    :type im: ndarray
+    :param size: the cropping size.
+    :type size: int
+    :param is_color: whether the image is color or not.
+    :type is_color: bool
+    """
+    h, w = im.shape[:2]
+    h_start = np.random.randint(0, h - size + 1)
+    w_start = np.random.randint(0, w - size + 1)
+    h_end, w_end = h_start + size, w_start + size
+    if is_color:
+        im = im[h_start:h_end, w_start:w_end, :]
+    else:
+        im = im[h_start:h_end, w_start:w_end]
+    return im
+
+
+def left_right_flip(im, is_color=True):
+    """
+    Flip an image along the horizontal direction.
+    Return the flipped image.
+
+    Example usage:
+    
+    .. code-block:: python
+
+        im = left_right_flip(im)
+    
+    :param im: input image with HWC layout or HW layout for gray image
+    :type im: ndarray
+    :param is_color: whether input image is color or not
+    :type is_color: bool
+    """
+    if len(im.shape) == 3 and is_color:
+        return im[:, ::-1, :]
+    else:
+        return im[:, ::-1]
+
+
+def simple_transform(im,
+                     resize_size,
+                     crop_size,
+                     is_train,
+                     is_color=True,
+                     mean=None):
+    """
+    Simply data argumentation for training. These operations include
+    resizing, croping and flipping.
+
+    Example usage:
+    
+    .. code-block:: python
+
+        im = simple_transform(im, 256, 224, True)
+
+    :param im: The input image with HWC layout.
+    :type im: ndarray
+    :param resize_size: The shorter edge length of the resized image.
+    :type resize_size: int
+    :param crop_size: The cropping size.
+    :type crop_size: int
+    :param is_train: Whether it is training or not.
+    :type is_train: bool
+    :param is_color: whether the image is color or not.
+    :type is_color: bool
+    :param mean: the mean values, which can be element-wise mean values or 
+                 mean values per channel.
+    :type mean: numpy array | list
+    """
+    im = resize_short(im, resize_size)
+    if is_train:
+        im = random_crop(im, crop_size, is_color=is_color)
+        if np.random.randint(2) == 0:
+            im = left_right_flip(im, is_color)
+    else:
+        im = center_crop(im, crop_size, is_color)
+        im = center_crop(im, crop_size, is_color=is_color)
+    if len(im.shape) == 3:
+        im = to_chw(im)
+
+    im = im.astype('float32')
+    if mean is not None:
+        mean = np.array(mean, dtype=np.float32)
+        # mean value, may be one value per channel 
+        if mean.ndim == 1 and is_color:
+            mean = mean[:, np.newaxis, np.newaxis]
+        elif mean.ndim == 1:
+            mean = mean
+        else:
+            # elementwise mean
+            assert len(mean.shape) == len(im)
+        im -= mean
+
+    return im
+
+
+def load_and_transform(filename,
+                       resize_size,
+                       crop_size,
+                       is_train,
+                       is_color=True,
+                       mean=None):
+    """
+    Load image from the input file `filename` and transform image for
+    data argumentation. Please refer to the `simple_transform` interface
+    for the transform operations.
+
+    Example usage:
+    
+    .. code-block:: python
+
+        im = load_and_transform('cat.jpg', 256, 224, True)
+
+    :param filename: The file name of input image.
+    :type filename: string
+    :param resize_size: The shorter edge length of the resized image.
+    :type resize_size: int
+    :param crop_size: The cropping size.
+    :type crop_size: int
+    :param is_train: Whether it is training or not.
+    :type is_train: bool
+    :param is_color: whether the image is color or not.
+    :type is_color: bool
+    :param mean: the mean values, which can be element-wise mean values or 
+                 mean values per channel.
+    :type mean: numpy array | list
+    """
+    im = load_image(filename, is_color)
+    im = simple_transform(im, resize_size, crop_size, is_train, is_color, mean)
+    return im
diff --git a/python/paddle/v2/minibatch.py b/python/paddle/v2/minibatch.py
new file mode 100644
index 0000000000000000000000000000000000000000..317cf037c69f8639e3760fbfce20565127794fcb
--- /dev/null
+++ b/python/paddle/v2/minibatch.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = ['batch']
+
+
+def batch(reader, batch_size):
+    """
+    Create a batched reader.
+
+    :param reader: the data reader to read from.
+    :type reader: callable
+    :param batch_size: size of each mini-batch
+    :type batch_size: int
+    :return: the batched reader.
+    :rtype: callable
+    """
+
+    def batch_reader():
+        r = reader()
+        b = []
+        for instance in r:
+            b.append(instance)
+            if len(b) == batch_size:
+                yield b
+                b = []
+        if b:
+            yield b
+
+    return batch_reader
diff --git a/python/paddle/v2/reader/__init__.py b/python/paddle/v2/reader/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b059735a924d58714cd88a761eb83143f1192d6
--- /dev/null
+++ b/python/paddle/v2/reader/__init__.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+At training and testing time, PaddlePaddle programs need to read data. To ease
+the users' work to write data reading code, we define that
+
+- A *reader* is a function that reads data (from file, network, random number
+  generator, etc) and yields data items.
+- A *reader creator* is a function that returns a reader function.
+- A *reader decorator* is a function, which accepts one or more readers, and
+  returns a reader.
+- A *batch reader* is a function that reads data (from *reader*, file, network,
+  random number generator, etc) and yields a batch of data items.
+
+#####################
+Data Reader Interface
+#####################
+
+Indeed, *data reader* doesn't have to be a function that reads and yields data
+items. It can be any function with no parameter that creates a iterable
+(anything can be used in :code:`for x in iterable`)\:
+
+..  code-block:: python
+
+    iterable = data_reader()
+
+Element produced from the iterable should be a **single** entry of data,
+**not** a mini batch. That entry of data could be a single item, or a tuple of
+items.
+Item should be of `supported type <http://www.paddlepaddle.org/doc/ui/data_provider
+/pydataprovider2.html?highlight=dense_vector#input-types>`_ (e.g., numpy 1d
+array of float32, int, list of int)
+
+An example implementation for single item data reader creator:
+
+..  code-block:: python
+
+    def reader_creator_random_image(width, height):
+        def reader():
+            while True:
+                yield numpy.random.uniform(-1, 1, size=width*height)
+    return reader
+
+An example implementation for multiple item data reader creator:
+
+..  code-block:: python
+
+    def reader_creator_random_image_and_label(width, height, label):
+        def reader():
+            while True:
+                yield numpy.random.uniform(-1, 1, size=width*height), label
+    return reader
+
+
+TODO(yuyang18): Should we add whole design doc here?
+"""
+
+import decorator
+from decorator import *
+
+import creator
+
+__all__ = decorator.__all__ + ['creator']
diff --git a/python/paddle/v2/reader/creator.py b/python/paddle/v2/reader/creator.py
new file mode 100644
index 0000000000000000000000000000000000000000..fda5246d74f598200b439774a25e80ec3e504077
--- /dev/null
+++ b/python/paddle/v2/reader/creator.py
@@ -0,0 +1,130 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Creator package contains some simple reader creator, which could
+be used in user program.
+"""
+
+__all__ = ['np_array', 'text_file', 'recordio', 'cloud_reader']
+
+
+def np_array(x):
+    """
+    Creates a reader that yields elements of x, if it is a
+    numpy vector. Or rows of x, if it is a numpy matrix.
+    Or any sub-hyperplane indexed by the highest dimension.
+
+    :param x: the numpy array to create reader from.
+    :returns: data reader created from x.
+    """
+
+    def reader():
+        if x.ndim < 1:
+            yield x
+
+        for e in x:
+            yield e
+
+    return reader
+
+
+def text_file(path):
+    """
+    Creates a data reader that outputs text line by line from given text file.
+    Trailing new line ('\\\\n') of each line will be removed.
+
+    :path: path of the text file.
+    :returns: data reader of text file
+    """
+
+    def reader():
+        f = open(path, "r")
+        for l in f:
+            yield l.rstrip('\n')
+        f.close()
+
+    return reader
+
+
+def recordio(paths, buf_size=100):
+    """
+    Creates a data reader from given RecordIO file paths separated by ",",
+        glob pattern is supported.
+    :path: path of recordio files, can be a string or a string list.
+    :returns: data reader of recordio files.
+    """
+
+    import recordio as rec
+    import paddle.v2.reader.decorator as dec
+    import cPickle as pickle
+
+    def reader():
+        if isinstance(paths, basestring):
+            path = paths
+        else:
+            path = ",".join(paths)
+        f = rec.reader(path)
+        while True:
+            r = f.read()
+            if r is None:
+                break
+            yield pickle.loads(r)
+        f.close()
+
+    return dec.buffered(reader, buf_size)
+
+
+pass_num = 0
+
+
+def cloud_reader(paths, etcd_endpoints, timeout_sec=5, buf_size=64):
+    """
+    Create a data reader that yield a record one by one from
+        the paths:
+    :paths: path of recordio files, can be a string or a string list.
+    :etcd_endpoints: the endpoints for etcd cluster
+    :returns: data reader of recordio files.
+
+    ..  code-block:: python
+        from paddle.v2.reader.creator import cloud_reader
+        etcd_endpoints = "http://127.0.0.1:2379"
+        trainer.train.(
+            reader=cloud_reader(["/work/dataset/uci_housing/uci_housing*"], etcd_endpoints),
+        )
+    """
+    import os
+    import cPickle as pickle
+    import paddle.v2.master as master
+    c = master.client(etcd_endpoints, timeout_sec, buf_size)
+
+    if isinstance(paths, basestring):
+        path = [paths]
+    else:
+        path = paths
+    c.set_dataset(path)
+
+    def reader():
+        global pass_num
+        c.paddle_start_get_records(pass_num)
+        pass_num += 1
+
+        while True:
+            r, e = c.next_record()
+            if not r:
+                if e != -2:
+                    print "get record error: ", e
+                break
+            yield pickle.loads(r)
+
+    return reader
diff --git a/python/paddle/v2/reader/decorator.py b/python/paddle/v2/reader/decorator.py
new file mode 100644
index 0000000000000000000000000000000000000000..44a6e344630bb35d28ee29078bf8727053a24bef
--- /dev/null
+++ b/python/paddle/v2/reader/decorator.py
@@ -0,0 +1,405 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = [
+    'map_readers', 'buffered', 'compose', 'chain', 'shuffle',
+    'ComposeNotAligned', 'firstn', 'xmap_readers', 'PipeReader'
+]
+
+from threading import Thread
+import subprocess
+
+from Queue import Queue
+import itertools
+import random
+import zlib
+
+
+def map_readers(func, *readers):
+    """
+    Creates a data reader that outputs return value of function using
+    output of each data readers as arguments.
+
+    :param func: function to use. The type of func should be (Sample) => Sample
+    :type: callable
+    :param readers: readers whose outputs will be used as arguments of func.
+    :return: the created data reader.
+    :rtype: callable
+    """
+
+    def reader():
+        rs = []
+        for r in readers:
+            rs.append(r())
+        for e in itertools.imap(func, *rs):
+            yield e
+
+    return reader
+
+
+def shuffle(reader, buf_size):
+    """
+    Creates a data reader whose data output is shuffled.
+
+    Output from the iterator that created by original reader will be
+    buffered into shuffle buffer, and then shuffled. The size of shuffle buffer
+    is determined by argument buf_size.
+
+    :param reader: the original reader whose output will be shuffled.
+    :type reader: callable
+    :param buf_size: shuffle buffer size.
+    :type buf_size: int
+
+    :return: the new reader whose output is shuffled.
+    :rtype: callable
+    """
+
+    def data_reader():
+        buf = []
+        for e in reader():
+            buf.append(e)
+            if len(buf) >= buf_size:
+                random.shuffle(buf)
+                for b in buf:
+                    yield b
+                buf = []
+
+        if len(buf) > 0:
+            random.shuffle(buf)
+            for b in buf:
+                yield b
+
+    return data_reader
+
+
+def chain(*readers):
+    """
+    Creates a data reader whose output is the outputs of input data
+    readers chained together.
+
+    If input readers output following data entries:
+    [0, 0, 0]
+    [1, 1, 1]
+    [2, 2, 2]
+    The chained reader will output:
+    [0, 0, 0, 1, 1, 1, 2, 2, 2]
+
+    :param readers: input readers.
+    :return: the new data reader.
+    :rtype: callable
+    """
+
+    def reader():
+        rs = []
+        for r in readers:
+            rs.append(r())
+
+        for e in itertools.chain(*rs):
+            yield e
+
+    return reader
+
+
+class ComposeNotAligned(ValueError):
+    pass
+
+
+def compose(*readers, **kwargs):
+    """
+    Creates a data reader whose output is the combination of input readers.
+
+    If input readers output following data entries:
+    (1, 2)    3    (4, 5)
+    The composed reader will output:
+    (1, 2, 3, 4, 5)
+
+    :param readers: readers that will be composed together.
+    :param check_alignment: if True, will check if input readers are aligned
+        correctly. If False, will not check alignment and trailing outputs
+        will be discarded. Defaults to True.
+    :type check_alignment: bool
+
+    :return: the new data reader.
+
+    :raises ComposeNotAligned: outputs of readers are not aligned.
+        Will not raise when check_alignment is set to False.
+    """
+    check_alignment = kwargs.pop('check_alignment', True)
+
+    def make_tuple(x):
+        if isinstance(x, tuple):
+            return x
+        else:
+            return (x, )
+
+    def reader():
+        rs = []
+        for r in readers:
+            rs.append(r())
+        if not check_alignment:
+            for outputs in itertools.izip(*rs):
+                yield sum(map(make_tuple, outputs), ())
+        else:
+            for outputs in itertools.izip_longest(*rs):
+                for o in outputs:
+                    if o is None:
+                        # None will be not be present if compose is aligned
+                        raise ComposeNotAligned(
+                            "outputs of readers are not aligned.")
+                yield sum(map(make_tuple, outputs), ())
+
+    return reader
+
+
+def buffered(reader, size):
+    """
+    Creates a buffered data reader.
+
+    The buffered data reader will read and save data entries into a
+    buffer. Reading from the buffered data reader will proceed as long
+    as the buffer is not empty.
+
+    :param reader: the data reader to read from.
+    :type reader: callable
+    :param size: max buffer size.
+    :type size: int
+
+    :returns: the buffered data reader.
+    """
+
+    class EndSignal():
+        pass
+
+    end = EndSignal()
+
+    def read_worker(r, q):
+        for d in r:
+            q.put(d)
+        q.put(end)
+
+    def data_reader():
+        r = reader()
+        q = Queue(maxsize=size)
+        t = Thread(
+            target=read_worker, args=(
+                r,
+                q, ))
+        t.daemon = True
+        t.start()
+        e = q.get()
+        while e != end:
+            yield e
+            e = q.get()
+
+    return data_reader
+
+
+def firstn(reader, n):
+    """
+    Limit the max number of samples that reader could return.
+
+    :param reader: the data reader to read from.
+    :type reader: callable
+    :param n: the max number of samples that return.
+    :type n: int
+    :return: the decorated reader.
+    :rtype: callable
+    """
+
+    # TODO(yuyang18): Check if just drop the reader, could clean the opened
+    # resource or not?
+
+    def firstn_reader():
+        for i, item in enumerate(reader()):
+            if i == n:
+                break
+            yield item
+
+    return firstn_reader
+
+
+class XmapEndSignal():
+    pass
+
+
+def xmap_readers(mapper, reader, process_num, buffer_size, order=False):
+    """
+    Use multiprocess to map samples from reader by a mapper defined by user.
+    And this function contains a buffered decorator.
+    :param mapper:  a function to map sample.
+    :type mapper: callable
+    :param reader: the data reader to read from
+    :type reader: callable
+    :param process_num: process number to handle original sample
+    :type process_num: int
+    :param buffer_size: max buffer size
+    :type buffer_size: int
+    :param order: keep the order of reader
+    :type order: bool
+    :return: the decarated reader
+    :rtype: callable
+    """
+    end = XmapEndSignal()
+
+    # define a worker to read samples from reader to in_queue
+    def read_worker(reader, in_queue):
+        for i in reader():
+            in_queue.put(i)
+        in_queue.put(end)
+
+    # define a worker to read samples from reader to in_queue with order flag
+    def order_read_worker(reader, in_queue):
+        in_order = 0
+        for i in reader():
+            in_queue.put((in_order, i))
+            in_order += 1
+        in_queue.put(end)
+
+    # define a worker to handle samples from in_queue by mapper
+    # and put mapped samples into out_queue
+    def handle_worker(in_queue, out_queue, mapper):
+        sample = in_queue.get()
+        while not isinstance(sample, XmapEndSignal):
+            r = mapper(sample)
+            out_queue.put(r)
+            sample = in_queue.get()
+        in_queue.put(end)
+        out_queue.put(end)
+
+    # define a worker to handle samples from in_queue by mapper
+    # and put mapped samples into out_queue by order
+    def order_handle_worker(in_queue, out_queue, mapper, out_order):
+        ins = in_queue.get()
+        while not isinstance(ins, XmapEndSignal):
+            order, sample = ins
+            r = mapper(sample)
+            while order != out_order[0]:
+                pass
+            out_queue.put(r)
+            out_order[0] += 1
+            ins = in_queue.get()
+        in_queue.put(end)
+        out_queue.put(end)
+
+    def xreader():
+        in_queue = Queue(buffer_size)
+        out_queue = Queue(buffer_size)
+        out_order = [0]
+        # start a read worker in a thread
+        target = order_read_worker if order else read_worker
+        t = Thread(target=target, args=(reader, in_queue))
+        t.daemon = True
+        t.start()
+        # start several handle_workers
+        target = order_handle_worker if order else handle_worker
+        args = (in_queue, out_queue, mapper, out_order) if order else (
+            in_queue, out_queue, mapper)
+        workers = []
+        for i in xrange(process_num):
+            worker = Thread(target=target, args=args)
+            worker.daemon = True
+            workers.append(worker)
+        for w in workers:
+            w.start()
+
+        sample = out_queue.get()
+        while not isinstance(sample, XmapEndSignal):
+            yield sample
+            sample = out_queue.get()
+        finish = 1
+        while finish < process_num:
+            sample = out_queue.get()
+            if isinstance(sample, XmapEndSignal):
+                finish += 1
+            else:
+                yield sample
+
+    return xreader
+
+
+def _buf2lines(buf, line_break="\n"):
+    # FIXME: line_break should be automatically configured.
+    lines = buf.split(line_break)
+    return lines[:-1], lines[-1]
+
+
+class PipeReader:
+    """
+        PipeReader read data by stream from a command, take it's 
+        stdout into a pipe buffer and redirect it to the parser to
+        parse, then yield data as your desired format.
+
+        You can using standard linux command or call another program
+        to read data, from HDFS, Ceph, URL, AWS S3 etc:
+
+        .. code-block:: python
+           cmd = "hadoop fs -cat /path/to/some/file"
+           cmd = "cat sample_file.tar.gz"
+           cmd = "curl http://someurl"
+           cmd = "python print_s3_bucket.py"
+
+        An example:
+
+        .. code-block:: python
+    
+           def example_reader():
+               for f in myfiles:
+                   pr = PipeReader("cat %s"%f)
+                   for l in pr.get_line():
+                       sample = l.split(" ")
+                       yield sample
+    """
+
+    def __init__(self, command, bufsize=8192, file_type="plain"):
+        if not isinstance(command, str):
+            raise TypeError("left_cmd must be a string")
+        if file_type == "gzip":
+            self.dec = zlib.decompressobj(
+                32 + zlib.MAX_WBITS)  # offset 32 to skip the header
+        self.file_type = file_type
+        self.bufsize = bufsize
+        self.process = subprocess.Popen(
+            command.split(" "), bufsize=bufsize, stdout=subprocess.PIPE)
+
+    def get_line(self, cut_lines=True, line_break="\n"):
+        """
+        :param cut_lines: cut buffer to lines
+        :type cut_lines: bool
+        :param line_break: line break of the file, like \n or \r
+        :type line_break: string
+
+        :return: one line or a buffer of bytes
+        :rtype: string
+        """
+        remained = ""
+        while True:
+            buff = self.process.stdout.read(self.bufsize)
+            if buff:
+                if self.file_type == "gzip":
+                    decomp_buff = self.dec.decompress(buff)
+                elif self.file_type == "plain":
+                    decomp_buff = buff
+                else:
+                    raise TypeError("file_type %s is not allowed" %
+                                    self.file_type)
+
+                if cut_lines:
+                    lines, remained = _buf2lines(''.join(
+                        [remained, decomp_buff]), line_break)
+                    for line in lines:
+                        yield line
+                else:
+                    yield decomp_buff
+            else:
+                break
diff --git a/python/paddle/v2/reader/tests/CMakeLists.txt b/python/paddle/v2/reader/tests/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..107d5912e1567e0c8721987a281272c7feb51e63
--- /dev/null
+++ b/python/paddle/v2/reader/tests/CMakeLists.txt
@@ -0,0 +1,2 @@
+py_test(creator_test SRCS creator_test.py)
+py_test(decorator_test SRCS decorator_test.py)
diff --git a/python/paddle/v2/reader/tests/__init__.py b/python/paddle/v2/reader/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..eca2dce114b069bf9b455d77ce670d73b5047fd2
--- /dev/null
+++ b/python/paddle/v2/reader/tests/__init__.py
@@ -0,0 +1,13 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/paddle/v2/reader/tests/creator_test.py b/python/paddle/v2/reader/tests/creator_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..7fe374e663607607cd0839eb6ca9c70c4d15eef8
--- /dev/null
+++ b/python/paddle/v2/reader/tests/creator_test.py
@@ -0,0 +1,74 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright PaddlePaddle contributors. All Rights Reservedd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import unittest
+import numpy as np
+import paddle.v2.reader.creator
+
+
+class TestNumpyArray(unittest.TestCase):
+    def test_numpy_array(self):
+        l = [[1, 2, 3], [4, 5, 6]]
+        x = np.array(l, np.int32)
+        reader = paddle.v2.reader.creator.np_array(x)
+        for idx, e in enumerate(reader()):
+            self.assertItemsEqual(e, l[idx])
+
+
+class TestTextFile(unittest.TestCase):
+    def test_text_file(self):
+        path = os.path.join(os.path.dirname(__file__), "test_data_creator.txt")
+        reader = paddle.v2.reader.creator.text_file(path)
+        for idx, e in enumerate(reader()):
+            self.assertEqual(e, str(idx * 2) + " " + str(idx * 2 + 1))
+
+
+class TestRecordIO(unittest.TestCase):
+    def do_test(self, path):
+        reader = paddle.v2.reader.creator.recordio(path)
+        idx = 0
+        for e in reader():
+            if idx == 0:
+                self.assertEqual(e, (1, 2, 3))
+            elif idx == 1:
+                self.assertEqual(e, (4, 5, 6))
+            idx += 1
+        self.assertEqual(idx, 2)
+
+    def test_recordIO(self):
+        self.do_test(
+            os.path.join(
+                os.path.dirname(__file__), "test_reader_recordio.dat"))
+        self.do_test([
+            os.path.join(
+                os.path.dirname(__file__), "test_reader_recordio.dat")
+        ])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/reader/tests/decorator_test.py b/python/paddle/v2/reader/tests/decorator_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b680e39f3fb299a14e7d8162470996d1d16b83d
--- /dev/null
+++ b/python/paddle/v2/reader/tests/decorator_test.py
@@ -0,0 +1,178 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+import unittest
+
+import paddle.v2.reader
+
+
+def reader_creator_10(dur):
+    def reader():
+        for i in range(10):
+            # this invocation helps testing paddle.reader.buffer
+            time.sleep(dur)
+            yield i
+
+    return reader
+
+
+class TestMap(unittest.TestCase):
+    def test_map(self):
+        d = {"h": 0, "i": 1}
+
+        def tokenize(x):
+            return d[x]
+
+        def read():
+            yield "h"
+            yield "i"
+
+        r = paddle.v2.reader.map_readers(tokenize, read)
+        for i, e in enumerate(r()):
+            self.assertEqual(e, i)
+
+
+class TestBuffered(unittest.TestCase):
+    def test_read(self):
+        for size in range(20):
+            b = paddle.v2.reader.buffered(reader_creator_10(0), size)
+            c = 0
+            for i in b():
+                self.assertEqual(i, c)
+                c += 1
+            self.assertEqual(c, 10)
+
+    def test_buffering(self):
+        # read have 30ms delay.
+        b = paddle.v2.reader.buffered(reader_creator_10(0.03), 10)
+        last_time = time.time()
+        for idx, i in enumerate(b()):
+            elapsed_time = time.time() - last_time
+            if i == 0:
+                time.sleep(0.3)
+            else:
+                # read time should be short, meaning already buffered.
+                self.assertLess(elapsed_time, 0.05)
+            last_time = time.time()
+
+
+class TestCompose(unittest.TestCase):
+    def test_compse(self):
+        reader = paddle.v2.reader.compose(
+            reader_creator_10(0), reader_creator_10(0))
+        for idx, e in enumerate(reader()):
+            self.assertEqual(e, (idx, idx))
+
+    def test_compose_not_aligned(self):
+        total = 0
+        reader = paddle.v2.reader.compose(
+            paddle.v2.reader.chain(reader_creator_10(0), reader_creator_10(0)),
+            reader_creator_10(0))
+        with self.assertRaises(paddle.v2.reader.ComposeNotAligned):
+            for e in reader():
+                total += 1
+        # expecting 10, not 20
+        self.assertEqual(total, 10)
+
+    def test_compose_not_aligned_no_check(self):
+        total = 0
+        reader = paddle.v2.reader.compose(
+            paddle.v2.reader.chain(reader_creator_10(0), reader_creator_10(0)),
+            reader_creator_10(0),
+            check_alignment=False)
+        for e in reader():
+            total += 1
+        # expecting 10, not 20
+        self.assertEqual(total, 10)
+
+
+class TestChain(unittest.TestCase):
+    def test_chain(self):
+        c = paddle.v2.reader.chain(reader_creator_10(0), reader_creator_10(0))
+        idx = 0
+        for e in c():
+            self.assertEqual(e, idx % 10)
+            idx += 1
+        self.assertEqual(idx, 20)
+
+
+class TestShuffle(unittest.TestCase):
+    def test_shuffle(self):
+        case = [(0, True), (1, True), (10, False), (100, False)]
+        a = reader_creator_10(0)
+        for size, checkEq in case:
+            s = paddle.v2.reader.shuffle(a, size)
+            total = 0
+            for idx, e in enumerate(s()):
+                if checkEq:
+                    self.assertEqual(idx, e)
+                total += 1
+            self.assertEqual(total, 10)
+
+
+class TestXmap(unittest.TestCase):
+    def test_xmap(self):
+        def mapper(x):
+            return (x + 1)
+
+        orders = (True, False)
+        thread_nums = (1, 2, 4, 8, 16)
+        buffered_size = (1, 2, 4, 8, 16)
+        for order in orders:
+            for tNum in thread_nums:
+                for size in buffered_size:
+                    reader = paddle.v2.reader.xmap_readers(mapper,
+                                                           reader_creator_10(0),
+                                                           tNum, size, order)
+                    for n in xrange(3):
+                        result = []
+                        for i in reader():
+                            result.append(i)
+                        if not order:
+                            result.sort()
+                        for idx, e in enumerate(result):
+                            self.assertEqual(e, mapper(idx))
+
+
+class TestPipeReader(unittest.TestCase):
+    def test_pipe_reader(self):
+        def example_reader(myfiles):
+            for f in myfiles:
+                pr = paddle.v2.reader.PipeReader("cat %s" % f, bufsize=128)
+                for l in pr.get_line():
+                    yield l
+
+        import tempfile
+
+        records = [str(i) for i in xrange(5)]
+        temp = tempfile.NamedTemporaryFile()
+        try:
+            with open(temp.name, 'w') as f:
+                for r in records:
+                    f.write('%s\n' % r)
+
+            result = []
+            for r in example_reader([temp.name]):
+                result.append(r)
+
+            for idx, e in enumerate(records):
+                self.assertEqual(e, result[idx])
+        finally:
+            # delete the temporary file
+            temp.close()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/reader/tests/test_data_creator.txt b/python/paddle/v2/reader/tests/test_data_creator.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a2a8d47d43868d369083808497697da79e620e31
--- /dev/null
+++ b/python/paddle/v2/reader/tests/test_data_creator.txt
@@ -0,0 +1,3 @@
+0 1
+2 3
+4 5
diff --git a/python/paddle/v2/reader/tests/test_reader_recordio.dat b/python/paddle/v2/reader/tests/test_reader_recordio.dat
new file mode 100644
index 0000000000000000000000000000000000000000..a99a35bb829e066c4845d0b85b96cd1eb3a12491
Binary files /dev/null and b/python/paddle/v2/reader/tests/test_reader_recordio.dat differ
diff --git a/python/paddle/v2/reader/tests/test_recordio_creator.dat b/python/paddle/v2/reader/tests/test_recordio_creator.dat
new file mode 100644
index 0000000000000000000000000000000000000000..17aa89b6796184407e83246d3f342a55a66b4a69
Binary files /dev/null and b/python/paddle/v2/reader/tests/test_recordio_creator.dat differ
diff --git a/python/paddle/v2/tests/CMakeLists.txt b/python/paddle/v2/tests/CMakeLists.txt
index 46e4feb8e1ce1d12f214f5c49b1b589a46110603..b4333ed530ce464095ec38d72706949cc464fbe4 100644
--- a/python/paddle/v2/tests/CMakeLists.txt
+++ b/python/paddle/v2/tests/CMakeLists.txt
@@ -1,4 +1,5 @@
 py_test(test_op SRCS test_op.py)
+py_test(test_image SRCS test_image.py)
 py_test(test_layer SRCS test_layer.py)
 py_test(test_topology SRCS test_topology.py)
 py_test(test_rnn_layer SRCS test_rnn_layer.py)
diff --git a/python/paddle/v2/tests/cat.jpg b/python/paddle/v2/tests/cat.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..bc1fbbd371216b9904b522ed302700c79d2e4876
Binary files /dev/null and b/python/paddle/v2/tests/cat.jpg differ
diff --git a/python/paddle/v2/tests/test_image.py b/python/paddle/v2/tests/test_image.py
new file mode 100644
index 0000000000000000000000000000000000000000..c78bbdc40a25878b21ba7e678afedf9d8f0a87cf
--- /dev/null
+++ b/python/paddle/v2/tests/test_image.py
@@ -0,0 +1,43 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+
+import paddle.v2.image as image
+
+
+class Image(unittest.TestCase):
+    def test_resize_flip_chw(self):
+        # resize
+        im = image.load_image('cat.jpg')
+        im = image.resize_short(im, 256)
+        self.assertEqual(256, min(im.shape[:2]))
+        self.assertEqual(3, im.shape[2])
+
+        # flip
+        im = image.left_right_flip(im)
+        im2 = np.flip(im, 1)
+        self.assertEqual(im.all(), im2.all())
+
+        # to_chw
+        h, w, c = im.shape
+        im = image.to_chw(im)
+        self.assertEqual(c, im.shape[0])
+        self.assertEqual(h, im.shape[1])
+        self.assertEqual(w, im.shape[2])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/tests/test_paramconf_order.py b/python/paddle/v2/tests/test_paramconf_order.py
index 8320217da2795da756cf12a80f39279182789eef..264442be182ea69c95b39b3bdb4c389d52eff66e 100644
--- a/python/paddle/v2/tests/test_paramconf_order.py
+++ b/python/paddle/v2/tests/test_paramconf_order.py
@@ -27,7 +27,6 @@
 # limitations under the License.
 import unittest
 import math
-import paddle.dataset as dataset
 import paddle.v2 as paddle
 
 
@@ -41,7 +40,7 @@ def wordemb(inlayer):
 
 
 def train():
-    word_dict = dataset.imikolov.build_dict()
+    word_dict = paddle.dataset.imikolov.build_dict()
     dict_size = len(word_dict)
     # Every layer takes integer value of range [0, dict_size)
     firstword = paddle.layer.data(
diff --git a/python/setup.py.in b/python/setup.py.in
index d73a3a6a1c41b87efb9600ac59983bd16547ec6a..08a448934d3248b46618acdef9e1894f94a93893 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -77,6 +77,8 @@ if '${WITH_FLUID_ONLY}'== 'OFF':
                'paddle.v2',
                'paddle.v2.master',
                'paddle.v2.plot',
+               'paddle.v2.reader',
+               'paddle.v2.dataset',
                'py_paddle']
 
 with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f: