Merge branch 'develop' into optimizer_lib2

baef96e1 · dzhwinter · GitHub · ec65fa83 · 17fe8322 · baef96e1
289 changed file
--- a/.travis.yml
+++ b/.travis.yml
@@ -50,6 +50,7 @@ before_install:
  # protobuf version.
  - pip install numpy wheel 'protobuf==3.1' sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit requests==2.9.2 LinkChecker
  - pip install rarfile
+  - eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
  - |
    function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
 script:

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -126,7 +126,9 @@ endif(WITH_GPU)
 add_subdirectory(proto)
 add_subdirectory(paddle)
+add_subdirectory(go/master/c)
 add_subdirectory(python)
+add_subdirectory(go/pserver/cclient)
 if(WITH_DOC)
    add_subdirectory(doc)

--- a/Dockerfile
+++ b/Dockerfile
@@ -30,7 +30,8 @@ RUN apt-get update && \
    python-numpy python-matplotlib gcc g++ \
    automake locales clang-format-3.8 swig doxygen cmake  \
    liblapack-dev liblapacke-dev libboost-dev \
-    clang-3.8 llvm-3.8 libclang-3.8-dev && \
+    clang-3.8 llvm-3.8 libclang-3.8-dev \
+    net-tools && \
    apt-get clean -y
 # Install Go

--- a/cmake/cpplint.cmake
+++ b/cmake/cpplint.cmake
@@ -59,7 +59,7 @@ macro(add_style_check_target TARGET_NAME)
                                "--filter=${STYLE_FILTER}"
                                "--write-success=${CUR_GEN}" ${filename}
                    DEPENDS ${filename}
-                    WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR})
+                    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
            endif()
        endforeach()
    endif()

--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -11,23 +11,16 @@ find_path(CUDNN_INCLUDE_DIR cudnn.h
 get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH)
-if(NOT ${CMAKE_HOST_SYSTEM_PROCESSOR})
+set(TARGET_ARCH "x86_64")
-    execute_process(
+if(NOT ${CMAKE_SYSTEM_PROCESSOR})
-        COMMAND uname -m COMMAND tr -d '\n'
+    set(TARGET_ARCH ${CMAKE_SYSTEM_PROCESSOR})
-        OUTPUT_VARIABLE HOST_ARCH
+endif()
-        RESULT_VARIABLE UNAME_RESULT)
-    if(${UNAME_RESULT})
-        set(HOST_ARCH "x86_64")
-    endif(${UNAME_RESULT})
-else(NOT ${CMAKE_HOST_SYSTEM_PROCESSOR})
-    set(HOST_ARCH ${CMAKE_HOST_SYSTEM_PROCESSOR})
-endif(NOT ${CMAKE_HOST_SYSTEM_PROCESSOR})
 list(APPEND CUDNN_CHECK_LIBRARY_DIRS
    ${CUDNN_ROOT}
    ${CUDNN_ROOT}/lib64
    ${CUDNN_ROOT}/lib
-    ${CUDNN_ROOT}/lib/${HOST_ARCH}-linux-gnu
+    ${CUDNN_ROOT}/lib/${TARGET_ARCH}-linux-gnu
    $ENV{CUDNN_ROOT}
    $ENV{CUDNN_ROOT}/lib64
    $ENV{CUDNN_ROOT}/lib

--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -24,20 +24,25 @@ IF(NOT ${CBLAS_FOUND})
    SET(CBLAS_LIBRARIES "${CBLAS_INSTALL_DIR}/lib/${LIBRARY_PREFIX}openblas${STATIC_LIBRARY_SUFFIX}"
        CACHE FILEPATH "openblas library." FORCE)
-    SET(COMMON_ARGS CC=${CMAKE_C_COMPILER} NO_SHARED=1 NO_LAPACK=1)
+    SET(COMMON_ARGS CC=${CMAKE_C_COMPILER} NO_SHARED=1 NO_LAPACK=1 libs)
+    IF(CMAKE_CROSSCOMPILING)
        IF(ANDROID)
            # arm_soft_fp_abi branch of OpenBLAS to support softfp
            #   https://github.com/xianyi/OpenBLAS/tree/arm_soft_fp_abi
            SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
-        SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0 libs)
+            SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0)
        ELSEIF(RPI)
            # use hardfp
            SET(OPENBLAS_COMMIT "v0.2.19")
-        SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=ARMV7 USE_THREAD=0 libs)
+            SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=ARMV7 USE_THREAD=0)
+        ENDIF()
    ELSE()
        SET(OPENBLAS_COMMIT "v0.2.19")
-        SET(OPTIONAL_ARGS DYNAMIC_ARCH=1 libs NUM_THREADS=64)
+        SET(OPTIONAL_ARGS "")
+        IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$")
+            SET(OPTIONAL_ARGS DYNAMIC_ARCH=1 NUM_THREADS=64)
+        ENDIF()
    ENDIF()
    ExternalProject_Add(

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -182,7 +182,7 @@ function(go_library TARGET_NAME)
    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build ${BUILD_MODE}
    -o "${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME}"
    ${go_library_SRCS}
-    WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR})
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
  add_custom_target(${TARGET_NAME}_lib ALL DEPENDS ${TARGET_NAME}_timestamp ${go_library_DEPS})
  add_library(${TARGET_NAME} STATIC IMPORTED)
  set_property(TARGET ${TARGET_NAME} PROPERTY
@@ -199,7 +199,7 @@ function(go_binary TARGET_NAME)
    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build
    -o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}"
    ${go_library_SRCS}
-    WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR})
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
  add_custom_target(${TARGET_NAME} ALL DEPENDS ${TARGET_NAME}_timestamp ${go_binary_DEPS})
  install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME} DESTINATION bin)
 endfunction(go_binary)
@@ -213,7 +213,7 @@ function(go_test TARGET_NAME)
    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} test
    -c -o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}"
    ${go_test_SRCS}
-    WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR})
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
  add_custom_target(${TARGET_NAME} ALL DEPENDS ${TARGET_NAME}_timestamp ${go_test_DEPS})
  add_test(${TARGET_NAME} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME})
 endfunction(go_test)

--- a/demo/image_classification/.gitignore
+++ b/demo/image_classification/.gitignore
-data/cifar-10-batches-py
-data/cifar-out
-cifar_vgg_model/*
-plot.png
-train.log
-image_provider_copy_1.py
-*pyc
-train.list
-test.list
--- a/demo/image_classification/api_v2_resnet.py
+++ b/demo/image_classification/api_v2_resnet.py
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import paddle.v2 as paddle
-__all__ = ['resnet_cifar10']
-def conv_bn_layer(input,
-                  ch_out,
-                  filter_size,
-                  stride,
-                  padding,
-                  active_type=paddle.activation.Relu(),
-                  ch_in=None):
-    tmp = paddle.layer.img_conv(
-        input=input,
-        filter_size=filter_size,
-        num_channels=ch_in,
-        num_filters=ch_out,
-        stride=stride,
-        padding=padding,
-        act=paddle.activation.Linear(),
-        bias_attr=False)
-    return paddle.layer.batch_norm(input=tmp, act=active_type)
-def shortcut(ipt, n_in, n_out, stride):
-    if n_in != n_out:
-        return conv_bn_layer(ipt, n_out, 1, stride, 0,
-                             paddle.activation.Linear())
-    else:
-        return ipt
-def basicblock(ipt, ch_out, stride):
-    ch_in = ch_out * 2
-    tmp = conv_bn_layer(ipt, ch_out, 3, stride, 1)
-    tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, paddle.activation.Linear())
-    short = shortcut(ipt, ch_in, ch_out, stride)
-    return paddle.layer.addto(input=[tmp, short], act=paddle.activation.Relu())
-def layer_warp(block_func, ipt, features, count, stride):
-    tmp = block_func(ipt, features, stride)
-    for i in range(1, count):
-        tmp = block_func(tmp, features, 1)
-    return tmp
-def resnet_cifar10(ipt, depth=32):
-    # depth should be one of 20, 32, 44, 56, 110, 1202
-    assert (depth - 2) % 6 == 0
-    n = (depth - 2) / 6
-    nStages = {16, 64, 128}
-    conv1 = conv_bn_layer(
-        ipt, ch_in=3, ch_out=16, filter_size=3, stride=1, padding=1)
-    res1 = layer_warp(basicblock, conv1, 16, n, 1)
-    res2 = layer_warp(basicblock, res1, 32, n, 2)
-    res3 = layer_warp(basicblock, res2, 64, n, 2)
-    pool = paddle.layer.img_pool(
-        input=res3, pool_size=8, stride=1, pool_type=paddle.pooling.Avg())
-    return pool
--- a/demo/image_classification/api_v2_train.py
+++ b/demo/image_classification/api_v2_train.py
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License
-import sys
-import paddle.v2 as paddle
-from api_v2_vgg import vgg_bn_drop
-def main():
-    datadim = 3 * 32 * 32
-    classdim = 10
-    # PaddlePaddle init
-    paddle.init(use_gpu=False, trainer_count=1)
-    image = paddle.layer.data(
-        name="image", type=paddle.data_type.dense_vector(datadim))
-    # Add neural network config
-    # option 1. resnet
-    # net = resnet_cifar10(image, depth=32)
-    # option 2. vgg
-    net = vgg_bn_drop(image)
-    out = paddle.layer.fc(input=net,
-                          size=classdim,
-                          act=paddle.activation.Softmax())
-    lbl = paddle.layer.data(
-        name="label", type=paddle.data_type.integer_value(classdim))
-    cost = paddle.layer.classification_cost(input=out, label=lbl)
-    # Create parameters
-    parameters = paddle.parameters.create(cost)
-    # Create optimizer
-    momentum_optimizer = paddle.optimizer.Momentum(
-        momentum=0.9,
-        regularization=paddle.optimizer.L2Regularization(rate=0.0002 * 128),
-        learning_rate=0.1 / 128.0,
-        learning_rate_decay_a=0.1,
-        learning_rate_decay_b=50000 * 100,
-        learning_rate_schedule='discexp',
-        batch_size=128)
-    # End batch and end pass event handler
-    def event_handler(event):
-        if isinstance(event, paddle.event.EndIteration):
-            if event.batch_id % 100 == 0:
-                print "\nPass %d, Batch %d, Cost %f, %s" % (
-                    event.pass_id, event.batch_id, event.cost, event.metrics)
-            else:
-                sys.stdout.write('.')
-                sys.stdout.flush()
-        if isinstance(event, paddle.event.EndPass):
-            result = trainer.test(
-                reader=paddle.batch(
-                    paddle.dataset.cifar.test10(), batch_size=128),
-                feeding={'image': 0,
-                         'label': 1})
-            print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
-    # Create trainer
-    trainer = paddle.trainer.SGD(cost=cost,
-                                 parameters=parameters,
-                                 update_equation=momentum_optimizer)
-    trainer.train(
-        reader=paddle.batch(
-            paddle.reader.shuffle(
-                paddle.dataset.cifar.train10(), buf_size=50000),
-            batch_size=128),
-        num_passes=5,
-        event_handler=event_handler,
-        feeding={'image': 0,
-                 'label': 1})
-if __name__ == '__main__':
-    main()
--- a/demo/image_classification/api_v2_vgg.py
+++ b/demo/image_classification/api_v2_vgg.py
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import paddle.v2 as paddle
-__all__ = ['vgg_bn_drop']
-def vgg_bn_drop(input):
-    def conv_block(ipt, num_filter, groups, dropouts, num_channels=None):
-        return paddle.networks.img_conv_group(
-            input=ipt,
-            num_channels=num_channels,
-            pool_size=2,
-            pool_stride=2,
-            conv_num_filter=[num_filter] * groups,
-            conv_filter_size=3,
-            conv_act=paddle.activation.Relu(),
-            conv_with_batchnorm=True,
-            conv_batchnorm_drop_rate=dropouts,
-            pool_type=paddle.pooling.Max())
-    conv1 = conv_block(input, 64, 2, [0.3, 0], 3)
-    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
-    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
-    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
-    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
-    drop = paddle.layer.dropout(input=conv5, dropout_rate=0.5)
-    fc1 = paddle.layer.fc(input=drop, size=512, act=paddle.activation.Linear())
-    bn = paddle.layer.batch_norm(
-        input=fc1,
-        act=paddle.activation.Relu(),
-        layer_attr=paddle.attr.Extra(drop_rate=0.5))
-    fc2 = paddle.layer.fc(input=bn, size=512, act=paddle.activation.Linear())
-    return fc2
--- a/demo/image_classification/data/download_cifar.sh
+++ b/demo/image_classification/data/download_cifar.sh
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-wget https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
-tar zxf cifar-10-python.tar.gz
-rm cifar-10-python.tar.gz
-rm -rf cifar-out/*
-echo Converting CIFAR data to images.....
-python process_cifar.py ./cifar-10-batches-py ./cifar-out
--- a/demo/image_classification/data/process_cifar.py
+++ b/demo/image_classification/data/process_cifar.py
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import numpy as np
-import sys
-import os
-import PIL.Image as Image
-"""
-  Usage: python process_cifar input_dir output_dir
-"""
-def mkdir_not_exist(path):
-    """
-    Make dir if the path does not exist.
-    path: the path to be created.
-    """
-    if not os.path.exists(path):
-        os.mkdir(path)
-def create_dir_structure(output_dir):
-    """
-    Create the directory structure for the directory.
-    output_dir: the direcotry structure path.
-    """
-    mkdir_not_exist(os.path.join(output_dir))
-    mkdir_not_exist(os.path.join(output_dir, "train"))
-    mkdir_not_exist(os.path.join(output_dir, "test"))
-def convert_batch(batch_path, label_set, label_map, output_dir, data_split):
-    """
-    Convert CIFAR batch to the structure of Paddle format.
-    batch_path: the batch to be converted.
-    label_set: the set of labels.
-    output_dir: the output path.
-    data_split: whether it is training or testing data.
-    """
-    data = np.load(batch_path)
-    for data, label, filename in zip(data['data'], data['labels'],
-                                     data['filenames']):
-        data = data.reshape((3, 32, 32))
-        data = np.transpose(data, (1, 2, 0))
-        label = label_map[label]
-        output_dir_this = os.path.join(output_dir, data_split, str(label))
-        output_filename = os.path.join(output_dir_this, filename)
-        if not label in label_set:
-            label_set[label] = True
-            mkdir_not_exist(output_dir_this)
-        Image.fromarray(data).save(output_filename)
-if __name__ == '__main__':
-    input_dir = sys.argv[1]
-    output_dir = sys.argv[2]
-    num_batch = 5
-    create_dir_structure(output_dir)
-    label_map = {
-        0: "airplane",
-        1: "automobile",
-        2: "bird",
-        3: "cat",
-        4: "deer",
-        5: "dog",
-        6: "frog",
-        7: "horse",
-        8: "ship",
-        9: "truck"
-    }
-    labels = {}
-    for i in range(1, num_batch + 1):
-        convert_batch(
-            os.path.join(input_dir, "data_batch_%d" % i), labels, label_map,
-            output_dir, "train")
-    convert_batch(
-        os.path.join(input_dir, "test_batch"), {}, label_map, output_dir,
-        "test")
--- a/demo/image_classification/image_provider.py
+++ b/demo/image_classification/image_provider.py
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import io
-import random
-import paddle.utils.image_util as image_util
-from paddle.trainer.PyDataProvider2 import *
-#
-# {'img_size': 32,
-# 'settings': a global object,
-# 'color': True,
-# 'mean_img_size': 32,
-# 'meta': './data/cifar-out/batches/batches.meta',
-# 'num_classes': 10,
-# 'file_list': ('./data/cifar-out/batches/train_batch_000',),
-# 'use_jpeg': True}
-def hook(settings, img_size, mean_img_size, num_classes, color, meta, use_jpeg,
-         is_train, **kwargs):
-    settings.mean_img_size = mean_img_size
-    settings.img_size = img_size
-    settings.num_classes = num_classes
-    settings.color = color
-    settings.is_train = is_train
-    if settings.color:
-        settings.img_raw_size = settings.img_size * settings.img_size * 3
-    else:
-        settings.img_raw_size = settings.img_size * settings.img_size
-    settings.meta_path = meta
-    settings.use_jpeg = use_jpeg
-    settings.img_mean = image_util.load_meta(settings.meta_path,
-                                             settings.mean_img_size,
-                                             settings.img_size, settings.color)
-    settings.logger.info('Image size: %s', settings.img_size)
-    settings.logger.info('Meta path: %s', settings.meta_path)
-    settings.input_types = {
-        'image': dense_vector(settings.img_raw_size),
-        'label': integer_value(settings.num_classes)
-    }
-    settings.logger.info('DataProvider Initialization finished')
-@provider(init_hook=hook, min_pool_size=0)
-def processData(settings, file_list):
-    """
-    The main function for loading data.
-    Load the batch, iterate all the images and labels in this batch.
-    file_list: the batch file list.
-    """
-    with open(file_list, 'r') as fdata:
-        lines = [line.strip() for line in fdata]
-        random.shuffle(lines)
-        for file_name in lines:
-            with io.open(file_name.strip(), 'rb') as file:
-                data = cPickle.load(file)
-                indexes = list(range(len(data['images'])))
-                if settings.is_train:
-                    random.shuffle(indexes)
-                for i in indexes:
-                    if settings.use_jpeg == 1:
-                        img = image_util.decode_jpeg(data['images'][i])
-                    else:
-                        img = data['images'][i]
-                    img_feat = image_util.preprocess_img(
-                        img, settings.img_mean, settings.img_size,
-                        settings.is_train, settings.color)
-                    label = data['labels'][i]
-                    yield {
-                        'image': img_feat.astype('float32'),
-                        'label': int(label)
-                    }
--- a/demo/image_classification/image_util.py
+++ b/demo/image_classification/image_util.py
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import numpy as np
-from PIL import Image
-from cStringIO import StringIO
-def resize_image(img, target_size):
-    """
-    Resize an image so that the shorter edge has length target_size.
-    img: the input image to be resized.
-    target_size: the target resized image size.
-    """
-    percent = (target_size / float(min(img.size[0], img.size[1])))
-    resized_size = int(round(img.size[0] * percent)), int(
-        round(img.size[1] * percent))
-    img = img.resize(resized_size, Image.ANTIALIAS)
-    return img
-def flip(im):
-    """
-    Return the flipped image.
-    Flip an image along the horizontal direction.
-    im: input image, (H x W x K) ndarrays 
-    """
-    if len(im.shape) == 3:
-        return im[:, :, ::-1]
-    else:
-        return im[:, ::-1]
-def crop_img(im, inner_size, color=True, test=True):
-    """
-    Return cropped image.
-    The size of the cropped image is inner_size * inner_size.
-    im: (K x H x W) ndarrays
-    inner_size: the cropped image size.
-    color: whether it is color image.
-    test: whether in test mode.
-      If False, does random cropping and flipping.
-      If True, crop the center of images.
-    """
-    if color:
-        height, width = max(inner_size, im.shape[1]), max(inner_size,
-                                                          im.shape[2])
-        padded_im = np.zeros((3, height, width))
-        startY = (height - im.shape[1]) / 2
-        startX = (width - im.shape[2]) / 2
-        endY, endX = startY + im.shape[1], startX + im.shape[2]
-        padded_im[:, startY:endY, startX:endX] = im
-    else:
-        im = im.astype('float32')
-        height, width = max(inner_size, im.shape[0]), max(inner_size,
-                                                          im.shape[1])
-        padded_im = np.zeros((height, width))
-        startY = (height - im.shape[0]) / 2
-        startX = (width - im.shape[1]) / 2
-        endY, endX = startY + im.shape[0], startX + im.shape[1]
-        padded_im[startY:endY, startX:endX] = im
-    if test:
-        startY = (height - inner_size) / 2
-        startX = (width - inner_size) / 2
-    else:
-        startY = np.random.randint(0, height - inner_size + 1)
-        startX = np.random.randint(0, width - inner_size + 1)
-    endY, endX = startY + inner_size, startX + inner_size
-    if color:
-        pic = padded_im[:, startY:endY, startX:endX]
-    else:
-        pic = padded_im[startY:endY, startX:endX]
-    if (not test) and (np.random.randint(2) == 0):
-        pic = flip(pic)
-    return pic
-def decode_jpeg(jpeg_string):
-    np_array = np.array(Image.open(StringIO(jpeg_string)))
-    if len(np_array.shape) == 3:
-        np_array = np.transpose(np_array, (2, 0, 1))
-    return np_array
-def preprocess_img(im, img_mean, crop_size, is_train, color=True):
-    """
-    Does data augmentation for images.
-    If is_train is false, cropping the center region from the image.
-    If is_train is true, randomly crop a region from the image,
-    and randomy does flipping.
-    im: (K x H x W) ndarrays
-    """
-    im = im.astype('float32')
-    test = not is_train
-    pic = crop_img(im, crop_size, color, test)
-    pic -= img_mean
-    return pic.flatten()
-def load_meta(meta_path, mean_img_size, crop_size, color=True):
-    """
-    Return the loaded meta file.
-    Load the meta image, which is the mean of the images in the dataset.
-    The mean image is subtracted from every input image so that the expected mean
-    of each input image is zero.
-    """
-    mean = np.load(meta_path)['data_mean']
-    border = (mean_img_size - crop_size) / 2
-    if color:
-        assert (mean_img_size * mean_img_size * 3 == mean.shape[0])
-        mean = mean.reshape(3, mean_img_size, mean_img_size)
-        mean = mean[:, border:border + crop_size, border:border +
-                    crop_size].astype('float32')
-    else:
-        assert (mean_img_size * mean_img_size == mean.shape[0])
-        mean = mean.reshape(mean_img_size, mean_img_size)
-        mean = mean[border:border + crop_size, border:border +
-                    crop_size].astype('float32')
-    return mean
-def load_image(img_path, is_color=True):
-    """
-    Load image and return. 
-    img_path: image path.
-    is_color: is color image or not.
-    """
-    img = Image.open(img_path)
-    img.load()
-    return img
-def oversample(img, crop_dims):
-    """
-    image : iterable of (H x W x K) ndarrays
-    crop_dims: (height, width) tuple for the crops.
-    Returned data contains ten crops of input image, namely,
-    four corner patches and the center patch as well as their
-    horizontal reflections.
-    """
-    # Dimensions and center.
-    im_shape = np.array(img[0].shape)
-    crop_dims = np.array(crop_dims)
-    im_center = im_shape[:2] / 2.0
-    # Make crop coordinates
-    h_indices = (0, im_shape[0] - crop_dims[0])
-    w_indices = (0, im_shape[1] - crop_dims[1])
-    crops_ix = np.empty((5, 4), dtype=int)
-    curr = 0
-    for i in h_indices:
-        for j in w_indices:
-            crops_ix[curr] = (i, j, i + crop_dims[0], j + crop_dims[1])
-            curr += 1
-    crops_ix[4] = np.tile(im_center, (1, 2)) + np.concatenate(
-        [-crop_dims / 2.0, crop_dims / 2.0])
-    crops_ix = np.tile(crops_ix, (2, 1))
-    # Extract crops
-    crops = np.empty(
-        (10 * len(img), crop_dims[0], crop_dims[1], im_shape[-1]),
-        dtype=np.float32)
-    ix = 0
-    for im in img:
-        for crop in crops_ix:
-            crops[ix] = im[crop[0]:crop[2], crop[1]:crop[3], :]
-            ix += 1
-        crops[ix - 5:ix] = crops[ix - 5:ix, :, ::-1, :]  # flip for mirrors
-    return crops
-class ImageTransformer:
-    def __init__(self,
-                 transpose=None,
-                 channel_swap=None,
-                 mean=None,
-                 is_color=True):
-        self.transpose = transpose
-        self.channel_swap = None
-        self.mean = None
-        self.is_color = is_color
-    def set_transpose(self, order):
-        if self.is_color:
-            assert 3 == len(order)
-        self.transpose = order
-    def set_channel_swap(self, order):
-        if self.is_color:
-            assert 3 == len(order)
-        self.channel_swap = order
-    def set_mean(self, mean):
-        # mean value, may be one value per channel 
-        if mean.ndim == 1:
-            mean = mean[:, np.newaxis, np.newaxis]
-        else:
-            # elementwise mean
-            if self.is_color:
-                assert len(mean.shape) == 3
-        self.mean = mean
-    def transformer(self, data):
-        if self.transpose is not None:
-            data = data.transpose(self.transpose)
-        if self.channel_swap is not None:
-            data = data[self.channel_swap, :, :]
-        if self.mean is not None:
-            data -= self.mean
-        return data
--- a/demo/image_classification/predict.sh
+++ b/demo/image_classification/predict.sh
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-model=cifar_vgg_model/pass-00299/
-image=data/cifar-out/test/airplane/seaplane_s_000978.png
-use_gpu=1
-python prediction.py $model $image $use_gpu
--- a/demo/image_classification/prediction.py
+++ b/demo/image_classification/prediction.py
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os, sys
-import numpy as np
-import logging
-from PIL import Image
-from optparse import OptionParser
-import paddle.utils.image_util as image_util
-from py_paddle import swig_paddle, DataProviderConverter
-from paddle.trainer.PyDataProvider2 import dense_vector
-from paddle.trainer.config_parser import parse_config
-logging.basicConfig(
-    format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s')
-logging.getLogger().setLevel(logging.INFO)
-class ImageClassifier():
-    def __init__(self,
-                 train_conf,
-                 use_gpu=True,
-                 model_dir=None,
-                 resize_dim=None,
-                 crop_dim=None,
-                 mean_file=None,
-                 oversample=False,
-                 is_color=True):
-        """
-        train_conf: network configure.
-        model_dir: string, directory of model.
-        resize_dim: int, resized image size.
-        crop_dim: int, crop size.
-        mean_file: string, image mean file.
-        oversample: bool, oversample means multiple crops, namely five
-                    patches (the four corner patches and the center
-                    patch) as well as their horizontal reflections,
-                    ten crops in all.
-        """
-        self.train_conf = train_conf
-        self.model_dir = model_dir
-        if model_dir is None:
-            self.model_dir = os.path.dirname(train_conf)
-        self.resize_dim = resize_dim
-        self.crop_dims = [crop_dim, crop_dim]
-        self.oversample = oversample
-        self.is_color = is_color
-        self.transformer = image_util.ImageTransformer(is_color=is_color)
-        self.transformer.set_transpose((2, 0, 1))
-        self.mean_file = mean_file
-        mean = np.load(self.mean_file)['data_mean']
-        mean = mean.reshape(3, self.crop_dims[0], self.crop_dims[1])
-        self.transformer.set_mean(mean)  # mean pixel
-        gpu = 1 if use_gpu else 0
-        conf_args = "is_test=1,use_gpu=%d,is_predict=1" % (gpu)
-        conf = parse_config(train_conf, conf_args)
-        swig_paddle.initPaddle("--use_gpu=%d" % (gpu))
-        self.network = swig_paddle.GradientMachine.createFromConfigProto(
-            conf.model_config)
-        assert isinstance(self.network, swig_paddle.GradientMachine)
-        self.network.loadParameters(self.model_dir)
-        data_size = 3 * self.crop_dims[0] * self.crop_dims[1]
-        slots = [dense_vector(data_size)]
-        self.converter = DataProviderConverter(slots)
-    def get_data(self, img_path):
-        """
-        1. load image from img_path.
-        2. resize or oversampling.
-        3. transformer data: transpose, sub mean.
-        return K x H x W ndarray.
-        img_path: image path.
-        """
-        image = image_util.load_image(img_path, self.is_color)
-        if self.oversample:
-            # image_util.resize_image: short side is self.resize_dim
-            image = image_util.resize_image(image, self.resize_dim)
-            image = np.array(image)
-            input = np.zeros(
-                (1, image.shape[0], image.shape[1], 3), dtype=np.float32)
-            input[0] = image.astype(np.float32)
-            input = image_util.oversample(input, self.crop_dims)
-        else:
-            image = image.resize(self.crop_dims, Image.ANTIALIAS)
-            input = np.zeros(
-                (1, self.crop_dims[0], self.crop_dims[1], 3), dtype=np.float32)
-            input[0] = np.array(image).astype(np.float32)
-        data_in = []
-        for img in input:
-            img = self.transformer.transformer(img).flatten()
-            data_in.append([img.tolist()])
-        return data_in
-    def forward(self, input_data):
-        in_arg = self.converter(input_data)
-        return self.network.forwardTest(in_arg)
-    def forward(self, data, output_layer):
-        """
-        input_data: py_paddle input data.
-        output_layer: specify the name of probability, namely the layer with
-                      softmax activation.
-        return: the predicting probability of each label.
-        """
-        input = self.converter(data)
-        self.network.forwardTest(input)
-        output = self.network.getLayerOutputs(output_layer)
-        # For oversampling, average predictions across crops.
-        # If not, the shape of output[name]: (1, class_number),
-        # the mean is also applicable.
-        return output[output_layer]['value'].mean(0)
-    def predict(self, image=None, output_layer=None):
-        assert isinstance(image, basestring)
-        assert isinstance(output_layer, basestring)
-        data = self.get_data(image)
-        prob = self.forward(data, output_layer)
-        lab = np.argsort(-prob)
-        logging.info("Label of %s is: %d", image, lab[0])
-if __name__ == '__main__':
-    image_size = 32
-    crop_size = 32
-    multi_crop = True
-    config = "vgg_16_cifar.py"
-    output_layer = "__fc_layer_1__"
-    mean_path = "data/cifar-out/batches/batches.meta"
-    model_path = sys.argv[1]
-    image = sys.argv[2]
-    use_gpu = bool(int(sys.argv[3]))
-    obj = ImageClassifier(
-        train_conf=config,
-        model_dir=model_path,
-        resize_dim=image_size,
-        crop_dim=crop_size,
-        mean_file=mean_path,
-        use_gpu=use_gpu,
-        oversample=multi_crop)
-    obj.predict(image, output_layer)
--- a/demo/image_classification/preprocess.py
+++ b/demo/image_classification/preprocess.py
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from paddle.utils.preprocess_img import ImageClassificationDatasetCreater
-from optparse import OptionParser
-def option_parser():
-    parser = OptionParser(usage="usage: python preprcoess.py "\
-                          "-i data_dir [options]")
-    parser.add_option(
-        "-i",
-        "--input",
-        action="store",
-        dest="input",
-        help="Input data directory.")
-    parser.add_option(
-        "-s",
-        "--size",
-        action="store",
-        dest="size",
-        help="Processed image size.")
-    parser.add_option(
-        "-c",
-        "--color",
-        action="store",
-        dest="color",
-        help="whether to use color images.")
-    return parser.parse_args()
-if __name__ == '__main__':
-    options, args = option_parser()
-    data_dir = options.input
-    processed_image_size = int(options.size)
-    color = options.color == "1"
-    data_creator = ImageClassificationDatasetCreater(
-        data_dir, processed_image_size, color)
-    data_creator.train_list_name = "train.txt"
-    data_creator.test_list_name = "test.txt"
-    data_creator.num_per_batch = 1000
-    data_creator.overwrite = True
-    data_creator.create_batches()
--- a/demo/image_classification/preprocess.sh
+++ b/demo/image_classification/preprocess.sh
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-data_dir=./data/cifar-out
-python preprocess.py -i $data_dir -s 32 -c 1
-echo "data/cifar-out/batches/train.txt" > train.list
-echo "data/cifar-out/batches/test.txt" > test.list
--- a/demo/image_classification/train.sh
+++ b/demo/image_classification/train.sh
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-config=vgg_16_cifar.py
-output=./cifar_vgg_model
-log=train.log
-paddle train \
--config=$config \
--dot_period=10 \
--log_period=100 \
--test_all_data_in_one_period=1 \
--use_gpu=1 \
--trainer_count=1 \
--num_passes=300 \
--save_dir=$output \
-2>&1 | tee $log
-paddle usage -l $log -e $? -n "image_classification_train" >/dev/null 2>&1
-python -m paddle.utils.plotcurve -i $log > plot.png
--- a/demo/image_classification/vgg_16_cifar.py
+++ b/demo/image_classification/vgg_16_cifar.py
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from paddle.trainer_config_helpers import *
-is_predict = get_config_arg("is_predict", bool, False)
-####################Data Configuration ##################
-if not is_predict:
-    data_dir = 'data/cifar-out/batches/'
-    meta_path = data_dir + 'batches.meta'
-    args = {
-        'meta': meta_path,
-        'mean_img_size': 32,
-        'img_size': 32,
-        'num_classes': 10,
-        'use_jpeg': 1,
-        'color': "color"
-    }
-    define_py_data_sources2(
-        train_list="train.list",
-        test_list="train.list",
-        module='image_provider',
-        obj='processData',
-        args=args)
-######################Algorithm Configuration #############
-settings(
-    batch_size=128,
-    learning_rate=0.1 / 128.0,
-    learning_method=MomentumOptimizer(0.9),
-    regularization=L2Regularization(0.0005 * 128))
-#######################Network Configuration #############
-data_size = 3 * 32 * 32
-label_size = 10
-img = data_layer(name='image', size=data_size)
-# small_vgg is predefined in trainer_config_helpers.networks
-predict = small_vgg(input_image=img, num_channels=3, num_classes=label_size)
-if not is_predict:
-    lbl = data_layer(name="label", size=label_size)
-    outputs(classification_cost(input=predict, label=lbl))
-else:
-    outputs(predict)
--- a/demo/introduction/.gitignore
+++ b/demo/introduction/.gitignore
-dataprovider.pyc
-empty.list
-train.log
-output
-train.list
--- a/demo/introduction/README.md
+++ b/demo/introduction/README.md
-This folder contains scripts used in PaddlePaddle introduction.
- use `bash train.sh` to train a simple linear regression model
- use `python evaluate_model.py` to read model parameters. You can see that `w` and `b` are very close to [2, 0.3].
--- a/demo/introduction/dataprovider.py
+++ b/demo/introduction/dataprovider.py
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from paddle.trainer.PyDataProvider2 import *
-import random
-# define data types of input: 2 real numbers
-@provider(
-    input_types={'x': dense_vector(1),
-                 'y': dense_vector(1)}, use_seq=False)
-def process(settings, input_file):
-    for i in xrange(2000):
-        x = random.random()
-        yield {'x': [x], 'y': [2 * x + 0.3]}
--- a/demo/introduction/train.sh
+++ b/demo/introduction/train.sh
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-paddle train \
-    --config=trainer_config.py \
-    --save_dir=./output \
-    --num_passes=30 \
-    2>&1 |tee 'train.log'
-paddle usage -l "train.log" -e $? -n "introduction" >/dev/null 2>&1
--- a/demo/introduction/trainer_config.py
+++ b/demo/introduction/trainer_config.py
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from paddle.trainer_config_helpers import *
-# 1. read data. Suppose you saved above python code as dataprovider.py
-define_py_data_sources2(
-    train_list=['no_matter.txt'],
-    test_list=None,
-    module='dataprovider',
-    obj='process',
-    args={})
-# 2. learning algorithm
-settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer())
-# 3. Network configuration
-x = data_layer(name='x', size=1)
-y = data_layer(name='y', size=1)
-y_predict = fc_layer(
-    input=x,
-    param_attr=ParamAttr(name='w'),
-    size=1,
-    act=LinearActivation(),
-    bias_attr=ParamAttr(name='b'))
-cost = mse_cost(input=y_predict, label=y)
-outputs(cost)
--- a/demo/recommendation/.gitignore
+++ b/demo/recommendation/.gitignore
-log.txt
-data/meta.bin
-data/ml-1m
-data/ratings.dat.train
-data/ratings.dat.test
-data/train.list
-data/test.list
-dataprovider_copy_1.py
-*.pyc
-output
--- a/demo/recommendation/api_train_v2.py
+++ b/demo/recommendation/api_train_v2.py
-import paddle.v2 as paddle
-import cPickle
-import copy
-def main():
-    paddle.init(use_gpu=False)
-    movie_title_dict = paddle.dataset.movielens.get_movie_title_dict()
-    uid = paddle.layer.data(
-        name='user_id',
-        type=paddle.data_type.integer_value(
-            paddle.dataset.movielens.max_user_id() + 1))
-    usr_emb = paddle.layer.embedding(input=uid, size=32)
-    usr_gender_id = paddle.layer.data(
-        name='gender_id', type=paddle.data_type.integer_value(2))
-    usr_gender_emb = paddle.layer.embedding(input=usr_gender_id, size=16)
-    usr_age_id = paddle.layer.data(
-        name='age_id',
-        type=paddle.data_type.integer_value(
-            len(paddle.dataset.movielens.age_table)))
-    usr_age_emb = paddle.layer.embedding(input=usr_age_id, size=16)
-    usr_job_id = paddle.layer.data(
-        name='job_id',
-        type=paddle.data_type.integer_value(paddle.dataset.movielens.max_job_id(
-        ) + 1))
-    usr_job_emb = paddle.layer.embedding(input=usr_job_id, size=16)
-    usr_combined_features = paddle.layer.fc(
-        input=[usr_emb, usr_gender_emb, usr_age_emb, usr_job_emb],
-        size=200,
-        act=paddle.activation.Tanh())
-    mov_id = paddle.layer.data(
-        name='movie_id',
-        type=paddle.data_type.integer_value(
-            paddle.dataset.movielens.max_movie_id() + 1))
-    mov_emb = paddle.layer.embedding(input=mov_id, size=32)
-    mov_categories = paddle.layer.data(
-        name='category_id',
-        type=paddle.data_type.sparse_binary_vector(
-            len(paddle.dataset.movielens.movie_categories())))
-    mov_categories_hidden = paddle.layer.fc(input=mov_categories, size=32)
-    mov_title_id = paddle.layer.data(
-        name='movie_title',
-        type=paddle.data_type.integer_value_sequence(len(movie_title_dict)))
-    mov_title_emb = paddle.layer.embedding(input=mov_title_id, size=32)
-    mov_title_conv = paddle.networks.sequence_conv_pool(
-        input=mov_title_emb, hidden_size=32, context_len=3)
-    mov_combined_features = paddle.layer.fc(
-        input=[mov_emb, mov_categories_hidden, mov_title_conv],
-        size=200,
-        act=paddle.activation.Tanh())
-    inference = paddle.layer.cos_sim(
-        a=usr_combined_features, b=mov_combined_features, size=1, scale=5)
-    cost = paddle.layer.mse_cost(
-        input=inference,
-        label=paddle.layer.data(
-            name='score', type=paddle.data_type.dense_vector(1)))
-    parameters = paddle.parameters.create(cost)
-    trainer = paddle.trainer.SGD(cost=cost,
-                                 parameters=parameters,
-                                 update_equation=paddle.optimizer.Adam(
-                                     learning_rate=1e-4))
-    feeding = {
-        'user_id': 0,
-        'gender_id': 1,
-        'age_id': 2,
-        'job_id': 3,
-        'movie_id': 4,
-        'category_id': 5,
-        'movie_title': 6,
-        'score': 7
-    }
-    def event_handler(event):
-        if isinstance(event, paddle.event.EndIteration):
-            if event.batch_id % 100 == 0:
-                print "Pass %d Batch %d Cost %.2f" % (
-                    event.pass_id, event.batch_id, event.cost)
-    trainer.train(
-        reader=paddle.batch(
-            paddle.reader.shuffle(
-                paddle.dataset.movielens.train(), buf_size=8192),
-            batch_size=256),
-        event_handler=event_handler,
-        feeding=feeding,
-        num_passes=1)
-    user_id = 234
-    movie_id = 345
-    user = paddle.dataset.movielens.user_info()[user_id]
-    movie = paddle.dataset.movielens.movie_info()[movie_id]
-    feature = user.value() + movie.value()
-    def reader():
-        yield feature
-    infer_dict = copy.copy(feeding)
-    del infer_dict['score']
-    prediction = paddle.infer(
-        output=inference,
-        parameters=parameters,
-        reader=paddle.batch(
-            reader, batch_size=32),
-        feeding=infer_dict)
-    print(prediction + 5) / 2
-if __name__ == '__main__':
-    main()
--- a/demo/recommendation/common_utils.py
+++ b/demo/recommendation/common_utils.py
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from paddle.trainer.PyDataProvider2 import *
-def meta_to_header(meta, name):
-    metas = meta[name]['__meta__']['raw_meta']
-    for each_meta in metas:
-        slot_name = each_meta.get('name', '%s_id' % name)
-        if each_meta['type'] == 'id':
-            yield slot_name, integer_value(each_meta['max'])
-        elif each_meta['type'] == 'embedding':
-            is_seq = each_meta['seq'] == 'sequence'
-            yield slot_name, integer_value(
-                len(each_meta['dict']),
-                seq_type=SequenceType.SEQUENCE
-                if is_seq else SequenceType.NO_SEQUENCE)
-        elif each_meta['type'] == 'one_hot_dense':
-            yield slot_name, dense_vector(len(each_meta['dict']))
--- a/demo/recommendation/data/config.json
+++ b/demo/recommendation/data/config.json
-{
-  "user": {
-    "file": {
-      "name": "users.dat",
-      "delimiter": "::"
-    },
-    "fields": ["id", "gender", "age", "occupation"]
-  },
-  "movie": {
-    "file": {
-      "name": "movies.dat",
-      "delimiter": "::"
-    },
-    "fields": ["id", "title", "genres"]
-  }
-}
--- a/demo/recommendation/data/config_generator.py
+++ b/demo/recommendation/data/config_generator.py
-#!/bin/env python2
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-config_generator.py
-Usage:
-    ./config_generator.py <config_file> [--output_format=<output_format>]
-    ./config_generator.py -h | --help
-Options:
-    -h --help                           Show this screen.
-    --output_format=<output_format>     Output Config format(json or yaml) [default: json].
-"""
-import json
-import docopt
-import copy
-DEFAULT_FILE = {"type": "split", "delimiter": ","}
-DEFAULT_FIELD = {
-    "id": {
-        "type": "id"
-    },
-    "gender": {
-        "name": "gender",
-        "type": "embedding",
-        "dict": {
-            "type": "char_based"
-        }
-    },
-    "age": {
-        "name": "age",
-        "type": "embedding",
-        "dict": {
-            "type": "whole_content",
-            "sort": True
-        }
-    },
-    "occupation": {
-        "name": "occupation",
-        "type": "embedding",
-        "dict": {
-            "type": "whole_content",
-            "sort": "true"
-        }
-    },
-    "title": {
-        "regex": {
-            "pattern": r"^(.*)\((\d+)\)$",
-            "group_id": 1,
-            "strip": True
-        },
-        "name": "title",
-        "type": {
-            "name": "embedding",
-            "seq_type": "sequence",
-        },
-        "dict": {
-            "type": "char_based"
-        }
-    },
-    "genres": {
-        "type": "one_hot_dense",
-        "dict": {
-            "type": "split",
-            "delimiter": "|"
-        },
-        "name": "genres"
-    }
-}
-def merge_dict(master_dict, slave_dict):
-    return dict(((k, master_dict.get(k) or slave_dict.get(k))
-                 for k in set(slave_dict) | set(master_dict)))
-def main(filename, fmt):
-    with open(filename, 'r') as f:
-        conf = json.load(f)
-        obj = dict()
-        for k in conf:
-            val = conf[k]
-            file_dict = val['file']
-            file_dict = merge_dict(file_dict, DEFAULT_FILE)
-            fields = []
-            for pos, field_key in enumerate(val['fields']):
-                assert isinstance(field_key, basestring)
-                field = copy.deepcopy(DEFAULT_FIELD[field_key])
-                field['pos'] = pos
-                fields.append(field)
-            obj[k] = {"file": file_dict, "fields": fields}
-    meta = {"meta": obj}
-    # print meta
-    if fmt == 'json':
-        def formatter(x):
-            import json
-            return json.dumps(x, indent=2)
-    elif fmt == 'yaml':
-        def formatter(x):
-            import yaml
-            return yaml.safe_dump(x, default_flow_style=False)
-    else:
-        raise NotImplementedError("Dump format %s is not implemented" % fmt)
-    print formatter(meta)
-if __name__ == '__main__':
-    args = docopt.docopt(__doc__, version="0.1.0")
-    main(args["<config_file>"], args["--output_format"])
--- a/demo/recommendation/data/meta_config.json
+++ b/demo/recommendation/data/meta_config.json
-{
-  "meta": {
-    "movie": {
-      "fields": [
-        {
-          "type": "id", 
-          "pos": 0
-        }, 
-        {
-          "regex": {
-            "pattern": "^(.*)\\((\\d+)\\)$", 
-            "group_id": 1, 
-            "strip": true
-          }, 
-          "type": {
-            "seq_type": "sequence", 
-            "name": "embedding"
-          }, 
-          "dict": {
-            "type": "char_based"
-          }, 
-          "name": "title", 
-          "pos": 1
-        }, 
-        {
-          "type": "one_hot_dense", 
-          "dict": {
-            "delimiter": "|", 
-            "type": "split"
-          }, 
-          "name": "genres", 
-          "pos": 2
-        }
-      ], 
-      "file": {
-        "delimiter": "::", 
-        "type": "split", 
-        "name": "movies.dat"
-      }
-    }, 
-    "user": {
-      "fields": [
-        {
-          "type": "id", 
-          "pos": 0
-        }, 
-        {
-          "type": "embedding", 
-          "dict": {
-            "type": "char_based"
-          }, 
-          "name": "gender", 
-          "pos": 1
-        }, 
-        {
-          "type": "embedding", 
-          "dict": {
-            "sort": true, 
-            "type": "whole_content"
-          }, 
-          "name": "age", 
-          "pos": 2
-        }, 
-        {
-          "type": "embedding", 
-          "dict": {
-            "sort": "true", 
-            "type": "whole_content"
-          }, 
-          "name": "occupation", 
-          "pos": 3
-        }
-      ], 
-      "file": {
-        "delimiter": "::", 
-        "type": "split", 
-        "name": "users.dat"
-      }
-    }
-  }
-}
--- a/demo/recommendation/data/meta_generator.py
+++ b/demo/recommendation/data/meta_generator.py
-#!/bin/env python2
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Preprocess Movielens dataset, to get movie/user object.
-Usage:
-    ./preprocess.py <dataset_dir> <binary_filename> [--config=<config_file>]
-    ./preprocess.py -h | --help
-Options:
-    -h --help               Show this screen.
-    --version               Show version.
-    --config=<config_file>  Get MetaData config file [default: config.json].
-"""
-import docopt
-import os
-import sys
-import re
-import collections
-try:
-    import cPickle as pickle
-except ImportError:
-    import pickle
-class UniqueIDGenerator(object):
-    def __init__(self):
-        self.pool = collections.defaultdict(self.__next_id__)
-        self.next_id = 0
-    def __next_id__(self):
-        tmp = self.next_id
-        self.next_id += 1
-        return tmp
-    def __call__(self, k):
-        return self.pool[k]
-    def to_list(self):
-        ret_val = [None] * len(self.pool)
-        for k in self.pool.keys():
-            ret_val[self.pool[k]] = k
-        return ret_val
-class SortedIDGenerator(object):
-    def __init__(self):
-        self.__key_set__ = set()
-        self.dict = None
-    def scan(self, key):
-        self.__key_set__.add(key)
-    def finish_scan(self, compare=None, key=None, reverse=False):
-        self.__key_set__ = sorted(
-            list(self.__key_set__), cmp=compare, key=key, reverse=reverse)
-        self.dict = dict()
-        for idx, each_key in enumerate(self.__key_set__):
-            self.dict[each_key] = idx
-    def __call__(self, key):
-        return self.dict[key]
-    def to_list(self):
-        return self.__key_set__
-class SplitFileReader(object):
-    def __init__(self, work_dir, config):
-        assert isinstance(config, dict)
-        self.filename = config['name']
-        self.delimiter = config.get('delimiter', ',')
-        self.work_dir = work_dir
-    def read(self):
-        with open(os.path.join(self.work_dir, self.filename), 'r') as f:
-            for line in f:
-                line = line.strip()
-                if isinstance(self.delimiter, unicode):
-                    self.delimiter = str(self.delimiter)
-                yield line.split(self.delimiter)
-    @staticmethod
-    def create(work_dir, config):
-        assert isinstance(config, dict)
-        if config['type'] == 'split':
-            return SplitFileReader(work_dir, config)
-class IFileReader(object):
-    READERS = [SplitFileReader]
-    def read(self):
-        raise NotImplementedError()
-    @staticmethod
-    def create(work_dir, config):
-        for reader_cls in IFileReader.READERS:
-            val = reader_cls.create(work_dir, config)
-            if val is not None:
-                return val
-class IDFieldParser(object):
-    TYPE = 'id'
-    def __init__(self, config):
-        self.__max_id__ = -sys.maxint - 1
-        self.__min_id__ = sys.maxint
-        self.__id_count__ = 0
-    def scan(self, line):
-        idx = int(line)
-        self.__max_id__ = max(self.__max_id__, idx)
-        self.__min_id__ = min(self.__min_id__, idx)
-        self.__id_count__ += 1
-    def parse(self, line):
-        return int(line)
-    def meta_field(self):
-        return {
-            "is_key": True,
-            'max': self.__max_id__,
-            'min': self.__min_id__,
-            'count': self.__id_count__,
-            'type': 'id'
-        }
-class SplitEmbeddingDict(object):
-    def __init__(self, delimiter):
-        self.__id__ = UniqueIDGenerator()
-        self.delimiter = delimiter
-    def scan(self, multi):
-        for val in multi.split(self.delimiter):
-            self.__id__(val)
-    def parse(self, multi):
-        return map(self.__id__, multi.split(self.delimiter))
-    def meta_field(self):
-        return self.__id__.to_list()
-class EmbeddingFieldParser(object):
-    TYPE = 'embedding'
-    NO_SEQUENCE = "no_sequence"
-    SEQUENCE = "sequence"
-    class CharBasedEmbeddingDict(object):
-        def __init__(self, is_seq=True):
-            self.__id__ = UniqueIDGenerator()
-            self.is_seq = is_seq
-        def scan(self, s):
-            for ch in s:
-                self.__id__(ch)
-        def parse(self, s):
-            return map(self.__id__, s) if self.is_seq else self.__id__(s[0])
-        def meta_field(self):
-            return self.__id__.to_list()
-    class WholeContentDict(object):
-        def __init__(self, need_sort=True):
-            assert need_sort
-            self.__id__ = SortedIDGenerator()
-            self.__has_finished__ = False
-        def scan(self, txt):
-            self.__id__.scan(txt)
-        def meta_field(self):
-            if not self.__has_finished__:
-                self.__id__.finish_scan()
-                self.__has_finished__ = True
-            return self.__id__.to_list()
-        def parse(self, txt):
-            return self.__id__(txt)
-    def __init__(self, config):
-        try:
-            self.seq_type = config['type']['seq_type']
-        except TypeError:
-            self.seq_type = EmbeddingFieldParser.NO_SEQUENCE
-        if config['dict']['type'] == 'char_based':
-            self.dict = EmbeddingFieldParser.CharBasedEmbeddingDict(
-                self.seq_type == EmbeddingFieldParser.SEQUENCE)
-        elif config['dict']['type'] == 'split':
-            self.dict = SplitEmbeddingDict(config['dict'].get('delimiter', ','))
-        elif config['dict']['type'] == 'whole_content':
-            self.dict = EmbeddingFieldParser.WholeContentDict(config['dict'][
-                'sort'])
-        else:
-            print config
-            assert False
-        self.name = config['name']
-    def scan(self, s):
-        self.dict.scan(s)
-    def meta_field(self):
-        return {
-            'name': self.name,
-            'dict': self.dict.meta_field(),
-            'type': 'embedding',
-            'seq': self.seq_type
-        }
-    def parse(self, s):
-        return self.dict.parse(s)
-class OneHotDenseFieldParser(object):
-    TYPE = 'one_hot_dense'
-    def __init__(self, config):
-        if config['dict']['type'] == 'split':
-            self.dict = SplitEmbeddingDict(config['dict']['delimiter'])
-        self.name = config['name']
-    def scan(self, s):
-        self.dict.scan(s)
-    def meta_field(self):
-        # print self.dict.meta_field()
-        return {
-            'dict': self.dict.meta_field(),
-            'name': self.name,
-            'type': 'one_hot_dense'
-        }
-    def parse(self, s):
-        ids = self.dict.parse(s)
-        retv = [0.0] * len(self.dict.meta_field())
-        for idx in ids:
-            retv[idx] = 1.0
-        # print retv
-        return retv
-class FieldParserFactory(object):
-    PARSERS = [IDFieldParser, EmbeddingFieldParser, OneHotDenseFieldParser]
-    @staticmethod
-    def create(config):
-        if isinstance(config['type'], basestring):
-            config_type = config['type']
-        elif isinstance(config['type'], dict):
-            config_type = config['type']['name']
-        assert config_type is not None
-        for each_parser_cls in FieldParserFactory.PARSERS:
-            if config_type == each_parser_cls.TYPE:
-                return each_parser_cls(config)
-        print config
-class CompositeFieldParser(object):
-    def __init__(self, parser, extractor):
-        self.extractor = extractor
-        self.parser = parser
-    def scan(self, *args, **kwargs):
-        self.parser.scan(self.extractor.extract(*args, **kwargs))
-    def parse(self, *args, **kwargs):
-        return self.parser.parse(self.extractor.extract(*args, **kwargs))
-    def meta_field(self):
-        return self.parser.meta_field()
-class PositionContentExtractor(object):
-    def __init__(self, pos):
-        self.pos = pos
-    def extract(self, line):
-        assert isinstance(line, list)
-        return line[self.pos]
-class RegexPositionContentExtractor(PositionContentExtractor):
-    def __init__(self, pos, pattern, group_id, strip=True):
-        PositionContentExtractor.__init__(self, pos)
-        pattern = pattern.strip()
-        self.pattern = re.compile(pattern)
-        self.group_id = group_id
-        self.strip = strip
-    def extract(self, line):
-        line = PositionContentExtractor.extract(self, line)
-        match = self.pattern.match(line)
-        # print line, self.pattern.pattern, match
-        assert match is not None
-        txt = match.group(self.group_id)
-        if self.strip:
-            txt.strip()
-        return txt
-class ContentExtractorFactory(object):
-    def extract(self, line):
-        pass
-    @staticmethod
-    def create(config):
-        if 'pos' in config:
-            if 'regex' not in config:
-                return PositionContentExtractor(config['pos'])
-            else:
-                extra_args = config['regex']
-                return RegexPositionContentExtractor(
-                    pos=config['pos'], **extra_args)
-class MetaFile(object):
-    def __init__(self, work_dir):
-        self.work_dir = work_dir
-        self.obj = dict()
-    def parse(self, config):
-        config = config['meta']
-        ret_obj = dict()
-        for key in config.keys():
-            val = config[key]
-            assert 'file' in val
-            reader = IFileReader.create(self.work_dir, val['file'])
-            assert reader is not None
-            assert 'fields' in val and isinstance(val['fields'], list)
-            fields_config = val['fields']
-            field_parsers = map(MetaFile.__field_config_mapper__, fields_config)
-            for each_parser in field_parsers:
-                assert each_parser is not None
-            for each_block in reader.read():
-                for each_parser in field_parsers:
-                    each_parser.scan(each_block)
-            metas = map(lambda x: x.meta_field(), field_parsers)
-            # print metas
-            key_index = filter(
-                lambda x: x is not None,
-                map(lambda (idx, meta): idx if 'is_key' in meta and meta['is_key'] else None,
-                    enumerate(metas)))[0]
-            key_map = []
-            for i in range(min(key_index, len(metas))):
-                key_map.append(i)
-            for i in range(key_index + 1, len(metas)):
-                key_map.append(i)
-            obj = {'__meta__': {'raw_meta': metas, 'feature_map': key_map}}
-            for each_block in reader.read():
-                idx = field_parsers[key_index].parse(each_block)
-                val = []
-                for i, each_parser in enumerate(field_parsers):
-                    if i != key_index:
-                        val.append(each_parser.parse(each_block))
-                obj[idx] = val
-            ret_obj[key] = obj
-        self.obj = ret_obj
-        return ret_obj
-    @staticmethod
-    def __field_config_mapper__(conf):
-        assert isinstance(conf, dict)
-        extrator = ContentExtractorFactory.create(conf)
-        field_parser = FieldParserFactory.create(conf)
-        assert extrator is not None
-        assert field_parser is not None
-        return CompositeFieldParser(field_parser, extrator)
-    def dump(self, fp):
-        pickle.dump(self.obj, fp, pickle.HIGHEST_PROTOCOL)
-def preprocess(binary_filename, dataset_dir, config, **kwargs):
-    assert isinstance(config, str)
-    with open(config, 'r') as config_file:
-        file_loader = None
-        if config.lower().endswith('.yaml'):
-            import yaml
-            file_loader = yaml
-        elif config.lower().endswith('.json'):
-            import json
-            file_loader = json
-        config = file_loader.load(config_file)
-    meta = MetaFile(dataset_dir)
-    meta.parse(config)
-    with open(binary_filename, 'wb') as outf:
-        meta.dump(outf)
-if __name__ == '__main__':
-    args = docopt.docopt(__doc__, version='0.1.0')
-    kwargs = dict()
-    for key in args.keys():
-        if key != '--help':
-            param_name = key
-            assert isinstance(param_name, str)
-            param_name = param_name.replace('<', '')
-            param_name = param_name.replace('>', '')
-            param_name = param_name.replace('--', '')
-            kwargs[param_name] = args[key]
-    preprocess(**kwargs)
--- a/demo/recommendation/data/ml_data.sh
+++ b/demo/recommendation/data/ml_data.sh
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -ex
-cd "$(dirname "$0")"
-# download the dataset
-wget http://files.grouplens.org/datasets/movielens/ml-1m.zip
-# unzip the dataset
-unzip ml-1m.zip
-# remove the unused zip file
-rm ml-1m.zip
--- a/demo/recommendation/data/split.py
+++ b/demo/recommendation/data/split.py
-#!/bin/env python2
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Separate movielens 1m dataset to train/test file.
-Usage:
-    ./separate.py <input_file> [--test_ratio=<test_ratio>] [--delimiter=<delimiter>]
-    ./separate.py -h | --help
-Options:
-    -h --help                       Show this screen.
-    --version                       Show version.
-    --test_ratio=<test_ratio>       Test ratio for separate [default: 0.1].
-    --delimiter=<delimiter>         File delimiter [default: ,].
-"""
-import docopt
-import collections
-import random
-def process(test_ratio, input_file, delimiter, **kwargs):
-    test_ratio = float(test_ratio)
-    rating_dict = collections.defaultdict(list)
-    with open(input_file, 'r') as f:
-        for line in f:
-            user_id = int(line.split(delimiter)[0])
-            rating_dict[user_id].append(line.strip())
-    with open(input_file + ".train", 'w') as train_file:
-        with open(input_file + ".test", 'w') as test_file:
-            for k in rating_dict.keys():
-                lines = rating_dict[k]
-                assert isinstance(lines, list)
-                random.shuffle(lines)
-                test_len = int(len(lines) * test_ratio)
-                for line in lines[:test_len]:
-                    print >> test_file, line
-                for line in lines[test_len:]:
-                    print >> train_file, line
-if __name__ == '__main__':
-    args = docopt.docopt(__doc__, version='0.1.0')
-    kwargs = dict()
-    for key in args.keys():
-        if key != '--help':
-            param_name = key
-            assert isinstance(param_name, str)
-            param_name = param_name.replace('<', '')
-            param_name = param_name.replace('>', '')
-            param_name = param_name.replace('--', '')
-            kwargs[param_name] = args[key]
-    process(**kwargs)
--- a/demo/recommendation/dataprovider.py
+++ b/demo/recommendation/dataprovider.py
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from paddle.trainer.PyDataProvider2 import *
-import common_utils  # parse
-def __list_to_map__(lst):
-    ret_val = dict()
-    for each in lst:
-        k, v = each
-        ret_val[k] = v
-    return ret_val
-def hook(settings, meta, **kwargs):
-    """
-    Init hook is invoked before process data. It will set obj.slots and store
-    data meta.
-    :param obj: global object. It will passed to process routine.
-    :type obj: object
-    :param meta: the meta file object, which passed from trainer_config. Meta
-                 file record movie/user features.
-    :param kwargs: unused other arguments.
-    """
-    del kwargs  # unused kwargs
-    # Header define slots that used for paddle.
-    #    first part is movie features.
-    #    second part is user features.
-    #    final part is rating score.
-    # header is a list of [USE_SEQ_OR_NOT?, SlotType]
-    movie_headers = list(common_utils.meta_to_header(meta, 'movie'))
-    settings.movie_names = [h[0] for h in movie_headers]
-    headers = movie_headers
-    user_headers = list(common_utils.meta_to_header(meta, 'user'))
-    settings.user_names = [h[0] for h in user_headers]
-    headers.extend(user_headers)
-    headers.append(("rating", dense_vector(1)))  # Score
-    # slot types.
-    settings.input_types = __list_to_map__(headers)
-    settings.meta = meta
-@provider(init_hook=hook, cache=CacheType.CACHE_PASS_IN_MEM)
-def process(settings, filename):
-    with open(filename, 'r') as f:
-        for line in f:
-            # Get a rating from file.
-            user_id, movie_id, score = map(int, line.split('::')[:-1])
-            # Scale score to [-5, +5]
-            score = float(score) * 2 - 5.0
-            # Get movie/user features by movie_id, user_id
-            movie_meta = settings.meta['movie'][movie_id]
-            user_meta = settings.meta['user'][user_id]
-            outputs = [('movie_id', movie_id - 1)]
-            # Then add movie features
-            for i, each_meta in enumerate(movie_meta):
-                outputs.append((settings.movie_names[i + 1], each_meta))
-            # Then add user id.
-            outputs.append(('user_id', user_id - 1))
-            # Then add user features.
-            for i, each_meta in enumerate(user_meta):
-                outputs.append((settings.user_names[i + 1], each_meta))
-            # Finally, add score
-            outputs.append(('rating', [score]))
-            # Return data to paddle
-            yield __list_to_map__(outputs)
--- a/demo/recommendation/evaluate.py
+++ b/demo/recommendation/evaluate.py
-#!/usr/bin/python
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import sys
-import re
-import math
-def get_best_pass(log_filename):
-    with open(log_filename, 'r') as f:
-        text = f.read()
-        pattern = re.compile('Test.*? cost=([0-9]+\.[0-9]+).*?pass-([0-9]+)',
-                             re.S)
-        results = re.findall(pattern, text)
-        sorted_results = sorted(results, key=lambda result: float(result[0]))
-        return sorted_results[0]
-log_filename = sys.argv[1]
-log = get_best_pass(log_filename)
-predict_error = math.sqrt(float(log[0])) / 2
-print 'Best pass is %s, error is %s, which means predict get error as %f' % (
-    log[1], log[0], predict_error)
-evaluate_pass = "output/pass-%s" % log[1]
-print "evaluating from pass %s" % evaluate_pass
--- a/demo/recommendation/evaluate.sh
+++ b/demo/recommendation/evaluate.sh
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-function get_best_pass() {
-  cat $1  | grep -Pzo 'Test .*\n.*pass-.*' | sed  -r 'N;s/Test.* cost=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' | sort | head -n 1
-}
-LOG=`get_best_pass log.txt`
-LOG=(${LOG})
-echo 'Best pass is '${LOG[1]}, ' error is '${LOG[0]}, 'which means predict get error as '`echo ${LOG[0]} | python -c 'import math; print math.sqrt(float(raw_input()))/2'`
-evaluate_pass="output/pass-${LOG[1]}"
-echo 'evaluating from pass '$evaluate_pass
--- a/demo/recommendation/prediction.py
+++ b/demo/recommendation/prediction.py
-#!/bin/env python2
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from py_paddle import swig_paddle, DataProviderConverter
-from common_utils import *
-from paddle.trainer.config_parser import parse_config
-try:
-    import cPickle as pickle
-except ImportError:
-    import pickle
-import sys
-if __name__ == '__main__':
-    model_path = sys.argv[1]
-    swig_paddle.initPaddle('--use_gpu=0')
-    conf = parse_config("trainer_config.py", "is_predict=1")
-    network = swig_paddle.GradientMachine.createFromConfigProto(
-        conf.model_config)
-    assert isinstance(network, swig_paddle.GradientMachine)
-    network.loadParameters(model_path)
-    with open('./data/meta.bin', 'rb') as f:
-        meta = pickle.load(f)
-        headers = [h[1] for h in meta_to_header(meta, 'movie')]
-        headers.extend([h[1] for h in meta_to_header(meta, 'user')])
-        cvt = DataProviderConverter(headers)
-        while True:
-            movie_id = int(raw_input("Input movie_id: "))
-            user_id = int(raw_input("Input user_id: "))
-            movie_meta = meta['movie'][movie_id]  # Query Data From Meta.
-            user_meta = meta['user'][user_id]
-            data = [movie_id - 1]
-            data.extend(movie_meta)
-            data.append(user_id - 1)
-            data.extend(user_meta)
-            print "Prediction Score is %.2f" % (
-                (network.forwardTest(cvt.convert([data]))[0]['value'][0][0] + 5)
-                / 2)
--- a/demo/recommendation/preprocess.sh
+++ b/demo/recommendation/preprocess.sh
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-UNAME_STR=`uname`
-if [[ ${UNAME_STR} == 'Linux' ]]; then
-	SHUF_PROG='shuf'
-else
-	SHUF_PROG='gshuf'
-fi
-cd "$(dirname "$0")"
-delimiter='::'
-dir=ml-1m
-cd data
-echo 'generate meta config file'
-python config_generator.py config.json > meta_config.json
-echo 'generate meta file'
-python meta_generator.py $dir meta.bin --config=meta_config.json
-echo 'split train/test file'
-python split.py $dir/ratings.dat --delimiter=${delimiter} --test_ratio=0.1
-echo 'shuffle train file'
-${SHUF_PROG} $dir/ratings.dat.train > ratings.dat.train
-cp $dir/ratings.dat.test .
-echo "./data/ratings.dat.train" > train.list
-echo "./data/ratings.dat.test" > test.list
--- a/demo/recommendation/requirements.txt
+++ b/demo/recommendation/requirements.txt
-PyYAML
-docopt
--- a/demo/recommendation/run.sh
+++ b/demo/recommendation/run.sh
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-paddle train \
-    --config=trainer_config.py \
-    --save_dir=./output \
-    --use_gpu=false \
-    --trainer_count=4\
-    --test_all_data_in_one_period=true \
-    --log_period=100 \
-    --dot_period=1 \
-    --num_passes=50  2>&1 | tee 'log.txt'
-paddle usage -l log.txt -e $? -n "recommendation" >/dev/null 2>&1
--- a/demo/recommendation/trainer_config.py
+++ b/demo/recommendation/trainer_config.py
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from paddle.trainer_config_helpers import *
-try:
-    import cPickle as pickle
-except ImportError:
-    import pickle
-is_predict = get_config_arg('is_predict', bool, False)
-META_FILE = 'data/meta.bin'
-with open(META_FILE, 'rb') as f:
-    # load meta file
-    meta = pickle.load(f)
-settings(
-    batch_size=1600, learning_rate=1e-3, learning_method=RMSPropOptimizer())
-def construct_feature(name):
-    """
-    Construct movie/user features.
-    This method read from meta data. Then convert feature to neural network due
-    to feature type. The map relation as follow.
-    * id: embedding => fc
-    * embedding:
-        is_sequence:  embedding => context_projection => fc => pool
-        not sequence: embedding => fc
-    * one_hot_dense:  fc => fc
-    Then gather all features vector, and use a fc layer to combined them as
-    return.
-    :param name: 'movie' or 'user'
-    :type name: basestring
-    :return: combined feature output
-    :rtype: LayerOutput
-    """
-    __meta__ = meta[name]['__meta__']['raw_meta']
-    fusion = []
-    for each_meta in __meta__:
-        type_name = each_meta['type']
-        slot_name = each_meta.get('name', '%s_id' % name)
-        if type_name == 'id':
-            slot_dim = each_meta['max']
-            embedding = embedding_layer(
-                input=data_layer(
-                    slot_name, size=slot_dim), size=256)
-            fusion.append(fc_layer(input=embedding, size=256))
-        elif type_name == 'embedding':
-            is_seq = each_meta['seq'] == 'sequence'
-            slot_dim = len(each_meta['dict'])
-            din = data_layer(slot_name, slot_dim)
-            embedding = embedding_layer(input=din, size=256)
-            if is_seq:
-                fusion.append(
-                    text_conv_pool(
-                        input=embedding, context_len=5, hidden_size=256))
-            else:
-                fusion.append(fc_layer(input=embedding, size=256))
-        elif type_name == 'one_hot_dense':
-            slot_dim = len(each_meta['dict'])
-            hidden = fc_layer(input=data_layer(slot_name, slot_dim), size=256)
-            fusion.append(fc_layer(input=hidden, size=256))
-    return fc_layer(name="%s_fusion" % name, input=fusion, size=256)
-movie_feature = construct_feature("movie")
-user_feature = construct_feature("user")
-similarity = cos_sim(a=movie_feature, b=user_feature)
-if not is_predict:
-    outputs(mse_cost(input=similarity, label=data_layer('rating', size=1)))
-    define_py_data_sources2(
-        'data/train.list',
-        'data/test.list',
-        module='dataprovider',
-        obj='process',
-        args={'meta': meta})
-else:
-    outputs(similarity)
--- a/demo/semantic_role_labeling/.gitignore
+++ b/demo/semantic_role_labeling/.gitignore
-*.pyc
-train.log
-data/feature
-data/conll05st-release/
-data/src.dict
-data/test.wsj.props
-data/test.wsj.seq_pair
-data/test.wsj.words
-data/tgt.dict
-output
-data/emb
-data/targetDict.txt
-data/verbDict.txt
-data/wordDict.txt
--- a/demo/semantic_role_labeling/api_train_v2.py
+++ b/demo/semantic_role_labeling/api_train_v2.py
-import math
-import numpy as np
-import gzip
-import logging
-import paddle.v2.dataset.conll05 as conll05
-import paddle.v2.evaluator as evaluator
-import paddle.v2 as paddle
-logger = logging.getLogger('paddle')
-word_dict, verb_dict, label_dict = conll05.get_dict()
-word_dict_len = len(word_dict)
-label_dict_len = len(label_dict)
-pred_len = len(verb_dict)
-mark_dict_len = 2
-word_dim = 32
-mark_dim = 5
-hidden_dim = 512
-depth = 8
-default_std = 1 / math.sqrt(hidden_dim) / 3.0
-mix_hidden_lr = 1e-3
-def d_type(size):
-    return paddle.data_type.integer_value_sequence(size)
-def db_lstm():
-    #8 features
-    word = paddle.layer.data(name='word_data', type=d_type(word_dict_len))
-    predicate = paddle.layer.data(name='verb_data', type=d_type(pred_len))
-    ctx_n2 = paddle.layer.data(name='ctx_n2_data', type=d_type(word_dict_len))
-    ctx_n1 = paddle.layer.data(name='ctx_n1_data', type=d_type(word_dict_len))
-    ctx_0 = paddle.layer.data(name='ctx_0_data', type=d_type(word_dict_len))
-    ctx_p1 = paddle.layer.data(name='ctx_p1_data', type=d_type(word_dict_len))
-    ctx_p2 = paddle.layer.data(name='ctx_p2_data', type=d_type(word_dict_len))
-    mark = paddle.layer.data(name='mark_data', type=d_type(mark_dict_len))
-    emb_para = paddle.attr.Param(name='emb', initial_std=0., is_static=True)
-    std_0 = paddle.attr.Param(initial_std=0.)
-    std_default = paddle.attr.Param(initial_std=default_std)
-    predicate_embedding = paddle.layer.embedding(
-        size=word_dim,
-        input=predicate,
-        param_attr=paddle.attr.Param(
-            name='vemb', initial_std=default_std))
-    mark_embedding = paddle.layer.embedding(
-        size=mark_dim, input=mark, param_attr=std_0)
-    word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
-    emb_layers = [
-        paddle.layer.embedding(
-            size=word_dim, input=x, param_attr=emb_para) for x in word_input
-    ]
-    emb_layers.append(predicate_embedding)
-    emb_layers.append(mark_embedding)
-    hidden_0 = paddle.layer.mixed(
-        size=hidden_dim,
-        bias_attr=std_default,
-        input=[
-            paddle.layer.full_matrix_projection(
-                input=emb, param_attr=std_default) for emb in emb_layers
-        ])
-    lstm_para_attr = paddle.attr.Param(initial_std=0.0, learning_rate=1.0)
-    hidden_para_attr = paddle.attr.Param(
-        initial_std=default_std, learning_rate=mix_hidden_lr)
-    lstm_0 = paddle.layer.lstmemory(
-        input=hidden_0,
-        act=paddle.activation.Relu(),
-        gate_act=paddle.activation.Sigmoid(),
-        state_act=paddle.activation.Sigmoid(),
-        bias_attr=std_0,
-        param_attr=lstm_para_attr)
-    #stack L-LSTM and R-LSTM with direct edges
-    input_tmp = [hidden_0, lstm_0]
-    for i in range(1, depth):
-        mix_hidden = paddle.layer.mixed(
-            size=hidden_dim,
-            bias_attr=std_default,
-            input=[
-                paddle.layer.full_matrix_projection(
-                    input=input_tmp[0], param_attr=hidden_para_attr),
-                paddle.layer.full_matrix_projection(
-                    input=input_tmp[1], param_attr=lstm_para_attr)
-            ])
-        lstm = paddle.layer.lstmemory(
-            input=mix_hidden,
-            act=paddle.activation.Relu(),
-            gate_act=paddle.activation.Sigmoid(),
-            state_act=paddle.activation.Sigmoid(),
-            reverse=((i % 2) == 1),
-            bias_attr=std_0,
-            param_attr=lstm_para_attr)
-        input_tmp = [mix_hidden, lstm]
-    feature_out = paddle.layer.mixed(
-        size=label_dict_len,
-        bias_attr=std_default,
-        input=[
-            paddle.layer.full_matrix_projection(
-                input=input_tmp[0], param_attr=hidden_para_attr),
-            paddle.layer.full_matrix_projection(
-                input=input_tmp[1], param_attr=lstm_para_attr)
-        ], )
-    return feature_out
-def load_parameter(file_name, h, w):
-    with open(file_name, 'rb') as f:
-        f.read(16)  # skip header.
-        return np.fromfile(f, dtype=np.float32).reshape(h, w)
-def train():
-    paddle.init(use_gpu=False, trainer_count=1)
-    # define network topology
-    feature_out = db_lstm()
-    target = paddle.layer.data(name='target', type=d_type(label_dict_len))
-    crf_cost = paddle.layer.crf(size=label_dict_len,
-                                input=feature_out,
-                                label=target,
-                                param_attr=paddle.attr.Param(
-                                    name='crfw',
-                                    initial_std=default_std,
-                                    learning_rate=mix_hidden_lr))
-    crf_dec = paddle.layer.crf_decoding(
-        size=label_dict_len,
-        input=feature_out,
-        label=target,
-        param_attr=paddle.attr.Param(name='crfw'))
-    evaluator.sum(input=crf_dec)
-    # create parameters
-    parameters = paddle.parameters.create(crf_cost)
-    parameters.set('emb', load_parameter(conll05.get_embedding(), 44068, 32))
-    # create optimizer
-    optimizer = paddle.optimizer.Momentum(
-        momentum=0,
-        learning_rate=2e-2,
-        regularization=paddle.optimizer.L2Regularization(rate=8e-4),
-        model_average=paddle.optimizer.ModelAverage(
-            average_window=0.5, max_average_window=10000), )
-    trainer = paddle.trainer.SGD(cost=crf_cost,
-                                 parameters=parameters,
-                                 update_equation=optimizer,
-                                 extra_layers=crf_dec)
-    reader = paddle.batch(
-        paddle.reader.shuffle(
-            conll05.test(), buf_size=8192), batch_size=10)
-    feeding = {
-        'word_data': 0,
-        'ctx_n2_data': 1,
-        'ctx_n1_data': 2,
-        'ctx_0_data': 3,
-        'ctx_p1_data': 4,
-        'ctx_p2_data': 5,
-        'verb_data': 6,
-        'mark_data': 7,
-        'target': 8
-    }
-    def event_handler(event):
-        if isinstance(event, paddle.event.EndIteration):
-            if event.batch_id % 100 == 0:
-                logger.info("Pass %d, Batch %d, Cost %f, %s" % (
-                    event.pass_id, event.batch_id, event.cost, event.metrics))
-            if event.batch_id and event.batch_id % 1000 == 0:
-                result = trainer.test(reader=reader, feeding=feeding)
-                logger.info("\nTest with Pass %d, Batch %d, %s" %
-                            (event.pass_id, event.batch_id, result.metrics))
-        if isinstance(event, paddle.event.EndPass):
-            # save parameters
-            with gzip.open('params_pass_%d.tar.gz' % event.pass_id, 'w') as f:
-                parameters.to_tar(f)
-            result = trainer.test(reader=reader, feeding=feeding)
-            logger.info("\nTest with Pass %d, %s" %
-                        (event.pass_id, result.metrics))
-    trainer.train(
-        reader=reader,
-        event_handler=event_handler,
-        num_passes=10,
-        feeding=feeding)
-def infer_a_batch(inferer, test_data, word_dict, pred_dict, label_dict):
-    probs = inferer.infer(input=test_data, field='id')
-    assert len(probs) == sum(len(x[0]) for x in test_data)
-    for idx, test_sample in enumerate(test_data):
-        start_id = 0
-        pred_str = "%s\t" % (pred_dict[test_sample[6][0]])
-        for w, tag in zip(test_sample[0],
-                          probs[start_id:start_id + len(test_sample[0])]):
-            pred_str += "%s[%s] " % (word_dict[w], label_dict[tag])
-        print(pred_str.strip())
-        start_id += len(test_sample[0])
-def infer():
-    label_dict_reverse = dict((value, key)
-                              for key, value in label_dict.iteritems())
-    word_dict_reverse = dict((value, key)
-                             for key, value in word_dict.iteritems())
-    pred_dict_reverse = dict((value, key)
-                             for key, value in verb_dict.iteritems())
-    test_creator = paddle.dataset.conll05.test()
-    paddle.init(use_gpu=False, trainer_count=1)
-    # define network topology
-    feature_out = db_lstm()
-    predict = paddle.layer.crf_decoding(
-        size=label_dict_len,
-        input=feature_out,
-        param_attr=paddle.attr.Param(name='crfw'))
-    test_pass = 0
-    with gzip.open('params_pass_%d.tar.gz' % (test_pass)) as f:
-        parameters = paddle.parameters.Parameters.from_tar(f)
-        inferer = paddle.inference.Inference(
-            output_layer=predict, parameters=parameters)
-        # prepare test data
-        test_data = []
-        test_batch_size = 50
-        for idx, item in enumerate(test_creator()):
-            test_data.append(item[0:8])
-            if idx and (not idx % test_batch_size):
-                infer_a_batch(
-                    inferer,
-                    test_data,
-                    word_dict_reverse,
-                    pred_dict_reverse,
-                    label_dict_reverse, )
-                test_data = []
-        infer_a_batch(
-            inferer,
-            test_data,
-            word_dict_reverse,
-            pred_dict_reverse,
-            label_dict_reverse, )
-        test_data = []
-def main(is_inferring=False):
-    if is_inferring:
-        infer()
-    else:
-        train()
-if __name__ == '__main__':
-    main(is_inferring=False)
--- a/demo/semantic_role_labeling/data/extract_dict_feature.py
+++ b/demo/semantic_role_labeling/data/extract_dict_feature.py
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import sys
-import os
-from optparse import OptionParser
-def extract_dict_features(pair_file, feature_file):
-    with open(pair_file) as fin, open(feature_file, 'w') as feature_out:
-        for line in fin:
-            sentence, predicate, labels = line.strip().split('\t')
-            sentence_list = sentence.split()
-            labels_list = labels.split()
-            verb_index = labels_list.index('B-V')
-            mark = [0] * len(labels_list)
-            if verb_index > 0:
-                mark[verb_index - 1] = 1
-                ctx_n1 = sentence_list[verb_index - 1]
-            else:
-                ctx_n1 = 'bos'
-            if verb_index > 1:
-                mark[verb_index - 2] = 1
-                ctx_n2 = sentence_list[verb_index - 2]
-            else:
-                ctx_n2 = 'bos'
-            mark[verb_index] = 1
-            ctx_0 = sentence_list[verb_index]
-            if verb_index < len(labels_list) - 1:
-                mark[verb_index + 1] = 1
-                ctx_p1 = sentence_list[verb_index + 1]
-            else:
-                ctx_p1 = 'eos'
-            if verb_index < len(labels_list) - 2:
-                mark[verb_index + 2] = 1
-                ctx_p2 = sentence_list[verb_index + 2]
-            else:
-                ctx_p2 = 'eos'
-            feature_str  = sentence + '\t' \
-                           + predicate + '\t' \
-                           + ctx_n2 + '\t' \
-                           + ctx_n1 + '\t' \
-                           + ctx_0 + '\t' \
-                           + ctx_p1 + '\t' \
-                           + ctx_p2 + '\t' \
-                           + ' '.join([str(i) for i in mark]) + '\t' \
-                           + labels
-            feature_out.write(feature_str + '\n')
-if __name__ == '__main__':
-    usage = '-p pair_file -f feature_file'
-    parser = OptionParser(usage)
-    parser.add_option('-p', dest='pair_file', help='the pair file')
-    parser.add_option('-f', dest='feature_file', help='the feature file')
-    (options, args) = parser.parse_args()
-    extract_dict_features(options.pair_file, options.feature_file)
--- a/demo/semantic_role_labeling/data/extract_pairs.py
+++ b/demo/semantic_role_labeling/data/extract_pairs.py
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import sys
-import os
-from optparse import OptionParser
-def read_labels(props_file):
-    '''
-    a sentence maybe has more than one verb, each verb has its label sequence
-    label[],  is a 3-dimension list. 
-    the first dim is to store all sentence's label seqs, len is the sentence number
-    the second dim is to store all label sequences for one sentences
-    the third dim is to store each label for one word
-    '''
-    labels = []
-    with open(props_file) as fin:
-        label_seqs_for_one_sentences = []
-        one_seg_in_file = []
-        for line in fin:
-            line = line.strip()
-            if line == '':
-                for i in xrange(len(one_seg_in_file[0])):
-                    a_kind_lable = [x[i] for x in one_seg_in_file]
-                    label_seqs_for_one_sentences.append(a_kind_lable)
-                labels.append(label_seqs_for_one_sentences)
-                one_seg_in_file = []
-                label_seqs_for_one_sentences = []
-            else:
-                part = line.split()
-                one_seg_in_file.append(part)
-    return labels
-def read_sentences(words_file):
-    sentences = []
-    with open(words_file) as fin:
-        s = ''
-        for line in fin:
-            line = line.strip()
-            if line == '':
-                sentences.append(s)
-                s = ''
-            else:
-                s += line + ' '
-    return sentences
-def transform_labels(sentences, labels):
-    sen_lab_pair = []
-    for i in xrange(len(sentences)):
-        if len(labels[i]) == 1:
-            continue
-        else:
-            verb_list = []
-            for x in labels[i][0]:
-                if x != '-':
-                    verb_list.append(x)
-            for j in xrange(1, len(labels[i])):
-                label_list = labels[i][j]
-                current_tag = 'O'
-                is_in_bracket = False
-                label_seq = []
-                verb_word = ''
-                for ll in label_list:
-                    if ll == '*' and is_in_bracket == False:
-                        label_seq.append('O')
-                    elif ll == '*' and is_in_bracket == True:
-                        label_seq.append('I-' + current_tag)
-                    elif ll == '*)':
-                        label_seq.append('I-' + current_tag)
-                        is_in_bracket = False
-                    elif ll.find('(') != -1 and ll.find(')') != -1:
-                        current_tag = ll[1:ll.find('*')]
-                        label_seq.append('B-' + current_tag)
-                        is_in_bracket = False
-                    elif ll.find('(') != -1 and ll.find(')') == -1:
-                        current_tag = ll[1:ll.find('*')]
-                        label_seq.append('B-' + current_tag)
-                        is_in_bracket = True
-                    else:
-                        print 'error:', ll
-                sen_lab_pair.append((sentences[i], verb_list[j - 1], label_seq))
-    return sen_lab_pair
-def write_file(sen_lab_pair, output_file):
-    with open(output_file, 'w') as fout:
-        for x in sen_lab_pair:
-            sentence = x[0]
-            label_seq = ' '.join(x[2])
-            assert len(sentence.split()) == len(x[2])
-            fout.write(sentence + '\t' + x[1] + '\t' + label_seq + '\n')
-if __name__ == '__main__':
-    usage = '-w words_file -p props_file -o output_file'
-    parser = OptionParser(usage)
-    parser.add_option('-w', dest='words_file', help='the words file')
-    parser.add_option('-p', dest='props_file', help='the props file')
-    parser.add_option('-o', dest='output_file', help='the output_file')
-    (options, args) = parser.parse_args()
-    sentences = read_sentences(options.words_file)
-    labels = read_labels(options.props_file)
-    sen_lab_pair = transform_labels(sentences, labels)
-    write_file(sen_lab_pair, options.output_file)
--- a/demo/semantic_role_labeling/data/get_data.sh
+++ b/demo/semantic_role_labeling/data/get_data.sh
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-wget http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz
-wget http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/verbDict.txt
-wget http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/targetDict.txt 
-wget http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt 
-wget http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb
-tar -xzvf conll05st-tests.tar.gz
-rm conll05st-tests.tar.gz
-cp ./conll05st-release/test.wsj/words/test.wsj.words.gz  .
-cp ./conll05st-release/test.wsj/props/test.wsj.props.gz  . 
-gunzip test.wsj.words.gz
-gunzip test.wsj.props.gz
-python extract_pairs.py  -w test.wsj.words -p test.wsj.props -o test.wsj.seq_pair
-python extract_dict_feature.py -p test.wsj.seq_pair -f feature 
--- a/demo/semantic_role_labeling/data/test.list
+++ b/demo/semantic_role_labeling/data/test.list
-./data/feature
--- a/demo/semantic_role_labeling/data/train.list
+++ b/demo/semantic_role_labeling/data/train.list
-./data/feature
--- a/demo/semantic_role_labeling/dataprovider.py
+++ b/demo/semantic_role_labeling/dataprovider.py
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from paddle.trainer.PyDataProvider2 import *
-UNK_IDX = 0
-def hook(settings, word_dict, label_dict, predicate_dict, **kwargs):
-    settings.word_dict = word_dict
-    settings.label_dict = label_dict
-    settings.predicate_dict = predicate_dict
-    #all inputs are integral and sequential type
-    settings.slots = [
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(predicate_dict)), integer_value_sequence(2),
-        integer_value_sequence(len(label_dict))
-    ]
-def get_batch_size(yeild_data):
-    return len(yeild_data[0])
-@provider(
-    init_hook=hook,
-    should_shuffle=True,
-    calc_batch_size=get_batch_size,
-    can_over_batch_size=True,
-    cache=CacheType.CACHE_PASS_IN_MEM)
-def process(settings, file_name):
-    with open(file_name, 'r') as fdata:
-        for line in fdata:
-            sentence, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2,  mark, label = \
-                line.strip().split('\t')
-            words = sentence.split()
-            sen_len = len(words)
-            word_slot = [settings.word_dict.get(w, UNK_IDX) for w in words]
-            predicate_slot = [settings.predicate_dict.get(predicate)] * sen_len
-            ctx_n2_slot = [settings.word_dict.get(ctx_n2, UNK_IDX)] * sen_len
-            ctx_n1_slot = [settings.word_dict.get(ctx_n1, UNK_IDX)] * sen_len
-            ctx_0_slot = [settings.word_dict.get(ctx_0, UNK_IDX)] * sen_len
-            ctx_p1_slot = [settings.word_dict.get(ctx_p1, UNK_IDX)] * sen_len
-            ctx_p2_slot = [settings.word_dict.get(ctx_p2, UNK_IDX)] * sen_len
-            marks = mark.split()
-            mark_slot = [int(w) for w in marks]
-            label_list = label.split()
-            label_slot = [settings.label_dict.get(w) for w in label_list]
-            yield word_slot, ctx_n2_slot, ctx_n1_slot, \
-                  ctx_0_slot, ctx_p1_slot, ctx_p2_slot, predicate_slot, mark_slot, label_slot
--- a/demo/semantic_role_labeling/db_lstm.py
+++ b/demo/semantic_role_labeling/db_lstm.py
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import math
-import os
-import sys
-from paddle.trainer_config_helpers import *
-#file paths
-word_dict_file = './data/wordDict.txt'
-label_dict_file = './data/targetDict.txt'
-predicate_file = './data/verbDict.txt'
-train_list_file = './data/train.list'
-test_list_file = './data/test.list'
-is_test = get_config_arg('is_test', bool, False)
-is_predict = get_config_arg('is_predict', bool, False)
-if not is_predict:
-    #load dictionaries
-    word_dict = dict()
-    label_dict = dict()
-    predicate_dict = dict()
-    with open(word_dict_file, 'r') as f_word, \
-         open(label_dict_file, 'r') as f_label, \
-         open(predicate_file, 'r') as f_pre:
-        for i, line in enumerate(f_word):
-            w = line.strip()
-            word_dict[w] = i
-        for i, line in enumerate(f_label):
-            w = line.strip()
-            label_dict[w] = i
-        for i, line in enumerate(f_pre):
-            w = line.strip()
-            predicate_dict[w] = i
-    if is_test:
-        train_list_file = None
-    #define data provider
-    define_py_data_sources2(
-        train_list=train_list_file,
-        test_list=test_list_file,
-        module='dataprovider',
-        obj='process',
-        args={
-            'word_dict': word_dict,
-            'label_dict': label_dict,
-            'predicate_dict': predicate_dict
-        })
-    word_dict_len = len(word_dict)
-    label_dict_len = len(label_dict)
-    pred_len = len(predicate_dict)
-else:
-    word_dict_len = get_config_arg('dict_len', int)
-    label_dict_len = get_config_arg('label_len', int)
-    pred_len = get_config_arg('pred_len', int)
-############################## Hyper-parameters ##################################
-mark_dict_len = 2
-word_dim = 32
-mark_dim = 5
-hidden_dim = 512
-depth = 8
-########################### Optimizer #######################################
-settings(
-    batch_size=150,
-    learning_method=MomentumOptimizer(momentum=0),
-    learning_rate=2e-2,
-    regularization=L2Regularization(8e-4),
-    is_async=False,
-    model_average=ModelAverage(
-        average_window=0.5, max_average_window=10000), )
-####################################### network ##############################
-#8 features and 1 target
-word = data_layer(name='word_data', size=word_dict_len)
-predicate = data_layer(name='verb_data', size=pred_len)
-ctx_n2 = data_layer(name='ctx_n2_data', size=word_dict_len)
-ctx_n1 = data_layer(name='ctx_n1_data', size=word_dict_len)
-ctx_0 = data_layer(name='ctx_0_data', size=word_dict_len)
-ctx_p1 = data_layer(name='ctx_p1_data', size=word_dict_len)
-ctx_p2 = data_layer(name='ctx_p2_data', size=word_dict_len)
-mark = data_layer(name='mark_data', size=mark_dict_len)
-if not is_predict:
-    target = data_layer(name='target', size=label_dict_len)
-default_std = 1 / math.sqrt(hidden_dim) / 3.0
-emb_para = ParameterAttribute(name='emb', initial_std=0., learning_rate=0.)
-std_0 = ParameterAttribute(initial_std=0.)
-std_default = ParameterAttribute(initial_std=default_std)
-predicate_embedding = embedding_layer(
-    size=word_dim,
-    input=predicate,
-    param_attr=ParameterAttribute(
-        name='vemb', initial_std=default_std))
-mark_embedding = embedding_layer(
-    name='word_ctx-in_embedding', size=mark_dim, input=mark, param_attr=std_0)
-word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
-emb_layers = [
-    embedding_layer(
-        size=word_dim, input=x, param_attr=emb_para) for x in word_input
-]
-emb_layers.append(predicate_embedding)
-emb_layers.append(mark_embedding)
-hidden_0 = mixed_layer(
-    name='hidden0',
-    size=hidden_dim,
-    bias_attr=std_default,
-    input=[
-        full_matrix_projection(
-            input=emb, param_attr=std_default) for emb in emb_layers
-    ])
-mix_hidden_lr = 1e-3
-lstm_para_attr = ParameterAttribute(initial_std=0.0, learning_rate=1.0)
-hidden_para_attr = ParameterAttribute(
-    initial_std=default_std, learning_rate=mix_hidden_lr)
-lstm_0 = lstmemory(
-    name='lstm0',
-    input=hidden_0,
-    act=ReluActivation(),
-    gate_act=SigmoidActivation(),
-    state_act=SigmoidActivation(),
-    bias_attr=std_0,
-    param_attr=lstm_para_attr)
-#stack L-LSTM and R-LSTM with direct edges
-input_tmp = [hidden_0, lstm_0]
-for i in range(1, depth):
-    mix_hidden = mixed_layer(
-        name='hidden' + str(i),
-        size=hidden_dim,
-        bias_attr=std_default,
-        input=[
-            full_matrix_projection(
-                input=input_tmp[0], param_attr=hidden_para_attr),
-            full_matrix_projection(
-                input=input_tmp[1], param_attr=lstm_para_attr)
-        ])
-    lstm = lstmemory(
-        name='lstm' + str(i),
-        input=mix_hidden,
-        act=ReluActivation(),
-        gate_act=SigmoidActivation(),
-        state_act=SigmoidActivation(),
-        reverse=((i % 2) == 1),
-        bias_attr=std_0,
-        param_attr=lstm_para_attr)
-    input_tmp = [mix_hidden, lstm]
-feature_out = mixed_layer(
-    name='output',
-    size=label_dict_len,
-    bias_attr=std_default,
-    input=[
-        full_matrix_projection(
-            input=input_tmp[0], param_attr=hidden_para_attr),
-        full_matrix_projection(
-            input=input_tmp[1], param_attr=lstm_para_attr)
-    ], )
-if not is_predict:
-    crf_l = crf_layer(
-        name='crf',
-        size=label_dict_len,
-        input=feature_out,
-        label=target,
-        param_attr=ParameterAttribute(
-            name='crfw', initial_std=default_std, learning_rate=mix_hidden_lr))
-    crf_dec_l = crf_decoding_layer(
-        name='crf_dec_l',
-        size=label_dict_len,
-        input=feature_out,
-        label=target,
-        param_attr=ParameterAttribute(name='crfw'))
-    eval = sum_evaluator(input=crf_dec_l)
-    outputs(crf_l)
-else:
-    crf_dec_l = crf_decoding_layer(
-        name='crf_dec_l',
-        size=label_dict_len,
-        input=feature_out,
-        param_attr=ParameterAttribute(name='crfw'))
-    outputs(crf_dec_l)
--- a/demo/semantic_role_labeling/predict.py
+++ b/demo/semantic_role_labeling/predict.py
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import numpy as np
-from optparse import OptionParser
-from py_paddle import swig_paddle, DataProviderConverter
-from paddle.trainer.PyDataProvider2 import integer_value_sequence
-from paddle.trainer.config_parser import parse_config
-"""
-Usage: run following command to show help message.
-  python predict.py -h
-"""
-UNK_IDX = 0
-class Prediction():
-    def __init__(self, train_conf, dict_file, model_dir, label_file,
-                 predicate_dict_file):
-        """
-        train_conf: trainer configure.
-        dict_file: word dictionary file name.
-        model_dir: directory of model.
-        """
-        self.dict = {}
-        self.labels = {}
-        self.predicate_dict = {}
-        self.labels_reverse = {}
-        self.load_dict_label(dict_file, label_file, predicate_dict_file)
-        len_dict = len(self.dict)
-        len_label = len(self.labels)
-        len_pred = len(self.predicate_dict)
-        conf = parse_config(
-            train_conf, 'dict_len=' + str(len_dict) + ',label_len=' +
-            str(len_label) + ',pred_len=' + str(len_pred) + ',is_predict=True')
-        self.network = swig_paddle.GradientMachine.createFromConfigProto(
-            conf.model_config)
-        self.network.loadParameters(model_dir)
-        slots = [
-            integer_value_sequence(len_dict), integer_value_sequence(len_dict),
-            integer_value_sequence(len_dict), integer_value_sequence(len_dict),
-            integer_value_sequence(len_dict), integer_value_sequence(len_dict),
-            integer_value_sequence(len_pred), integer_value_sequence(2)
-        ]
-        self.converter = DataProviderConverter(slots)
-    def load_dict_label(self, dict_file, label_file, predicate_dict_file):
-        """
-        Load dictionary from self.dict_file.
-        """
-        for line_count, line in enumerate(open(dict_file, 'r')):
-            self.dict[line.strip()] = line_count
-        for line_count, line in enumerate(open(label_file, 'r')):
-            self.labels[line.strip()] = line_count
-            self.labels_reverse[line_count] = line.strip()
-        for line_count, line in enumerate(open(predicate_dict_file, 'r')):
-            self.predicate_dict[line.strip()] = line_count
-    def get_data(self, data_file):
-        """
-        Get input data of paddle format.
-        """
-        with open(data_file, 'r') as fdata:
-            for line in fdata:
-                sentence, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, label = line.strip(
-                ).split('\t')
-                words = sentence.split()
-                sen_len = len(words)
-                word_slot = [self.dict.get(w, UNK_IDX) for w in words]
-                predicate_slot = [self.predicate_dict.get(predicate, UNK_IDX)
-                                  ] * sen_len
-                ctx_n2_slot = [self.dict.get(ctx_n2, UNK_IDX)] * sen_len
-                ctx_n1_slot = [self.dict.get(ctx_n1, UNK_IDX)] * sen_len
-                ctx_0_slot = [self.dict.get(ctx_0, UNK_IDX)] * sen_len
-                ctx_p1_slot = [self.dict.get(ctx_p1, UNK_IDX)] * sen_len
-                ctx_p2_slot = [self.dict.get(ctx_p2, UNK_IDX)] * sen_len
-                marks = mark.split()
-                mark_slot = [int(w) for w in marks]
-                yield word_slot, ctx_n2_slot, ctx_n1_slot, \
-                      ctx_0_slot, ctx_p1_slot, ctx_p2_slot, predicate_slot, mark_slot
-    def predict(self, data_file, output_file):
-        """
-        data_file: file name of input data.
-        """
-        input = self.converter(self.get_data(data_file))
-        output = self.network.forwardTest(input)
-        lab = output[0]["id"].tolist()
-        with open(data_file, 'r') as fin, open(output_file, 'w') as fout:
-            index = 0
-            for line in fin:
-                sen = line.split('\t')[0]
-                len_sen = len(sen.split())
-                line_labels = lab[index:index + len_sen]
-                index += len_sen
-                fout.write(sen + '\t' + ' '.join(
-                    [self.labels_reverse[i] for i in line_labels]) + '\n')
-def option_parser():
-    usage = (
-        "python predict.py -c config -w model_dir "
-        "-d word dictionary -l label_file -i input_file  -p pred_dict_file")
-    parser = OptionParser(usage="usage: %s [options]" % usage)
-    parser.add_option(
-        "-c",
-        "--tconf",
-        action="store",
-        dest="train_conf",
-        help="network config")
-    parser.add_option(
-        "-d",
-        "--dict",
-        action="store",
-        dest="dict_file",
-        help="dictionary file")
-    parser.add_option(
-        "-l",
-        "--label",
-        action="store",
-        dest="label_file",
-        default=None,
-        help="label file")
-    parser.add_option(
-        "-p",
-        "--predict_dict_file",
-        action="store",
-        dest="predict_dict_file",
-        default=None,
-        help="predict_dict_file")
-    parser.add_option(
-        "-i",
-        "--data",
-        action="store",
-        dest="data_file",
-        help="data file to predict")
-    parser.add_option(
-        "-w",
-        "--model",
-        action="store",
-        dest="model_path",
-        default=None,
-        help="model path")
-    parser.add_option(
-        "-o",
-        "--output_file",
-        action="store",
-        dest="output_file",
-        default=None,
-        help="output file")
-    return parser.parse_args()
-def main():
-    options, args = option_parser()
-    train_conf = options.train_conf
-    data_file = options.data_file
-    dict_file = options.dict_file
-    model_path = options.model_path
-    label_file = options.label_file
-    predict_dict_file = options.predict_dict_file
-    output_file = options.output_file
-    swig_paddle.initPaddle("--use_gpu=0")
-    predict = Prediction(train_conf, dict_file, model_path, label_file,
-                         predict_dict_file)
-    predict.predict(data_file, output_file)
-if __name__ == '__main__':
-    main()
--- a/demo/semantic_role_labeling/predict.sh
+++ b/demo/semantic_role_labeling/predict.sh
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-function get_best_pass() {
-  cat $1  | grep -Pzo 'Test .*\n.*pass-.*' | \
-  sed  -r 'N;s/Test.* cost=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' | \
-  sort -n | head -n 1
-}   
-log=train.log
-LOG=`get_best_pass $log`
-LOG=(${LOG})
-best_model_path="output/pass-${LOG[1]}"
-config_file=db_lstm.py
-dict_file=./data/wordDict.txt
-label_file=./data/targetDict.txt 
-predicate_dict_file=./data/verbDict.txt
-input_file=./data/feature
-output_file=predict.res
-python predict.py \
-     -c $config_file \
-     -w $best_model_path \
-     -l $label_file \
-     -p $predicate_dict_file  \
-     -d $dict_file \
-     -i $input_file \
-     -o $output_file
--- a/demo/semantic_role_labeling/test.sh
+++ b/demo/semantic_role_labeling/test.sh
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-function get_best_pass() {
-  cat $1  | grep -Pzo 'Test .*\n.*pass-.*' | \
-  sed  -r 'N;s/Test.* cost=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' |\
-  sort -n | head -n 1
-}
-log=train.log
-LOG=`get_best_pass $log`
-LOG=(${LOG})
-evaluate_pass="output/pass-${LOG[1]}"
-echo 'evaluating from pass '$evaluate_pass
-model_list=./model.list
-touch $model_list | echo $evaluate_pass > $model_list
-paddle train \
-  --config=./db_lstm.py \
-  --model_list=$model_list \
-  --job=test \
-  --use_gpu=false \
-  --config_args=is_test=1 \
-  --test_all_data_in_one_period=1 \
-2>&1 | tee 'test.log'
-paddle usage -l test.log -e $? -n "semantic_role_labeling_test" >/dev/null 2>&1
--- a/demo/semantic_role_labeling/train.sh
+++ b/demo/semantic_role_labeling/train.sh
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-paddle train \
-  --config=./db_lstm.py \
-  --use_gpu=0 \
-  --log_period=5000 \
-  --trainer_count=1 \
-  --show_parameter_stats_period=5000 \
-  --save_dir=./output \
-  --num_passes=10000 \
-  --average_test_period=10000000 \
-  --init_model_path=./data \
-  --load_missing_parameter_strategy=rand \
-  --test_all_data_in_one_period=1 \
-  2>&1 | tee 'train.log'
-paddle usage -l train.log -e $? -n "semantic_role_labeling_train" >/dev/null 2>&1
--- a/demo/sentiment/.gitignore
+++ b/demo/sentiment/.gitignore
-data/aclImdb
-data/imdb
-data/pre-imdb
-data/mosesdecoder-master
-logs/
-model_output
-dataprovider_copy_1.py
-model.list
-test.log
-train.log
-*.pyc
--- a/demo/sentiment/dataprovider.py
+++ b/demo/sentiment/dataprovider.py
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from paddle.trainer.PyDataProvider2 import *
-def hook(settings, dictionary, **kwargs):
-    settings.word_dict = dictionary
-    settings.input_types = [
-        integer_value_sequence(len(settings.word_dict)), integer_value(2)
-    ]
-    settings.logger.info('dict len : %d' % (len(settings.word_dict)))
-@provider(init_hook=hook)
-def process(settings, file_name):
-    with open(file_name, 'r') as fdata:
-        for line_count, line in enumerate(fdata):
-            label, comment = line.strip().split('\t\t')
-            label = int(label)
-            words = comment.split()
-            word_slot = [
-                settings.word_dict[w] for w in words if w in settings.word_dict
-            ]
-            if not word_slot:
-                continue
-            yield word_slot, label
--- a/demo/sentiment/predict.py
+++ b/demo/sentiment/predict.py
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os, sys
-import numpy as np
-from optparse import OptionParser
-from py_paddle import swig_paddle, DataProviderConverter
-from paddle.trainer.PyDataProvider2 import integer_value_sequence
-from paddle.trainer.config_parser import parse_config
-"""
-Usage: run following command to show help message.
-  python predict.py -h
-"""
-class SentimentPrediction():
-    def __init__(self, train_conf, dict_file, model_dir=None, label_file=None):
-        """
-        train_conf: trainer configure.
-        dict_file: word dictionary file name.
-        model_dir: directory of model.
-        """
-        self.train_conf = train_conf
-        self.dict_file = dict_file
-        self.word_dict = {}
-        self.dict_dim = self.load_dict()
-        self.model_dir = model_dir
-        if model_dir is None:
-            self.model_dir = os.path.dirname(train_conf)
-        self.label = None
-        if label_file is not None:
-            self.load_label(label_file)
-        conf = parse_config(train_conf, "is_predict=1")
-        self.network = swig_paddle.GradientMachine.createFromConfigProto(
-            conf.model_config)
-        self.network.loadParameters(self.model_dir)
-        input_types = [integer_value_sequence(self.dict_dim)]
-        self.converter = DataProviderConverter(input_types)
-    def load_dict(self):
-        """
-        Load dictionary from self.dict_file.
-        """
-        for line_count, line in enumerate(open(self.dict_file, 'r')):
-            self.word_dict[line.strip().split('\t')[0]] = line_count
-        return len(self.word_dict)
-    def load_label(self, label_file):
-        """
-        Load label.
-        """
-        self.label = {}
-        for v in open(label_file, 'r'):
-            self.label[int(v.split('\t')[1])] = v.split('\t')[0]
-    def get_index(self, data):
-        """
-        transform word into integer index according to the dictionary.
-        """
-        words = data.strip().split()
-        word_slot = [self.word_dict[w] for w in words if w in self.word_dict]
-        return word_slot
-    def batch_predict(self, data_batch):
-        input = self.converter(data_batch)
-        output = self.network.forwardTest(input)
-        prob = output[0]["value"]
-        labs = np.argsort(-prob)
-        for idx, lab in enumerate(labs):
-            if self.label is None:
-                print("predicting label is %d" % (lab[0]))
-            else:
-                print("predicting label is %s" % (self.label[lab[0]]))
-def option_parser():
-    usage = "python predict.py -n config -w model_dir -d dictionary -i input_file "
-    parser = OptionParser(usage="usage: %s [options]" % usage)
-    parser.add_option(
-        "-n",
-        "--tconf",
-        action="store",
-        dest="train_conf",
-        help="network config")
-    parser.add_option(
-        "-d",
-        "--dict",
-        action="store",
-        dest="dict_file",
-        help="dictionary file")
-    parser.add_option(
-        "-b",
-        "--label",
-        action="store",
-        dest="label",
-        default=None,
-        help="dictionary file")
-    parser.add_option(
-        "-c",
-        "--batch_size",
-        type="int",
-        action="store",
-        dest="batch_size",
-        default=1,
-        help="the batch size for prediction")
-    parser.add_option(
-        "-w",
-        "--model",
-        action="store",
-        dest="model_path",
-        default=None,
-        help="model path")
-    return parser.parse_args()
-def main():
-    options, args = option_parser()
-    train_conf = options.train_conf
-    batch_size = options.batch_size
-    dict_file = options.dict_file
-    model_path = options.model_path
-    label = options.label
-    swig_paddle.initPaddle("--use_gpu=0")
-    predict = SentimentPrediction(train_conf, dict_file, model_path, label)
-    batch = []
-    for line in sys.stdin:
-        words = predict.get_index(line)
-        if words:
-            batch.append([words])
-        else:
-            print('All the words in [%s] are not in the dictionary.' % line)
-        if len(batch) == batch_size:
-            predict.batch_predict(batch)
-            batch = []
-    if len(batch) > 0:
-        predict.batch_predict(batch)
-if __name__ == '__main__':
-    main()
--- a/demo/sentiment/predict.sh
+++ b/demo/sentiment/predict.sh
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-#Note the default model is pass-00002, you shold make sure the model path
-#exists or change the mode path.
-model=model_output/pass-00002/
-config=trainer_config.py
-label=data/pre-imdb/labels.list
-cat ./data/aclImdb/test/pos/10007_10.txt | python predict.py \
-     --tconf=$config\
-     --model=$model \
-     --label=$label \
-     --dict=./data/pre-imdb/dict.txt \
-     --batch_size=1
--- a/demo/sentiment/preprocess.py
+++ b/demo/sentiment/preprocess.py
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import sys
-import random
-import operator
-import numpy as np
-from subprocess import Popen, PIPE
-from os.path import join as join_path
-from optparse import OptionParser
-from paddle.utils.preprocess_util import *
-"""
-Usage: run following command to show help message.
-  python preprocess.py -h 
-"""
-def save_dict(dict, filename, is_reverse=True):
-    """
-    Save dictionary into file.
-    dict:   input dictionary.
-    filename: output file name, string.
-    is_reverse: True, descending order by value.
-                False, ascending order by value.
-    """
-    f = open(filename, 'w')
-    for k, v in sorted(dict.items(), key=operator.itemgetter(1),\
-                       reverse=is_reverse):
-        f.write('%s\t%s\n' % (k, v))
-    f.close()
-def tokenize(sentences):
-    """
-    Use tokenizer.perl to tokenize input sentences.
-    tokenizer.perl is tool of Moses.
-    sentences : a list of input sentences.
-    return: a list of processed text.
-    """
-    dir = './data/mosesdecoder-master/scripts/tokenizer/tokenizer.perl'
-    tokenizer_cmd = [dir, '-l', 'en', '-q', '-']
-    assert isinstance(sentences, list)
-    text = "\n".join(sentences)
-    tokenizer = Popen(tokenizer_cmd, stdin=PIPE, stdout=PIPE)
-    tok_text, _ = tokenizer.communicate(text)
-    toks = tok_text.split('\n')[:-1]
-    return toks
-def read_lines(path):
-    """
-    path: String, file path.
-    return a list of sequence.
-    """
-    seqs = []
-    with open(path, 'r') as f:
-        for line in f.readlines():
-            line = line.strip()
-            if len(line):
-                seqs.append(line)
-    return seqs
-class SentimentDataSetCreate():
-    """
-    A class to process data for sentiment analysis task.
-    """
-    def __init__(self,
-                 data_path,
-                 output_path,
-                 use_okenizer=True,
-                 multi_lines=False):
-        """
-        data_path: string, traing and testing dataset path
-        output_path: string, output path, store processed dataset
-        multi_lines: whether a file has multi lines.
-                     In order to shuffle fully, it needs to read all files into
-                     memory, then shuffle them if one file has multi lines.
-        """
-        self.output_path = output_path
-        self.data_path = data_path
-        self.train_dir = 'train'
-        self.test_dir = 'test'
-        self.train_list = "train.list"
-        self.test_list = "test.list"
-        self.label_list = "labels.list"
-        self.classes_num = 0
-        self.batch_size = 50000
-        self.batch_dir = 'batches'
-        self.dict_file = "dict.txt"
-        self.dict_with_test = False
-        self.dict_size = 0
-        self.word_count = {}
-        self.tokenizer = use_okenizer
-        self.overwrite = False
-        self.multi_lines = multi_lines
-        self.train_dir = join_path(data_path, self.train_dir)
-        self.test_dir = join_path(data_path, self.test_dir)
-        self.train_list = join_path(output_path, self.train_list)
-        self.test_list = join_path(output_path, self.test_list)
-        self.label_list = join_path(output_path, self.label_list)
-        self.dict_file = join_path(output_path, self.dict_file)
-    def data_list(self, path):
-        """
-        create dataset from path
-        path: data path
-        return: data list
-        """
-        label_set = get_label_set_from_dir(path)
-        data = []
-        for lab_name in label_set.keys():
-            file_paths = list_files(join_path(path, lab_name))
-            for p in file_paths:
-                data.append({"label"  : label_set[lab_name],\
-                             "seq_path": p})
-        return data, label_set
-    def create_dict(self, data):
-        """
-        create dict for input data.
-        data: list, [sequence, sequnce, ...]
-        """
-        for seq in data:
-            for w in seq.strip().lower().split():
-                if w not in self.word_count:
-                    self.word_count[w] = 1
-                else:
-                    self.word_count[w] += 1
-    def create_dataset(self):
-        """
-        create file batches and dictionary of train data set.
-        If the self.overwrite is false and train.list already exists in
-        self.output_path, this function will not create and save file
-        batches from the data set path.
-        return: dictionary size, class number.
-        """
-        out_path = self.output_path
-        if out_path and not os.path.exists(out_path):
-            os.makedirs(out_path)
-        # If self.overwrite is false or self.train_list has existed,
-        # it will not process dataset.
-        if not (self.overwrite or not os.path.exists(self.train_list)):
-            print "%s already exists." % self.train_list
-            return
-        # Preprocess train data.
-        train_data, train_lab_set = self.data_list(self.train_dir)
-        print "processing train set..."
-        file_lists = self.save_data(train_data, "train", self.batch_size, True,
-                                    True)
-        save_list(file_lists, self.train_list)
-        # If have test data path, preprocess test data.
-        if os.path.exists(self.test_dir):
-            test_data, test_lab_set = self.data_list(self.test_dir)
-            assert (train_lab_set == test_lab_set)
-            print "processing test set..."
-            file_lists = self.save_data(test_data, "test", self.batch_size,
-                                        False, self.dict_with_test)
-            save_list(file_lists, self.test_list)
-        # save labels set.
-        save_dict(train_lab_set, self.label_list, False)
-        self.classes_num = len(train_lab_set.keys())
-        # save dictionary.
-        save_dict(self.word_count, self.dict_file, True)
-        self.dict_size = len(self.word_count)
-    def save_data(self,
-                  data,
-                  prefix="",
-                  batch_size=50000,
-                  is_shuffle=False,
-                  build_dict=False):
-        """
-        Create batches for a Dataset object.
-        data: the Dataset object to process.
-        prefix: the prefix of each batch.
-        batch_size: number of data in each batch.
-        build_dict: whether to build dictionary for data
-        return: list of batch names
-        """
-        if is_shuffle and self.multi_lines:
-            return self.save_data_multi_lines(data, prefix, batch_size,
-                                              build_dict)
-        if is_shuffle:
-            random.shuffle(data)
-        num_batches = int(math.ceil(len(data) / float(batch_size)))
-        batch_names = []
-        for i in range(num_batches):
-            batch_name = join_path(self.output_path,
-                                   "%s_part_%03d" % (prefix, i))
-            begin = i * batch_size
-            end = min((i + 1) * batch_size, len(data))
-            # read a batch of data
-            label_list, data_list = self.get_data_list(begin, end, data)
-            if build_dict:
-                self.create_dict(data_list)
-            self.save_file(label_list, data_list, batch_name)
-            batch_names.append(batch_name)
-        return batch_names
-    def get_data_list(self, begin, end, data):
-        """
-        begin: int, begining index of data.
-        end: int, ending index of data.
-        data: a list of {"seq_path": seqquence path, "label": label index}
-        return a list of label and a list of sequence.
-        """
-        label_list = []
-        data_list = []
-        for j in range(begin, end):
-            seqs = read_lines(data[j]["seq_path"])
-            lab = int(data[j]["label"])
-            #File may have multiple lines.
-            for seq in seqs:
-                data_list.append(seq)
-                label_list.append(lab)
-        if self.tokenizer:
-            data_list = tokenize(data_list)
-        return label_list, data_list
-    def save_data_multi_lines(self,
-                              data,
-                              prefix="",
-                              batch_size=50000,
-                              build_dict=False):
-        """
-        In order to shuffle fully, there is no need to load all data if
-        each file only contains one sample, it only needs to shuffle list
-        of file name. But one file contains multi lines, each line is one
-        sample. It needs to read all data into memory to shuffle fully.
-        This interface is mainly for data containning multi lines in each
-        file, which consumes more memory if there is a great mount of data.
-        data: the Dataset object to process.
-        prefix: the prefix of each batch.
-        batch_size: number of data in each batch.
-        build_dict: whether to build dictionary for data
-        return: list of batch names
-        """
-        assert self.multi_lines
-        label_list = []
-        data_list = []
-        # read all data
-        label_list, data_list = self.get_data_list(0, len(data), data)
-        if build_dict:
-            self.create_dict(data_list)
-        length = len(label_list)
-        perm_list = np.array([i for i in xrange(length)])
-        random.shuffle(perm_list)
-        num_batches = int(math.ceil(length / float(batch_size)))
-        batch_names = []
-        for i in range(num_batches):
-            batch_name = join_path(self.output_path,
-                                   "%s_part_%03d" % (prefix, i))
-            begin = i * batch_size
-            end = min((i + 1) * batch_size, length)
-            sub_label = [label_list[perm_list[i]] for i in range(begin, end)]
-            sub_data = [data_list[perm_list[i]] for i in range(begin, end)]
-            self.save_file(sub_label, sub_data, batch_name)
-            batch_names.append(batch_name)
-        return batch_names
-    def save_file(self, label_list, data_list, filename):
-        """
-        Save data into file.
-        label_list: a list of int value.
-        data_list: a list of sequnece.
-        filename: output file name.
-        """
-        f = open(filename, 'w')
-        print "saving file: %s" % filename
-        for lab, seq in zip(label_list, data_list):
-            f.write('%s\t\t%s\n' % (lab, seq))
-        f.close()
-def option_parser():
-    parser = OptionParser(usage="usage: python preprcoess.py "\
-                                "-i data_dir [options]")
-    parser.add_option(
-        "-i",
-        "--data",
-        action="store",
-        dest="input",
-        help="Input data directory.")
-    parser.add_option(
-        "-o",
-        "--output",
-        action="store",
-        dest="output",
-        default=None,
-        help="Output directory.")
-    parser.add_option(
-        "-t",
-        "--tokenizer",
-        action="store",
-        dest="use_tokenizer",
-        default=True,
-        help="Whether to use tokenizer.")
-    parser.add_option("-m", "--multi_lines", action="store",
-                      dest="multi_lines", default=False,
-                      help="If input text files have multi lines and they "\
-                           "need to be shuffled, you should set -m True,")
-    return parser.parse_args()
-def main():
-    options, args = option_parser()
-    data_dir = options.input
-    output_dir = options.output
-    use_tokenizer = options.use_tokenizer
-    multi_lines = options.multi_lines
-    if output_dir is None:
-        outname = os.path.basename(options.input)
-        output_dir = join_path(os.path.dirname(data_dir), 'pre-' + outname)
-    data_creator = SentimentDataSetCreate(data_dir, output_dir, use_tokenizer,
-                                          multi_lines)
-    data_creator.create_dataset()
-if __name__ == '__main__':
-    main()
--- a/demo/sentiment/preprocess.sh
+++ b/demo/sentiment/preprocess.sh
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-echo "Start to preprcess..."
-data_dir="./data/imdb"
-python preprocess.py -i $data_dir
-echo "Done."
--- a/demo/sentiment/sentiment_net.py
+++ b/demo/sentiment/sentiment_net.py
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from os.path import join as join_path
-from paddle.trainer_config_helpers import *
-def sentiment_data(data_dir=None,
-                   is_test=False,
-                   is_predict=False,
-                   train_list="train.list",
-                   test_list="test.list",
-                   dict_file="dict.txt"):
-    """
-    Predefined data provider for sentiment analysis.
-    is_test: whether this config is used for test.
-    is_predict: whether this config is used for prediction.
-    train_list: text file name, containing a list of training set.
-    test_list: text file name, containing a list of testing set.
-    dict_file: text file name, containing dictionary.
-    """
-    dict_dim = len(open(join_path(data_dir, "dict.txt")).readlines())
-    class_dim = len(open(join_path(data_dir, 'labels.list')).readlines())
-    if is_predict:
-        return dict_dim, class_dim
-    if data_dir is not None:
-        train_list = join_path(data_dir, train_list)
-        test_list = join_path(data_dir, test_list)
-        dict_file = join_path(data_dir, dict_file)
-    train_list = train_list if not is_test else None
-    word_dict = dict()
-    with open(dict_file, 'r') as f:
-        for i, line in enumerate(open(dict_file, 'r')):
-            word_dict[line.split('\t')[0]] = i
-    define_py_data_sources2(
-        train_list,
-        test_list,
-        module="dataprovider",
-        obj="process",
-        args={'dictionary': word_dict})
-    return dict_dim, class_dim
-def bidirectional_lstm_net(input_dim,
-                           class_dim=2,
-                           emb_dim=128,
-                           lstm_dim=128,
-                           is_predict=False):
-    data = data_layer("word", input_dim)
-    emb = embedding_layer(input=data, size=emb_dim)
-    bi_lstm = bidirectional_lstm(input=emb, size=lstm_dim)
-    dropout = dropout_layer(input=bi_lstm, dropout_rate=0.5)
-    output = fc_layer(input=dropout, size=class_dim, act=SoftmaxActivation())
-    if not is_predict:
-        lbl = data_layer("label", 1)
-        outputs(classification_cost(input=output, label=lbl))
-    else:
-        outputs(output)
-def stacked_lstm_net(input_dim,
-                     class_dim=2,
-                     emb_dim=128,
-                     hid_dim=512,
-                     stacked_num=3,
-                     is_predict=False):
-    """
-    A Wrapper for sentiment classification task.
-    This network uses bi-directional recurrent network,
-    consisting three LSTM layers. This configure is referred to
-    the paper as following url, but use fewer layrs.
-        http://www.aclweb.org/anthology/P15-1109
-    input_dim: here is word dictionary dimension.
-    class_dim: number of categories.
-    emb_dim: dimension of word embedding.
-    hid_dim: dimension of hidden layer.
-    stacked_num: number of stacked lstm-hidden layer.
-    is_predict: is predicting or not.
-                Some layers is not needed in network when predicting.
-    """
-    hid_lr = 1e-3
-    assert stacked_num % 2 == 1
-    layer_attr = ExtraLayerAttribute(drop_rate=0.5)
-    fc_para_attr = ParameterAttribute(learning_rate=hid_lr)
-    lstm_para_attr = ParameterAttribute(initial_std=0., learning_rate=1.)
-    para_attr = [fc_para_attr, lstm_para_attr]
-    bias_attr = ParameterAttribute(initial_std=0., l2_rate=0.)
-    relu = ReluActivation()
-    linear = LinearActivation()
-    data = data_layer("word", input_dim)
-    emb = embedding_layer(input=data, size=emb_dim)
-    fc1 = fc_layer(input=emb, size=hid_dim, act=linear, bias_attr=bias_attr)
-    lstm1 = lstmemory(
-        input=fc1, act=relu, bias_attr=bias_attr, layer_attr=layer_attr)
-    inputs = [fc1, lstm1]
-    for i in range(2, stacked_num + 1):
-        fc = fc_layer(
-            input=inputs,
-            size=hid_dim,
-            act=linear,
-            param_attr=para_attr,
-            bias_attr=bias_attr)
-        lstm = lstmemory(
-            input=fc,
-            reverse=(i % 2) == 0,
-            act=relu,
-            bias_attr=bias_attr,
-            layer_attr=layer_attr)
-        inputs = [fc, lstm]
-    fc_last = pooling_layer(input=inputs[0], pooling_type=MaxPooling())
-    lstm_last = pooling_layer(input=inputs[1], pooling_type=MaxPooling())
-    output = fc_layer(
-        input=[fc_last, lstm_last],
-        size=class_dim,
-        act=SoftmaxActivation(),
-        bias_attr=bias_attr,
-        param_attr=para_attr)
-    if is_predict:
-        outputs(output)
-    else:
-        outputs(classification_cost(input=output, label=data_layer('label', 1)))
--- a/demo/sentiment/test.sh
+++ b/demo/sentiment/test.sh
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-function get_best_pass() {
-  cat $1  | grep -Pzo 'Test .*\n.*pass-.*' | \
-  sed  -r 'N;s/Test.* classification_error_evaluator=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' |\
-  sort -n | head -n 1
-}
-log=train.log
-LOG=`get_best_pass $log`
-LOG=(${LOG})
-evaluate_pass="model_output/pass-${LOG[1]}"
-echo 'evaluating from pass '$evaluate_pass
-model_list=./model.list
-touch $model_list | echo $evaluate_pass > $model_list
-net_conf=trainer_config.py
-paddle train --config=$net_conf \
-             --model_list=$model_list \
-             --job=test \
-             --use_gpu=false \
-             --trainer_count=4 \
-             --config_args=is_test=1 \
-             2>&1 | tee 'test.log'
-paddle usage -l test.log -e $? -n "sentiment_test" >/dev/null 2>&1
--- a/demo/sentiment/train.sh
+++ b/demo/sentiment/train.sh
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-config=trainer_config.py
-output=./model_output
-paddle train --config=$config \
-             --save_dir=$output \
-             --job=train \
-             --use_gpu=false \
-             --trainer_count=4 \
-             --num_passes=10 \
-             --log_period=10 \
-             --dot_period=20 \
-             --show_parameter_stats_period=100 \
-             --test_all_data_in_one_period=1 \
-             2>&1 | tee 'train.log'
-paddle usage -l train.log -e $? -n "sentiment_train" >/dev/null 2>&1
--- a/demo/sentiment/train_v2.py
+++ b/demo/sentiment/train_v2.py
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import sys
-import paddle.v2 as paddle
-def convolution_net(input_dim, class_dim=2, emb_dim=128, hid_dim=128):
-    data = paddle.layer.data("word",
-                             paddle.data_type.integer_value_sequence(input_dim))
-    emb = paddle.layer.embedding(input=data, size=emb_dim)
-    conv_3 = paddle.networks.sequence_conv_pool(
-        input=emb, context_len=3, hidden_size=hid_dim)
-    conv_4 = paddle.networks.sequence_conv_pool(
-        input=emb, context_len=4, hidden_size=hid_dim)
-    output = paddle.layer.fc(input=[conv_3, conv_4],
-                             size=class_dim,
-                             act=paddle.activation.Softmax())
-    lbl = paddle.layer.data("label", paddle.data_type.integer_value(2))
-    cost = paddle.layer.classification_cost(input=output, label=lbl)
-    return cost
-def stacked_lstm_net(input_dim,
-                     class_dim=2,
-                     emb_dim=128,
-                     hid_dim=512,
-                     stacked_num=3):
-    """
-    A Wrapper for sentiment classification task.
-    This network uses bi-directional recurrent network,
-    consisting three LSTM layers. This configure is referred to
-    the paper as following url, but use fewer layrs.
-        http://www.aclweb.org/anthology/P15-1109
-    input_dim: here is word dictionary dimension.
-    class_dim: number of categories.
-    emb_dim: dimension of word embedding.
-    hid_dim: dimension of hidden layer.
-    stacked_num: number of stacked lstm-hidden layer.
-    """
-    assert stacked_num % 2 == 1
-    layer_attr = paddle.attr.Extra(drop_rate=0.5)
-    fc_para_attr = paddle.attr.Param(learning_rate=1e-3)
-    lstm_para_attr = paddle.attr.Param(initial_std=0., learning_rate=1.)
-    para_attr = [fc_para_attr, lstm_para_attr]
-    bias_attr = paddle.attr.Param(initial_std=0., l2_rate=0.)
-    relu = paddle.activation.Relu()
-    linear = paddle.activation.Linear()
-    data = paddle.layer.data("word",
-                             paddle.data_type.integer_value_sequence(input_dim))
-    emb = paddle.layer.embedding(input=data, size=emb_dim)
-    fc1 = paddle.layer.fc(input=emb,
-                          size=hid_dim,
-                          act=linear,
-                          bias_attr=bias_attr)
-    lstm1 = paddle.layer.lstmemory(
-        input=fc1, act=relu, bias_attr=bias_attr, layer_attr=layer_attr)
-    inputs = [fc1, lstm1]
-    for i in range(2, stacked_num + 1):
-        fc = paddle.layer.fc(input=inputs,
-                             size=hid_dim,
-                             act=linear,
-                             param_attr=para_attr,
-                             bias_attr=bias_attr)
-        lstm = paddle.layer.lstmemory(
-            input=fc,
-            reverse=(i % 2) == 0,
-            act=relu,
-            bias_attr=bias_attr,
-            layer_attr=layer_attr)
-        inputs = [fc, lstm]
-    fc_last = paddle.layer.pooling(
-        input=inputs[0], pooling_type=paddle.pooling.Max())
-    lstm_last = paddle.layer.pooling(
-        input=inputs[1], pooling_type=paddle.pooling.Max())
-    output = paddle.layer.fc(input=[fc_last, lstm_last],
-                             size=class_dim,
-                             act=paddle.activation.Softmax(),
-                             bias_attr=bias_attr,
-                             param_attr=para_attr)
-    lbl = paddle.layer.data("label", paddle.data_type.integer_value(2))
-    cost = paddle.layer.classification_cost(input=output, label=lbl)
-    return cost
-if __name__ == '__main__':
-    # init
-    paddle.init(use_gpu=False)
-    #data
-    print 'load dictionary...'
-    word_dict = paddle.dataset.imdb.word_dict()
-    dict_dim = len(word_dict)
-    class_dim = 2
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            lambda: paddle.dataset.imdb.train(word_dict), buf_size=1000),
-        batch_size=100)
-    test_reader = paddle.batch(
-        lambda: paddle.dataset.imdb.test(word_dict), batch_size=100)
-    feeding = {'word': 0, 'label': 1}
-    # network config
-    # Please choose the way to build the network
-    # by uncommenting the corresponding line.
-    cost = convolution_net(dict_dim, class_dim=class_dim)
-    # cost = stacked_lstm_net(dict_dim, class_dim=class_dim, stacked_num=3)
-    # create parameters
-    parameters = paddle.parameters.create(cost)
-    # create optimizer
-    adam_optimizer = paddle.optimizer.Adam(
-        learning_rate=2e-3,
-        regularization=paddle.optimizer.L2Regularization(rate=8e-4),
-        model_average=paddle.optimizer.ModelAverage(average_window=0.5))
-    # End batch and end pass event handler
-    def event_handler(event):
-        if isinstance(event, paddle.event.EndIteration):
-            if event.batch_id % 100 == 0:
-                print "\nPass %d, Batch %d, Cost %f, %s" % (
-                    event.pass_id, event.batch_id, event.cost, event.metrics)
-            else:
-                sys.stdout.write('.')
-                sys.stdout.flush()
-        if isinstance(event, paddle.event.EndPass):
-            result = trainer.test(reader=test_reader, feeding=feeding)
-            print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
-    # create trainer
-    trainer = paddle.trainer.SGD(cost=cost,
-                                 parameters=parameters,
-                                 update_equation=adam_optimizer)
-    trainer.train(
-        reader=train_reader,
-        event_handler=event_handler,
-        feeding=feeding,
-        num_passes=2)
--- a/demo/sentiment/trainer_config.py
+++ b/demo/sentiment/trainer_config.py
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from sentiment_net import *
-from paddle.trainer_config_helpers import *
-# whether this config is used for test
-is_test = get_config_arg('is_test', bool, False)
-# whether this config is used for prediction
-is_predict = get_config_arg('is_predict', bool, False)
-data_dir = "./data/pre-imdb"
-dict_dim, class_dim = sentiment_data(data_dir, is_test, is_predict)
-################## Algorithm Config #####################
-settings(
-    batch_size=128,
-    learning_rate=2e-3,
-    learning_method=AdamOptimizer(),
-    model_average=ModelAverage(0.5),
-    regularization=L2Regularization(8e-4),
-    gradient_clipping_threshold=25)
-#################### Network Config ######################
-stacked_lstm_net(
-    dict_dim, class_dim=class_dim, stacked_num=3, is_predict=is_predict)
-# bidirectional_lstm_net(dict_dim, class_dim=class_dim, is_predict=is_predict)
--- a/demo/seqToseq/.gitignore
+++ b/demo/seqToseq/.gitignore
-data/wmt14
-data/pre-wmt14
-data/wmt14_model
-data/paraphrase
-data/pre-paraphrase
-data/paraphrase_model
-translation/gen.log
-translation/gen_result
-translation/train.log
-paraphrase/train.log
-dataprovider_copy_1.py
-translation/thirdparty.tgz
-translation/thirdparty/train.conf
-translation/thirdparty/dataprovider.py
-translation/thirdparty/seqToseq_net.py
-translation/thirdparty/*.dict
-*.pyc
--- a/demo/seqToseq/api_train_v2.py
+++ b/demo/seqToseq/api_train_v2.py
-import sys
-import paddle.v2 as paddle
-def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
-    ### Network Architecture
-    word_vector_dim = 512  # dimension of word vector
-    decoder_size = 512  # dimension of hidden unit in GRU Decoder network
-    encoder_size = 512  # dimension of hidden unit in GRU Encoder network
-    beam_size = 3
-    max_length = 250
-    #### Encoder
-    src_word_id = paddle.layer.data(
-        name='source_language_word',
-        type=paddle.data_type.integer_value_sequence(source_dict_dim))
-    src_embedding = paddle.layer.embedding(
-        input=src_word_id,
-        size=word_vector_dim,
-        param_attr=paddle.attr.ParamAttr(name='_source_language_embedding'))
-    src_forward = paddle.networks.simple_gru(
-        name='src_forward_gru', input=src_embedding, size=encoder_size)
-    src_backward = paddle.networks.simple_gru(
-        name='src_backward_gru',
-        input=src_embedding,
-        size=encoder_size,
-        reverse=True)
-    encoded_vector = paddle.layer.concat(input=[src_forward, src_backward])
-    #### Decoder
-    with paddle.layer.mixed(size=decoder_size) as encoded_proj:
-        encoded_proj += paddle.layer.full_matrix_projection(
-            input=encoded_vector)
-    backward_first = paddle.layer.first_seq(input=src_backward)
-    with paddle.layer.mixed(
-            name="decoder_boot_mixed",
-            size=decoder_size,
-            act=paddle.activation.Tanh()) as decoder_boot:
-        decoder_boot += paddle.layer.full_matrix_projection(
-            input=backward_first)
-    def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
-        decoder_mem = paddle.layer.memory(
-            name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
-        context = paddle.networks.simple_attention(
-            name="simple_attention",
-            encoded_sequence=enc_vec,
-            encoded_proj=enc_proj,
-            decoder_state=decoder_mem)
-        with paddle.layer.mixed(
-                name="input_recurrent",
-                size=decoder_size * 3,
-                # enable error clipping 
-                layer_attr=paddle.attr.ExtraAttr(
-                    error_clipping_threshold=100.0)) as decoder_inputs:
-            decoder_inputs += paddle.layer.full_matrix_projection(input=context)
-            decoder_inputs += paddle.layer.full_matrix_projection(
-                input=current_word)
-        gru_step = paddle.layer.gru_step(
-            name='gru_decoder',
-            input=decoder_inputs,
-            output_mem=decoder_mem,
-            # uncomment to enable local threshold for gradient clipping
-            # param_attr=paddle.attr.ParamAttr(gradient_clipping_threshold=9.9),
-            size=decoder_size)
-        with paddle.layer.mixed(
-                name="gru_step_output",
-                size=target_dict_dim,
-                bias_attr=True,
-                act=paddle.activation.Softmax()) as out:
-            out += paddle.layer.full_matrix_projection(input=gru_step)
-        return out
-    decoder_group_name = "decoder_group"
-    group_input1 = paddle.layer.StaticInputV2(input=encoded_vector, is_seq=True)
-    group_input2 = paddle.layer.StaticInputV2(input=encoded_proj, is_seq=True)
-    group_inputs = [group_input1, group_input2]
-    if not is_generating:
-        trg_embedding = paddle.layer.embedding(
-            input=paddle.layer.data(
-                name='target_language_word',
-                type=paddle.data_type.integer_value_sequence(target_dict_dim)),
-            size=word_vector_dim,
-            param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
-        group_inputs.append(trg_embedding)
-        # For decoder equipped with attention mechanism, in training,
-        # target embeding (the groudtruth) is the data input,
-        # while encoded source sequence is accessed to as an unbounded memory.
-        # Here, the StaticInput defines a read-only memory
-        # for the recurrent_group.
-        decoder = paddle.layer.recurrent_group(
-            name=decoder_group_name,
-            step=gru_decoder_with_attention,
-            input=group_inputs)
-        lbl = paddle.layer.data(
-            name='target_language_next_word',
-            type=paddle.data_type.integer_value_sequence(target_dict_dim))
-        cost = paddle.layer.classification_cost(input=decoder, label=lbl)
-        return cost
-    else:
-        # In generation, the decoder predicts a next target word based on
-        # the encoded source sequence and the last generated target word.
-        # The encoded source sequence (encoder's output) must be specified by
-        # StaticInput, which is a read-only memory.
-        # Embedding of the last generated word is automatically gotten by
-        # GeneratedInputs, which is initialized by a start mark, such as <s>,
-        # and must be included in generation.
-        trg_embedding = paddle.layer.GeneratedInputV2(
-            size=target_dict_dim,
-            embedding_name='_target_language_embedding',
-            embedding_size=word_vector_dim)
-        group_inputs.append(trg_embedding)
-        beam_gen = paddle.layer.beam_search(
-            name=decoder_group_name,
-            step=gru_decoder_with_attention,
-            input=group_inputs,
-            bos_id=0,
-            eos_id=1,
-            beam_size=beam_size,
-            max_length=max_length)
-        return beam_gen
-def main():
-    paddle.init(
-        use_gpu=False,
-        trainer_count=1,
-        # log gradient clipping info
-        log_clipping=True,
-        # log error clipping info
-        log_error_clipping=True)
-    is_generating = False
-    # source and target dict dim.
-    dict_size = 30000
-    source_dict_dim = target_dict_dim = dict_size
-    # train the network
-    if not is_generating:
-        cost = seqToseq_net(source_dict_dim, target_dict_dim)
-        parameters = paddle.parameters.create(cost)
-        # define optimize method and trainer
-        optimizer = paddle.optimizer.Adam(
-            learning_rate=5e-5,
-            # uncomment to enable global threshold for gradient clipping
-            # gradient_clipping_threshold=10.0,
-            regularization=paddle.optimizer.L2Regularization(rate=8e-4))
-        trainer = paddle.trainer.SGD(cost=cost,
-                                     parameters=parameters,
-                                     update_equation=optimizer)
-        # define data reader
-        wmt14_reader = paddle.batch(
-            paddle.reader.shuffle(
-                paddle.dataset.wmt14.train(dict_size), buf_size=8192),
-            batch_size=5)
-        # define event_handler callback
-        def event_handler(event):
-            if isinstance(event, paddle.event.EndIteration):
-                if event.batch_id % 10 == 0:
-                    print "\nPass %d, Batch %d, Cost %f, %s" % (
-                        event.pass_id, event.batch_id, event.cost,
-                        event.metrics)
-                else:
-                    sys.stdout.write('.')
-                    sys.stdout.flush()
-        # start to train
-        trainer.train(
-            reader=wmt14_reader, event_handler=event_handler, num_passes=2)
-    # generate a english sequence to french
-    else:
-        # use the first 3 samples for generation
-        gen_creator = paddle.dataset.wmt14.gen(dict_size)
-        gen_data = []
-        gen_num = 3
-        for item in gen_creator():
-            gen_data.append((item[0], ))
-            if len(gen_data) == gen_num:
-                break
-        beam_gen = seqToseq_net(source_dict_dim, target_dict_dim, is_generating)
-        # get the pretrained model, whose bleu = 26.92
-        parameters = paddle.dataset.wmt14.model()
-        # prob is the prediction probabilities, and id is the prediction word. 
-        beam_result = paddle.infer(
-            output_layer=beam_gen,
-            parameters=parameters,
-            input=gen_data,
-            field=['prob', 'id'])
-        # get the dictionary
-        src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
-        # the delimited element of generated sequences is -1,
-        # the first element of each generated sequence is the sequence length
-        seq_list = []
-        seq = []
-        for w in beam_result[1]:
-            if w != -1:
-                seq.append(w)
-            else:
-                seq_list.append(' '.join([trg_dict.get(w) for w in seq[1:]]))
-                seq = []
-        prob = beam_result[0]
-        beam_size = 3
-        for i in xrange(gen_num):
-            print "\n*******************************************************\n"
-            print "src:", ' '.join(
-                [src_dict.get(w) for w in gen_data[i][0]]), "\n"
-            for j in xrange(beam_size):
-                print "prob = %f:" % (prob[i][j]), seq_list[i * beam_size + j]
-if __name__ == '__main__':
-    main()
--- a/demo/seqToseq/data/paraphrase_data.sh
+++ b/demo/seqToseq/data/paraphrase_data.sh
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-set -x
-# download the in-house paraphrase dataset
-wget http://paddlepaddle.bj.bcebos.com/model_zoo/embedding/paraphrase.tar.gz
-# untar the dataset
-tar -zxvf paraphrase.tar.gz
-rm paraphrase.tar.gz
--- a/demo/seqToseq/data/paraphrase_model.sh
+++ b/demo/seqToseq/data/paraphrase_model.sh
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-set -x
-dim=32
-pretrained_dir='../../model_zoo/embedding/'
-preModel=$pretrained_dir'model_'$dim'.emb'
-preDict=$pretrained_dir'baidu.dict'
-usrDict_dir='pre-paraphrase/'
-srcDict=$usrDict_dir'src.dict'
-trgDict=$usrDict_dir'trg.dict'
-usrModel_dir='paraphrase_model/'
-mkdir $usrModel_dir
-srcModel=$usrModel_dir'_source_language_embedding'
-trgModel=$usrModel_dir'_target_language_embedding'
-echo 'extract desired parameters based on user dictionary'
-script=$pretrained_dir'extract_para.py'
-python $script --preModel $preModel --preDict $preDict \
-          --usrModel $srcModel --usrDict $srcDict -d $dim
-python $script --preModel $preModel --preDict $preDict \
-          --usrModel $trgModel --usrDict $trgDict -d $dim
--- a/demo/seqToseq/data/wmt14_data.sh
+++ b/demo/seqToseq/data/wmt14_data.sh
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-set -x
-mkdir wmt14
-cd wmt14
-# download the dataset
-wget http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/bitexts.tgz
-wget http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz
-# untar the dataset
-tar -zxvf bitexts.tgz
-tar -zxvf dev+test.tgz
-gunzip bitexts.selected/*
-mv bitexts.selected train
-rm bitexts.tgz
-rm dev+test.tgz
-# separate the dev and test dataset
-mkdir test gen
-mv dev/ntst1213.* test
-mv dev/ntst14.* gen 
-rm -rf dev
-set +x
-# rename the suffix, .fr->.src, .en->.trg
-for dir in train test gen
-do 
-  filelist=`ls $dir`
-  cd $dir
-  for file in $filelist
-  do 
-    if [ ${file##*.} = "fr" ]; then
-      mv $file ${file/%fr/src}
-    elif [ ${file##*.} = 'en' ]; then
-      mv $file ${file/%en/trg}
-    fi
-  done
-  cd ..
-done
--- a/demo/seqToseq/data/wmt14_model.sh
+++ b/demo/seqToseq/data/wmt14_model.sh
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-set -x
-# download the pretrained model
-wget http://paddlepaddle.bj.bcebos.com/model_zoo/wmt14_model.tar.gz
-# untar the model
-tar -zxvf wmt14_model.tar.gz
-rm wmt14_model.tar.gz 
--- a/demo/seqToseq/dataprovider.py
+++ b/demo/seqToseq/dataprovider.py
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from paddle.trainer.PyDataProvider2 import *
-UNK_IDX = 2
-START = "<s>"
-END = "<e>"
-def hook(settings, src_dict_path, trg_dict_path, is_generating, file_list,
-         **kwargs):
-    # job_mode = 1: training mode
-    # job_mode = 0: generating mode
-    settings.job_mode = not is_generating
-    def fun(dict_path):
-        out_dict = dict()
-        with open(dict_path, "r") as fin:
-            out_dict = {
-                line.strip(): line_count
-                for line_count, line in enumerate(fin)
-            }
-        return out_dict
-    settings.src_dict = fun(src_dict_path)
-    settings.trg_dict = fun(trg_dict_path)
-    settings.logger.info("src dict len : %d" % (len(settings.src_dict)))
-    if settings.job_mode:
-        settings.slots = {
-            'source_language_word':
-            integer_value_sequence(len(settings.src_dict)),
-            'target_language_word':
-            integer_value_sequence(len(settings.trg_dict)),
-            'target_language_next_word':
-            integer_value_sequence(len(settings.trg_dict))
-        }
-        settings.logger.info("trg dict len : %d" % (len(settings.trg_dict)))
-    else:
-        settings.slots = {
-            'source_language_word':
-            integer_value_sequence(len(settings.src_dict)),
-            'sent_id':
-            integer_value_sequence(len(open(file_list[0], "r").readlines()))
-        }
-def _get_ids(s, dictionary):
-    words = s.strip().split()
-    return [dictionary[START]] + \
-           [dictionary.get(w, UNK_IDX) for w in words] + \
-           [dictionary[END]]
-@provider(init_hook=hook, pool_size=50000)
-def process(settings, file_name):
-    with open(file_name, 'r') as f:
-        for line_count, line in enumerate(f):
-            line_split = line.strip().split('\t')
-            if settings.job_mode and len(line_split) != 2:
-                continue
-            src_seq = line_split[0]  # one source sequence
-            src_ids = _get_ids(src_seq, settings.src_dict)
-            if settings.job_mode:
-                trg_seq = line_split[1]  # one target sequence
-                trg_words = trg_seq.split()
-                trg_ids = [settings.trg_dict.get(w, UNK_IDX) for w in trg_words]
-                # remove sequence whose length > 80 in training mode
-                if len(src_ids) > 80 or len(trg_ids) > 80:
-                    continue
-                trg_ids_next = trg_ids + [settings.trg_dict[END]]
-                trg_ids = [settings.trg_dict[START]] + trg_ids
-                yield {
-                    'source_language_word': src_ids,
-                    'target_language_word': trg_ids,
-                    'target_language_next_word': trg_ids_next
-                }
-            else:
-                yield {'source_language_word': src_ids, 'sent_id': [line_count]}
--- a/demo/seqToseq/paraphrase/train.conf
+++ b/demo/seqToseq/paraphrase/train.conf
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import sys
-sys.path.append("..")
-from seqToseq_net import *
-is_generating = False
-### Data Definiation
-train_conf = seq_to_seq_data(data_dir = "./data/pre-paraphrase",
-                             is_generating = is_generating)
-### Algorithm Configuration
-settings(
-      learning_method = AdamOptimizer(),
-      batch_size = 50,
-      learning_rate = 5e-4)
-### Network Architecture
-gru_encoder_decoder(train_conf, is_generating, word_vector_dim = 32)
--- a/demo/seqToseq/paraphrase/train.sh
+++ b/demo/seqToseq/paraphrase/train.sh
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-cd ..
-paddle train \
-    --config='paraphrase/train.conf' \
-    --save_dir='paraphrase/model' \
-    --init_model_path='data/paraphrase_model' \
-    --load_missing_parameter_strategy=rand \
-    --use_gpu=false \
-    --num_passes=16 \
-    --show_parameter_stats_period=100 \
-    --trainer_count=4 \
-    --log_period=10 \
-    --dot_period=5 \
-    2>&1 | tee 'paraphrase/train.log'
-paddle usage -l 'paraphrase/train.log' -e $? -n "seqToseq_paraphrase_train" >/dev/null 2>&1
--- a/demo/seqToseq/preprocess.py
+++ b/demo/seqToseq/preprocess.py
-#!/bin/env python
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Example:
-    python preprocess.py -i INPUT [-d DICTSIZE] [-m]
-Options:
-    -h, --help     show this help message and exit
-    -i INPUT       input original dataset path
-    -d DICTSIZE    specified word count of dictionary
-    -m --mergeDict merge source and target dictionary
-"""
-import os
-import sys
-import string
-from optparse import OptionParser
-from paddle.utils.preprocess_util import save_list, DatasetCreater
-class SeqToSeqDatasetCreater(DatasetCreater):
-    """
-    A class to process data for sequence to sequence application.
-    """
-    def __init__(self, data_path, output_path):
-        """
-        data_path: the path to store the train data, test data and gen data
-        output_path: the path to store the processed dataset
-        """
-        DatasetCreater.__init__(self, data_path)
-        self.gen_dir_name = 'gen'
-        self.gen_list_name = 'gen.list'
-        self.output_path = output_path
-    def concat_file(self, file_path, file1, file2, output_path, output):
-        """
-        Concat file1 and file2 to be one output file 
-        The i-th line of output = i-th line of file1 + '\t' + i-th line of file2
-        file_path: the path to store file1 and file2
-        output_path: the path to store output file
-        """
-        file1 = os.path.join(file_path, file1)
-        file2 = os.path.join(file_path, file2)
-        output = os.path.join(output_path, output)
-        if not os.path.exists(output):
-            os.system('paste ' + file1 + ' ' + file2 + ' > ' + output)
-    def cat_file(self, dir_path, suffix, output_path, output):
-        """
-        Cat all the files in dir_path with suffix to be one output file 
-        dir_path: the base directory to store input file
-        suffix: suffix of file name
-        output_path: the path to store output file
-        """
-        cmd = 'cat '
-        file_list = os.listdir(dir_path)
-        file_list.sort()
-        for file in file_list:
-            if file.endswith(suffix):
-                cmd += os.path.join(dir_path, file) + ' '
-        output = os.path.join(output_path, output)
-        if not os.path.exists(output):
-            os.system(cmd + '> ' + output)
-    def build_dict(self, file_path, dict_path, dict_size=-1):
-        """ 
-        Create the dictionary for the file, Note that
-        1. Valid characters include all printable characters
-        2. There is distinction between uppercase and lowercase letters
-        3. There is 3 special token: 
-           <s>: the start of a sequence
-           <e>: the end of a sequence
-           <unk>: a word not included in dictionary
-        file_path: the path to store file 
-        dict_path: the path to store dictionary
-        dict_size: word count of dictionary
-                   if is -1, dictionary will contains all the words in file 
-        """
-        if not os.path.exists(dict_path):
-            dictory = dict()
-            with open(file_path, "r") as fdata:
-                for line in fdata:
-                    line = line.split('\t')
-                    for line_split in line:
-                        words = line_split.strip().split()
-                        for word in words:
-                            if word not in dictory:
-                                dictory[word] = 1
-                            else:
-                                dictory[word] += 1
-            output = open(dict_path, "w+")
-            output.write('<s>\n<e>\n<unk>\n')
-            count = 3
-            for key, value in sorted(
-                    dictory.items(), key=lambda d: d[1], reverse=True):
-                output.write(key + "\n")
-                count += 1
-                if count == dict_size:
-                    break
-            self.dict_size = count
-    def create_dataset(self,
-                       dict_size=-1,
-                       mergeDict=False,
-                       suffixes=['.src', '.trg']):
-        """
-        Create seqToseq dataset 
-        """
-        # dataset_list and dir_list has one-to-one relationship
-        train_dataset = os.path.join(self.data_path, self.train_dir_name)
-        test_dataset = os.path.join(self.data_path, self.test_dir_name)
-        gen_dataset = os.path.join(self.data_path, self.gen_dir_name)
-        dataset_list = [train_dataset, test_dataset, gen_dataset]
-        train_dir = os.path.join(self.output_path, self.train_dir_name)
-        test_dir = os.path.join(self.output_path, self.test_dir_name)
-        gen_dir = os.path.join(self.output_path, self.gen_dir_name)
-        dir_list = [train_dir, test_dir, gen_dir]
-        # create directory
-        for dir in dir_list:
-            if not os.path.exists(dir):
-                os.mkdir(dir)
-        # checkout dataset should be parallel corpora
-        suffix_len = len(suffixes[0])
-        for dataset in dataset_list:
-            file_list = os.listdir(dataset)
-            if len(file_list) % 2 == 1:
-                raise RuntimeError("dataset should be parallel corpora")
-            file_list.sort()
-            for i in range(0, len(file_list), 2):
-                if file_list[i][:-suffix_len] != file_list[i + 1][:-suffix_len]:
-                    raise RuntimeError(
-                        "source and target file name should be equal")
-        # cat all the files with the same suffix in dataset
-        for suffix in suffixes:
-            for dataset in dataset_list:
-                outname = os.path.basename(dataset) + suffix
-                self.cat_file(dataset, suffix, dataset, outname)
-        # concat parallel corpora and create file.list
-        print 'concat parallel corpora for dataset'
-        id = 0
-        list = ['train.list', 'test.list', 'gen.list']
-        for dataset in dataset_list:
-            outname = os.path.basename(dataset)
-            self.concat_file(dataset, outname + suffixes[0],
-                             outname + suffixes[1], dir_list[id], outname)
-            save_list([os.path.join(dir_list[id], outname)],
-                      os.path.join(self.output_path, list[id]))
-            id += 1
-        # build dictionary for train data
-        dict = ['src.dict', 'trg.dict']
-        dict_path = [
-            os.path.join(self.output_path, dict[0]),
-            os.path.join(self.output_path, dict[1])
-        ]
-        if mergeDict:
-            outname = os.path.join(train_dir, train_dataset.split('/')[-1])
-            print 'build src dictionary for train data'
-            self.build_dict(outname, dict_path[0], dict_size)
-            print 'build trg dictionary for train data'
-            os.system('cp ' + dict_path[0] + ' ' + dict_path[1])
-        else:
-            outname = os.path.join(train_dataset, self.train_dir_name)
-            for id in range(0, 2):
-                suffix = suffixes[id]
-                print 'build ' + suffix[1:] + ' dictionary for train data'
-                self.build_dict(outname + suffix, dict_path[id], dict_size)
-        print 'dictionary size is', self.dict_size
-def main():
-    usage = "usage: \n" \
-            "python %prog -i INPUT [-d DICTSIZE] [-m]"
-    parser = OptionParser(usage)
-    parser.add_option(
-        "-i", action="store", dest="input", help="input original dataset path")
-    parser.add_option(
-        "-d",
-        action="store",
-        dest="dictsize",
-        help="specified word count of dictionary")
-    parser.add_option(
-        "-m",
-        "--mergeDict",
-        action="store_true",
-        dest="mergeDict",
-        help="merge source and target dictionary")
-    (options, args) = parser.parse_args()
-    if options.input[-1] == os.path.sep:
-        options.input = options.input[:-1]
-    outname = os.path.basename(options.input)
-    output_path = os.path.join(os.path.dirname(options.input), 'pre-' + outname)
-    dictsize = int(options.dictsize) if options.dictsize else -1
-    if not os.path.exists(output_path):
-        os.mkdir(output_path)
-        data_creator = SeqToSeqDatasetCreater(options.input, output_path)
-        data_creator.create_dataset(dictsize, options.mergeDict)
-if __name__ == "__main__":
-    main()
--- a/demo/seqToseq/seqToseq_net.py
+++ b/demo/seqToseq/seqToseq_net.py
-# edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import sys
-import os
-from paddle.trainer_config_helpers import *
-def seq_to_seq_data(data_dir,
-                    is_generating,
-                    dict_size=30000,
-                    train_list='train.list',
-                    test_list='test.list',
-                    gen_list='gen.list',
-                    gen_result='gen_result'):
-    """
-    Predefined seqToseq train data provider for application
-    is_generating: whether this config is used for generating
-    dict_size: word count of dictionary
-    train_list: a text file containing a list of training data
-    test_list: a text file containing a list of testing data
-    gen_list: a text file containing a list of generating data
-    gen_result: a text file containing generating result
-    """
-    src_lang_dict = os.path.join(data_dir, 'src.dict')
-    trg_lang_dict = os.path.join(data_dir, 'trg.dict')
-    if is_generating:
-        train_list = None
-        test_list = os.path.join(data_dir, gen_list)
-    else:
-        train_list = os.path.join(data_dir, train_list)
-        test_list = os.path.join(data_dir, test_list)
-    define_py_data_sources2(
-        train_list,
-        test_list,
-        module="dataprovider",
-        obj="process",
-        args={
-            "src_dict_path": src_lang_dict,
-            "trg_dict_path": trg_lang_dict,
-            "is_generating": is_generating
-        })
-    return {
-        "src_dict_path": src_lang_dict,
-        "trg_dict_path": trg_lang_dict,
-        "gen_result": gen_result
-    }
-def gru_encoder_decoder(data_conf,
-                        is_generating,
-                        word_vector_dim=512,
-                        encoder_size=512,
-                        decoder_size=512,
-                        beam_size=3,
-                        max_length=250,
-                        error_clipping=50):
-    """
-    A wrapper for an attention version of GRU Encoder-Decoder network
-    is_generating: whether this config is used for generating
-    encoder_size: dimension of hidden unit in GRU Encoder network
-    decoder_size: dimension of hidden unit in GRU Decoder network
-    word_vector_dim: dimension of word vector
-    beam_size: expand width in beam search
-    max_length: a stop condition of sequence generation
-    """
-    for k, v in data_conf.iteritems():
-        globals()[k] = v
-    source_dict_dim = len(open(src_dict_path, "r").readlines())
-    target_dict_dim = len(open(trg_dict_path, "r").readlines())
-    gen_trans_file = gen_result
-    src_word_id = data_layer(name='source_language_word', size=source_dict_dim)
-    src_embedding = embedding_layer(
-        input=src_word_id,
-        size=word_vector_dim,
-        param_attr=ParamAttr(name='_source_language_embedding'))
-    src_forward = simple_gru(
-        input=src_embedding,
-        size=encoder_size,
-        naive=True,
-        gru_layer_attr=ExtraLayerAttribute(
-            error_clipping_threshold=error_clipping))
-    src_backward = simple_gru(
-        input=src_embedding,
-        size=encoder_size,
-        reverse=True,
-        naive=True,
-        gru_layer_attr=ExtraLayerAttribute(
-            error_clipping_threshold=error_clipping))
-    encoded_vector = concat_layer(input=[src_forward, src_backward])
-    with mixed_layer(size=decoder_size) as encoded_proj:
-        encoded_proj += full_matrix_projection(input=encoded_vector)
-    backward_first = first_seq(input=src_backward)
-    with mixed_layer(
-            size=decoder_size,
-            act=TanhActivation(), ) as decoder_boot:
-        decoder_boot += full_matrix_projection(input=backward_first)
-    def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
-        decoder_mem = memory(
-            name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
-        context = simple_attention(
-            encoded_sequence=enc_vec,
-            encoded_proj=enc_proj,
-            decoder_state=decoder_mem, )
-        with mixed_layer(size=decoder_size * 3) as decoder_inputs:
-            decoder_inputs += full_matrix_projection(input=context)
-            decoder_inputs += full_matrix_projection(input=current_word)
-        gru_step = gru_step_naive_layer(
-            name='gru_decoder',
-            input=decoder_inputs,
-            output_mem=decoder_mem,
-            size=decoder_size,
-            layer_attr=ExtraLayerAttribute(
-                error_clipping_threshold=error_clipping))
-        with mixed_layer(
-                size=target_dict_dim, bias_attr=True,
-                act=SoftmaxActivation()) as out:
-            out += full_matrix_projection(input=gru_step)
-        return out
-    decoder_group_name = "decoder_group"
-    group_inputs = [
-        StaticInput(
-            input=encoded_vector, is_seq=True), StaticInput(
-                input=encoded_proj, is_seq=True)
-    ]
-    if not is_generating:
-        trg_embedding = embedding_layer(
-            input=data_layer(
-                name='target_language_word', size=target_dict_dim),
-            size=word_vector_dim,
-            param_attr=ParamAttr(name='_target_language_embedding'))
-        group_inputs.append(trg_embedding)
-        # For decoder equipped with attention mechanism, in training,
-        # target embeding (the groudtruth) is the data input,
-        # while encoded source sequence is accessed to as an unbounded memory.
-        # Here, the StaticInput defines a read-only memory
-        # for the recurrent_group.
-        decoder = recurrent_group(
-            name=decoder_group_name,
-            step=gru_decoder_with_attention,
-            input=group_inputs)
-        lbl = data_layer(name='target_language_next_word', size=target_dict_dim)
-        cost = classification_cost(input=decoder, label=lbl)
-        outputs(cost)
-    else:
-        # In generation, the decoder predicts a next target word based on
-        # the encoded source sequence and the last generated target word.
-        # The encoded source sequence (encoder's output) must be specified by
-        # StaticInput, which is a read-only memory.
-        # Embedding of the last generated word is automatically gotten by
-        # GeneratedInputs, which is initialized by a start mark, such as <s>,
-        # and must be included in generation.
-        trg_embedding = GeneratedInput(
-            size=target_dict_dim,
-            embedding_name='_target_language_embedding',
-            embedding_size=word_vector_dim)
-        group_inputs.append(trg_embedding)
-        beam_gen = beam_search(
-            name=decoder_group_name,
-            step=gru_decoder_with_attention,
-            input=group_inputs,
-            bos_id=0,
-            eos_id=1,
-            beam_size=beam_size,
-            max_length=max_length)
-        seqtext_printer_evaluator(
-            input=beam_gen,
-            id_input=data_layer(
-                name="sent_id", size=1),
-            dict_file=trg_dict_path,
-            result_file=gen_trans_file)
-        outputs(beam_gen)
--- a/demo/seqToseq/translation/eval_bleu.sh
+++ b/demo/seqToseq/translation/eval_bleu.sh
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-gen_file=$1
-beam_size=$2
-# find top1 generating result
-top1=$(printf '%s_top1.txt' `basename $gen_file .txt`)
-if [ $beam_size -eq 1 ]; then
-    awk -F "\t" '{sub(" <e>","",$2);sub(" ","",$2);print $2}' $gen_file >$top1
-else
-    awk 'BEGIN{
-        FS="\t";
-        OFS="\t";
-        read_pos = 2} {
-        if (NR == read_pos){
-            sub(" <e>","",$3);
-            sub(" ","",$3);
-            print $3;
-            read_pos += (2 + res_num);
-      }}' res_num=$beam_size $gen_file >$top1
-fi 
-# evalute bleu value
-bleu_script=multi-bleu.perl
-standard_res=../data/wmt14/gen/ntst14.trg
-bleu_res=`perl $bleu_script $standard_res <$top1`
-echo $bleu_res
-rm $top1
--- a/demo/seqToseq/translation/gen.conf
+++ b/demo/seqToseq/translation/gen.conf
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import sys
-sys.path.append("..")
-from seqToseq_net import *
-# whether this config is used for generating
-is_generating = True
-### Data Definiation
-gen_conf = seq_to_seq_data(data_dir = "./data/pre-wmt14", 
-                           is_generating = is_generating,
-                           gen_result = "./translation/gen_result")
-### Algorithm Configuration
-settings(
-      learning_method = AdamOptimizer(),
-      batch_size = 1,
-      learning_rate = 0)
-### Network Architecture
-gru_encoder_decoder(gen_conf, is_generating)
--- a/demo/seqToseq/translation/gen.sh
+++ b/demo/seqToseq/translation/gen.sh
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-cd ..
-paddle train \
-    --job=test \
-    --config='translation/gen.conf' \
-    --save_dir='data/wmt14_model' \
-    --use_gpu=false \
-    --num_passes=13 \
-    --test_pass=12 \
-    --trainer_count=1 \
-    2>&1 | tee 'translation/gen.log'
-paddle usage -l 'translation/gen.log' -e $? -n "seqToseq_translation_gen" >/dev/null 2>&1
--- a/demo/seqToseq/translation/moses_bleu.sh
+++ b/demo/seqToseq/translation/moses_bleu.sh
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-set -x
-echo "Downloading multi-bleu.perl"
-wget https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/generic/multi-bleu.perl --no-check-certificate
--- a/demo/seqToseq/translation/train.conf
+++ b/demo/seqToseq/translation/train.conf
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import sys
-sys.path.append("..")
-from seqToseq_net import *
-# whether this config is used for generating
-is_generating = False
-### Data Definiation
-data_dir  = "./data/pre-wmt14"
-train_conf = seq_to_seq_data(data_dir = data_dir, 
-                             is_generating = is_generating)
-### Algorithm Configuration
-settings(
-    learning_method = AdamOptimizer(),
-    batch_size = 50,
-    learning_rate = 5e-4)
-### Network Architecture
-gru_encoder_decoder(train_conf, is_generating)
--- a/demo/seqToseq/translation/train.sh
+++ b/demo/seqToseq/translation/train.sh
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-cd ..
-paddle train \
--config='translation/train.conf' \
--save_dir='translation/model' \
--use_gpu=false \
--num_passes=16 \
--show_parameter_stats_period=100 \
--trainer_count=4 \
--log_period=10 \
--dot_period=5 \
-2>&1 | tee 'translation/train.log'
-paddle usage -l 'translation/train.log' -e $? -n "seqToseq_translation_train" >/dev/null 2>&1
--- a/demo/word2vec/api_train_v2.py
+++ b/demo/word2vec/api_train_v2.py
-import gzip
-import math
-import paddle.v2 as paddle
-embsize = 32
-hiddensize = 256
-N = 5
-def wordemb(inlayer):
-    wordemb = paddle.layer.embedding(
-        input=inlayer,
-        size=embsize,
-        param_attr=paddle.attr.Param(
-            name="_proj",
-            initial_std=0.001,
-            learning_rate=1,
-            l2_rate=0,
-            sparse_update=True))
-    return wordemb
-def main():
-    # for local training
-    cluster_train = False
-    if not cluster_train:
-        paddle.init(use_gpu=False, trainer_count=1)
-    else:
-        paddle.init(
-            use_gpu=False,
-            trainer_count=2,
-            port=7164,
-            ports_num=1,
-            ports_num_for_sparse=1,
-            num_gradient_servers=1)
-    word_dict = paddle.dataset.imikolov.build_dict()
-    dict_size = len(word_dict)
-    firstword = paddle.layer.data(
-        name="firstw", type=paddle.data_type.integer_value(dict_size))
-    secondword = paddle.layer.data(
-        name="secondw", type=paddle.data_type.integer_value(dict_size))
-    thirdword = paddle.layer.data(
-        name="thirdw", type=paddle.data_type.integer_value(dict_size))
-    fourthword = paddle.layer.data(
-        name="fourthw", type=paddle.data_type.integer_value(dict_size))
-    nextword = paddle.layer.data(
-        name="fifthw", type=paddle.data_type.integer_value(dict_size))
-    Efirst = wordemb(firstword)
-    Esecond = wordemb(secondword)
-    Ethird = wordemb(thirdword)
-    Efourth = wordemb(fourthword)
-    contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth])
-    hidden1 = paddle.layer.fc(input=contextemb,
-                              size=hiddensize,
-                              act=paddle.activation.Sigmoid(),
-                              layer_attr=paddle.attr.Extra(drop_rate=0.5),
-                              bias_attr=paddle.attr.Param(learning_rate=2),
-                              param_attr=paddle.attr.Param(
-                                  initial_std=1. / math.sqrt(embsize * 8),
-                                  learning_rate=1))
-    predictword = paddle.layer.fc(input=hidden1,
-                                  size=dict_size,
-                                  bias_attr=paddle.attr.Param(learning_rate=2),
-                                  act=paddle.activation.Softmax())
-    def event_handler(event):
-        if isinstance(event, paddle.event.EndIteration):
-            if event.batch_id % 100 == 0:
-                with gzip.open("batch-" + str(event.batch_id) + ".tar.gz",
-                               'w') as f:
-                    trainer.save_parameter_to_tar(f)
-                result = trainer.test(
-                    paddle.batch(
-                        paddle.dataset.imikolov.test(word_dict, N), 32))
-                print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % (
-                    event.pass_id, event.batch_id, event.cost, event.metrics,
-                    result.metrics)
-    cost = paddle.layer.classification_cost(input=predictword, label=nextword)
-    parameters = paddle.parameters.create(cost)
-    adagrad = paddle.optimizer.AdaGrad(
-        learning_rate=3e-3,
-        regularization=paddle.optimizer.L2Regularization(8e-4))
-    trainer = paddle.trainer.SGD(cost,
-                                 parameters,
-                                 adagrad,
-                                 is_local=not cluster_train)
-    trainer.train(
-        paddle.batch(paddle.dataset.imikolov.train(word_dict, N), 32),
-        num_passes=30,
-        event_handler=event_handler)
-if __name__ == '__main__':
-    main()
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -59,6 +59,11 @@ context_projection
 ..  autoclass:: paddle.v2.layer.context_projection
    :noindex:
+row_conv
+--------
+..  autoclass:: paddle.v2.layer.row_conv
+    :noindex:
 Image Pooling Layer
 ===================
@@ -346,6 +351,12 @@ sampling_id
 ..  autoclass:: paddle.v2.layer.sampling_id
    :noindex:
+multiplex
+---------
+..  autoclass:: paddle.v2.layer.multiplex
+    :noindex:
 Slicing and Joining Layers
 ==========================
@@ -441,3 +452,19 @@ eos
 ---
 ..  autoclass:: paddle.v2.layer.eos
    :noindex:
+Miscs
+=====
+dropout
+--------------
+..  autoclass:: paddle.v2.layer.dropout
+    :noindex:
+Activation with learnable parameter
+===================================
+prelu
+--------
+..  autoclass:: paddle.v2.layer.prelu
+    :noindex:
--- a/doc/api/v2/config/networks.rst
+++ b/doc/api/v2/config/networks.rst
@@ -125,11 +125,3 @@ simple_attention
    :members: simple_attention
    :noindex:
-Miscs
-=====
-dropout_layer
--------------
-..  automodule:: paddle.v2.networks
-    :members: dropout_layer
-    :noindex:
--- a/doc/design/cluster_train/pserver_client.md
+++ b/doc/design/cluster_train/pserver_client.md
@@ -74,14 +74,25 @@ typedef enum {
 typedef struct {
  char*               name;
  paddle_element_type element_type;
-  void*               content;
+  unsigned char*      content;
  int                 content_len;
 } paddle_parameter, paddle_gradient;
-typedef struct paddle_pserver_client paddle_pserver_client;
+typedef int paddle_pserver_client;
-paddle_pserver_client* paddle_new_pserver_client();
+/**
-void paddle_pserver_client_release(paddle_pserver_client* client);
+ * @brief creates a pserver client that talks to etcd for coordination.
+ */
+paddle_pserver_client paddle_new_etcd_pserver_client(char* etcd_addr);
+/**
+ * @brief creates a pserver client given pserver addresses.
+ *
+ * @param pserver_addrs comma-separated pserver addresses.
+ * @param selected if current pserver client is selected to initialize all parameter servers.
+ */
+paddle_pserver_client paddle_new_pserver_client(char* pserver_addrs, int selected);
+void paddle_pserver_client_release(paddle_pserver_client c);
 /**
 * @brief paddle_begin_init_params begins to initialize parameters on
@@ -95,7 +106,7 @@ void paddle_pserver_client_release(paddle_pserver_client* client);
 * @return 1 if the trainer is selected to initialize parameter
 * servers, otherwise 0.
 */
-int paddle_begin_init_params(paddle_pserver_client* client);
+int paddle_begin_init_params(paddle_pserver_client client);
 /**
 * @brief paddle_init_param initializes the parameter on parameter
@@ -109,7 +120,7 @@ int paddle_begin_init_params(paddle_pserver_client* client);
 * @paddle_begin_init_param). Or simply exit the program and wait for
 * the cluster management system to restart the trainer.
 */
-int paddle_init_param(paddle_pserver_client* client, paddle_parameter param, const unsigned char* param_config_proto, int config_len);
+int paddle_init_param(paddle_pserver_client client, paddle_parameter param, const unsigned char* param_config_proto, int config_len);
 /**
 * @brief paddle_finish_init_params tells parameter servers client has
@@ -120,7 +131,7 @@ int paddle_init_param(paddle_pserver_client* client, paddle_parameter param, con
 * @paddle_begin_init_param). Or simply exit the program and wait for
 * the cluster management system to restart the trainer.
 */
-int paddle_finish_init_params(paddle_pserver_client* client);
+int paddle_finish_init_params(paddle_pserver_client client);
 /**
 * @brief paddle_send_grads sends gradients to parameter servers for
@@ -131,7 +142,7 @@ int paddle_finish_init_params(paddle_pserver_client* client);
 * @param learning_rate the learning rate for the gradients.
 * @return 0 if successful, otherwise -1.
 */
-int paddle_send_grads(paddle_pserver_client* client, const paddle_gradient* grads, int len);
+int paddle_send_grads(paddle_pserver_client client, const paddle_gradient* grads, int len);
 /**
 * @brief paddle_get_params gets parameters from parameter servers.
@@ -139,13 +150,15 @@ int paddle_send_grads(paddle_pserver_client* client, const paddle_gradient* grad
 * paddle_get_params will block until parameters are initialized on
 * the parameter servers.
 *
- * @param names the array of names of the parameters to get.
+ * @param dst the destination array of parameter pointers to save to.
- * @param dst the destination array of parameters to save to.
+ * The parameter pointer must be pre-popullated with required parameter name,
+ * and the content of parameter must be pre-allocated of the size of required
+ * parameter on pserver.
 * @param len the length of the names array and the paddle_parameter
 * array.
 * @return 0 if successful, otherwise -1.
 */
-int paddle_get_params(paddle_pserver_client* client, const char** names, paddle_parameter* dst, int len);
+int paddle_get_params(paddle_pserver_client client, paddle_parameter** dst, int len);
 /**
 * @brief paddle_save_model indicates parameters to save the parameter
@@ -154,5 +167,5 @@ int paddle_get_params(paddle_pserver_client* client, const char** names, paddle_
 * @param path the path to save parameters.
 * @return 0 if successful, otherwise -1.
 */
-int paddle_save_model(paddle_pserver_client* client, const char* path);
+int paddle_save_model(paddle_pserver_client client, const char* path);
 ```
--- a/doc/design/cluster_train/remote_parameter_updater.md
+++ b/doc/design/cluster_train/remote_parameter_updater.md
+# Design Doc: Remote Parameter Updater for Cluster Train
+For an overview of distribute training, please refer to [distributed training design doc](README.md). In this design doc, we will discuss the parameter updater that will use parameter server cclient [The Client Library of Parameter Server Design Doc](pserver_client.md) to manage and update parameters.
+## Parameter Updater
+Parameter Updater is used by trainer to manage and update parameter, there are mainly two kind of parameter updater: local and remote, since this design is for cluster train, we will only discuss remote parameter updater here.
+### Remote Parameter Updater
+Remote Parameter Updater manage parameters through remote parameter server with the client that communicate with pserver([The Client Library of Parameter Server Design Doc](pserver_client.md))
+In PaddlePaddle Python V2 API, trainer is implemented in python, and the trainer will hold a instance of parameter updater and call it's functions directly. In this design, we will also expose the api of RemoteParameterUpdater to python with swig.
+#### Sparse Remote Parameter Updater
+Since we will only implement dense parameter management new, the mechanism for sparse parameter will be discussed in next stage.
+### Interface Design
+TBD
--- a/doc/getstarted/build_and_install/build_from_source_en.md
+++ b/doc/getstarted/build_and_install/build_from_source_en.md
@@ -22,6 +22,7 @@ To compile the source code, your computer must be equipped with the following de
 - **CMake**: CMake >= 3.0 (at least CMake 3.4 on Mac OS X)
 - **BLAS**: MKL, OpenBlas or ATLAS
 - **Python**: only support Python 2.7
+- **Go**
 **Note:** For CUDA 7.0 and CUDA 7.5, GCC 5.0 and up are not supported!
 For CUDA 8.0, GCC versions later than 5.3 are not supported!
@@ -107,6 +108,18 @@ As a simple example, consider the following:
    sudo apt-get install -y python python-pip python-numpy libpython-dev bison
    sudo pip install 'protobuf==3.1.0.post1'
+    # Install Go
+    # You can follow https://golang.org/doc/install for a detailed explanation.
+    wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \
+    tar -C $HOME -xzf go.tgz && \
+    mkdir $HOME/gopath && \
+    rm go.tgz
+    # Setup environment variables
+    export GOROOT=$HOME/go
+    export GOPATH=$HOME/gopath
+    export PATH=$PATH:$GOROOT/bin
    # install cmake 3.4
    curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \
        cd cmake-3.4.1 && ./bootstrap && make -j4 && sudo make install && \

--- a/doc/getstarted/index_cn.rst
+++ b/doc/getstarted/index_cn.rst
@@ -7,4 +7,4 @@
  build_and_install/index_cn.rst
  concepts/use_concepts_cn.rst
- `深度学习入门课程 <http://book.paddlepaddle.org/>`_
+- `深度学习入门课程 <http://book.paddlepaddle.org/index.cn.html>`_
--- a/doc/getstarted/index_en.rst
+++ b/doc/getstarted/index_en.rst
@@ -6,4 +6,4 @@ GET STARTED
  build_and_install/index_en.rst
- `Deep Learning 101 <http://book.paddlepaddle.org/index.en.html>`_
+- `Deep Learning 101 <http://book.paddlepaddle.org/index.html>`_
--- a/doc/howto/deep_model/rnn/index_cn.rst
+++ b/doc/howto/deep_model/rnn/index_cn.rst
@@ -4,6 +4,7 @@ RNN相关模型
 ..  toctree::
  :maxdepth: 1
+  rnn_config_cn.rst
  recurrent_group_cn.md
  hierarchical_layer_cn.rst
  hrnn_rnn_api_compare_cn.rst
--- a/doc/howto/deep_model/rnn/index_en.rst
+++ b/doc/howto/deep_model/rnn/index_en.rst
 RNN Models
 ==========
+..  toctree::
+  :maxdepth: 1
+  rnn_config_en.rst
--- a/doc/howto/deep_model/rnn/rnn_config_cn.rst
+++ b/doc/howto/deep_model/rnn/rnn_config_cn.rst
@@ -5,36 +5,13 @@ RNN配置
 中配置循环神经网络（RNN）。PaddlePaddle
 高度支持灵活和高效的循环神经网络配置。 在本教程中，您将了解如何：
-  准备用来学习循环神经网络的序列数据。
 -  配置循环神经网络架构。
 -  使用学习完成的循环神经网络模型生成序列。
 我们将使用 vanilla 循环神经网络和 sequence to sequence
 模型来指导你完成这些步骤。sequence to sequence
-模型的代码可以在\ ``demo / seqToseq``\ 找到。
+模型的代码可以在 `book/08.machine_translation <https://github.com/PaddlePaddle/book/tree/develop/08.machine_translation>`_ 找到。
+wmt14数据的提供文件在 `python/paddle/v2/dataset/wmt14.py <https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/dataset/wmt14.py>`_ 。
-准备序列数据
------------
-PaddlePaddle
-不需要对序列数据进行任何预处理，例如填充。唯一需要做的是将相应类型设置为输入。例如，以下代码段定义了三个输入。
-它们都是序列，它们的大小是\ ``src_dict``\ ，\ ``trg_dict``\ 和\ ``trg_dict``\ ：
-.. code:: python
-    settings.input_types = [
-      integer_value_sequence(len(settings.src_dict)),
-      integer_value_sequence(len(settings.trg_dict)),
-      integer_value_sequence(len(settings.trg_dict))]
-在\ ``process``\ 函数中，每个\ ``yield``\ 函数将返回三个整数列表。每个整数列表被视为一个整数序列：
-.. code:: python
-    yield src_ids, trg_ids, trg_ids_next
-有关如何编写数据提供程序的更多细节描述，请参考 :ref:`api_pydataprovider2` 。完整的数据提供文件在
-``demo/seqToseq/dataprovider.py``\ 。
 配置循环神经网络架构
 --------------------
@@ -85,16 +62,16 @@ vanilla
                   act=None,
                   rnn_layer_attr=None):
        def __rnn_step__(ipt):
-           out_mem = memory(name=name, size=size)
+           out_mem = paddle.layer.memory(name=name, size=size)
-           rnn_out = mixed_layer(input = [full_matrix_projection(ipt),
+           rnn_out = paddle.layer.mixed(input = [paddle.layer.full_matrix_projection(input=ipt),
-                                          full_matrix_projection(out_mem)],
+                                                 paddle.layer.full_matrix_projection(input=out_mem)],
                                        name = name,
                                        bias_attr = rnn_bias_attr,
                                        act = act,
                                        layer_attr = rnn_layer_attr,
                                        size = size)
           return rnn_out
-        return recurrent_group(name='%s_recurrent_group' % name,
+        return paddle.layer.recurrent_group(name='%s_recurrent_group' % name,
                                            step=__rnn_step__,
                                            reverse=reverse,
                                            input=input)
@@ -140,43 +117,52 @@ Sequence to Sequence Model with Attention
 .. code:: python
    # 定义源语句的数据层
-    src_word_id = data_layer(name='source_language_word', size=source_dict_dim)
+    src_word_id = paddle.layer.data(
+        name='source_language_word',
+        type=paddle.data_type.integer_value_sequence(source_dict_dim))
    # 计算每个词的词向量
-    src_embedding = embedding_layer(
+    src_embedding = paddle.layer.embedding(
        input=src_word_id,
        size=word_vector_dim,
-        param_attr=ParamAttr(name='_source_language_embedding'))
+        param_attr=paddle.attr.ParamAttr(name='_source_language_embedding'))
    # 应用前向循环神经网络
-    src_forward = grumemory(input=src_embedding, size=encoder_size)
+    src_forward = paddle.networks.simple_gru(
+        input=src_embedding, size=encoder_size)
    # 应用反向递归神经网络（reverse=True表示反向循环神经网络）
-    src_backward = grumemory(input=src_embedding,
+    src_backward = paddle.networks.simple_gru(
-                              size=encoder_size,
+        input=src_embedding, size=encoder_size, reverse=True)
-                              reverse=True)
    # 将循环神经网络的前向和反向部分混合在一起
-    encoded_vector = concat_layer(input=[src_forward, src_backward])
+    encoded_vector = paddle.layer.concat(input=[src_forward, src_backward])
    # 投射编码向量到 decoder_size
-    encoder_proj = mixed_layer(input = [full_matrix_projection(encoded_vector)],
+    encoded_proj = paddle.layer.mixed(
-                               size = decoder_size)
+        size=decoder_size,
+        input=paddle.layer.full_matrix_projection(encoded_vector))
    # 计算反向RNN的第一个实例
-    backward_first = first_seq(input=src_backward)
+    backward_first = paddle.layer.first_seq(input=src_backward)
    # 投射反向RNN的第一个实例到 decoder size
-    decoder_boot = mixed_layer(input=[full_matrix_projection(backward_first)], size=decoder_size, act=TanhActivation())
+    decoder_boot = paddle.layer.mixed(
+       size=decoder_size,
+       act=paddle.activation.Tanh(),
+       input=paddle.layer.full_matrix_projection(backward_first))
 解码器使用 ``recurrent_group`` 来定义循环神经网络。单步函数和输出函数在
 ``gru_decoder_with_attention`` 中定义：
 .. code:: python
-    group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
+    group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True)
-                  StaticInput(input=encoded_proj,is_seq=True)]
+    group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True)
-    trg_embedding = embedding_layer(
+    group_inputs = [group_input1, group_input2]
-        input=data_layer(name='target_language_word',
+    trg_embedding = paddle.layer.embedding(
-                         size=target_dict_dim),
+            input=paddle.layer.data(
+                name='target_language_word',
+                type=paddle.data_type.integer_value_sequence(target_dict_dim)),
            size=word_vector_dim,
-        param_attr=ParamAttr(name='_target_language_embedding'))
+            param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
+        group_inputs.append(trg_embedding)
    group_inputs.append(trg_embedding)
    # 对于配备有注意力机制的解码器，在训练中，
@@ -185,7 +171,8 @@ Sequence to Sequence Model with Attention
    # StaticInput 意味着不同时间步的输入都是相同的值，
    # 否则它以一个序列输入，不同时间步的输入是不同的。
    # 所有输入序列应该有相同的长度。
-    decoder = recurrent_group(name=decoder_group_name,
+    decoder = paddle.layer.recurrent_group(
+            name=decoder_group_name,
            step=gru_decoder_with_attention,
            input=group_inputs)
@@ -198,27 +185,32 @@ attention，门控循环单元单步函数和输出函数：
        # 定义解码器的Memory
        # Memory的输出定义在 gru_step 内
        # 注意 gru_step 应该与它的Memory名字相同
-        decoder_mem = memory(name='gru_decoder',
+        decoder_mem = paddle.layer.memory(
-                             size=decoder_size,
+            name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
-                             boot_layer=decoder_boot)
        # 计算 attention 加权编码向量
-        context = simple_attention(encoded_sequence=enc_vec,
+        context = paddle.networks.simple_attention(
+            encoded_sequence=enc_vec,
            encoded_proj=enc_proj,
            decoder_state=decoder_mem)
        # 混合当前词向量和attention加权编码向量
-        decoder_inputs = mixed_layer(inputs = [full_matrix_projection(context),
+         decoder_inputs = paddle.layer.mixed(
-                                               full_matrix_projection(current_word)],
+            size=decoder_size * 3,
-                                     size = decoder_size * 3)
+            input=[
+                paddle.layer.full_matrix_projection(input=context),
+                paddle.layer.full_matrix_projection(input=current_word)
+            ])
        # 定义门控循环单元循环神经网络单步函数
-        gru_step = gru_step_layer(name='gru_decoder',
+         gru_step = paddle.layer.gru_step(
+            name='gru_decoder',
            input=decoder_inputs,
            output_mem=decoder_mem,
            size=decoder_size)
        # 定义输出函数
-        out = mixed_layer(input=[full_matrix_projection(input=gru_step)],
+         out = paddle.layer.mixed(
            size=target_dict_dim,
            bias_attr=True,
-                          act=SoftmaxActivation())
+            act=paddle.activation.Softmax(),
+            input=paddle.layer.full_matrix_projection(input=gru_step))
        return out
 生成序列
@@ -238,28 +230,23 @@ attention，门控循环单元单步函数和输出函数：
   -  ``beam_size``: beam search 算法中的beam大小。
   -  ``max_length``: 生成序列的最大长度。
-  使用 ``seqtext_printer_evaluator``
-   根据索引矩阵和字典打印文本。这个函数需要设置：
-   -  ``id_input``: 数据的整数ID，用于标识生成的文件中的相应输出。
-   -  ``dict_file``: 用于将词ID转换为词的字典文件。
-   -  ``result_file``: 生成结果文件的路径。
 代码如下：
 .. code:: python
-    group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
+    group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True)
-                  StaticInput(input=encoded_proj,is_seq=True)]
+    group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True)
+    group_inputs = [group_input1, group_input2]
    # 在生成时，解码器基于编码源序列和最后生成的目标词预测下一目标词。
    # 编码源序列（编码器输出）必须由只读Memory的 StaticInput 指定。
    # 这里， GeneratedInputs 自动获取上一个生成的词，并在最开始初始化为起始词，如 <s>。
-    trg_embedding = GeneratedInput(
+    trg_embedding = paddle.layer.GeneratedInput(
            size=target_dict_dim,
            embedding_name='_target_language_embedding',
            embedding_size=word_vector_dim)
    group_inputs.append(trg_embedding)
-    beam_gen = beam_search(name=decoder_group_name,
+    beam_gen = paddle.layer.beam_search(
+            name=decoder_group_name,
            step=gru_decoder_with_attention,
            input=group_inputs,
            bos_id=0, # Beginnning token.
@@ -267,12 +254,8 @@ attention，门控循环单元单步函数和输出函数：
            beam_size=beam_size,
            max_length=max_length)
-    seqtext_printer_evaluator(input=beam_gen,
+    return beam_gen
-                              id_input=data_layer(name="sent_id", size=1),
-                              dict_file=trg_dict_path,
-                              result_file=gen_trans_file)
-    outputs(beam_gen)
-注意，这种生成技术只用于类似解码器的生成过程。如果你正在处理序列标记任务，请参阅 :ref:`semantic_role_labeling` 了解更多详细信息。
+注意，这种生成技术只用于类似解码器的生成过程。如果你正在处理序列标记任务，请参阅 `book/06.understand_sentiment <https://github.com/PaddlePaddle/book/tree/develop/06.understand_sentiment>`_ 了解更多详细信息。
-完整的配置文件在\ ``demo/seqToseq/seqToseq_net.py``\ 。
+完整的配置文件在 `book/08.machine_translation/train.py <https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/train.py>`_ 。
--- a/doc/howto/deep_model/rnn/rnn_config_en.rst
+++ b/doc/howto/deep_model/rnn/rnn_config_en.rst
@@ -3,34 +3,11 @@ RNN Configuration
 This tutorial will guide you how to configure recurrent neural network in PaddlePaddle. PaddlePaddle supports highly flexible and efficient recurrent neural network configuration. In this tutorial, you will learn how to:
- prepare sequence data for learning recurrent neural networks.
 - configure recurrent neural network architecture.
 - generate sequence with learned recurrent neural network models.
-We will use vanilla recurrent neural network, and sequence to sequence model to guide you through these steps. The code of sequence to sequence model can be found at :code:`demo/seqToseq`.
+We will use vanilla recurrent neural network, and sequence to sequence model to guide you through these steps. The code of sequence to sequence model can be found at `book/08.machine_translation <https://github.com/PaddlePaddle/book/tree/develop/08.machine_translation>`_ .
+And the data preparation of this model can be found at `python/paddle/v2/dataset/wmt14.py <https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/dataset/wmt14.py>`_ 
-=====================
-Prepare Sequence Data
-=====================
-PaddlePaddle does not need any preprocessing to sequence data, such as padding. The only thing that needs to be done is to set the type of the corresponding type to input. For example, the following code snippets defines three input. All of them are sequences, and the size of them are :code:`src_dict`, :code:`trg_dict`, and :code:`trg_dict`:
-.. code-block:: python
-    settings.input_types = [
-      integer_value_sequence(len(settings.src_dict)),
-      integer_value_sequence(len(settings.trg_dict)),
-      integer_value_sequence(len(settings.trg_dict))]
-Then at the :code:`process` function, each :code:`yield` function will return three integer lists. Each integer list is treated as a sequence of integers:
-.. code-block:: python
-    yield src_ids, trg_ids, trg_ids_next
-For more details description of how to write a data provider, please refer to :ref:`api_pydataprovider2` . The full data provider file is located at :code:`demo/seqToseq/dataprovider.py`.
 ===============================================
 Configure Recurrent Neural Network Architecture
@@ -75,16 +52,16 @@ Its **output function** simply takes :math:`x_t` as the output.
                   act=None,
                   rnn_layer_attr=None):
        def __rnn_step__(ipt):
-           out_mem = memory(name=name, size=size)
+           out_mem = paddle.layer.memory(name=name, size=size)
-           rnn_out = mixed_layer(input = [full_matrix_projection(ipt),
+           rnn_out = paddle.layer.mixed(input = [paddle.layer.full_matrix_projection(input=ipt),
-                                          full_matrix_projection(out_mem)],
+                                                 paddle.layer.full_matrix_projection(input=out_mem)],
                                        name = name,
                                        bias_attr = rnn_bias_attr,
                                        act = act,
                                        layer_attr = rnn_layer_attr,
                                        size = size)
           return rnn_out
-        return recurrent_group(name='%s_recurrent_group' % name,
+        return paddle.layer.recurrent_group(name='%s_recurrent_group' % name,
                                            step=__rnn_step__,
                                            reverse=reverse,
                                            input=input)
@@ -113,43 +90,52 @@ We also project the encoder vector to :code:`decoder_size` dimensional space, ge
 .. code-block:: python
    # Define the data layer of the source sentence.
-    src_word_id = data_layer(name='source_language_word', size=source_dict_dim)
+    src_word_id = paddle.layer.data(
+        name='source_language_word',
+        type=paddle.data_type.integer_value_sequence(source_dict_dim))
    # Calculate the word embedding of each word.
-    src_embedding = embedding_layer(
+    src_embedding = paddle.layer.embedding(
        input=src_word_id,
        size=word_vector_dim,
-        param_attr=ParamAttr(name='_source_language_embedding'))
+        param_attr=paddle.attr.ParamAttr(name='_source_language_embedding'))
    # Apply forward recurrent neural network.
-    src_forward = grumemory(input=src_embedding, size=encoder_size)
+    src_forward = paddle.networks.simple_gru(
+        input=src_embedding, size=encoder_size)
    # Apply backward recurrent neural network. reverse=True means backward recurrent neural network.
-    src_backward = grumemory(input=src_embedding,
+    src_backward = paddle.networks.simple_gru(
-                              size=encoder_size,
+        input=src_embedding, size=encoder_size, reverse=True)
-                              reverse=True)
    # Mix the forward and backward parts of the recurrent neural network together.
-    encoded_vector = concat_layer(input=[src_forward, src_backward])
+    encoded_vector = paddle.layer.concat(input=[src_forward, src_backward])
    # Project encoding vector to decoder_size.
-    encoder_proj = mixed_layer(input = [full_matrix_projection(encoded_vector)],
+    encoded_proj = paddle.layer.mixed(
-                               size = decoder_size)
+        size=decoder_size,
+        input=paddle.layer.full_matrix_projection(encoded_vector))
    # Compute the first instance of the backward RNN.
-    backward_first = first_seq(input=src_backward)
+    backward_first = paddle.layer.first_seq(input=src_backward)
    # Project the first instance of backward RNN to decoder size.
-    decoder_boot = mixed_layer(input=[full_matrix_projection(backward_first)], size=decoder_size, act=TanhActivation())
+    decoder_boot = paddle.layer.mixed(
+       size=decoder_size,
+       act=paddle.activation.Tanh(),
+       input=paddle.layer.full_matrix_projection(backward_first))
 The decoder uses :code:`recurrent_group` to define the recurrent neural network. The step and output functions are defined in :code:`gru_decoder_with_attention`:
 .. code-block:: python
-    group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
+    group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True)
-                  StaticInput(input=encoded_proj,is_seq=True)]
+    group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True)
-    trg_embedding = embedding_layer(
+    group_inputs = [group_input1, group_input2]
-        input=data_layer(name='target_language_word',
+    trg_embedding = paddle.layer.embedding(
-                         size=target_dict_dim),
+            input=paddle.layer.data(
+                name='target_language_word',
+                type=paddle.data_type.integer_value_sequence(target_dict_dim)),
            size=word_vector_dim,
-        param_attr=ParamAttr(name='_target_language_embedding'))
+            param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
+        group_inputs.append(trg_embedding)
    group_inputs.append(trg_embedding)
    # For decoder equipped with attention mechanism, in training,
@@ -158,7 +144,8 @@ The decoder uses :code:`recurrent_group` to define the recurrent neural network.
    # StaticInput means the same value is utilized at different time steps.
    # Otherwise, it is a sequence input. Inputs at different time steps are different.
    # All sequence inputs should have the same length.
-    decoder = recurrent_group(name=decoder_group_name,
+    decoder = paddle.layer.recurrent_group(
+            name=decoder_group_name,
            step=gru_decoder_with_attention,
            input=group_inputs)
@@ -171,27 +158,32 @@ The implementation of the step function is listed as below. First, it defines th
        # Defines the memory of the decoder.
        # The output of this memory is defined in gru_step.
        # Notice that the name of gru_step should be the same as the name of this memory.
-        decoder_mem = memory(name='gru_decoder',
+        decoder_mem = paddle.layer.memory(
-                             size=decoder_size,
+            name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
-                             boot_layer=decoder_boot)
        # Compute attention weighted encoder vector.
-        context = simple_attention(encoded_sequence=enc_vec,
+        context = paddle.networks.simple_attention(
+            encoded_sequence=enc_vec,
            encoded_proj=enc_proj,
            decoder_state=decoder_mem)
        # Mix the current word embedding and the attention weighted encoder vector.
-        decoder_inputs = mixed_layer(inputs = [full_matrix_projection(context),
+        decoder_inputs = paddle.layer.mixed(
-                                               full_matrix_projection(current_word)],
+            size=decoder_size * 3,
-                                     size = decoder_size * 3)
+            input=[
+                paddle.layer.full_matrix_projection(input=context),
+                paddle.layer.full_matrix_projection(input=current_word)
+            ])
        # Define Gated recurrent unit recurrent neural network step function.
-        gru_step = gru_step_layer(name='gru_decoder',
+        gru_step = paddle.layer.gru_step(
+            name='gru_decoder',
            input=decoder_inputs,
            output_mem=decoder_mem,
            size=decoder_size)
        # Defines the output function.
-        out = mixed_layer(input=[full_matrix_projection(input=gru_step)],
+        out = paddle.layer.mixed(
            size=target_dict_dim,
            bias_attr=True,
-                          act=SoftmaxActivation())
+            act=paddle.activation.Softmax(),
+            input=paddle.layer.full_matrix_projection(input=gru_step))
        return out
@@ -208,30 +200,26 @@ After training the model, we can use it to generate sequences. A common practice
  - :code:`beam_size`: the beam size used in beam search.
  - :code:`max_length`: the maximum length of the generated sentences.
-* use :code:`seqtext_printer_evaluator` to print text according to index matrix and dictionary. This function needs to set:
-  - :code:`id_input`: the integer ID of the data, used to identify the corresponding output in the generated files.
-  - :code:`dict_file`: the dictionary file for converting word id to word.
-  - :code:`result_file`: the path of the generation result file.
 The code is listed below:
 .. code-block:: python
-    group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
+    group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True)
-                  StaticInput(input=encoded_proj,is_seq=True)]
+    group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True)
+    group_inputs = [group_input1, group_input2]
    # In generation, decoder predicts a next target word based on
    # the encoded source sequence and the last generated target word.
    # The encoded source sequence (encoder's output) must be specified by
    # StaticInput which is a read-only memory.
    # Here, GeneratedInputs automatically fetchs the last generated word,
    # which is initialized by a start mark, such as <s>.
-    trg_embedding = GeneratedInput(
+    trg_embedding = paddle.layer.GeneratedInput(
            size=target_dict_dim,
            embedding_name='_target_language_embedding',
            embedding_size=word_vector_dim)
    group_inputs.append(trg_embedding)
-    beam_gen = beam_search(name=decoder_group_name,
+    beam_gen = paddle.layer.beam_search(
+            name=decoder_group_name,
            step=gru_decoder_with_attention,
            input=group_inputs,
            bos_id=0, # Beginnning token.
@@ -239,13 +227,9 @@ The code is listed below:
            beam_size=beam_size,
            max_length=max_length)
-    seqtext_printer_evaluator(input=beam_gen,
+    return beam_gen
-                              id_input=data_layer(name="sent_id", size=1),
-                              dict_file=trg_dict_path,
-                              result_file=gen_trans_file)
-    outputs(beam_gen)
-Notice that this generation technique is only useful for decoder like generation process. If you are working on sequence tagging tasks, please refer to :ref:`semantic_role_labeling` for more details.
+Notice that this generation technique is only useful for decoder like generation process. If you are working on sequence tagging tasks, please refer to `book/06.understand_sentiment <https://github.com/PaddlePaddle/book/tree/develop/06.understand_sentiment>`_ for more details.
-The full configuration file is located at :code:`demo/seqToseq/seqToseq_net.py`.
+The full configuration file is located at `book/08.machine_translation/train.py <https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/train.py>`_ .
--- a/doc/howto/dev/contribute_to_paddle_cn.md
+++ b/doc/howto/dev/contribute_to_paddle_cn.md
@@ -84,7 +84,7 @@ no changes added to commit (use "git add" and/or "git commit -a")
 ➜  docker build -t paddle:dev .
 ```
-随后可以用这个开发镜像开build PaddlePaddle的源码。比如如果要build一个不依赖GPU，但是支持AVX指令集，并且包括unit tests的PaddlePaddle，可以：
+随后可以用这个开发镜像开始build PaddlePaddle的源码。比如如果要build一个不依赖GPU，但是支持AVX指令集，并且包括unit tests的PaddlePaddle，可以：
 ```bash
 ➜  docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TEST=ON" paddle:dev

--- a/doc/howto/dev/contribute_to_paddle_en.md
+++ b/doc/howto/dev/contribute_to_paddle_en.md
@@ -4,9 +4,9 @@ We sincerely appreciate your contributions. You can use fork and pull request
 workflow to merge your code.
 ## Code Requirements
- Your code must be fully documented by
+- Your code comments must be fully documented by
-  [doxygen](http://www.stack.nl/~dimitri/doxygen/) style.
+  [Doxygen](http://www.stack.nl/~dimitri/doxygen/) style.
- Make sure the compiler option WITH\_STYLE\_CHECK is on and the compiler
+- Make sure the compiler option `WITH_STYLE_CHECK` is on and the compiler
  passes the code style check.
 - All code must have unit test.
 - Pass all unit tests.
@@ -20,32 +20,25 @@ It's just that simple.
 ## Clone
-Paddle is currently using [git-flow branching model](http://nvie.com/posts/a-successful-git-branching-model/).
+Clone remote repository.
-The **develop** is the main branch, and other user's branches are feature branches.
-Once you've created a fork, you can use your favorite git client to clone your
+```bash
-repo or just head straight to the command line:
+➜  git clone https://github.com/USERNAME/Paddle
+➜  cd Paddle
-```shell
-# Clone your fork to your local machine
-git clone --branch develop https://github.com/USERNAME/Paddle.git
-```
-If your repository doesn't contain **develop** branch, just create it by your own.
-```shell
-git clone https://github.com/USERNAME/Paddle.git Paddle
-cd Paddle
-git checkout -b develop  # create develop branch.
-git remote add upstream https://github.com/PaddlePaddle/Paddle.git  # add upstream to baidu/Paddle
-git pull upstream develop  # update to upstream
 ```
-Then you can start to develop by making a local developement branch
+## Create a local branch
+Paddle is currently using [Git-flow branching model](http://nvie.com/posts/a-successful-git-branching-model/).
-```shell
+All feature and bug fix development work should be done on a new branch, generally create new branch from `develop` branch .
-git checkout -b MY_COOL_STUFF_BRANCH
+```bash
+➜  git checkout -b my-cool-stuff
 ```
+Before the checkout, you need to keep the current branch directory clean, otherwise the untracked file will be brought to the new branch, which can be inspected by `git status`.
 ## Using `pre-commit` hook
 Paddle developers use [pre-commit](http://pre-commit.com/) tool to manage git
@@ -58,89 +51,169 @@ To use [pre-commit](http://pre-commit.com/), you should install it by
 `pip install pre-commit`, and currently, Paddle uses `clang-format` to format
 c/cpp sources. Please make sure clang-format 3.8+ installed.
-Then just run `pre-commit install` in your Paddle clone directory. When you
+Install and run it as follow:
-commit your code, the pre-commit hook will check the local code if there is
+```bash
+➜  pip install pre-commit
+➜  pre-commit install
+```
+When you commit your code, the pre-commit hook will check the local code if there is
 anything not suitable to commit, and so on.
+## Start to develop
+In this tutorial, I delete a line in README.md and created a new file.
+We can use `git status` to inspect the changes of current directory, `git diff` to see difference.
+```bash
+➜  git status
+On branch test
+Changes not staged for commit:
+  (use "git add <file>..." to update what will be committed)
+  (use "git checkout -- <file>..." to discard changes in working directory)
+	modified:   README.md
+Untracked files:
+  (use "git add <file>..." to include in what will be committed)
+	test
+no changes added to commit (use "git add" and/or "git commit -a")
+```
+## Build and Test
+We package PaddlePaddle's compile environment into a Docker image, called the develop image named `paddle:dev`, it contains all compiling tools that PaddlePaddle needs. 
+If you want to build the develop image, just run:
+```bash
+➜  docker build -t paddle:dev .
+```
+Then we can use the develop image to build PaddlePaddle source. For example:
+```bash
+➜  docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TEST=ON" paddle:dev
+```
+The above command will compile PaddlePaddle and create a Dockerfile for building production image. All the generated files are in the build directory. "WITH_GPU" controls if the generated production image supports GPU. "WITH_AVX" controls if the generated production image supports AVX. "WITH_TEST" controls if the unit test will be generated.
+Then we can generate the production image by copying the compiled PaddlePaddle program into the image by
+```bash
+➜  docker build -t paddle:prod -f build/Dockerfile .
+```
+Run unit test finally:
+```bash
+➜  docker run -it -v $(pwd):/paddle paddle:dev bash -c "cd /paddle/build && ctest"
+```
+For more details, you can read [this doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_en.rst).
 ## Commit
-Commit your changes by following command lines:
+Next we cancel the changes to the README.md file and then commit our changes by following command lines:
+```bash
+➜  git checkout -- README.md
+➜  git status
+On branch test
+Untracked files:
+  (use "git add <file>..." to include in what will be committed)
+	test
+nothing added to commit but untracked files present (use "git add" to track)
+➜  git add test
+```
-```shell
+We should write a description of each commit by `git commit` to allow others to know
-# show the working tree status
+the changes in these files.
-git status
-# add modified files
+```bash
-git add xx
+➜  git commit
-env EDITOR=vim git commit  # You can write your comments by vim/nano/emacs.
+CRLF end-lines remover...............................(no files to check)Skipped
+yapf.................................................(no files to check)Skipped
+Check for added large files..............................................Passed
+Check for merge conflicts................................................Passed
+Check for broken symlinks................................................Passed
+Detect Private Key...................................(no files to check)Skipped
+Fix End of Files.....................................(no files to check)Skipped
+clang-formater.......................................(no files to check)Skipped
+[my-cool-stuff c703c041] add test file
+ 1 file changed, 0 insertions(+), 0 deletions(-)
+ create mode 100644 233
 ```
-The first line of commit infomation is the title. The second and later lines
-are the details if any.
 ## Keeping Fork Up to Date
 Before pull your request, you should sync your code from the latest PaddlePaddle.
 To do this, you'll need to add a remote at first:
-```shell
+```bash
-# see the current configured remote repository
+➜  git remote add upstream https://github.com/PaddlePaddle/Paddle
-git remote -v
+➜  git remote
-# add upstream repository
+origin
-git remote add upstream https://github.com/PaddlePaddle/Paddle.git
+upstream
-# verify the new upstream
-git remote -v
 ```
 Update your fork with the latest upstream changes:
-```shell
+```bash
-git pull --rebase upstream develop
+➜  git fetch upstream
+➜  git pull upstream develop
 ```
-If there are no unique commits locally, git will simply perform a fast-forward.
-However, if you have been making changes (in the vast majority of cases you
-probably shouldn't be), you may have to deal with conflicts.
 Now, your local master branch is up-to-date with everything modified upstream.
 ## Push to GitHub
-```shell
+```bash
 # push to your repository in Github
-git push -u origin MY_COOL_STUFF_BRANCH  # create remote branch MY_COOL_STUFF_BRANCH to origin.
+➜  git push origin my-cool-stuff
 ```
-## Pull Request
+## Create an issue and a Pull Request
+Create an Issue to describe the problem and record its number.
 Go to the page for your fork on GitHub, select your development branch,
-and click the **pull request button**.
+and click the `New pull request`.
-## Update your pull request with the lastest version
+<img width="295" alt="screen shot 2017-04-26 at 9 09 28 pm" src="https://cloud.githubusercontent.com/assets/11692045/25436054/a6d98c66-2ac4-11e7-9cb1-18dd13150230.png">
-During the code review, your pull request may become stale because new commits in
+Then select the target branch:
-baidu/Paddle. GitHub allows autmotic update if there is no conflict. You can do this
-by clicking the "Update Branch" button in your pull request page. However, in the case
+<img width="750" alt="screen shot 2017-04-26 at 9 11 52 pm" src="https://cloud.githubusercontent.com/assets/11692045/25436139/f83b1e6c-2ac4-11e7-8c0e-add499023c46.png">
-of conflict, you need to do the update manually. You need to do the following on
-your local repository:
+We can add `resolve #Issue number` in PR description to close the issue automatically after the PR is merge. More details in <https://help.github.com/articles/closing-issues-via-commit-messages/>.
-```shell
-git checkout MY_COOL_STUFF_BRANCH
+Then wait for review, if there need to modify, refer to the above steps to update the corresponding origin branch.
-git pull upstream develop
-# You may need to resolve the conflict according to the git prompt.
+## Delete origin branch
-# Make and test your code.
-git push origin MY_COOL_STUFF_BRANCH
+After the PR is merge into the main repository, we can delete the remote branch on the PR page.
+<img width="775" alt="screen shot 2017-04-26 at 9 18 24 pm" src="https://cloud.githubusercontent.com/assets/11692045/25436457/e4cdd472-2ac5-11e7-9272-badc76c4a23e.png">
+Or just run:
+```bash
+➜  git push origin :my-cool-stuff
 ```
-Now your Pull Request is updated with the latest version.
-## Revise your pull request
+## Delete local branch
-When you revise your pull request according to reviewer's comments, please use 'git commit' instead of 'git commit --amend' to commit your changes so that the reviewers can see the difference between the new pull requrest and the old pull request.
+Finally, we delete local branch:
-The possible commands are
+```bash
+➜  git checkout develop 
-```shell
+# delete my-cool-stuff branch
-git checkout MY_COOL_STUFF_BRANCH
+➜  git branch -D my-cool-stuff
-git pull upstream develop   # update local to newest code base.
-# May be some conflicts will occured.
-# And develop your cool stuff
-env EDITOR=vim git commit  # add your revise log
-git push origin MY_COOL_STUFF_BRANCH
 ```
--- a/go/cmake/golang.cmake
+++ b/go/cmake/golang.cmake
@@ -17,7 +17,7 @@ function(GO_LIBRARY NAME BUILD_TYPE)
  endif()
  file(GLOB GO_SOURCE RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.go")
-  file(RELATIVE_PATH rel ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR})
+  file(RELATIVE_PATH rel ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR})
  # find Paddle directory.
  get_filename_component(PARENT_DIR ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY)
@@ -26,25 +26,23 @@ function(GO_LIBRARY NAME BUILD_TYPE)
  # automatically get all dependencies specified in the source code
  # for given target.
-  add_custom_target(goGet env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} get -d ${rel}/...)
+  add_custom_target(${NAME}_goGet env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} get -d ${rel}/...)
  # make a symlink that references Paddle inside $GOPATH, so go get
  # will use the local changes in Paddle rather than checkout Paddle
  # in github.
-  add_custom_target(copyPaddle
+  add_custom_target(${NAME}_copyPaddle
-    COMMAND ln -sf ${PADDLE_DIR} ${PADDLE_IN_GOPATH})
+    COMMAND rm -rf ${PADDLE_IN_GOPATH}/Paddle
-  add_dependencies(goGet copyPaddle)
+    COMMAND ln -sf ${PADDLE_DIR} ${PADDLE_IN_GOPATH}/Paddle)
+  add_dependencies(${NAME}_goGet ${NAME}_copyPaddle)
  add_custom_command(OUTPUT ${OUTPUT_DIR}/.timestamp
    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build ${BUILD_MODE}
     -o "${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME}"
    ${CMAKE_GO_FLAGS} ${GO_SOURCE}
-    WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR})
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
  add_custom_target(${NAME} ALL DEPENDS ${OUTPUT_DIR}/.timestamp ${ARGN})
-  add_dependencies(${NAME} goGet)
+  add_dependencies(${NAME} ${NAME}_goGet)
-  if(NOT BUILD_TYPE STREQUAL "STATIC")
-    install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME} DESTINATION bin)
-  endif()
 endfunction(GO_LIBRARY)
--- a/go/cmd/master/master.go
+++ b/go/cmd/master/master.go
 package main
 import (
-	"fmt"
 	"net"
 	"net/http"
 	"net/rpc"
-	"os"
-	"path/filepath"
 	"strconv"
-	"strings"
 	"time"
 	"github.com/namsral/flag"
 	"github.com/PaddlePaddle/Paddle/go/master"
-	"github.com/PaddlePaddle/recordio"
 )
 func main() {
 	port := flag.Int("port", 8080, "port of the master server.")
-	dataset := flag.String("training_dataset", "", "dataset: comma separated path to RecordIO paths, supports golb patterns.")
 	faultTolerance := flag.Bool("fault_tolerance", false, "enable fault tolerance (requires etcd).")
 	taskTimeoutDur := flag.Duration("task_timout_dur", 20*time.Minute, "task timout duration.")
 	taskTimeoutMax := flag.Int("task_timeout_max", 3, "max timtout count for each task before it being declared failed task.")
 	chunkPerTask := flag.Int("chunk_per_task", 10, "chunk per task.")
 	flag.Parse()
-	if *dataset == "" {
-		panic("no dataset specified.")
-	}
 	if *faultTolerance {
 		panic("fault tolernance not implemented.")
-	}
-	var chunks []master.Chunk
-	var paths []string
-	ss := strings.Split(*dataset, ",")
-	fmt.Println(ss)
-	for _, s := range ss {
-		match, err := filepath.Glob(s)
-		if err != nil {
-			panic(err)
-		}
-		paths = append(paths, match...)
-	}
-	if len(paths) == 0 {
-		panic("no valid datset specified.")
-	}
-	idx := 0
-	for _, path := range paths {
-		f, err := os.Open(path)
-		if err != nil {
-			panic(err)
-		}
-		index, err := recordio.LoadIndex(f)
-		if err != nil {
-			panic(err)
-		}
-		f.Close()
-		count := index.NumChunks()
-		for i := 0; i < count; i++ {
-			chunk := master.Chunk{
-				Idx:   idx,
-				Path:  path,
-				Index: *index.ChunkIndex(i),
-			}
-			chunks = append(chunks, chunk)
-		}
 	}
-	s := master.NewService(chunks, *chunkPerTask, *taskTimeoutDur, *taskTimeoutMax)
+	s := master.NewService(*chunkPerTask, *taskTimeoutDur, *taskTimeoutMax)
 	err := rpc.Register(s)
 	if err != nil {
 		panic(err)

--- a/go/pserver/internal/connection/conn.go
+++ b/go/pserver/internal/connection/conn.go
@@ -4,6 +4,8 @@ import (
 	"errors"
 	"net/rpc"
 	"sync"
+	log "github.com/sirupsen/logrus"
 )
 // TODO(helin): add TCP re-connect logic
@@ -21,6 +23,18 @@ func New() *Conn {
 	return c
 }
+// Close closes the connection.
+func (c *Conn) Close() error {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	if c.client == nil {
+		return nil
+	}
+	return c.client.Close()
+}
 // Connect connects the connection to a address.
 func (c *Conn) Connect(addr string) error {
 	c.mu.Lock()
@@ -50,12 +64,20 @@ func (c *Conn) Connect(addr string) error {
 			c.waitConn = nil
 		}
 	} else {
+		err := client.Close()
+		if err != nil {
+			log.Errorln(err)
+		}
 		return errors.New("client already set from a concurrent goroutine")
 	}
 	return nil
 }
+// TODO(helin): refactor Call to be able to perform given retry
+// policy.
 // Call make a RPC call.
 //
 // Call will be blocked until the connection to remote RPC service

--- a/go/master/c/CMakeLists.txt
+++ b/go/master/c/CMakeLists.txt
+cmake_minimum_required(VERSION 3.0)
+get_filename_component(PARENT_DIR ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY)
+get_filename_component(PARENT_DIR ${PARENT_DIR} DIRECTORY)
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${PARENT_DIR}/cmake")
+project(cxx_go C Go)
+include(golang)
+include(flags)
+set(MASTER_LIB_NAME "paddle_master")
+go_library(${MASTER_LIB_NAME} SHARED)
+if(PROJ_ROOT)
+  add_custom_command(OUTPUT ${PROJ_ROOT}/python/paddle/v2/master/lib${MASTER_LIB_NAME}.so
+    COMMAND rm ${CMAKE_CURRENT_BINARY_DIR}/lib${MASTER_LIB_NAME}.h
+    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/lib${MASTER_LIB_NAME}.so ${PROJ_ROOT}/python/paddle/v2/master/
+    DEPENDS ${MASTER_LIB_NAME})
+  add_custom_target(paddle_master_shared ALL DEPENDS ${PROJ_ROOT}/python/paddle/v2/master/lib${MASTER_LIB_NAME}.so)
+endif(PROJ_ROOT)
--- a/go/master/c/client.go
+++ b/go/master/c/client.go
+package main
+/*
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#define PADDLE_MASTER_OK    0
+#define PADDLE_MASTER_ERROR -1
+typedef int paddle_master_client;
+*/
+import "C"
+import (
+	"sync"
+	"unsafe"
+	"github.com/PaddlePaddle/Paddle/go/master"
+	log "github.com/sirupsen/logrus"
+)
+var nullPtr = unsafe.Pointer(uintptr(0))
+var mu sync.Mutex
+var handleMap = make(map[C.paddle_master_client]*master.Client)
+var curHandle C.paddle_master_client
+func add(c *master.Client) C.paddle_master_client {
+	mu.Lock()
+	defer mu.Unlock()
+	client := curHandle
+	curHandle++
+	handleMap[client] = c
+	return client
+}
+func get(client C.paddle_master_client) *master.Client {
+	mu.Lock()
+	defer mu.Unlock()
+	return handleMap[client]
+}
+func remove(client C.paddle_master_client) *master.Client {
+	mu.Lock()
+	defer mu.Unlock()
+	h := handleMap[client]
+	delete(handleMap, client)
+	return h
+}
+type addresser string
+func (a addresser) Address() string {
+	return string(a)
+}
+//export paddle_new_master_client
+func paddle_new_master_client(addr *C.char, bufSize int) C.paddle_master_client {
+	a := C.GoString(addr)
+	c := master.NewClient(addresser(a), bufSize)
+	return add(c)
+}
+//export paddle_release_master_client
+func paddle_release_master_client(client C.paddle_master_client) {
+	remove(client)
+}
+//export paddle_set_dataset
+func paddle_set_dataset(client C.paddle_master_client, path **C.char, size C.int) C.int {
+	c := get(client)
+	var paths []string
+	for i := 0; i < int(size); i++ {
+		ptr := (**C.char)(unsafe.Pointer(uintptr(unsafe.Pointer(path)) + uintptr(i)*unsafe.Sizeof(*path)))
+		str := C.GoString(*ptr)
+		paths = append(paths, str)
+	}
+	err := c.SetDataset(paths)
+	if err != nil {
+		log.Errorln(err)
+		return C.PADDLE_MASTER_ERROR
+	}
+	return C.PADDLE_MASTER_OK
+}
+//export paddle_next_record
+func paddle_next_record(client C.paddle_master_client, record **C.uchar) C.int {
+	c := get(client)
+	r := c.NextRecord()
+	if len(r) == 0 {
+		*record = (*C.uchar)(nullPtr)
+		return 0
+	}
+	size := C.size_t(len(r))
+	*record = (*C.uchar)(C.malloc(size))
+	C.memcpy(unsafe.Pointer(*record), unsafe.Pointer(&r[0]), size)
+	return C.int(size)
+}
+//export mem_free
+func mem_free(p unsafe.Pointer) {
+	// "free" may be a better name for this function, but doing so
+	// will cause calling any function of this library from Python
+	// ctypes hanging.
+	C.free(p)
+}
+func main() {}
--- a/go/master/client.go
+++ b/go/master/client.go
+package master
+import (
+	"os"
+	"time"
+	"github.com/PaddlePaddle/Paddle/go/connection"
+	"github.com/PaddlePaddle/recordio"
+	log "github.com/sirupsen/logrus"
+)
+// Addresser provide the address of the master server.
+type Addresser interface {
+	Address() string
+}
+// Client is the client of the master server.
+type Client struct {
+	conn *connection.Conn
+	ch   chan []byte
+}
+// NewClient creates a new Client.
+//
+// bufSize is the record buffer size. NextRecord will read from this
+// buffer.
+func NewClient(addr Addresser, bufSize int) *Client {
+	c := &Client{}
+	c.conn = connection.New()
+	c.ch = make(chan []byte, bufSize)
+	go c.monitorMaster(addr)
+	go c.getRecords()
+	return c
+}
+func (c *Client) getRecords() {
+	for {
+		t, err := c.getTask()
+		if err != nil {
+			// TODO(helin): wait before move on with next
+			// getTask call.
+			log.Errorln(err)
+			continue
+		}
+		for _, chunk := range t.Chunks {
+			f, err := os.Open(chunk.Path)
+			if err != nil {
+				log.Errorln(err)
+				continue
+			}
+			s := recordio.NewRangeScanner(f, &chunk.Index, -1, -1)
+			for s.Scan() {
+				c.ch <- s.Record()
+			}
+			if s.Err() != nil {
+				log.Errorln(err, chunk.Path)
+			}
+			err = f.Close()
+			if err != nil {
+				log.Errorln(err)
+			}
+		}
+		// We treat a task as finished whenever the last data
+		// instance of the task is read. This is not exactly
+		// correct, but a reasonable approximation.
+		c.taskFinished(t.ID)
+	}
+}
+func (c *Client) monitorMaster(addr Addresser) {
+	lastMaster := ""
+	monitor := func() {
+		// get the lastest address of the master server,
+		// connect to the new address once address changed.
+		curMaster := addr.Address()
+		if curMaster != lastMaster {
+			if curMaster == "" {
+				err := c.conn.Close()
+				if err != nil {
+					log.Errorln(err)
+				}
+			} else {
+				err := c.conn.Connect(curMaster)
+				if err != nil {
+					log.Errorln(err)
+					// connect to addr failed, set
+					// to last known addr in order
+					// to retry next time.
+					curMaster = lastMaster
+				}
+			}
+		}
+		lastMaster = curMaster
+	}
+	monitor()
+	ticker := time.NewTicker(10 * time.Second)
+	for _ = range ticker.C {
+		monitor()
+	}
+}
+// SetDataset set dataset for the master server to dispatch.
+//
+// SetDataset can be call multiple times from different nodes. But
+// only the first call will be honored.
+func (c *Client) SetDataset(globPaths []string) error {
+	return c.conn.Call("Service.SetDataset", globPaths, nil)
+}
+// getTask gets a new task from the master server.
+func (c *Client) getTask() (Task, error) {
+	var t Task
+	err := c.conn.Call("Service.GetTask", 0, &t)
+	return t, err
+}
+// TaskFinished tells the master server a task is finished.
+func (c *Client) taskFinished(taskID int) error {
+	return c.conn.Call("Service.TaskFinished", taskID, nil)
+}
+// NextRecord returns next record in the dataset.
+//
+// NextRecord will block until the next record is available. It is
+// thread-safe.
+func (c *Client) NextRecord() []byte {
+	return <-c.ch
+}
--- a/go/master/client_internal_test.go
+++ b/go/master/client_internal_test.go
+package master
+import (
+	"fmt"
+	"net"
+	"net/http"
+	"net/rpc"
+	"os"
+	"strconv"
+	"strings"
+	"testing"
+	"time"
+	log "github.com/sirupsen/logrus"
+	"github.com/PaddlePaddle/Paddle/go/connection"
+	"github.com/PaddlePaddle/recordio"
+)
+const (
+	totalTask    = 20
+	chunkPerTask = 10
+)
+func init() {
+	log.SetLevel(log.ErrorLevel)
+}
+type TestAddresser string
+func (a TestAddresser) Address() string {
+	return string(a)
+}
+func TestGetFinishTask(t *testing.T) {
+	const path = "/tmp/master_client_test_0"
+	l, err := net.Listen("tcp", ":0")
+	if err != nil {
+		panic(err)
+	}
+	ss := strings.Split(l.Addr().String(), ":")
+	p, err := strconv.Atoi(ss[len(ss)-1])
+	if err != nil {
+		panic(err)
+	}
+	go func(l net.Listener) {
+		s := NewService(chunkPerTask, time.Second, 1)
+		server := rpc.NewServer()
+		err := server.Register(s)
+		if err != nil {
+			panic(err)
+		}
+		mux := http.NewServeMux()
+		mux.Handle(rpc.DefaultRPCPath, server)
+		err = http.Serve(l, mux)
+		if err != nil {
+			panic(err)
+		}
+	}(l)
+	f, err := os.Create(path)
+	if err != nil {
+		panic(err)
+	}
+	for i := 0; i < totalTask*chunkPerTask; i++ {
+		w := recordio.NewWriter(f, -1, -1)
+		w.Write(nil)
+		// call Close to force RecordIO writing a chunk.
+		w.Close()
+	}
+	f.Close()
+	// Manually intialize client to avoid calling c.getRecords()
+	c := &Client{}
+	c.conn = connection.New()
+	go c.monitorMaster(TestAddresser(fmt.Sprintf(":%d", p)))
+	c.SetDataset([]string{path})
+	checkOnePass := func(i int) {
+		var tasks []Task
+		for idx := 0; idx < totalTask; idx++ {
+			task, err := c.getTask()
+			if err != nil {
+				t.Fatalf("Error: %v, pass: %d\n", err, i)
+			}
+			tasks = append(tasks, task)
+		}
+		_, err = c.getTask()
+		if err == nil {
+			t.Fatalf("Should get error, pass: %d\n", i)
+		}
+		err = c.taskFinished(tasks[0].ID)
+		if err != nil {
+			t.Fatalf("Error: %v, pass: %d\n", err, i)
+		}
+		tasks = tasks[1:]
+		task, err := c.getTask()
+		if err != nil {
+			t.Fatal(err)
+		}
+		tasks = append(tasks, task)
+		for _, task := range tasks {
+			err = c.taskFinished(task.ID)
+			if err != nil {
+				t.Fatalf("Error: %v, pass: %d\n", err, i)
+			}
+		}
+	}
+	for i := 0; i < 10; i++ {
+		checkOnePass(i)
+	}
+}
--- a/go/master/client_test.go
+++ b/go/master/client_test.go
+package master_test
+import (
+	"fmt"
+	"net"
+	"net/http"
+	"net/rpc"
+	"os"
+	"strconv"
+	"strings"
+	"testing"
+	"time"
+	"github.com/PaddlePaddle/Paddle/go/master"
+	"github.com/PaddlePaddle/recordio"
+)
+func TestNextRecord(t *testing.T) {
+	const (
+		path  = "/tmp/master_client_TestFull"
+		total = 50
+	)
+	l, err := net.Listen("tcp", ":0")
+	if err != nil {
+		panic(err)
+	}
+	ss := strings.Split(l.Addr().String(), ":")
+	p, err := strconv.Atoi(ss[len(ss)-1])
+	if err != nil {
+		panic(err)
+	}
+	go func(l net.Listener) {
+		s := master.NewService(10, time.Second, 1)
+		server := rpc.NewServer()
+		err := server.Register(s)
+		if err != nil {
+			panic(err)
+		}
+		mux := http.NewServeMux()
+		mux.Handle(rpc.DefaultRPCPath, server)
+		err = http.Serve(l, mux)
+		if err != nil {
+			panic(err)
+		}
+	}(l)
+	f, err := os.Create(path)
+	if err != nil {
+		panic(err)
+	}
+	w := recordio.NewWriter(f, -1, -1)
+	for i := 0; i < total; i++ {
+		w.Write([]byte{byte(i)})
+	}
+	w.Close()
+	f.Close()
+	c := master.NewClient(master.TestAddresser(fmt.Sprintf(":%d", p)), 10)
+	c.SetDataset([]string{path})
+	for pass := 0; pass < 50; pass++ {
+		received := make(map[byte]bool)
+		for i := 0; i < total; i++ {
+			r := c.NextRecord()
+			if len(r) != 1 {
+				t.Fatal("Length should be 1.", r)
+			}
+			if received[r[0]] {
+				t.Fatal("Received duplicate.", received, r)
+			}
+			received[r[0]] = true
+		}
+	}
+}
--- a/go/master/service.go
+++ b/go/master/service.go
@@ -2,29 +2,25 @@ package master
 import (
 	"errors"
-	"log"
+	"os"
+	"path/filepath"
 	"sync"
 	"time"
-	"github.com/PaddlePaddle/recordio"
+	log "github.com/sirupsen/logrus"
-)
-const (
-	targetTaskCount = 300
-)
-// errors
+	"github.com/PaddlePaddle/recordio"
-var (
-	ErrNoMoreTask          = errors.New("no more task for current pass")
-	ErrPendingTaskNotFound = errors.New("pending task not found")
 )
 // Service is the master server service.
 type Service struct {
+	chunksPerTask int
 	timeoutDur    time.Duration
 	timeoutMax    int
+	ready         chan struct{}
 	mu         sync.Mutex
+	initDone   bool
 	taskQueues taskQueues
 }
@@ -55,7 +51,6 @@ func partition(chunks []Chunk, chunksPerTask int) []taskEntry {
 	if len(cur.Task.Chunks) > 0 {
 		cur.Task.ID = id
-		id++
 		result = append(result, cur)
 	}
@@ -63,21 +58,21 @@ func partition(chunks []Chunk, chunksPerTask int) []taskEntry {
 }
 // NewService creates a new service.
-func NewService(chunks []Chunk, chunksPerTask int, timeoutDur time.Duration, timeoutMax int) *Service {
+func NewService(chunksPerTask int, timeoutDur time.Duration, timeoutMax int) *Service {
 	s := &Service{}
+	s.chunksPerTask = chunksPerTask
 	s.timeoutDur = timeoutDur
 	s.timeoutMax = timeoutMax
 	s.taskQueues = taskQueues{}
 	s.taskQueues.Pending = make(map[int]taskEntry)
-	s.taskQueues.Todo = partition(chunks, chunksPerTask)
+	s.ready = make(chan struct{})
 	return s
 }
 // Chunk is a chunk of data consisted of several data instances.
 type Chunk struct {
-	Idx   int // index of the chunk within the file
 	Path  string
-	Index recordio.Index // block index
+	Index recordio.Index // chunk index
 }
 // Task is the basic unit of data instances assigned to trainers.
@@ -105,25 +100,87 @@ func (s *Service) snapshot() error {
 	return nil
 }
-// GetTask gets a new task from the service.
+func readChunks(globPaths []string) ([]Chunk, error) {
-func (s *Service) GetTask(dummy int, task *Task) error {
+	var chunks []Chunk
+	var paths []string
+	for _, s := range globPaths {
+		match, err := filepath.Glob(s)
+		if err != nil {
+			return nil, err
+		}
+		paths = append(paths, match...)
+	}
+	if len(paths) == 0 {
+		return nil, errors.New("no valid dataset specified")
+	}
+	for _, path := range paths {
+		f, err := os.Open(path)
+		if err != nil {
+			return nil, err
+		}
+		index, err := recordio.LoadIndex(f)
+		if err != nil {
+			return nil, err
+		}
+		err = f.Close()
+		if err != nil {
+			return nil, err
+		}
+		count := index.NumChunks()
+		for i := 0; i < count; i++ {
+			chunk := Chunk{
+				Path:  path,
+				Index: *index.ChunkIndex(i),
+			}
+			chunks = append(chunks, chunk)
+		}
+	}
+	return chunks, nil
+}
+// SetDataset sets dataset to dispatch for the master server.
+//
+// SetDataset can be call multiple times. But only the first call will
+// be honored.
+func (s *Service) SetDataset(globPaths []string, dummy *int) error {
+	if len(globPaths) == 0 {
+		return errors.New("no dataset specified")
+	}
 	s.mu.Lock()
 	defer s.mu.Unlock()
+	if s.initDone {
+		// Already initialized. All trainer will call
+		// SetDataset, but we only handle the first one. Treat
+		// other calls as successful but do nothing.
+		return nil
+	}
-	if len(s.taskQueues.Todo) == 0 {
+	chunks, err := readChunks(globPaths)
-		return ErrNoMoreTask
+	if err != nil {
+		return err
 	}
-	t := s.taskQueues.Todo[0]
+	s.taskQueues.Todo = partition(chunks, s.chunksPerTask)
-	t.Epoch++
-	s.taskQueues.Todo = s.taskQueues.Todo[1:]
+	err = s.snapshot()
-	s.taskQueues.Pending[t.Task.ID] = t
-	err := s.snapshot()
 	if err != nil {
+		log.Errorln(err)
 		return err
 	}
-	time.AfterFunc(s.timeoutDur, func(taskID int, epoch int) func() {
+	close(s.ready)
+	s.initDone = true
+	return nil
+}
+func (s *Service) checkTimeoutFunc(taskID int, epoch int) func() {
 	return func() {
 		s.mu.Lock()
 		defer s.mu.Unlock()
@@ -142,7 +199,7 @@ func (s *Service) GetTask(dummy int, task *Task) error {
 		defer func() {
 			err := s.snapshot()
 			if err != nil {
-					log.Println(err)
+				log.Errorln(err)
 			}
 		}()
@@ -150,29 +207,108 @@ func (s *Service) GetTask(dummy int, task *Task) error {
 		t.NumTimeout++
 		if t.NumTimeout > s.timeoutMax {
+			log.Warningf("Task %v timed out %d times, discard.\n", t.Task, t.NumTimeout)
 			s.taskQueues.Failed = append(s.taskQueues.Failed, t.Task)
 			return
 		}
+		log.Warningf("Task %v timed out %d times, retry.\n", t.Task, t.NumTimeout)
 		s.taskQueues.Todo = append(s.taskQueues.Todo, t)
 	}
-	}(t.Task.ID, t.Epoch))
+}
+// must be called with lock held.
+func (s *Service) logFields() log.Fields {
+	return log.Fields{
+		"todoLen":    len(s.taskQueues.Todo),
+		"pendingLen": len(s.taskQueues.Pending),
+		"doneLen":    len(s.taskQueues.Done),
+		"failedLen":  len(s.taskQueues.Failed),
+	}
+}
+// GetTask gets a new task from the service.
+func (s *Service) GetTask(dummy int, task *Task) error {
+	select {
+	case <-s.ready:
+	}
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	if len(s.taskQueues.Todo) == 0 {
+		if len(s.taskQueues.Done) == 0 {
+			if len(s.taskQueues.Pending) == 0 {
+				err := errors.New("all task failed")
+				log.WithFields(s.logFields()).Warningln("All tasks failed.")
+				return err
+			}
+			// TODO(helin): client need to retry in this
+			// error case. Gotcha: RPC client can't
+			// compare returned error with predefined
+			// errors like io.EOF, because the error
+			// instance deserialized from RPC is a
+			// different instance than the error defined
+			// in package. So we need to figure out a way
+			// for client to check this error correctly.
+			err := errors.New("no more available task")
+			log.WithFields(s.logFields()).Warningln("No more available task.")
+			return err
+		}
+		s.taskQueues.Todo = s.taskQueues.Done
+		s.taskQueues.Done = nil
+		log.WithFields(s.logFields()).Infoln("No more todo task, but trainer is requesting task to do. Move all done task to todo.")
+	}
+	t := s.taskQueues.Todo[0]
+	t.Epoch++
+	s.taskQueues.Todo = s.taskQueues.Todo[1:]
+	s.taskQueues.Pending[t.Task.ID] = t
+	err := s.snapshot()
+	if err != nil {
+		return err
+	}
+	*task = t.Task
+	log.WithFields(s.logFields()).Infof("Task #%d dispatched.", task.ID)
+	time.AfterFunc(s.timeoutDur, s.checkTimeoutFunc(t.Task.ID, t.Epoch))
 	return nil
 }
 // TaskFinished tell the service that a task is finished.
 func (s *Service) TaskFinished(taskID int, dummy *int) error {
+	select {
+	case <-s.ready:
+	}
 	s.mu.Lock()
 	defer s.mu.Unlock()
 	t, ok := s.taskQueues.Pending[taskID]
 	if !ok {
-		return ErrPendingTaskNotFound
+		err := errors.New("pending task not found")
+		log.WithFields(s.logFields()).Warningln("Pending task #%d not found.", taskID)
+		return err
 	}
 	// task finished, reset timeout
 	t.NumTimeout = 0
 	s.taskQueues.Done = append(s.taskQueues.Done, t)
 	delete(s.taskQueues.Pending, taskID)
-	return s.snapshot()
+	log.WithFields(s.logFields()).Infof("Task #%d finished.", taskID)
+	if len(s.taskQueues.Pending) == 0 && len(s.taskQueues.Todo) == 0 {
+		log.WithFields(s.logFields()).Infoln("No more todo and pending task, start a new pass.")
+		s.taskQueues.Todo = append(s.taskQueues.Todo, s.taskQueues.Done...)
+		s.taskQueues.Done = nil
+	}
+	err := s.snapshot()
+	if err != nil {
+		log.Errorln(err)
+	}
+	return err
 }
--- a/go/pserver/cclient/CMakeLists.txt
+++ b/go/pserver/cclient/CMakeLists.txt
@@ -9,5 +9,15 @@ project(cxx_go C Go)
 include(golang)
 include(flags)
-go_library(client STATIC)
+go_library(paddle_pserver_cclient STATIC)
+if(PROJ_ROOT)
+  add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/trainer/libpaddle_pserver_cclient.a
+          COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/libpaddle_pserver_cclient.h ${PROJ_ROOT}/paddle/trainer/
+          COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/libpaddle_pserver_cclient.a ${PROJ_ROOT}/paddle/trainer/
+          WORKING_DIRECTORY ${PROJ_ROOT}/paddle
+          DEPENDS paddle_pserver_cclient)
+  add_custom_target(paddle_pserver_cclient_lib ALL DEPENDS ${PROJ_ROOT}/paddle/trainer/libpaddle_pserver_cclient.a)
+endif(PROJ_ROOT)
 add_subdirectory(test)
--- a/go/pserver/cclient/cclient.go
+++ b/go/pserver/cclient/cclient.go
 package main
 /*
-#include <stdlib.h>
 #include <string.h>
 typedef enum {
  PADDLE_ELEMENT_TYPE_INT32   = 0,
@@ -19,39 +18,27 @@ typedef struct {
  int                 content_len;
 } paddle_parameter, paddle_gradient;
-static inline void paddle_release_param(paddle_parameter* param) {
+typedef int paddle_pserver_client;
-  if (param != NULL) {
+#define PSERVER_ERROR -1
-    if (param->name != NULL) {
+#define PSERVER_OK 0
-      free(param->name);
-    }
-    if (param->content != NULL) {
-      free(param->content);
-    }
-    free(param);
-  }
-}
-typedef int client;
 */
 import "C"
 import (
-	"log"
 	"strings"
 	"sync"
 	"unsafe"
 	"github.com/PaddlePaddle/Paddle/go/pserver"
+	log "github.com/sirupsen/logrus"
 )
 var nullPtr = unsafe.Pointer(uintptr(0))
 var mu sync.Mutex
-var handleMap = make(map[C.client]*pserver.Client)
+var handleMap = make(map[C.paddle_pserver_client]*pserver.Client)
-var curHandle C.client
+var curHandle C.paddle_pserver_client
-func add(c *pserver.Client) C.client {
+func add(c *pserver.Client) C.paddle_pserver_client {
 	mu.Lock()
 	defer mu.Unlock()
 	client := curHandle
@@ -60,13 +47,13 @@ func add(c *pserver.Client) C.client {
 	return client
 }
-func get(client C.client) *pserver.Client {
+func get(client C.paddle_pserver_client) *pserver.Client {
 	mu.Lock()
 	defer mu.Unlock()
 	return handleMap[client]
 }
-func remove(client C.client) *pserver.Client {
+func remove(client C.paddle_pserver_client) *pserver.Client {
 	mu.Lock()
 	defer mu.Unlock()
 	h := handleMap[client]
@@ -100,7 +87,7 @@ func (l lister) List() []pserver.Server {
 }
 //export paddle_new_pserver_client
-func paddle_new_pserver_client(addrs *C.char, selected int) C.client {
+func paddle_new_pserver_client(addrs *C.char, selected int) C.paddle_pserver_client {
 	a := C.GoString(addrs)
 	as := strings.Split(a, ",")
 	servers := make([]pserver.Server, len(as))
@@ -113,27 +100,27 @@ func paddle_new_pserver_client(addrs *C.char, selected int) C.client {
 }
 //export paddle_new_etcd_pserver_client
-func paddle_new_etcd_pserver_client(etcd_addr *C.char) C.client {
+func paddle_new_etcd_pserver_client(etcd_addr *C.char) C.paddle_pserver_client {
 	// TODO(helin): fault tolerant pserver client using etcd.
 	panic("not implemented.")
 }
 //export paddle_pserver_client_release
-func paddle_pserver_client_release(client C.client) {
+func paddle_pserver_client_release(client C.paddle_pserver_client) {
 	remove(client)
 }
 //export paddle_begin_init_params
-func paddle_begin_init_params(client C.client) C.int {
+func paddle_begin_init_params(client C.paddle_pserver_client) C.int {
 	c := get(client)
 	if selected := c.BeginInitParams(); selected {
 		return 1
 	}
-	return 0
+	return C.PSERVER_OK
 }
 //export paddle_init_param
-func paddle_init_param(client C.client, param C.paddle_parameter, param_config unsafe.Pointer, config_len C.int) C.int {
+func paddle_init_param(client C.paddle_pserver_client, param C.paddle_parameter, param_config unsafe.Pointer, config_len C.int) C.int {
 	et := pserver.ElementType(param.element_type)
 	name := C.GoString(param.name)
 	content := cArrayToSlice(unsafe.Pointer(param.content), int(param.content_len))
@@ -143,31 +130,41 @@ func paddle_init_param(client C.client, param C.paddle_parameter, param_config u
 	}
 	c := get(client)
 	err := c.InitParam(pc)
 	if err != nil {
-		log.Println(err)
+		if err.Error() == pserver.AlreadyInitialized {
-		return -1
+			log.Warningf("parameter %s already initialized, treat paddle_init_param as sucessful.\n", name)
+			return C.PSERVER_OK
+		}
+		log.Errorln(err)
+		return C.PSERVER_ERROR
 	}
-	return 0
+	return C.PSERVER_OK
 }
 //export paddle_finish_init_params
-func paddle_finish_init_params(client C.client) C.int {
+func paddle_finish_init_params(client C.paddle_pserver_client) C.int {
 	c := get(client)
 	err := c.FinishInitParams()
 	if err != nil {
-		log.Println(err)
+		if err.Error() == pserver.AlreadyInitialized {
-		return -1
+			log.Warningln("parameters already initialized, treat paddle_finish_init_params as sucessful.")
+			return C.PSERVER_OK
 		}
-	return 0
+		log.Errorln(err)
+		return C.PSERVER_ERROR
+	}
+	return C.PSERVER_OK
 }
 //export paddle_send_grads
-func paddle_send_grads(client C.client, grads *C.paddle_gradient, total C.int) C.int {
+func paddle_send_grads(client C.paddle_pserver_client, grads **C.paddle_gradient, total C.int) C.int {
 	var gs []pserver.Gradient
 	for i := 0; i < int(total); i++ {
-		grad := (*C.paddle_gradient)(unsafe.Pointer((uintptr(unsafe.Pointer(grads)) + uintptr(i)*unsafe.Sizeof(*grads))))
+		grad := *(**C.paddle_gradient)(unsafe.Pointer((uintptr(unsafe.Pointer(grads)) + uintptr(i)*unsafe.Sizeof(*grads))))
 		et := pserver.ElementType(grad.element_type)
 		name := C.GoString(grad.name)
 		content := cArrayToSlice(unsafe.Pointer(grad.content), int(grad.content_len))
@@ -177,84 +174,82 @@ func paddle_send_grads(client C.client, grads *C.paddle_gradient, total C.int) C
 	c := get(client)
 	err := c.SendGrads(gs)
 	if err != nil {
-		log.Println(err)
+		log.Errorln(err)
-		return -1
+		return C.PSERVER_ERROR
 	}
-	return 0
+	return C.PSERVER_OK
 }
 //export paddle_get_params
-func paddle_get_params(client C.client, names **C.char, dst **C.paddle_parameter, total C.int) C.int {
+func paddle_get_params(client C.paddle_pserver_client, dst **C.paddle_parameter, total C.int) C.int {
 	var ns []string
 	for i := 0; i < int(total); i++ {
-		name := *(**C.char)(unsafe.Pointer((uintptr(unsafe.Pointer(names)) + uintptr(i)*unsafe.Sizeof(*names))))
+		param := *(**C.paddle_parameter)(unsafe.Pointer((uintptr(unsafe.Pointer(dst)) + uintptr(i)*unsafe.Sizeof(*dst))))
-		ns = append(ns, C.GoString(name))
+		ns = append(ns, C.GoString(param.name))
 	}
 	c := get(client)
 	ps, err := c.GetParams(ns)
 	if err != nil {
-		log.Println(err)
+		log.Errorln(err)
-		return -1
+		return C.PSERVER_ERROR
 	}
-	for i := 0; i < int(total); i++ {
+	if len(ps) != len(ns) {
-		if i >= len(ps) {
+		pn := make([]string, len(ps))
-			break
+		for i, p := range ps {
+			pn[i] = p.Name
+		}
+		log.Errorf("pserver returned wrong number of parameters. Requested: %s, returned: %s.\n", strings.Join(pn, ", "), strings.Join(ns, ", "))
+		return C.PSERVER_ERROR
+	}
+	for i := range ps {
+		if ns[i] != ps[i].Name {
+			pn := make([]string, len(ps))
+			for i, p := range ps {
+				pn[i] = p.Name
+			}
+			log.Errorf("pserver returned wrong parameters, or not in requested order. Requested: %s, returned: %s.\n", strings.Join(pn, ", "), strings.Join(ns, ", "))
+			return C.PSERVER_ERROR
+		}
 	}
+	for i := 0; i < int(total); i++ {
 		p := ps[i]
 		param := *(**C.paddle_parameter)(unsafe.Pointer((uintptr(unsafe.Pointer(dst)) + uintptr(i)*unsafe.Sizeof(*dst))))
-		nameReady := false
-		contentAllocated := false
 		if unsafe.Pointer(param) == nullPtr {
-			param = (*C.paddle_parameter)(C.calloc(1, C.size_t(unsafe.Sizeof(*param))))
+			log.Errorln("must pre-allocate parameter.")
-		} else {
+			return C.PSERVER_ERROR
-			if unsafe.Pointer(param.name) != nullPtr {
-				if n := C.GoString(param.name); n != p.Name {
-					log.Println("Warning: the pre-allocated parameter name does not match the parameter name, it will be freed.", n, p.Name)
-					C.free(unsafe.Pointer(param.name))
-				} else {
-					nameReady = true
-				}
 		}
 		if unsafe.Pointer(param.content) != nullPtr {
-				if int(param.content_len) == len(p.Content) {
+			if int(param.content_len) != len(p.Content) {
-					contentAllocated = true
+				log.Errorf("the pre-allocated content len does not match parameter content len. Pre-allocated len: %d, returned len: %d", param.content_len, len(p.Content))
-				} else {
+				return C.PSERVER_ERROR
-					log.Println("Warning: the pre-allocated content len does not match parameter content len, the pre-allocated content will be freed.", param.content_len, len(p.Content))
-					C.free(unsafe.Pointer(param.content))
-				}
 			}
 		}
-		if !nameReady {
-			param.name = C.CString(p.Name)
-		}
-		if !contentAllocated {
-			param.content = (*C.uchar)(C.malloc(C.size_t(len(p.Content))))
-		}
 		C.memcpy(unsafe.Pointer(param.content), unsafe.Pointer(&p.Content[0]), C.size_t(len(p.Content)))
 		param.content_len = C.int(len(p.Content))
 		param.element_type = C.paddle_element_type(p.ElementType)
 	}
-	return 0
+	return C.PSERVER_OK
 }
 //export paddle_save_model
-func paddle_save_model(client C.client, path *C.char) C.int {
+func paddle_save_model(client C.paddle_pserver_client, path *C.char) C.int {
 	p := C.GoString(path)
 	c := get(client)
 	err := c.Save(p)
 	if err != nil {
-		log.Println(err)
+		log.Errorln(err)
-		return -1
+		return C.PSERVER_ERROR
 	}
-	return 0
+	return C.PSERVER_OK
 }
 func main() {} // Required but ignored
--- a/go/pserver/cclient/test/CMakeLists.txt
+++ b/go/pserver/cclient/test/CMakeLists.txt
 cmake_minimum_required(VERSION 3.0)
-include_directories(${CMAKE_BINARY_DIR})
 add_executable(main main.c)
-add_dependencies(main client)
+add_dependencies(main paddle_pserver_cclient)
+add_executable(test_cclient test_cclient.c)
+add_dependencies(test_cclient paddle_pserver_cclient)
 if(APPLE)
  set(CMAKE_EXE_LINKER_FLAGS "-framework CoreFoundation -framework Security")
+else()
+  set(CMAKE_EXE_LINKER_FLAGS "-pthread")
 endif()
-target_link_libraries(main ${CMAKE_BINARY_DIR}/libclient.a)
+if(PROJ_ROOT)
+  include_directories(${CMAKE_CURRENT_BINARY_DIR}/..)
+  target_link_libraries(main ${CMAKE_CURRENT_BINARY_DIR}/../libpaddle_pserver_cclient.a pthread)
+  target_link_libraries(test_cclient ${CMAKE_CURRENT_BINARY_DIR}/../libpaddle_pserver_cclient.a pthread)
+else(PROJ_ROOT)
+  include_directories(${CMAKE_BINARY_DIR})
+  target_link_libraries(main ${CMAKE_BINARY_DIR}/libpaddle_pserver_cclient.a pthread)
+  target_link_libraries(test_cclient ${CMAKE_BINARY_DIR}/libpaddle_pserver_cclient.a pthread)
+endif(PROJ_ROOT)
--- a/go/pserver/cclient/test/main.c
+++ b/go/pserver/cclient/test/main.c
 #include <stdio.h>
+#include <stdlib.h>
-#include "libclient.h"
+#include "libpaddle_pserver_cclient.h"
-void fail() {
+// TODO(helin): Fix: gtest using cmake is not working, using this
-  // TODO(helin): fix: gtest using cmake is not working, using this
+// hacky way for now.
-  // hacky way for now.
+#define fail()                                          \
-  printf("test failed.\n");
+  fprintf(stderr, "info: %s:%d: ", __FILE__, __LINE__); \
  exit(-1);
+void sendGrads(paddle_pserver_client c) {
+  unsigned char grad_a[2000] = {2};
+  unsigned char grad_b[3000] = {3};
+  paddle_gradient grad1 = {
+      "param_a", PADDLE_ELEMENT_TYPE_FLOAT32, grad_a, 2000};
+  paddle_gradient grad2 = {
+      "param_b", PADDLE_ELEMENT_TYPE_FLOAT32, grad_b, 3000};
+  paddle_gradient* grads[2] = {&grad1, &grad2};
+  if (paddle_send_grads(c, grads, 2)) {
+    fail();
+  }
+}
+void getParams(paddle_pserver_client c) {
+  paddle_parameter param_a;
+  paddle_parameter param_b;
+  char name_a[] = "param_a";
+  char name_b[] = "param_b";
+  // Must pre-allocate the prameter content before calling paddle_get_params.
+  unsigned char content_a[2000] = {};
+  unsigned char content_b[3000] = {};
+  param_a.element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
+  param_a.name = name_a;
+  param_a.content = content_a;
+  param_a.content_len = 2000;
+  param_b.element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
+  param_b.name = name_b;
+  param_b.content = content_b;
+  param_b.content_len = 3000;
+  paddle_parameter* params[2] = {&param_a, &param_b};
+  if (paddle_get_params(c, params, 2)) {
+    fail();
+  }
 }
 int main() {
  char addr[] = "localhost:3000";
-  client c = paddle_new_pserver_client(addr, 1);
+  paddle_pserver_client c = paddle_new_pserver_client(addr, 1);
 retry:
  if (paddle_begin_init_params(c)) {
    paddle_parameter param;
    char name_a[] = "param_a";
    char name_b[] = "param_b";
-    unsigned char content[] = {0x00, 0x11, 0x22};
+    unsigned char content_a[2000] = {1};
+    unsigned char content_b[3000] = {0};
    param.element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
    param.name = name_a;
-    param.content = content;
+    param.content = content_a;
-    param.content_len = 3;
+    param.content_len = 2000;
-    if (paddle_init_param(c, param, NULL, 0) != 0) {
+    int error = paddle_init_param(c, param, NULL, 0);
+    if (error != 0) {
      goto retry;
    }
-    param.element_type = PADDLE_ELEMENT_TYPE_INT32;
+    param.element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
    param.name = name_b;
-    param.content = content;
+    param.content = content_b;
-    param.content_len = 3;
+    param.content_len = 3000;
-    if (paddle_init_param(c, param, NULL, 0) != 0) {
+    error = paddle_init_param(c, param, NULL, 0);
+    if (error != 0) {
      goto retry;
    }
-    if (paddle_finish_init_params(c) != 0) {
-      goto retry;
-    }
-  } else {
-    fail();
-  }
-  unsigned char content[] = {0x00, 0x11, 0x22};
-  paddle_gradient grads[2] = {
-      {"param_a", PADDLE_ELEMENT_TYPE_INT32, content, 3},
-      {"param_b", PADDLE_ELEMENT_TYPE_FLOAT32, content, 3}};
-  if (!paddle_send_grads(c, grads, 2)) {
+    error = paddle_finish_init_params(c);
-    fail();
+    if (error != 0) {
+      goto retry;
    }
-  paddle_parameter* params[2] = {NULL, NULL};
-  char* names[] = {"param_a", "param_b"};
-  if (!paddle_get_params(c, names, params, 2)) {
-    fail();
  }
-  // get parameters again by reusing the allocated parameter buffers.
+  int i;
-  if (!paddle_get_params(c, names, params, 2)) {
+  for (i = 0; i < 100; i++) {
-    fail();
+    sendGrads(c);
+    getParams(c);
  }
-  paddle_release_param(params[0]);
+  if (paddle_save_model(c, "/tmp/")) {
-  paddle_release_param(params[1]);
-  if (!paddle_save_model(c, "/tmp/")) {
    fail();
  }

--- a/go/pserver/cclient/test/test_cclient.c
+++ b/go/pserver/cclient/test/test_cclient.c
+#include <stdio.h>
+#include <stdlib.h>
+#include "libpaddle_pserver_cclient.h"
+typedef float real;
+void fail() {
+  // TODO(helin): fix: gtest using cmake is not working, using this
+  // hacky way for now.
+  printf("test failed.\n");
+  exit(-1);
+}
+void print_parameter(paddle_gradient* param) {
+  if (param == NULL) {
+    printf("param is NULL!!\n");
+  } else {
+    printf("==== parameter ====\n");
+    printf("name: %s\n", param->name);
+    printf("content_len: %d\n", param->content_len);
+    printf("content_type: %d\n", param->element_type);
+    int i;
+    for (i = 0; i < param->content_len / (int)sizeof(real); ++i) {
+      printf("%f ", ((float*)param->content)[i]);
+    }
+    printf("\n\n");
+  }
+}
+int main() {
+  char addr[] = "localhost:3000";
+  paddle_pserver_client c = paddle_new_pserver_client(addr, 1);
+  char* names[] = {"param_a", "param_b"};
+retry:
+  printf("init parameter to pserver:\n");
+  real param_content1[] = {0.1, 0.2, 0.3};
+  real param_content2[] = {0.4, 0.5, 0.6};
+  paddle_parameter** params =
+      (paddle_parameter**)malloc(sizeof(paddle_parameter*) * 2);
+  params[0] = (paddle_parameter*)malloc(sizeof(paddle_parameter));
+  params[0]->name = names[0];
+  params[0]->content = (unsigned char*)param_content1;
+  params[0]->content_len = 3 * sizeof(real);
+  params[0]->element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
+  params[1] = (paddle_parameter*)malloc(sizeof(paddle_parameter));
+  params[1]->name = names[1];
+  params[1]->content = (unsigned char*)param_content2;
+  params[1]->content_len = 3 * sizeof(real);
+  params[1]->element_type = PADDLE_ELEMENT_TYPE_INT32;
+  if (paddle_begin_init_params(c)) {
+    if (paddle_init_param(c, *params[0], NULL, 0) != 0) {
+      goto retry;
+    }
+    if (paddle_init_param(c, *params[1], NULL, 0) != 0) {
+      goto retry;
+    }
+    if (paddle_finish_init_params(c) != 0) {
+      goto retry;
+    }
+  } else {
+    fail();
+  }
+  printf("get inited parameters from pserver:\n");
+  // get parameters again by reusing the allocated parameter buffers.
+  if (paddle_get_params(c, params, 2) != 0) {
+    fail();
+  }
+  print_parameter(params[0]);
+  print_parameter(params[1]);
+  printf("send gradient to pserver:\n");
+  real gradient_content1[] = {0.01, 0.02, 0.03};
+  real gradinet_content2[] = {0.04, 0.05, 0.06};
+  paddle_gradient** grads =
+      (paddle_gradient**)malloc(sizeof(paddle_gradient*) * 2);
+  grads[0] = (paddle_gradient*)malloc(sizeof(paddle_gradient));
+  grads[0]->name = names[0];
+  grads[0]->content = (unsigned char*)gradient_content1;
+  grads[0]->content_len = 3 * sizeof(real);
+  grads[0]->element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
+  grads[1] = (paddle_gradient*)malloc(sizeof(paddle_gradient));
+  grads[1]->name = names[1];
+  grads[1]->content = (unsigned char*)gradinet_content2;
+  grads[1]->content_len = 3 * sizeof(real);
+  grads[1]->element_type = PADDLE_ELEMENT_TYPE_INT32;
+  printf("print gradient sent to pserver:\n");
+  print_parameter(grads[0]);
+  print_parameter(grads[1]);
+  if (paddle_send_grads(c, grads, 2) != 0) {
+    fail();
+  }
+  printf("get updated parameters from pserver:\n");
+  // get parameters again by reusing the allocated parameter buffers.
+  if (paddle_get_params(c, params, 2) != 0) {
+    fail();
+  }
+  print_parameter(params[0]);
+  print_parameter(params[1]);
+  if (paddle_save_model(c, "/tmp/") != 0) {
+    fail();
+  }
+  return 0;
+}
--- a/demo/mnist/api_train_v2.py
+++ b/demo/mnist/api_train_v2.py
@@ -71,11 +71,6 @@ def main():
    #predict = convolutional_neural_network(images)
    cost = paddle.layer.classification_cost(input=predict, label=label)
-    try:
-        with gzip.open('params.tar.gz', 'r') as f:
-            parameters = paddle.parameters.Parameters.from_tar(f)
-    except IOError:
    parameters = paddle.parameters.create(cost)
    optimizer = paddle.optimizer.Momentum(
@@ -85,7 +80,9 @@ def main():
    trainer = paddle.trainer.SGD(cost=cost,
                                 parameters=parameters,
-                                 update_equation=optimizer)
+                                 update_equation=optimizer,
+                                 is_local=False,
+                                 pserver_spec="localhost:3000")
    lists = []
@@ -95,9 +92,6 @@ def main():
                print "Pass %d, Batch %d, Cost %f, %s" % (
                    event.pass_id, event.batch_id, event.cost, event.metrics)
-                with gzip.open('params.tar.gz', 'w') as f:
-                    parameters.to_tar(f)
        elif isinstance(event, paddle.event.EndPass):
            result = trainer.test(reader=paddle.batch(
                paddle.dataset.mnist.test(), batch_size=128))

--- a/demo/introduction/api_train_v2.py
+++ b/demo/introduction/api_train_v2.py
@@ -24,7 +24,9 @@ def main():
    trainer = paddle.trainer.SGD(cost=cost,
                                 parameters=parameters,
-                                 update_equation=optimizer)
+                                 update_equation=optimizer,
+                                 is_local=False,
+                                 pserver_spec="localhost:3000")
    # event_handler to print training and testing info
    def event_handler(event):

--- a/go/pserver/client.go
+++ b/go/pserver/client.go
@@ -2,11 +2,11 @@ package pserver
 import (
 	"hash/fnv"
-	"log"
 	"sort"
 	"time"
-	"github.com/PaddlePaddle/Paddle/go/pserver/internal/connection"
+	"github.com/PaddlePaddle/Paddle/go/connection"
+	log "github.com/sirupsen/logrus"
 )
 // TODO(helin): add RPC call retry logic
@@ -47,7 +47,7 @@ func NewClient(l Lister, pserverNum int, sel Selector) *Client {
 // monitorPservers monitors pserver addresses, and updates connection
 // when the address changes.
 func (c *Client) monitorPservers(l Lister, pserverNum int) {
-	knownServers := make([]Server, pserverNum)
+	lastServers := make([]Server, pserverNum)
 	ticker := time.NewTicker(10 * time.Second)
 	monitor := func() {
 		curServers := make([]Server, pserverNum)
@@ -56,25 +56,37 @@ func (c *Client) monitorPservers(l Lister, pserverNum int) {
 			curServers[l.Index] = l
 		}
-		for i := range knownServers {
+		for i := range lastServers {
-			if knownServers[i].Addr != curServers[i].Addr {
+			if lastServers[i].Addr == curServers[i].Addr {
+				continue
+			}
+			if curServers[i].Addr == "" {
+				err := c.pservers[i].Close()
+				if err != nil {
+					log.Errorln(err)
+				}
+				continue
+			}
 			err := c.pservers[i].Connect(curServers[i].Addr)
 			if err != nil {
-					log.Println(err)
+				log.Errorln(err)
 				// connect to addr failed, set
 				// to last known addr in order
 				// to retry next time.
-					curServers[i].Addr = knownServers[i].Addr
+				curServers[i].Addr = lastServers[i].Addr
-				}
 			}
 		}
-		knownServers = curServers
+		lastServers = curServers
 	}
 	monitor()
-	for _ = range ticker.C {
+	for range ticker.C {
 		monitor()
 	}
 }
@@ -93,16 +105,14 @@ func (c *Client) BeginInitParams() bool {
 // InitParam initializes the parameter on parameter servers.
 func (c *Client) InitParam(paramWithConfigs ParameterWithConfig) error {
-	var dummy int
+	return c.pservers[c.partition(paramWithConfigs.Param.Name)].Call("Service.InitParam", paramWithConfigs, nil)
-	return c.pservers[c.partition(paramWithConfigs.Param.Name)].Call("Service.InitParam", paramWithConfigs, &dummy)
 }
 // FinishInitParams tells parameter servers client has sent all
 // parameters to parameter servers as initialization.
 func (c *Client) FinishInitParams() error {
 	for _, p := range c.pservers {
-		var dummy int
+		err := p.Call("Service.FinishInitParams", 0, nil)
-		err := p.Call("Service.FinishInitParams", dummy, &dummy)
 		if err != nil {
 			return err
 		}
@@ -116,8 +126,7 @@ func (c *Client) SendGrads(grads []Gradient) error {
 	errCh := make(chan error, len(grads))
 	for _, g := range grads {
 		go func(g Gradient) {
-			var dummy int
+			err := c.pservers[c.partition(g.Name)].Call("Service.SendGrad", g, nil)
-			err := c.pservers[c.partition(g.Name)].Call("Service.SendGrad", g, &dummy)
 			errCh <- err
 		}(g)
 	}
@@ -196,8 +205,7 @@ func (c *Client) Save(path string) error {
 	errCh := make(chan error, len(c.pservers))
 	for _, p := range c.pservers {
-		var dummy int
+		err := p.Call("Service.Save", path, nil)
-		err := p.Call("Service.Save", path, &dummy)
 		errCh <- err
 	}

--- a/go/pserver/client_test.go
+++ b/go/pserver/client_test.go
@@ -117,7 +117,7 @@ func TestClientFull(t *testing.T) {
 	for i := range params {
 		if names[i] != params[i].Name {
-			t.Fatalf("order of returned parameter does not required: parameter name: %s, required name: %s", names[i], params[i])
+			t.Fatalf("order of returned parameter does not required: parameter name: %s, required name: %s", names[i], params[i].Name)
 		}
 	}
 }
--- a/go/pserver/optimizer.c
+++ b/go/pserver/optimizer.c
@@ -32,7 +32,13 @@ int update_SGD(void* optimizer,
               const void* gradient,
               int num_bytes) {
  SGD_optimizer* o = (SGD_optimizer*)optimizer;
-  // TODO
+  float* parameter = (float*)buffer;
+  float* grad = (float*)gradient;
+  int i;
+  for (i = 0; i < num_bytes / sizeof(float); ++i) {
+    parameter[i] -= o->learning_rate * grad[i];
+  }
  return 0;
 }

--- a/go/pserver/service.go
+++ b/go/pserver/service.go
@@ -9,8 +9,10 @@ import (
 // ElementType is the type of elements of a Parameter.
 type ElementType int
-var ErrAlreadyInitialized = errors.New("pserver already initialized")
+const (
-var ErrUninitialized = errors.New("pserver not fully initialized")
+	AlreadyInitialized = "pserver already initialized"
+	Uninitialized      = "pserver not fully initialized"
+)
 // Supported element types
 const (
@@ -49,7 +51,7 @@ type Service struct {
 // NewService creates a new service.
 func NewService() *Service {
-	s := &Service{opt: newOptimizer(sgd, 0.01)}
+	s := &Service{opt: newOptimizer(sgd, 0.005)}
 	s.paramMap = make(map[string]Parameter)
 	s.initialized = make(chan struct{})
 	return s
@@ -59,7 +61,7 @@ func NewService() *Service {
 func (s *Service) InitParam(paramWithConfigs ParameterWithConfig, dummy *int) error {
 	select {
 	case <-s.initialized:
-		return ErrAlreadyInitialized
+		return errors.New(AlreadyInitialized)
 	default:
 	}
@@ -80,7 +82,7 @@ func (s *Service) InitParam(paramWithConfigs ParameterWithConfig, dummy *int) er
 func (s *Service) FinishInitParams(dummy0 int, dummy1 *int) error {
 	select {
 	case <-s.initialized:
-		return ErrAlreadyInitialized
+		return errors.New(AlreadyInitialized)
 	default:
 	}
@@ -94,7 +96,7 @@ func (s *Service) SendGrad(g Gradient, dummy *int) error {
 	select {
 	case <-s.initialized:
 	default:
-		return ErrUninitialized
+		return errors.New(Uninitialized)
 	}
 	s.mu.Lock()

--- a/go/pserver/service_test.go
+++ b/go/pserver/service_test.go
@@ -15,8 +15,7 @@ func TestFull(t *testing.T) {
 	p.Name = "param_a"
 	p.Content = []byte{1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0}
 	p.ElementType = pserver.Int32
-	var dummy int
+	err := s.InitParam(pserver.ParameterWithConfig{Param: p, Config: nil}, nil)
-	err := s.InitParam(pserver.ParameterWithConfig{p, nil}, &dummy)
 	if err != nil {
 		t.FailNow()
 	}
@@ -25,12 +24,12 @@ func TestFull(t *testing.T) {
 	p1.Name = "param_b"
 	p1.Content = []byte{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
 	p1.ElementType = pserver.Float32
-	err = s.InitParam(pserver.ParameterWithConfig{p1, nil}, &dummy)
+	err = s.InitParam(pserver.ParameterWithConfig{Param: p1, Config: nil}, nil)
 	if err != nil {
 		t.FailNow()
 	}
-	err = s.FinishInitParams(0, &dummy)
+	err = s.FinishInitParams(0, nil)
 	if err != nil {
 		t.FailNow()
 	}
@@ -46,11 +45,11 @@ func TestFull(t *testing.T) {
 	}
 	g1, g2 := pserver.Gradient(p1), pserver.Gradient(p)
-	err = s.SendGrad(g1, &dummy)
+	err = s.SendGrad(g1, nil)
 	if err != nil {
 		t.FailNow()
 	}
-	err = s.SendGrad(g2, &dummy)
+	err = s.SendGrad(g2, nil)
 	if err != nil {
 		t.FailNow()
@@ -74,23 +73,21 @@ func TestFull(t *testing.T) {
 func TestMultipleInit(t *testing.T) {
 	s := pserver.NewService()
-	var dummy int
+	err := s.FinishInitParams(0, nil)
-	err := s.FinishInitParams(0, &dummy)
 	if err != nil {
 		t.FailNow()
 	}
-	err = s.FinishInitParams(0, &dummy)
+	err = s.FinishInitParams(0, nil)
-	if err != pserver.ErrAlreadyInitialized {
+	if err.Error() != pserver.AlreadyInitialized {
 		t.FailNow()
 	}
 }
 func TestUninitialized(t *testing.T) {
 	s := pserver.NewService()
-	var dummy int
+	err := s.SendGrad(pserver.Gradient{}, nil)
-	err := s.SendGrad(pserver.Gradient{}, &dummy)
+	if err.Error() != pserver.Uninitialized {
-	if err != pserver.ErrUninitialized {
 		t.FailNow()
 	}
 }
@@ -98,13 +95,14 @@ func TestUninitialized(t *testing.T) {
 func TestBlockUntilInitialized(t *testing.T) {
 	s := pserver.NewService()
 	ch := make(chan struct{}, 2)
+	errCh := make(chan error, 2)
 	var wg sync.WaitGroup
 	wg.Add(1)
 	go func() {
 		var param pserver.Parameter
 		err := s.GetParam("param_a", &param)
 		if err != nil {
-			t.FailNow()
+			errCh <- err
 		}
 		wg.Done()
 		ch <- struct{}{}
@@ -112,10 +110,9 @@ func TestBlockUntilInitialized(t *testing.T) {
 	wg.Add(1)
 	go func() {
-		var dummy int
+		err := s.Save("", nil)
-		err := s.Save("", &dummy)
 		if err != nil {
-			t.FailNow()
+			errCh <- err
 		}
 		wg.Done()
 		ch <- struct{}{}
@@ -127,6 +124,8 @@ func TestBlockUntilInitialized(t *testing.T) {
 	case <-ch:
 		// some function returned before initialization is completed.
 		t.FailNow()
+	case <-errCh:
+		t.FailNow()
 	default:
 	}
@@ -134,13 +133,12 @@ func TestBlockUntilInitialized(t *testing.T) {
 	p.Name = "param_a"
 	p.Content = []byte{1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0}
 	p.ElementType = pserver.Int32
-	var dummy int
+	err := s.InitParam(pserver.ParameterWithConfig{Param: p, Config: nil}, nil)
-	err := s.InitParam(pserver.ParameterWithConfig{p, nil}, &dummy)
 	if err != nil {
 		t.FailNow()
 	}
-	err = s.FinishInitParams(0, &dummy)
+	err = s.FinishInitParams(0, nil)
 	if err != nil {
 		t.FailNow()
 	}

--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -9,6 +9,7 @@ add_subdirectory(pserver)
 add_subdirectory(trainer)
 add_subdirectory(scripts)
 add_subdirectory(optimizer)
+add_subdirectory(strings)
 # Do not build go directory until go cmake is working smoothly.
 # if(CMAKE_Go_COMPILER)

--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
@@ -16,7 +16,7 @@ set(API_HEADER
    Internal.h)
 add_library(paddle_api STATIC ${API_SOURCES})
-add_dependencies(paddle_api gen_proto_cpp)
+add_dependencies(paddle_api gen_proto_cpp paddle_pserver_cclient_lib)
 INCLUDE(${SWIG_USE_FILE})
 INCLUDE_DIRECTORIES(${PROJ_ROOT}/paddle)
@@ -41,10 +41,11 @@ SET(SWIG_MODULE_swig_paddle_EXTRA_DEPS
    paddle_network
    paddle_proto
    ${external_project_dependencies}
+    ${RDMA_LIBS}
 )
 IF(APPLE)
-    SET(MACOS_LD_FLAGS "-undefined dynamic_lookup -Wl,-all_load")
+    SET(MACOS_LD_FLAGS "-undefined dynamic_lookup -Wl,-all_load -framework CoreFoundation -framework Security")
 ELSE(APPLE)
    SET(START_GROUP "-Xlinker -start-group")
    SET(END_GROUP "-Xlinker -end-group")
@@ -73,6 +74,7 @@ SWIG_LINK_LIBRARIES(swig_paddle
    ${CMAKE_DL_LIBS}
    ${EXTERNAL_LIBS}
    ${CMAKE_THREAD_LIBS_INIT}
+    ${RDMA_LD_FLAGS}
    ${START_END}
 )

--- a/paddle/api/Paddle.i
+++ b/paddle/api/Paddle.i
@@ -179,6 +179,7 @@ namespace std {
 %newobject ParameterOptimizer::needSpecialTraversal;
 %newobject ParameterUpdater::createLocalUpdater;
 %newobject ParameterUpdater::createRemoteUpdater;
+%newobject ParameterUpdater::createNewRemoteUpdater;
 %feature("director") UpdateCallback;
 %feature("autodoc", 1); // To generate method stub, for code hint in ide

--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@@ -841,6 +841,8 @@ public:
  static ParameterUpdater* createRemoteUpdater(OptimizationConfig* config,
                                               int passCount,
                                               bool useSparseUpdater);
+  static ParameterUpdater* createNewRemoteUpdater(
+      OptimizationConfig* config, const std::string pserverSpec);
  ~ParameterUpdater();
  /**

--- a/paddle/api/ParameterUpdater.cpp
+++ b/paddle/api/ParameterUpdater.cpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "PaddleAPI.h"
 #include "PaddleAPIPrivate.h"
+#include "paddle/trainer/NewRemoteParameterUpdater.h"
 #include "paddle/trainer/RemoteParameterUpdater.h"
 #include "paddle/trainer/ThreadParameterUpdater.h"
@@ -28,6 +29,14 @@ ParameterUpdater *ParameterUpdater::createLocalUpdater(
  return updater;
 }
+ParameterUpdater *ParameterUpdater::createNewRemoteUpdater(
+    OptimizationConfig *config, const std::string pserverSpec) {
+  auto updater = new ParameterUpdater();
+  updater->m->updater.reset(new paddle::NewRemoteParameterUpdater(
+      config->m->getConfig(), pserverSpec));
+  return updater;
+}
 ParameterUpdater *ParameterUpdater::createRemoteUpdater(
    OptimizationConfig *config, int passCount, bool useSparseUpdater) {
  auto updater = new ParameterUpdater();

--- a/paddle/cuda/include/hl_cpu_matrix_kernel.cuh
+++ b/paddle/cuda/include/hl_cpu_matrix_kernel.cuh
@@ -17,10 +17,9 @@ limitations under the License. */
 #include <stdio.h>
 #include "hl_base.h"
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-#include "hl_neon_matrix_kernel.cuh"
+#ifndef __CUDA_ARCH__
-#else
+#include "hl_cpu_matrix_kernel_detail.cuh"
-#include "hl_sse_matrix_kernel.cuh"
 #endif
 /**
@@ -114,35 +113,6 @@ void hl_cpu_apply_quaternary_op(Op op,
  }
 }
-template <class Agg, class Op, class Saver>
-void hl_matrix_row_op(Agg agg, Op op, Saver sv,
-                      int dimM, int dimN,
-                      real *dst, int ld,
-                      real *A, int lda) {
-  for (int i = 0; i < dimM; i++) {
-    real tmp = agg.init();
-    for (int j = 0; j < dimN; j++) {
-        tmp = agg(tmp, op(A[i * lda + j]));
-    }
-    dst[i*ld] = sv(dst[i*ld], tmp);
-  }
-}
-template <class Agg, class Op, class Saver>
-void hl_matrix_row_op(Agg agg, Op op, Saver sv,
-                      int dimM, int dimN,
-                      real *dst, int ld,
-                      real *A, int lda,
-                      real *B, int ldb) {
-  for (int i = 0; i < dimM; i++) {
-    real tmp = agg.init();
-    for (int j = 0; j < dimN; j++) {
-        tmp = agg(tmp, op(A[i * lda + j], B[i * ldb + j]));
-    }
-    dst[i*ld] = sv(dst[i*ld], tmp);
-  }
-}
 template <class Agg, class Op, class Saver>
 void hl_cpu_matrix_row_op(Agg agg, Op op, Saver sv,
                          int dimM, int dimN,

--- a/paddle/cuda/include/hl_sse_matrix_kernel.cuh
+++ b/paddle/cuda/include/hl_sse_matrix_kernel.cuh
@@ -13,26 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
-#ifndef HL_SSE_MATRIX_KERNEL_CUH_
+#ifndef HL_MATRIX_KERNEL_DETAIL_CUH_
-#define HL_SSE_MATRIX_KERNEL_CUH_
+#define HL_MATRIX_KERNEL_DETAIL_CUH_
 #include "hl_matrix_type.cuh"
-#define VECTOR_SIZE     16
-#ifndef PADDLE_TYPE_DOUBLE
-/* number of float in vector */
-#define     VECTOR_LEN      4
-#define     VECTOR_SET      _mm_set_ps1
-#else
-#if   defined(__APPLE__) || defined(__OSX__)
-#define     _mm_set_pd1     _mm_set1_pd
-#endif
-/* number of double in vector */
-#define     VECTOR_LEN      2
-#define     VECTOR_SET      _mm_set_pd1
-#endif
 inline bool hl_check_align(size_t size) {
  return !(size & (VECTOR_SIZE - 1));
 }
@@ -41,27 +26,63 @@ inline bool hl_check_align(void *ptr) {
  return hl_check_align(reinterpret_cast<size_t>(ptr));
 }
-#ifndef PADDLE_TYPE_DOUBLE
+template <class Agg, class Op, class Saver>
-template <class Agg>
+void hl_matrix_row_op(Agg agg, Op op, Saver sv,
-inline real hl_agg_op(Agg agg, vecType mm) {
+                      int dimM, int dimN,
-  __m128 lo = _mm_unpacklo_ps(mm, mm);
+                      real *dst, int ld,
-  __m128 hi = _mm_unpackhi_ps(mm, mm);
+                      real *A, int lda) {
-  __m128 tmp1 = agg.vecOp(lo, hi);
+  for (int i = 0; i < dimM; i++) {
-  __m128 tmp2 = _mm_movehl_ps(tmp1, tmp1);
+    real tmp = agg.init();
-  __m128 ret = agg.vecOp(tmp1, tmp2);
+    for (int j = 0; j < dimN; j++) {
+        tmp = agg(tmp, op(A[i * lda + j]));
+    }
+    dst[i*ld] = sv(dst[i*ld], tmp);
+  }
+}
-  return _mm_cvtss_f32(ret);
+template <class Agg, class Op, class Saver>
+void hl_matrix_row_op(Agg agg, Op op, Saver sv,
+                      int dimM, int dimN,
+                      real *dst, int ld,
+                      real *A, int lda,
+                      real *B, int ldb) {
+  for (int i = 0; i < dimM; i++) {
+    real tmp = agg.init();
+    for (int j = 0; j < dimN; j++) {
+        tmp = agg(tmp, op(A[i * lda + j], B[i * ldb + j]));
+    }
+    dst[i*ld] = sv(dst[i*ld], tmp);
+  }
+}
+template <class Agg, class Op, class Saver>
+void hl_matrix_column_op(Agg agg, Op op, Saver sv,
+                         int dimM, int dimN,
+                         real *dst,
+                         real *A, int lda) {
+  for (int j = 0; j < dimN; j++) {
+    real tmp = agg.init();
+    for (int i = 0; i < dimM; i++) {
+        tmp = agg(tmp, op(A[i * lda + j]));
+    }
+    dst[j] = sv(dst[j], tmp);
+  }
 }
-#else
-template <class Agg>
+template <class Agg, class Op, class Saver>
-inline real hl_agg_op(Agg agg, vecType mm) {
+void hl_matrix_column_op(Agg agg, Op op, Saver sv,
-  __m128d lo = _mm_unpacklo_pd(mm, mm);
+                         int dimM, int dimN,
-  __m128d hi = _mm_unpackhi_pd(mm, mm);
+                         real *dst,
-  __m128d ret = agg.vecOp(lo, hi);
+                         real *A, int lda,
+                         real *B, int ldb) {
-  return _mm_cvtsd_f64(ret);
+  for (int j = 0; j < dimN; j++) {
+    real tmp = agg.init();
+    for (int i = 0; i < dimM; i++) {
+        tmp = agg(tmp, op(A[i * lda + j], B[i * ldb + j]));
+    }
+    dst[j] = sv(dst[j], tmp);
+  }
 }
-#endif
 template <class Agg, class Op, class Saver>
 void hl_sse_matrix_row_op(Agg agg, Op op, Saver sv,
@@ -118,35 +139,6 @@ void hl_sse_matrix_row_op(Agg agg, Op op, Saver sv,
  }
 }
-template <class Agg, class Op, class Saver>
-void hl_matrix_column_op(Agg agg, Op op, Saver sv,
-                         int dimM, int dimN,
-                         real *dst,
-                         real *A, int lda) {
-  for (int j = 0; j < dimN; j++) {
-    real tmp = agg.init();
-    for (int i = 0; i < dimM; i++) {
-        tmp = agg(tmp, op(A[i * lda + j]));
-    }
-    dst[j] = sv(dst[j], tmp);
-  }
-}
-template <class Agg, class Op, class Saver>
-void hl_matrix_column_op(Agg agg, Op op, Saver sv,
-                         int dimM, int dimN,
-                         real *dst,
-                         real *A, int lda,
-                         real *B, int ldb) {
-  for (int j = 0; j < dimN; j++) {
-    real tmp = agg.init();
-    for (int i = 0; i < dimM; i++) {
-        tmp = agg(tmp, op(A[i * lda + j], B[i * ldb + j]));
-    }
-    dst[j] = sv(dst[j], tmp);
-  }
-}
 /*
 * MaxRow greater than or equal dimN
 * dimN is multiples of VECTOR_LEN
@@ -315,4 +307,4 @@ void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
  }
 }
-#endif /* HL_SSE_MATRIX_KERNEL_CUH_ */
+#endif /* HL_MATRIX_KERNEL_DETAIL_CUH_ */
--- a/paddle/cuda/include/hl_cpu_scalar.cuh
+++ b/paddle/cuda/include/hl_cpu_scalar.cuh
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifndef HL_CPU_SCALAR_CUH_
+#define HL_CPU_SCALAR_CUH_
+#define VECTOR_SIMD false
+#define VECTOR_SET  hl_vec_set
+#ifndef PADDLE_TYPE_DOUBLE
+/* size of float */
+#define VECTOR_SIZE 4
+#else
+/* size of double */
+#define VECTOR_SIZE 8
+#endif
+typedef real vecType;
+/* Consider a real as a vector */
+#define VECTOR_LEN  1
+template <class Agg>
+inline real hl_agg_op(Agg agg, vecType mm) {
+  return mm;
+}
+INLINE real hl_vec_set(const real r) {
+  return r;
+}
+INLINE real hl_vec_classification_error(const real a,
+                                        const real b,
+                                        const real p,
+                                        const real r) {
+  return ((a > p) == (b > p)) ? 0.0f : 1.0f;
+}
+#endif  // HL_CPU_SCALAR_CUH_
--- a/paddle/cuda/include/hl_cpu_simd_neon.cuh
+++ b/paddle/cuda/include/hl_cpu_simd_neon.cuh
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifndef HL_CPU_SIMD_NEON_CUH_
+#define HL_CPU_SIMD_NEON_CUH_
+#include <arm_neon.h>
+#define VECTOR_SIMD true
+#define VECTOR_SIZE 16
+#define VECTOR_SET  hl_vec_set
+#ifndef PADDLE_TYPE_DOUBLE
+typedef float32x4_t vecType;
+/* number of float in vector */
+#define VECTOR_LEN  4
+template <class Agg>
+inline real hl_agg_op(Agg agg, vecType mm) {
+  float32x4_t rev = vrev64q_f32(mm);
+  float32x4_t tmp1 = agg.vecOp(rev, rev);
+  float32x2_t lo = vget_high_f32(rev);
+  float32x2_t hi = vget_low_f32(rev);
+  float32x4_t tmp2 = vcombine_f32(hi, lo);
+  float32x4_t ret = agg.vecOp(tmp1, tmp2);
+  return vgetq_lane_f32(ret, 0);
+}
+inline float32x4_t hl_vec_set(const real f) {
+  return vdupq_n_f32(f);
+}
+inline float32x4_t hl_vec_classification_error(const float32x4_t a,
+                                               const float32x4_t b,
+                                               const float32x4_t p,
+                                               const float32x4_t r) {
+  uint32x4_t tmp1 = vcgtq_f32(a, p);
+  uint32x4_t tmp2 = vcgtq_f32(b, p);
+  uint32x4_t tmp3 = veorq_u32(tmp1, tmp2);
+  return vcvtq_f32_u32(vandq_u32(tmp3, vcvtq_u32_f32(r)));
+}
+#else
+#ifdef __aarch64__
+typedef float64x2_t vecType;
+/* number of float in vector */
+#define VECTOR_LEN  2
+#define VECTOR_SET  vdupq_n_f64
+#error To be implemented
+#else
+#error NEON instructions does not support double precision
+#endif  // __aarch64__
+#endif
+#endif  // HL_CPU_SIMD_NEON_CUH_
--- a/paddle/cuda/include/hl_cpu_simd_sse.cuh
+++ b/paddle/cuda/include/hl_cpu_simd_sse.cuh
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifndef HL_CPU_SIMD_SSE_CUH_
+#define HL_CPU_SIMD_SSE_CUH_
+#include <mmintrin.h>
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#define VECTOR_SIMD true
+#define VECTOR_SIZE 16
+#define VECTOR_SET  hl_vec_set
+#ifndef PADDLE_TYPE_DOUBLE
+typedef __m128  vecType;
+/* number of float in vector */
+#define VECTOR_LEN  4
+template <class Agg>
+inline real hl_agg_op(Agg agg, vecType mm) {
+  __m128 lo = _mm_unpacklo_ps(mm, mm);
+  __m128 hi = _mm_unpackhi_ps(mm, mm);
+  __m128 tmp1 = agg.vecOp(lo, hi);
+  __m128 tmp2 = _mm_movehl_ps(tmp1, tmp1);
+  __m128 ret = agg.vecOp(tmp1, tmp2);
+  return _mm_cvtss_f32(ret);
+}
+inline __m128 hl_vec_set(const real f) {
+  return _mm_set_ps1(f);
+}
+inline __m128 hl_vec_classification_error(const __m128 a,
+                                          const __m128 b,
+                                          const __m128 p,
+                                          const __m128 r) {
+  __m128 tmp1 = _mm_cmpgt_ps(a, p);
+  __m128 tmp2 = _mm_cmpgt_ps(b, p);
+  __m128 tmp3 = _mm_xor_ps(tmp1, tmp2);
+  return _mm_and_ps(tmp3, r);
+}
+#else
+typedef __m128d vecType;
+/* number of double in vector */
+#define VECTOR_LEN  2
+template <class Agg>
+inline real hl_agg_op(Agg agg, vecType mm) {
+  __m128d lo = _mm_unpacklo_pd(mm, mm);
+  __m128d hi = _mm_unpackhi_pd(mm, mm);
+  __m128d ret = agg.vecOp(lo, hi);
+  return _mm_cvtsd_f64(ret);
+}
+inline __m128d hl_vec_set(const real d) {
+#if defined(__APPLE__) || defined(__OSX__)
+  return _mm_set1_pd(d);
+#else
+  return _mm_set_pd1(d);
+#endif
+}
+inline __m128d hl_vec_classification_error(const __m128d a,
+                                           const __m128d b,
+                                           const __m128d p,
+                                           const __m128d r) {
+  __m128d tmp1 = _mm_cmpgt_pd(a, p);
+  __m128d tmp2 = _mm_cmpgt_pd(b, p);
+  __m128d tmp3 = _mm_xor_pd(tmp1, tmp2);
+  return _mm_and_pd(tmp3, r);
+}
+#endif
+#endif  // HL_CPU_SIMD_SSE_CUH_
--- a/paddle/cuda/include/hl_matrix_base.cuh
+++ b/paddle/cuda/include/hl_matrix_base.cuh
@@ -18,26 +18,6 @@ limitations under the License. */
 #include "hl_matrix_type.cuh"
-#ifdef __CUDA_ARCH__
-/**
- * CUDA kernel inline function
- */
-#define INLINE   __device__ inline
-#else
-/**
- * CPP inline function
- */
-#define INLINE   inline
-#endif
-#ifndef PADDLE_TYPE_DOUBLE
-#define     DEVICE_FMAX     fmaxf
-#define     DEVICE_FMIN     fminf
-#else
-#define     DEVICE_FMAX     fmax
-#define     DEVICE_FMIN     fmin
-#endif
 class BaseOp {
 public:
  static const bool sse = false;
@@ -66,10 +46,8 @@ typedef BaseOp SSESquaredDiff;
 typedef BaseOp SSEFirst;
 typedef BaseOp SSESecond;
 typedef BaseOp SSEClassificationError;
-#elif defined(__ARM__NEON__) || defined(__ARM_NEON)
-#include "hl_matrix_base_neon.cuh"
 #else
-#include "hl_matrix_base_sse.cuh"
+#include "hl_matrix_base_detail.cuh"
 #endif
 namespace aggregate {

--- a/paddle/cuda/include/hl_matrix_base_neon.cuh
+++ b/paddle/cuda/include/hl_matrix_base_neon.cuh
@@ -12,32 +12,34 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifndef HL_MATRIX_BASE_DETAIL_CUH_
+#define HL_MATRIX_BASE_DETAIL_CUH_
-#ifndef HL_MATRIX_BASE_NEON_CUH_
+#include "hl_matrix_type.cuh"
-#define HL_MATRIX_BASE_NEON_CUH_
+#include "hl_tensor_ops.h"
 namespace aggregate {
 class SSESum {
 public:
-  static const bool sse = true;
+  static const bool sse = VECTOR_SIMD;
-  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+  INLINE vecType vecOp(const vecType a, const vecType b) const {
-    return vaddq_f32(a, b);
+    return hppl::binary::add<vecType>()(a, b);
  }
 };
 class SSEMax {
 public:
-  static const bool sse = true;
+  static const bool sse = VECTOR_SIMD;
-  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+  INLINE vecType vecOp(const vecType a, const vecType b) const {
-    return vmaxq_f32(a, b);
+    return hppl::binary::max<vecType>()(a, b);
  }
 };
 class SSEMin {
 public:
-  static const bool sse = true;
+  static const bool sse = VECTOR_SIMD;
-  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+  INLINE vecType vecOp(const vecType a, const vecType b) const {
-    return vminq_f32(a, b);
+    return hppl::binary::min<vecType>()(a, b);
  }
 };
 }  // namespace aggregate
@@ -46,8 +48,8 @@ namespace base {
 namespace unary {
 class SSEIdentity {
 public:
-  static const bool sse = true;
+  static const bool sse = VECTOR_SIMD;
-  INLINE float32x4_t vecOp(const float32x4_t a) const {
+  INLINE vecType vecOp(const vecType a) const {
    return a;
  }
 };
@@ -56,106 +58,96 @@ public:
 namespace binary {
 class SSEAdd {
 public:
-  static const bool sse = true;
+  static const bool sse = VECTOR_SIMD;
-  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+  INLINE vecType vecOp(const vecType a, const vecType b) const {
-    return vaddq_f32(a, b);
+    return hppl::binary::add<vecType>()(a, b);
  }
 };
 class SSEAdd2 {
 public:
-  static const bool sse = true;
+  static const bool sse = VECTOR_SIMD;
  const real p1;
  const real p2;
-  float32x4_t mp1;
+  vecType mp1;
-  float32x4_t mp2;
+  vecType mp2;
 public:
  SSEAdd2(const real s1, const real s2) : p1(s1), p2(s2) {
-    mp1 = vdupq_n_f32(p1);
+    mp1 = hl_vec_set(p1);
-    mp2 = vdupq_n_f32(p2);
+    mp2 = hl_vec_set(p2);
  }
-  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+  INLINE vecType vecOp(const vecType a, const vecType b) const {
-    float32x4_t tmp1, tmp2;
+    return hppl::binary::add_scale<vecType>(mp1, mp2)(a, b);
-    tmp1 = vmulq_f32(mp1, a);
-    tmp2 = vmulq_f32(mp2, b);
-    return vaddq_f32(tmp1, tmp2);
  }
 };
 class SSESub {
 public:
-  static const bool sse = true;
+  static const bool sse = VECTOR_SIMD;
-  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+  INLINE vecType vecOp(const vecType a, const vecType b) const {
-    return vsubq_f32(a, b);
+    return hppl::binary::sub<vecType>()(a, b);
  }
 };
 class SSEMul {
 public:
-  static const bool sse = true;
+  static const bool sse = VECTOR_SIMD;
-  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+  INLINE vecType vecOp(const vecType a, const vecType b) const {
-    return vmulq_f32(a, b);
+    return hppl::binary::mul<vecType>()(a, b);
  }
 };
 class SSEDiv {
 public:
-  static const bool sse = true;
+  static const bool sse = VECTOR_SIMD;
-  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+  INLINE vecType vecOp(const vecType a, const vecType b) const {
-    float32x4_t tmp;
+    return hppl::binary::div<vecType>()(a, b);
-    tmp = vrecpeq_f32(b);
-    return vmulq_f32(a, tmp);
  }
 };
 class SSESquaredDiff {
 public:
-  static const bool sse = true;
+  static const bool sse = VECTOR_SIMD;
-  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+  INLINE vecType vecOp(const vecType a, const vecType b) const {
-    float32x4_t tmp;
+    vecType tmp = hppl::binary::sub<vecType>()(a, b);
-    tmp = vsubq_f32(a, b);
+    return hppl::binary::mul<vecType>()(tmp, tmp);
-    return vmulq_f32(tmp, tmp);
  }
 };
 class SSEFirst {
 public:
-  static const bool sse = true;
+  static const bool sse = VECTOR_SIMD;
-  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+  INLINE vecType vecOp(const vecType a, const vecType b) const {
    return a;
  }
 };
 class SSESecond {
 public:
-  static const bool sse = true;
+  static const bool sse = VECTOR_SIMD;
-  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+  INLINE vecType vecOp(const vecType a, const vecType b) const {
    return b;
  }
 };
 class SSEClassificationError {
 public:
-  static const bool sse = true;
+  static const bool sse = VECTOR_SIMD;
  const real p;
-  float32x4_t mp;
+  vecType mp;
-  uint32x4_t result;
+  vecType result;
 public:
  explicit SSEClassificationError(const real s) : p(s) {
-    mp = vdupq_n_f32(p);
+    mp = hl_vec_set(p);
-    result = vdupq_n_u32(1);
+    result = hl_vec_set(1.0f);
  }
-  // TODO: to be check
+  INLINE vecType vecOp(const vecType a, const vecType b) const {
-  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+    return hl_vec_classification_error(a, b, mp, result);
-    uint32x4_t tmp1 = vcgtq_f32(a, mp);
-    uint32x4_t tmp2 = vcgtq_f32(b, mp);
-    uint32x4_t tmp3 = veorq_u32(tmp1, tmp2);
-    return vcvtq_f32_u32(vandq_u32(tmp3, result));
  }
 };
 }  // namespace binary
 }  // namespace base
-#endif /* HL_MATRIX_BASE_NEON_CUH_ */
+#endif /* HL_MATRIX_BASE_DETAIL_CUH_ */
--- a/paddle/cuda/include/hl_matrix_base_sse.cuh
+++ b/paddle/cuda/include/hl_matrix_base_sse.cuh
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifndef HL_MATRIX_BASE_SSE_CUH_
-#define HL_MATRIX_BASE_SSE_CUH_
-namespace aggregate {
-class SSESum {
-public:
-  static const bool sse = true;
-  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
-    return _mm_add_ps(a, b);
-  }
-  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
-    return _mm_add_pd(a, b);
-  }
-};
-class SSEMax {
-public:
-  static const bool sse = true;
-  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
-    return _mm_max_ps(a, b);
-  }
-  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
-    return _mm_max_pd(a, b);
-  }
-};
-class SSEMin {
-public:
-  static const bool sse = true;
-  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
-    return _mm_min_ps(a, b);
-  }
-  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
-    return _mm_min_pd(a, b);
-  }
-};
-}  // namespace aggregate
-namespace base {
-namespace unary {
-class SSEIdentity {
-public:
-  static const bool sse = true;
-  INLINE __m128 vecOp(const __m128 a) const {
-    return a;
-  }
-  INLINE __m128d vecOp(const __m128d a) const {
-    return a;
-  }
-};
-}  // namespace unary
-namespace binary {
-class SSEAdd {
-public:
-  static const bool sse = true;
-  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
-    return _mm_add_ps(a, b);
-  }
-  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
-    return _mm_add_pd(a, b);
-  }
-};
-class SSEAdd2 {
-public:
-  static const bool sse = true;
-  const real p1;
-  const real p2;
-  union {__m128 f; __m128d d;} mp1;
-  union {__m128 f; __m128d d;} mp2;
-public:
-  SSEAdd2(const real s1, const real s2) : p1(s1), p2(s2) {
-    if (sizeof(real) == sizeof(float)) {
-      mp1.f = _mm_set1_ps(p1);
-      mp2.f = _mm_set1_ps(p2);
-    } else {
-      mp1.d = _mm_set1_pd(p1);
-      mp2.d = _mm_set1_pd(p2);
-    }
-  }
-  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
-    __m128 tmp1, tmp2;
-    tmp1 = _mm_mul_ps(mp1.f, a);
-    tmp2 = _mm_mul_ps(mp2.f, b);
-    return _mm_add_ps(tmp1, tmp2);
-  }
-  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
-    __m128d tmp1, tmp2;
-    tmp1 = _mm_mul_pd(mp1.d, a);
-    tmp2 = _mm_mul_pd(mp2.d, b);
-    return _mm_add_pd(tmp1, tmp2);
-  }
-};
-class SSESub {
-public:
-  static const bool sse = true;
-  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
-    return _mm_sub_ps(a, b);
-  }
-  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
-    return _mm_sub_pd(a, b);
-  }
-};
-class SSEMul {
-public:
-  static const bool sse = true;
-  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
-    return _mm_mul_ps(a, b);
-  }
-  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
-    return _mm_mul_pd(a, b);
-  }
-};
-class SSEDiv {
-public:
-  static const bool sse = true;
-  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
-    return _mm_div_ps(a, b);
-  }
-  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
-    return _mm_div_pd(a, b);
-  }
-};
-class SSESquaredDiff {
-public:
-  static const bool sse = true;
-  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
-    return _mm_mul_ps(_mm_sub_ps(a, b), _mm_sub_ps(a, b));
-  }
-  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
-    return _mm_mul_pd(_mm_sub_pd(a, b), _mm_sub_pd(a, b));
-  }
-};
-class SSEFirst {
-public:
-  static const bool sse = true;
-  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
-    return a;
-  }
-  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
-    return a;
-  }
-};
-class SSESecond {
-public:
-  static const bool sse = true;
-  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
-    return b;
-  }
-  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
-    return b;
-  }
-};
-class SSEClassificationError {
-public:
-  static const bool sse = true;
-  const real p;
-  union {__m128 f; __m128d d;} mp;
-  union {__m128 f; __m128d d;} result;
-public:
-  explicit SSEClassificationError(const real s) : p(s) {
-    if (sizeof(real) == sizeof(float)) {
-      mp.f = _mm_set1_ps(p);
-      result.f = _mm_set1_ps(1.0f);
-    } else {
-      mp.d = _mm_set1_pd(p);
-      result.d = _mm_set1_pd(1.0);
-    }
-  }
-  INLINE __m128 vecOp(const __m128 a, const __m128 b) const {
-    __m128 tmp1 = _mm_cmpgt_ps(a, mp.f);
-    __m128 tmp2 = _mm_cmpgt_ps(b, mp.f);
-    __m128 tmp3 = _mm_xor_ps(tmp1, tmp2);
-    return _mm_and_ps(tmp3, result.f);
-  }
-  INLINE __m128d vecOp(const __m128d a, const __m128d b) const {
-    __m128d tmp1 = _mm_cmpgt_pd(a, mp.d);
-    __m128d tmp2 = _mm_cmpgt_pd(b, mp.d);
-    __m128d tmp3 = _mm_xor_pd(tmp1, tmp2);
-    return _mm_and_pd(tmp3, result.d);
-  }
-};
-}  // namespace binary
-}  // namespace base
-#endif /* HL_MATRIX_BASE_SSE_CUH_ */
--- a/paddle/cuda/include/hl_matrix_type.cuh
+++ b/paddle/cuda/include/hl_matrix_type.cuh
@@ -17,35 +17,35 @@ limitations under the License. */
 #include "hl_base.h"
-#if defined(__CUDA_ARCH__)
+#ifdef __CUDA_ARCH__
+/**
+ * CUDA kernel inline function
+ */
+#define INLINE   __device__ inline
+#else
+/**
+ * CPP inline function
+ */
+#define INLINE   inline
+#endif
+#ifdef __CUDA_ARCH__
 #include <vector_types.h>
 #ifndef PADDLE_TYPE_DOUBLE
 typedef float4 vecType;
 #else
 typedef double2 vecType;
 #endif
-#elif (defined  __ARM_NEON) || (defined __ARM_NEON__)
+#elif defined(__SSE3__)
-#include <arm_neon.h>
+#include "hl_cpu_simd_sse.cuh"
-#ifndef PADDLE_TYPE_DOUBLE
+#define PADDLE_USE_SSE3
-typedef float32x4_t  vecType;
+#elif (defined(__ARM_NEON) || defined(__ARM_NEON__)) && !defined(__NVCC__)
-#else
+// Currently nvcc does not support neon intrinsic.
-#error NEON instructions does not support double precision
+// TODO: Extract simd intrinsic implementation from .cu files.
-#endif
+#include "hl_cpu_simd_neon.cuh"
+#define PADDLE_USE_NEON
 #else
-#include <mmintrin.h>
+#include "hl_cpu_scalar.cuh"
-#include <xmmintrin.h>
-#include <emmintrin.h>
-#ifndef PADDLE_TYPE_DOUBLE
-typedef __m128  vecType;
-#else
-typedef __m128d vecType;
-#endif
-#endif
-#ifdef __CUDA_ARCH__
-#define INLINE   __device__ inline
-#else
-#define INLINE   inline
 #endif
 #endif  // HL_MATRIX_TYPE_CUH_
--- a/paddle/cuda/include/hl_neon_matrix_kernel.cuh
+++ b/paddle/cuda/include/hl_neon_matrix_kernel.cuh
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifndef HL_NEON_MATRIX_KERNEL_CUH_
-#define HL_NEON_MATRIX_KERNEL_CUH_
-#include "hl_matrix_type.cuh"
-#define VECTOR_SIZE     16
-/* number of float in vector */
-#define     VECTOR_LEN      4
-#define     VECTOR_SET      vdupq_n_f32
-inline bool hl_check_align(size_t size) {
-  return !(size & (VECTOR_SIZE - 1));
-}
-inline bool hl_check_align(void *ptr) {
-  return hl_check_align(reinterpret_cast<size_t>(ptr));
-}
-template <class Agg>
-inline real hl_agg_op(Agg agg, vecType mm) {
-  float32x4_t rev = vrev64q_f32(mm);
-  float32x4_t tmp1 = agg.vecOp(rev, rev);
-  float32x2_t lo = vget_high_f32(rev);
-  float32x2_t hi = vget_low_f32(rev);
-  float32x4_t tmp2 = vcombine_f32(hi, lo);
-  float32x4_t ret = agg.vecOp(tmp1, tmp2);
-  return vgetq_lane_f32(ret, 0);
-}
-template <class Agg, class Op, class Saver>
-void hl_sse_matrix_row_op(Agg agg, Op op, Saver sv,
-                          int dimM, int dimN,
-                          real *dst, int ld,
-                          real *A, int lda) {
-  for (int i = 0; i < dimM; i++, A += lda) {
-    vecType mm = VECTOR_SET(agg.init());
-    vecType *a = (vecType*)(A);
-    for (int j = 0; j < dimN / VECTOR_LEN; j++, a++) {
-      mm = agg.vecOp(mm, op.vecOp(*a));
-    }
-    int rem = dimN % VECTOR_LEN;
-    if (rem) {
-      real tmp = hl_agg_op(agg, mm);
-      real *a = A + (dimN / VECTOR_LEN) * VECTOR_LEN;
-      for (int j = 0; j < rem; j++) {
-        tmp = agg(tmp, op(a[j]));
-      }
-      dst[i*ld] = sv(dst[i*ld], tmp);
-    } else {
-      dst[i*ld] = sv(dst[i*ld], hl_agg_op(agg, mm));
-    }
-  }
-}
-template <class Agg, class Op, class Saver>
-void hl_sse_matrix_row_op(Agg agg, Op op, Saver sv,
-                          int dimM, int dimN,
-                          real *dst, int ld,
-                          real *A, int lda,
-                          real *B, int ldb) {
-  for (int i = 0; i < dimM; i++, A += lda, B += ldb) {
-    vecType mm = VECTOR_SET(agg.init());
-    vecType *a = (vecType*)(A);
-    vecType *b = (vecType*)(B);
-    for (int j = 0; j < dimN / VECTOR_LEN; j++, a++, b++) {
-        mm = agg.vecOp(mm, op.vecOp(*a, *b));
-    }
-    int rem = dimN % VECTOR_LEN;
-    if (rem) {
-      real tmp = hl_agg_op(agg, mm);
-      real *a = A + (dimN / VECTOR_LEN) * VECTOR_LEN;
-      real *b = B + (dimN / VECTOR_LEN) * VECTOR_LEN;
-      for (int j = 0; j < rem; j++) {
-          tmp = agg(tmp, op(a[j], b[j]));
-      }
-      dst[i*ld] = sv(dst[i*ld], tmp);
-    } else {
-        dst[i*ld] = sv(dst[i*ld], hl_agg_op(agg, mm));
-    }
-  }
-}
-template <class Agg, class Op, class Saver>
-void hl_matrix_column_op(Agg agg, Op op, Saver sv,
-                         int dimM, int dimN,
-                         real *dst,
-                         real *A, int lda) {
-  for (int j = 0; j < dimN; j++) {
-    real tmp = agg.init();
-    for (int i = 0; i < dimM; i++) {
-        tmp = agg(tmp, op(A[i * lda + j]));
-    }
-    dst[j] = sv(dst[j], tmp);
-  }
-}
-template <class Agg, class Op, class Saver>
-void hl_matrix_column_op(Agg agg, Op op, Saver sv,
-                         int dimM, int dimN,
-                         real *dst,
-                         real *A, int lda,
-                         real *B, int ldb) {
-  for (int j = 0; j < dimN; j++) {
-    real tmp = agg.init();
-    for (int i = 0; i < dimM; i++) {
-        tmp = agg(tmp, op(A[i * lda + j], B[i * ldb + j]));
-    }
-    dst[j] = sv(dst[j], tmp);
-  }
-}
-/*
- * MaxRow greater than or equal dimN
- * dimN is multiples of VECTOR_LEN
- * so rem <= MaxRow / VECTOR_LEN
- */
-template <int MaxRow, class Agg, class Op, class Saver>
-void hl_sse_column_op_with_rem(Agg agg, Op op, Saver sv,
-                               int dimM, int dimN,
-                               real *dst,
-                               real *A, int lda) {
-  vecType mm[MaxRow / VECTOR_LEN];
-  for (int n = 0; n < MaxRow / VECTOR_LEN; n++) {
-    mm[n] = VECTOR_SET(agg.init());
-  }
-  for (int i = 0; i < dimM; i++) {
-    vecType *a = (vecType*)(A + i * lda);
-    for (int n = 0; n < dimN / VECTOR_LEN; n++) {
-      mm[n] = agg.vecOp(mm[n], op.vecOp(a[n]));
-    }
-  }
-  vecType *result = (vecType*)(dst);
-  for (int n = 0; n < dimN / VECTOR_LEN; n++) {
-    result[n] = sv.vecOp(result[n], mm[n]);
-  }
-  int rem = dimN % VECTOR_LEN;
-  if (rem) {
-    A += (dimN / VECTOR_LEN) * VECTOR_LEN;
-    dst += (dimN / VECTOR_LEN) * VECTOR_LEN;
-    hl_matrix_column_op(agg, op, sv, dimM, rem, dst, A, lda);
-  }
-}
-/*
- * dimN is multiples of VECTOR_LEN
- * dimN greater than Step
- */
-template <int Step, class Agg, class Op, class Saver>
-void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
-                             int dimM, int dimN,
-                             real *dst,
-                             real *A, int lda) {
-  for (int j = 0; j < dimN / Step; j++, dst += Step, A += Step) {
-    vecType mm[Step / VECTOR_LEN];
-    for (int n = 0; n < Step / VECTOR_LEN; n++) {
-      mm[n] = VECTOR_SET(agg.init());
-    }
-    for (int i = 0; i < dimM; i++) {
-      vecType *a = (vecType*)(A + i * lda);
-      for (int n = 0; n < Step / VECTOR_LEN; n++) {
-        mm[n] = agg.vecOp(mm[n], op.vecOp(a[n]));
-      }
-    }
-    vecType *result = (vecType*)(dst);
-    for (int n = 0; n < Step / VECTOR_LEN; n++) {
-      result[n] = sv.vecOp(result[n], mm[n]);
-    }
-  }
-  int remRow = dimN % Step;
-  if (remRow) {
-    hl_sse_column_op_with_rem<Step>(agg, op, sv, dimM, remRow, dst, A, lda);
-  }
-}
-template <class Agg, class Op, class Saver>
-void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
-                             int dimM, int dimN,
-                             real *dst,
-                             real *A, int lda) {
-  if (dimN <= 16) {
-    hl_sse_matrix_column_op<16>(agg, op, sv, dimM, dimN, dst, A, lda);
-  } else if (dimN <= 32) {
-    hl_sse_matrix_column_op<32>(agg, op, sv, dimM, dimN, dst, A, lda);
-  } else if (dimN <= 1024 || dimM <= 512) {
-    hl_sse_matrix_column_op<64>(agg, op, sv, dimM, dimN, dst, A, lda);
-  } else {
-    hl_sse_matrix_column_op<1024>(agg, op, sv, dimM, dimN, dst, A, lda);
-  }
-}
-template <int MaxRow, class Agg, class Op, class Saver>
-void hl_sse_column_op_with_rem(Agg agg, Op op, Saver sv,
-                               int dimM, int dimN,
-                               real *dst,
-                               real *A, int lda,
-                               real *B, int ldb) {
-  vecType mm[MaxRow / VECTOR_LEN];
-  for (int n = 0; n < MaxRow / VECTOR_LEN; n++) {
-    mm[n] = VECTOR_SET(agg.init());
-  }
-  for (int i = 0; i < dimM; i++) {
-    vecType *a = (vecType*)(A + i * lda);
-    vecType *b = (vecType*)(B + i * ldb);
-    for (int n = 0; n < dimN / VECTOR_LEN; n++) {
-      mm[n] = agg.vecOp(mm[n], op.vecOp(a[n], b[n]));
-    }
-  }
-  vecType *result = (vecType*)(dst);
-  for (int n = 0; n < dimN / VECTOR_LEN; n++) {
-    result[n] = sv.vecOp(result[n], mm[n]);
-  }
-  int rem = dimN % VECTOR_LEN;
-  if (rem) {
-    A += (dimN / VECTOR_LEN) * VECTOR_LEN;
-    B += (dimN / VECTOR_LEN) * VECTOR_LEN;
-    dst += (dimN / VECTOR_LEN) * VECTOR_LEN;
-    hl_matrix_column_op(agg, op, sv, dimM, rem, dst, A, lda, B, ldb);
-  }
-}
-template <int Step, class Agg, class Op, class Saver>
-void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
-                             int dimM, int dimN,
-                             real *dst,
-                             real *A, int lda,
-                             real *B, int ldb) {
-  for (int j = 0; j < dimN / Step; j++, dst += Step, A += Step, B += Step) {
-    vecType mm[Step / VECTOR_LEN];
-    for (int n = 0; n < Step / VECTOR_LEN; n++) {
-      mm[n] = VECTOR_SET(agg.init());
-    }
-    for (int i = 0; i < dimM; i++) {
-      vecType *a = (vecType*)(A + i * lda);
-      vecType *b = (vecType*)(B + i * ldb);
-      for (int n = 0; n < Step / VECTOR_LEN; n++) {
-        mm[n] = agg.vecOp(mm[n], op.vecOp(a[n], b[n]));
-      }
-    }
-    vecType *result = (vecType*)(dst);
-    for (int n = 0; n < Step / VECTOR_LEN; n++) {
-      result[n] = sv.vecOp(result[n], mm[n]);
-    }
-  }
-  int remRow = dimN % Step;
-  if (remRow) {
-    hl_sse_column_op_with_rem<Step>(
-        agg, op, sv, dimM, remRow, dst, A, lda, B, ldb);
-  }
-}
-template <class Agg, class Op, class Saver>
-void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
-                             int dimM, int dimN,
-                             real *dst,
-                             real *A, int lda,
-                             real *B, int ldb) {
-  if (dimN <= 16) {
-    hl_sse_matrix_column_op<16>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
-  } else if (dimN <= 32) {
-    hl_sse_matrix_column_op<32>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
-  } else if (dimN <= 1024 || dimM <= 512) {
-    hl_sse_matrix_column_op<64>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
-  } else {
-    hl_sse_matrix_column_op<1024>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
-  }
-}
-#endif /* HL_NEON_MATRIX_KERNEL_CUH_ */
--- a/paddle/cuda/include/hl_tensor_ops.h
+++ b/paddle/cuda/include/hl_tensor_ops.h
@@ -328,6 +328,208 @@ public:
  INLINE T operator()(const T a, const T b) const { return a < b ? b : a; }
 };
+#ifdef PADDLE_USE_SSE3
+#ifndef PADDLE_TYPE_DOUBLE
+template <>
+class add<__m128> {
+public:
+  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
+    return _mm_add_ps(a, b);
+  }
+};
+template <>
+class add_scale<__m128> {
+private:
+  const __m128 p1;
+  const __m128 p2;
+public:
+  INLINE add_scale(const __m128 s1, const __m128 s2) : p1(s1), p2(s2) {}
+  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
+    return _mm_add_ps(_mm_mul_ps(p1, a), _mm_mul_ps(p2, b));
+  }
+};
+template <>
+class sub<__m128> {
+public:
+  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
+    return _mm_sub_ps(a, b);
+  }
+};
+template <>
+class mul<__m128> {
+public:
+  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
+    return _mm_mul_ps(a, b);
+  }
+};
+template <>
+class div<__m128> {
+public:
+  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
+    return _mm_div_ps(a, b);
+  }
+};
+template <>
+class min<__m128> {
+public:
+  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
+    return _mm_min_ps(a, b);
+  }
+};
+template <>
+class max<__m128> {
+public:
+  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
+    return _mm_max_ps(a, b);
+  }
+};
+#else
+template <>
+class add<__m128d> {
+public:
+  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
+    return _mm_add_pd(a, b);
+  }
+};
+template <>
+class add_scale<__m128d> {
+private:
+  const __m128d p1;
+  const __m128d p2;
+public:
+  INLINE add_scale(const __m128d s1, const __m128d s2) : p1(s1), p2(s2) {}
+  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
+    return _mm_add_pd(_mm_mul_pd(p1, a), _mm_mul_pd(p2, b));
+  }
+};
+template <>
+class sub<__m128d> {
+public:
+  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
+    return _mm_sub_pd(a, b);
+  }
+};
+template <>
+class mul<__m128d> {
+public:
+  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
+    return _mm_mul_pd(a, b);
+  }
+};
+template <>
+class div<__m128d> {
+public:
+  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
+    return _mm_div_pd(a, b);
+  }
+};
+template <>
+class min<__m128d> {
+public:
+  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
+    return _mm_min_pd(a, b);
+  }
+};
+template <>
+class max<__m128d> {
+public:
+  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
+    return _mm_max_pd(a, b);
+  }
+};
+#endif  // PADDLE_TYPE_DOUBLE
+#endif  // PADDLE_USE_SSE3
+#ifdef PADDLE_USE_NEON
+#ifndef PADDLE_TYPE_DOUBLE
+template <>
+class add<float32x4_t> {
+public:
+  INLINE float32x4_t operator()(const float32x4_t a,
+                                const float32x4_t b) const {
+    return vmulq_f32(a, b);
+  }
+};
+template <>
+class add_scale<float32x4_t> {
+private:
+  const float32x4_t p1;
+  const float32x4_t p2;
+public:
+  INLINE add_scale(const float32x4_t s1, const float32x4_t s2)
+      : p1(s1), p2(s2) {}
+  INLINE float32x4_t operator()(const float32x4_t a,
+                                const float32x4_t b) const {
+    return vaddq_f32(vmulq_f32(p1, a), vmulq_f32(p2, b));
+  }
+};
+template <>
+class sub<float32x4_t> {
+public:
+  INLINE float32x4_t operator()(const float32x4_t a,
+                                const float32x4_t b) const {
+    return vsubq_f32(a, b);
+  }
+};
+template <>
+class mul<float32x4_t> {
+public:
+  INLINE float32x4_t operator()(const float32x4_t a,
+                                const float32x4_t b) const {
+    return vmulq_f32(a, b);
+  }
+};
+template <>
+class div<float32x4_t> {
+public:
+  INLINE float32x4_t operator()(const float32x4_t a,
+                                const float32x4_t b) const {
+    float32x4_t tmp = vrecpeq_f32(b);
+    return vmulq_f32(a, tmp);
+  }
+};
+template <>
+class min<float32x4_t> {
+public:
+  INLINE float32x4_t operator()(const float32x4_t a,
+                                const float32x4_t b) const {
+    return vminq_f32(a, b);
+  }
+};
+template <>
+class max<float32x4_t> {
+public:
+  INLINE float32x4_t operator()(const float32x4_t a,
+                                const float32x4_t b) const {
+    return vmaxq_f32(a, b);
+  }
+};
+#else
+#error To be implemented
+#endif  // PADDLE_TYPE_DOUBLE
+#endif  // PADDLE_USE_NEON
 }  // namespace binary
 }  // namespace hppl

--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -14,8 +14,8 @@ add_library(paddle_function STATIC ${cpp_files} ${cu_objs})
 add_dependencies(paddle_function ${external_project_dependencies})
 add_dependencies(paddle_function gen_proto_cpp)
-if(WITH_GPU)
 if(WITH_TESTING)
+if(WITH_GPU)
    # TODO:
    # file(GLOB test_files . *OpTest.cpp)
    # add_executable(${test_bin} EXCLUDE_FROM_ALL ${test_files})
@@ -28,7 +28,10 @@ if(WITH_TESTING)
    add_simple_unittest(PadOpTest)
    add_simple_unittest(MulOpTest)
    add_simple_unittest(CosSimOpTest)
+    add_simple_unittest(RowConvOpTest)
 endif()
+add_simple_unittest(ConvOpTest)
 endif()
 add_style_check_target(paddle_function ${h_files})

--- a/paddle/function/ContextProjectionOpTest.cpp
+++ b/paddle/function/ContextProjectionOpTest.cpp
@@ -28,7 +28,7 @@ void testMatrixProjectionForward(int context_start,
               std::max(0, (int)(context_start + context_length - 1));
  if (pad == 0) is_padding = false;
-  FunctionCompare test(
+  CpuGpuFuncCompare test(
      "ContextProjectionForward",
      FuncConfig()
          .set("context_length", context_length)
@@ -60,7 +60,7 @@ void testMatrixProjectionBackward(int context_start,
               std::max(0, (int)(context_start + context_length - 1));
  if (pad == 0) is_padding = false;
-  FunctionCompare test(
+  CpuGpuFuncCompare test(
      "ContextProjectionBackward",
      FuncConfig()
          .set("context_length", context_length)

--- a/paddle/function/ConvOp.h
+++ b/paddle/function/ConvOp.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "Function.h"
+namespace paddle {
+/*
+ * \brief Based on the ConvFunctionBase class, the forward calculation,
+ *        backward input calculation and backward filter calculation
+ *        of convolution operations can be implemented.
+ *
+ * Arguments of forward and backward calculation:
+ *   1. Forward calculation of convolution.
+ *      inputs = {INPUT, FILTER}, outputs = {OUTPUT}
+ *      The first and second input arguments are input image and filter data.
+ *      The output argument is output image.
+ *
+ *   2. Backward input calculation of convolution.
+ *      inputs = {OUTPUT_GRAD, FILTER}, outputs = {INPUT_GRAD}
+ *      The first and second input arguments are output grad image
+ *      and filter data.
+ *      The output argument is input grad image.
+ *
+ *   3. Backward filter calculation of convolution.
+ *      inputs = {OUTPUT_GRAD, INPUT}, outputs = {FILTER_GRAD}
+ *      The first and second input arguments are output grad image
+ *      and input image.
+ *      The output argument is filter grad.
+ *
+ * Arguments format of input, filter and output:
+ *   1. Input image, output image, input image gradient, output image gradient
+ *      are all NCHW format. Where N is batch size, C is the number of channels,
+ *      H and W is the height and width of image or image gradient.
+ *
+ *   2. The format of the filter data is MCHW, where M is the number of output
+ *      image channels, C is the number of input image channels,
+ *      H and W is height and width of filter.
+ *
+ *      If `groups` is greater than 1, the filter's data format should be GMCHW,
+ *      where G is the `groups`, and G * M is the number of output image
+ *      channels, G * C is the number of input image channels,
+ *      H and W is height and width of filter.
+ */
+class ConvFunctionBase : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    // function arguments
+    strides_ = config.get<std::vector<size_t>>("strides");
+    paddings_ = config.get<std::vector<size_t>>("paddings");
+    groups_ = config.get<size_t>("groups");
+    // number of inputs and outputs
+    numInputs_ = 2;
+    numOutputs_ = 1;
+  }
+  virtual void calc(const BufferArgs& inputs, const BufferArgs& outputs) {}
+  // input can be INPUT and INPUT_GRAD
+  // filter can be FILTER and FILTER_GRAD
+  // output can be OUTPUT and OUTPUT_GRAD
+  void check(const TensorShape& input,
+             const TensorShape& filter,
+             const TensorShape& output) {
+    // inputs and outputs arguments should be 4-dimensional.
+    CHECK_EQ(input.ndims(), (size_t)4);
+    CHECK_EQ(output.ndims(), (size_t)4);
+    // The batchSize of the input needs to be equal to
+    // the batchSize of the output.
+    CHECK_EQ(input[0], output[0]);
+    if (filter.ndims() == (size_t)4) {
+      // If the filter's dimension is 4, groups convolution is not supported.
+      CHECK_EQ(groups_, (size_t)1);
+      // The input and output channel dimensions are the second and first
+      // dimensions of the filter shape.
+      CHECK_EQ(input[1], filter[1]);
+      CHECK_EQ(output[1], filter[0]);
+    } else {
+      // filter argument should be 5-dimensional.
+      CHECK_EQ(filter.ndims(), (size_t)5);
+      // The first dimension of the filter is the size of the group
+      CHECK_EQ(filter[0], groups_);
+      // The input and output channel dimensions are the third and second
+      // dimensions of the filter shape.
+      CHECK_EQ(input[1], filter[2] * groups_);
+      CHECK_EQ(output[1], filter[1] * groups_);
+    }
+  }
+protected:
+  size_t getFilterHeight(const TensorShape& filter) const {
+    return filter[filter.ndims() - 2];
+  }
+  size_t getFilterWidth(const TensorShape& filter) const {
+    return filter[filter.ndims() - 1];
+  }
+  std::vector<size_t> strides_;
+  std::vector<size_t> paddings_;
+  /// Group size, refer to grouped convolution in
+  /// Alex Krizhevsky's paper: when group=2, the first half of the
+  /// filters are only connected to the first half of the input channels,
+  /// and the second half only connected to the second half.
+  size_t groups_;
+  inline int strideH() const { return strides_[0]; }
+  inline int strideW() const { return strides_[1]; }
+  inline int paddingH() const { return paddings_[0]; }
+  inline int paddingW() const { return paddings_[1]; }
+  // A temporary memory in convolution calculation.
+  MemoryHandlePtr memory_;
+  template <DeviceType Device>
+  void resizeBuffer(size_t newSize) {
+    if (!memory_ || newSize * sizeof(real) > memory_->getAllocSize()) {
+      if (Device == DEVICE_TYPE_CPU) {
+        memory_ = std::make_shared<CpuMemoryHandle>(newSize * sizeof(real));
+      } else {
+        memory_ = std::make_shared<GpuMemoryHandle>(newSize * sizeof(real));
+      }
+    }
+  }
+};
+}  // namespace paddle
--- a/paddle/function/ConvOpTest.cpp
+++ b/paddle/function/ConvOpTest.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <gtest/gtest.h>
+#include <memory>
+#include "Function.h"
+#include "FunctionTest.h"
+namespace paddle {
+enum TestType {
+  kForwardTest = 0,
+  kBackwardInputTest = 1,
+  kBackwardFilterTest = 2,
+};
+template <DeviceType DType1, DeviceType DType2>
+class ConvolutionTest {
+public:
+  ConvolutionTest(const std::string& conv1,
+                  const std::string& conv2,
+                  TestType type,
+                  std::string algo = "auto") {
+    for (size_t batchSize : {1, 32}) {
+      for (size_t inputSize : {7, 14, 54}) {
+        for (size_t filterSize : {1, 3, 5}) {
+          for (size_t inputChannels : {3, 64}) {
+            for (size_t outputChannels : {3, 64, 128}) {
+              if (inputChannels < outputChannels) break;
+              for (size_t stride : {1, 2}) {
+                for (size_t padding : {0, 1}) {
+                  if (padding >= filterSize) break;
+                  size_t outputSize =
+                      (inputSize - filterSize + 2 * padding + stride) / stride;
+                  VLOG(3) << " batchSize=" << batchSize
+                          << " inputChannels=" << inputChannels
+                          << " inputHeight=" << inputSize
+                          << " inputWidth=" << inputSize
+                          << " outputChannels=" << outputChannels
+                          << " filterHeight=" << filterSize
+                          << " filterWidth=" << filterSize
+                          << " outputHeight=" << outputSize
+                          << " outputWidth=" << outputSize
+                          << " stride=" << stride << " padding=" << padding;
+                  std::vector<size_t> paddings = {padding, padding};
+                  std::vector<size_t> strides = {stride, stride};
+                  Compare2Function<DType1, DType2> test(
+                      conv1,
+                      conv2,
+                      FuncConfig()
+                          .set("paddings", paddings)
+                          .set("strides", strides)
+                          .set("groups", (size_t)1)
+                          .set("algo", algo));
+                  TensorShape input{
+                      batchSize, inputChannels, inputSize, inputSize};
+                  TensorShape filter{
+                      outputChannels, inputChannels, filterSize, filterSize};
+                  TensorShape output{
+                      batchSize, outputChannels, outputSize, outputSize};
+                  if (type == kForwardTest) {
+                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
+                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, output));
+                    test.run();
+                  } else if (type == kBackwardInputTest) {
+                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
+                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, input), ADD_TO);
+                    test.run();
+                  } else if (type == kBackwardFilterTest) {
+                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
+                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
+                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+                    test.run();
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+// Mainly used to test cases where the height and width (input, filter)
+// are not equal.
+template <DeviceType DType1, DeviceType DType2>
+class ConvolutionTest2 {
+public:
+  ConvolutionTest2(const std::string& conv1,
+                   const std::string& conv2,
+                   TestType type,
+                   std::string algo = "auto") {
+    for (size_t batchSize : {16}) {
+      for (size_t inputHeight : {7, 31}) {
+        for (size_t inputWidth : {10, 54}) {
+          for (size_t filterHeight : {1, 5}) {
+            for (size_t filterWidth : {3, 7}) {
+              for (size_t inputChannels : {7}) {
+                for (size_t outputChannels : {32}) {
+                  size_t stride = 1;
+                  size_t padding = 0;
+                  size_t outputHeight =
+                      (inputHeight - filterHeight + 2 * padding + stride) /
+                      stride;
+                  size_t outputWidth =
+                      (inputWidth - filterWidth + 2 * padding + stride) /
+                      stride;
+                  VLOG(3) << " batchSize=" << batchSize
+                          << " inputChannels=" << inputChannels
+                          << " inputHeight=" << inputHeight
+                          << " inputWidth=" << inputWidth
+                          << " outputChannels=" << outputChannels
+                          << " filterHeight=" << filterHeight
+                          << " filterWidth=" << filterWidth
+                          << " outputHeight=" << outputHeight
+                          << " outputWidth=" << outputWidth
+                          << " stride=" << stride << " padding=" << padding;
+                  std::vector<size_t> paddings = {padding, padding};
+                  std::vector<size_t> strides = {stride, stride};
+                  Compare2Function<DType1, DType2> test(
+                      conv1,
+                      conv2,
+                      FuncConfig()
+                          .set("paddings", paddings)
+                          .set("strides", strides)
+                          .set("groups", (size_t)1)
+                          .set("algo", algo));
+                  TensorShape input{
+                      batchSize, inputChannels, inputHeight, inputWidth};
+                  TensorShape filter{
+                      outputChannels, inputChannels, filterHeight, filterWidth};
+                  TensorShape output{
+                      batchSize, outputChannels, outputHeight, outputWidth};
+                  if (type == kForwardTest) {
+                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
+                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, output));
+                    test.run();
+                  } else if (type == kBackwardInputTest) {
+                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
+                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, input), ADD_TO);
+                    test.run();
+                  } else if (type == kBackwardFilterTest) {
+                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
+                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
+                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+                    test.run();
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+TEST(Forward, GEMM) {
+  ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU> test(
+      "NaiveConv-CPU", "GemmConv-CPU", kForwardTest);
+  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU> test2(
+      "NaiveConv-CPU", "GemmConv-CPU", kForwardTest);
+}
+#ifndef PADDLE_ONLY_CPU
+TEST(Forward, GEMM2) {
+  ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
+      "GemmConv-CPU", "GemmConv-GPU", kForwardTest);
+  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
+      "GemmConv-CPU", "GemmConv-GPU", kForwardTest);
+}
+TEST(BackwardInput, GEMM) {
+  ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
+      "GemmConvGradInput-CPU", "GemmConvGradInput-GPU", kBackwardInputTest);
+  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
+      "GemmConvGradInput-CPU", "GemmConvGradInput-GPU", kBackwardInputTest);
+}
+TEST(BackwardFilter, GEMM) {
+  ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
+      "GemmConvGradFilter-CPU", "GemmConvGradFilter-GPU", kBackwardFilterTest);
+  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
+      "GemmConvGradFilter-CPU", "GemmConvGradFilter-GPU", kBackwardFilterTest);
+}
+#endif
+}  // namespace paddle
--- a/paddle/function/CosSimOpTest.cpp
+++ b/paddle/function/CosSimOpTest.cpp
@@ -22,7 +22,7 @@ void testCosSimForward(size_t height_x,
                       size_t height_y,
                       size_t width,
                       real scale) {
-  FunctionCompare test("CosSimForward", FuncConfig().set("scale", scale));
+  CpuGpuFuncCompare test("CosSimForward", FuncConfig().set("scale", scale));
  // prepare input arguments
  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, width}));
  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_y, width}));
@@ -36,7 +36,7 @@ void testCosSimBackward(size_t height_x,
                        size_t height_y,
                        size_t width,
                        real scale) {
-  FunctionCompare test("CosSimBackward", FuncConfig().set("scale", scale));
+  CpuGpuFuncCompare test("CosSimBackward", FuncConfig().set("scale", scale));
  // prepare input arguments
  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, 1}));
  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, 1}));

--- a/paddle/function/CrossMapNormalOpTest.cpp
+++ b/paddle/function/CrossMapNormalOpTest.cpp
@@ -28,7 +28,7 @@ TEST(CrossMapNormal, real) {
                    << " size=" << size;
            // init Test object
-            FunctionCompare test("CrossMapNormal",
+            CpuGpuFuncCompare test("CrossMapNormal",
                                   FuncConfig()
                                       .set("size", size)
                                       .set("scale", (real)1.5)
@@ -57,7 +57,7 @@ TEST(CrossMapNormalGrad, real) {
                    << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW
                    << " size=" << size;
-            FunctionCompare test("CrossMapNormalGrad",
+            CpuGpuFuncCompare test("CrossMapNormalGrad",
                                   FuncConfig()
                                       .set("size", size)
                                       .set("scale", (real)1.5)

--- a/paddle/function/FunctionTest.h
+++ b/paddle/function/FunctionTest.h
@@ -22,14 +22,62 @@ namespace paddle {
 typedef std::shared_ptr<BufferArg> BufferArgPtr;
+namespace test {
+template <DeviceType DType>
+struct Allocator;
+template <>
+struct Allocator<DEVICE_TYPE_CPU> {
+  using type = CpuMemoryHandle;
+};
+template <>
+struct Allocator<DEVICE_TYPE_GPU> {
+  using type = GpuMemoryHandle;
+};
+// Copy argument1 to argument2
+template <DeviceType DType1, DeviceType DType2>
+class CopyArgument {
+public:
+  void operator()(const BufferArg& arg1, BufferArg& arg2) {
+    CHECK_EQ(arg1.valueType(), arg2.valueType());
+    CHECK_LE(arg1.shape().getElements(), arg2.shape().getElements());
+    if (arg1.valueType() == VALUE_TYPE_INT32) {
+      IVectorPtr vector1 =
+          IVector::create((int*)arg1.data(),
+                          arg1.shape().getElements(),
+                          DType1 == DEVICE_TYPE_CPU ? false : true);
+      IVectorPtr vector2 =
+          IVector::create((int*)arg2.data(),
+                          arg2.shape().getElements(),
+                          DType2 == DEVICE_TYPE_CPU ? false : true);
+      vector2->copyFrom(*vector1);
+    } else {
+      VectorPtr vector1 =
+          Vector::create((real*)arg1.data(),
+                         arg1.shape().getElements(),
+                         DType1 == DEVICE_TYPE_CPU ? false : true);
+      VectorPtr vector2 =
+          Vector::create((real*)arg2.data(),
+                         arg2.shape().getElements(),
+                         DType2 == DEVICE_TYPE_CPU ? false : true);
+      vector2->copyFrom(*vector1);
+    }
+  }
+};
+}  // namespace test
 /**
- * \brief A class for comparing CPU and GPU implementations of Function.
+ * \brief A class for comparing two Functions of different implementations.
- *
+ *        For example, can be used to compare the CPU and GPU implementation
+ *        of the function is consistent.
 *
 * Use case:
 *  // Initializes a test object, the corresponding cpu and gpu Function
 *  // are constructed according to FunctionName and FuncConfig.
- *  FunctionCompare test(FunctionName, FuncConfig);
+ *  CpuGpuFuncCompare test(FunctionName, FuncConfig);
 *  // Prepare inputs and outputs arguments.
 *  // Here the input and output can not contain real data,
 *  // only contains the argument type and shape.
@@ -45,28 +93,38 @@ typedef std::shared_ptr<BufferArg> BufferArgPtr;
 *  // Compares CPU and GPU calculation results for consistency.
 *  test.run();
 */
-class FunctionCompare {
+template <DeviceType DType1, DeviceType DType2>
+class Compare2Function {
 public:
-  FunctionCompare(const std::string& name, const FuncConfig& config)
+  typedef typename test::Allocator<DType1>::type Allocator1;
-      : cpuFunc_(FunctionBase::funcRegistrar_.createByType(name + "-CPU")),
+  typedef typename test::Allocator<DType2>::type Allocator2;
-        gpuFunc_(FunctionBase::funcRegistrar_.createByType(name + "-GPU")) {
+  typedef typename Tensor<real, DType1>::Vector Vector1;
-    cpuFunc_->init(config);
+  typedef typename Tensor<real, DType2>::Vector Vector2;
-    gpuFunc_->init(config);
+  typedef typename Tensor<real, DType1>::SparseMatrix SparseMatrix1;
+  typedef typename Tensor<real, DType2>::SparseMatrix SparseMatrix2;
+  Compare2Function(const std::string& name1,
+                   const std::string& name2,
+                   const FuncConfig& config)
+      : function1_(FunctionBase::funcRegistrar_.createByType(name1)),
+        function2_(FunctionBase::funcRegistrar_.createByType(name2)) {
+    function1_->init(config);
+    function2_->init(config);
  }
-  ~FunctionCompare() {}
+  ~Compare2Function() {}
  // input need only contains shape, do not contains data.
  void addInputs(const BufferArg& input) {
    size_t size =
        input.shape().getElements() * sizeOfValuType(input.valueType());
-    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(size));
+    func1Memory_.emplace_back(std::make_shared<Allocator1>(size));
-    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(size));
+    func2Memory_.emplace_back(std::make_shared<Allocator2>(size));
-    cpuInputs_.emplace_back(std::make_shared<BufferArg>(
+    func1Inputs_.emplace_back(std::make_shared<BufferArg>(
-        cpuMemory_.back()->getBuf(), input.valueType(), input.shape()));
+        func1Memory_.back()->getBuf(), input.valueType(), input.shape()));
-    gpuInputs_.emplace_back(std::make_shared<BufferArg>(
+    func2Inputs_.emplace_back(std::make_shared<BufferArg>(
-        gpuMemory_.back()->getBuf(), input.valueType(), input.shape()));
+        func2Memory_.back()->getBuf(), input.valueType(), input.shape()));
  }
  // assume one copy of sequence is shared by different SequenceArgs
@@ -75,62 +133,57 @@ public:
    size_t batchSize = input.shape()[0];
    size_t numSeqs = batchSize / 10 + 1;
    size_t sizeId = (numSeqs + 1) * sizeOfValuType(VALUE_TYPE_INT32);
-    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(sizeId));
+    func1Memory_.emplace_back(std::make_shared<Allocator1>(sizeId));
-    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(sizeId));
+    func2Memory_.emplace_back(std::make_shared<Allocator2>(sizeId));
-    cpuSeq_ = std::make_shared<SequenceIdArg>(cpuMemory_.back()->getBuf(),
+    seq1_ = std::make_shared<SequenceIdArg>(func1Memory_.back()->getBuf(),
                                            TensorShape{numSeqs + 1});
-    gpuSeq_ = std::make_shared<SequenceIdArg>(gpuMemory_.back()->getBuf(),
+    seq2_ = std::make_shared<SequenceIdArg>(func2Memory_.back()->getBuf(),
                                            TensorShape{numSeqs + 1});
    /// init sequence Id
-    initArg(*cpuSeq_, batchSize);
+    initArg(*seq1_, batchSize);
-    // todo(tianbing), delete it
+    copyArg_(*seq1_, *seq2_);
-    CHECK_EQ(cpuSeq_->shape().getElements(), cpuSeq_->numSeqs() + 1);
-    CpuIVector cpuSeq(cpuSeq_->shape().getElements(), (int*)cpuSeq_->data());
-    GpuIVector gpuSeq(gpuSeq_->shape().getElements(), (int*)gpuSeq_->data());
-    gpuSeq.copyFrom(cpuSeq);
  }
  void addInputs(const SequenceArg& input) {
    CHECK_EQ(input.shape().ndims(), 2UL);
    size_t batchSize = input.shape()[0];
-    if (!cpuSeq_ || !gpuSeq_) {  // sequence not exist
+    if (!seq1_ || !seq2_) {  // sequence not exist
      addSequence(SequenceIdArg(TensorShape{batchSize}));
    }
    size_t size =
        input.shape().getElements() * sizeOfValuType(input.valueType());
-    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(size));
+    func1Memory_.emplace_back(std::make_shared<Allocator1>(size));
-    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(size));
+    func2Memory_.emplace_back(std::make_shared<Allocator2>(size));
    /// SequenceArg
-    cpuInputs_.emplace_back(
+    func1Inputs_.emplace_back(
-        std::make_shared<SequenceArg>(cpuMemory_.back()->getBuf(),
+        std::make_shared<SequenceArg>(func1Memory_.back()->getBuf(),
                                      input.valueType(),
                                      input.shape(),
-                                      *cpuSeq_));
+                                      *seq1_));
-    gpuInputs_.emplace_back(
+    func2Inputs_.emplace_back(
-        std::make_shared<SequenceArg>(gpuMemory_.back()->getBuf(),
+        std::make_shared<SequenceArg>(func2Memory_.back()->getBuf(),
                                      input.valueType(),
                                      input.shape(),
-                                      *gpuSeq_));
+                                      *seq2_));
  }
  // output need only contains shape, do not contains data.
  void addOutputs(const BufferArg& output, ArgType argType = ASSIGN_TO) {
    size_t size =
        output.shape().getElements() * sizeOfValuType(output.valueType());
-    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(size));
+    func1Memory_.emplace_back(std::make_shared<Allocator1>(size));
-    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(size));
+    func2Memory_.emplace_back(std::make_shared<Allocator2>(size));
-    cpuOutputs_.emplace_back(
+    func1Outputs_.emplace_back(
-        std::make_shared<BufferArg>(cpuMemory_.back()->getBuf(),
+        std::make_shared<BufferArg>(func1Memory_.back()->getBuf(),
                                    output.valueType(),
                                    output.shape(),
                                    argType));
-    gpuOutputs_.emplace_back(
+    func2Outputs_.emplace_back(
-        std::make_shared<BufferArg>(gpuMemory_.back()->getBuf(),
+        std::make_shared<BufferArg>(func2Memory_.back()->getBuf(),
                                    output.valueType(),
                                    output.shape(),
                                    argType));
@@ -138,14 +191,14 @@ public:
  /// add and init output sparse matrix
  void addOutputs(const SparseMatrixArg& output, ArgType argType = ASSIGN_TO) {
-    cpuSparse_ = std::make_shared<CpuSparseMatrix>(
+    sparse1_ = std::make_shared<SparseMatrix1>(
        output.shape()[0],
        output.shape()[1],
        output.nnz(),
        static_cast<SparseValueType>(output.dataType()),
        static_cast<SparseFormat>(output.dataFormat()));
-    gpuSparse_ = std::make_shared<GpuSparseMatrix>(
+    sparse2_ = std::make_shared<SparseMatrix2>(
        output.shape()[0],
        output.shape()[1],
        output.nnz(),
@@ -154,52 +207,52 @@ public:
    /// init sparse matrix
    hl_stream_t stream(HPPL_STREAM_1);
-    cpuSparse_->randomizeUniform();
+    sparse1_->randomizeUniform();
-    gpuSparse_->copyFrom(*cpuSparse_, stream);
+    sparse2_->copyFrom(*sparse1_, stream);
    hl_stream_synchronize(stream);
-    cpuOutputs_.emplace_back(
+    func1Outputs_.emplace_back(
-        std::make_shared<SparseMatrixArg>(*cpuSparse_, argType));
+        std::make_shared<SparseMatrixArg>(*sparse1_, argType));
-    gpuOutputs_.emplace_back(
+    func2Outputs_.emplace_back(
-        std::make_shared<SparseMatrixArg>(*gpuSparse_, argType));
+        std::make_shared<SparseMatrixArg>(*sparse2_, argType));
  }
  void addOutputs(const SequenceArg& output, ArgType argType = ASSIGN_TO) {
    CHECK_EQ(output.shape().ndims(), 2UL);
    size_t batchSize = output.shape()[0];
-    if (!cpuSeq_ || !gpuSeq_) {  // sequence not exist
+    if (!seq1_ || !seq2_) {  // sequence not exist
      addSequence(SequenceIdArg(TensorShape{batchSize}));
    }
    size_t size =
        output.shape().getElements() * sizeOfValuType(output.valueType());
-    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(size));
+    func1Memory_.emplace_back(std::make_shared<Allocator1>(size));
-    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(size));
+    func2Memory_.emplace_back(std::make_shared<Allocator2>(size));
    /// SequenceArg
-    cpuOutputs_.emplace_back(
+    func1Outputs_.emplace_back(
-        std::make_shared<SequenceArg>(cpuMemory_.back()->getBuf(),
+        std::make_shared<SequenceArg>(func1Memory_.back()->getBuf(),
                                      output.valueType(),
                                      output.shape(),
-                                      *cpuSeq_,
+                                      *seq1_,
                                      argType));
-    gpuOutputs_.emplace_back(
+    func2Outputs_.emplace_back(
-        std::make_shared<SequenceArg>(gpuMemory_.back()->getBuf(),
+        std::make_shared<SequenceArg>(func2Memory_.back()->getBuf(),
                                      output.valueType(),
                                      output.shape(),
-                                      *gpuSeq_,
+                                      *seq2_,
                                      argType));
  }
  void addInputs(const SparseMatrixArg& input) {
-    cpuSparse_ = std::make_shared<CpuSparseMatrix>(
+    sparse1_ = std::make_shared<SparseMatrix1>(
        input.shape()[0],
        input.shape()[1],
        input.nnz(),
        static_cast<SparseValueType>(input.dataType()),
        static_cast<SparseFormat>(input.dataFormat()));
-    gpuSparse_ = std::make_shared<GpuSparseMatrix>(
+    sparse2_ = std::make_shared<SparseMatrix2>(
        input.shape()[0],
        input.shape()[1],
        input.nnz(),
@@ -208,12 +261,12 @@ public:
    /// init sparse matrix
    hl_stream_t stream(HPPL_STREAM_1);
-    cpuSparse_->randomizeUniform();
+    sparse1_->randomizeUniform();
-    gpuSparse_->copyFrom(*cpuSparse_, stream);
+    sparse2_->copyFrom(*sparse1_, stream);
    hl_stream_synchronize(stream);
-    cpuInputs_.emplace_back(std::make_shared<SparseMatrixArg>(*cpuSparse_));
+    func1Inputs_.emplace_back(std::make_shared<SparseMatrixArg>(*sparse1_));
-    gpuInputs_.emplace_back(std::make_shared<SparseMatrixArg>(*gpuSparse_));
+    func2Inputs_.emplace_back(std::make_shared<SparseMatrixArg>(*sparse2_));
  }
  void run() {
@@ -236,27 +289,27 @@ public:
      function->calc(inArgs, outArgs);
    };
-    callFunction(cpuFunc_.get(), cpuInputs_, cpuOutputs_);
+    callFunction(function1_.get(), func1Inputs_, func1Outputs_);
-    callFunction(gpuFunc_.get(), gpuInputs_, gpuOutputs_);
+    callFunction(function2_.get(), func2Inputs_, func2Outputs_);
    // check outputs
    compareOutputs();
  }
-  std::shared_ptr<FunctionBase> getCpuFunction() const { return cpuFunc_; }
+  std::shared_ptr<FunctionBase> getFunction1() const { return function1_; }
-  std::shared_ptr<FunctionBase> getGpuFunction() const { return gpuFunc_; }
+  std::shared_ptr<FunctionBase> getFunction2() const { return function2_; }
 protected:
  // only init cpu argument, gpu argument copy from cpu argument.
  void initArg(BufferArg& arg) {
-    CpuVector vector(arg.shape().getElements(), (real*)arg.data());
+    Vector1 vector(arg.shape().getElements(), (real*)arg.data());
    vector.uniform(0.001, 1);
  }
  void initArg(SequenceArg& arg) {
    /// init only matrix
-    CpuVector vector(arg.shape().getElements(), (real*)arg.data());
+    Vector1 vector(arg.shape().getElements(), (real*)arg.data());
    vector.uniform(0.001, 1);
  }
@@ -276,73 +329,72 @@ protected:
  }
  void initInputs() {
-    for (size_t i = 0; i < cpuInputs_.size(); i++) {
+    for (size_t i = 0; i < func1Inputs_.size(); i++) {
-      if (cpuInputs_[i]->isSparseArg()) {
+      if (func1Inputs_[i]->isSparseArg()) {
        continue;  /// sparse matrix already init
      }
-      if (cpuInputs_[i]->isSequenceArg()) {
+      if (func1Inputs_[i]->isSequenceArg()) {
-        initArg(dynamic_cast<SequenceArg&>(*cpuInputs_[i]));
+        initArg(dynamic_cast<SequenceArg&>(*func1Inputs_[i]));
      } else {
-        initArg(*cpuInputs_[i]);
+        initArg(*func1Inputs_[i]);
      }
-      // TODO: Need a BufferCopy used to copy from one BufferArg to another.
-      CpuVector cpuVector(cpuInputs_[i]->shape().getElements(),
-                          (real*)cpuInputs_[i]->data());
-      GpuVector gpuVector(gpuInputs_[i]->shape().getElements(),
-                          (real*)gpuInputs_[i]->data());
-      gpuVector.copyFrom(cpuVector);
+      copyArg_(*func1Inputs_[i], *func2Inputs_[i]);
    }
  }
  void initOutputs() {
-    for (size_t i = 0; i < cpuOutputs_.size(); i++) {
+    for (size_t i = 0; i < func1Outputs_.size(); i++) {
-      if (cpuOutputs_[i]->isSparseArg()) {
+      if (func1Outputs_[i]->isSparseArg()) {
        continue;  /// sparse matrix already init
      }
-      if (cpuOutputs_[i]->isSequenceArg()) {
+      if (func1Outputs_[i]->isSequenceArg()) {
-        initArg(dynamic_cast<SequenceArg&>(*cpuOutputs_[i]));
+        initArg(dynamic_cast<SequenceArg&>(*func1Outputs_[i]));
      } else {
-        initArg(*cpuOutputs_[i]);
+        initArg(*func1Outputs_[i]);
      }
-      // TODO: Need a BufferCopy used to copy from one BufferArg to another.
+      copyArg_(*func1Outputs_[i], *func2Outputs_[i]);
-      CpuVector cpuVector(cpuOutputs_[i]->shape().getElements(),
-                          (real*)cpuOutputs_[i]->data());
-      GpuVector gpuVector(gpuOutputs_[i]->shape().getElements(),
-                          (real*)gpuOutputs_[i]->data());
-      gpuVector.copyFrom(cpuVector);
    }
  }
  void compareOutputs() {
-    for (size_t i = 0; i < cpuOutputs_.size(); i++) {
+    for (size_t i = 0; i < func1Outputs_.size(); i++) {
      // TODO, Need a BufferCheck used to compare the two buffers.
-      const auto cpu = cpuOutputs_[i];
+      const auto cpu = func1Outputs_[i];
-      const auto gpu = gpuOutputs_[i];
+      const auto gpu = func2Outputs_[i];
      CHECK_EQ(cpu->numElements(), gpu->numElements());
-      CpuVector cpuVector(cpu->numElements(), (real*)cpu->data());
+      Vector1 cpuVector(cpu->numElements(), (real*)cpu->data());
-      GpuVector gpuVector(gpu->numElements(), (real*)gpu->data());
+      Vector2 gpuVector(gpu->numElements(), (real*)gpu->data());
      autotest::TensorCheckErr(cpuVector, gpuVector);
    }
  }
 protected:
-  std::shared_ptr<FunctionBase> cpuFunc_;
+  std::shared_ptr<FunctionBase> function1_;
-  std::shared_ptr<FunctionBase> gpuFunc_;
+  std::shared_ptr<FunctionBase> function2_;
-  std::vector<CpuMemHandlePtr> cpuMemory_;
+  std::vector<std::shared_ptr<Allocator1>> func1Memory_;
-  std::vector<GpuMemHandlePtr> gpuMemory_;
+  std::vector<std::shared_ptr<Allocator2>> func2Memory_;
-  std::vector<BufferArgPtr> cpuInputs_;
+  std::vector<BufferArgPtr> func1Inputs_;
-  std::vector<BufferArgPtr> cpuOutputs_;
+  std::vector<BufferArgPtr> func1Outputs_;
-  std::vector<BufferArgPtr> gpuInputs_;
+  std::vector<BufferArgPtr> func2Inputs_;
-  std::vector<BufferArgPtr> gpuOutputs_;
+  std::vector<BufferArgPtr> func2Outputs_;
-  std::shared_ptr<CpuSparseMatrix> cpuSparse_;
+  std::shared_ptr<SparseMatrix1> sparse1_;
-  std::shared_ptr<GpuSparseMatrix> gpuSparse_;
+  std::shared_ptr<SparseMatrix2> sparse2_;
-  std::shared_ptr<SequenceIdArg> cpuSeq_;
+  std::shared_ptr<SequenceIdArg> seq1_;
-  std::shared_ptr<SequenceIdArg> gpuSeq_;
+  std::shared_ptr<SequenceIdArg> seq2_;
+  test::CopyArgument<DType1, DType2> copyArg_;
+};
+class CpuGpuFuncCompare
+    : public Compare2Function<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> {
+public:
+  CpuGpuFuncCompare(const std::string& name, const FuncConfig& config)
+      : Compare2Function(name + "-CPU", name + "-GPU", config) {}
+  ~CpuGpuFuncCompare() {}
 };
 }  // namespace paddle
--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "GemmConvOp.h"
+#include "GemmFunctor.h"
+#include "paddle/math/MemoryHandle.h"
+namespace paddle {
+/*
+ * imData = [input_channels, input_height, input_width]
+ * colData = [input_channels, filter_height, filter_width,
+ *            output_height, output_width]
+ */
+template <class T>
+class Im2ColFunctor<DEVICE_TYPE_CPU, T> {
+public:
+  void operator()(const T* imData,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth,
+                  int outputHeight,
+                  int outputWidth,
+                  T* colData) {
+    int channelsCol = inputChannels * filterHeight * filterWidth;
+    for (int c = 0; c < channelsCol; ++c) {
+      int wOffset = c % filterWidth;
+      int hOffset = (c / filterWidth) % filterHeight;
+      int c_im = c / filterWidth / filterHeight;
+      for (int h = 0; h < outputHeight; ++h) {
+        for (int w = 0; w < outputWidth; ++w) {
+          int imRowIdx = h * strideHeight + hOffset;
+          int imColIdx = w * strideWidth + wOffset;
+          if ((imRowIdx - paddingHeight) < 0 ||
+              (imRowIdx - paddingHeight) >= inputHeight ||
+              (imColIdx - paddingWidth) < 0 ||
+              (imColIdx - paddingWidth) >= inputWidth) {
+            colData[(c * outputHeight + h) * outputWidth + w] = T(0);
+          } else {
+            imRowIdx += c_im * inputHeight - paddingHeight;
+            imColIdx -= paddingWidth;
+            colData[(c * outputHeight + h) * outputWidth + w] =
+                imData[imRowIdx * inputWidth + imColIdx];
+          }
+        }
+      }
+    }
+  }
+};
+template <class T>
+class Col2ImFunctor<DEVICE_TYPE_CPU, T> {
+public:
+  void operator()(const T* colData,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth,
+                  int outputHeight,
+                  int outputWidth,
+                  T* imData) {
+    int channelsCol = inputChannels * filterHeight * filterWidth;
+    for (int c = 0; c < channelsCol; ++c) {
+      int wOffset = c % filterWidth;
+      int hOffset = (c / filterWidth) % filterHeight;
+      int c_im = c / filterWidth / filterHeight;
+      for (int h = 0; h < outputHeight; ++h) {
+        for (int w = 0; w < outputWidth; ++w) {
+          int imRowIdx = h * strideHeight + hOffset;
+          int imColIdx = w * strideWidth + wOffset;
+          if ((imRowIdx - paddingHeight) >= 0 &&
+              (imRowIdx - paddingHeight) < inputHeight &&
+              (imColIdx - paddingWidth) >= 0 &&
+              (imColIdx - paddingWidth) < inputWidth) {
+            imRowIdx += c_im * inputHeight - paddingHeight;
+            imColIdx -= paddingWidth;
+            imData[imRowIdx * inputWidth + imColIdx] +=
+                colData[(c * outputHeight + h) * outputWidth + w];
+          }
+        }
+      }
+    }
+  }
+};
+/*
+ * \brief Forward calculation of convolution.
+ */
+template <DeviceType Device>
+class GemmConvFunction : public ConvFunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    // TODO(hedaoyuan): Need to define some index macros,
+    // to avoid useing 0 and 1.
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+    check(input, filter, output);
+    real beta;
+    if (outputs[0].getArgType() == ADD_TO) {
+      beta = 1.0;
+    } else {
+      beta = 0.0;
+    }
+    size_t batchSize = input[0];
+    size_t inputChannels = input[1];
+    size_t inputHeight = input[2];
+    size_t inputWidth = input[3];
+    size_t filterHeight = getFilterHeight(filter);
+    size_t filterWidth = getFilterWidth(filter);
+    size_t outputChannels = output[1];
+    size_t outputHeight = output[2];
+    size_t outputWidth = output[3];
+    real* inputData = inputs[0].data<real>();
+    real* filterData = inputs[1].data<real>();
+    real* outputData = outputs[0].data<real>();
+    size_t size = inputChannels / groups_ * filterHeight * filterWidth *
+                  outputHeight * outputWidth;
+    resizeBuffer<Device>(size);
+    real* colData = reinterpret_cast<real*>(memory_->getBuf());
+    Im2ColFunctor<Device, real> im2col;
+    GemmFunctor<Device, real> gemm;
+    size_t inputOffset = (inputChannels / groups_) * inputHeight * inputWidth;
+    size_t outputOffset =
+        (outputChannels / groups_) * outputHeight * outputWidth;
+    size_t filterOffset = filter.getElements() / groups_;
+    for (size_t i = 0; i < batchSize; i++) {
+      for (size_t g = 0; g < groups_; g++) {
+        im2col(inputData + g * inputOffset,
+               inputChannels / groups_,
+               inputHeight,
+               inputWidth,
+               filterHeight,
+               filterWidth,
+               strideH(),
+               strideW(),
+               paddingH(),
+               paddingW(),
+               outputHeight,
+               outputWidth,
+               colData);
+        int M = outputChannels / groups_;
+        int N = outputHeight * outputWidth;
+        int K = inputChannels / groups_ * filterHeight * filterWidth;
+        gemm(CblasNoTrans,
+             CblasNoTrans,
+             M,
+             N,
+             K,
+             1.0f,
+             filterData + g * filterOffset,
+             K,
+             colData,
+             N,
+             beta,
+             outputData + g * outputOffset,
+             N);
+      }
+      inputData += inputChannels * inputHeight * inputWidth;
+      outputData += outputChannels * outputHeight * outputWidth;
+    }
+  }
+};
+/*
+ * \brief Backward input calculation of convolution.
+ */
+template <DeviceType Device>
+class GemmConvGradInputFunction : public ConvFunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    // Since the implementation of Col2ImFunctor is ADD_TO,
+    // this function only supports ADD_TO mode.
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    const TensorShape& output = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& input = outputs[0].shape();
+    check(input, filter, output);
+    size_t batchSize = input[0];
+    size_t inputChannels = input[1];
+    size_t inputHeight = input[2];
+    size_t inputWidth = input[3];
+    size_t filterHeight = getFilterHeight(filter);
+    size_t filterWidth = getFilterWidth(filter);
+    size_t outputChannels = output[1];
+    size_t outputHeight = output[2];
+    size_t outputWidth = output[3];
+    real* outputGrad = inputs[0].data<real>();
+    real* filterData = inputs[1].data<real>();
+    real* inputGrad = outputs[0].data<real>();
+    size_t size = inputChannels / groups_ * filterHeight * filterWidth *
+                  outputHeight * outputWidth;
+    resizeBuffer<Device>(size);
+    real* colData = reinterpret_cast<real*>(memory_->getBuf());
+    Col2ImFunctor<Device, real> col2im;
+    GemmFunctor<Device, real> gemm;
+    size_t inputOffset = (inputChannels / groups_) * inputHeight * inputWidth;
+    size_t outputOffset =
+        (outputChannels / groups_) * outputHeight * outputWidth;
+    size_t filterOffset = filter.getElements() / groups_;
+    for (size_t i = 0; i < batchSize; i++) {
+      for (size_t g = 0; g < groups_; g++) {
+        int K = outputChannels / groups_;
+        int N = outputHeight * outputWidth;
+        int M = inputChannels / groups_ * filterHeight * filterWidth;
+        gemm(CblasTrans,
+             CblasNoTrans,
+             M,
+             N,
+             K,
+             1.0f,
+             filterData + g * filterOffset,
+             M,
+             outputGrad + g * outputOffset,
+             N,
+             0.0f,
+             colData,
+             N);
+        col2im(colData,
+               inputChannels / groups_,
+               inputHeight,
+               inputWidth,
+               filterHeight,
+               filterWidth,
+               strideH(),
+               strideW(),
+               paddingH(),
+               paddingW(),
+               outputHeight,
+               outputWidth,
+               inputGrad + g * inputOffset);
+      }
+      inputGrad += inputChannels * inputHeight * inputWidth;
+      outputGrad += outputChannels * outputHeight * outputWidth;
+    }
+  }
+};
+/*
+ * \brief Backward filter calculation of convolution.
+ */
+template <DeviceType Device>
+class GemmConvGradFilterFunction : public ConvFunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    const TensorShape& output = inputs[0].shape();
+    const TensorShape& input = inputs[1].shape();
+    const TensorShape& filter = outputs[0].shape();
+    check(input, filter, output);
+    real beta;
+    if (outputs[0].getArgType() == ADD_TO) {
+      beta = 1.0;
+    } else {
+      beta = 0.0;
+    }
+    size_t batchSize = input[0];
+    size_t inputChannels = input[1];
+    size_t inputHeight = input[2];
+    size_t inputWidth = input[3];
+    size_t filterHeight = getFilterHeight(filter);
+    size_t filterWidth = getFilterWidth(filter);
+    size_t outputChannels = output[1];
+    size_t outputHeight = output[2];
+    size_t outputWidth = output[3];
+    real* outputGrad = inputs[0].data<real>();
+    real* inputData = inputs[1].data<real>();
+    real* filterGrad = outputs[0].data<real>();
+    size_t size = inputChannels / groups_ * filterHeight * filterWidth *
+                  outputHeight * outputWidth;
+    resizeBuffer<Device>(size);
+    real* colData = reinterpret_cast<real*>(memory_->getBuf());
+    Im2ColFunctor<Device, real> im2col;
+    GemmFunctor<Device, real> gemm;
+    size_t inputOffset = (inputChannels / groups_) * inputHeight * inputWidth;
+    size_t outputOffset =
+        (outputChannels / groups_) * outputHeight * outputWidth;
+    size_t filterOffset = filter.getElements() / groups_;
+    for (size_t i = 0; i < batchSize; i++) {
+      for (size_t g = 0; g < groups_; g++) {
+        im2col(inputData + g * inputOffset,
+               inputChannels / groups_,
+               inputHeight,
+               inputWidth,
+               filterHeight,
+               filterWidth,
+               strideH(),
+               strideW(),
+               paddingH(),
+               paddingW(),
+               outputHeight,
+               outputWidth,
+               colData);
+        int M = outputChannels / groups_;
+        int K = outputHeight * outputWidth;
+        int N = inputChannels / groups_ * filterHeight * filterWidth;
+        gemm(CblasNoTrans,
+             CblasTrans,
+             M,
+             N,
+             K,
+             1.0f,
+             outputGrad + g * outputOffset,
+             K,
+             colData,
+             K,
+             i == 0 ? beta : 1.0f,
+             filterGrad + g * filterOffset,
+             N);
+      }
+      inputData += inputChannels * inputHeight * inputWidth;
+      outputGrad += outputChannels * outputHeight * outputWidth;
+    }
+  }
+};
+REGISTER_TYPED_FUNC(GemmConv, CPU, GemmConvFunction);
+REGISTER_TYPED_FUNC(GemmConvGradInput, CPU, GemmConvGradInputFunction);
+REGISTER_TYPED_FUNC(GemmConvGradFilter, CPU, GemmConvGradFilterFunction);
+#ifndef PADDLE_ONLY_CPU
+REGISTER_TYPED_FUNC(GemmConv, GPU, GemmConvFunction);
+REGISTER_TYPED_FUNC(GemmConvGradInput, GPU, GemmConvGradInputFunction);
+REGISTER_TYPED_FUNC(GemmConvGradFilter, GPU, GemmConvGradFilterFunction);
+#endif
+}  // namespace paddle
--- a/paddle/function/GemmConvOp.h
+++ b/paddle/function/GemmConvOp.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "ConvOp.h"
+namespace paddle {
+/*
+ * imData = [input_channels, input_height, input_width]
+ * colData = [input_channels, filter_height, filter_width,
+ *            output_height, output_width]
+ */
+template <DeviceType Device, class T>
+class Im2ColFunctor {
+public:
+  void operator()(const T* imData,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth,
+                  int outputHeight,
+                  int outputWidth,
+                  T* colData);
+};
+template <DeviceType Device, class T>
+class Col2ImFunctor {
+public:
+  void operator()(const T* colData,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth,
+                  int outputHeight,
+                  int outputWidth,
+                  T* imData);
+};
+}  // namespace paddle
--- a/paddle/function/GemmConvOpGpu.cu
+++ b/paddle/function/GemmConvOpGpu.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "ConvOp.h"
+#include "GemmConvOp.h"
+namespace paddle {
+template<class T>
+__global__
+void im2col(const T* data_im, int numOuts, int height, int width,
+            int blockH, int blockW,
+            int strideH, int strideW,
+            int paddingH, int paddingW,
+            int height_col, int width_col,
+            T* data_col) {
+  int index =
+    (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  if (index < numOuts) {
+    int w_out = index % width_col;
+    index /= width_col;
+    int h_out = index % height_col;
+    int channel_in = index / height_col;
+    int channel_out = channel_in * blockH * blockW;
+    int h_in = h_out * strideH;
+    int w_in = w_out * strideW;
+    data_col += (channel_out * height_col + h_out) * width_col + w_out;
+    for (int i = 0; i < blockH; ++i) {
+      for (int j = 0; j < blockW; ++j) {
+        int rIdx = int(h_in+i);
+        int cIdx = int(w_in+j);
+        if ((rIdx-(int)paddingH) >= (int)height ||
+            (rIdx-(int)paddingH) < 0 ||
+            (cIdx-(int)paddingW) >= (int)width ||
+            (cIdx-(int)paddingW) < 0) {
+          *data_col = 0;
+        } else {
+          rIdx = rIdx + channel_in*height - paddingH;
+          cIdx = cIdx - paddingW;
+          *data_col = data_im[rIdx* width + cIdx];
+        }
+        data_col += height_col * width_col;
+      }
+    }
+  }
+}
+template <class T>
+class Im2ColFunctor<DEVICE_TYPE_GPU, T> {
+public:
+  void operator()(const T* imData,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth,
+                  int outputHeight,
+                  int outputWidth,
+                  T* colData) {
+    int numKernels = inputChannels * outputHeight * outputWidth;
+    int blocks = (numKernels + 1024 -1) / 1024;
+    int blockX = 512;
+    int blockY = (blocks + 512 - 1) / 512;
+    dim3 threads(1024, 1);
+    dim3 grid(blockX, blockY);
+    im2col<T><<< grid, threads, 0, STREAM_DEFAULT >>>
+        (imData, numKernels, inputHeight, inputWidth, filterHeight, filterWidth,
+         strideHeight, strideWidth, paddingHeight, paddingWidth,
+         outputHeight, outputWidth, colData);
+    CHECK_SYNC("Im2ColFunctor GPU failed");
+  }
+};
+template<class T>
+__global__
+void col2im(size_t n, const T* data_col, size_t height,
+            size_t width, size_t channels,
+            size_t blockH, size_t blockW,
+            size_t strideH, size_t strideW,
+            size_t paddingH, size_t paddingW,
+            size_t height_col, size_t width_col,
+            T* data_im) {
+  size_t index =
+    (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  if (index < n) {
+    T val = 0;
+    int w = int(index % width);
+    int h = int((index / width) % height);
+    int c = int(index / (width * height));
+    if ((w - (int)paddingW) >= 0 &&
+        (w - (int)paddingW) < (width-2 * paddingW) &&
+        (h - (int)paddingH) >= 0 &&
+        (h - paddingH) < (height - 2 * paddingH)) {
+      // compute the start and end of the output
+      int w_col_start =
+        (w < (int)blockW) ? 0 : (w - int(blockW)) / (int)strideW + 1;
+      int w_col_end =
+        min((int)(w / (int)strideW + 1), (int)(width_col));
+      int h_col_start =
+        (h < (int)blockH) ? 0 : (h - (int)blockH) / (int)strideH + 1;
+      int h_col_end = min(int(h / strideH + 1), int(height_col));
+      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+          // the col location: [c * width * height + h_out, w_out]
+          int c_col = int(c * blockH* blockW) + \
+            (h - h_col * (int)strideH) * (int)blockW +
+            (w - w_col * (int)strideW);
+          val += data_col[(c_col * height_col + h_col) * width_col + w_col];
+        }
+      }
+      h -= paddingH;
+      w -= paddingW;
+      data_im[c*((width-2*paddingW) * (height-2*paddingH)) +
+              h*(width-2*paddingW) + w] += val;
+    }
+  }
+}
+template <class T>
+class Col2ImFunctor<DEVICE_TYPE_GPU, T> {
+public:
+  void operator()(const T* colData,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth,
+                  int outputHeight,
+                  int outputWidth,
+                  T* imData) {
+    size_t numKernels = inputChannels * (inputHeight + 2*paddingHeight)
+        * (inputWidth + 2*paddingWidth);
+    size_t blocks = (numKernels + 1024 -1) / 1024;
+    size_t blockX = 512;
+    size_t blockY = (blocks+512-1)/512;
+    dim3 threads(1024, 1);
+    dim3 grid(blockX, blockY);
+    // To avoid involving atomic operations, we will launch one kernel per
+    // bottom dimension, and then in the kernel add up the top dimensions.
+    col2im<T><<< grid, threads, 0, STREAM_DEFAULT >>>
+             (numKernels,
+              colData,
+              inputHeight + 2*paddingHeight,
+              inputWidth + 2*paddingWidth,
+              inputChannels,
+              filterHeight,
+              filterWidth,
+              strideHeight,
+              strideWidth,
+              paddingHeight,
+              paddingWidth,
+              outputHeight,
+              outputWidth,
+              imData);
+    CHECK_SYNC("Col2ImFunctor GPU failed");
+  }
+};
+template class Im2ColFunctor<DEVICE_TYPE_GPU, float>;
+template class Im2ColFunctor<DEVICE_TYPE_GPU, double>;
+template class Col2ImFunctor<DEVICE_TYPE_GPU, float>;
+template class Col2ImFunctor<DEVICE_TYPE_GPU, double>;
+}  // namespace paddle
--- a/paddle/function/GemmFunctor.h
+++ b/paddle/function/GemmFunctor.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/math/MathFunctions.h"
+namespace paddle {
+// TODO(hedaoyuan): Since the hl_matrix_mul interface does not conform to the
+// cblas_dgemm interface's parameter format, it is necessary to introduce
+// GemmFunctor as a new interface. Later, when considering the implementation
+// of MatMulFunction, we need to consider the reconstruction of hl_matrix_mul
+// interface.
+template <DeviceType Device, class T>
+class GemmFunctor {
+public:
+  void operator()(const CBLAS_TRANSPOSE transA,
+                  const CBLAS_TRANSPOSE TransB,
+                  const int M,
+                  const int N,
+                  const int K,
+                  const T alpha,
+                  const T* A,
+                  const int lda,
+                  const T* B,
+                  const int ldb,
+                  const T beta,
+                  T* C,
+                  const int ldc);
+};
+template <class T>
+class GemmFunctor<DEVICE_TYPE_CPU, T> {
+public:
+  void operator()(const CBLAS_TRANSPOSE transA,
+                  const CBLAS_TRANSPOSE TransB,
+                  const int M,
+                  const int N,
+                  const int K,
+                  const T alpha,
+                  const T* A,
+                  const int lda,
+                  const T* B,
+                  const int ldb,
+                  const T beta,
+                  T* C,
+                  const int ldc) {
+    gemm<T>(transA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
+  }
+};
+template <class T>
+class GemmFunctor<DEVICE_TYPE_GPU, T> {
+public:
+  void operator()(const CBLAS_TRANSPOSE transA,
+                  const CBLAS_TRANSPOSE TransB,
+                  const int M,
+                  const int N,
+                  const int K,
+                  const T alpha,
+                  const T* A,
+                  const int lda,
+                  const T* B,
+                  const int ldb,
+                  const T beta,
+                  T* C,
+                  const int ldc) {
+    hl_matrix_mul((T*)A,
+                  transA == CblasNoTrans ? HPPL_OP_N : HPPL_OP_T,
+                  (T*)B,
+                  TransB == CblasNoTrans ? HPPL_OP_N : HPPL_OP_T,
+                  C,
+                  M,
+                  N,
+                  K,
+                  alpha,
+                  beta,
+                  lda,
+                  ldb,
+                  ldc);
+  }
+};
+}  // namespace paddle
--- a/paddle/function/MulOpTest.cpp
+++ b/paddle/function/MulOpTest.cpp
@@ -35,7 +35,7 @@ void testFuncDDDMatrix(
  size_t heightC = dimM;
  size_t widthC = dimN;
  // init Test object
-  FunctionCompare test(
+  CpuGpuFuncCompare test(
      "MulOp", FuncConfig().set("aTrans", transa).set("bTrans", transb));
  // prepare input arguments
  /// matrix A : HA * WA
@@ -81,8 +81,8 @@ void testFuncDSparseDMatrix(
    size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) {
  real scaleT = 1.0;
  // init Test object
-  FunctionCompare test("MulOp",
+  CpuGpuFuncCompare test(
-                       FuncConfig().set("aTrans", false).set("bTrans", false));
+      "MulOp", FuncConfig().set("aTrans", false).set("bTrans", false));
  // prepare input arguments
  /// sparse matrix A : M * K
  test.addInputs(SparseMatrixArg(
@@ -126,8 +126,8 @@ void testFuncDDSparseMatrix(
    size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) {
  real scaleT = 1.0;
  // init Test object
-  FunctionCompare test("MulOp",
+  CpuGpuFuncCompare test(
-                       FuncConfig().set("aTrans", false).set("bTrans", false));
+      "MulOp", FuncConfig().set("aTrans", false).set("bTrans", false));
  // prepare input arguments
  /// matrix A : M * K
  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimK}));
@@ -172,8 +172,8 @@ void testFuncSparseDDMatrix(
    size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) {
  real scaleT = 1.0;
  // init Test object
-  FunctionCompare test("MulOp",
+  CpuGpuFuncCompare test(
-                       FuncConfig().set("aTrans", false).set("bTrans", false));
+      "MulOp", FuncConfig().set("aTrans", false).set("bTrans", false));
  // prepare input arguments
  /// matrix A : M * K
  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimK}));

--- a/paddle/function/NaiveConvOp.cpp
+++ b/paddle/function/NaiveConvOp.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "ConvOp.h"
+namespace paddle {
+/*
+ * The three arguments are stored in memory in row major order.
+ * inputData  = [batchSize, inputChannels, inputHeight, inputWidth]
+ * filterData = [outputChannels, inputChannels, filterHeight, filterWidth]
+ * outputData = [batchSize, outputChannels, outputHeight, outputWidth]
+ */
+template <class T>
+class NaiveConvFunctor {
+public:
+  void operator()(const T* inputData,
+                  size_t batchSize,
+                  size_t inputChannels,
+                  size_t inputHeight,
+                  size_t inputWidth,
+                  const T* filterData,
+                  size_t filterHeight,
+                  size_t filterWidth,
+                  T* outputData,
+                  size_t outputChannels,
+                  size_t outputHeight,
+                  size_t outputWidth,
+                  size_t paddingH,
+                  size_t paddingW,
+                  size_t strideH,
+                  size_t strideW) {
+    for (size_t batch = 0; batch < batchSize; batch++) {
+      for (size_t outC = 0; outC < outputChannels; outC++) {
+        for (size_t outH = 0; outH < outputHeight; outH++) {
+          for (size_t outW = 0; outW < outputWidth; outW++) {
+            const int inStartH = (outH * strideH) - paddingH;
+            const int inStartW = (outW * strideW) - paddingW;
+            T outValue = (T)0;
+            for (size_t inC = 0; inC < inputChannels; inC++) {
+              for (size_t fH = 0; fH < filterHeight; fH++) {
+                for (size_t fW = 0; fW < filterWidth; fW++) {
+                  T inValue;
+                  const int inH = inStartH + fH;
+                  const int inW = inStartW + fW;
+                  if ((inH >= 0 && inH < inputHeight) &&
+                      (inW >= 0 && inW < inputWidth)) {
+                    size_t offsetInput =
+                        batch * inputChannels * inputHeight * inputWidth +
+                        inC * inputHeight * inputWidth + inH * inputWidth + inW;
+                    inValue = inputData[offsetInput];
+                  } else {
+                    inValue = (T)0;
+                  }
+                  size_t offsetFilter =
+                      outC * inputChannels * filterHeight * filterWidth +
+                      inC * filterHeight * filterWidth + fH * filterWidth + fW;
+                  T filterValue = filterData[offsetFilter];
+                  outValue += (inValue * filterValue);
+                }
+              }
+            }
+            size_t offset =
+                batch * outputChannels * outputHeight * outputWidth +
+                outC * outputHeight * outputWidth + outH * outputWidth + outW;
+            outputData[offset] = outValue;
+          }
+        }
+      }
+    }
+  }
+};
+template <DeviceType Device>
+class NaiveConvFunction : public ConvFunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+    check(input, filter, output);
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+    size_t batchSize = inputs[0].shape()[0];
+    size_t inputChannels = inputs[0].shape()[1];
+    size_t inputHeight = inputs[0].shape()[2];
+    size_t inputWidth = inputs[0].shape()[3];
+    size_t filterHeight = inputs[1].shape()[2];
+    size_t filterWidth = inputs[1].shape()[3];
+    size_t outputChannels = outputs[0].shape()[1];
+    size_t outputHeight = outputs[0].shape()[2];
+    size_t outputWidth = outputs[0].shape()[3];
+    real* inputData = inputs[0].data<real>();
+    real* filterData = inputs[1].data<real>();
+    real* outputData = outputs[0].data<real>();
+    NaiveConvFunctor<real> conv;
+    conv(inputData,
+         batchSize,
+         inputChannels,
+         inputHeight,
+         inputWidth,
+         filterData,
+         filterHeight,
+         filterWidth,
+         outputData,
+         outputChannels,
+         outputHeight,
+         outputWidth,
+         paddingH(),
+         paddingW(),
+         strideH(),
+         strideW());
+  }
+};
+REGISTER_TYPED_FUNC(NaiveConv, CPU, NaiveConvFunction);
+}  // namespace paddle
--- a/paddle/function/PadOpTest.cpp
+++ b/paddle/function/PadOpTest.cpp
@@ -25,7 +25,7 @@ TEST(Pad, real) {
          VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
                  << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW;
          for (bool test_grad : {false, true}) {
-            FunctionCompare compare(
+            CpuGpuFuncCompare compare(
                test_grad ? "PadGrad" : "Pad",
                FuncConfig()
                    .set<std::vector<uint32_t>>("channel", {2, 3})

--- a/paddle/function/RowConvOp.cpp
+++ b/paddle/function/RowConvOp.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "RowConvOp.h"
+#include <iostream>
+#include "paddle/math/Vector.h"
+namespace paddle {
+template <>
+void RowConv<DEVICE_TYPE_CPU>(CpuMatrix& out,
+                              const CpuMatrix& in,
+                              const CpuMatrix& filter,
+                              const CpuIVector& seq) {
+  const int* starts = seq.getData();
+  const size_t numSeq = seq.getSize() - 1;
+  const size_t contextLength = filter.getHeight();
+  for (size_t i = 0; i < numSeq; ++i) {
+    size_t begin = starts[i];
+    size_t end = starts[i + 1];
+    for (size_t j = begin; j < end; ++j) {
+      MatrixPtr x;
+      MatrixPtr w;
+      if ((j + contextLength) < end) {
+        x = (const_cast<CpuMatrix&>(in)).subMatrix(j, contextLength);
+        w = (const_cast<CpuMatrix&>(filter)).subMatrix(0, contextLength);
+      } else {
+        x = (const_cast<CpuMatrix&>(in)).subMatrix(j, end - j);
+        w = (const_cast<CpuMatrix&>(filter)).subMatrix(0, end - j);
+      }
+      MatrixPtr y = out.subMatrix(j, 1);
+      y->addDotMulVMM(*x, *w);
+    }
+  }
+}
+template <>
+void RowConvGrad<DEVICE_TYPE_CPU>(const CpuMatrix& outG,
+                                  const CpuMatrix& in,
+                                  const CpuMatrix& filter,
+                                  CpuMatrix& inG,
+                                  CpuMatrix& filterG,
+                                  const CpuIVector& seq) {
+  // gradient w.r.t filter
+  const int* starts = seq.getData();
+  const size_t numSeq = seq.getSize() - 1;
+  const size_t contextLength = filter.getHeight();
+  if (filterG) {
+    for (size_t i = 0; i < numSeq; ++i) {
+      size_t begin = starts[i];
+      size_t end = starts[i + 1];
+      size_t steps = end - begin;
+      for (size_t j = 0; j < contextLength && (begin + j) < end; ++j) {
+        MatrixPtr x =
+            (const_cast<CpuMatrix&>(in)).subMatrix(begin + j, steps - j);
+        MatrixPtr dy =
+            (const_cast<CpuMatrix&>(outG)).subMatrix(begin, steps - j);
+        MatrixPtr dw = filterG.subMatrix(j, 1);
+        dw->addDotMulVMM(*dy, *x);
+      }
+    }
+  }
+  // gradient w.r.t input feature
+  if (inG) {
+    for (size_t i = 0; i < numSeq; ++i) {
+      size_t begin = starts[i];
+      size_t end = starts[i + 1];
+      size_t steps = end - begin;
+      for (size_t j = 0; j < steps; ++j) {
+        MatrixPtr dx = inG.subMatrix(begin + j, 1);
+        for (size_t t = 0; t < contextLength; ++t) {
+          if (int(j - t) >= 0) {
+            MatrixPtr dy =
+                (const_cast<CpuMatrix&>(outG)).subMatrix(begin + j - t, 1);
+            MatrixPtr w = (const_cast<CpuMatrix&>(filter)).subMatrix(t, 1);
+            dx->addDotMul(*dy, *w, 1.0, 1.0);
+          }
+        }
+      }
+    }
+  }
+}
+/**
+ * \brief The row convolution is called lookahead convolution. It is firstly
+ * introduced in deep-speech2 system. The bidirectional RNN that learns
+ * representation for a sequence by performing a forward and a backward pass
+ * through the entire sequence. However, unlike unidirectional RNNs,
+ * bidirectional RNNs are challenging to deploy in an online and low-latency
+ * setting. The lookahead convolution incorporates information from future
+ * subsequences in a computationally efficient manner to improve unidirectional
+ * recurrent neural networks.
+ *
+ * The connection of row convolution is different form the 1D sequence
+ * convolution. Assumed that, the future context-length is k, that is to say,
+ * it can get the output at timestep t by using the the input feature from t-th
+ * timestep to (t+k)-th timestep. Assumed that the hidden dim of input
+ * activations are d, the activations r_t for the new layer at time-step t are:
+ *
+ *
+ *            -- k + 1
+ *  r(t,i) =  >       W(i,j) * h(t+j-1, i),  for (1 <= i <= d)
+ *            -- j = 1
+ *
+ *
+ * The weight shape is: (k + 1) x d
+ * Function Arguments:
+ *
+ * \param inputs[0]  The input activations.
+ * \param inputs[0]  The filter (or weight) and shape is (k+1) x d.
+ * \param outputs[1] The output activations.
+ *
+ * [1] Dario Amodei, etc. Deep Speech 2 : End-to-End Speech Recognition in
+ * English
+ *     and Mandarin. https://arxiv.org/abs/1512.02595
+ */
+template <DeviceType Device>
+class RowConvFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override {}
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    // check
+    CHECK_EQ(2UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    // TODO(qingqing): support ASSIGN_TO.
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
+        << "SequenceArg required here.";
+    const auto in = dynamic_cast<const SequenceArg&>(inputs[0]);
+    auto out = dynamic_cast<const SequenceArg&>(outputs[0]);
+    auto w = inputs[1];
+    CHECK(in.data() && out.data() && in.getSequenceId().data());
+    CHECK_EQ(in.shape().ndims(), 2UL);
+    CHECK(in.shape() == out.shape());
+    CHECK_EQ(w.shape()[1], in.shape()[1]);
+    auto outMat = out.matrix<Device>();
+    const auto inMat = in.matrix<Device>();
+    const auto wMat = w.matrix<Device>();
+    const auto seqId = in.getSequenceId().vector<int, Device>();
+    RowConv<Device>(outMat, inMat, wMat, seqId);
+  }
+};
+/**
+ * \brief The backward of row convolution function. This function calculated
+ * the gradient w.r.t filter and the gradient w.r.t input activations(or data).
+ *
+ * Argument in this Function:
+ *
+ * \param inputs[0]  The gradient w.r.t output activations.
+ * \param inputs[1]  The input activations.
+ * \param inputs[2]  The filter (or weight) and shape is (k+1) x d.
+ * \param outputs[0] The gradient w.r.t input activations.
+ * \param outputs[1] The gradient w.r.r filter.
+ *
+ * Abbreviation:
+ * w.r.t: with respect to.
+ */
+template <DeviceType Device>
+class RowConvGradFunc : public FunctionBase {
+  // TODO(qingqing): split into RowConvDataFunc and RowConvWeightFunc
+public:
+  void init(const FuncConfig& config) override {}
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    // check
+    CHECK_EQ(3UL, inputs.size());
+    CHECK_EQ(2UL, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    CHECK_EQ(outputs[1].getArgType(), ADD_TO);
+    CHECK(inputs[0].isSequenceArg() && inputs[1].isSequenceArg() &&
+          outputs[0].isSequenceArg())
+        << "SequenceArg required here.";
+    const auto outGrad = dynamic_cast<const SequenceArg&>(inputs[0]);
+    const auto in = dynamic_cast<const SequenceArg&>(inputs[1]);
+    const auto w = inputs[2];
+    auto inGrad = dynamic_cast<const SequenceArg&>(outputs[0]);
+    auto wGrad = outputs[1];
+    CHECK_EQ(in.shape().ndims(), 2UL);
+    CHECK(in.shape() == inGrad.shape());
+    CHECK(in.shape() == outGrad.shape());
+    CHECK_EQ(wGrad.shape()[1], in.shape()[1]);
+    const auto outGMat = outGrad.matrix<Device>();
+    const auto inMat = in.matrix<Device>();
+    const auto wMat = w.matrix<Device>();
+    auto inGMat = inGrad.data()
+                      ? inGrad.matrix<Device>()
+                      : typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
+    auto wGMat = wGrad.data()
+                     ? wGrad.matrix<Device>()
+                     : typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
+    const auto seqId = in.getSequenceId().vector<int, Device>();
+    RowConvGrad<Device>(outGMat, inMat, wMat, inGMat, wGMat, seqId);
+  }
+};
+REGISTER_TYPED_FUNC(RowConv, CPU, RowConvFunc);
+REGISTER_TYPED_FUNC(RowConvGrad, CPU, RowConvGradFunc);
+#ifndef PADDLE_ONLY_CPU
+REGISTER_TYPED_FUNC(RowConv, GPU, RowConvFunc);
+REGISTER_TYPED_FUNC(RowConvGrad, GPU, RowConvGradFunc);
+#endif
+}  // namespace paddle
--- a/paddle/function/RowConvOp.h
+++ b/paddle/function/RowConvOp.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "Function.h"
+namespace paddle {
+/**
+ * \brief The forward of row convolution.
+ *
+ * \param[out] out      The output data and shape is h x d. h is the sum of
+ *                      time steps of all samples in one mini-batch.
+ * \param[in]  in       The input data and shape is h x d.
+ * \param[in]  filter   The filter and shape is k x d. The lookahead step
+ *                      number plus one equals k.
+ * \param[in]  seq      The sequence start positions.
+ *
+ */
+template <DeviceType DType>
+void RowConv(typename Tensor<real, DType>::Matrix& out,
+             const typename Tensor<real, DType>::Matrix& in,
+             const typename Tensor<real, DType>::Matrix& filter,
+             const typename Tensor<int, DType>::Vector& seq);
+/**
+ * \brief The backward of row convolution.
+ *
+ * \param[in]  outG     The gradient w.r.t output data.
+ * \param[in]  in       The input data.
+ * \param[in]  filter   The filter.
+ * \param[out] inG      The gradient w.r.t input data.
+ * \param[out] filterG  The gradient w.r.t filter.
+ * \param[in]  seq      The sequence start positions.
+ *
+ */
+template <DeviceType DType>
+void RowConvGrad(const typename Tensor<real, DType>::Matrix& outG,
+                 const typename Tensor<real, DType>::Matrix& in,
+                 const typename Tensor<real, DType>::Matrix& filter,
+                 typename Tensor<real, DType>::Matrix& inG,
+                 typename Tensor<real, DType>::Matrix& filterG,
+                 const typename Tensor<int, DType>::Vector& seq);
+}  // namespace paddle
--- a/paddle/function/RowConvOpGpu.cu
+++ b/paddle/function/RowConvOpGpu.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "hl_base.h"
+#include "RowConvOp.h"
+namespace paddle {
+template<int BLOCK_H, int BLOCK_W>
+__global__ void KeRowConv(real* y, const real* x,  const real* w,
+    const int* starts, const int height, const int width,
+    const int numSeq, const int context) {
+  const int tidx = threadIdx.x;
+  const int tidy = threadIdx.y;
+  const int blky = blockDim.y;
+  const int gidx = blockIdx.x * blockDim.x;
+  __shared__ real sw[BLOCK_H][BLOCK_W];
+  for (int i = tidy; i < context; i += blky) {
+    sw[i][tidx] = gidx + tidx < width ? w[i*width + gidx + tidx] : 0.0;
+  }
+  __syncthreads();
+  for (int i = 0; i < numSeq; ++i) {
+    const int start = starts[i];
+    const int end = starts[i + 1];
+    const int steps = end - start;
+    for (int j = tidy; j < steps; j += blky) {
+      real sum = 0;
+      int off = (start + j) * width;
+      for (int t = 0; t < context; ++t) {
+        if ((start + j + t) < end) {
+          int xoff = off + t * width;
+          real xVal = gidx + tidx < width ? x[xoff + gidx + tidx] : 0.0;
+          sum += sw[t][tidx] * xVal;
+        }
+      }
+      if (gidx + tidx < width) {
+        y[off + gidx + tidx] += sum;
+      }
+    }
+  }
+}
+__global__ void KeRowConv2(real* y, const real* x,  const real* w,
+    const int* starts, const int height, const int width,
+    const int numSeq, const int context) {
+  const int tidx = threadIdx.x;
+  const int tidy = threadIdx.y;
+  const int blky = blockDim.y;
+  const int gidx = blockIdx.x * blockDim.x;
+  for (int i = 0; i < numSeq; ++i) {
+    const int start = starts[i];
+    const int end = starts[i + 1];
+    const int steps = end - start;
+    for (int j = tidy; j < steps; j += blky) {
+      int off = (start + j) * width;
+      real sum = 0;
+      for (int t = 0; t < context && (start + j + t) < end; ++t) {
+        int xoff = off + t * width;
+        real xd = gidx + tidx < width ? x[xoff + gidx + tidx] : 0.0;
+        real wd = gidx + tidx < width ? w[t * width + gidx + tidx] : 0.0;
+        sum += wd * xd;
+      }
+      if (gidx + tidx < width) {
+        y[off + gidx + tidx] += sum;
+      }
+    }
+  }
+}
+template <>
+void RowConv<DEVICE_TYPE_GPU>(GpuMatrix& out,
+                              const GpuMatrix& in,
+                              const GpuMatrix& filter,
+                              const GpuIVector& seq) {
+  const size_t numSeq = seq.getSize() - 1;
+  const size_t contextLength = filter.getHeight();
+  const size_t height = in.getHeight();
+  const size_t width = in.getWidth();
+  real* y = out.getData();
+  const real* x = in.getData();
+  const real* w = filter.getData();
+  const int* starts = seq.getData();
+  dim3 dimBlock(32, 32);
+  dim3 dimGrid(DIVUP(width, dimBlock.x), 1);
+  if (contextLength <= 32) {
+    KeRowConv<32, 32><<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>
+      (y, x, w, starts, height, width, numSeq, contextLength);
+  } else {
+    KeRowConv2<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>
+      (y, x, w, starts, height, width, numSeq, contextLength);
+  }
+  CHECK_SYNC("RowConv");
+}
+template<int BLOCK_H, int BLOCK_W, int CONTEXT>
+__global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy,
+    const int* starts, const int height, const int width, const int numSeq,
+    const int context) {
+  const int tidx = threadIdx.x;
+  const int tidy = threadIdx.y;
+  const int blky = blockDim.y;
+  const int gidx = blockIdx.x * blockDim.x;
+  __shared__ real sh_x[BLOCK_W][BLOCK_H];
+  __shared__ real sh_dy[BLOCK_W][BLOCK_H + CONTEXT - 1];
+  __shared__ real sh_dw[CONTEXT][BLOCK_W];
+  if (tidy < context) {
+    sh_dw[tidy][tidx] = 0.0;
+  }
+  __syncthreads();
+  for (int i = 0; i < numSeq; ++i) {
+    const int start = starts[i];
+    const int end = starts[i + 1];
+    const int steps = end - start;
+    const int size = ((steps + BLOCK_H - 1)/BLOCK_H) * BLOCK_H;
+    for (int j = tidy; j < size; j += BLOCK_H) {
+      int xoff = gidx + tidx;
+      int yoff = start + j;
+      // transpose
+      sh_x[tidx][tidy] = (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0;
+      sh_dy[tidx][tidy + context - 1] = (xoff < width && yoff < end) ? dy[yoff * width + xoff] : 0.0;
+      __syncthreads();
+      if (tidy < (context - 1)) {
+        yoff = yoff - context + 1;
+        sh_dy[tidx][tidy] = (xoff < width && yoff >= start) ? dy[yoff * width + xoff] : 0.0;
+      }
+      __syncthreads();
+      for (int t = 0; t < context; t++) {
+        real val = sh_x[tidy][tidx] * sh_dy[tidy][tidx + context - 1 - t];
+        __syncthreads();
+        // warp size and blockDim.x is 32.
+        val += __shfl_down(val, 16);
+        val += __shfl_down(val, 8);
+        val += __shfl_down(val, 4);
+        val += __shfl_down(val, 2);
+        val += __shfl_down(val, 1);
+        __syncthreads();
+        if (tidx == 0) {
+          sh_dw[t][tidy] += val;
+        }
+        __syncthreads();
+      }
+    }
+  }
+  for (int t = tidy; (t < context) && ((gidx + tidx) < width); t += blky) {
+    dw[t * width + gidx + tidx] += sh_dw[t][tidx];
+  }
+}
+template<int BLOCK_H, int BLOCK_W>
+__global__ void KeRowConvBwWeight2(real* dw, const real* x, const real* dy,
+    const int* starts, const int height, const int width, const int numSeq,
+    const int context) {
+  const int tidx = threadIdx.x;
+  const int tidy = threadIdx.y;
+  const int gidx = blockIdx.x * blockDim.x;
+  __shared__ real sh_x[BLOCK_H][BLOCK_W];
+  __shared__ real sh_dy[BLOCK_H][BLOCK_W];
+  for (int i = 0; i < numSeq; ++i) {
+    const int start = starts[i];
+    const int end = starts[i + 1];
+    const int steps = end - start;
+    const int size = ((steps + BLOCK_H - 1)/BLOCK_H) * BLOCK_H;
+    for (int j = tidy; j < size; j += BLOCK_H) {
+      int xoff = gidx + tidx;
+      int yoff = start + j;
+      // transpose
+      sh_x[tidx][tidy] = (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0;
+      __syncthreads();
+      for (int t = 0; t < context; t++) {
+        sh_dy[tidx][tidy] = (xoff < width && (yoff - t) >= start && yoff - t < end) ? dy[(yoff - t) * width + xoff] : 0.0;
+        __syncthreads();
+        real val = sh_x[tidy][tidx] * sh_dy[tidy][tidx];
+        __syncthreads();
+        // warp size and blockDim.x is 32.
+        val += __shfl_down(val, 16);
+        val += __shfl_down(val, 8);
+        val += __shfl_down(val, 4);
+        val += __shfl_down(val, 2);
+        val += __shfl_down(val, 1);
+        __syncthreads();
+        if (tidx == 0 && (gidx + tidy) < width) {
+          dw[t*width + gidx + tidy] += val;
+        }
+      }
+    }
+  }
+}
+template<int BLOCK_H, int BLOCK_W>
+__global__ void KeRowConvBwData(real* dx, const real* w, const real* dy,
+    const int* starts, const int height, const int width, const int numSeq,
+    const int context) {
+  const int tidx = threadIdx.x;
+  const int tidy = threadIdx.y;
+  const int blky = blockDim.y;
+  const int gidx = blockIdx.x * blockDim.x;
+  __shared__ real sw[BLOCK_H][BLOCK_W];
+  for (int i = tidy; i < context; i += blky) {
+    sw[i][tidx] = gidx + tidx < width ? w[i*width + gidx + tidx] : 0.0;
+  }
+  __syncthreads();
+  for (int i = 0; i < numSeq; ++i) {
+    const int start = starts[i];
+    const int end = starts[i + 1];
+    const int steps = end - start;
+    for (int j = tidy; j < steps; j += blky) {
+      real sum = 0;
+      int off = (start + j) * width;
+      for (int t = 0; t < context && (j - t) >= 0; ++t) {
+        int dyOff = off - t * width;
+        real dyVal = gidx + tidx < width ? dy[dyOff + gidx + tidx] : 0.0;
+        sum += sw[t][tidx] * dyVal;
+      }
+      if (gidx + tidx < width) {
+        dx[off + gidx + tidx] += sum;
+      }
+    }
+  }
+}
+__global__ void KeRowConvBwData2(real* dx, const real* w, const real* dy,
+    const int* starts, const int height, const int width, const int numSeq,
+    const int context) {
+  const int tidx = threadIdx.x;
+  const int tidy = threadIdx.y;
+  const int blky = blockDim.y;
+  const int gidx = blockIdx.x * blockDim.x;
+  for (int i = 0; i < numSeq; ++i) {
+    const int start = starts[i];
+    const int end = starts[i + 1];
+    const int steps = end - start;
+    for (int j = tidy; j < steps; j += blky) {
+      real sum = 0;
+      int off = (start + j) * width;
+      for (int t = 0; t < context && (j - t) >= 0; ++t) {
+        int dyOff = off - t * width;
+        real dyVal = gidx + tidx < width ? dy[dyOff + gidx + tidx] : 0.0;
+        real wVal = gidx + tidx < width ? w[t * width + gidx + tidx] : 0.0;
+        sum += wVal * dyVal;
+      }
+      if (gidx + tidx < width) {
+        dx[off + gidx + tidx] += sum;
+      }
+    }
+  }
+}
+template <>
+void RowConvGrad<DEVICE_TYPE_GPU>(const GpuMatrix& outG,
+                              const GpuMatrix& in,
+                              const GpuMatrix& filter,
+                              GpuMatrix& inG,
+                              GpuMatrix& filterG,
+                              const GpuIVector& seq) {
+  const size_t numSeq = seq.getSize() - 1;
+  const size_t contextLength = filter.getHeight();
+  const size_t height = in.getHeight();
+  const size_t width = in.getWidth();
+  const real* dy = outG.getData();
+  const real* x = in.getData();
+  const real* w = filter.getData();
+  const int* starts = seq.getData();
+  if (filterG) {
+    dim3 dimBlock(32, 32);
+    dim3 dimGrid(DIVUP(width, dimBlock.x), 1);
+    real* dw = filterG.getData();
+    if (contextLength <= 32) { 
+      KeRowConvBwWeight<32, 32, 32>
+        <<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>
+        (dw, x, dy, starts, height, width, numSeq, contextLength);
+    } else {
+      KeRowConvBwWeight2<32, 32>
+        <<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>
+        (dw, x, dy, starts, height, width, numSeq, contextLength);
+    }
+  }
+  if (inG) {
+    real* dx = inG.getData();
+    dim3 dimBlock2(32, 32);
+    dim3 dimGrid2(DIVUP(width, dimBlock2.x), 1);
+    if (contextLength <= 64) {
+      KeRowConvBwData<32, 64>
+        <<<dimGrid2, dimBlock2, 0, STREAM_DEFAULT>>>
+        (dx, w, dy, starts, height, width, numSeq, contextLength);
+    } else {
+      KeRowConvBwData2
+        <<<dimGrid2, dimBlock2, 0, STREAM_DEFAULT>>>
+        (dx, w, dy, starts, height, width, numSeq, contextLength);
+    }
+  }
+  CHECK_SYNC("RowConvGrad");
+}
+}  // namespace paddle
--- a/paddle/function/RowConvOpTest.cpp
+++ b/paddle/function/RowConvOpTest.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <gtest/gtest.h>
+#include "FunctionTest.h"
+namespace paddle {
+void testRowConvFw(size_t batchSize, size_t dim, size_t contextLength) {
+  CpuGpuFuncCompare test("RowConv", FuncConfig());
+  test.addSequence(SequenceIdArg(TensorShape{batchSize}));
+  test.addInputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batchSize, dim}));
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{contextLength, dim}));
+  test.addOutputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batchSize, dim}),
+                  ADD_TO);
+  test.run();
+}
+void testRowConvBw(size_t batchSize, size_t dim, size_t contextLength) {
+  CpuGpuFuncCompare test("RowConvGrad", FuncConfig());
+  test.addSequence(SequenceIdArg(TensorShape{batchSize}));
+  test.addInputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batchSize, dim}));
+  test.addInputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batchSize, dim}));
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{contextLength, dim}));
+  test.addOutputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batchSize, dim}),
+                  ADD_TO);
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{contextLength, dim}),
+                  ADD_TO);
+  test.run();
+}
+TEST(RowConv, real) {
+  for (size_t numSamples : {17, 129, 2020}) {
+    for (size_t dim : {16, 512, 2560}) {
+      for (size_t context : {3, 19, 65}) {
+        VLOG(3) << " numSamples=" << numSamples << " dim=" << dim
+                << " context length=" << context;
+        testRowConvFw(numSamples, dim, context);
+        testRowConvBw(numSamples, dim, context);
+      }
+    }
+  }
+}
+}  // namespace paddle
--- a/paddle/gserver/layers/ConvBaseLayer.cpp
+++ b/paddle/gserver/layers/ConvBaseLayer.cpp
@@ -118,11 +118,7 @@ size_t ConvBaseLayer::calOutputSize() {
    layerSize = outH[0] * outW[0] * size_t(numFilters_);
  };
-  if (isDeconv_) {
-    setLayerSize(outputH_, outputW_, imgSizeH_, imgSizeW_);
-  } else {
  setLayerSize(imgSizeH_, imgSizeW_, outputH_, outputW_);
-  }
  return layerSize;
 }

--- a/paddle/gserver/layers/CudnnConvBaseLayer.cpp
+++ b/paddle/gserver/layers/CudnnConvBaseLayer.cpp
@@ -70,14 +70,8 @@ void CudnnConvBaseLayer::forward(PassType passType) {
  if (biases_) {
    REGISTER_TIMER_INFO("CudnnConvBiasTimer", getName().c_str());
    int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-    int outH, outW;
+    int outH = outputH_[0];
-    if (isDeconv_) {
+    int outW = outputW_[0];
-      outH = imgSizeH_[0];
-      outW = imgSizeW_[0];
-    } else {
-      outH = outputH_[0];
-      outW = outputW_[0];
-    }
    hl_tensor_reshape(outputDesc_,
                      batchSize,

--- a/paddle/gserver/layers/DetectionUtil.cpp
+++ b/paddle/gserver/layers/DetectionUtil.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "DetectionUtil.h"
+namespace paddle {
+size_t appendWithPermute(const Matrix& inMatrix,
+                         size_t height,
+                         size_t width,
+                         size_t outTotalSize,
+                         size_t outOffset,
+                         size_t batchSize,
+                         Matrix& outMatrix,
+                         PermMode permMode) {
+  CHECK_EQ(inMatrix.useGpu(), outMatrix.useGpu());
+  bool useGpu = inMatrix.useGpu();
+  if (permMode == kNCHWToNHWC) {
+    size_t inElementCnt = inMatrix.getElementCnt();
+    size_t channels = inElementCnt / (height * width * batchSize);
+    size_t imgSize = height * width;
+    for (size_t i = 0; i < batchSize; ++i) {
+      size_t offset = i * (outTotalSize / batchSize) + outOffset;
+      const MatrixPtr inTmp = Matrix::create(
+          const_cast<real*>(inMatrix.getData()) + i * channels * imgSize,
+          channels,
+          imgSize,
+          false,
+          useGpu);
+      MatrixPtr outTmp =
+          Matrix::create(const_cast<real*>(outMatrix.getData()) + offset,
+                         imgSize,
+                         channels,
+                         false,
+                         useGpu);
+      inTmp->transpose(outTmp, false);
+    }
+    return channels * imgSize;
+  } else {
+    LOG(FATAL) << "Unkown permute mode";
+  }
+}
+size_t decomposeWithPermute(const Matrix& inMatrix,
+                            size_t height,
+                            size_t width,
+                            size_t inTotalSize,
+                            size_t inOffset,
+                            size_t batchSize,
+                            Matrix& outMatrix,
+                            PermMode permMode) {
+  CHECK_EQ(inMatrix.useGpu(), outMatrix.useGpu());
+  bool useGpu = inMatrix.useGpu();
+  if (permMode == kNHWCToNCHW) {
+    size_t outElementCnt = outMatrix.getElementCnt();
+    size_t channels = outElementCnt / (height * width * batchSize);
+    size_t imgSize = height * width;
+    for (size_t i = 0; i < batchSize; ++i) {
+      size_t offset = i * (inTotalSize / batchSize) + inOffset;
+      const MatrixPtr inTmp =
+          Matrix::create(const_cast<real*>(inMatrix.getData()) + offset,
+                         imgSize,
+                         channels,
+                         false,
+                         useGpu);
+      MatrixPtr outTmp = Matrix::create(
+          const_cast<real*>(outMatrix.getData()) + i * channels * imgSize,
+          channels,
+          imgSize,
+          false,
+          useGpu);
+      inTmp->transpose(outTmp, false);
+    }
+    return channels * imgSize;
+  } else {
+    LOG(FATAL) << "Unkown permute mode";
+  }
+}
+real jaccardOverlap(const NormalizedBBox& bbox1, const NormalizedBBox& bbox2) {
+  if (bbox2.xMin > bbox1.xMax || bbox2.xMax < bbox1.xMin ||
+      bbox2.yMin > bbox1.yMax || bbox2.yMax < bbox1.yMin) {
+    return 0.0;
+  } else {
+    real interXMin = std::max(bbox1.xMin, bbox2.xMin);
+    real interYMin = std::max(bbox1.yMin, bbox2.yMin);
+    real interXMax = std::min(bbox1.xMax, bbox2.xMax);
+    real interYMax = std::min(bbox1.yMax, bbox2.yMax);
+    real interWidth = interXMax - interXMin;
+    real interHeight = interYMax - interYMin;
+    real interArea = interWidth * interHeight;
+    real bboxArea1 = bbox1.getArea();
+    real bboxArea2 = bbox2.getArea();
+    return interArea / (bboxArea1 + bboxArea2 - interArea);
+  }
+}
+void encodeBBoxWithVar(const NormalizedBBox& priorBBox,
+                       const vector<real>& priorBBoxVar,
+                       const NormalizedBBox& gtBBox,
+                       vector<real>& outVec) {
+  real priorBBoxWidth = priorBBox.getWidth();
+  real priorBBoxHeight = priorBBox.getHeight();
+  real priorBBoxCenterX = priorBBox.getCenterX();
+  real priorBBoxCenterY = priorBBox.getCenterY();
+  real gtBBoxWidth = gtBBox.getWidth();
+  real gtBBoxHeight = gtBBox.getHeight();
+  real gtBBoxCenterX = gtBBox.getCenterX();
+  real gtBBoxCenterY = gtBBox.getCenterY();
+  outVec.clear();
+  outVec.push_back((gtBBoxCenterX - priorBBoxCenterX) / priorBBoxWidth /
+                   priorBBoxVar[0]);
+  outVec.push_back((gtBBoxCenterY - priorBBoxCenterY) / priorBBoxHeight /
+                   priorBBoxVar[1]);
+  outVec.push_back(std::log(std::fabs(gtBBoxWidth / priorBBoxWidth)) /
+                   priorBBoxVar[2]);
+  outVec.push_back(std::log(std::fabs(gtBBoxHeight / priorBBoxHeight)) /
+                   priorBBoxVar[3]);
+}
+NormalizedBBox decodeBBoxWithVar(const NormalizedBBox& priorBBox,
+                                 const vector<real>& priorBBoxVar,
+                                 const vector<real>& locPredData) {
+  real priorBBoxWidth = priorBBox.getWidth();
+  real priorBBoxHeight = priorBBox.getHeight();
+  real priorBBoxCenterX = priorBBox.getCenterX();
+  real priorBBoxCenterY = priorBBox.getCenterY();
+  real decodedBBoxCenterX =
+      priorBBoxVar[0] * locPredData[0] * priorBBoxWidth + priorBBoxCenterX;
+  real decodedBBoxCenterY =
+      priorBBoxVar[1] * locPredData[1] * priorBBoxHeight + priorBBoxCenterY;
+  real decodedBBoxWidth =
+      std::exp(priorBBoxVar[2] * locPredData[2]) * priorBBoxWidth;
+  real decodedBBoxHeight =
+      std::exp(priorBBoxVar[3] * locPredData[3]) * priorBBoxHeight;
+  NormalizedBBox decodedBBox;
+  decodedBBox.xMin = decodedBBoxCenterX - decodedBBoxWidth / 2;
+  decodedBBox.yMin = decodedBBoxCenterY - decodedBBoxHeight / 2;
+  decodedBBox.xMax = decodedBBoxCenterX + decodedBBoxWidth / 2;
+  decodedBBox.yMax = decodedBBoxCenterY + decodedBBoxHeight / 2;
+  return decodedBBox;
+}
+void getBBoxFromPriorData(const real* priorData,
+                          const size_t numBBoxes,
+                          vector<NormalizedBBox>& bboxVec) {
+  size_t outOffset = bboxVec.size();
+  bboxVec.resize(bboxVec.size() + numBBoxes);
+  for (size_t i = 0; i < numBBoxes; ++i) {
+    NormalizedBBox bbox;
+    bbox.xMin = *(priorData + i * 8);
+    bbox.yMin = *(priorData + i * 8 + 1);
+    bbox.xMax = *(priorData + i * 8 + 2);
+    bbox.yMax = *(priorData + i * 8 + 3);
+    bboxVec[outOffset + i] = bbox;
+  }
+}
+void getBBoxVarFromPriorData(const real* priorData,
+                             const size_t num,
+                             vector<vector<real>>& varVec) {
+  size_t outOffset = varVec.size();
+  varVec.resize(varVec.size() + num);
+  for (size_t i = 0; i < num; ++i) {
+    vector<real> var;
+    var.push_back(*(priorData + i * 8 + 4));
+    var.push_back(*(priorData + i * 8 + 5));
+    var.push_back(*(priorData + i * 8 + 6));
+    var.push_back(*(priorData + i * 8 + 7));
+    varVec[outOffset + i] = var;
+  }
+}
+void getBBoxFromLabelData(const real* labelData,
+                          const size_t numBBoxes,
+                          vector<NormalizedBBox>& bboxVec) {
+  size_t outOffset = bboxVec.size();
+  bboxVec.resize(bboxVec.size() + numBBoxes);
+  for (size_t i = 0; i < numBBoxes; ++i) {
+    NormalizedBBox bbox;
+    bbox.xMin = *(labelData + i * 6 + 1);
+    bbox.yMin = *(labelData + i * 6 + 2);
+    bbox.xMax = *(labelData + i * 6 + 3);
+    bbox.yMax = *(labelData + i * 6 + 4);
+    real isDifficult = *(labelData + i * 6 + 5);
+    if (std::abs(isDifficult - 0.0) < 1e-6)
+      bbox.isDifficult = false;
+    else
+      bbox.isDifficult = true;
+    bboxVec[outOffset + i] = bbox;
+  }
+}
+void getBBoxFromDetectData(const real* detectData,
+                           const size_t numBBoxes,
+                           vector<real>& labelVec,
+                           vector<real>& scoreVec,
+                           vector<NormalizedBBox>& bboxVec) {
+  size_t outOffset = bboxVec.size();
+  labelVec.resize(outOffset + numBBoxes);
+  scoreVec.resize(outOffset + numBBoxes);
+  bboxVec.resize(outOffset + numBBoxes);
+  for (size_t i = 0; i < numBBoxes; ++i) {
+    labelVec[outOffset + i] = *(detectData + i * 7 + 1);
+    scoreVec[outOffset + i] = *(detectData + i * 7 + 2);
+    NormalizedBBox bbox;
+    bbox.xMin = *(detectData + i * 7 + 3);
+    bbox.yMin = *(detectData + i * 7 + 4);
+    bbox.xMax = *(detectData + i * 7 + 5);
+    bbox.yMax = *(detectData + i * 7 + 6);
+    bboxVec[outOffset + i] = bbox;
+  }
+}
+void matchBBox(const vector<NormalizedBBox>& priorBBoxes,
+               const vector<NormalizedBBox>& gtBBoxes,
+               real overlapThreshold,
+               vector<int>* matchIndices,
+               vector<real>* matchOverlaps) {
+  map<size_t, map<size_t, real>> overlaps;
+  size_t numPriors = priorBBoxes.size();
+  size_t numGTs = gtBBoxes.size();
+  matchIndices->clear();
+  matchIndices->resize(numPriors, -1);
+  matchOverlaps->clear();
+  matchOverlaps->resize(numPriors, 0.0);
+  // Store the positive overlap between predictions and ground truth
+  for (size_t i = 0; i < numPriors; ++i) {
+    for (size_t j = 0; j < numGTs; ++j) {
+      real overlap = jaccardOverlap(priorBBoxes[i], gtBBoxes[j]);
+      if (overlap > 1e-6) {
+        (*matchOverlaps)[i] = std::max((*matchOverlaps)[i], overlap);
+        overlaps[i][j] = overlap;
+      }
+    }
+  }
+  // Bipartite matching
+  vector<int> gtPool;
+  for (size_t i = 0; i < numGTs; ++i) {
+    gtPool.push_back(i);
+  }
+  while (gtPool.size() > 0) {
+    // Find the most overlapped gt and corresponding predictions
+    int maxPriorIdx = -1;
+    int maxGTIdx = -1;
+    real maxOverlap = -1.0;
+    for (map<size_t, map<size_t, real>>::iterator it = overlaps.begin();
+         it != overlaps.end();
+         ++it) {
+      size_t i = it->first;
+      if ((*matchIndices)[i] != -1) {
+        // The prediction already has matched ground truth or is ignored
+        continue;
+      }
+      for (size_t p = 0; p < gtPool.size(); ++p) {
+        int j = gtPool[p];
+        if (it->second.find(j) == it->second.end()) {
+          // No overlap between the i-th prediction and j-th ground truth
+          continue;
+        }
+        // Find the maximum overlapped pair
+        if (it->second[j] > maxOverlap) {
+          maxPriorIdx = (int)i;
+          maxGTIdx = (int)j;
+          maxOverlap = it->second[j];
+        }
+      }
+    }
+    if (maxPriorIdx == -1) {
+      break;
+    } else {
+      (*matchIndices)[maxPriorIdx] = maxGTIdx;
+      (*matchOverlaps)[maxPriorIdx] = maxOverlap;
+      gtPool.erase(std::find(gtPool.begin(), gtPool.end(), maxGTIdx));
+    }
+  }
+  // Get most overlaped for the rest prediction bboxes
+  for (map<size_t, map<size_t, real>>::iterator it = overlaps.begin();
+       it != overlaps.end();
+       ++it) {
+    size_t i = it->first;
+    if ((*matchIndices)[i] != -1) {
+      // The prediction already has matched ground truth or is ignored
+      continue;
+    }
+    int maxGTIdx = -1;
+    real maxOverlap = -1;
+    for (size_t j = 0; j < numGTs; ++j) {
+      if (it->second.find(j) == it->second.end()) {
+        // No overlap between the i-th prediction and j-th ground truth
+        continue;
+      }
+      // Find the maximum overlapped pair
+      real overlap = it->second[j];
+      if (overlap > maxOverlap && overlap >= overlapThreshold) {
+        maxGTIdx = j;
+        maxOverlap = overlap;
+      }
+    }
+    if (maxGTIdx != -1) {
+      (*matchIndices)[i] = maxGTIdx;
+      (*matchOverlaps)[i] = maxOverlap;
+    }
+  }
+}
+pair<size_t, size_t> generateMatchIndices(
+    const Matrix& priorValue,
+    const size_t numPriorBBoxes,
+    const Matrix& gtValue,
+    const int* gtStartPosPtr,
+    const size_t seqNum,
+    const vector<vector<real>>& maxConfScore,
+    const size_t batchSize,
+    const real overlapThreshold,
+    const real negOverlapThreshold,
+    const size_t negPosRatio,
+    vector<vector<int>>* matchIndicesVecPtr,
+    vector<vector<int>>* negIndicesVecPtr) {
+  vector<NormalizedBBox> priorBBoxes;  // share same prior bboxes
+  getBBoxFromPriorData(priorValue.getData(), numPriorBBoxes, priorBBoxes);
+  size_t totalPos = 0;
+  size_t totalNeg = 0;
+  for (size_t n = 0; n < batchSize; ++n) {
+    vector<int> matchIndices;
+    vector<int> negIndices;
+    vector<real> matchOverlaps;
+    matchIndices.resize(numPriorBBoxes, -1);
+    matchOverlaps.resize(numPriorBBoxes, 0.0);
+    size_t numGTBBoxes = 0;
+    if (n < seqNum) numGTBBoxes = gtStartPosPtr[n + 1] - gtStartPosPtr[n];
+    if (!numGTBBoxes) {
+      matchIndicesVecPtr->push_back(matchIndices);
+      negIndicesVecPtr->push_back(negIndices);
+      continue;
+    }
+    vector<NormalizedBBox> gtBBoxes;
+    getBBoxFromLabelData(
+        gtValue.getData() + gtStartPosPtr[n] * 6, numGTBBoxes, gtBBoxes);
+    matchBBox(
+        priorBBoxes, gtBBoxes, overlapThreshold, &matchIndices, &matchOverlaps);
+    size_t numPos = 0;
+    size_t numNeg = 0;
+    for (size_t i = 0; i < matchIndices.size(); ++i)
+      if (matchIndices[i] != -1) ++numPos;
+    totalPos += numPos;
+    vector<pair<real, size_t>> scoresIndices;
+    for (size_t i = 0; i < matchIndices.size(); ++i)
+      if (matchIndices[i] == -1 && matchOverlaps[i] < negOverlapThreshold) {
+        scoresIndices.push_back(std::make_pair(maxConfScore[n][i], i));
+        ++numNeg;
+      }
+    numNeg = std::min(static_cast<size_t>(numPos * negPosRatio), numNeg);
+    std::sort(scoresIndices.begin(),
+              scoresIndices.end(),
+              sortScorePairDescend<size_t>);
+    for (size_t i = 0; i < numNeg; ++i)
+      negIndices.push_back(scoresIndices[i].second);
+    totalNeg += numNeg;
+    matchIndicesVecPtr->push_back(matchIndices);
+    negIndicesVecPtr->push_back(negIndices);
+  }
+  return std::make_pair(totalPos, totalNeg);
+}
+void getMaxConfidenceScores(const real* confData,
+                            const size_t batchSize,
+                            const size_t numPriorBBoxes,
+                            const size_t numClasses,
+                            const size_t backgroundId,
+                            vector<vector<real>>* maxConfScoreVecPtr) {
+  maxConfScoreVecPtr->clear();
+  for (size_t i = 0; i < batchSize; ++i) {
+    vector<real> maxConfScore;
+    for (size_t j = 0; j < numPriorBBoxes; ++j) {
+      int offset = j * numClasses;
+      real maxVal = -FLT_MAX;
+      real maxPosVal = -FLT_MAX;
+      real maxScore = 0.0;
+      for (size_t c = 0; c < numClasses; ++c) {
+        maxVal = std::max<real>(confData[offset + c], maxVal);
+        if (c != backgroundId)
+          maxPosVal = std::max<real>(confData[offset + c], maxPosVal);
+      }
+      real sum = 0.0;
+      for (size_t c = 0; c < numClasses; ++c)
+        sum += std::exp(confData[offset + c] - maxVal);
+      maxScore = std::exp(maxPosVal - maxVal) / sum;
+      maxConfScore.push_back(maxScore);
+    }
+    confData += numPriorBBoxes * numClasses;
+    maxConfScoreVecPtr->push_back(maxConfScore);
+  }
+}
+template <typename T>
+bool sortScorePairDescend(const pair<real, T>& pair1,
+                          const pair<real, T>& pair2) {
+  return pair1.first > pair2.first;
+}
+template <>
+bool sortScorePairDescend(const pair<real, NormalizedBBox>& pair1,
+                          const pair<real, NormalizedBBox>& pair2) {
+  return pair1.first > pair2.first;
+}
+void applyNMSFast(const vector<NormalizedBBox>& bboxes,
+                  const real* confScoreData,
+                  size_t classIdx,
+                  size_t topK,
+                  real confThreshold,
+                  real nmsThreshold,
+                  size_t numPriorBBoxes,
+                  size_t numClasses,
+                  vector<size_t>* indices) {
+  vector<pair<real, size_t>> scores;
+  for (size_t i = 0; i < numPriorBBoxes; ++i) {
+    size_t confOffset = i * numClasses + classIdx;
+    if (confScoreData[confOffset] > confThreshold)
+      scores.push_back(std::make_pair(confScoreData[confOffset], i));
+  }
+  std::stable_sort(scores.begin(), scores.end(), sortScorePairDescend<size_t>);
+  if (topK > 0 && topK < scores.size()) scores.resize(topK);
+  while (scores.size() > 0) {
+    const size_t idx = scores.front().second;
+    bool keep = true;
+    for (size_t i = 0; i < indices->size(); ++i) {
+      if (keep) {
+        const size_t savedIdx = (*indices)[i];
+        real overlap = jaccardOverlap(bboxes[idx], bboxes[savedIdx]);
+        keep = overlap <= nmsThreshold;
+      } else {
+        break;
+      }
+    }
+    if (keep) indices->push_back(idx);
+    scores.erase(scores.begin());
+  }
+}
+size_t getDetectionIndices(
+    const real* confData,
+    const size_t numPriorBBoxes,
+    const size_t numClasses,
+    const size_t backgroundId,
+    const size_t batchSize,
+    const size_t confThreshold,
+    const size_t nmsTopK,
+    const real nmsThreshold,
+    const size_t keepTopK,
+    const vector<vector<NormalizedBBox>>& allDecodedBBoxes,
+    vector<map<size_t, vector<size_t>>>* allDetectionIndices) {
+  size_t totalKeepNum = 0;
+  for (size_t n = 0; n < batchSize; ++n) {
+    const vector<NormalizedBBox>& decodedBBoxes = allDecodedBBoxes[n];
+    size_t numDetected = 0;
+    map<size_t, vector<size_t>> indices;
+    size_t confOffset = n * numPriorBBoxes * numClasses;
+    for (size_t c = 0; c < numClasses; ++c) {
+      if (c == backgroundId) continue;
+      applyNMSFast(decodedBBoxes,
+                   confData + confOffset,
+                   c,
+                   nmsTopK,
+                   confThreshold,
+                   nmsThreshold,
+                   numPriorBBoxes,
+                   numClasses,
+                   &(indices[c]));
+      numDetected += indices[c].size();
+    }
+    if (keepTopK > 0 && numDetected > keepTopK) {
+      vector<pair<real, pair<size_t, size_t>>> scoreIndexPairs;
+      for (size_t c = 0; c < numClasses; ++c) {
+        const vector<size_t>& labelIndices = indices[c];
+        for (size_t i = 0; i < labelIndices.size(); ++i) {
+          size_t idx = labelIndices[i];
+          scoreIndexPairs.push_back(
+              std::make_pair((confData + confOffset)[idx * numClasses + c],
+                             std::make_pair(c, idx)));
+        }
+      }
+      std::sort(scoreIndexPairs.begin(),
+                scoreIndexPairs.end(),
+                sortScorePairDescend<pair<size_t, size_t>>);
+      scoreIndexPairs.resize(keepTopK);
+      map<size_t, vector<size_t>> newIndices;
+      for (size_t i = 0; i < scoreIndexPairs.size(); ++i) {
+        size_t label = scoreIndexPairs[i].second.first;
+        size_t idx = scoreIndexPairs[i].second.second;
+        newIndices[label].push_back(idx);
+      }
+      allDetectionIndices->push_back(newIndices);
+      totalKeepNum += keepTopK;
+    } else {
+      allDetectionIndices->push_back(indices);
+      totalKeepNum += numDetected;
+    }
+  }
+  return totalKeepNum;
+}
+void getDetectionOutput(const real* confData,
+                        const size_t numKept,
+                        const size_t numPriorBBoxes,
+                        const size_t numClasses,
+                        const size_t batchSize,
+                        const vector<map<size_t, vector<size_t>>>& allIndices,
+                        const vector<vector<NormalizedBBox>>& allDecodedBBoxes,
+                        Matrix& out) {
+  MatrixPtr outBuffer;
+  Matrix::resizeOrCreate(outBuffer, numKept, 7, false, false);
+  real* bufferData = outBuffer->getData();
+  size_t count = 0;
+  for (size_t n = 0; n < batchSize; ++n) {
+    for (map<size_t, vector<size_t>>::const_iterator it = allIndices[n].begin();
+         it != allIndices[n].end();
+         ++it) {
+      size_t label = it->first;
+      const vector<size_t>& indices = it->second;
+      const vector<NormalizedBBox>& decodedBBoxes = allDecodedBBoxes[n];
+      for (size_t i = 0; i < indices.size(); ++i) {
+        size_t idx = indices[i];
+        size_t confOffset = n * numPriorBBoxes * numClasses + idx * numClasses;
+        bufferData[count * 7] = n;
+        bufferData[count * 7 + 1] = label;
+        bufferData[count * 7 + 2] = (confData + confOffset)[label];
+        NormalizedBBox clippedBBox = clipBBox(decodedBBoxes[idx]);
+        bufferData[count * 7 + 3] = clippedBBox.xMin;
+        bufferData[count * 7 + 4] = clippedBBox.yMin;
+        bufferData[count * 7 + 5] = clippedBBox.xMax;
+        bufferData[count * 7 + 6] = clippedBBox.yMax;
+        ++count;
+      }
+    }
+  }
+  out.copyFrom(bufferData, numKept * 7);
+}
+NormalizedBBox clipBBox(const NormalizedBBox& bbox) {
+  real realOne = static_cast<real>(1.0);
+  real realZero = static_cast<real>(0.0);
+  NormalizedBBox clippedBBox;
+  clippedBBox.xMin = std::max(std::min(bbox.xMin, realOne), realZero);
+  clippedBBox.yMin = std::max(std::min(bbox.yMin, realOne), realZero);
+  clippedBBox.xMax = std::max(std::min(bbox.xMax, realOne), realZero);
+  clippedBBox.yMax = std::max(std::min(bbox.yMax, realOne), realZero);
+  return clippedBBox;
+}
+}  // namespace paddle
--- a/paddle/gserver/layers/DetectionUtil.h
+++ b/paddle/gserver/layers/DetectionUtil.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <float.h>
+#include <algorithm>
+#include <vector>
+#include "paddle/math/Matrix.h"
+using std::vector;
+using std::pair;
+using std::map;
+namespace paddle {
+template <typename T>
+struct BBoxBase {
+  BBoxBase(T xMin, T yMin, T xMax, T yMax)
+      : xMin(xMin), yMin(yMin), xMax(xMax), yMax(yMax), isDifficult(false) {}
+  BBoxBase() {}
+  T getWidth() const { return xMax - xMin; }
+  T getHeight() const { return yMax - yMin; }
+  T getCenterX() const { return (xMin + xMax) / 2; }
+  T getCenterY() const { return (yMin + yMax) / 2; }
+  T getArea() const { return getWidth() * getHeight(); }
+  // coordinate of bounding box
+  T xMin;
+  T yMin;
+  T xMax;
+  T yMax;
+  // whether difficult object (e.g. object with heavy occlusion is difficult)
+  bool isDifficult;
+};
+struct NormalizedBBox : BBoxBase<real> {
+  NormalizedBBox() : BBoxBase<real>() {}
+};
+enum PermMode { kNCHWToNHWC, kNHWCToNCHW };
+/**
+ * @brief First permute input maxtrix then append to output matrix
+ */
+size_t appendWithPermute(const Matrix& inMatrix,
+                         size_t height,
+                         size_t width,
+                         size_t outTotalSize,
+                         size_t outOffset,
+                         size_t batchSize,
+                         Matrix& outMatrix,
+                         PermMode permMode);
+/**
+ * @brief First permute input maxtrix then decompose to output
+ */
+size_t decomposeWithPermute(const Matrix& inMatrix,
+                            size_t height,
+                            size_t width,
+                            size_t totalSize,
+                            size_t offset,
+                            size_t batchSize,
+                            Matrix& outMatrix,
+                            PermMode permMode);
+/**
+ * @brief Compute jaccard overlap between two bboxes.
+ * @param bbox1 The first bbox
+ * @param bbox2 The second bbox
+ */
+real jaccardOverlap(const NormalizedBBox& bbox1, const NormalizedBBox& bbox2);
+/**
+ * @brief Compute offset parameters between prior bbox and ground truth bbox
+ * and variances of prior bbox are considered
+ * @param priorBBox Input prior bbox
+ * @param priorBBoxVar Variance parameters of prior bbox
+ * @param gtBBox Groundtruth bbox
+ * @param outVec Output vector
+ */
+void encodeBBoxWithVar(const NormalizedBBox& priorBBox,
+                       const vector<real>& priorBBoxVar,
+                       const NormalizedBBox& gtBBox,
+                       vector<real>& outVec);
+/**
+ * @brief Decode prior bbox with offset parameters
+ * and variances of prior bbox are considered
+ * @param priorBBox Prior bbox to be decoded
+ * @param priorBBoxVar Variance parameters of prior bbox
+ * @param locPredData Offset parameters
+ */
+NormalizedBBox decodeBBoxWithVar(const NormalizedBBox& priorBBox,
+                                 const vector<real>& priorBBoxVar,
+                                 const vector<real>& locPredData);
+/**
+ * @brief Extract bboxes from prior matrix, the layout is
+ * xmin1 | ymin1 | xmax1 | ymax1 | xmin1Var | ymin1Var | xmax1Var | ymax1Var ...
+ * @param priorData Matrix of prior value
+ * @param numBBoxes Number of bbox to be extracted
+ * @param bboxVec Append to the vector
+ */
+void getBBoxFromPriorData(const real* priorData,
+                          const size_t numBBoxes,
+                          vector<NormalizedBBox>& bboxVec);
+/**
+ * @brief Extract labels, scores and bboxes from detection matrix, the layout is
+ * imageId | label | score | xmin | ymin | xmax | ymax
+ * @param detectData Matrix of detection value
+ * @param numBBoxes Number of bbox to be extracted
+ * @param labelVec Label of bbox
+ * @param scoreVec Score of bbox
+ * @param bboxVec Append to the vector
+ */
+void getBBoxFromDetectData(const real* detectData,
+                           const size_t numBBoxes,
+                           vector<real>& labelVec,
+                           vector<real>& scoreVec,
+                           vector<NormalizedBBox>& bboxVec);
+/**
+ * @brief Extract variances from prior matrix, the layout is
+ * xmin1 | ymin1 | xmax1 | ymax1 | xmin1Var | ymin1Var | xmax1Var | ymax1Var ...
+ * @param priorData Matrix of prior value
+ * @param num Number to be extracted
+ * @param varVec Append to the vector
+ */
+void getBBoxVarFromPriorData(const real* priorData,
+                             const size_t num,
+                             vector<vector<real>>& varVec);
+/**
+ * @brief Extract bboxes from label matrix, the layout is
+ * class1_1 | xmin1_1 | ymin1_1 | xmax1_1 | ymax1_1 | difficult1_1 | ...
+ * @param labelData Matrix of label value
+ * @param numBBoxes Number to be extracted
+ * @param bboxVec Append to the vector
+ */
+void getBBoxFromLabelData(const real* labelData,
+                          const size_t numBBoxes,
+                          vector<NormalizedBBox>& bboxVec);
+/**
+* @brief Match prior bbox to groundtruth bbox, the strategy is:
+1. Find the most overlaped bbox pair (prior and groundtruth)
+2. For rest of prior bboxes find the most overlaped groundtruth bbox
+* @param priorBBoxes prior bbox
+* @param gtBBoxes groundtruth bbox
+* @param overlapThreshold Low boundary of overlap (judge whether matched)
+* @param matchIndices For each prior bbox, groundtruth bbox index if matched
+otherwise -1
+* @param matchOverlaps For each prior bbox, overap with all groundtruth bboxes
+*/
+void matchBBox(const vector<NormalizedBBox>& priorBBoxes,
+               const vector<NormalizedBBox>& gtBBoxes,
+               real overlapThreshold,
+               vector<int>* matchIndices,
+               vector<real>* matchOverlaps);
+/**
+* @brief Generate positive bboxes and negative bboxes,
+|positive bboxes|/|negative bboxes| is negPosRatio
+* @param priorValue Prior value
+* @param numPriorBBoxes Number of prior bbox
+* @param gtValue Groundtruth value
+* @param gtStartPosPtr Since groundtruth value stored as sequence type,
+this parameter indicates start position of each record
+* @param seqNum Number of sequence
+* @param maxConfScore Classification score for prior bbox, used to mine
+negative examples
+* @param batchSize Image number
+* @param overlapThreshold Low boundary of overap
+* @param negOverlapThreshold Upper boundary of overap (judge negative example)
+* @param negPosRatio Control number of negative bboxes
+* @param matchIndicesVecPtr Save indices of matched prior bbox
+* @param negIndicesVecPtr Save indices of negative prior bbox
+*/
+pair<size_t, size_t> generateMatchIndices(
+    const Matrix& priorValue,
+    const size_t numPriorBBoxes,
+    const Matrix& gtValue,
+    const int* gtStartPosPtr,
+    const size_t seqNum,
+    const vector<vector<real>>& maxConfScore,
+    const size_t batchSize,
+    const real overlapThreshold,
+    const real negOverlapThreshold,
+    const size_t negPosRatio,
+    vector<vector<int>>* matchIndicesVecPtr,
+    vector<vector<int>>* negIndicesVecPtr);
+/**
+ * @brief Get max confidence score for each prior bbox
+ * @param confData Confidence scores, layout is
+ * class1 score | class2 score | ... | classN score ...
+ * @param batchSize Image number
+ * @param numPriorBBoxes Prior bbox number
+ * @param numClasses Classes number
+ * @param backgroundId Background id
+ * @param maxConfScoreVecPtr Ouput
+ */
+void getMaxConfidenceScores(const real* confData,
+                            const size_t batchSize,
+                            const size_t numPriorBBoxes,
+                            const size_t numClasses,
+                            const size_t backgroundId,
+                            vector<vector<real>>* maxConfScoreVecPtr);
+template <typename T>
+bool sortScorePairDescend(const pair<real, T>& pair1,
+                          const pair<real, T>& pair2);
+template <>
+bool sortScorePairDescend(const pair<real, NormalizedBBox>& pair1,
+                          const pair<real, NormalizedBBox>& pair2);
+/**
+ * @brief Do NMS for bboxes to remove duplicated bboxes
+ * @param bboxes BBoxes to apply NMS
+ * @param confScoreData Confidence scores
+ * @param classIdx Class to do NMS
+ * @param topK Number to keep
+ * @param confThreshold Low boundary of confidence score
+ * @param nmsThreshold Threshold of overlap
+ * @param numPriorBBoxes Total number of prior bboxes
+ * @param numClasses Total class number
+ * @param indices Indices of high quality bboxes
+ */
+void applyNMSFast(const vector<NormalizedBBox>& bboxes,
+                  const real* confScoreData,
+                  size_t classIdx,
+                  size_t topK,
+                  real confThreshold,
+                  real nmsThreshold,
+                  size_t numPriorBBoxes,
+                  size_t numClasses,
+                  vector<size_t>* indices);
+/**
+ * @brief Get detection results which satify requirements
+ * @param numPriorBBoxes Prior bbox number
+ * @param numClasses Class number
+ * @param backgroundId Background class
+ * @param batchSize Image number
+ * @param confThreshold Threshold of class confidence
+ * @param nmsTopK Used in NMS operation to keep top k bbox
+ * @param nmsThreshold Used in NMS, threshold of overlap
+ * @param keepTopK How many bboxes keeped in an image
+ * @param allDecodedBBoxes Decoded bboxes for all images
+ * @param allDetectionIndices Save detection bbox indices
+ */
+size_t getDetectionIndices(
+    const real* confData,
+    const size_t numPriorBBoxes,
+    const size_t numClasses,
+    const size_t backgroundId,
+    const size_t batchSize,
+    const size_t confThreshold,
+    const size_t nmsTopK,
+    const real nmsThreshold,
+    const size_t keepTopK,
+    const vector<vector<NormalizedBBox>>& allDecodedBBoxes,
+    vector<map<size_t, vector<size_t>>>* allDetectionIndices);
+/**
+ * @brief Get detection results
+ * @param confData Confidence scores
+ * @param numPriorBBoxes Prior bbox number
+ * @param numClasses Class number
+ * @param batchSize Image number
+ * @param allIndices Indices of predicted bboxes
+ * @param allDecodedBBoxes BBoxes decoded
+ * @param out Output matrix
+ * image number | label | confidence score | xMin | yMin | xMax | yMax
+ */
+void getDetectionOutput(const real* confData,
+                        const size_t numKept,
+                        const size_t numPriorBBoxes,
+                        const size_t numClasses,
+                        const size_t batchSize,
+                        const vector<map<size_t, vector<size_t>>>& allIndices,
+                        const vector<vector<NormalizedBBox>>& allDecodedBBoxes,
+                        Matrix& out);
+NormalizedBBox clipBBox(const NormalizedBBox& bbox);
+}  // namespace paddle
--- a/paddle/gserver/layers/ExpandConvBaseLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvBaseLayer.cpp
@@ -22,26 +22,8 @@ bool ExpandConvBaseLayer::init(const LayerMap &layerMap,
  /* Initialize the basic convolutional parent class */
  ConvBaseLayer::init(layerMap, parameterMap);
-  /* The class fields channels_ and numFilters_ are the same as in the config
-   * i.e., channels_ is the for the input and numFilters_ is for the output
-   *
-   * But in order for the variables in convTrans having the same semantic
-   * meaning as in conv, we need to swap channels_ and numFilters here for
-   * convTrans, and in other functions too.
-   * */
-  /* Initialize the projection */
  for (auto &inputConfig : config_.inputs()) {
    const ConvConfig &conf = inputConfig.conv_conf();
-    int numFilters = isDeconv_ ? conf.channels() : numFilters_;
-    subM_.push_back(numFilters / conf.groups());
-    subN_.push_back(conf.output_x() *
-                    (conf.has_output_y() ? conf.output_y() : conf.output_x()));
-    int channel = isDeconv_ ? numFilters_ : conf.channels();
-    subK_.push_back(
-        channel * conf.filter_size() *
-        (conf.has_filter_size_y() ? conf.filter_size_y() : conf.filter_size()) /
-        conf.groups());
    /* Consistent caffe mode for multiple input */
    caffeMode_ = conf.caffe_mode();
  }
@@ -54,17 +36,9 @@ bool ExpandConvBaseLayer::init(const LayerMap &layerMap,
 size_t ExpandConvBaseLayer::getOutputSize() {
  CHECK_NE(inputLayers_.size(), 0UL);
  size_t layerSize = ConvBaseLayer::calOutputSize();
-  subN_.clear();
-  for (size_t i = 0; i < inputLayers_.size(); i++) {
-    subN_.push_back(outputH_[i] * outputW_[i]);
-  }
  return layerSize;
 }
-void ExpandConvBaseLayer::resetExpandInput(size_t height, size_t width) {
-  Matrix::resizeOrCreate(expandInput_, height, width, false, useGpu_);
-}
 void ExpandConvBaseLayer::addSharedBias() {
  size_t mapW = getOutputSize() / numFilters_;
  size_t mapH = getOutputValue()->getElementCnt() / mapW;
@@ -101,173 +75,6 @@ void ExpandConvBaseLayer::addUnsharedBias() {
  outValue->addBias(*bias, 1.0f);
 }
-void ExpandConvBaseLayer::expandOneFrame(MatrixPtr image,
-                                         size_t startIdx,
-                                         int inIdx) {
-  int channel = isDeconv_ ? numFilters_ : channels_[inIdx];
-  resetExpandInput(subK_[inIdx] * groups_[inIdx], subN_[inIdx]);
-  CHECK_EQ(image->getWidth(),
-           static_cast<size_t>(imgSizeH_[inIdx] * imgSizeW_[inIdx] * channel));
-  real *imgData = image->getData() + startIdx * image->getWidth();
-  MatrixPtr imageTmp =
-      Matrix::create(imgData,
-                     1,
-                     imgSizeH_[inIdx] * imgSizeW_[inIdx] * channel,
-                     false,
-                     useGpu_);
-  expandInput_->convExpand(*imageTmp,
-                           imgSizeH_[inIdx],
-                           imgSizeW_[inIdx],
-                           channel,
-                           filterSizeY_[inIdx],
-                           filterSize_[inIdx],
-                           strideY_[inIdx],
-                           stride_[inIdx],
-                           paddingY_[inIdx],
-                           padding_[inIdx],
-                           outputH_[inIdx],
-                           outputW_[inIdx]);
-  imageTmp->clear();
-}
-void ExpandConvBaseLayer::expandFwdOnce(MatrixPtr image,
-                                        MatrixPtr out,
-                                        int inIdx,
-                                        int startIdx) {
-  int subM = subM_[inIdx];
-  int subN = subN_[inIdx];
-  int subK = subK_[inIdx];
-  expandOneFrame(image, startIdx, inIdx);
-  int numFilters = isDeconv_ ? channels_[inIdx] : numFilters_;
-  real *outData = out->getData() + startIdx * subN * numFilters;
-  real *wgtData = weights_[inIdx]->getW()->getData();
-  real *expInData = expandInput_->getData();
-  for (int g = 0; g < groups_[inIdx]; ++g) {
-    MatrixPtr A =
-        Matrix::create(wgtData, subM, subK, false, useGpu_);  // mark transpose
-    MatrixPtr B = Matrix::create(expInData, subK, subN, false, useGpu_);
-    MatrixPtr C = Matrix::create(outData, subM, subN, false, useGpu_);
-    C->mul(*A, *B, 1, 1);
-    A->clear();
-    B->clear();
-    C->clear();
-    wgtData += subK * subM;
-    expInData += subK * subN;
-    outData += subM * subN;
-  }
-}
-void ExpandConvBaseLayer::bpropActs(MatrixPtr out,
-                                    MatrixPtr image,
-                                    int inpIdx) {
-  int channel = isDeconv_ ? numFilters_ : channels_[inpIdx];
-  int subM = subM_[inpIdx];
-  int subN = subN_[inpIdx];
-  int subK = subK_[inpIdx];
-  size_t batchSize = image->getHeight();
-  /* reset the expand-grad memory */
-  resetExpandInput(subK * groups_[inpIdx], subN);
-  real *localGradData = out->getData();
-  real *tgtGradData = image->getData();
-  for (size_t n = 0; n < batchSize; n++) {
-    real *wgtData = weights_[inpIdx]->getW()->getData();
-    real *expandInData = expandInput_->getData();
-    for (int g = 0; g < groups_[inpIdx]; g++) {
-      // create temporary matrix
-      MatrixPtr C = Matrix::create(expandInData, subK, subN, false, useGpu_);
-      MatrixPtr B = Matrix::create(localGradData, subM, subN, false, useGpu_);
-      MatrixPtr A = Matrix::create(wgtData, subM, subK, true, useGpu_);
-      C->mul(*A, *B);  // mul
-      // clear the temporary matrix
-      A->clear();
-      B->clear();
-      C->clear();
-      expandInData += subK * subN;
-      localGradData += subM * subN;
-      wgtData += subK * subM;
-    }
-    // shrink one frame outGrad
-    MatrixPtr oneGradTmp = Matrix::create(
-        expandInput_->getData(), subK * groups_[inpIdx], subN, false, useGpu_);
-    MatrixPtr vTmp =
-        Matrix::create(tgtGradData,
-                       1,
-                       imgSizeH_[inpIdx] * imgSizeW_[inpIdx] * channel,
-                       false,
-                       useGpu_);
-    vTmp->convShrink(*oneGradTmp,
-                     imgSizeH_[inpIdx],
-                     imgSizeW_[inpIdx],
-                     channel,
-                     filterSizeY_[inpIdx],
-                     filterSize_[inpIdx],
-                     strideY_[inpIdx],
-                     stride_[inpIdx],
-                     paddingY_[inpIdx],
-                     padding_[inpIdx],
-                     outputH_[inpIdx],
-                     outputW_[inpIdx],
-                     1.0f,
-                     1.0f);
-    vTmp->clear();
-    oneGradTmp->clear();
-    // move the data-pointer
-    tgtGradData += imgSizeH_[inpIdx] * imgSizeW_[inpIdx] * channel;
-  }
-}
-void ExpandConvBaseLayer::bpropWeights(MatrixPtr image,
-                                       MatrixPtr out,
-                                       int inpIdx) {
-  MatrixPtr weightGrad = weights_[inpIdx]->getWGrad();
-  int subM = subM_[inpIdx];
-  int subN = subN_[inpIdx];
-  int subK = subK_[inpIdx];
-  size_t batchSize = image->getHeight();
-  resetExpandInput(subK * groups_[inpIdx], subN);
-  real *gradData = out->getData();
-  for (size_t n = 0; n < batchSize; n++) {  // frame by frame
-    // expand
-    expandOneFrame(image, n, inpIdx);
-    real *wGradData = weightGrad->getData();
-    real *expandInData = expandInput_->getData();
-    // expand-mul one-group by one
-    for (int g = 0; g < groups_[inpIdx]; g++) {
-      MatrixPtr A = Matrix::create(expandInData, subK, subN, true, useGpu_);
-      MatrixPtr B = Matrix::create(gradData, subM, subN, false, useGpu_);
-      MatrixPtr C = Matrix::create(wGradData, subM, subK, false, useGpu_);
-      C->mul(*B, *A, 1, 1);
-      A->clear();
-      B->clear();
-      C->clear();
-      gradData += subM * subN;
-      wGradData += subK * subM;
-      expandInData += subK * subN;
-    }
-  }
-}
 void ExpandConvBaseLayer::bpropSharedBias(MatrixPtr biases, MatrixPtr v) {
  size_t mapW = getOutputSize() / numFilters_;
  size_t mapH = v->getElementCnt() / mapW;

--- a/paddle/gserver/layers/ExpandConvBaseLayer.h
+++ b/paddle/gserver/layers/ExpandConvBaseLayer.h
@@ -26,19 +26,6 @@ namespace paddle {
 */
 class ExpandConvBaseLayer : public ConvBaseLayer {
 protected:
-  /// For expand convolution.
-  /// subM_ = numFilters_ / groups_.
-  IntV subM_;
-  /// subN_ = outputH_ * outputW_.
-  IntV subN_;
-  /// subK_ = channels_ * filterPixels_ * groups_.
-  IntV subK_;
-  /*The expandInput_ and transOutValue_ are used for CPU expand conv calc
-   * Expand one sample at a time. shape:
-   * (numChannels * filterPixels_, outputSizeH * outputSizeW)
-   * */
-  MatrixPtr expandInput_;
  /// The transpose of output, which is an auxiliary matrix.
  MatrixPtr transOutValue_;
@@ -52,10 +39,6 @@ public:
            const ParameterMap& parameterMap) override;
  size_t getOutputSize();
-  /**
-   * Create or resize expandInput_.
-   */
-  void resetExpandInput(size_t height, size_t width);
  /**
   * Add shared bias.
@@ -66,20 +49,9 @@ public:
   * Add unshared bias.
   */
  void addUnsharedBias();
-  /**
-   * Expand one input sample.
-   */
-  void expandOneFrame(MatrixPtr image, size_t startIdx, int inIdx);
-  /**
-   * Expand one input sample and perform matrix multiplication.
-   */
-  void expandFwdOnce(MatrixPtr image, MatrixPtr out, int inIdx, int startIdx);
  void bpropSharedBias(MatrixPtr biases, MatrixPtr v);
  void bpropBiases(MatrixPtr v);
-  void bpropWeights(MatrixPtr image, MatrixPtr out, int inpIdx);
-  void bpropActs(MatrixPtr image, MatrixPtr out, int inpIdx);
 };
 }  // namespace paddle
--- a/paddle/gserver/layers/ExpandConvLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvLayer.cpp
@@ -18,32 +18,94 @@ limitations under the License. */
 namespace paddle {
+/*
+ * The calculation of the exconvt(convolution transpose (deconv) operation)
+ * is a swap of forward and backward of the calculation of exconv.
+ * */
 REGISTER_LAYER(exconv, ExpandConvLayer);
+REGISTER_LAYER(exconvt, ExpandConvLayer);
 bool ExpandConvLayer::init(const LayerMap &layerMap,
                           const ParameterMap &parameterMap) {
  /* Initialize the basic convolutional parent class */
  ExpandConvBaseLayer::init(layerMap, parameterMap);
+  size_t numInputs = config_.inputs_size();
+  inputShape_.resize(numInputs);
+  filterShape_.resize(numInputs);
+  outputShape_.resize(numInputs);
+  for (int i = 0; i < config_.inputs_size(); i++) {
+    std::vector<size_t> paddings = {(size_t)paddingY_[i], (size_t)padding_[i]};
+    std::vector<size_t> strides = {(size_t)strideY_[i], (size_t)stride_[i]};
+    createFunction(forward_,
+                   !isDeconv_ ? "GemmConv" : "GemmConvGradInput",
+                   FuncConfig()
+                       .set("paddings", paddings)
+                       .set("strides", strides)
+                       .set("groups", (size_t)groups_[i]));
+    createFunction(backward_,
+                   !isDeconv_ ? "GemmConvGradInput" : "GemmConv",
+                   FuncConfig()
+                       .set("paddings", paddings)
+                       .set("strides", strides)
+                       .set("groups", (size_t)groups_[i]));
+    createFunction(backward_,
+                   "GemmConvGradFilter",
+                   FuncConfig()
+                       .set("paddings", paddings)
+                       .set("strides", strides)
+                       .set("groups", (size_t)groups_[i]));
+  }
  return true;
 }
+// i is the index of input layers
+#define BACKWARD_INPUT(i, inputs, outputs) \
+  backward_[2 * i]->calc(inputs, outputs)
+#define BACKWARD_FILTER(i, inputs, outputs) \
+  backward_[2 * i + 1]->calc(inputs, outputs)
 void ExpandConvLayer::forward(PassType passType) {
  Layer::forward(passType);
-  /* malloc memory for the output_ if necessary */
+  size_t batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
  resetOutput(batchSize, getOutputSize());
-  MatrixPtr image = nullptr;
+  // Calculate the shape of the input, output, and filter.
-  MatrixPtr outV = getOutputValue();
  for (size_t i = 0; i < inputLayers_.size(); ++i) {
-    LayerPtr prevLayer = getPrev(i);
+    inputShape_[i] = TensorShape({(size_t)batchSize,
-    image = prevLayer->getOutputValue();
+                                  (size_t)channels_[i],
-    for (size_t off = 0; off < image->getHeight(); off++) {
+                                  (size_t)imgSizeH_[i],
-      REGISTER_TIMER_INFO("expandFwdOnce", getName().c_str());
+                                  (size_t)imgSizeW_[i]});
-      expandFwdOnce(image, outV, i, off);
+    filterShape_[i] =
+        TensorShape({(size_t)groups_[i],
+                     !isDeconv_ ? (size_t)numFilters_ / groups_[i]
+                                : (size_t)channels_[i] / groups_[i],
+                     !isDeconv_ ? (size_t)channels_[i] / groups_[i]
+                                : (size_t)numFilters_ / groups_[i],
+                     (size_t)filterSizeY_[i],
+                     (size_t)filterSize_[i]});
+    outputShape_[i] = TensorShape({(size_t)batchSize,
+                                   (size_t)numFilters_,
+                                   (size_t)outputH_[i],
+                                   (size_t)outputW_[i]});
  }
+  // Calculate the output value.
+  for (size_t i = 0; i < inputLayers_.size(); ++i) {
+    BufferArgs inputs;
+    BufferArgs outputs;
+    inputs.addArg(*getInputValue(i), inputShape_[i]);
+    inputs.addArg(*weights_[i]->getW(), filterShape_[i]);
+    outputs.addArg(*getOutputValue(),
+                   outputShape_[i],
+                   !isDeconv_ && i == 0 ? ASSIGN_TO : ADD_TO);
+    forward_[i]->calc(inputs, outputs);
  }
  /* add the bias-vector */
  if (biases_.get()) {
    if (sharedBiases_) {
@@ -67,14 +129,30 @@ void ExpandConvLayer::backward(const UpdateCallback &callback) {
    biases_->getParameterPtr()->incUpdate(callback);
  }
+  // Calculate the input grad and filter grad.
  for (size_t i = 0; i < inputLayers_.size(); ++i) {
-    /* First, calculate the input layers error */
+    if (getInputGrad(i)) {
-    if (getPrev(i)->getOutputGrad()) {
+      BufferArgs inputs;
-      bpropActs(outGrad, getPrev(i)->getOutputGrad(), i);
+      BufferArgs outputs;
+      inputs.addArg(*getOutputGrad(), outputShape_[i]);
+      inputs.addArg(*weights_[i]->getW(), filterShape_[i]);
+      outputs.addArg(*getInputGrad(i), inputShape_[i], ADD_TO);
+      BACKWARD_INPUT(i, inputs, outputs);
    }
    if (weights_[i]->getWGrad()) {
-      /* Then, calculate the W-gradient for the current layer */
+      BufferArgs inputs;
-      bpropWeights(getPrev(i)->getOutputValue(), outGrad, i);
+      BufferArgs outputs;
+      if (!isDeconv_) {
+        inputs.addArg(*getOutputGrad(), outputShape_[i]);
+        inputs.addArg(*getInputValue(i), inputShape_[i]);
+      } else {
+        inputs.addArg(*getInputValue(i), inputShape_[i]);
+        inputs.addArg(*getOutputGrad(), outputShape_[i]);
+      }
+      outputs.addArg(*weights_[i]->getWGrad(), filterShape_[i], ADD_TO);
+      BACKWARD_FILTER(i, inputs, outputs);
      /* Increasing the number of gradient */
      weights_[i]->getParameterPtr()->incUpdate(callback);
    }

--- a/paddle/gserver/layers/ExpandConvLayer.h
+++ b/paddle/gserver/layers/ExpandConvLayer.h
@@ -40,6 +40,11 @@ public:
  void forward(PassType passType) override;
  void backward(const UpdateCallback& callback) override;
+protected:
+  std::vector<TensorShape> inputShape_;
+  std::vector<TensorShape> filterShape_;
+  std::vector<TensorShape> outputShape_;
 };
 }  // namespace paddle
--- a/paddle/gserver/layers/ExpandConvTransLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvTransLayer.cpp
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "ExpandConvTransLayer.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-/* The implementation of the convTransLayer is basically a swap of forward and
- * backward of the original convLayer.
- * The variable naming follows the convention of the convLayer.
- * */
-namespace paddle {
-REGISTER_LAYER(exconvt, ExpandConvTransLayer);
-bool ExpandConvTransLayer::init(const LayerMap &layerMap,
-                                const ParameterMap &parameterMap) {
-  /* Initialize the basic convolutional parent class */
-  ExpandConvBaseLayer::init(layerMap, parameterMap);
-  return true;
-}
-void ExpandConvTransLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  /* malloc memory for the output_ if necessary */
-  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-  resetOutput(batchSize, getOutputSize());
-  MatrixPtr output = nullptr;
-  for (size_t i = 0; i < inputLayers_.size(); ++i) {
-    LayerPtr prevLayer = getPrev(i);
-    output = prevLayer->getOutputValue();
-    REGISTER_TIMER_INFO("shrinkFwd", getName().c_str());
-    bpropActs(output, getOutputValue(), i);
-  }
-  /* add the bias-vector */
-  if (biases_.get()) {
-    if (sharedBiases_) {
-      addSharedBias();
-    } else {
-      addUnsharedBias();
-    }
-  }
-  /* activation */
-  forwardActivation();
-}
-void ExpandConvTransLayer::backward(const UpdateCallback &callback) {
-  backwardActivation();
-  MatrixPtr imageGrad = getOutputGrad();
-  if (biases_ && biases_->getWGrad()) {
-    bpropBiases(imageGrad);
-    /* Increasing the number of gradient */
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-  for (size_t i = 0; i < inputLayers_.size(); ++i) {
-    /* First, calculate the input layers error */
-    for (size_t off = 0; off < imageGrad->getHeight(); off++) {
-      if (getPrev(i)->getOutputGrad()) {
-        expandFwdOnce(imageGrad, getPrev(i)->getOutputGrad(), i, off);
-      }
-    }
-    if (weights_[i]->getWGrad()) {
-      /* Then, calculate the W-gradient for the current layer */
-      bpropWeights(imageGrad, getPrev(i)->getOutputValue(), i);
-      /* Increasing the number of gradient */
-      weights_[i]->getParameterPtr()->incUpdate(callback);
-    }
-  }
-}
-}  // namespace paddle
--- a/paddle/gserver/layers/RowConvLayer.cpp
+++ b/paddle/gserver/layers/RowConvLayer.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "RowConvLayer.h"
+#include "paddle/utils/Stat.h"
+namespace paddle {
+REGISTER_LAYER(row_conv, RowConvLayer);
+bool RowConvLayer::init(const LayerMap& layerMap,
+                        const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+  contexLength_ = config_.inputs(0).row_conv_conf().context_length();
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  weight_.reset(new Weight(contexLength_, getSize(), parameters_[0]));
+  createFunction(forward_, "RowConv", FuncConfig());
+  createFunction(backward_, "RowConvGrad", FuncConfig());
+  return true;
+}
+void RowConvLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  MatrixPtr input = getInputValue(0);
+  size_t height = input->getHeight();
+  size_t width = input->getWidth();
+  CHECK_EQ(width, getSize());
+  resetOutput(height, width);
+  const auto startPos = getInput(0).sequenceStartPositions->getVector(useGpu_);
+  MatrixPtr w = weight_->getW();
+  wDims_ = TensorShape({w->getHeight(), w->getWidth()});
+  MatrixPtr outV = getOutputValue();
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getInputValue(0), *startPos);
+  inputs.addArg(*w, wDims_);
+  outputs.addArg(*getOutputValue(), *startPos, ADD_TO);
+  {
+    REGISTER_TIMER_INFO("RowConvForward", getName().c_str());
+    forward_[0]->calc(inputs, outputs);
+  }
+  /* activation */ {
+    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
+    forwardActivation();
+  }
+}
+void RowConvLayer::backward(const UpdateCallback& callback) {
+  /* Do derivation */ {
+    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
+    backwardActivation();
+  }
+  const auto startPos = getInput(0).sequenceStartPositions->getVector(useGpu_);
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getOutputGrad(), *startPos);
+  inputs.addArg(*getInputValue(0), *startPos);
+  inputs.addArg(*weight_->getW(), wDims_);
+  MatrixPtr inGrad = getInputGrad(0);
+  MatrixPtr wGrad = weight_->getWGrad();
+  size_t h = getInputValue(0)->getHeight();
+  size_t w = getInputValue(0)->getWidth();
+  outputs.addArg(
+      inGrad ? (*inGrad) : *(Matrix::create(nullptr, h, w, false, useGpu_)),
+      *startPos,
+      ADD_TO);
+  outputs.addArg(
+      wGrad ? (*wGrad)
+            : *(Matrix::create(nullptr, contexLength_, w, false, useGpu_)),
+      wDims_,
+      ADD_TO);
+  {
+    REGISTER_TIMER_INFO("RowConvBackward", getName().c_str());
+    backward_[0]->calc(inputs, outputs);
+  }
+  {
+    REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
+    weight_->getParameterPtr()->incUpdate(callback);
+  }
+}
+}  // namespace paddle
--- a/paddle/gserver/layers/ExpandConvTransLayer.h
+++ b/paddle/gserver/layers/ExpandConvTransLayer.h
@@ -14,31 +14,31 @@ limitations under the License. */
 #pragma once
-#include <vector>
+#include "Layer.h"
-#include "ExpandConvBaseLayer.h"
-#include "paddle/math/Matrix.h"
 namespace paddle {
 /**
- * @brief A subclass of convolution layer.
+ * \brief Row Convolution Layer.
- * This layer expands input and use matrix multiplication to
- * calculate convolution transpose (deconv) operation.
- *
- * The config file api is img_conv_layer with flag trans=True.
 */
-class ExpandConvTransLayer : public ExpandConvBaseLayer {
+class RowConvLayer : public Layer {
 public:
-  explicit ExpandConvTransLayer(const LayerConfig& config)
+  explicit RowConvLayer(const LayerConfig& config) : Layer(config) {}
-      : ExpandConvBaseLayer(config) {}
-  ~ExpandConvTransLayer() {}
+  ~RowConvLayer() {}
  bool init(const LayerMap& layerMap,
            const ParameterMap& parameterMap) override;
  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
-};
+protected:
+  // Row convolution weight, context_lenght_ * fan_out.
+  // fan_out is the size of output feature.
+  std::unique_ptr<Weight> weight_;
+  // The step number to look ahead plus one equals contexLength_.
+  size_t contexLength_;
+  TensorShape wDims_;
+};
 }  // namespace paddle
--- a/paddle/gserver/tests/test_BatchNorm.cpp
+++ b/paddle/gserver/tests/test_BatchNorm.cpp
@@ -17,7 +17,6 @@ limitations under the License. */
 #include <vector>
 #include "ModelConfig.pb.h"
 #include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/gserver/layers/ExpandConvTransLayer.h"
 #include "paddle/trainer/Trainer.h"
 #include "paddle/utils/GlobalConstants.h"

--- a/paddle/gserver/tests/test_ConvTrans.cpp
+++ b/paddle/gserver/tests/test_ConvTrans.cpp
@@ -17,7 +17,6 @@ limitations under the License. */
 #include <vector>
 #include "ModelConfig.pb.h"
 #include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/gserver/layers/ExpandConvTransLayer.h"
 #include "paddle/math/MathUtils.h"
 #include "paddle/trainer/Trainer.h"
 #include "paddle/utils/GlobalConstants.h"

--- a/paddle/gserver/tests/test_ConvUnify.cpp
+++ b/paddle/gserver/tests/test_ConvUnify.cpp
@@ -17,7 +17,6 @@ limitations under the License. */
 #include <vector>
 #include "ModelConfig.pb.h"
 #include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/gserver/layers/ExpandConvTransLayer.h"
 #include "paddle/math/MathUtils.h"
 #include "paddle/trainer/Trainer.h"
 #include "paddle/utils/GlobalConstants.h"

--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -1705,6 +1705,26 @@ TEST(Layer, TransLayer) {
  }
 }
+TEST(Layer, RowConvLayer) {
+  const int context = 3;
+  const int size = 512;
+  TestConfig config;
+  config.layerConfig.set_type("row_conv");
+  config.layerConfig.set_size(size);
+  config.layerConfig.set_active_type("sigmoid");
+  config.inputDefs.push_back(
+      {INPUT_SEQUENCE_DATA, "layer_0", size, context * size});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  RowConvConfig* conv = input->mutable_row_conv_conf();
+  conv->set_context_length(context);
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "row_conv", 100, false, useGpu, false);
+  }
+}
 int main(int argc, char** argv) {
  testing::InitGoogleTest(&argc, argv);
  initMain(argc, argv);

--- a/paddle/parameter/Argument.cpp
+++ b/paddle/parameter/Argument.cpp
@@ -632,7 +632,7 @@ void Argument::printValueString(std::ostream& stream,
                                const std::string& prefix) const {
  std::unordered_map<std::string, std::string> out;
  getValueString(&out);
-  for (auto field : {"value", "id", "sequence pos", "sub-sequence pos"}) {
+  for (auto field : {"value", "ids", "sequence pos", "sub-sequence pos"}) {
    auto it = out.find(field);
    if (it != out.end()) {
      stream << prefix << field << ":\n" << it->second;

--- a/paddle/parameter/tests/test_argument.cpp
+++ b/paddle/parameter/tests/test_argument.cpp
@@ -42,7 +42,7 @@ TEST(Argument, poolSequenceWithStride) {
    CHECK_EQ(outStart[3], 4);
    CHECK_EQ(outStart[4], 7);
-    CHECK_EQ(stridePositions->getSize(), 8);
+    CHECK_EQ(stridePositions->getSize(), 8UL);
    auto result = reversed ? strideResultReversed : strideResult;
    for (int i = 0; i < 8; i++) {
      CHECK_EQ(stridePositions->getData()[i], result[i]);

--- a/paddle/pserver/LightNetwork.cpp
+++ b/paddle/pserver/LightNetwork.cpp
@@ -383,20 +383,23 @@ void SocketClient::TcpClient(const std::string &serverAddr, int serverPort) {
  setOption(sockfd);
  /// Now connect to the server
-  int retry_second = 0;
+  int retry_count = 0;
-  int error = 0;
  do {
-    error = connect(sockfd, (sockaddr *)&serv_addr, sizeof(serv_addr));
+    if (connect(sockfd, (sockaddr *)&serv_addr, sizeof(serv_addr)) == 0) {
-    if (error == ECONNREFUSED) {
+      break;
+    }
+    if (errno == ECONNREFUSED) {
      LOG(WARNING) << "connection refused by pserver, try again!";
-      if (retry_second++ >= 7) {
+      if (retry_count++ >= 7) {
        LOG(FATAL) << "connection refused by pserver, maybe pserver failed!";
      }
      std::this_thread::sleep_for(std::chrono::seconds(1));
    } else {
-      PCHECK(error >= 0) << "ERROR connecting to " << serverAddr;
+      PCHECK(errno != 0) << "ERROR connecting to " << serverAddr << ":"
+                         << serverPort << "errorno: " << errno;
    }
-  } while (error == ECONNREFUSED);
+  } while (errno == ECONNREFUSED);
  channel_.reset(new SocketChannel(sockfd, serverAddr));
  tcpRdma_ = F_TCP;

--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -58,7 +58,7 @@ EOF
 make -j `nproc`
 if [ ${WITH_TESTING:-OFF} == "ON" ] && [ ${RUN_TEST:-OFF} == "ON" ] ; then
    pip uninstall -y py-paddle paddle || true
-    ctest -V
+    ctest --output-on-failure
 fi

--- a/paddle/strings/CMakeLists.txt
+++ b/paddle/strings/CMakeLists.txt
+cc_library(stringpiece SRCS stringpiece.cc)
+cc_test(stringpiece_test SRCS stringpiece_test.cc DEPS stringpiece glog gflags)
--- a/paddle/strings/stringpiece.cc
+++ b/paddle/strings/stringpiece.cc
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+  http://www.apache.org/licenses/LICENSE-2.0
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+#include "paddle/strings/stringpiece.h"
+#include <string.h>
+#include <algorithm>
+#include <iosfwd>
+#include <stdexcept>
+namespace paddle {
+StringPiece::StringPiece() : data_(NULL), size_(0) {}
+StringPiece::StringPiece(const char* d, size_t n) : data_(d), size_(n) {
+  if (d == NULL && n != 0)
+    throw std::invalid_argument(
+        "StringPiece requires len to be 0 for NULL data");
+}
+StringPiece::StringPiece(const char* s) : data_(s) {
+  size_ = (s == NULL) ? 0 : strlen(s);
+}
+StringPiece::StringPiece(const std::string& s)
+    : data_(s.data()), size_(s.size()) {}
+char StringPiece::operator[](size_t n) const {
+  if (n >= len())
+    throw std::invalid_argument("index out of StringPiece length");
+  return data_[n];
+}
+int Compare(StringPiece a, StringPiece b) {
+  const size_t min_len = (a.len() < b.len()) ? a.len() : b.len();
+  int r = memcmp(a.data(), b.data(), min_len);
+  if (r == 0) {
+    if (a.len() < b.len())
+      return -1;
+    else if (a.len() > b.len())
+      return 1;
+  }
+  return r;
+}
+bool operator==(StringPiece x, StringPiece y) {
+  return ((x.len() == y.len()) &&
+          (x.data() == y.data() || memcmp(x.data(), y.data(), x.len()) == 0));
+}
+bool operator!=(StringPiece x, StringPiece y) { return !(x == y); }
+bool operator<(StringPiece x, StringPiece y) { return Compare(x, y) < 0; }
+bool operator>(StringPiece x, StringPiece y) { return Compare(x, y) > 0; }
+bool operator<=(StringPiece x, StringPiece y) { return Compare(x, y) <= 0; }
+bool operator>=(StringPiece x, StringPiece y) { return Compare(x, y) >= 0; }
+bool HasPrefix(StringPiece s, StringPiece x) {
+  return ((s.len() >= x.len()) && (memcmp(s.data(), x.data(), x.len()) == 0));
+}
+bool HasSuffix(StringPiece s, StringPiece x) {
+  return ((s.len() >= x.len()) &&
+          (memcmp(s.data() + (s.len() - x.len()), x.data(), x.len()) == 0));
+}
+StringPiece SkipPrefix(StringPiece s, size_t n) {
+  if (n > s.len())
+    throw std::invalid_argument("Skip distance larger than StringPiece length");
+  return StringPiece(s.data() + n, s.len() - n);
+}
+StringPiece SkipSuffix(StringPiece s, size_t n) {
+  if (n > s.len())
+    throw std::invalid_argument("Skip distance larger than StringPiece length");
+  return StringPiece(s.data(), s.len() - n);
+}
+StringPiece TrimPrefix(StringPiece s, StringPiece x) {
+  return HasPrefix(s, x) ? SkipPrefix(s, x.len()) : s;
+}
+StringPiece TrimSuffix(StringPiece s, StringPiece x) {
+  return HasSuffix(s, x) ? SkipSuffix(s, x.len()) : s;
+}
+bool Contains(StringPiece s, StringPiece sub) {
+  return std::search(s.begin(), s.end(), sub.begin(), sub.end()) != s.end();
+}
+size_t Index(StringPiece s, StringPiece sub) {
+  auto e = std::search(s.begin(), s.end(), sub.begin(), sub.end());
+  return e != s.end() ? e - s.data() : StringPiece::npos;
+}
+size_t Find(StringPiece s, char c, size_t pos) {
+  if (pos >= s.len()) {
+    return StringPiece::npos;
+  }
+  const char* result =
+      reinterpret_cast<const char*>(memchr(s.data() + pos, c, s.len() - pos));
+  return result != nullptr ? result - s.data() : StringPiece::npos;
+}
+size_t RFind(StringPiece s, char c, size_t pos) {
+  if (s.len() == 0) return StringPiece::npos;
+  for (const char* p = s.data() + std::min(pos, s.len() - 1); p >= s.data();
+       p--) {
+    if (*p == c) {
+      return p - s.data();
+    }
+  }
+  return StringPiece::npos;
+}
+StringPiece SubStr(StringPiece s, size_t pos, size_t n) {
+  if (pos > s.len()) pos = s.len();
+  if (n > s.len() - pos) n = s.len() - pos;
+  return StringPiece(s.data() + pos, n);
+}
+std::ostream& operator<<(std::ostream& o, StringPiece piece) {
+  return o << piece.ToString();
+}
+}  // namespace paddle
--- a/paddle/strings/stringpiece.h
+++ b/paddle/strings/stringpiece.h
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+  http://www.apache.org/licenses/LICENSE-2.0
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+#pragma once
+#include <ostream>
+#include <string>
+namespace paddle {
+// StringPiece points into a std::string object but doesn't own the
+// string.  It is for efficient access to strings.  Like Go's string
+// type.  Not that StringPiece doesn't mutate the underlying string,
+// so it is thread-safe given that the underlying string doesn't
+// change.  Because StringPiece contains a little data members, and
+// its syntax is simple as it doesn't own/manage the string, it is
+// cheap to construct StringPieces and pass them around.
+class StringPiece {
+public:
+  static const size_t npos = static_cast<size_t>(-1);
+  // We provide non-explicit singleton constructors so users can
+  // pass in a "const char*" or a "string" wherever a "StringPiece"
+  // is expected.  These contructors ensure that if data_ is NULL,
+  // size_ is 0.
+  StringPiece();
+  StringPiece(const char* d, size_t n);
+  StringPiece(const char* d);
+  StringPiece(const std::string& s);
+  const char* data() const { return data_; }
+  size_t len() const { return size_; }
+  char operator[](size_t n) const;
+  // StringPiece doesn't own the string, so both iterator and const
+  // iterator are const char* indeed.
+  typedef const char* const_iterator;
+  typedef const char* iterator;
+  iterator begin() const { return data_; }
+  iterator end() const { return data_ + size_; }
+  // Return a string that contains the copy of the referenced data.
+  std::string ToString() const { return std::string(data_, size_); }
+private:
+  const char* data_;
+  size_t size_;
+  // Intentionally copyable
+};
+int Compare(StringPiece a, StringPiece b);
+bool operator==(StringPiece x, StringPiece y);
+bool operator!=(StringPiece x, StringPiece y);
+bool operator<(StringPiece x, StringPiece y);
+bool operator>(StringPiece x, StringPiece y);
+bool operator<=(StringPiece x, StringPiece y);
+bool operator>=(StringPiece x, StringPiece y);
+bool HasPrefix(StringPiece s, StringPiece prefix);
+bool HasSuffix(StringPiece s, StringPiece suffix);
+StringPiece SkipPrefix(StringPiece s, size_t n);
+StringPiece SkipSuffix(StringPiece s, size_t n);
+// Skip the prefix (or suffix) if it matches with the string.
+StringPiece TrimPrefix(StringPiece s, StringPiece prefix);
+StringPiece TrimSuffix(StringPiece s, StringPiece suffix);
+// Returns if s contains sub.  Any s except for empty s contains an
+// empty sub.
+bool Contains(StringPiece s, StringPiece sub);
+// Return the first occurrence of sub in s, or npos.  If both s and
+// sub is empty, it returns npos; otherwise, if only sub is empty, it
+// returns 0.
+size_t Index(StringPiece s, StringPiece sub);
+// Return the first occurrence of c in s[pos:end], or npos.
+size_t Find(StringPiece s, char c, size_t pos);
+// Search range is [0..pos] inclusive.  If pos == npos, search everything.
+size_t RFind(StringPiece s, char c, size_t pos);
+StringPiece SubStr(StringPiece s, size_t pos, size_t n);
+// allow StringPiece to be logged
+std::ostream& operator<<(std::ostream& o, StringPiece piece);
+}  // namespace paddle
--- a/paddle/strings/stringpiece_test.cc
+++ b/paddle/strings/stringpiece_test.cc
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+  http://www.apache.org/licenses/LICENSE-2.0
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+#include "paddle/strings/stringpiece.h"
+#include <sstream>
+#include "gtest/gtest.h"
+TEST(StringPiece, Construct) {
+  {
+    paddle::StringPiece s;
+    EXPECT_EQ(NULL, s.data());
+    EXPECT_EQ(0U, s.len());
+  }
+  { EXPECT_THROW(paddle::StringPiece s(NULL, 10000U), std::invalid_argument); }
+  {
+    paddle::StringPiece s(NULL);
+    EXPECT_EQ(0U, s.len());
+  }
+  {
+    std::string a;
+    EXPECT_EQ(0U, a.size());
+    paddle::StringPiece s(a);
+    EXPECT_EQ(0U, s.len());
+  }
+}
+TEST(StringPiece, CopyAndAssign) {
+  paddle::StringPiece empty;
+  EXPECT_EQ(0U, empty.len());
+  paddle::StringPiece a("hello");
+  paddle::StringPiece b = a;
+  EXPECT_EQ(b.len(), strlen("hello"));
+  EXPECT_EQ(a, b);
+  std::string storage("hello");
+  paddle::StringPiece c(storage);
+  EXPECT_EQ(a, c);
+  EXPECT_NE(a.data(), c.data());
+}
+TEST(StringPiece, Compare) {
+  {
+    paddle::StringPiece a("hello");
+    paddle::StringPiece b("world");
+    EXPECT_TRUE(a != b);
+    EXPECT_FALSE(a == b);
+    EXPECT_TRUE(a < b);
+    EXPECT_TRUE(a <= b);
+    EXPECT_FALSE(a > b);
+    EXPECT_FALSE(a >= b);
+    EXPECT_LT(Compare(a, b), 0);
+    EXPECT_GT(Compare(b, a), 0);
+  }
+  {
+    paddle::StringPiece a, b;
+    EXPECT_TRUE(a == b);
+    EXPECT_FALSE(a != b);
+    EXPECT_FALSE(a < b);
+    EXPECT_FALSE(a > b);
+    EXPECT_TRUE(a <= b);
+    EXPECT_TRUE(a >= b);
+    EXPECT_EQ(0, Compare(a, b));
+    EXPECT_EQ(0, Compare(b, a));
+  }
+}
+TEST(StringPiece, ToString) {
+  {
+    paddle::StringPiece s;
+    EXPECT_EQ(std::string(""), s.ToString());
+  }
+  {
+    paddle::StringPiece s(NULL);
+    EXPECT_EQ(std::string(""), s.ToString());
+  }
+  {
+    paddle::StringPiece s("hello");
+    EXPECT_EQ(std::string("hello"), s.ToString());
+  }
+}
+TEST(StringPiece, HasPrefixSuffix) {
+  using paddle::HasPrefix;
+  using paddle::HasSuffix;
+  {
+    paddle::StringPiece s;
+    EXPECT_FALSE(HasPrefix(s, "something"));
+    EXPECT_TRUE(HasPrefix(s, ""));
+    EXPECT_FALSE(HasSuffix(s, "something"));
+    EXPECT_TRUE(HasSuffix(s, ""));
+  }
+  {
+    paddle::StringPiece s("app");
+    EXPECT_TRUE(HasPrefix(s, ""));
+    EXPECT_TRUE(HasPrefix(s, "a"));
+    EXPECT_TRUE(HasPrefix(s, "ap"));
+    EXPECT_TRUE(HasPrefix(s, "app"));
+    EXPECT_TRUE(HasSuffix(s, ""));
+    EXPECT_TRUE(HasSuffix(s, "p"));
+    EXPECT_TRUE(HasSuffix(s, "pp"));
+    EXPECT_TRUE(HasSuffix(s, "app"));
+  }
+}
+TEST(StringPiece, SkipPrefixSuffix) {
+  using paddle::SkipPrefix;
+  using paddle::SkipSuffix;
+  {
+    paddle::StringPiece s;
+    EXPECT_EQ("", SkipPrefix(s, 0));
+    EXPECT_THROW(SkipPrefix(s, 1), std::invalid_argument);
+    EXPECT_EQ("", SkipSuffix(s, 0));
+    EXPECT_THROW(SkipSuffix(s, 1), std::invalid_argument);
+  }
+  {
+    paddle::StringPiece s("app");
+    EXPECT_EQ("app", SkipPrefix(s, 0));
+    EXPECT_EQ("pp", SkipPrefix(s, 1));
+    EXPECT_EQ("p", SkipPrefix(s, 2));
+    EXPECT_EQ("", SkipPrefix(s, 3));
+    EXPECT_THROW(SkipPrefix(s, 4), std::invalid_argument);
+    EXPECT_EQ("app", SkipSuffix(s, 0));
+    EXPECT_EQ("ap", SkipSuffix(s, 1));
+    EXPECT_EQ("a", SkipSuffix(s, 2));
+    EXPECT_EQ("", SkipSuffix(s, 3));
+    EXPECT_THROW(SkipSuffix(s, 4), std::invalid_argument);
+  }
+}
+TEST(StringPiece, TrimPrefixSuffix) {
+  using paddle::TrimPrefix;
+  using paddle::TrimSuffix;
+  {
+    paddle::StringPiece s;
+    EXPECT_EQ("", TrimPrefix(s, ""));
+    EXPECT_EQ("", TrimPrefix(s, "something"));
+    EXPECT_EQ("", TrimSuffix(s, ""));
+    EXPECT_EQ("", TrimSuffix(s, "something"));
+  }
+  {
+    paddle::StringPiece s("app");
+    EXPECT_EQ("app", TrimPrefix(s, ""));
+    EXPECT_EQ("pp", TrimPrefix(s, "a"));
+    EXPECT_EQ("p", TrimPrefix(s, "ap"));
+    EXPECT_EQ("", TrimPrefix(s, "app"));
+    EXPECT_EQ("app", TrimPrefix(s, "something"));
+    EXPECT_EQ("app", TrimSuffix(s, ""));
+    EXPECT_EQ("ap", TrimSuffix(s, "p"));
+    EXPECT_EQ("a", TrimSuffix(s, "pp"));
+    EXPECT_EQ("", TrimSuffix(s, "app"));
+    EXPECT_EQ("app", TrimSuffix(s, "something"));
+  }
+}
+TEST(StringPiece, Contains) {
+  using paddle::Contains;
+  {
+    paddle::StringPiece s;
+    EXPECT_FALSE(Contains(s, ""));
+    EXPECT_FALSE(Contains(s, "something"));
+  }
+  {
+    paddle::StringPiece s("app");
+    EXPECT_TRUE(Contains(s, ""));
+    EXPECT_TRUE(Contains(s, "a"));
+    EXPECT_TRUE(Contains(s, "p"));
+    EXPECT_TRUE(Contains(s, "ap"));
+    EXPECT_TRUE(Contains(s, "pp"));
+    EXPECT_TRUE(Contains(s, "app"));
+    EXPECT_FALSE(Contains(s, "something"));
+  }
+}
+TEST(StringPiece, Index) {
+  using paddle::Index;
+  auto npos = paddle::StringPiece::npos;
+  {
+    paddle::StringPiece s;
+    EXPECT_EQ(npos, Index(s, ""));
+    EXPECT_EQ(npos, Index(s, "something"));
+  }
+  {
+    paddle::StringPiece s("app");
+    EXPECT_EQ(0U, Index(s, ""));
+    EXPECT_EQ(0U, Index(s, "a"));
+    EXPECT_EQ(1U, Index(s, "p"));
+    EXPECT_EQ(0U, Index(s, "ap"));
+    EXPECT_EQ(1U, Index(s, "pp"));
+    EXPECT_EQ(0U, Index(s, "app"));
+    EXPECT_EQ(npos, Index(s, "something"));
+  }
+}
+TEST(StringPiece, Find) {
+  using paddle::Find;
+  auto npos = paddle::StringPiece::npos;
+  {
+    paddle::StringPiece s;
+    EXPECT_EQ(npos, Find(s, 'a', 0U));
+  }
+  {
+    paddle::StringPiece s("app");
+    EXPECT_EQ(0U, Find(s, 'a', 0U));
+    EXPECT_EQ(1U, Find(s, 'p', 0U));
+    EXPECT_EQ(1U, Find(s, 'p', 1U));
+    EXPECT_EQ(2U, Find(s, 'p', 2U));
+    EXPECT_EQ(npos, Find(s, 'z', 2U));
+  }
+}
+TEST(StringPiece, RFind) {
+  using paddle::RFind;
+  auto npos = paddle::StringPiece::npos;
+  {
+    paddle::StringPiece s;
+    EXPECT_EQ(npos, RFind(s, 'a', 0U));
+  }
+  {
+    paddle::StringPiece s("app");
+    EXPECT_EQ(2U, RFind(s, 'p', 2U));
+    EXPECT_EQ(0U, RFind(s, 'a', 2U));
+    EXPECT_EQ(1U, RFind(s, 'p', 1U));
+    EXPECT_EQ(0U, RFind(s, 'a', 0));
+    EXPECT_EQ(npos, RFind(s, 'z', 2U));
+  }
+}
+TEST(StringPiece, SubStr) {
+  using paddle::SubStr;
+  {
+    paddle::StringPiece s;
+    EXPECT_EQ("", SubStr(s, 0, 0));
+    EXPECT_EQ("", SubStr(s, 0, 1));
+    EXPECT_EQ("", SubStr(s, 1, 0));
+  }
+  {
+    paddle::StringPiece s("app");
+    EXPECT_EQ("", SubStr(s, 0, 0));
+    EXPECT_EQ("", SubStr(s, 1, 0));
+    EXPECT_EQ("", SubStr(s, 2, 0));
+    EXPECT_EQ("", SubStr(s, 3, 0));
+    EXPECT_EQ("a", SubStr(s, 0, 1));
+    EXPECT_EQ("p", SubStr(s, 1, 1));
+    EXPECT_EQ("p", SubStr(s, 2, 1));
+    EXPECT_EQ("", SubStr(s, 3, 1));
+    EXPECT_EQ("ap", SubStr(s, 0, 2));
+    EXPECT_EQ("pp", SubStr(s, 1, 2));
+    EXPECT_EQ("p", SubStr(s, 2, 2));
+    EXPECT_EQ("", SubStr(s, 3, 2));
+    EXPECT_EQ("app", SubStr(s, 0, 3));
+    EXPECT_EQ("pp", SubStr(s, 1, 3));
+    EXPECT_EQ("p", SubStr(s, 2, 3));
+    EXPECT_EQ("", SubStr(s, 3, 3));
+  }
+}
+TEST(StringPiece, StreamOutput) {
+  using paddle::StringPiece;
+  std::stringstream o;
+  o << StringPiece();
+  EXPECT_EQ("", o.str());
+  o << StringPiece("hello");
+  EXPECT_EQ("hello", o.str());
+  o << StringPiece();
+  EXPECT_EQ("hello", o.str());
+}
--- a/paddle/trainer/CMakeLists.txt
+++ b/paddle/trainer/CMakeLists.txt
@@ -4,6 +4,7 @@ set(TRAINER_SOURCES
        ParameterUpdater.cpp
        ParamUtil.cpp
        RemoteParameterUpdater.cpp
+        NewRemoteParameterUpdater.cpp
        Tester.cpp
        Trainer.cpp
        TrainerInternal.cpp
@@ -16,6 +17,7 @@ set(TRAINER_HEADERS
        ParameterUpdater.h
        ParamUtil.h
        RemoteParameterUpdater.h
+        NewRemoteParameterUpdater.h
        Tester.h
        TesterConfig.h
        Trainer.h
@@ -32,7 +34,7 @@ add_style_check_target(paddle_trainer_lib
 add_style_check_target(paddle_trainer_lib
    ${TRAINER_HEADERS})
 add_dependencies(paddle_trainer_lib
-    gen_proto_cpp)
+    gen_proto_cpp paddle_pserver_cclient_lib)
 macro(add_paddle_exe TARGET_NAME)
  add_executable(${TARGET_NAME} ${ARGN})
@@ -56,3 +58,10 @@ install(TARGETS paddle_trainer paddle_merge_model
 set_target_properties(paddle_trainer PROPERTIES INSTALL_RPATH_USE_LINK_PATH TRUE)
 set_target_properties(paddle_merge_model PROPERTIES INSTALL_RPATH_USE_LINK_PATH TRUE)
+if(APPLE)
+  set(CMAKE_EXE_LINKER_FLAGS "-framework CoreFoundation -framework Security")
+endif()
+target_link_libraries(paddle_trainer ${CMAKE_CURRENT_SOURCE_DIR}/libpaddle_pserver_cclient.a)
+target_link_libraries(paddle_trainer_lib ${CMAKE_CURRENT_SOURCE_DIR}/libpaddle_pserver_cclient.a)
--- a/paddle/trainer/NewRemoteParameterUpdater.cpp
+++ b/paddle/trainer/NewRemoteParameterUpdater.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "NewRemoteParameterUpdater.h"
+#include "Trainer.h"
+#include "paddle/utils/Stat.h"
+DECLARE_int32(trainer_id);
+DECLARE_string(save_dir);
+namespace paddle {
+NewRemoteParameterUpdater::NewRemoteParameterUpdater(
+    const OptimizationConfig &config, const std::string pserverSpec)
+    : parameterClient_(-1),
+      newParameters_(nullptr),
+      newGradients_(nullptr),
+      pserverSpec_(pserverSpec) {}
+void NewRemoteParameterUpdater::init(
+    const std::vector<ParameterPtr> &parameters) {
+  ParameterUpdater::init(parameters);
+  for (auto &para : parameters_) {
+    para->getBuf(PARAMETER_VALUE)->zeroMem();
+    para->getBuf(PARAMETER_GRADIENT)->zeroMem();
+  }
+  // create parameter server client.
+  parameterClient_ = paddle_new_pserver_client((char *)pserverSpec_.c_str(),
+                                               FLAGS_trainer_id == 0);
+  // init new parameter and gradient.
+  newParameters_ = initNewParameter(PARAMETER_VALUE);
+  newGradients_ = initNewParameter(PARAMETER_GRADIENT);
+  // init parameter, one trainer will get the opportunity to int parameter and
+  // send them to parameter server. Others will get the initialized parameter
+  // from parameter server
+  if (paddle_begin_init_params(parameterClient_)) {
+    LOG(INFO) << "paddle_begin_init_params start";
+    for (int i = 0; i < parameterSize(); ++i) {
+      auto paramConfig = parameters_[i]->getConfig();
+      std::string bytes = paramConfig.SerializeAsString();
+      const char *array = bytes.data();
+      int size = (int)bytes.size();
+      paddle_init_param(
+          parameterClient_, *newParameters_[i], (void *)array, size);
+    }
+    paddle_finish_init_params(parameterClient_);
+    LOG(INFO) << "paddle_begin_init_params done";
+  } else {
+    paddle_get_params(parameterClient_, newParameters_, parameterSize());
+  }
+  LOG(INFO) << "NewRemoteParameterUpdater initialized";
+}
+void NewRemoteParameterUpdater::updateImpl(Parameter *para) {}
+void NewRemoteParameterUpdater::finishBatch(real cost) {
+  // send gradient to parameter server.
+  paddle_send_grads(parameterClient_, newGradients_, parameterSize());
+  // get the updated parameter from parameterClient.
+  paddle_get_params(parameterClient_, newParameters_, parameterSize());
+  // clear gradient after update parameter.
+  for (auto &para : parameters_) {
+    para->getBuf(PARAMETER_GRADIENT)->zeroMem();
+  }
+}
+void NewRemoteParameterUpdater::startPass() {}
+bool NewRemoteParameterUpdater::finishPass() { return true; }
+}
--- a/paddle/trainer/NewRemoteParameterUpdater.h
+++ b/paddle/trainer/NewRemoteParameterUpdater.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <functional>
+#include <thread>
+#include "ParameterUpdater.h"
+#include "libpaddle_pserver_cclient.h"
+#include "paddle/pserver/ParameterClient2.h"
+#include "paddle/utils/Queue.h"
+#include "paddle/utils/Util.h"
+namespace paddle {
+/**
+ * New remote parameter updater for dense parameters that use cclient of go.
+ */
+class NewRemoteParameterUpdater : public ParameterUpdater {
+public:
+  NewRemoteParameterUpdater(const OptimizationConfig& config,
+                            const std::string pserverSpec);
+  ~NewRemoteParameterUpdater() {
+    releaseNewParameter(newParameters_);
+    releaseNewParameter(newGradients_);
+    if (parameterClient_ >= 0) paddle_pserver_client_release(parameterClient_);
+  }
+  /**
+   * initialize the internal parameter client and itself.
+   */
+  virtual void init(const std::vector<ParameterPtr>& parameters);
+  /**
+   * @brief start batch
+   *
+   * @note  one batch training exhibits stateful feature to help
+   *        to do performance tuning, sgd optimization if necessary.
+   */
+  virtual PassType startBatch(int64_t batchSize) { return PASS_TRAIN; }
+  /**
+   * send parameters to pservers and get returned parameters
+   * from all pservers if necessary.
+   */
+  virtual void finishBatch(real cost);
+  virtual void startPass();
+  virtual bool finishPass();
+protected:
+  /**
+   * work need to do after finishBatch
+   */
+  virtual void updateImpl(Parameter* para);
+private:
+  int parameterSize() { return (int)parameters_.size(); }
+  /**
+   * init parameter of go paddle pserver cclient.
+   * @param new_params
+   * @param type
+   */
+  paddle_parameter** initNewParameter(ParameterType type) {
+    paddle_parameter** new_params =
+        (paddle_parameter**)malloc(sizeof(paddle_parameter*) * parameterSize());
+    for (int i = 0; i < parameterSize(); ++i) {
+      new_params[i] = (paddle_parameter*)malloc(sizeof(paddle_parameter));
+      memset(new_params[i], 0, sizeof(paddle_parameter));
+    }
+    for (int i = 0; i < parameterSize(); ++i) {
+      ParameterPtr param = parameters_[i];
+      new_params[i]->element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
+      new_params[i]->name = (char*)param->getName().c_str();
+      new_params[i]->content =
+          (unsigned char*)(param->getBuf(type).get()->getData());
+      new_params[i]->content_len =
+          (int)param->getBuf(type).get()->getSize() * sizeof(real);
+    }
+    return new_params;
+  }
+  void releaseNewParameter(paddle_parameter** newParams) {
+    if (newParams != nullptr) {
+      for (int i = 0; i < parameterSize(); ++i) {
+        free(newParams[i]);
+      }
+      free(newParams);
+    }
+  }
+protected:
+  /// internal parameter client object for exchanging data with pserver
+  paddle_pserver_client parameterClient_;
+  /// the parameters for new pserver client
+  paddle_parameter** newParameters_;
+  /// the gradinets for new pserver client
+  paddle_parameter** newGradients_;
+  /// the specification of parameter server "host1:port,host1:port"
+  std::string pserverSpec_;
+};
+}  // namespace paddle
--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
@@ -194,6 +194,10 @@ message MaxOutConfig {
  required uint32 groups = 2;
 }
+message RowConvConfig {
+  required uint32 context_length = 1;
+}
 message ProjectionConfig {
  required string type = 1;
  required string name = 2;
@@ -279,6 +283,7 @@ message LayerInputConfig {
  optional SppConfig spp_conf = 12;
  optional PriorBoxConfig priorbox_conf = 13;
  optional PadConfig pad_conf = 14;
+  optional RowConvConfig row_conv_conf = 15;
 }
 message LayerConfig {

--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -18,7 +18,7 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
 add_custom_command(OUTPUT ${OUTPUT_DIR}/.timestamp
    COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
    COMMAND ${CMAKE_COMMAND} -E touch ${OUTPUT_DIR}/.timestamp
-    DEPENDS gen_proto_py ${PY_FILES} ${external_project_dependencies})
+    DEPENDS gen_proto_py ${PY_FILES} ${external_project_dependencies} paddle_master_shared)
 add_custom_target(paddle_python ALL DEPENDS
    ${OUTPUT_DIR}/.timestamp)

--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -73,7 +73,6 @@ To use this from paddle_trainer, paddle_trainer should be called with
 --config_args=extension_module_name=[MODULE_NAME]
 '''
 import copy
 import logging
 import os
@@ -127,6 +126,7 @@ def init_config_environment(
        g_config=TrainerConfig(),
        g_layer_map={},
        g_parameter_map={},
+        g_parameter_initializer_map={},
        g_extended_config_funcs={},
        # store command args of paddle_trainer
@@ -440,8 +440,7 @@ def model_type(name):
 @config_class
 class Bias(Cfg):
-    def __init__(
+    def __init__(self,
-            self,
                 parameter_name=None,
                 learning_rate=None,
                 momentum=None,
@@ -455,7 +454,8 @@ class Bias(Cfg):
                 sparse_remote_update=None,
                 gradient_clipping_threshold=None,
                 is_static=None,
-            is_shared=None, ):
+                 is_shared=None,
+                 initializer=None):
        self.add_keys(locals())
@@ -466,6 +466,7 @@ class Input(Cfg):
            self,
            input_layer_name,
            parameter_name=None,
+            initializer=None,
            learning_rate=None,
            momentum=None,
            decay_rate=None,
@@ -522,6 +523,7 @@ class Projection(Input):
            initial_std=None,
            initial_strategy=None,
            initial_smart=None,
+            initializer=None,
            num_batches_regularization=None,
            sparse_remote_update=None,
            sparse_update=None,
@@ -1480,7 +1482,8 @@ class LayerBase(object):
                    gradient_clipping_threshold=bias.
                    gradient_clipping_threshold,
                    is_static=bias.is_static,
-                    is_shared=bias.is_shared, )
+                    is_shared=bias.is_shared,
+                    initializer=bias.initializer)
            if for_self:
                self.config.bias_parameter_name = bias.parameter_name
            else:
@@ -1537,7 +1540,8 @@ class LayerBase(object):
            format=format,
            is_static=input_config.is_static,
            is_shared=input_config.is_shared,
-            update_hooks=input_config.update_hooks)
+            update_hooks=input_config.update_hooks,
+            initializer=input_config.initializer)
    def set_layer_size(self, size):
        if self.config.size == 0:
@@ -1731,9 +1735,10 @@ class ParameterReluLayer(LayerBase):
    def __init__(self, name, inputs, partial_sum=1, **args):
        super(ParameterReluLayer, self).__init__(
            name, self.layer_type, 0, inputs=inputs, **args)
-        config_assert(len(self.inputs) == 1)
-        config_assert(self.input_layer.size % partial_sum == 0)
        input_layer = self.get_input_layer(0)
+        config_assert(len(self.inputs) == 1, "prelu layer has only one input.")
+        config_assert(input_layer.size % partial_sum == 0,
+                      "a wrong setting for partial_sum")
        self.set_layer_size(input_layer.size)
        self.create_input_parameter(0, input_layer.size / partial_sum)
@@ -2081,6 +2086,23 @@ class MaxOutLayer(LayerBase):
                           g_layer_map[input_layer.name].width, out_channels)
+@config_layer('row_conv')
+class RowConvLayer(LayerBase):
+    def __init__(self, name, inputs, context_length, **xargs):
+        super(RowConvLayer, self).__init__(
+            name, 'maxout', 0, inputs=inputs, **xargs)
+        config_assert(
+            len(self.inputs) == 1,
+            'TransLayer must have one and only one input')
+        input_layer = self.get_input_layer(0)
+        row_conv_conf = self.config.inputs[0].row_conv_conf
+        row_conv_conf.context_length = context_length
+        self.set_layer_size(input_layer.size)
+        psize = context_length * input_layer.size
+        dims = [context_length, input_layer.size]
+        self.create_input_parameter(0, psize, dims)
 # key: cost type
 # value: cost class
 g_cost_map = {}
@@ -3204,7 +3226,8 @@ def Parameter(name,
              need_compact=None,
              is_static=None,
              is_shared=None,
-              update_hooks=None):
+              update_hooks=None,
+              initializer=None):
    config_assert(name not in g_parameter_map,
                  'Duplicated parameter name: ' + name)
@@ -3292,6 +3315,11 @@ def Parameter(name,
            para.update_hooks.extend(update_hooks)
    g_parameter_map[name] = para
+    if initializer is not None:
+        config_assert(
+            callable(initializer),
+            "parameter initializer should be a callable object")
+        g_parameter_initializer_map[name] = initializer
 @config_func
@@ -3546,11 +3574,7 @@ def update_g_config():
    return g_config
-def begin_parse(config_arg_str=''):
+def begin_parse():
-    '''
-    @param config_arg_str: a string of the form var1=val1,var2=val2. It will be
-    passed to config script as a dictionary CONFIG_ARGS
-    '''
    init_config_environment()
    for hook in _parse_config_hooks:
        hook()
@@ -3568,8 +3592,12 @@ def begin_parse(config_arg_str=''):
 def parse_config(trainer_config, config_arg_str):
-    begin_parse(config_arg_str)
+    '''
+    @param config_arg_str: a string of the form var1=val1,var2=val2. It will be
+    passed to config script as a dictionary CONFIG_ARGS
+    '''
+    begin_parse()
    config_args = {}
    if config_arg_str:

--- a/python/paddle/trainer_config_helpers/attrs.py
+++ b/python/paddle/trainer_config_helpers/attrs.py
@@ -95,6 +95,10 @@ class ParameterAttribute(object):
    :param sparse_update: Enable sparse update for this parameter. It will
                          enable both local and remote sparse update.
    :type sparse_update: bool
+    :param initializer: If not None, it should be a callable object which accepts
+                        a parameter name and returns numpy array for the initial
+                        value of the parameter
+    :param initializer: callable object
    """
    def __init__(self,
@@ -109,7 +113,8 @@ class ParameterAttribute(object):
                 learning_rate=None,
                 momentum=None,
                 gradient_clipping_threshold=None,
-                 sparse_update=False):
+                 sparse_update=False,
+                 initializer=None):
        self.attr = {}
        if is_static:
@@ -161,6 +166,8 @@ class ParameterAttribute(object):
                is_compatible_with(gradient_clipping_threshold, float):
            self.attr['gradient_clipping_threshold'] = \
                gradient_clipping_threshold
+        if initializer is not None:
+            self.attr['initializer'] = initializer
    def set_default_parameter_name(self, name):
        """

--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -31,31 +31,31 @@ except ImportError:
 import copy
 __all__ = [
-    "full_matrix_projection",
+    'full_matrix_projection',
-    "AggregateLevel",
+    'AggregateLevel',
-    "ExpandLevel",
+    'ExpandLevel',
-    "identity_projection",
+    'identity_projection',
-    "dotmul_projection",
+    'dotmul_projection',
-    "dotmul_operator",
+    'dotmul_operator',
-    "repeat_layer",
+    'repeat_layer',
-    "seq_reshape_layer",
+    'seq_reshape_layer',
-    "table_projection",
+    'table_projection',
-    "mixed_layer",
+    'mixed_layer',
-    "data_layer",
+    'data_layer',
-    "embedding_layer",
+    'embedding_layer',
-    "fc_layer",
+    'fc_layer',
-    "grumemory",
+    'grumemory',
-    "pooling_layer",
+    'pooling_layer',
-    "lstmemory",
+    'lstmemory',
-    "last_seq",
+    'last_seq',
-    "first_seq",
+    'first_seq',
-    "cos_sim",
+    'cos_sim',
-    "hsigmoid",
+    'hsigmoid',
-    "conv_projection",
+    'conv_projection',
-    "mse_cost",
+    'mse_cost',
-    "regression_cost",
+    'regression_cost',
    'classification_cost',
-    "LayerOutput",
+    'LayerOutput',
    'img_conv_layer',
    'img_pool_layer',
    'batch_norm_layer',
@@ -111,6 +111,7 @@ __all__ = [
    'block_expand_layer',
    'maxout_layer',
    'out_prod_layer',
+    'printer_layer',
    'print_layer',
    'priorbox_layer',
    'cross_channel_norm_layer',
@@ -120,6 +121,9 @@ __all__ = [
    'smooth_l1_cost',
    'layer_support',
    'multiplex_layer',
+    'row_conv_layer',
+    'dropout_layer',
+    'prelu_layer',
 ]
@@ -128,26 +132,26 @@ class LayerType(object):
    Layer type enumerations.
    """
-    DATA = "data"
+    DATA = 'data'
-    MIXED_LAYER = "mixed"
+    MIXED_LAYER = 'mixed'
-    LSTMEMORY = "lstmemory"
+    LSTMEMORY = 'lstmemory'
-    GRUMEMORY = "gated_recurrent"
+    GRUMEMORY = 'gated_recurrent'
-    SEQUENCE_LAST_INSTANCE = "seqlastins"
+    SEQUENCE_LAST_INSTANCE = 'seqlastins'
-    SEQUENCE_FIRST_INSTANCE = "seqfirstins"
+    SEQUENCE_FIRST_INSTANCE = 'seqfirstins'
-    SEQUENCE_RESHAPE = "seqreshape"
+    SEQUENCE_RESHAPE = 'seqreshape'
-    POOLING_MAX = "max"
+    POOLING_MAX = 'max'
    POOLING_AVG = 'average'
-    FC_LAYER = "fc"
+    FC_LAYER = 'fc'
    COST = 'cost'
    COSINE_SIM_VEC = 'cos_vm'
    COSINE_SIM = 'cos'
    HSIGMOID = 'hsigmoid'
-    CONV_LAYER = "conv"
+    CONV_LAYER = 'conv'
-    CONVTRANS_LAYER = "convt"
+    CONVTRANS_LAYER = 'convt'
-    EXCONV_LAYER = "exconv"
+    EXCONV_LAYER = 'exconv'
-    EXCONVTRANS_LAYER = "exconvt"
+    EXCONVTRANS_LAYER = 'exconvt'
-    CUDNNCONV_LAYER = "cudnn_conv"
+    CUDNNCONV_LAYER = 'cudnn_conv'
-    POOL_LAYER = "pool"
+    POOL_LAYER = 'pool'
    BATCH_NORM_LAYER = 'batch_norm'
    NORM_LAYER = 'norm'
    SUM_TO_ONE_NORM_LAYER = 'sum_to_one_norm'
@@ -187,25 +191,28 @@ class LayerType(object):
    SPP_LAYER = "spp"
    PAD_LAYER = "pad"
    MULTIPLEX_LAYER = "multiplex"
+    ROW_CONV_LAYER = "row_conv"
-    PRINT_LAYER = "print"
+    PRINT_LAYER = 'print'
-    PRIORBOX_LAYER = "priorbox"
+    PRIORBOX_LAYER = 'priorbox'
-    CTC_LAYER = "ctc"
+    CTC_LAYER = 'ctc'
-    WARP_CTC_LAYER = "warp_ctc"
+    WARP_CTC_LAYER = 'warp_ctc'
-    CRF_LAYER = "crf"
+    CRF_LAYER = 'crf'
-    CRF_DECODING_LAYER = "crf_decoding"
+    CRF_DECODING_LAYER = 'crf_decoding'
    NCE_LAYER = 'nce'
-    RANK_COST = "rank-cost"
+    RANK_COST = 'rank-cost'
-    LAMBDA_COST = "lambda_cost"
+    LAMBDA_COST = 'lambda_cost'
-    HUBER = "huber"
+    HUBER = 'huber'
-    CROSS_ENTROPY = "multi-class-cross-entropy"
+    CROSS_ENTROPY = 'multi-class-cross-entropy'
-    CROSS_ENTROPY_WITH_SELFNORM = "multi_class_cross_entropy_with_selfnorm"
+    CROSS_ENTROPY_WITH_SELFNORM = 'multi_class_cross_entropy_with_selfnorm'
-    SOFT_BIN_CLASS_CROSS_ENTROPY = "soft_binary_class_cross_entropy"
+    SOFT_BIN_CLASS_CROSS_ENTROPY = 'soft_binary_class_cross_entropy'
-    MULTI_BIN_LABEL_CROSS_ENTROPY = "multi_binary_label_cross_entropy"
+    MULTI_BIN_LABEL_CROSS_ENTROPY = 'multi_binary_label_cross_entropy'
-    SUM_COST = "sum_cost"
+    SUM_COST = 'sum_cost'
-    SMOOTH_L1 = "smooth_l1"
+    SMOOTH_L1 = 'smooth_l1'
+    PRELU = 'prelu'
    @staticmethod
    def is_layer_type(type_name):
@@ -969,7 +976,7 @@ def fc_layer(input,
 @wrap_name_default("print")
-def print_layer(input, name=None):
+def printer_layer(input, name=None):
    """
    Print the output value of input layers. This layer is useful for debugging.
@@ -991,6 +998,13 @@ def print_layer(input, name=None):
        inputs=[l.name for l in input], )
    # this layer don't return anything, can not be input of other layer.
+# Keep print_layer for compatibility with V1 API.
+# 'print_layer' does not work for V2 API because it will be changed to
+# 'print' for V2 API. But 'print' is a reserved key word in python.
+print_layer = printer_layer
 @wrap_name_default("priorbox")
 def priorbox_layer(input,
@@ -2916,11 +2930,11 @@ def memory(name,
    to specify the layer needs to be remembered as the following:
    .. code-block:: python
       mem = memory(size=256)
       state = fc_layer(input=mem, size=256)
       mem.set_input(mem)
    :param name: the name of the layer which this memory remembers.
                 If name is None, user should call set_input() to specify the
                 name of the layer which this memory remembers.
@@ -3407,7 +3421,7 @@ def recurrent_group(step,
                          else, for training or testing, one of the input type must
                          be LayerOutput.
-    : type is_generating: bool
+    :type is_generating: bool
    :return: LayerOutput object.
    :rtype: LayerOutput
@@ -3760,7 +3774,6 @@ def beam_search(step,
    assert generated_input_index != -1
    gipt = input[generated_input_index]
-    assert isinstance(gipt, BaseGeneratedInput)
    gipt.bos_id = bos_id
    gipt.eos_id = eos_id
@@ -3780,7 +3793,6 @@ def beam_search(step,
        predict = gipt.after_real_step(step(*args))
        eos_layer(input=predict, eos_id=eos_id, name=eos_name)
        return predict
    tmp = recurrent_group(
@@ -3814,7 +3826,7 @@ def mse_cost(input, label, weight=None, name=None, coeff=1.0, layer_attr=None):
    ..  math::
-        \frac{1}{N}\sum_{i=1}^N(t_i-y_i)^2
+        \\frac{1}{N}\sum_{i=1}^N(t_i-y_i)^2
    :param name: layer name.
    :type name: basestring
@@ -3852,7 +3864,6 @@ def classification_cost(input,
                        label,
                        weight=None,
                        name=None,
-                        top_k=None,
                        evaluator=classification_error_evaluator,
                        layer_attr=None):
    """
@@ -3867,8 +3878,6 @@ def classification_cost(input,
    :param weight: The weight affects the cost, namely the scale of cost.
                   It is an optional argument.
    :type weight: LayerOutput
-    :param top_k: number k in top-k error rate
-    :type top_k: int
    :param evaluator: Evaluator method.
    :param layer_attr: layer's extra attribute.
    :type layer_attr: ExtraLayerAttribute
@@ -3896,7 +3905,7 @@ def classification_cost(input,
        assert isinstance(e.for_classification, bool)
        assert e.for_classification
-        e(name=e.__name__, input=input, label=label, weight=weight, top_k=top_k)
+        e(name=e.__name__, input=input, label=label, weight=weight)
    if not isinstance(evaluator, collections.Sequence):
        evaluator = [evaluator]
@@ -4717,7 +4726,7 @@ def ctc_layer(input,
        fc_layer with softmax activation, should be num_classes + 1. The size of ctc_layer
        should also be num_classes + 1.
-    The simple usage:
+    The example usage is:
    .. code-block:: python
@@ -4769,27 +4778,42 @@ def warp_ctc_layer(input,
                   layer_attr=None):
    """
    A layer intergrating the open-source `warp-ctc
-    <https://github.com/baidu-research/warp-ctc>` library, which is used in
+    <https://github.com/baidu-research/warp-ctc>`_ library, which is used in
    `Deep Speech 2: End-toEnd Speech Recognition in English and Mandarin
-    <https://arxiv.org/pdf/1512.02595v1.pdf>`, to compute Connectionist Temporal
+    <https://arxiv.org/pdf/1512.02595v1.pdf>`_, to compute Connectionist Temporal
-    Classification (CTC) loss.
+    Classification (CTC) loss. Besides, another `warp-ctc
+    <https://github.com/gangliao/warp-ctc>`_ repository, which is forked from
+    the official one, is maintained to enable more compiling options. During the
+    building process, PaddlePaddle will clone the source codes, build and
+    install it to :code:`third_party/install/warpctc` directory.
+    To use warp_ctc layer, you need to specify the path of :code:`libwarpctc.so`,
+    using following methods:
+    1. Set it in :code:`paddle.init` (python api) or :code:`paddle_init` (c api),
+    such as :code:`paddle.init(use_gpu=True,
+    warpctc_dir=your_paddle_source_dir/third_party/install/warpctc/lib)`.
+    2. Set environment variable LD_LIBRARY_PATH on Linux or DYLD_LIBRARY_PATH
+    on Mac OS. For instance, :code:`export
+    LD_LIBRARY_PATH=your_paddle_source_dir/third_party/install/warpctc/lib:$LD_LIBRARY_PATH`.
    More details of CTC can be found by referring to `Connectionist Temporal
    Classification: Labelling Unsegmented Sequence Data with Recurrent
    Neural Networks <http://machinelearning.wustl.edu/mlpapers/paper_files/
-    icml2006_GravesFGS06.pdf>`_
+    icml2006_GravesFGS06.pdf>`_.
    Note:
        - Let num_classes represent the category number. Considering the 'blank'
-          label needed by CTC, you need to use (num_classes + 1) as the input
+          label needed by CTC, you need to use (num_classes + 1) as the input size.
-          size. Thus, the size of both warp_ctc_layer and 'input' layer should
+          Thus, the size of both warp_ctc layer and 'input' layer should be set to
-          be set to num_classes + 1.
+          num_classes + 1.
        - You can set 'blank' to any value ranged in [0, num_classes], which
          should be consistent as that used in your labels.
        - As a native 'softmax' activation is interated to the warp-ctc library,
          'linear' activation is expected instead in the 'input' layer.
-    The simple usage:
+    The example usage is:
    .. code-block:: python
@@ -4850,7 +4874,7 @@ def crf_layer(input,
    A layer for calculating the cost of sequential conditional random
    field model.
-    The simple usage:
+    The example usage is:
    .. code-block:: python
@@ -4924,7 +4948,7 @@ def crf_decoding_layer(input,
    this layer will also calculate error. output.value[i] is 1 for incorrect
    decoding or 0 for correct decoding.
-    The simple usage:
+    The example usage is:
    .. code-block:: python
@@ -5117,7 +5141,7 @@ def rank_cost(left,
      - :math:`o_i` and :math:`o_j`: the left output and right output.
        Their dimension is one.
-    The simple usage:
+    The example usage is:
    .. code-block:: python
@@ -5174,7 +5198,7 @@ def lambda_cost(input,
    """
    lambdaCost for lambdaRank LTR approach.
-    The simple usage:
+    The example usage is:
    .. code-block:: python
@@ -5232,6 +5256,8 @@ def cross_entropy(input,
    """
    A loss layer for multi class entropy.
+    The example usage is:
    .. code-block:: python
       cost = cross_entropy(input=input_layer,
@@ -5278,6 +5304,8 @@ def cross_entropy_with_selfnorm(input,
    A loss layer for multi class entropy with selfnorm.
    Input should be a vector of positive numbers, without normalization.
+    The example usage is:
    .. code-block:: python
       cost = cross_entropy_with_selfnorm(input=input_layer,
@@ -5319,6 +5347,8 @@ def sum_cost(input, name=None, layer_attr=None):
    """
    A loss layer which calculate the sum of the input as loss
+    The example usage is:
    .. code-block:: python
       cost = sum_cost(input=input_layer)
@@ -5348,6 +5378,8 @@ def huber_cost(input, label, name=None, coeff=1.0, layer_attr=None):
    """
    A loss layer for huber loss.
+    The example usage is:
    .. code-block:: python
       cost = huber_cost(input=input_layer,
@@ -5388,6 +5420,8 @@ def multi_binary_label_cross_entropy(input,
    """
    A loss layer for multi binary label cross entropy.
+    The example usage is:
    .. code-block:: python
       cost = multi_binary_label_cross_entropy(input=input_layer,
@@ -5447,6 +5481,8 @@ def smooth_l1_cost(input, label, name=None, coeff=1.0, layer_attr=None):
    More details can be found by referring to `Fast R-CNN
    <https://arxiv.org/pdf/1504.08083v2.pdf>`_
+    The example usage is:
    .. code-block:: python
       cost = smooth_l1_cost(input=input_layer,
@@ -5496,6 +5532,8 @@ def multiplex_layer(input, name=None, layer_attr=None):
    where, y is output. :math:`x_{k}` is the k-th input layer and
    :math:`k = x_{0}[i] + 1`.
+    The example usage is:
    .. code-block:: python
       maxid = multiplex_layer(input=layers)
@@ -5528,3 +5566,155 @@ def multiplex_layer(input, name=None, layer_attr=None):
        layer_type=LayerType.MULTIPLEX_LAYER,
        parents=input,
        size=l.config.size)
+@wrap_name_default("dropout")
+def dropout_layer(input, dropout_rate, name=None):
+    """
+    @TODO(yuyang18): Add comments.
+    :param name:
+    :param input:
+    :param dropout_rate:
+    :return:
+    """
+    return addto_layer(
+        name=name,
+        input=input,
+        act=LinearActivation(),
+        bias_attr=False,
+        layer_attr=ExtraAttr(drop_rate=dropout_rate))
+@wrap_name_default()
+@wrap_act_default(act=LinearActivation())
+@wrap_param_attr_default()
+@layer_support(DROPOUT)
+def row_conv_layer(input,
+                   context_len,
+                   act=None,
+                   name=None,
+                   param_attr=None,
+                   layer_attr=None):
+    """
+    The row convolution is called lookahead convolution. It is firstly
+    introduced in paper of `Deep Speech 2: End-toEnd Speech Recognition
+    in English and Mandarin <https://arxiv.org/pdf/1512.02595v1.pdf>`_ .
+    The bidirectional RNN that learns representation for a sequence by
+    performing a forward and a backward pass through the entire sequence.
+    However, unlike unidirectional RNNs, bidirectional RNNs are challenging
+    to deploy in an online and low-latency setting. The lookahead convolution
+    incorporates information from future subsequences in a computationally
+    efficient manner to improve unidirectional recurrent neural networks.
+    The connection of row convolution is different form the 1D sequence
+    convolution. Assumed that, the future context-length is k, that is to say,
+    it can get the output at timestep t by using the the input feature from t-th
+    timestep to (t+k+1)-th timestep. Assumed that the hidden dim of input
+    activations are d, the activations r_t for the new layer at time-step t are:
+    .. math::
+        r_{t,r} = \sum_{j=1}^{k + 1} {w_{i,j}h_{t+j-1, i}}
+                  \quad \text{for} \quad  (1 \leq i \leq d)
+    Note:
+        The `context_len` is `k + 1`. That is to say, the lookahead step
+        number plus one equals context_len.
+    .. code-block:: python
+       row_conv = row_conv_layer(input=input_layer, context_len=3)
+    :param input: The input layer.
+    :type input: LayerOutput
+    :param context_len: The context length equals the lookahead step number
+                        plus one.
+    :type context_len: int
+    :param act: Activation Type. Default is linear activation.
+    :type act: BaseActivation
+    :param param_attr: The Parameter Attribute. If None, the parameter will be
+                       initialized smartly. It's better set it by yourself.
+    :type param_attr: ParameterAttribute
+    :param layer_attr: Extra Layer config.
+    :type layer_attr: ExtraLayerAttribute|None
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    assert isinstance(input, LayerOutput)
+    assert context_len > 0, "the context_len must be greatet than 0."
+    Layer(
+        inputs=[Input(input.name, **param_attr.attr)],
+        name=name,
+        context_length=context_len,
+        type=LayerType.ROW_CONV_LAYER,
+        active_type=act.name,
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name, LayerType.ROW_CONV_LAYER, input, activation=act, size=input.size)
+@layer_support()
+@wrap_name_default()
+@wrap_param_attr_default()
+def prelu_layer(input,
+                name=None,
+                partial_sum=1,
+                param_attr=None,
+                layer_attr=None):
+    """
+    The Parameter Relu activation that actives outputs with a learnable weight.
+    Reference:
+        Delving Deep into Rectifiers: Surpassing Human-Level Performance on
+        ImageNet Classification http://arxiv.org/pdf/1502.01852v1.pdf
+    .. math::
+       z_i &\\quad if \\quad z_i > 0 \\\\
+       a_i * z_i  &\\quad \\mathrm{otherwise}
+    The example usage is:
+    .. code-block:: python
+       prelu = prelu_layer(input=layers, partial_sum=1)
+    :param name: Name of this layer.
+    :type name: basestring
+    :param input: The input layer.
+    :type input: LayerOutput
+    :param partial_sum: this parameter makes a group of inputs share a same weight.
+        - partial_sum = 1, indicates the element-wise activation: each element has a weight.
+        - partial_sum = number of elements in one channel, indicates the channel-wise activation, elements in a channel share a same weight.
+        - partial_sum = number of outputs, indicates all elements share a same weight.
+    :type partial_sum: int
+    :param param_attr: The parameter attribute. See ParameterAttribute for details.
+    :type param_attr: ParameterAttribute|None
+    :param layer_attr: Extra layer configurations. Default is None.
+    :type layer_attr: ExtraLayerAttribute|None
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    assert isinstance(input, LayerOutput), 'prelu_layer only accepts one input'
+    assert isinstance(param_attr, ParameterAttribute)
+    l = Layer(
+        name=name,
+        type=LayerType.PRELU,
+        inputs=Input(input.name, **param_attr.attr),
+        partial_sum=partial_sum,
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name=name,
+        layer_type=LayerType.PRELU,
+        parents=input,
+        size=l.config.size)
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -26,10 +26,10 @@ from paddle.trainer.config_parser import *
 __all__ = [
    'sequence_conv_pool', 'simple_lstm', "simple_img_conv_pool",
-    "img_conv_bn_pool", 'dropout_layer', 'lstmemory_group', 'lstmemory_unit',
+    "img_conv_bn_pool", 'lstmemory_group', 'lstmemory_unit', 'small_vgg',
-    'small_vgg', 'img_conv_group', 'vgg_16_network', 'gru_unit', 'gru_group',
+    'img_conv_group', 'vgg_16_network', 'gru_unit', 'gru_group', 'simple_gru',
-    'simple_gru', 'simple_attention', 'simple_gru2', 'bidirectional_gru',
+    'simple_attention', 'simple_gru2', 'bidirectional_gru', 'text_conv_pool',
-    'text_conv_pool', 'bidirectional_lstm', 'inputs', 'outputs'
+    'bidirectional_lstm', 'inputs', 'outputs'
 ]
 ######################################################
@@ -1366,29 +1366,6 @@ def simple_attention(encoded_sequence,
        input=scaled, pooling_type=SumPooling(), name="%s_pooling" % name)
-############################################################################
-#                         Miscs                                            #
-############################################################################
-@wrap_name_default("dropout")
-def dropout_layer(input, dropout_rate, name=None):
-    """
-    @TODO(yuyang18): Add comments.
-    :param name:
-    :param input:
-    :param dropout_rate:
-    :return:
-    """
-    return addto_layer(
-        name=name,
-        input=input,
-        act=LinearActivation(),
-        bias_attr=False,
-        layer_attr=ExtraAttr(drop_rate=dropout_rate))
 def inputs(layers, *args):
    """
    Declare the inputs of network. The order of input should be as same as

--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
@@ -5,6 +5,7 @@ last_first_seq test_expand_layer test_ntm_layers test_hsigmoid
 img_layers img_trans_layers util_layers simple_rnn_layers unused_layers test_cost_layers
 test_rnn_group shared_fc shared_lstm shared_gru test_cost_layers_with_weight
 test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops
-test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer)
+test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer
+test_prelu_layer test_row_conv)
 export whole_configs=(test_split_datasource)
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_prelu_layer.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_prelu_layer.protostr
+type: "nn"
+layers {
+  name: "input"
+  type: "data"
+  size: 300
+  active_type: ""
+}
+layers {
+  name: "__prelu_layer_0__"
+  type: "prelu"
+  size: 300
+  active_type: ""
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "___prelu_layer_0__.w0"
+  }
+}
+parameters {
+  name: "___prelu_layer_0__.w0"
+  size: 300
+  initial_mean: 0.0
+  initial_std: 0.057735026919
+  initial_strategy: 0
+  initial_smart: true
+}
+input_layer_names: "input"
+output_layer_names: "__prelu_layer_0__"
+sub_models {
+  name: "root"
+  layer_names: "input"
+  layer_names: "__prelu_layer_0__"
+  input_layer_names: "input"
+  output_layer_names: "__prelu_layer_0__"
+  is_recurrent_layer_group: false
+}
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_conv.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_conv.protostr
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 2560
+  active_type: ""
+}
+layers {
+  name: "__row_conv_layer_0__"
+  type: "maxout"
+  size: 2560
+  active_type: "relu"
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "___row_conv_layer_0__.w0"
+    row_conv_conf {
+      context_length: 19
+    }
+  }
+}
+parameters {
+  name: "___row_conv_layer_0__.w0"
+  size: 48640
+  initial_mean: 0.0
+  initial_std: 0.229415733871
+  dims: 19
+  dims: 2560
+  initial_strategy: 0
+  initial_smart: true
+}
+input_layer_names: "data"
+output_layer_names: "__row_conv_layer_0__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "__row_conv_layer_0__"
+  input_layer_names: "data"
+  output_layer_names: "__row_conv_layer_0__"
+  is_recurrent_layer_group: false
+}
--- a/python/paddle/trainer_config_helpers/tests/configs/test_prelu_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_prelu_layer.py
+from paddle.trainer_config_helpers import *
+data = data_layer(name='input', size=300)
+prelu = prelu_layer(input=data)
+outputs(prelu)
--- a/python/paddle/trainer_config_helpers/tests/configs/test_row_conv.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_row_conv.py
+from paddle.trainer_config_helpers import *
+settings(batch_size=1000, learning_rate=1e-5)
+data = data_layer(name='data', size=2560)
+row_conv = row_conv_layer(input=data, context_len=19, act=ReluActivation())
+outputs(row_conv)
--- a/python/paddle/utils/image_multiproc.py
+++ b/python/paddle/utils/image_multiproc.py
@@ -12,7 +12,7 @@ from paddle.trainer.config_parser import logger
 try:
    import cv2
 except ImportError:
-    logger.warning("OpenCV2 is not installed, using PIL to prcoess")
+    logger.warning("OpenCV2 is not installed, using PIL to process")
    cv2 = None
 __all__ = ["CvTransformer", "PILTransformer", "MultiProcessImageTransformer"]

--- a/python/paddle/v2/__init__.py
+++ b/python/paddle/v2/__init__.py
@@ -26,6 +26,7 @@ import evaluator
 from . import dataset
 from . import reader
 from . import plot
+from . import master
 import attr
 import op
 import pooling
@@ -37,9 +38,26 @@ import plot
 import image
 __all__ = [
-    'optimizer', 'layer', 'activation', 'parameters', 'init', 'trainer',
+    'optimizer',
-    'event', 'data_type', 'attr', 'pooling', 'data_feeder', 'dataset', 'reader',
+    'layer',
-    'topology', 'networks', 'infer', 'plot', 'evaluator', 'image'
+    'activation',
+    'parameters',
+    'init',
+    'trainer',
+    'event',
+    'data_type',
+    'attr',
+    'pooling',
+    'data_feeder',
+    'dataset',
+    'reader',
+    'topology',
+    'networks',
+    'infer',
+    'plot',
+    'evaluator',
+    'image',
+    'master',
 ]

--- a/python/paddle/v2/dataset/common.py
+++ b/python/paddle/v2/dataset/common.py
@@ -149,3 +149,57 @@ def cluster_files_reader(files_pattern,
                    yield line
    return reader
+def convert(output_path,
+            reader,
+            num_shards,
+            name_prefix,
+            max_lines_to_shuffle=1000):
+    import recordio
+    import cPickle as pickle
+    import random
+    """
+    Convert data from reader to recordio format files.
+    :param output_path: directory in which output files will be saved.
+    :param reader: a data reader, from which the convert program will read data instances.
+    :param num_shards: the number of shards that the dataset will be partitioned into.
+    :param name_prefix: the name prefix of generated files.
+    :param max_lines_to_shuffle: the max lines numbers to shuffle before writing.
+    """
+    assert num_shards >= 1
+    assert max_lines_to_shuffle >= 1
+    def open_writers():
+        w = []
+        for i in range(0, num_shards):
+            n = "%s/%s-%05d-of-%05d" % (output_path, name_prefix, i,
+                                        num_shards - 1)
+            w.append(recordio.writer(n))
+        return w
+    def close_writers(w):
+        for i in range(0, num_shards):
+            w[i].close()
+    def write_data(w, lines):
+        random.shuffle(lines)
+        for i, d in enumerate(lines):
+            d = pickle.dumps(d, pickle.HIGHEST_PROTOCOL)
+            w[i % num_shards].write(d)
+    w = open_writers()
+    lines = []
+    for i, d in enumerate(reader()):
+        lines.append(d)
+        if i % max_lines_to_shuffle == 0 and i >= max_lines_to_shuffle:
+            write_data(w, lines)
+            lines = []
+            continue
+    write_data(w, lines)
+    close_writers(w)
--- a/python/paddle/v2/dataset/flowers.py
+++ b/python/paddle/v2/dataset/flowers.py
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This module will download dataset from
+http://www.robots.ox.ac.uk/~vgg/data/flowers/102/index.html 
+and parse train/test set intopaddle reader creators.
+This set contains images of flowers belonging to 102 different categories. 
+The images were acquired by searching the web and taking pictures. There are a
+minimum of 40 images for each category.
+The database was used in:
+Nilsback, M-E. and Zisserman, A. Automated flower classification over a large
+ number of classes.Proceedings of the Indian Conference on Computer Vision, 
+Graphics and Image Processing (2008) 
+http://www.robots.ox.ac.uk/~vgg/publications/papers/nilsback08.{pdf,ps.gz}.
+"""
+import cPickle
+import itertools
+from common import download
+import tarfile
+import scipy.io as scio
+from paddle.v2.image import *
+import os
+import numpy as np
+import paddle.v2 as paddle
+from multiprocessing import cpu_count
+__all__ = ['train', 'test', 'valid']
+DATA_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz'
+LABEL_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/imagelabels.mat'
+SETID_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/setid.mat'
+DATA_MD5 = '52808999861908f626f3c1f4e79d11fa'
+LABEL_MD5 = 'e0620be6f572b9609742df49c70aed4d'
+SETID_MD5 = 'a5357ecc9cb78c4bef273ce3793fc85c'
+def default_mapper(sample):
+    '''
+    map image bytes data to type needed by model input layer
+    '''
+    img, label = sample
+    img = paddle.image.load_image_bytes(img)
+    img = paddle.image.simple_transform(img, 256, 224, True)
+    return img.flatten().astype('float32'), label
+def reader_creator(data_file,
+                   label_file,
+                   setid_file,
+                   dataset_name,
+                   mapper=default_mapper,
+                   buffered_size=1024):
+    '''
+    1. read images from tar file and 
+        merge images into batch files in 102flowers.tgz_batch/
+    2. get a reader to read sample from batch file
+    :param data_file: downloaded data file 
+    :type data_file: string
+    :param label_file: downloaded label file 
+    :type label_file: string
+    :param setid_file: downloaded setid file containing information
+                        about how to split dataset
+    :type setid_file: string
+    :param dataset_name: data set name (tstid|trnid|valid)
+    :type dataset_name: string
+    :param mapper: a function to map image bytes data to type 
+                    needed by model input layer
+    :type mapper: callable
+    :param buffered_size: the size of buffer used to process images
+    :type buffered_size: int
+    :return: data reader
+    :rtype: callable
+    '''
+    labels = scio.loadmat(label_file)['labels'][0]
+    indexes = scio.loadmat(setid_file)[dataset_name][0]
+    img2label = {}
+    for i in indexes:
+        img = "jpg/image_%05d.jpg" % i
+        img2label[img] = labels[i - 1]
+    file_list = batch_images_from_tar(data_file, dataset_name, img2label)
+    def reader():
+        for file in open(file_list):
+            file = file.strip()
+            batch = None
+            with open(file, 'r') as f:
+                batch = cPickle.load(f)
+            data = batch['data']
+            labels = batch['label']
+            for sample, label in itertools.izip(data, batch['label']):
+                yield sample, int(label)
+    return paddle.reader.xmap_readers(mapper, reader,
+                                      cpu_count(), buffered_size)
+def train(mapper=default_mapper, buffered_size=1024):
+    '''
+    Create flowers training set reader. 
+    It returns a reader, each sample in the reader is   
+    image pixels in [0, 1] and label in [1, 102] 
+    translated from original color image by steps:
+    1. resize to 256*256
+    2. random crop to 224*224
+    3. flatten
+    :param mapper:  a function to map sample.
+    :type mapper: callable
+    :param buffered_size: the size of buffer used to process images
+    :type buffered_size: int
+    :return: train data reader
+    :rtype: callable
+    '''
+    return reader_creator(
+        download(DATA_URL, 'flowers', DATA_MD5),
+        download(LABEL_URL, 'flowers', LABEL_MD5),
+        download(SETID_URL, 'flowers', SETID_MD5), 'trnid', mapper,
+        buffered_size)
+def test(mapper=default_mapper, buffered_size=1024):
+    '''
+    Create flowers test set reader. 
+    It returns a reader, each sample in the reader is   
+    image pixels in [0, 1] and label in [1, 102] 
+    translated from original color image by steps:
+    1. resize to 256*256
+    2. random crop to 224*224
+    3. flatten
+    :param mapper:  a function to map sample.
+    :type mapper: callable
+    :param buffered_size: the size of buffer used to process images
+    :type buffered_size: int
+    :return: test data reader
+    :rtype: callable
+    '''
+    return reader_creator(
+        download(DATA_URL, 'flowers', DATA_MD5),
+        download(LABEL_URL, 'flowers', LABEL_MD5),
+        download(SETID_URL, 'flowers', SETID_MD5), 'tstid', mapper,
+        buffered_size)
+def valid(mapper=default_mapper, buffered_size=1024):
+    '''
+    Create flowers validation set reader. 
+    It returns a reader, each sample in the reader is   
+    image pixels in [0, 1] and label in [1, 102] 
+    translated from original color image by steps:
+    1. resize to 256*256
+    2. random crop to 224*224
+    3. flatten
+    :param mapper:  a function to map sample.
+    :type mapper: callable
+    :param buffered_size: the size of buffer used to process images
+    :type buffered_size: int
+    :return: test data reader
+    :rtype: callable
+    '''
+    return reader_creator(
+        download(DATA_URL, 'flowers', DATA_MD5),
+        download(LABEL_URL, 'flowers', LABEL_MD5),
+        download(SETID_URL, 'flowers', SETID_MD5), 'valid', mapper,
+        buffered_size)
+def fetch():
+    download(DATA_URL, 'flowers', DATA_MD5)
+    download(LABEL_URL, 'flowers', LABEL_MD5)
+    download(SETID_URL, 'flowers', SETID_MD5)
--- a/python/paddle/v2/dataset/tests/common_test.py
+++ b/python/paddle/v2/dataset/tests/common_test.py
@@ -57,6 +57,38 @@ class TestCommon(unittest.TestCase):
        for idx, e in enumerate(reader()):
            self.assertEqual(e, str("0"))
+    def test_convert(self):
+        record_num = 10
+        num_shards = 4
+        def test_reader():
+            def reader():
+                for x in xrange(record_num):
+                    yield x
+            return reader
+        path = tempfile.mkdtemp()
+        paddle.v2.dataset.common.convert(path,
+                                         test_reader(), num_shards,
+                                         'random_images')
+        files = glob.glob(path + '/random_images-*')
+        self.assertEqual(len(files), num_shards)
+        recs = []
+        for i in range(0, num_shards):
+            n = "%s/random_images-%05d-of-%05d" % (path, i, num_shards - 1)
+            r = recordio.reader(n)
+            while True:
+                d = r.read()
+                if d is None:
+                    break
+                recs.append(d)
+        recs.sort()
+        self.assertEqual(total, record_num)
 if __name__ == '__main__':
    unittest.main()
--- a/demo/introduction/evaluate_model.py
+++ b/demo/introduction/evaluate_model.py
-#!/usr/bin/env python
-# -*- coding: UTF-8 -*-
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -14,26 +11,41 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""
-Print model parameters in last model
-Usage:
-    python evaluate_model.py
-"""
-import numpy as np
-import os
-def load(file_name):
-    with open(file_name, 'rb') as f:
-        f.read(16)  # skip header for float type.
-        return np.fromfile(f, dtype=np.float32)
-def main():
+import paddle.v2.dataset.flowers
-    print 'w=%.6f, b=%.6f from pass 29' % (load('output/pass-00029/w'),
+import unittest
-                                           load('output/pass-00029/b'))
+class TestFlowers(unittest.TestCase):
+    def check_reader(self, reader):
+        sum = 0
+        label = 0
+        size = 224 * 224 * 3
+        for l in reader():
+            self.assertEqual(l[0].size, size)
+            if l[1] > label:
+                label = l[1]
+            sum += 1
+        return sum, label
+    def test_train(self):
+        instances, max_label_value = self.check_reader(
+            paddle.v2.dataset.flowers.train())
+        self.assertEqual(instances, 1020)
+        self.assertEqual(max_label_value, 102)
+    def test_test(self):
+        instances, max_label_value = self.check_reader(
+            paddle.v2.dataset.flowers.test())
+        self.assertEqual(instances, 6149)
+        self.assertEqual(max_label_value, 102)
+    def test_valid(self):
+        instances, max_label_value = self.check_reader(
+            paddle.v2.dataset.flowers.valid())
+        self.assertEqual(instances, 1020)
+        self.assertEqual(max_label_value, 102)
 if __name__ == '__main__':
-    main()
+    unittest.main()
--- a/python/paddle/v2/image.py
+++ b/python/paddle/v2/image.py
 import numpy as np
 try:
    import cv2
-except:
+except ImportError:
-    print(
+    cv2 = None
-        "import cv2 error, please install opencv-python: pip install opencv-python"
+import os
-    )
+import tarfile
+import cPickle
 __all__ = [
-    "load_image", "resize_short", "to_chw", "center_crop", "random_crop",
+    "load_image_bytes", "load_image", "resize_short", "to_chw", "center_crop",
-    "left_right_flip", "simple_transform", "load_and_transform"
+    "random_crop", "left_right_flip", "simple_transform", "load_and_transform",
+    "batch_images_from_tar"
 ]
 """
 This file contains some common interfaces for image preprocess.
@@ -28,6 +30,90 @@ the image layout as follows.
 """
+def batch_images_from_tar(data_file,
+                          dataset_name,
+                          img2label,
+                          num_per_batch=1024):
+    """
+    Read images from tar file and batch them into batch file.
+    param data_file: path of image tar file
+    type data_file: string
+    param dataset_name: 'train','test' or 'valid'
+    type dataset_name: string
+    param img2label: a dic with image file name as key 
+                    and image's label as value
+    type img2label: dic
+    param num_per_batch: image number per batch file
+    type num_per_batch: int
+    return: path of list file containing paths of batch file
+    rtype: string
+    """
+    batch_dir = data_file + "_batch"
+    out_path = "%s/%s" % (batch_dir, dataset_name)
+    meta_file = "%s/%s.txt" % (batch_dir, dataset_name)
+    if os.path.exists(out_path):
+        return meta_file
+    else:
+        os.makedirs(out_path)
+    tf = tarfile.open(data_file)
+    mems = tf.getmembers()
+    data = []
+    labels = []
+    file_id = 0
+    for mem in mems:
+        if mem.name in img2label:
+            data.append(tf.extractfile(mem).read())
+            labels.append(img2label[mem.name])
+            if len(data) == num_per_batch:
+                output = {}
+                output['label'] = labels
+                output['data'] = data
+                cPickle.dump(
+                    output,
+                    open('%s/batch_%d' % (out_path, file_id), 'w'),
+                    protocol=cPickle.HIGHEST_PROTOCOL)
+                file_id += 1
+                data = []
+                labels = []
+    if len(data) > 0:
+        output = {}
+        output['label'] = labels
+        output['data'] = data
+        cPickle.dump(
+            output,
+            open('%s/batch_%d' % (out_path, file_id), 'w'),
+            protocol=cPickle.HIGHEST_PROTOCOL)
+    with open(meta_file, 'a') as meta:
+        for file in os.listdir(out_path):
+            meta.write(os.path.abspath("%s/%s" % (out_path, file)) + "\n")
+    return meta_file
+def load_image_bytes(bytes, is_color=True):
+    """
+    Load an color or gray image from bytes array.
+    Example usage:
+    .. code-block:: python
+        with open('cat.jpg') as f:
+            im = load_image_bytes(f.read())
+    :param bytes: the input image bytes array.
+    :type file: str
+    :param is_color: If set is_color True, it will load and
+                     return a color image. Otherwise, it will
+                     load and return a gray image.
+    """
+    flag = 1 if is_color else 0
+    file_bytes = np.asarray(bytearray(bytes), dtype=np.uint8)
+    img = cv2.imdecode(file_bytes, flag)
+    return img
 def load_image(file, is_color=True):
    """
    Load an color or gray image from the file path.

--- a/python/paddle/v2/layer.py
+++ b/python/paddle/v2/layer.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 """
 `paddle.v2.layer` is a part of model config packages in paddle.v2. In API v2,
-we want to make Paddle a plain Python package. The model config package defined
+we want to make Paddle a plain Python package. The model config package defines
 the way how to configure a neural network topology in Paddle Python code.
 The primary usage shows below.
@@ -30,7 +30,6 @@ The primary usage shows below.
    # use prediction instance where needed.
    parameters = paddle.parameters.create(cost)
 """
 import collections
 import copy
 import re
@@ -44,9 +43,10 @@ __all__ = ['data', 'parse_network']
 def __need_to_keep__(name):
-    if name in ['StaticInput', 'LayerType', 'layer_support']:
+    return name in [
-        return False
+        'StaticInput', 'SubsequenceInput', 'GeneratedInput', 'LayerType',
-    return True
+        'layer_support'
+    ]
 def __need_to_wrap__(name):
@@ -54,6 +54,8 @@ def __need_to_wrap__(name):
 def __convert_name__(inname):
+    if __need_to_keep__(inname):
+        return inname
    if inname == 'maxid_layer':
        return 'max_id'
    elif inname.endswith('memory') or inname.endswith(
@@ -74,8 +76,6 @@ def __convert_name__(inname):
 for name in v1_layers.__all__:
    obj = getattr(v1_layers, name)
-    if not __need_to_keep__(name):
-        continue
    new_name = __convert_name__(name)
    if callable(obj) and __need_to_wrap__(name):
        globals()[new_name] = __convert_to_v2__(obj, new_name, __name__)
@@ -107,7 +107,7 @@ __data_layer__.__doc__ = __map_data_docstr__(v1_layers.data_layer.__doc__)
 data = __convert_to_v2__(__data_layer__, 'name', __name__)
-def __get_used_layers__(output_layers, extra_layers=None):
+def __get_used_layers__(output_layers):
    layer_names = set()
    parents = {}
@@ -132,6 +132,13 @@ def __get_used_layers__(output_layers, extra_layers=None):
                    add_parent(mem.layer_name, mem.boot_layer_name)
                add_parent(mem.link_name, mem.layer_name)
+            if sub_model.HasField('generator'):
+                # according to the implementation of text generation
+                # in recurrent layer group, the generated word must be
+                # the first out link
+                add_parent(sub_model.out_links[0].layer_name,
+                           sub_model.generator.eos_layer_name)
    def dfs_travel(layer_name):
        if layer_name in layer_names:
            return
@@ -149,6 +156,20 @@ def __get_used_layers__(output_layers, extra_layers=None):
    for layer in output_layers:
        dfs_travel(layer.full_name)
+    # print layer needs to be specially handled because no other
+    # layer depends on it. It is used to print the result of some
+    # layers when running the model for debug purpose. So we explicitly
+    # add a print layer to the topolty if its input is in the toplogy.
+    for layer in cp.g_config.model_config.layers:
+        if layer.type == 'print':
+            used = True
+            for inp in layer.inputs:
+                if inp.input_layer_name not in layer_names:
+                    used = False
+                    break
+            if used:
+                layer_names.add(layer.name)
    return layer_names
@@ -233,8 +254,8 @@ def __trim_submodel__(old_submodel, layer_names, input_layer_names,
 def parse_network(output_layers, extra_layers=None):
    if not isinstance(output_layers, collections.Sequence):
        output_layers = [output_layers]
-    if extra_layers is not None and not isinstance(extra_layers,
+    if extra_layers is not None:
-                                                   collections.Sequence):
+        if not isinstance(extra_layers, collections.Sequence):
            extra_layers = [extra_layers]
    else:
        extra_layers = []
@@ -248,18 +269,29 @@ def parse_network(output_layers, extra_layers=None):
    model_config = ModelConfig()
    model_config.type = cp.g_config.model_config.type
+    for layer in output_layers:
+        model_config.output_layer_names.append(layer.full_name)
+        output_layer_names.add(layer.full_name)
    for l in cp.g_config.model_config.layers:
        if l.name not in layer_names:
            continue
        model_config.layers.extend([l])
        if l.type == 'data':
+            if l.name in model_config.output_layer_names:
+                """
+                In text generation, the outlink to save the generated word
+                indices is a data_layer defined in recurrent_group. This
+                data_layer is sure to be the output of the network in text
+                generation task, so this statement excludes such a special
+                data_layer from being inputs of the network, otherwise an error
+                will occur during data feeding.
+                """
+                continue
            model_config.input_layer_names.append(l.name)
            input_layer_names.add(l.name)
-    for layer in output_layers:
-        model_config.output_layer_names.append(layer.full_name)
-        output_layer_names.add(layer.full_name)
    for e in cp.g_config.model_config.evaluators:
        if e.name in evaluator_names:
            model_config.evaluators.extend([e])

--- a/python/paddle/v2/master/.gitignore
+++ b/python/paddle/v2/master/.gitignore
+*.whl
+*.so
+*.pyc
--- a/python/paddle/v2/master/__init__.py
+++ b/python/paddle/v2/master/__init__.py
+from client import *
+__all__ = ['client']
--- a/python/paddle/v2/master/client.py
+++ b/python/paddle/v2/master/client.py
+import ctypes
+import os
+path = os.path.join(os.path.dirname(__file__), "libpaddle_master.so")
+lib = ctypes.cdll.LoadLibrary(path)
+class client(object):
+    """
+    client is a client to the master server.
+    """
+    def __init__(self, addr, buf_size):
+        self.c = lib.paddle_new_master_client(addr, buf_size)
+    def close(self):
+        lib.paddle_release_master_client(self.c)
+        self.c = None
+    def set_dataset(self, paths):
+        holder_type = ctypes.c_char_p * len(paths)
+        holder = holder_type()
+        print paths
+        for idx, path in enumerate(paths):
+            c_ptr = ctypes.c_char_p(path)
+            holder[idx] = c_ptr
+        lib.paddle_set_dataset(self.c, holder, len(paths))
+    def next_record(self):
+        p = ctypes.c_char_p()
+        ret = ctypes.pointer(p)
+        size = lib.paddle_next_record(self.c, ret)
+        if size == 0:
+            # Empty record
+            return ""
+        record = ret.contents.value[:size]
+        # Memory created from C should be freed.
+        lib.mem_free(ret.contents)
+        return record
--- a/python/paddle/v2/optimizer.py
+++ b/python/paddle/v2/optimizer.py
@@ -45,7 +45,12 @@ class Optimizer(object):
        return swig_api.ParameterUpdater.createRemoteUpdater(
            self.__opt_conf__, pass_num, use_sparse_updater)
-    def create_updater(self, is_local, num_passes, use_sparse_updater):
+    def __create_new_remote_updater__(self, pserver_spec):
+        return swig_api.ParameterUpdater.createNewRemoteUpdater(
+            self.__opt_conf__, pserver_spec)
+    def create_updater(self, is_local, num_passes, use_sparse_updater,
+                       pserver_spec):
        """
        create proper parameter_updater by configuration.
        :param is_local: create local or remote parameter updater
@@ -64,8 +69,12 @@ class Optimizer(object):
        if is_local:
            parameter_updater = self.__create_local_updater__()
        else:
+            if pserver_spec is None:
                parameter_updater = self.__create_remote_updater__(
                    num_passes, use_sparse_updater)
+            else:
+                parameter_updater = self.__create_new_remote_updater__(
+                    pserver_spec)
        return parameter_updater

--- a/python/paddle/v2/parameters.py
+++ b/python/paddle/v2/parameters.py
 import numpy as np
 import py_paddle.swig_paddle as api
 from paddle.proto.ParameterConfig_pb2 import ParameterConfig
+import paddle.trainer.config_parser as cp
 import struct
 import tarfile
 import cStringIO
@@ -18,8 +19,11 @@ def create(layers):
    """
    topology = Topology(layers)
    pool = Parameters()
+    initializers = cp.g_parameter_initializer_map
    for param in topology.proto().parameters:
        pool.__append_config__(param)
+        if param.name in initializers:
+            pool[param.name] = initializers[param.name](param.name)
    return pool

--- a/python/paddle/v2/reader/decorator.py
+++ b/python/paddle/v2/reader/decorator.py
@@ -14,7 +14,7 @@
 __all__ = [
    'map_readers', 'buffered', 'compose', 'chain', 'shuffle',
-    'ComposeNotAligned', 'firstn'
+    'ComposeNotAligned', 'firstn', 'xmap_readers'
 ]
 import itertools
@@ -224,3 +224,74 @@ def firstn(reader, n):
            yield item
    return firstn_reader
+class XmapEndSignal():
+    pass
+def xmap_readers(mapper, reader, process_num, buffer_size):
+    """
+    Use multiprocess to map samples from reader by a mapper defined by user.
+    And this function contains a buffered decorator.
+    :param mapper:  a function to map sample.
+    :type mapper: callable
+    :param reader: the data reader to read from
+    :type reader: callable
+    :param process_num: process number to handle original sample 
+    :type process_num: int
+    :param buffer_size: max buffer size
+    :type buffer_size: int
+    :return: the decarated reader
+    :rtype: callable
+    """
+    end = XmapEndSignal()
+    in_queue = Queue(buffer_size)
+    out_queue = Queue(buffer_size)
+    # define a worker to read samples from reader to in_queue
+    def read_worker(reader, in_queue):
+        for i in reader():
+            in_queue.put(i)
+        in_queue.put(end)
+    # start a read worker in a thread
+    t = Thread(target=read_worker, args=(reader, in_queue))
+    t.daemon = True
+    t.start()
+    # define a worker to handle samples from in_queue by mapper
+    # and put mapped samples into out_queue
+    def handle_worker(in_queue, out_queue, mapper):
+        sample = in_queue.get()
+        while not isinstance(sample, XmapEndSignal):
+            r = mapper(sample)
+            out_queue.put(r)
+            sample = in_queue.get()
+        in_queue.put(end)
+        out_queue.put(end)
+    # start several handle_workers
+    workers = []
+    for i in xrange(process_num):
+        worker = Thread(
+            target=handle_worker, args=(in_queue, out_queue, mapper))
+        worker.daemon = True
+        workers.append(worker)
+    for w in workers:
+        w.start()
+    def xreader():
+        sample = out_queue.get()
+        while not isinstance(sample, XmapEndSignal):
+            yield sample
+            sample = out_queue.get()
+        finish = 1
+        while finish < process_num:
+            sample = out_queue.get()
+            if isinstance(sample, XmapEndSignal):
+                finish += 1
+            else:
+                yield sample
+    return xreader
--- a/python/paddle/v2/tests/test_layer.py
+++ b/python/paddle/v2/tests/test_layer.py
@@ -164,6 +164,7 @@ class OtherLayerTest(unittest.TestCase):
        maxid = layer.max_id(input=inference)
        sampling_id = layer.sampling_id(input=inference)
        eos = layer.eos(input=maxid, eos_id=5)
+        layer.printer(maxid)
        print layer.parse_network([maxid, sampling_id, eos])
    def test_slicing_joining_layer(self):

--- a/python/paddle/v2/tests/test_parameters.py
+++ b/python/paddle/v2/tests/test_parameters.py
@@ -11,6 +11,9 @@ except ImportError:
    sys.exit(0)
 import paddle.v2.parameters as parameters
+import paddle.v2.data_type as data_type
+import paddle.v2.layer as layer
+from paddle.v2.attr import ParamAttr
 from paddle.proto.ParameterConfig_pb2 import ParameterConfig
 import random
 import cStringIO
@@ -55,6 +58,25 @@ class TestParameters(unittest.TestCase):
            p1 = params_dup.get(name)
            self.assertTrue(numpy.isclose(p0, p1).all())
+    def test_initializer(self):
+        def initializer(name):
+            assert name == "fc.w"
+            mat = numpy.ones((3, 2), dtype=numpy.float32)
+            mat[1, 1] = 2
+            return mat
+        x = layer.data(name="x", type=data_type.dense_vector(3))
+        y = layer.fc(x,
+                     size=2,
+                     bias_attr=False,
+                     param_attr=ParamAttr(
+                         name="fc.w", initializer=initializer))
+        params = parameters.create(y)
+        val = params["fc.w"]
+        assert val.shape == (3, 2)
+        expected = numpy.array([[1, 1], [1, 2], [1, 1]], numpy.float32)
+        assert numpy.logical_and.reduce(numpy.reshape(val == expected, 6))
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/v2/topology.py
+++ b/python/paddle/v2/topology.py
@@ -31,7 +31,6 @@ class Topology(object):
    def __init__(self, layers, extra_layers=None):
        def __check__(layers):
            if not isinstance(layers, collections.Sequence):
-                __check_layer_type__(layers)
                layers = [layers]
            for layer in layers:
                __check_layer_type__(layer)
@@ -91,6 +90,7 @@ class Topology(object):
        [('image', dense_vector(768)), ('label', integer_value(10))]
        """
        data_layers = self.data_layers()
        return [(nm, data_layers[nm].data_type)
                for nm in self.proto().input_layer_names]

--- a/python/paddle/v2/trainer.py
+++ b/python/paddle/v2/trainer.py
@@ -49,7 +49,8 @@ class SGD(object):
                 parameters,
                 update_equation,
                 extra_layers=None,
-                 is_local=True):
+                 is_local=True,
+                 pserver_spec=None):
        if not isinstance(parameters, v2_parameters.Parameters):
            raise TypeError('parameters should be parameters')
@@ -63,6 +64,7 @@ class SGD(object):
        self.__parameters__ = parameters
        self.__topology_in_proto__ = topology.proto()
        self.__is_local__ = is_local
+        self.__pserver_spec__ = pserver_spec
        self.__use_sparse_updater__ = self.__topology__.use_sparse_updater()
        # # In local mode, disable sparse_remote_update.
@@ -126,7 +128,8 @@ class SGD(object):
        __check_train_args__(**locals())
        self.__parameter_updater__ = self.__optimizer__.create_updater(
-            self.__is_local__, num_passes, self.__use_sparse_updater__)
+            self.__is_local__, num_passes, self.__use_sparse_updater__,
+            self.__pserver_spec__)
        self.__parameter_updater__.init(self.__gradient_machine__)
        self.__gradient_machine__.start()

--- a/python/setup.py.in
+++ b/python/setup.py.in
 from setuptools import setup
 packages=['paddle',
          'paddle.proto',
          'paddle.trainer',
@@ -9,21 +8,25 @@ packages=['paddle',
          'paddle.v2',
          'paddle.v2.dataset',
          'paddle.v2.reader',
-          'paddle.v2.plot']
+          'paddle.v2.plot',
+          'paddle.v2.master']
+setup_requires=["requests",
+                "numpy",
+                "protobuf==3.1",
+                "matplotlib",
+                "rarfile"]
+if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']:
+    setup_requires+=["opencv-python"]
 setup(name='paddle',
      version='${PADDLE_VERSION}',
      description='Parallel Distributed Deep Learning',
-      install_requires=[
+      install_requires=setup_requires,
-          "requests",
-          "numpy",
-          "protobuf==${PROTOBUF_VERSION}",
-          "matplotlib",
-          "opencv-python",
-          "rarfile"
-      ],
      packages=packages,
+      package_data={'paddle.v2.master': ['libpaddle_master.so'], },
      package_dir={
          '': '${CMAKE_CURRENT_SOURCE_DIR}'
-      }
+      },
 )
--- a/v1_api_demo/README.md
+++ b/v1_api_demo/README.md
+The examples in v1_api_demo are using v1_api now, and will be upgraded into v2_api later.
+Thus, v1_api_demo is a temporary directory. We decide not to maintain it and will delete it in future.
+Please go to [PaddlePaddle/book](https://github.com/PaddlePaddle/book) and 
+[PaddlePaddle/models](https://github.com/PaddlePaddle/models) to learn PaddlePaddle.
--- a/demo/gan/.gitignore
+++ b/demo/gan/.gitignore
--- a/demo/gan/README.md
+++ b/demo/gan/README.md
--- a/demo/gan/data/download_cifar.sh
+++ b/demo/gan/data/download_cifar.sh
--- a/demo/gan/data/get_mnist_data.sh
+++ b/demo/gan/data/get_mnist_data.sh
--- a/demo/gan/gan_conf.py
+++ b/demo/gan/gan_conf.py
--- a/demo/gan/gan_conf_image.py
+++ b/demo/gan/gan_conf_image.py
--- a/demo/gan/gan_trainer.py
+++ b/demo/gan/gan_trainer.py
--- a/demo/mnist/.gitignore
+++ b/demo/mnist/.gitignore
--- a/demo/mnist/api_train.py
+++ b/demo/mnist/api_train.py
--- a/demo/mnist/data/generate_list.py
+++ b/demo/mnist/data/generate_list.py
--- a/demo/mnist/data/get_mnist_data.sh
+++ b/demo/mnist/data/get_mnist_data.sh
--- a/demo/mnist/light_mnist.py
+++ b/demo/mnist/light_mnist.py
--- a/demo/mnist/mnist_provider.py
+++ b/demo/mnist/mnist_provider.py
--- a/demo/mnist/mnist_util.py
+++ b/demo/mnist/mnist_util.py
--- a/demo/mnist/train.sh
+++ b/demo/mnist/train.sh
--- a/demo/mnist/vgg_16_mnist.py
+++ b/demo/mnist/vgg_16_mnist.py
--- a/demo/model_zoo/embedding/.gitignore
+++ b/demo/model_zoo/embedding/.gitignore
--- a/demo/model_zoo/embedding/extract_para.py
+++ b/demo/model_zoo/embedding/extract_para.py
--- a/demo/model_zoo/embedding/paraconvert.py
+++ b/demo/model_zoo/embedding/paraconvert.py
--- a/demo/model_zoo/embedding/pre_DictAndModel.sh
+++ b/demo/model_zoo/embedding/pre_DictAndModel.sh
--- a/demo/model_zoo/resnet/.gitignore
+++ b/demo/model_zoo/resnet/.gitignore
--- a/demo/model_zoo/resnet/classify.py
+++ b/demo/model_zoo/resnet/classify.py
--- a/demo/model_zoo/resnet/example/.gitignore
+++ b/demo/model_zoo/resnet/example/.gitignore
--- a/demo/model_zoo/resnet/example/__init__.py
+++ b/demo/model_zoo/resnet/example/__init__.py
--- a/demo/model_zoo/resnet/example/cat.jpg
+++ b/demo/model_zoo/resnet/example/cat.jpg
--- a/demo/model_zoo/resnet/example/dog.jpg
+++ b/demo/model_zoo/resnet/example/dog.jpg
--- a/demo/model_zoo/resnet/example/image_list_provider.py
+++ b/demo/model_zoo/resnet/example/image_list_provider.py
--- a/demo/model_zoo/resnet/example/test.list
+++ b/demo/model_zoo/resnet/example/test.list
--- a/demo/model_zoo/resnet/extract_fea_c++.sh
+++ b/demo/model_zoo/resnet/extract_fea_c++.sh
--- a/demo/model_zoo/resnet/extract_fea_py.sh
+++ b/demo/model_zoo/resnet/extract_fea_py.sh
--- a/demo/model_zoo/resnet/get_model.sh
+++ b/demo/model_zoo/resnet/get_model.sh
--- a/demo/model_zoo/resnet/load_feature.py
+++ b/demo/model_zoo/resnet/load_feature.py
--- a/demo/model_zoo/resnet/net_diagram.sh
+++ b/demo/model_zoo/resnet/net_diagram.sh
--- a/demo/model_zoo/resnet/predict.sh
+++ b/demo/model_zoo/resnet/predict.sh
--- a/demo/model_zoo/resnet/resnet.py
+++ b/demo/model_zoo/resnet/resnet.py
--- a/demo/quick_start/.gitignore
+++ b/demo/quick_start/.gitignore
--- a/demo/quick_start/api_predict.py
+++ b/demo/quick_start/api_predict.py
--- a/demo/quick_start/api_predict.sh
+++ b/demo/quick_start/api_predict.sh
--- a/demo/quick_start/api_train.py
+++ b/demo/quick_start/api_train.py
--- a/demo/quick_start/api_train.sh
+++ b/demo/quick_start/api_train.sh
--- a/demo/quick_start/cluster/cluster_train.sh
+++ b/demo/quick_start/cluster/cluster_train.sh
--- a/demo/quick_start/cluster/env.sh
+++ b/demo/quick_start/cluster/env.sh
--- a/demo/quick_start/cluster/pserver.sh
+++ b/demo/quick_start/cluster/pserver.sh
--- a/demo/quick_start/data/README.md
+++ b/demo/quick_start/data/README.md
--- a/demo/quick_start/data/get_data.sh
+++ b/demo/quick_start/data/get_data.sh
--- a/demo/quick_start/data/proc_from_raw_data/get_data.sh
+++ b/demo/quick_start/data/proc_from_raw_data/get_data.sh
--- a/demo/quick_start/data/proc_from_raw_data/preprocess.py
+++ b/demo/quick_start/data/proc_from_raw_data/preprocess.py
--- a/demo/quick_start/dataprovider_bow.py
+++ b/demo/quick_start/dataprovider_bow.py
--- a/demo/quick_start/dataprovider_emb.py
+++ b/demo/quick_start/dataprovider_emb.py
--- a/demo/quick_start/predict.sh
+++ b/demo/quick_start/predict.sh
--- a/demo/quick_start/train.sh
+++ b/demo/quick_start/train.sh
--- a/demo/quick_start/trainer_config.bidi-lstm.py
+++ b/demo/quick_start/trainer_config.bidi-lstm.py
--- a/demo/quick_start/trainer_config.cnn.py
+++ b/demo/quick_start/trainer_config.cnn.py
--- a/demo/quick_start/trainer_config.db-lstm.py
+++ b/demo/quick_start/trainer_config.db-lstm.py
--- a/demo/quick_start/trainer_config.emb.py
+++ b/demo/quick_start/trainer_config.emb.py
--- a/demo/quick_start/trainer_config.lr.py
+++ b/demo/quick_start/trainer_config.lr.py
--- a/demo/quick_start/trainer_config.lstm.py
+++ b/demo/quick_start/trainer_config.lstm.py
--- a/demo/quick_start/trainer_config.resnet-lstm.py
+++ b/demo/quick_start/trainer_config.resnet-lstm.py
--- a/demo/sequence_tagging/data/get_data.sh
+++ b/demo/sequence_tagging/data/get_data.sh
--- a/demo/sequence_tagging/data/test.list
+++ b/demo/sequence_tagging/data/test.list
--- a/demo/sequence_tagging/data/train.list
+++ b/demo/sequence_tagging/data/train.list
--- a/demo/sequence_tagging/dataprovider.py
+++ b/demo/sequence_tagging/dataprovider.py
--- a/demo/sequence_tagging/linear_crf.py
+++ b/demo/sequence_tagging/linear_crf.py
--- a/demo/sequence_tagging/readme.md
+++ b/demo/sequence_tagging/readme.md
--- a/demo/sequence_tagging/rnn_crf.py
+++ b/demo/sequence_tagging/rnn_crf.py
--- a/demo/sequence_tagging/train.sh
+++ b/demo/sequence_tagging/train.sh
--- a/demo/sequence_tagging/train_linear.sh
+++ b/demo/sequence_tagging/train_linear.sh
--- a/demo/traffic_prediction/README
+++ b/demo/traffic_prediction/README
--- a/demo/traffic_prediction/data/get_data.sh
+++ b/demo/traffic_prediction/data/get_data.sh
--- a/demo/traffic_prediction/dataprovider.py
+++ b/demo/traffic_prediction/dataprovider.py
--- a/demo/traffic_prediction/gen_result.py
+++ b/demo/traffic_prediction/gen_result.py
--- a/demo/traffic_prediction/predict.sh
+++ b/demo/traffic_prediction/predict.sh
--- a/demo/traffic_prediction/train.sh
+++ b/demo/traffic_prediction/train.sh
--- a/demo/traffic_prediction/trainer_config.py
+++ b/demo/traffic_prediction/trainer_config.py
--- a/v1_api_demo/vae/README.md
+++ b/v1_api_demo/vae/README.md
+#Variational Autoencoder (VAE)
+This demo implements VAE training described in the original paper (https://arxiv.org/abs/1312.6114).
+In order to run the model, first download the MNIST dataset by running the shell script in ./data.
+Then you can run the command below. The flag --useGpu specifies whether to use gpu for training (0 is cpu, 1 is gpu).  
+$python vae_train.py [--use_gpu 1]
+The generated images will be stored in ./samples/
+The corresponding models will be stored in ./params/
--- a/v1_api_demo/vae/data/get_mnist_data.sh
+++ b/v1_api_demo/vae/data/get_mnist_data.sh
+#!/usr/bin/env sh
+# This script downloads the mnist data and unzips it.
+set -e
+DIR="$( cd "$(dirname "$0")" ; pwd -P )"
+rm -rf "$DIR/mnist_data"
+mkdir "$DIR/mnist_data"
+cd "$DIR/mnist_data"
+echo "Downloading..."
+for fname in train-images-idx3-ubyte train-labels-idx1-ubyte t10k-images-idx3-ubyte t10k-labels-idx1-ubyte
+do
+    if [ ! -e $fname ]; then
+        wget --no-check-certificate http://yann.lecun.com/exdb/mnist/${fname}.gz
+        gunzip ${fname}.gz
+    fi
+done
--- a/demo/sentiment/data/get_imdb.sh
+++ b/demo/sentiment/data/get_imdb.sh
-#!/bin/bash
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,39 +12,49 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-set -e
+import numpy as np
-set -x
-DIR="$( cd "$(dirname "$0")" ; pwd -P )"
-cd $DIR
-#download the dataset
+class MNISTloader():
-echo "Downloading aclImdb..."
+    def __init__(self,
-#http://ai.stanford.edu/%7Eamaas/data/sentiment/
+                 data_path="./data/mnist_data/",
-wget http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz
+                 batch_size=60,
+                 process='train'):
+        self.batch_size = batch_size
+        self.data_path = data_path
+        self._pointer = 0
+        self.image_batches = np.array([])
+        self.process = process
-echo "Downloading mosesdecoder..."
+    def _extract_images(self, filename, n):
-#https://github.com/moses-smt/mosesdecoder
+        f = open(filename, 'rb')
-wget https://github.com/moses-smt/mosesdecoder/archive/master.zip
+        f.read(16)
+        data = np.fromfile(f, 'ubyte', count=n * 28 * 28).reshape((n, 28 * 28))
+        #Mapping data into [-1, 1]
+        data = data / 255. * 2. - 1
+        data_batches = np.split(data, 60000 / self.batch_size, 0)
-#extract package
+        f.close()
-echo "Unzipping..."
-tar -zxvf aclImdb_v1.tar.gz
-unzip master.zip
-#move train and test set to imdb_data directory 
+        return data_batches
-#in order to process when traing
-mkdir -p imdb/train
-mkdir -p imdb/test
-cp -r aclImdb/train/pos/ imdb/train/pos
+    @property
-cp -r aclImdb/train/neg/ imdb/train/neg
+    def pointer(self):
+        return self._pointer
-cp -r aclImdb/test/pos/ imdb/test/pos
+    def load_data(self):
-cp -r aclImdb/test/neg/ imdb/test/neg
+        TRAIN_IMAGES = '%s/train-images-idx3-ubyte' % self.data_path
+        TEST_IMAGES = '%s/t10k-images-idx3-ubyte' % self.data_path
-#remove compressed package
+        if self.process == 'train':
-rm aclImdb_v1.tar.gz
+            self.image_batches = self._extract_images(TRAIN_IMAGES, 60000)
-rm master.zip
+        else:
+            self.image_batches = self._extract_images(TEST_IMAGES, 10000)
-echo "Done."
+    def next_batch(self):
+        batch = self.image_batches[self._pointer]
+        self._pointer = (self._pointer + 1) % (60000 / self.batch_size)
+        return np.array(batch)
+    def reset_pointer(self):
+        self._pointer = 0
--- a/v1_api_demo/vae/vae_conf.py
+++ b/v1_api_demo/vae/vae_conf.py
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle.trainer_config_helpers import *
+import numpy as np
+is_generating = get_config_arg("is_generating", bool, False)
+settings(batch_size=32, learning_rate=1e-3, learning_method=AdamOptimizer())
+X_dim = 28 * 28
+h_dim = 128
+z_dim = 100
+def reparameterization(mu, logvar):
+    eps = ParamAttr(initial_mean=0., initial_std=1)
+    with mixed_layer() as sigma:
+        sigma += dotmul_projection(layer_math.exp(logvar) * 0.5, param_attr=eps)
+    return mu + sigma
+def q_func(X):
+    """
+    xavier initialization
+    """
+    param_attr = ParamAttr(
+        name='share.w', initial_mean=0., initial_std=1. / np.sqrt(X_dim / 2.))
+    mu_param = ParamAttr(
+        name='mu.w', initial_mean=0., initial_std=1. / np.sqrt(h_dim / 2.))
+    logvar_param = ParamAttr(
+        name='logvar.w', initial_mean=0., initial_std=1. / np.sqrt(h_dim / 2.))
+    bias_attr = ParamAttr(name='share.bias', initial_mean=0., initial_std=0.)
+    mu_bias = ParamAttr(name='mu.bias', initial_mean=0., initial_std=0.)
+    logvar_bias = ParamAttr(name='logvar.bias', initial_mean=0., initial_std=0.)
+    share_layer = fc_layer(
+        X,
+        size=h_dim,
+        param_attr=param_attr,
+        bias_attr=bias_attr,
+        act=ReluActivation())
+    return (fc_layer(
+        share_layer,
+        size=z_dim,
+        param_attr=mu_param,
+        bias_attr=mu_bias,
+        act=LinearActivation()), fc_layer(
+            share_layer,
+            size=z_dim,
+            param_attr=logvar_param,
+            bias_attr=logvar_bias,
+            act=LinearActivation()))
+def generator(z):
+    hidden_param = ParamAttr(
+        name='hidden.w', initial_mean=0., initial_std=1. / np.sqrt(z_dim / 2.))
+    hidden_bias = ParamAttr(name='hidden.bias', initial_mean=0., initial_std=0.)
+    prob_param = ParamAttr(
+        name='prob.w', initial_mean=0., initial_std=1. / np.sqrt(h_dim / 2.))
+    prob_bias = ParamAttr(name='prob.bias', initial_mean=0., initial_std=0.)
+    hidden_layer = fc_layer(
+        z,
+        size=h_dim,
+        act=ReluActivation(),
+        param_attr=hidden_param,
+        bias_attr=hidden_bias)
+    prob = fc_layer(
+        hidden_layer,
+        size=X_dim,
+        act=SigmoidActivation(),
+        param_attr=prob_param,
+        bias_attr=prob_bias)
+    return prob
+def reconstruct_error(prob, X):
+    cost = multi_binary_label_cross_entropy(input=prob, label=X)
+    return cost
+def KL_loss(mu, logvar):
+    with mixed_layer() as mu_square:
+        mu_square += dotmul_operator(mu, mu, scale=1.)
+    cost = 0.5 * sum_cost(layer_math.exp(logvar) + mu_square - 1. - logvar)
+    return cost
+if not is_generating:
+    x_batch = data_layer(name='x_batch', size=X_dim)
+    mu, logvar = q_func(x_batch)
+    z_samples = reparameterization(mu, logvar)
+    prob = generator(z_samples)
+    outputs(reconstruct_error(prob, x_batch) + KL_loss(mu, logvar))
+else:
+    z_samples = data_layer(name='noise', size=z_dim)
+    outputs(generator(z_samples))
--- a/v1_api_demo/vae/vae_train.py
+++ b/v1_api_demo/vae/vae_train.py
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import random
+import numpy as np
+import cPickle
+import sys, os
+from PIL import Image
+from paddle.trainer.config_parser import parse_config
+from paddle.trainer.config_parser import logger
+import py_paddle.swig_paddle as api
+import dataloader
+import matplotlib.pyplot as plt
+def plot_samples(samples):
+    fig = plt.figure(figsize=(4, 4))
+    gs = gridspec.GridSpec(4, 4)
+    gs.update(wspace=0.05, hspace=0.05)
+    for i, sample in enumerate(samples):
+        plt.subplot(gs[i])
+        plt.axis('off')
+        plt.imshow(sample.reshape(28, 28), cmap='Greys_r')
+    return fig
+def CHECK_EQ(a, b):
+    assert a == b, "a=%s, b=%s" % (a, b)
+def get_fake_samples(generator_machine, batch_size, noise):
+    gen_inputs = api.Arguments.createArguments(1)
+    gen_inputs.setSlotValue(0, api.Matrix.createDenseFromNumpy(noise))
+    gen_outputs = api.Arguments.createArguments(0)
+    generator_machine.forward(gen_inputs, gen_outputs, api.PASS_TEST)
+    fake_samples = gen_outputs.getSlotValue(0).copyToNumpyMat()
+    return fake_samples
+def copy_shared_parameters(src, dst):
+    '''
+    copy the parameters from src to dst
+    :param src: the source of the parameters
+    :type src: GradientMachine
+    :param dst: the destination of the parameters
+    :type dst: GradientMachine
+    '''
+    src_params = [src.getParameter(i) for i in xrange(src.getParameterSize())]
+    src_params = dict([(p.getName(), p) for p in src_params])
+    for i in xrange(dst.getParameterSize()):
+        dst_param = dst.getParameter(i)
+        src_param = src_params.get(dst_param.getName(), None)
+        if src_param is None:
+            continue
+        src_value = src_param.getBuf(api.PARAMETER_VALUE)
+        dst_value = dst_param.getBuf(api.PARAMETER_VALUE)
+        CHECK_EQ(len(src_value), len(dst_value))
+        dst_value.copyFrom(src_value)
+        dst_param.setValueUpdated()
+def find(iterable, cond):
+    for item in iterable:
+        if cond(item):
+            return item
+    return None
+def get_layer_size(model_conf, layer_name):
+    layer_conf = find(model_conf.layers, lambda x: x.name == layer_name)
+    assert layer_conf is not None, "Cannot find '%s' layer" % layer_name
+    return layer_conf.size
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--use_gpu", default="1", help="1 means use gpu for training")
+    parser.add_argument("--gpu_id", default="0", help="the gpu_id parameter")
+    args = parser.parse_args()
+    use_gpu = args.use_gpu
+    assert use_gpu in ["0", "1"]
+    if not os.path.exists("./samples/"):
+        os.makedirs("./samples/")
+    if not os.path.exists("./params/"):
+        os.makedirs("./params/")
+    api.initPaddle('--use_gpu=' + use_gpu, '--dot_period=10',
+                   '--log_period=1000', '--gpu_id=' + args.gpu_id,
+                   '--save_dir=' + "./params/")
+    conf = "vae_conf.py"
+    trainer_conf = parse_config(conf, "is_generating=False")
+    gener_conf = parse_config(conf, "is_generating=True")
+    batch_size = trainer_conf.opt_config.batch_size
+    noise_dim = get_layer_size(gener_conf.model_config, "noise")
+    mnist = dataloader.MNISTloader(batch_size=batch_size)
+    mnist.load_data()
+    training_machine = api.GradientMachine.createFromConfigProto(
+        trainer_conf.model_config)
+    generator_machine = api.GradientMachine.createFromConfigProto(
+        gener_conf.model_config)
+    trainer = api.Trainer.create(trainer_conf, training_machine)
+    trainer.startTrain()
+    for train_pass in xrange(100):
+        trainer.startTrainPass()
+        mnist.reset_pointer()
+        i = 0
+        it = 0
+        while mnist.pointer != 0 or i == 0:
+            X = mnist.next_batch().astype('float32')
+            inputs = api.Arguments.createArguments(1)
+            inputs.setSlotValue(0, api.Matrix.createDenseFromNumpy(X))
+            trainer.trainOneDataBatch(batch_size, inputs)
+            if it % 1000 == 0:
+                outputs = api.Arguments.createArguments(0)
+                training_machine.forward(inputs, outputs, api.PASS_TEST)
+                loss = np.mean(outputs.getSlotValue(0).copyToNumpyMat())
+                print "\niter: {}".format(str(it).zfill(3))
+                print "VAE loss: {}".format(str(loss).zfill(3))
+                #Sync parameters between networks (GradientMachine) at the beginning
+                copy_shared_parameters(training_machine, generator_machine)
+                z_samples = np.random.randn(batch_size,
+                                            noise_dim).astype('float32')
+                samples = get_fake_samples(generator_machine, batch_size,
+                                           z_samples)
+                #Generating the first 16 images for a picture. 
+                figure = plot_samples(samples[:16])
+                plt.savefig(
+                    "./samples/{}_{}.png".format(
+                        str(train_pass).zfill(3), str(i).zfill(3)),
+                    bbox_inches='tight')
+                plt.close(figure)
+                i += 1
+            it += 1
+        trainer.finishTrainPass()
+    trainer.finishTrain()
+if __name__ == '__main__':
+    main()