remove conflict

cd38e2d1 · chengduoZH · ce93eea8 · 2ac46d53 · cd38e2d1 · cd38e2d1
112 changed file
--- a/.copyright.hook
+++ b/.copyright.hook
@@ -77,10 +77,13 @@ def lang_type(filename):
    elif filename.endswith(".proto"):
        return "C"
    else:
-        print("Unsupported filetype")
+        print("Unsupported filetype %s", filename)
        exit(0)
+PYTHON_ENCODE = re.compile("^[ \t\v]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)")
 def main(argv=None):
    parser = argparse.ArgumentParser(
        description='Checker for copyright declaration.')
@@ -89,8 +92,15 @@ def main(argv=None):
    retv = 0
    for filename in args.filenames:
-        first_line = io.open(filename).readline()
+        fd = io.open(filename)
-        if "COPYRIGHT" in first_line.upper() : continue
+        first_line = fd.readline()
+        if "COPYRIGHT" in first_line.upper(): continue
+        if filename.endswith(".py"):
+            second_line = fd.readline()
+            if first_line.startswith("#!") or PYTHON_ENCODE.match(
+                    second_line) != None or PYTHON_ENCODE.match(
+                        first_line) != None:
+                continue
        original_contents = io.open(filename).read()
        new_contents = generate_copyright(
            COPYRIGHT, lang_type(filename)) + original_contents

--- a/CODE_OF_CONDUCT_cn.md
+++ b/CODE_OF_CONDUCT_cn.md
-# 貢獻者公約
+# 参与者公约
-## 我們的承諾
+## 我们的保证
-為了促進一個開放透明且受歡迎的環境，我們作為貢獻者和維護者保證，無論年齡、種族、民族、性別認同和表達、體型、殘疾、經驗水平、國籍、個人表現、宗教或性別取向，在我們的專案以及社群的參與者都有不被騷擾的體驗。
+为了促进一个开放透明且友好的环境，我们作为贡献者和维护者保证：无论年龄、种族、民族、性别认同和表达（方式）、体型、身体健全与否、经验水平、国籍、个人表现、宗教或性别取向，参与者在我们项目和社区中都免于骚扰。
-## 我們的準則
+## 我们的标准
-舉例來說有助於創造正面環境的行為包括：
+有助于创造正面环境的行为包括但不限于：
-* 使用歡迎和包容性語言
+* 使用友好和包容性语言
-* 尊重不同的觀點和經驗
+* 尊重不同的观点和经历
-* 優雅地接受建設性批評
+* 耐心地接受建设性批评
-* 關注在對於社群最好的事情上
+* 关注对社区最有利的事情
-* 對其他社群成員的表現友善
+* 友善对待其他社区成员
-舉例來說身為參與者不能接受的行為包括：
+身为参与者不能接受的行为包括但不限于：
-* 使用與性有關的言語或是圖像，以及不受歡迎的性騷擾
+* 使用与性有关的言语或是图像，以及不受欢迎的性骚扰
-* 酸民/反串/釣魚行為或進行侮辱/貶損的評論，人身攻擊及政治攻擊
+* 捣乱/煽动/造谣的行为或进行侮辱/贬损的评论，人身攻击及政治攻击
-* 公開或私下的騷擾
+* 公开或私下的骚扰
-* 未經許可地發布他人的個人資料，例如住址或是電子地址
+* 未经许可地发布他人的个人资料，例如住址或是电子地址
-* 其他可以被合理地認定為不恰當或者違反職業操守的行為
+* 其他可以被合理地认定为不恰当或者违反职业操守的行为
-## 我們的責任
+## 我们的责任
-專案維護者有責任為"可接受的行為"準則做出詮釋，以及對已發生的不被接受的行為採取恰當且公平的糾正措施。
+项目维护者有责任为「可接受的行为」标准做出诠释，以及对已发生的不被接受的行为采取恰当且公平的纠正措施。
-專案維護者有權力及責任去刪除、編輯、拒絕與本行為準則有所違背的評論(comments)、提交(commits)、程式碼、wiki 編輯、問題(issues)和其他貢獻，以及專案維護者可暫時或永久性的禁止任何他們認為有不適當、威脅、冒犯、有害行為的貢獻者。
+项目维护者有权利及责任去删除、编辑、拒绝与本行为标准有所违背的评论(comments)、提交(commits)、代码、wiki 编辑、问题(issues)和其他贡献，以及项目维护者可暂时或永久性的禁止任何他们认为有不适当、威胁、冒犯、有害行为的贡献者。
-## 使用範圍
+## 使用范围
-當一個人代表該專案或是其社群時，本行為準則適用於其專案平台和公共平台。
+当一个人代表该项目或是其社区时，本行为标准适用于其项目平台和公共平台。
-代表專案或是社群的情況，舉例來說包括使用官方專案的電子郵件地址、通過官方的社群媒體帳號發布或線上或線下事件中擔任指定代表。
+代表项目或是社区的情况，举例来说包括使用官方项目的电子邮件地址、通过官方的社区媒体账号发布或线上或线下事件中担任指定代表。
-該專案的呈現方式可由其專案維護者進行進一步的定義及解釋。
+该项目的呈现方式可由其项目维护者进行进一步的定义及解释。
-## 強制執行
+## 强制执行
-可以透過paddle-dev@baidu.com，來聯繫專案團隊來報告濫用、騷擾或其他不被接受的行為。
+可以通过paddle-dev@baidu.com，来联系项目团队来举报滥用、骚扰或其他不被接受的行为。
-任何維護團隊認為有必要且適合的所有投訴都將進行審查及調查，並做出相對應的回應。專案小組有對事件回報者有保密的義務。具體執行的方針近一步細節可能會單獨公佈。
+任何维护团队认为有必要且适合的所有投诉都将进行审查及调查，并做出相对应的回应。项目小组有对事件回报者有保密的义务。具体执行的方针近一步细节可能会单独公布。
-沒有真誠的遵守或是執行本行為準則的專案維護人員，可能會因專案領導人或是其他成員的決定，暫時或是永久的取消其身份。
+没有切实地遵守或是执行本行为标准的项目维护人员，可能会因项目领导人或是其他成员的决定，暂时或是永久地取消其参与资格。
-## 來源
+## 来源
-本行為準則改編自[貢獻者公約][首頁]，版本 1.4
+本行为标准改编自[贡献者公约][主页]，版本 1.4
-可在此觀看https://www.contributor-covenant.org/zh-tw/version/1/4/code-of-conduct.html
+可在此观看https://www.contributor-covenant.org/zh-cn/version/1/4/code-of-conduct.html
-[首頁]: https://www.contributor-covenant.org
+[主页]: https://www.contributor-covenant.org
--- a/adversarial/README.md
+++ b/adversarial/README.md
-# Advbox
-Advbox is a Python toolbox to create adversarial examples that fool neural networks. It requires Python and paddle.
-## How to use
-1. train a model and save it's parameters. (like fluid_mnist.py)
-2. load the parameters which is trained in step1, then reconstruct the model.(like mnist_tutorial_fgsm.py)
-3. use advbox to generate the adversarial sample.
--- a/adversarial/advbox/__init__.py
+++ b/adversarial/advbox/__init__.py
-# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-   A set of tools for generating adversarial example on paddle platform 
-"""
--- a/adversarial/advbox/attacks/base.py
+++ b/adversarial/advbox/attacks/base.py
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-"""
-The base model of the model.
-"""
-from abc import ABCMeta, abstractmethod
-class Attack(object):
-    """
-    Abstract base class for adversarial attacks. `Attack` represent an adversarial attack
-    which search an adversarial example. subclass should implement the _apply() method.
-    Args:
-        model(Model): an instance of the class advbox.base.Model.
-    """
-    __metaclass__ = ABCMeta
-    def __init__(self, model):
-        self.model = model
-    def __call__(self, image_label):
-        """
-        Generate the adversarial sample.
-        Args:
-        image_label(list): The image and label tuple list with one element.
-        """
-        adv_img = self._apply(image_label)
-        return adv_img
-    @abstractmethod
-    def _apply(self, image_label):
-        """
-        Search an adversarial example.
-        Args:
-        image_batch(list): The image and label tuple list with one element.
-        """
-        raise NotImplementedError
--- a/adversarial/advbox/attacks/gradientsign.py
+++ b/adversarial/advbox/attacks/gradientsign.py
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-"""
-This module provide the attack method for FGSM's implement.
-"""
-from __future__ import division
-import numpy as np
-from collections import Iterable
-from .base import Attack
-class GradientSignAttack(Attack):
-    """
-    This attack was originally implemented by Goodfellow et al. (2015) with the
-    infinity norm (and is known as the "Fast Gradient Sign Method"). This is therefore called
-    the Fast Gradient Method.
-    Paper link: https://arxiv.org/abs/1412.6572
-    """
-    def _apply(self, image_label, epsilons=1000):
-        assert len(image_label) == 1
-        pre_label = np.argmax(self.model.predict(image_label))
-        min_, max_ = self.model.bounds()
-        gradient = self.model.gradient(image_label)
-        gradient_sign = np.sign(gradient) * (max_ - min_)
-        if not isinstance(epsilons, Iterable):
-            epsilons = np.linspace(0, 1, num=epsilons + 1)
-        for epsilon in epsilons:
-            adv_img = image_label[0][0].reshape(
-                gradient_sign.shape) + epsilon * gradient_sign
-            adv_img = np.clip(adv_img, min_, max_)
-            adv_label = np.argmax(self.model.predict([(adv_img, 0)]))
-            if pre_label != adv_label:
-                return adv_img
-FGSM = GradientSignAttack
-class IteratorGradientSignAttack(Attack):
-    """
-    This attack was originally implemented by Alexey Kurakin(Google Brain).
-    Paper link: https://arxiv.org/pdf/1607.02533.pdf
-    """
-    def _apply(self, image_label, epsilons=100, steps=10):
-        """
-        Apply the iterative gradient sign attack.
-        Args:
-            image_label(list): The image and label tuple list of one element.
-            epsilons(list|tuple|int): The epsilon (input variation parameter).
-            steps(int): The number of iterator steps.
-        Return:
-            numpy.ndarray: The adversarail sample generated by the algorithm.
-        """
-        assert len(image_label) == 1
-        pre_label = np.argmax(self.model.predict(image_label))
-        gradient = self.model.gradient(image_label)
-        min_, max_ = self.model.bounds()
-        if not isinstance(epsilons, Iterable):
-            epsilons = np.linspace(0, 1, num=epsilons + 1)
-        for epsilon in epsilons:
-            adv_img = image_label[0][0].reshape(gradient.shape)
-            for _ in range(steps):
-                gradient = self.model.gradient([(adv_img, image_label[0][1])])
-                gradient_sign = np.sign(gradient) * (max_ - min_)
-                adv_img = adv_img + epsilon * gradient_sign
-                adv_img = np.clip(adv_img, min_, max_)
-                adv_label = np.argmax(self.model.predict([(adv_img, 0)]))
-                if pre_label != adv_label:
-                    return adv_img
--- a/adversarial/advbox/models/__init__.py
+++ b/adversarial/advbox/models/__init__.py
-# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Paddle model for target of attack 
-"""
--- a/adversarial/advbox/models/base.py
+++ b/adversarial/advbox/models/base.py
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-"""
-The base model of the model.
-"""
-from abc import ABCMeta
-import abc
-abstractmethod = abc.abstractmethod
-class Model(object):
-    """
-    Base class of model to provide attack.
-    Args:
-        bounds(tuple): The lower and upper bound for the image pixel.
-        channel_axis(int): The index of the axis that represents the color channel.
-        preprocess(tuple): Two element tuple used to preprocess the input. First
-            substract the first element, then divide the second element.
-    """
-    __metaclass__ = ABCMeta
-    def __init__(self, bounds, channel_axis, preprocess=None):
-        assert len(bounds) == 2
-        assert channel_axis in [0, 1, 2, 3]
-        if preprocess is None:
-            preprocess = (0, 1)
-        self._bounds = bounds
-        self._channel_axis = channel_axis
-        self._preprocess = preprocess
-    def bounds(self):
-        """
-        Return the upper and lower bounds of the model.
-        """
-        return self._bounds
-    def channel_axis(self):
-        """
-        Return the channel axis of the model.
-        """
-        return self._channel_axis
-    def _process_input(self, input_):
-        res = input_
-        sub, div = self._preprocess
-        if sub != 0:
-            res = input_ - sub
-        assert div != 0
-        if div != 1:
-            res /= div
-        return res
-    @abstractmethod
-    def predict(self, image_batch):
-        """
-        Calculate the prediction of the image batch.
-        Args:
-            image_batch(numpy.ndarray): image batch of shape (batch_size, height, width, channels).
-        Return:
-            numpy.ndarray: predictions of the images with shape (batch_size, num_of_classes).
-        """
-        raise NotImplementedError
-    @abstractmethod
-    def num_classes(self):
-        """
-        Determine the number of the classes
-        Return:
-            int: the number of the classes
-        """
-        raise NotImplementedError
-    @abstractmethod
-    def gradient(self, image_batch):
-        """
-        Calculate the gradient of the cross-entropy loss w.r.t the image.
-        Args:
-            image_batch(list): The image and label tuple list.
-        Return:
-            numpy.ndarray: gradient of the cross-entropy loss w.r.t the image with
-                the shape (height, width, channel).
-        """
-        raise NotImplementedError
--- a/adversarial/advbox/models/paddle.py
+++ b/adversarial/advbox/models/paddle.py
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-from __future__ import absolute_import
-import numpy as np
-import paddle.v2 as paddle
-import paddle.v2.fluid as fluid
-from paddle.v2.fluid.framework import program_guard
-from .base import Model
-class PaddleModel(Model):
-    """
-    Create a PaddleModel instance.
-    When you need to generate a adversarial sample, you should construct an instance of PaddleModel.
-    Args:
-        program(paddle.v2.fluid.framework.Program): The program of the model which generate the adversarial sample.
-        input_name(string): The name of the input.
-        logits_name(string): The name of the logits.
-        predict_name(string): The name of the predict.
-        cost_name(string): The name of the loss in the program.
-    """
-    def __init__(self,
-                 program,
-                 input_name,
-                 logits_name,
-                 predict_name,
-                 cost_name,
-                 bounds,
-                 channel_axis=3,
-                 preprocess=None):
-        super(PaddleModel, self).__init__(
-            bounds=bounds, channel_axis=channel_axis, preprocess=preprocess)
-        if preprocess is None:
-            preprocess = (0, 1)
-        self._program = program
-        self._place = fluid.CPUPlace()
-        self._exe = fluid.Executor(self._place)
-        self._input_name = input_name
-        self._logits_name = logits_name
-        self._predict_name = predict_name
-        self._cost_name = cost_name
-        # gradient
-        loss = self._program.block(0).var(self._cost_name)
-        param_grads = fluid.backward.append_backward(
-            loss, parameter_list=[self._input_name])
-        self._gradient = dict(param_grads)[self._input_name]
-    def predict(self, image_batch):
-        """
-            Predict the label of the image_batch.
-            Args:
-                image_batch(list): The image and label tuple list.
-            Return:
-                numpy.ndarray: predictions of the images with shape (batch_size, num_of_classes).
-        """
-        feeder = fluid.DataFeeder(
-            feed_list=[self._input_name, self._logits_name],
-            place=self._place,
-            program=self._program)
-        predict_var = self._program.block(0).var(self._predict_name)
-        predict = self._exe.run(self._program,
-                                feed=feeder.feed(image_batch),
-                                fetch_list=[predict_var])
-        return predict
-    def num_classes(self):
-        """
-            Calculate the number of classes of the output label. 
-        Return:
-            int: the number of classes
-        """
-        predict_var = self._program.block(0).var(self._predict_name)
-        assert len(predict_var.shape) == 2
-        return predict_var.shape[1]
-    def gradient(self, image_batch):
-        """
-        Calculate the gradient of the loss w.r.t the input.
-        Args:
-            image_batch(list): The image and label tuple list.
-        Return:
-            list: The list of the gradient of the image.
-        """
-        feeder = fluid.DataFeeder(
-            feed_list=[self._input_name, self._logits_name],
-            place=self._place,
-            program=self._program)
-        grad, = self._exe.run(self._program,
-                              feed=feeder.feed(image_batch),
-                              fetch_list=[self._gradient])
-        return grad
--- a/adversarial/mnist_tutorial_fgsm.py
+++ b/adversarial/mnist_tutorial_fgsm.py
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-"""
-FGSM demos on mnist using advbox tool.
-"""
-import paddle.v2 as paddle
-import paddle.v2.fluid as fluid
-import matplotlib.pyplot as plt
-import numpy as np
-from advbox.models.paddle import PaddleModel
-from advbox.attacks.gradientsign import GradientSignAttack
-def cnn_model(img):
-    """
-    Mnist cnn model
-    Args:
-        img(Varaible): the input image to be recognized
-    Returns:
-        Variable: the label prediction
-    """
-    #conv1 = fluid.nets.conv2d()
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
-        input=img,
-        num_filters=20,
-        filter_size=5,
-        pool_size=2,
-        pool_stride=2,
-        act='relu')
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
-        input=conv_pool_1,
-        num_filters=50,
-        filter_size=5,
-        pool_size=2,
-        pool_stride=2,
-        act='relu')
-    logits = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
-    return logits
-def main():
-    """
-    Advbox demo which demonstrate how to use advbox.
-    """
-    IMG_NAME = 'img'
-    LABEL_NAME = 'label'
-    img = fluid.layers.data(name=IMG_NAME, shape=[1, 28, 28], dtype='float32')
-    # gradient should flow
-    img.stop_gradient = False
-    label = fluid.layers.data(name=LABEL_NAME, shape=[1], dtype='int64')
-    logits = cnn_model(img)
-    cost = fluid.layers.cross_entropy(input=logits, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-    place = fluid.CPUPlace()
-    exe = fluid.Executor(place)
-    BATCH_SIZE = 1
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.mnist.train(), buf_size=500),
-        batch_size=BATCH_SIZE)
-    feeder = fluid.DataFeeder(
-        feed_list=[IMG_NAME, LABEL_NAME],
-        place=place,
-        program=fluid.default_main_program())
-    fluid.io.load_params(
-        exe, "./mnist/", main_program=fluid.default_main_program())
-    # advbox demo
-    m = PaddleModel(fluid.default_main_program(), IMG_NAME, LABEL_NAME,
-                    logits.name, avg_cost.name, (-1, 1))
-    att = GradientSignAttack(m)
-    for data in train_reader():
-        # fgsm attack
-        adv_img = att(data)
-        plt.imshow(n[0][0], cmap='Greys_r')
-        plt.show()
-        #np.save('adv_img', adv_img)
-        break
-if __name__ == '__main__':
-    main()
--- a/doc/api/v2/fluid/layers.rst
+++ b/doc/api/v2/fluid/layers.rst
@@ -504,3 +504,8 @@ l2_normalize
 ------------
 ..  autofunction:: paddle.v2.fluid.layers.l2_normalize
    :noindex:
+sequence_reshape
+----------------
+..  autofunction:: paddle.v2.fluid.layers.sequence_reshape
+    :noindex:
--- a/doc/design/error_clip.md
+++ b/doc/design/error_clip.md
@@ -46,12 +46,12 @@ class ErrorClipByValue(BaseErrorClipAttr):
        self.min = min
    def append_clip_op(self, block, grad_name):
-        block.append_op(
+        clip_op_desc = block.desc.append_op()
-            type="clip",
+        clip_op_desc.set_type("clip")
-            inputs={"X": grad_name},
+        clip_op_desc.set_input("X", [grad_name])
-            outputs={"Out": grad_name},
+        clip_op_desc.set_output("Out", [grad_name])
-            attrs={"min": self.min,
+        clip_op_desc.set_attr("min", self.min)
-                   "max": self.max})
+        clip_op_desc.set_attr("max", self.max)
 ```
 The `BaseErrorClipAttr` have one main member functions: `append_clip_op(self, block, grad_name)`.
@@ -80,6 +80,11 @@ def error_clip_callback(block, context):
                         op_desc.output_arg_names()):
        fwd_var = block.var_recursive(grad_to_var[grad_n])
        error_clip = getattr(fwd_var, "error_clip", None)
+        if not (error_clip is None or isinstance(error_clip,
+                                                 BaseErrorClipAttr)):
+            raise TypeError(
+                "Variable's error_clip should be an instance of BaseErrorClipAttr or None."
+            )
        if error_clip is not None:
            error_clip.append_clip_op(block, grad_n)
 ```

--- a/doc/design/switch_kernel.md
+++ b/doc/design/switch_kernel.md
 ## Background
-Every operator has many kernels because there are multiple data types, places, data layout that Fluid supports. We use the `KernelType` to describe kernel types that operators can hold. 
+Every operator has many kernels because there are multiple data types, places, data layout, library type that Fluid supports. We use the `OpKernelType ` to describe kernel types that operators can hold.
-The `KernelType` is as follows.
+The `OpKernelType ` is as follows:
-```
+```cpp
-struct KernelType {
+struct OpKernelType {
  Place place_;
  DataType data_type_;
-  LayoutType layout_;
+  DataLayout data_layout_;
+  LibraryType library_type_;
 };
 ```
-The `place_` is a descriptor of the device and the computational library, e.g., `MKLDNNPlace`, `CUDAPlace`.
+- The `place_` is a descriptor of the device, e.g., CPUPlace, CUDAPlace.
-The `data_type_` is the data type that this kernel performs on, e.g., `FP32`, `INT64`. Note that one kernel may have inputs with different data types. However, it will be a major `data_type`. For example, the `cross_entropy` takes `int64` as it label, and `double`/`float` as its input logit and output cost. The major `data_type` of `cross_entropy` is `float`/`double`.
+- The `data_type_` is the data type that this kernel performs on, e.g., `FP32`, `INT64`. Note that one kernel may have inputs with different data types. However, it will be a major `data_type`. For example, the `cross_entropy` takes `int64` as it label, and `double`/`float` as its input logit and output cost. The major `data_type` of `cross_entropy` is `float` or `double`.
-The `layout` is useful for some computational library. One example is that MKLDNN uses many kinds of layout, such as `nChw8c`. Each kind of layout will invoke the different kernel.
+- The `data_layout_ ` is useful for some computational library. One example is that MKLDNN uses many kinds of layout, such as `nChw8c`. Each kind of layout will invoke the different kernel.
+- The `library_type_` describes the computational library, e.g., `MKLDNN`, `CUDNN`.
 ## Problem
@@ -25,42 +28,72 @@ We register a kernel for every operator and every kernel type ideally. However,
 2. Some operators will take too many memory. It is better to force them into CPU. However, the rest of operators in this neural network will be performed on GPU, i.e., model parallel problem.
 3. Some layout and place are particular. One example is that MKLDNN uses `nChw8` and there is no other library uses `nChw8c`.
-Problems under these situations are similar. We can formalise this problem as follow.
+Take one situation to give a detailed explanation, if we have two Operators: OP1 and OP2, OP1 has one output `op1_to_op2`, and `op1_to_op2` is the input of OP2.
+If OP1 and OP2 run on the same place(for example CPUPlace), then `op1_2_op2` can be used directly by OP2.
+```
+OP1(CPUPlace)
+     |
+ op1_2_op2
+     |
+OP2(CPUPlace)
+```
+If OP1 and OP2 run one different place, then OP2 cannot `use op1_2_op2` directly.
+Problems under these situations are similar. We can formalize this problem as follow.
 We register kernels with types $KT = \{kt_1, kt_2, kt_3, ...\}$ for one operator. The inputs of this operator should be run on kernel type $kt_{?}$, which the $kt_{?} \notin KT$. How to cast the input of this operator from $kt_{?}$ to any of kernel type in $KT$.
-## Solution
+## Solution: data transform
-It is clearly that transforming inputs of an operator toadapt another kernel type is not related to the particular operator. So we should register these transformation methods as global methods.
+It is clear that transforming inputs of an operator to adapt another kernel type is not related to the particular operator. So we should register these transformation methods as global methods.
-We can infer a kernel type from the inputs of an operators. We let this kernel type as `actual kernel type`, which means this kernel type is the actually kernel type that operator should be performed.
+We can infer kernel type for each input of an operator. We let this kernel type as `actual kernel type for var`, which means this kernel type is the kernel type that can process this input variable.
 We can get a kernel type by 1) The configuration of operator description. (Users may want to force use `MKL` for `conv` operator). 2) The place of the current executor. (Executor is running on GPU). This kernel type is what we expect the operator will be performed on. We let this kernel type as `expect kernel type`.
-We transform the input data from `actual` to `expect` if the expect kernel type is not as same as actual kernel type.
+We transform the input data from `actual` to `expect` if the actual kernel type is not as same as expect kernel type.
-The algorithm is described as follow
+The algorithm is described as following
 ```cpp
-using DataTransformationFN = std::function<void(const Tensor& in, Tensor* out)>;
+void OperatorWithKernel::Run(
-using KernelTypePair = std::pair<KernelType, KernelType>;
+        const Scope& scope,
+        const platform::Place& place) const {
-map<KernelTypePair, DataTransformationFN> g_data_transformation_;
+  ExecutionContext ctx(...);
+  auto expected_kernel_key = this->GetExpectedKernelType(ctx);
-void OpWithKernel::Run() {
-  vec<Tensor> inputs = ...
+  Scope& new_scope = scope.NewScope();
-  auto actual_kernel_type = GetActualKernelType(inputs);
+  for (auto& var_name : this->Inputs()) {
-  // The expected kernel type is related to actual kernel type.
+    auto* tensor_in = GetTensor(var_name);
-  // For the most operators, the expected kernel type is as same as
+    auto kernel_type_for_var = this->GetKernelTypeForVar(...);
-  // actual kernel type.
+    if (kernel_type_for_var.place_ != expected_kernel_key.place_) {
-  //
+      auto* trans_var = new_scope.Var(var_name);
-  // So we pass `actual_kernel_type` as a parameter of 
+      auto* out = DataTransform(expected_kernel_key,
-  // GetExpectedKernelType
+                                kernel_type_for_var,
-  auto expect_kernel_type = GetExpectedKernelType(actual_kernel_type);
+                                *tensor_in);
+      CopyVariableWithTensor(...);
-  auto trans = g_data_transformation_[{actual_kernel_type, expect_kernel_type}];
+    }
+  }
-  kernel.run(trans(inputs));
+  auto kernel = kernels.find(expected_kernel_key);
+  kernel->Compute(ExecutionContext(...));
 }
 ```
+then the actual process for the multi-device above will be:
+```
+OP1(CPUPlace)
+     |
+op1_2_op2(on CPU)
+     |
+[transform](from CPU to GPU)
+     |
+op1_2_op2(on GPU)
+     |
+OP2(CUDAPlace)
+```
--- a/doc/howto/dev/new_op_en.md
+++ b/doc/howto/dev/new_op_en.md
@@ -4,7 +4,8 @@
 - [Implementing C++ Types](#implementing-c-types)
   - [Defining ProtoMaker](#defining-protomaker)
   - [Defining Operator](#defining-operator)
-   - [Registering Operator](#registering-operator)
+   - [Defining OpKernel](#defining-opkernel)
+   - [Registering Operator and OpKernel](#registering-operator-and-opkernel)
   - [Compilation](#compilation)
 - [Python Binding](#python-binding)
 - [Unit Tests](#unit-tests)
@@ -16,12 +17,13 @@
 Here are the base types needed. For details, please refer to the design docs.
- `framework::OperatorBase`: Operator (Op)base class.
- `framework::OpKernel`: Base class for Op computation.
- `framework::OperatorWithKernel`: Inherited from OperatorBase, describing an operator with computation.
 - `class OpProtoAndCheckerMaker`: Describes an Operator's input, output, attributes and description, mainly used to interface with Python API.
+- `framework::OperatorBase`: Operator (Op)base class.
+- `framework::OpKernel`: Base class for Op computation kernel.
+- `framework::OperatorWithKernel`: Inherited from OperatorBase, describing an operator with computation kernels.
-An operator can be differentiated by whether in has kernel methods. An operator with kernel inherits from `OperatorWithKernel` while the ones without inherit from `OperatorBase`. This tutorial focuses on implementing operators with kernels. In short, an operator includes the following information:
+Operators can be categorized into two groups: operator with kernel(s) and operator without kernel(s). An operator with kernel(s) inherits from `OperatorWithKernel` while the one without kernel(s) inherits from `OperatorBase`. This tutorial focuses on implementing operators with kernels. In short, an operator includes the following information:
 Information           | Where is it defined
@@ -32,7 +34,7 @@ Kernel implementation       | The kernel methods shared between CPU and CUDA are
 Registering the Op           | Ops are registered in `.cc` files; For Kernel registration, `.cc` files contain the CPU implementation, while `.cu` files contain the CUDA implementation.
-New Operator implementations are added to the list [paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators), with file names in the format `*_op.h` (if applicable), `*_op.cc`, `*_op.cu` (if applicable).** The system will use the naming scheme to automatically build operators and their corresponding Python extensions. **
+New Operator implementations are added to the list [paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators), with file names in the format `*_op.h` (if applicable), `*_op.cc`, `*_op.cu` (if applicable).** The system will use the naming scheme to automatically build operators and their corresponding Python extensions.**
 Let's take matrix multiplication operator, [MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc), as an example to introduce the writing of an Operator with Kernel.
@@ -156,7 +158,8 @@ Usually `OpProtoMaker` and `Op`'s type definitions are written in `.cc` files, w
 - `typename T` denotes data type, such as `float` or `double`.
 `MulKernel` types need to rewrite the interface for `Compute`.
- `Compute` takes one input variable `const framework::ExecutionContext& context`.
+- `Compute` takes one input parameter: `const framework::ExecutionContext& context`.
 - Compared with `InferShapeContext`, `ExecutionContext` includes device types, and can similarly extract input, output, and attribute variables.
 - `Compute` implements the computation logics of an `OpKernel`.
@@ -177,7 +180,7 @@ Usually `OpProtoMaker` and `Op`'s type definitions are written in `.cc` files, w
  };
  ```
-Note that **different devices (CPU, CUDA)share an Op definition; whether or not they share the same `OpKernel` depends on whether `Compute` calls functions that support both devices.**
+Note that **different devices (CPU, CUDA)share one Op definition; whether or not they share the same `OpKernel` depends on whether `Compute` calls functions can support both devices.**
 `MulOp`'s CPU and CUDA share the same `Kernel`. A non-sharing  `OpKernel` example can be seen in [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43).
@@ -188,13 +191,14 @@ This concludes the forward implementation of an operator. Next its operation and
 The definition of its corresponding backward operator, if applicable, is similar to that of an forward operator. **Note that a backward operator does not include a `ProtoMaker`**.
-### Registering Operator
+### Registering Operator and OpKernel
 - In `.cc` files, register forward and backward operator classes and the CPU kernel.
    ```cpp
    namespace ops = paddle::operators;
    REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad);
    REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>);
    REGISTER_OP_CPU_KERNEL(mul_grad,
                  ops::MulGradKernel<paddle::platform::CPUDeviceContext, float>);
@@ -204,6 +208,7 @@ The definition of its corresponding backward operator, if applicable, is similar
    - `REGISTER_OP` registers the `ops::MulOp` class, type named `mul`, its type `ProtoMaker` is `ops::MulOpMaker`, registering `ops::MulOpGrad` as `mul_grad`.
    - `REGISTER_OP_WITHOUT_GRADIENT` registers an operator without gradient.
    - `REGISTER_OP_CPU_KERNEL` registers `ops::MulKernel` class and specialized template types `paddle::platform::CPUPlace` and `float`, which also registers `ops::MulGradKernel`.
@@ -225,6 +230,7 @@ The definition of its corresponding backward operator, if applicable, is similar
 Run the following commands to compile.
 ```
+# maybe you need to rerun cmake
 make mul_op
 ```

--- a/doc/howto/dev/new_op_kernel_en.md
+++ b/doc/howto/dev/new_op_kernel_en.md
+## Add Kernels for a New Device
+### Background
+PaddlePaddle Fluid have hundreds of operators.  Each operator could have one or more kernels.  A kernel is an implementation of the operator for a certain device, which could be a hardware device, e.g., the CUDA GPU, or a library that utilizes a device, e.g., Intel MKL that makes full use of the Xeon CPU.
+[This document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_en.md) explains how to add an operator, and its kernels.  The kernels of an operator are indexed by a C++ type [`OpKernelType`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md).  An operator chooses the right kernel at runtime.  This choosing mechanism is described [here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md).
+### Write Kernels for A New Device 
+#### Add A New Device
+  For some historical reaons, we misuse the word *library* for *device*.  For example, we call the deivce type by *library type*.  An example is the header file [`library_type.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/library_type.h#L24).  We will correct this ASAP.
+To register a new device, we need to add an enum value to `LibraryType`:
+```
+enum class LibraryType {
+  kPlain = 0,
+  kMKLDNN = 1,
+  kCUDNN = 2,
+};
+```
+#### Add A New [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L53)
+If you have a new kind of Device, firstly you need to add a new kind of [`Place`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L53). For example `CUDAPlace`:
+```cpp
+struct CUDAPlace {
+  CUDAPlace() : CUDAPlace(0) {}
+  explicit CUDAPlace(int d) : device(d) {}
+  inline int GetDeviceId() const { return device; }
+  // needed for variant equality comparison
+  inline bool operator==(const CUDAPlace &o) const {
+    return device == o.device;
+  }
+  inline bool operator!=(const CUDAPlace &o) const { return !(*this == o); }
+  int device;
+};
+typedef boost::variant<CUDAPlace, CPUPlace> Place;
+```
+#### Add [device context]((https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L37))
+After a new kind of Device is added, you should add a corresponding [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L37) for it.
+```cpp
+class DeviceContext {
+ public:
+  virtual ~DeviceContext() {}
+  virtual Place GetPlace() const = 0;
+  virtual void Wait() const {}
+};
+```
+#### Implement new [OpKernel](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L351) for your Device.
+A detailed documentation can be found in [`new_op_and_kernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_en.md)
+```cpp
+class OpKernelBase {
+ public:
+  /**
+   * ExecutionContext is the only parameter of Kernel Run function.
+   * Run will get input/output variables, state such as momentum and
+   * device resource such as CUDA stream, cublas handle, etc. from
+   * ExecutionContext. User should construct it before run the Operator.
+   */
+  virtual void Compute(const ExecutionContext& context) const = 0;
+  virtual ~OpKernelBase() = default;
+};
+template <typename T>
+class OpKernel : public OpKernelBase {
+ public:
+  using ELEMENT_TYPE = T;
+};
+```
+#### Register the OpKernel to framework
+After writing the components described above, we should register the kernel to the framework.
+We use `REGISTER_OP_KERNEL` to do the registration.
+```cpp
+REGISTER_OP_KERNEL(
+	op_type,
+	library_type,
+	place_type,
+	kernel0, kernel1, ...)
+```
+kernel0, kernel1 are kernels that have the same `op_type`, `library_type`, `place_type` but different `data_types`.
+take [`conv2d`]((https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/conv_cudnn_op.cu.cc#L318)) as an example:
+	```cpp
+	REGISTER_OP_KERNEL(conv2d, CPU, paddle::platform::CPUPlace,
+    		paddle::operators::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
+    		paddle::operators::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
+	REGISTER_OP_KERNEL(conv2d, CUDNN, ::paddle::platform::CUDAPlace,
+	       paddle::operators::CUDNNConvOpKernel<float>,
+	       paddle::operators::CUDNNConvOpKernel<double>);
+	```
+In the code above:
+ - `conv2d` is the type/name of the operator
+ - `CUDNN/CPU` is `library`
+ - `paddle::platform::CUDAPlace/CPUPlace` is `place`
+ - template parameter `float/double` on `CUDNNConvOpKernel<T>` is `data_type`.
--- a/doc/howto/usage/cluster/fluid_cluster_train_en.md
+++ b/doc/howto/usage/cluster/fluid_cluster_train_en.md
@@ -101,9 +101,11 @@ t.transpile(optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
 ... #create executor
 # in pserver, run this
-exe.run(fluid.default_startup_program())
 #current_endpoint here means current pserver IP:PORT you wish to run on
-exe.run(t.get_pserver_program(current_endpoint, optimize_ops)) 
+pserver_prog = t.get_pserver_program(current_endpoint)
+pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
+exe.run(pserver_startup)
+exe.run(pserver_prog)
 # in trainer, run this
 ... # define data reader

--- a/doc/v1_api_tutorials/README.md
+++ b/doc/v1_api_tutorials/README.md
-The tutorials in v1_api_tutorials are using v1_api currently, and will be upgraded to v2_api later.
-Thus, v1_api_tutorials is a temporary directory. We decide not to maintain it and will delete it in future.
-Please go to [PaddlePaddle/book](https://github.com/PaddlePaddle/book) and 
-[PaddlePaddle/models](https://github.com/PaddlePaddle/models) to learn PaddlePaddle.
--- a/doc/v1_api_tutorials/embedding_model/index_cn.md
+++ b/doc/v1_api_tutorials/embedding_model/index_cn.md
-# 中文词向量模型的使用 #
----------
-本文档介绍如何在PaddlePaddle平台上,使用预训练的标准格式词向量模型。
-在此感谢 @lipeng 提出的代码需求，并给出的相关模型格式的定义。
-## 介绍 ###
-### 中文字典 ###
-我们的字典使用内部的分词工具对百度知道和百度百科的语料进行分词后产生。分词风格如下： "《红楼梦》"将被分为 "《"，"红楼梦"，"》"，和 "《红楼梦》"。字典采用UTF8编码，输出有2列：词本身和词频。字典共包含 3206326个词和4个特殊标记：
-  - `<s>`: 分词序列的开始
-  - `<e>`: 分词序列的结束
-  - `PALCEHOLDER_JUST_IGNORE_THE_EMBEDDING`: 占位符，没有实际意义
-  - `<unk>`: 未知词
-### 中文词向量的预训练模型 ###
-遵循文章 [A Neural Probabilistic Language Model](http://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf)中介绍的方法，模型采用 n-gram 语言模型，结构如下图：6元上下文作为输入层->全连接层->softmax层 。对应于字典，我们预训练得到4种不同维度的词向量，分别为：32维、64维、128维和256维。
-<center>![](./neural-n-gram-model.png)</center>
-<center>Figure 1. neural-n-gram-model</center>
-### 下载和数据抽取 ###
-运行以下的命令下载和获取我们的字典和预训练模型：
-    cd $PADDLE_ROOT/demo/model_zoo/embedding
-    ./pre_DictAndModel.sh
-## 中文短语改写的例子 ##
-以下示范如何使用预训练的中文字典和词向量进行短语改写。
-### 数据的准备和预处理 ###
-首先，运行以下的命令下载数据集。该数据集（utf8编码）包含20个训练样例，5个测试样例和2个生成式样例。
-    cd $PADDLE_ROOT/demo/seqToseq/data
-    ./paraphrase_data.sh
-第二步，将数据处理成规范格式，在训练数集上训练生成词向量字典（数据将保存在 `$PADDLE_SOURCE_ROOT/demo/seqToseq/data/pre-paraphrase`）:
-    cd $PADDLE_ROOT/demo/seqToseq/
-    python preprocess.py -i data/paraphrase [--mergeDict]
- 其中，如果使用`--mergeDict`选项，源语言短语和目标语言短语的字典将被合并（源语言和目标语言共享相同的编码字典）。本实例中，源语言和目标语言都是相同的语言，因此可以使用该选项。
-### 使用用户指定的词向量字典 ###
-使用如下命令，从预训练模型中，根据用户指定的字典，抽取对应的词向量构成新的词表:
-    cd $PADDLE_ROOT/demo/model_zoo/embedding
-    python extract_para.py --preModel PREMODEL --preDict PREDICT --usrModel USRMODEL--usrDict USRDICT -d DIM
- `--preModel PREMODEL`: 预训练词向量字典模型的路径
- `--preDict PREDICT`:  预训练模型使用的字典的路径
- `--usrModel USRMODEL`: 抽取出的新词表的保存路径
- `--usrDict USRDICT`: 用户指定新的字典的路径，用于构成新的词表
- `-d DIM`: 参数（词向量）的维度
-此处，你也可以简单的运行以下的命令：
-    cd $PADDLE_ROOT/demo/seqToseq/data/
-    ./paraphrase_model.sh
-运行成功以后，你将会看到以下的模型结构：
-    paraphrase_model
-    |--- _source_language_embedding
-    |--- _target_language_embedding
-### 在PaddlePaddle平台训练模型 ###
-首先，配置模型文件，配置如下（可以参考保存在 `demo/seqToseq/paraphrase/train.conf`的配置）:
-    from seqToseq_net import *
-    is_generating = False
-    ################## Data Definition #####################
-    train_conf = seq_to_seq_data(data_dir = "./data/pre-paraphrase",
-                                 job_mode = job_mode)
-    ############## Algorithm Configuration ##################
-    settings(
-          learning_method = AdamOptimizer(),
-          batch_size = 50,
-          learning_rate = 5e-4)
-    ################# Network configure #####################
-    gru_encoder_decoder(train_conf, is_generating, word_vector_dim = 32)
-这个配置与`demo/seqToseq/translation/train.conf` 基本相同
-然后，使用以下命令进行模型训练:
-    cd $PADDLE_SOURCE_ROOT/demo/seqToseq/paraphrase
-    ./train.sh
-其中，`train.sh` 与`demo/seqToseq/translation/train.sh` 基本相同，只有2个配置不一样:
- `--init_model_path`: 初始化模型的路径配置为`data/paraphrase_modeldata/paraphrase_model`
- `--load_missing_parameter_strategy`：如果参数模型文件缺失，除词向量模型外的参数将使用正态分布随机初始化
-如果用户想要了解详细的数据集的格式、模型的结构和训练过程，请查看 [Text generation Tutorial](../text_generation/index_cn.md).
-## 可选功能 ##
-###  观测词向量
-PaddlePaddle 平台为想观测词向量的用户提供了将二进制词向量模型转换为文本模型的功能:
-    cd $PADDLE_ROOT/demo/model_zoo/embedding
-    python paraconvert.py --b2t -i INPUT -o OUTPUT -d DIM
- `-i INPUT`: 输入的（二进制）词向量模型名称
- `-o OUTPUT`: 输出的文本模型名称
- `-d DIM`: （词向量）参数维度
-运行完以上命令，用户可以在输出的文本模型中看到:
-    0,4,32156096
-    -0.7845433,1.1937413,-0.1704215,0.4154715,0.9566584,-0.5558153,-0.2503305, ......
-    0.0000909,0.0009465,-0.0008813,-0.0008428,0.0007879,0.0000183,0.0001984, ......
-    ......
- 其中，第一行是`PaddlePaddle` 输出文件的格式说明，包含3个属性：:
-  - `PaddlePaddle`的版本号，本例中为0
-  - 浮点数占用的字节数，本例中为4
-  - 总计的参数个数，本例中为32,156,096
- 其余行是（词向量）参数行（假设词向量维度为32）
-  - 每行打印32个参数以','分隔
-  - 共有32,156,096/32 = 1,004,877行，也就是说，模型共包含1,004,877个被向量化的词
-### 词向量模型的修正
-`PaddlePaddle` 为想修正词向量模型的用户提供了将文本词向量模型转换为二进制模型的命令:
-    cd $PADDLE_ROOT/demo/model_zoo/embedding
-    python paraconvert.py --t2b -i INPUT -o OUTPUT
- `-i INPUT`: 输入的文本词向量模型名称
- `-o OUTPUT`: 输出的二进制词向量模型名称
-请注意，输入的文本格式如下:
-    -0.7845433,1.1937413,-0.1704215,0.4154715,0.9566584,-0.5558153,-0.2503305, ......
-    0.0000909,0.0009465,-0.0008813,-0.0008428,0.0007879,0.0000183,0.0001984, ......
-    ......
- 输入文本中没有头部（格式说明）行
- （输入文本）每行存储一个词，以逗号','分隔
--- a/doc/v1_api_tutorials/embedding_model/index_en.md
+++ b/doc/v1_api_tutorials/embedding_model/index_en.md
-# Chinese Word Embedding Model Tutorial #
----------
-This tutorial is to guide you through the process of using a Pretrained Chinese Word Embedding Model in the PaddlePaddle standard format.
-We thank @lipeng for the pull request that defined the model schemas and pretrained the models.
-## Introduction ###
-### Chinese Word Dictionary ###
-Our Chinese-word dictionary is created on Baidu ZhiDao and Baidu Baike by using in-house word segmentor. For example, the participle of "《红楼梦》" is "《"，"红楼梦"，"》"，and "《红楼梦》". Our dictionary (using UTF-8 format) has has two columns: word and its frequency. The total word count is 3206326, including 4 special token:
-  - `<s>`: the start of a sequence
-  - `<e>`: the end of a sequence
-  - `PALCEHOLDER_JUST_IGNORE_THE_EMBEDDING`: a placeholder, just ignore it and its embedding
-  - `<unk>`: a word not included in dictionary
-### Pretrained Chinese Word Embedding Model ###
-Inspired by paper [A Neural Probabilistic Language Model](http://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf), our model architecture (**Embedding joint of six words->FullyConnect->SoftMax**) is as following graph. And for our dictionary, we pretrain four models with different word vector dimenstions, i.e 32, 64, 128, 256.
-<center>![](./neural-n-gram-model.png)</center>
-<center>Figure 1. neural-n-gram-model</center>
-### Download and Extract ###
-To download and extract our dictionary and pretrained model, run the following commands.
-    cd $PADDLE_ROOT/demo/model_zoo/embedding
-    ./pre_DictAndModel.sh
-## Chinese Paraphrasing Example ##
-We provide a paraphrasing task to show the usage of pretrained Chinese Word Dictionary and Embedding Model.
-### Data Preparation and Preprocess ###
-First, run the following commands to download and extract the in-house dataset. The dataset (using UTF-8 format) has 20 training samples, 5 testing samples and 2 generating samples.
-    cd $PADDLE_ROOT/demo/seqToseq/data
-    ./paraphrase_data.sh
-Second, preprocess data and build dictionary on train data by running the following commands, and the preprocessed dataset is stored in `$PADDLE_SOURCE_ROOT/demo/seqToseq/data/pre-paraphrase`:
-    cd $PADDLE_ROOT/demo/seqToseq/
-    python preprocess.py -i data/paraphrase [--mergeDict]
- `--mergeDict`: if using this option, the source and target dictionary are merged, i.e, two dictionaries have the same context. Here, as source and target data are all chinese words, this option can be used.
-### User Specified Embedding Model ###
-The general command of extracting desired parameters from the pretrained embedding model based on user dictionary is:
-    cd $PADDLE_ROOT/demo/model_zoo/embedding
-    python extract_para.py --preModel PREMODEL --preDict PREDICT --usrModel USRMODEL--usrDict USRDICT -d DIM
- `--preModel PREMODEL`: the name of pretrained embedding model
- `--preDict PREDICT`: the name of pretrained dictionary
- `--usrModel USRMODEL`: the name of extracted embedding model
- `--usrDict USRDICT`: the name of user specified dictionary
- `-d DIM`: dimension of parameter
-Here, you can simply run the command:
-    cd $PADDLE_ROOT/demo/seqToseq/data/
-    ./paraphrase_model.sh
-And you will see following embedding model structure:
-    paraphrase_model
-    |--- _source_language_embedding
-    |--- _target_language_embedding
-### Training Model in PaddlePaddle ###
-First, create a model config file, see example `demo/seqToseq/paraphrase/train.conf`:
-    from seqToseq_net import *
-    is_generating = False
-    ################## Data Definition #####################
-    train_conf = seq_to_seq_data(data_dir = "./data/pre-paraphrase",
-                                 job_mode = job_mode)
-    ############## Algorithm Configuration ##################
-    settings(
-          learning_method = AdamOptimizer(),
-          batch_size = 50,
-          learning_rate = 5e-4)
-    ################# Network configure #####################
-    gru_encoder_decoder(train_conf, is_generating, word_vector_dim = 32)
-This config is almost the same as `demo/seqToseq/translation/train.conf`.
-Then, train the model by running the command:
-    cd $PADDLE_SOURCE_ROOT/demo/seqToseq/paraphrase
-    ./train.sh
-where `train.sh` is almost the same as `demo/seqToseq/translation/train.sh`, the only difference is following two command arguments:
- `--init_model_path`: path of the initialization model, here is `data/paraphrase_model`
- `--load_missing_parameter_strategy`: operations when model file is missing, here use a normal distibution to initialize the other parameters except for the embedding layer
-For users who want to understand the dataset format, model architecture and training procedure in detail, please refer to [Text generation Tutorial](../text_generation/index_en.md).
-## Optional Function ##
-###  Embedding Parameters Observation
-For users who want to observe the embedding parameters, this function can convert a PaddlePaddle binary embedding model to a text model by running the command:
-    cd $PADDLE_ROOT/demo/model_zoo/embedding
-    python paraconvert.py --b2t -i INPUT -o OUTPUT -d DIM
- `-i INPUT`: the name of input binary embedding model
- `-o OUTPUT`: the name of output text embedding model
- `-d DIM`: the dimension of parameter
-You will see parameters like this in output text model:
-    0,4,32156096
-    -0.7845433,1.1937413,-0.1704215,0.4154715,0.9566584,-0.5558153,-0.2503305, ......
-    0.0000909,0.0009465,-0.0008813,-0.0008428,0.0007879,0.0000183,0.0001984, ......
-    ......
- 1st line is **PaddlePaddle format file head**, it has 3 attributes:
-  - version of PaddlePaddle, here is 0
-  - sizeof(float), here is 4
-  - total number of parameter, here is 32156096
- Other lines print the paramters (assume `<dim>` = 32)
-  - each line print 32 paramters splitted by ','
-  - there is 32156096/32 = 1004877 lines, meaning there is 1004877 embedding words
-### Embedding Parameters Revision
-For users who want to revise the embedding parameters, this function can convert a revised text embedding model to a PaddlePaddle binary model by running the command:
-    cd $PADDLE_ROOT/demo/model_zoo/embedding
-    python paraconvert.py --t2b -i INPUT -o OUTPUT
- `-i INPUT`: the name of input text embedding model.
- `-o OUTPUT`: the name of output binary embedding model
-Note that the format of input text model is as follows:
-    -0.7845433,1.1937413,-0.1704215,0.4154715,0.9566584,-0.5558153,-0.2503305, ......
-    0.0000909,0.0009465,-0.0008813,-0.0008428,0.0007879,0.0000183,0.0001984, ......
-    ......
- there is no file header in 1st line
- each line stores parameters for one word, the separator is commas ','
--- a/doc/v1_api_tutorials/embedding_model/neural-n-gram-model.png
+++ b/doc/v1_api_tutorials/embedding_model/neural-n-gram-model.png
--- a/doc/v1_api_tutorials/gan/gan.png
+++ b/doc/v1_api_tutorials/gan/gan.png
--- a/doc/v1_api_tutorials/gan/index_en.md
+++ b/doc/v1_api_tutorials/gan/index_en.md
-# Generative Adversarial Networks (GAN) 
-This demo implements GAN training described in the original [GAN paper](https://arxiv.org/abs/1406.2661) and deep convolutional generative adversarial networks [DCGAN paper](https://arxiv.org/abs/1511.06434).
-The high-level structure of GAN is shown in Figure. 1 below. It is composed of two major parts: a generator and a discriminator, both of which are based on neural networks. The generator takes in some kind of noise with a known distribution and transforms it into an image. The discriminator takes in an image and determines whether it is artificially generated by the generator or a real image. So the generator and the discriminator are in a competitive game in which generator is trying to generate image to look as real as possible to fool the discriminator, while the discriminator is trying to distinguish between real and fake images. 
-<center>![](./gan.png)</center>
-<p align="center">
-    Figure 1. GAN-Model-Structure
-    <a href="https://ishmaelbelghazi.github.io/ALI/">figure credit</a>
-</p>
-The generator and discriminator take turn to be trained using SGD. The objective function of the generator is for its generated images being classified as real by the discriminator, and the objective function of the discriminator is to correctly classify real and fake images. When the GAN model is trained to converge to the equilibrium state, the generator will transform the given noise distribution to the distribution of real images, and the discriminator will not be able to distinguish between real and fake images at all. 
-## Implementation of GAN Model Structure
-Since GAN model involves multiple neural networks, it requires to use paddle python API. So the code walk-through below can also partially serve as an introduction to the usage of Paddle Python API.
-There are three networks defined in gan_conf.py, namely **generator_training**, **discriminator_training** and **generator**. The relationship to the model structure we defined above is that **discriminator_training** is the discriminator, **generator** is the generator, and the **generator_training** combined the generator and discriminator since training generator would require the discriminator to provide loss function. This relationship is described in the following code:
-```python
-if is_generator_training:
-    noise = data_layer(name="noise", size=noise_dim)
-    sample = generator(noise)
-if is_discriminator_training:
-    sample = data_layer(name="sample", size=sample_dim)
-if is_generator_training or is_discriminator_training:
-    label = data_layer(name="label", size=1)
-    prob = discriminator(sample)
-    cost = cross_entropy(input=prob, label=label)
-    classification_error_evaluator(
-        input=prob, label=label, name=mode + '_error')
-    outputs(cost)
-if is_generator:
-    noise = data_layer(name="noise", size=noise_dim)
-    outputs(generator(noise))
-```
-In order to train the networks defined in gan_conf.py, one first needs to initialize a Paddle environment, parse the config, create GradientMachine from the config and create trainer from GradientMachine as done in the code chunk below:
-```python
-import py_paddle.swig_paddle as api
-# init paddle environment
-api.initPaddle('--use_gpu=' + use_gpu, '--dot_period=10',
-               '--log_period=100', '--gpu_id=' + args.gpu_id,
-               '--save_dir=' + "./%s_params/" % data_source)
-# Parse config
-gen_conf = parse_config(conf, "mode=generator_training,data=" + data_source)
-dis_conf = parse_config(conf, "mode=discriminator_training,data=" + data_source)
-generator_conf = parse_config(conf, "mode=generator,data=" + data_source)
-# Create GradientMachine
-dis_training_machine = api.GradientMachine.createFromConfigProto(
-dis_conf.model_config)
-gen_training_machine = api.GradientMachine.createFromConfigProto(
-gen_conf.model_config)
-generator_machine = api.GradientMachine.createFromConfigProto(
-generator_conf.model_config)
-# Create trainer
-dis_trainer = api.Trainer.create(dis_conf, dis_training_machine)
-gen_trainer = api.Trainer.create(gen_conf, gen_training_machine)
-```
-In order to balance the strength between generator and discriminator, we schedule to train whichever one is performing worse by comparing their loss function value. The loss function value can be calculated by a forward pass through the GradientMachine.
-```python
-def get_training_loss(training_machine, inputs):
-    outputs = api.Arguments.createArguments(0)
-    training_machine.forward(inputs, outputs, api.PASS_TEST)
-    loss = outputs.getSlotValue(0).copyToNumpyMat()
-    return numpy.mean(loss)
-```
-After training one network, one needs to sync the new parameters to the other networks. The code below demonstrates one example of such use case:
-```python
-# Train the gen_training
-gen_trainer.trainOneDataBatch(batch_size, data_batch_gen)
-# Copy the parameters from gen_training to dis_training and generator
-copy_shared_parameters(gen_training_machine,
-dis_training_machine)
-copy_shared_parameters(gen_training_machine, generator_machine)
-```
-## A Toy Example 
-With the infrastructure explained above, we can now walk you through a toy example of generating two dimensional uniform distribution using 10 dimensional Gaussian noise. 
-The Gaussian noises are generated using the code below:
-```python
-def get_noise(batch_size, noise_dim):
-    return numpy.random.normal(size=(batch_size, noise_dim)).astype('float32')
-```
-The real samples (2-D uniform) are generated using the code below:
-```python
-# synthesize 2-D uniform data in gan_trainer.py:114
-def load_uniform_data():
-    data = numpy.random.rand(1000000, 2).astype('float32')
-    return data
-```
-The generator and discriminator network are built using fully-connected layer and batch_norm layer, and are defined in gan_conf.py. 
-To train the GAN model, one can use the command below. The flag -d specifies the training data (cifar, mnist or uniform) and flag --useGpu specifies whether to use gpu for training (0 is cpu, 1 is gpu).  
-```bash
-$python gan_trainer.py -d uniform --useGpu 1
-```
-The generated samples can be found in ./uniform_samples/ and one example is shown below as Figure 2. One can see that it roughly recovers the 2D uniform distribution. 
-<center>![](./uniform_sample.png)</center>
-<p align="center">
-    Figure 2. Uniform Sample
-</p>
-## MNIST Example
-### Data preparation
-To download the MNIST data, one can use the following commands:
-```bash
-$cd data/
-$./get_mnist_data.sh
-```
-### Model description
-Following the DC-Gan paper (https://arxiv.org/abs/1511.06434), we use convolution/convolution-transpose layer in the discriminator/generator network to better deal with images. The details of the network structures are defined in gan_conf_image.py. 
-### Training the model
-To train the GAN model on mnist data, one can use the following command:
-```bash
-$python gan_trainer.py -d mnist --useGpu 1
-```
-The generated sample images can be found at ./mnist_samples/ and one example is shown below as Figure 3. 
-<center>![](./mnist_sample.png)</center>
-<p align="center">
-    Figure 3. MNIST Sample
-</p>
--- a/doc/v1_api_tutorials/gan/mnist_sample.png
+++ b/doc/v1_api_tutorials/gan/mnist_sample.png
--- a/doc/v1_api_tutorials/gan/uniform_sample.png
+++ b/doc/v1_api_tutorials/gan/uniform_sample.png
--- a/doc/v1_api_tutorials/imagenet_model/resnet_block.jpg
+++ b/doc/v1_api_tutorials/imagenet_model/resnet_block.jpg
--- a/doc/v1_api_tutorials/imagenet_model/resnet_model_cn.md
+++ b/doc/v1_api_tutorials/imagenet_model/resnet_model_cn.md
-# Model Zoo - ImageNet #
-[ImageNet](http://www.image-net.org/) 是通用物体分类领域一个众所周知的数据库。本教程提供了一个用于ImageNet上的卷积分类网络模型。
-## ResNet 介绍
-论文 [Deep Residual Learning for Image Recognition](http://arxiv.org/abs/1512.03385) 中提出的ResNet网络结构在2015年ImageNet大规模视觉识别竞赛(ILSVRC 2015)的分类任务中赢得了第一名。他们提出残差学习的框架来简化网络的训练，所构建网络结构的的深度比之前使用的网络有大幅度的提高。下图展示的是基于残差的连接方式。左图构造网络模块的方式被用于34层的网络中，而右图的瓶颈连接模块用于50层，101层和152层的网络结构中。
-<center>![resnet_block](./resnet_block.jpg)</center>
-<center>图 1. ResNet 网络模块</center>
-本教程中我们给出了三个ResNet模型，这些模型都是由原作者提供的模型<https://github.com/KaimingHe/deep-residual-networks>转换过来的。我们使用PaddlePaddle在ILSVRC的验证集共50,000幅图像上测试了模型的分类错误率，其中输入图像的颜色通道顺序为**BGR**，保持宽高比缩放到短边为256，只截取中心方形的图像区域。分类错误率和模型大小由下表给出。
-<center>
-<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
-<colgroup>
-<col  class="left" />
-<col  class="left" />
-<col  class="left" />
-</colgroup>
-<thead>
-<tr>
-<th scope="col" class="left">ResNet</th>
-<th scope="col" class="left">Top-1</th>
-<th scope="col" class="left">Model Size</th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td class="left">ResNet-50</td>
-<td class="left">24.9%</td>
-<td class="left">99M</td>
-</tr>
-<tr>
-<td class="left">ResNet-101</td>
-<td class="left">23.7%</td>
-<td class="left">173M</td>
-</tr>
-<tr>
-<td class="left">ResNet-152</td>
-<td class="left">23.2%</td>
-<td class="left">234M</td>
-</tr>
-</tbody>
-</table></center>
-<br>
-## ResNet 模型
-50层，101层和152层的网络配置文件可参照```demo/model_zoo/resnet/resnet.py```。你也可以通过在命令行参数中增加一个参数如```--config_args=layer_num=50```来指定网络层的数目。
-### 网络可视化
-你可以通过执行下面的命令来得到ResNet网络的结构可视化图。该脚本会生成一个dot文件，然后可以转换为图片。需要安装graphviz来转换dot文件为图片。
-```
-cd demo/model_zoo/resnet
-./net_diagram.sh
-```
-### 模型下载
-```
-cd demo/model_zoo/resnet
-./get_model.sh
-```
-你可以执行上述命令来下载所有的模型和均值文件，如果下载成功，这些文件将会被保存在```demo/model_zoo/resnet/model```路径下。
-```
-mean_meta_224  resnet_101  resnet_152  resnet_50
-```
-   * resnet_50: 50层网络模型。
-   * resnet_101: 101层网络模型。
-   * resnet_152: 152层网络模型。
-   * mean\_meta\_224: 均值图像文件，图像大小为3 x 224 x 224，颜色通道顺序为**BGR**。你也可以使用这三个值: 103.939, 116.779, 123.68。
-### 参数信息
-* **卷积层权重**
-  由于每个卷积层后面连接的是batch normalization层，因此该层中没有偏置(bias)参数，并且只有一个权重。
-  形状: `(Co, ky, kx, Ci)`
-   * Co: 输出特征图的通道数目
-   * ky: 滤波器核在垂直方向上的尺寸
-   * kx: 滤波器核在水平方向上的尺寸
-   * Ci: 输入特征图的通道数目
-  二维矩阵: (Co * ky * kx, Ci), 行优先次序存储。
-* **全连接层权重**
-  二维矩阵: (输入层尺寸, 本层尺寸), 行优先次序存储。
-* **[Batch Normalization](<http://arxiv.org/abs/1502.03167>) 层权重**
-本层有四个参数，实际上只有.w0和.wbias是需要学习的参数，另外两个分别是滑动均值和方差。在测试阶段它们将会被加载到模型中。下表展示了batch normalization层的参数。
-<center>
-<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
-<colgroup>
-<col  class="left" />
-<col  class="left" />
-<col  class="left" />
-</colgroup>
-<thead>
-<tr>
-<th scope="col" class="left">参数名</th>
-<th scope="col" class="left">尺寸</th>
-<th scope="col" class="left">含义</th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td class="left">_res2_1_branch1_bn.w0</td>
-<td class="left">256</td>
-<td class="left">gamma, 缩放参数</td>
-</tr>
-<tr>
-<td class="left">_res2_1_branch1_bn.w1</td>
-<td class="left">256</td>
-<td class="left">特征图均值</td>
-</tr>
-<tr>
-<td class="left">_res2_1_branch1_bn.w2</td>
-<td class="left">256</td>
-<td class="left">特征图方差</td>
-</tr>
-<tr>
-<td class="left">_res2_1_branch1_bn.wbias</td>
-<td class="left">256</td>
-<td class="left">beta, 偏置参数</td>
-</tr>
-</tbody>
-</table></center>
-<br>
-### 参数读取
-使用者可以使用下面的Python脚本来读取参数值:
-```
-import sys
-import numpy as np
-def load(file_name):
-    with open(file_name, 'rb') as f:
-        f.read(16) # skip header for float type.
-        return np.fromfile(f, dtype=np.float32)
-if __name__=='__main__':
-    weight = load(sys.argv[1])
-```
-或者直接使用下面的shell命令:
-```
-od -j 16 -f _res2_1_branch1_bn.w0
-```
-## 特征提取
-我们提供了C++和Python接口来提取特征。下面的例子使用了`demo/model_zoo/resnet/example`中的数据，详细地展示了整个特征提取的过程。
-### C++接口
-首先，在配置文件中的`define_py_data_sources2`里指定图像数据列表，具体请参照示例`demo/model_zoo/resnet/resnet.py`。
-```
-    train_list = 'train.list' if not is_test else None
-    # mean.meta is mean file of ImageNet dataset.
-    # mean.meta size : 3 x 224 x 224.
-    # If you use three mean value, set like:
-    # "mean_value:103.939,116.779,123.68;"
-    args={
-        'mean_meta': "model/mean_meta_224/mean.meta",
-        'image_size': 224, 'crop_size': 224,
-        'color': True,'swap_channel:': [2, 1, 0]}
-    define_py_data_sources2(train_list,
-                           'example/test.list',
-                           module="example.image_list_provider",
-                           obj="processData",
-                           args=args)
-```
-第二步，在`resnet.py`文件中指定要提取特征的网络层的名字。例如，
-```
-Outputs("res5_3_branch2c_conv", "res5_3_branch2c_bn")
-```
-第三步，在`extract_fea_c++.sh`文件中指定模型路径和输出的目录，然后执行下面的命令。
-```
-cd demo/model_zoo/resnet
-./extract_fea_c++.sh
-```
-如果执行成功，特征将会存到`fea_output/rank-00000`文件中，如下所示。同时你可以使用`load_feature.py`文件中的`load_feature_c`接口来加载该文件。
-```
-0.115318 -0.108358 ... -0.087884;-1.27664 ... -1.11516 -2.59123;
-0.126383 -0.116248 ... -0.00534909;-1.42593 ... -1.04501 -1.40769;
-```
-* 每行存储的是一个样本的特征。其中，第一行存的是图像`example/dog.jpg`的特征，第二行存的是图像`example/cat.jpg`的特征。
-* 不同层的特征由分号`;`隔开，并且它们的顺序与`Outputs()`中指定的层顺序一致。这里，左边是`res5_3_branch2c_conv`层的特征，右边是`res5_3_branch2c_bn`层特征。
-### Python接口
-示例`demo/model_zoo/resnet/classify.py`中展示了如何使用Python来提取特征。下面的例子同样使用了`./example/test.list`中的数据。执行的命令如下：
-```
-cd demo/model_zoo/resnet
-./extract_fea_py.sh
-```
-extract_fea_py.sh:
-```
-python classify.py \
-     --job=extract \
-     --conf=resnet.py\
-     --use_gpu=1 \
-     --mean=model/mean_meta_224/mean.meta \
-     --model=model/resnet_50 \
-     --data=./example/test.list \
-     --output_layer="res5_3_branch2c_conv,res5_3_branch2c_bn" \
-     --output_dir=features
-```
-* \--job=extract:              指定工作模式来提取特征。
-* \--conf=resnet.py:           网络配置文件。
-* \--use_gpu=1:                指定是否使用GPU。
-* \--model=model/resnet_50:    模型路径。
-* \--data=./example/test.list: 数据列表。
-* \--output_layer="xxx,xxx":   指定提取特征的层。
-* \--output_dir=features:      输出目录。
-如果运行成功，你将会看到特征存储在`features/batch_0`文件中，该文件是由cPickle产生的。你可以使用`load_feature.py`中的`load_feature_py`接口来打开该文件，它将返回如下的字典：
-```
-{
-'cat.jpg': {'res5_3_branch2c_conv': array([[-0.12638293, -0.116248  , -0.11883899, ..., -0.00895038, 0.01994277, -0.00534909]], dtype=float32), 'res5_3_branch2c_bn': array([[-1.42593431, -1.28918779, -1.32414699, ..., -1.45933616, -1.04501402, -1.40769434]], dtype=float32)},
-'dog.jpg': {'res5_3_branch2c_conv': array([[-0.11531784, -0.10835785, -0.08809858, ...,0.0055237, 0.01505112, -0.08788397]], dtype=float32), 'res5_3_branch2c_bn': array([[-1.27663755, -1.18272924, -0.90937918, ..., -1.25178063, -1.11515927, -2.59122872]], dtype=float32)}
-}
-```
-仔细观察，这些特征值与上述使用C++接口提取的结果是一致的。
-## 预测
-`classify.py`文件也可以用于对样本进行预测。我们提供了一个示例脚本`predict.sh`，它使用50层的ResNet模型来对`example/test.list`中的数据进行预测。
-```
-cd demo/model_zoo/resnet
-./predict.sh
-```
-predict.sh调用了`classify.py`:
-```
-python classify.py \
-     --job=predict \
-     --conf=resnet.py\
-     --multi_crop \
-     --model=model/resnet_50 \
-     --use_gpu=1 \
-     --data=./example/test.list
-```
-* \--job=extract:              指定工作模型进行预测。
-* \--conf=resnet.py:           网络配置文件。network configure.
-* \--multi_crop:               使用10个裁剪图像块，预测概率取平均。
-* \--use_gpu=1:                指定是否使用GPU。
-* \--model=model/resnet_50:    模型路径。
-* \--data=./example/test.list: 数据列表。
-如果运行成功，你将会看到如下结果，其中156和285是这些图像的分类标签。
-```
-Label of example/dog.jpg is: 156
-Label of example/cat.jpg is: 282
-```
--- a/doc/v1_api_tutorials/imagenet_model/resnet_model_en.md
+++ b/doc/v1_api_tutorials/imagenet_model/resnet_model_en.md
-# Model Zoo - ImageNet #
-[ImageNet](http://www.image-net.org/) is a popular dataset for generic object classification. This tutorial provides convolutional neural network(CNN) models for ImageNet.
-## ResNet Introduction
-ResNets from paper [Deep Residual Learning for Image Recognition](http://arxiv.org/abs/1512.03385) won the 1st place on the ILSVRC 2015 classification task. They present residual learning framework to ease the training of networks that are substantially deeper than those used previously. The residual connections are shown in following figure. The left building block is used in network of 34 layers and the right bottleneck building block is used in network of 50, 101, 152 layers .
-<center>![resnet_block](./resnet_block.jpg)</center>
-<center>Figure 1. ResNet Block</center>
-We present three ResNet models, which are converted from the models provided by the authors <https://github.com/KaimingHe/deep-residual-networks>.  The classfication errors tested in PaddlePaddle on 50,000 ILSVRC validation set with input images channel order of **BGR** by single scale with the shorter side of 256 and single crop as following table.
-<center>
-<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
-<colgroup>
-<col  class="left" />
-<col  class="left" />
-<col  class="left" />
-</colgroup>
-<thead>
-<tr>
-<th scope="col" class="left">ResNet</th>
-<th scope="col" class="left">Top-1</th>
-<th scope="col" class="left">Model Size</th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td class="left">ResNet-50</td>
-<td class="left">24.9%</td>
-<td class="left">99M</td>
-</tr>
-<tr>
-<td class="left">ResNet-101</td>
-<td class="left">23.7%</td>
-<td class="left">173M</td>
-</tr>
-<tr>
-<td class="left">ResNet-152</td>
-<td class="left">23.2%</td>
-<td class="left">234M</td>
-</tr>
-</tbody>
-</table></center>
-<br>
-## ResNet Model
-See ```demo/model_zoo/resnet/resnet.py```. This config contains network of 50, 101 and 152 layers. You can specify layer number by adding argument like ```--config_args=layer_num=50``` in command line arguments.
-### Network Visualization
-You can get a diagram of ResNet network by running the following commands. The script generates dot file and then converts dot file to PNG file, which needs to install graphviz to convert.
-```
-cd demo/model_zoo/resnet
-./net_diagram.sh
-```
-### Model Download
-```
-cd demo/model_zoo/resnet
-./get_model.sh
-```
-You can run above command to download all models and mean file and save them in ```demo/model_zoo/resnet/model``` if downloading successfully.
-```
-mean_meta_224  resnet_101  resnet_152  resnet_50
-```
-   * resnet_50: model of 50 layers.
-   * resnet_101: model of 101 layers.
-   * resnet_152: model of 152 layers.
-   * mean\_meta\_224: mean file with 3 x 224 x 224 size in **BGR** order. You also can use three mean values: 103.939, 116.779, 123.68.
-### Parameter Info
-* **Convolution Layer Weight**
-  As batch normalization layer is connected after each convolution layer, there is no parameter of bias and only one weight in this layer.
-  shape: `(Co, ky, kx, Ci)`
-   * Co: channle number of output feature map.
-   * ky: filter size in vertical direction.
-   * kx: filter size in horizontal direction.
-   * Ci: channle number of input feature map.
-  2-Dim matrix: (Co * ky * kx, Ci), saved in row-major order.
-* **Fully connected Layer Weight**
-  2-Dim matrix: (input layer size, this layer size), saved in row-major order.
-* **[Batch Normalization](<http://arxiv.org/abs/1502.03167>) Layer Weight**
-There are four parameters in this layer. In fact, only .w0 and .wbias are the learned parameters. The other two are therunning mean and variance respectively. They will be loaded in testing. Following table shows parameters of a batch normzalization layer.
-<center>
-<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
-<colgroup>
-<col  class="left" />
-<col  class="left" />
-<col  class="left" />
-</colgroup>
-<thead>
-<tr>
-<th scope="col" class="left">Parameter Name</th>
-<th scope="col" class="left">Number</th>
-<th scope="col" class="left">Meaning</th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td class="left">_res2_1_branch1_bn.w0</td>
-<td class="left">256</td>
-<td class="left">gamma, scale parameter</td>
-</tr>
-<tr>
-<td class="left">_res2_1_branch1_bn.w1</td>
-<td class="left">256</td>
-<td class="left">mean value of feature map</td>
-</tr>
-<tr>
-<td class="left">_res2_1_branch1_bn.w2</td>
-<td class="left">256</td>
-<td class="left">variance of feature map</td>
-</tr>
-<tr>
-<td class="left">_res2_1_branch1_bn.wbias</td>
-<td class="left">256</td>
-<td class="left">beta, shift parameter</td>
-</tr>
-</tbody>
-</table></center>
-<br>
-### Parameter Observation
-Users who want to observe the parameters can use Python to read:
-```
-import sys
-import numpy as np
-def load(file_name):
-    with open(file_name, 'rb') as f:
-        f.read(16) # skip header for float type.
-        return np.fromfile(f, dtype=np.float32)
-if __name__=='__main__':
-    weight = load(sys.argv[1])
-```
-or simply use following shell command:
-```
-od -j 16 -f _res2_1_branch1_bn.w0
-```
-## Feature Extraction
-We provide both C++ and Python interfaces to extract features. The following examples use data in `demo/model_zoo/resnet/example` to show the extracting process in detail.
-### C++ Interface
-First, specify image data list in `define_py_data_sources2` in the config, see example `demo/model_zoo/resnet/resnet.py`.
-```
-    train_list = 'train.list' if not is_test else None
-    # mean.meta is mean file of ImageNet dataset.
-    # mean.meta size : 3 x 224 x 224.
-    # If you use three mean value, set like:
-    # "mean_value:103.939,116.779,123.68;"
-    args={
-        'mean_meta': "model/mean_meta_224/mean.meta",
-        'image_size': 224, 'crop_size': 224,
-        'color': True,'swap_channel:': [2, 1, 0]}
-    define_py_data_sources2(train_list,
-                           'example/test.list',
-                           module="example.image_list_provider",
-                           obj="processData",
-                           args=args)
-```
-Second, specify layers to extract features in `Outputs()` of `resnet.py`. For example,
-```
-Outputs("res5_3_branch2c_conv", "res5_3_branch2c_bn")
-```
-Third, specify model path and output directory in `extract_fea_c++.sh`, and then run the following commands.
-```
-cd demo/model_zoo/resnet
-./extract_fea_c++.sh
-```
-If successful, features are saved in `fea_output/rank-00000` as follows. And you can use `load_feature_c` interface in `load_feature.py ` to load such a file.
-```
-0.115318 -0.108358 ... -0.087884;-1.27664 ... -1.11516 -2.59123;
-0.126383 -0.116248 ... -0.00534909;-1.42593 ... -1.04501 -1.40769;
-```
-* Each line stores features of a sample. Here, the first line stores features of `example/dog.jpg` and second line stores features of `example/cat.jpg`.
-* Features of different layers are splitted by `;`, and their order is consistent with the layer order in `Outputs()`. Here, the left features are `res5_3_branch2c_conv` layer and right features are `res5_3_branch2c_bn` layer.
-### Python Interface
-`demo/model_zoo/resnet/classify.py` is an example to show how to use Python to extract features. Following example still uses data of `./example/test.list`. Command is as follows:
-```
-cd demo/model_zoo/resnet
-./extract_fea_py.sh
-```
-extract_fea_py.sh:
-```
-python classify.py \
-     --job=extract \
-     --conf=resnet.py\
-     --use_gpu=1 \
-     --mean=model/mean_meta_224/mean.meta \
-     --model=model/resnet_50 \
-     --data=./example/test.list \
-     --output_layer="res5_3_branch2c_conv,res5_3_branch2c_bn" \
-     --output_dir=features
-```
-* \--job=extract:              specify job mode to extract feature.
-* \--conf=resnet.py:           network configure.
-* \--use_gpu=1:             speficy GPU mode.
-* \--model=model/resnet_5:     model path.
-* \--data=./example/test.list: data list.
-* \--output_layer="xxx,xxx":   specify layers to extract features.
-* \--output_dir=features:      output diretcoty.
-If run successfully, you will see features saved in `features/batch_0`, this file is produced with cPickle. You can use `load_feature_py` interface in `load_feature.py` to open the file, and it returns a dictionary as follows:
-```
-{
-'cat.jpg': {'res5_3_branch2c_conv': array([[-0.12638293, -0.116248  , -0.11883899, ..., -0.00895038, 0.01994277, -0.00534909]], dtype=float32), 'res5_3_branch2c_bn': array([[-1.42593431, -1.28918779, -1.32414699, ..., -1.45933616, -1.04501402, -1.40769434]], dtype=float32)},
-'dog.jpg': {'res5_3_branch2c_conv': array([[-0.11531784, -0.10835785, -0.08809858, ...,0.0055237, 0.01505112, -0.08788397]], dtype=float32), 'res5_3_branch2c_bn': array([[-1.27663755, -1.18272924, -0.90937918, ..., -1.25178063, -1.11515927, -2.59122872]], dtype=float32)}
-}
-```
-Observed carefully, these feature values are consistent with the above results extracted by C++ interface.
-## Prediction
-`classify.py` also can be used to predict. We provide an example script `predict.sh` to predict data in `example/test.list` using a ResNet model with 50 layers.
-```
-cd demo/model_zoo/resnet
-./predict.sh
-```
-predict.sh calls the `classify.py`:
-```
-python classify.py \
-     --job=predict \
-     --conf=resnet.py\
-     --multi_crop \
-     --model=model/resnet_50 \
-     --use_gpu=1 \
-     --data=./example/test.list
-```
-* \--job=extract:              speficy job mode to predict.
-* \--conf=resnet.py:           network configure.
-* \--multi_crop:               use 10 crops and average predicting probability.
-* \--use_gpu=1:             speficy GPU mode.
-* \--model=model/resnet_50:    model path.
-* \--data=./example/test.list: data list.
-If run successfully, you will see following results, where 156 and 285 are labels of the images.
-```
-Label of example/dog.jpg is: 156
-Label of example/cat.jpg is: 282
-```
--- a/doc/v1_api_tutorials/quick_start/index_cn.rst
+++ b/doc/v1_api_tutorials/quick_start/index_cn.rst
-=============
-快速入门教程
-=============
-我们将以 `文本分类问题 <https://en.wikipedia.org/wiki/Document_classification>`_ 为例,
-介绍PaddlePaddle的基本使用方法。
-安装
-====
-请参考 :ref:`install_steps` 安装PaddlePaddle。
-使用概述
-========
-**文本分类问题**：对于给定的一条文本，我们从提前给定的类别集合中选择其所属类别。
-比如, 在购物网站上，通过查看买家对某个产品的评价反馈, 评估该产品的质量。
- 这个显示器很棒！ （好评）
- 用了两个月之后这个显示器屏幕碎了。（差评）
-使用PaddlePaddle, 每一个任务流程都可以被划分为如下五个步骤。
-    ..  image:: src/Pipeline_cn.jpg
-        :align: center
-        :scale: 80%
-1. 数据格式准备
-    - 本例每行保存一条样本，类别Id和文本信息用 ``Tab`` 间隔，文本中的单词用空格分隔（如果不切词，则字与字之间用空格分隔），例如：``类别Id '\t' 这 个 显 示 器 很 棒 ！``
-2. 向系统传送数据
-    - PaddlePaddle可以执行用户的python脚本程序来读取各种格式的数据文件。
-    - 本例的所有字符都将转换为连续整数表示的Id传给模型。
-3. 描述网络结构和优化算法
-    - 本例由易到难展示4种不同的文本分类网络配置：逻辑回归模型，词向量模型，卷积模型，时序模型。
-    - 常用优化算法包括Momentum, RMSProp，AdaDelta，AdaGrad，Adam，Adamax等，本例采用Adam优化方法，加了L2正则和梯度截断。
-4. 训练模型
-5. 应用模型
-数据格式准备
------------
-接下来我们将展示如何用PaddlePaddle训练一个文本分类模型，将 `Amazon电子产品评论数据 <http://jmcauley.ucsd.edu/data/amazon/>`_ 分为好评(正样本)和差评(负样本)两种类别。
-`源代码 <https://github.com/PaddlePaddle/Paddle>`_ 的 ``demo/quick_start`` 目录里提供了该数据的下载脚本和预处理脚本，你只需要在命令行输入以下命令，就能够很方便的完成数据下载和相应的预处理工作。
-.. code-block:: bash
-    cd demo/quick_start
-    ./data/get_data.sh
-    ./preprocess.sh
-数据预处理完成之后，通过配置类似于 ``dataprovider_*.py`` 的数据读取脚本和类似于 ``trainer_config.*.py`` 的训练模型脚本，PaddlePaddle将以设置参数的方式来设置
-相应的数据读取脚本和训练模型脚本。接下来，我们将对这两个步骤给出了详细的解释，你也可以先跳过本文的解释环节，直接进入训练模型章节, 使用 ``sh train.sh`` 开始训练模型，
-查看`train.sh`内容，通过 **自底向上法** (bottom-up approach)来帮助你理解PaddlePaddle的内部运行机制。
-向系统传送数据
-==============
-Python脚本读取数据
------------------
-`DataProvider` 是PaddlePaddle负责提供数据的模块，主要职责在于将训练数据传入内存或者显存，让模型能够得到训练更新，其包括两个函数：
-* initializer：PaddlePaddle会在调用读取数据的Python脚本之前，先调用initializer函数。在下面例子里，我们在initialzier函数里初始化词表，并且在随后的读取数据过程中填充词表。
-* process：PaddlePaddle调用process函数来读取数据。每次读取一条数据后，process函数会用yield语句输出这条数据，从而能够被PaddlePaddle 捕获 (harvest)。
-``dataprovider_bow.py`` 文件给出了完整例子：
-..  literalinclude:: ../../../demo/quick_start/dataprovider_bow.py
-     :language: python
-     :lines: 21-70
-     :linenos:
-     :emphasize-lines: 8,33
-详细内容请参见 :ref:`api_dataprovider` 。
-配置中的数据加载定义
--------------------
-在模型配置中通过 ``define_py_data_sources2`` 接口来加载数据：
-..  literalinclude:: ../../../demo/quick_start/trainer_config.emb.py
-     :language: python
-     :lines: 19-35
-     :linenos:
-     :emphasize-lines: 12
-以下是对上述数据加载的解释：
- data/train.list,data/test.list: 指定训练数据和测试数据
- module="dataprovider_bow": 处理数据的Python脚本文件
- obj="process": 指定生成数据的函数
- args={"dictionary": word_dict}: 额外的参数，这里指定词典
-更详细数据格式和用例请参考 :ref:`api_pydataprovider2` 。
-模型网络结构
-============
-本小节我们将介绍模型网络结构。
-    ..  image:: src/PipelineNetwork_cn.jpg
-        :align: center
-        :scale: 80%
-我们将以最基本的逻辑回归网络作为起点，并逐渐展示更加深入的功能。更详细的网络配置连接请参考 :ref:`api_trainer_config_helpers_layers` 。
-所有配置都能在 `源代码 <https://github.com/PaddlePaddle/Paddle>`_ 的 ``demo/quick_start`` 目录下找到。
-逻辑回归模型
------------
-具体流程如下:
-    ..  image:: src/NetLR_cn.jpg
-        :align: center
-        :scale: 80%
- 获取利用 `one-hot vector <https://en.wikipedia.org/wiki/One-hot>`_ 表示的每个单词，维度是词典大小
-    .. code-block:: python
-        word = data_layer(name="word",  size=word_dim)
- 获取该条样本类别Id，维度是类别个数。
-    .. code-block:: python
-        label = data_layer(name="label", size=label_dim)
- 利用逻辑回归模型对该向量进行分类，同时会计算分类准确率
-    .. code-block:: python
-        # Define a fully connected layer with logistic activation (also called softmax activation).
-        output = fc_layer(input=word,
-                        size=label_dim,
-                        act_type=SoftmaxActivation())
-        # Define cross-entropy classification loss and error.
-        classification_cost(input=output, label=label)
- - input: 除去data层，每个层都有一个或多个input,多个input以list方式输入
- - size: 该层神经元个数
- - act_type: 激活函数类型
-**效果总结**：我们将在后面介绍训练和预测流程的脚本。在此为方便对比不同网络结构，我们总结了各个网络的复杂度和效果。
-    =====================  ===============================  =================
-    网络名称                        参数数量                    错误率
-    =====================  ===============================  =================
-    逻辑回归                      252 KB                       8.652 %
-    =====================  ===============================  =================
-词向量模型
----------
-embedding模型需要稍微改变提供数据的Python脚本，即 ``dataprovider_emb.py``，词向量模型、
-卷积模型、时序模型均使用该脚本。其中文本输入类型定义为整数时序类型integer_value_sequence。
-.. code-block:: python
-    def initializer(settings, dictionary, **kwargs):
-        settings.word_dict = dictionary
-        settings.input_types = [
-            # Define the type of the first input as sequence of integer.
-            # The value of the integers range from 0 to len(dictrionary)-1
-            integer_value_sequence(len(dictionary)),
-            # Define the second input for label id
-            integer_value(2)]
-    @provider(init_hook=initializer)
-    def process(settings, file_name):
-        ...
-        # omitted, it is same as the data provider for LR model
-该模型依然使用逻辑回归分类网络的框架， 只是将句子用连续向量表示替换为用稀疏向量表示， 即对第三步进行替换。句子表示的计算更新为两步：
-..  image:: src/NetContinuous_cn.jpg
-    :align: center
-    :scale: 80%
- 利用单词Id查找该单词对应的连续向量(维度为word_dim)， 输入N个单词，输出为N个word_dim维度向量
-    .. code-block:: python
-        emb = embedding_layer(input=word, size=word_dim)
- 将该句话包含的所有单词向量求平均, 得到句子的表示
-    .. code-block:: python
-        avg = pooling_layer(input=emb, pooling_type=AvgPooling())
-其它部分和逻辑回归网络结构一致。
-**效果总结：**
-    =====================  ===============================  ==================
-    网络名称                        参数数量                    错误率
-    =====================  ===============================  ==================
-    词向量模型                      15 MB                       8.484 %
-    =====================  ===============================  ==================
-卷积模型
-----------
-卷积网络是一种特殊的从词向量表示到句子表示的方法， 也就是将词向量模型进一步演化为三个新步骤。
-..  image:: src/NetConv_cn.jpg
-    :align: center
-    :scale: 80%
-文本卷积分可为三个步骤:
-1. 首先，从每个单词左右两端分别获取k个相邻的单词, 拼接成一个新的向量；
-2. 其次，对该向量进行非线性变换(例如Sigmoid变换), 使其转变为维度为hidden_dim的新向量；
-3. 最后，对整个新向量集合的每一个维度取最大值来表示最后的句子。
-这三个步骤可配置为:
-.. code-block:: python
-    text_conv = sequence_conv_pool(input=emb,
-                                context_start=k,
-                                context_len=2 * k + 1)
-**效果总结：**
-    =====================  ===============================  ========================
-    网络名称                        参数数量                    错误率
-    =====================  ===============================  ========================
-    卷积模型                      16 MB                       5.628 %
-    =====================  ===============================  ========================
-时序模型
----------
-..  image:: src/NetRNN_cn.jpg
-    :align: center
-    :scale: 80%
-时序模型，也称为RNN模型, 包括简单的 `RNN模型 <https://en.wikipedia.org/wiki/Recurrent_neural_network>`_, `GRU模型 <https://en.wikipedia.org/wiki/Gated_recurrent_unit>`_ 和 `LSTM模型 <https://en.wikipedia.org/wiki/Long_short-term_memory>`_ 等等。
- GRU模型配置：
-    .. code-block:: python
-        gru = simple_gru(input=emb, size=gru_size)
- LSTM模型配置：
-    .. code-block:: python
-        lstm = simple_lstm(input=emb, size=lstm_size)
-本次试验，我们采用单层LSTM模型，并使用了Dropout，**效果总结：**
-    =====================  ===============================  =========================
-    网络名称                        参数数量                    错误率
-    =====================  ===============================  =========================
-    时序模型                      16 MB                       4.812 %
-    =====================  ===============================  =========================
-优化算法
-=========
-`优化算法 <http://www.paddlepaddle.org/doc/ui/api/trainer_config_helpers/optimizers_index.html>`_ 包括
-Momentum, RMSProp，AdaDelta，AdaGrad，ADAM，Adamax等，这里采用Adam优化方法，同时使用了L2正则(L2 Regularization)和梯度截断(Gradient Clipping)。
-.. code-block:: python
-    settings(batch_size=128,
-            learning_rate=2e-3,
-            learning_method=AdamOptimizer(),
-            regularization=L2Regularization(8e-4),
-            gradient_clipping_threshold=25)
-训练模型
-=========
-在数据加载和网络配置完成之后， 我们就可以训练模型了。
-..  image:: src/PipelineTrain_cn.jpg
-    :align: center
-    :scale: 80%
-训练模型，我们只需要运行 ``train.sh`` 训练脚本：
-    .. code-block:: bash
-        ./train.sh
-``train.sh`` 中包含了训练模型的基本命令。训练时所需设置的主要参数如下：
-    .. code-block:: bash
-        paddle train \
-        --config=trainer_config.py \
-        --log_period=20 \
-        --save_dir=./output \
-        --num_passes=15 \
-        --use_gpu=false
-这里只简单介绍了单机训练，如何进行分布式训练，请参考 :ref:`cluster_train` 。
-预测
-=====
-当模型训练好了之后，我们就可以进行预测了。
-..  image:: src/PipelineTest_cn.jpg
-    :align: center
-    :scale: 80%
-之前配置文件中 ``test.list`` 指定的数据将会被测试，这里直接通过预测脚本 ``predict.sh`` 进行预测,
-更详细的说明，请参考 :ref:`api_swig_py_paddle` 。
-    .. code-block:: bash
-        model="output/pass-00003"
-        paddle train \
-            --config=trainer_config.lstm.py \
-            --use_gpu=false \
-            --job=test \
-            --init_model_path=$model \
-            --config_args=is_predict=1 \
-            --predict_output_dir=. \
-        mv rank-00000 result.txt
-这里以 ``output/pass-00003`` 为例进行预测，用户可以根据训练日志，选择测试结果最好的模型来预测。
-预测结果以文本的形式保存在 ``result.txt`` 中，一行为一个样本，格式如下：
-    .. code-block:: bash
-        预测ID;ID为0的概率 ID为1的概率
-        预测ID;ID为0的概率 ID为1的概率
-总体效果总结
-==============
-在 ``/demo/quick_start`` 目录下，能够找到这里使用的所有数据, 网络配置, 训练脚本等等。
-对于Amazon-Elec测试集(25k), 如下表格，展示了上述网络模型的训练效果:
-    =====================  ===============================  =============  ==================================
-    网络名称                       参数数量                    错误率          配置文件
-    =====================  ===============================  =============  ==================================
-    逻辑回归模型                      252 KB                     8.652%          trainer_config.lr.py
-    词向量模型                         15 MB                      8.484%         trainer_config.emb.py
-    卷积模型                        16 MB                     5.628%          trainer_config.cnn.py
-    时序模型                         16 MB                     4.812%          trainer_config.lstm.py
-    =====================  ===============================  =============  ==================================
-附录
-=====
-命令行参数
----------
-* \--config：网络配置
-* \--save_dir：模型存储路径
-* \--log_period：每隔多少batch打印一次日志
-* \--num_passes：训练轮次，一个pass表示过一遍所有训练样本
-* \--config_args：命令指定的参数会传入网络配置中。
-* \--init_model_path：指定初始化模型路径，可用在测试或训练时指定初始化模型。
-默认一个pass保存一次模型，也可以通过saving_period_by_batches设置每隔多少batch保存一次模型。
-可以通过show_parameter_stats_period设置打印参数信息等。
-其他参数请参考 命令行参数文档（链接待补充）。
-输出日志
---------
-.. code-block:: bash
-    TrainerInternal.cpp:160]  Batch=20 samples=2560 AvgCost=0.628761 CurrentCost=0.628761 Eval: classification_error_evaluator=0.304297  CurrentEval: classification_error_evaluator=0.304297
-模型训练会看到类似上面这样的日志信息，详细的参数解释，请参考如下表格：
-    ===========================================  ==============================================================
-    名称                                             解释
-    ===========================================  ==============================================================
-    Batch=20                                      表示过了20个batch
-    samples=2560                                  表示过了2560个样本
-    AvgCost                                          每个pass的第0个batch到当前batch所有样本的平均cost
-    CurrentCost                                      当前log_period个batch所有样本的平均cost
-    Eval: classification_error_evaluator          每个pass的第0个batch到当前batch所有样本的平均分类错误率
-    CurrentEval: classification_error_evaluator      当前log_period个batch所有样本的平均分类错误率
-    ===========================================  ==============================================================
--- a/doc/v1_api_tutorials/quick_start/index_en.md
+++ b/doc/v1_api_tutorials/quick_start/index_en.md
-# Quick Start
-This tutorial will teach the basics of deep learning (DL), including how to implement many different models in PaddlePaddle. You will learn how to:
-  - Prepare data into the standardized format that PaddlePaddle accepts.
-  - Write data providers that read data into PaddlePaddle.
-  - Configure neural networks in PaddlePaddle layer by layer.
-  - Train models.
-  - Perform inference with trained models.
-## Install
-To get started, please install PaddlePaddle on your computer. Throughout this tutorial, you will learn by implementing different DL models for text classification.
-To install PaddlePaddle, please follow the instructions here: <a href = "../../getstarted/build_and_install/index_en.html" >Build and Install</a>.
-## Overview
-For the first step, you will use PaddlePaddle to build a **text classification** system. For example, suppose you run an e-commence  website, and you want to analyze the sentiment of user reviews to evaluate product quality.
-For example, given the input
-```
-This monitor is fantastic.
-```
-Your classifier should output “positive”, since this text snippet shows that the user is satisfied with the product. Given this input:
-```
-The monitor breaks down two months after purchase.
-```
-the classifier should output “negative“.
-To build your text classification system, your code will need to perform five steps:
-<center> ![](./src/Pipeline_en.jpg) </center>
-  - Preprocess data into a standardized format.
-  - Provide data to the learning model.
-  - Specify the neural network structure.
-  - Train the model.
-  - Inference (make prediction on test examples).
-1. Preprocess data into standardized format
-    - In the text classification example, you will start with a text file with one training example per line. Each line contains category id (in machine learning, often denoted the target y), followed by the input text (often denoted x); these two elements are separated by a Tab. For example: ```positive [tab] This monitor is fantastic```. You will preprocess this raw data into a format that Paddle can use.
-2. Provide data to the learning model.
-    - You can write data providers in Python. For any required data preprocessing step, you can add the preprocessing code to the PyDataProvider Python file.
-    - In our text classification example, every word or character will be converted into an integer id, specified in a dictionary file. It perform a dictionary lookup in PyDataProvider to get the id.
-3. Specify neural network structure.  (From easy to hard, we provide 4 kinds of network configurations)
-    - A logistic regression model.
-    - A word embedding model.
-    - A convolutional neural network model.
-    - A sequential recurrent neural network model.
-    - You will also learn different learning algorithms.
-4. Training model.
-5. Inference.
-## Preprocess data into standardized format
-In this example, you are going to use [Amazon electronic product review dataset](http://jmcauley.ucsd.edu/data/amazon/) to build a bunch of deep neural network models for text classification. Each text in this dataset is a product review. This dataset has two categories: “positive” and “negative”. Positive means the reviewer likes the product, while negative means the reviewer does not like the product.
-`demo/quick_start` in the [source code](https://github.com/PaddlePaddle/Paddle) provides script for downloading the preprocessed data as shown below. (If you want to process the raw data, you can use the script `demo/quick_start/data/proc_from_raw_data/get_data.sh`).
-```bash
-cd demo/quick_start
-./data/get_data.sh
-```
-## Transfer Data to Model
-### Write Data Provider with Python
-The following `dataprovider_bow.py` gives a complete example of writing data provider with Python. It includes the following parts:
-* initalizer： define the additional meta-data of the data provider and the types of the input data.
-* process： Each `yield` returns a data sample. In this case, it return the text representation and category id. The order of features in the returned result needs to be consistent with the definition of the input types in `initalizer`.
-```python
-from paddle.trainer.PyDataProvider2 import *
-# id of the word not in dictionary
-UNK_IDX = 0
-# initializer is called by the framework during initialization.
-# It allows the user to describe the data types and setup the
-# necessary data structure for later use.
-# `settings` is an object. initializer need to properly fill settings.input_types.
-# initializer can also store other data structures needed to be used at process().
-# In this example, dictionary is stored in settings.
-# `dictionay` and `kwargs` are arguments passed from trainer_config.lr.py
-def initializer(settings, dictionary, **kwargs):
-    # Put the word dictionary into settings
-    settings.word_dict = dictionary
-    # setting.input_types specifies what the data types the data provider
-    # generates.
-    settings.input_types = [
-        # The first input is a sparse_binary_vector,
-        # which means each dimension of the vector is either 0 or 1. It is the
-        # bag-of-words (BOW) representation of the texts.
-        sparse_binary_vector(len(dictionary)),
-        # The second input is an integer. It represents the category id of the
-        # sample. 2 means there are two labels in the dataset.
-        # (1 for positive and 0 for negative)
-        integer_value(2)]
-# Delaring a data provider. It has an initializer 'data_initialzer'.
-# It will cache the generated data of the first pass in memory, so that
-# during later pass, no on-the-fly data generation will be needed.
-# `setting` is the same object used by initializer()
-# `file_name` is the name of a file listed train_list or test_list file given
-# to define_py_data_sources2(). See trainer_config.lr.py.
-@provider(init_hook=initializer, cache=CacheType.CACHE_PASS_IN_MEM)
-def process(settings, file_name):
-    # Open the input data file.
-    with open(file_name, 'r') as f:
-        # Read each line.
-        for line in f:
-            # Each line contains the label and text of the comment, separated by \t.
-            label, comment = line.strip().split('\t')
-            # Split the words into a list.
-            words = comment.split()
-            # convert the words into a list of ids by looking them up in word_dict.
-            word_vector = [settings.word_dict.get(w, UNK_IDX) for w in words]
-            # Return the features for the current comment. The first is a list
-            # of ids representing a 0-1 binary sparse vector of the text,
-            # the second is the integer id of the label.
-            yield word_vector, int(label)
-```
-### Define Python Data Provider in Configuration files.
-You need to add a data provider definition `define_py_data_sources2` in our network configuration. This definition specifies:
- The path of the training and testing data (`data/train.list`, `data/test.list`).
- The location of the data provider file (`dataprovider_bow`).
- The function to call to get data. (`process`).
- Additional arguments or data. Here it passes the path of word dictionary.
-```python
-from paddle.trainer_config_helpers import *
-file = "data/dict.txt"
-word_dict = dict()
-with open(dict_file, 'r') as f:
-    for i, line in enumerate(f):
-        w = line.strip().split()[0]
-        word_dict[w] = i
-# define the data sources for the model.
-# We need to use different process for training and prediction.
-# For training, the input data includes both word IDs and labels.
-# For prediction, the input data only includs word Ids.
-define_py_data_sources2(train_list='data/train.list',
-                        test_list='data/test.list',
-                        module="dataprovider_bow",
-                        obj="process",
-                        args={"dictionary": word_dict})
-```
-You can refer to the following link for more detailed examples and data formats: <a href = "../../api/v1/data_provider/pydataprovider2_en.html">PyDataProvider2</a>.
-## Network Architecture
-We will describe four kinds of network architectures in this section.
-<center> ![](./src/PipelineNetwork_en.jpg) </center>
-First, you will build a logistic regression model. Later, you will also get chance to build other more powerful network architectures.
-For more detailed documentation, you could refer to: <a href = "../../api/v1/trainer_config_helpers/layers.html">layer documentation</a>. All configuration files are in `demo/quick_start` directory.
-### Logistic Regression
-The architecture is illustrated in the following picture:
-<center> ![](./src/NetLR_en.png) </center>
- You need define the data for text features. The size of the data layer is the number of words in the dictionary.
-```python
-word = data_layer(name="word",  size=voc_dim)
-```
- You also need to define the category id for each example. The size of the data layer is the number of labels.
-```python
-label = data_layer(name="label", size=label_dim)
-```
- It uses logistic regression model to classify the vector, and it will output the classification error during training.
-    - Each layer has an *input* argument that specifies its input layer. Some layers can have multiple input layers. You can use a list of the input layers as input in that case.
-    - *size* for each layer means the number of neurons of the layer.
-    - *act_type* means activation function applied to the output of each neuron independently.
-    - Some layers can have additional special inputs. For example, `classification_cost` needs ground truth label as input to compute classification loss and error.
-```python
-# Define a fully connected layer with logistic activation (also called softmax activation).
-output = fc_layer(input=word,
-                  size=label_dim,
-                  act_type=SoftmaxActivation())
-# Define cross-entropy classification loss and error.
-classification_cost(input=output, label=label)
-```
-Performance summary: You can refer to the training and testing scripts later. In order to compare different network architectures, the model complexity and test classification error are listed in the following table:
-<html>
-<center>
-<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
-<thead>
-<th scope="col" class="left">Network name</th>
-<th scope="col" class="left">Number of parameters</th>
-<th scope="col" class="left">Test error</th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td class="left">Logistic regression</td>
-<td class="left">252 KB</td>
-<td class="left">8.652%</td>
-</tr>
-</tbody>
-</table></center>
-</html>
-<br>
-### Word Embedding Model
-In order to use the word embedding model, you need to change the data provider a little bit to make the input words as a sequence of word IDs. The revised data provider `dataprovider_emb.py` is listed below. You only need to change initializer() for the type of the first input. It is changed from sparse_binary_vector to sequence of intergers.  process() remains the same. This data provider can also be used for later sequence models.
-```python
-def initializer(settings, dictionary, **kwargs):
-    # Put the word dictionary into settings
-    settings.word_dict = dictionary
-    settings.input_types = [
-        # Define the type of the first input as a sequence of integers.
-        integer_value_sequence(len(dictionary)),
-        # Define the second input for label id
-        integer_value(2)]
-@provider(init_hook=initializer)
-def process(settings, file_name):
-    ...
-    # omitted, it is same as the data provider for LR model
-```
-This model is very similar to the framework of logistic regression, but it uses word embedding vectors instead of a sparse vectors to represent words.
-<center> ![](./src/NetContinuous_en.png) </center>
- It can look up the dense word embedding vector in the dictionary  (its words embedding vector is `word_dim`). The input is a sequence of N words, the output is N word_dim dimensional vectors.
-```python
-emb = embedding_layer(input=word, dim=word_dim)
-```
- It averages all the word embedding in a sentence to get its sentence representation.
-```python
-avg = pooling_layer(input=emb, pooling_type=AvgPooling())
-```
-The other parts of the model are the same as logistic regression network.
-The performance is summarized in the following table:
-<html>
-<center>
-<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
-<thead>
-<th scope="col" class="left">Network name</th>
-<th scope="col" class="left">Number of parameters</th>
-<th scope="col" class="left">Test error</th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td class="left">Word embedding model</td>
-<td class="left">15 MB</td>
-<td class="left">8.484%</td>
-</tr>
-</tbody>
-</table>
-</html></center>
-<br>
-### Convolutional Neural Network Model
-Convolutional neural network converts a sequence of word embeddings into a sentence representation using temporal convolutions. You will transform the fully connected layer of the word embedding model to 3 new sub-steps.
-<center> ![](./src/NetConv_en.png) </center>
-Text convolution has 3 steps:
-1. Get K nearest neighbor context of each word in a sentence, stack them into a 2D vector representation.
-2. Apply temporal convolution to this representation to produce a new hidden_dim dimensional vector.
-3. Apply max-pooling to the new vectors at all the time steps in a sentence to get a sentence representation.
-```python
-# context_len means convolution kernel size.
-# context_start means the start of the convolution. It can be negative. In that case, zero padding is applied.
-text_conv = sequence_conv_pool(input=emb,
-                               context_start=k,
-                               context_len=2 * k + 1)
-```
-The performance is summarized in the following table：
-<html>
-<center>
-<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
-<thead>
-<th scope="col" class="left">Network name</th>
-<th scope="col" class="left">Number of parameters</th>
-<th scope="col" class="left">Test error</th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td class="left">Convolutional model</td>
-<td class="left">16 MB</td>
-<td class="left">5.628%</td>
-</tr>
-</tbody>
-</table></center>
-<br>
-### Recurrent Model
-<center> ![](./src/NetRNN_en.png) </center>
-You can use Recurrent neural network as our time sequence model, including simple RNN model, GRU model, and LSTM model。
- GRU model can be specified via：
-```python
-gru = simple_gru(input=emb, size=gru_size)
-```
- LSTM model can be specified via：
-```python
-lstm = simple_lstm(input=emb, size=lstm_size)
-```
-You can use single layer LSTM model with Dropout for our text classification problem. The performance is summarized in the following table:
-<html>
-<center>
-<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
-<thead>
-<th scope="col" class="left">Network name</th>
-<th scope="col" class="left">Number of parameters</th>
-<th scope="col" class="left">Test error</th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td class="left">Recurrent model</td>
-<td class="left">16 MB</td>
-<td class="left">4.812%</td>
-</tr>
-</tbody>
-</table></center>
-</html>
-<br>
-## Optimization Algorithm
-<a href = "../../api/v1/trainer_config_helpers/optimizers.html">Optimization algorithms</a> include Momentum, RMSProp, AdaDelta, AdaGrad, Adam, and Adamax. You can use Adam optimization method here, with L2 regularization and gradient clipping, because Adam has been proved to work very well for training recurrent neural network.
-```python
-settings(batch_size=128,
-         learning_rate=2e-3,
-         learning_method=AdamOptimizer(),
-         regularization=L2Regularization(8e-4),
-         gradient_clipping_threshold=25)
-```
-## Training Model
-After completing data preparation and network architecture specification, you will run the training script.
-<center> ![](./src/PipelineTrain_en.png) </center>
-Training script: our training script is in `train.sh` file. The training arguments are listed below:
-```bash
-paddle train \
--config=trainer_config.py \
--log_period=20 \
--save_dir=./output \
--num_passes=15 \
--use_gpu=false
-```
-We do not provide examples on how to train on clusters here. If you want to train on clusters, please follow the <a href = "../../howto/usage/cluster/cluster_train_en.html">distributed training</a> documentation or other demos for more details.
-## Inference
-You can use the trained model to perform prediction on the dataset with no labels. You can also evaluate the model on dataset with labels to obtain its test accuracy.
-<center> ![](./src/PipelineTest_en.png) </center>
-The test script is listed below. PaddlePaddle can evaluate a model on the data with labels specified in `test.list`.
-```bash
-paddle train \
--config=trainer_config.lstm.py \
--use_gpu=false \
--job=test \
--init_model_path=./output/pass-0000x
-```
-We will give an example of performing prediction using Recurrent model on a dataset with no labels. You can refer to <a href = "../../api/v1/predict/swig_py_paddle_en.html">Python Prediction API</a> tutorial，or other <a href = "../../tutorials/index_en.html">demo</a> for the prediction process using Python. You can also use the following script for inference or evaluation.
-inference script (predict.sh)：
-```bash
-model="output/pass-00003"
-paddle train \
-    --config=trainer_config.lstm.py \
-    --use_gpu=false \
-    --job=test \
-    --init_model_path=$model \
-    --config_args=is_predict=1 \
-    --predict_output_dir=. \
-mv rank-00000 result.txt
-```
-User can choose the best model base on the training log instead of model `output/pass-00003`. There are several differences between training and inference network configurations.
- You do not need labels during inference.
- Outputs need to be specified to the classification probability layer (the output of softmax layer), or the id of maximum probability (`max_id` layer). An example to output the id and probability is given in the code snippet.
- batch_size = 1.
- You need to specify the location of `test_list` in the test data.
-The results in `result.txt` is as follows, each line is one sample.
-```
-predicted_label_id;probability_of_label_0 probability_of_label_1  # the first sample
-predicted_label_id;probability_of_label_0 probability_of_label_1  # the second sample
-```
-```python
-is_predict = get_config_arg('is_predict', bool, False)
-trn = 'data/train.list' if not is_predict else None
-tst = 'data/test.list' if not is_predict else 'data/pred.list'
-obj = 'process' if not is_predict else 'process_pre'
-batch_size = 128 if not is_predict else 1
-if is_predict:
-    maxid = maxid_layer(output)
-    outputs([maxid,output])
-else:
-    label = data_layer(name="label", size=2)
-    cls = classification_cost(input=output, label=label) outputs(cls)
-```
-## Summary
-The scripts of data downloading, network configurations, and training scrips are in `/demo/quick_start`. The following table summarizes the performance of our network architecture on Amazon-Elec dataset(25k):
-<center>
-<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
-<thead>
-<th scope="col" class="left">Network name</th>
-<th scope="col" class="left">Number of parameters</th>
-<th scope="col" class="left">Error rate</th>
-<th scope="col" class="left">Configuration file name</th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td class="left">Logistic regression model(BOW)</td>
-<td class="left"> 252KB </td>
-<td class="left">8.652%</td>
-<td class="left">trainer_config.lr.py</td>
-</tr>
-<tr>
-<td class="left">Word embedding</td>
-<td class="left"> 15MB </td>
-<td class="left"> 8.484%</td>
-<td class="left">trainer_config.emb.py</td>
-</tr>
-<tr>
-<td class="left">Convolution model</td>
-<td class="left"> 16MB </td>
-<td class="left"> 5.628%</td>
-<td class="left">trainer_config.cnn.py</td>
-</tr>
-<tr>
-<td class="left">Time sequence model</td>
-<td class="left"> 16MB </td>
-<td class="left"> 4.812%</td>
-<td class="left">trainer_config.lstm.py</td>
-</tr>
-</tbody>
-</table>
-</center>
-<br>
-## Appendix
-### Command Line Argument
-* \--config：network architecture path.
-* \--save_dir：model save directory.
-* \--log_period：the logging period per batch.
-* \--num_passes：number of training passes. One pass means the training would go over the whole training dataset once.
-* \--config_args：Other configuration arguments.
-* \--init_model_path：The path of the initial model parameter.
-By default, the trainer will save model every pass. You can also specify `saving_period_by_batches` to set the frequency of batch saving. You can use `show_parameter_stats_period` to print the statistics of the parameters, which are very useful for tuning parameters. Other command line arguments can be found in <a href = "../../howto/usage/cmd_parameter/index_en.html">command line argument documentation</a>。
-### Log
-```
-TrainerInternal.cpp:160]  Batch=20 samples=2560 AvgCost=0.628761 CurrentCost=0.628761 Eval: classification_error_evaluator=0.304297  CurrentEval: classification_error_evaluator=0.304297
-```
-During model training, you will see the log like the examples above:
-<center>
-<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
-<thead>
-<th scope="col" class="left">Name</th>
-<th scope="col" class="left">Explanation</th>
-</tr>
-</thead>
-<tr>
-<td class="left">Batch=20</td>
-<td class="left"> You have trained 20 batches. </td>
-</tr>
-<tr>
-<td class="left">samples=2560</td>
-<td class="left"> You have trained 2560 examples. </td>
-</tr>
-<tr>
-<td class="left">AvgCost</td>
-<td class="left"> The average cost from the first batch to the current batch. </td>
-</tr>
-<tr>
-<td class="left">CurrentCost</td>
-<td class="left"> the average cost of the last log_period batches </td>
-</tr>
-<tr>
-<td class="left">Eval: classification_error_evaluator</td>
-<td class="left"> The average classification error from the first batch to the current batch.</td>
-</tr>
-<tr>
-<td class="left">CurrentEval: classification_error_evaluator</td>
-<td class="left"> The average error rate of the last log_period batches </td>
-</tr>
-</tbody>
-</table>
-</center>
-<br>
--- a/doc/v1_api_tutorials/quick_start/src/NetContinuous_cn.jpg
+++ b/doc/v1_api_tutorials/quick_start/src/NetContinuous_cn.jpg
--- a/doc/v1_api_tutorials/quick_start/src/NetContinuous_en.png
+++ b/doc/v1_api_tutorials/quick_start/src/NetContinuous_en.png
--- a/doc/v1_api_tutorials/quick_start/src/NetConv_cn.jpg
+++ b/doc/v1_api_tutorials/quick_start/src/NetConv_cn.jpg
--- a/doc/v1_api_tutorials/quick_start/src/NetConv_en.png
+++ b/doc/v1_api_tutorials/quick_start/src/NetConv_en.png
--- a/doc/v1_api_tutorials/quick_start/src/NetLR_cn.jpg
+++ b/doc/v1_api_tutorials/quick_start/src/NetLR_cn.jpg
--- a/doc/v1_api_tutorials/quick_start/src/NetLR_en.png
+++ b/doc/v1_api_tutorials/quick_start/src/NetLR_en.png
--- a/doc/v1_api_tutorials/quick_start/src/NetRNN_cn.jpg
+++ b/doc/v1_api_tutorials/quick_start/src/NetRNN_cn.jpg
--- a/doc/v1_api_tutorials/quick_start/src/NetRNN_en.png
+++ b/doc/v1_api_tutorials/quick_start/src/NetRNN_en.png
--- a/doc/v1_api_tutorials/quick_start/src/PipelineNetwork_cn.jpg
+++ b/doc/v1_api_tutorials/quick_start/src/PipelineNetwork_cn.jpg
--- a/doc/v1_api_tutorials/quick_start/src/PipelineNetwork_en.jpg
+++ b/doc/v1_api_tutorials/quick_start/src/PipelineNetwork_en.jpg
--- a/doc/v1_api_tutorials/quick_start/src/PipelineTest_cn.jpg
+++ b/doc/v1_api_tutorials/quick_start/src/PipelineTest_cn.jpg
--- a/doc/v1_api_tutorials/quick_start/src/PipelineTest_en.png
+++ b/doc/v1_api_tutorials/quick_start/src/PipelineTest_en.png
--- a/doc/v1_api_tutorials/quick_start/src/PipelineTrain_cn.jpg
+++ b/doc/v1_api_tutorials/quick_start/src/PipelineTrain_cn.jpg
--- a/doc/v1_api_tutorials/quick_start/src/PipelineTrain_en.png
+++ b/doc/v1_api_tutorials/quick_start/src/PipelineTrain_en.png
--- a/doc/v1_api_tutorials/quick_start/src/Pipeline_cn.jpg
+++ b/doc/v1_api_tutorials/quick_start/src/Pipeline_cn.jpg
--- a/doc/v1_api_tutorials/quick_start/src/Pipeline_en.jpg
+++ b/doc/v1_api_tutorials/quick_start/src/Pipeline_en.jpg
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -33,8 +33,13 @@ cc_library(scope SRCS scope.cc DEPS glog threadpool)
 cc_test(scope_test SRCS scope_test.cc DEPS scope)
 cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor)
+nv_test(data_device_transform_test SRCS data_device_transform_test.cu
+        DEPS operator op_registry init math_function)
 cc_library(data_type_transform SRCS data_type_transform.cc DEPS tensor)
 cc_library(data_layout_transform SRCS data_layout_transform.cc DEPS tensor math_function)
+cc_test(data_layout_transform_test SRCS data_layout_transform_test.cc DEPS data_layout_transform)
 cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor
        framework_proto selected_rows data_device_transform data_type_transform data_layout_transform)
@@ -82,5 +87,3 @@ cc_test(init_test SRCS init_test.cc DEPS init)
 cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto)
 cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
-nv_test(data_device_transform_test SRCS data_device_transform_test.cu
-        DEPS operator op_registry init math_function)
--- a/paddle/framework/data_device_transform_test.cu
+++ b/paddle/framework/data_device_transform_test.cu
@@ -150,6 +150,7 @@ TEST(Operator, CPUtoGPU) {
  // get output
  auto* output2 = scope.Var("OUT2");
  gpu_op->Run(scope, cuda_place);
+  VLOG(3) << "after gpu_op run";
  // auto* output2_ptr = output2->Get<LoDTensor>().data<float>();
  DeviceContextPool& pool = DeviceContextPool::Instance();

--- a/paddle/framework/data_layout_transform.cc
+++ b/paddle/framework/data_layout_transform.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,12 +14,23 @@ limitations under the License. */
 #include "paddle/framework/data_layout_transform.h"
-#include "paddle/framework/tensor.h"
 #include "paddle/operators/math/math_function.h"
 namespace paddle {
 namespace framework {
+std::vector<int> GetAxis(const DataLayout& from, const DataLayout& to) {
+  PADDLE_ENFORCE_NE(from, to,
+                    "layout transform should transform different layout");
+  if (from == DataLayout::kNCHW && to == DataLayout::kNHWC) {
+    return {0, 2, 3, 1};
+  } else if (from == DataLayout::kNHWC && to == DataLayout::kNCHW) {
+    return {0, 3, 1, 2};
+  } else {
+    PADDLE_THROW("unsupported transform");
+  }
+}
 struct CastDataLayout {
  CastDataLayout(const platform::DeviceContext* ctx,
                 const std::vector<int>& axis, const framework::Tensor& in,
@@ -44,38 +55,36 @@ struct CastDataLayout {
  }
 };
-void TransDataLayout(const std::vector<int>& axis,
+void TransDataLayout(const OpKernelType& kernel_type_for_var,
-                     const platform::DeviceContext* ctx,
+                     const OpKernelType& expected_kernel_type, const Tensor& in,
-                     const KernelTypePair& kernel_pair, const Variable& in,
+                     Tensor* out) {
-                     Variable* out) {
-  PADDLE_ENFORCE(in.IsType<Tensor>(), "Only support Tensor transform!.");
  PADDLE_ENFORCE(
-      platform::places_are_same_class(kernel_pair.first.place_,
+      platform::places_are_same_class(kernel_type_for_var.place_,
-                                      kernel_pair.second.place_),
+                                      expected_kernel_type.place_),
      "TransDataLayout only support DataLayout transform on same place!");
-  PADDLE_ENFORCE(kernel_pair.first.data_type_ == kernel_pair.second.data_type_,
-                 "TransDataLayout only support Datatype are same!");
-  auto src = in.Get<Tensor>();
+  PADDLE_ENFORCE(arity(in.dims()) == 4, "Input Arity only support 4!");
-  auto* dst = out->GetMutable<Tensor>();
-  PADDLE_ENFORCE(arity(src.dims()) == 4, "Input Arity Only Suppport 4!");
+  auto& pool = platform::DeviceContextPool::Instance();
-  auto src_dim = src.dims();
+  auto src_dim = in.dims();
  std::vector<int64_t> dst_dim;
+  auto axis = GetAxis(kernel_type_for_var.data_layout_,
+                      expected_kernel_type.data_layout_);
  dst_dim.resize(axis.size());
  for (size_t i = 0; i < axis.size(); i++) {
    dst_dim[i] = src_dim[axis[i]];
  }
-  dst->Resize(make_ddim(dst_dim));
+  out->Resize(make_ddim(dst_dim));
-  auto place = kernel_pair.second.place_;
+  out->mutable_data(expected_kernel_type.place_, in.type());
-  dst->mutable_data(place, src.type());
-  auto src_type = kernel_pair.first.data_type_;
+  framework::VisitDataType(
-  framework::VisitDataType(src_type, CastDataLayout(ctx, axis, src, dst));
+      framework::ToDataType(in.type()),
+      CastDataLayout(pool.Get(expected_kernel_type.place_), axis, in, out));
-  dst->set_layout(kernel_pair.second.data_layout_);
+  out->set_layout(expected_kernel_type.data_layout_);
 }
 }  // namespace framework

--- a/paddle/framework/data_layout_transform.h
+++ b/paddle/framework/data_layout_transform.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -15,17 +15,17 @@ limitations under the License. */
 #pragma once
 #include "paddle/framework/op_kernel_type.h"
+#include "paddle/framework/tensor.h"
 #include "paddle/framework/variable.h"
 namespace paddle {
 namespace framework {
-using KernelTypePair = std::pair<OpKernelType, OpKernelType>;
+std::vector<int> GetAxis(const DataLayout& from, const DataLayout& to);
-void TransDataLayout(const std::vector<int>& axis,
+void TransDataLayout(const OpKernelType& kernel_type_for_var,
-                     const platform::DeviceContext* ctx,
+                     const OpKernelType& expected_kernel_type, const Tensor& in,
-                     const KernelTypePair& kernel_pair, const Variable& in,
+                     Tensor* out);
-                     Variable* out);
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/data_layout_transform_test.cc
+++ b/paddle/framework/data_layout_transform_test.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/framework/data_layout_transform.h"
+#include "gtest/gtest.h"
+#include "paddle/platform/device_context.h"
+TEST(DataTransform, DataLayoutFunction) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  auto place = CPUPlace();
+  Tensor in = Tensor();
+  Tensor out = Tensor();
+  in.mutable_data<double>(make_ddim({2, 3, 1, 2}), place);
+  in.set_layout(DataLayout::kNHWC);
+  auto kernel_nhwc = OpKernelType(proto::DataType::FP32, place,
+                                  DataLayout::kNHWC, LibraryType::kPlain);
+  auto kernel_ncwh = OpKernelType(proto::DataType::FP32, place,
+                                  DataLayout::kNCHW, LibraryType::kPlain);
+  TransDataLayout(kernel_nhwc, kernel_ncwh, in, &out);
+  EXPECT_TRUE(out.layout() == DataLayout::kNCHW);
+  EXPECT_TRUE(out.dims() == make_ddim({2, 2, 3, 1}));
+  TransDataLayout(kernel_ncwh, kernel_nhwc, in, &out);
+  EXPECT_TRUE(in.layout() == DataLayout::kNHWC);
+  EXPECT_TRUE(in.dims() == make_ddim({2, 3, 1, 2}));
+}
\ No newline at end of file
--- a/paddle/framework/data_transform.cc
+++ b/paddle/framework/data_transform.cc
@@ -15,18 +15,43 @@ limitations under the License. */
 #include "paddle/framework/data_transform.h"
 #include "paddle/framework/data_device_transform.h"
+#include "paddle/framework/data_layout_transform.h"
 namespace paddle {
 namespace framework {
+static void PassTensorData(Tensor* from, Tensor* to) {
+  to->ShareDataWith(*from);
+  *from = Tensor();
+}
 void DataTransform(const OpKernelType& expected_kernel_type,
                   const OpKernelType& kernel_type_for_var,
-                   const Tensor& input_tensor, Tensor* out) {
+                   const Tensor& input_tensor, Tensor* output_tensor) {
+  bool transformed = false;
+  Tensor in;
+  in.ShareDataWith(input_tensor);
+  Tensor out;
+  // do layout transform
+  if (NeedTransformLayout(expected_kernel_type.data_layout_,
+                          kernel_type_for_var.data_layout_)) {
+    TransDataLayout(kernel_type_for_var, expected_kernel_type, in, &out);
+    transformed = true;
+    PassTensorData(&out, &in);
+  }
+  // do device transform
  if (!platform::is_same_place(kernel_type_for_var.place_,
                               expected_kernel_type.place_)) {
-    DeviceTransform(input_tensor, expected_kernel_type.place_, out);
+    DeviceTransform(in, expected_kernel_type.place_, &out);
+    transformed = true;
+    PassTensorData(&out, &in);
  }
-  PADDLE_ENFORCE_NOT_NULL(out, "out should not be null");
+  PADDLE_ENFORCE(transformed, "no transform is done, please check!");
+  // get output data
+  output_tensor->ShareDataWith(in);
 }
 void CopyVariableWithTensor(const Variable& in_var, const Tensor& tensor,

--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@@ -286,18 +286,18 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor,
  DeserializeFromStream(is, static_cast<Tensor *>(tensor), dev_ctx);
 }
-// TODO(tonyyang-svail): make this function support LoD
 std::vector<LoDTensor> LoDTensor::SplitLoDTensor(
    const std::vector<platform::Place> places) const {
  check_memory_size();
-  PADDLE_ENFORCE(lod().empty(), "Disable parallel lod for now");
+  int batch_size =
-  size_t result_size = std::min(static_cast<size_t>(dims()[0]), places.size());
+      lod().empty() ? dims()[0] : static_cast<int>(lod()[0].size()) - 1;
-  size_t remainder = dims()[0] % places.size();
+  size_t result_size = std::min(static_cast<size_t>(batch_size), places.size());
+  size_t remainder = batch_size % places.size();
  std::vector<LoDTensor> results;
  results.reserve(result_size);
-  int step_width = static_cast<int>(dims()[0] / result_size);
+  int step_width = static_cast<int>(batch_size / result_size);
  for (size_t i = 0; i < result_size; ++i) {
    int begin = static_cast<int>(i * step_width);
    int end = static_cast<int>((i + 1) * step_width);
@@ -305,13 +305,28 @@ std::vector<LoDTensor> LoDTensor::SplitLoDTensor(
      end += remainder;
    }
-    auto src = Slice(begin, end);
-    auto &dst_place = places[i];
    LoDTensor dst;
-    if (!(dst_place == place())) {
+    if (lod().empty()) {
+      auto src = Slice(begin, end);
+      auto &dst_place = places[i];
      framework::Copy(src, dst_place, &dst);
-    } else {  // It is no need to copy if src_place and dst_place are same.
+    } else {
-      dst.ShareDataWith(src);
+      auto lod_and_offset = GetSubLoDAndAbsoluteOffset(lod(), begin, end, 0);
+      auto &offset = lod_and_offset.second;
+      auto src = Slice(offset.first, offset.second);
+      auto &dst_place = places[i];
+      framework::Copy(src, dst_place, &dst);
+      LoD my_lod;
+      for (auto &l : lod_and_offset.first) {
+        std::vector<size_t> v{0};
+        for (auto &ll : l) {
+          v.push_back(ll + v.back());
+        }
+        my_lod.emplace_back(v);
+      }
+      dst.set_lod(my_lod);
    }
    results.emplace_back(dst);
  }
@@ -319,29 +334,38 @@ std::vector<LoDTensor> LoDTensor::SplitLoDTensor(
  return results;
 }
-// TODO(tonyyang-svail): make this function support LoD
 void LoDTensor::MergeLoDTensor(
    const std::vector<const LoDTensor *> &lod_tensors,
    platform::Place dst_place) {
  PADDLE_ENFORCE(!lod_tensors.empty());
  framework::DDim new_dim = lod_tensors[0]->dims();
  std::type_index new_type = lod_tensors[0]->type();
-  auto new_layout = lod_tensors[0]->layout();
+  framework::DataLayout new_layout = lod_tensors[0]->layout();
-  int64_t new_height = 0;
+  LoD new_lod = lod_tensors[0]->lod();
-  for (auto *lod : lod_tensors) {
+  for (size_t i = 1; i < lod_tensors.size(); ++i) {
-    new_height += lod->dims()[0];
+    auto *t = lod_tensors[i];
-    for (int i = 1; i < new_dim.size(); ++i) {
+    PADDLE_ENFORCE_EQ(new_type.hash_code(), t->type().hash_code());
-      PADDLE_ENFORCE_EQ(new_dim[i], lod->dims()[i]);
+    PADDLE_ENFORCE_EQ(new_layout, t->layout());
+    PADDLE_ENFORCE_EQ(framework::product(new_dim) / new_dim[0],
+                      framework::product(t->dims()) / t->dims()[0]);
+    new_dim[0] += t->dims()[0];
+    auto &lod = t->lod();
+    for (size_t j = 0; j < lod.size(); ++j) {
+      auto &sub_lod = new_lod[j];
+      auto &offset = sub_lod.back();
+      for (size_t k = 1; k < lod[j].size(); ++k) {
+        sub_lod.push_back(lod[j][k] + offset);
+      }
    }
-    PADDLE_ENFORCE_EQ(new_type, lod->type());
-    PADDLE_ENFORCE_EQ(new_layout, lod->layout());
  }
-  new_dim[0] = new_height;
  Resize(new_dim);
  set_layout(new_layout);
+  set_lod(new_lod);
  mutable_data(dst_place, new_type);
  int begin = 0;
  for (auto *src : lod_tensors) {
    int end = begin + src->dims()[0];

--- a/paddle/framework/lod_tensor_test.cc
+++ b/paddle/framework/lod_tensor_test.cc
@@ -100,6 +100,71 @@ TEST(LoD, ToAbsOffset) {
  EXPECT_EQ(abs_lod, expected);
 }
+TEST(LoD, SplitLoDTensor) {
+  LoD lod;
+  lod.push_back(std::vector<size_t>({0, 2, 4, 5, 6}));
+  lod.push_back(std::vector<size_t>({0, 1, 6, 8, 13, 15, 20}));
+  platform::CPUPlace place;
+  LoDTensor lod_tensor;
+  lod_tensor.Resize({20, 1});
+  float* dst_ptr = lod_tensor.mutable_data<float>(place);
+  for (int i = 0; i < lod_tensor.numel(); ++i) {
+    dst_ptr[i] = i;
+  }
+  lod_tensor.set_lod(lod);
+  std::vector<platform::Place> places{platform::CPUPlace(),
+                                      platform::CPUPlace()};
+  LoD lod0;
+  lod0.push_back(std::vector<size_t>({0, 2, 4}));
+  lod0.push_back(std::vector<size_t>({0, 1, 6, 8, 13}));
+  LoD lod1;
+  lod1.push_back(std::vector<size_t>({0, 1, 2}));
+  lod1.push_back(std::vector<size_t>({0, 2, 7}));
+  auto lods = lod_tensor.SplitLoDTensor(places);
+  EXPECT_EQ(lods[0].lod(), lod0);
+  EXPECT_EQ(lods[1].lod(), lod1);
+}
+TEST(LoD, MergeLoDTensor) {
+  LoD lod;
+  lod.push_back(std::vector<size_t>({0, 2, 4, 5, 6}));
+  lod.push_back(std::vector<size_t>({0, 1, 6, 8, 13, 15, 20}));
+  platform::CPUPlace place;
+  LoDTensor lod_tensor0;
+  LoD lod0;
+  lod0.push_back(std::vector<size_t>({0, 2, 4}));
+  lod0.push_back(std::vector<size_t>({0, 1, 6, 8, 13}));
+  lod_tensor0.set_lod(lod0);
+  lod_tensor0.Resize({13, 1});
+  float* dst_ptr = lod_tensor0.mutable_data<float>(place);
+  for (int i = 0; i < lod_tensor0.numel(); ++i) {
+    dst_ptr[i] = i;
+  }
+  LoDTensor lod_tensor1;
+  LoD lod1;
+  lod1.push_back(std::vector<size_t>({0, 1, 2}));
+  lod1.push_back(std::vector<size_t>({0, 2, 7}));
+  lod_tensor1.set_lod(lod1);
+  lod_tensor1.Resize({7, 1});
+  dst_ptr = lod_tensor1.mutable_data<float>(place);
+  for (int i = 0; i < lod_tensor1.numel(); ++i) {
+    dst_ptr[i] = i;
+  }
+  std::vector<const LoDTensor*> lods{&lod_tensor0, &lod_tensor1};
+  LoDTensor lod_tensor;
+  lod_tensor.MergeLoDTensor(lods, place);
+  EXPECT_EQ(lod_tensor.lod(), lod);
+}
 TEST(LoD, CheckLoD) {
  LoD relative_lod;
  relative_lod.push_back(std::vector<size_t>({0, 2}));

--- a/paddle/framework/op_kernel_type.h
+++ b/paddle/framework/op_kernel_type.h
@@ -85,9 +85,14 @@ inline std::string KernelTypeToString(const OpKernelType& kernel_key) {
  return stream.str();
 }
+inline bool NeedTransformLayout(const DataLayout& l, const DataLayout& r) {
+  return l != DataLayout::kAnyLayout && r != DataLayout::kAnyLayout && l != r;
+}
 inline bool TransFromNeeded(const OpKernelType& l, const OpKernelType& r) {
  return (!platform::places_are_same_class(l.place_, r.place_)) ||
-         (l.data_type_ != r.data_type_) || (l.data_layout_ != r.data_layout_);
+         (l.data_type_ != r.data_type_) ||
+         NeedTransformLayout(l.data_layout_, r.data_layout_);
 }
 }  // namespace framework

--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -485,9 +485,15 @@ void OperatorWithKernel::Run(const Scope& scope,
  // }
  auto expected_kernel_key = this->GetExpectedKernelType(ctx);
  VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
+  auto kernel_iter = kernels.find(expected_kernel_key);
+  if (kernel_iter == kernels.end()) {
+    PADDLE_THROW("op %s does not have kernel for %s", type_,
+                 KernelTypeToString(expected_kernel_key));
+  }
+  // do data transform
  Scope& new_scope = scope.NewScope();
  for (auto& var_name_item : this->Inputs()) {
@@ -520,8 +526,6 @@ void OperatorWithKernel::Run(const Scope& scope,
    }
  }
-  auto kernel_iter = kernels.find(expected_kernel_key);
  auto* new_dev_ctx = pool.Get(expected_kernel_key.place_);
  kernel_iter->second->Compute(
      ExecutionContext(*this, new_scope, *new_dev_ctx));

--- a/paddle/inference/CMakeLists.txt
+++ b/paddle/inference/CMakeLists.txt
@@ -8,27 +8,6 @@ cc_library(paddle_fluid_api
 # Merge all modules into a simgle static library
 cc_library(paddle_fluid DEPS paddle_fluid_api ${FLUID_CORE_MODULES})
-# ptools
-# just for testing, we may need to change the storing format for inference_model
-# and move the dependent of pickle.
-# download from http://www.picklingtools.com/
-# build in the C++ sub-directory, using command
-#     make -f Makefile.Linux libptools.so
-set(PTOOLS_LIB)
-set(PTOOLS_ROOT $ENV{PTOOLS_ROOT} CACHE PATH "Folder contains PicklingTools")
-find_path(PTOOLS_INC_DIR chooseser.h PATHS ${PTOOLS_ROOT}/C++)
-find_library(PTOOLS_SHARED_LIB NAMES ptools PATHS ${PTOOLS_ROOT}/C++)
-if(PTOOLS_INC_DIR AND PTOOLS_SHARED_LIB)
-  add_definitions(-DPADDLE_USE_PTOOLS)
-  set(PTOOLS_LIB ptools)
-  message(STATUS "Found PicklingTools: ${PTOOLS_SHARED_LIB}")
-  add_library(${PTOOLS_LIB} SHARED IMPORTED GLOBAL)
-  set_property(TARGET ${PTOOLS_LIB} PROPERTY IMPORTED_LOCATION ${PTOOLS_SHARED_LIB})
-  include_directories(${PTOOLS_ROOT}/C++)
-  include_directories(${PTOOLS_ROOT}/C++/opencontainers_1_8_5/include)
-  add_definitions(-DOC_NEW_STYLE_INCLUDES) # used in ptools
-endif()
 add_executable(example example.cc)
 if(APPLE)
  set(OPTIONAL_LINK_FLAGS)

--- a/paddle/inference/example.cc
+++ b/paddle/inference/example.cc
@@ -18,33 +18,21 @@ limitations under the License. */
 #include "paddle/inference/inference.h"
 DEFINE_string(dirname, "", "Directory of the inference model.");
-DEFINE_string(feed_var_names, "", "Names of feeding variables");
-DEFINE_string(fetch_var_names, "", "Names of fetching variables");
 int main(int argc, char** argv) {
  google::ParseCommandLineFlags(&argc, &argv, true);
-  if (FLAGS_dirname.empty() || FLAGS_feed_var_names.empty() ||
+  if (FLAGS_dirname.empty()) {
-      FLAGS_fetch_var_names.empty()) {
    // Example:
    //   ./example --dirname=recognize_digits_mlp.inference.model
-    //             --feed_var_names="x"
+    std::cout << "Usage: ./example --dirname=path/to/your/model" << std::endl;
-    //             --fetch_var_names="fc_2.tmp_2"
-    std::cout << "Usage: ./example --dirname=path/to/your/model "
-                 "--feed_var_names=x --fetch_var_names=y"
-              << std::endl;
    exit(1);
  }
  std::cout << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
-  std::cout << "FLAGS_feed_var_names: " << FLAGS_feed_var_names << std::endl;
-  std::cout << "FLAGS_fetch_var_names: " << FLAGS_fetch_var_names << std::endl;
  std::string dirname = FLAGS_dirname;
-  std::vector<std::string> feed_var_names = {FLAGS_feed_var_names};
-  std::vector<std::string> fetch_var_names = {FLAGS_fetch_var_names};
  paddle::InferenceEngine* engine = new paddle::InferenceEngine();
-  engine->LoadInferenceModel(dirname, feed_var_names, fetch_var_names);
+  engine->LoadInferenceModel(dirname);
  paddle::framework::LoDTensor input;
  srand(time(0));

--- a/paddle/inference/inference.cc
+++ b/paddle/inference/inference.cc
@@ -25,19 +25,37 @@ limitations under the License. */
 namespace paddle {
+void InferenceEngine::LoadInferenceModel(const std::string& dirname) {
+  std::string model_filename = dirname + "/__model__.dat";
+  LOG(INFO) << "loading model from " << model_filename;
+  std::ifstream inputfs(model_filename, std::ios::in | std::ios::binary);
+  std::string program_desc_str;
+  inputfs.seekg(0, std::ios::end);
+  program_desc_str.resize(inputfs.tellg());
+  inputfs.seekg(0, std::ios::beg);
+  LOG(INFO) << "program_desc_str's size: " << program_desc_str.size();
+  inputfs.read(&program_desc_str[0], program_desc_str.size());
+  inputfs.close();
+  program_ = new framework::ProgramDesc(program_desc_str);
+  GenerateLoadProgram(dirname);
+  framework::BlockDesc* global_block = program_->MutableBlock(0);
+  feed_var_names_.clear();
+  fetch_var_names_.clear();
+  for (auto* op : global_block->AllOps()) {
+    if (op->Type() == "feed") {
+      feed_var_names_.insert(feed_var_names_.begin(), op->Output("Out")[0]);
+    } else if (op->Type() == "fetch") {
+      fetch_var_names_.push_back(op->Input("X")[0]);
+    }
+  }
+}
 void InferenceEngine::LoadInferenceModel(
    const std::string& dirname,
    const std::vector<std::string>& feed_var_names,
    const std::vector<std::string>& fetch_var_names) {
-#ifdef PADDLE_USE_PTOOLS
-  std::string model_filename = dirname + "/__model__";
-  LOG(INFO) << "Using PicklingTools, loading model from " << model_filename;
-  Val v;
-  LoadValFromFile(model_filename.c_str(), v, SERIALIZE_P0);
-  std::string program_desc_str = v["program_desc_str"];
-  LOG(INFO) << "program_desc_str's size: " << program_desc_str.size();
-// PicklingTools cannot parse the vector of strings correctly.
-#else
  std::string model_filename = dirname + "/__model__.dat";
  LOG(INFO) << "loading model from " << model_filename;
  std::ifstream inputfs(model_filename, std::ios::in | std::ios::binary);
@@ -48,7 +66,7 @@ void InferenceEngine::LoadInferenceModel(
  LOG(INFO) << "program_desc_str's size: " << program_desc_str.size();
  inputfs.read(&program_desc_str[0], program_desc_str.size());
  inputfs.close();
-#endif
  program_ = new framework::ProgramDesc(program_desc_str);
  GenerateLoadProgram(dirname);
@@ -62,7 +80,7 @@ void InferenceEngine::LoadInferenceModel(
 }
 bool InferenceEngine::IsParameter(const framework::VarDesc* var) {
-  if (var->Persistable()) {
+  if (var->Persistable() && var->Name() != "feed" && var->Name() != "fetch") {
    // There are many unreachable variables in the program
    for (size_t i = 0; i < program_->Size(); ++i) {
      const framework::BlockDesc& block = program_->Block(i);

--- a/paddle/inference/inference.h
+++ b/paddle/inference/inference.h
@@ -28,6 +28,7 @@ public:
    delete load_program_;
  }
+  void LoadInferenceModel(const std::string& dirname);
  void LoadInferenceModel(const std::string& dirname,
                          const std::vector<std::string>& feed_var_names,
                          const std::vector<std::string>& fetch_var_names);

--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -178,14 +178,13 @@ foreach(src ${GENERAL_OPS})
 endforeach()
 file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n")
 set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
 cc_test(gather_test SRCS gather_test.cc DEPS tensor)
 cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
 cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
 cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
+cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_search_op)
 cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory)
 if(WITH_GPU)
    cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)

--- a/paddle/operators/beam_search_op.cc
+++ b/paddle/operators/beam_search_op.cc
@@ -29,7 +29,7 @@ void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
  PruneEndidCandidates(pre_ids, &selected_items);
  // calculate the output tensor's height
  size_t num_instances = std::accumulate(
-      std::begin(items), std::end(items), 0,
+      std::begin(selected_items), std::end(selected_items), 0,
      [](size_t a, std::vector<Item> &b) { return a + b.size(); });
  // the output tensor shape should be [num_instances, 1]
  auto dims = framework::make_ddim(
@@ -48,12 +48,20 @@ void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
  size_t low_offset = 0;
  for (auto &items : selected_items) {
    low_level.push_back(low_offset);
+    sort(items.begin(), items.end(), [](const Item &a, const Item &b) {
+      if (a.offset < b.offset) {
+        return true;
+      }
+      return a.id < b.id;
+    });
    for (auto &item : items) {
      ids_data[low_offset] = item.id;
      scores_data[low_offset] = item.score;
      low_offset++;
    }
  }
+  low_level.push_back(low_offset);
  // fill lod
  auto abs_lod = framework::ToAbsOffset(ids_->lod());
  auto &high_level = abs_lod[lod_level_];
@@ -64,16 +72,21 @@ void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
  selected_scores->set_lod(lod);
 }
-void BeamSearch::PruneEndidCandidates(const framework::LoDTensor &pre_ids,
+int BeamSearch::PruneEndidCandidates(const framework::LoDTensor &pre_ids,
-                                      std::vector<std::vector<Item>> *items) {
+                                     std::vector<std::vector<Item>> *items) {
  auto *pre_ids_data = pre_ids.data<int64_t>();
+  int res = 0;
  for (size_t offset = 0; offset < items->size(); offset++) {
    auto prefix_id = pre_ids_data[offset];
    if (prefix_id == end_id_) {
      items->at(offset).clear();
+    } else {
+      res++;
    }
  }
+  return res;
 }
 std::vector<std::vector<BeamSearch::Item>> BeamSearch::ToMap(
@@ -121,11 +134,7 @@ bool BeamSearch::NextItemSet(std::vector<BeamSearch::Item> *items) {
  auto ids = *ids_;
  auto scores = *scores_;
-  auto source_abs_two_level_lod = framework::SliceInLevel(
-      ids.lod(), lod_level_, sent_offset_, sent_offset_ + 1);
-  source_abs_two_level_lod = framework::ToAbsOffset(source_abs_two_level_lod);
  auto abs_lod = framework::ToAbsOffset(ids.lod());
-  PADDLE_ENFORCE_GE(source_abs_two_level_lod.size(), 2UL);
  auto *ids_data = ids.data<int64_t>();
  auto *scores_data = scores.data<float>();

--- a/paddle/operators/beam_search_op.h
+++ b/paddle/operators/beam_search_op.h
@@ -73,7 +73,15 @@ namespace operators {
 * second level:
 * [0, 2, 4]
 *
- * tensor's data
+ * id tensor's data
+ * [[
+ * 4,
+ * 1,
+ * 3,
+ * 8,
+ * ]]
+ *
+ * score tensor's data
 * [[
 * 0.5,
 * 0.3,
@@ -137,16 +145,21 @@ class BeamSearch {
    Item() {}
    Item(size_t offset, size_t id, float score)
        : offset(offset), id(id), score(score) {}
-    // offset in the lod_level_+1
+    // offset in the higher lod level.
    size_t offset;
+    // // prefix id in the lower lod level.
+    // size_t prefix;
    // the candidate id
    id_t id;
    // the corresponding score
    score_t score;
  };
-  void PruneEndidCandidates(const framework::LoDTensor& pre_ids,
+  /*
-                            std::vector<std::vector<Item>>* items);
+   * Delete all the records that follows the end token.
+   */
+  int PruneEndidCandidates(const framework::LoDTensor& pre_ids,
+                           std::vector<std::vector<Item>>* items);
  /*
   * Transform the items into a map whose key is offset, value is the items.

--- a/paddle/operators/beam_search_op_test.cc
+++ b/paddle/operators/beam_search_op_test.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/operators/beam_search_op.h"
+#include <gtest/gtest.h>
+#include <vector>
+namespace paddle {
+namespace test {
+using std::vector;
+using framework::LoDTensor;
+using framework::LoD;
+using operators::BeamSearch;
+using paddle::platform::CPUPlace;
+using std::cout;
+using std::endl;
+void CreateInput(LoDTensor* ids, LoDTensor* scores) {
+  LoD lod;
+  vector<size_t> level0({0, 1, 4});
+  vector<size_t> level1({0, 1, 2, 3, 4});
+  lod.push_back(level0);
+  lod.push_back(level1);
+  ids->set_lod(lod);
+  scores->set_lod(lod);
+  auto dims = framework::make_ddim(vector<int64_t>({4, 3}));
+  ids->Resize(dims);
+  scores->Resize(dims);
+  CPUPlace place;
+  auto* ids_data = ids->mutable_data<int64_t>(place);
+  auto* scores_data = scores->mutable_data<float>(place);
+  vector<int64_t> _ids({4, 2, 5, 2, 1, 3, 3, 5, 2, 8, 2, 1});
+  vector<float> _scores(
+      {0.5, 0.3, 0.2, 0.6, 0.3, 0.1, 0.9, 0.5, 0.1, 0.7, 0.5, 0.1});
+  for (int i = 0; i < 12; i++) {
+    ids_data[i] = _ids[i];
+    scores_data[i] = _scores[i];
+  }
+}
+TEST(beam_search_op, run) {
+  CPUPlace place;
+  LoDTensor ids, scores;
+  CreateInput(&ids, &scores);
+  LoDTensor pre_ids;
+  pre_ids.Resize(framework::make_ddim(vector<int64_t>(4, 1)));
+  for (int i = 0; i < 4; i++) {
+    pre_ids.mutable_data<int64_t>(place)[i] = i + 1;
+  }
+  BeamSearch beamsearch(ids, scores, (int64_t)0, (int64_t)2, 0);
+  LoDTensor sids, sscores;
+  beamsearch(pre_ids, &sids, &sscores);
+  LOG(INFO) << "score: " << sscores << endl;
+  ASSERT_EQ(sids.lod(), sscores.lod());
+  vector<int> tids({2, 4, 3, 8});
+  vector<float> tscores({0.3, 0.5, 0.9, 0.7});
+  for (int i = 0; i < 4; i++) {
+    ASSERT_EQ(tids[i], sids.data<int64_t>()[i]);
+    ASSERT_EQ(tscores[i], sscores.data<float>()[i]);
+  }
+}
+}  // namespace test
+}  // namespace paddle
--- a/paddle/operators/conv_op.cc
+++ b/paddle/operators/conv_op.cc
@@ -70,6 +70,13 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
 framework::OpKernelType ConvOp::GetExpectedKernelType(
    const framework::ExecutionContext& ctx) const {
  bool use_cudnn = ctx.Attr<bool>("use_cudnn");
+  use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
+#ifdef PADDLE_WITH_CUDA
+  if (platform::is_gpu_place(ctx.GetPlace())) {
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
+  }
+#endif
  framework::LibraryType library_;
  if (use_cudnn) {
    library_ = framework::LibraryType::kCUDNN;
@@ -283,6 +290,14 @@ void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const {
 framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
    const framework::ExecutionContext& ctx) const {
  bool use_cudnn = ctx.Attr<bool>("use_cudnn");
+  use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
+#ifdef PADDLE_WITH_CUDA
+  if (platform::is_gpu_place(ctx.GetPlace())) {
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
+  }
+#endif
  framework::LibraryType library_;
  if (use_cudnn) {
    library_ = framework::LibraryType::kCUDNN;

--- a/paddle/operators/conv_transpose_op.cc
+++ b/paddle/operators/conv_transpose_op.cc
@@ -61,6 +61,13 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
 framework::OpKernelType ConvTransposeOp::GetExpectedKernelType(
    const framework::ExecutionContext& ctx) const {
  bool use_cudnn = ctx.Attr<bool>("use_cudnn");
+  use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
+#ifdef PADDLE_WITH_CUDA
+  if (platform::is_gpu_place(ctx.GetPlace())) {
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
+  }
+#endif
  framework::LibraryType library_;
  if (use_cudnn) {
    library_ = framework::LibraryType::kCUDNN;
@@ -263,6 +270,13 @@ void ConvTransposeOpGrad::InferShape(framework::InferShapeContext* ctx) const {
 framework::OpKernelType ConvTransposeOpGrad::GetExpectedKernelType(
    const framework::ExecutionContext& ctx) const {
  bool use_cudnn = ctx.Attr<bool>("use_cudnn");
+  use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
+#ifdef PADDLE_WITH_CUDA
+  if (platform::is_gpu_place(ctx.GetPlace())) {
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
+  }
+#endif
  framework::LibraryType library_;
  if (use_cudnn) {
    library_ = framework::LibraryType::kCUDNN;

--- a/paddle/operators/ctc_align_op.cc
+++ b/paddle/operators/ctc_align_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/operators/ctc_align_op.h"
+namespace paddle {
+namespace operators {
+class CTCAlignOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input of CTCAlignOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Output"),
+                   "Output of CTCAlignOp should not be null.");
+    auto input_dims = ctx->GetInputDim("Input");
+    // TODO(wanghaoshuang): it is tricky to set the wrong dimension here.
+    ctx->SetOutputDim("Output", input_dims);
+  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
+        ctx.device_context());
+  }
+};
+class CTCAlignOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  CTCAlignOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Input",
+             "(LodTensor, default: LoDTensor<int>), Its shape is "
+             "[Lp, 1], where Lp is the sum of all input sequences' length.");
+    AddOutput("Output", "(Tensor, default: Tensor<int>), The align result.");
+    AddAttr<int>("blank",
+                 "(int, default: 0), the blank label setted in Connectionist "
+                 "Temporal Classification (CTC) op.")
+        .SetDefault(0);
+    AddAttr<bool>("merge_repeated",
+                  "(bool, default: true), whether to "
+                  "merge repeated elements between two blanks. ")
+        .SetDefault(true);
+    AddComment(R"DOC(
+CTCAlign op is used to merge repeated elements between two blanks
+and then delete all blanks in sequence.
+Given:
+    Input.data = [0, 1, 2, 2, 0, 4, 0, 4, 5, 0, 6,
+                  6, 0, 0, 7, 7, 7, 0]
+    Input.dims = {18, 1}
+    Input.LoD = [[0, 11, 18]]
+And:
+    blank = 0
+    merge_repeated = True
+Then:
+    Output.data = [1, 2, 4, 4, 5, 6,
+                   6, 7]
+    Output.dims = {8, 1}
+    Output.LoD = [[0, 6, 8]]
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(ctc_align, ops::CTCAlignOp, ops::CTCAlignOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    ctc_align, ops::CTCAlignKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::CTCAlignKernel<paddle::platform::CPUDeviceContext, int64_t>);
--- a/paddle/operators/ctc_align_op.cu
+++ b/paddle/operators/ctc_align_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <stdio.h>
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include "paddle/operators/ctc_align_op.h"
+namespace paddle {
+namespace operators {
+template <typename T>
+__global__ void MergeAndDelCudaKernel(const int64_t num_token, const T* tokens,
+                                      const size_t num_seq, size_t* lod0,
+                                      const int blank, const int merge_repeated,
+                                      size_t* out_lod0, T* output) {
+  int ouput_idx = 0;
+  out_lod0[0] = 0;
+  for (int i = 0; i < num_seq; ++i) {
+    T pre_token = -1;
+    for (int j = lod0[i]; j < lod0[i + 1]; ++j) {
+      if (tokens[j] != blank && !(merge_repeated && tokens[j] == pre_token)) {
+        output[ouput_idx] = tokens[j];
+        ++ouput_idx;
+      }
+      pre_token = tokens[j];
+    }
+    out_lod0[i + 1] = ouput_idx;
+  }
+}
+template <typename T>
+class CTCAlignOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace.");
+    const size_t level = 0;
+    auto* input = ctx.Input<LoDTensor>("Input");
+    auto* output = ctx.Output<LoDTensor>("Output");
+    auto input_lod = framework::ToAbsOffset(input->lod());
+    const T* tokens = input->data<T>();
+    const int64_t num_tokens = input->dims()[0];
+    const size_t num_seq = input_lod[level].size() - 1;
+    const int blank = ctx.Attr<int>("blank");
+    const int merge_repeated =
+        static_cast<int>(ctx.Attr<bool>("merge_repeated"));
+    // prepare a lod to record lod information while merging elements
+    thrust::device_vector<size_t> dev_out_lod0(input_lod[level].size());
+    size_t* dev_out_lod0_ptr = thrust::raw_pointer_cast(dev_out_lod0.data());
+    // merge elements and delete blank
+    T* output_data = output->mutable_data<T>({num_tokens, 1}, ctx.GetPlace());
+    auto stream = ctx.cuda_device_context().stream();
+    MergeAndDelCudaKernel<T><<<1, 1, 0, stream>>>(
+        num_tokens, tokens, num_seq, input_lod[level].data(), blank,
+        merge_repeated, dev_out_lod0_ptr, output_data);
+    // set output lod
+    thrust::host_vector<size_t> host_out_lod0(dev_out_lod0.begin(),
+                                              dev_out_lod0.end());
+    framework::LoD out_lod;
+    out_lod.push_back(host_out_lod0);
+    output->set_lod(out_lod);
+    // resize output dims
+    output->Resize({static_cast<int64_t>(host_out_lod0.back()), 1});
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+REGISTER_OP_CUDA_KERNEL(ctc_align, paddle::operators::CTCAlignOpCUDAKernel<int>,
+                        paddle::operators::CTCAlignOpCUDAKernel<int64_t>);
--- a/paddle/operators/ctc_align_op.h
+++ b/paddle/operators/ctc_align_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <string.h>
+#include "paddle/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+template <typename DeviceContext, typename T>
+class CTCAlignKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<LoDTensor>("Input");
+    auto* output = ctx.Output<LoDTensor>("Output");
+    const size_t level = 0;
+    auto input_lod = framework::ToAbsOffset(input->lod());
+    // check input dims and lod
+    auto input_dims = input->dims();
+    PADDLE_ENFORCE_EQ(input_dims[0],
+                      static_cast<int64_t>(input_lod[level].back()),
+                      "The first dimension of Input(Input) should be equal to "
+                      "the sum of all sequences' lengths.");
+    const size_t num_sequences = input_lod[level].size() - 1;
+    size_t blank = static_cast<size_t>(ctx.Attr<int>("blank"));
+    bool merge_repeated = ctx.Attr<bool>("merge_repeated");
+    // merge repeated tokens and delete blank
+    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+    size_t output_idx = 0;
+    std::vector<size_t> output_lod0(1, 0);
+    const T* input_data = input->data<T>();
+    for (size_t seq_idx = 0; seq_idx < num_sequences; ++seq_idx) {
+      T prev_token = -1;
+      for (size_t i = input_lod[level][seq_idx];
+           i < input_lod[level][seq_idx + 1]; ++i) {
+        if (input_data[i] != blank &&
+            !(merge_repeated && input_data[i] == prev_token)) {
+          output_data[output_idx] = input_data[i];
+          ++output_idx;
+        }
+        prev_token = input_data[i];
+      }
+      output_lod0.push_back(output_idx);
+    }
+    // set output lod
+    framework::LoD output_lod;
+    output_lod.push_back(output_lod0);
+    output->set_lod(output_lod);
+    // resize output dims
+    output->Resize({static_cast<int64_t>(output_lod0.back()), 1});
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/detail/grpc_client.cc
+++ b/paddle/operators/detail/grpc_client.cc
@@ -63,9 +63,6 @@ bool RPCClient::AsyncGetVariable(const std::string& ep,
  sendrecv::VariableMessage req;
  req.set_varname(var_name);
-  auto* var = scope.FindVar(var_name);
-  SerializeToMessage(var_name, var, ctx, &req);
  // varhandle
  VarHandle var_h;
  var_h.ep = ep;

--- a/paddle/operators/detail/grpc_server.cc
+++ b/paddle/operators/detail/grpc_server.cc
@@ -36,7 +36,10 @@ class RequestBase {
  CallStatus Status() { return status_; }
  void SetStatus(CallStatus status) { status_ = status; }
-  virtual std::string GetReqName() { assert(false); }
+  virtual std::string GetReqName() {
+    assert(false);
+    return "";
+  }
 protected:
  grpc::ServerContext ctx_;
@@ -80,11 +83,13 @@ class RequestGet final : public RequestBase {
 public:
  explicit RequestGet(sendrecv::SendRecvService::AsyncService* service,
                      grpc::ServerCompletionQueue* cq, framework::Scope* scope,
-                      const platform::DeviceContext* dev_ctx)
+                      const platform::DeviceContext* dev_ctx,
+                      SimpleBlockQueue<char>* queue)
      : RequestBase(service, cq),
        responder_(&ctx_),
        scope_(scope),
-        dev_ctx_(dev_ctx) {
+        dev_ctx_(dev_ctx),
+        queue_(queue) {
    service_->RequestGetVariable(&ctx_, &request_, &responder_, cq_, cq_, this);
  }
@@ -100,6 +105,7 @@ class RequestGet final : public RequestBase {
    // TODO(gongwb): check var's info.
    responder_.Finish(reply_, grpc::Status::OK, this);
    status_ = FINISH;
+    queue_->Push('c');
  }
 protected:
@@ -108,8 +114,15 @@ class RequestGet final : public RequestBase {
  ServerAsyncResponseWriter<sendrecv::VariableMessage> responder_;
  framework::Scope* scope_;
  const platform::DeviceContext* dev_ctx_;
+  SimpleBlockQueue<char>* queue_;
 };
+void AsyncGRPCServer::WaitClientGet(int count) {
+  for (int i = 0; i < count; ++i) {
+    var_get_queue_.Pop();
+  }
+}
 void AsyncGRPCServer::RunSyncUpdate() {
  grpc::ServerBuilder builder;
  builder.AddListeningPort(address_, grpc::InsecureServerCredentials());
@@ -149,7 +162,6 @@ void AsyncGRPCServer::ShutdownQueue() {
 }
 // This URL explains why shutdown is complicate:
-// https://stackoverflow.com/questions/35708348/grpc-what-is-the-recommended-way-to-shut-down-an-asynchronous-server-in-c
 void AsyncGRPCServer::ShutDown() {
  server_->Shutdown();
  ShutdownQueue();
@@ -170,10 +182,12 @@ void AsyncGRPCServer::TryToRegisterNewGetOne() {
  if (is_shut_down_) {
    return;
  }
-  RequestGet* get = new RequestGet(&service_, cq_get_.get(), scope_, dev_ctx_);
+  RequestGet* get = new RequestGet(&service_, cq_get_.get(), scope_, dev_ctx_,
+                                   &var_get_queue_);
  VLOG(4) << "create Requestget status:" << get->Status();
 }
+// FIXME(typhoonzero): remove wait argument and change cq_name to enum.
 void AsyncGRPCServer::HandleRequest(bool wait, grpc::ServerCompletionQueue* cq,
                                    std::string cq_name,
                                    std::function<void()> TryToRegisterNewOne) {
@@ -188,9 +202,9 @@ void AsyncGRPCServer::HandleRequest(bool wait, grpc::ServerCompletionQueue* cq,
    }
    PADDLE_ENFORCE(tag);
-    if (wait && !done_) {
+    // FIXME(typhoonzero): de-couple the barriers with recv_op
-      Wait();
+    if (cq_name == "cq_get") WaitCond(1);
-    }
+    if (cq_name == "cq_send") WaitCond(0);
    RequestBase* base = (RequestBase*)tag;
    // reference:
@@ -222,22 +236,18 @@ void AsyncGRPCServer::HandleRequest(bool wait, grpc::ServerCompletionQueue* cq,
  }
 }
-void AsyncGRPCServer::Wait() {
+void AsyncGRPCServer::WaitCond(int cond) {
-  std::unique_lock<std::mutex> lock(this->mutex_);
+  std::unique_lock<std::mutex> lock(this->barrier_mutex_);
-  condition_.wait(lock, [=] { return this->done_ == true; });
+  barrier_condition_.wait(lock,
-}
+                          [=] { return this->barrier_cond_step_ == cond; });
-void AsyncGRPCServer::Reset() {
-  std::lock_guard<std::mutex> lock(this->mutex_);
-  done_ = false;
 }
-void AsyncGRPCServer::Done() {
+void AsyncGRPCServer::SetCond(int cond) {
  {
-    std::lock_guard<std::mutex> lock(this->mutex_);
+    std::lock_guard<std::mutex> lock(this->barrier_mutex_);
-    done_ = true;
+    barrier_cond_step_ = cond;
  }
-  condition_.notify_all();
+  barrier_condition_.notify_all();
 }
 }  // namespace detail

--- a/paddle/operators/detail/grpc_server.h
+++ b/paddle/operators/detail/grpc_server.h
@@ -41,9 +41,10 @@ class AsyncGRPCServer final : public sendrecv::SendRecvService::Service {
  void RunSyncUpdate();
-  void Reset();
+  // functions to sync server barrier status.
+  void WaitCond(int cond);
-  void Done();
+  void SetCond(int cond);
+  void WaitClientGet(int count);
  void SetScope(framework::Scope *scope) { scope_ = scope; }
@@ -56,7 +57,6 @@ class AsyncGRPCServer final : public sendrecv::SendRecvService::Service {
  void ShutDown();
 protected:
-  void Wait();
  void HandleRequest(bool wait, grpc::ServerCompletionQueue *cq,
                     std::string cq_name,
                     std::function<void()> TryToRegisterNewOne);
@@ -78,11 +78,12 @@ class AsyncGRPCServer final : public sendrecv::SendRecvService::Service {
  const platform::DeviceContext *dev_ctx_;
  // received variable from RPC, operators fetch variable from this queue.
  SimpleBlockQueue<MessageWithName> var_recv_queue_;
+  SimpleBlockQueue<char> var_get_queue_;
  // condition of the sub program
-  std::mutex mutex_;
+  std::mutex barrier_mutex_;
-  volatile mutable bool done_;
+  mutable int barrier_cond_step_;
-  std::condition_variable condition_;
+  std::condition_variable barrier_condition_;
  std::unique_ptr<std::thread> t_send_;
  std::unique_ptr<std::thread> t_get_;

--- a/paddle/operators/edit_distance_op.cc
+++ b/paddle/operators/edit_distance_op.cc
@@ -49,10 +49,10 @@ class EditDistanceOpMaker : public framework::OpProtoAndCheckerMaker {
  EditDistanceOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Hyps",
-             "(2-D LoDTensor<int>, 2nd dim. equal to 1) "
+             "(2-D LoDTensor<int64_t>, 2nd dim. equal to 1) "
             "The indices for hypothesis strings.");
    AddInput("Refs",
-             "(2-D LoDTensor<int>, 2nd dim. equal to 1) "
+             "(2-D LoDTensor<int64_t>, 2nd dim. equal to 1) "
             "The indices for reference strings.");
    AddAttr<bool>("normalized",
                  "(bool, default false) Indicated whether to normalize "
@@ -66,22 +66,22 @@ class EditDistanceOpMaker : public framework::OpProtoAndCheckerMaker {
 EditDistance operator computes the edit distances between a batch of hypothesis
 strings and their references.
-Edit distance, also called Levenshtein distance, measures how dissimilar two strings 
+Edit distance, also called Levenshtein distance, measures how dissimilar two strings
-are by counting the minimum number of operations to transform one string into anthor. 
+are by counting the minimum number of operations to transform one string into anthor.
-Here the operations include insertion, deletion, and substitution. For example, 
+Here the operations include insertion, deletion, and substitution. For example,
-given hypothesis string A = "kitten" and reference B = "sitting", the edit distance 
+given hypothesis string A = "kitten" and reference B = "sitting", the edit distance
-is 3 for A will be transformed into B at least after two substitutions and one 
+is 3 for A will be transformed into B at least after two substitutions and one
 insertion:
   "kitten" -> "sitten" -> "sittin" -> "sitting"
-Input(Hyps) is a LoDTensor consisting of all the hypothesis strings with the total 
+Input(Hyps) is a LoDTensor consisting of all the hypothesis strings with the total
-number denoted by `batch_size`, and the separation is specified by the LoD information. 
+number denoted by `batch_size`, and the separation is specified by the LoD information.
-And the `batch_size` reference strings are arranged in order in the same way in the 
+And the `batch_size` reference strings are arranged in order in the same way in the
 LoDTensor Input(Refs).
-Output(Out) contains the `batch_size` results and each stands for the edit stance 
+Output(Out) contains the `batch_size` results and each stands for the edit stance
-for a pair of strings respectively. If Attr(normalized) is true, the edit distance 
+for a pair of strings respectively. If Attr(normalized) is true, the edit distance
 will be divided by the length of reference string.
 )DOC");
  }

--- a/paddle/operators/edit_distance_op.cu
+++ b/paddle/operators/edit_distance_op.cu
@@ -39,8 +39,8 @@ __global__ void FillFirstColumn(T* dist, const int M, const int N) {
 }
 template <typename T>
-__global__ void Levenshtein(T* dist, const int* x1, const int* x2, const int M,
+__global__ void Levenshtein(T* dist, const int64_t* x1, const int64_t* x2,
-                            const int N, const int start) {
+                            const int M, const int N, const int start) {
  int idx = blockDim.x * blockIdx.x + threadIdx.x;
  int offset = N;
  int index = start + idx * offset;
@@ -113,8 +113,8 @@ class EditDistanceGPUKernel : public framework::OpKernel<T> {
        dist_t.Resize({m + 1, n + 1});
        dist_t.mutable_data<T>(ctx.GetPlace());
        auto dist = dist_t.data<T>();
-        auto x1 = x1_t->data<int>() + hyp_lod[num];
+        auto x1 = x1_t->data<int64_t>() + hyp_lod[num];
-        auto x2 = x2_t->data<int>() + ref_lod[num];
+        auto x2 = x2_t->data<int64_t>() + ref_lod[num];
        FillFirstColumn<T><<<1 + m / PADDLE_CUDA_NUM_THREADS,
                             PADDLE_CUDA_NUM_THREADS, 0, stream>>>(dist, m, n);

--- a/paddle/operators/edit_distance_op.h
+++ b/paddle/operators/edit_distance_op.h
@@ -60,8 +60,8 @@ class EditDistanceKernel : public framework::OpKernel<T> {
        dist_t.Resize({m + 1, n + 1});
        dist_t.mutable_data<T>(ctx.GetPlace());
        auto dist = dist_t.data<T>();
-        auto x1 = x1_t->data<int>() + hyp_lod[num];
+        auto x1 = x1_t->data<int64_t>() + hyp_lod[num];
-        auto x2 = x2_t->data<int>() + ref_lod[num];
+        auto x2 = x2_t->data<int64_t>() + ref_lod[num];
        for (int64_t i = 0; i < m + 1; ++i) {
          dist[i * (n + 1)] = i;
        }

--- a/paddle/operators/linear_chain_crf_op.cc
+++ b/paddle/operators/linear_chain_crf_op.cc
@@ -187,7 +187,7 @@ class LinearChainCRFOp : public framework::OperatorWithKernel {
      const framework::ExecutionContext& ctx) const override {
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<LoDTensor>("Emission")->type()),
-        ctx.device_context());
+        platform::CPUPlace());
  }
 };
@@ -248,7 +248,7 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel {
        framework::ToDataType(
            ctx.Input<LoDTensor>(framework::GradVarName("LogLikelihood"))
                ->type()),
-        ctx.device_context());
+        platform::CPUPlace());
  }
 };

--- a/paddle/operators/linear_chain_crf_op.h
+++ b/paddle/operators/linear_chain_crf_op.h
@@ -65,57 +65,14 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
    const size_t level = 0;
    const size_t seq_num = in_lod[level].size() - 1;
-    // These local variables hold the inputs and outputs, garanteeing them on
+    const LoDTensor* emission_weights = ctx.Input<LoDTensor>("Emission");
-    // CPU memory, to provide a consistent reference.
+    const Tensor* transition_weights = ctx.Input<Tensor>("Transition");
-    // TODO(caoying) Fix this by moving all these local variables into the
+    const LoDTensor* label = ctx.Input<LoDTensor>("Label");
-    // class's data members once we can profile the whole training process.
-    LoDTensor* emission_weights = nullptr;
+    Tensor* emission_exps = ctx.Output<Tensor>("EmissionExps");
-    LoDTensor emission_weight_tensor;
+    Tensor* transition_exps = ctx.Output<Tensor>("TransitionExps");
-    Tensor* transition_weights = nullptr;
+    Tensor* alpha = ctx.Output<Tensor>("Alpha");
-    Tensor transition_weight_tensor;
+    Tensor* ll = ctx.Output<Tensor>("LogLikelihood");
-    LoDTensor* label = nullptr;
-    LoDTensor label_tensor;
-    Tensor* emission_exps = nullptr;
-    Tensor emission_exps_tensor;
-    Tensor* transition_exps = nullptr;
-    Tensor transition_exps_tensor;
-    Tensor* alpha = nullptr;
-    Tensor alpha_tensor;
-    Tensor* ll = nullptr;
-    Tensor ll_tensor;
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-      emission_weights = &emission_weight_tensor;
-      transition_weights = &transition_weight_tensor;
-      label = &label_tensor;
-      CopyInputsToCpuMemory(
-          ctx.device_context(), *ctx.Input<LoDTensor>("Emission"),
-          *ctx.Input<Tensor>("Transition"), *ctx.Input<LoDTensor>("Label"),
-          emission_weights, transition_weights, label);
-      emission_exps = &emission_exps_tensor;
-      emission_exps->Resize(emission_weights->dims());
-      transition_exps = &transition_exps_tensor;
-      transition_exps->Resize(transition_weights->dims());
-      alpha = &alpha_tensor;
-      alpha->Resize(ctx.Output<Tensor>("Alpha")->dims());
-      ll = &ll_tensor;
-    } else {
-      emission_weights =
-          const_cast<LoDTensor*>(ctx.Input<LoDTensor>("Emission"));
-      transition_weights = const_cast<Tensor*>(ctx.Input<Tensor>("Transition"));
-      label = const_cast<LoDTensor*>(ctx.Input<LoDTensor>("Label"));
-      emission_exps = ctx.Output<Tensor>("EmissionExps");
-      transition_exps = ctx.Output<Tensor>("TransitionExps");
-      alpha = ctx.Output<Tensor>("Alpha");
-      ll = ctx.Output<Tensor>("LogLikelihood");
-    }
    // Because the computation codes only runs on CPU, here the memory for all
    // the outputs is FIXED to be allocated on the CPU memory.
@@ -173,61 +130,9 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
          one_seq, one_seq_row_max, one_seq_exps, *transition_weights,
          *transition_exps, one_seq_label, &one_seq_alpha);
    }
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-      CopyOutputsToGpuMemory(
-          ctx.device_context(), *emission_exps, *transition_exps, *alpha, *ll,
-          ctx.Output<Tensor>("EmissionExps"),
-          ctx.Output<Tensor>("TransitionExps"), ctx.Output<Tensor>("Alpha"),
-          ctx.Output<Tensor>("LogLikelihood"));
-    }
  };
 private:
-  void CopyInputsToCpuMemory(const platform::DeviceContext& ctx,
-                             const LoDTensor& emission_weights_src,
-                             const Tensor& transition_weights_src,
-                             const LoDTensor& label_src,
-                             LoDTensor* emission_weights_dst,
-                             Tensor* transition_weights_dst,
-                             LoDTensor* label_dst) const {
-    // Copy the inputs from GPU memory to CPU memory if this operators runs on
-    // GPU device.
-    auto copyLoDTensor = [](const platform::DeviceContext& ctx,
-                            const LoDTensor& src, LoDTensor* dst) {
-      dst->mutable_data<T>(src.dims(), platform::CPUPlace());
-      framework::Copy(src, platform::CPUPlace(), ctx, dst);
-    };
-    copyLoDTensor(ctx, emission_weights_src, emission_weights_dst);
-    copyLoDTensor(ctx, label_src, label_dst);
-    transition_weights_dst->mutable_data<T>(transition_weights_src.dims(),
-                                            platform::CPUPlace());
-    framework::Copy(transition_weights_src, platform::CPUPlace(), ctx,
-                    transition_weights_dst);
-  }
-  void CopyOutputsToGpuMemory(const platform::DeviceContext& ctx,
-                              const Tensor& emission_exps_src,
-                              const Tensor& transition_exps_src,
-                              const Tensor& alpha_src, const Tensor& ll_src,
-                              Tensor* emission_exps_dst,
-                              Tensor* transition_exps_dst, Tensor* alpha_dst,
-                              Tensor* ll_dst) const {
-    // Copy the forward results from CPU memory to GPU memory if this
-    // operators runs on GPU device.
-    auto copyTensor = [](const platform::DeviceContext& ctx, const Tensor& src,
-                         Tensor* dst) {
-      dst->mutable_data<T>(platform::CUDAPlace());
-      framework::Copy(src, platform::CUDAPlace(), ctx, dst);
-    };
-    copyTensor(ctx, emission_exps_src, emission_exps_dst);
-    copyTensor(ctx, transition_exps_src, transition_exps_dst);
-    copyTensor(ctx, alpha_src, alpha_dst);
-    copyTensor(ctx, ll_src, ll_dst);
-  }
  T ForwardOneSequence(const Tensor& emission, const Tensor& emission_row_max,
                       const Tensor& emission_exps, const Tensor& trans_weights,
                       const Tensor& trans_weight_exps, const Tensor& label,
@@ -296,63 +201,17 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
    auto lod = ctx.Input<LoDTensor>("Label")->lod();
    PADDLE_ENFORCE(lod.size(), "Input(Label) must be a sequence.");
-    // These local variables hold the inputs and outputs, garanteeing them on
+    const Tensor* label = ctx.Input<LoDTensor>("Label");
-    // CPU memory, to provide a consistent reference.
+    const Tensor* emission_exps = ctx.Input<Tensor>("EmissionExps");
-    // TODO(caoying) Fix this by moving all these local variables into the
+    const Tensor* transition_exps = ctx.Input<Tensor>("TransitionExps");
-    // class's data members once we can profile the training process, or
+    const Tensor* alpha = ctx.Input<Tensor>("Alpha");
-    // implementing a real GPU kernel for CRF.
+    const T* ll_grad =
-    Tensor* label = nullptr;
+        ctx.Input<Tensor>(framework::GradVarName("LogLikelihood"))->data<T>();
-    Tensor label_tensor;
-    Tensor* emission_exps = nullptr;
-    Tensor emission_exps_tensor;
-    Tensor* transition_exps = nullptr;
-    Tensor transition_exps_tensor;
-    Tensor* alpha = nullptr;
-    Tensor alpha_tensor;
-    Tensor ll_grad_tensor;
-    T* ll_grad = nullptr;
-    Tensor* emission_grad = nullptr;
-    Tensor emission_grad_tensor;
-    Tensor* transition_grad = nullptr;
-    Tensor transition_grad_tensor;
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-      label = &label_tensor;
-      emission_exps = &emission_exps_tensor;
-      transition_exps = &transition_exps_tensor;
-      alpha = &alpha_tensor;
-      CopyInputsToCpuMemory(
-          ctx.device_context(), *ctx.Input<LoDTensor>("Label"),
-          *ctx.Input<Tensor>("EmissionExps"),
-          *ctx.Input<Tensor>("TransitionExps"), *ctx.Input<Tensor>("Alpha"),
-          *ctx.Input<Tensor>(framework::GradVarName("LogLikelihood")), label,
-          emission_exps, transition_exps, alpha, &ll_grad_tensor);
-      ll_grad = ll_grad_tensor.data<T>();
-      if (ctx.Output<Tensor>(framework::GradVarName("Emission"))) {
-        emission_grad = &emission_grad_tensor;
-        emission_grad->Resize(emission_exps->dims());
-      }
-      if (ctx.Output<Tensor>(framework::GradVarName("Transition"))) {
+    Tensor* emission_grad =
-        transition_grad = &transition_grad_tensor;
+        ctx.Output<Tensor>(framework::GradVarName("Emission"));
-        transition_grad->Resize(transition_exps->dims());
+    Tensor* transition_grad =
-      }
+        ctx.Output<Tensor>(framework::GradVarName("Transition"));
-    } else {
-      label = const_cast<LoDTensor*>(ctx.Input<LoDTensor>("Label"));
-      emission_exps = const_cast<Tensor*>(ctx.Input<Tensor>("EmissionExps"));
-      transition_exps =
-          const_cast<Tensor*>(ctx.Input<Tensor>("TransitionExps"));
-      alpha = const_cast<Tensor*>(ctx.Input<Tensor>("Alpha"));
-      ll_grad = const_cast<Tensor*>(
-                    ctx.Input<Tensor>(framework::GradVarName("LogLikelihood")))
-                    ->data<T>();
-      emission_grad = ctx.Output<Tensor>(framework::GradVarName("Emission"));
-      transition_grad =
-          ctx.Output<Tensor>(framework::GradVarName("Transition"));
-    }
    // TODO(caoying) Fix this constraint. When the Input(Emission) is from the
    // data reader operator, it can have no gradients.
@@ -389,58 +248,9 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
          one_seq_emission_exps, *transition_exps, one_seq_alpha, one_seq_label,
          &one_seq_beta, transition_grad, &one_seq_emission_grad);
    }
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-      CopyOutputsToGpuMemory(
-          ctx.device_context(), emission_grad, transition_grad,
-          ctx.Output<Tensor>(framework::GradVarName("Emission")),
-          ctx.Output<Tensor>(framework::GradVarName("Transition")));
-    }
  };
 private:
-  void CopyInputsToCpuMemory(const platform::DeviceContext& ctx,
-                             const LoDTensor& label_src,
-                             const Tensor& emission_exps_src,
-                             const Tensor& transition_exps_src,
-                             const Tensor& alpha_src, const Tensor& ll_grad_src,
-                             Tensor* label_dst, Tensor* emission_exps_dst,
-                             Tensor* transition_exps_dst, Tensor* alpha_dst,
-                             Tensor* ll_grad_dst) const {
-    // Copy the inputs from GPU memory to CPU memory when this operators runs on
-    // GPU device.
-    label_dst->mutable_data<T>(label_src.dims(), platform::CPUPlace());
-    framework::Copy(label_src, platform::CPUPlace(), ctx, label_dst);
-    auto copyTensor = [](const platform::DeviceContext& ctx, const Tensor& src,
-                         Tensor* dst) {
-      dst->mutable_data<T>(src.dims(), platform::CPUPlace());
-      framework::Copy(src, platform::CPUPlace(), ctx, dst);
-    };
-    copyTensor(ctx, emission_exps_src, emission_exps_dst);
-    copyTensor(ctx, transition_exps_src, transition_exps_dst);
-    copyTensor(ctx, alpha_src, alpha_dst);
-    copyTensor(ctx, ll_grad_src, ll_grad_dst);
-  }
-  void CopyOutputsToGpuMemory(const platform::DeviceContext& ctx,
-                              const Tensor* emission_grad_src,
-                              const Tensor* transition_grad_src,
-                              Tensor* emission_grad_dst,
-                              Tensor* transition_grad_dst) const {
-    // Copy the backward results from CPU memory to GPU
-    // memory if this operators runs on GPU device.
-    auto copyTensor = [](const platform::DeviceContext& ctx, const Tensor* src,
-                         Tensor* dst) {
-      if (src && dst) {
-        dst->mutable_data<T>(platform::CUDAPlace());
-        framework::Copy(*src, platform::CUDAPlace(), ctx, dst);
-      }
-    };
-    copyTensor(ctx, emission_grad_src, emission_grad_dst);
-    copyTensor(ctx, transition_grad_src, transition_grad_dst);
-  }
  void BackwardOneSequence(const platform::CPUDeviceContext& ctx,
                           const T ll_grad, const Tensor& emission_exps,
                           const Tensor& transition_exps, const Tensor& alpha,

--- a/paddle/operators/lstm_op.cc
+++ b/paddle/operators/lstm_op.cc
@@ -117,7 +117,7 @@ class LSTMOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("C0",
             "(Tensor, optional) the initial cell state is an optional "
             "input. This is a tensor with shape (N x D), where N is the "
-             "batch size. `H0` and `C0` can be NULL but only at the same time")
+             "batch size. `H0` and `C0` can be NULL but only at the same time.")
        .AsDispensable();
    AddInput("Weight",
             "(Tensor) the learnable hidden-hidden weights."

--- a/paddle/operators/math/sequence_padding.cc
+++ b/paddle/operators/math/sequence_padding.cc
@@ -32,7 +32,8 @@ class PaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
    auto seq_dims = seq.dims();
-    PADDLE_ENFORCE_EQ(seq_dims[0], abs_offset_lod[level].back(),
+    PADDLE_ENFORCE_EQ(seq_dims[0],
+                      static_cast<int64_t>(abs_offset_lod[level].back()),
                      "The first dimension of LoDTensor seq should be "
                      "equal to the sum of all sequences's length.");
@@ -41,32 +42,32 @@ class PaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
                      "The input padding should be a 3-D Tensor of shape "
                      "[max_sequence_length, num_sequences, sequence_width].");
-    const size_t max_sequence_length = MaximumSequenceLength(lod, level);
+    const int64_t max_sequence_length = MaximumSequenceLength(lod, level);
    PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length,
                      "The first dimension of Tensor padding should be the "
                      "maximum length of all sequences in LoDTensor seq.");
-    const size_t num_sequences = abs_offset_lod[level].size() - 1;
+    const int64_t num_sequences = abs_offset_lod[level].size() - 1;
    PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences,
                      "The second dimension of Tensor padding should be the "
                      "number of sequences in LoDTensor seq.");
-    const size_t sequence_width = seq.numel() / seq_dims[0];
+    const int64_t sequence_width = seq.numel() / seq_dims[0];
    PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
                      "The third dimension of Tensor padding should be the "
                      "width of sequence in LoDTensor seq.");
    const T* seq_data = seq.data<T>();
    T* padding_data = padding.data<T>();
-    for (size_t i = 0; i < max_sequence_length; ++i) {
+    for (int64_t i = 0; i < max_sequence_length; ++i) {
-      for (size_t j = 0; j < num_sequences; ++j) {
+      for (int64_t j = 0; j < num_sequences; ++j) {
-        size_t start_pos = abs_offset_lod[level][j];
+        int64_t start_pos = abs_offset_lod[level][j];
-        size_t sequence_length = abs_offset_lod[level][j + 1] - start_pos;
+        int64_t sequence_length = abs_offset_lod[level][j + 1] - start_pos;
        if (i < sequence_length) {
          // i > 0 => sequence_length > 0
          T scale =
              norm_by_times ? (1.0f / static_cast<T>(sequence_length)) : 1.0f;
-          for (size_t k = 0; k < sequence_width; ++k) {
+          for (int64_t k = 0; k < sequence_width; ++k) {
            padding_data[(i * num_sequences + j) * sequence_width + k] =
                seq_data[(start_pos + i) * sequence_width + k] * scale;
          }
@@ -93,7 +94,8 @@ class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
    auto seq_dims = seq.dims();
-    PADDLE_ENFORCE_EQ(seq_dims[0], abs_offset_lod[level].back(),
+    PADDLE_ENFORCE_EQ(seq_dims[0],
+                      static_cast<int64_t>(abs_offset_lod[level].back()),
                      "The first dimension of LoDTensor seq should be "
                      "equal to the sum of all sequences's length.");
@@ -102,31 +104,31 @@ class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
                      "The input padding should be a 3-D Tensor of shape "
                      "[max_sequnece_length, num_sequences, sequence_width].");
-    const size_t max_sequence_length = MaximumSequenceLength(lod, level);
+    const int64_t max_sequence_length = MaximumSequenceLength(lod, level);
    PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length,
                      "The first dimension of Tensor padding should be "
                      "the maximum length of all sequences in LoDTensor seq.");
-    const size_t num_sequences = abs_offset_lod[level].size() - 1;
+    const int64_t num_sequences = abs_offset_lod[level].size() - 1;
    PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences,
                      "The second dimension of Tensor padding should be "
                      "the number of sequences in LoDTensor seq.");
-    const size_t sequence_width = seq.numel() / seq_dims[0];
+    const int64_t sequence_width = seq.numel() / seq_dims[0];
    PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
                      "The third dimension of Tensor padding should be the "
                      "width of sequence in LoDTensor seq.");
    const T* padding_data = padding.data<T>();
    T* seq_data = seq.data<T>();
-    for (size_t i = 0; i < num_sequences; ++i) {
+    for (int64_t i = 0; i < num_sequences; ++i) {
-      size_t start_pos = abs_offset_lod[level][i];
+      int64_t start_pos = abs_offset_lod[level][i];
-      size_t sequence_length = abs_offset_lod[level][i + 1] - start_pos;
+      int64_t sequence_length = abs_offset_lod[level][i + 1] - start_pos;
-      for (size_t j = 0; j < sequence_length; ++j) {
+      for (int64_t j = 0; j < sequence_length; ++j) {
        // sequence_width > j > 0
        T scale =
            norm_by_times ? (1.0f / static_cast<T>(sequence_length)) : 1.0f;
-        for (size_t k = 0; k < sequence_width; ++k) {
+        for (int64_t k = 0; k < sequence_width; ++k) {
          seq_data[(start_pos + j) * sequence_width + k] =
              padding_data[(j * num_sequences + i) * sequence_width + k] *
              scale;

--- a/paddle/operators/math/sequence_padding.cu
+++ b/paddle/operators/math/sequence_padding.cu
@@ -71,7 +71,8 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
    auto seq_dims = seq.dims();
-    PADDLE_ENFORCE_EQ(seq_dims[0], abs_offset_lod[level].back(),
+    PADDLE_ENFORCE_EQ(seq_dims[0],
+                      static_cast<int64_t>(abs_offset_lod[level].back()),
                      "The first dimension of LoDTensor seq should be "
                      "equal to the sum of all sequences's length.");
@@ -80,17 +81,17 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
                      "The input padding should be a 3-D Tensor of shape "
                      "[max_sequence_length, num_sequences, sequence_width].");
-    size_t max_sequence_length = MaximumSequenceLength(lod, level);
+    int64_t max_sequence_length = MaximumSequenceLength(lod, level);
    PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length,
                      "The first dimension of Tensor padding should be the "
                      "maximum length of all sequences in LoDTensor seq.");
-    const size_t num_sequences = abs_offset_lod[level].size() - 1;
+    const int64_t num_sequences = abs_offset_lod[level].size() - 1;
    PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences,
                      "The second dimension of Tensor padding should be the "
                      "number of sequences in LoDTensor seq.");
-    const size_t sequence_width = seq.numel() / seq_dims[0];
+    const int64_t sequence_width = seq.numel() / seq_dims[0];
    PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
                      "The third dimension of Tensor padding should be the "
                      "width of sequence in LoDTensor seq.");
@@ -101,7 +102,7 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
      return;
    }
-    const size_t kBlockSize = 512;
+    const int64_t kBlockSize = 512;
    /* At least use 32 threads to copy sequence_width elements,
     * and at least 8 elements for each thread.
@@ -143,7 +144,8 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
    auto seq_dims = seq.dims();
-    PADDLE_ENFORCE_EQ(seq_dims[0], abs_offset_lod[level].back(),
+    PADDLE_ENFORCE_EQ(seq_dims[0],
+                      static_cast<int64_t>(abs_offset_lod[level].back()),
                      "The first dimension of LoDTensor seq should be "
                      "equal to the sum of all sequences's length.");
@@ -152,17 +154,17 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
                      "The input padding should be a 3-D Tensor of shape "
                      "[max_sequnece_length, num_sequences, sequence_width].");
-    size_t max_sequence_length = MaximumSequenceLength(lod, level);
+    int64_t max_sequence_length = MaximumSequenceLength(lod, level);
    PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length,
                      "The first dimension of Tensor padding should be "
                      "the maximum length of all sequences in LoDTensor seq.");
-    const size_t num_sequences = abs_offset_lod[level].size() - 1;
+    const int64_t num_sequences = abs_offset_lod[level].size() - 1;
    PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences,
                      "The second dimension of Tensor padding should be "
                      "the number of sequences in LoDTensor seq.");
-    const size_t sequence_width = seq.numel() / seq_dims[0];
+    const int64_t sequence_width = seq.numel() / seq_dims[0];
    PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
                      "The third dimension of Tensor padding should be the "
                      "width of sequence in LoDTensor seq.");
@@ -173,7 +175,7 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
      return;
    }
-    const size_t kBlockSize = 512;
+    const int64_t kBlockSize = 512;
    /* At least use 32 threads to copy sequence_width elements,
     * and at least 8 elements for each thread.

--- a/paddle/operators/math/sequence_padding_test.cc
+++ b/paddle/operators/math/sequence_padding_test.cc
@@ -31,7 +31,7 @@ void TestSequencePadding(const paddle::framework::LoD& lod,
  cpu_seq.set_lod(lod);
  cpu_seq.mutable_data<T>(seq_dims, paddle::platform::CPUPlace());
-  for (size_t i = 0; i < cpu_seq.numel(); ++i) {
+  for (int64_t i = 0; i < cpu_seq.numel(); ++i) {
    cpu_seq.data<T>()[i] = static_cast<T>(i);
  }
@@ -69,7 +69,7 @@ void TestSequencePadding(const paddle::framework::LoD& lod,
  EXPECT_EQ(cpu_seq.numel(), cpu_seq_back.numel());
  EXPECT_EQ(cpu_seq.dims(), cpu_seq_back.dims());
-  for (size_t i = 0; i < cpu_seq.numel(); ++i) {
+  for (int64_t i = 0; i < cpu_seq.numel(); ++i) {
    EXPECT_EQ(cpu_seq.data<T>()[i], cpu_seq_back.data<T>()[i]);
  }

--- a/paddle/operators/parallel_do_op.cc
+++ b/paddle/operators/parallel_do_op.cc
@@ -64,6 +64,12 @@ static void SplitTensorAndMoveTensorToScopes(
  }
 }
+void WaitOnPlace(const platform::Place place) {
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  auto &dev_ctx = *pool.Get(place);
+  dev_ctx.Wait();
+}
 void WaitOnPlaces(const std::vector<platform::Place> places) {
  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
@@ -214,6 +220,7 @@ class ParallelDoGradOp : public framework::OperatorBase {
        auto &tensor_to_merge = sub_scopes[i]->FindVar(s)->Get<LoDTensor>();
        if (!(places[i] == places[0])) {
          framework::Copy(tensor_to_merge, places[0], tmp);
+          WaitOnPlace(places[0]);
        } else {
          tmp->ShareDataWith(tensor_to_merge);
        }
@@ -222,12 +229,13 @@ class ParallelDoGradOp : public framework::OperatorBase {
            "sum", {{"X", {s, tmp_name}}}, {{"Out", {s}}},
            framework::AttributeMap{});
        sum_op->Run(*sub_scopes[0], places[0]);
-        WaitOnPlaces(places);
+        WaitOnPlace(places[0]);
      }
      VLOG(3) << result;
      framework::Copy(result, place, scope.FindVar(s)->GetMutable<LoDTensor>());
    }
+    WaitOnPlaces(places);
  }
 };

--- a/paddle/operators/pool_op.cc
+++ b/paddle/operators/pool_op.cc
@@ -64,6 +64,13 @@ void PoolOp::InferShape(framework::InferShapeContext *ctx) const {
 framework::OpKernelType PoolOp::GetExpectedKernelType(
    const framework::ExecutionContext &ctx) const {
  bool use_cudnn = ctx.Attr<bool>("use_cudnn");
+  use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
+#ifdef PADDLE_WITH_CUDA
+  if (platform::is_gpu_place(ctx.GetPlace())) {
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
+  }
+#endif
  framework::LibraryType library_;
  if (use_cudnn) {
    library_ = framework::LibraryType::kCUDNN;
@@ -88,6 +95,13 @@ void PoolOpGrad::InferShape(framework::InferShapeContext *ctx) const {
 framework::OpKernelType PoolOpGrad::GetExpectedKernelType(
    const framework::ExecutionContext &ctx) const {
  bool use_cudnn = ctx.Attr<bool>("use_cudnn");
+  use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
+#ifdef PADDLE_WITH_CUDA
+  if (platform::is_gpu_place(ctx.GetPlace())) {
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
+  }
+#endif
  framework::LibraryType library_;
  if (use_cudnn) {
    library_ = framework::LibraryType::kCUDNN;

--- a/paddle/operators/recv_op.cc
+++ b/paddle/operators/recv_op.cc
@@ -27,12 +27,17 @@ limitations under the License. */
 #include "paddle/operators/detail/grpc_server.h"
 #include "paddle/operators/detail/sendrecvop_utils.h"
 #include "paddle/operators/detail/simple_block_queue.h"
+#include "paddle/string/printf.h"
 #define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV"
 namespace paddle {
 namespace operators {
+constexpr int kCondStart = 0;
+constexpr int kCondRunning = 1;
+constexpr int kCondDone = 2;
 void RunServer(std::shared_ptr<detail::AsyncGRPCServer> service) {
  service->RunSyncUpdate();
  VLOG(4) << "RunServer thread end";
@@ -77,42 +82,41 @@ class RecvOp : public framework::OperatorBase {
    if (grads_counter_.find(varname) == grads_counter_.end()) {
      grads_counter_[varname] = 0;
    }
-    char ret[256];
+    return string::Sprintf("%s.trainer_%d", varname, grads_counter_[varname]++);
-    snprintf(ret, sizeof(ret), "%s.trainer_%d", varname.c_str(),
-             grads_counter_[varname]++);
-    return std::string(ret);
  }
  void Run(const framework::Scope &scope,
           const platform::Place &dev_place) const override {
-    // FIXME(typhoonzero): no new scopes for every run.
-    framework::Scope &recv_scope = scope.NewScope();
    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
    auto &dev_ctx = *pool.Get(dev_place);
+    framework::Scope &recv_scope = scope.NewScope();
    // FIXME(Yancey1989): initialize rpc server with laze mode.
    rpc_service_->SetScope(&recv_scope);
    rpc_service_->SetDevCtx(&dev_ctx);
    auto param_list = Attr<std::vector<std::string>>("ParamList");
    auto grad_list = Attr<std::vector<std::string>>("GradList");
-    auto trainer_count = Attr<int>("Trainers");
+    auto fan_in = Attr<int>("Fanin");
    size_t param_count = param_list.size();
-    rpc_service_->Reset();
+    std::string program_str = Attr<std::string>("OptimizeProgram");
+    framework::proto::ProgramDesc program_desc;
+    program_desc.ParseFromString(program_str);
+    framework::ProgramDesc program(program_desc);
+    framework::Executor executor(dev_place);
    // TODO(typhoonzero): change this to a while_op for every cluster-batch.
    bool exit_flag = false;
-    VLOG(4) << "param_count:" << param_count
+    int64_t barrier_size = param_count * fan_in;
-            << " trainer_count:" << trainer_count;
    while (!exit_flag) {
-      // TODO(gognwb): simply this loop.
+      // Get from multiple trainers, we don't care about the order in which
-      // Get from multiple trainers, we don't care about order in which
+      // the gradients arrives, just add suffix 0~n and merge the gradient.
-      // the gradient arrives, just add suffix 0~n then average the gradient.
+      rpc_service_->SetCond(0);
-      for (size_t i = 0; i < param_count * trainer_count; ++i) {
+      for (size_t i = 0; i < barrier_size; ++i) {
-        // blocking get one var from client.
        const detail::MessageWithName &v = rpc_service_->Get();
        auto grad_var_name = v.first;
        if (grad_var_name == LISTEN_TERMINATE_MESSAGE) {
-          VLOG(4) << "received LISTEN_TERMINATE_MESSAGE and RunOp.Run() exit";
+          LOG(INFO) << "received terminate message and exit";
          exit_flag = true;
          break;
        }
@@ -121,49 +125,31 @@ class RecvOp : public framework::OperatorBase {
        if (it != grad_list.end()) {
          param_var_name = param_list[it - grad_list.begin()];
        } else {
-          LOG(ERROR) << "grad have no paired param found!\"" << grad_var_name
+          LOG(ERROR) << "grad have no paired param:" << grad_var_name;
-                     << "\"";
        }
        VLOG(3) << "recved grad: " << grad_var_name
                << " updating param: " << param_var_name;
+        if (fan_in > 1) {
-        auto *merged_grad = recv_scope.FindVar(grad_var_name);
-        if (merged_grad == nullptr) {
-          auto *ptr = recv_scope.Var(grad_var_name);
-          CreateTensorFromMessageType(ptr, v.second.type());
-          VLOG(3) << "Create Variable " << grad_var_name
-                  << " on recv scope, which pointer is " << ptr << " type is "
-                  << v.second.type();
-        }
-        if (trainer_count > 1) {
          grad_var_name = this->GetGradVarNameForTrainer(grad_var_name);
        }
+        auto *var = recv_scope.FindVar(grad_var_name);
-        auto *var = recv_scope.Var(grad_var_name);
+        if (var == nullptr) {
+          LOG(ERROR) << "can not find server side var: " << grad_var_name;
+          PADDLE_THROW("can not find server side var");
+        }
        detail::DeserializeFromMessage(v.second, dev_ctx, var);
      }
      if (exit_flag) {
        break;
      }
-      rpc_service_->Reset();
-      std::string program_str = Attr<std::string>("OptimizeProgram");
-      framework::proto::ProgramDesc program_desc;
-      program_desc.ParseFromString(program_str);
-      framework::ProgramDesc program(program_desc);
-      framework::Executor executor(dev_place);
-      // Run sub graph to get optimized tensor
      try {
        executor.Run(program, &recv_scope, 0, /*global_block*/
                     false /*create_local_scope*/, false /*create_vars*/);
      } catch (std::exception &e) {
        LOG(ERROR) << "run sub program error " << e.what();
      }
+      rpc_service_->SetCond(1);
-      rpc_service_->Done();
+      rpc_service_->WaitClientGet(barrier_size);
      grads_counter_.clear();
    }  // while(true)
  }
@@ -199,7 +185,7 @@ This operator will recv tensor from send_op
        "GradList", "type list of string",
        "grad->param name mapping to find which param to optimize.")
        .SetDefault({});
-    AddAttr<int>("Trainers", "type int",
+    AddAttr<int>("Fanin", "type int",
                 "Number of trainers in the current cluster job")
        .SetDefault(1);
  }

--- a/paddle/operators/send_op.cc
+++ b/paddle/operators/send_op.cc
@@ -41,10 +41,13 @@ class SendOp : public framework::OperatorBase {
    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
    auto& ctx = *pool.Get(place);
    for (size_t i = 0; i < ins.size(); i++) {
+      VLOG(3) << "sending " << ins[i];
      client_.AsyncSendVariable(epmap[i], ctx, scope, ins[i]);
    }
+    PADDLE_ENFORCE(client_.Wait());
    for (size_t i = 0; i < outs.size(); i++) {
+      VLOG(3) << "getting " << outs[i];
      client_.AsyncGetVariable(epmap[i], ctx, scope, outs[i]);
    }

--- a/paddle/operators/sequence_erase_op.cc
+++ b/paddle/operators/sequence_erase_op.cc
@@ -86,4 +86,5 @@ REGISTER_OP_WITHOUT_GRADIENT(sequence_erase, ops::SequenceEraseOp,
                             ops::SequenceEraseOpMaker);
 REGISTER_OP_CPU_KERNEL(
    sequence_erase,
-    ops::SequenceEraseKernel<paddle::platform::CPUDeviceContext, int32_t>);
+    ops::SequenceEraseKernel<paddle::platform::CPUDeviceContext, int32_t>,
+    ops::SequenceEraseKernel<paddle::platform::CPUDeviceContext, int64_t>);
--- a/paddle/operators/sequence_erase_op.cu
+++ b/paddle/operators/sequence_erase_op.cu
@@ -23,27 +23,22 @@ using platform::PADDLE_CUDA_NUM_THREADS;
 using LoDTensor = framework::LoDTensor;
 template <typename T>
-__global__ void LabelErasedIdx(const T* in_dat, const int in_len,
+__global__ void LabelErasedIdx(const T* in_dat, const int64_t in_len,
-                               const T* tokens, const int tokens_len,
+                               const int* tokens, const size_t tokens_len,
-                               int* num_erased) {
+                               size_t* num_erased) {
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  if (index < in_len) {
-    int erased = 0;
+    for (size_t i = 0; i < tokens_len; ++i) {
-    for (int i = 0; i < tokens_len; ++i) {
      if (in_dat[index] == tokens[i]) {
-        erased = 1;
+        num_erased[index + 1] = 1;
+        break;
      }
    }
-    num_erased[index + 1] = erased;
-    if (index == 0) {
-      num_erased[0] = 0;
-    }
  }
 }
-template <typename T>
+__global__ void GetOutLod(const size_t* num_erased, const size_t* in_lod,
-__global__ void GetOutLod(const T* num_erased, const int* in_lod,
+                          const size_t lod_len, size_t* out_lod0) {
-                          const int lod_len, int* out_lod0) {
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  if (index < lod_len) {
    out_lod0[index] = in_lod[index] - num_erased[in_lod[index]];
@@ -51,11 +46,11 @@ __global__ void GetOutLod(const T* num_erased, const int* in_lod,
 }
 template <typename T>
-__global__ void SetOutput(const T* in_dat, const int in_len,
+__global__ void SetOutput(const T* in_dat, const int64_t in_len,
-                          const int* num_erased, T* out_dat) {
+                          const size_t* num_erased, T* out_dat) {
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  if (index < in_len) {
-    if (in_dat[index] != in_dat[index + 1]) {
+    if (num_erased[index] == num_erased[index + 1]) {
      out_dat[index - num_erased[index]] = in_dat[index];
    }
  }
@@ -72,53 +67,44 @@ class SequenceEraseOpCUDAKernel : public framework::OpKernel<T> {
    PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now.");
    PADDLE_ENFORCE_EQ(lod[0].back(), (size_t)in->numel(),
                      "The actual size mismatches with the LoD information.");
-    auto tokens = ctx.Attr<std::vector<T>>("tokens");
+    auto tokens = ctx.Attr<std::vector<int>>("tokens");
-    auto tokens_len = tokens.size();
    auto in_len = in->numel();
    auto in_dat = in->data<T>();
-    auto lod0 = lod[0];
+    // Copy tokens to GPU
+    thrust::device_vector<int> dev_tokens(tokens.begin(), tokens.end());
-    thrust::host_vector<T> host_tokens(tokens_len);
+    int* dev_tokens_ptr = thrust::raw_pointer_cast(dev_tokens.data());
-    for (size_t i = 0; i < tokens.size(); ++i) {
-      host_tokens[i] = tokens[i];
-    }
-    thrust::device_vector<T> dev_tokens = host_tokens;
-    thrust::device_vector<int> num_erased(in_len + 1);
-    T* dev_tokens_ptr = thrust::raw_pointer_cast(dev_tokens.data());
-    int* num_erased_ptr = thrust::raw_pointer_cast(num_erased.data());
+    // Count number of elements to be erased
+    thrust::device_vector<size_t> num_erased(in_len + 1, 0);
+    size_t* num_erased_ptr = thrust::raw_pointer_cast(num_erased.data());
    auto stream = ctx.cuda_device_context().stream();
    LabelErasedIdx<<<(in_len - 1) / PADDLE_CUDA_NUM_THREADS + 1,
                     PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-        in_dat, in_len, dev_tokens_ptr, tokens_len, num_erased_ptr);
+        in_dat, in_len, dev_tokens_ptr, tokens.size(), num_erased_ptr);
    thrust::inclusive_scan(num_erased.begin() + 1, num_erased.end(),
                           num_erased.begin() + 1);
-    // Calc LoD
+    // Copy LoD to GPU
+    auto lod0 = lod[0];
    auto lod_len = lod0.size();
-    thrust::host_vector<int> host_lod(lod_len);
+    thrust::device_vector<size_t> dev_in_lod = lod0;
-    for (size_t i = 0; i < lod_len; ++i) {
+    size_t* dev_in_lod_ptr = thrust::raw_pointer_cast(dev_in_lod.data());
-      host_lod[i] = lod0[i];
-    }
+    // Calc output LoD
-    thrust::device_vector<int> dev_in_lod = host_lod;
+    thrust::device_vector<size_t> dev_out_lod(lod_len);
-    thrust::device_vector<int> dev_out_lod(lod_len);
+    size_t* dev_out_lod_ptr = thrust::raw_pointer_cast(dev_out_lod.data());
-    int* dev_in_lod_ptr = thrust::raw_pointer_cast(dev_in_lod.data());
-    int* dev_out_lod_ptr = thrust::raw_pointer_cast(dev_out_lod.data());
    GetOutLod<<<(lod_len - 1) / PADDLE_CUDA_NUM_THREADS + 1,
                PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
        num_erased_ptr, dev_in_lod_ptr, lod_len, dev_out_lod_ptr);
-    thrust::host_vector<int> host_out_lod = dev_out_lod;
-    std::vector<int> out_lod0(lod_len, 0);
+    // Set LoD for output
-    for (size_t i = 0; i < lod_len; i++) {
+    thrust::host_vector<size_t> out_lod0 = dev_out_lod;
-      out_lod0[i] = host_out_lod[i];
-    }
    framework::LoD out_lod;
    out_lod.push_back(out_lod0);
    out->set_lod(out_lod);
    // Set output
-    out->Resize({out_lod0.back(), 1});
+    out->Resize({static_cast<int64_t>(out_lod0.back()), 1});
    auto out_dat = out->mutable_data<T>(ctx.GetPlace());
    SetOutput<<<(in_len - 1) / PADDLE_CUDA_NUM_THREADS + 1,
                PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_dat, in_len,
@@ -130,4 +116,5 @@ class SequenceEraseOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 REGISTER_OP_CUDA_KERNEL(sequence_erase,
-                        paddle::operators::SequenceEraseOpCUDAKernel<int32_t>);
+                        paddle::operators::SequenceEraseOpCUDAKernel<int32_t>,
+                        paddle::operators::SequenceEraseOpCUDAKernel<int64_t>);
--- a/paddle/operators/sequence_expand_op.cc
+++ b/paddle/operators/sequence_expand_op.cc
@@ -58,7 +58,7 @@ This operator expands input(X) according to LOD of input(Y).
 Following are cases to better explain how this works:
 Case 1:
-Given 2-level a LoDTensor input(X)
+Given a 2-level LoDTensor input(X)
    X.lod = [[0,       2, 3],
             [0, 1,    3, 4]]
    X.data = [a, b, c, d]
@@ -75,9 +75,8 @@ then we get 2-level LoDTensor
 Case 2:
-Given a 0-level LoDTensor input(X)
+Given a common Tensor input(X)
    X.data = [a, b, c]
-    X.lod = NULL
    X.dims = [3, 1]
 and input(Y)
    Y.lod = [[0, 2, 3, 6]]
@@ -89,9 +88,8 @@ then we get 1-level LoDTensor
 Case 3:
-Given a 0-level LoDTensor input(X)
+Given a common Tensor input(X)
    X.data = [[a, b], [c, d], [e, f]]
-    X.lod = NULL
    X.dims = [3, 2]
 and input(Y)
    Y.lod = [[0, 2, 3, 6]]

--- a/paddle/operators/sequence_reshape_op.cc
+++ b/paddle/operators/sequence_reshape_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/operators/sequence_reshape_op.h"
+#include "paddle/framework/ddim.h"
+namespace paddle {
+namespace operators {
+class SequenceReshapeOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SequenceReshapeOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SequenceReshapeOp should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto x_numel = product(x_dims);
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2U, "Rank of Input(X) should be 2.");
+    int new_dim = ctx->Attrs().Get<int>("new_dim");
+    ctx->SetOutputDim("Out",
+                      {x_numel / new_dim, static_cast<int64_t>(new_dim)});
+  }
+};
+class SequenceReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SequenceReshapeOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(LoDTensor, default LoDTensor<float>) A 2-D LoDTensor with shape "
+             "being [N, M].");
+    AddOutput("Out",
+              "(LoDTensor, default LoDTensor<float>) A 2-D LoDTensor with "
+              "shape [T, new_dim] where T is calculated based on X.lod, M and "
+              "new_dim.");
+    AddAttr<int>("new_dim", "Sequence dimension of the output LoDTensor.");
+    AddComment(R"DOC(
+Sequence Reshape Operator.
+This operator will rearrange the input sequences. The new dimension is set by
+attribute and length of each sequence may change longer or shorter which is
+decided by original length, original dimension and new dimension. The following
+example will help to illustrate the function of this operator:
+x is a LoDTensor:
+    x.lod  = [[0, 2, 6]]
+    x.data = [[1, 2], [3, 4],
+              [5, 6], [7, 8], [9, 10], [11, 12]]
+    x.dims = [6, 2]
+set new_dim = 4
+then out is a LoDTensor:
+    out.lod  = [[0, 1, 3]]
+    out.data = [[1, 2, 3, 4],
+                [5, 6, 7, 8], [9, 10, 11, 12]]
+    out.dims = [3, 4]
+Currently, only 1-level LoDTensor is supported and please make sure (original
+length * original dimension) can be divided by new_dim with no remainder for
+each sequence.
+)DOC");
+  }
+};
+class SequenceReshapeGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(
+        ctx->HasInput(framework::GradVarName("Out")),
+        "Input(Out@GRAD) of SequenceReshapeGradOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SequenceReshapeGradOp should  not be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
+  }
+};
+class SequenceReshapeGradOpMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* op_desc_ptr = new framework::OpDesc();
+    op_desc_ptr->SetType("sequence_reshape_grad");
+    op_desc_ptr->SetInput("X", Input("X"));
+    op_desc_ptr->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op_desc_ptr->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op_desc_ptr->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(op_desc_ptr);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(sequence_reshape, ops::SequenceReshapeOp,
+                  ops::SequenceReshapeOpMaker, ops::SequenceReshapeGradOpMaker);
+REGISTER_OPERATOR(sequence_reshape_grad, ops::SequenceReshapeGradOp);
+REGISTER_OP_CPU_KERNEL(
+    sequence_reshape,
+    ops::SequenceReshapeKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SequenceReshapeKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::SequenceReshapeKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::SequenceReshapeKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(
+    sequence_reshape_grad,
+    ops::SequenceReshapeGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SequenceReshapeGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::SequenceReshapeGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::SequenceReshapeGradKernel<paddle::platform::CPUDeviceContext, int>);
--- a/paddle/operators/sequence_reshape_op.cu
+++ b/paddle/operators/sequence_reshape_op.cu
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/operators/sequence_reshape_op.h"
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    sequence_reshape,
+    ops::SequenceReshapeKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SequenceReshapeKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::SequenceReshapeKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::SequenceReshapeKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
+    sequence_reshape_grad,
+    ops::SequenceReshapeGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SequenceReshapeGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::SequenceReshapeGradKernel<paddle::platform::CUDADeviceContext,
+                                   int64_t>,
+    ops::SequenceReshapeGradKernel<paddle::platform::CUDADeviceContext, int>);
--- a/paddle/operators/sequence_reshape_op.h
+++ b/paddle/operators/sequence_reshape_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+namespace paddle {
+namespace operators {
+using LoDTensor = framework::LoDTensor;
+template <typename DeviceContext, typename T>
+class SequenceReshapeKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Input<LoDTensor>("X");
+    auto* out = context.Output<LoDTensor>("Out");
+    int out_width = context.Attr<int>("new_dim");
+    auto in_dims = in->dims();
+    int64_t in_width = in_dims[1];
+    auto& in_lod = in->lod();
+    PADDLE_ENFORCE_EQ(in_lod.size(), 1UL,
+                      "Only support one level sequence now.");
+    PADDLE_ENFORCE_EQ(
+        in_dims[0], in_lod[0].back(),
+        "Inconsistent size between X.shape[0] and X.lod()[0].back().");
+    auto in_lod_l0 = in_lod[0];
+    int seq_num = in_lod_l0.size() - 1;
+    if (in_width == out_width) {
+      out->set_lod(in->lod());
+    } else {
+      auto& out_lod = *out->mutable_lod();
+      out_lod.resize(1);
+      out_lod[0].resize(seq_num + 1);
+      out_lod[0][0] = 0;
+      for (int i = 0; i < seq_num; ++i) {
+        size_t seq_len = in_lod_l0[i + 1] - in_lod_l0[i];
+        size_t offset = 0;
+        offset = (seq_len * in_width) / out_width;
+        PADDLE_ENFORCE_EQ(offset * out_width, seq_len * in_width,
+                          "Please make sure (sequence_length * dimension) can "
+                          "be divided by new_dim with no remainder for each "
+                          "sequence. The %dth sequence is invalid.",
+                          i + 1);
+        out_lod[0][i + 1] = out_lod[0][i] + offset;
+      }
+    }
+    framework::Copy(*in, context.GetPlace(), out);
+    out->Resize({static_cast<int64_t>(out->lod()[0].back()), out_width});
+  }
+};
+template <typename DeviceContext, typename T>
+class SequenceReshapeGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x_tensor_ptr = context.Input<LoDTensor>("X");
+    auto* outg_tensor_ptr =
+        context.Input<LoDTensor>(framework::GradVarName("Out"));
+    auto* xg_tensor_ptr =
+        context.Output<LoDTensor>(framework::GradVarName("X"));
+    xg_tensor_ptr->mutable_data<T>(context.GetPlace());
+    framework::Copy(*outg_tensor_ptr, context.GetPlace(), xg_tensor_ptr);
+    xg_tensor_ptr->Resize(x_tensor_ptr->dims());
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/split_selected_rows_op.cc
+++ b/paddle/operators/split_selected_rows_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/operators/split_selected_rows_op.h"
+namespace paddle {
+namespace operators {
+class SplitSelectedRowsOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SplitSelectedRowsOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input SelectedRows.");
+    AddOutput("Out", "The outputs of input SelectedRows.").AsDuplicable();
+    AddAttr<std::vector<int>>("rows_sections", "Rows section for output.")
+        .SetDefault(std::vector<int>({}));
+    AddAttr<std::vector<int>>("height_sections",
+                              "Height for each output SelectedRows.")
+        .SetDefault(std::vector<int>({}));
+    AddComment(R"DOC(
+Split a SelectedRows with a specified rows section.
+height_sections is only needed when need to split the dims of the original tensor.
+Example:
+  Input:
+    X.rows = {0, 7, 5}
+    X.height = 12
+  Attr:
+    rows_sections = {1, 2}
+    height_sections = {}
+  Out:
+    out0.rows = {0}
+    out0.height = 12
+    out1.rows = {7, 5}
+    out2.height = 12
+)DOC");
+  }
+};
+class SplitSelectedRowsOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "SplitSelectedRowsOp must has input X.");
+    PADDLE_ENFORCE(ctx->HasOutputs("Out"),
+                   "SplitSelectedRowsOp must has output Out.");
+    std::vector<int> height_sections =
+        ctx->Attrs().Get<std::vector<int>>("height_sections");
+    std::vector<int> rows_sections =
+        ctx->Attrs().Get<std::vector<int>>("rows_sections");
+    PADDLE_ENFORCE_EQ(
+        rows_sections.size(), ctx->Outputs("Out").size(),
+        "The size of rows section should be the same with Outputs size.");
+    int64_t n = ctx->Outputs("Out").size();
+    std::vector<framework::DDim> outs_dims;
+    outs_dims.reserve(n);
+    // make output dims
+    for (int64_t i = 0; i < n; ++i) {
+      auto dims = ctx->GetInputDim("X");
+      if (height_sections.size()) {
+        PADDLE_ENFORCE_EQ(
+            height_sections.size(), static_cast<size_t>(n),
+            "The size of height section should be the same with height"
+            " section size.");
+        dims[0] = height_sections[i];
+      }
+      outs_dims.push_back(dims);
+    }
+    ctx->SetOutputsDim("Out", outs_dims);
+  }
+};
+class SplitSelectedRowsGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("sum");
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(split_selected_rows, ops::SplitSelectedRowsOp,
+                  ops::SplitSelectedRowsOpMaker,
+                  ops::SplitSelectedRowsGradMaker);
+REGISTER_OP_CPU_KERNEL(
+    split_selected_rows,
+    ops::SplitSelectedRowsOpKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/split_selected_rows_op.cu
+++ b/paddle/operators/split_selected_rows_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/operators/split_selected_rows_op.h"
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    split_selected_rows,
+    ops::SplitSelectedRowsOpKernel<paddle::platform::CUDADeviceContext, float>);
--- a/paddle/operators/split_selected_rows_op.h
+++ b/paddle/operators/split_selected_rows_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <vector>
+#include "paddle/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class SplitSelectedRowsOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::SelectedRows>("X");
+    auto outs = ctx.MultiOutput<framework::SelectedRows>("Out");
+    auto rows_sections = ctx.Attr<std::vector<int>>("rows_sections");
+    auto height_sections = ctx.Attr<std::vector<int>>("height_sections");
+    int64_t n = outs.size();
+    int offset = 0;
+    for (int64_t i = 0; i < n; ++i) {
+      framework::Vector<int64_t> out_rows;
+      for (int64_t j = 0; j < rows_sections[i]; ++j) {
+        out_rows.push_back(x->rows()[offset + j]);
+      }
+      auto& out = outs[i];
+      auto x_dims = x->GetCompleteDims();
+      x_dims[0] = rows_sections[i];
+      out->mutable_value()->mutable_data<T>(x_dims, ctx.GetPlace());
+      framework::Copy(x->value().Slice(offset, rows_sections[i] + offset),
+                      x->place(), ctx.device_context(), out->mutable_value());
+      outs[i]->set_rows(out_rows);
+      if (height_sections.size()) {
+        outs[i]->set_height(height_sections[i]);
+      }
+      offset += rows_sections[i];
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/python/paddle/v2/fluid/distribute_transpiler.py
+++ b/python/paddle/v2/fluid/distribute_transpiler.py
@@ -407,7 +407,7 @@ class DistributeTranspiler:
            outputs=opt_op.outputs,
            attrs=opt_op.attrs)
-    def get_pserver_program(self, endpoint, optimize_ops):
+    def get_pserver_program(self, endpoint):
        """
        get pserver side program by endpoint
@@ -420,11 +420,24 @@ class DistributeTranspiler:
        pserver_program = Program()
        for v in self.param_grad_ep_mapping[endpoint]["params"]:
            self._clone_var(pserver_program.global_block(), v)
+        for v in self.param_grad_ep_mapping[endpoint]["grads"]:
+            # create vars for each trainer in global scope, so
+            # we don't need to create them when grad arrives.
+            pserver_program.global_block().create_var(
+                name=v.name, persistable=True, dtype=v.dtype, shape=v.shape)
+            for trainer_id in xrange(self.trainers):
+                print("create variable for program: %s.trainer_%d" %
+                      (v.name, trainer_id))
+                pserver_program.global_block().create_var(
+                    name="%s.trainer_%d" % (v.name, trainer_id),
+                    persistable=True,
+                    dtype=v.dtype,
+                    shape=v.shape)
        # step6
        optimize_sub_program = Program()
-        for idx, opt_op in enumerate(optimize_ops):
+        for idx, opt_op in enumerate(self.optimize_ops):
-            is_op_on_pserver = self._is_op_on_pserver(endpoint, optimize_ops,
+            is_op_on_pserver = self._is_op_on_pserver(endpoint,
-                                                      idx)
+                                                      self.optimize_ops, idx)
            if not is_op_on_pserver:
                continue
            if opt_op.inputs.has_key("Grad"):
@@ -449,7 +462,7 @@ class DistributeTranspiler:
                    p.name
                    for p in self.param_grad_ep_mapping[endpoint]["grads"]
                ],
-                "Trainers": self.trainers
+                "Fanin": self.trainers
            })
        pserver_program.sync_with_cpp()
        return pserver_program

--- a/python/paddle/v2/fluid/io.py
+++ b/python/paddle/v2/fluid/io.py
@@ -15,6 +15,7 @@ import os
 import cPickle as pickle
 from paddle.v2.fluid.framework import Program, Parameter, default_main_program, Variable
+from . import core
 __all__ = [
    'save_vars',
@@ -191,6 +192,33 @@ def get_inference_program(target_vars, main_program=None):
    return inference_program
+def prepend_feed_ops(inference_program, feeded_var_names):
+    global_block = inference_program.global_block()
+    feed_var = global_block.create_var(
+        name='feed', type=core.VarDesc.VarType.FEED_MINIBATCH, persistable=True)
+    for i, name in enumerate(feeded_var_names):
+        out = global_block.var(name)
+        global_block.prepend_op(
+            type='feed',
+            inputs={'X': [feed_var]},
+            outputs={'Out': [out]},
+            attrs={'col': i})
+def append_fetch_ops(inference_program, fetch_var_names):
+    global_block = inference_program.global_block()
+    fetch_var = global_block.create_var(
+        name='fetch', type=core.VarDesc.VarType.FETCH_LIST, persistable=True)
+    for i, name in enumerate(fetch_var_names):
+        global_block.append_op(
+            type='fetch',
+            inputs={'X': [name]},
+            outputs={'Out': [fetch_var]},
+            attrs={'col': i})
 def save_inference_model(dirname,
                         feeded_var_names,
                         target_vars,
@@ -241,6 +269,9 @@ def save_inference_model(dirname,
            "fetch_var_names": fetch_var_names
        }, f, -1)
+    prepend_feed_ops(inference_program, feeded_var_names)
+    append_fetch_ops(inference_program, fetch_var_names)
    # Save only programDesc of inference_program in binary format
    # in another file: __model__.dat
    with open(model_file_name + ".dat", "wb") as fp:

--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -22,36 +22,13 @@ from ..param_attr import ParamAttr
 from tensor import concat
 __all__ = [
-    'fc',
+    'fc', 'embedding', 'dynamic_lstm', 'gru_unit', 'linear_chain_crf',
-    'embedding',
+    'crf_decoding', 'cos_sim', 'cross_entropy', 'square_error_cost', 'accuracy',
-    'dynamic_lstm',
+    'chunk_eval', 'sequence_conv', 'conv2d', 'sequence_pool', 'pool2d',
-    'gru_unit',
+    'batch_norm', 'beam_search_decode', 'conv2d_transpose', 'sequence_expand',
-    'linear_chain_crf',
+    'lstm_unit', 'reduce_sum', 'reduce_mean', 'reduce_max', 'reduce_min',
-    'crf_decoding',
+    'sequence_first_step', 'sequence_last_step', 'dropout', 'split',
-    'cos_sim',
+    'l2_normalize', 'matmul', 'warpctc', 'sequence_reshape'
-    'cross_entropy',
-    'square_error_cost',
-    'accuracy',
-    'chunk_eval',
-    'sequence_conv',
-    'conv2d',
-    'sequence_pool',
-    'pool2d',
-    'batch_norm',
-    'beam_search_decode',
-    'conv2d_transpose',
-    'sequence_expand',
-    'lstm_unit',
-    'reduce_sum',
-    'reduce_mean',
-    'reduce_max',
-    'reduce_min',
-    'sequence_first_step',
-    'sequence_last_step',
-    'dropout',
-    'split',
-    'l2_normalize',
-    'matmul',
 ]
@@ -229,6 +206,102 @@ def dynamic_lstm(input,
                 cell_activation='tanh',
                 candidate_activation='tanh',
                 dtype='float32'):
+    """
+    **Dynamic LSTM Layer**
+    The defalut implementation is diagonal/peephole connection
+    (https://arxiv.org/pdf/1402.1128.pdf), the formula is as follows:
+    .. math::
+        i_t & = \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + W_{ic}c_{t-1} + b_i)
+        f_t & = \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + W_{fc}c_{t-1} + b_f)
+        \\tilde{c_t} & = act_g(W_{cx}x_t + W_{ch}h_{t-1} + b_c)
+        o_t & = \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + W_{oc}c_t + b_o)
+        c_t & = f_t \odot c_{t-1} + i_t \odot \\tilde{c_t}
+        h_t & = o_t \odot act_h(c_t)
+    where the :math:`W` terms denote weight matrices (e.g. :math:`W_{xi}` is
+    the matrix of weights from the input gate to the input), :math:`W_{ic}, \
+    W_{fc}, W_{oc}` are diagonal weight matrices for peephole connections. In
+    our implementation, we use vectors to reprenset these diagonal weight
+    matrices. The :math:`b` terms denote bias vectors (:math:`b_i` is the input
+    gate bias vector), :math:`\sigma` is the non-line activations, such as
+    logistic sigmoid function, and :math:`i, f, o` and :math:`c` are the input
+    gate, forget gate, output gate, and cell activation vectors, respectively,
+    all of which have the same size as the cell output activation vector :math:`h`.
+    The :math:`\odot` is the element-wise product of the vectors. :math:`act_g`
+    and :math:`act_h` are the cell input and cell output activation functions
+    and `tanh` is usually used for them. :math:`\\tilde{c_t}` is also called
+    candidate hidden state, which is computed based on the current input and
+    the previous hidden state.
+    Set `use_peepholes` to `False` to disable peephole connection. The formula
+    is omitted here, please refer to the paper
+    http://www.bioinf.jku.at/publications/older/2604.pdf for details.
+    Note that these :math:`W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}`
+    operations on the input :math:`x_{t}` are NOT included in this operator.
+    Users can choose to use fully-connect layer before LSTM layer.
+    Args:
+        input(Variable): The input of dynamic_lstm layer, which supports
+                         variable-time length input sequence. The underlying
+                         tensor in this Variable is a matrix with shape
+                         (T X 4D), where T is the total time steps in this
+                         mini-batch, D is the hidden size.
+        size(int): 4 * hidden size.
+        param_attr(ParamAttr): The parameter attribute for the learnable
+                               hidden-hidden weights.
+                               - The shape is (D x 4D), where D is the hidden
+                                 size.
+                               - Weights = {:math:`W_{ch}, W_{ih}, \
+                                                W_{fh}, W_{oh}`}
+        bias_attr(ParamAttr): The bias attribute for the learnable bias
+                              weights, which contains two parts, input-hidden
+                              bias weights and peephole connections weights if
+                              setting `use_peepholes` to `True`.
+                              1. `use_peepholes = False`
+                                - The shape is (1 x 4D).
+                                - Biases = {:math:`b_c, b_i, b_f, b_o`}.
+                              2. `use_peepholes = True`
+                                - The shape is (1 x 7D).
+                                - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \
+                                                 W_{fc}, W_{oc}`}.
+        use_peepholes(bool): Whether to enable diagonal/peephole connections,
+                             default `True`.
+        is_reverse(bool): Whether to compute reversed LSTM, default `False`.
+        gate_activation(str): The activation for input gate, forget gate and
+                              output gate. Choices = ["sigmoid", "tanh", "relu",
+                              "identity"], default "sigmoid".
+        cell_activation(str): The activation for cell output. Choices = ["sigmoid",
+                              "tanh", "relu", "identity"], default "tanh".
+        candidate_activation(str): The activation for candidate hidden state.
+                              Choices = ["sigmoid", "tanh", "relu", "identity"],
+                              default "tanh".
+        dtype(str): Data type. Choices = ["float32", "float64"], default "float32".
+    Returns:
+        tuple: The hidden state, and cell state of LSTM. The shape of both \
+        is (T x D), and lod is the same with the `input`.
+    Examples:
+        .. code-block:: python
+            hidden_dim = 512
+            forward_proj = fluid.layers.fc(input=input_seq, size=hidden_dim * 4,
+                                           act=None, bias_attr=None)
+            forward, _ = fluid.layers.dynamic_lstm(
+                input=forward_proj, size=hidden_dim * 4, use_peepholes=False)
+    """
    helper = LayerHelper('lstm', **locals())
    size = size / 4
    weight = helper.create_parameter(
@@ -676,6 +749,7 @@ def conv2d(input,
           groups=None,
           param_attr=None,
           bias_attr=None,
+           use_cudnn=True,
           act=None):
    """
    **Convlution2D Layer**
@@ -739,6 +813,8 @@ def conv2d(input,
            connected to the second half of the input channels. Default: groups=1
        param_attr(ParamAttr): The parameters to the Conv2d Layer. Default: None
        bias_attr(ParamAttr): Bias parameter for the Conv2d layer. Default: None
+        use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn
+            library is installed. Default: True
        act(str): Activation type. Default: None
    Returns:
@@ -774,6 +850,8 @@ def conv2d(input,
        stride = [stride, stride]
    if isinstance(padding, int):
        padding = [padding, padding]
+    if not isinstance(use_cudnn, bool):
+        raise ValueError("use_cudnn should be True or False")
    input_shape = input.shape
    filter_shape = [num_filters, num_filter_channels] + filter_size
@@ -797,9 +875,12 @@ def conv2d(input,
            'Filter': filter_param,
        },
        outputs={"Output": pre_bias},
-        attrs={'strides': stride,
+        attrs={
-               'paddings': padding,
+            'strides': stride,
-               'groups': groups})
+            'paddings': padding,
+            'groups': groups,
+            'use_cudnn': use_cudnn
+        })
    pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
@@ -948,6 +1029,7 @@ def pool2d(input,
           pool_stride=None,
           pool_padding=None,
           global_pooling=False,
+           use_cudnn=True,
           name=None):
    """
    This function adds the operator for pooling in 2 dimensions, using the
@@ -967,6 +1049,8 @@ def pool2d(input,
        pool_stride = [pool_stride, pool_stride]
    if isinstance(pool_padding, int):
        pool_padding = [pool_padding, pool_padding]
+    if not isinstance(use_cudnn, bool):
+        raise ValueError("use_cudnn should be True or False")
    helper = LayerHelper('pool2d', **locals())
    dtype = helper.input_dtype()
@@ -981,7 +1065,8 @@ def pool2d(input,
            "ksize": pool_size,
            "global_pooling": global_pooling,
            "strides": pool_stride,
-            "paddings": pool_padding
+            "paddings": pool_padding,
+            "use_cudnn": use_cudnn
        })
    return pool_out
@@ -1096,6 +1181,7 @@ def conv2d_transpose(input,
                     stride=None,
                     dilation=None,
                     param_attr=None,
+                     use_cudnn=True,
                     name=None):
    """
    The transpose of conv2d layer.
@@ -1123,6 +1209,8 @@ def conv2d_transpose(input,
            contain two integers, (dilation_H, dilation_W). Otherwise, the
            dilation_H = dilation_W = dilation.
        param_attr: Parameter Attribute.
+        use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn
+            library is installed. Default: True
        name(str|None): A name for this layer(optional). If set None, the layer
                       will be named automatically.
@@ -1151,6 +1239,10 @@ def conv2d_transpose(input,
    elif dilation is not None:
        op_attr['dilations'] = dilation
+    if not isinstance(use_cudnn, bool):
+        raise ValueError("use_cudnn should be True or False")
+    op_attr['use_cudnn'] = use_cudnn
    if filter_size is None:
        if output_size is None:
            raise ValueError("output_size must be set when filter_size is None")
@@ -1709,11 +1801,11 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
    The actual behavior depends on the shapes of :math:`x`, :math:`y` and the
    flag values of :attr:`transpose_x`, :attr:`transpose_y`. Specifically:
-    - If a transpose flag is specified, the last two dimensions of the tensor 
+    - If a transpose flag is specified, the last two dimensions of the tensor
-      are transposed. If the tensor is rank-1 of shape :math:`[D]`, then for 
+      are transposed. If the tensor is rank-1 of shape :math:`[D]`, then for
-      :math:`x` it is treated as :math:`[1, D]` in nontransposed form and as 
+      :math:`x` it is treated as :math:`[1, D]` in nontransposed form and as
-      :math:`[D, 1]` in transposed form, whereas for :math:`y` it is the 
+      :math:`[D, 1]` in transposed form, whereas for :math:`y` it is the
-      opposite: It is treated as :math:`[D, 1]` in nontransposed form and as 
+      opposite: It is treated as :math:`[D, 1]` in nontransposed form and as
      :math:`[1, D]` in transposed form.
    - After transpose, the two tensors are 2-D or n-D and matrix multiplication
@@ -1733,7 +1825,7 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
        y (Variable): The input variable which is a Tensor or LoDTensor.
        transpose_x (bool): Whether to transpose :math:`x` before multiplication.
        transpose_y (bool): Whether to transpose :math:`y` before multiplication.
-        name(str|None): A name for this layer(optional). If set None, the layer 
+        name(str|None): A name for this layer(optional). If set None, the layer
            will be named automatically.
    Returns:
@@ -1772,3 +1864,110 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
        attrs={'transpose_X': transpose_x,
               'transpose_Y': transpose_y})
    return out
+def warpctc(input, label, blank=0, norm_by_times=False, **kwargs):
+    """
+    An operator integrating the open source Warp-CTC library
+    (https://github.com/baidu-research/warp-ctc)
+    to compute Connectionist Temporal Classification (CTC) loss.
+    It can be aliased as softmax with CTC, since a native softmax activation is
+    interated to the Warp-CTC library, to to normlize values for each row of the
+    input tensor.
+    Args:
+       input(Variable): (LodTensor, default: LoDTensor<float>),
+         the unscaled probabilities of variable-length sequences,
+         which is a 2-D Tensor with LoD information.
+         It's shape is [Lp, num_classes + 1], where Lp is the sum of all input
+         sequences' length and num_classes is the true number of classes.
+         (not including the blank label).
+       label(Variable): (LodTensor, default: LoDTensor<int>), the ground truth
+         of variable-length sequence, which is a 2-D Tensor with LoD
+         information. It is of the shape [Lg, 1], where Lg is th sum of
+         all labels' length.
+       blank: (int, default: 0), the blank label index of Connectionist
+         Temporal Classification (CTC) loss, which is in the
+         half-opened interval [0, num_classes + 1).
+       norm_by_times: (bool, default: false), whether to normalize
+       the gradients by the number of time-step,which is also the
+       sequence's length. There is no need to normalize the gradients
+       if warpctc layer was follewed by a mean_op.
+    Returns:
+        Variable: The Connectionist Temporal Classification (CTC) loss,
+        which is a 2-D Tensor of the shape [batch_size, 1].
+    Examples:
+        .. code-block:: python
+            y = layers.data(name='y', shape=[11, 8], dtype='float32', lod_level=1)
+            y_predict = layers.data(name='y_predict', shape=[11, 1], dtype='float32')
+            cost = layers.warpctc(input=y_predict, label=y)
+    """
+    helper = LayerHelper('warpctc', **kwargs)
+    loss_out = helper.create_tmp_variable(dtype=input.dtype)
+    grad_out = helper.create_tmp_variable(dtype=input.dtype)
+    helper.append_op(
+        type='warpctc',
+        inputs={'Logits': [input],
+                'Label': [label]},
+        outputs={'WarpCTCGrad': [grad_out],
+                 'Loss': [loss_out]},
+        attrs={'blank': blank,
+               'norm_by_times': norm_by_times})
+    return loss_out
+def sequence_reshape(input, new_dim):
+    """
+    **Sequence Reshape Layer**
+    This layer will rearrange the input sequences. The new dimension is set by
+    user. Length of each sequence is computed according to original length,
+    original dimension and new dimension. The following example will help to
+    illustrate the function of this layer:
+    .. code-block:: text
+        x is a LoDTensor:
+            x.lod  = [[0, 2, 6]]
+            x.data = [[1, 2], [3, 4],
+                      [5, 6], [7, 8], [9, 10], [11, 12]]
+            x.dims = [6, 2]
+        set new_dim = 4
+        then out is a LoDTensor:
+            out.lod  = [[0, 1, 3]]
+            out.data = [[1, 2, 3, 4],
+                        [5, 6, 7, 8], [9, 10, 11, 12]]
+            out.dims = [3, 4]
+    Currently, only 1-level LoDTensor is supported and please make sure
+    (original length * original dimension) can be divided by new dimension with
+    no remainder for each sequence.
+    Args:
+       input (Variable): (LodTensor, default: LoDTensor<float>), a 2-D LoDTensor
+                with shape being [N, M] where M for dimension.
+       new_dim (int): New dimension which the input LoDTensor is reshaped to.
+    Returns:
+        Variable: Reshaped LoDTensor according to new dimension.
+    Examples:
+        .. code-block:: python
+            x = fluid.layers.data(name='x', shape=[5, 20],
+                              dtype='float32', lod_level=1)
+            x_reshaped = layers.sequence_reshape(input=x, new_dim=10)
+    """
+    helper = LayerHelper('sequence_reshape', **locals())
+    out = helper.create_tmp_variable(helper.input_dtype())
+    helper.append_op(
+        type='sequence_reshape',
+        inputs={'X': [input]},
+        outputs={'Out': [out]},
+        attrs={'new_dim': new_dim})
+    return out
--- a/python/paddle/v2/fluid/nets.py
+++ b/python/paddle/v2/fluid/nets.py
@@ -28,19 +28,22 @@ def simple_img_conv_pool(input,
                         pool_stride,
                         act,
                         param_attr=None,
-                         pool_type='max'):
+                         pool_type='max',
+                         use_cudnn=True):
    conv_out = layers.conv2d(
        input=input,
        num_filters=num_filters,
        filter_size=filter_size,
        param_attr=param_attr,
-        act=act)
+        act=act,
+        use_cudnn=use_cudnn)
    pool_out = layers.pool2d(
        input=conv_out,
        pool_size=pool_size,
        pool_type=pool_type,
-        pool_stride=pool_stride)
+        pool_stride=pool_stride,
+        use_cudnn=use_cudnn)
    return pool_out
@@ -54,7 +57,8 @@ def img_conv_group(input,
                   conv_with_batchnorm=False,
                   conv_batchnorm_drop_rate=None,
                   pool_stride=1,
-                   pool_type=None):
+                   pool_type=None,
+                   use_cudnn=True):
    """
    Image Convolution Group, Used for vgg net.
    """
@@ -85,7 +89,8 @@ def img_conv_group(input,
            filter_size=conv_filter_size[i],
            padding=conv_padding[i],
            param_attr=param_attr[i],
-            act=local_conv_act)
+            act=local_conv_act,
+            use_cudnn=use_cudnn)
        if conv_with_batchnorm[i]:
            tmp = layers.batch_norm(input=tmp, act=conv_act)
@@ -97,7 +102,8 @@ def img_conv_group(input,
        input=tmp,
        pool_size=pool_size,
        pool_type=pool_type,
-        pool_stride=pool_stride)
+        pool_stride=pool_stride,
+        use_cudnn=use_cudnn)
    return pool_out

--- a/python/paddle/v2/fluid/registry.py
+++ b/python/paddle/v2/fluid/registry.py
 #  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
+# Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
+# you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
+# Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
+# distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
+# See the License for the specific language governing permissions and
-#limitations under the License.
+# limitations under the License.
 import re
 import cStringIO
 import warnings
@@ -167,13 +167,18 @@ def register_layer(op_type):
            inputs[ipt.name] = val
        outputs = dict()
-        out = helper.create_tmp_variable(dtype=dtype)
+        out = kwargs.pop(_convert_(o_name), [])
-        outputs[o_name] = [out]
+        if out:
+            out_var = out[0] if (isinstance(out, list) or
+                                 isinstance(out, tuple)) else out
+        else:
+            out_var = helper.create_tmp_variable(dtype=dtype)
+        outputs[o_name] = [out_var]
        for name in intermediate_output_names:
            outputs[name] = [helper.create_tmp_variable(dtype=dtype)]
        helper.append_op(
            type=op_type, inputs=inputs, outputs=outputs, attrs=kwargs)
-        return helper.append_activation(out)
+        return helper.append_activation(out_var)
    func.__name__ = op_type
    func.__doc__ = _generate_doc_string_(op_proto)

--- a/python/paddle/v2/fluid/tests/book_distribute/notest_dist_fit_a_line.py
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_dist_fit_a_line.py
@@ -53,8 +53,9 @@ if training_role == "PSERVER":
    if not current_endpoint:
        print("need env SERVER_ENDPOINT")
        exit(1)
-    pserver_prog = t.get_pserver_program(current_endpoint, optimize_ops)
+    pserver_prog = t.get_pserver_program(current_endpoint)
-    exe.run(fluid.default_startup_program())
+    pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
+    exe.run(pserver_startup)
    exe.run(pserver_prog)
 else:
    trainer_prog = t.get_trainer_program()

--- a/python/paddle/v2/fluid/tests/book_distribute/notest_dist_image_classification.py
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_dist_image_classification.py
+#Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from __future__ import print_function
+import sys
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+import os
+import sys
+TRAINERS = 5
+BATCH_SIZE = 128
+PASS_NUM = 100
+def resnet_cifar10(input, depth=32):
+    def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
+        tmp = fluid.layers.conv2d(
+            input=input,
+            filter_size=filter_size,
+            num_filters=ch_out,
+            stride=stride,
+            padding=padding,
+            act=None,
+            bias_attr=False)
+        return fluid.layers.batch_norm(input=tmp, act=act)
+    def shortcut(input, ch_in, ch_out, stride):
+        if ch_in != ch_out:
+            return conv_bn_layer(input, ch_out, 1, stride, 0, None)
+        else:
+            return input
+    def basicblock(input, ch_in, ch_out, stride):
+        tmp = conv_bn_layer(input, ch_out, 3, stride, 1)
+        tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, act=None)
+        short = shortcut(input, ch_in, ch_out, stride)
+        return fluid.layers.elementwise_add(x=tmp, y=short, act='relu')
+    def layer_warp(block_func, input, ch_in, ch_out, count, stride):
+        tmp = block_func(input, ch_in, ch_out, stride)
+        for i in range(1, count):
+            tmp = block_func(tmp, ch_out, ch_out, 1)
+        return tmp
+    assert (depth - 2) % 6 == 0
+    n = (depth - 2) / 6
+    conv1 = conv_bn_layer(
+        input=input, ch_out=16, filter_size=3, stride=1, padding=1)
+    res1 = layer_warp(basicblock, conv1, 16, 16, n, 1)
+    res2 = layer_warp(basicblock, res1, 16, 32, n, 2)
+    res3 = layer_warp(basicblock, res2, 32, 64, n, 2)
+    pool = fluid.layers.pool2d(
+        input=res3, pool_size=8, pool_type='avg', pool_stride=1)
+    return pool
+def vgg16_bn_drop(input):
+    def conv_block(input, num_filter, groups, dropouts):
+        return fluid.nets.img_conv_group(
+            input=input,
+            pool_size=2,
+            pool_stride=2,
+            conv_num_filter=[num_filter] * groups,
+            conv_filter_size=3,
+            conv_act='relu',
+            conv_with_batchnorm=True,
+            conv_batchnorm_drop_rate=dropouts,
+            pool_type='max')
+    conv1 = conv_block(input, 64, 2, [0.3, 0])
+    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
+    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
+    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
+    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
+    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
+    fc1 = fluid.layers.fc(input=drop, size=512, act=None)
+    bn = fluid.layers.batch_norm(input=fc1, act='relu')
+    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
+    fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
+    return fc2
+classdim = 10
+data_shape = [3, 32, 32]
+images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
+label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+net_type = "vgg"
+if len(sys.argv) >= 2:
+    net_type = sys.argv[1]
+if net_type == "vgg":
+    print("train vgg net")
+    net = vgg16_bn_drop(images)
+elif net_type == "resnet":
+    print("train resnet")
+    net = resnet_cifar10(images, 32)
+else:
+    raise ValueError("%s network is not supported" % net_type)
+predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
+cost = fluid.layers.cross_entropy(input=predict, label=label)
+avg_cost = fluid.layers.mean(x=cost)
+optimizer = fluid.optimizer.Adam(learning_rate=0.001)
+optimize_ops, params_grads = optimizer.minimize(avg_cost)
+accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.cifar.train10(), buf_size=128 * 10),
+    batch_size=BATCH_SIZE)
+place = fluid.CPUPlace()
+exe = fluid.Executor(place)
+t = fluid.DistributeTranspiler()
+# all parameter server endpoints list for spliting parameters
+pserver_endpoints = os.getenv("PSERVERS")
+# server endpoint for current node
+current_endpoint = os.getenv("SERVER_ENDPOINT")
+# run as trainer or parameter server
+training_role = os.getenv("TRAINING_ROLE",
+                          "TRAINER")  # get the training role: trainer/pserver
+t.transpile(
+    optimize_ops, params_grads, pservers=pserver_endpoints, trainers=TRAINERS)
+if training_role == "PSERVER":
+    if not current_endpoint:
+        print("need env SERVER_ENDPOINT")
+        exit(1)
+    print("start pserver at:", current_endpoint)
+    pserver_prog = t.get_pserver_program(current_endpoint)
+    pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
+    exe.run(pserver_startup)
+    exe.run(pserver_prog)
+    print("pserver run end")
+elif training_role == "TRAINER":
+    print("start trainer")
+    trainer_prog = t.get_trainer_program()
+    feeder = fluid.DataFeeder(place=place, feed_list=[images, label])
+    exe.run(fluid.default_startup_program())
+    for pass_id in range(PASS_NUM):
+        accuracy.reset(exe)
+        for data in train_reader():
+            loss, acc = exe.run(trainer_prog,
+                                feed=feeder.feed(data),
+                                fetch_list=[avg_cost] + accuracy.metrics)
+            pass_acc = accuracy.eval(exe)
+            print("loss:" + str(loss) + " acc:" + str(acc) + " pass_acc:" + str(
+                pass_acc))
+            # this model is slow, so if we can train two mini batch, we think it works properly.
+    print("trainer run end")
+else:
+    print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
+exit(1)
--- a/python/paddle/v2/fluid/tests/book_distribute/notest_dist_label_semantic_roles.py
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_dist_label_semantic_roles.py
@@ -197,8 +197,9 @@ def main():
        if not current_endpoint:
            print("need env SERVER_ENDPOINT")
            exit(1)
-        pserver_prog = t.get_pserver_program(current_endpoint, optimize_ops)
+        pserver_prog = t.get_pserver_program(current_endpoint)
-        exe.run(fluid.default_startup_program())
+        pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
+        exe.run(pserver_startup)
        exe.run(pserver_prog)
    elif training_role == "TRAINER":
        trainer_prog = t.get_trainer_program()

--- a/python/paddle/v2/fluid/tests/book_distribute/notest_dist_word2vec.py
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_dist_word2vec.py
@@ -87,8 +87,9 @@ if training_role == "PSERVER":
    if not current_endpoint:
        print("need env SERVER_ENDPOINT")
        exit(1)
-    pserver_prog = t.get_pserver_program(current_endpoint, optimize_ops)
+    pserver_prog = t.get_pserver_program(current_endpoint)
-    exe.run(fluid.default_startup_program())
+    pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
+    exe.run(pserver_startup)
    exe.run(pserver_prog)
 elif training_role == "TRAINER":
    feeder = fluid.DataFeeder(

--- a/python/paddle/v2/fluid/tests/book_distribute/notest_recognize_digits_conv_dist.py
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_recognize_digits_conv_dist.py
@@ -52,26 +52,27 @@ train_reader = paddle.batch(
 place = fluid.CPUPlace()
 exe = fluid.Executor(place)
-t = fluid.DistributeTranspiler()
+pserver_endpoints = os.getenv("PSERVERS")  # all pserver endpoints
-# all parameter server endpoints list for spliting parameters
+trainers = int(os.getenv("TRAINERS"))  # total trainer count
-pserver_endpoints = os.getenv("PSERVERS")
+current_endpoint = os.getenv("SERVER_ENDPOINT")  # current pserver endpoint
-# server endpoint for current node
-current_endpoint = os.getenv("SERVER_ENDPOINT")
-# run as trainer or parameter server
 training_role = os.getenv("TRAINING_ROLE",
                          "TRAINER")  # get the training role: trainer/pserver
-t.transpile(optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
+t = fluid.DistributeTranspiler()
+t.transpile(
+    optimize_ops, params_grads, pservers=pserver_endpoints, trainers=trainers)
 if training_role == "PSERVER":
    if not current_endpoint:
        print("need env SERVER_ENDPOINT")
        exit(1)
-    pserver_prog = t.get_pserver_program(current_endpoint, optimize_ops)
+    pserver_prog = t.get_pserver_program(current_endpoint)
-    exe.run(fluid.default_startup_program())
+    pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
+    exe.run(pserver_startup)
    exe.run(pserver_prog)
 elif training_role == "TRAINER":
    trainer_prog = t.get_trainer_program()
    feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
+    # TODO(typhoonzero): change trainer startup program to fetch parameters from pserver
    exe.run(fluid.default_startup_program())
    for pass_id in range(PASS_NUM):

--- a/adversarial/fluid_mnist.py
+++ b/adversarial/fluid_mnist.py
@@ -11,89 +11,78 @@
 #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #See the License for the specific language governing permissions and
 #limitations under the License.
-"""
+from __future__ import print_function
-CNN on mnist data using fluid api of paddlepaddle
+import numpy as np
-"""
 import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
+import os
+BATCH_SIZE = 128
+PASS_NUM = 100
-def mnist_cnn_model(img):
+images = fluid.layers.data(name='x', shape=[784], dtype='float32')
-    """
-    Mnist cnn model
-    Args:
+# TODO(aroraabhinav) Add regularization and error clipping after
-        img(Varaible): the input image to be recognized
+# Issue 7432(https://github.com/PaddlePaddle/Paddle/issues/7432) is resolved.
+hidden1 = fluid.layers.fc(input=images, size=128, act='relu')
+hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu')
+predict = fluid.layers.fc(input=hidden2, size=10, act='softmax')
-    Returns:
+label = fluid.layers.data(name='y', shape=[1], dtype='int64')
-        Variable: the label prediction
-    """
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
-        input=img,
-        num_filters=20,
-        filter_size=5,
-        pool_size=2,
-        pool_stride=2,
-        act='relu')
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+cost = fluid.layers.cross_entropy(input=predict, label=label)
-        input=conv_pool_1,
+avg_cost = fluid.layers.mean(x=cost)
-        num_filters=50,
-        filter_size=5,
-        pool_size=2,
-        pool_stride=2,
-        act='relu')
-    logits = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
+optimizer = fluid.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
-    return logits
+optimize_ops, params_grads = optimizer.minimize(avg_cost)
+accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
-def main():
+train_reader = paddle.batch(
-    """
+    paddle.reader.shuffle(
-    Train the cnn model on mnist datasets
+        paddle.dataset.mnist.train(), buf_size=8192),
-    """
+    batch_size=BATCH_SIZE)
-    img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    logits = mnist_cnn_model(img)
-    cost = fluid.layers.cross_entropy(input=logits, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-    optimizer = fluid.optimizer.Adam(learning_rate=0.01)
-    optimizer.minimize(avg_cost)
-    accuracy = fluid.evaluator.Accuracy(input=logits, label=label)
+place = fluid.CPUPlace()
+exe = fluid.Executor(place)
-    BATCH_SIZE = 50
+t = fluid.DistributeTranspiler()
-    PASS_NUM = 3
+# all parameter server endpoints list for spliting parameters
-    ACC_THRESHOLD = 0.98
+pserver_endpoints = os.getenv("PSERVERS")
-    LOSS_THRESHOLD = 10.0
+# server endpoint for current node
-    train_reader = paddle.batch(
+current_endpoint = os.getenv("SERVER_ENDPOINT")
-        paddle.reader.shuffle(
+# run as trainer or parameter server
-            paddle.dataset.mnist.train(), buf_size=500),
+training_role = os.getenv("TRAINING_ROLE",
-        batch_size=BATCH_SIZE)
+                          "TRAINER")  # get the training role: trainer/pserver
+t.transpile(optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
-    place = fluid.CPUPlace()
+if training_role == "PSERVER":
-    exe = fluid.Executor(place)
+    if not current_endpoint:
-    feeder = fluid.DataFeeder(feed_list=[img, label], place=place)
+        print("need env SERVER_ENDPOINT")
+        exit(1)
+    pserver_prog = t.get_pserver_program(current_endpoint)
+    pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
+    exe.run(pserver_startup)
+    exe.run(pserver_prog)
+elif training_role == "TRAINER":
+    trainer_prog = t.get_trainer_program()
+    feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
    exe.run(fluid.default_startup_program())
    for pass_id in range(PASS_NUM):
        accuracy.reset(exe)
+        batch_id = 0
        for data in train_reader():
-            loss, acc = exe.run(fluid.default_main_program(),
+            loss, acc = exe.run(trainer_prog,
                                feed=feeder.feed(data),
                                fetch_list=[avg_cost] + accuracy.metrics)
            pass_acc = accuracy.eval(exe)
-            print("pass_id=" + str(pass_id) + " acc=" + str(acc) + " pass_acc="
+            if batch_id % 100 == 0:
-                  + str(pass_acc))
+                print("batch_id %d, loss: %f, acc: %f" %
-            if loss < LOSS_THRESHOLD and pass_acc > ACC_THRESHOLD:
+                      (batch_id, loss, pass_acc))
-                break
+            batch_id += 1
        pass_acc = accuracy.eval(exe)
        print("pass_id=" + str(pass_id) + " pass_acc=" + str(pass_acc))
-    fluid.io.save_params(
+else:
-        exe, dirname='./mnist', main_program=fluid.default_main_program())
+    print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
-    print('train mnist done')
-if __name__ == '__main__':
-    main()
--- a/python/paddle/v2/fluid/tests/book_distribute/notest_understand_sentiment_conv_dist.py
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_understand_sentiment_conv_dist.py
@@ -92,15 +92,16 @@ def main():
    t.transpile(
        optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
-    exe.run(fluid.default_startup_program())
    if training_role == "PSERVER":
        if not current_endpoint:
            print("need env SERVER_ENDPOINT")
            exit(1)
-        pserver_prog = t.get_pserver_program(current_endpoint, optimize_ops)
+        pserver_prog = t.get_pserver_program(current_endpoint)
+        pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
+        exe.run(pserver_startup)
        exe.run(pserver_prog)
    elif training_role == "TRAINER":
+        exe.run(fluid.default_startup_program())
        trainer_prog = t.get_trainer_program()
        feeder = fluid.DataFeeder(feed_list=[data, label], place=place)

--- a/python/paddle/v2/fluid/tests/test_ctc_align.py
+++ b/python/paddle/v2/fluid/tests/test_ctc_align.py
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import sys
+import unittest
+import numpy as np
+from op_test import OpTest
+from test_softmax_op import stable_softmax
+def CTCAlign(input, lod, blank, merge_repeated):
+    lod0 = lod[0]
+    result = []
+    for i in range(len(lod0) - 1):
+        prev_token = -1
+        for j in range(lod0[i], lod0[i + 1]):
+            token = input[j][0]
+            if (token != blank) and not (merge_repeated and
+                                         token == prev_token):
+                result.append(token)
+            prev_token = token
+    result = np.array(result).reshape([len(result), 1]).astype("int32")
+    return result
+class TestCTCAlignOp(OpTest):
+    def config(self):
+        self.op_type = "ctc_align"
+        self.input_lod = [[0, 11, 18]]
+        self.blank = 0
+        self.merge_repeated = False
+        self.input = np.array(
+            [0, 1, 2, 2, 0, 4, 0, 4, 5, 0, 6, 6, 0, 0, 7, 7, 7, 0]).reshape(
+                [18, 1]).astype("int32")
+    def setUp(self):
+        self.config()
+        output = CTCAlign(self.input, self.input_lod, self.blank,
+                          self.merge_repeated)
+        self.inputs = {"Input": (self.input, self.input_lod), }
+        self.outputs = {"Output": output}
+        self.attrs = {
+            "blank": self.blank,
+            "merge_repeated": self.merge_repeated
+        }
+    def test_check_output(self):
+        self.check_output()
+        pass
+class TestCTCAlignOpCase1(TestCTCAlignOp):
+    def config(self):
+        self.op_type = "ctc_align"
+        self.input_lod = [[0, 11, 19]]
+        self.blank = 0
+        self.merge_repeated = True
+        self.input = np.array(
+            [0, 1, 2, 2, 0, 4, 0, 4, 5, 0, 6, 6, 0, 0, 7, 7, 7, 0, 0]).reshape(
+                [19, 1]).astype("int32")
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/v2/fluid/tests/test_edit_distance_op.py
+++ b/python/paddle/v2/fluid/tests/test_edit_distance_op.py
@@ -51,8 +51,8 @@ class TestEditDistanceOp(OpTest):
    def setUp(self):
        self.op_type = "edit_distance"
        normalized = False
-        x1 = np.array([[0, 12, 3, 5, 8, 2]]).astype("int32")
+        x1 = np.array([[0, 12, 3, 5, 8, 2]]).astype("int64")
-        x2 = np.array([[0, 12, 4, 7, 8]]).astype("int32")
+        x2 = np.array([[0, 12, 4, 7, 8]]).astype("int64")
        x1 = np.transpose(x1)
        x2 = np.transpose(x2)
        x1_lod = [0, 1, 5]
@@ -79,8 +79,8 @@ class TestEditDistanceOpNormalized(OpTest):
    def setUp(self):
        self.op_type = "edit_distance"
        normalized = True
-        x1 = np.array([[0, 10, 3, 6, 5, 8, 2]]).astype("int32")
+        x1 = np.array([[0, 10, 3, 6, 5, 8, 2]]).astype("int64")
-        x2 = np.array([[0, 10, 4, 6, 7, 8]]).astype("int32")
+        x2 = np.array([[0, 10, 4, 6, 7, 8]]).astype("int64")
        x1 = np.transpose(x1)
        x2 = np.transpose(x2)
        x1_lod = [0, 1, 3, 6]

--- a/python/paddle/v2/fluid/tests/test_layers.py
+++ b/python/paddle/v2/fluid/tests/test_layers.py
@@ -216,6 +216,14 @@ class TestBook(unittest.TestCase):
            self.assertIsNotNone(x)
        print(str(program))
+    def test_sequence_reshape(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[8], dtype='float32', lod_level=1)
+            out = layers.sequence_reshape(input=x, new_dim=16)
+            self.assertIsNotNone(out)
+        print(str(program))
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/v2/fluid/tests/test_parallel_op.py
+++ b/python/paddle/v2/fluid/tests/test_parallel_op.py
@@ -15,9 +15,6 @@ import unittest
 import paddle.v2.fluid as fluid
 import numpy
-import sys
-# TODO(dzhwinter): get places op check need to be enhanced.
-sys.exit(0)
 class BaseParallelForTest(unittest.TestCase):
@@ -165,13 +162,13 @@ class ParallelOpTest(BaseParallelForTest):
            feed={
                'img': numpy.random.random(size=(51, 784)).astype('float32')
            },
-            fetch='fc1.w@GRAD')
+            fetch=['fc1.w@GRAD'])
    def test_fc_with_tiny_data(self):
        self.run_test(
            callback=ParallelOpTest.__network__,
            feed={'img': numpy.random.random(size=(1, 784)).astype('float32')},
-            fetch='fc1.w@GRAD')
+            fetch=['fc1.w@GRAD'])
 if __name__ == '__main__':

--- a/python/paddle/v2/fluid/tests/test_sequence_erase_op.py
+++ b/python/paddle/v2/fluid/tests/test_sequence_erase_op.py
@@ -29,7 +29,7 @@ def sequence_erase(in_seq, lod0, tokens):
    return np.array(out_seq).astype("int32"), new_lod0
-class TestSequenceEraseOp(OpTest):
+class TestSequenceEraseOpInt32(OpTest):
    def setUp(self):
        self.op_type = "sequence_erase"
        in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
@@ -44,5 +44,35 @@ class TestSequenceEraseOp(OpTest):
        self.check_output()
+class TestSequenceEraseOpInt64(OpTest):
+    def setUp(self):
+        self.op_type = "sequence_erase"
+        in_seq = np.random.randint(0, 10, (30, 1)).astype("int64")
+        lod = [[0, 9, 13, 24, 30]]
+        tokens = [2, 3, 5]
+        out_seq, new_lod0 = sequence_erase(in_seq, lod[0], tokens)
+        self.attrs = {'tokens': tokens}
+        self.inputs = {'X': (in_seq, lod)}
+        self.outputs = {'Out': (out_seq, [new_lod0])}
+    def test_check_output(self):
+        self.check_output()
+class TestSequenceEraseOpEmpty(OpTest):
+    def setUp(self):
+        self.op_type = "sequence_erase"
+        in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
+        lod = [[0, 9, 13, 24, 30]]
+        tokens = []
+        out_seq, new_lod0 = sequence_erase(in_seq, lod[0], tokens)
+        self.attrs = {'tokens': tokens}
+        self.inputs = {'X': (in_seq, lod)}
+        self.outputs = {'Out': (out_seq, [new_lod0])}
+    def test_check_output(self):
+        self.check_output()
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/v2/fluid/tests/test_sequence_reshape.py
+++ b/python/paddle/v2/fluid/tests/test_sequence_reshape.py
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import unittest
+import numpy as np
+import math
+from op_test import OpTest
+class TestSequenceReshape(OpTest):
+    def setUp(self):
+        self.op_type = 'sequence_reshape'
+        dimension = 12
+        x_lod = [[0, 4, 5, 8, 11]]
+        x = np.random.uniform(0.1, 1, [11, 24]).astype('float32')
+        self.inputs = {'X': (x, x_lod)}
+        self.attrs = {'new_dim': dimension}
+        out, out_lod = self.compute_output(x, x_lod, dimension)
+        self.outputs = {'Out': (out, out_lod)}
+    def compute_output(self, x, x_lod, dimension):
+        x_width = x.shape[1]
+        out_lod = [[0]]
+        for i in xrange(len(x_lod[0]) - 1):
+            seq_len = x_lod[0][i + 1] - x_lod[0][i]
+            offset = (seq_len * x_width) / dimension
+            assert int(offset) * dimension == seq_len * x_width
+            out_lod[0].append(out_lod[0][-1] + int(offset))
+        out = np.zeros(shape=(out_lod[0][-1], dimension)).astype('float32')
+        out.ravel()[:] = x.ravel()[:]
+        return out, out_lod
+    def test_check_output(self):
+        self.check_output()
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+class TestSequenceReshape_reduce(TestSequenceReshape):
+    def setUp(self):
+        self.op_type = 'sequence_reshape'
+        dimension = 24
+        x_lod = [[0, 4, 6, 8, 12]]
+        x = np.random.uniform(0.1, 1, [12, 12]).astype('float32')
+        self.inputs = {'X': (x, x_lod)}
+        self.attrs = {'new_dim': dimension}
+        out, out_lod = self.compute_output(x, x_lod, dimension)
+        self.outputs = {'Out': (out, out_lod)}
+class TestSequenceReshape_same(TestSequenceReshape):
+    def setUp(self):
+        self.op_type = 'sequence_reshape'
+        dimension = 12
+        x_lod = [[0, 4, 6, 8, 12]]
+        x = np.random.uniform(0.1, 1, [12, 12]).astype('float32')
+        self.inputs = {'X': (x, x_lod)}
+        self.attrs = {'new_dim': dimension}
+        out, out_lod = self.compute_output(x, x_lod, dimension)
+        self.outputs = {'Out': (out, out_lod)}
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/v2/fluid/tests/test_split_selected_rows_op.py
+++ b/python/paddle/v2/fluid/tests/test_split_selected_rows_op.py
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import unittest
+import paddle.v2.fluid.core as core
+import numpy as np
+from paddle.v2.fluid.op import Operator
+class TestSpliteSelectedRows(unittest.TestCase):
+    def get_places(self):
+        places = [core.CPUPlace()]
+        if core.is_compile_gpu():
+            places.append(core.CUDAPlace(0))
+        return places
+    def test_check_output(self):
+        for place in self.get_places():
+            self.check_with_place(place)
+    def test_check_grad(self):
+        for place in self.get_places():
+            self.check_grad_with_place(place)
+    def check_with_place(self, place):
+        scope = core.Scope()
+        rows = [0, 5, 7, 4]
+        height = 10
+        row_numel = 2
+        # initialize input variable X
+        x = scope.var('X').get_selected_rows()
+        x.set_rows(rows)
+        x.set_height(height)
+        np_array = np.ones((len(rows), row_numel)).astype("float32")
+        np_array[0, 0] = 2.0
+        np_array[2, 1] = 4.0
+        x_tensor = x.get_tensor()
+        x_tensor.set(np_array, place)
+        rows_sections = [2, 2]
+        height_sections = []
+        # initialize output variables [out0, out1]
+        out0 = scope.var('out0').get_selected_rows()
+        out1 = scope.var('out1').get_selected_rows()
+        # expected output selected rows
+        expected_out0_rows = [0, 5]
+        expected_out1_rows = [7, 4]
+        expected_height = height
+        op = Operator(
+            "split_selected_rows",
+            X="X",
+            Out=["out0", "out1"],
+            rows_sections=rows_sections,
+            height_sections=height_sections)
+        op.run(scope, place)
+        self.assertEqual(out0.rows(), expected_out0_rows)
+        self.assertEqual(out1.rows(), expected_out1_rows)
+        self.assertEqual(out0.height(), expected_height)
+        self.assertEqual(out1.height(), expected_height)
+        self.assertAlmostEqual(2.0, np.array(out0.get_tensor())[0, 0])
+        self.assertAlmostEqual(4.0, np.array(out1.get_tensor())[0, 1])
+    def check_grad_with_place(self, place):
+        scope = core.Scope()
+        height = 10
+        row_numel = 2
+        # attr
+        rows_sections = [2, 2]
+        height_sections = []
+        # initialize input variable X
+        out0_grad = scope.var("out0@GRAD").get_selected_rows()
+        rows0 = [0, 5]
+        out0_grad.set_rows(rows0)
+        out0_grad.set_height(height)
+        out0_grad_tensor = out0_grad.get_tensor()
+        np_array = np.ones((len(rows0), row_numel)).astype("float32")
+        np_array[0, 0] = 2.0
+        out0_grad_tensor.set(np_array, place)
+        out1_grad = scope.var("out1@GRAD").get_selected_rows()
+        rows1 = [7, 5]
+        out1_grad.set_rows(rows1)
+        out1_grad.set_height(height)
+        out1_grad_tensor = out1_grad.get_tensor()
+        np_array = np.ones((len(rows1), row_numel)).astype("float32")
+        np_array[0, 1] = 4.0
+        out1_grad_tensor.set(np_array, place)
+        x_grad = scope.var("X@GRAD").get_selected_rows()
+        grad_op = Operator(
+            "sum",
+            X=["out0@GRAD", "out1@GRAD"],
+            Out="X@GRAD",
+            rows_sections=rows_sections,
+            height_sections=height_sections)
+        grad_op.run(scope, place)
+        self.assertEqual(x_grad.rows(), rows0 + rows1)
+        self.assertEqual(x_grad.height(), height)
+        self.assertAlmostEqual(2.0, np.array(x_grad.get_tensor())[0, 0])
+        self.assertAlmostEqual(4.0, np.array(x_grad.get_tensor())[2, 1])
+if __name__ == "__main__":
+    unittest.main()