Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into dist_train_benchmark_vgg16

08b529a7 · typhoonzero · b38452df · 336e8db1 · 08b529a7 · 08b529a7
430 changed file
--- a/.copyright.hook
+++ b/.copyright.hook
@@ -49,10 +49,15 @@ def generate_copyright(template, lang='C'):
        LANG_COMMENT_MARK = "//"
    lines = template.split(NEW_LINE_MARK)
-    ans = LANG_COMMENT_MARK + " " + COPYRIGHT_HEADER + NEW_LINE_MARK
+    BLANK = " "
+    ans = LANG_COMMENT_MARK + BLANK + COPYRIGHT_HEADER + NEW_LINE_MARK
    for lino, line in enumerate(lines):
        if lino == 0 or lino == 1 or lino == len(lines) - 1: continue
-        ans += LANG_COMMENT_MARK + " " + line + NEW_LINE_MARK
+        if len(line)  == 0:
+            BLANK = ""
+        else:
+            BLANK = " "
+        ans += LANG_COMMENT_MARK + BLANK + line + NEW_LINE_MARK
    return ans + "\n"
@@ -62,6 +67,8 @@ def lang_type(filename):
        return "Python"
    elif filename.endswith(".h"):
        return "C"
+    elif filename.endswith(".c"):
+        return "C"
    elif filename.endswith(".hpp"):
        return "C"
    elif filename.endswith(".cc"):
@@ -77,10 +84,13 @@ def lang_type(filename):
    elif filename.endswith(".proto"):
        return "C"
    else:
-        print("Unsupported filetype")
+        print("Unsupported filetype %s", filename)
        exit(0)
+PYTHON_ENCODE = re.compile("^[ \t\v]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)")
 def main(argv=None):
    parser = argparse.ArgumentParser(
        description='Checker for copyright declaration.')
@@ -89,9 +99,14 @@ def main(argv=None):
    retv = 0
    for filename in args.filenames:
-        first_line = io.open(filename).readline()
+        fd = io.open(filename, encoding="utf-8")
-        if "COPYRIGHT" in first_line.upper() : continue
+        first_line = fd.readline()
-        original_contents = io.open(filename).read()
+        second_line = fd.readline()
+        if "COPYRIGHT (C)" in first_line.upper(): continue
+        if first_line.startswith("#!") or PYTHON_ENCODE.match(
+                second_line) != None or PYTHON_ENCODE.match(first_line) != None:
+            continue
+        original_contents = io.open(filename, encoding="utf-8").read()
        new_contents = generate_copyright(
            COPYRIGHT, lang_type(filename)) + original_contents
        print('Auto Insert Copyright Header {}'.format(filename))

--- a/benchmark/paddle/image/alexnet.py
+++ b/benchmark/paddle/image/alexnet.py
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
+# Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
+# you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# You may obtain a copy of the License at
 #
-#    http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
+# Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
+# distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
+# See the License for the specific language governing permissions and
-#limitations under the License.
+# limitations under the License.
-#!/usr/bin/env python
 from paddle.trainer_config_helpers import *

--- a/benchmark/paddle/image/googlenet.py
+++ b/benchmark/paddle/image/googlenet.py
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
 #!/usr/bin/env python
 from paddle.trainer_config_helpers import *

--- a/benchmark/paddle/image/provider.py
+++ b/benchmark/paddle/image/provider.py
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
+# Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
+# you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# You may obtain a copy of the License at
 #
-#    http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
+# Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
+# distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
+# See the License for the specific language governing permissions and
-#limitations under the License.
+# limitations under the License.
 import io, os
 import random
 import numpy as np

--- a/benchmark/paddle/image/resnet.py
+++ b/benchmark/paddle/image/resnet.py
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
 #!/usr/bin/env python
 from paddle.trainer_config_helpers import *

--- a/benchmark/paddle/image/smallnet_mnist_cifar.py
+++ b/benchmark/paddle/image/smallnet_mnist_cifar.py
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
 #!/usr/bin/env python
 from paddle.trainer_config_helpers import *

--- a/benchmark/paddle/image/vgg.py
+++ b/benchmark/paddle/image/vgg.py
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
 #!/usr/bin/env python
 from paddle.trainer_config_helpers import *

--- a/benchmark/paddle/rnn/imdb.py
+++ b/benchmark/paddle/rnn/imdb.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from __future__ import print_function
 import six.moves.cPickle as pickle
 import gzip

--- a/benchmark/paddle/rnn/provider.py
+++ b/benchmark/paddle/rnn/provider.py
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
+# Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
+# you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# You may obtain a copy of the License at
 #
-#    http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
+# Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
+# distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
+# See the License for the specific language governing permissions and
-#limitations under the License.
+# limitations under the License.
 import io, os
 import random
 import numpy as np

--- a/benchmark/paddle/rnn/rnn.py
+++ b/benchmark/paddle/rnn/rnn.py
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
 #!/usr/bin/env python
 from paddle.trainer_config_helpers import *

--- a/benchmark/tensorflow/image/alexnet.py
+++ b/benchmark/tensorflow/image/alexnet.py
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
+# Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
+# you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# You may obtain a copy of the License at
 #
-#    http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
+# Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
+# distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
+# See the License for the specific language governing permissions and
-#limitations under the License.
+# limitations under the License.
 from six.moves import xrange  # pylint: disable=redefined-builtin
 from datetime import datetime
 import math

--- a/benchmark/tensorflow/image/alexnet_multi_gpu.py
+++ b/benchmark/tensorflow/image/alexnet_multi_gpu.py
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
+# Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
+# you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# You may obtain a copy of the License at
 #
-#    http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
+# Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
+# distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
+# See the License for the specific language governing permissions and
-#limitations under the License.
+# limitations under the License.
 from six.moves import xrange  # pylint: disable=redefined-builtin
 from datetime import datetime
 import math

--- a/benchmark/tensorflow/image/googlenet.py
+++ b/benchmark/tensorflow/image/googlenet.py
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
+# Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
+# you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# You may obtain a copy of the License at
 #
-#    http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
+# Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
+# distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
+# See the License for the specific language governing permissions and
-#limitations under the License.
+# limitations under the License.
 from six.moves import xrange
 from datetime import datetime
 import math

--- a/benchmark/tensorflow/image/googlenet_multi_gpu.py
+++ b/benchmark/tensorflow/image/googlenet_multi_gpu.py
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
+# Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
+# you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# You may obtain a copy of the License at
 #
-#    http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
+# Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
+# distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
+# See the License for the specific language governing permissions and
-#limitations under the License.
+# limitations under the License.
 from six.moves import xrange  # pylint: disable=redefined-builtin
 from datetime import datetime
 import math

--- a/benchmark/tensorflow/image/smallnet_mnist_cifar.py
+++ b/benchmark/tensorflow/image/smallnet_mnist_cifar.py
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
+# Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
+# you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# You may obtain a copy of the License at
 #
-#    http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
+# Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
+# distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
+# See the License for the specific language governing permissions and
-#limitations under the License.
+# limitations under the License.
 from six.moves import xrange  # pylint: disable=redefined-builtin
 from datetime import datetime
 import math

--- a/benchmark/tensorflow/rnn/reader.py
+++ b/benchmark/tensorflow/rnn/reader.py
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
+# Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
+# you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# You may obtain a copy of the License at
 #
-#    http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
+# Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
+# distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
+# See the License for the specific language governing permissions and
-#limitations under the License.
+# limitations under the License.
 import os.path
 import io
 import numpy as np

--- a/benchmark/tensorflow/rnn/rnn.py
+++ b/benchmark/tensorflow/rnn/rnn.py
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
 #!/usr/bin/env python
 from six.moves import xrange  # pylint: disable=redefined-builtin
 import math

--- a/benchmark/tensorflow/rnn/rnn_multi_gpu.py
+++ b/benchmark/tensorflow/rnn/rnn_multi_gpu.py
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
 #!/usr/bin/env python
 from six.moves import xrange  # pylint: disable=redefined-builtin
 import re

--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -100,6 +100,11 @@ IF(NOT ${CBLAS_FOUND})
                \"${CBLAS_INSTALL_DIR}/lib -> ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}\"
            )"
        )
+        INSTALL(CODE "execute_process(
+            COMMAND rm -r ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}/cmake
+                    ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}/pkgconfig
+            )"
+        )
    ENDIF()
 ENDIF(NOT ${CBLAS_FOUND})

--- a/cmake/make_resource.py
+++ b/cmake/make_resource.py
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
+# Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
+# you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# You may obtain a copy of the License at
 #
-#    http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
+# Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
+# distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
+# See the License for the specific language governing permissions and
-#limitations under the License.
+# limitations under the License.
 import os
 import re
 import sys

--- a/doc/api/v1/data_provider/src/mnist_config.py
+++ b/doc/api/v1/data_provider/src/mnist_config.py
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
+# Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
+# you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# You may obtain a copy of the License at
 #
-#    http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
+# Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
+# distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
+# See the License for the specific language governing permissions and
-#limitations under the License.
+# limitations under the License.
 from paddle.trainer_config_helpers import *
 define_py_data_sources2(

--- a/doc/api/v1/data_provider/src/mnist_provider.dict.py
+++ b/doc/api/v1/data_provider/src/mnist_provider.dict.py
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
+# Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
+# you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# You may obtain a copy of the License at
 #
-#    http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
+# Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
+# distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
+# See the License for the specific language governing permissions and
-#limitations under the License.
+# limitations under the License.
 from paddle.trainer.PyDataProvider2 import *

--- a/doc/api/v1/data_provider/src/sentimental_config.py
+++ b/doc/api/v1/data_provider/src/sentimental_config.py
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
+# Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
+# you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# You may obtain a copy of the License at
 #
-#    http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
+# Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
+# distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
+# See the License for the specific language governing permissions and
-#limitations under the License.
+# limitations under the License.
 from paddle.trainer_config_helpers import *
 dictionary = dict()

--- a/doc/api/v1/data_provider/src/sentimental_provider.py
+++ b/doc/api/v1/data_provider/src/sentimental_provider.py
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
+# Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
+# you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# You may obtain a copy of the License at
 #
-#    http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
+# Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
+# distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
+# See the License for the specific language governing permissions and
-#limitations under the License.
+# limitations under the License.
 from paddle.trainer.PyDataProvider2 import *

--- a/doc/api/v2/fluid/layers.rst
+++ b/doc/api/v2/fluid/layers.rst
@@ -504,3 +504,8 @@ l2_normalize
 ------------
 ..  autofunction:: paddle.v2.fluid.layers.l2_normalize
    :noindex:
+sequence_reshape
+----------------
+..  autofunction:: paddle.v2.fluid.layers.sequence_reshape
+    :noindex:
--- a/doc/design/fluid.md
+++ b/doc/design/fluid.md
@@ -105,18 +105,10 @@ There are two ways to execute a Fluid program.  When a program is executed, it c
 There is a C++ class [`Executor`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h), which runs a `ProgramDesc`, similar to how an interpreter runs a Python program.
-Fluid is moving towards the direction of a compiler, which is explain in more detail later in this article.
+Fluid is moving towards the direction of a compiler, which is explain in [fluid_compiler.md](fluid_compiler.md).
 ## Backward Compatibility of Fluid
 Given all the advantages from the removal of the concept of a *model*, hardware manufacturers might still prefer the existence of the concept of a model, so it would be easier for them to support multiple frameworks all at once and could run a trained model during inference.  For example, Nervana, a startup company acquired by Intel, has been working on an XPU that reads the models in the format known as [n-graph](https://github.com/NervanaSystems/ngraph).  Similarly, [Movidius](https://www.movidius.com/) is producing a mobile deep learning chip that reads and runs graphs of operators.  The well-known [ONNX](https://github.com/onnx/onnx) is also a file format of graphs of operators.
 For Fluid, we can write a converter that extracts the parts in the `ProgramDesc` protobuf message, converts them into a graph of operators, and exports the graph into the ONNX or n-graph format.
-## Towards a Deep Learning Language and the Compiler
-We can change the `if-then-else` and loop structure a little bit in the above Fluid example programs, to make it into a new programming language, different than Python.
-Even if we do not invent a new language, as long as we get the `ProgramDesc` message filled in, we can write a transpiler, which translates each invocation to an operator, into a C++ call to a kernel function of that operator. For example, a transpiler that weaves the CUDA kernels outputs an NVIDIA-friendly C++ program, which can be built using `nvcc`.  Another transpiler could generate MKL-friendly code that should be built using `icc` from Intel.  More interestingly, we can translate a Fluid program into its distributed version of two `ProgramDesc` messages, one for running on the trainer process, and the other one for the parameter server.  For more details of the last example, the [concurrent programming design](concurrent_programming.md) document would be a good pointer.  The following figure explains the proposed two-stage process:
-![](fluid-compiler.png)
--- a/doc/design/fluid_compiler.md
+++ b/doc/design/fluid_compiler.md
+# PaddlePaddle Fluid: Towards a Compiled Programming Language
+As described in [fluid.md](fluid.md), when a Fluid application program
+runs, it generates a `ProgramDesc` protobuf message as an intermediate
+representation of itself.  The C++ class `Executor` can run this
+protobuf message as an interpreter.  This article describes the Fluid
+compiler.
+![](fluid-compiler.png)
+## ProgramDesc
+Before we go deeper into the idea of compiled language, let us take a
+look at a simple example Fluid application.
+```python
+import "fluid"
+func paddlepaddle() {
+  X = fluid.read(...)
+  W = fluid.Tensor(...)
+  Y = fluid.mult(X, W)
+}
+```
+This program consists of a [block](block.md) of three operators --
+`read`, `assign`, and `mult`.  Its `ProgramDesc` message looks like
+the following
+```protobuf
+message ProgramDesc {
+  block[0] = Block {
+    vars = [X, W, Y],
+    ops = [
+      read(output = X)
+      assign(input = ..., output = W)
+      mult(input = {X, W}, output = Y)
+    ],
+  }
+}
+```
+## Transpilers
+We can write a transpiler program that takes a `ProgramDesc`, e.g.,
+the above one, and outputs another `ProgramDesc`.  Let us take some
+examples:
+1. *Memory optimization transpiler*: We can write a transpiler that
+   inserts some `FreeMemoryOp`s in the above example `ProgramDesc` so
+   to free memory early, before the end of an iteration, so to keep a
+   small memory footprint.
+1. *Distributed training transpiler*: We can write a transpiler that
+   converts a`ProgramDesc` into its distributed version of two
+   `ProgramDesc`s -- one for running by the trainer processes and the
+   other for the parameter server.
+In the rest of this article, we talk about a special kind of
+transpiler, *Native code generator*, which takes a `ProgramDesc` and
+generates a `.cu` (or `.cc`) file, which could be built by C++
+compilers (gcc, nvcc, icc) into binaries.
+## Native Code Generator
+For the above example, the native code generator transpiler, say, the
+CUDA code generator, should generate a `main` function:
+```c++
+void main() {
+  auto X = fluid_cuda_read(...);
+  auto W = fluid_cuda_create_tensor(...);
+  auto Y = fluid_cuda_mult(X, W);
+}
+```
+and the definitions of functions `fluid_cuda_read`,
+`fluid_cuda_create_tensor`, and `fluid_cuda_mult`.  Please be aware
+that each function could just define a C++ instance of an operator and
+run it.  For example
+```c++
+paddle::Tensor fluid_cuda_read(...) {
+  paddle::Tensor t;
+  paddle::operator::Read r(&t, ...);
+  r.Run();
+  return t;
+}
+```
+For computational operators that have multiple *kernels*, each for a
+specific hardware platform, for example, the `mult` operator, the
+generated code should call its CUDA kernel:
+```c++
+paddle::Tensor fluid_cuda_mult(const paddle::Tensor& a, 
+                               const paddle::Tensor& b) {
+  paddle::Tensor t;
+  paddle::operator::Mult m(a, b, ...);
+  Mult.Run(cuda_context);
+}
+```
+where `cuda_context` could be a global variable of type
+`paddle::CUDADeviceContext`.
+## Multi-Block Code Generation
+Most Fluid application programs may have more than one blocks.  To
+execute them, we need to trace [scopes](scope.md).
--- a/doc/faq/local/src/reduce_min_pool_size.py
+++ b/doc/faq/local/src/reduce_min_pool_size.py
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
+# Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
+# you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# You may obtain a copy of the License at
 #
-#    http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
+# Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
+# distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
+# See the License for the specific language governing permissions and
-#limitations under the License.
+# limitations under the License.
 @provider(min_pool_size=0, ...)
 def process(settings, filename):
    os.system('shuf %s > %s.shuf' % (filename, filename))  # shuffle before.

--- a/doc/faq/local/src/word2vec_config.py
+++ b/doc/faq/local/src/word2vec_config.py
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
+# Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
+# you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# You may obtain a copy of the License at
 #
-#    http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
+# Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
+# distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
+# See the License for the specific language governing permissions and
-#limitations under the License.
+# limitations under the License.
 ...  # the settings and define data provider is omitted.
 DICT_DIM = 3000  # dictionary dimension.
 word_ids = data_layer('word_ids', size=DICT_DIM)

--- a/doc/faq/local/src/word2vec_dataprovider.py
+++ b/doc/faq/local/src/word2vec_dataprovider.py
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
+# Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
+# you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# You may obtain a copy of the License at
 #
-#    http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
+# Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
+# distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
+# See the License for the specific language governing permissions and
-#limitations under the License.
+# limitations under the License.
 DICT_DIM = 3000

--- a/doc/faq/model/index_cn.rst
+++ b/doc/faq/model/index_cn.rst
@@ -67,3 +67,14 @@
  * 不同于上述介绍的recurrent layer , :code:`paddle.networks.lstmemory_unit` 定义了LSTM单元在一个时间步内的计算过程，它并不是一个完整的recurrent layer，也不能接收序列数据作为输入；
  * :code:`paddle.networks.lstmemory_unit` 只能在recurrent_group中作为step function使用；
+5. PaddlePaddle的softmax能否指定计算的维度
+-----------------------------------------
+PaddlePaddle的softmax不能指定计算维度，只能按行计算。
+在图像任务中，对于NCHW，如果需要在C维度计算softmax，可以先使用 :code:`paddle.layer.switch_order` 改变维度顺序，即将NCHW转换成NHWC，再做一定的reshape，最后计算softmax。
+6. PaddlePaddle是否支持维数可变的数据输入
+------------------------------------------
+PaddlePaddle提供的 :code:`paddle.data_type.dense_array` 支持维数可变的数据输入。在使用时，将对应数据层的维数设置成一个大于输入数据维数的值用于占位即可。
--- a/doc/getstarted/concepts/src/infer.py
+++ b/doc/getstarted/concepts/src/infer.py
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
+# Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
+# you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# You may obtain a copy of the License at
 #
-#    http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
+# Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
+# distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
+# See the License for the specific language governing permissions and
-#limitations under the License.
+# limitations under the License.
 import paddle.v2 as paddle
 import numpy as np

--- a/doc/getstarted/concepts/src/train.py
+++ b/doc/getstarted/concepts/src/train.py
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
+# Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
+# you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# You may obtain a copy of the License at
 #
-#    http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
+# Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
+# distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
+# See the License for the specific language governing permissions and
-#limitations under the License.
+# limitations under the License.
 import paddle.v2 as paddle
 import numpy as np

--- a/doc/howto/dev/new_op_kernel_en.md
+++ b/doc/howto/dev/new_op_kernel_en.md
+## Add Kernels for a New Device
+### Background
+PaddlePaddle Fluid have hundreds of operators.  Each operator could have one or more kernels.  A kernel is an implementation of the operator for a certain device, which could be a hardware device, e.g., the CUDA GPU, or a library that utilizes a device, e.g., Intel MKL that makes full use of the Xeon CPU.
+[This document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_en.md) explains how to add an operator, and its kernels.  The kernels of an operator are indexed by a C++ type [`OpKernelType`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md).  An operator chooses the right kernel at runtime.  This choosing mechanism is described [here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md).
+### Write Kernels for A New Device 
+#### Add A New Device
+  For some historical reaons, we misuse the word *library* for *device*.  For example, we call the deivce type by *library type*.  An example is the header file [`library_type.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/library_type.h#L24).  We will correct this ASAP.
+To register a new device, we need to add an enum value to `LibraryType`:
+```
+enum class LibraryType {
+  kPlain = 0,
+  kMKLDNN = 1,
+  kCUDNN = 2,
+};
+```
+#### Add A New [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L53)
+If you have a new kind of Device, firstly you need to add a new kind of [`Place`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L53). For example `CUDAPlace`:
+```cpp
+struct CUDAPlace {
+  CUDAPlace() : CUDAPlace(0) {}
+  explicit CUDAPlace(int d) : device(d) {}
+  inline int GetDeviceId() const { return device; }
+  // needed for variant equality comparison
+  inline bool operator==(const CUDAPlace &o) const {
+    return device == o.device;
+  }
+  inline bool operator!=(const CUDAPlace &o) const { return !(*this == o); }
+  int device;
+};
+typedef boost::variant<CUDAPlace, CPUPlace> Place;
+```
+#### Add [device context]((https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L37))
+After a new kind of Device is added, you should add a corresponding [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L37) for it.
+```cpp
+class DeviceContext {
+ public:
+  virtual ~DeviceContext() {}
+  virtual Place GetPlace() const = 0;
+  virtual void Wait() const {}
+};
+```
+#### Implement new [OpKernel](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L351) for your Device.
+A detailed documentation can be found in [`new_op_and_kernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_en.md)
+```cpp
+class OpKernelBase {
+ public:
+  /**
+   * ExecutionContext is the only parameter of Kernel Run function.
+   * Run will get input/output variables, state such as momentum and
+   * device resource such as CUDA stream, cublas handle, etc. from
+   * ExecutionContext. User should construct it before run the Operator.
+   */
+  virtual void Compute(const ExecutionContext& context) const = 0;
+  virtual ~OpKernelBase() = default;
+};
+template <typename T>
+class OpKernel : public OpKernelBase {
+ public:
+  using ELEMENT_TYPE = T;
+};
+```
+#### Register the OpKernel to framework
+After writing the components described above, we should register the kernel to the framework.
+We use `REGISTER_OP_KERNEL` to do the registration.
+```cpp
+REGISTER_OP_KERNEL(
+	op_type,
+	library_type,
+	place_type,
+	kernel0, kernel1, ...)
+```
+kernel0, kernel1 are kernels that have the same `op_type`, `library_type`, `place_type` but different `data_types`.
+take [`conv2d`]((https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/conv_cudnn_op.cu.cc#L318)) as an example:
+	```cpp
+	REGISTER_OP_KERNEL(conv2d, CPU, paddle::platform::CPUPlace,
+    		paddle::operators::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
+    		paddle::operators::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
+	REGISTER_OP_KERNEL(conv2d, CUDNN, ::paddle::platform::CUDAPlace,
+	       paddle::operators::CUDNNConvOpKernel<float>,
+	       paddle::operators::CUDNNConvOpKernel<double>);
+	```
+In the code above:
+ - `conv2d` is the type/name of the operator
+ - `CUDNN/CPU` is `library`
+ - `paddle::platform::CUDAPlace/CPUPlace` is `place`
+ - template parameter `float/double` on `CUDNNConvOpKernel<T>` is `data_type`.
--- a/doc/howto/usage/cluster/src/k8s_train/start_paddle.py
+++ b/doc/howto/usage/cluster/src/k8s_train/start_paddle.py
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
 #!/usr/bin/python
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #

--- a/doc/howto/usage/cluster/src/word2vec/api_train_v2.py
+++ b/doc/howto/usage/cluster/src/word2vec/api_train_v2.py
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
+# Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
+# you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# You may obtain a copy of the License at
 #
-#    http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
+# Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
+# distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
+# See the License for the specific language governing permissions and
-#limitations under the License.
+# limitations under the License.
 import gzip
 import math

--- a/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py
+++ b/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
+# Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
+# you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# You may obtain a copy of the License at
 #
-#    http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
+# Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
+# distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
+# See the License for the specific language governing permissions and
-#limitations under the License.
+# limitations under the License.
 import math
 import os
 import paddle.v2 as paddle

--- a/doc/howto/usage/cluster/src/word2vec/prepare.py
+++ b/doc/howto/usage/cluster/src/word2vec/prepare.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import paddle.v2 as paddle
 import tarfile
 import os

--- a/go/pserver/client/c/test/test_mnist.py
+++ b/go/pserver/client/c/test/test_mnist.py
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
+# Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
+# you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# You may obtain a copy of the License at
 #
-#    http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
+# Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
+# distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
+# See the License for the specific language governing permissions and
-#limitations under the License.
+# limitations under the License.
 import paddle.v2 as paddle
 import gzip

--- a/go/pserver/client/c/test/test_train.py
+++ b/go/pserver/client/c/test/test_train.py
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
+# Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
+# you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# You may obtain a copy of the License at
 #
-#    http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
+# Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
+# distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
+# See the License for the specific language governing permissions and
-#limitations under the License.
+# limitations under the License.
 import paddle.v2 as paddle
 import paddle.v2.dataset.uci_housing as uci_housing
 import paddle.v2.master as master

--- a/paddle/api/test/testTrainConfig.py
+++ b/paddle/api/test/testTrainConfig.py
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
+# Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
+# you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# You may obtain a copy of the License at
 #
-#    http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
+# Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
+# distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
+# See the License for the specific language governing permissions and
-#limitations under the License.
+# limitations under the License.
 from paddle.trainer_config_helpers import *
 settings(batch_size=100, learning_method=AdamOptimizer())

--- a/paddle/capi/examples/model_inference/dense/main.c
+++ b/paddle/capi/examples/model_inference/dense/main.c
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 #include <paddle/capi.h>
 #include <time.h>

--- a/paddle/capi/examples/model_inference/dense/merge_v2_model.py
+++ b/paddle/capi/examples/model_inference/dense/merge_v2_model.py
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
+# Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
+# you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# You may obtain a copy of the License at
 #
-#    http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
+# Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
+# distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
+# See the License for the specific language governing permissions and
-#limitations under the License.
+# limitations under the License.
 from paddle.utils.merge_model import merge_v2_model
 from mnist_v2 import network

--- a/paddle/capi/examples/model_inference/dense/mnist_v2.py
+++ b/paddle/capi/examples/model_inference/dense/mnist_v2.py
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
+# Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
+# you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# You may obtain a copy of the License at
 #
-#    http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
+# Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
+# distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
+# See the License for the specific language governing permissions and
-#limitations under the License.
+# limitations under the License.
 import os
 import sys
 import gzip

--- a/paddle/capi/examples/model_inference/dense/trainer_config.py
+++ b/paddle/capi/examples/model_inference/dense/trainer_config.py
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
+# Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
+# you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# You may obtain a copy of the License at
 #
-#    http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
+# Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
+# distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
+# See the License for the specific language governing permissions and
-#limitations under the License.
+# limitations under the License.
-from paddle.trainer_config_helpers import *
-img = data_layer(name='pixel', size=784)
-hidden = fc_layer(
-    input=img,
-    size=200,
-    param_attr=ParamAttr(name='hidden.w'),
-    bias_attr=ParamAttr(name='hidden.b'))
-prob = fc_layer(
-    input=hidden,
-    size=10,
-    act=SoftmaxActivation(),
-    param_attr=ParamAttr(name='prob.w'),
-    bias_attr=ParamAttr(name='prob.b'))
-outputs(prob)
--- a/paddle/capi/examples/model_inference/multi_thread/main.c
+++ b/paddle/capi/examples/model_inference/multi_thread/main.c
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 #include <paddle/capi.h>
 #include <pthread.h>
 #include <time.h>

--- a/paddle/capi/examples/model_inference/multi_thread/main_gpu.c
+++ b/paddle/capi/examples/model_inference/multi_thread/main_gpu.c
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 #include <paddle/capi.h>
 #include <pthread.h>
 #include <time.h>

--- a/paddle/capi/examples/model_inference/sequence/main.c
+++ b/paddle/capi/examples/model_inference/sequence/main.c
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 #include <paddle/capi.h>
 #include <time.h>
 #include "../common/common.h"

--- a/paddle/capi/examples/model_inference/sequence/trainer_config.py
+++ b/paddle/capi/examples/model_inference/sequence/trainer_config.py
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
+# Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
+# you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# You may obtain a copy of the License at
 #
-#    http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
+# Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
+# distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
+# See the License for the specific language governing permissions and
-#limitations under the License.
+# limitations under the License.
 from paddle.trainer_config_helpers import *
 WORD_DIM = 3000

--- a/paddle/capi/examples/model_inference/sparse_binary/main.c
+++ b/paddle/capi/examples/model_inference/sparse_binary/main.c
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 #include <paddle/capi.h>
 #include <time.h>

--- a/paddle/capi/tests/test_predict_network.py
+++ b/paddle/capi/tests/test_predict_network.py
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
+# Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
+# you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# You may obtain a copy of the License at
 #
-#    http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
+# Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
+# distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
+# See the License for the specific language governing permissions and
-#limitations under the License.
+# limitations under the License.
 from paddle.trainer_config_helpers import *
 settings(batch_size=100)

--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -33,8 +33,14 @@ cc_library(scope SRCS scope.cc DEPS glog threadpool)
 cc_test(scope_test SRCS scope_test.cc DEPS scope)
 cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor)
+nv_test(data_device_transform_test SRCS data_device_transform_test.cu
+        DEPS operator op_registry init math_function)
 cc_library(data_type_transform SRCS data_type_transform.cc DEPS tensor)
+cc_test(data_type_transform_test SRCS data_type_transform_test.cc DEPS data_type_transform)
 cc_library(data_layout_transform SRCS data_layout_transform.cc DEPS tensor math_function)
+cc_test(data_layout_transform_test SRCS data_layout_transform_test.cc DEPS data_layout_transform)
 cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor
        framework_proto selected_rows data_device_transform data_type_transform data_layout_transform)
@@ -82,5 +88,3 @@ cc_test(init_test SRCS init_test.cc DEPS init)
 cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto)
 cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
-nv_test(data_device_transform_test SRCS data_device_transform_test.cu
-        DEPS operator op_registry init math_function)
--- a/paddle/framework/data_device_transform.cc
+++ b/paddle/framework/data_device_transform.cc
@@ -31,7 +31,7 @@ static const platform::DeviceContext* GetDeviceContext(
  }
 }
-void DeviceTransform(const Tensor& in, const platform::Place& dst_place,
+void TransDataDevice(const Tensor& in, const platform::Place& dst_place,
                     Tensor* out) {
  VLOG(3) << "DeviceTransform in, src_place " << in.place()
          << " dst_place: " << dst_place;

--- a/paddle/framework/data_device_transform.h
+++ b/paddle/framework/data_device_transform.h
@@ -21,7 +21,7 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
-void DeviceTransform(const Tensor& in, const platform::Place& dst_place,
+void TransDataDevice(const Tensor& in, const platform::Place& dst_place,
                     Tensor* out);
 }  // namespace framework

--- a/paddle/framework/data_device_transform_test.cu
+++ b/paddle/framework/data_device_transform_test.cu
@@ -150,6 +150,7 @@ TEST(Operator, CPUtoGPU) {
  // get output
  auto* output2 = scope.Var("OUT2");
  gpu_op->Run(scope, cuda_place);
+  VLOG(3) << "after gpu_op run";
  // auto* output2_ptr = output2->Get<LoDTensor>().data<float>();
  DeviceContextPool& pool = DeviceContextPool::Instance();

--- a/paddle/framework/data_layout_transform.cc
+++ b/paddle/framework/data_layout_transform.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
-Licensed under the Apache License, Version 2.0 (the "License");
+// Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
+// you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
+// You may obtain a copy of the License at
+//
-    http://www.apache.org/licenses/LICENSE-2.0
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
-Unless required by applicable law or agreed to in writing, software
+// Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
+// distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
+// See the License for the specific language governing permissions and
-limitations under the License. */
+// limitations under the License.
 #include "paddle/framework/data_layout_transform.h"
-#include "paddle/framework/tensor.h"
 #include "paddle/operators/math/math_function.h"
 namespace paddle {
 namespace framework {
+std::vector<int> GetAxis(const DataLayout& from, const DataLayout& to) {
+  PADDLE_ENFORCE_NE(from, to,
+                    "layout transform should transform different layout");
+  if (from == DataLayout::kNCHW && to == DataLayout::kNHWC) {
+    return {0, 2, 3, 1};
+  } else if (from == DataLayout::kNHWC && to == DataLayout::kNCHW) {
+    return {0, 3, 1, 2};
+  } else {
+    PADDLE_THROW("unsupported transform");
+  }
+}
 struct CastDataLayout {
  CastDataLayout(const platform::DeviceContext* ctx,
                 const std::vector<int>& axis, const framework::Tensor& in,
@@ -44,38 +55,36 @@ struct CastDataLayout {
  }
 };
-void TransDataLayout(const std::vector<int>& axis,
+void TransDataLayout(const OpKernelType& kernel_type_for_var,
-                     const platform::DeviceContext* ctx,
+                     const OpKernelType& expected_kernel_type, const Tensor& in,
-                     const KernelTypePair& kernel_pair, const Variable& in,
+                     Tensor* out) {
-                     Variable* out) {
-  PADDLE_ENFORCE(in.IsType<Tensor>(), "Only support Tensor transform!.");
  PADDLE_ENFORCE(
-      platform::places_are_same_class(kernel_pair.first.place_,
+      platform::places_are_same_class(kernel_type_for_var.place_,
-                                      kernel_pair.second.place_),
+                                      expected_kernel_type.place_),
      "TransDataLayout only support DataLayout transform on same place!");
-  PADDLE_ENFORCE(kernel_pair.first.data_type_ == kernel_pair.second.data_type_,
-                 "TransDataLayout only support Datatype are same!");
-  auto src = in.Get<Tensor>();
+  PADDLE_ENFORCE(arity(in.dims()) == 4, "Input Arity only support 4!");
-  auto* dst = out->GetMutable<Tensor>();
-  PADDLE_ENFORCE(arity(src.dims()) == 4, "Input Arity Only Suppport 4!");
+  auto& pool = platform::DeviceContextPool::Instance();
-  auto src_dim = src.dims();
+  auto src_dim = in.dims();
  std::vector<int64_t> dst_dim;
+  auto axis = GetAxis(kernel_type_for_var.data_layout_,
+                      expected_kernel_type.data_layout_);
  dst_dim.resize(axis.size());
  for (size_t i = 0; i < axis.size(); i++) {
    dst_dim[i] = src_dim[axis[i]];
  }
-  dst->Resize(make_ddim(dst_dim));
+  out->Resize(make_ddim(dst_dim));
-  auto place = kernel_pair.second.place_;
+  out->mutable_data(expected_kernel_type.place_, in.type());
-  dst->mutable_data(place, src.type());
-  auto src_type = kernel_pair.first.data_type_;
+  framework::VisitDataType(
-  framework::VisitDataType(src_type, CastDataLayout(ctx, axis, src, dst));
+      framework::ToDataType(in.type()),
+      CastDataLayout(pool.Get(expected_kernel_type.place_), axis, in, out));
-  dst->set_layout(kernel_pair.second.data_layout_);
+  out->set_layout(expected_kernel_type.data_layout_);
 }
 }  // namespace framework

--- a/paddle/framework/data_layout_transform.h
+++ b/paddle/framework/data_layout_transform.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
-Licensed under the Apache License, Version 2.0 (the "License");
+// Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
+// you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
+// You may obtain a copy of the License at
+//
-    http://www.apache.org/licenses/LICENSE-2.0
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
-Unless required by applicable law or agreed to in writing, software
+// Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
+// distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
+// See the License for the specific language governing permissions and
-limitations under the License. */
+// limitations under the License.
 #pragma once
 #include "paddle/framework/op_kernel_type.h"
+#include "paddle/framework/tensor.h"
 #include "paddle/framework/variable.h"
 namespace paddle {
 namespace framework {
-using KernelTypePair = std::pair<OpKernelType, OpKernelType>;
+std::vector<int> GetAxis(const DataLayout& from, const DataLayout& to);
-void TransDataLayout(const std::vector<int>& axis,
+void TransDataLayout(const OpKernelType& kernel_type_for_var,
-                     const platform::DeviceContext* ctx,
+                     const OpKernelType& expected_kernel_type, const Tensor& in,
-                     const KernelTypePair& kernel_pair, const Variable& in,
+                     Tensor* out);
-                     Variable* out);
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/data_layout_transform_test.cc
+++ b/paddle/framework/data_layout_transform_test.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/framework/data_layout_transform.h"
+#include "gtest/gtest.h"
+#include "paddle/platform/device_context.h"
+TEST(DataTransform, DataLayoutFunction) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  auto place = CPUPlace();
+  Tensor in = Tensor();
+  Tensor out = Tensor();
+  in.mutable_data<double>(make_ddim({2, 3, 1, 2}), place);
+  in.set_layout(DataLayout::kNHWC);
+  auto kernel_nhwc = OpKernelType(proto::DataType::FP32, place,
+                                  DataLayout::kNHWC, LibraryType::kPlain);
+  auto kernel_ncwh = OpKernelType(proto::DataType::FP32, place,
+                                  DataLayout::kNCHW, LibraryType::kPlain);
+  TransDataLayout(kernel_nhwc, kernel_ncwh, in, &out);
+  EXPECT_TRUE(out.layout() == DataLayout::kNCHW);
+  EXPECT_TRUE(out.dims() == make_ddim({2, 2, 3, 1}));
+  TransDataLayout(kernel_ncwh, kernel_nhwc, in, &out);
+  EXPECT_TRUE(in.layout() == DataLayout::kNHWC);
+  EXPECT_TRUE(in.dims() == make_ddim({2, 3, 1, 2}));
+}
\ No newline at end of file
--- a/paddle/framework/data_transform.cc
+++ b/paddle/framework/data_transform.cc
@@ -15,18 +15,50 @@ limitations under the License. */
 #include "paddle/framework/data_transform.h"
 #include "paddle/framework/data_device_transform.h"
+#include "paddle/framework/data_layout_transform.h"
+#include "paddle/framework/data_type_transform.h"
 namespace paddle {
 namespace framework {
+static void PassTensorData(Tensor* from, Tensor* to) {
+  to->ShareDataWith(*from);
+  *from = Tensor();
+}
 void DataTransform(const OpKernelType& expected_kernel_type,
                   const OpKernelType& kernel_type_for_var,
-                   const Tensor& input_tensor, Tensor* out) {
+                   const Tensor& input_tensor, Tensor* output_tensor) {
+  bool transformed = false;
+  Tensor in;
+  in.ShareDataWith(input_tensor);
+  Tensor out;
+  // do layout transform
+  if (NeedTransformLayout(expected_kernel_type.data_layout_,
+                          kernel_type_for_var.data_layout_)) {
+    TransDataLayout(kernel_type_for_var, expected_kernel_type, in, &out);
+    transformed = true;
+    PassTensorData(&out, &in);
+  }
+  if (expected_kernel_type.data_type_ != kernel_type_for_var.data_type_) {
+    TransDataType(kernel_type_for_var, expected_kernel_type, in, &out);
+    transformed = true;
+    PassTensorData(&out, &in);
+  }
+  // do device transform
  if (!platform::is_same_place(kernel_type_for_var.place_,
                               expected_kernel_type.place_)) {
-    DeviceTransform(input_tensor, expected_kernel_type.place_, out);
+    TransDataDevice(in, expected_kernel_type.place_, &out);
+    transformed = true;
+    PassTensorData(&out, &in);
  }
-  PADDLE_ENFORCE_NOT_NULL(out, "out should not be null");
+  PADDLE_ENFORCE(transformed, "No transform is applied, please check!");
+  // get output data
+  output_tensor->ShareDataWith(in);
 }
 void CopyVariableWithTensor(const Variable& in_var, const Tensor& tensor,

--- a/paddle/framework/data_type_transform.cc
+++ b/paddle/framework/data_type_transform.cc
@@ -38,14 +38,11 @@ struct CastDataType {
  template <typename OutType>
  void operator()() {
-    auto place = ctx_->GetPlace();
    auto* in_begin = in_.data<InType>();
-    auto numel = in_.numel();
+    auto* in_end = in_begin + in_.numel();
-    auto* in_end = in_begin + numel;
+    auto* out_begin = out_->mutable_data<OutType>(in_.place());
-    auto* out_begin = out_->mutable_data<OutType>(place);
-    if (platform::is_cpu_place(place)) {
+    if (platform::is_cpu_place(in_.place())) {
      platform::Transform<platform::CPUDeviceContext> trans;
      auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
      trans(*context, in_begin, in_end, out_begin,
@@ -57,38 +54,31 @@ struct CastDataType {
  }
 };
-void TransDataType(const platform::DeviceContext* ctx,
+void TransDataType(const OpKernelType& kernel_type_for_var,
-                   const KernelTypePair& kernel_pair, const Variable& in,
+                   const OpKernelType& expected_kernel_type, const Tensor& in,
-                   Variable* out) {
+                   Tensor* out) {
-  PADDLE_ENFORCE(in.IsType<Tensor>(), "Only Support Tensor transform!.");
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  PADDLE_ENFORCE(
-      platform::places_are_same_class(kernel_pair.first.place_,
-                                      kernel_pair.second.place_),
-      "TransDataType Only Support DataType transform on same place!");
-  auto src = in.Get<Tensor>();
-  auto* dst = out->GetMutable<Tensor>();
-  auto dims = src.dims();
+  out->Resize(in.dims());
-  dst->Resize(dims);
+  auto src_type = kernel_type_for_var.data_type_;
-  auto dst_type = kernel_pair.second.data_type_;
+  auto dst_type = expected_kernel_type.data_type_;
-  auto src_type = kernel_pair.first.data_type_;
+  auto ctx = pool.Get(in.place());
  switch (src_type) {
    case proto::DataType::FP32:
-      framework::VisitDataType(dst_type, CastDataType<float>(src, dst, ctx));
+      framework::VisitDataType(dst_type, CastDataType<float>(in, out, ctx));
      break;
    case proto::DataType::FP64:
-      framework::VisitDataType(dst_type, CastDataType<double>(src, dst, ctx));
+      framework::VisitDataType(dst_type, CastDataType<double>(in, out, ctx));
      break;
    case proto::DataType::INT32:
-      framework::VisitDataType(dst_type, CastDataType<int>(src, dst, ctx));
+      framework::VisitDataType(dst_type, CastDataType<int>(in, out, ctx));
      break;
    case proto::DataType::INT64:
-      framework::VisitDataType(dst_type, CastDataType<int64_t>(src, dst, ctx));
+      framework::VisitDataType(dst_type, CastDataType<int64_t>(in, out, ctx));
      break;
    case proto::DataType::BOOL:
-      framework::VisitDataType(dst_type, CastDataType<bool>(src, dst, ctx));
+      framework::VisitDataType(dst_type, CastDataType<bool>(in, out, ctx));
      break;
    default:
      PADDLE_THROW("Not support type %d", src_type);

--- a/paddle/framework/data_type_transform.h
+++ b/paddle/framework/data_type_transform.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include "paddle/framework/op_kernel_type.h"
+#include "paddle/framework/tensor.h"
 #include "paddle/framework/variable.h"
 #include "paddle/platform/device_context.h"
@@ -23,9 +24,9 @@ namespace framework {
 using KernelTypePair = std::pair<OpKernelType, OpKernelType>;
-void TransDataType(const platform::DeviceContext* ctx,
+void TransDataType(const OpKernelType& kernel_type_for_var,
-                   const KernelTypePair& kernel_pair, const Variable& in,
+                   const OpKernelType& expected_kernel_type, const Tensor& in,
-                   Variable* out);
+                   Tensor* out);
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/data_type_transform_test.cc
+++ b/paddle/framework/data_type_transform_test.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/framework/data_type_transform.h"
+#include "gtest/gtest.h"
+TEST(DataTypeTransform, CPUTransform) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  auto place = CPUPlace();
+  Tensor in;
+  Tensor out;
+  float* ptr = in.mutable_data<float>(make_ddim({2, 3}), place);
+  int data_number = 2 * 3;
+  for (int i = 0; i < data_number; ++i) {
+    ptr[i] = i / 3;
+  }
+  auto kernel_fp32 = OpKernelType(proto::DataType::FP32, place,
+                                  DataLayout::kAnyLayout, LibraryType::kPlain);
+  auto kernel_fp64 = OpKernelType(proto::DataType::FP64, place,
+                                  DataLayout::kAnyLayout, LibraryType::kPlain);
+  auto kernel_int32 = OpKernelType(proto::DataType::INT32, place,
+                                   DataLayout::kAnyLayout, LibraryType::kPlain);
+  TransDataType(kernel_fp32, kernel_fp64, in, &out);
+  double* out_data_double = out.data<double>();
+  for (int i = 0; i < data_number; ++i) {
+    ASSERT_EQ(out_data_double[i], static_cast<double>(i / 3));
+  }
+  TransDataType(kernel_fp32, kernel_int32, in, &out);
+  int* out_data_int = out.data<int>();
+  for (int i = 0; i < data_number; ++i) {
+    ASSERT_EQ(out_data_int[i], static_cast<int>(i / 3));
+  }
+}
--- a/paddle/framework/eigen_test.cc
+++ b/paddle/framework/eigen_test.cc
@@ -11,18 +11,6 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-/*
-  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-  Licensed under the Apache License, Version 2.0 (the "License");
-  you may not use this file except in compliance with the License.
-  You may obtain a copy of the License at
-  http://www.apache.org/licenses/LICENSE-2.0
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License.
-*/
 #include "paddle/framework/eigen.h"
 #include <gtest/gtest.h>

--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -23,6 +23,7 @@ limitations under the License. */
 #include "paddle/framework/op_registry.h"
 #include "paddle/platform/place.h"
+DECLARE_bool(do_memory_benchmark);
 DEFINE_bool(check_nan_inf, false,
            "Checking whether operator produce NAN/INF or not. It will be "
            "extremely slow so please use this flag wisely.");
@@ -117,6 +118,10 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
    auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
    VLOG(3) << op->DebugStringEx(local_scope);
    op->Run(*local_scope, place_);
+    if (FLAGS_do_memory_benchmark) {
+      VLOG(2) << "Memory used after operator " + op->Type() + " running: "
+              << memory::memory_usage(place_);
+    }
    if (FLAGS_check_nan_inf) {
      for (auto& vname : op->OutputVars(true)) {
        auto* var = local_scope->FindVar(vname);
@@ -130,6 +135,12 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
  if (create_vars && create_local_scope) {
    scope->DeleteScope(local_scope);
  }
+  if (FLAGS_do_memory_benchmark) {
+    VLOG(2) << "-------------------------------------------------------";
+    VLOG(2) << "Memory used after deleting local scope: "
+            << memory::memory_usage(place_);
+    VLOG(2) << "-------------------------------------------------------";
+  }
 }
 }  // namespace framework

--- a/paddle/framework/lod_tensor_test.cc
+++ b/paddle/framework/lod_tensor_test.cc
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
-Licensed under the Apache License, Version 2.0 (the "License");
+// Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
+// you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
+// You may obtain a copy of the License at
+//
-    http://www.apache.org/licenses/LICENSE-2.0
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
-Unless required by applicable law or agreed to in writing, software
+// Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
+// distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
+// See the License for the specific language governing permissions and
-limitations under the License. */
+// limitations under the License.
 #include "paddle/framework/lod_tensor.h"

--- a/paddle/framework/lod_tensor_test.cu
+++ b/paddle/framework/lod_tensor_test.cu
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//    http://www.apache.org/licenses/LICENSE-2.0
+//     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-/*
-  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-  Licensed under the Apache License, Version 2.0 (the "License");
-  you may not use this file except in compliance with the License.
-  You may obtain a copy of the License at
-  http://www.apache.org/licenses/LICENSE-2.0
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License.
-*/
 #include <cuda.h>
 #include <cuda_runtime.h>

--- a/paddle/framework/op_kernel_type.h
+++ b/paddle/framework/op_kernel_type.h
@@ -85,9 +85,14 @@ inline std::string KernelTypeToString(const OpKernelType& kernel_key) {
  return stream.str();
 }
+inline bool NeedTransformLayout(const DataLayout& l, const DataLayout& r) {
+  return l != DataLayout::kAnyLayout && r != DataLayout::kAnyLayout && l != r;
+}
 inline bool TransFromNeeded(const OpKernelType& l, const OpKernelType& r) {
  return (!platform::places_are_same_class(l.place_, r.place_)) ||
-         (l.data_type_ != r.data_type_) || (l.data_layout_ != r.data_layout_);
+         (l.data_type_ != r.data_type_) ||
+         NeedTransformLayout(l.data_layout_, r.data_layout_);
 }
 }  // namespace framework

--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -485,9 +485,15 @@ void OperatorWithKernel::Run(const Scope& scope,
  // }
  auto expected_kernel_key = this->GetExpectedKernelType(ctx);
  VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
+  auto kernel_iter = kernels.find(expected_kernel_key);
+  if (kernel_iter == kernels.end()) {
+    PADDLE_THROW("op %s does not have kernel for %s", type_,
+                 KernelTypeToString(expected_kernel_key));
+  }
+  // do data transform
  Scope& new_scope = scope.NewScope();
  for (auto& var_name_item : this->Inputs()) {
@@ -520,8 +526,6 @@ void OperatorWithKernel::Run(const Scope& scope,
    }
  }
-  auto kernel_iter = kernels.find(expected_kernel_key);
  auto* new_dev_ctx = pool.Get(expected_kernel_key.place_);
  kernel_iter->second->Compute(
      ExecutionContext(*this, new_scope, *new_dev_ctx));

--- a/paddle/framework/scope.cc
+++ b/paddle/framework/scope.cc
@@ -20,6 +20,10 @@ limitations under the License. */
 #include "paddle/framework/threadpool.h"
 #include "paddle/string/printf.h"
+DEFINE_bool(do_memory_benchmark, false,
+            "Doing memory benchmark. It will make deleting scope synchronized, "
+            "and add some memory usage logs");
 namespace paddle {
 namespace framework {
@@ -88,8 +92,12 @@ void Scope::DeleteScope(Scope* scope) {
  auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
  PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);
  this->kids_.erase(it);
-  // Make delete async.
+  // When making memory benchmark on Fluid, we have to delete scope sync.
-  Async([scope] { delete scope; });
+  if (FLAGS_do_memory_benchmark) {
+    delete scope;
+  } else {
+    Async([scope] { delete scope; });
+  }
 }
 void Scope::Rename(const std::string& origin_name,

--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@@ -11,18 +11,6 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-/*
-  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-  Licensed under the Apache License, Version 2.0 (the "License");
-  you may not use this file except in compliance with the License.
-  You may obtain a copy of the License at
-  http://www.apache.org/licenses/LICENSE-2.0
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License.
-*/
 #include "paddle/framework/tensor.h"
 #include <gtest/gtest.h>
@@ -60,9 +48,6 @@ TEST(Tensor, DataAssert) {
  ASSERT_TRUE(caught);
 }
-/* following tests are not available at present
-   because Memory::Alloc() and Memory::Free() have not been ready.
-*/
 TEST(Tensor, MutableData) {
  {
    framework::Tensor src_tensor;

--- a/paddle/framework/tensor_util_test.cc
+++ b/paddle/framework/tensor_util_test.cc
@@ -11,18 +11,6 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-/*
-  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-  Licensed under the Apache License, Version 2.0 (the "License");
-  you may not use this file except in compliance with the License.
-  You may obtain a copy of the License at
-  http://www.apache.org/licenses/LICENSE-2.0
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License.
-*/
 #include "paddle/framework/tensor_util.h"
 #include <gtest/gtest.h>

--- a/paddle/framework/variable.h
+++ b/paddle/framework/variable.h
@@ -11,18 +11,6 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-/*
-  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-  Licensed under the Apache License, Version 2.0 (the "License");
-  you may not use this file except in compliance with the License.
-  You may obtain a copy of the License at
-  http://www.apache.org/licenses/LICENSE-2.0
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License.
-*/
 #pragma once
 #include <memory>

--- a/paddle/framework/variable_test.cc
+++ b/paddle/framework/variable_test.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 /*
  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
  Licensed under the Apache License, Version 2.0 (the "License");

--- a/paddle/gserver/layers/PriorBox.cpp
+++ b/paddle/gserver/layers/PriorBox.cpp
@@ -65,14 +65,19 @@ bool PriorBoxLayer::init(const LayerMap& layerMap,
  std::copy(pbConf.aspect_ratio().begin(),
            pbConf.aspect_ratio().end(),
            std::back_inserter(tmp));
-  // flip
-  int inputRatioLength = tmp.size();
+  if (maxSize_.size() > 0) CHECK_EQ(minSize_.size(), maxSize_.size());
-  for (int index = 0; index < inputRatioLength; index++) {
-    aspectRatio_.push_back(tmp[index]);
+  // flip aspect ratios
-    aspectRatio_.push_back(1 / tmp[index]);
+  for (int index = 0; index < tmp.size(); index++) {
+    real ar = tmp[index];
+    if (fabs(ar - 1.) < 1e-6) continue;
+    aspectRatio_.push_back(ar);
+    aspectRatio_.push_back(1. / ar);
  }
-  numPriors_ = aspectRatio_.size();
-  if (maxSize_.size() > 0) numPriors_++;
+  numPriors_ = aspectRatio_.size() * minSize_.size() + maxSize_.size();
  return true;
 }
@@ -99,50 +104,39 @@ void PriorBoxLayer::forward(PassType passType) {
    for (int w = 0; w < layerWidth; ++w) {
      real centerX = (w + 0.5) * stepW;
      real centerY = (h + 0.5) * stepH;
-      real minSize = 0;
      for (size_t s = 0; s < minSize_.size(); s++) {
-        // first prior.
+        real minSize = minSize_[s];
-        minSize = minSize_[s];
        real boxWidth = minSize;
        real boxHeight = minSize;
-        // xmin, ymin, xmax, ymax.
-        tmpPtr[idx++] = (centerX - boxWidth / 2.) / imageWidth;
+        // priors with different aspect ratios
-        tmpPtr[idx++] = (centerY - boxHeight / 2.) / imageHeight;
+        for (size_t r = 0; r < aspectRatio_.size(); r++) {
-        tmpPtr[idx++] = (centerX + boxWidth / 2.) / imageWidth;
+          real ar = aspectRatio_[r];
-        tmpPtr[idx++] = (centerY + boxHeight / 2.) / imageHeight;
+          boxWidth = minSize * sqrt(ar);
-        // set the variance.
+          boxHeight = minSize / sqrt(ar);
-        for (int t = 0; t < 4; t++) tmpPtr[idx++] = variance_[t];
+          tmpPtr[idx++] = (centerX - boxWidth / 2.) / imageWidth;
+          tmpPtr[idx++] = (centerY - boxHeight / 2.) / imageHeight;
+          tmpPtr[idx++] = (centerX + boxWidth / 2.) / imageWidth;
+          tmpPtr[idx++] = (centerY + boxHeight / 2.) / imageHeight;
+          // set the variance.
+          for (int t = 0; t < 4; t++) tmpPtr[idx++] = variance_[t];
+        }
        if (maxSize_.size() > 0) {
-          CHECK_EQ(minSize_.size(), maxSize_.size());
+          // square prior with size sqrt(minSize * maxSize)
-          // second prior.
+          real maxSize = maxSize_[s];
-          for (size_t s = 0; s < maxSize_.size(); s++) {
+          boxWidth = boxHeight = sqrt(minSize * maxSize);
-            real maxSize = maxSize_[s];
+          tmpPtr[idx++] = (centerX - boxWidth / 2.) / imageWidth;
-            boxWidth = boxHeight = sqrt(minSize * maxSize);
+          tmpPtr[idx++] = (centerY - boxHeight / 2.) / imageHeight;
-            tmpPtr[idx++] = (centerX - boxWidth / 2.) / imageWidth;
+          tmpPtr[idx++] = (centerX + boxWidth / 2.) / imageWidth;
-            tmpPtr[idx++] = (centerY - boxHeight / 2.) / imageHeight;
+          tmpPtr[idx++] = (centerY + boxHeight / 2.) / imageHeight;
-            tmpPtr[idx++] = (centerX + boxWidth / 2.) / imageWidth;
+          // set the variance.
-            tmpPtr[idx++] = (centerY + boxHeight / 2.) / imageHeight;
+          for (int t = 0; t < 4; t++) tmpPtr[idx++] = variance_[t];
-            // set the variance.
-            for (int t = 0; t < 4; t++) tmpPtr[idx++] = variance_[t];
-          }
        }
      }
-      // rest of priors.
-      for (size_t r = 0; r < aspectRatio_.size(); r++) {
-        real ar = aspectRatio_[r];
-        if (fabs(ar - 1.) < 1e-6) continue;
-        real boxWidth = minSize * sqrt(ar);
-        real boxHeight = minSize / sqrt(ar);
-        tmpPtr[idx++] = (centerX - boxWidth / 2.) / imageWidth;
-        tmpPtr[idx++] = (centerY - boxHeight / 2.) / imageHeight;
-        tmpPtr[idx++] = (centerX + boxWidth / 2.) / imageWidth;
-        tmpPtr[idx++] = (centerY + boxHeight / 2.) / imageHeight;
-        // set the variance.
-        for (int t = 0; t < 4; t++) tmpPtr[idx++] = variance_[t];
-      }
    }
  }
  // clip the prior's coordidate such that it is within [0, 1]
  for (int d = 0; d < dim * 2; ++d)
    if ((d % 8) < 4)

--- a/paddle/operators/assign_value_op.cc
+++ b/paddle/operators/assign_value_op.cc
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
-Licensed under the Apache License, Version 2.0 (the "License");
+// Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
+// you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
+// You may obtain a copy of the License at
+//
-    http://www.apache.org/licenses/LICENSE-2.0
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
-Unless required by applicable law or agreed to in writing, software
+// Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
+// distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
+// See the License for the specific language governing permissions and
-limitations under the License. */
+// limitations under the License.
 #include "paddle/operators/assign_value_op.h"

--- a/paddle/operators/assign_value_op.h
+++ b/paddle/operators/assign_value_op.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
-Licensed under the Apache License, Version 2.0 (the "License");
+// Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
+// you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
+// You may obtain a copy of the License at
+//
-    http://www.apache.org/licenses/LICENSE-2.0
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
-Unless required by applicable law or agreed to in writing, software
+// Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
+// distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
+// See the License for the specific language governing permissions and
-limitations under the License. */
+// limitations under the License.
 #pragma once

--- a/paddle/operators/compare_op.cc
+++ b/paddle/operators/compare_op.cc
@@ -39,6 +39,11 @@ N-dim tensor. X and Y could be any type.  The each element of the Out tensor is
 calculated by %s
 )DOC",
                               comment.type, comment.equation));
+    AddAttr<int>("axis",
+                 "(int, default -1). The start dimension index "
+                 "for broadcasting Y onto X.")
+        .SetDefault(-1)
+        .EqualGreaterThan(-1);
  }
 };
@@ -95,11 +100,5 @@ REGISTER_LOGICAL_OP(less_than, "Out = X < Y");
 REGISTER_LOGICAL_KERNEL(less_than, CPU, paddle::operators::LessThanFunctor);
 REGISTER_LOGICAL_OP(less_equal, "Out = X <= Y");
 REGISTER_LOGICAL_KERNEL(less_equal, CPU, paddle::operators::LessEqualFunctor);
-REGISTER_LOGICAL_OP(greater_than, "Out = X > Y");
-REGISTER_LOGICAL_KERNEL(greater_than, CPU,
-                        paddle::operators::GreaterThanFunctor);
-REGISTER_LOGICAL_OP(greater_equal, "Out = X >= Y");
-REGISTER_LOGICAL_KERNEL(greater_equal, CPU,
-                        paddle::operators::GreaterEqualFunctor);
 REGISTER_LOGICAL_OP(equal, "Out = X == Y");
 REGISTER_LOGICAL_KERNEL(equal, CPU, paddle::operators::EqualFunctor);
--- a/paddle/operators/compare_op.cu
+++ b/paddle/operators/compare_op.cu
@@ -16,8 +16,4 @@ limitations under the License. */
 REGISTER_LOGICAL_KERNEL(less_than, CUDA, paddle::operators::LessThanFunctor);
 REGISTER_LOGICAL_KERNEL(less_equal, CUDA, paddle::operators::LessEqualFunctor);
-REGISTER_LOGICAL_KERNEL(greater_than, CUDA,
-                        paddle::operators::GreaterThanFunctor);
-REGISTER_LOGICAL_KERNEL(greater_equal, CUDA,
-                        paddle::operators::GreaterEqualFunctor);
 REGISTER_LOGICAL_KERNEL(equal, CUDA, paddle::operators::EqualFunctor);
--- a/paddle/operators/compare_op.h
+++ b/paddle/operators/compare_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <math.h>
 #include <type_traits>
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/elementwise_op_function.h"
 #include "paddle/platform/transform.h"
 namespace paddle {
@@ -33,18 +34,6 @@ struct LessEqualFunctor {
  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a <= b; }
 };
-template <typename T>
-struct GreaterThanFunctor {
-  using ELEM_TYPE = T;
-  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a > b; }
-};
-template <typename T>
-struct GreaterEqualFunctor {
-  using ELEM_TYPE = T;
-  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a >= b; }
-};
 template <typename T>
 struct EqualFunctor {
  using ELEM_TYPE = T;
@@ -65,14 +54,7 @@ class CompareOpKernel
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    using T = typename Functor::ELEM_TYPE;
-    auto* x = context.Input<framework::Tensor>("X");
+    ElementwiseComputeEx<Functor, DeviceContext, T, bool>(context);
-    auto* y = context.Input<framework::Tensor>("Y");
-    auto* out = context.Output<framework::Tensor>("Out");
-    Functor binary_func;
-    platform::Transform<DeviceContext> trans;
-    trans(context.template device_context<DeviceContext>(), x->data<T>(),
-          x->data<T>() + x->numel(), y->data<T>(),
-          out->mutable_data<bool>(context.GetPlace()), binary_func);
  }
 };

--- a/paddle/operators/ctc_align_op.cc
+++ b/paddle/operators/ctc_align_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/operators/ctc_align_op.h"
+namespace paddle {
+namespace operators {
+class CTCAlignOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input of CTCAlignOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Output"),
+                   "Output of CTCAlignOp should not be null.");
+    auto input_dims = ctx->GetInputDim("Input");
+    // TODO(wanghaoshuang): it is tricky to set the wrong dimension here.
+    ctx->SetOutputDim("Output", input_dims);
+  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
+        ctx.device_context());
+  }
+};
+class CTCAlignOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  CTCAlignOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Input",
+             "(LodTensor, default: LoDTensor<int>), Its shape is "
+             "[Lp, 1], where Lp is the sum of all input sequences' length.");
+    AddOutput("Output", "(Tensor, default: Tensor<int>), The align result.");
+    AddAttr<int>("blank",
+                 "(int, default: 0), the blank label setted in Connectionist "
+                 "Temporal Classification (CTC) op.")
+        .SetDefault(0);
+    AddAttr<bool>("merge_repeated",
+                  "(bool, default: true), whether to "
+                  "merge repeated elements between two blanks. ")
+        .SetDefault(true);
+    AddComment(R"DOC(
+CTCAlign op is used to merge repeated elements between two blanks
+and then delete all blanks in sequence.
+Given:
+    Input.data = [0, 1, 2, 2, 0, 4, 0, 4, 5, 0, 6,
+                  6, 0, 0, 7, 7, 7, 0]
+    Input.dims = {18, 1}
+    Input.LoD = [[0, 11, 18]]
+And:
+    blank = 0
+    merge_repeated = True
+Then:
+    Output.data = [1, 2, 4, 4, 5, 6,
+                   6, 7]
+    Output.dims = {8, 1}
+    Output.LoD = [[0, 6, 8]]
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(ctc_align, ops::CTCAlignOp, ops::CTCAlignOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    ctc_align, ops::CTCAlignKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::CTCAlignKernel<paddle::platform::CPUDeviceContext, int64_t>);
--- a/paddle/operators/ctc_align_op.cu
+++ b/paddle/operators/ctc_align_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <stdio.h>
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include "paddle/operators/ctc_align_op.h"
+namespace paddle {
+namespace operators {
+template <typename T>
+__global__ void MergeAndDelCudaKernel(const int64_t num_token, const T* tokens,
+                                      const size_t num_seq, size_t* lod0,
+                                      const int blank, const int merge_repeated,
+                                      size_t* out_lod0, T* output) {
+  int ouput_idx = 0;
+  out_lod0[0] = 0;
+  for (int i = 0; i < num_seq; ++i) {
+    T pre_token = -1;
+    for (int j = lod0[i]; j < lod0[i + 1]; ++j) {
+      if (tokens[j] != blank && !(merge_repeated && tokens[j] == pre_token)) {
+        output[ouput_idx] = tokens[j];
+        ++ouput_idx;
+      }
+      pre_token = tokens[j];
+    }
+    out_lod0[i + 1] = ouput_idx;
+  }
+}
+template <typename T>
+class CTCAlignOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace.");
+    const size_t level = 0;
+    auto* input = ctx.Input<LoDTensor>("Input");
+    auto* output = ctx.Output<LoDTensor>("Output");
+    auto input_lod = framework::ToAbsOffset(input->lod());
+    const T* tokens = input->data<T>();
+    const int64_t num_tokens = input->dims()[0];
+    const size_t num_seq = input_lod[level].size() - 1;
+    const int blank = ctx.Attr<int>("blank");
+    const int merge_repeated =
+        static_cast<int>(ctx.Attr<bool>("merge_repeated"));
+    // prepare a lod to record lod information while merging elements
+    thrust::device_vector<size_t> dev_out_lod0(input_lod[level].size());
+    size_t* dev_out_lod0_ptr = thrust::raw_pointer_cast(dev_out_lod0.data());
+    // merge elements and delete blank
+    T* output_data = output->mutable_data<T>({num_tokens, 1}, ctx.GetPlace());
+    auto stream = ctx.cuda_device_context().stream();
+    MergeAndDelCudaKernel<T><<<1, 1, 0, stream>>>(
+        num_tokens, tokens, num_seq, input_lod[level].data(), blank,
+        merge_repeated, dev_out_lod0_ptr, output_data);
+    // set output lod
+    thrust::host_vector<size_t> host_out_lod0(dev_out_lod0.begin(),
+                                              dev_out_lod0.end());
+    framework::LoD out_lod;
+    out_lod.push_back(host_out_lod0);
+    output->set_lod(out_lod);
+    // resize output dims
+    output->Resize({static_cast<int64_t>(host_out_lod0.back()), 1});
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+REGISTER_OP_CUDA_KERNEL(ctc_align, paddle::operators::CTCAlignOpCUDAKernel<int>,
+                        paddle::operators::CTCAlignOpCUDAKernel<int64_t>);
--- a/paddle/operators/ctc_align_op.h
+++ b/paddle/operators/ctc_align_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <string.h>
+#include "paddle/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+template <typename DeviceContext, typename T>
+class CTCAlignKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<LoDTensor>("Input");
+    auto* output = ctx.Output<LoDTensor>("Output");
+    const size_t level = 0;
+    auto input_lod = framework::ToAbsOffset(input->lod());
+    // check input dims and lod
+    auto input_dims = input->dims();
+    PADDLE_ENFORCE_EQ(input_dims[0],
+                      static_cast<int64_t>(input_lod[level].back()),
+                      "The first dimension of Input(Input) should be equal to "
+                      "the sum of all sequences' lengths.");
+    const size_t num_sequences = input_lod[level].size() - 1;
+    size_t blank = static_cast<size_t>(ctx.Attr<int>("blank"));
+    bool merge_repeated = ctx.Attr<bool>("merge_repeated");
+    // merge repeated tokens and delete blank
+    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+    size_t output_idx = 0;
+    std::vector<size_t> output_lod0(1, 0);
+    const T* input_data = input->data<T>();
+    for (size_t seq_idx = 0; seq_idx < num_sequences; ++seq_idx) {
+      T prev_token = -1;
+      for (size_t i = input_lod[level][seq_idx];
+           i < input_lod[level][seq_idx + 1]; ++i) {
+        if (input_data[i] != blank &&
+            !(merge_repeated && input_data[i] == prev_token)) {
+          output_data[output_idx] = input_data[i];
+          ++output_idx;
+        }
+        prev_token = input_data[i];
+      }
+      output_lod0.push_back(output_idx);
+    }
+    // set output lod
+    framework::LoD output_lod;
+    output_lod.push_back(output_lod0);
+    output->set_lod(output_lod);
+    // resize output dims
+    output->Resize({static_cast<int64_t>(output_lod0.back()), 1});
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/edit_distance_op.cc
+++ b/paddle/operators/edit_distance_op.cc
@@ -49,10 +49,10 @@ class EditDistanceOpMaker : public framework::OpProtoAndCheckerMaker {
  EditDistanceOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Hyps",
-             "(2-D LoDTensor<int>, 2nd dim. equal to 1) "
+             "(2-D LoDTensor<int64_t>, 2nd dim. equal to 1) "
             "The indices for hypothesis strings.");
    AddInput("Refs",
-             "(2-D LoDTensor<int>, 2nd dim. equal to 1) "
+             "(2-D LoDTensor<int64_t>, 2nd dim. equal to 1) "
             "The indices for reference strings.");
    AddAttr<bool>("normalized",
                  "(bool, default false) Indicated whether to normalize "
@@ -66,22 +66,22 @@ class EditDistanceOpMaker : public framework::OpProtoAndCheckerMaker {
 EditDistance operator computes the edit distances between a batch of hypothesis
 strings and their references.
-Edit distance, also called Levenshtein distance, measures how dissimilar two strings 
+Edit distance, also called Levenshtein distance, measures how dissimilar two strings
-are by counting the minimum number of operations to transform one string into anthor. 
+are by counting the minimum number of operations to transform one string into anthor.
-Here the operations include insertion, deletion, and substitution. For example, 
+Here the operations include insertion, deletion, and substitution. For example,
-given hypothesis string A = "kitten" and reference B = "sitting", the edit distance 
+given hypothesis string A = "kitten" and reference B = "sitting", the edit distance
-is 3 for A will be transformed into B at least after two substitutions and one 
+is 3 for A will be transformed into B at least after two substitutions and one
 insertion:
   "kitten" -> "sitten" -> "sittin" -> "sitting"
-Input(Hyps) is a LoDTensor consisting of all the hypothesis strings with the total 
+Input(Hyps) is a LoDTensor consisting of all the hypothesis strings with the total
-number denoted by `batch_size`, and the separation is specified by the LoD information. 
+number denoted by `batch_size`, and the separation is specified by the LoD information.
-And the `batch_size` reference strings are arranged in order in the same way in the 
+And the `batch_size` reference strings are arranged in order in the same way in the
 LoDTensor Input(Refs).
-Output(Out) contains the `batch_size` results and each stands for the edit stance 
+Output(Out) contains the `batch_size` results and each stands for the edit stance
-for a pair of strings respectively. If Attr(normalized) is true, the edit distance 
+for a pair of strings respectively. If Attr(normalized) is true, the edit distance
 will be divided by the length of reference string.
 )DOC");
  }

--- a/paddle/operators/edit_distance_op.cu
+++ b/paddle/operators/edit_distance_op.cu
@@ -39,8 +39,8 @@ __global__ void FillFirstColumn(T* dist, const int M, const int N) {
 }
 template <typename T>
-__global__ void Levenshtein(T* dist, const int* x1, const int* x2, const int M,
+__global__ void Levenshtein(T* dist, const int64_t* x1, const int64_t* x2,
-                            const int N, const int start) {
+                            const int M, const int N, const int start) {
  int idx = blockDim.x * blockIdx.x + threadIdx.x;
  int offset = N;
  int index = start + idx * offset;
@@ -113,8 +113,8 @@ class EditDistanceGPUKernel : public framework::OpKernel<T> {
        dist_t.Resize({m + 1, n + 1});
        dist_t.mutable_data<T>(ctx.GetPlace());
        auto dist = dist_t.data<T>();
-        auto x1 = x1_t->data<int>() + hyp_lod[num];
+        auto x1 = x1_t->data<int64_t>() + hyp_lod[num];
-        auto x2 = x2_t->data<int>() + ref_lod[num];
+        auto x2 = x2_t->data<int64_t>() + ref_lod[num];
        FillFirstColumn<T><<<1 + m / PADDLE_CUDA_NUM_THREADS,
                             PADDLE_CUDA_NUM_THREADS, 0, stream>>>(dist, m, n);

--- a/paddle/operators/edit_distance_op.h
+++ b/paddle/operators/edit_distance_op.h
@@ -60,8 +60,8 @@ class EditDistanceKernel : public framework::OpKernel<T> {
        dist_t.Resize({m + 1, n + 1});
        dist_t.mutable_data<T>(ctx.GetPlace());
        auto dist = dist_t.data<T>();
-        auto x1 = x1_t->data<int>() + hyp_lod[num];
+        auto x1 = x1_t->data<int64_t>() + hyp_lod[num];
-        auto x2 = x2_t->data<int>() + ref_lod[num];
+        auto x2 = x2_t->data<int64_t>() + ref_lod[num];
        for (int64_t i = 0; i < m + 1; ++i) {
          dist[i * (n + 1)] = i;
        }

--- a/paddle/operators/elementwise_op_function.h
+++ b/paddle/operators/elementwise_op_function.h
@@ -176,14 +176,15 @@ class MidWiseTransformIterator<T, platform::CUDADeviceContext>
 };
 #endif
-template <typename Functor, typename T, typename DeviceContext>
+template <typename Functor, typename T, typename DeviceContext,
+          typename OutType = T>
 class TransformFunctor {
 public:
  TransformFunctor(const framework::Tensor* x, const framework::Tensor* y,
                   framework::Tensor* z, const DeviceContext& ctx, Functor func)
      : x_(x->data<T>()),
        y_(y->data<T>()),
-        z_(z->mutable_data<T>(ctx.GetPlace())),
+        z_(z->mutable_data<OutType>(ctx.GetPlace())),
        nx_(x->numel()),
        ctx_(ctx),
        func_(func) {}
@@ -208,7 +209,7 @@ class TransformFunctor {
 private:
  const T* x_;
  const T* y_;
-  T* z_;
+  OutType* z_;
  int64_t nx_;
  const DeviceContext& ctx_;
  Functor func_;
@@ -364,15 +365,16 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx) {
  }
 }
-template <typename Functor, typename DeviceContext, typename T>
+template <typename Functor, typename DeviceContext, typename T,
+          typename OutType = T>
 void ElementwiseComputeEx(const framework::ExecutionContext& ctx) {
  using Tensor = framework::Tensor;
  auto* x = ctx.Input<Tensor>("X");
  auto* y = ctx.Input<Tensor>("Y");
  auto* z = ctx.Output<Tensor>("Out");
-  z->mutable_data<T>(ctx.GetPlace());
+  z->mutable_data<OutType>(ctx.GetPlace());
-  TransformFunctor<Functor, T, DeviceContext> functor(
+  TransformFunctor<Functor, T, DeviceContext, OutType> functor(
      x, y, z, ctx.template device_context<DeviceContext>(), Functor());
  auto x_dims = x->dims();

--- a/paddle/operators/linear_chain_crf_op.cc
+++ b/paddle/operators/linear_chain_crf_op.cc
@@ -187,7 +187,7 @@ class LinearChainCRFOp : public framework::OperatorWithKernel {
      const framework::ExecutionContext& ctx) const override {
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<LoDTensor>("Emission")->type()),
-        ctx.device_context());
+        platform::CPUPlace());
  }
 };
@@ -248,7 +248,7 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel {
        framework::ToDataType(
            ctx.Input<LoDTensor>(framework::GradVarName("LogLikelihood"))
                ->type()),
-        ctx.device_context());
+        platform::CPUPlace());
  }
 };

--- a/paddle/operators/linear_chain_crf_op.h
+++ b/paddle/operators/linear_chain_crf_op.h
@@ -65,57 +65,14 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
    const size_t level = 0;
    const size_t seq_num = in_lod[level].size() - 1;
-    // These local variables hold the inputs and outputs, garanteeing them on
+    const LoDTensor* emission_weights = ctx.Input<LoDTensor>("Emission");
-    // CPU memory, to provide a consistent reference.
+    const Tensor* transition_weights = ctx.Input<Tensor>("Transition");
-    // TODO(caoying) Fix this by moving all these local variables into the
+    const LoDTensor* label = ctx.Input<LoDTensor>("Label");
-    // class's data members once we can profile the whole training process.
-    LoDTensor* emission_weights = nullptr;
+    Tensor* emission_exps = ctx.Output<Tensor>("EmissionExps");
-    LoDTensor emission_weight_tensor;
+    Tensor* transition_exps = ctx.Output<Tensor>("TransitionExps");
-    Tensor* transition_weights = nullptr;
+    Tensor* alpha = ctx.Output<Tensor>("Alpha");
-    Tensor transition_weight_tensor;
+    Tensor* ll = ctx.Output<Tensor>("LogLikelihood");
-    LoDTensor* label = nullptr;
-    LoDTensor label_tensor;
-    Tensor* emission_exps = nullptr;
-    Tensor emission_exps_tensor;
-    Tensor* transition_exps = nullptr;
-    Tensor transition_exps_tensor;
-    Tensor* alpha = nullptr;
-    Tensor alpha_tensor;
-    Tensor* ll = nullptr;
-    Tensor ll_tensor;
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-      emission_weights = &emission_weight_tensor;
-      transition_weights = &transition_weight_tensor;
-      label = &label_tensor;
-      CopyInputsToCpuMemory(
-          ctx.device_context(), *ctx.Input<LoDTensor>("Emission"),
-          *ctx.Input<Tensor>("Transition"), *ctx.Input<LoDTensor>("Label"),
-          emission_weights, transition_weights, label);
-      emission_exps = &emission_exps_tensor;
-      emission_exps->Resize(emission_weights->dims());
-      transition_exps = &transition_exps_tensor;
-      transition_exps->Resize(transition_weights->dims());
-      alpha = &alpha_tensor;
-      alpha->Resize(ctx.Output<Tensor>("Alpha")->dims());
-      ll = &ll_tensor;
-    } else {
-      emission_weights =
-          const_cast<LoDTensor*>(ctx.Input<LoDTensor>("Emission"));
-      transition_weights = const_cast<Tensor*>(ctx.Input<Tensor>("Transition"));
-      label = const_cast<LoDTensor*>(ctx.Input<LoDTensor>("Label"));
-      emission_exps = ctx.Output<Tensor>("EmissionExps");
-      transition_exps = ctx.Output<Tensor>("TransitionExps");
-      alpha = ctx.Output<Tensor>("Alpha");
-      ll = ctx.Output<Tensor>("LogLikelihood");
-    }
    // Because the computation codes only runs on CPU, here the memory for all
    // the outputs is FIXED to be allocated on the CPU memory.
@@ -173,61 +130,9 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
          one_seq, one_seq_row_max, one_seq_exps, *transition_weights,
          *transition_exps, one_seq_label, &one_seq_alpha);
    }
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-      CopyOutputsToGpuMemory(
-          ctx.device_context(), *emission_exps, *transition_exps, *alpha, *ll,
-          ctx.Output<Tensor>("EmissionExps"),
-          ctx.Output<Tensor>("TransitionExps"), ctx.Output<Tensor>("Alpha"),
-          ctx.Output<Tensor>("LogLikelihood"));
-    }
  };
 private:
-  void CopyInputsToCpuMemory(const platform::DeviceContext& ctx,
-                             const LoDTensor& emission_weights_src,
-                             const Tensor& transition_weights_src,
-                             const LoDTensor& label_src,
-                             LoDTensor* emission_weights_dst,
-                             Tensor* transition_weights_dst,
-                             LoDTensor* label_dst) const {
-    // Copy the inputs from GPU memory to CPU memory if this operators runs on
-    // GPU device.
-    auto copyLoDTensor = [](const platform::DeviceContext& ctx,
-                            const LoDTensor& src, LoDTensor* dst) {
-      dst->mutable_data<T>(src.dims(), platform::CPUPlace());
-      framework::Copy(src, platform::CPUPlace(), ctx, dst);
-    };
-    copyLoDTensor(ctx, emission_weights_src, emission_weights_dst);
-    copyLoDTensor(ctx, label_src, label_dst);
-    transition_weights_dst->mutable_data<T>(transition_weights_src.dims(),
-                                            platform::CPUPlace());
-    framework::Copy(transition_weights_src, platform::CPUPlace(), ctx,
-                    transition_weights_dst);
-  }
-  void CopyOutputsToGpuMemory(const platform::DeviceContext& ctx,
-                              const Tensor& emission_exps_src,
-                              const Tensor& transition_exps_src,
-                              const Tensor& alpha_src, const Tensor& ll_src,
-                              Tensor* emission_exps_dst,
-                              Tensor* transition_exps_dst, Tensor* alpha_dst,
-                              Tensor* ll_dst) const {
-    // Copy the forward results from CPU memory to GPU memory if this
-    // operators runs on GPU device.
-    auto copyTensor = [](const platform::DeviceContext& ctx, const Tensor& src,
-                         Tensor* dst) {
-      dst->mutable_data<T>(platform::CUDAPlace());
-      framework::Copy(src, platform::CUDAPlace(), ctx, dst);
-    };
-    copyTensor(ctx, emission_exps_src, emission_exps_dst);
-    copyTensor(ctx, transition_exps_src, transition_exps_dst);
-    copyTensor(ctx, alpha_src, alpha_dst);
-    copyTensor(ctx, ll_src, ll_dst);
-  }
  T ForwardOneSequence(const Tensor& emission, const Tensor& emission_row_max,
                       const Tensor& emission_exps, const Tensor& trans_weights,
                       const Tensor& trans_weight_exps, const Tensor& label,
@@ -296,63 +201,17 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
    auto lod = ctx.Input<LoDTensor>("Label")->lod();
    PADDLE_ENFORCE(lod.size(), "Input(Label) must be a sequence.");
-    // These local variables hold the inputs and outputs, garanteeing them on
+    const Tensor* label = ctx.Input<LoDTensor>("Label");
-    // CPU memory, to provide a consistent reference.
+    const Tensor* emission_exps = ctx.Input<Tensor>("EmissionExps");
-    // TODO(caoying) Fix this by moving all these local variables into the
+    const Tensor* transition_exps = ctx.Input<Tensor>("TransitionExps");
-    // class's data members once we can profile the training process, or
+    const Tensor* alpha = ctx.Input<Tensor>("Alpha");
-    // implementing a real GPU kernel for CRF.
+    const T* ll_grad =
-    Tensor* label = nullptr;
+        ctx.Input<Tensor>(framework::GradVarName("LogLikelihood"))->data<T>();
-    Tensor label_tensor;
-    Tensor* emission_exps = nullptr;
-    Tensor emission_exps_tensor;
-    Tensor* transition_exps = nullptr;
-    Tensor transition_exps_tensor;
-    Tensor* alpha = nullptr;
-    Tensor alpha_tensor;
-    Tensor ll_grad_tensor;
-    T* ll_grad = nullptr;
-    Tensor* emission_grad = nullptr;
-    Tensor emission_grad_tensor;
-    Tensor* transition_grad = nullptr;
-    Tensor transition_grad_tensor;
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-      label = &label_tensor;
-      emission_exps = &emission_exps_tensor;
-      transition_exps = &transition_exps_tensor;
-      alpha = &alpha_tensor;
-      CopyInputsToCpuMemory(
-          ctx.device_context(), *ctx.Input<LoDTensor>("Label"),
-          *ctx.Input<Tensor>("EmissionExps"),
-          *ctx.Input<Tensor>("TransitionExps"), *ctx.Input<Tensor>("Alpha"),
-          *ctx.Input<Tensor>(framework::GradVarName("LogLikelihood")), label,
-          emission_exps, transition_exps, alpha, &ll_grad_tensor);
-      ll_grad = ll_grad_tensor.data<T>();
-      if (ctx.Output<Tensor>(framework::GradVarName("Emission"))) {
-        emission_grad = &emission_grad_tensor;
-        emission_grad->Resize(emission_exps->dims());
-      }
-      if (ctx.Output<Tensor>(framework::GradVarName("Transition"))) {
+    Tensor* emission_grad =
-        transition_grad = &transition_grad_tensor;
+        ctx.Output<Tensor>(framework::GradVarName("Emission"));
-        transition_grad->Resize(transition_exps->dims());
+    Tensor* transition_grad =
-      }
+        ctx.Output<Tensor>(framework::GradVarName("Transition"));
-    } else {
-      label = const_cast<LoDTensor*>(ctx.Input<LoDTensor>("Label"));
-      emission_exps = const_cast<Tensor*>(ctx.Input<Tensor>("EmissionExps"));
-      transition_exps =
-          const_cast<Tensor*>(ctx.Input<Tensor>("TransitionExps"));
-      alpha = const_cast<Tensor*>(ctx.Input<Tensor>("Alpha"));
-      ll_grad = const_cast<Tensor*>(
-                    ctx.Input<Tensor>(framework::GradVarName("LogLikelihood")))
-                    ->data<T>();
-      emission_grad = ctx.Output<Tensor>(framework::GradVarName("Emission"));
-      transition_grad =
-          ctx.Output<Tensor>(framework::GradVarName("Transition"));
-    }
    // TODO(caoying) Fix this constraint. When the Input(Emission) is from the
    // data reader operator, it can have no gradients.
@@ -389,58 +248,9 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
          one_seq_emission_exps, *transition_exps, one_seq_alpha, one_seq_label,
          &one_seq_beta, transition_grad, &one_seq_emission_grad);
    }
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-      CopyOutputsToGpuMemory(
-          ctx.device_context(), emission_grad, transition_grad,
-          ctx.Output<Tensor>(framework::GradVarName("Emission")),
-          ctx.Output<Tensor>(framework::GradVarName("Transition")));
-    }
  };
 private:
-  void CopyInputsToCpuMemory(const platform::DeviceContext& ctx,
-                             const LoDTensor& label_src,
-                             const Tensor& emission_exps_src,
-                             const Tensor& transition_exps_src,
-                             const Tensor& alpha_src, const Tensor& ll_grad_src,
-                             Tensor* label_dst, Tensor* emission_exps_dst,
-                             Tensor* transition_exps_dst, Tensor* alpha_dst,
-                             Tensor* ll_grad_dst) const {
-    // Copy the inputs from GPU memory to CPU memory when this operators runs on
-    // GPU device.
-    label_dst->mutable_data<T>(label_src.dims(), platform::CPUPlace());
-    framework::Copy(label_src, platform::CPUPlace(), ctx, label_dst);
-    auto copyTensor = [](const platform::DeviceContext& ctx, const Tensor& src,
-                         Tensor* dst) {
-      dst->mutable_data<T>(src.dims(), platform::CPUPlace());
-      framework::Copy(src, platform::CPUPlace(), ctx, dst);
-    };
-    copyTensor(ctx, emission_exps_src, emission_exps_dst);
-    copyTensor(ctx, transition_exps_src, transition_exps_dst);
-    copyTensor(ctx, alpha_src, alpha_dst);
-    copyTensor(ctx, ll_grad_src, ll_grad_dst);
-  }
-  void CopyOutputsToGpuMemory(const platform::DeviceContext& ctx,
-                              const Tensor* emission_grad_src,
-                              const Tensor* transition_grad_src,
-                              Tensor* emission_grad_dst,
-                              Tensor* transition_grad_dst) const {
-    // Copy the backward results from CPU memory to GPU
-    // memory if this operators runs on GPU device.
-    auto copyTensor = [](const platform::DeviceContext& ctx, const Tensor* src,
-                         Tensor* dst) {
-      if (src && dst) {
-        dst->mutable_data<T>(platform::CUDAPlace());
-        framework::Copy(*src, platform::CUDAPlace(), ctx, dst);
-      }
-    };
-    copyTensor(ctx, emission_grad_src, emission_grad_dst);
-    copyTensor(ctx, transition_grad_src, transition_grad_dst);
-  }
  void BackwardOneSequence(const platform::CPUDeviceContext& ctx,
                           const T ll_grad, const Tensor& emission_exps,
                           const Tensor& transition_exps, const Tensor& alpha,

--- a/paddle/operators/math/matmul.h
+++ b/paddle/operators/math/matmul.h
@@ -41,10 +41,24 @@ class MatMulFunctor {
                      "Input tensor a must be at least 1-dimensional.");
    PADDLE_ENFORCE_GE(dim_b.size(), 1,
                      "Input tensor b must be at least 1-dimensional.");
-    PADDLE_ENFORCE_LE(dim_a.size(), 3,
-                      "Input tensor a must be at most 3-dimensional.");
+    std::vector<int64_t> out_dim;
-    PADDLE_ENFORCE_LE(dim_b.size(), 3,
+    int64_t batch_count = 1;
-                      "Input tensor b must be at most 3-dimensional.");
+    if (dim_a.size() > 3) {
+      PADDLE_ENFORCE(dim_b.size() == dim_a.size(),
+                     "The dimensions of X and Y must be the same, and both of "
+                     "them should be %d-dimensional.",
+                     dim_b.size());
+      // The first rank-2 dimensions are accumulated on the batch_count, and the
+      // last two dimensions are used for matrix multiplication.
+      for (int j = 0; j < dim_a.size() - 2; ++j) {
+        PADDLE_ENFORCE_EQ(dim_b[j], dim_a[j],
+                          "The %d-th dimension of X and Y must be the same.",
+                          j);
+        out_dim.push_back(dim_a[j]);
+        batch_count *= dim_a[j];
+      }
+    }
    int M = 0, N = 0, kA = 0, kB = 0, batchCountA = 0, batchCountB = 0,
        strideA = 0, strideB = 0;
@@ -67,7 +81,11 @@ class MatMulFunctor {
        strideA = M * kA;
        break;
      default:
-        assert(false);
+        batchCountA = batch_count;
+        size_t mat_s = dim_a.size() - 2;
+        M = trans_a ? dim_a[mat_s + 1] : dim_a[mat_s];
+        kA = trans_a ? dim_a[mat_s] : dim_a[mat_s + 1];
+        strideA = M * kA;
    }
    switch (dim_b.size()) {
@@ -88,7 +106,11 @@ class MatMulFunctor {
        strideB = kB * N;
        break;
      default:
-        assert(false);
+        batchCountB = batch_count;
+        size_t mat_s = dim_b.size() - 2;
+        kB = trans_b ? dim_b[mat_s + 1] : dim_b[mat_s];
+        N = trans_b ? dim_b[mat_s] : dim_b[mat_s + 1];
+        strideB = kB * N;
    }
    PADDLE_ENFORCE_EQ(

--- a/paddle/operators/math/sampler.cc
+++ b/paddle/operators/math/sampler.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "sampler.h"
+namespace paddle {
+namespace random {
+Sampler::~Sampler() {}
+UniformSampler::UniformSampler(int64 range)
+    : Sampler(range), inv_range_(1.0 / range) {
+  random_engine_ = std::make_shared<std::mt19937>(seed_);
+  dist_ = std::make_shared<std::uniform_int_distribution<>>(0, range);
+}
+UniformSampler::UniformSampler(int64 range, unsigned int seed)
+    : Sampler(range, seed), inv_range_(1.0 / range) {
+  random_engine_ = std::make_shared<std::mt19937>(seed_);
+  dist_ = std::make_shared<std::uniform_int_distribution<>>(0, range);
+}
+int64 UniformSampler::Sample() const { return (*dist_)(*random_engine_); }
+float UniformSampler::Probability(int64 value) const { return inv_range_; }
+LogUniformSampler::LogUniformSampler(int64 range)
+    : Sampler(range), log_range_(log(range + 1)) {
+  random_engine_ = std::make_shared<std::mt19937>(seed_);
+  dist_ = std::make_shared<std::uniform_real_distribution<>>(0, 1);
+}
+LogUniformSampler::LogUniformSampler(int64 range, unsigned int seed)
+    : Sampler(range, seed), log_range_(log(range + 1)) {
+  random_engine_ = std::make_shared<std::mt19937>(seed_);
+  dist_ = std::make_shared<std::uniform_real_distribution<>>(0, 1);
+}
+int64 LogUniformSampler::Sample() const {
+  // Got Log Uniform distribution from uniform distribution by
+  // inverse_transform_sampling method
+  // More details:
+  // https://wanghaoshuang.github.io/2017/11/Log-uniform-distribution-sampler/
+  const int64 value =
+      static_cast<int64>(exp((*dist_)(*random_engine_) * log_range_)) - 1;
+  // Mathematically, value should be <= range_, but might not be due to some
+  // floating point roundoff, so we mod by range_.
+  return value % range_;
+}
+float LogUniformSampler::Probability(int64 value) const {
+  // Given f(x) = 1/[(x+1) * log_range_]
+  // The value's  probability  is integral of f(x) from value to (value + 1)
+  // More details:
+  // https://wanghaoshuang.github.io/2017/11/Log-uniform-distribution-sampler
+  return (log((value + 2.0) / (value + 1.0))) / log_range_;
+}
+}  // namespace random
+}  // namespace paddle
--- a/paddle/operators/math/sampler.h
+++ b/paddle/operators/math/sampler.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <memory>
+#include <random>
+typedef long int64;
+namespace paddle {
+namespace operators {
+namespace math {
+// TODO(wanghaoshuang): Support for GPU
+/**
+* Sample integers from [0, range).
+*/
+class Sampler {
+ public:
+  explicit Sampler(int64 range) : range_(range) {
+    PADDLE_ENFORCE_GT(range, 0);
+    std::random_device r;
+    seed_ = r();
+  }
+  explicit Sampler(int64 range, unsigned int seed)
+      : range_(range), seed_(seed) {
+    PADDLE_ENFORCE_GT(range, 0);
+  }
+  virtual ~Sampler();
+  // Sample a single value
+  virtual int64 Sample() const = 0;
+  // The probability that a single call to Sample() returns the given value.
+  virtual float Probability(int64 value) const = 0;
+  int64 range() { return range_; };
+ protected:
+  const int64 range_;
+  unsigned int seed_;
+};
+/**
+ * Sample integers from [0, range).
+ * And the distribution function is:
+ * P(x) = 1 / range
+ */
+class UniformSampler : public Sampler {
+ public:
+  explicit UniformSampler(int64 range);
+  explicit UniformSampler(int64 range, unsigned int seed);
+  ~UniformSampler() override {}
+  int64 Sample() const override;
+  float Probability(int64 value) const override;
+ private:
+  const float inv_range_;
+  std::shared_ptr<std::mt19937_64> random_engine_;
+  std::shared_ptr<std::uniform_int_distribution<>> dist_;
+};
+/**
+ * Sample integers from [0, range).
+ * And the distribution function is:
+ * P(x) = (1/ln(range+1)) * ln(1 + 1/(x + 1))
+ */
+class LogUniformSampler : public Sampler {
+ public:
+  explicit LogUniformSampler(int64 range);
+  explicit LogUniformSampler(int64 range, unsigned int seed);
+  ~LogUniformSampler() override {}
+  int64 Sample() const override;
+  float Probability(int64 value) const override;
+ private:
+  const float log_range_;
+  std::shared_ptr<std::mt19937_64> random_engine_;
+  std::shared_ptr<std::uniform_real_distribution<>> dist_;
+};
+}  // math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/matmul_op.cc
+++ b/paddle/operators/matmul_op.cc
@@ -41,10 +41,26 @@ class MatMulOp : public framework::OperatorWithKernel {
                      "Input tensor X must be at least 1-dimensional.");
    PADDLE_ENFORCE_GE(dim_y.size(), 1,
                      "Input tensor Y must be at least 1-dimensional.");
-    PADDLE_ENFORCE_LE(dim_x.size(), 3,
-                      "Input tensor X must be at most 3-dimensional.");
+    std::vector<int64_t> out_dim;
-    PADDLE_ENFORCE_LE(dim_y.size(), 3,
+    int64_t batch_count = 1;
-                      "Input tensor Y must be at most 3-dimensional.");
+    if (dim_x.size() > 3) {
+      PADDLE_ENFORCE_EQ(
+          dim_y.size(), dim_x.size(),
+          "The dimensions of X and Y must be the same, and both of "
+          "them should be %d-dimensional.",
+          dim_x.size());
+      // The first rank-2 dimensions are accumulated on the batch_count, and the
+      // last two dimensions are used for matrix multiplication.
+      for (int j = 0; j < dim_x.size() - 2; ++j) {
+        PADDLE_ENFORCE_EQ(dim_y[j], dim_x[j],
+                          "The %d-th dimension of X and Y must be the same.",
+                          j);
+        out_dim.push_back(dim_x[j]);
+        batch_count *= dim_x[j];
+      }
+    }
    int M = 0, N = 0, KX = 0, KY = 0, batchCountX = 0, batchCountY = 0;
    bool remove_initial_dim = false, remove_final_dim = false;
@@ -70,7 +86,11 @@ class MatMulOp : public framework::OperatorWithKernel {
        KX = transpose_x ? dim_x[1] : dim_x[2];
        break;
      default:
-        assert(false);
+        batchCountX = batch_count;
+        size_t mat_s = dim_x.size() - 2;
+        M = transpose_x ? dim_x[mat_s + 1] : dim_x[mat_s];
+        KX = transpose_x ? dim_x[mat_s] : dim_x[mat_s + 1];
+        break;
    }
    switch (dim_y.size()) {
@@ -94,7 +114,10 @@ class MatMulOp : public framework::OperatorWithKernel {
        N = transpose_y ? dim_y[1] : dim_y[2];
        break;
      default:
-        assert(false);
+        batchCountY = batch_count;
+        size_t mat_s = dim_y.size() - 2;
+        KY = transpose_y ? dim_y[mat_s + 1] : dim_y[mat_s];
+        N = transpose_y ? dim_y[mat_s] : dim_y[mat_s + 1];
    }
    PADDLE_ENFORCE_EQ(
@@ -110,7 +133,11 @@ class MatMulOp : public framework::OperatorWithKernel {
    std::vector<int64_t> dim_out;
    if (batchCount) {
-      dim_out.push_back(batchCount);
+      if (dim_x.size() > 3) {
+        dim_out.insert(dim_out.begin(), out_dim.begin(), out_dim.end());
+      } else {
+        dim_out.push_back(batchCount);
+      }
    }
    if (!remove_initial_dim) {
      dim_out.push_back(M);
@@ -162,10 +189,14 @@ Examples without transpose:
 - X: [B, M, K], Y: [K] => Out: [B, M]
 - X: [M, K], Y: [B, K, N] => Out: [B, M, N]
 - X: [B, M, K], Y: [B, K, N] => Out: [B, M, N]
+- X: [B, ..., M, K], Y: [B, ..., K, N] => Out: [B, ..., M, N]
 The behavior is designed to be similar to the `numpy.matmul` function.
 The differences are:
- Currently only rank 1 to rank 3 input tensors are supported.
+- When the rank of the input data is less than or equal to 3, it
+  is similar to the `numpy.matmul` function.
+- When the rank of the input is greater than 3, the rank of X and
+  Y must be equal, and the first `rank - 2` dimensions must be equal.
 - We add `transpose_X` and `transpose_Y` flags.
 Both the input `X` and `Y` can carry the LoD (Level of Details) information,

--- a/paddle/operators/matmul_op.h
+++ b/paddle/operators/matmul_op.h
@@ -137,6 +137,13 @@ class MatMulGradKernel : public framework::OpKernel<T> {
      y_dims.push_back(1);
    }
+    int batch_count = 0;
+    // The first rank-2 dimensions are accumulated on the batch_count, and the
+    // last two dimensions are used for matrix multiplication.
+    if (x_dims.size() > 3) {
+      batch_count = accumulate(x_dims.begin(), x_dims.end() - 2, 1,
+                               std::multiplies<int>());
+    }
    // Fix the dOut dimensions.
    int M = 0, N = 0, batchCountX = 0, batchCountY = 0;
@@ -149,7 +156,9 @@ class MatMulGradKernel : public framework::OpKernel<T> {
        M = transpose_x ? x_dims[2] : x_dims[1];
        break;
      default:
-        assert(false);
+        batchCountX = batch_count;
+        size_t mat_s = x_dims.size() - 2;
+        M = transpose_x ? x_dims[mat_s + 1] : x_dims[mat_s];
    }
    switch (y_dims.size()) {
@@ -161,7 +170,9 @@ class MatMulGradKernel : public framework::OpKernel<T> {
        N = transpose_y ? y_dims[1] : y_dims[2];
        break;
      default:
-        assert(false);
+        batchCountY = batch_count;
+        size_t mat_s = y_dims.size() - 2;
+        N = transpose_y ? y_dims[mat_s] : y_dims[mat_s + 1];
    }
    if (batchCountX && batchCountY) {
      PADDLE_ENFORCE_EQ(
@@ -172,7 +183,11 @@ class MatMulGradKernel : public framework::OpKernel<T> {
    int batchCount = std::max(batchCountX, batchCountY);
    std::vector<int64_t> dout_dims = {M, N};
    if (batchCount) {
-      dout_dims.insert(dout_dims.begin(), batchCount);
+      if (x_dims.size() > 3) {
+        dout_dims.insert(dout_dims.begin(), x_dims.begin(), x_dims.end() - 2);
+      } else {
+        dout_dims.insert(dout_dims.begin(), batchCount);
+      }
    }
    Tensor X = Reshape<T>(x, make_ddim(x_dims));
    Tensor Y = Reshape<T>(y, make_ddim(y_dims));

--- a/paddle/operators/net_op.cc
+++ b/paddle/operators/net_op.cc
@@ -11,21 +11,6 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-/*
-  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-  Licensed under the Apache License, Version 2.0 (the "License");
-  you may not use this file except in compliance with the License.
-  You may obtain a copy of the License at
-  http://www.apache.org/licenses/LICENSE-2.0
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License.
-*/
 #include "paddle/operators/net_op.h"
 #include <set>

--- a/paddle/operators/recv_op.cc
+++ b/paddle/operators/recv_op.cc
@@ -34,9 +34,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
-constexpr int kCondStart = 0;
+constexpr char kOptimizeBlock[] = "OptimizeBlock";
-constexpr int kCondRunning = 1;
-constexpr int kCondDone = 2;
 void RunServer(std::shared_ptr<detail::AsyncGRPCServer> service) {
  service->RunSyncUpdate();
@@ -99,15 +97,13 @@ class RecvOp : public framework::OperatorBase {
    auto fan_in = Attr<int>("Fanin");
    size_t param_count = param_list.size();
-    std::string program_str = Attr<std::string>("OptimizeProgram");
+    auto *block = Attr<framework::BlockDesc *>(kOptimizeBlock);
-    framework::proto::ProgramDesc program_desc;
+    auto *program = block->Program();
-    program_desc.ParseFromString(program_str);
-    framework::ProgramDesc program(program_desc);
    framework::Executor executor(dev_place);
    // TODO(typhoonzero): change this to a while_op for every cluster-batch.
    bool exit_flag = false;
-    int64_t barrier_size = param_count * fan_in;
+    size_t barrier_size = param_count * fan_in;
    while (!exit_flag) {
      // Get from multiple trainers, we don't care about the order in which
      // the gradients arrives, just add suffix 0~n and merge the gradient.
@@ -142,8 +138,9 @@ class RecvOp : public framework::OperatorBase {
      if (exit_flag) {
        break;
      }
      try {
-        executor.Run(program, &recv_scope, 0, /*global_block*/
+        executor.Run(*program, &recv_scope, block->ID(), /*global_block*/
                     false /*create_local_scope*/, false /*create_vars*/);
      } catch (std::exception &e) {
        LOG(ERROR) << "run sub program error " << e.what();
@@ -175,8 +172,8 @@ This operator will recv tensor from send_op
                         "IP address to listen on.")
        .SetDefault("127.0.0.1:6164")
        .AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
-    AddAttr<std::string>("OptimizeProgram", "type string",
+    AddAttr<framework::BlockDesc *>(
-                         "Serialized ProgramDesc string for recv to run.");
+        kOptimizeBlock, "Serialized ProgramDesc string for recv to run.");
    AddAttr<std::vector<std::string>>(
        "ParamList", "type list of string",
        "grad->param name mapping to find which param to optimize.")

--- a/paddle/operators/scale_op.cc
+++ b/paddle/operators/scale_op.cc
@@ -48,7 +48,7 @@ Scale operator
 $$Out = scale*X$$
 )DOC");
    AddAttr<AttrType>("scale",
-                      "(float, default 0)"
+                      "(float, default 1.0)"
                      "The scaling factor of the scale operator.")
        .SetDefault(1.0);
  }

--- a/paddle/operators/send_op.cc
+++ b/paddle/operators/send_op.cc
@@ -66,7 +66,7 @@ class SendOpMaker : public framework::OpProtoAndCheckerMaker {
    AddOutput("Out", "(Tensor) Output tensor to get from server")
        .AsDuplicable();
    AddComment(R"DOC(
-Recv operator
+Send operator
 This operator will send tensor to recv_op.
 )DOC");

--- a/paddle/operators/send_recv_op_test.cc
+++ b/paddle/operators/send_recv_op_test.cc
@@ -130,10 +130,7 @@ void StartServerNet(bool is_sparse) {
  attrs.insert({"endpoint", std::string("127.0.0.1:6174")});
  attrs.insert({"ParamList", std::vector<std::string>({"Out"})});
  attrs.insert({"GradList", std::vector<std::string>({"x1"})});
-  std::string program_proto;
+  attrs.insert({"OptimizeBlock", block});
-  PADDLE_ENFORCE(program.Proto()->SerializeToString(&program_proto));
-  attrs.insert({"OptimizeProgram", program_proto});
  recv_op = f::OpRegistry::CreateOp("recv", {{"RX", {"x1"}}}, {}, attrs);
  recv_op->Run(scope, place);
 }

--- a/paddle/operators/sequence_expand_op.cc
+++ b/paddle/operators/sequence_expand_op.cc
@@ -58,7 +58,7 @@ This operator expands input(X) according to LOD of input(Y).
 Following are cases to better explain how this works:
 Case 1:
-Given 2-level a LoDTensor input(X)
+Given a 2-level LoDTensor input(X)
    X.lod = [[0,       2, 3],
             [0, 1,    3, 4]]
    X.data = [a, b, c, d]
@@ -75,9 +75,8 @@ then we get 2-level LoDTensor
 Case 2:
-Given a 0-level LoDTensor input(X)
+Given a common Tensor input(X)
    X.data = [a, b, c]
-    X.lod = NULL
    X.dims = [3, 1]
 and input(Y)
    Y.lod = [[0, 2, 3, 6]]
@@ -89,9 +88,8 @@ then we get 1-level LoDTensor
 Case 3:
-Given a 0-level LoDTensor input(X)
+Given a common Tensor input(X)
    X.data = [[a, b], [c, d], [e, f]]
-    X.lod = NULL
    X.dims = [3, 2]
 and input(Y)
    Y.lod = [[0, 2, 3, 6]]

--- a/paddle/operators/sequence_reshape_op.cc
+++ b/paddle/operators/sequence_reshape_op.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/operators/sequence_reshape_op.h"
+#include "paddle/framework/ddim.h"
+namespace paddle {
+namespace operators {
+class SequenceReshapeOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SequenceReshapeOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SequenceReshapeOp should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto x_numel = product(x_dims);
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2U, "Rank of Input(X) should be 2.");
+    int new_dim = ctx->Attrs().Get<int>("new_dim");
+    ctx->SetOutputDim("Out",
+                      {x_numel / new_dim, static_cast<int64_t>(new_dim)});
+  }
+};
+class SequenceReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SequenceReshapeOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(LoDTensor, default LoDTensor<float>) A 2-D LoDTensor with shape "
+             "being [N, M].");
+    AddOutput("Out",
+              "(LoDTensor, default LoDTensor<float>) A 2-D LoDTensor with "
+              "shape [T, new_dim] where T is calculated based on X.lod, M and "
+              "new_dim.");
+    AddAttr<int>("new_dim", "Sequence dimension of the output LoDTensor.");
+    AddComment(R"DOC(
+Sequence Reshape Operator.
+This operator will rearrange the input sequences. The new dimension is set by
+attribute and length of each sequence may change longer or shorter which is
+decided by original length, original dimension and new dimension. The following
+example will help to illustrate the function of this operator:
+x is a LoDTensor:
+    x.lod  = [[0, 2, 6]]
+    x.data = [[1, 2], [3, 4],
+              [5, 6], [7, 8], [9, 10], [11, 12]]
+    x.dims = [6, 2]
+set new_dim = 4
+then out is a LoDTensor:
+    out.lod  = [[0, 1, 3]]
+    out.data = [[1, 2, 3, 4],
+                [5, 6, 7, 8], [9, 10, 11, 12]]
+    out.dims = [3, 4]
+Currently, only 1-level LoDTensor is supported and please make sure (original
+length * original dimension) can be divided by new_dim with no remainder for
+each sequence.
+)DOC");
+  }
+};
+class SequenceReshapeGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(
+        ctx->HasInput(framework::GradVarName("Out")),
+        "Input(Out@GRAD) of SequenceReshapeGradOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SequenceReshapeGradOp should  not be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
+  }
+};
+class SequenceReshapeGradOpMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* op_desc_ptr = new framework::OpDesc();
+    op_desc_ptr->SetType("sequence_reshape_grad");
+    op_desc_ptr->SetInput("X", Input("X"));
+    op_desc_ptr->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op_desc_ptr->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op_desc_ptr->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(op_desc_ptr);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(sequence_reshape, ops::SequenceReshapeOp,
+                  ops::SequenceReshapeOpMaker, ops::SequenceReshapeGradOpMaker);
+REGISTER_OPERATOR(sequence_reshape_grad, ops::SequenceReshapeGradOp);
+REGISTER_OP_CPU_KERNEL(
+    sequence_reshape,
+    ops::SequenceReshapeKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SequenceReshapeKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::SequenceReshapeKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::SequenceReshapeKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(
+    sequence_reshape_grad,
+    ops::SequenceReshapeGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SequenceReshapeGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::SequenceReshapeGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::SequenceReshapeGradKernel<paddle::platform::CPUDeviceContext, int>);
--- a/paddle/operators/sequence_reshape_op.cu
+++ b/paddle/operators/sequence_reshape_op.cu
--- a/paddle/operators/sequence_reshape_op.h
+++ b/paddle/operators/sequence_reshape_op.h
--- a/paddle/operators/split_selected_rows_op.cc
+++ b/paddle/operators/split_selected_rows_op.cc
--- a/paddle/operators/split_selected_rows_op.h
+++ b/paddle/operators/split_selected_rows_op.h
--- a/paddle/optimizer/parameter_optimizer_test.cc
+++ b/paddle/optimizer/parameter_optimizer_test.cc
--- a/paddle/optimizer/serialization.h
+++ b/paddle/optimizer/serialization.h
--- a/paddle/optimizer/serialization_test.cc
+++ b/paddle/optimizer/serialization_test.cc
--- a/paddle/platform/assert.h
+++ b/paddle/platform/assert.h
--- a/paddle/scripts/cluster_train/paddle.py
+++ b/paddle/scripts/cluster_train/paddle.py
--- a/paddle/scripts/cpplint.py
+++ b/paddle/scripts/cpplint.py
--- a/paddle/string/piece.cc
+++ b/paddle/string/piece.cc
--- a/paddle/string/piece.h
+++ b/paddle/string/piece.h
--- a/paddle/string/piece_test.cc
+++ b/paddle/string/piece_test.cc
--- a/paddle/string/printf.h
+++ b/paddle/string/printf.h
--- a/paddle/string/tinyformat/tinyformat.h
+++ b/paddle/string/tinyformat/tinyformat.h
--- a/paddle/trainer/tests/picojson.h
+++ b/paddle/trainer/tests/picojson.h
--- a/paddle/trainer/tests/simple_sparse_neural_network.py
+++ b/paddle/trainer/tests/simple_sparse_neural_network.py
--- a/paddle/trainer/tests/simple_sparse_neural_network_dp.py
+++ b/paddle/trainer/tests/simple_sparse_neural_network_dp.py
--- a/paddle/utils/enable_virtualenv.py
+++ b/paddle/utils/enable_virtualenv.py
--- a/python/paddle/trainer_config_helpers/tests/configs/img_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/img_layers.py
--- a/python/paddle/trainer_config_helpers/tests/configs/img_trans_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/img_trans_layers.py
--- a/python/paddle/trainer_config_helpers/tests/configs/last_first_seq.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/last_first_seq.py
--- a/python/paddle/trainer_config_helpers/tests/configs/layer_activations.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/layer_activations.py
--- a/python/paddle/trainer_config_helpers/tests/configs/math_ops.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/math_ops.py
--- a/python/paddle/trainer_config_helpers/tests/configs/projections.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/projections.py
--- a/python/paddle/trainer_config_helpers/tests/configs/shared_fc.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/shared_fc.py
--- a/python/paddle/trainer_config_helpers/tests/configs/shared_gru.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/shared_gru.py
--- a/python/paddle/trainer_config_helpers/tests/configs/shared_lstm.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/shared_lstm.py
--- a/python/paddle/trainer_config_helpers/tests/configs/simple_rnn_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/simple_rnn_layers.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_BatchNorm3D.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_BatchNorm3D.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_bi_grumemory.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_bi_grumemory.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_bilinear_interp.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_bilinear_interp.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_clip_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_clip_layer.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_config_parser_for_non_file_config.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_config_parser_for_non_file_config.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_conv3d_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_conv3d_layer.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_crop.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_crop.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_cross_entropy_over_beam.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_cross_entropy_over_beam.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_deconv3d_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_deconv3d_layer.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_detection_output_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_detection_output_layer.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_dot_prod_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_dot_prod_layer.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_expand_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_expand_layer.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_factorization_machine.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_factorization_machine.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_fc.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_fc.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_gated_unit_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_gated_unit_layer.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_grumemory_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_grumemory_layer.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_hsigmoid.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_hsigmoid.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_kmax_seq_socre_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_kmax_seq_socre_layer.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_l2_distance_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_l2_distance_layer.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_lstmemory_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_lstmemory_layer.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_multibox_loss_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_multibox_loss_layer.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_multiplex_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_multiplex_layer.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_ntm_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_ntm_layers.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_pad.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_pad.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_pooling3D_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_pooling3D_layer.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_prelu_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_prelu_layer.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_print_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_print_layer.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_recursive_topology.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_recursive_topology.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_repeat_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_repeat_layer.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_resize_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_resize_layer.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_rnn_group.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_rnn_group.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_roi_pool_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_roi_pool_layer.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_row_conv.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_row_conv.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_row_l2_norm_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_row_l2_norm_layer.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_scale_shift_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_scale_shift_layer.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_scale_sub_region_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_scale_sub_region_layer.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_seq_concat_reshape.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_seq_concat_reshape.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_seq_slice_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_seq_slice_layer.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_sequence_pooling.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_sequence_pooling.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_smooth_l1.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_smooth_l1.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_split_datasource.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_split_datasource.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_spp_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_spp_layer.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_sub_nested_seq_select_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_sub_nested_seq_select_layer.py
--- a/python/paddle/trainer_config_helpers/tests/configs/unused_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/unused_layers.py
--- a/python/paddle/trainer_config_helpers/tests/configs/util_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/util_layers.py
--- a/python/paddle/trainer_config_helpers/tests/test_reset_hook.py
+++ b/python/paddle/trainer_config_helpers/tests/test_reset_hook.py
--- a/python/paddle/utils/image_multiproc.py
+++ b/python/paddle/utils/image_multiproc.py
--- a/python/paddle/utils/plotcurve.py
+++ b/python/paddle/utils/plotcurve.py
--- a/python/paddle/v2/dataset/__init__.py
+++ b/python/paddle/v2/dataset/__init__.py
--- a/python/paddle/v2/dataset/common.py
+++ b/python/paddle/v2/dataset/common.py
--- a/python/paddle/v2/dataset/sentiment.py
+++ b/python/paddle/v2/dataset/sentiment.py
--- a/python/paddle/v2/dataset/tests/imikolov_test.py
+++ b/python/paddle/v2/dataset/tests/imikolov_test.py
--- a/python/paddle/v2/dataset/tests/test_sentiment.py
+++ b/python/paddle/v2/dataset/tests/test_sentiment.py
--- a/python/paddle/v2/dataset/tests/wmt16_test.py
+++ b/python/paddle/v2/dataset/tests/wmt16_test.py
--- a/python/paddle/v2/dataset/wmt14.py
+++ b/python/paddle/v2/dataset/wmt14.py
--- a/python/paddle/v2/dataset/wmt16.py
+++ b/python/paddle/v2/dataset/wmt16.py
--- a/python/paddle/v2/event.py
+++ b/python/paddle/v2/event.py
--- a/python/paddle/v2/fluid/__init__.py
+++ b/python/paddle/v2/fluid/__init__.py
--- a/python/paddle/v2/fluid/backward.py
+++ b/python/paddle/v2/fluid/backward.py
--- a/python/paddle/v2/fluid/clip.py
+++ b/python/paddle/v2/fluid/clip.py
--- a/python/paddle/v2/fluid/data_feeder.py
+++ b/python/paddle/v2/fluid/data_feeder.py
--- a/python/paddle/v2/fluid/default_scope_funcs.py
+++ b/python/paddle/v2/fluid/default_scope_funcs.py
--- a/python/paddle/v2/fluid/distribute_transpiler.py
+++ b/python/paddle/v2/fluid/distribute_transpiler.py
--- a/python/paddle/v2/fluid/distribute_transpiler_simple.py
+++ b/python/paddle/v2/fluid/distribute_transpiler_simple.py
--- a/python/paddle/v2/fluid/distributed_spliter.py
+++ b/python/paddle/v2/fluid/distributed_spliter.py
--- a/python/paddle/v2/fluid/evaluator.py
+++ b/python/paddle/v2/fluid/evaluator.py
--- a/python/paddle/v2/fluid/executor.py
+++ b/python/paddle/v2/fluid/executor.py
--- a/python/paddle/v2/fluid/framework.py
+++ b/python/paddle/v2/fluid/framework.py
--- a/python/paddle/v2/fluid/initializer.py
+++ b/python/paddle/v2/fluid/initializer.py
--- a/python/paddle/v2/fluid/io.py
+++ b/python/paddle/v2/fluid/io.py
--- a/python/paddle/v2/fluid/layer_helper.py
+++ b/python/paddle/v2/fluid/layer_helper.py
--- a/python/paddle/v2/fluid/layers/__init__.py
+++ b/python/paddle/v2/fluid/layers/__init__.py
--- a/python/paddle/v2/fluid/layers/control_flow.py
+++ b/python/paddle/v2/fluid/layers/control_flow.py
--- a/python/paddle/v2/fluid/layers/device.py
+++ b/python/paddle/v2/fluid/layers/device.py
--- a/python/paddle/v2/fluid/layers/io.py
+++ b/python/paddle/v2/fluid/layers/io.py
--- a/python/paddle/v2/fluid/registry.py
+++ b/python/paddle/v2/fluid/registry.py
--- a/python/paddle/v2/fluid/layers/math_op_patch.py
+++ b/python/paddle/v2/fluid/layers/math_op_patch.py
--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
--- a/python/paddle/v2/fluid/layers/ops.py
+++ b/python/paddle/v2/fluid/layers/ops.py
--- a/python/paddle/v2/fluid/layers/tensor.py
+++ b/python/paddle/v2/fluid/layers/tensor.py
--- a/python/paddle/v2/fluid/memory_optimization_transpiler.py
+++ b/python/paddle/v2/fluid/memory_optimization_transpiler.py
--- a/python/paddle/v2/fluid/net_drawer.py
+++ b/python/paddle/v2/fluid/net_drawer.py
--- a/python/paddle/v2/fluid/nets.py
+++ b/python/paddle/v2/fluid/nets.py
--- a/python/paddle/v2/fluid/op.py
+++ b/python/paddle/v2/fluid/op.py
--- a/python/paddle/v2/fluid/optimizer.py
+++ b/python/paddle/v2/fluid/optimizer.py
--- a/python/paddle/v2/fluid/param_attr.py
+++ b/python/paddle/v2/fluid/param_attr.py
--- a/python/paddle/v2/fluid/profiler.py
+++ b/python/paddle/v2/fluid/profiler.py
--- a/python/paddle/v2/fluid/regularizer.py
+++ b/python/paddle/v2/fluid/regularizer.py
--- a/python/paddle/v2/fluid/tests/CMakeLists.txt
+++ b/python/paddle/v2/fluid/tests/CMakeLists.txt
--- a/python/paddle/v2/fluid/tests/__init__.py
+++ b/python/paddle/v2/fluid/tests/__init__.py
--- a/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
+++ b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
--- a/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
+++ b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
--- a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
--- a/python/paddle/v2/fluid/tests/book/test_machine_translation.py
+++ b/python/paddle/v2/fluid/tests/book/test_machine_translation.py
--- a/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py
+++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py
--- a/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
+++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
--- a/python/paddle/v2/fluid/tests/book/test_recommender_system.py
+++ b/python/paddle/v2/fluid/tests/book/test_recommender_system.py
--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py
--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py
--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
--- a/python/paddle/v2/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/v2/fluid/tests/book/test_word2vec.py
--- a/python/paddle/v2/fluid/tests/book_distribute/notest_dist_fit_a_line.py
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_dist_fit_a_line.py
--- a/python/paddle/v2/fluid/tests/book_distribute/notest_dist_image_classification.py
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_dist_image_classification.py
--- a/python/paddle/v2/fluid/tests/book_distribute/notest_dist_label_semantic_roles.py
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_dist_label_semantic_roles.py
--- a/python/paddle/v2/fluid/tests/book_distribute/notest_dist_word2vec.py
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_dist_word2vec.py
--- a/python/paddle/v2/fluid/tests/book_distribute/notest_recognize_digits_conv_dist.py
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_recognize_digits_conv_dist.py
--- a/python/paddle/v2/fluid/tests/book_distribute/notest_recognize_digits_mlp_dist.py
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_recognize_digits_mlp_dist.py
--- a/python/paddle/v2/fluid/tests/book_distribute/notest_understand_sentiment_conv_dist.py
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_understand_sentiment_conv_dist.py
--- a/python/paddle/v2/fluid/tests/book_distribute/test_split_var.py
+++ b/python/paddle/v2/fluid/tests/book_distribute/test_split_var.py
--- a/python/paddle/v2/fluid/tests/book_memory_optimization/CMakeLists.txt
+++ b/python/paddle/v2/fluid/tests/book_memory_optimization/CMakeLists.txt
--- a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
+++ b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
--- a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
+++ b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
--- a/python/paddle/v2/fluid/tests/decorators.py
+++ b/python/paddle/v2/fluid/tests/decorators.py
--- a/python/paddle/v2/fluid/tests/demo/fc_gan.py
+++ b/python/paddle/v2/fluid/tests/demo/fc_gan.py
--- a/python/paddle/v2/fluid/tests/op_test.py
+++ b/python/paddle/v2/fluid/tests/op_test.py
--- a/python/paddle/v2/fluid/tests/test_accuracy_op.py
+++ b/python/paddle/v2/fluid/tests/test_accuracy_op.py
--- a/python/paddle/v2/fluid/tests/test_activation_op.py
+++ b/python/paddle/v2/fluid/tests/test_activation_op.py
--- a/python/paddle/v2/fluid/tests/test_adadelta_op.py
+++ b/python/paddle/v2/fluid/tests/test_adadelta_op.py
--- a/python/paddle/v2/fluid/tests/test_adagrad_op.py
+++ b/python/paddle/v2/fluid/tests/test_adagrad_op.py
--- a/python/paddle/v2/fluid/tests/test_adam_op.py
+++ b/python/paddle/v2/fluid/tests/test_adam_op.py
--- a/python/paddle/v2/fluid/tests/test_adamax_op.py
+++ b/python/paddle/v2/fluid/tests/test_adamax_op.py
--- a/python/paddle/v2/fluid/tests/test_array_read_write_op.py
+++ b/python/paddle/v2/fluid/tests/test_array_read_write_op.py
--- a/python/paddle/v2/fluid/tests/test_assign_op.py
+++ b/python/paddle/v2/fluid/tests/test_assign_op.py
--- a/python/paddle/v2/fluid/tests/test_assign_value_op.py
+++ b/python/paddle/v2/fluid/tests/test_assign_value_op.py
--- a/python/paddle/v2/fluid/tests/test_auc_op.py
+++ b/python/paddle/v2/fluid/tests/test_auc_op.py
--- a/python/paddle/v2/fluid/tests/test_batch_norm_op.py
+++ b/python/paddle/v2/fluid/tests/test_batch_norm_op.py
--- a/python/paddle/v2/fluid/tests/test_beam_search_decode_op.py
+++ b/python/paddle/v2/fluid/tests/test_beam_search_decode_op.py
--- a/python/paddle/v2/fluid/tests/test_beam_search_op.py
+++ b/python/paddle/v2/fluid/tests/test_beam_search_op.py
--- a/python/paddle/v2/fluid/tests/test_bilinear_tensor_product_op.py
+++ b/python/paddle/v2/fluid/tests/test_bilinear_tensor_product_op.py
--- a/python/paddle/v2/fluid/tests/test_calc_gradient.py
+++ b/python/paddle/v2/fluid/tests/test_calc_gradient.py
--- a/python/paddle/v2/fluid/tests/test_cast_op.py
+++ b/python/paddle/v2/fluid/tests/test_cast_op.py
--- a/python/paddle/v2/fluid/tests/test_chunk_eval_op.py
+++ b/python/paddle/v2/fluid/tests/test_chunk_eval_op.py
--- a/python/paddle/v2/fluid/tests/test_clip_by_norm_op.py
+++ b/python/paddle/v2/fluid/tests/test_clip_by_norm_op.py
--- a/python/paddle/v2/fluid/tests/test_clip_op.py
+++ b/python/paddle/v2/fluid/tests/test_clip_op.py
--- a/python/paddle/v2/fluid/tests/test_compare_op.py
+++ b/python/paddle/v2/fluid/tests/test_compare_op.py
--- a/python/paddle/v2/fluid/tests/test_concat_op.py
+++ b/python/paddle/v2/fluid/tests/test_concat_op.py
--- a/python/paddle/v2/fluid/tests/test_cond_op.py
+++ b/python/paddle/v2/fluid/tests/test_cond_op.py
--- a/python/paddle/v2/fluid/tests/test_conditional_block.py
+++ b/python/paddle/v2/fluid/tests/test_conditional_block.py
--- a/python/paddle/v2/fluid/tests/test_const_value.py
+++ b/python/paddle/v2/fluid/tests/test_const_value.py
--- a/python/paddle/v2/fluid/tests/test_conv2d_op.py
+++ b/python/paddle/v2/fluid/tests/test_conv2d_op.py
--- a/python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py
+++ b/python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py
--- a/python/paddle/v2/fluid/tests/test_conv3d_op.py
+++ b/python/paddle/v2/fluid/tests/test_conv3d_op.py
--- a/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py
+++ b/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py
--- a/python/paddle/v2/fluid/tests/test_conv_shift_op.py
+++ b/python/paddle/v2/fluid/tests/test_conv_shift_op.py
--- a/python/paddle/v2/fluid/tests/test_cos_sim_op.py
+++ b/python/paddle/v2/fluid/tests/test_cos_sim_op.py
--- a/python/paddle/v2/fluid/tests/test_create_op_doc_string.py
+++ b/python/paddle/v2/fluid/tests/test_create_op_doc_string.py
--- a/python/paddle/v2/fluid/tests/test_crf_decoding_op.py
+++ b/python/paddle/v2/fluid/tests/test_crf_decoding_op.py
--- a/python/paddle/v2/fluid/tests/test_crop_op.py
+++ b/python/paddle/v2/fluid/tests/test_crop_op.py
--- a/python/paddle/v2/fluid/tests/test_cross_entropy_op.py
+++ b/python/paddle/v2/fluid/tests/test_cross_entropy_op.py
--- a/python/paddle/v2/fluid/tests/test_ctc_align.py
+++ b/python/paddle/v2/fluid/tests/test_ctc_align.py
--- a/python/paddle/v2/fluid/tests/test_data_feeder.py
+++ b/python/paddle/v2/fluid/tests/test_data_feeder.py
--- a/python/paddle/v2/fluid/tests/test_decayed_adagrad_op.py
+++ b/python/paddle/v2/fluid/tests/test_decayed_adagrad_op.py
--- a/python/paddle/v2/fluid/tests/test_default_scope_funcs.py
+++ b/python/paddle/v2/fluid/tests/test_default_scope_funcs.py
--- a/python/paddle/v2/fluid/tests/test_detection_output_op.py
+++ b/python/paddle/v2/fluid/tests/test_detection_output_op.py
--- a/python/paddle/v2/fluid/tests/test_dropout_op.py
+++ b/python/paddle/v2/fluid/tests/test_dropout_op.py
--- a/python/paddle/v2/fluid/tests/test_dyn_rnn.py
+++ b/python/paddle/v2/fluid/tests/test_dyn_rnn.py
--- a/python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py
+++ b/python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py
--- a/python/paddle/v2/fluid/tests/test_dynrnn_static_input.py
+++ b/python/paddle/v2/fluid/tests/test_dynrnn_static_input.py
--- a/python/paddle/v2/fluid/tests/test_edit_distance_op.py
+++ b/python/paddle/v2/fluid/tests/test_edit_distance_op.py
--- a/python/paddle/v2/fluid/tests/test_elementwise_max_op.py
+++ b/python/paddle/v2/fluid/tests/test_elementwise_max_op.py
--- a/python/paddle/v2/fluid/tests/test_elementwise_min_op.py
+++ b/python/paddle/v2/fluid/tests/test_elementwise_min_op.py
--- a/python/paddle/v2/fluid/tests/test_clip.py
+++ b/python/paddle/v2/fluid/tests/test_clip.py
--- a/python/paddle/v2/fluid/tests/test_exception.py
+++ b/python/paddle/v2/fluid/tests/test_exception.py
--- a/python/paddle/v2/fluid/tests/test_executor_and_mul.py
+++ b/python/paddle/v2/fluid/tests/test_executor_and_mul.py
--- a/python/paddle/v2/fluid/tests/test_expand_op.py
+++ b/python/paddle/v2/fluid/tests/test_expand_op.py
--- a/python/paddle/v2/fluid/tests/test_feed_fetch_method.py
+++ b/python/paddle/v2/fluid/tests/test_feed_fetch_method.py
--- a/python/paddle/v2/fluid/tests/test_fill_constant_batch_size_like_op.py
+++ b/python/paddle/v2/fluid/tests/test_fill_constant_batch_size_like_op.py
--- a/python/paddle/v2/fluid/tests/test_fill_constant_op.py
+++ b/python/paddle/v2/fluid/tests/test_fill_constant_op.py
--- a/python/paddle/v2/fluid/tests/test_fill_op.py
+++ b/python/paddle/v2/fluid/tests/test_fill_op.py
--- a/python/paddle/v2/fluid/tests/test_fill_zeros_like_op.py
+++ b/python/paddle/v2/fluid/tests/test_fill_zeros_like_op.py
--- a/python/paddle/v2/fluid/tests/test_framework_debug_str.py
+++ b/python/paddle/v2/fluid/tests/test_framework_debug_str.py
--- a/python/paddle/v2/fluid/tests/test_ftrl_op.py
+++ b/python/paddle/v2/fluid/tests/test_ftrl_op.py
--- a/python/paddle/v2/fluid/tests/test_gather_op.py
+++ b/python/paddle/v2/fluid/tests/test_gather_op.py
--- a/python/paddle/v2/fluid/tests/test_gaussian_random_op.py
+++ b/python/paddle/v2/fluid/tests/test_gaussian_random_op.py
--- a/python/paddle/v2/fluid/tests/test_get_places_op.py
+++ b/python/paddle/v2/fluid/tests/test_get_places_op.py
--- a/python/paddle/v2/fluid/tests/test_gradient_clip.py
+++ b/python/paddle/v2/fluid/tests/test_gradient_clip.py
--- a/python/paddle/v2/fluid/tests/test_gru_op.py
+++ b/python/paddle/v2/fluid/tests/test_gru_op.py
--- a/python/paddle/v2/fluid/tests/test_gru_unit_op.py
+++ b/python/paddle/v2/fluid/tests/test_gru_unit_op.py
--- a/python/paddle/v2/fluid/tests/test_hinge_loss_op.py
+++ b/python/paddle/v2/fluid/tests/test_hinge_loss_op.py
--- a/python/paddle/v2/fluid/tests/test_huber_loss_op.py
+++ b/python/paddle/v2/fluid/tests/test_huber_loss_op.py
--- a/python/paddle/v2/fluid/tests/test_image_classification_layer.py
+++ b/python/paddle/v2/fluid/tests/test_image_classification_layer.py
--- a/python/paddle/v2/fluid/tests/test_infer_shape.py
+++ b/python/paddle/v2/fluid/tests/test_infer_shape.py
--- a/python/paddle/v2/fluid/tests/test_inference_model_io.py
+++ b/python/paddle/v2/fluid/tests/test_inference_model_io.py
--- a/python/paddle/v2/fluid/tests/test_initializer.py
+++ b/python/paddle/v2/fluid/tests/test_initializer.py
--- a/python/paddle/v2/fluid/tests/test_is_empty_op.py
+++ b/python/paddle/v2/fluid/tests/test_is_empty_op.py
--- a/python/paddle/v2/fluid/tests/test_l1_norm_op.py
+++ b/python/paddle/v2/fluid/tests/test_l1_norm_op.py
--- a/python/paddle/v2/fluid/tests/test_layers.py
+++ b/python/paddle/v2/fluid/tests/test_layers.py
--- a/python/paddle/v2/fluid/tests/test_linear_chain_crf_op.py
+++ b/python/paddle/v2/fluid/tests/test_linear_chain_crf_op.py
--- a/python/paddle/v2/fluid/tests/test_lod_array_length_op.py
+++ b/python/paddle/v2/fluid/tests/test_lod_array_length_op.py
--- a/python/paddle/v2/fluid/tests/test_lod_rank_table.py
+++ b/python/paddle/v2/fluid/tests/test_lod_rank_table.py
--- a/python/paddle/v2/fluid/tests/test_lod_reset_op.py
+++ b/python/paddle/v2/fluid/tests/test_lod_reset_op.py
--- a/python/paddle/v2/fluid/tests/test_lod_tensor_array.py
+++ b/python/paddle/v2/fluid/tests/test_lod_tensor_array.py
--- a/python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py
+++ b/python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py
--- a/python/paddle/v2/fluid/tests/test_log_loss_op.py
+++ b/python/paddle/v2/fluid/tests/test_log_loss_op.py
--- a/python/paddle/v2/fluid/tests/test_logical_op.py
+++ b/python/paddle/v2/fluid/tests/test_logical_op.py
--- a/python/paddle/v2/fluid/tests/test_lookup_table_op.py
+++ b/python/paddle/v2/fluid/tests/test_lookup_table_op.py
--- a/python/paddle/v2/fluid/tests/test_lrn_op.py
+++ b/python/paddle/v2/fluid/tests/test_lrn_op.py
--- a/python/paddle/v2/fluid/tests/test_lstm_op.py
+++ b/python/paddle/v2/fluid/tests/test_lstm_op.py
--- a/python/paddle/v2/fluid/tests/test_lstm_unit_op.py
+++ b/python/paddle/v2/fluid/tests/test_lstm_unit_op.py
--- a/python/paddle/v2/fluid/tests/test_margin_rank_loss_op.py
+++ b/python/paddle/v2/fluid/tests/test_margin_rank_loss_op.py
--- a/python/paddle/v2/fluid/tests/test_math_op_patch.py
+++ b/python/paddle/v2/fluid/tests/test_math_op_patch.py
--- a/python/paddle/v2/fluid/tests/test_matmul_op.py
+++ b/python/paddle/v2/fluid/tests/test_matmul_op.py
--- a/python/paddle/v2/fluid/tests/test_maxout_op.py
+++ b/python/paddle/v2/fluid/tests/test_maxout_op.py
--- a/python/paddle/v2/fluid/tests/test_mean_op.py
+++ b/python/paddle/v2/fluid/tests/test_mean_op.py
--- a/python/paddle/v2/fluid/tests/test_memory_optimization_transpiler.py
+++ b/python/paddle/v2/fluid/tests/test_memory_optimization_transpiler.py
--- a/python/paddle/v2/fluid/tests/test_minus_op.py
+++ b/python/paddle/v2/fluid/tests/test_minus_op.py
--- a/python/paddle/v2/fluid/tests/test_mnist_if_else_op.py
+++ b/python/paddle/v2/fluid/tests/test_mnist_if_else_op.py
--- a/python/paddle/v2/fluid/tests/test_modified_huber_loss_op.py
+++ b/python/paddle/v2/fluid/tests/test_modified_huber_loss_op.py
--- a/python/paddle/v2/fluid/tests/test_momentum_op.py
+++ b/python/paddle/v2/fluid/tests/test_momentum_op.py
--- a/python/paddle/v2/fluid/tests/test_mul_op.py
+++ b/python/paddle/v2/fluid/tests/test_mul_op.py
--- a/python/paddle/v2/fluid/tests/test_multiplex_op.py
+++ b/python/paddle/v2/fluid/tests/test_multiplex_op.py
--- a/python/paddle/v2/fluid/tests/test_nce.py
+++ b/python/paddle/v2/fluid/tests/test_nce.py
--- a/python/paddle/v2/fluid/tests/test_net.py
+++ b/python/paddle/v2/fluid/tests/test_net.py
--- a/python/paddle/v2/fluid/tests/test_norm_op.py
+++ b/python/paddle/v2/fluid/tests/test_norm_op.py
--- a/python/paddle/v2/fluid/tests/test_normalization_wrapper.py
+++ b/python/paddle/v2/fluid/tests/test_normalization_wrapper.py
--- a/python/paddle/v2/fluid/tests/test_op_support_gpu.py
+++ b/python/paddle/v2/fluid/tests/test_op_support_gpu.py
--- a/python/paddle/v2/fluid/tests/test_operator.py
+++ b/python/paddle/v2/fluid/tests/test_operator.py
--- a/python/paddle/v2/fluid/tests/test_operator_desc.py
+++ b/python/paddle/v2/fluid/tests/test_operator_desc.py
--- a/python/paddle/v2/fluid/tests/test_optimizer.py
+++ b/python/paddle/v2/fluid/tests/test_optimizer.py
--- a/python/paddle/v2/fluid/tests/test_pad_op.py
+++ b/python/paddle/v2/fluid/tests/test_pad_op.py
--- a/python/paddle/v2/fluid/tests/test_parallel_op.py
+++ b/python/paddle/v2/fluid/tests/test_parallel_op.py
--- a/python/paddle/v2/fluid/tests/test_parameter.py
+++ b/python/paddle/v2/fluid/tests/test_parameter.py
--- a/python/paddle/v2/fluid/tests/test_pool2d_op.py
+++ b/python/paddle/v2/fluid/tests/test_pool2d_op.py
--- a/python/paddle/v2/fluid/tests/test_pool3d_op.py
+++ b/python/paddle/v2/fluid/tests/test_pool3d_op.py
--- a/python/paddle/v2/fluid/tests/test_pool_max_op.py
+++ b/python/paddle/v2/fluid/tests/test_pool_max_op.py
--- a/python/paddle/v2/fluid/tests/test_positive_negative_pair_op.py
+++ b/python/paddle/v2/fluid/tests/test_positive_negative_pair_op.py
--- a/python/paddle/v2/fluid/tests/test_precision_recall_op.py
+++ b/python/paddle/v2/fluid/tests/test_precision_recall_op.py
--- a/python/paddle/v2/fluid/tests/test_prelu_op.py
+++ b/python/paddle/v2/fluid/tests/test_prelu_op.py
--- a/python/paddle/v2/fluid/tests/test_print_op.py
+++ b/python/paddle/v2/fluid/tests/test_print_op.py
--- a/python/paddle/v2/fluid/tests/test_profiler.py
+++ b/python/paddle/v2/fluid/tests/test_profiler.py
--- a/python/paddle/v2/fluid/tests/test_program.py
+++ b/python/paddle/v2/fluid/tests/test_program.py
--- a/python/paddle/v2/fluid/tests/test_protobuf.py
+++ b/python/paddle/v2/fluid/tests/test_protobuf.py
--- a/python/paddle/v2/fluid/tests/test_protobuf_descs.py
+++ b/python/paddle/v2/fluid/tests/test_protobuf_descs.py
--- a/python/paddle/v2/fluid/tests/test_proximal_adagrad_op.py
+++ b/python/paddle/v2/fluid/tests/test_proximal_adagrad_op.py
--- a/python/paddle/v2/fluid/tests/test_proximal_gd_op.py
+++ b/python/paddle/v2/fluid/tests/test_proximal_gd_op.py
--- a/python/paddle/v2/fluid/tests/test_rank_loss_op.py
+++ b/python/paddle/v2/fluid/tests/test_rank_loss_op.py
--- a/python/paddle/v2/fluid/tests/test_recurrent_op.py
+++ b/python/paddle/v2/fluid/tests/test_recurrent_op.py
--- a/python/paddle/v2/fluid/tests/test_reduce_op.py
+++ b/python/paddle/v2/fluid/tests/test_reduce_op.py
--- a/python/paddle/v2/fluid/tests/test_registry.py
+++ b/python/paddle/v2/fluid/tests/test_registry.py
--- a/python/paddle/v2/fluid/tests/test_regularizer.py
+++ b/python/paddle/v2/fluid/tests/test_regularizer.py
--- a/python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py
+++ b/python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py
--- a/python/paddle/v2/fluid/tests/test_reshape_op.py
+++ b/python/paddle/v2/fluid/tests/test_reshape_op.py
--- a/python/paddle/v2/fluid/tests/test_rmsprop_op.py
+++ b/python/paddle/v2/fluid/tests/test_rmsprop_op.py
--- a/python/paddle/v2/fluid/tests/test_rnn_memory_helper_op.py
+++ b/python/paddle/v2/fluid/tests/test_rnn_memory_helper_op.py
--- a/python/paddle/v2/fluid/tests/test_roi_pool_op.py
+++ b/python/paddle/v2/fluid/tests/test_roi_pool_op.py
--- a/python/paddle/v2/fluid/tests/test_row_conv_op.py
+++ b/python/paddle/v2/fluid/tests/test_row_conv_op.py
--- a/python/paddle/v2/fluid/tests/test_scale_op.py
+++ b/python/paddle/v2/fluid/tests/test_scale_op.py
--- a/python/paddle/v2/fluid/tests/test_scatter_op.py
+++ b/python/paddle/v2/fluid/tests/test_scatter_op.py
--- a/python/paddle/v2/fluid/tests/test_scope.py
+++ b/python/paddle/v2/fluid/tests/test_scope.py
--- a/python/paddle/v2/fluid/tests/test_selected_rows.py
+++ b/python/paddle/v2/fluid/tests/test_selected_rows.py
--- a/python/paddle/v2/fluid/tests/test_seq_concat_op.py
+++ b/python/paddle/v2/fluid/tests/test_seq_concat_op.py
--- a/python/paddle/v2/fluid/tests/test_seq_conv.py
+++ b/python/paddle/v2/fluid/tests/test_seq_conv.py
--- a/python/paddle/v2/fluid/tests/test_seq_pool.py
+++ b/python/paddle/v2/fluid/tests/test_seq_pool.py
--- a/python/paddle/v2/fluid/tests/test_sequence_erase_op.py
+++ b/python/paddle/v2/fluid/tests/test_sequence_erase_op.py
--- a/python/paddle/v2/fluid/tests/test_sequence_expand.py
+++ b/python/paddle/v2/fluid/tests/test_sequence_expand.py
--- a/python/paddle/v2/fluid/tests/test_sequence_reshape.py
+++ b/python/paddle/v2/fluid/tests/test_sequence_reshape.py
--- a/python/paddle/v2/fluid/tests/test_sequence_slice_op.py
+++ b/python/paddle/v2/fluid/tests/test_sequence_slice_op.py
--- a/python/paddle/v2/fluid/tests/test_sequence_softmax_op.py
+++ b/python/paddle/v2/fluid/tests/test_sequence_softmax_op.py
--- a/python/paddle/v2/fluid/tests/test_sgd_op.py
+++ b/python/paddle/v2/fluid/tests/test_sgd_op.py
--- a/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py
+++ b/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py
--- a/python/paddle/v2/fluid/tests/test_sigmoid_cross_entropy_with_logits_op.py
+++ b/python/paddle/v2/fluid/tests/test_sigmoid_cross_entropy_with_logits_op.py
--- a/python/paddle/v2/fluid/tests/test_sign_op.py
+++ b/python/paddle/v2/fluid/tests/test_sign_op.py
--- a/python/paddle/v2/fluid/tests/test_smooth_l1_loss_op.py
+++ b/python/paddle/v2/fluid/tests/test_smooth_l1_loss_op.py
--- a/python/paddle/v2/fluid/tests/test_softmax_op.py
+++ b/python/paddle/v2/fluid/tests/test_softmax_op.py
--- a/python/paddle/v2/fluid/tests/test_softmax_with_cross_entropy_op.py
+++ b/python/paddle/v2/fluid/tests/test_softmax_with_cross_entropy_op.py
--- a/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py
+++ b/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py
--- a/python/paddle/v2/fluid/tests/test_split_op.py
+++ b/python/paddle/v2/fluid/tests/test_split_op.py
--- a/python/paddle/v2/fluid/tests/test_split_selected_rows_op.py
+++ b/python/paddle/v2/fluid/tests/test_split_selected_rows_op.py
--- a/python/paddle/v2/fluid/tests/test_spp_op.py
+++ b/python/paddle/v2/fluid/tests/test_spp_op.py
--- a/python/paddle/v2/fluid/tests/test_squared_l2_distance_op.py
+++ b/python/paddle/v2/fluid/tests/test_squared_l2_distance_op.py
--- a/python/paddle/v2/fluid/tests/test_squared_l2_norm_op.py
+++ b/python/paddle/v2/fluid/tests/test_squared_l2_norm_op.py
--- a/python/paddle/v2/fluid/tests/test_sum_op.py
+++ b/python/paddle/v2/fluid/tests/test_sum_op.py
--- a/python/paddle/v2/fluid/tests/test_tensor.py
+++ b/python/paddle/v2/fluid/tests/test_tensor.py
--- a/python/paddle/v2/fluid/tests/test_top_k_op.py
+++ b/python/paddle/v2/fluid/tests/test_top_k_op.py
--- a/python/paddle/v2/fluid/tests/test_transpose_op.py
+++ b/python/paddle/v2/fluid/tests/test_transpose_op.py
--- a/python/paddle/v2/fluid/tests/test_uniform_random_op.py
+++ b/python/paddle/v2/fluid/tests/test_uniform_random_op.py
--- a/python/paddle/v2/fluid/tests/test_unpool_op.py
+++ b/python/paddle/v2/fluid/tests/test_unpool_op.py
--- a/python/paddle/v2/fluid/tests/test_variable.py
+++ b/python/paddle/v2/fluid/tests/test_variable.py
--- a/python/paddle/v2/fluid/tests/test_warpctc_op.py
+++ b/python/paddle/v2/fluid/tests/test_warpctc_op.py
--- a/python/paddle/v2/fluid/tests/test_while_op.py
+++ b/python/paddle/v2/fluid/tests/test_while_op.py
--- a/python/paddle/v2/image.py
+++ b/python/paddle/v2/image.py
--- a/python/paddle/v2/inference.py
+++ b/python/paddle/v2/inference.py
--- a/python/paddle/v2/master/__init__.py
+++ b/python/paddle/v2/master/__init__.py
--- a/python/paddle/v2/master/client.py
+++ b/python/paddle/v2/master/client.py
--- a/python/paddle/v2/reader/tests/__init__.py
+++ b/python/paddle/v2/reader/tests/__init__.py
--- a/python/paddle/v2/reader/tests/creator_test.py
+++ b/python/paddle/v2/reader/tests/creator_test.py
--- a/python/paddle/v2/reader/tests/decorator_test.py
+++ b/python/paddle/v2/reader/tests/decorator_test.py
--- a/python/paddle/v2/tests/test_image.py
+++ b/python/paddle/v2/tests/test_image.py
--- a/python/paddle/v2/tests/test_layer.py
+++ b/python/paddle/v2/tests/test_layer.py
--- a/python/paddle/v2/tests/test_op.py
+++ b/python/paddle/v2/tests/test_op.py
--- a/python/paddle/v2/tests/test_paramconf_order.py
+++ b/python/paddle/v2/tests/test_paramconf_order.py
--- a/python/paddle/v2/tests/test_parameters.py
+++ b/python/paddle/v2/tests/test_parameters.py
--- a/python/paddle/v2/tests/test_rnn_layer.py
+++ b/python/paddle/v2/tests/test_rnn_layer.py
--- a/python/paddle/v2/tests/test_topology.py
+++ b/python/paddle/v2/tests/test_topology.py
--- a/python/paddle/v2/trainer.py
+++ b/python/paddle/v2/trainer.py
--- a/tools/manylinux1/build_scripts/manylinux1-check.py
+++ b/tools/manylinux1/build_scripts/manylinux1-check.py
--- a/tools/manylinux1/build_scripts/python-tag-abi-tag.py
+++ b/tools/manylinux1/build_scripts/python-tag-abi-tag.py
--- a/tools/manylinux1/build_scripts/ssl-check.py
+++ b/tools/manylinux1/build_scripts/ssl-check.py