diff --git a/.clang-format b/.clang-format
index aff93435f58c522f5ed1090aef2005f76e91cf31..8b5830627348c6bff12260b7d9adbd357f074718 100644
--- a/.clang-format
+++ b/.clang-format
@@ -19,7 +19,7 @@ BasedOnStyle: Google
IndentWidth: 2
TabWidth: 2
ContinuationIndentWidth: 4
-AccessModifierOffset: -2 # The private/protected/public has no indent in class
+AccessModifierOffset: -1 # The private/protected/public has no indent in class
Standard: Cpp11
AllowAllParametersOfDeclarationOnNextLine: true
BinPackParameters: false
diff --git a/.copyright.hook b/.copyright.hook
deleted file mode 100644
index 09afff2072df3384a429d01d06188218ae6e85d1..0000000000000000000000000000000000000000
--- a/.copyright.hook
+++ /dev/null
@@ -1,121 +0,0 @@
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import argparse
-import io, re
-import sys, os
-import subprocess
-import platform
-
-COPYRIGHT = '''
- Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-'''
-
-LANG_COMMENT_MARK = None
-
-NEW_LINE_MARK = None
-
-COPYRIGHT_HEADER = None
-
-if platform.system() == "Windows":
- NEW_LINE_MARK = "\r\n"
-else:
- NEW_LINE_MARK = '\n'
- COPYRIGHT_HEADER = COPYRIGHT.split(NEW_LINE_MARK)[1]
- p = re.search('(\d{4})', COPYRIGHT_HEADER).group(0)
- process = subprocess.Popen(["date", "+%Y"], stdout=subprocess.PIPE)
- date, err = process.communicate()
- date = date.decode("utf-8").rstrip("\n")
- COPYRIGHT_HEADER = COPYRIGHT_HEADER.replace(p, date)
-
-
-def generate_copyright(template, lang='C'):
- if lang == 'Python':
- LANG_COMMENT_MARK = '#'
- else:
- LANG_COMMENT_MARK = "//"
-
- lines = template.split(NEW_LINE_MARK)
- BLANK = " "
- ans = LANG_COMMENT_MARK + BLANK + COPYRIGHT_HEADER + NEW_LINE_MARK
- for lino, line in enumerate(lines):
- if lino == 0 or lino == 1 or lino == len(lines) - 1: continue
- if len(line) == 0:
- BLANK = ""
- else:
- BLANK = " "
- ans += LANG_COMMENT_MARK + BLANK + line + NEW_LINE_MARK
-
- return ans + "\n"
-
-
-def lang_type(filename):
- if filename.endswith(".py"):
- return "Python"
- elif filename.endswith(".h"):
- return "C"
- elif filename.endswith(".c"):
- return "C"
- elif filename.endswith(".hpp"):
- return "C"
- elif filename.endswith(".cc"):
- return "C"
- elif filename.endswith(".cpp"):
- return "C"
- elif filename.endswith(".cu"):
- return "C"
- elif filename.endswith(".cuh"):
- return "C"
- elif filename.endswith(".go"):
- return "C"
- elif filename.endswith(".proto"):
- return "C"
- else:
- print("Unsupported filetype %s", filename)
- exit(0)
-
-
-PYTHON_ENCODE = re.compile("^[ \t\v]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)")
-
-
-def main(argv=None):
- parser = argparse.ArgumentParser(
- description='Checker for copyright declaration.')
- parser.add_argument('filenames', nargs='*', help='Filenames to check')
- args = parser.parse_args(argv)
-
- retv = 0
- for filename in args.filenames:
- fd = io.open(filename, encoding="utf-8")
- first_line = fd.readline()
- second_line = fd.readline()
- if "COPYRIGHT (C)" in first_line.upper(): continue
- if first_line.startswith("#!") or PYTHON_ENCODE.match(
- second_line) != None or PYTHON_ENCODE.match(first_line) != None:
- continue
- original_contents = io.open(filename, encoding="utf-8").read()
- new_contents = generate_copyright(
- COPYRIGHT, lang_type(filename)) + original_contents
- print('Auto Insert Copyright Header {}'.format(filename))
- retv = 1
- with io.open(filename, 'w') as output_file:
- output_file.write(new_contents)
-
- return retv
-
-
-if __name__ == '__main__':
- exit(main())
diff --git a/.gitignore b/.gitignore
index 9e3a0b499f9f42856429f3a42bef313ea3df3699..b92bb9cc129659fa502b4a9b55548992412e5429 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,6 +5,7 @@ python/paddle/v2/fluid/tests/book/image_classification_resnet.inference.model/
python/paddle/v2/fluid/tests/book/image_classification_vgg.inference.model/
python/paddle/v2/fluid/tests/book/label_semantic_roles.inference.model/
*.DS_Store
+*.vs
build/
build_doc/
*.user
@@ -15,6 +16,7 @@ build_doc/
.cproject
.pydevproject
.settings/
+CMakeSettings.json
Makefile
.test_env/
third_party/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 6140340890c0e5025eb08209e8ea78df918b4dc0..e718b32cb6c48d11e73600509a17db107f438708 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -23,7 +23,7 @@ repos:
- id: clang-format-with-version-check
name: clang-format
description: Format files with ClangFormat.
- entry: bash ./.clang_format.hook -i
+ entry: bash ./tools/codestyle/clang_format.hook -i
language: system
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$
- repo: local
@@ -34,6 +34,14 @@ repos:
entry: bash ./tools/codestyle/cpplint_pre_commit.hook
language: system
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx)$
+- repo: local
+ hooks:
+ - id: pylint-doc-string
+ name: pylint
+ description: Check python docstring style using docstring_checker.
+ entry: bash ./tools/codestyle/pylint_pre_commit.hook
+ language: system
+ files: \.(py)$
- repo: https://github.com/PaddlePaddle/pre-commit-golang
sha: 8337620115c25ff8333f1b1a493bd031049bd7c0
hooks:
@@ -44,7 +52,7 @@ repos:
hooks:
- id: copyright_checker
name: copyright_checker
- entry: python ./.copyright.hook
+ entry: python ./tools/codestyle/copyright.hook
language: system
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$
exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$
diff --git a/.travis.yml b/.travis.yml
index 3391e2c3cab9938c9dc5705b51367c707d3bbe9d..361136ac2c8d899a0d7a4d7945083fcc489551b5 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -18,6 +18,8 @@ env:
addons:
ssh_known_hosts: 13.229.163.131
before_install:
+ # For pylint dockstring checker
+ - sudo pip install pylint pytest astroid isort
- |
function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
script:
@@ -25,15 +27,6 @@ script:
# 43min timeout
paddle/scripts/paddle_docker_build.sh ${JOB}
if [ $? -eq 0 ] || [ $? -eq 142 ]; then true; else exit 1; fi;
- - |
- if [[ "$JOB" != "doc" ]]; then exit 0; fi;
- # For document only
- if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
- if [[ "$TRAVIS_BRANCH" != "develop" && ! "$TRAVIS_BRANCH" =~ ^v[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then exit 0; fi;
- export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/master/scripts/deploy/deploy_docs.sh
- export DOCS_DIR=`pwd`
- cd ..
- curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH $DOCS_DIR $DOCS_DIR/build/doc/
notifications:
email:
on_success: change
diff --git a/AUTHORS.md b/AUTHORS.md
index 4ee05420982d13f686cf13e8957ce41dfcdd2cb8..41b7193677a0208ba2fa82b72862292572dcb6ef 100644
--- a/AUTHORS.md
+++ b/AUTHORS.md
@@ -4,6 +4,7 @@
| backyes | Yan-Fei Wang |
| baiyfbupt | Yi-Fan Bai |
| beckett1124 | Bin Qi |
+| ChengduoZH | Cheng-Duo Zhao|
| chengxiaohua1105 | Xiao-Hua Cheng |
| cxwangyi, yiwangbaidu, wangkuiyi | Yi Wang |
| cxysteven | Xing-Yi Cheng |
@@ -21,6 +22,7 @@
| jczaja | Jacek Czaja |
| JiayiFeng | Jia-Yi Feng |
| kbinias | Krzysztof Binias |
+| kexinzhao | Ke-Xin Zhao |
| kuke | Yi-Bing Liu |
| lcy-seso | Ying Cao |
| lipeng-unisound | Peng Li |
@@ -44,6 +46,7 @@
| tianbingsz | Tian-Bing Xu |
| tpatejko | Tomasz Patejko |
| typhoonzero | Yi Wu |
+| velconia | Qi-Yang Min |
| wanghaoshuang | Hao-Shuang Wang |
| wangyang59 | Yang Wang |
| wangzhen-nlp | Zhen Wang |
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 23bbe829ac16180088bfa37df66e23f19b021ea3..b1d0abdf2ceb4cf338dde782a97a6df906149655 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -24,8 +24,10 @@ message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: "
"${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}")
message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
"${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
+if(WIN32)
+ set(CMAKE_STATIC_LIBRARY_PREFIX lib)
+endif(WIN32)
-find_package(Sphinx)
if(NOT CMAKE_CROSSCOMPILING)
find_package(CUDA QUIET)
endif(NOT CMAKE_CROSSCOMPILING)
@@ -42,7 +44,6 @@ option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FO
option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON)
option(WITH_TESTING "Compile PaddlePaddle with unit testing" OFF)
option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON)
-option(WITH_STYLE_CHECK "Compile PaddlePaddle with style check" ON)
option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON)
option(WITH_DOUBLE "Compile PaddlePaddle with double precision" OFF)
option(WITH_RDMA "Compile PaddlePaddle with RDMA support" OFF)
@@ -57,10 +58,25 @@ option(WITH_FLUID_ONLY "Compile PaddlePaddle fluid only" OFF)
option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF)
option(GLIDE_INSTALL "Download and install go dependencies " ON)
option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF)
-option(WITH_DISTRIBUTE "Compile with grpc distributed support" OFF)
+option(WITH_DISTRIBUTE "Compile with distributed support" OFF)
option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF)
+option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen" OFF)
option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF)
option(WITH_FAST_BUNDLE_TEST "Bundle tests that can be run in a single process together to reduce launch overhead" OFF)
+option(WITH_CONTRIB "Compile the third-party contributation" OFF)
+option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF)
+option(WITH_ANAKIN "Compile with Anakin library" OFF)
+option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE})
+option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF)
+option(WITH_INFERENCE "Compile fluid inference library" ON)
+option(WITH_SYSTEM_BLAS "Use system blas library" OFF)
+option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION})
+
+# PY_VERSION
+if(NOT PY_VERSION)
+ set(PY_VERSION 2.7)
+endif()
+set(PYBIND11_PYTHON_VERSION ${PY_VERSION})
# CMAKE_BUILD_TYPE
if(NOT CMAKE_BUILD_TYPE)
@@ -98,9 +114,17 @@ if(ANDROID OR IOS)
add_definitions(-DPADDLE_MOBILE_INFERENCE)
endif()
+if (APPLE OR WIN32)
+ set(WITH_MKL OFF CACHE STRING
+ "Disable MKL for building on mac and windows" FORCE)
+endif()
+
set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
"A path setting third party libraries download & build directories.")
+set(FLUID_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_install_dir" CACHE STRING
+ "A path setting fluid shared and static libraries")
+
if (WITH_C_API AND WITH_PYTHON)
message(WARNING "It is suggest not embedded a python interpreter in Paddle "
"when using C-API. It will give an unpredictable behavior when using a "
@@ -118,16 +142,23 @@ else()
endif()
set(WITH_MKLML ${WITH_MKL})
-if (WITH_MKL AND AVX2_FOUND)
- set(WITH_MKLDNN ON)
-else()
- message(STATUS "Do not have AVX2 intrinsics and disabled MKL-DNN")
- set(WITH_MKLDNN OFF)
+if (NOT DEFINED WITH_MKLDNN)
+ if (WITH_MKL AND AVX2_FOUND)
+ set(WITH_MKLDNN ON)
+ else()
+ message(STATUS "Do not have AVX2 intrinsics and disabled MKL-DNN")
+ set(WITH_MKLDNN OFF)
+ endif()
endif()
+if (REPLACE_ENFORCE_GLOG)
+ add_definitions("-DREPLACE_ENFORCE_GLOG")
+endif()
########################################################################################
include(external/mklml) # download mklml package
+include(external/xbyak) # download xbyak package
+include(external/libxsmm) # download, build, install libxsmm
include(external/zlib) # download, build, install zlib
include(external/gflags) # download, build, install gflags
include(external/glog) # download, build, install glog
@@ -137,34 +168,68 @@ include(external/python) # download, build, install python
include(external/openblas) # download, build, install openblas
include(external/mkldnn) # download, build, install mkldnn
include(external/swig) # download, build, install swig
-include(external/warpctc) # download, build, install warpctc
include(external/boost) # download boost
include(external/any) # download libn::any
include(external/eigen) # download eigen3
include(external/pybind11) # download pybind11
include(external/cares)
-include(external/grpc)
+include(external/cub)
+
+if (NOT WIN32)
+# there is no official support of snappystream, warpctc, nccl, cupti in windows
include(external/snappy) # download snappy
-include(external/snappystream)
-include(external/threadpool)
+include(external/snappystream) # download snappystream
+include(external/warpctc) # download, build, install warpctc
+include(cupti)
+endif (NOT WIN32)
+
+if(WITH_DISTRIBUTE)
+ if(WITH_GRPC)
+ include(external/grpc)
+ message(STATUS "Use grpc framework.")
+ else()
+ message(STATUS "Use brpc framework.")
+ include(external/leveldb)
+ include(external/brpc)
+ endif()
+endif()
+
+if(WITH_BRPC_RDMA)
+ message(STATUS "Use brpc with rdma.")
+ if(WITH_GRPC)
+ message(FATAL_ERROR "Can't use grpc with brpc rdma.")
+ endif()
+ if(NOT WITH_DISTRIBUTE)
+ message(FATAL_ERROR "Can't use brpc rdma in no distribute env.")
+ endif()
+endif()
+
+include(external/threadpool)
+include(flags) # set paddle compile flags
include(cudnn) # set cudnn libraries, must before configure
-include(cupti)
include(configure) # add paddle env configuration
+
+if(WITH_GPU)
+ include(cuda)
+ include(tensorrt)
+ include(external/anakin)
+elseif()
+ set(WITH_ANAKIN OFF CACHE STRING "Anakin is used in GPU only now." FORCE)
+endif()
+
include(generic) # simplify cmake module
include(package) # set paddle packages
-include(cpplint) # set paddle c++ style
include(ccache) # set ccache for compilation
include(util) # set unittest and link libs
include(rdma) # set rdma libraries
-include(flags) # set paddle compile flags
include(version) # set PADDLE_VERSION
include(coveralls) # set code coverage
include(inference_lib) # add paddle fluid inference libraries
include_directories("${PADDLE_SOURCE_DIR}")
-include_directories("${PADDLE_SOURCE_DIR}/paddle/cuda/include")
+include_directories("${PADDLE_SOURCE_DIR}/paddle/legacy/cuda/include")
include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto")
include_directories("${CMAKE_CURRENT_BINARY_DIR}/go/pserver/client/c")
@@ -177,11 +242,6 @@ set(EXTERNAL_LIBS
${PYTHON_LIBRARIES}
)
-if(WITH_GPU)
- include(cuda)
- include(tensorrt)
-endif(WITH_GPU)
-
if(WITH_AMD_GPU)
find_package(HIP)
include(hip)
@@ -191,6 +251,10 @@ if(WITH_MKLML)
list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB})
endif()
+if(WITH_LIBXSMM)
+ list(APPEND EXTERNAL_LIBS ${LIBXSMM_LIBS})
+endif()
+
if(WITH_MKLDNN)
list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB})
endif()
@@ -202,10 +266,10 @@ endif(USE_NNPACK)
add_subdirectory(proto)
-if(NOT MOBILE_INFERENCE)
+if(NOT MOBILE_INFERENCE AND NOT WITH_FLUID_ONLY)
# "add_subdirectory(go)" should be placed after the following loine,
# because it depends on paddle/optimizer.
- add_subdirectory(paddle/optimizer)
+ add_subdirectory(paddle/legacy/optimizer)
endif()
# "add_subdirectory(paddle)" and "add_subdirectory(python)" should be
@@ -226,5 +290,7 @@ if(WITH_PYTHON)
endif()
if(WITH_DOC)
+ find_package(Sphinx REQUIRED)
+ find_python_module(recommonmark REQUIRED)
add_subdirectory(doc)
endif()
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 3c36cffcb4eeaaf7f8cff5167777628dd2697e7d..b878f37a5b8e807e5aa346e0074a741f2f8b6cc5 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -58,6 +58,8 @@ PaddlePaddle uses this [Git branching model](http://nvie.com/posts/a-successful-
create mode 100644 233
```
+ NOTE: The `yapf` installed by `pip install pre-commit` and `conda install -c conda-forge pre-commit` is slightly different. Paddle developers use `pip install pre-commit`.
+
1. Build and test
Users can build PaddlePaddle natively on Linux and Mac OS X. But to unify the building environment and to make it easy for debugging, the recommended way is [using Docker](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/build_en.md).
@@ -157,4 +159,4 @@ This will enable VLOG messages generated by `buddy_allocator.{h,cc}` and in the
- verbose level 1: [framework](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework)
- verbose level 3: [operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators)
- verbose level 5: [memory](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory), [platform](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/platform)
-- verbose level 7: [math](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/math)
+- verbose level 7: [math](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/math)
diff --git a/Dockerfile b/Dockerfile
index 85e3c5371125f65bb959e921c570683d9e8ae3e3..634be18a51bf61e96a8bf6f263b6674a7932d6e4 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -23,13 +23,13 @@ ENV HOME /root
COPY ./paddle/scripts/docker/root/ /root/
RUN apt-get update && \
- apt-get install -y --allow-downgrades \
- git python-pip python-dev openssh-server bison \
+ apt-get install -y --allow-downgrades patchelf \
+ git python-pip python-dev python-opencv openssh-server bison \
libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 \
wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
curl sed grep graphviz libjpeg-dev zlib1g-dev \
python-matplotlib gcc-4.8 g++-4.8 \
- automake locales clang-format swig doxygen cmake \
+ automake locales clang-format swig cmake \
liblapack-dev liblapacke-dev \
clang-3.8 llvm-3.8 libclang-3.8-dev \
net-tools libtool ccache && \
@@ -70,7 +70,7 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
# specify sphinx version as 1.5.6 and remove -U option for [pip install -U
# sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
# version(1.7.1 for now), which causes building documentation failed.
-RUN pip install --upgrade pip==9.0.3 && \
+RUN easy_install -U pip && \
pip install -U wheel && \
pip install -U docopt PyYAML sphinx==1.5.6 && \
pip install sphinx-rtd-theme==0.1.9 recommonmark
@@ -79,6 +79,9 @@ RUN pip install pre-commit 'ipython==5.3.0' && \
pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
pip install opencv-python
+#For docstring checker
+RUN pip install pylint pytest astroid isort LinkChecker
+
COPY ./python/requirements.txt /root/
RUN pip install -r /root/requirements.txt
@@ -101,6 +104,3 @@ RUN echo 'root:root' | chpasswd
RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
EXPOSE 22
-
-# development image default do build work
-CMD ["bash", "/paddle/paddle/scripts/docker/build.sh"]
diff --git a/Dockerfile.android b/Dockerfile.android
index 848a7eba6f1421432addae8acff407b611adb4ae..48db2efea21a648657e3f490c95429b9a29ede52 100644
--- a/Dockerfile.android
+++ b/Dockerfile.android
@@ -40,5 +40,3 @@ RUN mkdir -p ${ANDROID_TOOLCHAINS_DIR} && \
unzip -q android-ndk-r14b-linux-x86_64.zip && \
mv android-ndk-r14b ${ANDROID_NDK_HOME} && \
rm -rf /opt/android-ndk-tmp
-
-CMD ["bash", "/paddle/paddle/scripts/docker/build_android.sh"]
diff --git a/README.md b/README.md
index a3b13fe79cc33927e5cc3e091926b111688a941b..a67cb8ad439f462c361cb6bac2449c3a4b042126 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,6 @@
[](https://travis-ci.org/PaddlePaddle/Paddle)
[](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html)
[](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html)
-[](https://coveralls.io/github/PaddlePaddle/Paddle?branch=develop)
[](https://github.com/PaddlePaddle/Paddle/releases)
[](LICENSE)
@@ -19,6 +18,22 @@ learning to many products at Baidu.
Our vision is to enable deep learning for everyone via PaddlePaddle.
Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.
+
+### Latest PaddlePaddle Release: [Fluid 0.14.0](https://github.com/PaddlePaddle/Paddle/tree/v0.14.0)
+### Install Latest Stable Release:
+```
+# Linux CPU
+pip install paddlepaddle
+# Linux GPU cuda9cudnn7
+pip install paddlepaddle-gpu
+# Linux GPU cuda8cudnn7
+pip install paddlepaddle-gpu==0.14.0.post87
+# Linux GPU cuda8cudnn5
+pip install paddlepaddle-gpu==0.14.0.post85
+
+# For installation on other platform, refer to http://paddlepaddle.org/
+```
+
## Features
- **Flexibility**
@@ -62,9 +77,9 @@ Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddl
## Installation
It is recommended to check out the
-[Docker installation guide](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/docker_install_en.html)
+[Docker installation guide](http://www.paddlepaddle.org/docs/develop/documentation/fluid/en/build_and_install/docker_install_en.html)
before looking into the
-[build from source guide](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/build_from_source_en.html).
+[build from source guide](http://www.paddlepaddle.org/docs/develop/documentation/fluid/en/build_and_install/build_from_source_en.html).
## Documentation
diff --git a/benchmark/.gitignore b/benchmark/.gitignore
index 7b66e8a5b5020fd847982db401665d24ba3a069c..fb4114356d4f37efc8ad672316fd4f99443d9fcd 100644
--- a/benchmark/.gitignore
+++ b/benchmark/.gitignore
@@ -7,3 +7,6 @@ paddle/rnn/imdb.pkl
caffe/image/logs
tensorflow/image/logs
tensorflow/rnn/logs
+fluid/models/*.pyc
+fluid/logs
+fluid/nohup.out
diff --git a/benchmark/cluster/README.md b/benchmark/cluster/README.md
deleted file mode 100644
index 64816098a524f064ec12474a736cd4c721227a70..0000000000000000000000000000000000000000
--- a/benchmark/cluster/README.md
+++ /dev/null
@@ -1,196 +0,0 @@
-# Cluster Training Benchmark
-
-## Setup
-
-- Platform
- - Kubernetes: v1.6.2
- - Linux Kernel: v3.10.0
-
-- Resource
- - CPU: 10 Cores per Pod
- - Memory: 5GB per Pod
-
-- Docker Image
-
- We use different base Docker Image to run the benchmark on Kubernetes:
- - PaddlePaddle v2: paddlepaddle/paddle:0.11.0
- - PaddlePaddle Fluid: paddlepaddle/paddle:[commit-id]
- - TensorFlow: tensorflow/tensorflow:1.5.0-rc0
-
-- Model
- vgg16 is used in this benchmark.
-
-## Cases
-
-- Variable
- - Batch Size of training data.
- - PServer count of the training job.
- - The number of trainers.
-
-- Invariant
- - The resource of trainer/pserver Pod.
-
-### Measure the Performance for Different Batch Size
-
-- PServer Count: 40
-- Trainer Count: 100
-- Metrics: mini-batch / sec
-
-
-
-
-
-Batch Size |
- 32 |
-64 |
-128 |
-256 |
-
-
-
-
- PaddlePaddle Fluid |
-- |
-- |
-- |
-- |
-
-
-PaddlePaddle v2 |
-- |
-- |
-- |
-- |
-
-
-TensorFlow |
-- |
-- |
-- |
-- |
-
-
-
-
-### Measure the Performance for Different PServer Count
-
-- Trainer Count: 100
-- Batch Size: 64
-- Metrics: mini-batch / sec
-
-
-
-
-
-PServer Count |
-10 |
-20 |
-40 |
-60 |
-
-
-
-
- PaddlePaddle Fluid |
-- |
-- |
-- |
-- |
-
-
-PaddlePaddle v2 |
-- |
-- |
-- |
-- |
-
-
-TensorFlow |
-- |
-- |
-- |
-- |
-
-
-
-
-### Measure Parallel Efficiency By Increasing Trainer Count
-
-- PServer Count: 20
-- Batch Size: 64
-- Metrics:
-
-$S = \div(T1, TN)$
-
-which S is the ratio of T1 over TN, training time of 1 and N trainers.
-The parallel efficiency is:
-
-$E = \div(S, N)$
-
-
-
-
-Trainer Counter |
-1 |
-10 |
-20 |
-30 |
-40 |
-50 |
-60 |
-70 |
-80 |
-90 |
-100 |
-
-
-
-
- PaddlePaddle Fluid |
-- |
-- |
-- |
-- |
-- |
-- |
-- |
-- |
-- |
-- |
-- |
-
-
-PaddlePaddle v2 |
-- |
-- |
-- |
-- |
-- |
-- |
-- |
-- |
-- |
-- |
-- |
-
-
-TensorFlow |
-- |
-- |
-- |
-- |
-- |
-- |
-- |
-- |
-- |
-- |
-- |
-
-
-
-
-
-## Reproduce the benchmark
-
-TODO
diff --git a/benchmark/cluster/vgg16/Dockerfile b/benchmark/cluster/vgg16/Dockerfile
deleted file mode 100644
index 13ad8e1b6237e6f41a076c4fb54311728832ae33..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/Dockerfile
+++ /dev/null
@@ -1,35 +0,0 @@
-FROM nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04
-
-# you can get mirror list here:
-# https://launchpad.net/ubuntu/+archivemirrors
-ARG UBUNTU_MIRROR
-RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
-
-RUN apt-get update && apt-get install -y python python-dev python-pip iputils-ping libgtk2.0-dev
-RUN pip install -U kubernetes opencv-python
-
-RUN pip install paddlepaddle
-# if network is slowly, you may need to add proxy here.
-# ENV https_proxy=
-RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python'
-RUN pip uninstall -y paddlepaddle
-# unset proxy if it is setted.
-# ENV https_proxy=""
-
-# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
-# so we must build one with distribute support to install in this image.
-ADD *.whl /
-RUN pip install /*.whl && rm -f /*.whl
-ENV LD_LIBRARY_PATH=/usr/local/lib
-
-# tf k8s
-RUN pip install tensorflow==1.4.0
-ADD tf_k8s /usr/bin
-RUN chmod +x /usr/bin/tf_k8s
-ADD vgg16_tf.py /workspace/
-
-# below lines may change a lot for debugging
-ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
-ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
-RUN chmod +x /usr/bin/paddle_k8s
-ADD vgg16_fluid.py vgg16_v2.py /workspace/
diff --git a/benchmark/cluster/vgg16/README.md b/benchmark/cluster/vgg16/README.md
deleted file mode 100644
index d56a912b9b03986e32693363f82df05a34b779e9..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/README.md
+++ /dev/null
@@ -1,195 +0,0 @@
-# Performance for Distributed vgg16
-
-## Test Result
-
-### Hardware Infomation
-
-- CPU: Intel(R) Xeon(R) CPU E5-2620 v4 @ 2.10GHz
-- cpu MHz : 2101.000
-- cache size : 20480 KB
-
-### Blas settings
-
-Setting environment variable: `MKL_NUM_THREADS=1`.
-
-### Single Node Single Thread
-
-- Metrics: samples / sec
-
-
-
-
-Batch Size |
- 32 |
-64 |
-128 |
-256 |
-
-
-
-
- PaddlePaddle Fluid |
- 15.44 |
- 16.32 |
- 16.74 |
- 16.79 |
-
-
-PaddlePaddle v2 |
- 15.97 |
- 17.04 |
- 17.60 |
- 17.83 |
-
-
-TensorFlow |
- 9.09 |
- 9.10 |
- 9.24 |
- 8.66 |
-
-
-
-
-
-### Different Batch Size
-
-- PServer Count: 10
-- Trainer Count: 20
-- Metrics: samples / sec
-
-
-
-
-Batch Size |
- 32 |
-64 |
-128 |
-256 |
-
-
-
-
- PaddlePaddle Fluid |
- 190.20 |
- 222.15 |
- 247.40 |
- 258.18 |
-
-
-PaddlePaddle v2 |
- 170.96 |
- 233.71 |
- 256.14 |
- 329.23 |
-
-
-TensorFlow |
- - |
- - |
- - |
- - |
-
-
-
-
-### Accelerate Rate
-
-- Pserver Count: 20
-- Batch Size: 128
-- Metrics: samples / sec
-
-
-
-
-Trainer Count |
-20 |
-40 |
-80 |
-100 |
-
-
-
-
- PaddlePaddle Fluid |
- 263.29 (78.64%) |
- 518.80 (77.47%) |
- 836.26 (62.44%) |
- 1019.29 (60.89%) |
-
-
-PaddlePaddle v2 (need more tests) |
- 326.85 (92.85%) |
- 534.58 (75.93%) |
- 853.30 (60.60%) |
- 1041.99 (59.20%) |
-
-
-TensorFlow |
- - |
- - |
- - |
- - |
-
-
-
-
-
-### Different Pserver Count
-
-- Trainer Count: 60
-- Batch Size: 128
-- Metrics: samples/ sec
-
-
-
-
-PServer Count |
-3 |
-6 |
-10 |
-20 |
-
-
-
-
- PaddlePaddle Fluid(should fix in next PR) |
- 589.1 |
- 592.6 |
- 656.4 |
- 655.8 |
-
-
-PaddlePaddle v2 (need more tests) |
- 593.4 |
- 791.3 |
- 729.7 |
- 821.7 |
-
-
-TensorFlow |
- - |
- - |
- - |
- - |
-
-
-
-
-
-*The performance gap between Fuild and v2 comes from the network interference.*
-
-
-## Steps to Run the Performance Test
-
-1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support.
-1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory.
-1. Run `docker build -t [image:tag] .` to build the docker image and run `docker push [image:tag]` to push the image to reponsitory so kubernetes can find it.
-1. Run `kubectl create -f pserver.yaml && kubectl create -f trainer.yaml` to start the job on your kubernetes cluster (you must configure the `kubectl` client before this step).
-1. Run `kubectl get po` to get running pods, and run `kubectl logs [podID]` to fetch the pod log of pservers and trainers.
-
-Check the logs for the distributed training progress and analyze the performance.
-
-## Enable Verbos Logs
-
-Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` and `GLOG_logtostderr=1` to see what happend in detail.
diff --git a/benchmark/cluster/vgg16/fluid_pserver.yaml b/benchmark/cluster/vgg16/fluid_pserver.yaml
deleted file mode 100644
index ee8b0763b62fc011f40f6197e929a68b48a93e47..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/fluid_pserver.yaml
+++ /dev/null
@@ -1,72 +0,0 @@
-apiVersion: extensions/v1beta1
-kind: ReplicaSet
-metadata:
- name: vgg16job-pserver
-spec:
- replicas: 10
- template:
- metadata:
- labels:
- paddle-job-pserver: vgg16job
- spec:
- hostNetwork: true
- imagePullSecrets:
- - name: job-registry-secret
- containers:
- - name: pserver
- image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
- imagePullPolicy: Always
- ports:
- - name: jobport-30236
- containerPort: 30236
- env:
- - name: PADDLE_JOB_NAME
- value: vgg16job
- - name: MKL_NUM_THREADS
- value: "1"
- - name: TRAINING_ROLE
- value: "PSERVER"
- - name: TRAINERS
- value: "20"
- - name: PSERVERS
- value: "10"
- - name: TOPOLOGY
- value: ""
- - name: ENTRY
- value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0"
- - name: TRAINER_PACKAGE
- value: "/workspace"
- - name: PADDLE_INIT_PORT
- value: "30236"
- - name: PADDLE_INIT_NICS
- value: "xgbe0"
- - name: PADDLE_INIT_TRAINER_COUNT
- value: "1"
- - name: PADDLE_INIT_PORTS_NUM
- value: "1"
- - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
- value: "1"
- - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
- value: "20"
- - name: PADDLE_INIT_NUM_PASSES
- value: "1"
- - name: PADDLE_INIT_USE_GPU
- value: "0"
- - name: LD_LIBRARY_PATH
- value: "/usr/local/lib:/usr/local/nvidia/lib64"
- - name: NAMESPACE
- valueFrom:
- fieldRef:
- fieldPath: "metadata.namespace"
- - name: POD_IP
- valueFrom:
- fieldRef:
- fieldPath: "status.podIP"
- command: ["paddle_k8s", "start_fluid"]
- resources:
- requests:
- memory: 10Gi
- cpu: 4
- limits:
- memory: 10Gi
- cpu: 4
diff --git a/benchmark/cluster/vgg16/fluid_trainer.yaml b/benchmark/cluster/vgg16/fluid_trainer.yaml
deleted file mode 100644
index 3d56caac009464d1073423bb63abff1f8b0cf28f..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/fluid_trainer.yaml
+++ /dev/null
@@ -1,69 +0,0 @@
-apiVersion: batch/v1
-kind: Job
-metadata:
- name: vgg16job-trainer
-spec:
- parallelism: 20
- completions: 20
- template:
- metadata:
- labels:
- paddle-job: vgg16job
- spec:
- imagePullSecrets:
- - name: job-registry-secret
- hostNetwork: true
- containers:
- - name: trainer
- image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
- imagePullPolicy: Always
- command: ["paddle_k8s", "start_fluid"]
- env:
- - name: PADDLE_JOB_NAME
- value: vgg16job
- - name: TRAINING_ROLE
- value: "TRAINER"
- - name: TRAINERS
- value: "20"
- - name: PSERVERS
- value: "10"
- - name: TOPOLOGY
- value: ""
- - name: ENTRY
- value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 128"
- - name: TRAINER_PACKAGE
- value: "/workspace"
- - name: PADDLE_INIT_PORT
- value: "30236"
- - name: PADDLE_INIT_NICS
- value: "xgbe0"
- - name: PADDLE_INIT_TRAINER_COUNT
- value: "1"
- - name: PADDLE_INIT_PORTS_NUM
- value: "1"
- - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
- value: "1"
- - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
- value: "20"
- - name: PADDLE_INIT_NUM_PASSES
- value: "1"
- - name: PADDLE_INIT_USE_GPU
- value: "0"
- - name: LD_LIBRARY_PATH
- value: "/usr/local/lib:/usr/local/nvidia/lib64"
- - name: NAMESPACE
- valueFrom:
- fieldRef:
- fieldPath: "metadata.namespace"
- - name: POD_IP
- valueFrom:
- fieldRef:
- fieldPath: "status.podIP"
- resources:
- requests:
- memory: 40Gi
- cpu: 2
- limits:
- memory: 40Gi
- cpu: 2
- restartPolicy: Never
diff --git a/benchmark/cluster/vgg16/run_vgg_dist.sh b/benchmark/cluster/vgg16/run_vgg_dist.sh
deleted file mode 100644
index 8c0501439e9d5fa175f5aa9b62d286e690a10904..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/run_vgg_dist.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/bash
-
-# Update to point to the source file.
-VGG_SRC="vgg16_fluid.py"
-
-export TRAINING_ROLE=PSERVER
-export TRAINERS=2
-export POD_IP=127.0.0.1
-export PADDLE_INIT_PORT=6174
-MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 &
-
-# Need to wait for the ps to start first.
-sleep 10
-echo "done start ps"
-
-export TRAINING_ROLE=TRAINER
-export TRAINERS=2
-export POD_IP=127.0.0.1
-export PADDLE_INIT_PORT=6174
-CUDA_VISIBLE_DEVICES=4 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=0 &
-CUDA_VISIBLE_DEVICES=5 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=1 &
diff --git a/benchmark/cluster/vgg16/tf_k8s b/benchmark/cluster/vgg16/tf_k8s
deleted file mode 100644
index 4fc263d5f681aeabfa71f1758714d269d987b272..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/tf_k8s
+++ /dev/null
@@ -1,82 +0,0 @@
-#!/bin/bash
-check_trainer_ret() {
- ret=$1
- stdbuf -oL echo "job returned $ret...setting pod return message..."
- stdbuf -oL echo "==============================="
-
- if [ $ret -eq 136 ] ; then
- echo "Error Arithmetic Operation(Floating Point Exception)" > /dev/termination-log
- elif [ $ret -eq 139 ] ; then
- echo "Segmentation Fault" > /dev/termination-log
- elif [ $ret -eq 1 ] ; then
- echo "General Error" > /dev/termination-log
- elif [ $ret -eq 134 ] ; then
- echo "Program Abort" > /dev/termination-log
- fi
- stdbuf -oL echo "termination log wroted..."
- exit $ret
-}
-
-g_pservers=""
-g_trainers=""
-
-wait_running_pods(){
- pserver_label="tf-job-pserver=${JOB_NAME}"
- trainer_label="tf-job-trainer=${JOB_NAME}"
-
- stdbuf -oL python /root/k8s_tools.py wait_pods_running ${pserver_label} ${PSERVERS_NUM}
- stdbuf -oL python /root/k8s_tools.py wait_pods_running ${trainer_label} ${TRAINERS_NUM}
-
- g_pservers=$(python /root/k8s_tools.py fetch_endpoints ${pserver_label} ${PORT})
- g_trainers=$(python /root/k8s_tools.py fetch_endpoints ${trainer_label} ${PORT})
-}
-
-start_tf_pserver(){
- wait_running_pods
-
- label="tf-job-pserver=${JOB_NAME}"
- pserver_id=$(python /root/k8s_tools.py fetch_id ${label})
-
- cmd="${ENTRY} --ps_hosts=${g_pservers} --worker_hosts=${g_trainers} \
- --job_name=${TF_JOB_NAME} --task_index=${pserver_id}"
-
- stdbuf -oL sh -c "cd ${TRAINER_PACKAGE} && ${cmd}"
-}
-
-start_tf_trainer(){
- wait_running_pods
-
- label="tf-job-trainer=${JOB_NAME}"
- trainer_id=$(python /root/k8s_tools.py fetch_id ${label})
-
- cmd="${ENTRY} --ps_hosts=${g_pservers} --worker_hosts=${g_trainers} \
- --job_name=${TF_JOB_NAME} --task_index=${trainer_id} --batch_size=${BATCH_SIZE}"
-
- stdbuf -oL sh -c "cd ${TRAINER_PACKAGE} && ${cmd}"
- check_trainer_ret $?
-}
-
-start_tf(){
- if [[ "${TF_JOB_NAME}" == "worker" ]]; then
- start_tf_trainer
- else
- start_tf_pserver
- fi
-}
-
-usage() {
- echo "usage: tf_k8s []:"
- echo " start_tf Start tensorflow jobs"
-}
-
-case "$1" in
- start_tf)
- start_tf
- ;;
- --help)
- usage
- ;;
- *)
- usage
- ;;
-esac
diff --git a/benchmark/cluster/vgg16/tf_pserver.yaml b/benchmark/cluster/vgg16/tf_pserver.yaml
deleted file mode 100644
index 5e37c700819119c8af05c40fe4b8d13911efc3e1..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/tf_pserver.yaml
+++ /dev/null
@@ -1,56 +0,0 @@
-apiVersion: extensions/v1beta1
-kind: ReplicaSet
-metadata:
- name: vgg16job-tf-pserver
-spec:
- replicas: 10
- template:
- metadata:
- labels:
- tf-job-pserver: vgg16job-tf
- spec:
- hostNetwork: true
- imagePullSecrets:
- - name: job-registry-secret
- containers:
- - name: pserver
- image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16"
- imagePullPolicy: Always
- command: ["tf_k8s", "start_tf"]
- ports:
- - name: jobport-30236
- containerPort: 30236
- env:
- - name: PORT
- value: "32036"
- - name: ENTRY
- value: "python vgg16_tf.py"
- - name: JOB_NAME
- value: vgg16job-tf
- - name: PSERVERS_NUM
- value: "10"
- - name: TF_JOB_NAME
- value: "ps"
- - name: TRAINERS_NUM
- value: "20"
- - name: BATCH_SIZE
- value: "128"
- - name: TRAINER_PACKAGE
- value: "/workspace"
- - name: NUM_PASSES
- value: "1"
- - name: NAMESPACE
- valueFrom:
- fieldRef:
- fieldPath: "metadata.namespace"
- - name: POD_IP
- valueFrom:
- fieldRef:
- fieldPath: "status.podIP"
- resources:
- requests:
- memory: 10Gi
- cpu: 4
- limits:
- memory: 10Gi
- cpu: 4
diff --git a/benchmark/cluster/vgg16/tf_trainer.yaml b/benchmark/cluster/vgg16/tf_trainer.yaml
deleted file mode 100644
index 08795df3addfa7b618db24a65e57be190e268f06..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/tf_trainer.yaml
+++ /dev/null
@@ -1,58 +0,0 @@
-apiVersion: batch/v1
-kind: Job
-metadata:
- name: vgg16job-tf-trainer
-spec:
- parallelism: 20
- completions: 20
- template:
- metadata:
- labels:
- tf-job-trainer: vgg16job-tf
- spec:
- imagePullSecrets:
- - name: job-registry-secret
- hostNetwork: true
- containers:
- - name: trainer
- image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16"
- imagePullPolicy: Always
- command: ["tf_k8s", "start_tf"]
- ports:
- - name: jobport-30236
- containerPort: 30236
- env:
- - name: PORT
- value: "32036"
- - name: JOB_NAME
- value: vgg16job-tf
- - name: TF_JOB_NAME
- value: "worker"
- - name: ENTRY
- value: "python vgg16_tf.py"
- - name: PSERVERS_NUM
- value: "10"
- - name: BATCH_SIZE
- value: "128"
- - name: TRAINERS_NUM
- value: "20"
- - name: TRAINER_PACKAGE
- value: "/workspace"
- - name: NUM_PASSES
- value: "1"
- - name: NAMESPACE
- valueFrom:
- fieldRef:
- fieldPath: "metadata.namespace"
- - name: POD_IP
- valueFrom:
- fieldRef:
- fieldPath: "status.podIP"
- resources:
- requests:
- memory: 40Gi
- cpu: 2
- limits:
- memory: 40Gi
- cpu: 2
- restartPolicy: Never
diff --git a/benchmark/cluster/vgg16/v2_pserver.yaml b/benchmark/cluster/vgg16/v2_pserver.yaml
deleted file mode 100644
index dd1271e0cf399184134c06b3200ee1202c65cef0..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/v2_pserver.yaml
+++ /dev/null
@@ -1,64 +0,0 @@
-apiVersion: extensions/v1beta1
-kind: ReplicaSet
-metadata:
- name: vgg16v2job-pserver
-spec:
- replicas: 10
- template:
- metadata:
- labels:
- paddle-job-pserver: vgg16v2job
- spec:
- hostNetwork: true
- imagePullSecrets:
- - name: job-registry-secret
- containers:
- - name: pserver
- image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
- imagePullPolicy: Always
- ports:
- - name: jobport-30236
- containerPort: 30236
- env:
- - name: PADDLE_JOB_NAME
- value: vgg16v2job
- - name: TRAINERS
- value: "20"
- - name: PSERVERS
- value: "10"
- - name: TOPOLOGY
- value: ""
- - name: ENTRY
- value: "python train.py"
- - name: TRAINER_PACKAGE
- value: "/workspace"
- - name: PADDLE_INIT_PORT
- value: "30236"
- - name: PADDLE_INIT_NICS
- value: "xgbe0"
- - name: PADDLE_INIT_TRAINER_COUNT
- value: "1"
- - name: PADDLE_INIT_PORTS_NUM
- value: "1"
- - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
- value: "1"
- - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
- value: "20"
- - name: PADDLE_INIT_NUM_PASSES
- value: "1"
- - name: PADDLE_INIT_USE_GPU
- value: "0"
- - name: LD_LIBRARY_PATH
- value: "/usr/local/lib:/usr/local/nvidia/lib64"
- - name: NAMESPACE
- valueFrom:
- fieldRef:
- fieldPath: "metadata.namespace"
- command: ["paddle_k8s", "start_pserver"]
- resources:
- requests:
- memory: 10Gi
- cpu: 4
- limits:
- memory: 10Gi
- cpu: 4
diff --git a/benchmark/cluster/vgg16/v2_trainer.yaml b/benchmark/cluster/vgg16/v2_trainer.yaml
deleted file mode 100644
index 12c8964066cbcfe8d2a44de2f51a3d12ea422fe2..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/v2_trainer.yaml
+++ /dev/null
@@ -1,65 +0,0 @@
-apiVersion: batch/v1
-kind: Job
-metadata:
- name: vgg16v2job-trainer
-spec:
- parallelism: 20
- completions: 20
- template:
- metadata:
- labels:
- paddle-job: vgg16v2job
- spec:
- imagePullSecrets:
- - name: job-registry-secret
- hostNetwork: true
- containers:
- - name: trainer
- image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
- imagePullPolicy: Always
- command: ["paddle_k8s", "start_trainer", "v2"]
- env:
- - name: PADDLE_JOB_NAME
- value: vgg16v2job
- - name: BATCH_SIZE
- value: "256"
- - name: TRAINERS
- value: "20"
- - name: PSERVERS
- value: "10"
- - name: TOPOLOGY
- value: ""
- - name: ENTRY
- value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16_v2.py"
- - name: TRAINER_PACKAGE
- value: "/workspace"
- - name: PADDLE_INIT_PORT
- value: "30236"
- - name: PADDLE_INIT_NICS
- value: "xgbe0"
- - name: PADDLE_INIT_TRAINER_COUNT
- value: "1"
- - name: PADDLE_INIT_PORTS_NUM
- value: "1"
- - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
- value: "1"
- - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
- value: "20"
- - name: PADDLE_INIT_NUM_PASSES
- value: "2"
- - name: PADDLE_INIT_USE_GPU
- value: "0"
- - name: LD_LIBRARY_PATH
- value: "/usr/local/lib:/usr/local/nvidia/lib64"
- - name: NAMESPACE
- valueFrom:
- fieldRef:
- fieldPath: "metadata.namespace"
- resources:
- requests:
- memory: 40Gi
- cpu: 2
- limits:
- memory: 40Gi
- cpu: 2
- restartPolicy: Never
diff --git a/benchmark/cluster/vgg16/vgg16_fluid.py b/benchmark/cluster/vgg16/vgg16_fluid.py
deleted file mode 100644
index 05b5f3977cbed2f08df73c6d8ba2fff687db3313..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/vgg16_fluid.py
+++ /dev/null
@@ -1,308 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""VGG16 benchmark in Fluid"""
-from __future__ import print_function
-
-import sys
-import time
-import numpy as np
-import paddle.v2 as paddle
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import paddle.fluid.profiler as profiler
-import argparse
-import functools
-import os
-from paddle.fluid import debuger
-
-
-def str2bool(v):
- if v.lower() in ('yes', 'true', 't', 'y', '1'):
- return True
- elif v.lower() in ('no', 'false', 'f', 'n', '0'):
- return False
- else:
- raise argparse.ArgumentTypeError('Boolean value expected.')
-
-
-parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument(
- '--batch_size', type=int, default=128, help="Batch size for training.")
-parser.add_argument(
- '--learning_rate',
- type=float,
- default=1e-3,
- help="Learning rate for training.")
-parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.")
-parser.add_argument(
- '--device',
- type=str,
- default='CPU',
- choices=['CPU', 'GPU'],
- help="The device type.")
-parser.add_argument('--device_id', type=int, default=0, help="The device id.")
-parser.add_argument(
- '--data_format',
- type=str,
- default='NCHW',
- choices=['NCHW', 'NHWC'],
- help='The data order, now only support NCHW.')
-parser.add_argument(
- '--data_set',
- type=str,
- default='cifar10',
- choices=['cifar10', 'flowers'],
- help='Optional dataset for benchmark.')
-parser.add_argument(
- '--local',
- type=str2bool,
- default=True,
- help='Whether to run as local mode.')
-
-parser.add_argument(
- "--ps_hosts",
- type=str,
- default="",
- help="Comma-separated list of hostname:port pairs")
-parser.add_argument(
- "--trainer_hosts",
- type=str,
- default="",
- help="Comma-separated list of hostname:port pairs")
-parser.add_argument(
- "--profile", action='store_true', help="If set, profile a few steps.")
-
-# Flags for defining the tf.train.Server
-parser.add_argument(
- "--task_index", type=int, default=0, help="Index of task within the job")
-args = parser.parse_args()
-
-
-def vgg16_bn_drop(input):
- def conv_block(input, num_filter, groups, dropouts):
- return fluid.nets.img_conv_group(
- input=input,
- pool_size=2,
- pool_stride=2,
- conv_num_filter=[num_filter] * groups,
- conv_filter_size=3,
- conv_act='relu',
- conv_with_batchnorm=True,
- conv_batchnorm_drop_rate=dropouts,
- pool_type='max')
-
- conv1 = conv_block(input, 64, 2, [0.3, 0])
- conv2 = conv_block(conv1, 128, 2, [0.4, 0])
- conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
- conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
- conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
-
- drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
- fc1 = fluid.layers.fc(input=drop, size=4096, act=None)
- bn = fluid.layers.batch_norm(input=fc1, act='relu')
- drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
- fc2 = fluid.layers.fc(input=drop2, size=4096, act=None)
- return fc2
-
-
-def main():
- if args.data_set == "cifar10":
- classdim = 10
- if args.data_format == 'NCHW':
- data_shape = [3, 32, 32]
- else:
- data_shape = [32, 32, 3]
- else:
- classdim = 102
- if args.data_format == 'NCHW':
- data_shape = [3, 224, 224]
- else:
- data_shape = [224, 224, 3]
-
- # Input data
- images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
- label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
- # Train program
- net = vgg16_bn_drop(images)
- predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
- cost = fluid.layers.cross_entropy(input=predict, label=label)
- avg_cost = fluid.layers.mean(x=cost)
-
- # Evaluator
- batch_size = fluid.layers.create_tensor(dtype='int64')
- batch_acc = fluid.layers.accuracy(
- input=predict, label=label, total=batch_size)
-
- # inference program
- inference_program = fluid.default_main_program().clone()
- with fluid.program_guard(inference_program):
- inference_program = fluid.io.get_inference_program(batch_acc)
-
- # Optimization
- optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
- optimize_ops, params_grads = optimizer.minimize(avg_cost)
-
- # Initialize executor
- place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(
- args.device_id)
- exe = fluid.Executor(place)
-
- # test
- def test(exe):
- test_pass_acc = fluid.average.WeightedAverage()
- for batch_id, data in enumerate(test_reader()):
- img_data = np.array(map(lambda x: x[0].reshape(data_shape),
- data)).astype("float32")
- y_data = np.array(map(lambda x: x[1], data)).astype("int64")
- y_data = y_data.reshape([-1, 1])
-
- outs = exe.run(inference_program,
- feed={"pixel": img_data,
- "label": y_data},
- fetch_list=[batch_acc, batch_size])
- test_pass_acc.add(value=np.array(outs[0]), weight=np.array(outs[1]))
-
- return test_pass_acc.eval()
-
- def train_loop(exe, trainer_prog):
- iters = 0
- ts = time.time()
- train_pass_acc = fluid.average.WeightedAverage()
- for pass_id in range(args.num_passes):
- # train
- start_time = time.time()
- num_samples = 0
- train_pass_acc.reset()
-
- def run_step(batch_id, data):
- img_data = np.array(
- map(lambda x: x[0].reshape(data_shape), data)).astype(
- "float32")
- y_data = np.array(map(lambda x: x[1], data)).astype("int64")
- y_data = y_data.reshape([-1, 1])
-
- loss, acc, b_size = exe.run(
- trainer_prog,
- feed={"pixel": img_data,
- "label": y_data},
- fetch_list=[avg_cost, batch_acc, batch_size])
- return loss, acc, b_size
-
- if args.profile and args.task_index == 0:
- # warmup.
- for batch_id, data in enumerate(train_reader()):
- if batch_id > 5: break
- run_step(batch_id, data)
- with profiler.profiler('All', 'total', '/tmp/profile_vgg'):
- for batch_id, data in enumerate(train_reader()):
- if batch_id > 5: break
- run_step(batch_id, data)
-
- for batch_id, data in enumerate(train_reader()):
- ts = time.time()
- loss, acc, b_size = run_step(batch_id, data)
- iters += 1
- num_samples += len(data)
- train_pass_acc.add(value=acc, weight=b_size)
- print(
- "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, "
- "Speed = %.2f img/s" % (pass_id, iters, loss, acc,
- len(data) / (time.time() - ts))
- ) # The accuracy is the accumulation of batches, but not the current batch.
-
- pass_elapsed = time.time() - start_time
- pass_train_acc = train_pass_acc.eval()
- pass_test_acc = test(exe)
- print("Task:%d Pass = %d, Training performance = %f imgs/s, "
- "Train accuracy = %f, Test accuracy = %f\n" %
- (args.task_index, pass_id, num_samples / pass_elapsed,
- pass_train_acc, pass_test_acc))
-
- if args.local:
- # Parameter initialization
- exe.run(fluid.default_startup_program())
-
- # data reader
- train_reader = paddle.batch(
- paddle.reader.shuffle(
- paddle.dataset.cifar.train10() if args.data_set == 'cifar10'
- else paddle.dataset.flowers.train(),
- buf_size=5120),
- batch_size=args.batch_size)
- test_reader = paddle.batch(
- paddle.dataset.cifar.test10()
- if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
- batch_size=args.batch_size)
- train_loop(exe, fluid.default_main_program())
- else:
- trainers = int(os.getenv("TRAINERS")) # total trainer count
- print("trainers total: ", trainers)
-
- training_role = os.getenv(
- "TRAINING_ROLE",
- "TRAINER") # get the training role: trainer/pserver
-
- t = fluid.DistributeTranspiler()
- t.transpile(
- trainer_id=args.task_index,
- pservers=args.ps_hosts,
- trainers=trainers)
-
- if training_role == "PSERVER":
- current_endpoint = os.getenv("POD_IP") + ":" + os.getenv(
- "PADDLE_INIT_PORT")
- if not current_endpoint:
- print("need env SERVER_ENDPOINT")
- exit(1)
- pserver_prog = t.get_pserver_program(current_endpoint)
- pserver_startup = t.get_startup_program(current_endpoint,
- pserver_prog)
- exe.run(pserver_startup)
- exe.run(pserver_prog)
- elif training_role == "TRAINER":
- # Parameter initialization
- exe.run(fluid.default_startup_program())
-
- # data reader
- train_reader = paddle.batch(
- paddle.reader.shuffle(
- paddle.dataset.cifar.train10() if args.data_set == 'cifar10'
- else paddle.dataset.flowers.train(),
- buf_size=5120),
- batch_size=args.batch_size)
- test_reader = paddle.batch(
- paddle.dataset.cifar.test10() if args.data_set == 'cifar10' else
- paddle.dataset.flowers.test(),
- batch_size=args.batch_size)
-
- trainer_prog = t.get_trainer_program()
- feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
- # TODO(typhoonzero): change trainer startup program to fetch parameters from pserver
- exe.run(fluid.default_startup_program())
- train_loop(exe, trainer_prog)
- else:
- print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
-
-
-def print_arguments():
- print('----------- Configuration Arguments -----------')
- for arg, value in sorted(vars(args).iteritems()):
- print('%s: %s' % (arg, value))
- print('------------------------------------------------')
-
-
-if __name__ == "__main__":
- print_arguments()
- main()
diff --git a/benchmark/cluster/vgg16/vgg16_tf.py b/benchmark/cluster/vgg16/vgg16_tf.py
deleted file mode 100644
index 2d220478acae46566760209dbc012cff316946aa..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/vgg16_tf.py
+++ /dev/null
@@ -1,366 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""VGG16 benchmark in TensorFlow
-You can get distribution example template structure here:
-https://medium.com/clusterone/how-to-write-distributed-tensorflow-code-with-an-example-on-tensorport-70bf3306adcb
-https://www.tensorflow.org/deploy/distributed
-"""
-
-import tensorflow as tf
-import paddle.v2 as paddle
-import numpy as np
-import argparse
-import time
-
-parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument(
- '--batch_size', type=int, default=128, help="Batch size for training.")
-parser.add_argument(
- '--learning_rate',
- type=float,
- default=1e-3,
- help="Learning rate for training.")
-parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.")
-parser.add_argument(
- '--device',
- type=str,
- default='CPU',
- choices=['CPU', 'GPU'],
- help="The device type.")
-parser.add_argument(
- '--data_format',
- type=str,
- default='NHWC',
- choices=['NCHW', 'NHWC'],
- help='The data order, NCHW=[batch, channels, height, width].'
- 'Only support NHWC right now.')
-parser.add_argument(
- '--data_set',
- type=str,
- default='cifar10',
- choices=['cifar10', 'flowers'],
- help='Optional dataset for benchmark.')
-
-parser.add_argument(
- "--ps_hosts",
- type=str,
- default="",
- help="Comma-separated list of hostname:port pairs")
-parser.add_argument(
- "--worker_hosts",
- type=str,
- default="",
- help="Comma-separated list of hostname:port pairs")
-parser.add_argument(
- "--job_name", type=str, default="", help="One of 'worker', 'ps'")
-# Flags for defining the tf.train.Server
-parser.add_argument(
- "--task_index", type=int, default=0, help="Index of task within the job")
-
-args = parser.parse_args()
-
-
-class VGG16Model(object):
- def __init__(self):
- self.parameters = []
-
- def batch_norm_relu(self, inputs, is_training):
- """Performs a batch normalization followed by a ReLU."""
- # We set fused=True for a significant speed boost. See
- # https://www.tensorflow.org/speed/speed_guide#common_fused_ops
- inputs = tf.layers.batch_normalization(
- inputs=inputs,
- axis=1 if args.data_format == 'NCHW' else -1,
- momentum=0.9,
- epsilon=1e-05,
- center=True,
- scale=True,
- training=is_training,
- fused=True)
- inputs = tf.nn.relu(inputs)
- return inputs
-
- def conv_bn_layer(self,
- name,
- images,
- kernel_shape,
- is_training,
- drop_rate=0.0):
- with tf.name_scope(name) as scope:
- kernel = tf.Variable(
- tf.truncated_normal(
- kernel_shape, dtype=tf.float32, stddev=1e-1),
- name='weights')
- conv = tf.nn.conv2d(
- images,
- kernel, [1, 1, 1, 1],
- data_format=args.data_format,
- padding='SAME')
- biases = tf.Variable(
- tf.constant(
- 0.0, shape=[kernel_shape[-1]], dtype=tf.float32),
- trainable=True,
- name='biases')
- out = tf.nn.bias_add(conv, biases)
- out = self.batch_norm_relu(out, is_training)
- out = tf.layers.dropout(out, rate=drop_rate, training=is_training)
- return out
-
- def fc_layer(self, name, inputs, shape):
- with tf.name_scope(name) as scope:
- fc_w = tf.Variable(
- tf.truncated_normal(
- shape, dtype=tf.float32, stddev=1e-1),
- name='weights')
- fc_b = tf.Variable(
- tf.constant(
- 0.0, shape=[shape[-1]], dtype=tf.float32),
- trainable=True,
- name='biases')
- out = tf.nn.bias_add(tf.matmul(inputs, fc_w), fc_b)
- return out
-
- def network(self, images, class_dim, is_training):
- """ VGG16 model structure.
-
- TODO(kuke): enable this network to support the 'NCHW' data format
- """
-
- # conv1
- conv1_1 = self.conv_bn_layer(
- 'conv1_1', images, [3, 3, 3, 64], is_training, drop_rate=0.3)
- conv1_2 = self.conv_bn_layer(
- 'conv1_2', conv1_1, [3, 3, 64, 64], is_training, drop_rate=0.0)
- # pool1
- pool1 = tf.nn.max_pool(
- conv1_2,
- ksize=[1, 2, 2, 1],
- strides=[1, 2, 2, 1],
- padding='SAME',
- name='pool1')
- # conv2
- conv2_1 = self.conv_bn_layer(
- 'conv2_1', pool1, [3, 3, 64, 128], is_training, drop_rate=0.4)
- conv2_2 = self.conv_bn_layer(
- 'conv2_2', conv2_1, [3, 3, 128, 128], is_training, drop_rate=0.0)
- # pool2
- pool2 = tf.nn.max_pool(
- conv2_2,
- ksize=[1, 2, 2, 1],
- strides=[1, 2, 2, 1],
- padding='SAME',
- name='pool2')
- # conv3
- conv3_1 = self.conv_bn_layer(
- 'conv3_1', pool2, [3, 3, 128, 256], is_training, drop_rate=0.4)
- conv3_2 = self.conv_bn_layer(
- 'conv3_2', conv3_1, [3, 3, 256, 256], is_training, drop_rate=0.4)
- conv3_3 = self.conv_bn_layer(
- 'conv3_3', conv3_2, [3, 3, 256, 256], is_training, drop_rate=0.0)
- # pool3
- pool3 = tf.nn.max_pool(
- conv3_3,
- ksize=[1, 2, 2, 1],
- strides=[1, 2, 2, 1],
- padding='SAME',
- name='pool3')
- # conv4
- conv4_1 = self.conv_bn_layer(
- 'conv4_1', pool3, [3, 3, 256, 512], is_training, drop_rate=0.4)
- conv4_2 = self.conv_bn_layer(
- 'conv4_2', conv4_1, [3, 3, 512, 512], is_training, drop_rate=0.4)
- conv4_3 = self.conv_bn_layer(
- 'conv4_3', conv4_2, [3, 3, 512, 512], is_training, drop_rate=0.0)
- # pool4
- pool4 = tf.nn.max_pool(
- conv4_3,
- ksize=[1, 2, 2, 1],
- strides=[1, 2, 2, 1],
- padding='SAME',
- name='pool4')
- # conv5
- conv5_1 = self.conv_bn_layer(
- 'conv5_1', pool4, [3, 3, 512, 512], is_training, drop_rate=0.4)
- conv5_2 = self.conv_bn_layer(
- 'conv5_2', conv5_1, [3, 3, 512, 512], is_training, drop_rate=0.4)
- conv5_3 = self.conv_bn_layer(
- 'conv5_3', conv5_2, [3, 3, 512, 512], is_training, drop_rate=0.0)
- # pool5
- pool5 = tf.nn.max_pool(
- conv5_3,
- ksize=[1, 2, 2, 1],
- strides=[1, 2, 2, 1],
- padding='SAME',
- name='pool4')
- # flatten
- shape = int(np.prod(pool5.get_shape()[1:]))
- pool5_flat = tf.reshape(pool5, [-1, shape])
- # fc1
- drop = tf.layers.dropout(pool5_flat, rate=0.5, training=is_training)
- fc1 = self.fc_layer('fc1', drop, [shape, 512])
- # fc2
- bn = self.batch_norm_relu(fc1, is_training)
- drop = tf.layers.dropout(bn, rate=0.5, training=is_training)
- fc2 = self.fc_layer('fc2', drop, [512, 512])
-
- fc3 = self.fc_layer('fc3', fc2, [512, class_dim])
-
- return fc3
-
-
-def run_benchmark(cluster_spec, server):
- """Run benchmark on cifar10 or flowers."""
-
- if args.data_set == "cifar10":
- class_dim = 10
- raw_shape = (3, 32, 32)
- dat_shape = (None, 32, 32, 3) if args.data_format == 'NHWC' else (
- None, 3, 32, 32)
- else:
- class_dim = 102
- raw_shape = (3, 224, 224)
- dat_shape = (None, 224, 224, 3) if args.data_format == 'NHWC' else (
- None, 3, 224, 224)
-
- device = tf.train.replica_device_setter(
- worker_device="/job:worker/task:{}".format(args.task_index),
- cluster=cluster_spec)
-
- with tf.device(device):
- images = tf.placeholder(tf.float32, shape=dat_shape)
- labels = tf.placeholder(tf.int64, shape=(None, ))
- is_training = tf.placeholder('bool')
- onehot_labels = tf.one_hot(labels, depth=class_dim)
-
- vgg16 = VGG16Model()
- logits = vgg16.network(images, class_dim, is_training)
- loss = tf.losses.softmax_cross_entropy(
- onehot_labels=onehot_labels, logits=logits)
- avg_loss = tf.reduce_mean(loss)
-
- correct = tf.equal(tf.argmax(logits, 1), labels)
- accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
-
- optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
- update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
- global_step = tf.Variable(0, name='global_step', trainable=False)
- with tf.control_dependencies(update_ops):
- train_op = optimizer.minimize(avg_loss, global_step=global_step)
-
- summary_op = tf.summary.merge_all()
- init_op = tf.global_variables_initializer()
-
- # data reader
- train_reader = paddle.batch(
- paddle.reader.shuffle(
- paddle.dataset.cifar.train10()
- if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
- buf_size=5120),
- batch_size=args.batch_size)
- test_reader = paddle.batch(
- paddle.reader.shuffle(
- paddle.dataset.cifar.test10()
- if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
- buf_size=5120),
- batch_size=args.batch_size)
-
- # test
- def test():
- test_accs = []
- for batch_id, data in enumerate(test_reader()):
- test_images = np.array(
- map(lambda x: np.transpose(x[0].reshape(raw_shape),
- axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32")
- test_labels = np.array(map(lambda x: x[1], data)).astype('int64')
- test_accs.append(
- accuracy.eval(feed_dict={
- images: test_images,
- labels: test_labels,
- is_training: False
- }))
- return np.mean(test_accs)
-
- config = tf.ConfigProto(
- intra_op_parallelism_threads=1,
- inter_op_parallelism_threads=1,
- log_device_placement=True)
- config.gpu_options.allow_growth = True
-
- hooks = [tf.train.StopAtStepHook(last_step=1000000)]
-
- with tf.train.MonitoredTrainingSession(
- master=server.target,
- is_chief=(args.task_index == 0),
- hooks=hooks,
- config=config) as sess:
- iters, num_samples, start_time = 0, 0, 0.0
- for pass_id in range(args.num_passes):
- # train
- num_samples = 0
- start_time = time.time()
- for batch_id, data in enumerate(train_reader()):
- train_images = np.array(
- map(lambda x: np.transpose(x[0].reshape(raw_shape),
- axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32")
- train_labels = np.array(map(lambda x: x[1], data)).astype(
- 'int64')
- iter_begin_time = time.time()
- _, loss, acc = sess.run([train_op, avg_loss, accuracy],
- feed_dict={
- images: train_images,
- labels: train_labels,
- is_training: True
- })
- iters += 1
- print(
- "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed=%.2f imgs/sec"
- % (pass_id, iters, loss, acc,
- len(data) / (time.time() - iter_begin_time)))
- num_samples += len(data)
- train_elapsed = time.time() - start_time
- # test
- pass_test_acc = test()
- print("Pass = %d, Train speed = %f imgs/s, Test accuracy = %f\n" %
- (pass_id, num_samples / train_elapsed, pass_test_acc))
-
-
-def print_arguments():
- print('----------- Configuration Arguments -----------')
- for arg, value in sorted(vars(args).iteritems()):
- print('%s: %s' % (arg, value))
- print('------------------------------------------------')
-
-
-if __name__ == '__main__':
- print_arguments()
-
- ps_hosts = args.ps_hosts.split(",")
- worker_hosts = args.worker_hosts.split(",")
-
- # Create a cluster from the parameter server and worker hosts.
- cluster_spec = tf.train.ClusterSpec({
- "ps": ps_hosts,
- "worker": worker_hosts
- })
-
- # Create and start a server for the local task.
- server = tf.train.Server(
- cluster_spec, job_name=args.job_name, task_index=args.task_index)
-
- if args.job_name == "ps":
- print("start pserver")
- server.join()
- elif args.job_name == "worker":
- print("start worker")
- run_benchmark(cluster_spec, server)
diff --git a/benchmark/cluster/vgg16/vgg16_v2.py b/benchmark/cluster/vgg16/vgg16_v2.py
deleted file mode 100644
index 1a66af32d7131997c63bd3c3042875f33a467084..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/vgg16_v2.py
+++ /dev/null
@@ -1,154 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-import gzip
-
-import paddle.v2.dataset.cifar as cifar
-import paddle.v2 as paddle
-import time
-import os
-
-DATA_DIM = 3 * 32 * 32
-CLASS_DIM = 10
-BATCH_SIZE = os.getenv("BATCH_SIZE")
-if BATCH_SIZE:
- BATCH_SIZE = int(BATCH_SIZE)
-else:
- BATCH_SIZE = 128
-print "batch_size", BATCH_SIZE
-NODE_COUNT = int(os.getenv("TRAINERS"))
-ts = 0
-
-
-def vgg(input, nums, class_dim):
- def conv_block(input, num_filter, groups, num_channels=None):
- return paddle.networks.img_conv_group(
- input=input,
- num_channels=num_channels,
- pool_size=2,
- pool_stride=2,
- conv_num_filter=[num_filter] * groups,
- conv_filter_size=3,
- conv_act=paddle.activation.Relu(),
- pool_type=paddle.pooling.Max())
-
- assert len(nums) == 5
- # the channel of input feature is 3
- conv1 = conv_block(input, 64, nums[0], 3)
- conv2 = conv_block(conv1, 128, nums[1])
- conv3 = conv_block(conv2, 256, nums[2])
- conv4 = conv_block(conv3, 512, nums[3])
- conv5 = conv_block(conv4, 512, nums[4])
-
- fc_dim = 512
- fc1 = paddle.layer.fc(input=conv5,
- size=fc_dim,
- act=paddle.activation.Relu(),
- layer_attr=paddle.attr.Extra(drop_rate=0.5))
- fc2 = paddle.layer.fc(input=fc1,
- size=fc_dim,
- act=paddle.activation.Relu(),
- layer_attr=paddle.attr.Extra(drop_rate=0.5))
- out = paddle.layer.fc(input=fc2,
- size=class_dim,
- act=paddle.activation.Softmax())
- return out
-
-
-def vgg13(input, class_dim):
- nums = [2, 2, 2, 2, 2]
- return vgg(input, nums, class_dim)
-
-
-def vgg16(input, class_dim):
- nums = [2, 2, 3, 3, 3]
- return vgg(input, nums, class_dim)
-
-
-def vgg19(input, class_dim):
- nums = [2, 2, 4, 4, 4]
- return vgg(input, nums, class_dim)
-
-
-def main():
- global ts
- paddle.init(use_gpu=False)
- image = paddle.layer.data(
- name="image", type=paddle.data_type.dense_vector(DATA_DIM))
- lbl = paddle.layer.data(
- name="label", type=paddle.data_type.integer_value(CLASS_DIM))
-
- extra_layers = None
- # NOTE: for v2 distributed training need averaging updates.
- learning_rate = 1e-3 / NODE_COUNT
- out = vgg16(image, class_dim=CLASS_DIM)
- cost = paddle.layer.classification_cost(input=out, label=lbl)
-
- # Create parameters
- parameters = paddle.parameters.create(cost)
-
- # Create optimizer
- optimizer = paddle.optimizer.Momentum(
- momentum=0.9,
- regularization=paddle.optimizer.L2Regularization(rate=0.0005 *
- BATCH_SIZE),
- learning_rate=learning_rate / BATCH_SIZE,
- learning_rate_decay_a=0.1,
- learning_rate_decay_b=128000 * 35,
- learning_rate_schedule="discexp", )
-
- train_reader = paddle.batch(
- paddle.reader.shuffle(
- cifar.train10(),
- # To use other data, replace the above line with:
- # reader.train_reader('train.list'),
- buf_size=1000),
- batch_size=BATCH_SIZE)
- test_reader = paddle.batch(
- cifar.test10(),
- # To use other data, replace the above line with:
- # reader.test_reader('val.list'),
- batch_size=BATCH_SIZE)
-
- # Create trainer
- trainer = paddle.trainer.SGD(cost=cost,
- parameters=parameters,
- update_equation=optimizer,
- extra_layers=extra_layers,
- is_local=False)
-
- # End batch and end pass event handler
- def event_handler(event):
- global ts, ts_pass
- if isinstance(event, paddle.event.BeginPass):
- ts_pass = time.time()
- if isinstance(event, paddle.event.BeginIteration):
- ts = time.time()
- if isinstance(event, paddle.event.EndIteration):
- if event.batch_id % 1 == 0:
- print "\nPass %d, Batch %d, Cost %f, %s, spent: %f" % (
- event.pass_id, event.batch_id, event.cost, event.metrics,
- time.time() - ts)
- if isinstance(event, paddle.event.EndPass):
- print "Pass %d end, spent: %f" % (event.pass_id,
- time.time() - ts_pass)
- result = trainer.test(reader=test_reader)
- print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
-
- trainer.train(
- reader=train_reader, num_passes=200, event_handler=event_handler)
-
-
-if __name__ == '__main__':
- main()
diff --git a/benchmark/fluid/Dockerfile b/benchmark/fluid/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..707fadb1fae97cefe8a41715cd57d71754abda41
--- /dev/null
+++ b/benchmark/fluid/Dockerfile
@@ -0,0 +1,31 @@
+FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
+
+# Use UBUNTU_MIRROR can speed up apt-get speed.
+# ARG UBUNTU_MIRROR
+# RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
+
+RUN apt-get update && apt-get install -y python python-pip iputils-ping libgtk2.0-dev wget vim net-tools iftop python-opencv
+RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/lib/libnccl.so
+
+# IMPORTANT:
+# Add "ENV http_proxy=http://ip:port" if your download is slow, and don't forget to unset it at runtime.
+# exmaple: unset http_proxy && unset https_proxy && python fluid_benchmark.py ...
+
+RUN pip install -U pip
+RUN pip install -U kubernetes paddlepaddle
+
+RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()\npaddle.dataset.flowers.fetch()" | python'
+RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.mnist.train()\npaddle.dataset.mnist.test()\npaddle.dataset.imdb.fetch()" | python'
+RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.imikolov.fetch()" | python'
+RUN pip uninstall -y paddlepaddle && mkdir /workspace
+
+ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
+ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
+RUN chmod +x /usr/bin/paddle_k8s
+
+ADD *.whl /
+RUN pip install /*.whl && rm -f /*.whl
+
+ENV LD_LIBRARY_PATH=/usr/local/lib
+ADD fluid_benchmark.py recordio_converter.py args.py recordio_converter.py run.sh run_fluid_benchmark.sh /workspace/
+ADD models/ /workspace/models/
diff --git a/benchmark/fluid/README.md b/benchmark/fluid/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..28cade4634bb62723bf5120169e202657f548234
--- /dev/null
+++ b/benchmark/fluid/README.md
@@ -0,0 +1,99 @@
+# Fluid Benchmark
+
+This directory contains several models configurations and tools that used to run
+Fluid benchmarks for local and distributed training.
+
+
+## Run the Benchmark
+
+To start, run the following command to get the full help message:
+
+```bash
+python fluid_benchmark.py --help
+```
+
+Currently supported `--model` argument include:
+
+* mnist
+* resnet
+ * you can chose to use different dataset using `--data_set cifar10` or
+ `--data_set flowers`.
+* vgg
+* stacked_dynamic_lstm
+* machine_translation
+
+* Run the following command to start a benchmark job locally:
+ ```bash
+ python fluid_benchmark.py --model mnist --device GPU
+ ```
+ You can choose to use GPU/CPU training. With GPU training, you can specify
+ `--gpus ` to run multi GPU training.
+ You can set async mode parameter server. With async mode, you can specify
+ `--async_mode` to train model asynchronous.
+* Run distributed training with parameter servers:
+ * see [run_fluid_benchmark.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/fluid/run_fluid_benchmark.sh) as an example.
+ * start parameter servers:
+ ```bash
+ PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --device GPU --update_method pserver
+ sleep 15
+ ```
+ * start trainers:
+ ```bash
+ PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --device GPU --update_method pserver
+ ```
+* Run distributed training using NCCL2
+ ```bash
+ PADDLE_PSERVER_PORT=7164 PADDLE_TRAINER_IPS=192.168.0.2,192.168.0.3 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --device GPU --update_method nccl2
+ ```
+
+## Prepare the RecordIO file to Achieve Better Performance
+
+Run the following command will generate RecordIO files like "mnist.recordio" under the path
+and batch_size you choose, you can use batch_size=1 so that later reader can change the batch_size
+at any time using `fluid.batch`.
+
+```bash
+python -c 'from recordio_converter import *; prepare_mnist("data", 1)'
+```
+
+## Run Distributed Benchmark on Kubernetes Cluster
+
+You may need to build a Docker image before submitting a cluster job onto Kubernetes, or you will
+have to start all those processes mannually on each node, which is not recommended.
+
+To build the Docker image, you need to choose a paddle "whl" package to run with, you may either
+download it from
+http://www.paddlepaddle.org/docs/develop/documentation/zh/build_and_install/pip_install_en.html or
+build it by your own. Once you've got the "whl" package, put it under the current directory and run:
+
+```bash
+docker build -t [your docker image name]:[your docker image tag] .
+```
+
+Then push the image to a Docker registry that your Kubernetes cluster can reach.
+
+We provide a script `kube_gen_job.py` to generate Kubernetes yaml files to submit
+distributed benchmark jobs to your cluster. To generate a job yaml, just run:
+
+```bash
+python kube_gen_job.py --jobname myjob --pscpu 4 --cpu 8 --gpu 8 --psmemory 20 --memory 40 --pservers 4 --trainers 4 --entry "python fluid_benchmark.py --model mnist --gpus 8 --device GPU --update_method pserver " --disttype pserver
+```
+
+Then the yaml files are generated under directory `myjob`, you can run:
+
+```bash
+kubectl create -f myjob/
+```
+
+The job shall start.
+
+
+## Notes for Run Fluid Distributed with NCCL2 and RDMA
+
+Before running NCCL2 distributed jobs, please check that whether your node has multiple network
+interfaces, try to add the environment variable `export NCCL_SOCKET_IFNAME=eth0` to use your actual
+network device.
+
+To run high-performance distributed training, you must prepare your hardware environment to be
+able to run RDMA enabled network communication, please check out [this](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/howto/cluster/nccl2_rdma_training.md)
+note for details.
diff --git a/benchmark/fluid/args.py b/benchmark/fluid/args.py
new file mode 100644
index 0000000000000000000000000000000000000000..a79f25ccc6ace1594f3f331633130eaace5e175b
--- /dev/null
+++ b/benchmark/fluid/args.py
@@ -0,0 +1,134 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+
+__all__ = ['parse_args', ]
+
+BENCHMARK_MODELS = [
+ "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm"
+]
+
+
+def parse_args():
+ parser = argparse.ArgumentParser('Fluid model benchmarks.')
+ parser.add_argument(
+ '--model',
+ type=str,
+ choices=BENCHMARK_MODELS,
+ default='resnet',
+ help='The model to run benchmark with.')
+ parser.add_argument(
+ '--batch_size', type=int, default=32, help='The minibatch size.')
+ # args related to learning rate
+ parser.add_argument(
+ '--learning_rate', type=float, default=0.001, help='The learning rate.')
+ # TODO(wuyi): add "--use_fake_data" option back.
+ parser.add_argument(
+ '--skip_batch_num',
+ type=int,
+ default=5,
+ help='The first num of minibatch num to skip, for better performance test'
+ )
+ parser.add_argument(
+ '--iterations', type=int, default=80, help='The number of minibatches.')
+ parser.add_argument(
+ '--pass_num', type=int, default=100, help='The number of passes.')
+ parser.add_argument(
+ '--data_format',
+ type=str,
+ default='NCHW',
+ choices=['NCHW', 'NHWC'],
+ help='The data data_format, now only support NCHW.')
+ parser.add_argument(
+ '--device',
+ type=str,
+ default='GPU',
+ choices=['CPU', 'GPU'],
+ help='The device type.')
+ parser.add_argument(
+ '--gpus',
+ type=int,
+ default=1,
+ help='If gpus > 1, will use ParallelExecutor to run, else use Executor.')
+ # this option is available only for vgg and resnet.
+ parser.add_argument(
+ '--cpus',
+ type=int,
+ default=1,
+ help='If cpus > 1, will use ParallelDo to run, else use Executor.')
+ parser.add_argument(
+ '--data_set',
+ type=str,
+ default='flowers',
+ choices=['cifar10', 'flowers'],
+ help='Optional dataset for benchmark.')
+ parser.add_argument(
+ '--infer_only', action='store_true', help='If set, run forward only.')
+ parser.add_argument(
+ '--use_cprof', action='store_true', help='If set, use cProfile.')
+ parser.add_argument(
+ '--use_nvprof',
+ action='store_true',
+ help='If set, use nvprof for CUDA.')
+ parser.add_argument(
+ '--no_test',
+ action='store_true',
+ help='If set, do not test the testset during training.')
+ parser.add_argument(
+ '--memory_optimize',
+ action='store_true',
+ help='If set, optimize runtime memory before start.')
+ parser.add_argument(
+ '--use_fake_data',
+ action='store_true',
+ help='If set ommit the actual read data operators.')
+ parser.add_argument(
+ '--profile', action='store_true', help='If set, profile a few steps.')
+ parser.add_argument(
+ '--update_method',
+ type=str,
+ default='local',
+ choices=['local', 'pserver', 'nccl2'],
+ help='Choose parameter update method, can be local, pserver, nccl2.')
+ parser.add_argument(
+ '--no_split_var',
+ action='store_true',
+ default=False,
+ help='Whether split variables into blocks when update_method is pserver')
+ parser.add_argument(
+ '--async_mode',
+ action='store_true',
+ default=False,
+ help='Whether start pserver in async mode to support ASGD')
+ parser.add_argument(
+ '--use_reader_op',
+ action='store_true',
+ help='Whether to use reader op, and must specify the data path if set this to true.'
+ )
+ parser.add_argument(
+ '--data_path',
+ type=str,
+ default="",
+ help='Directory that contains all the training recordio files.')
+ parser.add_argument(
+ '--use_inference_transpiler',
+ action='store_true',
+ help='If set, use inference transpiler to optimize the program.')
+ parser.add_argument(
+ '--no_random',
+ action='store_true',
+ help='If set, keep the random seed and do not shuffle the data.')
+ args = parser.parse_args()
+ return args
diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..53d010434a8ebbe0184d84f588783f25186d606a
--- /dev/null
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -0,0 +1,372 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import cProfile
+import time
+import os
+
+import numpy as np
+
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.profiler as profiler
+import paddle.fluid.transpiler.distribute_transpiler as distribute_transpiler
+
+from args import *
+
+
+def append_nccl2_prepare(trainer_id):
+ if trainer_id >= 0:
+ # append gen_nccl_id at the end of startup program
+ trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+ port = os.getenv("PADDLE_PSERVER_PORT")
+ worker_ips = os.getenv("PADDLE_TRAINER_IPS")
+ worker_endpoints = []
+ for ip in worker_ips.split(","):
+ worker_endpoints.append(':'.join([ip, port]))
+ num_trainers = len(worker_endpoints)
+ current_endpoint = os.getenv("PADDLE_CURRENT_IP") + ":" + port
+ worker_endpoints.remove(current_endpoint)
+
+ nccl_id_var = fluid.default_startup_program().global_block().create_var(
+ name="NCCLID",
+ persistable=True,
+ type=fluid.core.VarDesc.VarType.RAW)
+ fluid.default_startup_program().global_block().append_op(
+ type="gen_nccl_id",
+ inputs={},
+ outputs={"NCCLID": nccl_id_var},
+ attrs={
+ "endpoint": current_endpoint,
+ "endpoint_list": worker_endpoints,
+ "trainer_id": trainer_id
+ })
+ return nccl_id_var, num_trainers, trainer_id
+ else:
+ raise Exception("must set positive PADDLE_TRAINER_ID env variables for "
+ "nccl-based dist train.")
+
+
+def dist_transpile(trainer_id, args):
+ if trainer_id < 0:
+ return None, None
+
+ # the port of all pservers, needed by both trainer and pserver
+ port = os.getenv("PADDLE_PSERVER_PORT", "6174")
+ # comma separated ips of all pservers, needed by trainer and
+ # pserver
+ pserver_ips = os.getenv("PADDLE_PSERVER_IPS", "")
+ eplist = []
+ for ip in pserver_ips.split(","):
+ eplist.append(':'.join([ip, port]))
+ pserver_endpoints = ",".join(eplist)
+ # total number of workers/trainers in the job, needed by
+ # trainer and pserver
+ trainers = int(os.getenv("PADDLE_TRAINERS"))
+ # the IP of the local machine, needed by pserver only
+ current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port
+ # the role, should be either PSERVER or TRAINER
+ training_role = os.getenv("PADDLE_TRAINING_ROLE")
+
+ t = distribute_transpiler.DistributeTranspiler()
+ t.transpile(
+ trainer_id,
+ pservers=pserver_endpoints,
+ trainers=trainers,
+ sync_mode=not args.async_mode)
+ if training_role == "PSERVER":
+ pserver_program = t.get_pserver_program(current_endpoint)
+ pserver_startup_program = t.get_startup_program(current_endpoint,
+ pserver_program)
+ return pserver_program, pserver_startup_program
+ elif training_role == "TRAINER":
+ train_program = t.get_trainer_program()
+ return train_program, fluid.default_startup_program()
+ else:
+ raise ValueError(
+ 'PADDLE_TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
+ )
+
+
+def test(exe, inference_program, test_reader, feeder, batch_acc):
+ accuracy_evaluator = fluid.metrics.Accuracy()
+ for batch_id, data in enumerate(test_reader()):
+ acc = exe.run(inference_program,
+ feed=feeder.feed(data),
+ fetch_list=[batch_acc])
+ accuracy_evaluator.update(value=np.array(acc), weight=len(data))
+
+ return accuracy_evaluator.eval()
+
+
+# TODO(wuyi): replace train, train_parallel, test functions with new trainer
+# API once it is ready.
+def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
+ args, train_prog, startup_prog):
+ if os.getenv("PADDLE_TRAINING_ROLE") == "PSERVER":
+ place = core.CPUPlace()
+ exe = fluid.Executor(place)
+ exe.run(startup_prog)
+ exe.run(train_prog)
+ return
+
+ if args.use_fake_data:
+ raise Exception(
+ "fake data is not supported in single GPU test for now.")
+
+ place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
+ exe = fluid.Executor(place)
+ exe.run(startup_prog)
+
+ # Use inference_transpiler to speedup
+ if not args.use_reader_op:
+ feed_var_list = [
+ var for var in train_prog.global_block().vars.itervalues()
+ if var.is_data
+ ]
+ feeder = fluid.DataFeeder(feed_var_list, place)
+
+ iters, num_samples, start_time = 0, 0, time.time()
+ for pass_id in range(args.pass_num):
+ train_losses = []
+ if not args.use_reader_op:
+ reader_generator = train_reader()
+ batch_id = 0
+ data = None
+ while True:
+ if not args.use_reader_op:
+ data = next(reader_generator, None)
+ if data == None:
+ break
+ if iters == args.iterations:
+ reader_generator.close()
+ break
+ if iters == args.skip_batch_num:
+ start_time = time.time()
+ num_samples = 0
+
+ if args.use_reader_op:
+ try:
+ loss = exe.run(train_prog, fetch_list=[avg_loss])
+ except fluid.core.EnforceNotMet as ex:
+ break
+ else:
+ loss = exe.run(train_prog,
+ feed=feeder.feed(data),
+ fetch_list=[avg_loss])
+ iters += 1
+ batch_id += 1
+ # FIXME(wuyi): For use_reader_op, if the current
+ # pass is not the last, the last batch of this pass
+ # is also equal to args.batch_size.
+ if args.use_reader_op:
+ num_samples += args.batch_size * args.gpus
+ else:
+ num_samples += len(data)
+ train_losses.append(loss)
+ print("Pass: %d, Iter: %d, Loss: %f\n" %
+ (pass_id, iters, np.mean(train_losses)))
+ print_train_time(start_time, time.time(), num_samples)
+ print("Pass: %d, Loss: %f" % (pass_id, np.mean(train_losses))),
+ # evaluation
+ if not args.no_test and batch_acc and not args.use_reader_op:
+ if args.use_inference_transpiler:
+ t = fluid.InferenceTranspiler()
+ t.transpile(infer_prog, place)
+
+ pass_test_acc = test(exe, infer_prog, test_reader, feeder,
+ batch_acc)
+ print(", Test Accuracy: %f" % pass_test_acc)
+ print("\n")
+ # TODO(wuyi): add warmup passes to get better perf data.
+ exit(0)
+
+
+# TODO(wuyi): replace train, train_parallel, test functions with new trainer
+# API once it is ready.
+def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
+ batch_acc, args, train_prog, startup_prog, nccl_id_var,
+ num_trainers, trainer_id):
+ place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
+ if not args.use_reader_op:
+ feed_var_list = [
+ var for var in train_prog.global_block().vars.itervalues()
+ if var.is_data
+ ]
+ feeder = fluid.DataFeeder(feed_var_list, place)
+
+ # generate fake:
+ if args.use_fake_data:
+ for var in feed_var_list:
+ v = startup_prog.global_block()._clone_variable(var)
+ var.persistable = True
+ v.persistable = True
+
+ real_shape = list(var.shape)
+ real_shape[0] = args.batch_size / args.gpus
+ startup_prog.global_block().append_op(
+ outputs={"Out": v},
+ type="fill_constant",
+ attrs={"shape": real_shape,
+ "value": 1.0,
+ "dtype": var.dtype})
+
+ if nccl_id_var and trainer_id == 0:
+ #FIXME(wuyi): wait other trainer to start listening
+ time.sleep(30)
+
+ startup_exe = fluid.Executor(place)
+ startup_exe.run(startup_prog)
+ strategy = fluid.ExecutionStrategy()
+ strategy.num_threads = 1
+ strategy.allow_op_delay = False
+ exe = fluid.ParallelExecutor(
+ True,
+ avg_loss.name,
+ exec_strategy=strategy,
+ num_trainers=num_trainers,
+ trainer_id=trainer_id)
+
+ for pass_id in range(args.pass_num):
+ num_samples = 0
+ iters = 0
+ start_time = time.time()
+ if not args.use_reader_op:
+ reader_generator = train_reader()
+ batch_id = 0
+ data = None
+ while True:
+ if not args.use_reader_op:
+ data = next(reader_generator, None)
+ if data == None:
+ break
+ if iters == args.iterations:
+ reader_generator.close()
+ break
+ if args.profile and pass_id == 0 and batch_id == 5:
+ profiler.start_profiler("All")
+ elif args.profile and pass_id == 0 and batch_id == 10:
+ profiler.stop_profiler("total", "/tmp/profile_%d" % trainer_id)
+
+ if iters == args.skip_batch_num:
+ start_time = time.time()
+ num_samples = 0
+ if args.use_fake_data or args.use_reader_op:
+ try:
+ loss, = exe.run([avg_loss.name])
+ except fluid.core.EnforceNotMet as ex:
+ break
+ else:
+ loss, = exe.run([avg_loss.name], feed=feeder.feed(data))
+ if args.use_reader_op:
+ num_samples += args.batch_size * args.gpus
+ else:
+ num_samples += len(data)
+ iters += 1
+ if batch_id % 1 == 0:
+ print("Pass %d, batch %d, loss %s" %
+ (pass_id, batch_id, np.array(loss)))
+ batch_id += 1
+
+ print_train_time(start_time, time.time(), num_samples)
+ if not args.no_test and batch_acc and not args.use_reader_op:
+ # we have not implement record io for test
+ # skip test when use args.use_reader_op
+ test_acc = test(startup_exe, infer_prog, test_reader, feeder,
+ batch_acc)
+ print("Pass: %d, Test Accuracy: %f\n" % (pass_id, test_acc))
+
+
+def print_arguments(args):
+ vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
+ vars(args)['device'] == 'GPU')
+ print('----------- Configuration Arguments -----------')
+ for arg, value in sorted(vars(args).iteritems()):
+ print('%s: %s' % (arg, value))
+ print('------------------------------------------------')
+
+
+def print_train_time(start_time, end_time, num_samples):
+ train_elapsed = end_time - start_time
+ examples_per_sec = num_samples / train_elapsed
+ print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
+ (num_samples, train_elapsed, examples_per_sec))
+
+
+def print_paddle_envs():
+ print('----------- Configuration envs -----------')
+ for k in os.environ:
+ if "PADDLE_" in k:
+ print "ENV %s:%s" % (k, os.environ[k])
+ print('------------------------------------------------')
+
+
+def main():
+ args = parse_args()
+ print_arguments(args)
+ print_paddle_envs()
+ if args.no_random:
+ fluid.default_startup_program().random_seed = 1
+
+ # the unique trainer id, starting from 0, needed by trainer
+ # only
+ nccl_id_var, num_trainers, trainer_id = (
+ None, 1, int(os.getenv("PADDLE_TRAINER_ID", "0")))
+
+ if args.use_cprof:
+ pr = cProfile.Profile()
+ pr.enable()
+ model_def = __import__("models.%s" % args.model, fromlist=["models"])
+ train_args = list(model_def.get_model(args))
+ train_args.append(args)
+ # Run optimizer.minimize(avg_loss)
+ train_args[2].minimize(train_args[0])
+ if args.memory_optimize:
+ fluid.memory_optimize(fluid.default_main_program())
+
+ if args.update_method == "pserver":
+ train_prog, startup_prog = dist_transpile(trainer_id, args)
+ if not train_prog:
+ raise Exception(
+ "Must configure correct environments to run dist train.")
+ train_args.extend([train_prog, startup_prog])
+ if args.gpus > 1 and os.getenv("PADDLE_TRAINING_ROLE") == "TRAINER":
+ train_args.extend([nccl_id_var, num_trainers, trainer_id])
+ train_parallel(*train_args)
+ train(*train_args)
+ exit(0)
+
+ # for other update methods, use default programs
+ train_args.append(fluid.default_main_program())
+ train_args.append(fluid.default_startup_program())
+
+ if args.update_method == "nccl2":
+ nccl_id_var, num_trainers, trainer_id = append_nccl2_prepare(trainer_id)
+ if args.gpus == 1:
+ # NOTE: parallel executor use profiler interanlly
+ if args.use_nvprof and args.device == 'GPU':
+ with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
+ train(*train_args)
+ else:
+ train(*train_args)
+ else:
+ if args.device == "CPU":
+ raise Exception("Only support GPU perf with parallel exe")
+ train_args.extend([nccl_id_var, num_trainers, trainer_id])
+ train_parallel(*train_args)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/benchmark/fluid/kube_gen_job.py b/benchmark/fluid/kube_gen_job.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfe8b5cdd58456902fa8ec355e9837dface3f7be
--- /dev/null
+++ b/benchmark/fluid/kube_gen_job.py
@@ -0,0 +1,197 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import yaml
+import copy
+import argparse
+import random
+import os
+import copy
+from kube_templates import pserver, trainer, envs
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(description='Generate dist job yamls.')
+
+ parser.add_argument(
+ '--jobname', default="paddlejob", help='unique job name')
+ parser.add_argument(
+ '--cpu', default=1, type=int, help='CPU cores per trainer node')
+ parser.add_argument(
+ '--pscpu', default=1, type=int, help='CPU cores per pserver node')
+ parser.add_argument(
+ '--gpu', default=0, type=int, help='num of GPUs per node')
+ parser.add_argument(
+ '--image',
+ default="bootstrapper:5000/fluid_benchmark:gpu",
+ help='num of GPUs per node')
+ parser.add_argument(
+ '--pservers', default=1, type=int, help='num of pservers')
+ parser.add_argument(
+ '--trainers', default=1, type=int, help='num of trainers')
+ parser.add_argument('--memory', default=1, type=int, help='trainer memory')
+ parser.add_argument(
+ '--psmemory', default=1, type=int, help='pserver memory')
+ parser.add_argument(
+ '--port', default=30236, type=int, help='num of trainers')
+ parser.add_argument(
+ '--entry', default="python train.py", help='command to run')
+ parser.add_argument(
+ '--fluid', default=1, type=int, help='whether is fluid job')
+ parser.add_argument(
+ '--rdma', action='store_true', help='whether mount rdma libs')
+ parser.add_argument(
+ '--disttype',
+ default="pserver",
+ type=str,
+ choices=['pserver', 'nccl2', 'local'],
+ help='pserver or nccl2 or local')
+
+ args = parser.parse_args()
+ return args
+
+
+def gen_job():
+ ps = pserver
+ tn = trainer
+ args = parse_args()
+
+ ps_container = ps["spec"]["template"]["spec"]["containers"][0]
+ tn_container = tn["spec"]["template"]["spec"]["containers"][0]
+
+ if args.fluid == 1:
+ ps_container["command"] = \
+ ["paddle_k8s", "start_fluid"]
+ tn_container["command"] = \
+ ["paddle_k8s", "start_fluid"]
+ ps["metadata"]["name"] = args.jobname + "-pserver"
+ ps["spec"]["template"]["metadata"]["labels"][
+ "paddle-job-pserver"] = args.jobname
+ tn["metadata"]["name"] = args.jobname + "-trainer"
+ tn["spec"]["template"]["metadata"]["labels"]["paddle-job"] = args.jobname
+
+ ps_container["image"] = args.image
+ tn_container["image"] = args.image
+
+ ps_container["resources"]["requests"]["cpu"] = str(args.pscpu)
+ ps_container["resources"]["requests"]["memory"] = str(args.psmemory) + "Gi"
+ ps_container["resources"]["limits"]["cpu"] = str(args.pscpu)
+ ps_container["resources"]["limits"]["memory"] = str(args.psmemory) + "Gi"
+
+ tn_container["resources"]["requests"]["cpu"] = str(args.cpu)
+ tn_container["resources"]["requests"]["memory"] = str(args.memory) + "Gi"
+ tn_container["resources"]["limits"]["cpu"] = str(args.cpu)
+ tn_container["resources"]["limits"]["memory"] = str(args.memory) + "Gi"
+ if args.gpu > 0:
+ tn_container["resources"]["requests"][
+ "alpha.kubernetes.io/nvidia-gpu"] = str(args.gpu)
+ tn_container["resources"]["limits"][
+ "alpha.kubernetes.io/nvidia-gpu"] = str(args.gpu)
+
+ ps["spec"]["replicas"] = int(args.pservers)
+ tn["spec"]["parallelism"] = int(args.trainers)
+ tn["spec"]["completions"] = int(args.trainers)
+ ps_container["ports"][0]["name"] = "jobport-" + str(args.port)
+ ps_container["ports"][0]["containerPort"] = args.port
+ spreadport = random.randint(40000, 60000)
+ tn_container["ports"][0]["name"] = "spr-" + str(spreadport)
+ tn_container["ports"][0]["containerPort"] = spreadport
+
+ envs.append({"name": "PADDLE_JOB_NAME", "value": args.jobname})
+ envs.append({"name": "PADDLE_TRAINERS", "value": str(args.trainers)})
+ envs.append({"name": "PADDLE_PSERVERS", "value": str(args.pservers)})
+ envs.append({"name": "ENTRY", "value": args.entry})
+ envs.append({"name": "PADDLE_PSERVER_PORT", "value": str(args.port)})
+ # NOTE: these directories below are cluster specific, please modify
+ # this settings before you run on your own cluster.
+ envs.append({
+ "name": "LD_LIBRARY_PATH",
+ "value":
+ "/usr/local/lib:/usr/local/nvidia/lib64:/usr/local/rdma/lib64:/usr/lib64/mlnx_ofed/valgrind"
+ })
+
+ volumes = [{
+ "name": "nvidia-driver",
+ "hostPath": {
+ "path": "/usr/local/nvidia/lib64"
+ }
+ }]
+ volumeMounts = [{
+ "mountPath": "/usr/local/nvidia/lib64",
+ "name": "nvidia-driver"
+ }]
+
+ if args.rdma:
+ volumes.extend([{
+ "name": "ibetc",
+ "hostPath": {
+ "path": "/etc/libibverbs.d"
+ }
+ }, {
+ "name": "iblibs",
+ "hostPath": {
+ "path": "/usr/local/rdma"
+ }
+ }, {
+ "name": "valgrind",
+ "hostPath": {
+ "path": "/usr/lib64/mlnx_ofed/valgrind"
+ }
+ }])
+ volumeMounts.extend([{
+ "mountPath": "/etc/libibverbs.d",
+ "name": "ibetc"
+ }, {
+ "mountPath": "/usr/local/rdma",
+ "name": "iblibs"
+ }, {
+ "mountPath": "/usr/lib64/mlnx_ofed/valgrind",
+ "name": "valgrind"
+ }])
+ # append shm for NCCL2
+ volumes.append({"name": "dshm", "emptyDir": {"medium": "Memory"}})
+ volumeMounts.append({"mountPath": "/dev/shm", "name": "dshm"})
+
+ tn["spec"]["template"]["spec"]["volumes"] = volumes
+ tn_container["volumeMounts"] = volumeMounts
+
+ ps_container["env"] = copy.deepcopy(envs)
+ ps_container["env"].append({
+ "name": "PADDLE_TRAINING_ROLE",
+ "value": "PSERVER"
+ })
+ tn_container["env"] = envs
+ if args.disttype == "pserver":
+ tn_container["env"].append({
+ "name": "PADDLE_TRAINING_ROLE",
+ "value": "TRAINER"
+ })
+ elif args.disttype == "nccl2" or args.disttype == "local":
+ # NCCL2 have no training role, set to plain WORKER
+ tn_container["env"].append({
+ "name": "PADDLE_TRAINING_ROLE",
+ "value": "WORKER"
+ })
+
+ os.mkdir(args.jobname)
+ if args.disttype == "pserver":
+ with open("%s/pserver.yaml" % args.jobname, "w") as fn:
+ yaml.dump(ps, fn)
+
+ with open("%s/trainer.yaml" % args.jobname, "w") as fn:
+ yaml.dump(tn, fn)
+
+
+if __name__ == "__main__":
+ gen_job()
diff --git a/benchmark/fluid/kube_templates/__init__.py b/benchmark/fluid/kube_templates/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d09d940a5ee638e4b55405d05924e2d76006cfc
--- /dev/null
+++ b/benchmark/fluid/kube_templates/__init__.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from pserver import pserver
+from trainer import trainer
+
+__all__ = ["pserver", "trainer", "envs"]
+
+envs = [
+ # envs that don't need to change
+ {
+ "name": "GLOG_v",
+ "value": "0"
+ },
+ {
+ "name": "GLOG_logtostderr",
+ "value": "1"
+ },
+ {
+ "name": "TOPOLOGY",
+ "value": ""
+ },
+ {
+ "name": "TRAINER_PACKAGE",
+ "value": "/workspace"
+ },
+ {
+ "name": "PADDLE_INIT_NICS",
+ "value": "eth2"
+ },
+ {
+ "name": "NAMESPACE",
+ "valueFrom": {
+ "fieldRef": {
+ "fieldPath": "metadata.namespace"
+ }
+ }
+ },
+ {
+ "name": "POD_IP",
+ "valueFrom": {
+ "fieldRef": {
+ "fieldPath": "status.podIP"
+ }
+ }
+ },
+ {
+ "name": "PADDLE_CURRENT_IP",
+ "valueFrom": {
+ "fieldRef": {
+ "fieldPath": "status.podIP"
+ }
+ }
+ }
+]
diff --git a/benchmark/fluid/kube_templates/pserver.py b/benchmark/fluid/kube_templates/pserver.py
new file mode 100644
index 0000000000000000000000000000000000000000..b54982c806ad4229fbd4bd7edf82a4e7eb4c5ad1
--- /dev/null
+++ b/benchmark/fluid/kube_templates/pserver.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+pserver = {
+ "apiVersion": "extensions/v1beta1",
+ "kind": "ReplicaSet",
+ "metadata": {
+ "name": "jobname-pserver"
+ },
+ "spec": {
+ "replicas": 1,
+ "template": {
+ "metadata": {
+ "labels": {
+ "paddle-job-pserver": "jobname"
+ }
+ },
+ "spec": {
+ "hostNetwork": True,
+ "imagePullSecrets": [{
+ "name": "job-registry-secret"
+ }],
+ "containers": [{
+ "name": "pserver",
+ "image": "",
+ "imagePullPolicy": "Always",
+ "ports": [{
+ "name": "jobport-1",
+ "containerPort": 1
+ }],
+ "env": [],
+ "command": ["paddle_k8s", "start_pserver"],
+ "resources": {
+ "requests": {
+ "memory": "10Gi",
+ "cpu": "4"
+ },
+ "limits": {
+ "memory": "10Gi",
+ "cpu": "4"
+ }
+ }
+ }]
+ }
+ }
+ }
+}
diff --git a/benchmark/fluid/kube_templates/trainer.py b/benchmark/fluid/kube_templates/trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..b915d31e371d9d787ff64d705e32baf301e16abe
--- /dev/null
+++ b/benchmark/fluid/kube_templates/trainer.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+trainer = {
+ "apiVersion": "batch/v1",
+ "kind": "Job",
+ "metadata": {
+ "name": "jobname-pserver"
+ },
+ "spec": {
+ "parallelism": 4,
+ "completions": 4,
+ "template": {
+ "metadata": {
+ "labels": {
+ "paddle-job": "jobname"
+ }
+ },
+ "spec": {
+ "hostNetwork": True,
+ "imagePullSecrets": [{
+ "name": "job-registry-secret"
+ }],
+ "restartPolicy": "Never",
+ "containers": [{
+ "name": "trainer",
+ "image": "",
+ "imagePullPolicy": "Always",
+ # to let container set rlimit
+ "securityContext": {
+ "privileged": True
+ # TODO(wuyi): use below specific cap instead of privileged,
+ # using privileged will cause all GPU device are visible
+ # in the container.
+ # "capabilities": {
+ # "add": ["SYS_RESOURCE"]
+ # }
+ },
+ "ports": [{
+ "name": "jobport-1",
+ "containerPort": 1
+ }],
+ "env": [],
+ "command": ["paddle_k8s", "start_trainer", "v2"],
+ "resources": {
+ "requests": {
+ "memory": "10Gi",
+ "cpu": "4",
+ },
+ "limits": {
+ "memory": "10Gi",
+ "cpu": "4",
+ }
+ }
+ }]
+ }
+ }
+ }
+}
diff --git a/benchmark/fluid/machine_translation.py b/benchmark/fluid/machine_translation.py
deleted file mode 100644
index adde5f21acd4e77d58a453d6868abeccfca4bb5a..0000000000000000000000000000000000000000
--- a/benchmark/fluid/machine_translation.py
+++ /dev/null
@@ -1,379 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""seq2seq model for fluid."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import argparse
-import time
-import distutils.util
-
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import paddle.fluid.framework as framework
-from paddle.fluid.executor import Executor
-
-parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument(
- "--embedding_dim",
- type=int,
- default=512,
- help="The dimension of embedding table. (default: %(default)d)")
-parser.add_argument(
- "--encoder_size",
- type=int,
- default=512,
- help="The size of encoder bi-rnn unit. (default: %(default)d)")
-parser.add_argument(
- "--decoder_size",
- type=int,
- default=512,
- help="The size of decoder rnn unit. (default: %(default)d)")
-parser.add_argument(
- "--batch_size",
- type=int,
- default=16,
- help="The sequence number of a mini-batch data. (default: %(default)d)")
-parser.add_argument(
- '--skip_batch_num',
- type=int,
- default=5,
- help='The first num of minibatch num to skip, for better performance test')
-parser.add_argument(
- '--iterations', type=int, default=80, help='The number of minibatches.')
-parser.add_argument(
- "--dict_size",
- type=int,
- default=30000,
- help="The dictionary capacity. Dictionaries of source sequence and "
- "target dictionary have same capacity. (default: %(default)d)")
-parser.add_argument(
- "--pass_num",
- type=int,
- default=2,
- help="The pass number to train. (default: %(default)d)")
-parser.add_argument(
- "--learning_rate",
- type=float,
- default=0.0002,
- help="Learning rate used to train the model. (default: %(default)f)")
-parser.add_argument(
- "--infer_only", action='store_true', help="If set, run forward only.")
-parser.add_argument(
- "--beam_size",
- type=int,
- default=3,
- help="The width for beam searching. (default: %(default)d)")
-parser.add_argument(
- '--device',
- type=str,
- default='GPU',
- choices=['CPU', 'GPU'],
- help="The device type.")
-parser.add_argument(
- "--max_length",
- type=int,
- default=250,
- help="The maximum length of sequence when doing generation. "
- "(default: %(default)d)")
-parser.add_argument(
- '--with_test',
- action='store_true',
- help='If set, test the testset during training.')
-
-
-def lstm_step(x_t, hidden_t_prev, cell_t_prev, size):
- def linear(inputs):
- return fluid.layers.fc(input=inputs, size=size, bias_attr=True)
-
- forget_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
- input_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
- output_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
- cell_tilde = fluid.layers.tanh(x=linear([hidden_t_prev, x_t]))
-
- cell_t = fluid.layers.sums(input=[
- fluid.layers.elementwise_mul(
- x=forget_gate, y=cell_t_prev), fluid.layers.elementwise_mul(
- x=input_gate, y=cell_tilde)
- ])
-
- hidden_t = fluid.layers.elementwise_mul(
- x=output_gate, y=fluid.layers.tanh(x=cell_t))
-
- return hidden_t, cell_t
-
-
-def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim,
- target_dict_dim, is_generating, beam_size, max_length):
- """Construct a seq2seq network."""
-
- def bi_lstm_encoder(input_seq, gate_size):
- # Linear transformation part for input gate, output gate, forget gate
- # and cell activation vectors need be done outside of dynamic_lstm.
- # So the output size is 4 times of gate_size.
- input_forward_proj = fluid.layers.fc(input=input_seq,
- size=gate_size * 4,
- act=None,
- bias_attr=False)
- forward, _ = fluid.layers.dynamic_lstm(
- input=input_forward_proj, size=gate_size * 4, use_peepholes=False)
- input_reversed_proj = fluid.layers.fc(input=input_seq,
- size=gate_size * 4,
- act=None,
- bias_attr=False)
- reversed, _ = fluid.layers.dynamic_lstm(
- input=input_reversed_proj,
- size=gate_size * 4,
- is_reverse=True,
- use_peepholes=False)
- return forward, reversed
-
- src_word_idx = fluid.layers.data(
- name='source_sequence', shape=[1], dtype='int64', lod_level=1)
-
- src_embedding = fluid.layers.embedding(
- input=src_word_idx,
- size=[source_dict_dim, embedding_dim],
- dtype='float32')
-
- src_forward, src_reversed = bi_lstm_encoder(
- input_seq=src_embedding, gate_size=encoder_size)
-
- encoded_vector = fluid.layers.concat(
- input=[src_forward, src_reversed], axis=1)
-
- encoded_proj = fluid.layers.fc(input=encoded_vector,
- size=decoder_size,
- bias_attr=False)
-
- backward_first = fluid.layers.sequence_pool(
- input=src_reversed, pool_type='first')
-
- decoder_boot = fluid.layers.fc(input=backward_first,
- size=decoder_size,
- bias_attr=False,
- act='tanh')
-
- def lstm_decoder_with_attention(target_embedding, encoder_vec, encoder_proj,
- decoder_boot, decoder_size):
- def simple_attention(encoder_vec, encoder_proj, decoder_state):
- decoder_state_proj = fluid.layers.fc(input=decoder_state,
- size=decoder_size,
- bias_attr=False)
- decoder_state_expand = fluid.layers.sequence_expand(
- x=decoder_state_proj, y=encoder_proj)
- concated = fluid.layers.concat(
- input=[encoder_proj, decoder_state_expand], axis=1)
- attention_weights = fluid.layers.fc(input=concated,
- size=1,
- act='tanh',
- bias_attr=False)
- attention_weights = fluid.layers.sequence_softmax(
- input=attention_weights)
- weigths_reshape = fluid.layers.reshape(
- x=attention_weights, shape=[-1])
- scaled = fluid.layers.elementwise_mul(
- x=encoder_vec, y=weigths_reshape, axis=0)
- context = fluid.layers.sequence_pool(input=scaled, pool_type='sum')
- return context
-
- rnn = fluid.layers.DynamicRNN()
-
- cell_init = fluid.layers.fill_constant_batch_size_like(
- input=decoder_boot,
- value=0.0,
- shape=[-1, decoder_size],
- dtype='float32')
- cell_init.stop_gradient = False
-
- with rnn.block():
- current_word = rnn.step_input(target_embedding)
- encoder_vec = rnn.static_input(encoder_vec)
- encoder_proj = rnn.static_input(encoder_proj)
- hidden_mem = rnn.memory(init=decoder_boot, need_reorder=True)
- cell_mem = rnn.memory(init=cell_init)
- context = simple_attention(encoder_vec, encoder_proj, hidden_mem)
- decoder_inputs = fluid.layers.concat(
- input=[context, current_word], axis=1)
- h, c = lstm_step(decoder_inputs, hidden_mem, cell_mem, decoder_size)
- rnn.update_memory(hidden_mem, h)
- rnn.update_memory(cell_mem, c)
- out = fluid.layers.fc(input=h,
- size=target_dict_dim,
- bias_attr=True,
- act='softmax')
- rnn.output(out)
- return rnn()
-
- if not is_generating:
- trg_word_idx = fluid.layers.data(
- name='target_sequence', shape=[1], dtype='int64', lod_level=1)
-
- trg_embedding = fluid.layers.embedding(
- input=trg_word_idx,
- size=[target_dict_dim, embedding_dim],
- dtype='float32')
-
- prediction = lstm_decoder_with_attention(trg_embedding, encoded_vector,
- encoded_proj, decoder_boot,
- decoder_size)
- label = fluid.layers.data(
- name='label_sequence', shape=[1], dtype='int64', lod_level=1)
- cost = fluid.layers.cross_entropy(input=prediction, label=label)
- avg_cost = fluid.layers.mean(x=cost)
-
- feeding_list = ["source_sequence", "target_sequence", "label_sequence"]
-
- return avg_cost, feeding_list
-
-
-def to_lodtensor(data, place):
- seq_lens = [len(seq) for seq in data]
- cur_len = 0
- lod = [cur_len]
- for l in seq_lens:
- cur_len += l
- lod.append(cur_len)
- flattened_data = np.concatenate(data, axis=0).astype("int64")
- flattened_data = flattened_data.reshape([len(flattened_data), 1])
- lod_t = core.LoDTensor()
- lod_t.set(flattened_data, place)
- lod_t.set_lod([lod])
- return lod_t, lod[-1]
-
-
-def lodtensor_to_ndarray(lod_tensor):
- dims = lod_tensor.get_dims()
- ndarray = np.zeros(shape=dims).astype('float32')
- for i in xrange(np.product(dims)):
- ndarray.ravel()[i] = lod_tensor.get_float_element(i)
- return ndarray
-
-
-def train():
- avg_cost, feeding_list = seq_to_seq_net(
- args.embedding_dim,
- args.encoder_size,
- args.decoder_size,
- args.dict_size,
- args.dict_size,
- False,
- beam_size=args.beam_size,
- max_length=args.max_length)
-
- # clone from default main program
- inference_program = fluid.default_main_program().clone()
-
- optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
- optimizer.minimize(avg_cost)
-
- fluid.memory_optimize(fluid.default_main_program())
-
- train_batch_generator = paddle.batch(
- paddle.reader.shuffle(
- paddle.dataset.wmt14.train(args.dict_size), buf_size=1000),
- batch_size=args.batch_size)
-
- test_batch_generator = paddle.batch(
- paddle.reader.shuffle(
- paddle.dataset.wmt14.test(args.dict_size), buf_size=1000),
- batch_size=args.batch_size)
-
- place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
- exe = Executor(place)
- exe.run(framework.default_startup_program())
-
- def do_validation():
- total_loss = 0.0
- count = 0
- for batch_id, data in enumerate(test_batch_generator()):
- src_seq = to_lodtensor(map(lambda x: x[0], data), place)[0]
- trg_seq = to_lodtensor(map(lambda x: x[1], data), place)[0]
- lbl_seq = to_lodtensor(map(lambda x: x[2], data), place)[0]
-
- fetch_outs = exe.run(inference_program,
- feed={
- feeding_list[0]: src_seq,
- feeding_list[1]: trg_seq,
- feeding_list[2]: lbl_seq
- },
- fetch_list=[avg_cost],
- return_numpy=False)
-
- total_loss += lodtensor_to_ndarray(fetch_outs[0])[0]
- count += 1
-
- return total_loss / count
-
- iters, num_samples, start_time = 0, 0, time.time()
- for pass_id in xrange(args.pass_num):
- train_accs = []
- train_losses = []
- for batch_id, data in enumerate(train_batch_generator()):
- if iters == args.skip_batch_num:
- start_time = time.time()
- num_samples = 0
- if iters == args.iterations:
- break
- src_seq, word_num = to_lodtensor(map(lambda x: x[0], data), place)
- num_samples += word_num
- trg_seq, word_num = to_lodtensor(map(lambda x: x[1], data), place)
- num_samples += word_num
- lbl_seq, _ = to_lodtensor(map(lambda x: x[2], data), place)
-
- fetch_outs = exe.run(framework.default_main_program(),
- feed={
- feeding_list[0]: src_seq,
- feeding_list[1]: trg_seq,
- feeding_list[2]: lbl_seq
- },
- fetch_list=[avg_cost])
-
- iters += 1
- loss = np.array(fetch_outs[0])
- print(
- "Pass = %d, Iter = %d, Loss = %f" % (pass_id, iters, loss)
- ) # The accuracy is the accumulation of batches, but not the current batch.
-
- train_elapsed = time.time() - start_time
- examples_per_sec = num_samples / train_elapsed
- print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
- (num_samples, train_elapsed, examples_per_sec))
- # evaluation
- if args.with_test:
- test_loss = do_validation()
- exit(0)
-
-
-def infer():
- pass
-
-
-def print_arguments(args):
- print('----------- seq2seq Configuration Arguments -----------')
- for arg, value in sorted(vars(args).iteritems()):
- print('%s: %s' % (arg, value))
- print('------------------------------------------------')
-
-
-if __name__ == '__main__':
- args = parser.parse_args()
- print_arguments(args)
- if args.infer_only:
- infer()
- else:
- train()
diff --git a/benchmark/fluid/mnist.py b/benchmark/fluid/mnist.py
deleted file mode 100644
index 1e2185dfac1072d1f1046f4616a9d53a8fc76061..0000000000000000000000000000000000000000
--- a/benchmark/fluid/mnist.py
+++ /dev/null
@@ -1,224 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import argparse
-import time
-
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.profiler as profiler
-
-SEED = 1
-DTYPE = "float32"
-
-# random seed must set before configuring the network.
-# fluid.default_startup_program().random_seed = SEED
-
-
-def parse_args():
- parser = argparse.ArgumentParser("mnist model benchmark.")
- parser.add_argument(
- '--batch_size', type=int, default=128, help='The minibatch size.')
- parser.add_argument(
- '--skip_batch_num',
- type=int,
- default=5,
- help='The first num of minibatch num to skip, for better performance test'
- )
- parser.add_argument(
- '--iterations', type=int, default=35, help='The number of minibatches.')
- parser.add_argument(
- '--pass_num', type=int, default=5, help='The number of passes.')
- parser.add_argument(
- '--device',
- type=str,
- default='GPU',
- choices=['CPU', 'GPU'],
- help='The device type.')
- parser.add_argument(
- '--infer_only', action='store_true', help='If set, run forward only.')
- parser.add_argument(
- '--use_cprof', action='store_true', help='If set, use cProfile.')
- parser.add_argument(
- '--use_nvprof',
- action='store_true',
- help='If set, use nvprof for CUDA.')
- parser.add_argument(
- '--with_test',
- action='store_true',
- help='If set, test the testset during training.')
- args = parser.parse_args()
- return args
-
-
-def cnn_model(data):
- conv_pool_1 = fluid.nets.simple_img_conv_pool(
- input=data,
- filter_size=5,
- num_filters=20,
- pool_size=2,
- pool_stride=2,
- act="relu")
- conv_pool_2 = fluid.nets.simple_img_conv_pool(
- input=conv_pool_1,
- filter_size=5,
- num_filters=50,
- pool_size=2,
- pool_stride=2,
- act="relu")
-
- # TODO(dzhwinter) : refine the initializer and random seed settting
- SIZE = 10
- input_shape = conv_pool_2.shape
- param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
- scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
-
- predict = fluid.layers.fc(
- input=conv_pool_2,
- size=SIZE,
- act="softmax",
- param_attr=fluid.param_attr.ParamAttr(
- initializer=fluid.initializer.NormalInitializer(
- loc=0.0, scale=scale)))
- return predict
-
-
-def eval_test(exe, batch_acc, batch_size_tensor, inference_program):
- test_reader = paddle.batch(
- paddle.dataset.mnist.test(), batch_size=args.batch_size)
- test_pass_acc = fluid.average.WeightedAverage()
- for batch_id, data in enumerate(test_reader()):
- img_data = np.array(map(lambda x: x[0].reshape([1, 28, 28]),
- data)).astype(DTYPE)
- y_data = np.array(map(lambda x: x[1], data)).astype("int64")
- y_data = y_data.reshape([len(y_data), 1])
-
- acc, weight = exe.run(inference_program,
- feed={"pixel": img_data,
- "label": y_data},
- fetch_list=[batch_acc, batch_size_tensor])
- test_pass_acc.add(value=acc, weight=weight)
- pass_acc = test_pass_acc.eval()
- return pass_acc
-
-
-def run_benchmark(model, args):
- if args.use_cprof:
- pr = cProfile.Profile()
- pr.enable()
- start_time = time.time()
- # Input data
- images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
- label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
- # Train program
- predict = model(images)
- cost = fluid.layers.cross_entropy(input=predict, label=label)
- avg_cost = fluid.layers.mean(x=cost)
-
- # Evaluator
- batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
- batch_acc = fluid.layers.accuracy(
- input=predict, label=label, total=batch_size_tensor)
-
- # inference program
- inference_program = fluid.default_main_program().clone()
-
- # Optimization
- opt = fluid.optimizer.AdamOptimizer(
- learning_rate=0.001, beta1=0.9, beta2=0.999)
- opt.minimize(avg_cost)
-
- fluid.memory_optimize(fluid.default_main_program())
-
- # Initialize executor
- place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0)
- exe = fluid.Executor(place)
-
- # Parameter initialization
- exe.run(fluid.default_startup_program())
-
- # Reader
- train_reader = paddle.batch(
- paddle.dataset.mnist.train(), batch_size=args.batch_size)
-
- accuracy = fluid.metrics.Accuracy()
- iters, num_samples, start_time = 0, 0, time.time()
- for pass_id in range(args.pass_num):
- accuracy.reset()
- train_accs = []
- train_losses = []
- for batch_id, data in enumerate(train_reader()):
- if iters == args.skip_batch_num:
- start_time = time.time()
- num_samples = 0
- if iters == args.iterations:
- break
- img_data = np.array(
- map(lambda x: x[0].reshape([1, 28, 28]), data)).astype(DTYPE)
- y_data = np.array(map(lambda x: x[1], data)).astype("int64")
- y_data = y_data.reshape([len(y_data), 1])
-
- outs = exe.run(
- fluid.default_main_program(),
- feed={"pixel": img_data,
- "label": y_data},
- fetch_list=[avg_cost, batch_acc, batch_size_tensor]
- ) # The accuracy is the accumulation of batches, but not the current batch.
- accuracy.update(value=outs[1], weight=outs[2])
- iters += 1
- num_samples += len(y_data)
- loss = np.array(outs[0])
- acc = np.array(outs[1])
- train_losses.append(loss)
- train_accs.append(acc)
- print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" %
- (pass_id, iters, loss, acc))
-
- print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
- (pass_id, np.mean(train_losses), np.mean(train_accs)))
- train_elapsed = time.time() - start_time
- examples_per_sec = num_samples / train_elapsed
-
- print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
- (num_samples, train_elapsed, examples_per_sec))
- # evaluation
- if args.with_test:
- test_avg_acc = eval_test(exe, batch_acc, batch_size_tensor,
- inference_program)
- exit(0)
-
-
-def print_arguments(args):
- vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
- vars(args)['device'] == 'GPU')
- print('----------- mnist Configuration Arguments -----------')
- for arg, value in sorted(vars(args).iteritems()):
- print('%s: %s' % (arg, value))
- print('------------------------------------------------')
-
-
-if __name__ == '__main__':
- args = parse_args()
- print_arguments(args)
- if args.use_nvprof and args.device == 'GPU':
- with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
- run_benchmark(cnn_model, args)
- else:
- run_benchmark(cnn_model, args)
diff --git a/benchmark/fluid/models/__init__.py b/benchmark/fluid/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c3fcac8dd4a1ba0496ef013bd4eb468a0075125
--- /dev/null
+++ b/benchmark/fluid/models/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = [
+ "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm"
+]
diff --git a/benchmark/fluid/models/machine_translation.py b/benchmark/fluid/models/machine_translation.py
new file mode 100644
index 0000000000000000000000000000000000000000..17f6b03826ae818a3671ea7f9355a8e8c04b50be
--- /dev/null
+++ b/benchmark/fluid/models/machine_translation.py
@@ -0,0 +1,219 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""seq2seq model for fluid."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+import distutils.util
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.framework as framework
+from paddle.fluid.executor import Executor
+
+
+def lstm_step(x_t, hidden_t_prev, cell_t_prev, size):
+ def linear(inputs):
+ return fluid.layers.fc(input=inputs, size=size, bias_attr=True)
+
+ forget_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
+ input_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
+ output_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
+ cell_tilde = fluid.layers.tanh(x=linear([hidden_t_prev, x_t]))
+
+ cell_t = fluid.layers.sums(input=[
+ fluid.layers.elementwise_mul(
+ x=forget_gate, y=cell_t_prev), fluid.layers.elementwise_mul(
+ x=input_gate, y=cell_tilde)
+ ])
+
+ hidden_t = fluid.layers.elementwise_mul(
+ x=output_gate, y=fluid.layers.tanh(x=cell_t))
+
+ return hidden_t, cell_t
+
+
+def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim,
+ target_dict_dim, is_generating, beam_size, max_length):
+ """Construct a seq2seq network."""
+
+ def bi_lstm_encoder(input_seq, gate_size):
+ # Linear transformation part for input gate, output gate, forget gate
+ # and cell activation vectors need be done outside of dynamic_lstm.
+ # So the output size is 4 times of gate_size.
+ input_forward_proj = fluid.layers.fc(input=input_seq,
+ size=gate_size * 4,
+ act=None,
+ bias_attr=False)
+ forward, _ = fluid.layers.dynamic_lstm(
+ input=input_forward_proj, size=gate_size * 4, use_peepholes=False)
+ input_reversed_proj = fluid.layers.fc(input=input_seq,
+ size=gate_size * 4,
+ act=None,
+ bias_attr=False)
+ reversed, _ = fluid.layers.dynamic_lstm(
+ input=input_reversed_proj,
+ size=gate_size * 4,
+ is_reverse=True,
+ use_peepholes=False)
+ return forward, reversed
+
+ src_word_idx = fluid.layers.data(
+ name='source_sequence', shape=[1], dtype='int64', lod_level=1)
+
+ src_embedding = fluid.layers.embedding(
+ input=src_word_idx,
+ size=[source_dict_dim, embedding_dim],
+ dtype='float32')
+
+ src_forward, src_reversed = bi_lstm_encoder(
+ input_seq=src_embedding, gate_size=encoder_size)
+
+ encoded_vector = fluid.layers.concat(
+ input=[src_forward, src_reversed], axis=1)
+
+ encoded_proj = fluid.layers.fc(input=encoded_vector,
+ size=decoder_size,
+ bias_attr=False)
+
+ backward_first = fluid.layers.sequence_pool(
+ input=src_reversed, pool_type='first')
+
+ decoder_boot = fluid.layers.fc(input=backward_first,
+ size=decoder_size,
+ bias_attr=False,
+ act='tanh')
+
+ def lstm_decoder_with_attention(target_embedding, encoder_vec, encoder_proj,
+ decoder_boot, decoder_size):
+ def simple_attention(encoder_vec, encoder_proj, decoder_state):
+ decoder_state_proj = fluid.layers.fc(input=decoder_state,
+ size=decoder_size,
+ bias_attr=False)
+ decoder_state_expand = fluid.layers.sequence_expand(
+ x=decoder_state_proj, y=encoder_proj)
+ concated = fluid.layers.concat(
+ input=[encoder_proj, decoder_state_expand], axis=1)
+ attention_weights = fluid.layers.fc(input=concated,
+ size=1,
+ act='tanh',
+ bias_attr=False)
+ attention_weights = fluid.layers.sequence_softmax(
+ input=attention_weights)
+ weigths_reshape = fluid.layers.reshape(
+ x=attention_weights, shape=[-1])
+ scaled = fluid.layers.elementwise_mul(
+ x=encoder_vec, y=weigths_reshape, axis=0)
+ context = fluid.layers.sequence_pool(input=scaled, pool_type='sum')
+ return context
+
+ rnn = fluid.layers.DynamicRNN()
+
+ cell_init = fluid.layers.fill_constant_batch_size_like(
+ input=decoder_boot,
+ value=0.0,
+ shape=[-1, decoder_size],
+ dtype='float32')
+ cell_init.stop_gradient = False
+
+ with rnn.block():
+ current_word = rnn.step_input(target_embedding)
+ encoder_vec = rnn.static_input(encoder_vec)
+ encoder_proj = rnn.static_input(encoder_proj)
+ hidden_mem = rnn.memory(init=decoder_boot, need_reorder=True)
+ cell_mem = rnn.memory(init=cell_init)
+ context = simple_attention(encoder_vec, encoder_proj, hidden_mem)
+ decoder_inputs = fluid.layers.concat(
+ input=[context, current_word], axis=1)
+ h, c = lstm_step(decoder_inputs, hidden_mem, cell_mem, decoder_size)
+ rnn.update_memory(hidden_mem, h)
+ rnn.update_memory(cell_mem, c)
+ out = fluid.layers.fc(input=h,
+ size=target_dict_dim,
+ bias_attr=True,
+ act='softmax')
+ rnn.output(out)
+ return rnn()
+
+ if not is_generating:
+ trg_word_idx = fluid.layers.data(
+ name='target_sequence', shape=[1], dtype='int64', lod_level=1)
+
+ trg_embedding = fluid.layers.embedding(
+ input=trg_word_idx,
+ size=[target_dict_dim, embedding_dim],
+ dtype='float32')
+
+ prediction = lstm_decoder_with_attention(trg_embedding, encoded_vector,
+ encoded_proj, decoder_boot,
+ decoder_size)
+ label = fluid.layers.data(
+ name='label_sequence', shape=[1], dtype='int64', lod_level=1)
+ cost = fluid.layers.cross_entropy(input=prediction, label=label)
+ avg_cost = fluid.layers.mean(x=cost)
+
+ feeding_list = ["source_sequence", "target_sequence", "label_sequence"]
+
+ return avg_cost, feeding_list
+
+
+def lodtensor_to_ndarray(lod_tensor):
+ dims = lod_tensor.get_dims()
+ ndarray = np.zeros(shape=dims).astype('float32')
+ for i in xrange(np.product(dims)):
+ ndarray.ravel()[i] = lod_tensor.get_float_element(i)
+ return ndarray
+
+
+def get_model(args):
+ if args.use_reader_op:
+ raise Exception("machine_translation do not support reader op for now.")
+ embedding_dim = 512
+ encoder_size = 512
+ decoder_size = 512
+ dict_size = 30000
+ beam_size = 3
+ max_length = 250
+ avg_cost, feeding_list = seq_to_seq_net(
+ embedding_dim,
+ encoder_size,
+ decoder_size,
+ dict_size,
+ dict_size,
+ False,
+ beam_size=beam_size,
+ max_length=max_length)
+
+ # clone from default main program
+ inference_program = fluid.default_main_program().clone()
+
+ optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
+
+ train_batch_generator = paddle.batch(
+ paddle.reader.shuffle(
+ paddle.dataset.wmt14.train(dict_size), buf_size=1000),
+ batch_size=args.batch_size * args.gpus)
+
+ test_batch_generator = paddle.batch(
+ paddle.reader.shuffle(
+ paddle.dataset.wmt14.test(dict_size), buf_size=1000),
+ batch_size=args.batch_size)
+
+ return avg_cost, inference_program, optimizer, train_batch_generator, \
+ test_batch_generator, None
diff --git a/benchmark/fluid/models/mnist.py b/benchmark/fluid/models/mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e740dc6896b7eeeb82170aa13d32987c4df5c48
--- /dev/null
+++ b/benchmark/fluid/models/mnist.py
@@ -0,0 +1,125 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+import cProfile
+import os
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+
+SEED = 1
+DTYPE = "float32"
+
+# random seed must set before configuring the network.
+# fluid.default_startup_program().random_seed = SEED
+
+
+def cnn_model(data):
+ conv_pool_1 = fluid.nets.simple_img_conv_pool(
+ input=data,
+ filter_size=5,
+ num_filters=20,
+ pool_size=2,
+ pool_stride=2,
+ act="relu")
+ conv_pool_2 = fluid.nets.simple_img_conv_pool(
+ input=conv_pool_1,
+ filter_size=5,
+ num_filters=50,
+ pool_size=2,
+ pool_stride=2,
+ act="relu")
+
+ # TODO(dzhwinter) : refine the initializer and random seed settting
+ SIZE = 10
+ input_shape = conv_pool_2.shape
+ param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
+ scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
+
+ predict = fluid.layers.fc(
+ input=conv_pool_2,
+ size=SIZE,
+ act="softmax",
+ param_attr=fluid.param_attr.ParamAttr(
+ initializer=fluid.initializer.NormalInitializer(
+ loc=0.0, scale=scale)))
+ return predict
+
+
+def get_model(args):
+ if args.use_reader_op:
+ filelist = [
+ os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
+ ]
+ data_file = fluid.layers.open_files(
+ filenames=filelist,
+ shapes=[[-1, 1, 28, 28], (-1, 1)],
+ lod_levels=[0, 0],
+ dtypes=["float32", "int64"],
+ thread_num=args.gpus,
+ pass_num=args.pass_num)
+ data_file = fluid.layers.double_buffer(
+ fluid.layers.batch(
+ data_file, batch_size=args.batch_size))
+ images, label = fluid.layers.read_file(data_file)
+ else:
+ images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
+ label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+ if args.device == 'CPU' and args.cpus > 1:
+ places = fluid.layers.get_places(args.cpus)
+ pd = fluid.layers.ParallelDo(places)
+ with pd.do():
+ predict = cnn_model(pd.read_input(images))
+ label = pd.read_input(label)
+ cost = fluid.layers.cross_entropy(input=predict, label=label)
+ avg_cost = fluid.layers.mean(x=cost)
+ batch_acc = fluid.layers.accuracy(input=predict, label=label)
+
+ pd.write_output(avg_cost)
+ pd.write_output(batch_acc)
+
+ avg_cost, batch_acc = pd()
+ avg_cost = fluid.layers.mean(avg_cost)
+ batch_acc = fluid.layers.mean(batch_acc)
+ else:
+ # Train program
+ predict = cnn_model(images)
+ cost = fluid.layers.cross_entropy(input=predict, label=label)
+ avg_cost = fluid.layers.mean(x=cost)
+
+ # Evaluator
+ batch_acc = fluid.layers.accuracy(input=predict, label=label)
+
+ # inference program
+ inference_program = fluid.default_main_program().clone()
+
+ # Optimization
+ opt = fluid.optimizer.AdamOptimizer(
+ learning_rate=0.001, beta1=0.9, beta2=0.999)
+
+ # Reader
+ train_reader = paddle.batch(
+ paddle.dataset.mnist.train(), batch_size=args.batch_size * args.gpus)
+ test_reader = paddle.batch(
+ paddle.dataset.mnist.test(), batch_size=args.batch_size)
+ return avg_cost, inference_program, opt, train_reader, test_reader, batch_acc
diff --git a/benchmark/fluid/models/resnet.py b/benchmark/fluid/models/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..d44a9c07d31cfae9d54ad5949b85c77e60eae258
--- /dev/null
+++ b/benchmark/fluid/models/resnet.py
@@ -0,0 +1,208 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import numpy as np
+import time
+import os
+
+import cProfile, pstats, StringIO
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.profiler as profiler
+from recordio_converter import imagenet_train, imagenet_test
+
+
+def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
+ conv1 = fluid.layers.conv2d(
+ input=input,
+ filter_size=filter_size,
+ num_filters=ch_out,
+ stride=stride,
+ padding=padding,
+ act=None,
+ bias_attr=False)
+ return fluid.layers.batch_norm(input=conv1, act=act)
+
+
+def shortcut(input, ch_out, stride):
+ ch_in = input.shape[1] # if args.data_format == 'NCHW' else input.shape[-1]
+ if ch_in != ch_out:
+ return conv_bn_layer(input, ch_out, 1, stride, 0, None)
+ else:
+ return input
+
+
+def basicblock(input, ch_out, stride):
+ short = shortcut(input, ch_out, stride)
+ conv1 = conv_bn_layer(input, ch_out, 3, stride, 1)
+ conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None)
+ return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
+
+
+def bottleneck(input, ch_out, stride):
+ short = shortcut(input, ch_out * 4, stride)
+ conv1 = conv_bn_layer(input, ch_out, 1, stride, 0)
+ conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1)
+ conv3 = conv_bn_layer(conv2, ch_out * 4, 1, 1, 0, act=None)
+ return fluid.layers.elementwise_add(x=short, y=conv3, act='relu')
+
+
+def layer_warp(block_func, input, ch_out, count, stride):
+ res_out = block_func(input, ch_out, stride)
+ for i in range(1, count):
+ res_out = block_func(res_out, ch_out, 1)
+ return res_out
+
+
+def resnet_imagenet(input, class_dim, depth=50, data_format='NCHW'):
+
+ cfg = {
+ 18: ([2, 2, 2, 1], basicblock),
+ 34: ([3, 4, 6, 3], basicblock),
+ 50: ([3, 4, 6, 3], bottleneck),
+ 101: ([3, 4, 23, 3], bottleneck),
+ 152: ([3, 8, 36, 3], bottleneck)
+ }
+ stages, block_func = cfg[depth]
+ conv1 = conv_bn_layer(input, ch_out=64, filter_size=7, stride=2, padding=3)
+ pool1 = fluid.layers.pool2d(
+ input=conv1, pool_type='avg', pool_size=3, pool_stride=2)
+ res1 = layer_warp(block_func, pool1, 64, stages[0], 1)
+ res2 = layer_warp(block_func, res1, 128, stages[1], 2)
+ res3 = layer_warp(block_func, res2, 256, stages[2], 2)
+ res4 = layer_warp(block_func, res3, 512, stages[3], 2)
+ pool2 = fluid.layers.pool2d(
+ input=res4,
+ pool_size=7,
+ pool_type='avg',
+ pool_stride=1,
+ global_pooling=True)
+ out = fluid.layers.fc(input=pool2, size=class_dim, act='softmax')
+ return out
+
+
+def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'):
+ assert (depth - 2) % 6 == 0
+
+ n = (depth - 2) // 6
+
+ conv1 = conv_bn_layer(
+ input=input, ch_out=16, filter_size=3, stride=1, padding=1)
+ res1 = layer_warp(basicblock, conv1, 16, n, 1)
+ res2 = layer_warp(basicblock, res1, 32, n, 2)
+ res3 = layer_warp(basicblock, res2, 64, n, 2)
+ pool = fluid.layers.pool2d(
+ input=res3, pool_size=8, pool_type='avg', pool_stride=1)
+ out = fluid.layers.fc(input=pool, size=class_dim, act='softmax')
+ return out
+
+
+def get_model(args):
+ model = resnet_cifar10
+ if args.data_set == "cifar10":
+ class_dim = 10
+ if args.data_format == 'NCHW':
+ dshape = [3, 32, 32]
+ else:
+ dshape = [32, 32, 3]
+ model = resnet_cifar10
+ train_reader = paddle.dataset.cifar.train10()
+ test_reader = paddle.dataset.cifar.test10()
+ elif args.data_set == "flowers":
+ class_dim = 102
+ if args.data_format == 'NCHW':
+ dshape = [3, 224, 224]
+ else:
+ dshape = [224, 224, 3]
+ model = resnet_imagenet
+ train_reader = paddle.dataset.flowers.train()
+ test_reader = paddle.dataset.flowers.test()
+ elif args.data_set == "imagenet":
+ class_dim = 1000
+ if args.data_format == 'NCHW':
+ dshape = [3, 224, 224]
+ else:
+ dshape = [224, 224, 3]
+ model = resnet_imagenet
+ if not args.data_path:
+ raise Exception(
+ "Must specify --data_path when training with imagenet")
+ train_reader = imagenet_train(args.data_path)
+ test_reader = imagenet_test(args.data_path)
+
+ if args.use_reader_op:
+ filelist = [
+ os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
+ ]
+ data_file = fluid.layers.open_files(
+ filenames=filelist,
+ shapes=[[-1] + dshape, (-1, 1)],
+ lod_levels=[0, 0],
+ dtypes=["float32", "int64"],
+ thread_num=args.gpus,
+ pass_num=args.pass_num)
+ data_file = fluid.layers.double_buffer(
+ fluid.layers.batch(
+ data_file, batch_size=args.batch_size))
+ input, label = fluid.layers.read_file(data_file)
+ else:
+ input = fluid.layers.data(name='data', shape=dshape, dtype='float32')
+ label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+ if args.device == 'CPU' and args.cpus > 1:
+ places = fluid.layers.get_places(args.cpus)
+ pd = fluid.layers.ParallelDo(places)
+ with pd.do():
+ predict = model(pd.read_input(input), class_dim)
+ label = pd.read_input(label)
+ cost = fluid.layers.cross_entropy(input=predict, label=label)
+ avg_cost = fluid.layers.mean(x=cost)
+ batch_acc = fluid.layers.accuracy(input=predict, label=label)
+
+ pd.write_output(avg_cost)
+ pd.write_output(batch_acc)
+
+ avg_cost, batch_acc = pd()
+ avg_cost = fluid.layers.mean(avg_cost)
+ batch_acc = fluid.layers.mean(batch_acc)
+ else:
+ predict = model(input, class_dim)
+ cost = fluid.layers.cross_entropy(input=predict, label=label)
+ avg_cost = fluid.layers.mean(x=cost)
+ batch_acc = fluid.layers.accuracy(input=predict, label=label)
+
+ inference_program = fluid.default_main_program().clone()
+ with fluid.program_guard(inference_program):
+ inference_program = fluid.io.get_inference_program(
+ target_vars=[batch_acc])
+
+ optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
+
+ batched_train_reader = paddle.batch(
+ train_reader if args.no_random else paddle.reader.shuffle(
+ train_reader, buf_size=5120),
+ batch_size=args.batch_size * args.gpus,
+ drop_last=True)
+ batched_test_reader = paddle.batch(
+ test_reader, batch_size=args.batch_size, drop_last=True)
+
+ return avg_cost, inference_program, optimizer, batched_train_reader,\
+ batched_test_reader, batch_acc
diff --git a/benchmark/fluid/models/stacked_dynamic_lstm.py b/benchmark/fluid/models/stacked_dynamic_lstm.py
new file mode 100644
index 0000000000000000000000000000000000000000..3231542a17ace99a17c9f9b9bdb3c2527637d9ef
--- /dev/null
+++ b/benchmark/fluid/models/stacked_dynamic_lstm.py
@@ -0,0 +1,127 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import cPickle
+import os
+import random
+import time
+
+import numpy
+import paddle
+import paddle.dataset.imdb as imdb
+import paddle.fluid as fluid
+import paddle.batch as batch
+import paddle.fluid.profiler as profiler
+
+word_dict = imdb.word_dict()
+
+
+def crop_sentence(reader, crop_size):
+ unk_value = word_dict['']
+
+ def __impl__():
+ for item in reader():
+ if len([x for x in item[0] if x != unk_value]) < crop_size:
+ yield item
+
+ return __impl__
+
+
+def get_model(args):
+ if args.use_reader_op:
+ raise Exception(
+ "stacked_dynamic_lstm do not support reader op for now.")
+ lstm_size = 512
+ emb_dim = 512
+ crop_size = 1500
+
+ data = fluid.layers.data(
+ name="words", shape=[1], lod_level=1, dtype='int64')
+ sentence = fluid.layers.embedding(
+ input=data, size=[len(word_dict), emb_dim])
+
+ sentence = fluid.layers.fc(input=sentence, size=lstm_size, act='tanh')
+
+ rnn = fluid.layers.DynamicRNN()
+ with rnn.block():
+ word = rnn.step_input(sentence)
+ prev_hidden = rnn.memory(value=0.0, shape=[lstm_size])
+ prev_cell = rnn.memory(value=0.0, shape=[lstm_size])
+
+ def gate_common(
+ ipt,
+ hidden,
+ size, ):
+ gate0 = fluid.layers.fc(input=ipt, size=size, bias_attr=True)
+ gate1 = fluid.layers.fc(input=hidden, size=size, bias_attr=False)
+ gate = fluid.layers.sums(input=[gate0, gate1])
+ return gate
+
+ forget_gate = fluid.layers.sigmoid(
+ x=gate_common(word, prev_hidden, lstm_size))
+ input_gate = fluid.layers.sigmoid(
+ x=gate_common(word, prev_hidden, lstm_size))
+ output_gate = fluid.layers.sigmoid(
+ x=gate_common(word, prev_hidden, lstm_size))
+ cell_gate = fluid.layers.tanh(
+ x=gate_common(word, prev_hidden, lstm_size))
+
+ cell = fluid.layers.sums(input=[
+ fluid.layers.elementwise_mul(
+ x=forget_gate, y=prev_cell), fluid.layers.elementwise_mul(
+ x=input_gate, y=cell_gate)
+ ])
+
+ hidden = fluid.layers.elementwise_mul(
+ x=output_gate, y=fluid.layers.tanh(x=cell))
+
+ rnn.update_memory(prev_cell, cell)
+ rnn.update_memory(prev_hidden, hidden)
+ rnn.output(hidden)
+
+ last = fluid.layers.sequence_pool(rnn(), 'last')
+ logit = fluid.layers.fc(input=last, size=2, act='softmax')
+ loss = fluid.layers.cross_entropy(
+ input=logit,
+ label=fluid.layers.data(
+ name='label', shape=[1], dtype='int64'))
+ loss = fluid.layers.mean(x=loss)
+
+ # add acc
+ batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+ batch_acc = fluid.layers.accuracy(input=logit, label=fluid.layers.data(name='label', \
+ shape=[1], dtype='int64'), total=batch_size_tensor)
+
+ inference_program = fluid.default_main_program().clone()
+ with fluid.program_guard(inference_program):
+ inference_program = fluid.io.get_inference_program(
+ target_vars=[batch_acc, batch_size_tensor])
+
+ adam = fluid.optimizer.Adam()
+
+ train_reader = batch(
+ paddle.reader.shuffle(
+ crop_sentence(imdb.train(word_dict), crop_size), buf_size=25000),
+ batch_size=args.batch_size * args.gpus)
+ test_reader = batch(
+ paddle.reader.shuffle(
+ crop_sentence(imdb.test(word_dict), crop_size), buf_size=25000),
+ batch_size=args.batch_size)
+
+ return loss, inference_program, adam, train_reader, test_reader, batch_acc
diff --git a/benchmark/fluid/models/vgg.py b/benchmark/fluid/models/vgg.py
new file mode 100644
index 0000000000000000000000000000000000000000..932601302d2f5d56b53e3462af886429034d8989
--- /dev/null
+++ b/benchmark/fluid/models/vgg.py
@@ -0,0 +1,121 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""VGG16 benchmark in Fluid"""
+from __future__ import print_function
+
+import sys
+import time
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import argparse
+import functools
+import os
+
+
+def vgg16_bn_drop(input):
+ def conv_block(input, num_filter, groups, dropouts):
+ return fluid.nets.img_conv_group(
+ input=input,
+ pool_size=2,
+ pool_stride=2,
+ conv_num_filter=[num_filter] * groups,
+ conv_filter_size=3,
+ conv_act='relu',
+ conv_with_batchnorm=True,
+ conv_batchnorm_drop_rate=dropouts,
+ pool_type='max')
+
+ conv1 = conv_block(input, 64, 2, [0.3, 0])
+ conv2 = conv_block(conv1, 128, 2, [0.4, 0])
+ conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
+ conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
+ conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
+
+ drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
+ fc1 = fluid.layers.fc(input=drop, size=512, act=None)
+ bn = fluid.layers.batch_norm(input=fc1, act='relu')
+ drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
+ fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
+ return fc2
+
+
+def get_model(args):
+ if args.data_set == "cifar10":
+ classdim = 10
+ if args.data_format == 'NCHW':
+ data_shape = [3, 32, 32]
+ else:
+ data_shape = [32, 32, 3]
+ else:
+ classdim = 102
+ if args.data_format == 'NCHW':
+ data_shape = [3, 224, 224]
+ else:
+ data_shape = [224, 224, 3]
+
+ if args.use_reader_op:
+ filelist = [
+ os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
+ ]
+ data_file = fluid.layers.open_files(
+ filenames=filelist,
+ shapes=[[-1] + data_shape, (-1, 1)],
+ lod_levels=[0, 0],
+ dtypes=["float32", "int64"],
+ thread_num=args.gpus,
+ pass_num=args.pass_num)
+ data_file = fluid.layers.double_buffer(
+ fluid.layers.batch(
+ data_file, batch_size=args.batch_size))
+ images, label = fluid.layers.read_file(data_file)
+ else:
+ images = fluid.layers.data(
+ name='data', shape=data_shape, dtype='float32')
+ label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+ # Train program
+ net = vgg16_bn_drop(images)
+ predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
+ cost = fluid.layers.cross_entropy(input=predict, label=label)
+ avg_cost = fluid.layers.mean(x=cost)
+
+ # Evaluator
+ batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+ batch_acc = fluid.layers.accuracy(
+ input=predict, label=label, total=batch_size_tensor)
+
+ # inference program
+ inference_program = fluid.default_main_program().clone()
+ with fluid.program_guard(inference_program):
+ inference_program = fluid.io.get_inference_program(
+ target_vars=[batch_acc, batch_size_tensor])
+
+ # Optimization
+ optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
+
+ # data reader
+ train_reader = paddle.batch(
+ paddle.reader.shuffle(
+ paddle.dataset.cifar.train10()
+ if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
+ buf_size=5120),
+ batch_size=args.batch_size * args.gpus)
+ test_reader = paddle.batch(
+ paddle.dataset.cifar.test10()
+ if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
+ batch_size=args.batch_size)
+
+ return avg_cost, inference_program, optimizer, train_reader, test_reader, batch_acc
diff --git a/benchmark/fluid/recordio_converter.py b/benchmark/fluid/recordio_converter.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2dc39109bf1beaf147b046560c92fbd2416d8e6
--- /dev/null
+++ b/benchmark/fluid/recordio_converter.py
@@ -0,0 +1,164 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import random
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.dataset import mnist, cifar, flowers, image
+
+
+def convert_2_recordio(py_reader, outfilepath, batch_size, shape_data,
+ shape_label):
+ num_batches = 0
+ with fluid.program_guard(fluid.Program(), fluid.Program()):
+ reader = paddle.batch(py_reader(), batch_size=batch_size)
+ feeder = fluid.DataFeeder(
+ feed_list=[ # order is image and label
+ fluid.layers.data(
+ name='image', shape=shape_data),
+ fluid.layers.data(
+ name='label', shape=shape_label, dtype='int64'),
+ ],
+ place=fluid.CPUPlace())
+ num_batches = fluid.recordio_writer.convert_reader_to_recordio_file(
+ outfilepath, reader, feeder)
+ return num_batches
+
+
+def prepare_mnist(outpath, batch_size):
+ outfilepath = os.path.join(outpath, "mnist.recordio")
+ convert_2_recordio(mnist.train, outfilepath, batch_size, [784], [1])
+
+
+def prepare_cifar10(outpath, batch_size):
+ outfilepath = os.path.join(outpath, "cifar.recordio")
+ convert_2_recordio(cifar.train10, outfilepath, batch_size, [3, 32, 32], [1])
+
+
+def prepare_flowers(outpath, batch_size):
+ outfilepath = os.path.join(outpath, "flowers.recordio")
+ convert_2_recordio(flowers.train, outfilepath, batch_size, [3, 224, 224],
+ [1])
+
+
+def default_mapper(sample):
+ img, label = sample
+ img = image.simple_transform(
+ img, 256, 224, True, mean=[103.94, 116.78, 123.68])
+ return img.flatten().astype('float32'), label
+
+
+def imagenet_train(data_dir):
+ contents = os.listdir(data_dir)
+ if set(contents) != set(
+ ["train", "train.txt", "val", "val_set", "val.txt", "unzip.sh"]):
+ raise Exception("Imagenet data contents error!")
+ img2label = dict()
+ imgfilelist = []
+ with open(os.path.join(data_dir, "train.txt")) as fn:
+ while 1:
+ l = fn.readline()
+ if not l:
+ break
+ img, lbl = l[:-1].split(" ")
+ img2label[img] = int(lbl)
+ imgfilelist.append(img)
+ # shuffle all, this is slow
+ random.shuffle(imgfilelist)
+
+ def train_reader():
+ for idx, imgfile in enumerate(imgfilelist):
+ data = image.load_image(
+ os.path.join(data_dir, "train", imgfile.lower()))
+ label = [img2label[imgfile], ]
+ yield [data, label]
+
+ return paddle.reader.map_readers(default_mapper, train_reader)
+
+
+def imagenet_test(data_dir):
+ contents = os.listdir(data_dir)
+ if set(contents) != set(
+ ["train", "train.txt", "val", "val_set", "val.txt", "unzip.sh"]):
+ raise Exception("Imagenet data contents error!")
+ img2label = dict()
+ imgfilelist = []
+ with open(os.path.join(data_dir, "val.txt")) as fn:
+ while 1:
+ l = fn.readline()
+ if not l:
+ break
+ img, lbl = l[:-1].split(" ")
+ img2label[img] = int(lbl)
+ imgfilelist.append(img)
+
+ def test_reader():
+ for idx, imgfile in enumerate(imgfilelist):
+ base_path = os.path.join(data_dir, "val", imgfile.split(".")[0])
+ image_path = ".".join([base_path, "jpeg"])
+ data = image.load_image(image_path)
+ label = [img2label[imgfile], ]
+ yield [data, label]
+
+ return paddle.reader.map_readers(default_mapper, test_reader)
+
+
+# FIXME(wuyi): delete this when https://github.com/PaddlePaddle/Paddle/pull/11066 is merged
+def convert_reader_to_recordio_files(
+ filename,
+ batch_per_file,
+ reader_creator,
+ feeder,
+ compressor=core.RecordIOWriter.Compressor.Snappy,
+ max_num_records=1000,
+ feed_order=None):
+ if feed_order is None:
+ feed_order = feeder.feed_names
+ f_name, f_ext = os.path.splitext(filename)
+ assert (f_ext == ".recordio")
+
+ lines = []
+ f_idx = 0
+ counter = 0
+ for idx, batch in enumerate(reader_creator()):
+ lines.append(batch)
+ if idx >= batch_per_file and idx % batch_per_file == 0:
+ filename = "%s-%05d%s" % (f_name, f_idx, f_ext)
+ with fluid.recordio_writer.create_recordio_writer(
+ filename, compressor, max_num_records) as writer:
+ for l in lines:
+ res = feeder.feed(l)
+ for each in feed_order:
+ writer.append_tensor(res[each])
+ writer.complete_append_tensor()
+ counter += 1
+ lines = []
+ f_idx += 1
+ print("written file: ", filename)
+ return counter
+
+
+def prepare_imagenet(inpath, outpath, batch_size):
+ r = paddle.batch(imagenet_train(inpath), batch_size=batch_size)
+ feeder = fluid.DataFeeder(
+ feed_list=[
+ fluid.layers.data(
+ name="image", shape=[3, 224, 224]), fluid.layers.data(
+ name="label", shape=[1], dtype='int64')
+ ],
+ place=fluid.CPUPlace())
+ outpath = os.path.join(outpath, "imagenet.recordio")
+ convert_reader_to_recordio_files(outpath, 10000, r, feeder)
diff --git a/benchmark/fluid/resnet.py b/benchmark/fluid/resnet.py
deleted file mode 100644
index 831fa2c019fc2868cd85b1ca7b2c8c76a2f1628c..0000000000000000000000000000000000000000
--- a/benchmark/fluid/resnet.py
+++ /dev/null
@@ -1,313 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import functools
-import numpy as np
-import time
-
-import cProfile, pstats, StringIO
-
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import paddle.fluid.profiler as profiler
-
-
-def parse_args():
- parser = argparse.ArgumentParser('Convolution model benchmark.')
- parser.add_argument(
- '--model',
- type=str,
- choices=['resnet_imagenet', 'resnet_cifar10'],
- default='resnet_imagenet',
- help='The model architecture.')
- parser.add_argument(
- '--batch_size', type=int, default=32, help='The minibatch size.')
- parser.add_argument(
- '--use_fake_data',
- action='store_true',
- help='use real data or fake data')
- parser.add_argument(
- '--skip_batch_num',
- type=int,
- default=5,
- help='The first num of minibatch num to skip, for better performance test'
- )
- parser.add_argument(
- '--iterations', type=int, default=80, help='The number of minibatches.')
- parser.add_argument(
- '--pass_num', type=int, default=100, help='The number of passes.')
- parser.add_argument(
- '--data_format',
- type=str,
- default='NCHW',
- choices=['NCHW', 'NHWC'],
- help='The data data_format, now only support NCHW.')
- parser.add_argument(
- '--device',
- type=str,
- default='GPU',
- choices=['CPU', 'GPU'],
- help='The device type.')
- parser.add_argument(
- '--data_set',
- type=str,
- default='flowers',
- choices=['cifar10', 'flowers'],
- help='Optional dataset for benchmark.')
- parser.add_argument(
- '--infer_only', action='store_true', help='If set, run forward only.')
- parser.add_argument(
- '--use_cprof', action='store_true', help='If set, use cProfile.')
- parser.add_argument(
- '--use_nvprof',
- action='store_true',
- help='If set, use nvprof for CUDA.')
- parser.add_argument(
- '--with_test',
- action='store_true',
- help='If set, test the testset during training.')
- args = parser.parse_args()
- return args
-
-
-def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
- conv1 = fluid.layers.conv2d(
- input=input,
- filter_size=filter_size,
- num_filters=ch_out,
- stride=stride,
- padding=padding,
- act=None,
- bias_attr=False)
- return fluid.layers.batch_norm(input=conv1, act=act)
-
-
-def shortcut(input, ch_out, stride):
- ch_in = input.shape[1] if args.data_format == 'NCHW' else input.shape[-1]
- if ch_in != ch_out:
- return conv_bn_layer(input, ch_out, 1, stride, 0, None)
- else:
- return input
-
-
-def basicblock(input, ch_out, stride):
- short = shortcut(input, ch_out, stride)
- conv1 = conv_bn_layer(input, ch_out, 3, stride, 1)
- conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None)
- return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
-
-
-def bottleneck(input, ch_out, stride):
- short = shortcut(input, ch_out * 4, stride)
- conv1 = conv_bn_layer(input, ch_out, 1, stride, 0)
- conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1)
- conv3 = conv_bn_layer(conv2, ch_out * 4, 1, 1, 0, act=None)
- return fluid.layers.elementwise_add(x=short, y=conv3, act='relu')
-
-
-def layer_warp(block_func, input, ch_out, count, stride):
- res_out = block_func(input, ch_out, stride)
- for i in range(1, count):
- res_out = block_func(res_out, ch_out, 1)
- return res_out
-
-
-def resnet_imagenet(input, class_dim, depth=50, data_format='NCHW'):
-
- cfg = {
- 18: ([2, 2, 2, 1], basicblock),
- 34: ([3, 4, 6, 3], basicblock),
- 50: ([3, 4, 6, 3], bottleneck),
- 101: ([3, 4, 23, 3], bottleneck),
- 152: ([3, 8, 36, 3], bottleneck)
- }
- stages, block_func = cfg[depth]
- conv1 = conv_bn_layer(input, ch_out=64, filter_size=7, stride=2, padding=3)
- pool1 = fluid.layers.pool2d(
- input=conv1, pool_type='avg', pool_size=3, pool_stride=2)
- res1 = layer_warp(block_func, pool1, 64, stages[0], 1)
- res2 = layer_warp(block_func, res1, 128, stages[1], 2)
- res3 = layer_warp(block_func, res2, 256, stages[2], 2)
- res4 = layer_warp(block_func, res3, 512, stages[3], 2)
- pool2 = fluid.layers.pool2d(
- input=res4,
- pool_size=7,
- pool_type='avg',
- pool_stride=1,
- global_pooling=True)
- out = fluid.layers.fc(input=pool2, size=class_dim, act='softmax')
- return out
-
-
-def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'):
- assert (depth - 2) % 6 == 0
-
- n = (depth - 2) // 6
-
- conv1 = conv_bn_layer(
- input=input, ch_out=16, filter_size=3, stride=1, padding=1)
- res1 = layer_warp(basicblock, conv1, 16, n, 1)
- res2 = layer_warp(basicblock, res1, 32, n, 2)
- res3 = layer_warp(basicblock, res2, 64, n, 2)
- pool = fluid.layers.pool2d(
- input=res3, pool_size=8, pool_type='avg', pool_stride=1)
- out = fluid.layers.fc(input=pool, size=class_dim, act='softmax')
- return out
-
-
-def run_benchmark(model, args):
- if args.use_cprof:
- pr = cProfile.Profile()
- pr.enable()
-
- if args.data_set == "cifar10":
- class_dim = 10
- if args.data_format == 'NCHW':
- dshape = [3, 32, 32]
- else:
- dshape = [32, 32, 3]
- else:
- class_dim = 102
- if args.data_format == 'NCHW':
- dshape = [3, 224, 224]
- else:
- dshape = [224, 224, 3]
-
- input = fluid.layers.data(name='data', shape=dshape, dtype='float32')
- label = fluid.layers.data(name='label', shape=[1], dtype='int64')
- predict = model(input, class_dim)
- cost = fluid.layers.cross_entropy(input=predict, label=label)
- avg_cost = fluid.layers.mean(x=cost)
-
- batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
- batch_acc = fluid.layers.accuracy(
- input=predict, label=label, total=batch_size_tensor)
-
- inference_program = fluid.default_main_program().clone()
- with fluid.program_guard(inference_program):
- inference_program = fluid.io.get_inference_program(
- target_vars=[batch_acc, batch_size_tensor])
-
- optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
- opts = optimizer.minimize(avg_cost)
-
- fluid.memory_optimize(fluid.default_main_program())
-
- train_reader = paddle.batch(
- paddle.reader.shuffle(
- paddle.dataset.cifar.train10()
- if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
- buf_size=5120),
- batch_size=args.batch_size)
- test_reader = paddle.batch(
- paddle.dataset.cifar.test10()
- if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
- batch_size=args.batch_size)
-
- def test(exe):
- test_accuracy = fluid.average.WeightedAverage()
- for batch_id, data in enumerate(test_reader()):
- img_data = np.array(map(lambda x: x[0].reshape(dshape),
- data)).astype("float32")
- y_data = np.array(map(lambda x: x[1], data)).astype("int64")
- y_data = y_data.reshape([-1, 1])
-
- acc, weight = exe.run(inference_program,
- feed={"data": img_data,
- "label": y_data},
- fetch_list=[batch_acc, batch_size_tensor])
- test_accuracy.add(value=acc, weight=weight)
-
- return test_accuracy.eval()
-
- place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
- exe = fluid.Executor(place)
- exe.run(fluid.default_startup_program())
- accuracy = fluid.average.WeightedAverage()
- if args.use_fake_data:
- data = train_reader().next()
- image = np.array(map(lambda x: x[0].reshape(dshape), data)).astype(
- 'float32')
- label = np.array(map(lambda x: x[1], data)).astype('int64')
- label = label.reshape([-1, 1])
-
- iters, num_samples, start_time = 0, 0, time.time()
- for pass_id in range(args.pass_num):
- accuracy.reset()
- train_accs = []
- train_losses = []
- for batch_id, data in enumerate(train_reader()):
- if iters == args.skip_batch_num:
- start_time = time.time()
- num_samples = 0
- if iters == args.iterations:
- break
- if not args.use_fake_data:
- image = np.array(map(lambda x: x[0].reshape(dshape),
- data)).astype('float32')
- label = np.array(map(lambda x: x[1], data)).astype('int64')
- label = label.reshape([-1, 1])
- loss, acc, weight = exe.run(
- fluid.default_main_program(),
- feed={'data': image,
- 'label': label},
- fetch_list=[avg_cost, batch_acc, batch_size_tensor])
- iters += 1
- num_samples += len(label)
- accuracy.add(value=acc, weight=weight)
- train_losses.append(loss)
- train_accs.append(acc)
- print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" %
- (pass_id, iters, loss, acc))
- print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
- (pass_id, np.mean(train_losses), np.mean(train_accs)))
- train_elapsed = time.time() - start_time
- examples_per_sec = num_samples / train_elapsed
- print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
- (num_samples, train_elapsed, examples_per_sec))
- # evaluation
- if args.with_test:
- pass_test_acc = test(exe)
- exit(0)
-
-
-def print_arguments(args):
- vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
- vars(args)['device'] == 'GPU')
- print('----------- resnet Configuration Arguments -----------')
- for arg, value in sorted(vars(args).iteritems()):
- print('%s: %s' % (arg, value))
- print('------------------------------------------------')
-
-
-if __name__ == '__main__':
- model_map = {
- 'resnet_imagenet': resnet_imagenet,
- 'resnet_cifar10': resnet_cifar10
- }
- args = parse_args()
- print_arguments(args)
- if args.data_format == 'NHWC':
- raise ValueError('Only support NCHW data_format now.')
- if args.use_nvprof and args.device == 'GPU':
- with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
- run_benchmark(model_map[args.model], args)
- else:
- run_benchmark(model_map[args.model], args)
diff --git a/benchmark/fluid/run.sh b/benchmark/fluid/run.sh
index f6dfd20bf2ee0b668b6d4238d4511253b2233035..5d9b2db87135e53470b106dcd11a6bcfdc5dbda9 100644
--- a/benchmark/fluid/run.sh
+++ b/benchmark/fluid/run.sh
@@ -2,6 +2,7 @@
# This script benchmarking the PaddlePaddle Fluid on
# single thread single GPU.
+mkdir -p logs
#export FLAGS_fraction_of_gpu_memory_to_use=0.0
export CUDNN_PATH=/paddle/cudnn_v5
@@ -35,71 +36,74 @@ nohup stdbuf -oL nvidia-smi \
--format=csv \
--filename=mem.log \
-l 1 &
+
# mnist
# mnist gpu mnist 128
-FLAGS_benchmark=true stdbuf -oL python fluid/mnist.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+ --model=mnist \
--device=GPU \
--batch_size=128 \
--skip_batch_num=5 \
--iterations=500 \
- 2>&1 | tee -a mnist_gpu_128.log
+ 2>&1 | tee -a logs/mnist_gpu_128.log
# vgg16
# gpu cifar10 128
-FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+ --model=vgg16 \
--device=GPU \
--batch_size=128 \
--skip_batch_num=5 \
--iterations=30 \
- 2>&1 | tee -a vgg16_gpu_128.log
+ 2>&1 | tee -a logs/vgg16_gpu_128.log
# flowers gpu 128
-FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+ --model=vgg16 \
--device=GPU \
--batch_size=32 \
--data_set=flowers \
--skip_batch_num=5 \
--iterations=30 \
- 2>&1 | tee -a vgg16_gpu_flowers_32.log
+ 2>&1 | tee -a logs/vgg16_gpu_flowers_32.log
# resnet50
# resnet50 gpu cifar10 128
-FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+ --model=resnet \
--device=GPU \
--batch_size=128 \
--data_set=cifar10 \
- --model=resnet_cifar10 \
--skip_batch_num=5 \
--iterations=30 \
- 2>&1 | tee -a resnet50_gpu_128.log
+ 2>&1 | tee -a logs/resnet50_gpu_128.log
# resnet50 gpu flowers 64
-FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+ --model=resnet \
--device=GPU \
--batch_size=64 \
--data_set=flowers \
- --model=resnet_imagenet \
--skip_batch_num=5 \
--iterations=30 \
- 2>&1 | tee -a resnet50_gpu_flowers_64.log
+ 2>&1 | tee -a logs/resnet50_gpu_flowers_64.log
# lstm
# lstm gpu imdb 32 # tensorflow only support batch=32
-FLAGS_benchmark=true stdbuf -oL python fluid/stacked_dynamic_lstm.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+ --model=stacked_dynamic_lstm \
--device=GPU \
--batch_size=32 \
--skip_batch_num=5 \
--iterations=30 \
- --hidden_dim=512 \
- --emb_dim=512 \
- --crop_size=1500 \
- 2>&1 | tee -a lstm_gpu_32.log
+ 2>&1 | tee -a logs/lstm_gpu_32.log
# seq2seq
# seq2seq gpu wmb 128
-FLAGS_benchmark=true stdbuf -oL python fluid/machine_translation.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+ --model=machine_translation \
--device=GPU \
--batch_size=128 \
--skip_batch_num=5 \
--iterations=30 \
- 2>&1 | tee -a lstm_gpu_128.log
+ 2>&1 | tee -a logs/lstm_gpu_128.log
diff --git a/benchmark/fluid/run_fluid_benchmark.sh b/benchmark/fluid/run_fluid_benchmark.sh
new file mode 100644
index 0000000000000000000000000000000000000000..4309a3126c1d72fe1eb2d5ec423075aea4d3ec88
--- /dev/null
+++ b/benchmark/fluid/run_fluid_benchmark.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model resnet --device CPU --update_method pserver --iterations=10000 &
+
+sleep 15
+
+CUDA_VISIBLE_DEVICES=0,1 PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model resnet --device GPU --update_method pserver --iterations=10000 --gpus 2 &
+
+CUDA_VISIBLE_DEVICES=2,3 PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=1 python fluid_benchmark.py --model resnet --device GPU --update_method pserver --iterations=10000 --gpus 2 &
diff --git a/benchmark/fluid/stacked_dynamic_lstm.py b/benchmark/fluid/stacked_dynamic_lstm.py
deleted file mode 100644
index 73bcc47b4d404af2c01d61ca3dfb11971bbcfe9c..0000000000000000000000000000000000000000
--- a/benchmark/fluid/stacked_dynamic_lstm.py
+++ /dev/null
@@ -1,236 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import cPickle
-import os
-import random
-import time
-
-import numpy
-import paddle
-import paddle.dataset.imdb as imdb
-import paddle.fluid as fluid
-import paddle.batch as batch
-import paddle.fluid.profiler as profiler
-
-
-def parse_args():
- parser = argparse.ArgumentParser("Understand Sentiment by Dynamic RNN.")
- parser.add_argument(
- '--batch_size',
- type=int,
- default=32,
- help='The sequence number of a batch data. (default: %(default)d)')
- parser.add_argument(
- '--skip_batch_num',
- type=int,
- default=5,
- help='The first num of minibatch num to skip, for better performance test'
- )
- parser.add_argument(
- '--iterations', type=int, default=80, help='The number of minibatches.')
- parser.add_argument(
- '--emb_dim',
- type=int,
- default=512,
- help='Dimension of embedding table. (default: %(default)d)')
- parser.add_argument(
- '--hidden_dim',
- type=int,
- default=512,
- help='Hidden size of lstm unit. (default: %(default)d)')
- parser.add_argument(
- '--pass_num',
- type=int,
- default=100,
- help='Epoch number to train. (default: %(default)d)')
- parser.add_argument(
- '--device',
- type=str,
- default='CPU',
- choices=['CPU', 'GPU'],
- help='The device type.')
- parser.add_argument(
- '--crop_size',
- type=int,
- default=int(os.environ.get('CROP_SIZE', '1500')),
- help='The max sentence length of input. Since this model use plain RNN,'
- ' Gradient could be explored if sentence is too long')
- parser.add_argument(
- '--with_test',
- action='store_true',
- help='If set, test the testset during training.')
- args = parser.parse_args()
- return args
-
-
-word_dict = imdb.word_dict()
-
-
-def crop_sentence(reader, crop_size):
- unk_value = word_dict['']
-
- def __impl__():
- for item in reader():
- if len([x for x in item[0] if x != unk_value]) < crop_size:
- yield item
-
- return __impl__
-
-
-def main():
- args = parse_args()
- lstm_size = args.hidden_dim
-
- data = fluid.layers.data(
- name="words", shape=[1], lod_level=1, dtype='int64')
- sentence = fluid.layers.embedding(
- input=data, size=[len(word_dict), args.emb_dim])
-
- sentence = fluid.layers.fc(input=sentence, size=lstm_size, act='tanh')
-
- rnn = fluid.layers.DynamicRNN()
- with rnn.block():
- word = rnn.step_input(sentence)
- prev_hidden = rnn.memory(value=0.0, shape=[lstm_size])
- prev_cell = rnn.memory(value=0.0, shape=[lstm_size])
-
- def gate_common(
- ipt,
- hidden,
- size, ):
- gate0 = fluid.layers.fc(input=ipt, size=size, bias_attr=True)
- gate1 = fluid.layers.fc(input=hidden, size=size, bias_attr=False)
- gate = fluid.layers.sums(input=[gate0, gate1])
- return gate
-
- forget_gate = fluid.layers.sigmoid(
- x=gate_common(word, prev_hidden, lstm_size))
- input_gate = fluid.layers.sigmoid(
- x=gate_common(word, prev_hidden, lstm_size))
- output_gate = fluid.layers.sigmoid(
- x=gate_common(word, prev_hidden, lstm_size))
- cell_gate = fluid.layers.tanh(
- x=gate_common(word, prev_hidden, lstm_size))
-
- cell = fluid.layers.sums(input=[
- fluid.layers.elementwise_mul(
- x=forget_gate, y=prev_cell), fluid.layers.elementwise_mul(
- x=input_gate, y=cell_gate)
- ])
-
- hidden = fluid.layers.elementwise_mul(
- x=output_gate, y=fluid.layers.tanh(x=cell))
-
- rnn.update_memory(prev_cell, cell)
- rnn.update_memory(prev_hidden, hidden)
- rnn.output(hidden)
-
- last = fluid.layers.sequence_pool(rnn(), 'last')
- logit = fluid.layers.fc(input=last, size=2, act='softmax')
- loss = fluid.layers.cross_entropy(
- input=logit,
- label=fluid.layers.data(
- name='label', shape=[1], dtype='int64'))
- loss = fluid.layers.mean(x=loss)
-
- # add acc
- batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
- batch_acc = fluid.layers.accuracy(input=logit, label=fluid.layers.data(name='label', \
- shape=[1], dtype='int64'), total=batch_size_tensor)
-
- inference_program = fluid.default_main_program().clone()
- with fluid.program_guard(inference_program):
- inference_program = fluid.io.get_inference_program(
- target_vars=[batch_acc, batch_size_tensor])
-
- adam = fluid.optimizer.Adam()
- adam.minimize(loss)
-
- fluid.memory_optimize(fluid.default_main_program())
-
- place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0)
- exe = fluid.Executor(place)
- exe.run(fluid.default_startup_program())
-
- train_reader = batch(
- paddle.reader.shuffle(
- crop_sentence(imdb.train(word_dict), args.crop_size),
- buf_size=25000),
- batch_size=args.batch_size)
-
- iters, num_samples, start_time = 0, 0, time.time()
- for pass_id in range(args.pass_num):
- train_accs = []
- train_losses = []
- for batch_id, data in enumerate(train_reader()):
- if iters == args.skip_batch_num:
- start_time = time.time()
- num_samples = 0
- if iters == args.iterations:
- break
- tensor_words = to_lodtensor([x[0] for x in data], place)
- label = numpy.array([x[1] for x in data]).astype("int64")
- label = label.reshape((-1, 1))
- loss_np, acc, weight = exe.run(
- fluid.default_main_program(),
- feed={"words": tensor_words,
- "label": label},
- fetch_list=[loss, batch_acc, batch_size_tensor])
- iters += 1
- for x in data:
- num_samples += len(x[0])
- print(
- "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" %
- (pass_id, iters, loss_np, acc)
- ) # The accuracy is the accumulation of batches, but not the current batch.
-
- train_elapsed = time.time() - start_time
- examples_per_sec = num_samples / train_elapsed
- print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
- (num_samples, train_elapsed, examples_per_sec))
- exit(0)
-
-
-def to_lodtensor(data, place):
- seq_lens = [len(seq) for seq in data]
- cur_len = 0
- lod = [cur_len]
- for l in seq_lens:
- cur_len += l
- lod.append(cur_len)
- flattened_data = numpy.concatenate(data, axis=0).astype("int64")
- flattened_data = flattened_data.reshape([len(flattened_data), 1])
- res = fluid.LoDTensor()
- res.set(flattened_data, place)
- res.set_lod([lod])
- return res
-
-
-def print_arguments(args):
- print('----------- lstm Configuration Arguments -----------')
- for arg, value in sorted(vars(args).iteritems()):
- print('%s: %s' % (arg, value))
- print('------------------------------------------------')
-
-
-if __name__ == '__main__':
- args = parse_args()
- print_arguments(args)
- main()
diff --git a/benchmark/fluid/vgg.py b/benchmark/fluid/vgg.py
deleted file mode 100644
index 53e34e0cbd15914791c305db6797f826ebfae34e..0000000000000000000000000000000000000000
--- a/benchmark/fluid/vgg.py
+++ /dev/null
@@ -1,224 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""VGG16 benchmark in Fluid"""
-from __future__ import print_function
-
-import sys
-import time
-import numpy as np
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import argparse
-import functools
-
-parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument(
- '--batch_size', type=int, default=128, help="Batch size for training.")
-parser.add_argument(
- '--skip_batch_num',
- type=int,
- default=5,
- help='The first num of minibatch num to skip, for better performance test')
-parser.add_argument(
- '--iterations', type=int, default=80, help='The number of minibatches.')
-parser.add_argument(
- '--learning_rate',
- type=float,
- default=1e-3,
- help="Learning rate for training.")
-parser.add_argument('--pass_num', type=int, default=50, help="No. of passes.")
-parser.add_argument(
- '--device',
- type=str,
- default='GPU',
- choices=['CPU', 'GPU'],
- help="The device type.")
-parser.add_argument(
- '--data_format',
- type=str,
- default='NCHW',
- choices=['NCHW', 'NHWC'],
- help='The data order, now only support NCHW.')
-parser.add_argument(
- '--data_set',
- type=str,
- default='cifar10',
- choices=['cifar10', 'flowers'],
- help='Optional dataset for benchmark.')
-parser.add_argument(
- '--with_test',
- action='store_true',
- help='If set, test the testset during training.')
-args = parser.parse_args()
-
-
-def vgg16_bn_drop(input):
- def conv_block(input, num_filter, groups, dropouts):
- return fluid.nets.img_conv_group(
- input=input,
- pool_size=2,
- pool_stride=2,
- conv_num_filter=[num_filter] * groups,
- conv_filter_size=3,
- conv_act='relu',
- conv_with_batchnorm=True,
- conv_batchnorm_drop_rate=dropouts,
- pool_type='max')
-
- conv1 = conv_block(input, 64, 2, [0.3, 0])
- conv2 = conv_block(conv1, 128, 2, [0.4, 0])
- conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
- conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
- conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
-
- drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
- fc1 = fluid.layers.fc(input=drop, size=512, act=None)
- bn = fluid.layers.batch_norm(input=fc1, act='relu')
- drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
- fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
- return fc2
-
-
-def main():
- if args.data_set == "cifar10":
- classdim = 10
- if args.data_format == 'NCHW':
- data_shape = [3, 32, 32]
- else:
- data_shape = [32, 32, 3]
- else:
- classdim = 102
- if args.data_format == 'NCHW':
- data_shape = [3, 224, 224]
- else:
- data_shape = [224, 224, 3]
-
- # Input data
- images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
- label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
- # Train program
- net = vgg16_bn_drop(images)
- predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
- cost = fluid.layers.cross_entropy(input=predict, label=label)
- avg_cost = fluid.layers.mean(x=cost)
-
- # Evaluator
- batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
- batch_acc = fluid.layers.accuracy(
- input=predict, label=label, total=batch_size_tensor)
-
- # inference program
- inference_program = fluid.default_main_program().clone()
- with fluid.program_guard(inference_program):
- inference_program = fluid.io.get_inference_program(
- target_vars=[batch_acc, batch_size_tensor])
-
- # Optimization
- optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
- opts = optimizer.minimize(avg_cost)
-
- fluid.memory_optimize(fluid.default_main_program())
-
- # Initialize executor
- place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
- exe = fluid.Executor(place)
-
- # Parameter initialization
- exe.run(fluid.default_startup_program())
-
- # data reader
- train_reader = paddle.batch(
- paddle.reader.shuffle(
- paddle.dataset.cifar.train10()
- if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
- buf_size=5120),
- batch_size=args.batch_size)
- test_reader = paddle.batch(
- paddle.dataset.cifar.test10()
- if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
- batch_size=args.batch_size)
-
- # test
- def test(exe):
- test_accuracy = fluid.average.WeightedAverage()
- for batch_id, data in enumerate(test_reader()):
- img_data = np.array(map(lambda x: x[0].reshape(data_shape),
- data)).astype("float32")
- y_data = np.array(map(lambda x: x[1], data)).astype("int64")
- y_data = y_data.reshape([-1, 1])
-
- acc, weight = exe.run(inference_program,
- feed={"pixel": img_data,
- "label": y_data},
- fetch_list=[batch_acc, batch_size_tensor])
- test_accuracy.add(value=acc, weight=weight)
- return test_accuracy.eval()
-
- iters, num_samples, start_time = 0, 0, time.time()
- accuracy = fluid.average.WeightedAverage()
- for pass_id in range(args.pass_num):
- accuracy.reset()
- train_accs = []
- train_losses = []
- for batch_id, data in enumerate(train_reader()):
- if iters == args.skip_batch_num:
- start_time = time.time()
- num_samples = 0
- if iters == args.iterations:
- break
- img_data = np.array(map(lambda x: x[0].reshape(data_shape),
- data)).astype("float32")
- y_data = np.array(map(lambda x: x[1], data)).astype("int64")
- y_data = y_data.reshape([-1, 1])
-
- loss, acc, weight = exe.run(
- fluid.default_main_program(),
- feed={"pixel": img_data,
- "label": y_data},
- fetch_list=[avg_cost, batch_acc, batch_size_tensor])
- accuracy.add(value=acc, weight=weight)
- iters += 1
- num_samples += len(y_data)
- print(
- "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" %
- (pass_id, iters, loss, acc)
- ) # The accuracy is the accumulation of batches, but not the current batch.
-
- # pass_train_acc = accuracy.eval()
- train_losses.append(loss)
- train_accs.append(acc)
- print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
- (pass_id, np.mean(train_losses), np.mean(train_accs)))
- train_elapsed = time.time() - start_time
- examples_per_sec = num_samples / train_elapsed
- print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
- (num_samples, train_elapsed, examples_per_sec))
- # evaluation
- if args.with_test:
- pass_test_acc = test(exe)
- exit(0)
-
-
-def print_arguments():
- print('----------- vgg Configuration Arguments -----------')
- for arg, value in sorted(vars(args).iteritems()):
- print('%s: %s' % (arg, value))
- print('------------------------------------------------')
-
-
-if __name__ == "__main__":
- print_arguments()
- main()
diff --git a/benchmark/paddle/image/run.sh b/benchmark/paddle/image/run.sh
index 717ed487ba7657db6535efcb1128a355a0f15eaf..5b58a8d773aab795e5439b0f0e5d81bec66b5f56 100755
--- a/benchmark/paddle/image/run.sh
+++ b/benchmark/paddle/image/run.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
set -e
function train() {
diff --git a/benchmark/paddle/image/run_mkl_infer.sh b/benchmark/paddle/image/run_mkl_infer.sh
index 62c9bf6efd3810f506fd4592b2ba3a21b1b7f0e7..0fad5e04cc992a3ec97591d3833957bb7517a8f3 100755
--- a/benchmark/paddle/image/run_mkl_infer.sh
+++ b/benchmark/paddle/image/run_mkl_infer.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
set -e
function clock_to_seconds() {
diff --git a/benchmark/paddle/image/run_mkl_train.sh b/benchmark/paddle/image/run_mkl_train.sh
index 03d2d378fb72e36f765d89af788f6ee96fe21d4e..1583bf134a276a08aa2f8e84dc63adbb205a83d6 100755
--- a/benchmark/paddle/image/run_mkl_train.sh
+++ b/benchmark/paddle/image/run_mkl_train.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
set -e
function train() {
diff --git a/benchmark/paddle/image/run_openblas_infer.sh b/benchmark/paddle/image/run_openblas_infer.sh
index a9a7b8a66717c4be0543c3fe2db293fe199e3dc4..987381cabc2e793886099212660723c122b73bb0 100755
--- a/benchmark/paddle/image/run_openblas_infer.sh
+++ b/benchmark/paddle/image/run_openblas_infer.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
set -e
function clock_to_seconds() {
diff --git a/benchmark/paddle/image/run_openblas_train.sh b/benchmark/paddle/image/run_openblas_train.sh
index 935cff6f2c97d25d6de556cfee25e27dbe49b5b6..cc64e1d09da02087b1737190a0b75dc7758600a6 100755
--- a/benchmark/paddle/image/run_openblas_train.sh
+++ b/benchmark/paddle/image/run_openblas_train.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
set -e
function train() {
diff --git a/benchmark/paddle/rnn/run.sh b/benchmark/paddle/rnn/run.sh
index e9dfeb2e525979f47e4ef48f7610dc1007900f2c..f99a562b3f88a98560f4bf7aee98ceee9daefe67 100755
--- a/benchmark/paddle/rnn/run.sh
+++ b/benchmark/paddle/rnn/run.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
set -e
function train() {
diff --git a/benchmark/tensorflow/image/run.sh b/benchmark/tensorflow/image/run.sh
index eade36beb9df5f8d3978939216e058203e024c1a..cf894fe3f2dca24e3acf863d625b3a7008793b83 100755
--- a/benchmark/tensorflow/image/run.sh
+++ b/benchmark/tensorflow/image/run.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
set -e
function test() {
diff --git a/benchmark/tensorflow/image/run_multi.sh b/benchmark/tensorflow/image/run_multi.sh
index 69faa4331744f2276e7706185ae10bc507f95764..bf1435bc55b90669e0b8bd893b8ed7bbb99d51e2 100755
--- a/benchmark/tensorflow/image/run_multi.sh
+++ b/benchmark/tensorflow/image/run_multi.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
set -e
function test() {
diff --git a/benchmark/tensorflow/rnn/run.sh b/benchmark/tensorflow/rnn/run.sh
index bb4c69cb95f965eff35f1c5a60376bf1e84f841b..db10eefdea8676ad34fb84a161f0fc1309147824 100755
--- a/benchmark/tensorflow/rnn/run.sh
+++ b/benchmark/tensorflow/rnn/run.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
set -e
function test() {
diff --git a/benchmark/tensorflow/rnn/run_multi.sh b/benchmark/tensorflow/rnn/run_multi.sh
index c2d7dd597e6da54cd5c4cda311fbbd18486b4647..ec62fc26b51543f2f8ddfc5e73aa6ff7d611e4dd 100755
--- a/benchmark/tensorflow/rnn/run_multi.sh
+++ b/benchmark/tensorflow/rnn/run_multi.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
set -e
function test() {
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index e3b9d94215a858c5c9a34e1b7e97540f1876801d..6ed51c648478efb9784d0c43b169c285e740e0f3 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -83,18 +83,20 @@ else()
set(REFERENCE_CBLAS_LIB_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/lib)
endif()
-find_path(REFERENCE_CBLAS_INCLUDE_DIR NAMES cblas.h PATHS
+if(WITH_SYSTEM_BLAS)
+ find_path(REFERENCE_CBLAS_INCLUDE_DIR NAMES cblas.h PATHS
${REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS})
-find_library(REFERENCE_CBLAS_LIBRARY NAMES cblas PATHS
+ find_library(REFERENCE_CBLAS_LIBRARY NAMES cblas PATHS
${REFERENCE_CBLAS_LIB_SEARCH_PATHS})
-if(REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY)
- set(CBLAS_FOUND ON)
- set(CBLAS_PROVIDER REFERENCE)
- set(CBLAS_INC_DIR ${REFERENCE_CBLAS_INCLUDE_DIR})
- set(CBLAS_LIBRARIES ${REFERENCE_CBLAS_LIBRARY})
- add_definitions(-DPADDLE_USE_REFERENCE_CBLAS)
- message(STATUS "Found reference-cblas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
+ if(REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY)
+ set(CBLAS_FOUND ON)
+ set(CBLAS_PROVIDER REFERENCE)
+ set(CBLAS_INC_DIR ${REFERENCE_CBLAS_INCLUDE_DIR})
+ set(CBLAS_LIBRARIES ${REFERENCE_CBLAS_LIBRARY})
+ add_definitions(-DPADDLE_USE_REFERENCE_CBLAS)
+ message(STATUS "Found reference-cblas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
+ endif()
endif()
if(IOS_USE_VECLIB_FOR_BLAS AND VECLIB_FOUND)
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index e490397cc0624c310949a4b571bd00cac6e8953b..ce1857582bd3e8ab3077158384beaae36a83a4b2 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -41,18 +41,31 @@ if(USE_EIGEN_FOR_BLAS)
add_definitions(-DPADDLE_USE_EIGEN_FOR_BLAS)
endif(USE_EIGEN_FOR_BLAS)
+if(EIGEN_USE_THREADS)
+ add_definitions(-DEIGEN_USE_THREADS)
+endif(EIGEN_USE_THREADS)
+
if(NOT WITH_PROFILER)
add_definitions(-DPADDLE_DISABLE_PROFILER)
endif(NOT WITH_PROFILER)
if(NOT CMAKE_CROSSCOMPILING)
- if(WITH_AVX AND AVX_FOUND)
+ if(WITH_AVX AND AVX512F_FOUND)
+ set(SIMD_FLAG ${AVX512F_FLAG})
+ elseif(WITH_AVX AND AVX2_FOUND)
+ set(SIMD_FLAG ${AVX2_FLAG})
+ elseif(WITH_AVX AND AVX_FOUND)
set(SIMD_FLAG ${AVX_FLAG})
elseif(SSE3_FOUND)
set(SIMD_FLAG ${SSE3_FLAG})
endif()
endif()
+if(WIN32)
+ # windows stupid compile option for all targets.
+ add_definitions(-D_XKEYCHECK_H)
+endif(WIN32)
+
if(NOT WITH_GOLANG)
add_definitions(-DPADDLE_WITHOUT_GOLANG)
endif(NOT WITH_GOLANG)
@@ -88,8 +101,28 @@ if(WITH_GPU)
if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
message(FATAL_ERROR "TensorRT needs CUDNN >= 7.0 to compile")
endif()
+ if(${TENSORRT_MAJOR_VERSION} VERSION_LESS 4)
+ message(FATAL_ERROR "Paddle needs TensorRT >= 4.0 to compile")
+ endif()
include_directories(${TENSORRT_INCLUDE_DIR})
endif()
+ if(WITH_ANAKIN)
+ if(${CUDA_VERSION_MAJOR} VERSION_LESS 8)
+ message(WARNING "Anakin needs CUDA >= 8.0 to compile. Force WITH_ANAKIN=OFF")
+ set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when CUDA >= 8.0." FORCE)
+ endif()
+ if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
+ message(WARNING "Anakin needs CUDNN >= 7.0 to compile. Force WITH_ANAKIN=OFF")
+ set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when CUDNN >= 7.0." FORCE)
+ endif()
+ endif()
+ if(WITH_ANAKIN)
+ # NOTICE(minqiyang): the end slash is important because $CUDNN_INCLUDE_DIR
+ # is a softlink to real cudnn.h directory
+ set(ENV{CUDNN_INCLUDE_DIR} "${CUDNN_INCLUDE_DIR}/")
+ get_filename_component(CUDNN_LIBRARY_DIR ${CUDNN_LIBRARY} DIRECTORY)
+ set(ENV{CUDNN_LIBRARY} ${CUDNN_LIBRARY_DIR})
+ endif()
elseif(WITH_AMD_GPU)
add_definitions(-DPADDLE_WITH_HIP)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__")
@@ -111,6 +144,10 @@ endif()
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SIMD_FLAG}")
+if(WITH_DISTRIBUTE)
+ add_definitions(-DPADDLE_WITH_DISTRIBUTE)
+endif()
+
if(WITH_GOLANG)
# we need to symlink Paddle directory into GOPATH. If we
# don't do it and we have code that depends on Paddle, go
@@ -159,3 +196,11 @@ if(WITH_GOLANG)
endif()
endif(WITH_GOLANG)
+
+if(WITH_GRPC)
+ add_definitions(-DPADDLE_WITH_GRPC)
+endif(WITH_GRPC)
+
+if(WITH_BRPC_RDMA)
+ add_definitions(-DPADDLE_WITH_BRPC_RDMA)
+endif(WITH_BRPC_RDMA)
diff --git a/cmake/cpplint.cmake b/cmake/cpplint.cmake
deleted file mode 100644
index 4823dc3e91390002aefac70f7931b4197db05789..0000000000000000000000000000000000000000
--- a/cmake/cpplint.cmake
+++ /dev/null
@@ -1,62 +0,0 @@
-# util to check C++ file style
-# * it basically use google cpplint.py.
-# * It provide "add_style_check_target" for cmake.
-# Usage see add_style_check_target's document
-#
-# TODO(yuyang18): Add python style check.
-
-set(STYLE_FILTER)
-
-# diable unwanted filters
-
-# paddle do not indent public/potected/private in class
-set(STYLE_FILTER "${STYLE_FILTER}-whitespace/indent,")
-# paddle use mutable reference. BUT IT IS NOT RECOMMANDED
-set(STYLE_FILTER "${STYLE_FILTER}-runtime/references,")
-# paddle use relative path for include.
-set(STYLE_FILTER "${STYLE_FILTER}-build/include,")
-# paddle use , , etc.
-set(STYLE_FILTER "${STYLE_FILTER}-build/c++11,")
-# paddle use c style casting. BUT IT IS NOT RECOMMANDED
-set(STYLE_FILTER "${STYLE_FILTER}-readability/casting")
-
-
-# IGNORE SOME FILES
-set(IGNORE_PATTERN
- .*ImportanceSampler.*
- .*cblas\\.h.*
- .*\\.pb\\.txt
- .*MultiDataProvider.*
- .*pb.*
- .*pybind.h)
-
-# add_style_check_target
-#
-# attach check code style step for target.
-#
-# first argument: target name to attach
-# rest arguments: source list to check code style.
-#
-# NOTE: If WITH_STYLE_CHECK is OFF, then this macro just do nothing.
-macro(add_style_check_target TARGET_NAME)
- if(WITH_STYLE_CHECK)
- set(SOURCES_LIST ${ARGN})
- list(REMOVE_DUPLICATES SOURCES_LIST)
- foreach(filename ${SOURCES_LIST})
- foreach(pattern ${IGNORE_PATTERN})
- if(filename MATCHES ${pattern})
- list(REMOVE_ITEM SOURCES_LIST ${filename})
- endif()
- endforeach()
- endforeach()
-
- if(SOURCES_LIST)
- add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
- COMMAND "${PYTHON_EXECUTABLE}" "${PADDLE_SOURCE_DIR}/paddle/scripts/cpplint.py"
- "--filter=${STYLE_FILTER}"
- ${SOURCES_LIST}
- COMMENT "cpplint: Checking source code style"
- WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
- endif()
- endif()
-endmacro()
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index b520c03a836a9e3f263ba050f151877ffe0d071d..03c73786a6c31868b1893bfcb319e43e37db1a3d 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -169,14 +169,19 @@ set(CUDA_PROPAGATE_HOST_FLAGS OFF)
# Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
# So, don't set these flags here.
+if (NOT WIN32) # windows msvc2015 support c++11 natively.
+# -std=c++11 -fPIC not recoginize by msvc, -Xcompiler will be added by cmake.
list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
-list(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC")
+endif(NOT WIN32)
+
+list(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
# in cuda9, suppress cuda warning on eigen
list(APPEND CUDA_NVCC_FLAGS "-w")
# Set :expt-relaxed-constexpr to suppress Eigen warnings
list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")
+if (NOT WIN32)
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG})
elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
@@ -187,6 +192,13 @@ elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel")
# nvcc 9 does not support -Os. Use Release flags instead
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE})
endif()
+else(NOT WIN32)
+if(CMAKE_BUILD_TYPE STREQUAL "Release")
+ list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG")
+else()
+ message(FATAL "Windows only support Release build now. Please set visual studio build type to Release, x64 build.")
+endif()
+endif(NOT WIN32)
mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD)
mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION)
diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake
index 2c84061ff572de4687b4d496f8ded6deee8d1011..cd51533926de7bb132ab7bfab1686d664a331410 100644
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -21,11 +21,29 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS
${CUDNN_ROOT}/lib64
${CUDNN_ROOT}/lib
${CUDNN_ROOT}/lib/${TARGET_ARCH}-linux-gnu
+ ${CUDNN_ROOT}/local/cuda-${CUDA_VERSION}/targets/${TARGET_ARCH}-linux/lib/
$ENV{CUDNN_ROOT}
$ENV{CUDNN_ROOT}/lib64
$ENV{CUDNN_ROOT}/lib
- /usr/lib)
-find_library(CUDNN_LIBRARY NAMES libcudnn.so libcudnn.dylib # libcudnn_static.a
+ /usr/lib
+ ${CUDA_TOOLKIT_ROOT_DIR}
+ ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
+ )
+set(CUDNN_LIB_NAME "")
+if (LINUX)
+set(CUDNN_LIB_NAME "libcudnn.so")
+endif(LINUX)
+
+if(WIN32)
+# only support cudnn7
+set(CUDNN_LIB_NAME "cudnn.lib" "cudnn64_7.dll")
+endif(WIN32)
+
+if(Apple)
+set(CUDNN_LIB_NAME "libcudnn.dylib" "libcudnn.so")
+endif(Apple)
+
+find_library(CUDNN_LIBRARY NAMES ${CUDNN_LIB_NAME} # libcudnn_static.a
PATHS ${CUDNN_CHECK_LIBRARY_DIRS} ${CUDNN_INCLUDE_DIR} ${__libpath_hist}
NO_DEFAULT_PATH
DOC "Path to cuDNN library.")
diff --git a/cmake/external/anakin.cmake b/cmake/external/anakin.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..dc6730662f0b888f1981ac9c086320acc52d0a50
--- /dev/null
+++ b/cmake/external/anakin.cmake
@@ -0,0 +1,82 @@
+if (NOT WITH_ANAKIN)
+ return()
+endif()
+
+option(ANAKIN_ENABLE_OP_TIMER "Get more detailed information with Anakin op time" OFF)
+if(ANAKIN_ENABLE_OP_TIMER)
+ add_definitions(-DPADDLE_ANAKIN_ENABLE_OP_TIMER)
+endif()
+
+INCLUDE(ExternalProject)
+set(ANAKIN_SOURCE_DIR ${THIRD_PARTY_PATH}/anakin)
+# the anakin install dir is only default one now
+set(ANAKIN_INSTALL_DIR ${THIRD_PARTY_PATH}/anakin/src/extern_anakin/output)
+set(ANAKIN_INCLUDE ${ANAKIN_INSTALL_DIR})
+set(ANAKIN_LIBRARY ${ANAKIN_INSTALL_DIR})
+set(ANAKIN_SHARED_LIB ${ANAKIN_LIBRARY}/libanakin.so)
+set(ANAKIN_SABER_LIB ${ANAKIN_LIBRARY}/libanakin_saber_common.so)
+
+# TODO(luotao): ANAKIN_MODLE_URL etc will move to demo ci later.
+set(INFERENCE_URL "http://paddle-inference-dist.bj.bcebos.com")
+set(ANAKIN_MODLE_URL "${INFERENCE_URL}/mobilenet_v2.anakin.bin")
+set(ANAKIN_RNN_MODLE_URL "${INFERENCE_URL}/anakin_test%2Fditu_rnn.anakin2.model.bin")
+set(ANAKIN_RNN_DATA_URL "${INFERENCE_URL}/anakin_test%2Fditu_rnn_data.txt")
+execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_SOURCE_DIR}")
+execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_MODLE_URL} -N")
+execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_RNN_MODLE_URL} -N")
+execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_RNN_DATA_URL} -N")
+
+include_directories(${ANAKIN_INCLUDE})
+include_directories(${ANAKIN_INCLUDE}/saber/)
+include_directories(${ANAKIN_INCLUDE}/saber/core/)
+include_directories(${ANAKIN_INCLUDE}/saber/funcs/impl/x86/)
+include_directories(${ANAKIN_INCLUDE}/saber/funcs/impl/cuda/base/cuda_c/)
+
+set(ANAKIN_COMPILE_EXTRA_FLAGS
+ -Wno-error=unused-but-set-variable -Wno-unused-but-set-variable
+ -Wno-error=unused-variable -Wno-unused-variable
+ -Wno-error=format-extra-args -Wno-format-extra-args
+ -Wno-error=comment -Wno-comment
+ -Wno-error=format -Wno-format
+ -Wno-error=maybe-uninitialized -Wno-maybe-uninitialized
+ -Wno-error=switch -Wno-switch
+ -Wno-error=return-type -Wno-return-type
+ -Wno-error=non-virtual-dtor -Wno-non-virtual-dtor
+ -Wno-error=ignored-qualifiers
+ -Wno-ignored-qualifiers
+ -Wno-sign-compare
+ -Wno-reorder
+ -Wno-error=cpp)
+
+ExternalProject_Add(
+ extern_anakin
+ ${EXTERNAL_PROJECT_LOG_ARGS}
+ DEPENDS ${MKLML_PROJECT}
+ GIT_REPOSITORY "https://github.com/PaddlePaddle/Anakin"
+ GIT_TAG "9424277cf9ae180a14aff09560d3cd60a49c76d2"
+ PREFIX ${ANAKIN_SOURCE_DIR}
+ UPDATE_COMMAND ""
+ CMAKE_ARGS -DUSE_GPU_PLACE=YES
+ -DUSE_X86_PLACE=YES
+ -DBUILD_WITH_UNIT_TEST=NO
+ -DPROTOBUF_ROOT=${THIRD_PARTY_PATH}/install/protobuf
+ -DMKLML_ROOT=${THIRD_PARTY_PATH}/install/mklml
+ -DCUDNN_ROOT=${CUDNN_ROOT}
+ -DCUDNN_INCLUDE_DIR=${CUDNN_INCLUDE_DIR}
+ -DENABLE_OP_TIMER=${ANAKIN_ENABLE_OP_TIMER}
+ ${EXTERNAL_OPTIONAL_ARGS}
+ CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ANAKIN_INSTALL_DIR}
+)
+
+message(STATUS "Anakin for inference is enabled")
+message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}")
+
+add_library(anakin_shared SHARED IMPORTED GLOBAL)
+set_property(TARGET anakin_shared PROPERTY IMPORTED_LOCATION ${ANAKIN_SHARED_LIB})
+add_dependencies(anakin_shared extern_anakin protobuf mklml)
+
+add_library(anakin_saber SHARED IMPORTED GLOBAL)
+set_property(TARGET anakin_saber PROPERTY IMPORTED_LOCATION ${ANAKIN_SABER_LIB})
+add_dependencies(anakin_saber extern_anakin protobuf mklml)
+
+list(APPEND external_project_dependencies anakin_shared anakin_saber)
diff --git a/cmake/external/boost.cmake b/cmake/external/boost.cmake
index 10662fc96704685f030a5d76c6857d4bc20a63d9..ada61de8eb15ae10288ac54f588e9adf84acee37 100644
--- a/cmake/external/boost.cmake
+++ b/cmake/external/boost.cmake
@@ -23,8 +23,17 @@ set(BOOST_PROJECT "extern_boost")
# checked that the devtools package of CentOS 6 installs boost 1.41.0.
# So we use 1.41.0 here.
set(BOOST_VER "1.41.0")
-set(BOOST_TAR "boost_1_41_0")
-set(BOOST_URL "http://paddlepaddledeps.bj.bcebos.com/${BOOST_TAR}.tar.gz")
+if((NOT DEFINED BOOST_TAR) OR (NOT DEFINED BOOST_URL))
+ message(STATUS "use pre defined download url")
+ set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE)
+ set(BOOST_URL "http://paddlepaddledeps.cdn.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE)
+endif()
+IF (WIN32)
+ MESSAGE(WARNING, "In windows, boost can not be downloaded automaticlly, please build it manually and put it at " ${THIRD_PARTY_PATH}install/boost)
+else()
+ MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}")
+ENDIF(WIN32)
+
set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost)
set(BOOST_DOWNLOAD_DIR "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE)
@@ -32,12 +41,13 @@ set_directory_properties(PROPERTIES CLEAN_NO_CUSTOM 1)
include_directories(${BOOST_INCLUDE_DIR})
+if (NOT WIN32)
ExternalProject_Add(
${BOOST_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS}
DOWNLOAD_DIR ${BOOST_DOWNLOAD_DIR}
DOWNLOAD_COMMAND wget --no-check-certificate ${BOOST_URL} -c -q -O ${BOOST_TAR}.tar.gz
- && tar zxf ${BOOST_TAR}.tar.gz
+ && tar zxf ${BOOST_TAR}.tar.gz
DOWNLOAD_NO_PROGRESS 1
PREFIX ${BOOST_SOURCES_DIR}
CONFIGURE_COMMAND ""
@@ -45,8 +55,9 @@ ExternalProject_Add(
INSTALL_COMMAND ""
UPDATE_COMMAND ""
)
+endif(NOT WIN32)
-if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
+if (${CMAKE_VERSION} VERSION_LESS "3.3.0" OR NOT WIN32)
set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/boost_dummy.c)
file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";")
add_library(boost STATIC ${dummyfile})
diff --git a/cmake/external/brpc.cmake b/cmake/external/brpc.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..30b227b6452abf44171a1a4e04569e66b16e67a4
--- /dev/null
+++ b/cmake/external/brpc.cmake
@@ -0,0 +1,69 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+INCLUDE(ExternalProject)
+
+find_library(SSL_LIBRARY NAMES ssl)
+ADD_LIBRARY(ssl SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET ssl PROPERTY IMPORTED_LOCATION ${SSL_LIBRARY})
+
+find_library(CRYPTO_LIBRARY NAMES crypto)
+ADD_LIBRARY(crypto SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET crypto PROPERTY IMPORTED_LOCATION ${CRYPTO_LIBRARY})
+
+
+SET(BRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/brpc)
+SET(BRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/brpc)
+SET(BRPC_INCLUDE_DIR "${BRPC_INSTALL_DIR}/include" CACHE PATH "brpc include directory." FORCE)
+SET(BRPC_LIBRARIES "${BRPC_INSTALL_DIR}/lib/libbrpc.a" CACHE FILEPATH "brpc library." FORCE)
+
+INCLUDE_DIRECTORIES(${BRPC_INCLUDE_DIR})
+
+# Reference https://stackoverflow.com/questions/45414507/pass-a-list-of-prefix-paths-to-externalproject-add-in-cmake-args
+set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf|${THIRD_PARTY_PATH}/install/zlib")
+
+# If minimal .a is need, you can set WITH_DEBUG_SYMBOLS=OFF
+ExternalProject_Add(
+ extern_brpc
+ ${EXTERNAL_PROJECT_LOG_ARGS}
+ GIT_REPOSITORY "https://github.com/gongweibao/brpc"
+ GIT_TAG "7dc04defad1fd4173aae170c3fcbde131b65155a"
+ PREFIX ${BRPC_SOURCES_DIR}
+ UPDATE_COMMAND ""
+ CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+ -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+ -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+ -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+ -DCMAKE_INSTALL_PREFIX=${BRPC_INSTALL_DIR}
+ -DCMAKE_INSTALL_LIBDIR=${BRPC_INSTALL_DIR}/lib
+ -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+ -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+ -DCMAKE_PREFIX_PATH=${prefix_path}
+ -DBRPC_WITH_GLOG=ON
+ -DIOBUF_WITH_HUGE_BLOCK=ON
+ -DBRPC_WITH_RDMA=${WITH_BRPC_RDMA}
+ ${EXTERNAL_OPTIONAL_ARGS}
+ LIST_SEPARATOR |
+ CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${BRPC_INSTALL_DIR}
+ -DCMAKE_INSTALL_LIBDIR:PATH=${BRPC_INSTALL_DIR}/lib
+ -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+ -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+)
+ADD_DEPENDENCIES(extern_brpc protobuf ssl crypto leveldb gflags glog gtest snappy)
+ADD_LIBRARY(brpc STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET brpc PROPERTY IMPORTED_LOCATION ${BRPC_LIBRARIES})
+ADD_DEPENDENCIES(brpc extern_brpc)
+
+
+LIST(APPEND external_project_dependencies brpc)
diff --git a/cmake/external/cub.cmake b/cmake/external/cub.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..c94849cf4b96746e6c507db2a6310c2f305dacf5
--- /dev/null
+++ b/cmake/external/cub.cmake
@@ -0,0 +1,35 @@
+if(NOT WITH_GPU)
+ return()
+endif()
+
+include(ExternalProject)
+
+set(CUB_SOURCE_DIR ${THIRD_PARTY_PATH}/cub)
+set(CUB_INCLUDE_DIR ${CUB_SOURCE_DIR}/src/extern_cub)
+
+include_directories(${CUB_INCLUDE_DIR})
+
+ExternalProject_Add(
+ extern_cub
+ ${EXTERNAL_PROJECT_LOG_ARGS}
+ GIT_REPOSITORY "https://github.com/NVlabs/cub.git"
+ GIT_TAG "v1.8.0"
+ PREFIX ${CUB_SOURCE_DIR}
+ UPDATE_COMMAND ""
+ CONFIGURE_COMMAND ""
+ BUILD_COMMAND ""
+ INSTALL_COMMAND ""
+ TEST_COMMAND ""
+)
+
+if(${CMAKE_VERSION} VERSION_LESS "3.3.0")
+ set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cub_dummy.c)
+ file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";")
+ add_library(cub STATIC ${dummyfile})
+else()
+ add_library(cub INTERFACE)
+endif()
+
+add_dependencies(cub extern_cub)
+
+LIST(APPEND externl_project_dependencies cub)
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index edc93c2773f46ec9e0bf898557c55c93274e6a01..e029300eee9b99582f085f6b650e03f7dacc091a 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -21,11 +21,12 @@ else()
ExternalProject_Add(
extern_eigen3
${EXTERNAL_PROJECT_LOG_ARGS}
- GIT_REPOSITORY "https://github.com/RLovelett/eigen.git"
+ GIT_REPOSITORY "https://github.com/eigenteam/eigen-git-mirror"
# eigen on cuda9.1 missing header of math_funtions.hpp
# https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen
GIT_TAG 917060c364181f33a735dc023818d5a54f60e54c
PREFIX ${EIGEN_SOURCE_DIR}
+ DOWNLOAD_NAME "eigen"
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake
index a1d2d0f44685c342db9d868da716809b49575c01..cf58cc39762351f8b37d073bcd218d249285bf52 100644
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -18,7 +18,7 @@ SET(GFLAGS_SOURCES_DIR ${THIRD_PARTY_PATH}/gflags)
SET(GFLAGS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/gflags)
SET(GFLAGS_INCLUDE_DIR "${GFLAGS_INSTALL_DIR}/include" CACHE PATH "gflags include directory." FORCE)
IF(WIN32)
- set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/gflags.lib" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
+ set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/libgflags.lib" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
ELSE(WIN32)
set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/libgflags.a" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
ENDIF(WIN32)
@@ -45,7 +45,13 @@ ExternalProject_Add(
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
)
-
+IF(WIN32)
+ IF(NOT EXISTS "${GFLAGS_INSTALL_DIR}/lib/libgflags.lib")
+ add_custom_command(TARGET extern_gflags POST_BUILD
+ COMMAND cmake -E rename ${GFLAGS_INSTALL_DIR}/lib/gflags_static.lib ${GFLAGS_INSTALL_DIR}/lib/libgflags.lib
+ )
+ ENDIF()
+ENDIF(WIN32)
ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET gflags PROPERTY IMPORTED_LOCATION ${GFLAGS_LIBRARIES})
ADD_DEPENDENCIES(gflags extern_gflags)
@@ -60,3 +66,4 @@ IF(WITH_C_API)
INSTALL(FILES ${GFLAGS_LIBRARIES} DESTINATION third_party/gflags/lib)
ENDIF()
ENDIF()
+
diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
index ac0181e69cbf5efeee44c5ca801b2710eefb3e6d..25ef2970ac52f12f961c9c6d3a589fec4c80983f 100644
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -60,6 +60,13 @@ ExternalProject_Add(
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
)
+IF(WIN32)
+ IF(NOT EXISTS "${GLOG_INSTALL_DIR}/lib/libglog.lib")
+ add_custom_command(TARGET extern_glog POST_BUILD
+ COMMAND cmake -E rename ${GLOG_INSTALL_DIR}/lib/glog.lib ${GLOG_INSTALL_DIR}/lib/libglog.lib
+ )
+ ENDIF()
+ENDIF(WIN32)
ADD_LIBRARY(glog STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET glog PROPERTY IMPORTED_LOCATION ${GLOG_LIBRARIES})
diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake
index e90948782bb5e333bbdb47ef9d61c1e37e3cf9e4..7fb67afbe15a5a019c978092d5ba3a4a0f66d996 100644
--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@@ -23,21 +23,34 @@ SET(GRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/grpc)
SET(GRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/grpc)
SET(GRPC_INCLUDE_DIR "${GRPC_INSTALL_DIR}/include/" CACHE PATH "grpc include directory." FORCE)
SET(GRPC_CPP_PLUGIN "${GRPC_INSTALL_DIR}/bin/grpc_cpp_plugin" CACHE FILEPATH "GRPC_CPP_PLUGIN" FORCE)
+
+include(ProcessorCount)
+ProcessorCount(NUM_OF_PROCESSOR)
+
IF(APPLE)
- SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j static grpc_cpp_plugin | sed "s/-Werror//g" | sh)
+ SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j ${NUM_OF_PROCESSOR} static grpc_cpp_plugin | sed "s/-Werror//g" | sh)
ELSE()
- SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j static grpc_cpp_plugin)
+ SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j ${NUM_OF_PROCESSOR} static grpc_cpp_plugin)
ENDIF()
+# FIXME(wuyi): do not build zlib cares protobuf twice, find a way to build grpc with them
ExternalProject_Add(
extern_grpc
DEPENDS protobuf zlib
- GIT_REPOSITORY "https://github.com/grpc/grpc.git"
- GIT_TAG "v1.10.x"
+ # NOTE(wuyi):
+ # this package is generated by following steps:
+ # 1. git clone -b v1.8.x https://github.com/grpc/grpc.git
+ # 2. git submodule update --init
+ # 3. keep only zlib, cares, protobuf, boringssl under "third_party",
+ # checkout and clean other dirs under third_party
+ # 4. remove .git, and package the directory.
+ URL "http://paddlepaddledeps.bj.bcebos.com/grpc-v1.10.x.tar.gz"
+ URL_MD5 "1f268a2aff6759839dccd256adcc91cf"
PREFIX ${GRPC_SOURCES_DIR}
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
BUILD_IN_SOURCE 1
+ PATCH_COMMAND cp ${PADDLE_SOURCE_DIR}/patches/grpc/grpc_library.h ${GRPC_SOURCES_DIR}/src/extern_grpc/include/grpcpp/impl/codegen/grpc_library.h && cp ${PADDLE_SOURCE_DIR}/patches/grpc/completion_queue.h ${GRPC_SOURCES_DIR}/src/extern_grpc/include/grpcpp/impl/codegen/completion_queue.h
# NOTE(yuyang18):
# Disable -Werror, otherwise the compile will fail in MacOS.
# It seems that we cannot configure that by make command.
@@ -46,7 +59,6 @@ ExternalProject_Add(
INSTALL_COMMAND make prefix=${GRPC_INSTALL_DIR} install
)
-# FIXME(typhoonzero): hack to get static lib path, try a better way like merge them.
ADD_LIBRARY(grpc++_unsecure STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET grpc++_unsecure PROPERTY IMPORTED_LOCATION
"${GRPC_INSTALL_DIR}/lib/libgrpc++_unsecure.a")
diff --git a/cmake/external/leveldb.cmake b/cmake/external/leveldb.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..fb5091731da02b497a14f119e944905eee4979d5
--- /dev/null
+++ b/cmake/external/leveldb.cmake
@@ -0,0 +1,44 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+INCLUDE(ExternalProject)
+
+SET(LEVELDB_SOURCES_DIR ${THIRD_PARTY_PATH}/leveldb)
+SET(LEVELDB_INSTALL_DIR ${THIRD_PARTY_PATH}/install/leveldb)
+SET(LEVELDB_INCLUDE_DIR "${LEVELDB_INSTALL_DIR}/include" CACHE PATH "leveldb include directory." FORCE)
+SET(LEVELDB_LIBRARIES "${LEVELDB_INSTALL_DIR}/lib/libleveldb.a" CACHE FILEPATH "leveldb library." FORCE)
+INCLUDE_DIRECTORIES(${LEVELDB_INCLUDE_DIR})
+
+ExternalProject_Add(
+ extern_leveldb
+ ${EXTERNAL_PROJECT_LOG_ARGS}
+ PREFIX ${LEVELDB_SOURCES_DIR}
+ URL "https://github.com/google/leveldb/archive/v1.18.tar.gz"
+ URL_MD5 "73770de34a2a5ab34498d2e05b2b7fa0"
+ CONFIGURE_COMMAND ""
+ BUILD_COMMAND CXXFLAGS=-fPIC make -j ${NUM_OF_PROCESSOR} libleveldb.a
+ INSTALL_COMMAND mkdir -p ${LEVELDB_INSTALL_DIR}/lib/
+ && cp ${LEVELDB_SOURCES_DIR}/src/extern_leveldb/libleveldb.a ${LEVELDB_LIBRARIES}
+ && cp -r ${LEVELDB_SOURCES_DIR}/src/extern_leveldb/include ${LEVELDB_INSTALL_DIR}/
+ BUILD_IN_SOURCE 1
+)
+
+ADD_DEPENDENCIES(extern_leveldb snappy)
+
+ADD_LIBRARY(leveldb STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET leveldb PROPERTY IMPORTED_LOCATION ${LEVELDB_LIBRARIES})
+ADD_DEPENDENCIES(leveldb extern_leveldb)
+
+LIST(APPEND external_project_dependencies leveldb)
+
diff --git a/cmake/external/libxsmm.cmake b/cmake/external/libxsmm.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..530f7ebe2813fb2f00c6b5b4d1f7b2f04fe650b0
--- /dev/null
+++ b/cmake/external/libxsmm.cmake
@@ -0,0 +1,57 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+OPTION(WITH_LIBXSMM "Compile with libxsmm" OFF)
+
+IF(NOT WITH_LIBXSMM)
+ return()
+ENDIF()
+
+IF(WIN32 OR APPLE OR ANDROID OR IOS)
+ MESSAGE(WARNING "Windows, Mac or Mobile are not supported with libxsmm in Paddle yet.")
+ SET(WITH_LIBXSMM OFF CACHE STRING "Disable LIBXSMM" FORCE)
+ return()
+ENDIF()
+
+INCLUDE (ExternalProject)
+
+SET(LIBXSMM_SOURCES_DIR ${THIRD_PARTY_PATH}/libxsmm)
+SET(LIBXSMM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/libxsmm)
+SET(LIBXSMM_INCLUDE_DIR "${LIBXSMM_INSTALL_DIR}/include" CACHE PATH "LIBXSMM include directory." FORCE)
+SET(LIBXSMM_LIBRARY_DIR "${LIBXSMM_INSTALL_DIR}/lib" CACHE PATH "LIBXSMM library directory." FORCE)
+SET(LIBXSMM_LIBS "${LIBXSMM_LIBRARY_DIR}/libxsmm.a"
+ "${LIBXSMM_LIBRARY_DIR}/libxsmmnoblas.a")
+
+ExternalProject_Add(
+ extern_libxsmm
+ GIT_REPOSITORY "https://github.com/hfp/libxsmm.git"
+ GIT_TAG "7cc03b5b342fdbc6b6d990b190671c5dbb8489a2"
+ PREFIX ${LIBXSMM_SOURCES_DIR}
+ UPDATE_COMMAND ""
+ CONFIGURE_COMMAND ""
+ BUILD_IN_SOURCE 1
+ BUILD_COMMAND $(MAKE) --silent PREFIX=${LIBXSMM_INSTALL_DIR} CXX=g++ CC=gcc WARP=0 install
+ INSTALL_COMMAND ""
+)
+ADD_LIBRARY(libxsmm STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET libxsmm PROPERTY IMPORTED_LOCATION "${LIBXSMM_LIBRARY_DIR}/libxsmm.a")
+SET_PROPERTY(TARGET libxsmm PROPERTY IMPORTED_LOCATION "${LIBXSMM_LIBRARY_DIR}/libxsmmnoblas.a")
+
+MESSAGE(STATUS "Libxsmm library: ${LIBXSMM_LIBS}")
+include_directories(${LIBXSMM_INCLUDE_DIR})
+ADD_DEFINITIONS(-DPADDLE_WITH_LIBXSMM)
+ADD_DEPENDENCIES(libxsmm extern_libxsmm)
+LIST(APPEND external_project_dependencies libxsmm)
+
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index 5759e5c489724332793bf103b7aacf7ffb068611..baf253df2755657b01b67c410f63b7d8422d4df3 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -24,7 +24,7 @@ SET(MKLDNN_INSTALL_DIR ${THIRD_PARTY_PATH}/install/mkldnn)
SET(MKLDNN_INC_DIR "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
IF(WIN32 OR APPLE)
- MESSAGE(WARNING
+ MESSAGE(WARNING
"Windows or Mac is not supported with MKLDNN in Paddle yet."
"Force WITH_MKLDNN=OFF")
SET(WITH_MKLDNN OFF CACHE STRING "Disable MKLDNN in Windows and MacOS" FORCE)
@@ -45,22 +45,26 @@ IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
ELSE()
MESSAGE(FATAL_ERROR "Should enable MKLML when build MKLDNN")
ENDIF()
-
-SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} -Wno-error=strict-overflow")
-SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} -Wno-error=strict-overflow")
+SET(MKLDNN_FLAG "-Wno-error=strict-overflow -Wno-error=unused-result")
+SET(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value")
+SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}")
+SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}")
ExternalProject_Add(
${MKLDNN_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS}
DEPENDS ${MKLDNN_DEPENDS}
GIT_REPOSITORY "https://github.com/01org/mkl-dnn.git"
- GIT_TAG "v0.11"
+ GIT_TAG "64e03a1939e0d526aa8e9f2e3f7dc0ad8d372944"
PREFIX ${MKLDNN_SOURCES_DIR}
UPDATE_COMMAND ""
+ CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+ CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
- CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+ CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
CMAKE_ARGS -DMKLROOT=${MKLML_ROOT}
CMAKE_ARGS -DCMAKE_C_FLAGS=${MKLDNN_CFLAG}
CMAKE_ARGS -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG}
+ CMAKE_ARGS -DWITH_TEST=OFF -DWITH_EXAMPLE=OFF
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR}
-DMKLROOT:PATH=${MKLML_ROOT}
)
diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
index 796bcf28a1dfb308ccb7a2f839742c5c2fcf2002..82c424fb79d5596c31891bc395699bf9ff4e7e7e 100644
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -27,8 +27,12 @@ ENDIF()
INCLUDE(ExternalProject)
SET(MKLML_PROJECT "extern_mklml")
-SET(MKLML_VER "mklml_lnx_2018.0.1.20171007")
-SET(MKLML_URL "http://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.tgz")
+IF((NOT DEFINED MKLML_VER) OR (NOT DEFINED MKLML_URL))
+ MESSAGE(STATUS "use pre defined download url")
+ SET(MKLML_VER "mklml_lnx_2018.0.3.20180406" CACHE STRING "" FORCE)
+ SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
+ENDIF()
+MESSAGE(STATUS "MKLML_VER: ${MKLML_VER}, MKLML_URL: ${MKLML_URL}")
SET(MKLML_SOURCE_DIR "${THIRD_PARTY_PATH}/mklml")
SET(MKLML_DOWNLOAD_DIR "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
SET(MKLML_DST_DIR "mklml")
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 8af2765f58717408e3a1ef6b500bb01511bfd8d3..c3fbe4dbdb28f1008bb274ee18293db348bfc6ed 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -17,18 +17,29 @@ IF(USE_EIGEN_FOR_BLAS)
ENDIF(USE_EIGEN_FOR_BLAS)
INCLUDE(cblas)
+# IF(WIN32 AND NOT ${CBLAS_FOUND})
+
+
IF(NOT ${CBLAS_FOUND})
+
INCLUDE(ExternalProject)
SET(CBLAS_SOURCES_DIR ${THIRD_PARTY_PATH}/openblas)
SET(CBLAS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/openblas)
- SET(CBLAS_INC_DIR "${CBLAS_INSTALL_DIR}/include" CACHE PATH "openblas include directory." FORCE)
+ SET(CBLAS_INCLUDE_DIR "${CBLAS_INSTALL_DIR}/include" CACHE PATH "openblas include directory." FORCE)
SET(CBLAS_LIBRARIES
"${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
CACHE FILEPATH "openblas library." FORCE)
+ ADD_DEFINITIONS(-DPADDLE_USE_OPENBLAS)
+ IF (WIN32)
+ SET(CBLAS_FOUND true)
+ MESSAGE(WARNING, "In windows, openblas only support msvc build, please build it manually and put it at " ${CBLAS_INSTALL_DIR})
+ ENDIF(WIN32)
+
+ IF (NOT WIN32)
SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable")
SET(OPENBLAS_COMMIT "v0.2.20")
@@ -67,7 +78,6 @@ IF(NOT ${CBLAS_FOUND})
ENDIF()
SET(COMMON_ARGS CC=${OPENBLAS_CC} NO_SHARED=1 NO_LAPACK=1 libs)
-
ExternalProject_Add(
extern_openblas
${EXTERNAL_PROJECT_LOG_ARGS}
@@ -82,9 +92,11 @@ IF(NOT ${CBLAS_FOUND})
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
)
+ ELSE()
+ ENDIF(NOT WIN32)
SET(CBLAS_PROVIDER openblas)
IF(WITH_C_API)
- INSTALL(DIRECTORY ${CBLAS_INC_DIR} DESTINATION third_party/openblas)
+ INSTALL(DIRECTORY ${CBLAS_INCLUDE_DIR} DESTINATION third_party/openblas)
# Because libopenblas.a is a symbolic link of another library, thus need to
# install the whole directory.
IF(ANDROID)
@@ -105,14 +117,25 @@ IF(NOT ${CBLAS_FOUND})
ENDIF(NOT ${CBLAS_FOUND})
MESSAGE(STATUS "BLAS library: ${CBLAS_LIBRARIES}")
-INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
+MESSAGE(STATUS "BLAS Include: ${CBLAS_INCLUDE_DIR}")
+INCLUDE_DIRECTORIES(${CBLAS_INCLUDE_DIR})
# FIXME(gangliao): generate cblas target to track all high performance
# linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas)
SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cblas_dummy.c)
FILE(WRITE ${dummyfile} "const char *dummy_cblas = \"${dummyfile}\";")
ADD_LIBRARY(cblas STATIC ${dummyfile})
-TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES})
+
+IF("${CBLAS_PROVIDER}" STREQUAL "MKLML")
+ TARGET_LINK_LIBRARIES(cblas dynload_mklml)
+ELSE()
+ TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES})
+ENDIF("${CBLAS_PROVIDER}" STREQUAL "MKLML")
+
+IF(WITH_LIBXSMM)
+ TARGET_LINK_LIBRARIES(cblas ${LIBXSMM_LIBS})
+ ADD_DEPENDENCIES(cblas extern_libxsmm)
+ENDIF()
IF(NOT ${CBLAS_FOUND})
ADD_DEPENDENCIES(cblas extern_openblas)
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 0fde4373a4be58e71ff1a305bd4991cc554d7a34..550b0dada8e90c1e2b33705fd53c065672113b45 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -14,11 +14,14 @@
INCLUDE(ExternalProject)
# Always invoke `FIND_PACKAGE(Protobuf)` for importing function protobuf_generate_cpp
+IF(NOT WIN32)
FIND_PACKAGE(Protobuf QUIET)
+ENDIF(NOT WIN32)
macro(UNSET_VAR VAR_NAME)
UNSET(${VAR_NAME} CACHE)
UNSET(${VAR_NAME})
endmacro()
+
UNSET_VAR(PROTOBUF_INCLUDE_DIR)
UNSET_VAR(PROTOBUF_FOUND)
UNSET_VAR(PROTOBUF_PROTOC_EXECUTABLE)
@@ -94,12 +97,14 @@ macro(PROMPT_PROTOBUF_LIB)
SET(protobuf_DEPS ${ARGN})
MESSAGE(STATUS "Protobuf protoc executable: ${PROTOBUF_PROTOC_EXECUTABLE}")
+ MESSAGE(STATUS "Protobuf-lite library: ${PROTOBUF_LITE_LIBRARY}")
MESSAGE(STATUS "Protobuf library: ${PROTOBUF_LIBRARY}")
+ MESSAGE(STATUS "Protoc library: ${PROTOBUF_PROTOC_LIBRARY}")
MESSAGE(STATUS "Protobuf version: ${PROTOBUF_VERSION}")
INCLUDE_DIRECTORIES(${PROTOBUF_INCLUDE_DIR})
# Assuming that all the protobuf libraries are of the same type.
- IF(${PROTOBUF_LIBRARY} MATCHES "${CMAKE_STATIC_LIBRARY_SUFFIX}$")
+ IF(${PROTOBUF_LIBRARY} MATCHES ${CMAKE_STATIC_LIBRARY_SUFFIX})
SET(protobuf_LIBTYPE STATIC)
ELSEIF(${PROTOBUF_LIBRARY} MATCHES "${CMAKE_SHARED_LIBRARY_SUFFIX}$")
SET(protobuf_LIBTYPE SHARED)
@@ -137,18 +142,25 @@ macro(SET_PROTOBUF_VERSION)
endmacro()
set(PROTOBUF_ROOT "" CACHE PATH "Folder contains protobuf")
+IF (WIN32)
+ SET(PROTOBUF_ROOT ${THIRD_PARTY_PATH}/install/protobuf)
+ MESSAGE(WARNING, "In windows, protobuf only support msvc build, please build it manually and put it at " ${PROTOBUF_ROOT})
+ENDIF(WIN32)
+
if (NOT "${PROTOBUF_ROOT}" STREQUAL "")
+
find_path(PROTOBUF_INCLUDE_DIR google/protobuf/message.h PATHS ${PROTOBUF_ROOT}/include NO_DEFAULT_PATH)
- find_library(PROTOBUF_LIBRARY protobuf PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
- find_library(PROTOBUF_LITE_LIBRARY protobuf-lite PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
- find_library(PROTOBUF_PROTOC_LIBRARY protoc PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
+ find_library(PROTOBUF_LIBRARY protobuf libprotobuf.lib PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
+ find_library(PROTOBUF_LITE_LIBRARY protobuf-lite libprotobuf-lite.lib PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
+ find_library(PROTOBUF_PROTOC_LIBRARY protoc libprotoc.lib PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
find_program(PROTOBUF_PROTOC_EXECUTABLE protoc PATHS ${PROTOBUF_ROOT}/bin NO_DEFAULT_PATH)
if (PROTOBUF_INCLUDE_DIR AND PROTOBUF_LIBRARY AND PROTOBUF_LITE_LIBRARY AND PROTOBUF_PROTOC_LIBRARY AND PROTOBUF_PROTOC_EXECUTABLE)
message(STATUS "Using custom protobuf library in ${PROTOBUF_ROOT}.")
+ SET(PROTOBUF_FOUND true)
SET_PROTOBUF_VERSION()
PROMPT_PROTOBUF_LIB()
else()
- message(WARNING "Cannot find protobuf library in ${PROTOBUF_ROOT}.")
+ message(WARNING "Cannot find protobuf library in ${PROTOBUF_ROOT}")
endif()
endif()
@@ -212,6 +224,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/${TARGET_NAME}/cmake
${OPTIONAL_ARGS}
-Dprotobuf_BUILD_TESTS=OFF
+ -DCMAKE_SKIP_RPATH=ON
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
@@ -238,6 +251,7 @@ IF(CMAKE_CROSSCOMPILING)
CACHE FILEPATH "protobuf executable." FORCE)
ENDIF()
+
IF(NOT PROTOBUF_FOUND)
build_protobuf(extern_protobuf FALSE)
diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake
index d7e5571bdbd8ba58d8a08c9426971f1c7b186413..f17b8d46dc2d8ded81ced7de5827d5e7fd5109f0 100644
--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@@ -18,8 +18,9 @@ ENDIF()
INCLUDE(python_module)
-FIND_PACKAGE(PythonInterp 2.7)
-FIND_PACKAGE(PythonLibs 2.7)
+FIND_PACKAGE(PythonInterp ${PY_VERSION})
+FIND_PACKAGE(PythonLibs ${PY_VERSION})
+
# Fixme: Maybe find a static library. Get SHARED/STATIC by FIND_PACKAGE.
ADD_LIBRARY(python SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES})
diff --git a/cmake/external/snappy.cmake b/cmake/external/snappy.cmake
index 80282329c6ac65fbd1493a6838efca4bd9cadaad..af09ed4d5d6e21cc50aba5198a7e9ea56f49451a 100644
--- a/cmake/external/snappy.cmake
+++ b/cmake/external/snappy.cmake
@@ -47,8 +47,6 @@ ExternalProject_Add(
-DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPY_INSTALL_DIR}/lib
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
- BUILD_COMMAND make -j8
- INSTALL_COMMAND make install
)
add_library(snappy STATIC IMPORTED GLOBAL)
diff --git a/cmake/external/snappystream.cmake b/cmake/external/snappystream.cmake
index 20a96430823d07a07d4bb4602e7fc0cfe55c3bf2..6df636d7fa8757ade73892bda03a80ba9767472b 100644
--- a/cmake/external/snappystream.cmake
+++ b/cmake/external/snappystream.cmake
@@ -46,8 +46,6 @@ ExternalProject_Add(
-DCMAKE_INSTALL_PREFIX:PATH=${SNAPPYSTREAM_INSTALL_DIR}
-DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPYSTREAM_INSTALL_DIR}/lib
-DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
- BUILD_COMMAND make -j8
- INSTALL_COMMAND make install
DEPENDS snappy
)
diff --git a/cmake/external/xbyak.cmake b/cmake/external/xbyak.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..384c2f9328296ce6a8a6293be6cc47e5063dd3c4
--- /dev/null
+++ b/cmake/external/xbyak.cmake
@@ -0,0 +1,58 @@
+# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set(WITH_XBYAK ON)
+if(WIN32 OR APPLE)
+ SET(WITH_XBYAK OFF CACHE STRING "Disable XBYAK in Windows and MacOS" FORCE)
+ return()
+endif()
+
+include(ExternalProject)
+
+set(XBYAK_PROJECT extern_xbyak)
+set(XBYAK_PREFIX_DIR ${THIRD_PARTY_PATH}/xbyak)
+set(XBYAK_INSTALL_ROOT ${THIRD_PARTY_PATH}/install/xbyak)
+set(XBYAK_INC_DIR ${XBYAK_INSTALL_ROOT}/include)
+
+include_directories(${XBYAK_INC_DIR})
+include_directories(${XBYAK_INC_DIR}/xbyak)
+
+add_definitions(-DPADDLE_WITH_XBYAK)
+
+# xbyak options
+add_definitions(-DXBYAK64)
+add_definitions(-DXBYAK_NO_OP_NAMES)
+
+ExternalProject_Add(
+ ${XBYAK_PROJECT}
+ ${EXTERNAL_PROJECT_LOG_ARGS}
+ DEPENDS ""
+ GIT_REPOSITORY "https://github.com/herumi/xbyak.git"
+ GIT_TAG "v5.661" # Jul 26th
+ PREFIX ${XBYAK_PREFIX_DIR}
+ UPDATE_COMMAND ""
+ CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${XBYAK_INSTALL_ROOT}
+ CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${XBYAK_INSTALL_ROOT}
+)
+
+if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
+ set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/xbyak_dummy.c)
+ file(WRITE ${dummyfile} "const char *dummy_xbyak = \"${dummyfile}\";")
+ add_library(xbyak STATIC ${dummyfile})
+else()
+ add_library(xbyak INTERFACE)
+endif()
+
+add_dependencies(xbyak ${XBYAK_PROJECT})
+list(APPEND external_project_dependencies xbyak)
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 1120677a37e0d44163816b66600121c8f0d545af..e0556a0babc74ba6efa0a190d4f7b77416bef3bf 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -102,7 +102,6 @@ set(COMMON_FLAGS
-fno-omit-frame-pointer
-Wall
-Wextra
- -Werror
-Wnon-virtual-dtor
-Wdelete-non-virtual-dtor
-Wno-unused-parameter
@@ -115,6 +114,11 @@ set(COMMON_FLAGS
-Wno-error=terminate # Warning in PADDLE_ENFORCE
)
+# https://github.com/PaddlePaddle/Paddle/issues/12773
+if (NOT WIN32)
+list(APPEND COMMON_FLAGS -Werror)
+endif()
+
set(GPU_COMMON_FLAGS
-fPIC
-fno-omit-frame-pointer
@@ -142,6 +146,11 @@ else()
${GPU_COMMON_FLAGS})
endif()
+if(UNIX AND NOT APPLE)
+ # except apple from nix*Os family
+ set(LINUX TRUE)
+endif(UNIX AND NOT APPLE)
+
foreach(flag ${COMMON_FLAGS})
safe_set_cflag(CMAKE_C_FLAGS ${flag})
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 1d3e2ade6d393c6e4c37eea0dc1064cdb18808a5..6d230942321f8d82a14f5c58037134deb0ab222d 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -96,6 +96,20 @@ if(NOT APPLE AND NOT ANDROID)
set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt")
endif(NOT APPLE AND NOT ANDROID)
+set_property(GLOBAL PROPERTY FLUID_MODULES "")
+# find all fluid modules is used for paddle fluid static library
+# for building inference libs
+function(find_fluid_modules TARGET_NAME)
+ get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
+ string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path})
+ string(FIND "${__target_path}" "fluid" pos)
+ if(pos GREATER 1)
+ get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
+ set(fluid_modules ${fluid_modules} ${TARGET_NAME})
+ set_property(GLOBAL PROPERTY FLUID_MODULES "${fluid_modules}")
+ endif()
+endfunction(find_fluid_modules)
+
function(merge_static_libs TARGET_NAME)
set(libs ${ARGN})
list(REMOVE_DUPLICATES libs)
@@ -134,7 +148,8 @@ function(merge_static_libs TARGET_NAME)
COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a"
COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles}
)
- else() # general UNIX: use "ar" to extract objects and re-add to a common lib
+ endif(APPLE)
+ if(LINUX) # general UNIX: use "ar" to extract objects and re-add to a common lib
set(target_DIR ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}.dir)
foreach(lib ${libs})
@@ -173,7 +188,36 @@ function(merge_static_libs TARGET_NAME)
COMMAND ${CMAKE_AR} crs ${target_LIBNAME} `find ${target_DIR} -name '*.o'`
COMMAND ${CMAKE_RANLIB} ${target_LIBNAME}
WORKING_DIRECTORY ${target_DIR})
- endif()
+ endif(LINUX)
+ if(WIN32) # windows do not support gcc/nvcc combined compiling. Use msvc lib.exe to merge libs.
+ # Make the generated dummy source file depended on all static input
+ # libs. If input lib changes,the source file is touched
+ # which causes the desired effect (relink).
+ add_custom_command(OUTPUT ${target_SRCS}
+ COMMAND ${CMAKE_COMMAND} -E touch ${target_SRCS}
+ DEPENDS ${libs})
+
+ # Generate dummy staic lib
+ file(WRITE ${target_SRCS} "const char *dummy_${TARGET_NAME} = \"${target_SRCS}\";")
+ add_library(${TARGET_NAME} STATIC ${target_SRCS})
+ target_link_libraries(${TARGET_NAME} ${libs_deps})
+
+ foreach(lib ${libs})
+ # Get the file names of the libraries to be merged
+ #if(NOT $ MATCHES "lib.*\\.lib")
+ # message("library" ${lib})
+ # set(libfiles ${libfiles} lib$)
+ #else()
+ set(libfiles ${libfiles} $)
+ #endif()
+ endforeach()
+
+ # windows cmd return error in clean env.
+ # COMMAND del "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/${TARGET_NAME}.lib"
+ add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
+ COMMAND lib /OUT:${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.lib ${libfiles}
+ )
+ endif(WIN32)
endfunction(merge_static_libs)
function(cc_library TARGET_NAME)
@@ -181,6 +225,10 @@ function(cc_library TARGET_NAME)
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(cc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+ if(WIN32)
+ # add libxxx.lib prefix in windows
+ set(${TARGET_NAME}_LIB_NAME "${CMAKE_STATIC_LIBRARY_PREFIX}${TARGET_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}" CACHE STRING "output library name for target ${TARGET_NAME}")
+ endif(WIN32)
if(cc_library_SRCS)
if(cc_library_SHARED OR cc_library_shared) # build *.so
add_library(${TARGET_NAME} SHARED ${cc_library_SRCS})
@@ -195,6 +243,15 @@ function(cc_library TARGET_NAME)
list(REMOVE_ITEM cc_library_DEPS warpctc)
add_dependencies(${TARGET_NAME} warpctc)
endif()
+ # Only deps libmklml.so, not link
+ if("${cc_library_DEPS};" MATCHES "mklml;")
+ list(REMOVE_ITEM cc_library_DEPS mklml)
+ if(NOT "${TARGET_NAME}" MATCHES "dynload_mklml")
+ list(APPEND cc_library_DEPS dynload_mklml)
+ endif()
+ add_dependencies(${TARGET_NAME} mklml)
+ target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed")
+ endif()
target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
endif()
@@ -206,8 +263,6 @@ function(cc_library TARGET_NAME)
list(APPEND cc_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
endif()
endforeach()
- add_style_check_target(${TARGET_NAME} ${cc_library_SRCS} ${cc_library_HEADERS})
-
else(cc_library_SRCS)
if(cc_library_DEPS)
merge_static_libs(${TARGET_NAME} ${cc_library_DEPS})
@@ -231,16 +286,23 @@ endfunction(cc_binary)
function(cc_test TARGET_NAME)
if(WITH_TESTING)
- set(options "")
+ set(options SERIAL)
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS ARGS)
cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_executable(${TARGET_NAME} ${cc_test_SRCS})
- target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
- add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
+ target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
+ add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
add_test(NAME ${TARGET_NAME}
COMMAND ${TARGET_NAME} ${cc_test_ARGS}
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+ if (${cc_test_SERIAL})
+ set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
+
+ set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
+ set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
+ set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
+ endif()
endif()
endfunction(cc_test)
@@ -268,7 +330,6 @@ function(nv_library TARGET_NAME)
list(APPEND nv_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
endif()
endforeach()
- add_style_check_target(${TARGET_NAME} ${nv_library_SRCS} ${nv_library_HEADERS})
else(nv_library_SRCS)
if (nv_library_DEPS)
merge_static_libs(${TARGET_NAME} ${nv_library_DEPS})
@@ -295,14 +356,21 @@ endfunction(nv_binary)
function(nv_test TARGET_NAME)
if (WITH_GPU AND WITH_TESTING)
- set(options "")
+ set(options SERIAL)
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
- target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main memory gtest gflags glog)
- add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main memory gtest gflags glog)
+ target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
+ add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
add_test(${TARGET_NAME} ${TARGET_NAME})
+ if (nv_test_SERIAL)
+ set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
+
+ set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
+ set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
+ set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
+ endif()
endif()
endfunction(nv_test)
@@ -338,7 +406,6 @@ function(hip_library TARGET_NAME)
list(APPEND hip_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
endif()
endforeach()
- add_style_check_target(${TARGET_NAME} ${hip_library_SRCS} ${hip_library_HEADERS})
else(hip_library_SRCS)
if (hip_library_DEPS)
merge_static_libs(${TARGET_NAME} ${hip_library_DEPS})
@@ -550,7 +617,9 @@ function(py_test TARGET_NAME)
set(multiValueArgs SRCS DEPS ARGS ENVS)
cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_test(NAME ${TARGET_NAME}
- COMMAND env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
+ COMMAND env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true
+ FLAGS_cpu_deterministic=true
+ PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
endif()
@@ -608,3 +677,21 @@ function(grpc_library TARGET_NAME)
COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
cc_library("${TARGET_NAME}" SRCS "${grpc_library_SRCS}" DEPS "${TARGET_NAME}_grpc" "${TARGET_NAME}_proto" "${grpc_library_DEPS}")
endfunction()
+
+
+function(brpc_library TARGET_NAME)
+ set(oneValueArgs PROTO)
+ set(multiValueArgs SRCS DEPS)
+ set(options "")
+ cmake_parse_arguments(brpc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+ message(STATUS "generating brpc ${brpc_library_PROTO}")
+
+ get_filename_component(ABS_PROTO ${brpc_library_PROTO} ABSOLUTE)
+ get_filename_component(PROTO_WE ${brpc_library_PROTO} NAME_WE)
+ get_filename_component(PROTO_PATH ${ABS_PROTO} PATH)
+
+ protobuf_generate_cpp(brpc_proto_srcs brpc_proto_hdrs "${ABS_PROTO}")
+ cc_library("${TARGET_NAME}_proto" SRCS "${brpc_proto_srcs}")
+ cc_library("${TARGET_NAME}" SRCS "${brpc_library_SRCS}" DEPS "${TARGET_NAME}_proto" "${brpc_library_DEPS}")
+endfunction()
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index cc758019827b9a5416a801e4da43d754d4492a73..bc36683a9facc253e7b9feb0c5a56e79491fb9b0 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -12,19 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-set_property(GLOBAL PROPERTY FLUID_MODULES "")
-# find all fluid modules is used for paddle fluid static library
-function(find_fluid_modules TARGET_NAME)
- get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
- string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path})
- string(FIND "${__target_path}" "fluid" pos)
- if(pos GREATER 1)
- get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
- set(fluid_modules ${fluid_modules} ${TARGET_NAME})
- set_property(GLOBAL PROPERTY FLUID_MODULES "${fluid_modules}")
- endif()
-endfunction(find_fluid_modules)
-
# make package for paddle fluid shared and static library
function(copy TARGET)
set(options "")
@@ -39,7 +26,7 @@ function(copy TARGET)
message(FATAL_ERROR "${TARGET} source numbers are not equal to destination numbers")
endif()
math(EXPR len "${copy_lib_SRCS_len} - 1")
-
+
add_custom_target(${TARGET} DEPENDS ${copy_lib_DEPS})
foreach(index RANGE ${len})
list(GET copy_lib_SRCS ${index} src)
@@ -52,71 +39,105 @@ function(copy TARGET)
endfunction()
# third party
-set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/eigen3")
+set(dst_dir "${FLUID_INSTALL_DIR}/third_party/eigen3")
copy(eigen3_lib
SRCS ${EIGEN_INCLUDE_DIR}/Eigen/Core ${EIGEN_INCLUDE_DIR}/Eigen/src ${EIGEN_INCLUDE_DIR}/unsupported/Eigen
DSTS ${dst_dir}/Eigen ${dst_dir}/Eigen ${dst_dir}/unsupported
+ DEPS eigen3
)
-set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/gflags")
+set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/gflags")
copy(gflags_lib
SRCS ${GFLAGS_INCLUDE_DIR} ${GFLAGS_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib
+ DEPS gflags
)
-set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/glog")
+set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/glog")
copy(glog_lib
SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib
+ DEPS glog
+)
+
+set(dst_dir "${FLUID_INSTALL_DIR}/third_party/boost/")
+copy(boost_lib
+ SRCS ${BOOST_INCLUDE_DIR}/boost
+ DSTS ${dst_dir}
+ DEPS boost
)
if(NOT PROTOBUF_FOUND)
- set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/protobuf")
+ set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/protobuf")
copy(protobuf_lib
SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LIBRARY}
DSTS ${dst_dir} ${dst_dir}/lib
+ DEPS extern_protobuf
)
endif()
if(NOT CBLAS_FOUND)
- set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/openblas")
+ set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/openblas")
copy(openblas_lib
SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include
DSTS ${dst_dir} ${dst_dir}
+ DEPS extern_openblas
)
elseif (WITH_MKLML)
- set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/mklml")
+ set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mklml")
copy(mklml_lib
SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_INC_DIR}
DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir}
+ DEPS mklml
)
endif()
+if(WITH_MKLDNN)
+ set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mkldnn")
+ copy(mkldnn_lib
+ SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB}
+ DSTS ${dst_dir} ${dst_dir}/lib
+ DEPS mkldnn
+ )
+endif()
+
+if (NOT WIN32)
if(NOT MOBILE_INFERENCE AND NOT RPI)
- set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/snappy")
+ set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy")
copy(snappy_lib
SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES}
- DSTS ${dst_dir} ${dst_dir}/lib)
+ DSTS ${dst_dir} ${dst_dir}/lib
+ DEPS snappy)
- set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/snappystream")
+ set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappystream")
copy(snappystream_lib
SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES}
- DSTS ${dst_dir} ${dst_dir}/lib)
+ DSTS ${dst_dir} ${dst_dir}/lib
+ DEPS snappystream)
- set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/zlib")
+ set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib")
copy(zlib_lib
SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
- DSTS ${dst_dir} ${dst_dir}/lib)
+ DSTS ${dst_dir} ${dst_dir}/lib
+ DEPS zlib)
endif()
+endif(NOT WIN32)
# paddle fluid module
set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
-set(dst_dir "${CMAKE_INSTALL_PREFIX}/paddle/fluid")
+set(dst_dir "${FLUID_INSTALL_DIR}/paddle/fluid")
set(module "framework")
+if (NOT WIN32)
copy(framework_lib DEPS framework_py_proto
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module}
)
+else()
+copy(framework_lib
+ SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
+ DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module}
+)
+endif(NOT WIN32)
set(module "memory")
copy(memory_lib
@@ -124,10 +145,23 @@ copy(memory_lib
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail
)
+set(inference_deps paddle_fluid_shared paddle_fluid)
+
+set(module "inference/api")
+if (WITH_ANAKIN AND WITH_GPU)
+ copy(anakin_inference_lib DEPS paddle_inference_api inference_anakin_api
+ SRCS
+ ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libinference_anakin_api* # compiled anakin api
+ ${ANAKIN_INSTALL_DIR} # anakin release
+ DSTS ${dst_dir}/inference/anakin ${dst_dir}/inference/anakin)
+ list(APPEND inference_deps anakin_inference_lib)
+endif()
+
set(module "inference")
-copy(inference_lib DEPS paddle_fluid_shared paddle_fluid
+copy(inference_lib DEPS ${inference_deps}
SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
- DSTS ${dst_dir}/${module} ${dst_dir}/${module}
+ ${src_dir}/${module}/api/paddle_inference_api.h ${src_dir}/${module}/api/demo_ci
+ DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
)
set(module "platform")
@@ -142,4 +176,31 @@ copy(string_lib
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/tinyformat
)
+set(module "pybind")
+copy(pybind_lib
+ SRCS ${CMAKE_CURRENT_BINARY_DIR}/paddle/fluid/${module}/pybind.h
+ DSTS ${dst_dir}/${module}
+)
+
+# CMakeCache Info
+copy(cmake_cache
+ SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
+ DSTS ${FLUID_INSTALL_DIR})
+
add_custom_target(inference_lib_dist DEPENDS ${inference_lib_dist_dep})
+
+# paddle fluid version
+execute_process(
+ COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1
+ WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
+ OUTPUT_VARIABLE PADDLE_GIT_COMMIT)
+set(version_file ${FLUID_INSTALL_DIR}/version.txt)
+file(WRITE ${version_file}
+ "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n"
+ "WITH_MKL: ${WITH_MKL}\n"
+ "WITH_GPU: ${WITH_GPU}\n")
+if(WITH_GPU)
+ file(APPEND ${version_file}
+ "CUDA version: ${CUDA_VERSION}\n"
+ "CUDNN version: v${CUDNN_MAJOR_VERSION}\n")
+endif()
diff --git a/cmake/simd.cmake b/cmake/simd.cmake
index 53c2de332ea74b06d1bd6e5bb119cad6af27ed01..3eacf4d86aa0385eddb690d72e85e3384929bb99 100644
--- a/cmake/simd.cmake
+++ b/cmake/simd.cmake
@@ -10,6 +10,7 @@ if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID
set(SSE3_FLAG "-msse3")
set(AVX_FLAG "-mavx")
set(AVX2_FLAG "-mavx2")
+ set(AVX512F_FLAG "-mavx512f")
elseif(MSVC)
set(MMX_FLAG "/arch:MMX")
set(SSE2_FLAG "/arch:SSE2")
@@ -81,5 +82,16 @@ int main()
return 0;
}" AVX2_FOUND)
+# Check AVX512F
+set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG})
+set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
+CHECK_CXX_SOURCE_RUNS("
+#include
+int main()
+{
+ __m512i a = _mm512_undefined_epi32();
+ return 0;
+}" AVX512F_FOUND)
+
set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED})
-mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND)
+mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FOUND)
diff --git a/cmake/version.cmake b/cmake/version.cmake
index cde650128a068faf32f4abfff5cdfdeb656d8577..ac10bdf067be549fe90112aef73fd6e1fbe0ac48 100644
--- a/cmake/version.cmake
+++ b/cmake/version.cmake
@@ -1,23 +1,46 @@
# Get the latest git tag.
set(PADDLE_VERSION $ENV{PADDLE_VERSION})
set(tmp_version "HEAD")
+set(TAG_VERSION_REGEX "[0-9]+\\.[0-9]+\\.[0-9]+(\\.(a|b|rc)\\.[0-9]+)?")
+set(COMMIT_VERSION_REGEX "[0-9a-f]+[0-9a-f]+[0-9a-f]+[0-9a-f]+[0-9a-f]+")
while ("${PADDLE_VERSION}" STREQUAL "")
+ # Check current branch name
execute_process(
- COMMAND ${GIT_EXECUTABLE} describe --tags --abbrev=0 ${tmp_version}
+ COMMAND ${GIT_EXECUTABLE} rev-parse --abbrev-ref ${tmp_version}
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
- OUTPUT_VARIABLE GIT_TAG_NAME
- RESULT_VARIABLE GIT_RESULT
+ OUTPUT_VARIABLE GIT_BRANCH_NAME
+ RESULT_VARIABLE GIT_BRANCH_RESULT
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
- if (NOT ${GIT_RESULT})
- # Check the tag is a correct version
- if (${GIT_TAG_NAME} MATCHES "v[0-9]+\\.[0-9]+\\.[0-9]+(\\.(a|b|rc)\\.[0-9]+)?")
- string(REPLACE "v" "" PADDLE_VERSION ${GIT_TAG_NAME})
- else() # otherwise, get the previous git tag name.
- set(tmp_version "${GIT_TAG_NAME}~1")
+ if (NOT ${GIT_BRANCH_RESULT})
+ execute_process(
+ COMMAND ${GIT_EXECUTABLE} describe --tags --abbrev=0 --always ${tmp_version}
+ WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
+ OUTPUT_VARIABLE GIT_TAG_NAME
+ RESULT_VARIABLE GIT_RESULT
+ ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+ if (NOT ${GIT_RESULT})
+ # Check if current branch is release branch
+ if (${GIT_BRANCH_NAME} MATCHES "release/${TAG_VERSION_REGEX}")
+ # Check the tag is a correct version
+ if (${GIT_TAG_NAME} MATCHES "${COMMIT_VERSION_REGEX}")
+ # if no tag was found, set PADDLE_VERSION to 0.0.0 to represent latest
+ set(PADDLE_VERSION "0.0.0")
+ elseif (${GIT_TAG_NAME} MATCHES "v${TAG_VERSION_REGEX}")
+ string(REPLACE "v" "" PADDLE_VERSION ${GIT_TAG_NAME})
+ else() # otherwise, get the previous git tag name.
+ set(tmp_version "${GIT_TAG_NAME}~1")
+ endif()
+ else()
+ # otherwise, we always set PADDLE_VERSION to 0.0.0 to represent latest
+ set(PADDLE_VERSION "0.0.0")
+ endif()
+ else()
+ set(PADDLE_VERSION "0.0.0")
+ message(WARNING "Cannot add paddle version from git tag")
endif()
else()
set(PADDLE_VERSION "0.0.0")
- message(WARNING "Cannot add paddle version from git tag")
+ message(WARNING "Cannot add paddle version for wrong git branch result")
endif()
endwhile()
diff --git a/contrib/float16/float16_inference_report.md b/contrib/float16/float16_inference_report.md
deleted file mode 100644
index 67623a4d8d58b2ab84031ac4b290931ab540973b..0000000000000000000000000000000000000000
--- a/contrib/float16/float16_inference_report.md
+++ /dev/null
@@ -1,163 +0,0 @@
-## Introduction
-Working with deep neural networks (DNN) is a two-stage process. First we train DNN using labeled examples of inputs and desired outputs to obtain the model parameters (weights), then we deploy DNN along with the trained weights to run inference on unknown inputs. Typically, these weights are in float data type and hence we run inference in float mode using these weights. This post focuses on the discussion of how to use low precision float16 data type to represent these trained weights and run inference in float16 mode as well as the advantages of float16 inference over its float counterpart by showing some experiment results.
-
-## What is float16?
-float16 (or FP16) is a half-precision floating-point format that uses 16 bits in memory to represent a value. The advantage over 32-bit single-precision floating-point format (commonly known as float data type) is that it requires half the storage and bandwidth at the expense of precision and range. Fortunately, DNN inference has high tolerance against the loss of precision and range when using float16 to represent the weights and the inference accuracy will only be minimally affected in most cases. This gives us the opportunity to use float16 data type to speedup the inference.
-
-Interested readers can refer to our [design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/data_type/float16.md) and [code](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/float16.h) for more details on how we implement the float16 data type.
-
-## Why float16?
-The trend in today's deep learning community is to use bigger and deeper model. This translates to larger memory footprint, higher computation demands, and as a result higher energy consumption on computing devices. The advantages of float16 over float are correspondingly three-fold:
-
-1. We only need half the memory size to load the same model using float16 representations. Moreover, most of the intermediate results generated during float16 inference are also of float16 data type. This makes the whole memory footprint of float16 inference roughly about half of its float counterpart. This is especially useful when deploying inference on mobile devices with limited available memory. Also given the same available memory, the maximum batch size for float16 inference is about twice that for float inference.
-
-2. Because float16 occupies less memory than float, in theory hardware devices can achieve much higher floating point operators per second (FLOPS) for float16 data than float data. Right now, an outstanding example of hardware devices that actually deliver such advantages is Nvidia's latest Volta architecture GPUs, including Tesla V100 and Titan V. Moreover float16 takes less time to read from or write to memory and hence float16 can make inference more efficient especially in memory-bound applications where the performance is largely affected by how fast it is to read and write data.
-
-3. From the energy efficiency perspective, the energy needed to read, write, and compute float16 data is much less that its float counterpart, which can significantly reduce the battery power consumption on mobile devices or the total cost of ownership (TCO) of data centers.
-
-## Fluid implementation of float16 inference
-### Overview
-Fluid use [Program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#program) instead of computation graph to describe a neural network model and the optimization procedure. Fluid program is a python wrapper around a protobuf message called [ProgramDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/program.md). Similar to programming languages, the basic structure of a Fluid program is some nested [blocks](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#block), where each block consists of some [variable](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#variable) definitions and a sequence of [operators](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#operator). An [executor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/executor.md) will run a given program by sequentially executing the operators in the entrance block.
-
-### Basic requirement
-When an operator is run by an executor, it uses a kernel to perform computations on tensors contained in the input variables, and then write the results to the tensors in the output variables. Each operator has multiple kernels for different combinations of data types, devices, and library types, respectively. The operator will select the appropriate kernel to run based on, among other things, the data type of the input tensors. By default, every Fluid operator has a kernel for float data type that takes float inputs and generates float outputs.
-
-This means that if we provide float input to the first operator in a program, then each operator will use float kernel to compute float output and send it as input to the next operator to trigger its float kernel. This chain effect will makes the program run in float mode and gives us a final output of float data type.
-
-The same principle applies if we want a program to run in float16 mode. We provide input variable of float16 data type to the first operator and every subsequent operator will invoke the float16 kernel until we get the final output in float16 data type. So the preliminary requirements for float16 inference is to add float16 kernels to operators that are needed in a specific kind of neural networks. Our current focus is on Convolutional Neural Networks (CNN) and hence we have added float16 kernels to the following operators: convolution, pooling, GEMM, elementwise addition, batch norm, dropout, various activations including relu and tanh, and softmax.
-
-### float16 transpiler
-Furthermore, we need a float16 transpiler to achieve the following usage code:
-
-```python
-# Get the float32 inference program and load the associated float32 weights
-[inference_program, feed_target_names,
- fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
-
-# Prepare the float input data
-batch_size = 1
-tensor_img = numpy.random.rand(batch_size, 3, 32, 32).astype(numpy.float32)
-
-# Running inference_program in float mode
-float_results = exe.run(inference_program,
- feed={feed_target_names[0]: tensor_img},
- fetch_list=fetch_targets)
-
-# Use float16 transpiler to speedup
-float16_inference_program = float_inference_program.clone()
-t = Float16Transpiler()
-t.transpile(float16_inference_program, GPUPlace)
-
-# Running float16_inference_program in float16 mode using the same input data
-float16_results = exe.run(float16_inference_program,
- feed={feed_target_names[0]: tensor_img},
- fetch_list=fetch_targets)
-
-# Do some tests to verify the correctness of float16 inference
-...
-np.testing.assert_almost_equal(float_results, float16_results, ...)
-...
-
-# Save the float16 inference program and float16 weights for future deployment
-fluid.io.save_inference_model(fp16_save_dirname, feed_target_names,
- fetch_targets, exe,
- float16_inference_program)
-```
-
-In this scenario, we already have a float32 inference program and some associated float32 weights that can do float32 inference. We can easily use the `transpile` method of the `Float16Transpiler` class to do certain modifications to the existing program and weights so that we have a new float16 program and the associated float16 weights.
-
-We can then run various inference experiments in float16 mode and save the float16 program and weights on disk for future deployment. To enhance the code usability, we maintain a consistent API so that user can use the same float32 input data to run inference program in either float32 and float16 mode and obtain output data both of float32 data type. This requires us to add some cast operators in the program to convert between float16 tensor and float32 tensor.
-
-The float16 transpiler is implemented to fulfill the requirements mentioned above. The details of the float16 transpiler can be found [here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/data_type/float16.md#float16-inference).
-
-### Experiment results
-We provide demo codes that can be used to reproduce the experiment results by doing:
-```bash
-git clone https://github.com/PaddlePaddle/Paddle.git
-cd Paddle
-# This line will generate a paddle development docker image with cuda 8 and cudnn 7
-# If you want test on cuda 9 instead, change the line 5 in Paddle/Dockerfile
-# from `FROM nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04`
-# to `FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04` and similarly for other configurations
-nvidia-docker build -t paddle:float16 .
-# After running this, different results will be written to different log files in Paddle/contrib/float16/
-nvidia-docker run -it -v $PWD:/paddle paddle:float16 /paddle/contrib/float16/run_float16_demo.sh
-```
-
-#### Correctness
-As is mentioned before, DNN inference has been found to be tolerant against the loss of precision and range incured by float16 and we want to see how good this tolerance is.
-
-We train a resnet32 model using cifar10 data set, save it when test set accuracy is above 60%, and then test the inference accuracy on the 10000 examples of the cifar10 test set in float16 and float32 mode, respectively.
-
-We repeat the test ten times and get the following results:
-
-| | float16 | float32 |
-|--------|--------:|--------: |
-| # 1 | 62.75% | 62.72% |
-| # 2 | 61.27% | 61.28% |
-| # 3 | 62.24% | 62.23% |
-| # 4 | 64.16% | 64.17% |
-| # 5 | 60.75% | 60.77% |
-| # 6 | 63.25% | 63.24% |
-| # 7 | 62.15% | 62.13% |
-| # 8 | 62.05% | 62.02% |
-| # 9 | 65.19% | 65.20% |
-| #10 | 62.53% | 62.48% |
-| average| 62.63% | 62.62% |
-
-We can see that the accuracy of float16 inference is very close to that of float32 inference in every experiment (within 0.05% difference) and is overall 0.01% better than its float32 counterpart averaged over 10 tests.
-
-#### Performance benchmark
-Currently, Fluid inference in float16 mode is only supported on Nvidia GPU device. There is no motivation to support float16 inference on non-ARM CPUs because float16 is not natively supported there and float16 calculation will only be slower than its float counterpart.
-
-Nvidia started to support its native float16 data type (which has the same internal memory representation as Fluid float16 class) on CUDA 7.5. Moreover, float16 speedups on common computational intensive tasks including GEMM (general matrix-matrix multiplication) and convolution are supported since cublas 7.5 and cuDNN 5.0.
-
-Recently, the introduction of [tensor core](https://devblogs.nvidia.com/programming-tensor-cores-cuda-9/) in volta architecture GPUs and the support of tensor core calculation in CUDA 9.0 and cuDNN 7 make float16 truly superior to float in certain deep learning applications.
-
-We thus benchmark the float16 inference performance on a single Nvidia Tesla V100 GPU (volta architecture and with tensor cores) and compare it with its float32 counterpart. All the following results are in ms (millisecond) averaged over 1000 mini-batches with respective to different mini-batch(mb) sizes.
-
-Average inference time for one mini-batch on Vgg16 model tested on imagenet data set:
-
-| total | mb=1 | mb=2 | mb=4 | mb=8 | mb=16 | mb=32 | mb=64 |
-|-------|-----: |-----: |-----: |-----: |------: |------:|-------:|
-|float32| 14.01 | 9.70 | 22.99 | 28.26 | 53.87 | 84.42 | 178.95 |
-|float16| 3.32 | 4.11 | 5.88 | 9.41 | 16.54 | 30.47 | 60.23 |
-|Speedup| 4.22 | 2.36 | 3.91 | 3.00 | 3.26 | 2.77 | 2.97 |
-
-We can see that float16 inference provides 2x ~ 4x speedup on different batch sizes.
-
-Convolution operation is ususally the computational bottleneck of CNN, so we also check the average time spent on the Fluid convolution operators for one mini-batch as follows:
-
-|conv op| mb=1 | mb=2 | mb=4 | mb=8 | mb=16 | mb=32 | mb=64 |
-|-------|-----: |-----: |-----: |-----: |------: |------:|-------:|
-|float32| 11.95 | 6.96 | 18.65 | 21.42 | 41.35 | 60.58 | 130.11 |
-|float16| 1.78 | 2.10 | 2.93 | 4.55 | 7.99 | 14.63 | 28.67 |
-|Speedup| 6.71 | 3.31 | 6.37 | 4.71 | 5.18 | 4.14 | 4.54 |
-
-Fluid convolution operator uses cuDNN 7 to implement the kernel and we can see that with the help of tensor core, float16 convolution is significantly faster than its float32 counterpart, which makes the overall float16 inference performance much better.
-
-Similarly, we also list the benchmark results of Resnet50 model tested on imagenet data set:
-
-| total | mb=1 | mb=2 | mb=4 | mb=8 | mb=16 | mb=32 | mb=64 | mb=128 |
-|-------|-----: |-----: |-----: |-----: |------: |------:|-------:|-------:|
-|float32| 7.03 | 7.41 | 9.16 | 12.55 | 21.13 | 38.27 | 67.93 | 127.02 |
-|float16| 6.13 | 6.32 | 6.24 | 7.40 | 10.90 | 18.18 | 33.20 | 64.52 |
-|Speedup| 1.15 | 1.17 | 1.47 | 1.70 | 1.94 | 2.11 | 2.05 | 1.97 |
-
-|conv op| mb=1 | mb=2 | mb=4 | mb=8 | mb=16 | mb=32 | mb=64 | mb=128 |
-|-------|-----: |-----: |-----: |-----: |------: |------:|-------:|-------:|
-|float32| 5.43 | 5.46 | 6.50 | 8.36 | 13.80 | 24.45 | 41.21 | 73.44 |
-|float16| 4.19 | 4.30 | 3.96 | 4.21 | 5.63 | 8.77 | 15.24 | 28.40 |
-|Speedup| 1.30 | 1.27 | 1.64 | 1.99 | 2.45 | 2.79 | 2.70 | 2.59 |
-
-We find that the speedup provided by float16 inference starts relatively small at 1.15x for batch size 1 and gradually increase to about 2x for larger batch sizes. Similar trend can be found for the time spent on the convolution operator. Note that right now the tensor core will only be utilized in the convolution operation when certain dimentional requirements are met for the input data and filter. The speedup by float16 inference for Resnet50 is smaller than the Vgg16 counterpart partially because the convolution operation in Resnet is much simpler than the Vgg counterpart and this makes the tensor core less utilized in Resnet than in Vgg.
-
-We also did the same benchmark on a Nvidia GeForce GTX 1080 Ti GPU that does not support tensor core. The results show that for Vgg16, float16 inference provides consistent small speedup (around 1.15x) for all mini-batch sizes, while for Resnet50, float16 inference is slower than its float32 counterpart in small batch sizes (mb = 1 and 2) and then deliver around 1.15x speedup for all larger batch sizes. By comparing the benchmarks on 1080 Ti and V100, we find that tensor core, which is specialized for float16 computations, is a critical component for high performance float16 inference.
-
-Please refer to [here](https://github.com/PaddlePaddle/Paddle/blob/develop/contrib/float16/float16_benchmark.md) for comprehensive benchmark results.
-
-### Summary
-1. Fluid is now able to run inference in float16 mode via a float16 transpiler. We currently support CNN programs, including Vgg and Resnet, to run in float16 inference mode.
-2. The accuracy of float16 inference is verified to be almost identical to the float32 counterpart at least on CNNs.
-3. float16 inference provides significant speedup on large and computationally intensive Vgg16 network on image net data set. For the much smaller and simpler Resnet50, the speedup provided by float16 inference is less significant than on Vgg16 but still favorable especially for large batch size.
-4. We cannot achieve the superior float16 inference performance without the help of the newly introduced tensor cores on the Nvidia Volta architecture GPUs.
diff --git a/contrib/float16/float16_transpiler.py b/contrib/float16/float16_transpiler.py
deleted file mode 100644
index 91ba101edb65cd45bd5e37a0c6ad25e515593a81..0000000000000000000000000000000000000000
--- a/contrib/float16/float16_transpiler.py
+++ /dev/null
@@ -1,256 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import paddle.fluid.core as core
-from paddle.fluid.framework import Program
-from paddle.fluid.executor import global_scope
-
-
-class Float16Transpiler:
- def transpile(self, program, place, scope=None):
- '''
- Transpile the program desc and cast the weights to float16 data type to
- enable float16 inference.
-
- Since the operator in a program desc will automatically choose the
- right compute kernel to run based on the data type of the input tensor.
- We actually don't need to change the program desc to run in float16 mode.
-
- However, in this way, users who are used to feeding and fetching tensors
- of float32 data type when running typical inference may find it confusing
- and difficult to run inference in float16 mode as they need to convert
- input data to float16 dtype and then convert the results back to float32
- dtype to match the rest of code.
-
- So this function appends cast ops to the program desc where necessary so
- that users are able to run inference in float16 mode while providing input
- tensor (feed_holder) of float data type and obtaining output tensor
- (fetch_holder) of float data type.
-
- Moreover, it is desired that when we have the scope and program desc to run
- inference in float32 mode, we can use a single API to do the necessary
- modification and then user can run float16 inference on the fly. To make
- this happen, this function also create new parameters in the scope to have the
- converted float16 weights and change the operators in program desc to use
- these new parameters.
-
- :param program: program to transpile
- :type program: Program
- :param place: inference place
- :type place: Place
- :param scope: inference scope
- :type scope: Scope
- '''
- if not isinstance(program, Program):
- raise TypeError("program should be as Program type")
- if not isinstance(place, core.CPUPlace) and not isinstance(
- place, core.CUDAPlace):
- raise TypeError("place should be as CPUPlace/CUDAPlace type")
- if scope is None:
- scope = global_scope()
- if not isinstance(scope, core.Scope):
- raise TypeError("scope should be as Scope type or None")
-
- self.scope = scope
- self.place = place
- self.block = program.block(0)
- self.input_map = {} # store the input names should be adjusted
-
- self._modify_feed_fetch()
- self._convert_param_to_float16()
- self._adjust_input(skip=True)
- self._remove_unused_var()
-
- # TODO(luotao): use clone() method to flush the program.desc in force,
- # since some large program.desc will not be flushed immediately.
- # And a better solution will be considered later.
- program = program.clone()
-
- # ====================== private transpiler functions =====================
- def _adjust_input(self, skip=False):
- '''
- Change the input variable name in operators.
-
- When we are in the process of modifying a program desc, we usually
- replace some variables with some other variables, where we create
- a dictionary input_map to record the one-to-one correspondence
- between each old variable and the new one.
-
- After that, this function will search all the operators that use the
- old variables and change the info in op to use the new variables. There
- maybe some exceptions to this rule when we are using the float16 transpiler
- and insert cast ops to cast float32 variable to float16 one. After we
- insert the cast op to cast var_1 to var_1_fp16, we don't want to change
- the input of cast op to var_1_fp16 after using this function.
- '''
- skip_ops = {"cast"}
- for i in range(len(self.block.ops)):
- current_op = self.block.ops[i]
- if skip and current_op.type in skip_ops:
- continue
- for input_arg in current_op.input_arg_names:
- if input_arg in self.input_map:
- current_op.rename_input(input_arg,
- self.input_map[input_arg])
-
- def _remove_unused_var(self):
- '''
- remove unused varibles in program
- '''
- args = []
- for i in range(len(self.block.ops)):
- current_op = self.block.ops[i]
- args += current_op.input_arg_names
- args += current_op.output_arg_names
- args = list(set(args)) # unique the input and output arguments
-
- for var in self.block.vars.keys():
- if var not in args:
- self.block.remove_var(var)
-
- def _modify_feed_fetch(self):
- '''
- Modify feed fetch op/vars for float16 inference.
-
- For each feed op:
- feed_op->feed_target_var
-
- Change it to:
- feed_op->feed_target_var->cast_op(from other dtype to float16)->tmp_var
-
- For each fetch op:
- fetch_target_var->fetch_op
-
- Change it to:
- tmp_var->cast_op(from float16 to other dtype)->fetch_target_var->fetch_op
-
- :return: None
- '''
-
- def find_op(var):
- # It is possible that var.op is not up to date after some
- # modifications to program desc. Here we force to make it up to date.
- var.op = None
- for op in self.block.ops:
- if var.name in op.output_arg_names:
- var.op = op
- break
-
- if var.op is None:
- raise ValueError("The target variable must have an "
- "associated operator that generates it.")
-
- i = 0
- while i < len(self.block.ops):
- cur_op = self.block.ops[i]
- if cur_op.type == "feed":
- var_name = cur_op.output("Out")[0]
- tmp_var_name = var_name + ".fp16"
- var = self.block.vars[var_name]
- tmp_var = self.block.create_var(
- name=tmp_var_name.encode('ascii'),
- type=var.type,
- dtype=core.VarDesc.VarType.FP16,
- shape=var.shape,
- persistable=var.persistable)
- self.block.insert_op(
- i + 1,
- type="cast",
- inputs={"X": var},
- outputs={"Out": tmp_var},
- attrs={
- 'in_dtype': int(var.dtype),
- 'out_dtype': int(tmp_var.dtype)
- })
- self.input_map[var_name] = tmp_var_name
- i = i + 1
- elif cur_op.type == "fetch":
- var_name = cur_op.input("X")[0]
- tmp_var_name = var_name + ".fp16"
- var = self.block.vars[var_name]
- tmp_var = self.block.create_var(
- name=tmp_var_name.encode('ascii'),
- type=var.type,
- dtype=core.VarDesc.VarType.FP16,
- shape=var.shape,
- persistable=var.persistable)
- find_op(var)
- var.op.rename_output(var_name, tmp_var_name)
- self.block.insert_op(
- i,
- type="cast",
- inputs={"X": tmp_var},
- outputs={"Out": var},
- attrs={
- 'in_dtype': int(tmp_var.dtype),
- 'out_dtype': int(var.dtype)
- })
- i = i + 1
- i = i + 1
-
- def _convert_param_to_float16(self):
- def _get_no_fp16_conversion_var_names():
- '''
- Get the set of input variable names that shouldn't be converted to float16.
-
- When we want to run inference in float16 mode, most parameters need to be
- firstly converted to float16. However, there are some parameters that
- shouldn't be converted to float16 because the corresponding operator
- requires float32 parameters even in float16 mode (when the input data is
- of float16 data type). Currently, the only operator that has this exclusion
- is the batch norm op.
-
- :return: set of input variable names
- :type var_names: set
- '''
- op_names = {'batch_norm'}
- var_names = []
- for op in self.block.ops:
- if op.type in op_names:
- var_names += op.input_arg_names
- return set(var_names)
-
- def _should_be_converted(var):
- return var.persistable and \
- var.name not in self.no_conversion_vars and \
- var.type != core.VarDesc.VarType.FEED_MINIBATCH and \
- var.type != core.VarDesc.VarType.FETCH_LIST
-
- self.no_conversion_vars = _get_no_fp16_conversion_var_names()
- conversion_var_list = filter(_should_be_converted,
- self.block.vars.values())
- for var in conversion_var_list:
- fp16_var_name = var.name + ".fp16"
- fp16_var = self.block.create_parameter(
- name=fp16_var_name.encode('ascii'),
- type=var.type,
- dtype=core.VarDesc.VarType.FP16,
- shape=var.shape)
-
- # cast the data in the tensor of the original var to float16
- # data type and store it in the tensor of the new float16 var
- self.scope.var(fp16_var_name)
- fp16_tensor = self.scope.find_var(fp16_var_name).get_tensor()
- tensor = np.array(self.scope.find_var(var.name).get_tensor())
- # After the old tensor data is converted to np.float16, view(np.uint16)
- # is used so that the internal memory of the numpy array will be
- # reinterpreted to be of np.uint16 data type, which is binded to fluid
- # float16 data type via the help of pybind in tensor_py.h.
- fp16_tensor.set(
- tensor.astype(np.float16).view(np.uint16), self.place)
-
- # old var will be replaced by the fp16 var in program desc
- self.input_map[var.name] = fp16_var_name
- self.block.remove_var(var.name)
diff --git a/contrib/float16/run_float16_demo.sh b/contrib/float16/run_float16_demo.sh
deleted file mode 100755
index d8a34ee67b8fab214fa6e96104304689211f84da..0000000000000000000000000000000000000000
--- a/contrib/float16/run_float16_demo.sh
+++ /dev/null
@@ -1,117 +0,0 @@
-#!/bin/bash
-
-BUILD_PATH=/paddle/fp16_build
-WHEEL_PATH=$BUILD_PATH/python/dist
-INFER_PATH=$BUILD_PATH/paddle/fluid/inference/tests/book
-DEMO_PATH=/paddle/contrib/float16
-
-# Use the single most powerful CUDA GPU on your machine
-export CUDA_VISIBLE_DEVICES=0
-
-# Build the PaddlePaddle Fluid wheel package and install it.
-mkdir -p $BUILD_PATH && cd $BUILD_PATH
-cmake .. -DWITH_AVX=OFF \
- -DWITH_MKL=OFF \
- -DWITH_GPU=ON \
- -DWITH_TESTING=ON \
- -DWITH_TIMER=ON \
- -DWITH_PROFILER=ON \
- -DWITH_FLUID_ONLY=ON
-make -j `nproc`
-pip install -U "$WHEEL_PATH/$(ls $WHEEL_PATH)"
-
-cd $DEMO_PATH
-# Clear previous log results
-rm -f *.log
-
-# Test the float16 inference accuracy of resnet32 on cifar10 data set
-stdbuf -oL python float16_inference_demo.py \
- --data_set=cifar10 \
- --model=resnet \
- --threshold=0.6 \
- --repeat=10 \
- 2>&1 | tee -a float16_inference_accuracy.log
-
-# Sleep to cool down the GPU for consistent benchmarking
-sleep 2m
-
-# benchmarking parameters
-REPEAT=1000
-MAXIMUM_BATCH_SIZE=512
-
-for ((batch_size = 1; batch_size <= MAXIMUM_BATCH_SIZE; batch_size *= 2));
-do
-
- # Test inference benchmark of vgg16 on imagenet
- stdbuf -oL python float16_inference_demo.py \
- --data_set=imagenet \
- --model=vgg \
- --threshold=0.001 \
- --repeat=1 \
-
- $INFER_PATH/test_inference_image_classification_vgg \
- --data_set=imagenet \
- --dirname=$DEMO_PATH/image_classification_imagenet_vgg.inference.model \
- --fp16_dirname=$DEMO_PATH/float16_image_classification_imagenet_vgg.inference.model \
- --repeat=$REPEAT \
- --batch_size=$batch_size \
- --skip_cpu=true \
- 2>&1 | tee -a imagenet_vgg16_benchmark.log
-
- sleep 2m
-
- # Test inference benchmark of resnet50 on imagenet
- stdbuf -oL python float16_inference_demo.py \
- --data_set=imagenet \
- --model=resnet \
- --threshold=0.001 \
- --repeat=1 \
-
- $INFER_PATH/test_inference_image_classification_resnet \
- --data_set=imagenet \
- --dirname=$DEMO_PATH/image_classification_imagenet_resnet.inference.model \
- --fp16_dirname=$DEMO_PATH/float16_image_classification_imagenet_resnet.inference.model \
- --repeat=$REPEAT \
- --batch_size=$batch_size \
- --skip_cpu=true \
- 2>&1 | tee -a imagenet_resnet50_benchmark.log
-
- sleep 2m
-
- # Test inference benchmark of vgg16 on cifar10
- stdbuf -oL python float16_inference_demo.py \
- --data_set=cifar10 \
- --model=vgg \
- --threshold=0.001 \
- --repeat=1 \
-
- $INFER_PATH/test_inference_image_classification_vgg \
- --data_set=cifar10 \
- --dirname=$DEMO_PATH/image_classification_cifar10_vgg.inference.model \
- --fp16_dirname=$DEMO_PATH/float16_image_classification_cifar10_vgg.inference.model \
- --repeat=$REPEAT \
- --batch_size=$batch_size \
- --skip_cpu=true \
- 2>&1 | tee -a cifar10_vgg16_benchmark.log
-
- sleep 1m
-
- # Test inference benchmark of resnet32 on cifar10
- stdbuf -oL python float16_inference_demo.py \
- --data_set=cifar10 \
- --model=resnet \
- --threshold=0.001 \
- --repeat=1 \
-
- $INFER_PATH/test_inference_image_classification_vgg \
- --data_set=cifar10 \
- --dirname=$DEMO_PATH/image_classification_cifar10_resnet.inference.model \
- --fp16_dirname=$DEMO_PATH/float16_image_classification_cifar10_resnet.inference.model \
- --repeat=$REPEAT \
- --batch_size=$batch_size \
- --skip_cpu=true \
- 2>&1 | tee -a cifar10_resnet32_benchmark.log
-
- sleep 1m
-
-done
diff --git a/doc/about/about_us.rst b/doc/about/about_us.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f67d8b8130030db8d7e7d10b30271a913bd6272a
--- /dev/null
+++ b/doc/about/about_us.rst
@@ -0,0 +1,53 @@
+=========
+关于我们
+=========
+
+什么是PaddlePaddle
+--------------------
+
+- PaddlePaddle是百度自主研发并开源的深度学习框架,它能够让开发者和企业安全、快速地实现自己的AI想法
+
+- 项目团队汇聚了全球顶级的深度学习科学家,致力于为开发者和企业提供最好的深度学习研发体验
+
+- 框架具有易学、易用、安全、高效四大特性,是最适合中国开发者和企业的深度学习工具
+
+PaddlePaddle的技术特色
+-------------------------
+
+- 新一代深度学习框架: PaddlePaddle是基于“深度学习编程语言”的新一代深度学习框架,在保证性能的同时,极大的提升了框架对模型的表达能力,能够描述任意潜在可能出现的模型
+
+- 对大规模计算更加友好:经过百度内多种大规模计算业务的打磨,PaddlePaddle在分布式计算上表现优异,基于EDL技术能够节约大量计算资源,同时也能支持大规模稀疏模型的训练
+
+- 提供可视化的深度学习:通过Visual DL可以帮助开发者方便的观测训练整体趋势、数据样本质量和中间结果、参数分布和变化趋势、以及模型的结构,帮助开发者更便捷的完成编程过程
+
+提供基于PaddlePaddle的教育体系
+--------------------------------
+
+- 深度学习课程:百度与中国市场顶级的教育、培训机构共同开发了深度学习精品课程以及学习教材,帮助开发者从零掌握深度学习
+
+- 深度学习实训:对于目的是科研和学习的用户,PaddlePaddle提供了无需安装、线上运行的开发环境,并提供算法、算力、数据支持
+
+- 线下培训:提供丰富、高质量的线下教育活动,如青年教师培训、线下实战营、沙龙等多种形式的培训和交流
+
+
+提供基于PaddlePaddle的AI服务
+------------------------------
+
+- EadyDL:可以帮助零算法基础的企业快速完成一个深度学习任务,只需少量的数据即可得到优质的模型
+
+- AI市场:提供标准化的AI 能力、产品的交易机制,帮助企业快速找到所需,有效开展AI业务
+
+- 深度学习竞赛: PaddlePaddle汇聚顶尖深度学习开发者,企业可以发布自己的商业问题,通过竞赛方式快速找到最优的解决方案
+
+你对PaddlePaddle有任何的问题都可以通过以下方式联系到我们
+-----------------------------------------------------------
+
+- 学习/使用问题:可以在 `PaddlePaddle开源社区 `_,以及 `PaddlePaddle中文社区 `_ 向我们反馈
+
+- 对PaddlePaddle框架发展的建议:可发送邮件至Paddle-better@baidu.com
+
+我们期待与你一起打造世界顶级深度学习框架,共同推动AI技术的进步
+
+
+
+PaddlePaddle团队
diff --git a/doc/fluid/CMakeLists.txt b/doc/fluid/CMakeLists.txt
index 8086507bb4b7e870ad6d6091945ed07a00b5100b..be92af3902769a65c77953c9f3cb1f3aa3738d79 100644
--- a/doc/fluid/CMakeLists.txt
+++ b/doc/fluid/CMakeLists.txt
@@ -15,6 +15,9 @@ set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
# HTML output director
set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
+set(IMPORT_PADDLE_STRING "")
+set(IMPORT_PADDLEV2_STRING "")
+
configure_file(
"${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.en.in"
"${BINARY_BUILD_DIR_EN}/conf.py"
@@ -27,8 +30,6 @@ sphinx_add_target(paddle_fluid_docs
${CMAKE_CURRENT_SOURCE_DIR}
${SPHINX_HTML_DIR_EN})
-add_dependencies(paddle_fluid_docs gen_proto_py paddle_python)
-
# configured documentation tools and intermediate build results
set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
@@ -50,6 +51,4 @@ sphinx_add_target(paddle_fluid_docs_cn
${CMAKE_CURRENT_SOURCE_DIR}
${SPHINX_HTML_DIR_CN})
-add_dependencies(paddle_fluid_docs_cn gen_proto_py paddle_python)
-
add_subdirectory(api)
diff --git a/doc/fluid/api/CMakeLists.txt b/doc/fluid/api/CMakeLists.txt
index 48b396f0786adad1ba6cd41f72497f853e54bc38..435d6e10fb02e9b2a8147f37da33e8848cc9b98a 100644
--- a/doc/fluid/api/CMakeLists.txt
+++ b/doc/fluid/api/CMakeLists.txt
@@ -7,6 +7,9 @@ set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
# HTML output director
set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
+set(IMPORT_PADDLE_STRING "import paddle")
+set(IMPORT_PADDLEV2_STRING "import paddle.v2")
+
configure_file(
"${CMAKE_CURRENT_SOURCE_DIR}/../../templates/conf.py.en.in"
"${BINARY_BUILD_DIR_EN}/conf.py"
diff --git a/doc/fluid/api/average.rst b/doc/fluid/api/average.rst
new file mode 100644
index 0000000000000000000000000000000000000000..496f5b29875443f0c44f50fcb3ca837f4e7bcd12
--- /dev/null
+++ b/doc/fluid/api/average.rst
@@ -0,0 +1,16 @@
+.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+ !DO NOT EDIT THIS FILE MANUALLY!
+
+=============
+fluid.average
+=============
+
+.. _api_fluid_average_WeightedAverage:
+
+WeightedAverage
+---------------
+
+.. autoclass:: paddle.fluid.average.WeightedAverage
+ :members:
+ :noindex:
+
diff --git a/doc/fluid/api/backward.rst b/doc/fluid/api/backward.rst
new file mode 100644
index 0000000000000000000000000000000000000000..115e0d24b39928cfc349f72e0a21d6374cd8cd75
--- /dev/null
+++ b/doc/fluid/api/backward.rst
@@ -0,0 +1,23 @@
+.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+ !DO NOT EDIT THIS FILE MANUALLY!
+
+==============
+fluid.backward
+==============
+
+.. _api_fluid_backward_append_backward:
+
+append_backward
+---------------
+
+.. autofunction:: paddle.fluid.backward.append_backward
+ :noindex:
+
+.. _api_fluid_backward_calc_gradient:
+
+calc_gradient
+-------------
+
+.. autofunction:: paddle.fluid.backward.calc_gradient
+ :noindex:
+
diff --git a/doc/fluid/api/clip.rst b/doc/fluid/api/clip.rst
new file mode 100644
index 0000000000000000000000000000000000000000..aeefbb95a46e5d5ed46375e388a720fad2711779
--- /dev/null
+++ b/doc/fluid/api/clip.rst
@@ -0,0 +1,43 @@
+.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+ !DO NOT EDIT THIS FILE MANUALLY!
+
+==========
+fluid.clip
+==========
+
+.. _api_fluid_clip_ErrorClipByValue:
+
+ErrorClipByValue
+----------------
+
+.. autoclass:: paddle.fluid.clip.ErrorClipByValue
+ :members:
+ :noindex:
+
+.. _api_fluid_clip_GradientClipByValue:
+
+GradientClipByValue
+-------------------
+
+.. autoclass:: paddle.fluid.clip.GradientClipByValue
+ :members:
+ :noindex:
+
+.. _api_fluid_clip_GradientClipByNorm:
+
+GradientClipByNorm
+------------------
+
+.. autoclass:: paddle.fluid.clip.GradientClipByNorm
+ :members:
+ :noindex:
+
+.. _api_fluid_clip_GradientClipByGlobalNorm:
+
+GradientClipByGlobalNorm
+------------------------
+
+.. autoclass:: paddle.fluid.clip.GradientClipByGlobalNorm
+ :members:
+ :noindex:
+
diff --git a/doc/fluid/api/data.rst b/doc/fluid/api/data.rst
deleted file mode 100644
index b56c7332cc284649c7e04328e51a7faa78593a39..0000000000000000000000000000000000000000
--- a/doc/fluid/api/data.rst
+++ /dev/null
@@ -1,10 +0,0 @@
-==================================
-Data Reader Interface and DataSets
-==================================
-
-.. toctree::
- :maxdepth: 1
-
- data/data_reader.rst
- data/image.rst
- data/dataset.rst
diff --git a/doc/fluid/api/data_feeder.rst b/doc/fluid/api/data_feeder.rst
index 3df5c0307ffed9d101da58b385840b115920e906..11d2890f5b3446e37c3ef31e5a17ebebe169dbc8 100644
--- a/doc/fluid/api/data_feeder.rst
+++ b/doc/fluid/api/data_feeder.rst
@@ -1,9 +1,11 @@
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY!
-===========
-data_feeder
-===========
+=================
+fluid.data_feeder
+=================
+
+.. _api_fluid_data_feeder_DataFeeder:
DataFeeder
----------
diff --git a/doc/fluid/api/evaluator.rst b/doc/fluid/api/evaluator.rst
deleted file mode 100644
index f80b87c7d2704a144c02028c4925530a67d11289..0000000000000000000000000000000000000000
--- a/doc/fluid/api/evaluator.rst
+++ /dev/null
@@ -1,28 +0,0 @@
-.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
- !DO NOT EDIT THIS FILE MANUALLY!
-
-=========
-evaluator
-=========
-
-ChunkEvaluator
---------------
-
-.. autoclass:: paddle.fluid.evaluator.ChunkEvaluator
- :members:
- :noindex:
-
-EditDistance
---------------
-
-.. autoclass:: paddle.fluid.evaluator.EditDistance
- :members:
- :noindex:
-
-DetectionMAP
---------------
-
-.. autoclass:: paddle.fluid.evaluator.DetectionMAP
- :members:
- :noindex:
-
diff --git a/doc/fluid/api/executor.rst b/doc/fluid/api/executor.rst
index a9cdf264e49691afc4b9425b7bfe54f8157ae6c2..f23ecc1f80030f20359ce9675130a167722606c9 100644
--- a/doc/fluid/api/executor.rst
+++ b/doc/fluid/api/executor.rst
@@ -1,9 +1,11 @@
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY!
-========
-executor
-========
+==============
+fluid.executor
+==============
+
+.. _api_fluid_executor_Executor:
Executor
--------
@@ -12,21 +14,27 @@ Executor
:members:
:noindex:
+.. _api_fluid_executor_global_scope:
+
global_scope
------------
.. autofunction:: paddle.fluid.executor.global_scope
:noindex:
+.. _api_fluid_executor_scope_guard:
+
scope_guard
-----------
.. autofunction:: paddle.fluid.executor.scope_guard
:noindex:
-switch_scope
-------------
+.. _api_fluid_executor__switch_scope:
+
+_switch_scope
+-------------
-.. autofunction:: paddle.fluid.executor.switch_scope
+.. autofunction:: paddle.fluid.executor._switch_scope
:noindex:
diff --git a/doc/fluid/api/fluid.rst b/doc/fluid/api/fluid.rst
new file mode 100644
index 0000000000000000000000000000000000000000..7eab58355c3648d929d3b5d98984adce9034f016
--- /dev/null
+++ b/doc/fluid/api/fluid.rst
@@ -0,0 +1,362 @@
+.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+ !DO NOT EDIT THIS FILE MANUALLY!
+
+=====
+fluid
+=====
+
+.. _api_fluid_Block:
+
+Block
+-----
+
+.. autoclass:: paddle.fluid.Block
+ :members:
+ :noindex:
+
+.. _api_fluid_Variable:
+
+Variable
+--------
+
+.. autoclass:: paddle.fluid.Variable
+ :members:
+ :noindex:
+
+.. _api_fluid_Program:
+
+Program
+-------
+
+.. autoclass:: paddle.fluid.Program
+ :members:
+ :noindex:
+
+.. _api_fluid_Operator:
+
+Operator
+--------
+
+.. autoclass:: paddle.fluid.Operator
+ :members:
+ :noindex:
+
+.. _api_fluid_default_startup_program:
+
+default_startup_program
+-----------------------
+
+.. autofunction:: paddle.fluid.default_startup_program
+ :noindex:
+
+.. _api_fluid_default_main_program:
+
+default_main_program
+--------------------
+
+.. autofunction:: paddle.fluid.default_main_program
+ :noindex:
+
+.. _api_fluid_program_guard:
+
+program_guard
+-------------
+
+.. autofunction:: paddle.fluid.program_guard
+ :noindex:
+
+.. _api_fluid_get_var:
+
+get_var
+-------
+
+.. autofunction:: paddle.fluid.get_var
+ :noindex:
+
+.. _api_fluid_Executor:
+
+Executor
+--------
+
+.. autoclass:: paddle.fluid.Executor
+ :members:
+ :noindex:
+
+.. _api_fluid_global_scope:
+
+global_scope
+------------
+
+.. autofunction:: paddle.fluid.global_scope
+ :noindex:
+
+.. _api_fluid_scope_guard:
+
+scope_guard
+-----------
+
+.. autofunction:: paddle.fluid.scope_guard
+ :noindex:
+
+.. _api_fluid__switch_scope:
+
+_switch_scope
+-------------
+
+.. autofunction:: paddle.fluid._switch_scope
+ :noindex:
+
+
+.. _api_fluid_make_channel:
+
+make_channel
+------------
+
+.. autofunction:: paddle.fluid.make_channel
+ :noindex:
+
+.. _api_fluid_channel_send:
+
+channel_send
+------------
+
+.. autofunction:: paddle.fluid.channel_send
+ :noindex:
+
+.. _api_fluid_channel_recv:
+
+channel_recv
+------------
+
+.. autofunction:: paddle.fluid.channel_recv
+ :noindex:
+
+.. _api_fluid_channel_close:
+
+channel_close
+-------------
+
+.. autofunction:: paddle.fluid.channel_close
+ :noindex:
+
+.. _api_fluid_Select:
+
+Select
+------
+
+.. autoclass:: paddle.fluid.Select
+ :members:
+ :noindex:
+
+.. _api_fluid_Trainer:
+
+Trainer
+-------
+
+.. autoclass:: paddle.fluid.Trainer
+ :members:
+ :noindex:
+
+.. _api_fluid_BeginEpochEvent:
+
+BeginEpochEvent
+---------------
+
+.. autoclass:: paddle.fluid.BeginEpochEvent
+ :members:
+ :noindex:
+
+.. _api_fluid_EndEpochEvent:
+
+EndEpochEvent
+-------------
+
+.. autoclass:: paddle.fluid.EndEpochEvent
+ :members:
+ :noindex:
+
+.. _api_fluid_BeginStepEvent:
+
+BeginStepEvent
+--------------
+
+.. autoclass:: paddle.fluid.BeginStepEvent
+ :members:
+ :noindex:
+
+.. _api_fluid_EndStepEvent:
+
+EndStepEvent
+------------
+
+.. autoclass:: paddle.fluid.EndStepEvent
+ :members:
+ :noindex:
+
+.. _api_fluid_CheckpointConfig:
+
+CheckpointConfig
+----------------
+
+.. autoclass:: paddle.fluid.CheckpointConfig
+ :members:
+ :noindex:
+
+.. _api_fluid_Inferencer:
+
+Inferencer
+----------
+
+.. autoclass:: paddle.fluid.Inferencer
+ :members:
+ :noindex:
+
+.. _api_fluid_DistributeTranspiler:
+
+DistributeTranspiler
+--------------------
+
+.. autoclass:: paddle.fluid.DistributeTranspiler
+ :members:
+ :noindex:
+
+.. _api_fluid_memory_optimize:
+
+memory_optimize
+---------------
+
+.. autofunction:: paddle.fluid.memory_optimize
+ :noindex:
+
+.. _api_fluid_release_memory:
+
+release_memory
+--------------
+
+.. autofunction:: paddle.fluid.release_memory
+ :noindex:
+
+.. _api_fluid_ParallelExecutor:
+
+ParallelExecutor
+----------------
+
+.. autoclass:: paddle.fluid.ParallelExecutor
+ :members:
+ :noindex:
+
+.. _api_fluid_ExecutionStrategy:
+
+ExecutionStrategy
+-----------------
+
+.. autoclass:: paddle.fluid.ExecutionStrategy
+ :members:
+ :noindex:
+
+.. _api_fluid_BuildStrategy:
+
+BuildStrategy
+-------------
+
+.. autoclass:: paddle.fluid.BuildStrategy
+ :members:
+ :noindex:
+
+.. _api_fluid_create_lod_tensor:
+
+create_lod_tensor
+-----------------
+
+.. autofunction:: paddle.fluid.create_lod_tensor
+ :noindex:
+
+.. _api_fluid_create_random_int_lodtensor:
+
+create_random_int_lodtensor
+---------------------------
+
+.. autofunction:: paddle.fluid.create_random_int_lodtensor
+ :noindex:
+
+.. _api_fluid_LoDTensor:
+
+LoDTensor
+---------
+
+.. autoclass:: paddle.fluid.LoDTensor
+ :members:
+ :noindex:
+
+.. _api_fluid_CPUPlace:
+
+CPUPlace
+--------
+
+.. autoclass:: paddle.fluid.CPUPlace
+ :members:
+ :noindex:
+
+.. _api_fluid_CUDAPlace:
+
+CUDAPlace
+---------
+
+.. autoclass:: paddle.fluid.CUDAPlace
+ :members:
+ :noindex:
+
+.. _api_fluid_CUDAPinnedPlace:
+
+CUDAPinnedPlace
+---------------
+
+.. autoclass:: paddle.fluid.CUDAPinnedPlace
+ :members:
+ :noindex:
+
+.. _api_fluid_Tensor:
+
+Tensor
+------
+
+.. autoclass:: paddle.fluid.Tensor
+ :members:
+ :noindex:
+
+.. _api_fluid_ParamAttr:
+
+ParamAttr
+---------
+
+.. autoclass:: paddle.fluid.ParamAttr
+ :members:
+ :noindex:
+
+.. _api_fluid_WeightNormParamAttr:
+
+WeightNormParamAttr
+-------------------
+
+.. autoclass:: paddle.fluid.WeightNormParamAttr
+ :members:
+ :noindex:
+
+.. _api_fluid_DataFeeder:
+
+DataFeeder
+----------
+
+.. autoclass:: paddle.fluid.DataFeeder
+ :members:
+ :noindex:
+
+.. _api_fluid_Scope:
+
+Scope
+-----
+
+.. autoclass:: paddle.fluid.Scope
+ :members:
+ :noindex:
+
diff --git a/doc/fluid/api/gen_doc.py b/doc/fluid/api/gen_doc.py
index 89ab880301b6ac687fd61f556f87f03792c37da3..02efce2bf8392c62a7600c272bedcadc6563f927 100644
--- a/doc/fluid/api/gen_doc.py
+++ b/doc/fluid/api/gen_doc.py
@@ -29,19 +29,27 @@ def parse_arg():
class DocGenerator(object):
- def __init__(self, module_name, stream=sys.stdout):
+ def __init__(self, module_name=None, stream=sys.stdout):
+ if module_name == "":
+ module_name = None
self.stream = stream
- self.module_name = module_name
- if not hasattr(fluid, module_name):
- raise ValueError("Cannot find fluid.{0}".format(module_name))
+ if module_name is None:
+ self.module_name = "fluid"
else:
- self.module = getattr(fluid, module_name)
+ self.module_name = "fluid." + module_name
+ if module_name is None:
+ self.module = fluid
+ else:
+ if not hasattr(fluid, module_name):
+ raise ValueError("Cannot find fluid.{0}".format(module_name))
+ else:
+ self.module = getattr(fluid, module_name)
self.stream.write('''.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY!
''')
- self._print_header_(module_name, dot='=', is_title=True)
+ self._print_header_(self.module_name, dot='=', is_title=True)
def print_submodule(self, submodule_name):
submodule = getattr(self.module, submodule_name)
@@ -60,25 +68,29 @@ class DocGenerator(object):
self._print_header_(name, dot='=', is_title=False)
def print_item(self, name):
- item = getattr(self.module, name)
+ item = getattr(self.module, name, None)
+ if item is None:
+ return
if isinstance(item, types.TypeType):
self.print_class(name)
elif isinstance(item, types.FunctionType):
self.print_method(name)
else:
- raise RuntimeError("Unsupported item {0}".format(name))
+ pass
def print_class(self, name):
+ self._print_ref_(name)
self._print_header_(name, dot='-', is_title=False)
- self.stream.write('''.. autoclass:: paddle.fluid.{0}.{1}
+ self.stream.write('''.. autoclass:: paddle.{0}.{1}
:members:
:noindex:
'''.format(self.module_name, name))
def print_method(self, name):
+ self._print_ref_(name)
self._print_header_(name, dot='-', is_title=False)
- self.stream.write('''.. autofunction:: paddle.fluid.{0}.{1}
+ self.stream.write('''.. autofunction:: paddle.{0}.{1}
:noindex:
'''.format(self.module_name, name))
@@ -94,6 +106,10 @@ class DocGenerator(object):
self.stream.write('\n')
self.stream.write('\n')
+ def _print_ref_(self, name):
+ self.stream.write(".. _api_{0}_{1}:\n\n".format("_".join(
+ self.module_name.split(".")), name))
+
def main():
args = parse_arg()
diff --git a/doc/fluid/api/gen_doc.sh b/doc/fluid/api/gen_doc.sh
index ba7b7ba8e51399deb852b0a7c8ddd3128f521e85..b14ee29873c50fd011f6c48b754767ac8918252a 100755
--- a/doc/fluid/api/gen_doc.sh
+++ b/doc/fluid/api/gen_doc.sh
@@ -1,7 +1,9 @@
#!/bin/bash
-python gen_doc.py layers --submodules control_flow device io nn ops tensor > layers.rst
+python gen_doc.py layers --submodules control_flow device io nn ops tensor learning_rate_scheduler detection metric_op tensor > layers.rst
-for module in io data_feeder evaluator executor initializer io nets optimizer param_attr profiler regularizer
+for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer transpiler recordio_writer backward average profiler
do
python gen_doc.py ${module} > ${module}.rst
done
+
+python gen_doc.py "" > fluid.rst
diff --git a/doc/fluid/api/index_en.rst b/doc/fluid/api/index_en.rst
index 06c686d9508635abd41571983e00be174e94743e..359406819a993e7eaf2155c839373df44d97b103 100644
--- a/doc/fluid/api/index_en.rst
+++ b/doc/fluid/api/index_en.rst
@@ -1,19 +1,26 @@
-======================
-Fluid
-======================
+=============
+API Reference
+=============
.. toctree::
:maxdepth: 1
+ fluid.rst
layers.rst
data_feeder.rst
executor.rst
initializer.rst
- evaluator.rst
+ metrics.rst
nets.rst
+ clip.rst
optimizer.rst
param_attr.rst
profiler.rst
regularizer.rst
io.rst
data.rst
+ transpiler.rst
+ recordio_writer.rst
+ backward.rst
+ average.rst
+ profiler.rst
diff --git a/doc/fluid/api/initializer.rst b/doc/fluid/api/initializer.rst
index 2f02c5de097945a45a3e053427104bd17bea1279..dc0b52b14fd242dfaded1cb9a8e0ab9eb66b0607 100644
--- a/doc/fluid/api/initializer.rst
+++ b/doc/fluid/api/initializer.rst
@@ -1,9 +1,11 @@
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY!
-===========
-initializer
-===========
+=================
+fluid.initializer
+=================
+
+.. _api_fluid_initializer_Constant:
Constant
--------
@@ -12,6 +14,8 @@ Constant
:members:
:noindex:
+.. _api_fluid_initializer_Uniform:
+
Uniform
-------
@@ -19,6 +23,8 @@ Uniform
:members:
:noindex:
+.. _api_fluid_initializer_Normal:
+
Normal
------
@@ -26,6 +32,8 @@ Normal
:members:
:noindex:
+.. _api_fluid_initializer_Xavier:
+
Xavier
------
@@ -33,13 +41,42 @@ Xavier
:members:
:noindex:
+.. _api_fluid_initializer_Bilinear:
+
+Bilinear
+--------
+
+.. autoclass:: paddle.fluid.initializer.Bilinear
+ :members:
+ :noindex:
+
+.. _api_fluid_initializer_MSRA:
+
MSRA
-------
+----
.. autoclass:: paddle.fluid.initializer.MSRA
:members:
:noindex:
+.. _api_fluid_initializer_force_init_on_cpu:
+
+force_init_on_cpu
+-----------------
+
+.. autofunction:: paddle.fluid.initializer.force_init_on_cpu
+ :noindex:
+
+.. _api_fluid_initializer_init_on_cpu:
+
+init_on_cpu
+-----------
+
+.. autofunction:: paddle.fluid.initializer.init_on_cpu
+ :noindex:
+
+.. _api_fluid_initializer_ConstantInitializer:
+
ConstantInitializer
-------------------
@@ -47,6 +84,8 @@ ConstantInitializer
:members:
:noindex:
+.. _api_fluid_initializer_UniformInitializer:
+
UniformInitializer
------------------
@@ -54,6 +93,8 @@ UniformInitializer
:members:
:noindex:
+.. _api_fluid_initializer_NormalInitializer:
+
NormalInitializer
-----------------
@@ -61,6 +102,8 @@ NormalInitializer
:members:
:noindex:
+.. _api_fluid_initializer_XavierInitializer:
+
XavierInitializer
-----------------
@@ -68,9 +111,21 @@ XavierInitializer
:members:
:noindex:
+.. _api_fluid_initializer_BilinearInitializer:
+
+BilinearInitializer
+-------------------
+
+.. autoclass:: paddle.fluid.initializer.BilinearInitializer
+ :members:
+ :noindex:
+
+.. _api_fluid_initializer_MSRAInitializer:
MSRAInitializer
------------------
+---------------
+
.. autoclass:: paddle.fluid.initializer.MSRAInitializer
:members:
:noindex:
+
diff --git a/doc/fluid/api/io.rst b/doc/fluid/api/io.rst
index dd9d88b669957c22cd0a07fa4b7e219e2d6e5d61..7cee0bc4d9aa2c51517d23a381f14a8f63cc3681 100644
--- a/doc/fluid/api/io.rst
+++ b/doc/fluid/api/io.rst
@@ -1,9 +1,11 @@
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY!
-==
-io
-==
+========
+fluid.io
+========
+
+.. _api_fluid_io_save_vars:
save_vars
---------
@@ -11,51 +13,115 @@ save_vars
.. autofunction:: paddle.fluid.io.save_vars
:noindex:
+.. _api_fluid_io_save_params:
+
save_params
-----------
.. autofunction:: paddle.fluid.io.save_params
:noindex:
+.. _api_fluid_io_save_persistables:
+
save_persistables
-----------------
.. autofunction:: paddle.fluid.io.save_persistables
:noindex:
+.. _api_fluid_io_load_vars:
+
load_vars
---------
.. autofunction:: paddle.fluid.io.load_vars
:noindex:
+.. _api_fluid_io_load_params:
+
load_params
-----------
.. autofunction:: paddle.fluid.io.load_params
:noindex:
+.. _api_fluid_io_load_persistables:
+
load_persistables
-----------------
.. autofunction:: paddle.fluid.io.load_persistables
:noindex:
+.. _api_fluid_io_save_inference_model:
+
save_inference_model
--------------------
.. autofunction:: paddle.fluid.io.save_inference_model
:noindex:
+.. _api_fluid_io_load_inference_model:
+
load_inference_model
--------------------
.. autofunction:: paddle.fluid.io.load_inference_model
:noindex:
+.. _api_fluid_io_get_inference_program:
+
get_inference_program
---------------------
.. autofunction:: paddle.fluid.io.get_inference_program
:noindex:
+.. _api_fluid_io_save_checkpoint:
+
+save_checkpoint
+---------------
+
+.. autofunction:: paddle.fluid.io.save_checkpoint
+ :noindex:
+
+.. _api_fluid_io_load_checkpoint:
+
+load_checkpoint
+---------------
+
+.. autofunction:: paddle.fluid.io.load_checkpoint
+ :noindex:
+
+.. _api_fluid_io_clean_checkpoint:
+
+clean_checkpoint
+----------------
+
+.. autofunction:: paddle.fluid.io.clean_checkpoint
+ :noindex:
+
+.. _api_fluid_io_load_persist_vars_without_grad:
+
+load_persist_vars_without_grad
+------------------------------
+
+.. autofunction:: paddle.fluid.io.load_persist_vars_without_grad
+ :noindex:
+
+.. _api_fluid_io_save_persist_vars_without_grad:
+
+save_persist_vars_without_grad
+------------------------------
+
+.. autofunction:: paddle.fluid.io.save_persist_vars_without_grad
+ :noindex:
+
+.. _api_fluid_io_get_latest_checkpoint_serial:
+
+get_latest_checkpoint_serial
+----------------------------
+
+.. autofunction:: paddle.fluid.io.get_latest_checkpoint_serial
+ :noindex:
+
diff --git a/doc/fluid/api/layers.rst b/doc/fluid/api/layers.rst
index ff3c9346a2cd777a5294d536911f39de9032fe52..ecbd8191ccf5aa6046e7875fe8afa2ed0105e4a0 100644
--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
@@ -1,25 +1,31 @@
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY!
-======
-layers
-======
+============
+fluid.layers
+============
control_flow
============
+.. _api_fluid_layers_split_lod_tensor:
+
split_lod_tensor
----------------
.. autofunction:: paddle.fluid.layers.split_lod_tensor
:noindex:
+.. _api_fluid_layers_merge_lod_tensor:
+
merge_lod_tensor
----------------
.. autofunction:: paddle.fluid.layers.merge_lod_tensor
:noindex:
+.. _api_fluid_layers_BlockGuard:
+
BlockGuard
----------
@@ -27,6 +33,8 @@ BlockGuard
:members:
:noindex:
+.. _api_fluid_layers_BlockGuardWithCompletion:
+
BlockGuardWithCompletion
------------------------
@@ -34,12 +42,7 @@ BlockGuardWithCompletion
:members:
:noindex:
-StaticRNNMemoryLink
--------------------
-
-.. autoclass:: paddle.fluid.layers.StaticRNNMemoryLink
- :members:
- :noindex:
+.. _api_fluid_layers_WhileGuard:
WhileGuard
----------
@@ -48,6 +51,8 @@ WhileGuard
:members:
:noindex:
+.. _api_fluid_layers_While:
+
While
-----
@@ -55,23 +60,32 @@ While
:members:
:noindex:
+.. _api_fluid_layers_Switch:
+
+Switch
+------
+
+.. autoclass:: paddle.fluid.layers.Switch
+ :members:
+ :noindex:
+
+.. _api_fluid_layers_lod_rank_table:
+
lod_rank_table
--------------
.. autofunction:: paddle.fluid.layers.lod_rank_table
:noindex:
+.. _api_fluid_layers_max_sequence_len:
+
max_sequence_len
----------------
.. autofunction:: paddle.fluid.layers.max_sequence_len
:noindex:
-topk
-----
-
-.. autofunction:: paddle.fluid.layers.topk
- :noindex:
+.. _api_fluid_layers_lod_tensor_to_array:
lod_tensor_to_array
-------------------
@@ -79,54 +93,80 @@ lod_tensor_to_array
.. autofunction:: paddle.fluid.layers.lod_tensor_to_array
:noindex:
+.. _api_fluid_layers_array_to_lod_tensor:
+
array_to_lod_tensor
-------------------
.. autofunction:: paddle.fluid.layers.array_to_lod_tensor
:noindex:
+.. _api_fluid_layers_increment:
+
increment
---------
.. autofunction:: paddle.fluid.layers.increment
:noindex:
+.. _api_fluid_layers_array_write:
+
array_write
-----------
.. autofunction:: paddle.fluid.layers.array_write
:noindex:
+.. _api_fluid_layers_create_array:
+
create_array
------------
.. autofunction:: paddle.fluid.layers.create_array
:noindex:
+.. _api_fluid_layers_less_than:
+
less_than
---------
.. autofunction:: paddle.fluid.layers.less_than
:noindex:
+.. _api_fluid_layers_equal:
+
+equal
+-----
+
+.. autofunction:: paddle.fluid.layers.equal
+ :noindex:
+
+.. _api_fluid_layers_array_read:
+
array_read
----------
.. autofunction:: paddle.fluid.layers.array_read
:noindex:
+.. _api_fluid_layers_shrink_memory:
+
shrink_memory
-------------
.. autofunction:: paddle.fluid.layers.shrink_memory
:noindex:
+.. _api_fluid_layers_array_length:
+
array_length
------------
.. autofunction:: paddle.fluid.layers.array_length
:noindex:
+.. _api_fluid_layers_IfElse:
+
IfElse
------
@@ -134,6 +174,8 @@ IfElse
:members:
:noindex:
+.. _api_fluid_layers_DynamicRNN:
+
DynamicRNN
----------
@@ -141,6 +183,8 @@ DynamicRNN
:members:
:noindex:
+.. _api_fluid_layers_ConditionalBlock:
+
ConditionalBlock
----------------
@@ -148,6 +192,8 @@ ConditionalBlock
:members:
:noindex:
+.. _api_fluid_layers_StaticRNN:
+
StaticRNN
---------
@@ -155,12 +201,16 @@ StaticRNN
:members:
:noindex:
+.. _api_fluid_layers_reorder_lod_tensor_by_rank:
+
reorder_lod_tensor_by_rank
--------------------------
.. autofunction:: paddle.fluid.layers.reorder_lod_tensor_by_rank
:noindex:
+.. _api_fluid_layers_ParallelDo:
+
ParallelDo
----------
@@ -168,15 +218,27 @@ ParallelDo
:members:
:noindex:
+.. _api_fluid_layers_Print:
+
Print
-----
.. autofunction:: paddle.fluid.layers.Print
:noindex:
+.. _api_fluid_layers_is_empty:
+
+is_empty
+--------
+
+.. autofunction:: paddle.fluid.layers.is_empty
+ :noindex:
+
device
======
+.. _api_fluid_layers_get_places:
+
get_places
----------
@@ -186,12 +248,16 @@ get_places
io
==
+.. _api_fluid_layers_data:
+
data
----
.. autofunction:: paddle.fluid.layers.data
:noindex:
+.. _api_fluid_layers_BlockGuardServ:
+
BlockGuardServ
--------------
@@ -199,6 +265,8 @@ BlockGuardServ
:members:
:noindex:
+.. _api_fluid_layers_ListenAndServ:
+
ListenAndServ
-------------
@@ -206,86 +274,187 @@ ListenAndServ
:members:
:noindex:
+.. _api_fluid_layers_Send:
+
Send
----
.. autofunction:: paddle.fluid.layers.Send
:noindex:
+.. _api_fluid_layers_Recv:
+
+Recv
+----
+
+.. autofunction:: paddle.fluid.layers.Recv
+ :noindex:
+
+.. _api_fluid_layers_open_recordio_file:
+
+open_recordio_file
+------------------
+
+.. autofunction:: paddle.fluid.layers.open_recordio_file
+ :noindex:
+
+.. _api_fluid_layers_open_files:
+
+open_files
+----------
+
+.. autofunction:: paddle.fluid.layers.open_files
+ :noindex:
+
+.. _api_fluid_layers_read_file:
+
+read_file
+---------
+
+.. autofunction:: paddle.fluid.layers.read_file
+ :noindex:
+
+.. _api_fluid_layers_shuffle:
+
+shuffle
+-------
+
+.. autofunction:: paddle.fluid.layers.shuffle
+ :noindex:
+
+.. _api_fluid_layers_batch:
+
+batch
+-----
+
+.. autofunction:: paddle.fluid.layers.batch
+ :noindex:
+
+.. _api_fluid_layers_double_buffer:
+
+double_buffer
+-------------
+
+.. autofunction:: paddle.fluid.layers.double_buffer
+ :noindex:
+
+.. _api_fluid_layers_random_data_generator:
+
+random_data_generator
+---------------------
+
+.. autofunction:: paddle.fluid.layers.random_data_generator
+ :noindex:
+
+.. _api_fluid_layers_Preprocessor:
+
+Preprocessor
+------------
+
+.. autoclass:: paddle.fluid.layers.Preprocessor
+ :members:
+ :noindex:
+
+.. _api_fluid_layers_load:
+
+load
+----
+
+.. autofunction:: paddle.fluid.layers.load
+ :noindex:
+
nn
==
+.. _api_fluid_layers_fc:
+
fc
--
.. autofunction:: paddle.fluid.layers.fc
:noindex:
+.. _api_fluid_layers_embedding:
+
embedding
---------
.. autofunction:: paddle.fluid.layers.embedding
:noindex:
+.. _api_fluid_layers_dynamic_lstm:
+
dynamic_lstm
------------
.. autofunction:: paddle.fluid.layers.dynamic_lstm
:noindex:
+.. _api_fluid_layers_dynamic_lstmp:
+
dynamic_lstmp
-------------
.. autofunction:: paddle.fluid.layers.dynamic_lstmp
:noindex:
+.. _api_fluid_layers_dynamic_gru:
+
dynamic_gru
-----------
.. autofunction:: paddle.fluid.layers.dynamic_gru
:noindex:
+.. _api_fluid_layers_gru_unit:
+
gru_unit
--------
.. autofunction:: paddle.fluid.layers.gru_unit
:noindex:
+.. _api_fluid_layers_linear_chain_crf:
+
linear_chain_crf
----------------
.. autofunction:: paddle.fluid.layers.linear_chain_crf
:noindex:
+.. _api_fluid_layers_crf_decoding:
+
crf_decoding
------------
.. autofunction:: paddle.fluid.layers.crf_decoding
:noindex:
+.. _api_fluid_layers_cos_sim:
+
cos_sim
-------
.. autofunction:: paddle.fluid.layers.cos_sim
:noindex:
+.. _api_fluid_layers_cross_entropy:
+
cross_entropy
-------------
.. autofunction:: paddle.fluid.layers.cross_entropy
:noindex:
+.. _api_fluid_layers_square_error_cost:
+
square_error_cost
-----------------
.. autofunction:: paddle.fluid.layers.square_error_cost
:noindex:
-accuracy
---------
-
-.. autofunction:: paddle.fluid.layers.accuracy
- :noindex:
+.. _api_fluid_layers_chunk_eval:
chunk_eval
----------
@@ -293,321 +462,706 @@ chunk_eval
.. autofunction:: paddle.fluid.layers.chunk_eval
:noindex:
+.. _api_fluid_layers_sequence_conv:
+
sequence_conv
-------------
.. autofunction:: paddle.fluid.layers.sequence_conv
:noindex:
+.. _api_fluid_layers_conv2d:
+
conv2d
------
.. autofunction:: paddle.fluid.layers.conv2d
:noindex:
+.. _api_fluid_layers_conv3d:
+
+conv3d
+------
+
+.. autofunction:: paddle.fluid.layers.conv3d
+ :noindex:
+
+.. _api_fluid_layers_sequence_pool:
+
sequence_pool
-------------
.. autofunction:: paddle.fluid.layers.sequence_pool
:noindex:
+.. _api_fluid_layers_sequence_softmax:
+
+sequence_softmax
+----------------
+
+.. autofunction:: paddle.fluid.layers.sequence_softmax
+ :noindex:
+
+.. _api_fluid_layers_softmax:
+
+softmax
+-------
+
+.. autofunction:: paddle.fluid.layers.softmax
+ :noindex:
+
+.. _api_fluid_layers_pool2d:
+
pool2d
------
.. autofunction:: paddle.fluid.layers.pool2d
:noindex:
-batch_norm
-----------
+.. _api_fluid_layers_pool3d:
-.. autofunction:: paddle.fluid.layers.batch_norm
+pool3d
+------
+
+.. autofunction:: paddle.fluid.layers.pool3d
:noindex:
-layer_norm
+.. _api_fluid_layers_batch_norm:
+
+batch_norm
----------
-.. autofunction:: paddle.fluid.layers.layer_norm
+.. autofunction:: paddle.fluid.layers.batch_norm
:noindex:
+.. _api_fluid_layers_beam_search_decode:
+
beam_search_decode
------------------
.. autofunction:: paddle.fluid.layers.beam_search_decode
:noindex:
+.. _api_fluid_layers_conv2d_transpose:
+
conv2d_transpose
----------------
.. autofunction:: paddle.fluid.layers.conv2d_transpose
:noindex:
+.. _api_fluid_layers_conv3d_transpose:
+
+conv3d_transpose
+----------------
+
+.. autofunction:: paddle.fluid.layers.conv3d_transpose
+ :noindex:
+
+.. _api_fluid_layers_sequence_expand:
+
sequence_expand
---------------
.. autofunction:: paddle.fluid.layers.sequence_expand
:noindex:
+.. _api_fluid_layers_lstm_unit:
+
lstm_unit
---------
.. autofunction:: paddle.fluid.layers.lstm_unit
:noindex:
+.. _api_fluid_layers_reduce_sum:
+
reduce_sum
----------
.. autofunction:: paddle.fluid.layers.reduce_sum
:noindex:
+.. _api_fluid_layers_reduce_mean:
+
reduce_mean
-----------
.. autofunction:: paddle.fluid.layers.reduce_mean
:noindex:
+.. _api_fluid_layers_reduce_max:
+
reduce_max
----------
.. autofunction:: paddle.fluid.layers.reduce_max
:noindex:
+.. _api_fluid_layers_reduce_min:
+
reduce_min
----------
.. autofunction:: paddle.fluid.layers.reduce_min
:noindex:
+.. _api_fluid_layers_reduce_prod:
+
+reduce_prod
+-----------
+
+.. autofunction:: paddle.fluid.layers.reduce_prod
+ :noindex:
+
+.. _api_fluid_layers_sequence_first_step:
+
sequence_first_step
-------------------
.. autofunction:: paddle.fluid.layers.sequence_first_step
:noindex:
+.. _api_fluid_layers_sequence_last_step:
+
sequence_last_step
------------------
.. autofunction:: paddle.fluid.layers.sequence_last_step
:noindex:
+.. _api_fluid_layers_dropout:
+
dropout
-------
.. autofunction:: paddle.fluid.layers.dropout
:noindex:
+.. _api_fluid_layers_split:
+
split
-----
.. autofunction:: paddle.fluid.layers.split
:noindex:
+.. _api_fluid_layers_ctc_greedy_decoder:
+
ctc_greedy_decoder
------------------
.. autofunction:: paddle.fluid.layers.ctc_greedy_decoder
:noindex:
+.. _api_fluid_layers_edit_distance:
+
edit_distance
-------------
.. autofunction:: paddle.fluid.layers.edit_distance
:noindex:
+.. _api_fluid_layers_l2_normalize:
+
l2_normalize
------------
.. autofunction:: paddle.fluid.layers.l2_normalize
:noindex:
+.. _api_fluid_layers_matmul:
+
matmul
------
.. autofunction:: paddle.fluid.layers.matmul
:noindex:
+.. _api_fluid_layers_topk:
+
+topk
+----
+
+.. autofunction:: paddle.fluid.layers.topk
+ :noindex:
+
+.. _api_fluid_layers_warpctc:
+
warpctc
-------
.. autofunction:: paddle.fluid.layers.warpctc
:noindex:
+.. _api_fluid_layers_sequence_reshape:
+
sequence_reshape
----------------
.. autofunction:: paddle.fluid.layers.sequence_reshape
:noindex:
+.. _api_fluid_layers_transpose:
+
transpose
---------
.. autofunction:: paddle.fluid.layers.transpose
:noindex:
+.. _api_fluid_layers_im2sequence:
+
im2sequence
-----------
.. autofunction:: paddle.fluid.layers.im2sequence
:noindex:
+.. _api_fluid_layers_nce:
+
nce
---
.. autofunction:: paddle.fluid.layers.nce
:noindex:
+.. _api_fluid_layers_beam_search:
+
beam_search
-----------
.. autofunction:: paddle.fluid.layers.beam_search
:noindex:
+.. _api_fluid_layers_row_conv:
+
row_conv
--------
.. autofunction:: paddle.fluid.layers.row_conv
:noindex:
+.. _api_fluid_layers_multiplex:
+
multiplex
---------
.. autofunction:: paddle.fluid.layers.multiplex
:noindex:
-label_smooth
-------------
+.. _api_fluid_layers_layer_norm:
-.. autofunction:: paddle.fluid.layers.label_smooth
+layer_norm
+----------
+
+.. autofunction:: paddle.fluid.layers.layer_norm
:noindex:
-roi_pool
+.. _api_fluid_layers_softmax_with_cross_entropy:
+
+softmax_with_cross_entropy
+--------------------------
+
+.. autofunction:: paddle.fluid.layers.softmax_with_cross_entropy
+ :noindex:
+
+.. _api_fluid_layers_smooth_l1:
+
+smooth_l1
---------
-.. autofunction:: paddle.fluid.layers.roi_pool
+.. autofunction:: paddle.fluid.layers.smooth_l1
:noindex:
-
-ops
-===
+.. _api_fluid_layers_one_hot:
-mean
-----
+one_hot
+-------
-.. autofunction:: paddle.fluid.layers.mean
+.. autofunction:: paddle.fluid.layers.one_hot
:noindex:
-mul
----
+.. _api_fluid_layers_autoincreased_step_counter:
-.. autofunction:: paddle.fluid.layers.mul
+autoincreased_step_counter
+--------------------------
+
+.. autofunction:: paddle.fluid.layers.autoincreased_step_counter
:noindex:
+.. _api_fluid_layers_reshape:
+
reshape
-------
.. autofunction:: paddle.fluid.layers.reshape
:noindex:
+.. _api_fluid_layers_lod_reset:
+
+lod_reset
+---------
+
+.. autofunction:: paddle.fluid.layers.lod_reset
+ :noindex:
+
+.. _api_fluid_layers_lrn:
+
+lrn
+---
+
+.. autofunction:: paddle.fluid.layers.lrn
+ :noindex:
+
+.. _api_fluid_layers_pad:
+
pad
---
.. autofunction:: paddle.fluid.layers.pad
:noindex:
-scale
------
+.. _api_fluid_layers_label_smooth:
-.. autofunction:: paddle.fluid.layers.scale
+label_smooth
+------------
+
+.. autofunction:: paddle.fluid.layers.label_smooth
:noindex:
-sigmoid_cross_entropy_with_logits
----------------------------------
+.. _api_fluid_layers_roi_pool:
-.. autofunction:: paddle.fluid.layers.sigmoid_cross_entropy_with_logits
+roi_pool
+--------
+
+.. autofunction:: paddle.fluid.layers.roi_pool
:noindex:
-elementwise_add
----------------
+.. _api_fluid_layers_dice_loss:
-.. autofunction:: paddle.fluid.layers.elementwise_add
+dice_loss
+---------
+
+.. autofunction:: paddle.fluid.layers.dice_loss
:noindex:
-elementwise_div
----------------
+.. _api_fluid_layers_image_resize:
-.. autofunction:: paddle.fluid.layers.elementwise_div
+image_resize
+------------
+
+.. autofunction:: paddle.fluid.layers.image_resize
:noindex:
+.. _api_fluid_layers_image_resize_short:
+
+image_resize_short
+------------------
+
+.. autofunction:: paddle.fluid.layers.image_resize_short
+ :noindex:
+
+.. _api_fluid_layers_resize_bilinear:
+
+resize_bilinear
+---------------
+
+.. autofunction:: paddle.fluid.layers.resize_bilinear
+ :noindex:
+
+.. _api_fluid_layers_gather:
+
+gather
+------
+
+.. autofunction:: paddle.fluid.layers.gather
+ :noindex:
+
+.. _api_fluid_layers_random_crop:
+
+random_crop
+-----------
+
+.. autofunction:: paddle.fluid.layers.random_crop
+ :noindex:
+
+.. _api_fluid_layers_mean_iou:
+
+mean_iou
+--------
+
+.. autofunction:: paddle.fluid.layers.mean_iou
+ :noindex:
+
+.. _api_fluid_layers_relu:
+
+relu
+----
+
+.. autofunction:: paddle.fluid.layers.relu
+ :noindex:
+
+.. _api_fluid_layers_log:
+
+log
+---
+
+.. autofunction:: paddle.fluid.layers.log
+ :noindex:
+
+.. _api_fluid_layers_crop:
+
+crop
+----
+
+.. autofunction:: paddle.fluid.layers.crop
+ :noindex:
+
+ops
+===
+
+.. _api_fluid_layers_mean:
+
+mean
+----
+
+.. autofunction:: paddle.fluid.layers.mean
+ :noindex:
+
+.. _api_fluid_layers_mul:
+
+mul
+---
+
+.. autofunction:: paddle.fluid.layers.mul
+ :noindex:
+
+.. _api_fluid_layers_scale:
+
+scale
+-----
+
+.. autofunction:: paddle.fluid.layers.scale
+ :noindex:
+
+.. _api_fluid_layers_sigmoid_cross_entropy_with_logits:
+
+sigmoid_cross_entropy_with_logits
+---------------------------------
+
+.. autofunction:: paddle.fluid.layers.sigmoid_cross_entropy_with_logits
+ :noindex:
+
+.. _api_fluid_layers_elementwise_add:
+
+elementwise_add
+---------------
+
+.. autofunction:: paddle.fluid.layers.elementwise_add
+ :noindex:
+
+.. _api_fluid_layers_elementwise_div:
+
+elementwise_div
+---------------
+
+.. autofunction:: paddle.fluid.layers.elementwise_div
+ :noindex:
+
+.. _api_fluid_layers_elementwise_sub:
+
elementwise_sub
---------------
.. autofunction:: paddle.fluid.layers.elementwise_sub
:noindex:
+.. _api_fluid_layers_elementwise_mul:
+
elementwise_mul
---------------
.. autofunction:: paddle.fluid.layers.elementwise_mul
:noindex:
+.. _api_fluid_layers_elementwise_max:
+
elementwise_max
---------------
.. autofunction:: paddle.fluid.layers.elementwise_max
:noindex:
+.. _api_fluid_layers_elementwise_min:
+
elementwise_min
---------------
.. autofunction:: paddle.fluid.layers.elementwise_min
:noindex:
+.. _api_fluid_layers_elementwise_pow:
+
elementwise_pow
---------------
.. autofunction:: paddle.fluid.layers.elementwise_pow
:noindex:
+.. _api_fluid_layers_clip:
+
clip
----
.. autofunction:: paddle.fluid.layers.clip
:noindex:
+.. _api_fluid_layers_clip_by_norm:
+
clip_by_norm
------------
.. autofunction:: paddle.fluid.layers.clip_by_norm
:noindex:
-sequence_softmax
-----------------
+.. _api_fluid_layers_logical_and:
-.. autofunction:: paddle.fluid.layers.sequence_softmax
+logical_and
+-----------
+
+.. autofunction:: paddle.fluid.layers.logical_and
+ :noindex:
+
+.. _api_fluid_layers_logical_or:
+
+logical_or
+----------
+
+.. autofunction:: paddle.fluid.layers.logical_or
+ :noindex:
+
+.. _api_fluid_layers_logical_xor:
+
+logical_xor
+-----------
+
+.. autofunction:: paddle.fluid.layers.logical_xor
+ :noindex:
+
+.. _api_fluid_layers_logical_not:
+
+logical_not
+-----------
+
+.. autofunction:: paddle.fluid.layers.logical_not
+ :noindex:
+
+.. _api_fluid_layers_uniform_random_batch_size_like:
+
+uniform_random_batch_size_like
+------------------------------
+
+.. autofunction:: paddle.fluid.layers.uniform_random_batch_size_like
:noindex:
+.. _api_fluid_layers_gaussian_random:
+
+gaussian_random
+---------------
+
+.. autofunction:: paddle.fluid.layers.gaussian_random
+ :noindex:
+
+.. _api_fluid_layers_gaussian_random_batch_size_like:
+
+gaussian_random_batch_size_like
+-------------------------------
+
+.. autofunction:: paddle.fluid.layers.gaussian_random_batch_size_like
+ :noindex:
+
+.. _api_fluid_layers_scatter:
+
+scatter
+-------
+
+.. autofunction:: paddle.fluid.layers.scatter
+ :noindex:
+
+.. _api_fluid_layers_sum:
+
+sum
+---
+
+.. autofunction:: paddle.fluid.layers.sum
+ :noindex:
+
+.. _api_fluid_layers_slice:
+
+slice
+-----
+
+.. autofunction:: paddle.fluid.layers.slice
+ :noindex:
+
+.. _api_fluid_layers_polygon_box_transform:
+
+polygon_box_transform
+---------------------
+
+.. autofunction:: paddle.fluid.layers.polygon_box_transform
+ :noindex:
+
+.. _api_fluid_layers_shape:
+
+shape
+-----
+
+.. autofunction:: paddle.fluid.layers.shape
+ :noindex:
+
+.. _api_fluid_layers_iou_similarity:
+
+iou_similarity
+--------------
+
+.. autofunction:: paddle.fluid.layers.iou_similarity
+ :noindex:
+
+.. _api_fluid_layers_maxout:
+
+maxout
+------
+
+.. autofunction:: paddle.fluid.layers.maxout
+ :noindex:
+
+.. _api_fluid_layers_sigmoid:
+
sigmoid
-------
.. autofunction:: paddle.fluid.layers.sigmoid
:noindex:
+.. _api_fluid_layers_logsigmoid:
+
logsigmoid
----------
.. autofunction:: paddle.fluid.layers.logsigmoid
:noindex:
+.. _api_fluid_layers_exp:
+
exp
---
.. autofunction:: paddle.fluid.layers.exp
:noindex:
-relu
-----
-
-.. autofunction:: paddle.fluid.layers.relu
- :noindex:
+.. _api_fluid_layers_tanh:
tanh
----
@@ -615,59 +1169,87 @@ tanh
.. autofunction:: paddle.fluid.layers.tanh
:noindex:
+.. _api_fluid_layers_tanh_shrink:
+
tanh_shrink
-----------
.. autofunction:: paddle.fluid.layers.tanh_shrink
:noindex:
+.. _api_fluid_layers_softshrink:
+
softshrink
----------
.. autofunction:: paddle.fluid.layers.softshrink
:noindex:
+.. _api_fluid_layers_sqrt:
+
sqrt
----
.. autofunction:: paddle.fluid.layers.sqrt
:noindex:
+.. _api_fluid_layers_abs:
+
abs
---
.. autofunction:: paddle.fluid.layers.abs
:noindex:
+.. _api_fluid_layers_ceil:
+
ceil
----
.. autofunction:: paddle.fluid.layers.ceil
:noindex:
+.. _api_fluid_layers_floor:
+
floor
-----
.. autofunction:: paddle.fluid.layers.floor
:noindex:
+.. _api_fluid_layers_cos:
+
+cos
+---
+
+.. autofunction:: paddle.fluid.layers.cos
+ :noindex:
+
+.. _api_fluid_layers_sin:
+
+sin
+---
+
+.. autofunction:: paddle.fluid.layers.sin
+ :noindex:
+
+.. _api_fluid_layers_round:
+
round
-----
.. autofunction:: paddle.fluid.layers.round
:noindex:
+.. _api_fluid_layers_reciprocal:
+
reciprocal
----------
.. autofunction:: paddle.fluid.layers.reciprocal
:noindex:
-log
----
-
-.. autofunction:: paddle.fluid.layers.log
- :noindex:
+.. _api_fluid_layers_square:
square
------
@@ -675,157 +1257,522 @@ square
.. autofunction:: paddle.fluid.layers.square
:noindex:
+.. _api_fluid_layers_softplus:
+
softplus
--------
.. autofunction:: paddle.fluid.layers.softplus
:noindex:
+.. _api_fluid_layers_softsign:
+
softsign
--------
.. autofunction:: paddle.fluid.layers.softsign
:noindex:
+.. _api_fluid_layers_brelu:
+
brelu
-----
.. autofunction:: paddle.fluid.layers.brelu
:noindex:
+.. _api_fluid_layers_leaky_relu:
+
leaky_relu
----------
.. autofunction:: paddle.fluid.layers.leaky_relu
:noindex:
+.. _api_fluid_layers_soft_relu:
+
soft_relu
---------
.. autofunction:: paddle.fluid.layers.soft_relu
:noindex:
+.. _api_fluid_layers_elu:
+
elu
---
.. autofunction:: paddle.fluid.layers.elu
:noindex:
+.. _api_fluid_layers_relu6:
+
relu6
-----
.. autofunction:: paddle.fluid.layers.relu6
:noindex:
+.. _api_fluid_layers_pow:
+
pow
---
.. autofunction:: paddle.fluid.layers.pow
:noindex:
+.. _api_fluid_layers_stanh:
+
stanh
-----
.. autofunction:: paddle.fluid.layers.stanh
:noindex:
+.. _api_fluid_layers_hard_sigmoid:
+
+hard_sigmoid
+------------
+
+.. autofunction:: paddle.fluid.layers.hard_sigmoid
+ :noindex:
+
+.. _api_fluid_layers_swish:
+
+swish
+-----
+
+.. autofunction:: paddle.fluid.layers.swish
+ :noindex:
+
+.. _api_fluid_layers_uniform_random:
+
+uniform_random
+--------------
+
+.. autofunction:: paddle.fluid.layers.uniform_random
+ :noindex:
+
+.. _api_fluid_layers_hard_shrink:
+
hard_shrink
-----------
.. autofunction:: paddle.fluid.layers.hard_shrink
:noindex:
+.. _api_fluid_layers_cumsum:
+
+cumsum
+------
+
+.. autofunction:: paddle.fluid.layers.cumsum
+ :noindex:
+
+.. _api_fluid_layers_thresholded_relu:
+
thresholded_relu
----------------
.. autofunction:: paddle.fluid.layers.thresholded_relu
:noindex:
-hard_sigmoid
-------------
+tensor
+======
-.. autofunction:: paddle.fluid.layers.hard_sigmoid
+.. _api_fluid_layers_create_tensor:
+
+create_tensor
+-------------
+
+.. autofunction:: paddle.fluid.layers.create_tensor
:noindex:
-swish
+.. _api_fluid_layers_create_parameter:
+
+create_parameter
+----------------
+
+.. autofunction:: paddle.fluid.layers.create_parameter
+ :noindex:
+
+.. _api_fluid_layers_create_global_var:
+
+create_global_var
+-----------------
+
+.. autofunction:: paddle.fluid.layers.create_global_var
+ :noindex:
+
+.. _api_fluid_layers_cast:
+
+cast
+----
+
+.. autofunction:: paddle.fluid.layers.cast
+ :noindex:
+
+.. _api_fluid_layers_concat:
+
+concat
+------
+
+.. autofunction:: paddle.fluid.layers.concat
+ :noindex:
+
+.. _api_fluid_layers_sums:
+
+sums
+----
+
+.. autofunction:: paddle.fluid.layers.sums
+ :noindex:
+
+.. _api_fluid_layers_assign:
+
+assign
+------
+
+.. autofunction:: paddle.fluid.layers.assign
+ :noindex:
+
+.. _api_fluid_layers_fill_constant_batch_size_like:
+
+fill_constant_batch_size_like
+-----------------------------
+
+.. autofunction:: paddle.fluid.layers.fill_constant_batch_size_like
+ :noindex:
+
+.. _api_fluid_layers_fill_constant:
+
+fill_constant
+-------------
+
+.. autofunction:: paddle.fluid.layers.fill_constant
+ :noindex:
+
+.. _api_fluid_layers_argmin:
+
+argmin
+------
+
+.. autofunction:: paddle.fluid.layers.argmin
+ :noindex:
+
+.. _api_fluid_layers_argmax:
+
+argmax
+------
+
+.. autofunction:: paddle.fluid.layers.argmax
+ :noindex:
+
+.. _api_fluid_layers_argsort:
+
+argsort
+-------
+
+.. autofunction:: paddle.fluid.layers.argsort
+ :noindex:
+
+.. _api_fluid_layers_ones:
+
+ones
+----
+
+.. autofunction:: paddle.fluid.layers.ones
+ :noindex:
+
+.. _api_fluid_layers_zeros:
+
+zeros
-----
-.. autofunction:: paddle.fluid.layers.swish
+.. autofunction:: paddle.fluid.layers.zeros
+ :noindex:
+
+.. _api_fluid_layers_reverse:
+
+reverse
+-------
+
+.. autofunction:: paddle.fluid.layers.reverse
+ :noindex:
+
+learning_rate_scheduler
+=======================
+
+.. _api_fluid_layers_exponential_decay:
+
+exponential_decay
+-----------------
+
+.. autofunction:: paddle.fluid.layers.exponential_decay
+ :noindex:
+
+.. _api_fluid_layers_natural_exp_decay:
+
+natural_exp_decay
+-----------------
+
+.. autofunction:: paddle.fluid.layers.natural_exp_decay
+ :noindex:
+
+.. _api_fluid_layers_inverse_time_decay:
+
+inverse_time_decay
+------------------
+
+.. autofunction:: paddle.fluid.layers.inverse_time_decay
+ :noindex:
+
+.. _api_fluid_layers_polynomial_decay:
+
+polynomial_decay
+----------------
+
+.. autofunction:: paddle.fluid.layers.polynomial_decay
+ :noindex:
+
+.. _api_fluid_layers_piecewise_decay:
+
+piecewise_decay
+---------------
+
+.. autofunction:: paddle.fluid.layers.piecewise_decay
+ :noindex:
+
+.. _api_fluid_layers_noam_decay:
+
+noam_decay
+----------
+
+.. autofunction:: paddle.fluid.layers.noam_decay
+ :noindex:
+
+.. _api_fluid_layers_append_LARS:
+
+append_LARS
+-----------
+
+.. autofunction:: paddle.fluid.layers.append_LARS
+ :noindex:
+
+detection
+=========
+
+.. _api_fluid_layers_prior_box:
+
+prior_box
+---------
+
+.. autofunction:: paddle.fluid.layers.prior_box
+ :noindex:
+
+.. _api_fluid_layers_multi_box_head:
+
+multi_box_head
+--------------
+
+.. autofunction:: paddle.fluid.layers.multi_box_head
+ :noindex:
+
+.. _api_fluid_layers_bipartite_match:
+
+bipartite_match
+---------------
+
+.. autofunction:: paddle.fluid.layers.bipartite_match
+ :noindex:
+
+.. _api_fluid_layers_target_assign:
+
+target_assign
+-------------
+
+.. autofunction:: paddle.fluid.layers.target_assign
+ :noindex:
+
+.. _api_fluid_layers_detection_output:
+
+detection_output
+----------------
+
+.. autofunction:: paddle.fluid.layers.detection_output
+ :noindex:
+
+.. _api_fluid_layers_ssd_loss:
+
+ssd_loss
+--------
+
+.. autofunction:: paddle.fluid.layers.ssd_loss
+ :noindex:
+
+.. _api_fluid_layers_detection_map:
+
+detection_map
+-------------
+
+.. autofunction:: paddle.fluid.layers.detection_map
+ :noindex:
+
+.. _api_fluid_layers_iou_similarity:
+
+iou_similarity
+--------------
+
+.. autofunction:: paddle.fluid.layers.iou_similarity
+ :noindex:
+
+.. _api_fluid_layers_box_coder:
+
+box_coder
+---------
+
+.. autofunction:: paddle.fluid.layers.box_coder
+ :noindex:
+
+metric_op
+=========
+
+.. _api_fluid_layers_accuracy:
+
+accuracy
+--------
+
+.. autofunction:: paddle.fluid.layers.accuracy
+ :noindex:
+
+.. _api_fluid_layers_auc:
+
+auc
+---
+
+.. autofunction:: paddle.fluid.layers.auc
:noindex:
tensor
======
+.. _api_fluid_layers_create_tensor:
+
create_tensor
-------------
.. autofunction:: paddle.fluid.layers.create_tensor
:noindex:
+.. _api_fluid_layers_create_parameter:
+
create_parameter
----------------
.. autofunction:: paddle.fluid.layers.create_parameter
:noindex:
+.. _api_fluid_layers_create_global_var:
+
create_global_var
-----------------
.. autofunction:: paddle.fluid.layers.create_global_var
:noindex:
+.. _api_fluid_layers_cast:
+
cast
----
.. autofunction:: paddle.fluid.layers.cast
:noindex:
+.. _api_fluid_layers_concat:
+
concat
------
.. autofunction:: paddle.fluid.layers.concat
:noindex:
+.. _api_fluid_layers_sums:
+
sums
----
.. autofunction:: paddle.fluid.layers.sums
:noindex:
+.. _api_fluid_layers_assign:
+
assign
------
.. autofunction:: paddle.fluid.layers.assign
:noindex:
+.. _api_fluid_layers_fill_constant_batch_size_like:
+
fill_constant_batch_size_like
-----------------------------
.. autofunction:: paddle.fluid.layers.fill_constant_batch_size_like
:noindex:
+.. _api_fluid_layers_fill_constant:
+
fill_constant
-------------
.. autofunction:: paddle.fluid.layers.fill_constant
:noindex:
+.. _api_fluid_layers_argmin:
+
+argmin
+------
+
+.. autofunction:: paddle.fluid.layers.argmin
+ :noindex:
+
+.. _api_fluid_layers_argmax:
+
+argmax
+------
+
+.. autofunction:: paddle.fluid.layers.argmax
+ :noindex:
+
+.. _api_fluid_layers_ones:
+
ones
----
.. autofunction:: paddle.fluid.layers.ones
:noindex:
+.. _api_fluid_layers_zeros:
+
zeros
-----
.. autofunction:: paddle.fluid.layers.zeros
:noindex:
-topk
-----
+.. _api_fluid_layers_reverse:
-.. autofunction:: paddle.fluid.layers.topk
+reverse
+-------
+
+.. autofunction:: paddle.fluid.layers.reverse
:noindex:
+.. _api_fluid_layers_rank_loss:
+
+rank_loss
+-------
+
+.. autofunction:: paddle.fluid.layers.rank_loss
+ :noindex:
diff --git a/doc/fluid/api/metrics.rst b/doc/fluid/api/metrics.rst
new file mode 100644
index 0000000000000000000000000000000000000000..0f54b2e2eb7ead353215c5dbd529293794e37123
--- /dev/null
+++ b/doc/fluid/api/metrics.rst
@@ -0,0 +1,88 @@
+.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+ !DO NOT EDIT THIS FILE MANUALLY!
+
+=============
+fluid.metrics
+=============
+
+.. _api_fluid_metrics_MetricBase:
+
+MetricBase
+----------
+
+.. autoclass:: paddle.fluid.metrics.MetricBase
+ :members:
+ :noindex:
+
+.. _api_fluid_metrics_CompositeMetric:
+
+CompositeMetric
+---------------
+
+.. autoclass:: paddle.fluid.metrics.CompositeMetric
+ :members:
+ :noindex:
+
+.. _api_fluid_metrics_Precision:
+
+Precision
+---------
+
+.. autoclass:: paddle.fluid.metrics.Precision
+ :members:
+ :noindex:
+
+.. _api_fluid_metrics_Recall:
+
+Recall
+------
+
+.. autoclass:: paddle.fluid.metrics.Recall
+ :members:
+ :noindex:
+
+.. _api_fluid_metrics_Accuracy:
+
+Accuracy
+--------
+
+.. autoclass:: paddle.fluid.metrics.Accuracy
+ :members:
+ :noindex:
+
+.. _api_fluid_metrics_ChunkEvaluator:
+
+ChunkEvaluator
+--------------
+
+.. autoclass:: paddle.fluid.metrics.ChunkEvaluator
+ :members:
+ :noindex:
+
+.. _api_fluid_metrics_EditDistance:
+
+EditDistance
+------------
+
+.. autoclass:: paddle.fluid.metrics.EditDistance
+ :members:
+ :noindex:
+
+.. _api_fluid_metrics_DetectionMAP:
+
+DetectionMAP
+------------
+
+.. autoclass:: paddle.fluid.metrics.DetectionMAP
+ :members:
+ :noindex:
+
+.. _api_fluid_metrics_Auc:
+
+Auc
+---
+
+.. autoclass:: paddle.fluid.metrics.Auc
+ :members:
+ :noindex:
+
diff --git a/doc/fluid/api/nets.rst b/doc/fluid/api/nets.rst
index 7ae3187304f386a08c5cb8a4ba093423a58a7f36..059733af18517257b6821d95fd628a9e13e6e98e 100644
--- a/doc/fluid/api/nets.rst
+++ b/doc/fluid/api/nets.rst
@@ -1,9 +1,11 @@
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY!
-====
-nets
-====
+==========
+fluid.nets
+==========
+
+.. _api_fluid_nets_simple_img_conv_pool:
simple_img_conv_pool
--------------------
@@ -11,18 +13,24 @@ simple_img_conv_pool
.. autofunction:: paddle.fluid.nets.simple_img_conv_pool
:noindex:
+.. _api_fluid_nets_sequence_conv_pool:
+
sequence_conv_pool
------------------
.. autofunction:: paddle.fluid.nets.sequence_conv_pool
:noindex:
+.. _api_fluid_nets_glu:
+
glu
---
.. autofunction:: paddle.fluid.nets.glu
:noindex:
+.. _api_fluid_nets_scaled_dot_product_attention:
+
scaled_dot_product_attention
----------------------------
diff --git a/doc/fluid/api/optimizer.rst b/doc/fluid/api/optimizer.rst
index 7a92caf9b7139cf091eff834dbed3586b23ac3af..8d792120f2f16a8c92606b343eb4c3d4368bed14 100644
--- a/doc/fluid/api/optimizer.rst
+++ b/doc/fluid/api/optimizer.rst
@@ -1,9 +1,11 @@
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY!
-=========
-optimizer
-=========
+===============
+fluid.optimizer
+===============
+
+.. _api_fluid_optimizer_SGD:
SGD
---
@@ -12,6 +14,8 @@ SGD
:members:
:noindex:
+.. _api_fluid_optimizer_Momentum:
+
Momentum
--------
@@ -19,6 +23,8 @@ Momentum
:members:
:noindex:
+.. _api_fluid_optimizer_Adagrad:
+
Adagrad
-------
@@ -26,6 +32,8 @@ Adagrad
:members:
:noindex:
+.. _api_fluid_optimizer_Adam:
+
Adam
----
@@ -33,6 +41,8 @@ Adam
:members:
:noindex:
+.. _api_fluid_optimizer_Adamax:
+
Adamax
------
@@ -40,6 +50,8 @@ Adamax
:members:
:noindex:
+.. _api_fluid_optimizer_DecayedAdagrad:
+
DecayedAdagrad
--------------
@@ -47,6 +59,17 @@ DecayedAdagrad
:members:
:noindex:
+.. _api_fluid_optimizer_Ftrl:
+
+Ftrl
+----
+
+.. autoclass:: paddle.fluid.optimizer.Ftrl
+ :members:
+ :noindex:
+
+.. _api_fluid_optimizer_SGDOptimizer:
+
SGDOptimizer
------------
@@ -54,6 +77,8 @@ SGDOptimizer
:members:
:noindex:
+.. _api_fluid_optimizer_MomentumOptimizer:
+
MomentumOptimizer
-----------------
@@ -61,6 +86,8 @@ MomentumOptimizer
:members:
:noindex:
+.. _api_fluid_optimizer_AdagradOptimizer:
+
AdagradOptimizer
----------------
@@ -68,6 +95,8 @@ AdagradOptimizer
:members:
:noindex:
+.. _api_fluid_optimizer_AdamOptimizer:
+
AdamOptimizer
-------------
@@ -75,6 +104,8 @@ AdamOptimizer
:members:
:noindex:
+.. _api_fluid_optimizer_AdamaxOptimizer:
+
AdamaxOptimizer
---------------
@@ -82,6 +113,8 @@ AdamaxOptimizer
:members:
:noindex:
+.. _api_fluid_optimizer_DecayedAdagradOptimizer:
+
DecayedAdagradOptimizer
-----------------------
@@ -89,9 +122,57 @@ DecayedAdagradOptimizer
:members:
:noindex:
+.. _api_fluid_optimizer_RMSPropOptimizer:
+
+RMSPropOptimizer
+----------------
+
+.. autoclass:: paddle.fluid.optimizer.RMSPropOptimizer
+ :members:
+ :noindex:
+
+.. _api_fluid_optimizer_FtrlOptimizer:
+
+FtrlOptimizer
+-------------
+
+.. autoclass:: paddle.fluid.optimizer.FtrlOptimizer
+ :members:
+ :noindex:
+
+.. _api_fluid_optimizer_Adadelta:
+
Adadelta
---------------
+--------
+
+.. autoclass:: paddle.fluid.optimizer.Adadelta
+ :members:
+ :noindex:
+
+.. _api_fluid_optimizer_ModelAverage:
+
+ModelAverage
+------------
-.. autoclass:: paddle.fluid.optimizer.AdadeltaOptimizer
+.. autoclass:: paddle.fluid.optimizer.ModelAverage
:members:
:noindex:
+
+.. _api_fluid_optimizer_Optimizer:
+
+Optimizer
+---------
+
+.. autoclass:: paddle.fluid.optimizer.Optimizer
+ :members:
+ :noindex:
+
+.. _api_fluid_optimizer_RMSPropOptimizer:
+
+RMSPropOptimizer
+----------------
+
+.. autoclass:: paddle.fluid.optimizer.RMSPropOptimizer
+ :members:
+ :noindex:
+
diff --git a/doc/fluid/api/param_attr.rst b/doc/fluid/api/param_attr.rst
index 8e4ddb2b0492d0fcfcade199fdd6dfe43faa7075..33035bbc7ca5c8d000adeaf1cb79806a3ea64604 100644
--- a/doc/fluid/api/param_attr.rst
+++ b/doc/fluid/api/param_attr.rst
@@ -1,9 +1,11 @@
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY!
-==========
-param_attr
-==========
+================
+fluid.param_attr
+================
+
+.. _api_fluid_param_attr_ParamAttr:
ParamAttr
---------
@@ -12,6 +14,8 @@ ParamAttr
:members:
:noindex:
+.. _api_fluid_param_attr_WeightNormParamAttr:
+
WeightNormParamAttr
-------------------
diff --git a/doc/fluid/api/profiler.rst b/doc/fluid/api/profiler.rst
index 74d102dcb0db35766c34e3d14939a8aa5861686b..c750a2d588df56728ac7f73051ab7a9e44dee232 100644
--- a/doc/fluid/api/profiler.rst
+++ b/doc/fluid/api/profiler.rst
@@ -1,9 +1,11 @@
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY!
-========
-profiler
-========
+==============
+fluid.profiler
+==============
+
+.. _api_fluid_profiler_cuda_profiler:
cuda_profiler
-------------
@@ -11,15 +13,35 @@ cuda_profiler
.. autofunction:: paddle.fluid.profiler.cuda_profiler
:noindex:
+.. _api_fluid_profiler_reset_profiler:
+
reset_profiler
--------------
.. autofunction:: paddle.fluid.profiler.reset_profiler
:noindex:
+.. _api_fluid_profiler_profiler:
+
profiler
--------
.. autofunction:: paddle.fluid.profiler.profiler
:noindex:
+.. _api_fluid_profiler_start_profiler:
+
+start_profiler
+--------------
+
+.. autofunction:: paddle.fluid.profiler.start_profiler
+ :noindex:
+
+.. _api_fluid_profiler_stop_profiler:
+
+stop_profiler
+-------------
+
+.. autofunction:: paddle.fluid.profiler.stop_profiler
+ :noindex:
+
diff --git a/doc/fluid/api/recordio_writer.rst b/doc/fluid/api/recordio_writer.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f0c12fd115478a29fbd178b533b7490b2f663717
--- /dev/null
+++ b/doc/fluid/api/recordio_writer.rst
@@ -0,0 +1,23 @@
+.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+ !DO NOT EDIT THIS FILE MANUALLY!
+
+=====================
+fluid.recordio_writer
+=====================
+
+.. _api_fluid_recordio_writer_convert_reader_to_recordio_file:
+
+convert_reader_to_recordio_file
+-------------------------------
+
+.. autofunction:: paddle.fluid.recordio_writer.convert_reader_to_recordio_file
+ :noindex:
+
+.. _api_fluid_recordio_writer_convert_reader_to_recordio_files:
+
+convert_reader_to_recordio_files
+--------------------------------
+
+.. autofunction:: paddle.fluid.recordio_writer.convert_reader_to_recordio_files
+ :noindex:
+
diff --git a/doc/fluid/api/regularizer.rst b/doc/fluid/api/regularizer.rst
index 837c67111c6e98e6a3859be802addc20a1c64f2b..987eaea903520d91c284c8da7a8cb066a1648069 100644
--- a/doc/fluid/api/regularizer.rst
+++ b/doc/fluid/api/regularizer.rst
@@ -1,9 +1,11 @@
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY!
-===========
-regularizer
-===========
+=================
+fluid.regularizer
+=================
+
+.. _api_fluid_regularizer_append_regularization_ops:
append_regularization_ops
-------------------------
@@ -11,6 +13,8 @@ append_regularization_ops
.. autofunction:: paddle.fluid.regularizer.append_regularization_ops
:noindex:
+.. _api_fluid_regularizer_L1Decay:
+
L1Decay
-------
@@ -18,6 +22,8 @@ L1Decay
:members:
:noindex:
+.. _api_fluid_regularizer_L2Decay:
+
L2Decay
-------
@@ -25,16 +31,21 @@ L2Decay
:members:
:noindex:
+.. _api_fluid_regularizer_L1DecayRegularizer:
+
L1DecayRegularizer
----------------------
+------------------
.. autoclass:: paddle.fluid.regularizer.L1DecayRegularizer
:members:
:noindex:
+.. _api_fluid_regularizer_L2DecayRegularizer:
+
L2DecayRegularizer
----------------------
+------------------
.. autoclass:: paddle.fluid.regularizer.L2DecayRegularizer
:members:
:noindex:
+
diff --git a/doc/fluid/api/transpiler.rst b/doc/fluid/api/transpiler.rst
new file mode 100644
index 0000000000000000000000000000000000000000..d2ac04f1449c32cb414cea1b76d7469bbe9ccb85
--- /dev/null
+++ b/doc/fluid/api/transpiler.rst
@@ -0,0 +1,59 @@
+.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+ !DO NOT EDIT THIS FILE MANUALLY!
+
+================
+fluid.transpiler
+================
+
+.. _api_fluid_transpiler_DistributeTranspiler:
+
+DistributeTranspiler
+--------------------
+
+.. autoclass:: paddle.fluid.transpiler.DistributeTranspiler
+ :members:
+ :noindex:
+
+.. _api_fluid_transpiler_InferenceTranspiler:
+
+InferenceTranspiler
+-------------------
+
+.. autoclass:: paddle.fluid.transpiler.InferenceTranspiler
+ :members:
+ :noindex:
+
+.. _api_fluid_transpiler_memory_optimize:
+
+memory_optimize
+---------------
+
+.. autofunction:: paddle.fluid.transpiler.memory_optimize
+ :noindex:
+
+.. _api_fluid_transpiler_release_memory:
+
+release_memory
+--------------
+
+.. autofunction:: paddle.fluid.transpiler.release_memory
+ :noindex:
+
+.. _api_fluid_transpiler_HashName:
+
+HashName
+--------
+
+.. autoclass:: paddle.fluid.transpiler.HashName
+ :members:
+ :noindex:
+
+.. _api_fluid_transpiler_RoundRobin:
+
+RoundRobin
+----------
+
+.. autoclass:: paddle.fluid.transpiler.RoundRobin
+ :members:
+ :noindex:
+
diff --git a/doc/fluid/build_and_install/paddleci.png b/doc/fluid/build_and_install/paddleci.png
new file mode 120000
index 0000000000000000000000000000000000000000..c3eb1457acc77cab9360e654240d1e8f548035b4
--- /dev/null
+++ b/doc/fluid/build_and_install/paddleci.png
@@ -0,0 +1 @@
+../../v2/build_and_install/paddleci.png
\ No newline at end of file
diff --git a/doc/fluid/design/concepts/functions_operators_layers.md b/doc/fluid/design/concepts/functions_operators_layers.md
index 30bc488a18a28d349645d9d2502aae6691a69931..1f86b99e5197c3e0b85fd76fe704520ef21b06d3 100644
--- a/doc/fluid/design/concepts/functions_operators_layers.md
+++ b/doc/fluid/design/concepts/functions_operators_layers.md
@@ -40,7 +40,7 @@ template
class FCOp : public OperatorBase {
public:
void Run(...) {
- add(mul(Input("X"), Input("W")), Input("b");
+ add(mul(Input("X"), Input("W")), Input("b"));
}
};
REGISTER_OP(FCOp, "fc");
diff --git a/doc/fluid/design/concepts/lod_tensor.md b/doc/fluid/design/concepts/lod_tensor.md
index a88292e7888d0ebc64ee89ca315dfea38a12c71d..748488f6d5f2f1272e87b89047570632418da8dc 100644
--- a/doc/fluid/design/concepts/lod_tensor.md
+++ b/doc/fluid/design/concepts/lod_tensor.md
@@ -155,7 +155,7 @@ into offsets
3 2+3 4+5 1+9 2+10 3+12
```
-so we know that the first sentence is from word 0 to word 3, and the second sentence from work 3 to word 5.
+so we know that the first sentence is from word 0 to word 3, and the second sentence from word 3 to word 5.
Similarly, the lengths in the top level LoD
@@ -173,6 +173,7 @@ are transformed into offsets of elements/words as follows:
## Slicing of LoD Tensors
+
When we use the above 2-level LoD Tensor as the input to a nested-RNN, we need to retrieve certain sequences. Here we define the sequence identified by branch as the **-slice**.
For example, the <2>-slice of above example is
@@ -189,3 +190,22 @@ and the <2,0>-slice of above slice is
10 12
||
```
+
+## Length Representation vs Offset Representation
+
+The offset representation is an implementation-oriented decision and it makes understanding the idea behind LoDTensor difficult.
+Hence, we encapsulate this implementation detail in C++ and expose the original length representation in our Python API.
+Specifically, we call this length representation `recursive_sequence_lengths` and users can use the following code to set or get the `recursive_sequence_lengths` of a LoDTensor in Python:
+```Python
+# length representation of lod called recursive_sequence_lengths
+recursive_seq_lens = [[3, 1, 2], [2, 2, 1, 3, 1, 2]]
+# Create a LoDTensor that has the above recursive_sequence_lengths info.
+# This recursive_sequence_lengths will be converted to an offset representation of LoD in the C++ implementation under the hood.
+tensor = fluid.LoDTensor(lod)
+
+# Set/Change the recursive_sequence_lengths info of LoDTensor
+tensor.set_recursive_sequence_lengths([[3, 1, 2]])
+# Get the recursive_sequence_lengths info of a LoDTensor (the offset-based LoD representation stored in C++ will be converted
+# back to length-based recursive_sequence_lengths), new_recursive_seq_lens = [[3, 1, 2]]
+new_recursive_seq_lens = tensor.recursive_sequence_lengths()
+```
diff --git a/doc/fluid/design/concepts/python_data_feeding.md b/doc/fluid/design/concepts/python_data_feeding.md
new file mode 100644
index 0000000000000000000000000000000000000000..dffee8e02bacbc99bdfa8c54f1a146de340ad778
--- /dev/null
+++ b/doc/fluid/design/concepts/python_data_feeding.md
@@ -0,0 +1,130 @@
+# Python Data Feeding
+
+In the former implementation of Paddle Fluid, there are two ways to feed data:
+
+- Use `reader_op` in backend C++ side. This method only supports data feeding from recordio files and random data generators, but supports many kinds of `decorated_readers`. For examples, `double_buffer_reader` uses two threads to achieve better performance: one for time-consuming I/O operations, and the other for `Executor::Run()`. See [C++ Data Feeding](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/cpp_data_feeding.md) for details.
+
+- Feed data directly using `DataFeeder.feed()` in Python codes. It is more flexible than the first way. Many kinds of preprocessing steps can be performed before feeding using Python or any other languages, instead of adding many uncommon `operators` in C++ side. But this method is less efficient: the program cannot read the next mini-batch data before `Executor::Run()` ends. Moreover, `decorated_readers` such as `double_buffer_reader` cannot be used for better performance.
+
+In this document, we design a Python Data Feeding process combining the efficiency of the first way and the flexibility of the second way. A data queue `LoDTensorBlockingQueue` is designed to be shared by the Python and C++ side, while `LoDTensorArray` is pushed into the queue in Python side and `reader_op` in C++ side reads out the data from the queue.
+
+
+## Design of LoDTensorBlockingQueue
+`LoDTensorBlockingQueue` is a blocking queue with a fixed `capacity` and accepts `std::vector` with shapes indicated by `dims`. Since `LoDTensorBlockingQueue` must be constructed using `capacity` and `dims`, it cannot be a `Variable` type. Therefore, a `LoDTensorBlockingQueueHolder` is designed to defer construction of `LoDTensorBlockingQueue`.
+
+```C++
+class LoDTensorBlockingQueueHolder;
+
+class LoDTensorBlockingQueue {
+ friend class LoDTensorBlockingQueueHolder;
+ private:
+ // `LoDTensorBlockingQueue` can only be constructed by
+ // `LoDTensorBlockingQueueHolder::InitOnce()`
+ LoDTensorBlockingQueue(size_t capacity, const std::vector& dims);
+
+ public:
+ size_t Size() const { return queue_.Size(); } // Get the current size of the queue
+
+ size_t Cap() const { return queue_.Cap(); }// Get the capacity of the queue
+
+ void Close() { return queue_.Close(); }
+
+ bool IsClosed() const { return queue_.IsClosed(); }
+
+ // Block if Size() == Cap()
+ // Return false only when queue_.IsClosed() == true
+ bool Push(const std::vector &lod_tensor_vec);
+
+ // Block if Size() == 0.
+ // *Success == false when queue_.IsClosed() == true
+ std::vector Pop(bool *success = nullptr);
+
+ private:
+ // Use reader::BlockingQueue as the inner data structure
+ BlockingQueue> queue_;
+ std::vector dims_;
+};
+
+class LoDTensorBlockingQueueHolder {
+ public:
+ // Call the constructor of `LoDTensorBlockingQueue` to create queue_
+ // `InitOnce` can only called once, otherwise an exception would raise
+ void InitOnce(size_t capacity, const std::vector& dims) {
+ PADDLE_ENFORCE(queue_ == nullptr);
+ queue_.reset(new LoDTensorBlockingQueue(capacity, dims));
+ }
+
+ const std::shared_ptr& GetQueue() const { return queue_; }
+
+ private:
+ std::shared_ptr queue_;
+};
+```
+
+There are some major things that must be concerned:
+- `LoDTensorBlockingQueueHolder` should be a `Variable` in global scope, so that `reader_op` can find it when reading data.
+- A `Variable` of `LoDTensorBlockingQueueHolder` but not `VarDesc` must be created in Python code before `Executor::Run()` so that `Executor::Run()` can get the feeding data when it is called.
+- `Create_reader_op` should accept the name of the `LoDTensorBlockingQueueHolder` variable as an input.
+
+
+## Release of the GIL in pybind
+`Pybind11::gil_scoped_release` is used to release GIL (Global Interpreter Lock) when `LoDTensorBlockingQueue::Push()` or `Executor::Run()` method are invoked in Python side, making `LoDTensorBlockingQueue::Push()` and `Executor::Run()` run in parallel.
+
+
+## Design of PyReader
+`PyReader` is a reader which holds a `LoDTensorBlockingQueue` object.
+```C++
+class PyReader : public ReaderBase {
+ public:
+ explicit PyReader(const std::shared_ptr& queue);
+
+ void ReadNext(std::vector* out) override {
+ bool success;
+ *out = queue_->Pop(&success);
+ if (!success) out->clear();
+ }
+
+ void ReInit() override { return; }
+
+ private:
+ std::shared_ptr queue_;
+};
+```
+
+
+## Design of CreatePyReaderOp
+`CreatePyReaderOp` is used to create the `PyReader` object. It requires an input `blocking_queue` which indicates the name of the `LoDTensorBlockingQueueHolder` variable.
+```C++
+class CreatePyReaderOp : public framework::OperatorBase {
+ public:
+ using framework::OperatorBase::OperatorBase;
+ private:
+ void RunImpl(const framework::Scope& scope,
+ const platform::Place& dev_place) const override {
+ auto* out = scope.FindVar(Output("Out"))
+ ->template GetMutable();
+ if (out->Get() != nullptr) return;
+
+ const std::string& queue_name = Input("blocking_queue");
+ auto* queue_holder_var = scope.FindVar(queue_name);
+ PADDLE_ENFORCE(queue_holder_var != nullptr);
+ auto* queue_holder = queue_holder_var
+ ->template GetMutable();
+ out->Reset(new PyReader(queue_holder->GetQueue()));
+ }
+};
+```
+
+## Design of Python codes
+The design of Python codes are as follows. First, we construct a variable of `LoDTensorBlockingQueueHolder` and init it with given parameters, returning the `LoDTensorBlockingQueue` object after initialization. After that, a layer of `CreatePyReaderOp` is constructed and accepts the name of the `LoDTensorBlockingQueueHolder` variable. The `LoDTensorBlockingQueue` object and result of the layer are both returned.
+```Python
+def py_reader(capacity, shapes):
+ queue_name = unique_name.generate("lod_tensor_blocking_queue")
+ var = global_scope().var(feeder_name) # create LoDTensorBlockingQueueHolder Variable
+ feed_queue = core.init_lod_tensor_blocking_queue(var, capacity, shapes) # init the queue
+ out = create_var()
+ create_py_reader_op_with_queue_name(
+ inputs={'blocking_queue': queue_name},
+ outputs={'Out':[out]})
+ return out, feed_queue
+```
diff --git a/doc/fluid/design/concepts/var_desc.md b/doc/fluid/design/concepts/var_desc.md
index 6750323c0167bf1efbde6ef4fd670e88a5aa502a..8db67f6703d142da71cf06bd4f7e2cb13556f9b0 100644
--- a/doc/fluid/design/concepts/var_desc.md
+++ b/doc/fluid/design/concepts/var_desc.md
@@ -35,7 +35,7 @@ The computation `Program` consists of nested `Blocks`. Each `Block` will consist
## Definition of VarType
-A VarDesc should have a name, type and whether or not it is persistable. The are different kinds of variable types supported in PaddlePaddle, apart from the POD_Types like: `LOD_TENSOR`, `SELECTED_ROWS`, `FEED_MINIBATCH`, `FETCH_LIST`, `STEP_SCOPES`, `LOD_RANK_TABLE`, `LOD_TENSOR_ARRAY`, `PLACE_LIST`, `READER` and `CHANNEL`. These are declared inside `VarType`. A `VarDesc` then looks as the following:
+A VarDesc should have a name, type and whether or not it is persistable. There are different kinds of variable types supported in PaddlePaddle, apart from the POD_Types like: `LOD_TENSOR`, `SELECTED_ROWS`, `FEED_MINIBATCH`, `FETCH_LIST`, `STEP_SCOPES`, `LOD_RANK_TABLE`, `LOD_TENSOR_ARRAY`, `PLACE_LIST`, `READER` and `CHANNEL`. These are declared inside `VarType`. A `VarDesc` then looks as the following:
```proto
message VarDesc {
diff --git a/doc/fluid/design/dist_train/async_update.md b/doc/fluid/design/dist_train/async_update.md
index 6a0835b761b69030ba30697e6e8863928efbf57f..248d2ec18dafdecac9184527638754b6ba4d85b8 100644
--- a/doc/fluid/design/dist_train/async_update.md
+++ b/doc/fluid/design/dist_train/async_update.md
@@ -4,34 +4,37 @@
For the typical synchronous distributed training, some significant steps are as follows:
-1. A Trainer will compute the gradients and SEND them to the Parameter Server(PServer) nodes.
-1. After the PServer node received gradients came from all the Trainers, It will aggregate the
+1. A trainer process will compute the gradients and **send** them to the parameter server (PS) nodes.
+1. After the PS node received gradients came from all the Trainers, It will aggregate the
gradient variables for the same parameter into one gradient variable and then apply the aggregated
gradient to the respective parameter, finally using an optimize algorithms(SGD, Monument...)
to update the parameters.
-1. The Trainer would wait for the PServers finished the optimize stage, and GET the parameters from PServer,
+1. The Trainer would wait for the PS finished the optimize stage, and GET the parameters from PS,
so all the Trainers would get the same parameters.
-In the synchronously distributed training, there should be a `Barrier` to synchronise the
-parameters after the optimizing stage. The performance of a distributed training job would
-depend on the slowest node if there were hundreds or thousands of training nodes in a
-Job, the performance of synchronously distributed training might be very poor because of
-the slow node. So this design doc would introduce an approach to implement
-*asynchronously* distributed training in PaddlePaddle Fluid.
+In Synchronous Distributed Training, there is a **barrier** on each PS to wait until all trainers processes
+have completed running current mini-batch. After that, all trainers can continue to run the next
+mini-batch. So, we can find that the overall performance of Synchronous Distributed Training depends
+on the slowest node.
+
+In Asynchronous Distributed Training, we don't need to wait for a global mini-bach, the optimizer on
+the PS will run immediately when the gradient is uploaded to the PS from one trainer. This mode would
+train such models that achieve scaling, better throughput. In this design doc, we will introduce how to
+implement the Asynchronous Distributed Training base on PaddlePaddle Fluid.
## Design
-As the figure above, we describe a global view of asynchronously update process and use
+As the figure above, we describe a global view of the asynchronous update process and use
the parameter `w1` as an example to introduce the steps:
1. For each gradient variables, they may distribute on different GPU card and aggregate
them while they are all calculated.
-1. Split the gradient variable into multiple blocks according to the number of PServer
+1. Split the gradient variable into multiple blocks according to the number of PS
instances and then send them.
-1. PServer would run an `Optimize Block` using a specified optimize algorithm to update
+1. PS would run an `Optimize Block` using a specified optimize algorithm to update
the specified parameter.
-1. The trainer will fetch latest parameter from PServer before running forward Op which depends
+1. The trainer will fetch the latest parameter from PS before running forward Op which depends
on the specified parameter.
1. Broadcast the received variable into multiple GPU cards and continue to run the next
mini-batch.
@@ -40,8 +43,8 @@ mini-batch.
- For the multiple devices distributed training, we need to aggregate the gradient
variables which placed on different devices firstly and then schedule a `SendVars` Operator to
-send the gradient variables to the multiple PServer instances.
-- Schedule `FetchVars` operator to fetch the latest parameter from PServer before running
+send the gradient variables to the multiple PS instances.
+- Schedule `FetchVars` operator to fetch the latest parameter from PS before running
the forward ops.
- There could be a large number of gradient variables to be sent, so we need to use another
thread pool(IO Threadpool) whose a number of the schedulable threads is larger than the
diff --git a/doc/fluid/design/dist_train/dist_train_nccl2.md b/doc/fluid/design/dist_train/dist_train_nccl2.md
new file mode 100644
index 0000000000000000000000000000000000000000..b8b8427811cddcddf872db5badfd37c96a76c3e3
--- /dev/null
+++ b/doc/fluid/design/dist_train/dist_train_nccl2.md
@@ -0,0 +1,35 @@
+# Distributed Training with NCCL2
+
+We design a pattern that can enable training with `ParallelExecutor` and
+use [NCCL2](https://developer.nvidia.com/nccl) as it's collective
+communication library.
+
+In `ParallelExecutor` we can use `AllReduce` or `Reduce` and `Broadcast`
+to do multi GPU training. And if we initialize NCCL2 communicators as
+ranks in a distributed environment, we can simply run the `ParallelExecutor`
+as a distributed program! The only thing that may be different than in
+the single node version is that we need to broadcast the NCCL unique ID
+to all the nodes and initialize communicators using that ID, so NCCL2
+can know each other as ranks.
+
+To achieve this feature, we introduce a new operator: `gen_nccl_id` op,
+so we are ***not*** "bind to" running NCCL2 with MPI, we can run it in
+whatever platform you like.
+
+It has two running modes:
+
+1. Generate and broadcast mode, which should be used on trainer 0;
+1. Listen and fetch mode, which should be used on trainers other than 0.
+
+In both two modes, this op can save the NCCL ID into current scope as a
+persistable variable, Then we can insert this op at the end of
+"startup program" of fluid, so that all workers can get the same ID to
+initialize NCCL communicator objects.
+
+
+
+The above figure indicates the general process when training with NCCL2
+distributed. Each trainer has the number of communicators equal to the
+number of GPUs, but the ranks should match the global ranks number: here
+we have total 8 GPUs, so `nranks==8`, for each trainer, the ranks should
+be from 0 ~ 3 on trainer 0 and 4 ~ 7 on trainer 1.
diff --git a/doc/fluid/design/dist_train/distributed_lookup_table_design.md b/doc/fluid/design/dist_train/distributed_lookup_table_design.md
index 988729138926f035750b59eb245dde82502a3ad2..e284e1ec5cdd18d0049ce3c1a8349bbe1248cb48 100644
--- a/doc/fluid/design/dist_train/distributed_lookup_table_design.md
+++ b/doc/fluid/design/dist_train/distributed_lookup_table_design.md
@@ -1,6 +1,6 @@
# Design Doc: Distributed Lookup Table Operator
-A lookup table operator in PaddlePaddle where the table could be out
+A distribute lookup table operator in PaddlePaddle where the table could be out
of the memory of a computer.
## Background
@@ -24,14 +24,14 @@ memory, so we'd need a distributed storage service, which supports the
lookup of rows.
The following figure illustrates the multiplication of x with two
-non-zero elements, or say, two symbols, and a lookup table W:
+non-zero elements, or say two symbols, and a lookup table W:

### The Backward Algorithm
The backward algorithm computes W'(x) using W(x). W'(x) has the same
-scale of size as W(x) and is much smaller than W.
+the scale of size as W(x) and is much smaller than W.
To optimize W given W', we can do simple SGD update:
@@ -44,85 +44,46 @@ $$W = f(W, W')$$
The following figure illustrates the backward pass of the lookup
operator: 
-## Distributed Storage Service
-
-The forward algorithm requires a distributed storage service for W.
-The backward algorithm prefers that the storage system can apply the
-optimization algorithm on W. The following two sections describe two
-solutions -- the former doesn't require that the storage service can
-do optimization, the latter does.
-
-### Storage Service Doesn't Optimize
-
-In this design, we use highly-optimized distributed storage, e.g.,
-memcached, as the storage service, and we run the optimization
-algorithm on parameter servers of PaddlePaddle. The following figure
-illustrates the training process.
-
-
-
-
-
-Each trainer runs the forward and backward passes using their local
-data:
-
-1. In the forward pass, when a trainer runs the forward algorithm of a
- lookup operator, it retrieves W(x) from the storage service.
-1. The trainer computes W'(x) in the backward pass using W(x).
-
-During the global update process:
-
-1. Each trainer uploads its W'(x) to parameter servers.
-1. The parameter server runs the optimization algorithm, e.g., the
- Adam optimization algorithm, which requires that
- 1. The parameter server retrieves W(x) from memcached, and
- 1. The parameter server pushes $\Delta W(x)=f(W(x), lambda \sum_j
- W'(x))$ to memcached, where $f$ denotes the optimization
- algorithm.
-
-### Storage Service Does Optimize
-
-This design is very similar to the above one, except that the
-optimization algorithm $f$ runs on the storage service.
-
-- Pro: parameter servers do not retrieve W(x) from the storage
- service, thus saves half network communication.
-- Con: the storage service needs to be able to run the optimization
- algorithm.
-
-## Conclusion
-
-Let us do the "storage service does not optimize" solution first, as a
-baseline at least, because it is easier to use a well-optimized
-distributed storage service like memcached. We can do the "storage
-service does optimize" solution later or at the same time, which, if
-implemented carefully, should have better performance than the former.
+## Distributed Lookup Table
+### Problem 1: The lookup table may be very large.
+
+ In the condition like the search engine and recommendation system, the number of feature Id may be very large, say 100,000,000,000, then for a float value lookup table of size 8, the total size of the table is:
+
+ ```
+ 100,000,000,000 * 8 * 4(Bytes) = 2980.23 GB
+ ```
+
+### Solution: Distributed storage
+
+1. Paddle use [SelectedRows](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/selected_rows.md) as the storage format for the lookup table, the lookup table parameter will be split to multi-machine according to the hash of the feature ID, and data will also be split and send to the same machine to prefetch the parameter.
+
+1. For common parameters, the trainer will get the whole parameter for training, but for the big lookup table, the trainer can not store the whole parameter. Because the input data feature is very sparse, every time we only need a few parameters for training, so we use `prefetch_op` to only prefetch the parameter needed to trainer.
+
+### Problem 2. The Id in the lookup table is not sure before training.
+
+ The feature Id is calculated by the hash function because the feature data source is so large, we can not get all the Id before training. So we can not initialize the table before training.
+
+### Solution: Id auto growth
+
+At the beginning of training, paddle only malloc the memory for the lookup table at parameter server side, the Id and it's value will not be initialized. During training, when a parameter server received an Id, if it is already in the lookup table, it will return the existing parameter, if the Id does not exist, paddle will add it into the lookup table and initialize the value for it.
+
+### Problem 3: parameter load and save
+
+For common parameters, paddle use trainer to save and load them. But for distributed lookup table, trainer cannot do this because it's large size.
+
+### Solution: Parameter server side save and load
+
+Paddle support parameter server side save and load for distribute lookup table. Each machine of parameter servers will only save and load part of the whole table.
+
+## Architecture
+The whole architecture of the distribute lookup table is as below:
+
+### Training steps:
+1. Read a batch of data, the data is feature ids.
+1. The input ids will be split by `split_ids_op` with the same hash function of the lookup table.
+1. The `prefetch_op` use the split result to prefetch parameters back from the lookup table.
+1. Run forward-backward to get the gradient of the lookup table.
+1. `split_ids_op` split the gradient and then use `send_op` to the parameter server.
+1. parameter server update the table with the received gradient.
+
+
diff --git a/doc/fluid/design/dist_train/src/distributed_lookup_table.graffle b/doc/fluid/design/dist_train/src/distributed_lookup_table.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..65dfdbbacd219739db6ddfdf243cc16c3c4e8d1e
Binary files /dev/null and b/doc/fluid/design/dist_train/src/distributed_lookup_table.graffle differ
diff --git a/doc/fluid/design/dist_train/src/distributed_lookup_table.jpeg b/doc/fluid/design/dist_train/src/distributed_lookup_table.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..5353a16fd329f62ff893d32706b9c3c0bcc46a07
Binary files /dev/null and b/doc/fluid/design/dist_train/src/distributed_lookup_table.jpeg differ
diff --git a/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.graffle b/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..96ca6d48f43bd9f49c6861dab006e2037873db87
Binary files /dev/null and b/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.graffle differ
diff --git a/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.png b/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.png
new file mode 100644
index 0000000000000000000000000000000000000000..afa25ab3b4e427bc595a855b12ab966478e01ed0
Binary files /dev/null and b/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.png differ
diff --git a/doc/fluid/design/dist_train/src/ncc2_design.graffle b/doc/fluid/design/dist_train/src/ncc2_design.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..7d2753bbb03bc28c7a0054bb0aa424deb072ffbf
Binary files /dev/null and b/doc/fluid/design/dist_train/src/ncc2_design.graffle differ
diff --git a/doc/fluid/design/dist_train/src/ncc2_design.png b/doc/fluid/design/dist_train/src/ncc2_design.png
new file mode 100644
index 0000000000000000000000000000000000000000..da0d5ee81f5dfeb4ca1356601b0bb5870456e3d6
Binary files /dev/null and b/doc/fluid/design/dist_train/src/ncc2_design.png differ
diff --git a/doc/fluid/design/ir/overview.md b/doc/fluid/design/ir/overview.md
new file mode 100644
index 0000000000000000000000000000000000000000..83ef97c99efeaf27a27f93f0cd3857c0f1bc812e
--- /dev/null
+++ b/doc/fluid/design/ir/overview.md
@@ -0,0 +1,185 @@
+## Motivation
+
+There is a `gap` between the `Program` defined by
+user and the `Executable` that can be scheduled
+efficiently on heterogeneous hardware, either locally
+or distributedly.
+
+Usually, the `gap` is bridged by
+
+* A serious transformations with defined order.
+
+* These transformations usually involve
+`insert, delete, clustering, split, dependency analysis`.
+
+* Has a simple way to verify and debug each transformation.
+
+* Flexible to add, remove or customize transformations to fit
+the requirements of various algorithms (models) and hardware secenarios.
+
+Some other events also push us to a better unified pattern.
+
+* The deep learning framework is built around the concepts of graphs.
+To leverage tools such as compilation (e.g. TVM and nGraph) or
+cross-framework conversion (e.g. ONNX), we also need a intermediate
+representation that can be connected to the rest of the ecosystem.
+
+
+We need a unified pattern to naturally support the requirements
+described above. The pattern should fit both training, inference
+and other offline serielized model transformations.
+Learned from LLVM and other deep learning framework, we draft the
+design below.
+
+
+## Design
+
+### Major Concepts
+
+#### Node
+
+`Node` represents an operation that performs some computation or
+a variable that is input or output of operation.
+
+`Node`s are connected to other `Node`s via inputs and outputs.
+
+Other properties (maybe device placement information) can be added
+to `Node` in the future if it's a
+common requirement of many other `Pass`es. Otherwise, it should live
+in a `Node` wrapper class that is private to some `Pass` or be
+a local member of a `Pass`.
+
+#### Graph
+
+`Graph` contains a list of `Node`s, which are connected to
+each other via inputs and outputs.
+
+TODO: Better definitions for the graph.
+
+`Graph` can also contain `Attribute`s. `Attribute`s
+can be `any` thing. For example, it can be a list of "wraper"
+nodes. The `wrapper` nodes compose `Node`s and provide
+helper method for execution or transformation. `Attribute`
+can also contain other things that describe some properties of
+the `Graph` or `Graph` nodes. `Attribute` can be passed
+across `Pass`. However, it should be used with care.
+
+```cpp
+class Graph {
+ public:
+ explicit Graph(const ProgramDesc &program);
+
+ bool Has(const std::string &attr_name) const;
+
+ template
+ AttrType &Get(const std::string &attr_name) const;
+
+ template
+ void Set(const std::string &attr_name, AttrType *attr);
+ const std::unordered_set &Nodes() const;
+
+ // Create a normal variable with non-null VarDesc.
+ ir::Node *CreateVarNode(VarDesc *var_desc);
+
+ // Create a normal runnable operator with OpDesc.
+ ir::Node *CreateOpNode(OpDesc *op_desc);
+
+ // Create a control dependency var that connects 2 operations. The
+ // var doesn't hold any data. Other than that, it's no different from
+ // other var, considering dependency analysis.
+ ir::Node *CreateControlDepVar();
+
+ // A more free style way of creating a graph node. Mostly use for test
+ // or "copy" from another node. Avoid using it if possible.
+ ir::Node *CreateEmptyNode(const std::string &name, ir::Node::Type type);
+
+ // Clear all node information of the graph and return the ownership of the
+ // nodes.
+ std::vector> ReleaseNodes();
+};
+```
+
+#### Pass
+
+`Pass` represents a transformation of `Graph`. Its input
+is a `Graph` and its output is also a `Graph`. For example,
+a `Pass` can simply print out the `Graph`. A `Pass`
+can also fuse some `Graph`'s `Node`s.
+
+```cpp
+class Pass {
+ public:
+
+ std::unique_ptr Apply(std::unique_ptr graph) const {
+ // Some correctness check.
+ auto new_graph = ApplyImpl(std::move(graph));
+ // Some correctness check.
+ return new_graph;
+ }
+
+ // Get a reference to the attributed previously set.
+ template
+ AttrType &Get(const std::string &attr_name) const;
+
+ // Set a pointer to the attribute. Pass takes ownership of the attribute.
+ template
+ void Set(const std::string &attr_name, AttrType *attr) ;
+
+ // Set a pointer to the attribute. Pass doesn't take ownership. Caller
+ // should delete the attribute.
+ template
+ void SetNotOwned(const std::string &attr_name, AttrType *attr);
+
+ protected:
+ virtual std::unique_ptr ApplyImpl(std::unique_ptr graph) const = 0;
+};
+
+// In my_pass.cc
+class MyPass : public Pass {
+ protected:
+ std::unique_ptr ApplyImpl(std::unique_ptr graph) const override {
+ // do something.
+ return graph;
+ }
+}
+REGISTER_PASS(my_pass, MyPass)
+.RequirePassAttr("places")
+.RequireGraphAttr("dep_vars");
+
+
+// To use the pass.
+auto my_pass = ir::PassRegistry::Instance().Get("my_pass");
+graph = my_pass->Apply(std::move(graph));
+// Note: to force link my_pass.cc, in the code:
+USE_PASS(my_pass);
+```
+
+#### Optimize
+
+`Optimize` contains a series of `Pass` with defined order.
+`Optimize` transforms a `Graph` that only contains raw
+modeling logic to a `Graph` that can be run efficiently while
+maintaining the original modeling logic.
+
+
+### Optimize Process
+
+* Program is first converted to Graph.
+* Graph goes through a series of Pass
+* Graph is transformed from raw model logic to a
+form that is efficient to execute.
+
+```
+// Program->ProgramToGraph->Graph->Pass1->Graph->Pass2->Graph->Pass3->Graph->Executor
+auto graph = Graph(program);
+graph = PassRegistry::Instance().Get("op_fuse_pass").Apply(std::move(grah));
+// For more complex Pass, Optimize Process can provide Pass attributes.
+auto mem_opt_pass = PassRegistry::Instance().Get("memory_optimization_pass");
+mem_opt_pass.SetNotOwned("optimize_level", 1);
+mem_opt_pass->Apply(std::move(graph));
+graph = PassRegistry::Instance().Get("multi_devices_pass").Apply(std::move(grah));
+graph = PassRegistry::Instance().Get("multi_devices_check_pass").Apply(std::move(grah));
+Executor exe;
+exe.Run(graph);
+
+```
diff --git a/doc/fluid/design/modules/python_api.md b/doc/fluid/design/modules/python_api.md
index 265732a348ea77d21005e335390d99abcdfbd045..83af4e55485c079265d3f2b1e15070825b532c02 100644
--- a/doc/fluid/design/modules/python_api.md
+++ b/doc/fluid/design/modules/python_api.md
@@ -98,13 +98,13 @@ class Block(objects):
def append_operator(self, ...):
self.ops.append(Operator(self, ...))
- def prepend_operator(self, ...): # Parameter's ctor prepands initialize operators.
+ def _prepend_operator(self, ...): # Parameter's ctor prepands initialize operators.
self.ops.prepend(Operator(self, ...))
```
`create_parameter` is necessary because parameters are global variables, defined in the global block, but can be created in some sub-blocks. For example, an FC layer in the step block of an RNN operator.
-`prepend_operator` is necessary because the constructor of `Parameter` needs to create the initialize (or load) operator of the parameter, and would like to put it in the *preamble* of the global block.
+`_prepend_operator` is necessary because the constructor of `Parameter` needs to create the initialize (or load) operator of the parameter, and would like to put it in the *preamble* of the global block.
### Operator
diff --git a/doc/fluid/design/motivation/api.md b/doc/fluid/design/motivation/api.md
index e6a4638d9100d9b07c3ee6b92b530a17eae1c162..bc222564e3ec28e306ca0572b6a23104f6e9cbc5 100644
--- a/doc/fluid/design/motivation/api.md
+++ b/doc/fluid/design/motivation/api.md
@@ -77,8 +77,7 @@ print "The sematic-vector of testA: ", paddle.infer(fA, parameters, testA)
### Example 2. Sharing Parameters between "Models"
-We use [GAN](https://github.com/PaddlePaddle/book/tree/develop/gan) in
-this example. In the following example program, `d0` and `d1`
+We use GAN in this example. In the following example program, `d0` and `d1`
correspond to the two networks in the following figure:
diff --git a/doc/fluid/design/motivation/refactorization.md b/doc/fluid/design/motivation/refactorization.md
index 4e1d660cef6369f04db8e1e83360f6af25259f96..ad9d0f6d3f3ad9884f108826e8410871fffd51bf 100644
--- a/doc/fluid/design/motivation/refactorization.md
+++ b/doc/fluid/design/motivation/refactorization.md
@@ -125,12 +125,12 @@ Compile Time -> IR -> Runtime
## Operator/OpWithKernel/OpKernel
-
+
---
## Operator
-
+
* `Operator` is the fundamental building block of the user interface.
* Operator stores input/output variable names and attributes.
@@ -141,7 +141,7 @@ Compile Time -> IR -> Runtime
## OpWithKernel/Kernel
-
+
* `OpWithKernel` inherits `Operator`.
* `OpWithKernel` contains a Kernel map.
diff --git a/doc/fluid/design/multi_devices/kernel_selection.md b/doc/fluid/design/multi_devices/kernel_selection.md
index 967317d5d2eeb818ab14faabca342cc8c4ed717e..4d2aab87b8cf30d03075e96cc4c67070efaf963a 100644
--- a/doc/fluid/design/multi_devices/kernel_selection.md
+++ b/doc/fluid/design/multi_devices/kernel_selection.md
@@ -74,10 +74,10 @@ void OperatorWithKernel::Run(
auto kernel_type_for_var = this->GetKernelTypeForVar(...);
if (kernel_type_for_var.place_ != expected_kernel_key.place_) {
auto* trans_var = new_scope.Var(var_name);
- auto* out = DataTransform(expected_kernel_key,
+ auto* out = TransformData(expected_kernel_key,
kernel_type_for_var,
*tensor_in);
- CopyVariableWithTensor(...);
+ SetTensorToVariable(...);
}
}
diff --git a/doc/fluid/design/multi_devices/operator_kernel_type.md b/doc/fluid/design/multi_devices/operator_kernel_type.md
index 8c1bc8f76a337006497e5ab5e5a710f9f49261b8..5e391bd62b4f4e123a9a6f35b7adf5726f205635 100644
--- a/doc/fluid/design/multi_devices/operator_kernel_type.md
+++ b/doc/fluid/design/multi_devices/operator_kernel_type.md
@@ -75,7 +75,7 @@ Different layout leads to different implementation of the operator kernel. There
- The inference of Layout is at run-time, not at compile-time.
-- Every operator has to implement different kernels for different layouts. Let's take MKLDNN as an example. If we want to implement an MKLDNN convolution operator, we have to implement all the kernels for different layouts, which are listed [here](http://01org.github.io/mkl-dnn/structmkldnn_1_1memory.html). And we will have a special macro to register kernels for MKLDNN operators.
+- Every operator has to implement different kernels for different layouts. Let's take MKLDNN as an example. If we want to implement an MKLDNN convolution operator, we have to implement all the kernels for different layouts, which are listed [here](http://intel.github.io/mkl-dnn/structmkldnn_1_1memory.html). And we will have a special macro to register kernels for MKLDNN operators.
`Layout` is also defined as a enum variable:
diff --git a/doc/fluid/design/others/graph_survey.md b/doc/fluid/design/others/graph_survey.md
index 6c6db08f463ae0a2b94fc4546f123a1d7c151870..97f395133b48a1d0ed5136f0ebc8720b8ca87ded 100644
--- a/doc/fluid/design/others/graph_survey.md
+++ b/doc/fluid/design/others/graph_survey.md
@@ -28,7 +28,7 @@ def get_symbol(num_classes=10, **kwargs):
-Varible here is actually a Symbol. Every basic Symbol will correspond to one Node, and every Node has its own NodeAttr. There is a op field in NodeAttr class, when a Symbol represents Variable(often input data), the op field is null.
+Varible here is actually a Symbol. Every basic Symbol will correspond to one Node, and every Node has its own AnyAttr. There is a op field in AnyAttr class, when a Symbol represents Variable(often input data), the op field is null.
Symbol contains a data member, std::vector outputs, and NodeEntry cantains a poniter to Node. We can follow the Node pointer to get all the Graph.
diff --git a/doc/fluid/design/quantization/fixed_point_quantization.md b/doc/fluid/design/quantization/fixed_point_quantization.md
new file mode 100644
index 0000000000000000000000000000000000000000..085352fc5614d693e63a2f7241e868a9649456af
--- /dev/null
+++ b/doc/fluid/design/quantization/fixed_point_quantization.md
@@ -0,0 +1,110 @@
+Fixed-point quantization uses lower bits, for example, 2-bit, 3-bit or 8-bit fixed point to represent weights and activations, which usually are in singe-precision float-point with 32 bits. The fixed-point representation has advantages in reducing memory bandwidth, lowering power consumption and computational resources as well as the model storage requirements. It is especially important for the inference in embedded-device deployment.
+
+According to some experiments, the apporach to quantize the model trained in float point directly works effectively on the large models, like the VGG model having many parameters. But the accuracy drops a lot for the small model. In order to improve the tradeoff between accuracy and latency, many quantized training apporaches are proposed.
+
+This document is to design a quantized training framework on Fluid. The first part will introduce how to quantize, The second part will describe the quantized training framework. The last part will illustrate how to calculate the quantization scale.
+
+
+### How to quantize
+
+There are many ways to quantize the float value to fixed-point value. For example:
+
+$$ r = min(max(x, a), b)$$
+$$ s = \frac{b - a}{n - 1} $$
+$$ q = \left \lfloor \frac{r - a}{s} \right \rceil $$
+
+where, $x$ is the float value to be quantized, $[a, b]$ is the quantization range, $a$ is the minimum value and $b$ is the maximal value. $\left \lfloor \right \rceil$ denotes rounding to the nearest integer. If the quantization level is $k$, $n$ is $2^k$, for example, $k$ is 8 and $n$ is 256. $q$ is the quantized integer.
+
+
+The quantization we applied is parameterized by the number of quantization levels and maximum absolute value:
+
+$$ M = max(abs(x)) $$
+$$ q = \left \lfloor \frac{x}{M} * (n - 1) \right \rceil $$
+
+where, $x$ is the float value to be quantized, $M$ is maximum absolute value. $\left \lfloor \right \rceil$ denotes rounding to the nearest integer. For 8 bit quantization, $n=2^{8}=256$. $q$ is the quantized integer.
+
+
+Wether the *min-max* quantization or *max-abs* quantization, they also can be represent:
+
+$q = scale * r + b$
+
+We call *min-max*, *max-abs* as the quantization arguments, also call them quantization scale or quantization range.
+
+
+How to calculate the quantization scale (or maximum absolute value) for inference will be described in the last part.
+
+
+### Training Framework
+
+#### Forward pass
+
+The forward pass is simulated quantization, see Figure 1.
+
+The training framework is as following figure.
+
+
+
+Figure 1. Forward in training with simulated quantization.
+
+
+- Firstly, both input and weight will be quantized to 8-bit integers.
+- Second, do the multiplication (or convolution) operation with integers.
+- Third, dequantize the multiplication (or convolution) results to 32-bit float point.
+- Finally, do bias-addition in float type of 32 bit. Here, the bias is not quantized.
+
+For general matrix multiplication (GEMM), quantize for $X$ and $W$:
+
+$$ X_q = \left \lfloor \frac{X}{X_m} * (n - 1) \right \rceil $$
+$$ W_q = \left \lfloor \frac{W}{W_m} * (n - 1) \right \rceil $$
+
+Do GEMM:
+
+$$ Y = X_q * W_q $$
+
+
+Dequantize $Y$:
+
+$$
+\begin{align}
+Y_{dq} &=\frac{Y}{(n - 1) * (n - 1)} * X_m * W_m \\\
+ &=\frac{X_q * W_q}{(n - 1) * (n - 1)} * X_m * W_m \\\
+ &=(\frac{X_q}{n - 1} * X_m) * (\frac{W_q}{n - 1} * W_m)
+\end{align}
+$$
+
+From these formulas, dequantization also can be moved before GEMM, do dequantization for $Xq$ and $Wq$ at first, then do GEMM. The forward workflow in training is equivalent to following framework.
+
+
+
+Figure 2. Equivalent forward in training with simulated quantization.
+
+
+We use this equivalent workflow in the training. In our desigin, there is a quantization transpiler to insert the quantization operator and the de-quantization operator in the Fluid `ProgramDesc`. Since the outputs of quantization and de-quantization operator are still in floating point, they are called faked quantization and de-quantization operator. And the training framework is called simulated quantization.
+
+#### Backward pass
+
+See Figure 3. The gradients are calculated by dequantized weights and activations. All inputs and outputs are float point with 32-bit. And in the weight updating process, the gradients will be added to the original weight, not the quantized or dequantized weights.
+
+
+
+Figure 3. Backward and weight updating in training with simulated quantization.
+
+
+So the quantization transipler will change some inputs of the corresponding backward operators.
+
+### How to calculate quantization scale
+
+There are two strategies to calculate quantization scale, we call them dynamic and static strategy. The dynamic strategy calculates the quantization scale value each iteration. The static strategy keeps the quantization scale for different inputs.
+
+For weights, we apply the dynamic strategy in the training, that is to say, the quantization scale will be recalculated during each iteration until the traning is finished.
+
+For activations, the quantization scales are estimated during training, then used in inference. There are several different ways to estimate them:
+
+
+1. Calculate the mean of maximum absolute during a window.
+2. Calculate the max of maximum absolute during a window.
+3. Calculate the running mean of maximum absolute during a window, as follows:
+
+ $$ Vt = (1 - k) * V + k * V_{t-1} $$
+
+ where, $V$ is the maximum absolute value of current batch, $Vt$ is the running mean value. $k$ is a factor, such as 0.9.
diff --git a/doc/fluid/design/quantization/quantization_backward_and_optimization.png b/doc/fluid/design/quantization/quantization_backward_and_optimization.png
new file mode 100644
index 0000000000000000000000000000000000000000..84f8235ab87cb631992b691f8e05b9c0b6c93da2
Binary files /dev/null and b/doc/fluid/design/quantization/quantization_backward_and_optimization.png differ
diff --git a/doc/fluid/design/quantization/quantization_equivalent_forward.png b/doc/fluid/design/quantization/quantization_equivalent_forward.png
new file mode 100644
index 0000000000000000000000000000000000000000..df49c864537c047c785da12d24893e54ce0a5341
Binary files /dev/null and b/doc/fluid/design/quantization/quantization_equivalent_forward.png differ
diff --git a/doc/fluid/design/quantization/quantization_forward.png b/doc/fluid/design/quantization/quantization_forward.png
new file mode 100644
index 0000000000000000000000000000000000000000..0913f61621bb6533bcb10bd1d18120ccaaa96cff
Binary files /dev/null and b/doc/fluid/design/quantization/quantization_forward.png differ
diff --git a/doc/fluid/dev/api_doc_std_cn.md b/doc/fluid/dev/api_doc_std_cn.md
index b50f18f21df0787b9761bf0935ed7f4384ff0f98..7d39b8de1e6dc502ffea5f7882bd6a42b1ed6549 100644
--- a/doc/fluid/dev/api_doc_std_cn.md
+++ b/doc/fluid/dev/api_doc_std_cn.md
@@ -1,8 +1,9 @@
# API注释撰写标准
-- [API注释模块](#API注释模块)
-- [格式及示例](#格式及示例)
-- [完整示例](#完整示例)
+- [API注释撰写标准](#api)
+ - [API注释模块](#api)
+ - [格式及示例](#)
+ - [完整示例](#)
## API注释模块
@@ -217,4 +218,4 @@ API文档须使用reStructuredText格式撰写,该格式详情请参考[链接
## 完整示例
-fc 的完整注释见[示例](src/fc.py)。
+fc 的完整注释见[示例](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/src/fc.py)。
diff --git a/doc/fluid/dev/api_doc_std_en.md b/doc/fluid/dev/api_doc_std_en.md
index e57072d52fd162e92a3482aef33f99ab9394c532..f175b219750d1c765a6a111c2ec3aa732fa46175 100644
--- a/doc/fluid/dev/api_doc_std_en.md
+++ b/doc/fluid/dev/api_doc_std_en.md
@@ -1,8 +1,9 @@
# API Doc Standard
-- [API Doc Structure](#API Doc Structure)
-- [Format and Examples](#Format and Examples)
-- [Complete Example](#Complete Example)
+- [API Doc Standard](#api-doc-standard)
+ - [API Doc Structure](#api-doc-structure)
+ - [Format and Examples](#format-and-examples)
+ - [Complete Example](#complete-example)
## API Doc Structure
@@ -223,4 +224,4 @@ Format and examples of each part of API documantation are as follows: (take fc f
## Complete Example
-Complete Example of fc please see [here](src/fc.py)。
+Complete Example of fc please see [here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/src/fc.py)。
diff --git a/doc/fluid/dev/new_op_cn.md b/doc/fluid/dev/new_op_cn.md
index 587d819f79fcf82549826359fbf04ad3af404446..ff7408111fa20a7a6a3a2fe9f9ba20835918f399 100644
--- a/doc/fluid/dev/new_op_cn.md
+++ b/doc/fluid/dev/new_op_cn.md
@@ -36,19 +36,19 @@
OpProtoMake定义 |
-`.cc`文件,Backward Op不需要定义OpProtoMake |
+.cc 文件,Backward Op不需要定义OpProtoMake |
Op定义 |
- `.cc`文件 |
+ .cc 文件 |
Kernel实现 |
- CPU、CUDA共享Kernel实现在`.h`文件中,否则,CPU 实现在`.cc`文件中,CUDA 实现在`.cu`文件中。 |
+ CPU、CUDA共享Kernel实现在.h 文件中,否则,CPU 实现在.cc 文件中,CUDA 实现在.cu 文件中。 |
注册Op |
- Op注册实现在`.cc`文件;Kernel注册CPU实现在`.cc`文件中,CUDA实现在`.cu`文件中 |
+ Op注册实现在.cc 文件;Kernel注册CPU实现在.cc 文件中,CUDA实现在.cu 文件中 |
@@ -119,10 +119,29 @@ $$Out = scale*X$$
这个例子有`AddAttr("scale", "...").SetDefault(1.0);` : 增加`scale`系数,作为参数属性,并且设置默认值为1.0。
+### 定义GradProtoMaker类
+每个Op的必须有一个对应的GraProtoMaker,若未定制对应前向Op的GradProtoMaker,fluid提供了DefaultGradProtoMaker,默认注册会使用全部输入输出,包括Input, Output, Output@Grad等,使用不需要的变量的会造成显存浪费。
+下面示例定义了ScaleOp的GradProtoMaker。
+
+```cpp
+class ScaleGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+ using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ std::unique_ptr Apply() const override {
+ auto *grad_op = new framework::OpDesc();
+ grad_op->SetType("scale");
+ grad_op->SetInput("X", OutputGrad("Out"));
+ grad_op->SetOutput("Out", InputGrad("X"));
+ grad_op->SetAttr("scale", GetAttr("scale"));
+ return std::unique_ptr(grad_op);
+ }
+};
+```
### 定义Operator类
-下面的点实现了MulOp的定义:
+下面实现了MulOp的定义:
```cpp
class MulOp : public framework::OperatorWithKernel {
@@ -334,3 +353,83 @@ ctest -R test_mul_op
- 注册Op时的类型名,需要和该Op的名字一样。即不允许在`A_op.cc`里面,注册`REGISTER_OPERATOR(B, ...)`等,这将会导致单元测试出错。
- 如果Op没有实现CUDA Kernel,请不要创建空的`*_op.cu`,这将会导致单元测试出错。
- 如果多个Op依赖一些共用的函数,可以创建非`*_op.*`格式的文件来存放,如`gather.h`文件。
+
+### PADDLE_ENFORCE使用注意
+
+实现Op时检查数据的合法性需要使用PADDLE_ENFORCE以及PADDLE_ENFORCE_EQ等宏定义,基本格式如下:
+
+```
+PADDLE_ENFORCE(表达式, 错误提示信息)
+PADDLE_ENFORCE_EQ(比较对象A, 比较对象B, 错误提示信息)
+```
+
+如果表达式为真,或者比较对象A=B,则检查通过,否则会终止程序运行,向用户反馈相应的错误提示信息。
+为了确保提示友好易懂,开发者需要注意其使用方法。
+
+#### 总体原则
+
+任何使用了PADDLE_ENFORCE与PADDLE_ENFORCE_**检查的地方,必须有详略得当的备注解释!**错误提示信息**不能为空!
+
+#### 提示信息书写标准
+
+1. [required] 哪里错了?为什么错了?
+ - 例如:`ValueError: Mismatched label shape`
+2. [optional] 期望的输入是什么样的?实际的输入是怎样的?
+ - 例如:`Expected labels dimension=1. Received 4.`
+3. [optional] 能否给出修改意见?
+ - 例如:`Suggested Fix:If your classifier expects one-hot encoding label,check your n_classes argument to the estimatorand/or the shape of your label.Otherwise, check the shape of your label.`
+
+如果并非必要或者简洁的描述即可表达清楚以上要点,根据情况书写亦可。
+
+##### FAQ 典型问题
+
+1. 无报错信息或报错信息过于简单,不能给用户提供有效的提示!
+
+问题示例1 :未写提示信息
+```
+PADDLE_ENFORCE(ctx->HasInput("X"), "");
+```
+问题示例2 :提示信息过于简单
+```
+PADDLE_ENFORCE(i != nullptr, "i must be set"); // i是什么?
+```
+
+2. 在报错信息中使用开发人员定义的变量缩写,不易理解!
+
+问题示例:
+```
+PADDLE_ENFORCE(forward_pd != nullptr,
+ "Fail to find eltwise_fwd_pd in device context"); //eltwise_fwd_pd用户可能看不懂
+```
+
+3. OP内部调用非法接口:Op内部如果出现Output = ShareDataWith(Input)
+问题示例:
+```cpp
+auto *out = ctx.Output("Out");
+auto *in = ctx.Input("X");
+out->ShareDataWith(*in);
+```
+Op内部如果出现Output = ShareDataWith(Input),相当于operator图的中有一条隐藏边,连接了Input和Output,这条边无法在图分析中表达,引发基于图优化的错误。
+
+4. OP实现的性能实践
+调用了eigen的broadcast, chop等操作,性能会比手写cuda kernel差几倍以上。此时cpu的实现可以复用eigen,gpu实现可以实现cuda kernel.
+
+
+#### OP InferShape检查提示信息特别说明
+
+- 检查输入输出变量,请统一遵循以下格式
+`Input(变量名) of OP名 operator should not be null.`
+
+正确示例:
+```
+PADDLE_ENFORCE(ctx->HasInput("Input"),
+ "Input(Input) of LSTMP operator should not be null.");
+```
+
+- 反向Op的输入输出检查,要写明反向Op的名字
+
+正确示例:
+```
+PADDLE_ENFORCE(ctx->HasInput("X"),
+ "Input(X) of LoDResetGrad opreator should not be null.");
+```
diff --git a/doc/fluid/dev/releasing_process_en.md b/doc/fluid/dev/releasing_process_en.md
index f989b964d6d1a329bbe31adc7ec10db017acaefa..2c1c30c1eddfde6d9a8e2637be86537c43cc1b00 100644
--- a/doc/fluid/dev/releasing_process_en.md
+++ b/doc/fluid/dev/releasing_process_en.md
@@ -50,6 +50,33 @@ pop-up box, choose the current release branch and click "Run Build" button. You
* pypi does not allow overwrite the already uploaded version of wheel package, even if you delete the
old version. you must change the version number before upload a new one.
+### Publish wheel Packages for MacOS
+
+You need to build the binary wheel package for MacOS before publishing, to
+make sure that the package can be used by many versions of MacOS
+(10.11, 10.12, 10.13) and different python installs (python.org, homebrew, etc.),
+you must build the package ***exactly*** following below steps:
+
+Build steps:
+
+1. install python from python.org downloads, and make sure it's currently in use
+ in your system.
+1. `export MACOSX_DEPLOYMENT_TARGET=10.11`, use `10.11` is enough for recent versions.
+1. `git clone https://github.com/PaddlePaddle/Paddle.git && cd Paddle && mkdir build && cd build`
+1. `cmake -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_SYSTEM_BLAS=OFF ..`, make sure the output of `cmake` command is using the correct python interpreter installed from python.org
+1. `make -j`
+1. `pip install delocate`
+1. `mkdir fixed_wheel && delocate-wheel -w fixed_wheel python/dist/*.whl`
+
+Then the whl under `fixed_wheel` is ready to upload.
+
+Install steps:
+
+1. run `pip install paddlepaddle...whl`
+1. find the `libpython.dylib` that are currently in use:
+ - for python.org package installs, do nothing.
+ - for other python installs, find the path of `libpython*.dylib` and `export LD_LIBRARY_PATH=you path && DYLD_LIBRARY_PATH=your path`
+
## Publish Docker Images
Our CI tool will push latest images to DockerHub, so we only need to push a version tag like:
diff --git a/doc/fluid/dev/use_eigen_cn.md b/doc/fluid/dev/use_eigen_cn.md
index 75922e7d85a13e53ce94619a48d8da8b960e6c9a..56203d6fad444f61ef1be187ad0d149b2aa99ba4 100644
--- a/doc/fluid/dev/use_eigen_cn.md
+++ b/doc/fluid/dev/use_eigen_cn.md
@@ -7,7 +7,7 @@
Eigen Tensor模块对element-wise计算提供了强大的支持,并且书写一份代码,可以同时在CPU、GPU执行。但Eigen Tensor是一个正在开发中的模块,因此可能测试不够完备,文档较少。
-关于Eigen Tensor模块的详细介绍请参考[文档1](https://github.com/RLovelett/eigen/blob/master/unsupported/Eigen/CXX11/src/Tensor/README.md) 和[文档2](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md)
+关于Eigen Tensor模块的详细介绍请参考[Eigen文档](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md)
## paddle::framework::Tensor
diff --git a/doc/fluid/getstarted/Developer's_Guide_to_Paddle_Fluid.md b/doc/fluid/getstarted/Developer's_Guide_to_Paddle_Fluid.md
new file mode 100644
index 0000000000000000000000000000000000000000..79df6c59578e2acf495a3453ab61f069c3f09a49
--- /dev/null
+++ b/doc/fluid/getstarted/Developer's_Guide_to_Paddle_Fluid.md
@@ -0,0 +1,1819 @@
+
+# Paddle Fluid 开发者指南
+
+---
+
+### ==1==. 为什么需要 PaddlePaddle Fluid?
+
+---
+
+### 两个基础问题
+
+
+
+1. 如何描述机器学习模型和优化过程?
+ - 完备自洽,表达能力足以支持潜在出现的各种计算需求
+1. 如何充分利用资源高效计算?
+ - 支持异步设备、多卡、分布式计算
+ - 降低计算/计算优化的开发成本
+ - ……
+
+
+
+---
+
+### 如何描述模型和优化过程?
+
+
+
+
+
+
+ |
+一组连续执行的layers |
+variable和operator构成的计算图 |
+不再有模型的概念 |
+
+
+
+
+ 2013 |
+ Caffe,Theano, Torch, PaddlePaddle |
+ |
+ |
+
+
+
+ 2015 |
+ |
+ TensorFlow, MxNet, Caffe2, ONNX, n-graph |
+ |
+
+
+2016 |
+ |
+ |
+ PyTorch, TensorFlow Eager Execution, **==PaddlePaddle Fluid==** |
+
+
+
+
+
+---
+
+
+### 目标
+
+
+
+- 提高对各类机器学习任务的描述能力:能够描述潜在出现的任意机器学习模型。
+- 代码结构逻辑清晰,各模块充分解耦:内外部贡献者能够专注于自己所需的功能模块,基于框架进行再次开发。
+- 从设计上,留下技术优化的空间和潜力。
+- 代码解耦后降低多设备支持、计算优化等的开发成本。
+- 在统一的设计理念下,实现自动可伸缩,自动容错的分布式计算。
+
+
+
+---
+
+## ==2.== Design Overview
+
+---
+
+# Fluid: 系统形态
+
+- [编译器式的执行流程,区分编译时和运行时](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/motivation/fluid_compiler.md)
+
+
+
+
+
+
+---
+
+#### 让我们在Fluid程序实例中,区分编译时和运行时
+
+---
+### Fluid 编译时
+
+
+
+- ==**定义前向计算**==
+
+ ```python
+ x = fluid.layers.data(name='x',shape=[13], dtype='float32')
+ y_predict = fluid.layers.fc(input=x, size=1, act=None)
+ y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+ cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+ avg_cost = fluid.layers.mean(x=cost)
+ ```
+
+- ==**添加反向、正则、优化**==
+ ```python
+ learning_rate = 0.01
+ sgd_optimizer = fluid.optimizer.SGD(learning_rate)
+ sgd_optimizer.minimize(avg_cost)
+ ```
+
+
+---
+
+### `Program` vs. 计算图
+
+
+
+- 在科学计算领域,计算图是一种描述计算的经典方式。下图展示了从前向计算图(蓝色)开始,通过添加反向(红色)和优化算法相关(绿色)操作,构建出整个计算图的过程:
+-
+
+
+
+
+
+- Fluid ==使用`Program`而不是计算图==来描述模型和优化过程。`Program`由`Block`、`Operator`和`Variable`构成,相关概念会在后文详细展开。
+- 编译时 Fluid 接受前向计算(这里可以先简单的理解为是一段有序的计算流)`Program`,为这段前向计算按照:前向 -> 反向 -> 梯度 clip -> 正则 -> 优化 的顺序,添加相关 `Operator`和`Variable`到`Program`到完整的计算。
+
+
+
+---
+
+### Fluid 运行时
+
+
+
+- ==**读入数据**==
+
+ ```python
+ train_reader = paddle.batch(
+ paddle.reader.shuffle(paddle.dataset.uci_housing.train(), buf_size=500),
+ batch_size=20)
+ feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+ ```
+- ==**定义执行程序的设备**==
+ ```python
+ place = fluid.CPUPlace()
+ feeder = fluid.DataFeeder(place=place,feed_list=[x, y])
+ ```
+
+- ==创建执行器(Executor),执行初始化 `Program`和训练`Program`==
+
+ ```python
+ exe = fluid.Executor(place)
+ exe.run(fluid.default_startup_program())
+ PASS_NUM = 100
+ for pass_id in range(PASS_NUM):
+ for data in train_reader():
+ avg_loss_value, = exe.run(fluid.default_main_program(),
+ feed=feeder.feed(data),
+ fetch_list=[avg_cost])
+ print(avg_loss_value)
+ ```
+
+
+---
+
+### 总结:框架做什么?用户做什么?
+
+
+
+
+
+
+构建训练 |
+执行训练 |
+
+
+
+
+
+用户:描述前向运算 框架:添加反向运算 框架:添加优化运算 框架:添加内存优化 框架:添加并行/多设备/分布式相关的计算单元
+ |
+
+
+框架:创建Operator(计算)+ Variable(数据) 框架:创建`Block` 框架:内存管理/设备管理 框架:执行计算
+ |
+
+
+
+
+
+---
+
+### 总结:编译时
+
+
+**用户编写一段Python程序,描述模型的前向计算**
+1. 创建变量描述 `VarDesc`
+1. 创建operators的描述 `OpDesc`
+1. 创建operators的属性
+1. 推断变量的类型和形状,进行静态检查:`inferShape`
+1. 规划变量的内存复用
+1. 创建反向计算
+1. 添加优化相关的Operators
+1. (可选)添加多卡/多机相关的Operator,生成在多卡/多机上运行的程序
+
+
+
+---
+
+### 总结:运行时
+
+
+**执行规划好的计算**
+1. 创建`Executor`
+1. 为将要执行的一段计算,在层级式的`Scope`空间中创建`Scope`
+1. 创建`Block`,依次执行`Block`
+
+
+
+ Figure. 编译时运行时概览
+
+
+
+
+---
+
+## ==3==. 用户如何描述计算?
+---
+
+### Fluid:==像写程序一样==定义计算
+
+
+- 顺序执行
+ ```python
+ x = fluid.layers.data(name='x',shape=[13], dtype='float32')
+ y_predict = fluid.layers.fc(input=x, size=1, act=None)
+ y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+ cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+ ```
+
+- 条件分支: [swith](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/switch.md)、[ifelse](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/if_else_op.md)
+
+ ```python
+ a = fluid.Var(10)
+ b = fluid.Var(0)
+
+ switch = fluid.switch()
+ with switch.block():
+ with switch.case(fluid.less_equal(a, 10)):
+ fluid.print("Case 1")
+ with switch.case(fluid.larger(a, 0)):
+ fluid.print("Case 2")
+ with switch.default():
+ fluid.print("Case 3")
+ ```
+
+>[A Lisp cond form may be compared to a continued if-then-else as found in many algebraic programming languages](https://www.cs.cmu.edu/Groups/AI/html/cltl/clm/node84.html).
+
+
+
+---
+
+### Fluid: ==像写程序一样==定义计算
+
+
+
+- 循环:[while](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_machine_translation.py#L105)
+
+ ```python
+ d0 = layers.data("d0", shape=[10], dtype='float32')
+ data_array = layers.array_write(x=d0, i=i)
+ array_len = layers.fill_constant(shape=[1],dtype='int64', value=3)
+
+ cond = layers.less_than(x=i, y=array_len)
+ while_op = layers.While(cond=cond)
+ with while_op.block():
+ d = layers.array_read(array=data_array, i=i)
+ i = layers.increment(x=i, in_place=True)
+ layers.array_write(result, i=i, array=d)
+ layers.less_than(x=i, y=array_len, cond=cond)
+ ```
+
+- 完整实例请点查看 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/unittests/test_while_op.py#L36-L44)
+- beam search [->]( https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_machine_translation.py#L105)
+
+
+
+---
+
+#### 总结
+
+
+
+1. 用户层提供的描述语法具有完备性、自洽性,有能力支持对复杂计算过程描述
+1. 使用方式和核心概念可以类比编程语言,认知能够直接迁移
+1. 能够支持:定义问题,逐步求解
+
+
+
+---
+
+## ==3.== 核心概念
+
+---
+### 编译时概念 :==变量和计算的描述==
+
+
+
+- `VarDesc` + `TensorDesc` + `OpDesc` -> `BlockDesc` -> `ProgramDesc`
+ - https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto
+
+- 什么是 Fluid Program
+
+ - 在Fluid中,一个神经网络任务(训练/预测)被描述为一段`Program`
+ - `Program`包含对`Variable`(数据)和 `Operator`(对数据的操作)的描述
+ - `Variable` 和 `Operator` 被组织为多个可以嵌套的`Block`,构成一段完整的`Fluid Program`
+
+
+>编译阶段最终,经过 Transpiler 的执行规划,变换处理,生成使用`protobuf`序列化后的`ProgramDesc`。可以发送给多卡或者网络中的其它计算节点执行
+
+
+
+---
+
+### 编译时概念 :==**[Transpiler](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/motivation/fluid_compiler.md)**==
+
+
+1. 接受一段`ProgramDesc`作为输入,生成一段新的`ProgramDesc`
+
+ - *Memory optimization transpiler*:向原始`ProgramDesc` 中插入 `FreeMemoryOps`,在一次迭代优化结束前提前释放内存,使得能够维持较小的 memory footprint
+
+ - *Distributed training transpiler*:将原始的`ProgramDesc`中转化为对应的分布式版本,生成两段新的`ProgramDesc`:
+ 1. trainer进程执行的`ProgramDesc`
+ 1. parameter server执行的`ProgramDesc`
+
+1. ==**WIP**==: 接受一段`ProgramDesc`,生成可直接被`gcc`, `nvcc`, `icc`等编译的代码,编译后得到可执行文件
+
+
+
+---
+### Transplier
+
+
+
+
+
+---
+
+### 打印 `ProgramDesc`
+
+
+
+
+
+
+
+- `default_startup_program`:创建可学习参数,对参数进行初始化
+- `default_main_program`:由用户定义的模型,包括了前向、反向、优化及所有必要的计算
+
+- 打印可读的 `Program`
+ ```python
+ from paddle.v2.fluid import debuger
+ print debuger.pprint_program_codes(framework.default_main_program().desc)
+ ```
+
+
+---
+### 输出效果
+
+
+
+
+
+variable in block 0 |
+variable in block 0 |
+
+
+
+ |
+ |
+
+
+
+
+
+---
+
+### 运行时概念
+
+
+
+- 数据相关
+ - `Tensor` / `LoDTensor` / `Variable`
+ - `Scope`
+
+- 计算相关
+ - `Block`
+ - `Kernel`、`OpWithKernel`、`OpWithoutKernel`
+
+
+
+ |
+protobuf messages |
+C++ class objects |
+
+
+
+Data |
+[VarDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto#L107)
+ |
+[Variable](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/variable.h#L24)
+ |
+
+
+
+Operation |
+[OpDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto#L35)
+ |
+[Operator](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h#L64)
+ |
+
+
+Block |
+BlockDesc
+ |
+Block
+ |
+
+
+
+
+
+
+- 执行相关 :`Executor`
+
+
+
+---
+#### Tensor 和 LoD(Level-of-Detail) Tensor
+
+
+- Tensor 是$n$-dimensional arry的推广,LoDTensor是在Tensor基础上附加了序列信息
+- Fluid中输入、输出,网络中的可学习参数全部统一使用LoDTensor(n-dimension array)表示
+- 一个mini-batch输入数据是一个LoDTensor
+ - 在Fluid中,RNN 处理变长序列无需padding,得益于 `LoDTensor`表示
+ - 可以简单将 LoD 理解为:`std::vector>`
+ - 对非序列数据,LoD 信息为空
+
+
+
+ |
+TensorFlow |
+PaddlePaddle |
+
+
+
+RNN |
+Support
+ |
+Support
+ |
+
+
+
+recursive RNN |
+Support
+ |
+Support
+ |
+
+
+padding zeros |
+Must
+ |
+No need
+ |
+
+blob data type |
+Tensor
+ |
+LODTensor
+ |
+
+
+
+
+
+
+
+---
+#### LoD 信息实例
+
+
+
+
+
+
+
+- 图(a)的LoD 信息
+ ```cpp
+ [0, 5, 8, 10, 14]
+ ```
+- 图(b)的 LoD 信息
+ ```cpp
+ [[0, 5, 8, 10, 14] /*level=1*/, [0, 2, 3, 5, 7, 8, 10, 13, 14] /*level=2*/]
+ ```
+
+
+---
+#### Tensor, Variable, Scope 之间的关系
+
+
+
+
+
+
+1. `Block` 是一个实现层的概念,不在应用层暴露给用户。目前用户无法自行创建并利用`Block`,用户能够感知的只有`Program`这个概念。
+1. 逻辑上,可以将 `Block` 类比为编程语言中的大括号:定义了一段作用域,其中运行一段代码
+1. `Executor`会为每一个`Block`创建一个`Scope`,`Block`是可嵌套的,因此`Scope`也是可嵌套的
+
+
+
+---
+### Executor
+
+
+
+
+
+接口 |
+说明 |
+
+
+
+
+
+ |
+输入 1. `ProgramDesc` 2. `Scope` 3.`block_id`
解释执行步骤 1. 创建所有 Variables 2. 逐一创建 Operator 并运行
+ |
+
+
+
+
+---
+### Operator/OpWithKernel/Kernel
+
+
+
+
+
+
+- operator 无状态,Operator的核心是==Run==方法
+- 一个operator可以注册多个kernel
+- operator 可以无 kernel:while_op 、ifelse op
+
+
+
+---
+#### Fluid Operator vs. PaddlePaddle layers
+
+
+
+
+Layer |
+Operator |
+
+
+
+
+
+ |
+
+
+ |
+
+
+
+1. 内部维护状态 2. 包含forward和backward方法 |
+1. 内部无状态 2. 只有Run方法 |
+
+
+
+
+
+
+---
+
+### ==4.== 内存管理
+
+---
+### 目标
+
+- 为异构设备提供统一的内存分配、回收接口
+- 最小化管理内存所需的时间,最小化管理开销
+- 减少内存碎片
+- 将内存管理与计算(Operators/Kernels)完全剥离
+- 统一内存管理是内存优化的基础
+
+---
+
+
+
+### Memory 接口
+
+- 内存管理模块向上层应用逻辑提供三个基础接口:
+ ```cpp
+ template
+ void* Alloc(Place place, size_t size);
+
+ template
+ void Free(Place place, void* ptr);
+
+ template
+ size_t Used(Place place);
+
+ struct Usage : public boost::static_visitor {
+ size_t operator()(const platform::CPUPlace& cpu) const;
+ size_t operator()(const platform::CUDAPlace& gpu) const;
+ };
+ ```
+- 模板参数 `Place` 指示内存分配发生的设备
+- 实现时,需特化支持的 `Place`, 提供以上三个接口的实现
+
+
+
+---
+### 代码结构
+
+
+
+内存管理模块可以理解为由以下两部分构成:
+
+1. SystemAllocator:实际从物理设备上分配、释放的内存的接口
+1. BuddyAllocator:内存管理算法
+
+
+
+---
+### System Allocator
+
+
+
+- SystemAllocator 是实现物理内存分配、回收的基类
+ - 不同设备上的内存分配和回收终将转化为标准接口调用
+ - 为不同设备实现MemoryAllocator,继承自SystemAllocator
+
+ ```cpp
+ class SystemAllocator {
+ public:
+ virtual ~SystemAllocator() {}
+ virtual void* Alloc(size_t& index, size_t size) = 0;
+ virtual void Free(void* p, size_t size, size_t index) = 0;
+ virtual bool UseGpu() const = 0;
+ };
+ ```
+
+
+---
+
+### CPU/GPU Allocator
+
+
+
+```cpp
+class CPUAllocator : public SystemAllocator {
+ public:
+ virtual void* Alloc(size_t& index, size_t size);
+ virtual void Free(void* p, size_t size, size_t index);
+ virtual bool UseGpu() const;
+};
+
+#ifdef PADDLE_WITH_CUDA
+class GPUAllocator : public SystemAllocator {
+ public:
+ virtual void* Alloc(size_t& index, size_t size);
+ virtual void Free(void* p, size_t size, size_t index);
+ virtual bool UseGpu() const;
+ private:
+ size_t gpu_alloc_size_ = 0;
+ size_t fallback_alloc_size_ = 0;
+};
+#endif
+```
+- CPUAllocator和GPUAllocator分别继承自SystemAllocator,分别调用相应的标准库函数实现物理内存的分配和释放。
+- 一旦大块、连续的物理内存分配之后,将通过内存管理算法实现内存的按块分配、回收、重用等。
+
+
+
+---
+### CPU Allocator
+
+
+
+- CPU 内存的分配提供两种选项:
+ 1. non-pinned memory:可分页内存
+ 2. pinned memory:页锁定内存
+ - 分配过大的页锁定内存有可能因为系统可使用的分页内存减少,影响系统性能,默认CPU下分配的是可分页内存
+
+- 通过gflags进行设置一次性分配内存的大小以及是否使用页锁定内存。
+
+ ```cpp
+ DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory.");
+ DEFINE_double(fraction_of_cpu_memory_to_use, 1,
+ "Default use 100% of CPU memory for PaddlePaddle,"
+ "reserve the rest for page tables, etc");
+ ```
+
+
+
+---
+### GPU Allocator
+
+
+
+- 通过 cudaMalloc 分配GPU显存
+- GPUAllocator::Alloc 首先会计算指定GPU device上的可用显存
+ - 如果可用显存小于请求分配大小,调用cudaMalloc进行分配
+ - 如果可用显存不足,目前会报错退出。
+- 通过gflags控制GPU下一次性分配显存的大小:
+
+ ```cpp
+ DEFINE_double(fraction_of_gpu_memory_to_use, 0.92,
+ "Default use 92% of GPU memory for PaddlePaddle,"
+ "reserve the rest for page tables, etc");
+ ```
+
+
+
+---
+#### 内存管理算法: [Buddy Memory Allocation](https://en.wikipedia.org/wiki/Buddy_memory_allocation)
+
+
+
+- Memory Arena:一次性分配大块连续内存,之后会基于这块内存进行内存管理:动态分配、释放、重用内存块。
+- 伙伴内存分配:
+ - 将内存划分为 2 的幂次方个分区,使用 best-fit 方法来分配内存请求。
+ - 当释放内存时,检查 buddy 块,查看相邻的内存块是否也已被释放。如果是,将内存块合并,以最小化内存碎片。
+ - 分配的内存在物理内存的自然边界对齐,提高内存访问效率。
+ - 算法的时间效率高,单使用 best-fit 方法的缘故,会产生一定的内存浪费
+
+
+
+---
+
+### Buddy Allocator
+
+
+
+- BuddyAllocator 是一个单例,每个设备(如: GPU/CPU(0)/GPU(1)) 拥有一个BuddyAllocator
+- BuddyAllocator 内部拥有一个私有成员变量 SystemAllocator
+- 当请求的内存超过BuddyAllocator管理的空余内存时,将会调用SystemAllocator去指定的设备上分配物理内存
+
+
+
+---
+### 实例:CPU 下内存管理接口的实现
+
+
+
+- 对上层应用,统一通过BuddyAllocator来实现内存的分配、释放以及用量查询
+ ```cpp
+ template <>
+ void* Alloc(platform::CPUPlace place, size_t size) {
+ VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
+ void* p = GetCPUBuddyAllocator()->Alloc(size);
+ VLOG(10) << " pointer=" << p;
+ return p;
+ }
+
+ template <>
+ void Free(platform::CPUPlace place, void* p) {
+ VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
+ GetCPUBuddyAllocator()->Free(p);
+ }
+
+ template <>
+ size_t Used(platform::CPUPlace place) {
+ return GetCPUBuddyAllocator()->Used();
+ }
+ ```
+
+
+---
+### ==5.== 多设备支持
+
+---
+### 多设备支持(一)
+
+
+
+- step 1:添加Place类型,由用户实现添加到框架
+ - 可以将Place类型理解为一个整数加上一个枚举型,包括:设备号 + 设备类型
+
+
+
+
+- DeviceContext
+ - 不同的Place会对应一个相应的DeviceContext,用于组织管理与设备相关的信息
+ - 例如,GpuDeviceContext中会管理Cuda stream
+ - 目前实现中一些特殊的库也会对应有自己的DeviceContext:例如:
+ ```cpp
+ class MKLDNNDeviceContext : public CPUDeviceContext {……}
+ ```
+ - 每种设备对应的DeviceContext需要管理的内容不尽相同,视具体需求来实现
+
+
+
+---
+
+### 多设备支持(二)
+
+
+
+- step 2: 增加KernelType,为相应的KernelType注册Kernel对象,由用户实现注册给框架 可以按照:
+ 1. Place 执行设备
+ 1. DataType 执行数据类型 FP32/FP64/INT32/INT64
+ 1. Memory layout: 运行时 Tensor 在内存中的排布格式 NCHW、 NHWC
+ 1. 使用的库
+
+ 来区分Kernel,为同一个operator注册多个 Kernel。
+
+ ```cpp
+ struct OpKernelType {
+ proto::DataType data_type_;
+ DataLayout data_layout_;
+ platform::Place place_;
+ LibraryType library_type_;
+ }
+ ```
+
+
+
+---
+
+### 多设备支持(三)
+
+
+
+step 3: 运行时的 KernelType 推断和Kernel切换,按需要修改Kernel推断和Kernel切换规则
+- Expected Kernel:期待调用的Kernel:由(1)`Place`和计算精度决定;或(2)用户在配置中显示指定使用的计算库,如`cudnn`、`mkldnn`等。
+- Actual Kernel:运行时从`Operator`的输入(`Variable`)可以推断出实际需要的`KernelType`
+- 当Expected Kernel和Actual Kernel不一致的时候,框架会插入`data_transformer`或者`data_layerout_transform`等,保证Expected Kernel可以执行,包括:
+ - CPUPlace -> GPUPlace :跨设备内存复制
+ - NCHW -> nChw8c :Layout转换
+ - FP32 -> FP16 :精度转换 _**尚未支持**_
+ - ……
+- 以上过程实现在OperatorWithKernel类的Run方法中 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.cc#L497)
+
+
+
+---
+## ==6.== while_op
+
+---
+### while_op
+
+
+
+- 循环执行一段`Program`,直到条件operator判断循环条件不满足时终止循环
+- while_op 的特殊之处:
+ 1. while_op 没有 kernel
+ 1. while_op 拥有自己的`Block`,会形成一段嵌套的`Block`
+ 1. ==while_op 内部创建了一个 Executor,来循环执行`Block`==
+
+- while_op 输入输出 : LoDTensorArray
+ ```cpp
+ namespace paddle {
+ namespace framework {
+ using LoDTensorArray = std::vector;
+ }
+ }
+ ```
+ - 每一次循环,从原始输入中“切出”一个片段
+ - LoDTensorArray 在Python端暴露,是Fluid支持的基础数据结构之一,用户可以直接创建并使用
+
+
+
+---
+### while_op [Run](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/while_op.cc#L42) 方法概览
+
+
+
+```cpp
+
+void Run(const framework::Scope &scope,
+ const platform::Place &dev_place) const override {
+ PADDLE_ENFORCE_NOT_NULL(scope.FindVar(Input(kCondition)));
+ auto &cond = scope.FindVar(Input(kCondition))->Get();
+ PADDLE_ENFORCE_EQ(cond.dims(), paddle::framework::make_ddim({1}));
+
+ framework::Executor executor(dev_place);
+ auto *block = Attr(kStepBlock);
+
+ auto *program = block->Program();
+ auto step_scopes =
+ scope.FindVar(Output(kStepScopes))->GetMutable();
+
+ while (cond.data()[0]) {
+ auto ¤t_scope = scope.NewScope();
+ step_scopes->push_back(¤t_scope);
+ executor.Run(*program, ¤t_scope, block->ID(),
+ false /*create_local_scope*/);
+ }
+}
+
+```
+
+
+
+---
+### while_op 的重要应用:Dynamic RNN
+
+---
+
+### 什么是 `dynamicRNN` ?
+
+
+
+
+1. 用户可以自定义在一个时间步之内的计算, 框架接受序列输入数据,在其上循环调用用户定义的单步计算
+1. 可学习参数在多个时间步之间共享
+1. `dynamicRNN` 由 `while_op` 实现
+1. 如果`dynamicRNN`中定义了`memory`,将会构成一个循环神经网络,否则其行为就等于在输入序列上循环调用预定义的单步计算
+
+
+
+---
+
+#### `dynamic RNN` 用户接口
+
+
+
+
+
+
+- `dynamicRNN` 中的重要元素
+ 1. **step input**: `dynamicRNN` 每个时间步的输入
+ 1. **step function**: 用户定义的单步计算
+ 1. **memory**: 用于形成循环连接
+ 1. **external/static memory**:单步计算的每一步都可以全部读取到的外部输入
+
+
+
+---
+
+#### dynamicRNN 中的 Memory
+
+
+
+`dynamicRNN`中`memory`的行为非常类似于 C++ 中的引用变量
+ - `memory` “指向” 一个operator的输出变量,记作: A
+ - `memory` 可以被 LoDTensor 初始化(当LoD信息为空时,为非序列,否则为序列),默认`memory`被初始化为零
+ - `memory` 在 operator A 前向计算之后,进行前向计算
+ - 当 `memory` 的前向计算会 "指向" A 的输出 LoDTensor
+ - `memory` 的输出可以是另一个 operator 的输入,于是形成了“循环”连接
+
+
+
+---
+
+### DynamicRNN 实现细节
+
+
+
+- `while_op` 无法独立构成dynamicRNN,必须和一组相关的 operator 及数据结构配合
+ - 依赖的 operators (这里仅列出最重要的,并非全部):
+ - `lod_rank_table` operator
+ - `lod_tensor_to_array` operator
+ - `array_to_lod_tensor` operator
+ - `shrink_memory` operator
+ - 依赖的数据结构
+ - `TensorArray`
+ - `LoDRankTable`
+
+- 在Fluid中,RNN接受变长序列输入,无需填充,以上数据结构和相关的operator配合工作,实现了对变长输入以batch计算
+
+
+
+---
+
+### `dynamicRNN` 如何实现 batch 计算 ?
+
+
+
+- 问题:
+ - RNN 可以看作是一个展开的前向网络,前向网络的深度是最长序列的长度
+ - 如果不对变长序列进行填充,将它们填充到一样长度,每个mini-batch输入将会不等长,每个样本展开长度不一致,导致前向和反向计算实现困难
+
+
+
+----
+##### 实例 :RNN encoder-decoder with attention
+
+
+
+- 以机器翻译的RNN encoder-decoder 模型(涉及了`dynamicRNN`的所有设计要素)为例,下图是 RNN encoder-decoder 的原始输入:
+
+ 
Figure. RNN encoder-decoder 原始batch 输入数据
+
+
+- source word sequences 是encoder RNN的输出,是一个LoDTensor
+- target word sequences 是look_uptable的输入,是一个LoDTensor
+- 上图中一个矩形方块是CPU/GPU内存中一片连续的内存空间,表示一个dense vector
+
+
+
+---
+
+### `dynamicRNN` 如何实现 batch 计算 ?
+
+
+
+1. 对一个mini batch中不等长样本进行排序,最长样本变成batch中的第一个,最短样本是batch中最后一个
+ - `LoDTensor` -> `LoDRankTable` :heavy_plus_sign: `lod_rank_table operaator`
+ - 可以将`LoDRankTable`理解为对LoDTensor中的多个序列按照长度排序LoDRankTable 存储了排序之后的index
+
+2. 构建每个时间步的batch输入:随着时间步增加,每个时间步的batch输入可能会逐渐缩小
+ - `TensorArray` :heavy_plus_sign: `lod_tensor_to_array` -> `LoDTensor` (without LoD)
+3. 每个时间步输出写入一个输出 `LoDTensorArray`
+3. `dynamicRNN`循环结束后, 按照`LoDRankTable`中记录的信息对输出`LoDTensorArray`重排序,还原会原始输入顺序
+ - `TensorArray` :heavy_plus_sign: `array_to_lod_tensor` -> `LoDTensor`
+
+
+
+---
+
+### 运行实例
+
+
+
+
+
+---
+### 运行实例
+
+
+
+
+
+
+
+- 执行到第5~7个batch时,batch size将会缩小
+
+
+
+---
+### 运行实例
+
+
+
+
+
+
+
+- 第5 ~ 7个batch时RNN的`memory`会发生什么?
+ - `memory` 指向某个operator的输出Tensor,在该operator前向计算之后,“取回”其计算结果
+ - 5 ~ 7时,遇到了序列的结束,==下一个时间步计算不再需要在已经结束的序列上展开==
+ - 在`dynamicRNN`中`shrink_memory` operator 用来缩小`memory`的batch输入
+
+
+
+---
+### 运行实例:batch 1 ~ 2
+
+
+
Figure. 第1、2个batch输入dynamicRNN的batch输入
+
+
+---
+### 运行实例:batch 3 ~ 4
+
+
+
Figure. 第3、4个batch输入dynamicRNN的batch输入
+
+
+---
+
+### 运行实例:batch 5 ~ 7
+
+
+
Figure. 第5、6、7个batch输入dynamicRNN的batch输入
+
+
+---
+### ==7.== Fluid 代码结构
+
+---
+### Fluid 代码结构
+
+
+
+
+代码结构 |
+模块结构 |
+
+
+
+
+
+
+
+
+ |
+
+
+
+
+ |
+
+
+
+
+
+---
+
+### ==8.== 文档总结
+
+---
+
+
+- 设计概览
+ - 重构概览 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/refactorization.md)
+ - fluid [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/fluid.md)
+ - fluid_compiler [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/motivation/fluid_compiler.md)
+- 核心概念
+ - variable 描述 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/var_desc.md)
+ - Tensor [->](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/tensor.md)
+ - LoDTensor [->](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md)
+ - TensorArray [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/tensor_array.md)
+ - Program [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md)
+ - Block [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md)
+ - Scope [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/scope.md)
+
+---
+
+- 重要功能模块
+ - backward [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/backward.md)
+ - 内存优化 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/memory_optimization.md)
+ - evaluator [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/executor.md)
+ - python API [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md)
+ - regularization [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/regularization.md)
+
+- 开发指南
+ - 支持新设硬件设备库 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/support_new_device.md)
+ - 添加新的Operator [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_cn.md)
+ - 添加新的Kernel [->](
+https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_kernel_en.md)
+
+
+
+---
+
+### ==9.== 开发指南
+
+---
+
+#### 建议开发环境:使用 Docker 编译和测试
+
+
+
+Docker编译PaddlePaddle源码: [->](http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/build_and_install/docker_install_cn.html)
+
+PaddlePaddle 在 Dockerhub 地址:[->](
+ https://hub.docker.com/r/paddlepaddle/paddle/tags/)
+
+1. 获取PaddlePaddle的Docker镜像
+ ```bash
+ docker pull paddlepaddle/paddle:latest-dev
+ ```
+
+1. 启动 docker container
+
+ ```bash
+ docker run -it -v $PWD/Paddle:/paddle paddlepaddle/paddle:latest-dev /bin/bash
+ ```
+
+1. 进入docker container后,从源码编译,请参考文档 [->]( http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/build_and_install/build_from_source_cn.html)
+
+
+
+---
+
+### 一些说明
+
+
+
+1. PaddlePaddle的Docker镜像为了减小体积,默认没有安装vim,可以在容器中执行`apt-get install -y vim`来安装vim。
+1. 开发推荐使用tag为`latest-dev`的镜像,其中打包了所有编译依赖。`latest`及`lastest-gpu`是production镜像,主要用于运行PaddlePaddle程序。
+2. 在Docker中运行GPU程序,推荐使用nvidia-docker,[否则需要将CUDA库和设备挂载到Docker容器内](http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/build_and_install/docker_install_cn.html)。
+
+
+ ```bash
+ nvidia-docker run -it -v $PWD/Paddle:/paddle paddlepaddle/paddle:latest-dev /bin/bash
+ ```
+
+
+
+
+
+---
+
+### [如何贡献](http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/dev/contribute_to_paddle_cn.html)
+
+
+
+- ==提交PullRequest前请务必阅读==: [->](http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/dev/contribute_to_paddle_cn.html)
+- 代码要求
+ 1. 代码注释遵守 Doxygen 的样式
+ 1. 确保编译器选项 WITH_STYLE_CHECK 已打开,并且编译能通过代码样式检查
+ 1. 所有代码必须具有单元测试,且能够通过所有单元测试
+- 使用 `pre-commit` 钩子提交Pull Request
+ 1. 帮助格式化源代码(C++,Python)
+ 1. 在提交前自动检查一些基本事宜:如每个文件只有一个 EOL,Git 中不要添加大文件等
+ 1. 安装pre-commit,并在PaddlePaddle根目录运行:
+ ```bash
+ ➜ pip install pre-commit
+ ➜ pre-commit install
+ ```
+
+
+---
+
+### 如何贡献
+
+
+
+1. 开始开发之前请先建立issue。
+ - 让其它同学知道某项工作已经有人在进行,以避免多人开发同一功能的情况。
+1. 提交PR必须关联相关的issue。做法请参考:[->](https://help.github.com/articles/closing-issues-using-keywords/)
+ - 目的:为了在提交的版本中留有记录描述这个PR是为了开发什么样的功能,为了解决什么样的问题。
+ - 当PR被merge后,关联的issue会被自动关闭。
+1. PR review 中,reviewer的每条comment都必须回复。
+ - 如修改完可直接回复:Done。
+ - 目的:review comment 中可能会有(1)询问类型的问题;(2)可以在下一个PR修改的问题;(3)comment意见不合理等。需要明确回复,以便reviewer和其他人有历史可查,便于区分是否已经进行修改,或者准备下一个PR修改,或者意见不合理可以不用进行修改。
+
+
+
+---
+
+### ==10.== 添加新的 Operator
+
+---
+
+### 概念简介
+
+
+
+添加一个新的operator,会涉及实现以下C++类的派生类:
+
+1. `framework::OperatorBase`: Operator(简写,Op)基类。
+1. `framework::OpKernel`: Op计算函数的基类,称作Kernel。
+1. `framework::OperatorWithKernel`:继承自OperatorBase,Op有计算函数,称作有Kernel。
+1. `class OpProtoAndCheckerMaker`:描述该Op的输入、输出、属性、注释,主要用于Python API接口生成
+
+依据是否包含kernel,可以将Op分为两种:
+1. 包含Kernel的Op:继承自OperatorWithKernel,==绝大多数operator都属于这一类==
+1. 不包含kernel的Op,继承自OperatorBase,只有少量Op属于这一类,例如while_op,ifelse_op
+
+这里主要介绍带Kernel的Op如何编写。
+
+
+
+---
+
+#### 添加新的Operator需要修改/添加哪些文件?
+
+
+
+
+
+
+内容 |
+定义位置 |
+
+
+
+
+
+OpProtoMake定义
+ |
+
+`.cc`文件,Backward Op不需要OpProtoMaker
+ |
+
+
+
+Op定义
+ |
+
+`.cc`文件
+ |
+
+
+
+Kernel实现
+ |
+
+CPU、CUDA共享Kernel实现在`.h`文件中,否则,CPU 实现在`.cc`文件中,CUDA 实现在`.cu`文件中。
+ |
+
+
+
+
+注册Op
+ |
+
+Op注册实现在`.cc`文件;Kernel注册CPU实现在`.cc`文件中,CUDA实现在`.cu`文件中
+ |
+
+
+
+
+
+- 添加 Operator 之前请阅读:[Operator 命名规范](https://github.com/PaddlePaddle/Paddle/blob/63cca04cfd488a4dab6d6273fd04a8017ef45932/doc/fluid/dev/name_convention.md)及[Operator Markdown注释规范](https://github.com/PaddlePaddle/Paddle/blob/63cca04cfd488a4dab6d6273fd04a8017ef45932/doc/fluid/dev/op_markdown_format.md)。
+- 实现新的op都添加至目录[paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators)下,文件命名以`*_op.h`(如有) 、 `*_op.cc` 、`*_op.cu`(如有)结尾。
+- 根据文件名自动构建op和Python端绑定,请务必遵守以上命名,否则需要进一步修改PyBind相关文件及CMakeLists.txt。
+
+
+---
+
+###### 实现带Kernel的Operator step1: 定义ProtoMaker类
+
+
+
+下面均以[clip_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/clip_op.h)为例进行介绍
+
+- clip_op计算公式:$Out = \min(\max(X, min), max)$
+- 首先定义`ProtoMaker`来描述该Op的输入、输出,并添加注释(*下面代码段的中注释进行了简化,实现时需按照规范添加注释*):
+
+ ```cpp
+ template
+ class ClipOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+ ClipOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+ : OpProtoAndCheckerMaker(proto, op_checker) {
+ AddInput("X","(Tensor)The input of clip op.");
+ AddOutput("Out", "(Tensor),The output of clip op.");
+ AddAttr(
+ "min", "(float),Minimum value.");
+ AddAttr(
+ "max", "(float),Maximum value.");
+ AddComment(R"DOC(
+ ……
+ )DOC");
+ }
+ };
+ ```
+
+
+
+---
+
+###### 实现带Kernel的Operator step2: 定义Operator类
+
+
+
+下面的代码段实现了`clip_op`的定义:
+
+```cpp
+class ClipOp : public framework::OperatorWithKernel {
+ public:
+ using framework::OperatorWithKernel::OperatorWithKernel;
+
+ void InferShape(framework::InferShapeContext* ctx) const override {
+ PADDLE_ENFORCE(ctx->HasInput("X"),
+ "Input(X) of ClipOp should not be null.");
+ PADDLE_ENFORCE(ctx->HasOutput("Out"),
+ "Output(Out) of ClipOp should not be null.");
+ auto x_dims = ctx->GetInputDim("X");
+ auto max = ctx->Attrs().Get("max");
+ auto min = ctx->Attrs().Get("min");
+ PADDLE_ENFORCE_LT(min, max, "max should be greater than min.");
+ ctx->SetOutputDim("Out", x_dims);
+ ctx->ShareLoD("X", /*->*/ "Out");
+ }
+};
+```
+
+
+---
+
+### Operator 类中需要完成的工作
+
+
+
+1. clip_op 继承自`OperatorWithKernel`,
+
+ ```cpp
+ using framework::OperatorWithKernel::OperatorWithKernel;
+ ```
+ 表示使用基类`OperatorWithKernel`的构造函数。
+
+1. 重写`InferShape`接口。
+ - `InferShape` 为const函数,不能修改Op的成员变
+ - `InferShape` 的参数为 `const framework::InferShapeContext &ctx`,从中可获取到输入输出以及属性
+ - `InferShape` 会被调用两次,一次是编译时(创建op),一次是运行时(调用op的`Run`方法时),需要完成以下功能:
+ 1. 做检查, 尽早报错:检查输入数据维度、类型等是否合法
+ 2. 设置输出Tensor的形状
+
+通常`OpProtoMaker`和`Op`类的定义写在`.cc`文件中。
+
+
+
+---
+
+### 补充说明
+
+
+
+1. `InferShape`目前支持两种实现方式,二者最后都会生成一个functor注册给OpInfo结构体。
+ 1. 继承framework::InferShapeBase,实现为一个functor(参考 [mul_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc#L22))
+ 2. override InferShape函数(参考 [clip_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/clip_op.cc#L24))
+
+1. 什么是`functor` ?
+
+ - 类或结构体仅重载了`()`,一般是可被多个kernel复用的计算函数。
+
+
+
+ ```cpp
+ template
+ class CrossEntropyFunctor {
+ public:
+ void operator()(const platform::CPUDeviceContext& ctx,
+ framework::Tensor* out,
+ const framework::Tensor* prob,
+ const framework::Tensor* labels, const bool softLabel) {
+ ……
+ }
+ };
+ ```
+
+
+ - 在 clip_op 内也会看到将一段计算函数抽象为functor的使用法: [->](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/clip_op.h#L27)。
+
+
+
+---
+
+###### 实现带Kernel的Operator step3: 定义OpKernel类
+
+
+
+- `ClipKernel`继承自`framework::OpKernel`,带有下面两个模板参数:
+ 1. `typename DeviceContext`: 表示设备类型,不同设备共享同一个Kernel时,需添加该模板参数。不共享时,需要提供针对不同设备的特化实现。
+ 1. `typename T` : 表示支持的数据类型,如`float`, `double`等
+
+- 在`ClipKernel`类中重写`Compute`方法
+ 1. `Compute`接受输入参数:`const framework::ExecutionContext& context`
+ - `ExecutionContext` 是从 `Scope`中将运行时Op的输入、输出`Variable`组织在一起,使得Op在调用`Compute`方法时,能够简单地通过名字拿到需要的输入输出`Variable`
+ - 与`InferShapeContext`相比,`ExecutionContext` 中增加了设备类型
+ 1. 在`Compute`函数里实现`OpKernel`的具体计算逻辑
+
+
+
+---
+#### ClipKernel 代码概览
+
+
+
+```cpp
+template
+class ClipKernel : public framework::OpKernel {
+ public:
+ void Compute(const framework::ExecutionContext& context) const override {
+ auto max = context.Attr("max");
+ auto min = context.Attr("min");
+ auto* x = context.Input("X");
+ auto* out = context.Output("Out");
+ T* out_data = out->mutable_data(context.GetPlace());
+ const T* x_data = x->data();
+ int64_t numel = x->numel();
+ Transform trans;
+ trans(context.template device_context(), x_data,
+ x_data + numel, out_data, ClipFunctor(min, max));
+ }
+};
+```
+
+- 为了使`OpKernel`的计算过程书写更加简单,并且CPU、CUDA的代码可以复用, Fluid 使用 Eigen 作为基础的矩阵运算库
+- Fluid对Eigen unsupported Tensor提供了一些基本的封装,可以在`Compute`接口中直接调用
+ - 关于在PaddlePaddle中如何使用Eigen库,请参考[使用文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/use_eigen_cn.md)。
+
+
+
+---
+###### 实现带Kernel的Operator step4: 实现反向Op
+
+
+
+- ==**反向Op没有`ProtoMaker`**==,除此之外定义与实现方式前向Op完全一致,不再赘述
+- 这里仅对反向Op的输入输出进行说明:
+ 1. 反向Op的输入
+ - 前向Op的输出
+ - 反向传播过程中传递给当前Op的梯度
+ - 需要注意,Fluid中,不区分Cost Op和中间层Op,所有Op都必须正确处理接收到的梯度
+ 2. 反向Op的输出
+ - 对可学习参数的求导结果
+ - 对所有输入的求导结果
+
+
+
+
+---
+
+###### 实现带Kernel的Operator step5: 注册Op及Kernel
+
+
+
+至此Op和Op kernel都已经实现完毕,接下来,需要在`.cc`和`cu`文件中注册op和kernel
+
+1. 在`.cc`文件中注册前向、反向Op类,注册CPU Kernel。
+
+
+
+ ```cpp
+ namespace ops = paddle::operators;
+ REGISTER_OP(clip, ops::ClipOp, ops::ClipOpMaker, clip_grad,
+ ops::ClipOpGrad);
+ REGISTER_OP_CPU_KERNEL(
+ clip, ops::ClipKernel);
+ REGISTER_OP_CPU_KERNEL(
+ clip_grad, ops::ClipGradKernel);
+ ```
+
+ - 在上面的代码片段中:
+
+ 1. `REGISTER_OP` : 注册`ops::ClipOp`类,类型名为`clip`,该类的`ProtoMaker`为`ops::ClipOpMaker`,注册`ops::ClipOpGrad`,类型名为`clip_grad`
+ 1. `REGISTER_OP_WITHOUT_GRADIENT` : 用于注册没有反向的Op,例如:优化算法相关的Op
+ 1. `REGISTER_OP_CPU_KERNEL` :注册`ops::ClipKernel`类,并特化模板参数为`paddle::platform::CPUPlace`和`float`类型,同理,注册`ops::ClipGradKernel`类
+
+
+1. 按照同样方法,在`.cu`文件中注册GPU Kernel
+ - 如果CUDA Kernel的实现基于Eigen,需在 `.cu`的开始加上宏定义 `#define EIGEN_USE_GPU`
+
+
+
+---
+
+##### 编译和Python端绑定
+
+
+
+- 运行下面命令可以仅编译新添加的Op:
+
+ ```
+ make mul_op
+ ```
+ - 需注意,运行单元测试需要编译整个工程
+
+- 如果遵循前文的文件命名规则,构建过程中,会自动为新增的op添加Python端绑定,并链接到生成的lib库中
+
+
+
+---
+
+###### 实现带Kernel的Operator step6: 添加前向单测及梯度检测
+
+
+
+- 新增Op的单元测试统一添加至:[python/paddle/v2/fluid/tests/unittests](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid/tests/unittests)目录
+- 前向Operator单测
+
+ 1. Op单元测试继承自`OpTest`,各项具体的单元测试在`TestClipOp`里完成,所有单测case都以`TestXX`命名
+ 1. 单元测试Operator,需要:
+ 1. 在`setUp`函数定义输入、输出,以及相关的属性参数
+ 1. 生成随机的输入数据
+ 1. 在Python脚本中实现与前向operator相同的计算逻辑,得到输出值,与operator前向计算的输出进行对比
+ 1. 反向梯度检测流程测试框架已经实现,直接调用相应接口`check_grad`即可
+
+- `clip_op` 单测代码请参考 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/unittests/test_clip_op.py),这里不再展开
+
+
+
+---
+#### 编译执行单测
+
+
+
+- `python/paddle/v2/framework/tests` 目录下新增的 `test_*.py` 单元测试会被自动加入工程进行编译
+
+ - 运行单元测试测时需要编译整个工程,并且编译时需要打开`WITH_TESTING`, 即`cmake paddle_dir -DWITH_TESTING=ON`
+- 编译成功后,执行下面的命令来运行单元测试:
+
+ ```bash
+ make test ARGS="-R test_mul_op -V"
+ ```
+
+ 或者:
+
+ ```
+ ctest -R test_mul_op
+ ```
+
+
+---
+
+### 添加Op的一些注意事项
+
+
+
+- 为每个Op创建单独的`*_op.h`(如有)、`*_op.cc`和`*_op.cu`(如有)。不允许一个文件中包含多个Op,将会导致编译出错。
+- 注册Op时的类型名,需要和该Op的名字一样。不允许在`A_op.cc`里面,注册`REGISTER_OP(B, ...)`,会导致单元测试出错。
+- 如果Op没有实现CUDA Kernel,不要创建空的`*_op.cu`,会导致单元测试出错。
+- 如果多个Op依赖一些共用的函数,可以创建非`*_op.*`格式的文件来存放,如`gather.h`文件。
+
+
+
+---
+
+### ==10.== 使用相关问题
+
+---
+
+### 定义前向计算
+
+
+
+- 当在python端执行时:
+ ```python
+ import paddle.v2.fluid as fluid
+ ```
+ [`framework.py`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/framework.py#L1040)定义了两个全局`Program`:
+ ```python
+ # program is a global instance.
+ _main_program_ = Program()
+ _startup_program_ = Program()
+ ```
+
+- 前向定义的过程就是不断往`mian_program`中添加Op和Variable
+- 如果需要执行一个新的`mian_program`时,可以调用调用:
+ ```python
+ def switch_main_program(program):
+ """
+ Switch the main program to a new program.
+ This funtion returns the previous main program.
+ """
+ ……
+ ```
+
+
+---
+
+### 自定义参数的初始化
+
+
+
+- 调用`fluid.ParamAttr(……)`接口,自定义参数的初始化
+
+ ```python
+ w_param_attrs = ParamAttr(name=None,
+ initializer=UniformInitializer(low=-1.0, high=1.0, seed=0),
+ learning_rate=1.0,
+ regularizer=L1Decay(1.0),
+ trainable=True,
+ clip=GradientClipByValue(-1.0, 1.0),
+ )
+ y_predict = fluid.layers.fc(input=x, size=1, param_attr=w_param_attrs)
+ ```
+
+- 补充问题:如何创建 `Variable`
+ ```python
+ cur_program = Program()
+ cur_block = cur_program.current_block()
+ new_var = cur_block.create_var(name="X", shape=[-1, 16, 16], dtype="float32")
+ ```
+
+
+
+---
+
+### 添加反向Op
+
+
+
+- 调用`fluid.backward.append_backward(X)`(`X`是一个Variable),来为一段前向`ProgramDesc`添加反Op
+
+ ```python
+ data = fluid.layers.data(name="data", shape=(2,3,4))
+ out = fluid.layers.fc(input=data,size=128,act=None)
+ loss = fluid.layers.reduce_sum(out)
+ fluid.backward.append_backward(loss=loss)
+ ```
+
+- 添加优化相关的Op
+ ```python
+ sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+ sgd_optimizer.minimize(loss)
+ ```
+
+- 可以随时调用`print(fluid.default_main_program())`来输出当前的`main_program`
+
+- 当构建完成整个`Program`后,调用下面的接口执行内存优化:
+ ```python
+ fluid.memory_optimize(fluid.default_main_program())
+ ```
+ - _注:内存优化目前仍在持续开发中,有可能不够稳定。_
+
+
+
+---
+
+### 总结:编译时执行流程
+
+
+
+- 用户定义前向计算
+- 添加反向Op到`default_main_program`
+- 添加 gradient clipping Op 到
+- 添加 regularization Op 到`default_main_program`
+- 为指定的优化算法,添加相关的状态 variable of optimizer 到`default_startup_program`
+ - 状态相关 variable是指如学习率, 历史 momentum, 二阶momentum等
+- 添加初始化 variable 的Op 到 `default_startup_program`
+- 为整个网络最后一个op,添加设置其接受到的梯度的Op到`default_main_program`
+- 进行内存优化规划
+
+
+
+---
+
+### Feed 数据 (一):通过 feed 字典
+
+
+
+- 执行executor的run方法时,指定feed字典,feed op 会将指定的数据放到`x`和`y`两个Variable中
+ ```python
+ y_data = np.random.randint(0, 8, [1]).astype("int32")
+ y_tensor = core.Tensor()
+ y_tensor.set(y_data, place)
+
+ x_data = np.random.uniform(0.1, 1, [11, 8]).astype("float32")
+ x_tensor = core.Tensor()
+ x_tensor.set(x_data, place)
+ ……
+ cost = exe.run(
+ fluid.default_main_program(),
+ feed={'x': x_tensor,
+ 'y': y_tensor},
+ fetchlist=[avg_cost])
+ ```
+
+- 这种方法较为底层,一般用于单测中
+
+
+
+---
+
+### Feed 数据 (二):使用 DataFeeder接口
+
+
+
+- 编写一个data_reader函数,data_reader是一个Python generator
+
+ ```python
+ def demo_reader():
+ def random_generator():
+ yield np.random.uniform(0.1, 1, [4]), np.random.randint(0, 1, [1])
+ return random_generator
+ ```
+- 在训练任务中使用 DataFeeder 接口
+ ```python
+ cost = exe.run(
+ fluid.default_main_program(),
+ feed={'x': x_tensor,
+ 'y': y_tensor},
+ fetchlist=[avg_cost])
+
+ train_reader = paddle.batch(
+ paddle.reader.shuffle(demo_reader(), buf_size=500), batch_size=4)
+ feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+ for data in train_reader():
+ cost = exe.run(
+ fluid.default_main_program(),
+ feed=feeder.feed(data),
+ fetch_list=[cost])
+ ```
+
+
+
+---
+
+### 常见问题
+
+
+
+- 如何使用 evaluator ? [->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_label_semantic_roles.py#L168)
+
+ ```python
+ accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
+ for pass_id in range(PASS_NUM):
+ accuracy.reset()
+ for data in train_reader():
+ loss, acc = exe.run(fluid.default_main_program(),
+ feed=feeder.feed(data),
+ fetch_list=[avg_cost] + accuracy.metrics)
+ pass_acc = accuracy.eval(exe)
+ # acc 当前一个batch 的 accuracy
+ # pass_acc 当前batch 的 accuracy
+ pass_total_acc = accuracy.eval(exe) # 整个pass的accuracy
+ ```
+
+- 如何在训练中测试?[->](https://github.com/dzhwinter/benchmark/blob/master/fluid/vgg16.py#L144)
+- 如何保存训练好的模型?[->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_recognize_digits.py#L143)
+- 如何加载训练好的模型进行预测?[->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_recognize_digits.py#L154)
+- 如何在同一个训练任务中定义多个Program,并交替运行? [->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/demo/fc_gan.py)
+- 如何profile?Fluid 实现了profile 工具,可以直接调用。请参考示例 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/unittests/test_profiler.py)
+
+
+
+
+---
diff --git a/doc/fluid/getstarted/index_cn.rst b/doc/fluid/getstarted/index_cn.rst
index 75af7354be93a6eeabfa9ccf86903505402a7ca6..3daea71d0933a2774227ff2b5e744392ca6b1765 100644
--- a/doc/fluid/getstarted/index_cn.rst
+++ b/doc/fluid/getstarted/index_cn.rst
@@ -17,3 +17,4 @@
:maxdepth: 1
concepts/use_concepts_cn.rst
+ developer's_guide_to_paddle_fluid.md
diff --git a/doc/fluid/getstarted/index_en.rst b/doc/fluid/getstarted/index_en.rst
index 75a43f4af87c34830ec940068196e6ca72640501..fb20bb4f245281c3acf67c417979dc63c144fef3 100644
--- a/doc/fluid/getstarted/index_en.rst
+++ b/doc/fluid/getstarted/index_en.rst
@@ -16,3 +16,4 @@ Here is an example of linear regression. It introduces workflow of PaddlePaddle,
:maxdepth: 1
concepts/index_en.rst
+ developer's_guide_to_paddle_fluid.md
diff --git a/doc/fluid/getstarted/quickstart_cn.rst b/doc/fluid/getstarted/quickstart_cn.rst
index 135beb75d0330f39d062753aa2aa83a077f36bb1..6a964d4f8561f30aa10936d2399698c51583442c 100644
--- a/doc/fluid/getstarted/quickstart_cn.rst
+++ b/doc/fluid/getstarted/quickstart_cn.rst
@@ -11,7 +11,7 @@ PaddlePaddle支持使用pip快速安装,目前支持CentOS 6以上, Ubuntu 14.
pip install paddlepaddle
-如果需要安装支持GPU的版本(cuda7.5_cudnn5_avx_openblas),需要执行:
+如果需要安装支持GPU的版本(cuda8.0_cudnn5_avx_openblas),需要执行:
.. code-block:: bash
@@ -28,18 +28,18 @@ PaddlePaddle支持使用pip快速安装,目前支持CentOS 6以上, Ubuntu 14.
import paddle.dataset.uci_housing as uci_housing
import paddle.fluid as fluid
-
+
with fluid.scope_guard(fluid.core.Scope()):
# initialize executor with cpu
exe = fluid.Executor(place=fluid.CPUPlace())
- # load inference model
+ # load inference model
[inference_program, feed_target_names,fetch_targets] = \
fluid.io.load_inference_model(uci_housing.fluid_model(), exe)
# run inference
- result = exe.run(inference_program,
- feed={feed_target_names[0]: uci_housing.predict_reader()},
+ result = exe.run(inference_program,
+ feed={feed_target_names[0]: uci_housing.predict_reader()},
fetch_list=fetch_targets)
- # print predicted price is $12,273.97
+ # print predicted price is $12,273.97
print 'Predicted price: ${:,.2f}'.format(result[0][0][0] * 1000)
执行 :code:`python housing.py` 瞧! 它应该打印出预测住房数据的清单。
diff --git a/doc/fluid/getstarted/quickstart_en.rst b/doc/fluid/getstarted/quickstart_en.rst
index df6619cfd039fc1fdca8cde57db9cc6aebf8f029..680122f25893a5a48fac103266bda4788f891f6d 100644
--- a/doc/fluid/getstarted/quickstart_en.rst
+++ b/doc/fluid/getstarted/quickstart_en.rst
@@ -12,7 +12,7 @@ Simply run the following command to install, the version is cpu_avx_openblas:
pip install paddlepaddle
-If you need to install GPU version (cuda7.5_cudnn5_avx_openblas), run:
+If you need to install GPU version (cuda8.0_cudnn5_avx_openblas), run:
.. code-block:: bash
@@ -31,18 +31,18 @@ code:
import paddle.dataset.uci_housing as uci_housing
import paddle.fluid as fluid
-
+
with fluid.scope_guard(fluid.core.Scope()):
# initialize executor with cpu
exe = fluid.Executor(place=fluid.CPUPlace())
- # load inference model
+ # load inference model
[inference_program, feed_target_names,fetch_targets] = \
fluid.io.load_inference_model(uci_housing.fluid_model(), exe)
# run inference
- result = exe.run(inference_program,
- feed={feed_target_names[0]: uci_housing.predict_reader()},
+ result = exe.run(inference_program,
+ feed={feed_target_names[0]: uci_housing.predict_reader()},
fetch_list=fetch_targets)
- # print predicted price is $12,273.97
+ # print predicted price is $12,273.97
print 'Predicted price: ${:,.2f}'.format(result[0][0][0] * 1000)
Run :code:`python housing.py` and voila! It should print out a list of predictions
diff --git a/doc/fluid/howto/cluster/fluid_cluster_train_cn.md b/doc/fluid/howto/cluster/fluid_cluster_train_cn.md
index b99b90056b0a2e51f2668a6d27d94857bdc09c37..55326940ce7c7dbaa5bf19f1950f470527ddf4f0 100644
--- a/doc/fluid/howto/cluster/fluid_cluster_train_cn.md
+++ b/doc/fluid/howto/cluster/fluid_cluster_train_cn.md
@@ -168,13 +168,13 @@ cd /paddle/python/paddle/fluid/tests/book
第二步,启动Parameter Server:
```bash
-PADDLE_INIT_PORT=6174 PADDLE_INIT_PSERVERS=192.168.1.2 TRAINERS=2 POD_IP=192.168.1.2 PADDLE_INIT_TRAINER_ID=1 TRAINING_ROLE=PSERVER python test_fit_a_line.py
+PADDLE_PSERVER_PORT=6174 PADDLE_PSERVER_IPS=192.168.1.2 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=192.168.1.2 PADDLE_TRAINER_ID=1 PADDLE_TRAINING_ROLE=PSERVER python test_fit_a_line.py
```
执行命令后请等待出现提示: ```Server listening on 192.168.1.2:6174 ```, 表示Paramter Server已经正常启动。
第三步,启动Trainer:
```bash
-PADDLE_INIT_PORT=6174 PADDLE_INIT_PSERVERS=192.168.1.3 TRAINERS=2 POD_IP=192.168.1.3 PADDLE_INIT_TRAINER_ID=1 TRAINING_ROLE=TRAINER python test_fit_a_line.py
+PADDLE_PSERVER_PORT=6174 PADDLE_PSERVER_IPS=192.168.1.3 PADDLE_TRAINERS=2 PADDLE_CURRENT_IPP=192.168.1.3 PADDLE_TRAINER_ID=1 PADDLE_TRAINING_ROLE=TRAINER python test_fit_a_line.py
```
由于我们定义的Trainer的数量是2个,因此需要在另外一个计算节点上再启动一个Trainer。
diff --git a/doc/fluid/howto/cluster/fluid_recordio.md b/doc/fluid/howto/cluster/fluid_recordio.md
new file mode 100644
index 0000000000000000000000000000000000000000..92859e8f622d0c155128821c54252113c5016989
--- /dev/null
+++ b/doc/fluid/howto/cluster/fluid_recordio.md
@@ -0,0 +1,127 @@
+# How to use RecordIO in Fluid
+
+If you want to use RecordIO as your training data format, you need to convert to your training data
+to RecordIO files and reading them in the process of training, PaddlePaddle Fluid provides some
+interface to deal with the RecordIO files.
+
+## Generate RecordIO File
+
+Before start training with RecordIO files, you need to convert your training data
+to RecordIO format by `fluid.recordio_writer.convert_reader_to_recordio_file`, the sample codes
+as follows:
+
+```python
+ reader = paddle.batch(mnist.train(), batch_size=1)
+ feeder = fluid.DataFeeder(
+ feed_list=[ # order is image and label
+ fluid.layers.data(
+ name='image', shape=[784]),
+ fluid.layers.data(
+ name='label', shape=[1], dtype='int64'),
+ ],
+ place=fluid.CPUPlace())
+ fluid.recordio_writer.convert_reader_to_recordio_file('./mnist.recordio', reader, feeder)
+```
+
+The above code snippet would generate a RecordIO `./mnist.recordio` on your host.
+
+**NOTE**: we recommend users to set `batch_size=1` when generating the recordio files so that users can
+adjust it flexibly while reading it.
+
+## Use the RecordIO file in a Local Training Job
+
+PaddlePaddle Fluid provides an interface `fluid.layers.io.open_recordio_file` to load your RecordIO file
+and then you can use them as a Layer in your network configuration, the sample codes as follows:
+
+```python
+ data_file = fluid.layers.io.open_recordio_file(
+ filename="./mnist.recordio",
+ shapes=[(-1, 784),(-1, 1)],
+ lod_levels=[0, 0],
+ dtypes=["float32", "int32"])
+ data_file = fluid.layers.io.batch(data_file, batch_size=4)
+
+ img, label = fluid.layers.io.read_file(data_file)
+ hidden = fluid.layers.fc(input=img, size=100, act='tanh')
+ prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
+ loss = fluid.layers.cross_entropy(input=prediction, label=label)
+ avg_loss = fluid.layers.mean(loss)
+
+ fluid.optimizer.Adam(learning_rate=1e-3).minimize(avg_loss)
+
+ place = fluid.CPUPlace()
+
+ exe = fluid.Executor(place)
+ exe.run(fluid.default_startup_program())
+ avg_loss_np = []
+
+ # train a pass
+ batch_id = 0
+ while True:
+ tmp, = exe.run(fetch_list=[avg_loss])
+
+ avg_loss_np.append(tmp)
+ print(batch_id)
+ batch_id += 1
+```
+
+## Use the RecordIO files in Distributed Training
+
+1. generate multiple RecordIO files
+
+For a distributed training job, you may have multiple trainer nodes,
+and one or more RecordIO files for one trainer node, you can use the interface
+`fluid.recordio_writer.convert_reader_to_recordio_files` to convert your training data
+into multiple RecordIO files, the sample codes as follows:
+
+```python
+ reader = paddle.batch(mnist.train(), batch_size=1)
+ feeder = fluid.DataFeeder(
+ feed_list=[ # order is image and label
+ fluid.layers.data(
+ name='image', shape=[784]),
+ fluid.layers.data(
+ name='label', shape=[1], dtype='int64'),
+ ],
+ place=fluid.CPUPlace())
+ fluid.recordio_writer.convert_reader_to_recordio_files(
+ filename_suffix='./mnist.recordio', batch_per_file=100, reader, feeder)
+```
+
+The above codes would generate multiple RecordIO files on your host like:
+
+```bash
+.
+ \_mnist-00000.recordio
+ |-mnist-00001.recordio
+ |-mnist-00002.recordio
+ |-mnist-00003.recordio
+ |-mnist-00004.recordio
+```
+
+2. open multiple RecordIO files by `fluid.layers.io.open_files`
+
+For a distributed training job, the distributed operator system will schedule trainer process on multiple nodes,
+each trainer process reads parts of the whole training data, we usually take the following approach to make the training
+data allocated by each trainer process as uniform as possiable:
+
+```python
+def gen_train_list(file_pattern, trainers, trainer_id):
+ file_list = glob.glob(file_pattern)
+ ret_list = []
+ for idx, f in enumerate(file_list):
+ if (idx + trainers) % trainers == trainer_id:
+ ret_list.append(f)
+ return ret_list
+
+trainers = int(os.getenv("PADDLE_TRAINERS"))
+trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+data_file = fluid.layers.io.open_files(
+ filenames=gen_train_list("./mnist-[0-9]*.recordio", 2, 0),
+ thread_num=1,
+ shapes=[(-1, 784),(-1, 1)],
+ lod_levels=[0, 0],
+ dtypes=["float32", "int32"])
+img, label = fluid.layers.io.read_file(data_files)
+...
+```
diff --git a/doc/fluid/howto/cluster/nccl2_rdma_training.md b/doc/fluid/howto/cluster/nccl2_rdma_training.md
new file mode 100644
index 0000000000000000000000000000000000000000..8adaf324fccb4cda7af16b9bace559c0642ae444
--- /dev/null
+++ b/doc/fluid/howto/cluster/nccl2_rdma_training.md
@@ -0,0 +1,110 @@
+# Distributed Training with NCCL2 and RDMA
+
+When doing distributed multi-GPU training, network bandwidth often becomes the
+bottleneck. We introduce a way to use NCCL2 to do such training job to
+achieve best performance.
+
+## Prepare Hardware with RDMA and Multiple GPUs
+
+I'm using two Linux servers each of them installed with 8 GPUs and
+one 100Gb RDMA card.
+Base environment is:
+
+* OS: CentOS 7.4
+* RDMA device: "Mellanox Technologies MT27700 Family [ConnectX-4]"
+* Kernel version: `4.4.88-1.el7.elrepo.x86_64`
+* Docker version: `1.12.6`
+* Docker storage driver: `overlay2`
+* IP addresses: 192.168.16.30,192.168.16.34
+
+In general, the steps including:
+
+1. Install GPU drivers
+1. Install RDMA drivers
+1. Install "InfiniBand Support"
+1. Use docker to run tests and make sure GPUs and RDMA can work inside
+ the container.
+
+I'll omit the section "Install GPU drivers" because we can find it easily
+somewhere else.
+
+### Install RDMA drivers
+
+For my case, I've got two machines with device
+"Mellanox Technologies MT27700 Family [ConnectX-4]" installed. The OS was
+"CentOS 7.4" and I updated the kernel to version 4.4 so that docker can
+work with the latest overlay2 filesystem.
+
+***NOTE: before you start, make sure you have a way to get a console
+of the server other than ssh because we may need to re-configure the
+network device.***
+
+1. Go to http://www.mellanox.com/page/products_dyn?product_family=26,
+ download `MLNX_OFED` software in the bottom of the page, and upload it
+ onto the server.
+1. Run `./mlnxofedinstall --add-kernel-support` in the software package.
+1. Run `/etc/init.d/openibd restart` to make everything work, note that
+ this operation may cause the network goes down if you are using this
+ RDMA device as default network device and use ssh to log in the server.
+1. Re-configure the network interface, for example:
+ `ifconfig eth2 192.168.16.30/20 up`, then add routes if needed:
+ `ip route add default via 192.168.16.1 dev eth2`.
+1. Do the same thing on the other node.
+1. Use `ping` to test if the two nodes have typical ICMP connection.
+1. Use either `udaddy` or `ib_write_bw` to test the network connection is
+ ready and have the desired bandwidth.
+
+### Prepare Docker Image to Run RDMA Programs
+
+1. Build a docker image using cuda base image like: `nvidia/cuda:8.0-cudnn5-devel-ubuntu16.04` and install paddlepaddle whl
+ package in it.
+1. Start a docker container and mount GPU driver libs into it (you can
+ skip this step if you are using nvidia-docker).
+1. Mount RDMA drivers and libs into the docker image (see below section),
+ also `udaddy` and `ib_write_bw` if needed.
+1. Mount GPU devices and RDMA devices into the container using `--device`
+ or just use privileged mode `--privileged`.
+1. Start the container using host network mode: `--net=host`
+
+### RDMA Library Files Needed
+
+Usually, `MLNX_OFED` install latest supported libs under
+`/usr/lib64/mlnx_ofed/valgrind`. Other libs also needed to run RDMA programs
+is listed below. These libs must be mounted into the docker container.
+
+* Libs under `/usr/lib64/mlnx_ofed/valgrind`
+ * libibcm.so
+ * libibverbs.so
+ * libmlx4.so
+ * libmlx5.so
+ * libmlx5-rdmav2.so
+ * librdmacm.so
+* Other libs:
+ * libnl-3.so.200
+ * libnl-route-3.so.200
+ * libnuma.so.1
+
+## Start to Run the Training Job
+
+Setting NCCL environment variables to turn NCCL switches on and off:
+
+
+| Env Name | Description |
+| --- | --- |
+| NCCL_SOCKET_IFNAME | The RDMA device, e.g. eth2 |
+| NCCL_P2P_DISABLE | Set to 1 to disable P2P transfer between GPUs |
+| NCCL_IB_DISABLE | Set to 1 to disable using RDMA |
+| NCCL_IB_CUDA_SUPPORT | Set to 1 to enable GPU Direct if supported |
+| NCCL_DEBUG | Set debug level: VERSION, WARN, INFO |
+
+My two servers are: `192.168.16.30,192.168.16.34`, On node 1, Run :
+
+```bash
+PADDLE_TRAINER_ID=0 PADDLE_PORT=48372 PADDLE_WORKERS=192.168.16.30,192.168.16.34 POD_IP=192.168.16.30 stdbuf -oL python vgg16.py
+```
+
+On node 2, Run:
+
+```bash
+PADDLE_TRAINER_ID=1 PADDLE_PORT=48372 PADDLE_WORKERS=192.168.16.30,192.168.16.34 POD_IP=192.168.16.34 stdbuf -oL python vgg16.py
+```
diff --git a/doc/fluid/howto/index_cn.rst b/doc/fluid/howto/index_cn.rst
index 97aeaf167d329529f2b120b5a3d4085e0510fe16..b57af64f44da82926c4862578f3072960ca5aa92 100644
--- a/doc/fluid/howto/index_cn.rst
+++ b/doc/fluid/howto/index_cn.rst
@@ -3,5 +3,6 @@
.. toctree::
:maxdepth: 1
-
+
+ inference/index_cn.rst
optimization/index_cn.rst
diff --git a/doc/fluid/howto/inference/build_and_install_lib_cn.rst b/doc/fluid/howto/inference/build_and_install_lib_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..91357dd8c8da19f2f33c6f285ed7eb234428b1ab
--- /dev/null
+++ b/doc/fluid/howto/inference/build_and_install_lib_cn.rst
@@ -0,0 +1,97 @@
+安装与编译C++预测库
+===========================
+
+直接下载安装
+-------------
+
+====================== ========================================
+版本说明 C++预测库
+====================== ========================================
+cpu_avx_mkl `fluid.tgz `_
+cpu_avx_openblas `fluid.tgz `_
+cpu_noavx_openblas `fluid.tgz `_
+cuda7.5_cudnn5_avx_mkl `fluid.tgz `_
+cuda8.0_cudnn5_avx_mkl `fluid.tgz `_
+cuda8.0_cudnn7_avx_mkl `fluid.tgz `_
+cuda9.0_cudnn7_avx_mkl `fluid.tgz `_
+====================== ========================================
+
+从源码编译
+----------
+用户也可以从 PaddlePaddle 核心代码编译C++预测库,只需在编译时配制下面这些编译选项:
+
+================= =========
+选项 值
+================= =========
+CMAKE_BUILD_TYPE Release
+FLUID_INSTALL_DIR 安装路径
+WITH_FLUID_ONLY ON(推荐)
+WITH_SWIG_PY OFF(推荐
+WITH_PYTHON OFF(推荐)
+WITH_GPU ON/OFF
+WITH_MKL ON/OFF
+================= =========
+
+建议按照推荐值设置,以避免链接不必要的库。其它可选编译选项按需进行设定。
+
+下面的代码片段从github拉取最新代码,配制编译选项(需要将PADDLE_ROOT替换为PaddlePaddle预测库的安装路径):
+
+ .. code-block:: bash
+
+ pip install paddlepaddle-gpu
+ PADDLE_ROOT=/path/of/capi
+ git clone https://github.com/PaddlePaddle/Paddle.git
+ cd Paddle
+ mkdir build
+ cd build
+ cmake -DFLUID_INSTALL_DIR=$PADDLE_ROOT \
+ -DCMAKE_BUILD_TYPE=Release \
+ -DWITH_FLUID_ONLY=ON \
+ -DWITH_SWIG_PY=OFF \
+ -DWITH_PYTHON=OFF \
+ -DWITH_MKL=OFF \
+ -DWITH_GPU=OFF \
+ ..
+ make
+ make inference_lib_dist
+
+成功编译后,使用C++预测库所需的依赖(包括:(1)编译出的PaddlePaddle预测库和头文件;(2)第三方链接库和头文件;(3)版本信息与编译选项信息)
+均会存放于PADDLE_ROOT目录中。目录结构如下:
+
+ .. code-block:: text
+
+ PaddleRoot/
+ ├── CMakeCache.txt
+ ├── paddle
+ │ └── fluid
+ │ ├── framework
+ │ ├── inference
+ │ ├── memory
+ │ ├── platform
+ │ ├── pybind
+ │ └── string
+ ├── third_party
+ │ ├── boost
+ │ │ └── boost
+ │ ├── eigen3
+ │ │ ├── Eigen
+ │ │ └── unsupported
+ │ └── install
+ │ ├── gflags
+ │ ├── glog
+ │ ├── mklml
+ │ ├── protobuf
+ │ ├── snappy
+ │ ├── snappystream
+ │ └── zlib
+ └── version.txt
+
+version.txt 中记录了该预测库的版本信息,包括Git Commit ID、使用OpenBlas或MKL数学库、CUDA/CUDNN版本号,如:
+
+ .. code-block:: text
+
+ GIT COMMIT ID: c95cd4742f02bb009e651a00b07b21c979637dc8
+ WITH_MKL: ON
+ WITH_GPU: ON
+ CUDA version: 8.0
+ CUDNN version: v5
diff --git a/doc/fluid/howto/inference/index_cn.rst b/doc/fluid/howto/inference/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a903423548decd0992bf19772fb2cb143f6a12b5
--- /dev/null
+++ b/doc/fluid/howto/inference/index_cn.rst
@@ -0,0 +1,8 @@
+预测库
+------------
+
+.. toctree::
+ :maxdepth: 1
+
+ build_and_install_lib_cn.rst
+ inference_support_in_fluid_cn.md
diff --git a/doc/fluid/howto/inference/inference_support_in_fluid_cn.md b/doc/fluid/howto/inference/inference_support_in_fluid_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..309b17fccd5c461c9c22beb64eb4c6792b7e4a7a
--- /dev/null
+++ b/doc/fluid/howto/inference/inference_support_in_fluid_cn.md
@@ -0,0 +1,304 @@
+# 使用指南
+
+## 目录:
+
+- Python Inference API
+- Inference C++ API
+- Inference实例
+- Inference计算优化
+
+## Python Inference API **[改进中]**
+- 保存Inference模型 ([链接](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/io.py#L295))
+
+ ```python
+ def save_inference_model(dirname,
+ feeded_var_names,
+ target_vars,
+ executor,
+ main_program=None,
+ model_filename=None,
+ params_filename=None):
+ ```
+ Inference模型和参数将会保存到`dirname`目录下:
+ - 序列化的模型
+ - `model_filename`为`None`,保存到`dirname/__model__`
+ - `model_filename`非`None`,保存到`dirname/model_filename`
+ - 参数
+ - `params_filename`为`None`,单独保存到各个独立的文件,各文件以参数变量的名字命名
+ - `params_filename`非`None`,保存到`dirname/params_filename`
+
+- 两种存储格式
+ - 参数保存到各个独立的文件
+ - 如,设置`model_filename`为`None`、`params_filename`为`None`
+
+ ```bash
+ $ cd recognize_digits_conv.inference.model
+ $ ls
+ $ __model__ batch_norm_1.w_0 batch_norm_1.w_2 conv2d_2.w_0 conv2d_3.w_0 fc_1.w_0 batch_norm_1.b_0 batch_norm_1.w_1 conv2d_2.b_0 conv2d_3.b_0 fc_1.b_0
+ ```
+ - 参数保存到同一个文件
+ - 如,设置`model_filename`为`None`、`params_filename`为`__params__`
+
+ ```bash
+ $ cd recognize_digits_conv.inference.model
+ $ ls
+ $ __model__ __params__
+ ```
+- 加载Inference模型([链接](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/io.py#L380))
+ ```python
+ def load_inference_model(dirname,
+ executor,
+ model_filename=None,
+ params_filename=None):
+ ...
+ return [program, feed_target_names, fetch_targets]
+ ```
+
+## 链接Fluid Inference库
+- 示例项目([链接](https://github.com/luotao1/fluid_inference_example.git))
+
+ - GCC配置
+ ```bash
+ $ g++ -o a.out -std=c++11 main.cc \
+ -I${PADDLE_ROOT}/ \
+ -I${PADDLE_ROOT}/third_party/install/gflags/include \
+ -I${PADDLE_ROOT}/third_party/install/glog/include \
+ -I${PADDLE_ROOT}/third_party/install/protobuf/include \
+ -I${PADDLE_ROOT}/third_party/eigen3 \
+ -L${PADDLE_ROOT}/paddle/fluid/inference -lpaddle_fluid \
+ -lrt -ldl -lpthread
+ ```
+
+ - CMake配置
+ ```cmake
+ include_directories(${PADDLE_ROOT}/)
+ include_directories(${PADDLE_ROOT}/third_party/install/gflags/include)
+ include_directories(${PADDLE_ROOT}/third_party/install/glog/include)
+ include_directories(${PADDLE_ROOT}/third_party/install/protobuf/include)
+ include_directories(${PADDLE_ROOT}/third_party/eigen3)
+ target_link_libraries(${TARGET_NAME}
+ ${PADDLE_ROOT}/paddle/fluid/inference/libpaddle_fluid.so
+ -lrt -ldl -lpthread)
+ ```
+
+ - 设置环境变量:
+ `export LD_LIBRARY_PATH=${PADDLE_ROOT}/paddle/fluid/inference:$LD_LIBRARY_PATH`
+
+
+
+## C++ Inference API
+
+- 推断流程([链接](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/test_helper.h#L91))
+
+ - 1、 初始化设备
+ ```cpp
+ #include "paddle/fluid/framework/init.h"
+ paddle::framework::InitDevices(false);
+ ```
+
+ - 2、 定义place,executor,scope
+ ```cpp
+ auto place = paddle::platform::CPUPlace();
+ auto executor = paddle::framework::Executor(place);
+ auto* scope = new paddle::framework::Scope();
+ ```
+
+ - 3、 加载模型
+ ```cpp
+ #include "paddle/fluid/inference/io.h"
+ auto inference_program = paddle::inference::Load(executor, *scope, dirname);
+ // or
+ auto inference_program = paddle::inference::Load(executor,
+ *scope,
+ dirname + "/" + model_filename,
+ dirname + "/" + params_filename);
+ ```
+
+ - 4、 获取`feed_target_names`和`fetch_target_names`
+ ```cpp
+ const std::vector& feed_target_names = inference_program->GetFeedTargetNames();
+ const std::vector& fetch_target_names = inference_program->GetFetchTargetNames();
+ ```
+
+ - 5、 准备`feed`数据
+ ```cpp
+ #include "paddle/fluid/framework/lod_tensor.h"
+ std::vector cpu_feeds;
+ ...
+ std::map feed_targets;
+ for (size_t i = 0; i < feed_target_names.size(); ++i) {
+ // Please make sure that cpu_feeds[i] is right for feed_target_names[i]
+ feed_targets[feed_target_names[i]] = cpu_feeds[i];
+ }
+ ```
+
+ - 6、 定义`Tensor`来`fetch`结果
+ ```cpp
+ std::vector cpu_fetchs;
+ std::map fetch_targets;
+ for (size_t i = 0; i < fetch_target_names.size(); ++i) {
+ fetch_targets[fetch_target_names[i]] = cpu_fetchs[i];
+ }
+ ```
+
+ - 7、 执行`inference_program`
+ ```cpp
+ executor.Run(*inference_program, scope, feed_targets, fetch_targets);
+ ```
+
+ - 8、 使用`fetch`数据
+ ```cpp
+ for (size_t i = 0; i < cpu_fetchs.size(); ++i) {
+ std::cout << "lod_i: " << cpu_fetchs[i]->lod();
+ std::cout << "dims_i: " << cpu_fetchs[i]->dims();
+ std::cout << "result:";
+ float* output_ptr = cpu_fetchs[i]->data();
+ for (int j = 0; j < cpu_fetchs[i]->numel(); ++j) {
+ std::cout << " " << output_ptr[j];
+ }
+ std::cout << std::endl;
+ }
+ ```
+ 针对不同的数据,4. - 8.可执行多次。
+
+ - 9、 释放内存
+ ```cpp
+ delete scope;
+ ```
+
+
+- 接口说明
+
+ ```cpp
+ void Run(const ProgramDesc& program, Scope* scope,
+ std::map& feed_targets,
+ std::map& fetch_targets,
+ bool create_vars = true,
+ const std::string& feed_holder_name = "feed",
+ const std::string& fetch_holder_name = "fetch");
+ ```
+ - 使用Python API `save_inference_model`保存的`program`里面包含了`feed_op`和`fetch_op`,用户提供的`feed_targets`、`fetch_targets`必须和`inference_program`中的`feed_op`、`fetch_op`保持一致。
+ - 用户提供的`feed_holder_name`和`fetch_holder_name`也必须和`inference_program`中`feed_op`、`fetch_op`保持一致,可使用`SetFeedHolderName`和`SetFetchHolderName`接口重新设置`inferece_program`
+ - 默认情况下,除了`persistable`属性设置为`True`的`Variable`之外,每次执行`executor.Run`会创建一个局部`Scope`,并且在这个局部`Scope`中创建和销毁所有的`Variable`,以最小化空闲时的内存占用。
+ - `persistable`属性为`True`的`Variable`有:
+ - Operators的参数`w`、`b`等
+ - `feed_op`的输入变量
+ - `fetch_op`的输出变量
+
+
+- **不在每次执行时创建和销毁变量
+ ([PR](https://github.com/PaddlePaddle/Paddle/pull/9301))**
+ - 执行`inference_program`
+ ```cpp
+ // Call once
+ executor.CreateVariables(*inference_program, scope, 0);
+ // Call as many times as you like
+ executor.Run(
+ *inference_program, scope, feed_targets, fetch_targets, false);
+ ```
+ - **优点**
+ - 节省了频繁创建、销毁变量的时间(约占每次`Run`总时间的1% ~ 12%)
+ - 执行结束后可获取所有Operators的计算结果
+ - **缺点**
+ - 空闲时也会占用大量的内存
+ - 在同一个`Scope`中,相同的变量名是公用同一块内存的,容易引起意想不到的错误
+
+
+- **不在每次执行时创建Op([PR](https://github.com/PaddlePaddle/Paddle/pull/9630))**
+ - 执行`inference_program`
+ ```cpp
+ // Call once
+ auto ctx = executor.Prepare(*inference_program, 0);
+ // Call as many times as you like if you have no need to change the inference_program
+ executor.RunPreparedContext(ctx.get(), scope, feed_targets, fetch_targets);
+ ```
+ - **优点**
+ - 节省了频繁创建、销毁Op的时间
+ - **缺点**
+ - 一旦修改了`inference_program`,则需要重新创建`ctx`
+
+
+- **多线程共享Parameters([链接](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/test_multi_thread_helper.h))**
+ - 主线程
+ - 1、 初始化设备
+ - 2、 定义`place`,`executor`,`scope`
+ - 3、 加载模型,得到`inference_program`
+ - 从线程
+ - **复制`inference_program`得到`copy_program`,修改`copy_program`的`feed_holder_name`和`fetch_holder_name`**
+ ```cpp
+ auto copy_program = std::unique_ptr(
+ new paddle::framework::ProgramDesc(*inference_program));
+ std::string feed_holder_name = "feed_" + paddle::string::to_string(thread_id);
+ std::string fetch_holder_name = "fetch_" + paddle::string::to_string(thread_id);
+ copy_program->SetFeedHolderName(feed_holder_name);
+ copy_program->SetFetchHolderName(fetch_holder_name);
+ ```
+ - 4、 获取`copy_program`的`feed_target_names`和`fetch_target_names`
+ - 5、 准备feed数据,定义Tensor来fetch结果
+ - 6、 执行`copy_program`
+ ```cpp
+ executor->Run(*copy_program, scope, feed_targets, fetch_targets, true, feed_holder_name, fetch_holder_name);
+ ```
+ - 7、 使用fetch数据
+ - 主线程
+ - 8、 释放资源
+
+
+- 基本概念
+ - 数据相关:
+ - [Tensor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/tensor.md),一个N维数组,数据可以是任意类型(int,float,double等)
+ - [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/lod_tensor.md),带LoD(Level-of-Detail)即序列信息的Tensor
+ - [Scope](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/scope.md),记录了变量Variable
+ - 执行相关:
+ - [Executor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/executor.md),无状态执行器,只跟设备相关
+ - Place
+ - CPUPlace,CPU设备
+ - CUDAPlace,CUDA GPU设备
+ - 神经网络表示:
+ - [Program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/program.md).
+
+ 详细介绍请参考[**Paddle Fluid开发者指南**](https://github.com/lcy-seso/learning_notes/blob/master/Fluid/developer's_guid_for_Fluid/Developer's_Guide_to_Paddle_Fluid.md)
+
+
+
+## Inference实例
+
+ 1. fit a line: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_fit_a_line.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc)
+ 1. image classification: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_image_classification.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_image_classification.cc)
+ 1. label semantic roles: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_label_semantic_roles.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc)
+ 1. recognize digits: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_recognize_digits.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc)
+ 1. recommender system: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_recommender_system.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc)
+ 1. understand sentiment: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_understand_sentiment.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc)
+ 1. word2vec: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_word2vec.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_word2vec.cc)
+
+
+## Inference计算优化
+- 使用Python推理优化工具([inference_transpiler](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/inference_transpiler.py))
+ ```python
+ class InferenceTranspiler:
+ def transpile(self, program, place, scope=None):
+ ...
+ if scope is None:
+ scope = global_scope()
+ ...
+ ```
+ - 使用`InferenceTranspiler`将会直接修改`program`。
+ - 使用`InferenceTranspiler`会修改参数的值,请确保`program`的参数在`scope`内。
+- 支持的优化
+ - 融合batch_norm op的计算
+- 使用示例([链接](https://github.com/Xreki/Xreki.github.io/blob/master/fluid/inference/inference_transpiler.py))
+ ```python
+ import paddle.fluid as fluid
+ # NOTE: Applying the inference transpiler will change the inference_program.
+ t = fluid.InferenceTranspiler()
+ t.transpile(inference_program, place, inference_scope)
+ ```
+
+
+
+
+## 内存使用优化
+- 使用Python内存优化工具([memory_optimization_transipiler](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/memory_optimization_transpiler.py))
+ ```python
+ fluid.memory_optimize(inference_program)
+ ```
diff --git a/doc/fluid/howto/optimization/benchmark/README.md b/doc/fluid/howto/optimization/benchmark/README.md
deleted file mode 120000
index db30af7f53231c687f9ad61ad961a685733cbad0..0000000000000000000000000000000000000000
--- a/doc/fluid/howto/optimization/benchmark/README.md
+++ /dev/null
@@ -1 +0,0 @@
-../../../../../benchmark/cluster/README.md
\ No newline at end of file
diff --git a/doc/fluid/howto/optimization/benchmark/vgg16/README.md b/doc/fluid/howto/optimization/benchmark/vgg16/README.md
deleted file mode 120000
index ca963ef5f06aa0c2fe507ba7548dca8017358120..0000000000000000000000000000000000000000
--- a/doc/fluid/howto/optimization/benchmark/vgg16/README.md
+++ /dev/null
@@ -1 +0,0 @@
-../../../../../../benchmark/cluster/vgg16/README.md
\ No newline at end of file
diff --git a/doc/fluid/howto/optimization/cpu_profiling_cn.md b/doc/fluid/howto/optimization/cpu_profiling_cn.md
index 8266dec3c6125a09b90ac0ccd4aa5464f5c7db31..198a05a79e19227e90eaafe116217a164cd51a7d 100644
--- a/doc/fluid/howto/optimization/cpu_profiling_cn.md
+++ b/doc/fluid/howto/optimization/cpu_profiling_cn.md
@@ -1,3 +1,5 @@
+# CPU性能调优
+
此教程会介绍如何使用Python的cProfile包、Python库yep、Google perftools来进行性能分析 (profiling) 与调优(performance tuning)。
Profling 指发现性能瓶颈。系统中的瓶颈可能和程序员开发过程中想象的瓶颈相去甚远。Tuning 指消除瓶颈。性能优化的过程通常是不断重复地 profiling 和 tuning。
@@ -8,7 +10,7 @@ PaddlePaddle 用户一般通过调用 Python API 编写深度学习程序。大
* Python 与 C++ 混合代码的性能分析
-# Python代码的性能分析
+## Python代码的性能分析
### 生成性能分析文件
diff --git a/doc/fluid/howto/optimization/cpu_profiling_en.md b/doc/fluid/howto/optimization/cpu_profiling_en.md
index e95556dd608b7ff0a3eb18873df0015a2da94e7c..216694965b3c878a8a5f3ccd2a0cba8d21d9ce05 100644
--- a/doc/fluid/howto/optimization/cpu_profiling_en.md
+++ b/doc/fluid/howto/optimization/cpu_profiling_en.md
@@ -1,3 +1,5 @@
+# Tune CPU performance
+
This tutorial introduces techniques we use to profile and tune the
CPU performance of PaddlePaddle. We will use Python packages
`cProfile` and `yep`, and Google's `perftools`.
@@ -14,7 +16,7 @@ the profiling and tuning of
1. the Python code and
1. the mixture of Python and C++ code.
-# Profiling the Python Code
+## Profiling the Python Code
### Generate the Performance Profiling File
diff --git a/doc/fluid/howto/optimization/host_memory_profiling_cn.md b/doc/fluid/howto/optimization/host_memory_profiling_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..7fb0883dd937465d15479b29df95078edb50e069
--- /dev/null
+++ b/doc/fluid/howto/optimization/host_memory_profiling_cn.md
@@ -0,0 +1,89 @@
+# 堆内存分析和优化
+
+计算机程序都可能有内存泄漏的风险。**内存泄漏**一般是由于程序在堆(heap)上分配了内存而没有释放,随着程序的运行占用的内存越来越大,一方面会影响程序的稳定性,可能让运行速度越来越慢,或者造成oom,甚至会影响运行程序的机器的稳定性,造成宕机。
+
+
+目前有很多内存泄漏分析工具,比较经典的有[valgrind](http://valgrind.org/docs/manual/quick-start.html#quick-start.intro), [gperftools](https://gperftools.github.io/gperftools/)。
+
+因为Fluid是用Python驱动C++ core来运行,valgrind直接分析非常困难,需要自己编译debug版本的、带valgrind支持的专用Python版本,而且输出的信息中大部分是Python自己的符号和调用信息,分析起来很困难,另外使用valgrind会让程序运行速度变得非常慢,所以不建议使用。
+
+本教程主要介绍[gperftools](https://gperftools.github.io/gperftools/)的使用。
+
+gperftool主要支持以下四个功能:
+
+- thread-caching malloc
+- heap-checking using tcmalloc
+- heap-profiling using tcmalloc
+- CPU profiler
+
+Paddle也提供了基于gperftool的[CPU性能分析教程](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/howto/optimization/cpu_profiling_cn.md)。
+
+对于堆内存的分析,主要用到thread-caching malloc和heap-profiling using tcmalloc。
+
+## 环境
+
+本教程基于paddle提供的Docker开发环境paddlepaddle/paddle:latest-dev,基于Ubuntu 16.04.4 LTS环境。
+
+## 使用流程
+
+- 安装google-perftools
+
+```
+apt-get install libunwind-dev
+apt-get install google-perftools
+```
+
+- 安装pprof
+
+```
+go get -u github.com/google/pprof
+```
+
+- 设置运行环境
+
+```
+export PPROF_PATH=/root/gopath/bin/pprof
+export PPROF_BINARY_PATH=/root/gopath/bin/pprof
+export LD_PRELOAD=/usr/lib/libtcmalloc.so.4
+```
+
+- 使用heap profile来运行python程序。本质上是周期性的对堆的分配情况做一次快照。
+
+```
+# HEAPPROFILE 设置生成的堆分析文件的目录和文件前缀
+# HEAP_PROFILE_ALLOCATION_INTERVAL 设置每分配多少存储dump一次dump,默认1GB
+env HEAPPROFILE="./perf_log/test.log" HEAP_PROFILE_ALLOCATION_INTERVAL=209715200 python trainer.py
+```
+
+随着程序的运行,会在perf_log这个文件夹下生成很多文件,如下:
+
+```
+-rw-r--r-- 1 root root 1.0M Jun 1 15:00 test.log.0001.heap
+-rw-r--r-- 1 root root 1.0M Jun 1 15:00 test.log.0002.heap
+-rw-r--r-- 1 root root 1.0M Jun 1 15:00 test.log.0003.heap
+-rw-r--r-- 1 root root 1.0M Jun 1 15:00 test.log.0004.heap
+-rw-r--r-- 1 root root 1.0M Jun 1 15:00 test.log.0005.heap
+-rw-r--r-- 1 root root 1.0M Jun 1 15:00 test.log.0006.heap
+```
+
+- 使用pprof对heap文件进行分析。分析有两种模式:
+ - 完整模式。会对当前heap做一个分析,显示目前分配内存一些调用路径。
+
+ ```
+ pprof --pdf python test.log.0012.heap
+ ```
+ 上述命令会生成一个profile00x.pdf的文件,可以直接打开,例如:[memory_cpu_allocator](https://github.com/jacquesqiao/Paddle/blob/bd2ea0e1f84bb6522a66d44a072598153634cade/doc/fluid/howto/optimization/memory_cpu_allocator.pdf)。从下图可以看出,在CPU版本fluid的运行过程中,分配存储最多的模块式CPUAllocator. 而别的模块相对而言分配内存较少,所以被忽略了,这对于分配内存泄漏是很不方便的,因为泄漏是一个缓慢的过程,在这种图中是无法看到的。
+
+ 
+
+ - Diff模式。可以对两个时刻的heap做diff,把一些内存分配没有发生变化的模块去掉,而把增量部分显示出来。
+ ```
+ pprof --pdf --base test.log.0010.heap python test.log.1045.heap
+ ```
+ 生成的结果为:[`memory_leak_protobuf`](https://github.com/jacquesqiao/Paddle/blob/bd2ea0e1f84bb6522a66d44a072598153634cade/doc/fluid/howto/optimization/memory_leak_protobuf.pdf)
+
+ 从图中可以看出:ProgramDesc这个结构,在两个版本之间增长了200MB+,所以这里有很大的内存泄漏的可能性,最终结果也确实证明是这里造成了泄漏。
+
+ 
+ 
+
diff --git a/doc/fluid/howto/optimization/timeline.md b/doc/fluid/howto/optimization/timeline.md
deleted file mode 100644
index 96481ae2a6e4442d40803f8d5361e5f942502df3..0000000000000000000000000000000000000000
--- a/doc/fluid/howto/optimization/timeline.md
+++ /dev/null
@@ -1,27 +0,0 @@
-# how to use timeline tool to do profile
-
-1. Add `with profiler.profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number.
-
- ```python
- with profiler.profiler('All', 'total', '/tmp/profile') as prof:
- for pass_id in range(pass_num):
- for batch_id, data in enumerate(train_reader()):
- exe.run(fluid.default_main_program(),
- feed=feeder.feed(data),
- fetch_list=[],
- use_program_cache=True)
- ...
- ```
-
-1. Run `python paddle/tools/timeline.py` to process `/tmp/profile`, it will generate another
-file `/tmp/timeline` by default. You can change the path by cmd parameter, please take a look at
-[timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py) for details.
-
-1. Open chrome and visit , use `load` button to load the generated `timeline` file.
-
- 
-
-1. The resulting timeline should be like:
-
-
- 
diff --git a/doc/fluid/howto/optimization/timeline_cn.md b/doc/fluid/howto/optimization/timeline_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..faf39f276dbddcd4961407ba2d082c9826051cbe
--- /dev/null
+++ b/doc/fluid/howto/optimization/timeline_cn.md
@@ -0,0 +1,32 @@
+# 如何使用timeline工具做性能分析
+
+1. 在训练的主循环外加上`profiler.start_profiler(...)`和`profiler.stop_profiler(...)`。运行之后,代码会在`/tmp/profile`目录下生成一个profile的记录文件。
+
+ **提示:**
+ 请不要在timeline记录信息时运行太多次迭代,因为timeline中的记录数量和迭代次数是成正比的。
+
+ ```python
+ for pass_id in range(pass_num):
+ for batch_id, data in enumerate(train_reader()):
+ if pass_id == 0 and batch_id == 5:
+ profiler.start_profiler("All")
+ elif pass_id == 0 and batch_id == 10:
+ profiler.stop_profiler("total", "/tmp/profile")
+ exe.run(fluid.default_main_program(),
+ feed=feeder.feed(data),
+ fetch_list=[])
+ ...
+ ```
+
+1. 运行`python paddle/tools/timeline.py`来处理`/tmp/profile`,这个程序默认会生成一个`/tmp/timeline`文件,你也可以用命令行参数来修改这个路径,请参考[timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py)。
+```python
+python Paddle/tools/timeline.py --profile_path=/tmp/profile --timeline_path=timeline
+```
+
+1. 打开chrome浏览器,访问,用`load`按钮来加载生成的`timeline`文件。
+
+ 
+
+1. 结果如下图所示,可以放到来查看timetime的细节信息。
+
+ 
diff --git a/doc/fluid/howto/optimization/timeline_en.md b/doc/fluid/howto/optimization/timeline_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..6f963c6b4da6967fb2f493ada917a4b08917fa4c
--- /dev/null
+++ b/doc/fluid/howto/optimization/timeline_en.md
@@ -0,0 +1,33 @@
+# how to use timeline tool to do profile
+
+1. Add `profiler.start_profiler(...)`和`profiler.stop_profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number.
+
+ ```python
+ for pass_id in range(pass_num):
+ for batch_id, data in enumerate(train_reader()):
+ if pass_id == 0 and batch_id == 5:
+ profiler.start_profiler("All")
+ elif pass_id == 0 and batch_id == 10:
+ profiler.stop_profiler("total", "/tmp/profile")
+ exe.run(fluid.default_main_program(),
+ feed=feeder.feed(data),
+ fetch_list=[])
+ ...
+ ```
+
+1. Run `python paddle/tools/timeline.py` to process `/tmp/profile`, it will generate another
+file `/tmp/timeline` by default. You can change the path by cmd parameter, please take a look at
+[timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py) for details.
+
+```python
+python Paddle/tools/timeline.py --profile_path=/tmp/profile --timeline_path=timeline
+```
+
+1. Open chrome and visit , use `load` button to load the generated `timeline` file.
+
+ 
+
+1. The resulting timeline should be like:
+
+
+ 
diff --git a/doc/fluid/howto/performance/error_clip.md b/doc/fluid/howto/performance/error_clip.md
index 58aa73b8cd38d01e2426278a3479714e4fb6a3b0..749cf7693c75696feb17f8556224ed03649baa80 100644
--- a/doc/fluid/howto/performance/error_clip.md
+++ b/doc/fluid/howto/performance/error_clip.md
@@ -78,7 +78,7 @@ def error_clip_callback(block, context):
op_desc = block.desc.op(block.desc.op_size() - 1)
for grad_n in filter(lambda n: grad_to_var.has_key(n),
op_desc.output_arg_names()):
- fwd_var = block.var_recursive(grad_to_var[grad_n])
+ fwd_var = block.__var_recursive(grad_to_var[grad_n])
error_clip = getattr(fwd_var, "error_clip", None)
if not (error_clip is None or isinstance(error_clip,
BaseErrorClipAttr)):
diff --git a/doc/fluid/images/1.png b/doc/fluid/images/1.png
new file mode 100644
index 0000000000000000000000000000000000000000..67daf566f91aab570e60971c4ea8e2be876e214d
Binary files /dev/null and b/doc/fluid/images/1.png differ
diff --git a/doc/fluid/images/2.png b/doc/fluid/images/2.png
new file mode 100644
index 0000000000000000000000000000000000000000..43367777f41449a666e7a3b571f09ac5d5dfb1ae
Binary files /dev/null and b/doc/fluid/images/2.png differ
diff --git a/doc/fluid/images/3.png b/doc/fluid/images/3.png
new file mode 100644
index 0000000000000000000000000000000000000000..481021ef306e2596818aab7fe17a570754f63635
Binary files /dev/null and b/doc/fluid/images/3.png differ
diff --git a/doc/fluid/images/4.png b/doc/fluid/images/4.png
new file mode 100644
index 0000000000000000000000000000000000000000..4279f41e06de459f18b9a622539511d555e9a0af
Binary files /dev/null and b/doc/fluid/images/4.png differ
diff --git a/doc/fluid/images/LoDTensor.png b/doc/fluid/images/LoDTensor.png
new file mode 100644
index 0000000000000000000000000000000000000000..75369f5378309e0f304b83f6bb69bdb195eac079
Binary files /dev/null and b/doc/fluid/images/LoDTensor.png differ
diff --git a/doc/fluid/images/compile_run_time.png b/doc/fluid/images/compile_run_time.png
new file mode 100644
index 0000000000000000000000000000000000000000..0bc9b2fd0e81b4851e6d96171ccb9a05d0f42a48
Binary files /dev/null and b/doc/fluid/images/compile_run_time.png differ
diff --git a/doc/fluid/images/executor.png b/doc/fluid/images/executor.png
new file mode 100644
index 0000000000000000000000000000000000000000..b29c0d779e3d46b779b5baeabe3176adaeb00a6d
Binary files /dev/null and b/doc/fluid/images/executor.png differ
diff --git a/doc/fluid/images/fluid_examples.png b/doc/fluid/images/fluid_examples.png
new file mode 100644
index 0000000000000000000000000000000000000000..aa99472c0f914cde128fd7b3bd8dc29ac24f94b6
Binary files /dev/null and b/doc/fluid/images/fluid_examples.png differ
diff --git a/doc/fluid/images/fluid_module_1.png b/doc/fluid/images/fluid_module_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..554782ba54e43efc3d6babbb94e3cac3530ac649
Binary files /dev/null and b/doc/fluid/images/fluid_module_1.png differ
diff --git a/doc/fluid/images/fluid_module_2.png b/doc/fluid/images/fluid_module_2.png
new file mode 100644
index 0000000000000000000000000000000000000000..4219efccbb1e87839adf6b5720fe46808b7d2fcf
Binary files /dev/null and b/doc/fluid/images/fluid_module_2.png differ
diff --git a/doc/fluid/images/layer.png b/doc/fluid/images/layer.png
new file mode 100644
index 0000000000000000000000000000000000000000..e46db4c9c6f5b65ff274b498b716b11de343a8b0
Binary files /dev/null and b/doc/fluid/images/layer.png differ
diff --git a/doc/fluid/images/op.dot b/doc/fluid/images/op.dot
new file mode 100644
index 0000000000000000000000000000000000000000..c8ad839cb88788e9b5906402257cc7bbc3ddcb54
--- /dev/null
+++ b/doc/fluid/images/op.dot
@@ -0,0 +1,4 @@
+digraph sample {
+ graph [rankdir=TD]; node [shape=record];
+ op [label="{Operator| InferShape()=0\lRun()=0\l | map<string, string[]> inputs_\lmap<string, string[]> outputs_ \l AttributeMap attrs_\l}"];
+}
\ No newline at end of file
diff --git a/doc/fluid/images/op_op_with_kern_class_diagram.dot b/doc/fluid/images/op_op_with_kern_class_diagram.dot
new file mode 100644
index 0000000000000000000000000000000000000000..8f24e9ea83acf879c7008f2d97113c0a4cc111c3
--- /dev/null
+++ b/doc/fluid/images/op_op_with_kern_class_diagram.dot
@@ -0,0 +1,38 @@
+digraph sample {
+ graph [rankdir=TD]; node [shape=record];
+ op [label="{Operator| InferShape()=0\lRun()=0\l | map<string, string[]> inputs_\lmap<string, string[]> outputs_ \l AttributeMap attrs_\l}"];
+ op_with_kern [label="{OpWithKernel | InferShape()=0\lRun()\l | map<OpKernelKey,OpKernel>kernels_ }"]
+ op_kernel [label="{OpKernel | Compute()=0}"]
+ op_kernel_key [label="{OpKernelKey| Place place\n...}"]
+
+ op -> op_with_kern [dir=back, arrowtail=onormal]
+ op_with_kern -> op_kernel [arrowhead=vee, label="contains many"]
+
+ {
+ rank=same;
+ op_with_kern
+ op_kernel
+ }
+
+ op_kernel -> op_kernel_key [style=invis]
+
+ {
+ rank=same;
+ op_kernel
+ op_kernel_key
+ }
+
+ op_with_kern -> op_kernel_key [arrowhead=vee, label ="\nas map key"]
+
+ mul_op [label="MulOp"]
+ op_with_kern -> mul_op [dir=back, arrowtail=onormal]
+ mul_kernel [label="template <typename Place>\lclass MulOpKernel\l"]
+ op_kernel -> mul_kernel [dir=back, arrowtail=onormal]
+ mul_op -> mul_kernel [arrowhead=vee, label="register many"]
+
+ {
+ rank=same;
+ mul_op;
+ mul_kernel;
+ }
+}
\ No newline at end of file
diff --git a/doc/fluid/images/op_with_kernel.dot b/doc/fluid/images/op_with_kernel.dot
new file mode 100644
index 0000000000000000000000000000000000000000..4f5af4f7b5f5a69693a058c99eb658900136077a
--- /dev/null
+++ b/doc/fluid/images/op_with_kernel.dot
@@ -0,0 +1,26 @@
+digraph sample {
+ graph [rankdir=TD]; node [shape=record];
+ op [label="{Operator}"];
+ op_with_kern [label="{OpWithKernel | InferShape()=0\lRun()\l | map<OpKernelKey,OpKernel>kernels_ }"]
+ op_kernel [label="{OpKernel | Compute()=0}"]
+ op_kernel_key [label="{OpKernelKey| Place place\n...}"]
+
+ op -> op_with_kern [dir=back, arrowtail=onormal]
+ op_with_kern -> op_kernel [arrowhead=vee, label="contains many"]
+
+ {
+ rank=same;
+ op_with_kern
+ op_kernel
+ }
+
+ op_kernel -> op_kernel_key [style=invis]
+
+ {
+ rank=same;
+ op_kernel
+ op_kernel_key
+ }
+
+ op_with_kern -> op_kernel_key [arrowhead=vee, label ="\nas map key"]
+}
\ No newline at end of file
diff --git a/doc/fluid/images/operator1.png b/doc/fluid/images/operator1.png
new file mode 100644
index 0000000000000000000000000000000000000000..3975b06f615b7a88dfc11e71b6451fdf4ce42d60
Binary files /dev/null and b/doc/fluid/images/operator1.png differ
diff --git a/doc/fluid/images/operator2.png b/doc/fluid/images/operator2.png
new file mode 100644
index 0000000000000000000000000000000000000000..b7bb1fae2050d3a70797517bc20dbbdef3dfcb7c
Binary files /dev/null and b/doc/fluid/images/operator2.png differ
diff --git a/doc/fluid/images/place.png b/doc/fluid/images/place.png
new file mode 100644
index 0000000000000000000000000000000000000000..14e77511d639af155e5a3725cde05323e0cc94f2
Binary files /dev/null and b/doc/fluid/images/place.png differ
diff --git a/doc/fluid/images/print_fluid_program.png b/doc/fluid/images/print_fluid_program.png
new file mode 100644
index 0000000000000000000000000000000000000000..e8e459e1b3d5c8706b3caa05dc371db8d46df4a5
Binary files /dev/null and b/doc/fluid/images/print_fluid_program.png differ
diff --git a/doc/fluid/images/program_desc1.png b/doc/fluid/images/program_desc1.png
new file mode 100644
index 0000000000000000000000000000000000000000..0656336914ece957f2e5bb4d70ad337a63e31d88
Binary files /dev/null and b/doc/fluid/images/program_desc1.png differ
diff --git a/doc/fluid/images/program_desc2.png b/doc/fluid/images/program_desc2.png
new file mode 100644
index 0000000000000000000000000000000000000000..db5bfa1231345add8661b4f8ef0fc9d861f40d24
Binary files /dev/null and b/doc/fluid/images/program_desc2.png differ
diff --git a/doc/fluid/images/raw_input.png b/doc/fluid/images/raw_input.png
new file mode 100644
index 0000000000000000000000000000000000000000..0725f92d2b169c2b59ec7c68b402859c2a2dd1d8
Binary files /dev/null and b/doc/fluid/images/raw_input.png differ
diff --git a/doc/fluid/images/scope_variable_tensor.png b/doc/fluid/images/scope_variable_tensor.png
new file mode 100644
index 0000000000000000000000000000000000000000..59b0de6fb36f9f6b469227c05760a7612bb30b4d
Binary files /dev/null and b/doc/fluid/images/scope_variable_tensor.png differ
diff --git a/doc/fluid/images/sorted_input.png b/doc/fluid/images/sorted_input.png
new file mode 100644
index 0000000000000000000000000000000000000000..ff601128368ee179e3fd33e5e295a9ddd3dcbaeb
Binary files /dev/null and b/doc/fluid/images/sorted_input.png differ
diff --git a/doc/fluid/images/transpiler.png b/doc/fluid/images/transpiler.png
new file mode 100644
index 0000000000000000000000000000000000000000..422973c0dc7aa2b544d2fc86a97ace706388cb9e
Binary files /dev/null and b/doc/fluid/images/transpiler.png differ
diff --git a/doc/fluid/images/user_interface.png b/doc/fluid/images/user_interface.png
new file mode 100644
index 0000000000000000000000000000000000000000..ffc94e3d8945ec6291460afd90e8fcc600828390
Binary files /dev/null and b/doc/fluid/images/user_interface.png differ
diff --git a/doc/fluid/index_cn.rst b/doc/fluid/index_cn.rst
index d878d192cae7ee9e8b8fdb4f615839c186fdf334..6b1ef3ceed4f7ed5073d42c13ce103e2ab467e58 100644
--- a/doc/fluid/index_cn.rst
+++ b/doc/fluid/index_cn.rst
@@ -1,12 +1,16 @@
- PaddlePaddle Fluid
-==========================
+.. PaddlePaddle Fluid documentation master file, created by
+ sphinx-quickstart on Thu Jun 7 17:04:53 2018.
+ You can adapt this file completely to your liking, but it should at least
+ contain the root `toctree` directive.
+
+##############
+欢迎使用 Fluid
+##############
.. toctree::
- :maxdepth: 1
+ :maxdepth: 1
- getstarted/index_cn.rst
- build_and_install/index_cn.rst
- design/index_cn.rst
- howto/index_cn.rst
- dev/index_cn.rst
- faq/index_cn.rst
+ new_docs/beginners_guide/index.rst
+ new_docs/user_guides/index.rst
+ new_docs/advanced_usage/index.rst
+ new_docs/faq/index_cn.rst
diff --git a/doc/fluid/new_docs/advanced_usage/benchmark.rst b/doc/fluid/new_docs/advanced_usage/benchmark.rst
new file mode 100644
index 0000000000000000000000000000000000000000..7854263bf8f64c840492550fb22152582c7d2361
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/benchmark.rst
@@ -0,0 +1,120 @@
+#################
+如何进行基准测试
+#################
+
+本文介绍如何给深度学习框架做基准测试。基准测试主要包含验证模型的精度和性能两方面,下文包含搭建测试环境,选择基准测试模型,验证测试结果等几方面内容。
+
+验证深度学习框架,可分为训练和测试两个阶段, 验证指标略有不同,本文只介绍训练阶段的指标验证。训练阶段关注的是模型训练集上的精度,训练集是完备的,因此关注大batch\_size下的训练速度,关注吞吐量,例如图像模型常用的batch\_size=128, 多卡情况下会加大;预测阶段关注的是在测试集上的精度,线上服务测试数据不能提前收集,因此关注小batch\_size下的预测速度,关注延迟,例如预测服务常用的batch\_size=1, 4等。
+
+`Fluid `__ 是PaddlePaddle从0.11.0版本开始引入的设计,本文的基准测试在该版本上完成。
+
+
+环境搭建
+""""""""""""
+
+基准测试中模型精度和硬件、框架无关,由模型结构和数据共同决定;性能方面由测试硬件和框架性能决定。框架基准测试为了对比框架之间的差异,控制硬件环境,系统库等版本一致。下文中的对比实验都在相同的硬件条件和系统环境条件下进行.
+
+
+不同架构的GPU卡性能差异巨大,在验证模型在GPU上训练性能时,可使用NVIDIA提供的工具:code `nvidia-smi` 检验当前使用的GPU型号,如果测试多卡训练性能,需确认硬件连接是 `nvlink `__ 或 `PCIe `__ 。 同样地,CPU型号会极大影响模型在CPU上的训练性能。可读取`/proc/cpuinfo`中的参数,确认当前正在使用的CPU型号。
+
+下载GPU对应的Cuda Tool Kit和 Cudnn,或者使用NVIDIA官方发布的nvidia-docker镜像 `nvidia-docker `__, 镜像内包含了Cuda和Cudnn,本文采用这种方式。 Cuda Tool Kit包含了GPU代码使用到的基础库,影响在此基础上编译出的Fluid二进制运行性能。
+
+准备好Cuda环境后,从github上的下载Paddle并源码编译,会生成对应的最适合当前GPU的sm\_arch二进制\ `sm\_arch `__\ 。另外,cudnn对卷积类任务影响巨大,在基准测试中需要小版本一致,例如Cudnn7.0.2与Cudnn7.1.4在Resnet上有5%以上差异。
+
+
+选择基准模型
+""""""""""""
+
+对框架做基准测试,需要覆盖不同训练任务和不同大小的模型,本文中选取了图像和NLP的最为常用的5个模型。
+
+============ ============ ================= ============
+任务种类 模型名称 网络结构 数据集
+============ ============ ================= ============
+图像分类 mnist Lenet mnist
+图像分类 VGG VGG-16 Flowers102
+图像分类 Resnet Resnet-50 Flowers102
+文本分类 Stacked-LSTM Stacked-LSTM IMDB
+机器翻译 seq-seq Stacked-LSTM wmt14
+============ ============ ================= ============
+
+其中mnist, VGG, Resnet属于CNN模型, stacked-lstm, seq2seq代表RNN模型。
+`benchmark `__
+基准模型测试脚本中,均跳过了前几个batch的训练过程,原因是加载数据和分配显存受系统当前运行情况影响,会导致统计性能不准确。运行完若干个轮次后,统计对应指标。
+
+
+基准模型的数据的选择方面,数据量大且验证效果多的公开数据集为首选。图像模型VGG和resnet, 本文选择了 `flowers102 `__ ,图像大小预处理为和Imagenet相同大小,因此性能可直接对比
+NLP模型的公开且影响力大数据集较少,seq2seq模型选择了wmt14数据,stacked-lstm模型中选择了 `imdb `__ 数据。
+
+
+注意,图像模型每条样本大小相同,图像经过变换后大小一致,因此经过的计算路径基本相同,计算速度和显存占用波动较小,可以从若干个batch的数据中采样得到当前的训练性能数据。而NLP模型由于样本长度不定,计算路径和显存占用也不相同,因此只能完整运行若干个轮次后,统计速度和显存消耗。
+显存分配是特别耗时的操作,因此Fluid默认会占用所有可用显存空间形成显存池,用以加速计算过程中的显存分配。如果需要统计模型真实显存消耗,可设置环境变量`FLAGS_fraction_of_gpu_memory_to_use=0.0`,观察最大显存开销。
+
+
+测试过程
+""""""""""""
+
+- CPU 单机单线程测试
+
+测试CPU上单线程的性能,先设置CUDA的环境变量为空,``CUDA_VISIBLE_DEVICES=``,并通过环境变量关闭OpenMP和MKL的多线程 ``OMP_NUM_THREADS=1``, ``MKL_NUM_THREADS=1;``。
+然后代码中设置为使用CPUPlace,如果使用Paddle代码库中的脚本,只需要命令行参数传入 use_gpu=False即可。
+
+.. code-block:: python
+
+ >>> import paddle.fluid as fluid
+ >>> place = fluid.CPUPlace()
+
+.. code:: bash
+
+ docker run -it --name CASE_NAME --security-opt seccomp=unconfined -v $PWD/benchmark:/benchmark paddlepaddle/paddle:latest-dev /bin/bash
+
+
+- GPU 单机单卡测试
+
+本教程使用了Cuda8, Cudnn7.0.1。来源为:code `nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04`
+
+.. code:: bash
+
+ nvidia-docker run -it --name CASE_NAME --security-opt seccomp=unconfined -v $PWD/benchmark:/benchmark -v /usr/lib/x86_64-linux-gnu:/usr/lib/x86_64-linux-gnu paddlepaddle/paddle:latest-dev /bin/bash
+在单卡上测试,设置CUDA的环境变量使用一块GPU,``CUDA_VISIBLE_DEVICES=0``
+然后代码中设置为使用CUDAPlace,如果使用Paddle代码库中的脚本,只需要命令行参数传入 use_gpu=True即可。
+
+.. code-block:: python
+
+ >>> import paddle.fluid as fluid
+ >>> place = fluid.CUDAPlace(0) // 0 指第0块GPU
+
+
+测试结果
+""""""""""""
+
+本教程对比相同环境下的Fluid0.12.0和TensorFlow1.4.0的性能表现。
+硬件环境为 CPU: Intel(R) Xeon(R) CPU E5-2660 v4 @ 2.00GHz, GPU: TITAN X(Pascal) 12G x 1, Nvidia-Driver 384.90。
+系统环境为Ubuntu 16.04.3 LTS, 本文中采用了docker环境,系统版本为nvidia-docker17.05.0-ce。
+测试的Fluid版本为\ `v.0.12.0 `__ 。
+TensorFlow版本为\ `v.1.4.0-rc1 `__ 。
+使用的脚本和配置见\ `benchmark `__ 。
+图表中统计单位为samples/秒。
+
+- CPU 单机单线程测试结果
+
+ ================ ==================== ===================
+ Speed Fluid CPU TensorFlow CPU
+ ================ ==================== ===================
+ mnist 1298.75 samples/s 637.57 samples/s
+ VGG-16 0.4147 images/s 0.1229 images/s
+ Resnet-50 1.6935 images/s 0.3657 images/s
+ Stacked-LSTM 472.3225 words/s 48.2293words/s
+ Seq2Seq 217.1655 words/s 28.6164 words/s
+ ================ ==================== ===================
+
+- GPU 单机单卡测试结果
+
+ =============== ===================== =================
+ Speed Fluid GPU TensorFlow GPU
+ =============== ===================== =================
+ mnist 19710.90 samples/s 15576.3 samples/s
+ VGG-16 59.83327 images/s 40.9967 images/s
+ Resnet-50 105.84412 97.8923 images/s
+ Stacked-LSTM 1319.99315 1608.2526 words/s
+ Seq2Seq 7147.89081 6845.1161 words/s
+ =============== ===================== =================
diff --git a/doc/fluid/new_docs/advanced_usage/deploy/anakin_arm_benchmark.md b/doc/fluid/new_docs/advanced_usage/deploy/anakin_arm_benchmark.md
new file mode 100644
index 0000000000000000000000000000000000000000..08ea379f81d16407ed5f82770b55a34bcf138da8
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/deploy/anakin_arm_benchmark.md
@@ -0,0 +1,56 @@
+# Anakin ARM 性能测试
+
+## 测试环境和参数:
++ 测试模型Mobilenetv1, mobilenetv2, mobilenet-ssd
++ 采用android ndk交叉编译,gcc 4.9,enable neon, ABI: armveabi-v7a with neon -mfloat-abi=softfp
++ 测试平台
+ - 荣耀v9(root): 处理器:麒麟960, 4 big cores in 2.36GHz, 4 little cores in 1.8GHz
+ - nubia z17:处理器:高通835, 4 big cores in 2.36GHz, 4 little cores in 1.9GHz
+ - 360 N5:处理器:高通653, 4 big cores in 1.8GHz, 4 little cores in 1.4GHz
++ 多线程:openmp
++ 时间:warmup10次,运行10次取均值
++ ncnn版本:来源于github的master branch中commits ID:307a77f04be29875f40d337cfff6df747df09de6(msg:convert LogisticRegressionOutput)版本
++ TFlite版本:来源于github的master branch中commits ID:65c05bc2ac19f51f7027e66350bc71652662125c(msg:Removed unneeded file copy that was causing failure in Pi builds)版本
+
+在BenchMark中本文将使用**`ncnn`**、**`TFlite`**和**`Anakin`**进行性能对比分析
+
+## BenchMark model
+
+> 注意在性能测试之前,请先将测试model通过[External Converter](#10003)转换为Anakin model
+> 对这些model,本文在ARM上进行多线程的单batch size测试。
+
+- [Mobilenet v1](#11) *caffe model 可以在[这儿](https://github.com/shicai/MobileNet-Caffe)下载*
+- [Mobilenet v2](#22) *caffe model 可以在[这儿](https://github.com/shicai/MobileNet-Caffe)下载*
+- [mobilenet-ssd](#33) *caffe model 可以在[这儿](https://github.com/chuanqi305/MobileNet-SSD)下载*
+
+### mobilenetv1
+
+ |platform | Anakin (1) | Anakin (2) | Anakin (4) | ncnn (1) | ncnn (2) | ncnn (4) | TFlite (1) | TFlite (2) | TFlite (4)|
+ |:---: | :---: | :---: | :---:| :---:| :---:| :---:| :---:| :---:| :---:|
+ |麒麟960|107.7ms|61.1ms|38.2ms|152.8ms|85.2ms|51.9ms|152.6ms|nan|nan|
+ |高通835|105.7ms|63.1ms|~~46.8ms~~|152.7ms|87.0ms|~~92.7ms~~|146.9ms|nan|nan|
+ |高通653|120.3ms|64.2ms|46.6ms|202.5ms|117.6ms|84.8ms|158.6ms|nan|nan|
+
+### mobilenetv2
+
+ |platform | Anakin (1) | Anakin (2) | Anakin (4) | ncnn (1) | ncnn (2) | ncnn (4) | TFlite (1) | TFlite (2) | TFlite (4)|
+ |:---: | :---: | :---: | :---:| :---:| :---:| :---:| :---:| :---:| :---:|
+ |麒麟960|93.1ms|53.9ms|34.8ms|144.4ms|84.3ms|55.3ms|100.6ms|nan|nan|
+ |高通835|93.0ms|55.6ms|41.1ms|139.1ms|88.4ms|58.1ms|95.2ms|nan|nan|
+ |高通653|106.6ms|64.2ms|48.0ms|199.9ms|125.1ms|98.9ms|108.5ms|nan|nan|
+
+### mobilenet-ssd
+
+ |platform | Anakin (1) | Anakin (2) | Anakin (4) | ncnn (1) | ncnn (2) | ncnn (4) | TFlite (1) | TFlite (2) | TFlite (4)|
+ |:---: | :---: | :---: | :---:| :---:| :---:| :---:| :---:| :---:| :---:|
+ |麒麟960|213.9ms|120.5ms|74.5ms|307.9ms|166.5ms|104.2ms|nan|nan|nan|
+ |高通835|213.0ms|125.7ms|~~98.4ms~~|292.9ms|177.9ms|~~167.8ms~~|nan|nan|nan|
+ |高通653|236.0ms|129.6ms|96.0ms|377.7ms|228.9ms|165.0ms|nan|nan|nan
+
+## How to run those Benchmark models?
+
+1. 首先, 使用[External Converter](../docs/Manual/Converter_en.md)对caffe model 进行转换
+2. 然后将转换后的Anakin model和编译好的benchmark_arm 二进制文件通过'adb push'命令上传至测试机
+3. 接着在测试机含有Anakin model的目录中运行'./benchmark_arm ./ anakin_model.anakin.bin 1 10 10 1' 命令
+4. 最后,终端显示器上将会打印该模型的运行时间
+5. 其中运行命令的参数个数和含义可以通过运行'./benchmark_arm'看到
diff --git a/doc/fluid/new_docs/advanced_usage/deploy/anakin_example.md b/doc/fluid/new_docs/advanced_usage/deploy/anakin_example.md
new file mode 100644
index 0000000000000000000000000000000000000000..e6b9e18fe2d64b3fda6382bb23a6a818a3e17fbe
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/deploy/anakin_example.md
@@ -0,0 +1,28 @@
+# Example
+Anakin目前只支持NCHW的格式
+示例文件在test/framework/net下
+
+## 在NV的GPU上运行CNN模型
+示例文件为打开example_nv_cnn_net.cpp,整体流程如下:
+- 将模型的的path设置为anakin模型的路径,初始化NV平台的图对象。 anakin模型可以通过转换器转化caffe或fluid的模型得到
+- 根据模型设置网络图的输入尺寸,进行图优化
+- 根据优化后的网络图初始化网络执行器
+- 取出网络的输入tensor,将数据拷贝到输入tensor
+- 运行推导
+- 取出网络的输出tensor
+
+以NV平台为例演示Anakin框架的使用方法,注意编译时需要打开GPU编译开关
+
+## 在X86上运行RNN模型
+示例文件为example_x86_rnn_net.cpp
+整体流程与在NV的GPU上运行CNN模型相似,不同之处如下:
+- 使用X86标识初始化图对象和网络执行器对象
+- rnn模型的输入尺寸是可变的,初始化图时的输入维度是维度的最大值,输入维度N代表总的词的个数。还需要设置输入tensor的seq_offset来标示这些词是如何划分为句子的,如{0,5,12}表示共有12个词,其中第0到第4个词是第一句话,第5到第11个词是第二句话
+
+以X86平台为例演示Anakin框架的使用方法,注意编译时需要打开X86编译开关
+
+## 在NV的GPU上使用Anakin的线程池运行CNN模型
+示例文件为example_nv_cnn_net_multi_thread.cpp ,示例使用worker的同步预测接口
+整体流程与在NV的GPU上运行CNN模型相似,不同之处如下:
+- 用模型地址和线程池大小初始化worker对象
+- 将输入tensor注入任务队列,获得输出tensor
diff --git a/doc/fluid/new_docs/advanced_usage/deploy/anakin_gpu_benchmark.md b/doc/fluid/new_docs/advanced_usage/deploy/anakin_gpu_benchmark.md
new file mode 100644
index 0000000000000000000000000000000000000000..667f9396f1169a0d891b9e6b0e912aa5527ab0b8
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/deploy/anakin_gpu_benchmark.md
@@ -0,0 +1,170 @@
+# Anakin GPU Benchmark
+
+## Machine:
+
+> CPU: `12-core Intel(R) Xeon(R) CPU E5-2620 v2 @2.10GHz`
+> GPU: `Tesla P4`
+> cuDNN: `v7`
+
+
+## Counterpart of anakin :
+
+The counterpart of **`Anakin`** is the acknowledged high performance inference engine **`NVIDIA TensorRT 3`** , The models which TensorRT 3 doesn't support we use the custom plugins to support.
+
+## Benchmark Model
+
+The following convolutional neural networks are tested with both `Anakin` and `TenorRT3`.
+ You can use pretrained caffe model or the model trained by youself.
+
+> Please note that you should transform caffe model or others into anakin model with the help of [`external converter ->`](../docs/Manual/Converter_en.md)
+
+
+- [Vgg16](#1) *caffe model can be found [here->](https://gist.github.com/jimmie33/27c1c0a7736ba66c2395)*
+- [Yolo](#2) *caffe model can be found [here->](https://github.com/hojel/caffe-yolo-model)*
+- [Resnet50](#3) *caffe model can be found [here->](https://github.com/KaimingHe/deep-residual-networks#models)*
+- [Resnet101](#4) *caffe model can be found [here->](https://github.com/KaimingHe/deep-residual-networks#models)*
+- [Mobilenet v1](#5) *caffe model can be found [here->](https://github.com/shicai/MobileNet-Caffe)*
+- [Mobilenet v2](#6) *caffe model can be found [here->](https://github.com/shicai/MobileNet-Caffe)*
+- [RNN](#7) *not support yet*
+
+We tested them on single-GPU with single-thread.
+
+### VGG16
+
+- Latency (`ms`) of different batch
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 8.8690 | 8.2815 |
+| 2 | 15.5344 | 13.9116 |
+| 4 | 26.6000 | 21.8747 |
+| 8 | 49.8279 | 40.4076 |
+| 32 | 188.6270 | 163.7660 |
+
+- GPU Memory Used (`MB`)
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 963 | 997 |
+| 2 | 965 | 1039 |
+| 4 | 991 | 1115 |
+| 8 | 1067 | 1269 |
+| 32 | 1715 | 2193 |
+
+
+### Yolo
+
+- Latency (`ms`) of different batch
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 16.4596| 15.2124 |
+| 2 | 26.6347| 25.0442 |
+| 4 | 43.3695| 43.5017 |
+| 8 | 80.9139 | 80.9880 |
+| 32 | 293.8080| 310.8810 |
+
+- GPU Memory Used (`MB`)
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 1569 | 1775 |
+| 2 | 1649 | 1815 |
+| 4 | 1709 | 1887 |
+| 8 | 1731 | 2031 |
+| 32 | 2253 | 2907 |
+
+### Resnet50
+
+- Latency (`ms`) of different batch
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 4.2459 | 4.1061 |
+| 2 | 6.2627 | 6.5159 |
+| 4 | 10.1277 | 11.3327 |
+| 8 | 17.8209 | 20.6680 |
+| 32 | 65.8582 | 77.8858 |
+
+- GPU Memory Used (`MB`)
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 531 | 503 |
+| 2 | 543 | 517 |
+| 4 | 583 | 541 |
+| 8 | 611 | 589 |
+| 32 | 809 | 879 |
+
+### Resnet101
+
+- Latency (`ms`) of different batch
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 7.5562 | 7.0837 |
+| 2 | 11.6023 | 11.4079 |
+| 4 | 18.3650 | 20.0493 |
+| 8 | 32.7632 | 36.0648 |
+| 32 | 123.2550 | 135.4880 |
+
+- GPU Memory Used (`MB)`
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 701 | 683 |
+| 2 | 713 | 697 |
+| 4 | 793 | 721 |
+| 8 | 819 | 769 |
+| 32 | 1043 | 1059 |
+
+### MobileNet V1
+
+- Latency (`ms`) of different batch
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 45.5156 | 1.3947 |
+| 2 | 46.5585 | 2.5483 |
+| 4 | 48.4242 | 4.3404 |
+| 8 | 52.7957 | 8.1513 |
+| 32 | 83.2519 | 31.3178 |
+
+- GPU Memory Used (`MB`)
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 329 | 283 |
+| 2 | 345 | 289 |
+| 4 | 371 | 299 |
+| 8 | 393 | 319 |
+| 32 | 531 | 433 |
+
+### MobileNet V2
+
+- Latency (`ms`) of different batch
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 65.6861 | 2.9842 |
+| 2 | 66.6814 | 4.7472 |
+| 4 | 69.7114 | 7.4163 |
+| 8 | 76.1092 | 12.8779 |
+| 32 | 124.9810 | 47.2142 |
+
+- GPU Memory Used (`MB`)
+
+| BatchSize | TensorRT | Anakin |
+| --- | --- | --- |
+| 1 | 341 | 293 |
+| 2 | 353 | 301 |
+| 4 | 385 | 319 |
+| 8 | 421 | 351 |
+| 32 | 637 | 551 |
+
+## How to run those Benchmark models?
+
+> 1. At first, you should parse the caffe model with [`external converter`](https://github.com/PaddlePaddle/Anakin/blob/b95f31e19993a192e7428b4fcf852b9fe9860e5f/docs/Manual/Converter_en.md).
+> 2. Switch to *source_root/benchmark/CNN* directory. Use 'mkdir ./models' to create ./models and put anakin models into this file.
+> 3. Use command 'sh run.sh', we will create files in logs to save model log with different batch size. Finally, model latency summary will be displayed on the screen.
+> 4. If you want to get more detailed information with op time, you can modify CMakeLists.txt with setting `ENABLE_OP_TIMER` to `YES`, then recompile and run. You will find detailed information in model log file.
diff --git a/doc/fluid/new_docs/advanced_usage/deploy/anakin_tutorial.md b/doc/fluid/new_docs/advanced_usage/deploy/anakin_tutorial.md
new file mode 100644
index 0000000000000000000000000000000000000000..5efbc89abd469871b318c306e8cb03dd95f0c85b
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/deploy/anakin_tutorial.md
@@ -0,0 +1,639 @@
+# Anakin 使用教程 ##
+
+本教程将会简略的介绍Anakin的工作原理,一些基本的Anakin API,以及如何调用这些API。
+
+## 内容 ###
+
+- [Anakin的工作原理](#principle)
+- [Anakin APIs](#api)
+- [示例代码](#example)
+
+## Anakin的工作原理 ###
+
+
+
+用Anakin来进行前向计算主要分为三个步骤:
+
+- 将外部模型通过[Anakin Parser](Converter_ch.md)解析为Anakin模型
+ 在使用Anakin之前,用户必须将所有其他模型转换成Anakin模型,我们提供了转换脚本,用户可通过[Anakin Parser](Converter_ch.md)进行模型转换。
+- 生成Anakin计算图
+ 加载Anakin模型生成原始计算图,然后需要对原始计算图进行优化。你只需要调用相应的API优化即可。
+- 执行计算图
+ Anakin会选择不同硬件平台执行计算图。
+
+
+## Anakin APIs ###
+### Tensor ####
+
+`Tensor`提供基础的数据操作和管理,为ops提供统一的数据接口。`Tensor`包含以下几个属性:
+
+- Buffer
+ 数据存储区
+- Shape
+ 数据的维度信息
+- Event
+ 用于异步计算的同步
+
+ `Tensor` 类包含三个`Shape`对象, 分别是`_shape`, `_valid_shape`和 `offset`。 `_shape`为`tensor`真正空间信息,`_valid_shape`表示当前`tensor`使用的空间信息, `_offset`表示当前`tensor`数据指针相对于真正数据空间的信息。 `Tensor`不同维度与分别与数学中的向量、矩阵等相对应如下表所示。
+
+
+Dimentions | Math entity |
+ :----: | :----:
+1 | vector
+2 | matrix
+3 | 3-tensor
+n | n-tensor
+
+#### 声明tensor对象
+
+`Tensor`接受三个模板参数:
+
+
+```c++
+ template
+ class Tensor .../* Inherit other class */{
+ //some implements
+ ...
+ };
+```
+
+TargetType是平台类型,如X86,GPU等等,在Anakin内部有相应的标识与之对应;datatype是普通的数据类型,在Anakin内部也有相应的标志与之对应;[LayOutType](#layout)是数据分布类型,如batch x channel x height x width [NxCxHxW], 在Anakin内部用一个struct来标识。 Anakin中数据类型与基本数据类型的对应如下:
+
+1. TargetType
+
+ Anakin TargetType | platform
+ :----: | :----:|
+ NV | NVIDIA GPU
+ ARM | ARM
+ AMD | AMD GPU
+ X86 | X86
+ NVHX86 | NVIDIA GPU with Pinned Memory
+
+2. DataType
+
+Anakin DataType | C++ | Description
+:---: | :---: | :---: |
+AK_HALF | short | fp16
+AK_FLOAT | float | fp32
+AK_DOUBLE | double | fp64
+AK_INT8 | char | int8
+AK_INT16 | short | int16
+AK_INT32 | int | int32
+AK_INT64 | long | int64
+AK_UINT8 | unsigned char | uint8
+AK_UINT16 | unsigned short | uint8
+AK_UINT32 | unsigned int | uint32
+AK_STRING | std::string | /
+AK_BOOL | bool | /
+AK_SHAPE | / | Anakin Shape
+AK_TENSOR | / | Anakin Tensor
+
+
+3. LayOutType
+
+Anakin LayOutType ( Tensor LayOut ) | Tensor Dimention | Tensor Support | Op Support
+:---: | :---: | :---: | :---: |
+W | 1-D | YES | NO
+HW | 2-D | YES | NO
+WH | 2-D | YES | NO
+NW | 2-D | YES | YES
+NHW | 3-D | YES |YES
+NCHW ( default ) | 4-D | YES | YES
+NHWC | 4-D | YES | NO
+NCHW_C4 | 5-D | YES | YES
+
+
+理论上,Anakin支持申明1维以上的tensor,但是对于Anakin中的Op来说,只支持NW、NHW、NCHW、NCHW_C4这四种LayOut,其中NCHW是默认的LayOutType,NCHW_C4是专门针对于int8这种数据类型的。
+
+
+例子
+
+> 下面的代码将展示如何使用tensor, 我们建议先看看这些示例。
+
+> 要想获得更多关于tensor的信息, 请参考 *soure_path/core/tensor.h*
+
+> 1. 使用shape对象初始化tensor
+``` c++
+ //create a null tensor. A null tensor holds for nothing.
+ //tensor's buffer is resident at CPU and its datatype is AK_FLOAT.
+ //tensor's Layout is NCHW(default)
+ Tensor mytensor;
+
+ //1. using shape object to create a tensor.
+ Shape shape1(NUM); //1-D shape. NUM is the number of dimention.
+ Tensor mytensor1(shape1); //1-D tensor.
+
+ // A 4-D shape
+ Shape shape2(N, C, H, W); // batch x channel x height x width
+```
+
+>`注意:Shape的维度必须和tensor的`[LayoutType](#layout)`相同,比如Shape(N,C,H,W), 那么Tensor的 LayoutType必须是NCHW,否则会出错。如下列代码所示`
+
+
+```c++
+ // A 4-D tensor.
+ Tensor mytensor2(shape2); //right
+
+ //A 4-D tensor which is resident at GPU and its datatype is AK_INT8
+ Tensor mytensor3(shape2); //right
+
+ Tensor mytensor4(shape2); //wrong!! shape's dimetion must be equal to tensor's Layout.
+ Tensor mytensor5(shape2); //wrong!!!!
+
+```
+
+> 2. 使用现有的数据和shape初始化tensor
+
+```c++
+
+ /**
+ * A construtor of Tensor.
+ * data_ptr is a pointer to any data type of data
+ * TargetType is type of a platform [Anakin TargetType]
+ * id : device id
+ * shape: a Anakin shape
+ */
+ Tensor(Dtype* data_ptr, TargetType_t target, int id, Shape shape);
+
+ //using existing data feed to a tensor
+ Tensor mytensor(data_ptr, TargetType, device_id, shape); //shape must has dimention (N, C, H, W).
+
+```
+
+> 3. 使用tensor初始化tensor
+
+```c++
+ Tensor tensor(exist_tensor);
+```
+
+
+> 提示: 你可以用` typedef Tensor Tensor4d_X86 `方便定义tensor
+
+
+#### 填充tensor数据区
+
+
+填充数据区得看你申明tensor的方式, 下面展示了如何填充tensor的数据区。
+
+```c++
+首先来看看tensor的四种声明方式:
+
+1. Tensor mytensor;
+2. Tensor mytensor1(shape1);
+3. Tensor mytensor(data_ptr, TargetType, device_id, shape);
+4. Tensor tensor(exist_tensor);
+
+
+相关的声明方式的数据填充方法如下:
+
+1:声明一个空的tensor,此时没有为其分配内存,所以,我们需要手动的为其分配内存。
+
+ //parama shape
+ mytensor.re_alloc(Shape shape);
+
+ //Get writable pointer to mytensor.
+ //parama index (int): where you start to write.
+ //Dtype is your data type such int, float or double.
+ Dtype *p = mytensor.mutable_data(index/*=0*/);
+ //write data to mytensor
+ for(int i = 0; i < mytensor.size(); i++){
+ p[i] = 1.0f;
+ }
+ //do something ...
+
+2: 这种声明方式会自动分配内存
+
+ //Get writable pointer to mytensor.
+ //parama index (int): where you start to write.
+ //Dtype is your data type such int, float or double.
+ Dtype *p = mytensor1.mutable_data(index/*=0*/);
+ //write data to mytensor
+ for(int i = 0; i < mytensor.size(); i++){
+ p[i] = 1.0f;
+ }
+ //do something ...
+
+
+3:在该种声明方式中,我们仍不需要手动为其分配内存。但在构造函数内部是否为其分配内存,得依情况而定。如果data_ptr和申明的
+tensor都在都一个目标平台上,那么该tensor就会与data_ptr共享内存空间,相反,如果他们不在同一个平台上(如data_ptr在X86上,而
+tensor在GPU上),那么此时tensor就会开辟一个新的内存空间,并将data_ptr所指向的数据拷贝到tensor的buffer中。
+
+ //Get writable pointer to mytensor.
+ //parama index (int): where you start to write.
+ //Dtype is your data type such int, float or double.
+ Dtype *p = mytensor.mutable_data(index/*=0*/);
+ //write data to mytensor
+ for(int i = 0; i < mytensor.size(); i++){
+ p[i] = 1.0f;
+ }
+ //do something ...
+
+4:该种方式仍不需要手动分配内存
+
+ //Get writable pointer to mytensor.
+ //parama index (int): where you start to write.
+ //Dtype is your data type such int, float or double.
+ Dtype *p = mytensor.mutable_data(index/*=0*/);
+ //write data to mytensor
+ for(int i = 0; i < mytensor.size(); i++){
+ p[i] = 1.0f;
+ }
+ //do something ...
+
+
+另外,你还可以获取一个tensor的可读指针,示例如下:
+ //Get read-only pointer to mytensor.
+ //parama index (int): where you start to read.
+ //Dtype is your data type such int, float or double.
+ Dtype *p = mytensor.data(index/*=0*/);
+ //do something ...
+```
+
+如果想更详细的了解tensor,请查阅*soure_path/saber/core/tensor.h*
+
+#### 获取tensor的shape
+
+```c++
+//some declarations
+// ...
+Shape shape = mytensor.shape();
+
+//Get a first dimetion size of tesor, if it has.
+int d1 = shape[0];
+
+//Get a second dimention size of tensor, if it has.
+int d2 = shape[1];
+
+...
+
+//Get a n-th dimention size of tensor, if it has.
+int dn = shape[n-1];
+
+
+//Get a tensor's dimention
+int dims = mytensor.dims();
+
+//Get the size of tensor.
+//size = d1 x d2 x ... x dn.
+int size = mytensor.size();
+
+//Get the size of tensor at interval [Di, Dj)
+// form i-th dimention to j-th dimention, but not including the j-th dimention.
+// which means di x (di+1) x ... x (dj -1)
+int size = mytensor.count(start, end);
+```
+
+#### 设置tensor的shape
+
+我们可以用tensor的成员函数set_shape来设置tensor的shape。 下面是set_shape的定义
+
+
+```c++
+/**
+ * \brief set a tensor's shape
+ * \param valid_shape [a Shape object]
+ * \param shape [a Shape object]
+ * \param offset [a Shape object]
+ * \return the status of this operation, that means whether it success * or not.
+ */
+SaberStatus set_shape(Shape valid_shape, Shape shape = Shape::zero(TensorAPI::layout_dims::value), Shape offset = Shape::minusone(TensorAPI::layout_dims::value));
+```
+
+这个成员函数只设置tensor的shape。这些shape对象(valid_shape, shape, offset)的[LayOutType](#layout)必须和当前的tensor的相应三个shape对象的LayOutType相同,如果不同就会出错,返回SaberInvalidValue。 如果相同,那么将成功设置tensor的shape。
+
+```c++
+
+// some declarations
+// ...
+//valid_shape, shape , offset are Shape object;
+//All these Shape object's LayOutType must be equal to mytensor's.
+mytensor.set_shape(valid_shape, shape, offset);
+
+```
+
+#### 重置 tensor的shape
+
+```c++
+//some declarations
+Shape shape, valid_shape, offset;
+
+//do some initializations
+...
+mytensor.reshape(valid_shape, shape, offset);
+```
+
+注意: Reshape操作仍然需要shape的[LayOutType](#layout) 与tensor的相同
+
+
+### Graph ###
+
+`Graph`类负责加载Anakin模型生成计算图、对图进行优化、存储模型等操作。
+
+#### 图的声明
+
+与`Tensor`一样,graph也接受三个模板参数。
+
+```c++
+
+template
+class Graph ... /* inherit other class*/{
+
+ //some implements
+ ...
+
+};
+```
+
+前面已经介绍过[TargetType](#target)和[DataType](#datatype)是Anakin内部自定义数据类型。[TargetType](#target)表示平台类型 (如NV、X86), [DataType](#datatype)是Anakin基本数据类型与C++/C中的基本数据类型相对应。 [Precision](#precision)为op所支持的精度类型, 稍后我们在介绍它。
+
+
+```c++
+
+//Create a empty graph object.
+Graph graph = Graph tmp();
+
+//Create a pointer to a empty graph.
+Graph *graph = new Graph();
+
+//Create a pointer to a empty graph.
+auto graph = new Graph();
+
+```
+
+#### 加载 Anakin 模型
+
+```c++
+//some declarations
+...
+auto graph = new Graph();
+std::string model_path = "the/path/to/where/your/models/are";
+const char *model_path1 = "the/path/to/where/your/models/are";
+
+//Loading Anakin model to generate a compute graph.
+auto status = graph->load(model_path);
+
+//Or this way.
+auto status = graph->load(model_path1);
+//Check whether load operation success.
+if(!status){
+ std::cout << "error" << endl;
+ //do something...
+}
+
+```
+
+#### 优化计算图
+
+```c++
+//some declarations
+...
+//Load graph.
+...
+//According to the ops of loaded graph, optimize compute graph.
+graph->Optimize();
+
+```
+
+> 注意: 第一次加载原始图,必须要优化。
+
+#### 保存模型
+
+你可以在任何时候保存模型, 特别的, 你可以保存一个优化的模型,这样,下次再加载模型时,就不必进行优化操作。
+
+
+```c++
+//some declarations
+...
+//Load graph.
+...
+// save a model
+//save_model_path: the path to where your model is.
+auto status = graph->save(save_model_path);
+
+//Checking
+if(!status){
+ cout << "error" << endl;
+ //do somethin...
+}
+```
+
+#### 重新设置计算图里的tensor的shape
+
+```c++
+//some declarations
+...
+//Load graph.
+...
+vector shape{10, 256, 256, 10};
+//input_name : std::string.
+//Reshape a tensor named input_name.
+graph->Reshape(input_name, shape);//Note: shape is a vector, not a Shape object.
+```
+
+#### 设置 batch size
+
+`Graph` 支持重新设置batch size的大小。
+
+```c++
+//some declarations
+...
+//Load graph.
+...
+//input_name : std::string.
+//Reset a tensor named input_name.
+int new_batch_size = 4;
+graph->ResetBatchSize(input_name, new_batch_size);
+```
+
+### Net ###
+
+
+`Net` 是计算图的执行器。你可以通过Net对象获得输入和输出
+#### Creating a graph executor
+
+`Net`接受四个模板参数。
+
+
+```c++
+template
+class Net{
+ //some implements
+ ...
+
+};
+```
+由于有些Op可能支持多种精度,我们可以通过Precision来指定。OpRunType表示同步或异步类型,异步是默认类型。OpRunType::SYNC表示同步,在GPU上只有单个流;OpRunType::ASYNC表示异步,在GPU上有多个流并以异步方式执行。实际上,Precision和OpRunType都是enum class, 详细设计请参考*source_root/framework/core/types.h*.
+
+
+1. Precision
+
+Precision | Op support
+:---: | :---:
+Precision::INT4 | NO
+Precision::INT8 | NO
+Precision::FP16 | NO
+Precision::FP32 | YES
+Precision::FP64 | NO
+
+现在Op的精度只支持FP32, 但在将来我们会支持剩下的Precision.
+
+
+
+2. OpRunType
+
+OpRunType | Sync/Aync |Description
+:---: | :---: | :---:
+OpRunType::SYNC | Synchronization | single-stream on GPU
+OpRunType::ASYNC | Asynchronization | multi-stream on GPU
+
+用graph对象创建一个执行器。
+```c++
+//some declarations
+...
+//Create a pointer to a graph.
+auto graph = new Graph();
+//do something...
+...
+
+//create a executor
+Net executor(*graph);
+
+```
+
+#### 获取输入输出tensor
+
+
+获取输入输出tensor,并填充输入tensor的buffer。如果想要获取输入和输出tensor,那么必须指定输入的名字,如"input_0", "input_1", "input_2", ..., 必须传入如上字符串才能够获得输入tensor。另外,如果想知道input_i对应哪个输入,你需要去dash board查看,如何使用dash board请看[Anakin Parser](Converter_ch.md)。请看如下示例代码
+
+```c++
+//some declaratinos
+...
+
+//create a executor
+//TargetType is NV [NVIDIA GPU]
+Net executor(*graph);
+
+//Get the first input tensor.
+//The following tensors(tensor_in0, tensor_in2 ...) are resident at GPU.
+//Note: Member function get_in returns an pointer to tensor.
+Tensor* tensor_in0 = executor.get_in("input_0");
+
+//If you have multiple input tensors
+//You just type this code below.
+Tensor* tensor_in1 = executor.get_in("input_1");
+...
+auto tensor_inn = executor.get_in("input_n");
+```
+
+当得到输入tensor之后,就可以填充它的数据区了。
+
+```c++
+//This tensor is resident at GPU.
+auto tensor_d_in = executor.get_in("input_0");
+
+//If we want to feed above tensor, we must feed the tensor which is resident at host. And then copy the host tensor to the device's one.
+
+//using Tensor4d = Tensor;
+Tensor4d tensor_h_in; //host tensor;
+//Tensor tensor_h_in;
+
+//Allocate memory for host tensor.
+tensor_h_in.re_alloc(tensor_d_in->valid_shape());
+//Get a writable pointer to tensor.
+float *h_data = tensor_h_in.mutable_data();
+
+//Feed your tensor.
+/** example
+for(int i = 0; i < tensor_h_in.size(); i++){
+ h_data[i] = 1.0f;
+}
+*/
+//Copy host tensor's data to device tensor.
+tensor_d_in->copy_from(tensor_h_in);
+
+// And then
+```
+
+
+类似的,我们可以利用成员函数get_out来获得输出tensor。但与获得输入tensor不同的是, 我们需要指定输入tensor结点的名字,这个可以从dash board中看到,请从[Anakin Parser](Converter_ch.md)中查看dash board的使用方法。假如有个输出结点叫pred_out, 那么我们可以通过如下代码获得相应的输出tensor:
+```c++
+//Note: this tensor are resident at GPU.
+Tensor* tensor_out_d = executor.get_out("pred_out");
+
+```
+
+
+#### Executing graph
+
+
+当一切准备就绪后,我们就可以执行真正的计算了!
+```c++
+executor.prediction();
+```
+
+## 示例代码 ##
+
+下面的例子展示了如何调用Anakin。
+
+在这儿之前, 请确保你已经有了Anakin模型。如果还没有,那么请使用[Anakin Parser](Converter_ch.md)转换你的模型。
+
+### Single-thread
+
+单线程例子在 *source_root/test/framework/net/net_exec_test.cpp`*
+
+```c++
+
+std::string model_path = "your_Anakin_models/xxxxx.anakin.bin";
+// Create an empty graph object.
+auto graph = new Graph();
+// Load Anakin model.
+auto status = graph->load(model_path);
+if(!status ) {
+ LOG(FATAL) << " [ERROR] " << status.info();
+}
+// Reshape
+graph->Reshape("input_0", {10, 384, 960, 10});
+// You must optimize graph for the first time.
+graph->Optimize();
+// Create a executer.
+Net net_executer(*graph);
+
+//Get your input tensors through some specific string such as "input_0", "input_1", and
+//so on.
+//And then, feed the input tensor.
+//If you don't know Which input do these specific string ("input_0", "input_1") correspond with, you can launch dash board to find out.
+auto d_tensor_in_p = net_executer.get_in("input_0");
+Tensor4d h_tensor_in;
+auto valid_shape_in = d_tensor_in_p->valid_shape();
+for (int i=0; icopy_from(h_tensor_in);
+
+//Do inference.
+net_executer.prediction();
+
+//Get result tensor through the name of output node.
+//And also, you need to see the dash board again to find out how many output nodes are and remember their name.
+
+//For example, you've got a output node named obj_pre_out
+//Then, you can get an output tensor.
+auto d_tensor_out_0_p = net_executer.get_out("obj_pred_out"); //get_out returns a pointer to output tensor.
+auto d_tensor_out_1_p = net_executer.get_out("lc_pred_out"); //get_out returns a pointer to output tensor.
+//......
+// do something else ...
+//...
+//save model.
+//You might not optimize the graph when you load the saved model again.
+std::string save_model_path = model_path + std::string(".saved");
+auto status = graph->save(save_model_path);
+if (!status ) {
+ LOG(FATAL) << " [ERROR] " << status.info();
+}
+
+```
diff --git a/doc/fluid/new_docs/advanced_usage/deploy/convert_paddle_to_anakin.md b/doc/fluid/new_docs/advanced_usage/deploy/convert_paddle_to_anakin.md
new file mode 100644
index 0000000000000000000000000000000000000000..56ca582b2b47f404ede777712830731ea7f4e9b5
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/deploy/convert_paddle_to_anakin.md
@@ -0,0 +1,73 @@
+# 模型转换指南
+
+Anakin 支持不同框架的模型预测。但由于格式的差别,Anakin 需要您预先转换模型。本文档介绍如何转换模型。
+
+## 简介
+
+Anakin 模型转换器输入支持 Caffe 和 Fluid 两种格式的预测模型,模型包含网络结构(model 或 prototxt)和权重参数(param 或 caffemodel)。
+
+模型转换的输出是一个 bin 文件,它作为 Anakin 框架的 graph 参数导入。
+
+您还可以使用模型转换器的 launch board 功能生成网络结构的 HTML 预览。
+
+
+## 系统要求
+
+- python 2.7+
+- pyyaml
+- flask
+- protobuf 3.5+
+
+
+## 用法
+
+### 1、环境
+转换器所需的依赖标注于 *系统要求* 一节。
+
+### 2、配置
+您需要对 *config.yaml* 文件进行修改以告知您的需求。工程中给出了 *config.yaml* 示例,下面作进一步说明。
+
+#### config.yaml
+```bash
+OPTIONS:
+ Framework: CAFFE # 依框架类型填写 CAFFE 或 FLUID
+ SavePath: ./output # 转换结束后模型的保存位置
+ ResultName: googlenet # 输出模型的名字
+ Config:
+ LaunchBoard: ON # 是否生成网络结构预览页面
+ Server:
+ ip: 0.0.0.0
+ port: 8888 # 从一个可用端口访问预览页面
+ OptimizedGraph: # 当您使用了 Anakin 框架的 Optimized 功能时,才应该打开此项
+ enable: OFF
+ path: /path/to/anakin_optimized_anakin_model/googlenet.anakin.bin.saved
+ LOGGER:
+ LogToPath: ./log/ # 生成日志的路径
+ WithColor: ON
+
+TARGET:
+ CAFFE:
+ # 当 Framework 为 CAFFE 时需填写
+ ProtoPaths:
+ - /path/to/caffe/src/caffe/proto/caffe.proto
+ PrototxtPath: /path/to/your/googlenet.prototxt
+ ModelPath: /path/to/your/googlenet.caffemodel
+
+ FLUID:
+ # 当 Framework 为 FLUID 时需填写
+ Debug: NULL
+ ProtoPaths:
+ - /
+ PrototxtPath: /path/to/fluid/inference_model
+ ModelPath: /path/to/fluid/inference_model
+ # ...
+```
+
+### 3、转换
+在完成配置文件的修改后,您只需执行 ```python converter.py``` 就可以进行模型转换了。
+
+
+### 4、预览
+最后一步,就是在浏览器中查看令人振奋的转换结果!网址是在 *config.yaml* 中配置的,例如 http://0.0.0.0:8888 。
+
+> 注意:若您使用了默认的 IP 地址 0.0.0.0,请在预览时使用真实的服务器地址 real_ip:port 替代它。
diff --git a/doc/fluid/new_docs/advanced_usage/deploy/how_to_add_anakin_op.md b/doc/fluid/new_docs/advanced_usage/deploy/how_to_add_anakin_op.md
new file mode 100644
index 0000000000000000000000000000000000000000..f2783eb9f591a31443f2a692ce0eb1bcc9b1063a
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/deploy/how_to_add_anakin_op.md
@@ -0,0 +1,405 @@
+# 如何增加新的Operator
+
+## 基本概念
+
+简单介绍下几个同Operator相关的基本概念,详情请参考设计文档。
+
+```framework```: 上层的逻辑代码,负责从parser中获取参数及weights,添加op时主要修改framework/operator目录下的内容。
+
+```saber```: 底层的实现代码,Anakin通过saber封装了不同的backends,不同的实现(impl)分别特化出自己的实现,外层framework通过不同的template进入各自的impl完成调用。各个op的parameter放在saber/saber_funcs_param.h文件中,增加op主要修改saber/funcs下的内容。
+
+saber的文件结构:
+* saber/funcs下的是各个funcs的外部接口,这一层的op与具体的设备实现无关,只与各op完成的功能有关。由于跟实现(impl)无关,本层文件明均不带impl。
+* saber/funcs/impl下是各个op的impl声明,特定设备需要完成该层声明的特化版本,如saber/funcs/impl/x86实现了上一层impl声明的x86特化版本,saber/funcs/impl/cuda实现了上一层impl声明的NV特化版本。当增加新的backends时需要特化出新的实现。本层代码同实现相关,均带有```impl_```前缀。
+* saber/funcs/impl/cuda/base/cuda_c内有cuda```.cu```扩展名的文件,添加cuda的kernel需要在该文件目录下添加。
+* saber/funcs/impl/cuda/base/sass 内有不同架构的汇编代码编译的静态库。
+
+### 涉及到的基类及各个类之前的关系
+
+简单介绍相关的基类
+
+* ```anakin::Operator```: framework的operator基类,位于framework/core/operator/operator.h
+
+* ```anakin::saber::BaseFunc```: saber对外的op接口基类,提供统一的对外接口,位于saber/funcs/base.h。BaseFunc的```compute_output_shape```接口只根据input的shape和param的参数计算输出的shape,并通过```tensor```的```set_shape```接口(只设置shape,不分配空间)设置到output中。```operator()```接口为各个op的计算接口。
+
+* ```ankain::saber::ImplBase```: saber设备实现的op的接口,所有设备相关实现的基类。位于saber/funcs/impl/impl_base.h。实现版本中这里分为两类,一类以```vender_```为前缀,带有```vender_```代码意为使用第三方库来实现该op,如cudnn的conv,或mkl的conv等等,这类op的性能我们难以调优,因此单独列为一类。另一类是带有源码的saber实现,这些实现都带有```saber_```为前缀,此类实现带有源码,能够通过后续优化不断提升性能,实现起名时需要注意这一点。
+
+## 添加operator
+
+添加一个新的op需要以下几步:
+
+1. 添加saber的param
+2. 定义saber的Operator类
+3. 定义新的impl声明
+3. 完成新的impl实现
+4. 增加framework的实现或特化
+
+接下来就针对这几步,以一个简单例子为例介绍实现。
+
+例如我们要添加新的Mul op。给出计算公式如下:$$Out = alpha \dot X * Y$$
+
+### 为operator增加param
+
+涉及到的文件:```saber/saber_funcs_param.h```。如果之前已经存在需要添加的op的param,这一步可以跳过。
+这里```XXXParam```是一个```struct```。包含一个无参数的构造函数,含参数的构造函数,复制构造函数,```operator=()```及```operator==()```。
+```
+template // 能够获得target, datatype, layout
+struct MulParam{
+ MulParam()
+ : alpha(0)
+ {}
+ MulParam(float alpha_in)
+ : alpha(alpha_in)
+ {}
+ MulParam(const MulParam& right)
+ : alpha(right.alpha)
+ {}
+ MulParam &operator=(const MulParam &right) {
+ alpha = right.alpha;
+ }
+ bool operator==(const MulParam &right) {
+ return alpha == right.alpha;
+ }
+ float alpha;
+};
+```
+
+### 定义Operator类
+涉及到的文件:```saber/funcs/mul.h```。如果之前定义过该op的类,这里需要修改输入的impl定义头文件。
+下面给出一个相对完整的定义结构供参考。
+```
+//不同的设备需要包含对应的operator实现.[详见](#impl)
+#ifdef NVIDIA_GPU
+#include "saber/funcs/impl/cuda/saber_mul.h"
+#include "saber/funcs/impl/cuda/vender_mul.h"
+#endif
+//如果一个设备现在还没有对应的operator实现,需要包含声明。[详见](#declare)
+#ifdef USE_X86_PLACE
+#include "saber/funcs/impl/impl_mul.h"
+#endif
+namespace anakin {
+namespace saber {
+template
+class Mul : public BaseFunc<
+ Tensor,
+ Tensor,
+ Tensor,
+ ImplBase, MulParam> {
+public:
+ using BaseFunc<
+ Tensor,
+ Tensor,
+ Tensor,
+ ImplBase, MulParam>::BaseFunc;
+ Mul() = default;
+ typedef Tensor InDataTensor;
+ typedef Tensor OutDataTensor;
+ typedef Tensor OpTensor;
+ typedef MulParam Param_t;
+ typedef std::vector Input_v;
+ typedef std::vector Output_v;
+ typedef std::vector Shape_v;
+
+ virtual SaberStatus compute_output_shape(const Input_v &input,
+ Output_v &output, Param_t ¶m) override {
+ //计算输出的shape,
+ Shape output_shape = (input[0]->valid_shape());
+ /* code */
+ return output[0]->set_shape(output_shape);
+ }
+ virtual SaberStatus init_impl(ImplEnum implenum) override {
+ // 不同设备均使用此init_impl, 此接口创建对应impl的实现。
+ switch (implenum) {
+ case VENDER_IMPL:
+ this->_impl.push_back(new VenderMul );
+ return SaberSuccess;
+ case SABER_IMPL:
+ this->_impl.push_back(new SaberMul );
+ return SaberSuccess;
+ default:
+ return SaberUnImplError;
+ }
+ }
+private:
+ virtual void pick_best_static() override {
+ if (true) // some condition?
+ this->_best_impl = this->_impl[0];
+ }
+ virtual void pick_best_specify(ImplEnum implenum) override {
+ this->_best_impl = this->_impl[0];
+ }
+};
+} // namespace saber
+} // namespace anakin
+```
+
+### 为operator增加新的impl声明
+
+涉及的文件:```saber/funcs/impl/impl_mul.h```。不同的设备都特化同一个声明,特化版本放在对应的文件夹下,这里的声明就是给出所有设备的统一声明。下面给出一个参考。
+```
+#include "saber/funcs/impl/impl_macro.h"
+namespace anakin{
+namespace saber{
+DEFINE_OP_CLASS(Mul, MulParam); // 第一个参数是op的名字,第二个是对应param的名字
+}
+}
+```
+
+### 完成新的operator特定后端实现
+
+涉及的文件:```saber/funcs/impl/xxx/vender_mul.h```或```saber/funcs/impl/xxx/saber_mul.h```
+这里```xxx```指代特定的一种设备。```vender```是指的使用第三方库实现的op,```saber```指的源码实现的op。这里以cuda的vender实现为例,简单介绍一下特化出的函数的几个基本接口。
+
+```
+// include 对应的声明
+#include "saber/funcs/impl/impl_mul.h"
+
+namespace anakin{
+namespace saber{
+template
+class VenderMul :
+ public ImplBase<
+ Tensor,
+ Tensor,
+ Tensor,
+ MulParam > >
+{
+public:
+ typedef Tensor DataTensor_in;
+ typedef Tensor DataTensor_out;
+ typedef Tensor OpTensor;
+ typedef typename DataTensor_in::Dtype InDataType;
+ typedef typename DataTensor_out::Dtype OutDataType;
+ typedef typename OpTensor::Dtype OpDataType;
+ VenderMul(){}
+ ~VenderMul() {}
+
+ virtual SaberStatus init(const std::vector& inputs,
+ std::vector& outputs,
+ MulParam& param, Context& ctx) {
+ this->_ctx = ctx;
+ create(inputs, outputs, param, ctx);
+ }
+
+ virtual SaberStatus create(const std::vector& inputs,
+ std::vector& outputs,
+ MulParam& param, Context& ctx) {
+ // set内部参数
+ }
+
+ virtual SaberStatus dispatch(const std::vector& inputs,
+ std::vector& outputs,
+ MulParam& param) {
+ // dispatch kernel.
+ }
+
+private:
+};
+}
+}
+```
+```init```和```create```的区别:```init```接口是第一次初始化op的时候进入的接口,此函数只在第一次初始化op时调用,这个接口一般放一些只需要执行一次的代码,如malloc或者create之类的函数。```create```函数除了第一次init执行外,在输入发生变化或者param发生变化时会再次触发,create一般放置set函数,设置内部变量,当input发生变化时这里执行一些同input或weights直接相关的代码。但create因为触发位置在网络内,如果```create```函数执行了一些严重耗时的操作,这里会拖慢整个op的执行时间,需要慎重选择操作放置的位置。
+### 添加framework的特化
+
+涉及的文件:```framework/operators/mul.h```和```framework/operators/mul.cpp```。
+这里简单介绍下如果添加或修改framework内的operator
+
+```
+#include "framework/core/base.h"
+#include "framework/core/data_types.h"
+#include "framework/core/operator/operator.h"
+#include "utils/logger/logger.h"
+#include "saber/funcs/mul.h" // 需要包对应的saber头文件
+namespace anakin {
+namespace ops {
+template
+class MulHelper;
+
+template
+class Mul : public Operator {
+public:
+ Mul() {}
+ /// forward impl
+ virtual void operator() (OpContext &ctx,
+ const std::vector >& ins,
+ std::vector >& outs) {
+ LOG(ERROR) << "Not Impl Yet Operator power::type>().type_info()<<">";
+ }
+ friend class MulHelper;
+};
+template
+class MulHelper : public OperatorHelper {
+public:
+ MulHelper() = default;
+ ~MulHelper();
+ Status InitParam() override;
+
+ Status Init(OpContext &ctx,
+ const std::vector >& ins,
+ std::vector >& outs) override;
+ Status InferShape(const std::vector >& ins,
+ std::vector >& outs) override;
+
+public:
+ saber::MulParam> _param_mul;
+ saber::Mul _funcs_mul;
+};
+}
+} /* namespace anakin */
+```
+对应的```.cpp```文件如下:
+```
+#include "framework/operators/mul.h"
+
+namespace anakin {
+namespace ops {
+
+#ifdef USE_CUDA
+template<>
+void Mul::operator()(
+ OpContext& ctx,
+ const std::vector >& ins,
+ std::vector >& outs) {
+ auto* impl =
+ static_cast*>(this->_helper);
+ auto& param =
+ static_cast*>(this->_helper)->_param_mul;
+ impl->_funcs_mul(ins, outs, param, ctx);
+}
+#endif
+
+template
+Status MulHelper::InitParam() {
+ auto alpha = GET_PARAMETER(float, alpha);
+ MulParam> param_mul(alpha);
+ _param_mul = param_mul;
+ return Status::OK();
+}
+
+template
+Status MulHelper::Init(OpContext& ctx,
+ const std::vector >& ins,
+ std::vector >& outs) {
+
+ SABER_CHECK(_funcs_mul.init(ins, outs, _param_mul, SPECIFY, VENDER_IMPL, ctx));
+ return Status::OK();
+}
+
+template
+Status MulHelper::InferShape(const
+ std::vector >& ins,
+ std::vector >& outs) {
+ SABER_CHECK(_funcs_mul.compute_output_shape(ins, outs, _param_mul));
+ return Status::OK();
+}
+
+#ifdef USE_CUDA
+template class MulHelper;
+#endif
+#ifdef USE_ARM_PLACE
+template class MulHelper;
+#endif
+// register helper
+#ifdef USE_CUDA
+ANAKIN_REGISTER_OP_HELPER(Mul, MulHelper, NV, AK_FLOAT, Precision::FP32);
+#endif
+#ifdef USE_ARM_PLACE
+ANAKIN_REGISTER_OP_HELPER(Mul, MulHelper, ARM, AK_FLOAT, Precision::FP32);
+#endif
+//! register op
+ANAKIN_REGISTER_OP(Mul)
+.Doc("Mul operator")
+#ifdef USE_CUDA
+.__alias__("mul")
+#endif
+#ifdef USE_ARM_PLACE
+.__alias__("mul")
+#endif
+.num_in(1)
+.num_out(1)
+.Args("alpha", " alpha of Mul "); //注册
+
+} /* namespace ops */
+
+} /* namespace anakin */
+```
+
+## 实现单元测试
+涉及的文件:```test/saber/xxx/test_saber_funcs_mul_xxx.cpp```
+在对应的test下需要添加新的单元测试
+
+```
+TEST(TestSaberFuncNV, test_depthwise_conv) {
+
+ // init tensors and some param.
+
+ // start Reshape & doInfer
+ Context ctx1(0, 1, 1);
+
+ // create param
+ MulParam > param(alpha);
+
+ std::vector*> input;
+ std::vector*> output;
+
+ // create saber op
+ Mul mul;
+
+ // compute output shape
+ mul.compute_output_shape(input, output, param);
+
+ // re_alloc output tensors memory based on output shape
+ output[0]->re_alloc(output[0]->shape());
+
+ // init saber op(calling init and create)
+ mul.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1);
+
+ // call operator()
+ mul(input, output, param, ctx1);
+
+ // cuda specified, record events
+ cudaStream_t cuda_stream = ctx1.get_compute_stream();
+ output[0]->record_event(cuda_stream);
+ output_dev.sync();
+
+ // param changed
+ param.alpha = 2.0;
+ // auto calling saber op(create and dispatch)
+ mul(input, output, param, ctx1);
+
+ cudaDeviceSynchronize();
+ CUDA_CHECK(cudaPeekAtLastError());
+}
+
+int main(int argc, const char** argv){
+ anakin::saber::Env::env_init();
+
+ // initial logger
+ //logger::init(argv[0]);
+ InitTest();
+ RUN_ALL_TESTS(argv[0]);
+ return 0;
+}
+
+```
+## 调试及注意事项
+
+一个op需要有对外的op接口和内部实现,由于存在saber/funcs/impl的非特化版本声明,当有op在某种设备下没有对应实现时,也能够编译,但此时是没有任何实现的空实现,
diff --git a/doc/fluid/new_docs/advanced_usage/deploy/how_to_support_new_device_in_anakin.md b/doc/fluid/new_docs/advanced_usage/deploy/how_to_support_new_device_in_anakin.md
new file mode 100644
index 0000000000000000000000000000000000000000..a1f75f5e95cfb90f26d3782ba30a6d1887a70424
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/deploy/how_to_support_new_device_in_anakin.md
@@ -0,0 +1,459 @@
+# 如何支持一个新的设备
+
+## 概览
+
+添加一个新的设备需要以下3个步骤:
+
+* [在`CMakeList`中添加设备的支持](#0001)
+* [在`saber`中添加设备的实现](#0002)
+* [在`framework`中添加设备的具体化或实例化](#0003)
+
+假设新设备的名称为`TNEW`, 以下将以这个设备名称进行演示。
+
+## 在`CMakeList`中添加设备的支持 ##
+
+* 修改根目录`CMakeList.txt`
+```cmake
+#select the plantform to build
+anakin_option(USE_GPU_PLACE "Select the build mode for GPU place." NO)
+anakin_option(USE_X86_PLACE "Select the build mode for X86 place." NO)
+anakin_option(USE_ARM_PLACE "Select the build mode for ARM place." NO)
+anakin_option(USE_TNEW_PLACE "Select the build mode for ARM place." YES)
+```
+
+* 修改`saber/CMakeList.txt`
+
+根据新增设备的目录完善`saber`目录下的`CMakeList.txt`。
+```cmake
+if(USE_TNEW_PLACE)
+ anakin_fetch_files_with_suffix(${ANAKIN_SABER}/core/impl/tnew "cpp" ANAKIN_SABER_BASE_SRC)
+ anakin_fetch_files_with_suffix(${ANAKIN_SABER}/funcs/impl/tnew "cpp" ANAKIN_SABER_BASE_SRC)
+endif()
+```
+
+* 修改`test/CMakeList.txt`
+
+新增设备的单测文件放在`test/saber/tnew`目录下,修改`test`目录下的`CMakeList.txt`。
+```cmake
+if(USE_TNEW_PLACE)
+ anakin_fetch_files_with_suffix(${ANAKIN_UNIT_TEST}/saber/tnew "cpp" ANAKIN_TEST_CASE_SRC)
+endif()
+```
+
+* 修改`cmake/anakin_config.h.in`
+```c++
+// plantform to use
+#cmakedefine USE_GPU_PLACE
+
+#cmakedefine USE_X86_PLACE
+
+#cmakedefine USE_ARM_PLACE
+
+#cmakedefine USE_TNEW_PLACE
+```
+
+* 其他依赖和编译选项
+修改`cmake`目录下的`compiler_options.cmake`和`find_modules.cmake`
+
+
+## 在`saber`中添加设备的实现 ##
+`saber`是`Anakin`的基础计算库,对外提供设备无关的统一的API,设备相关的实现都会封装到`TargetWrapper`中。
+
+### 在`saber/saber_types.h`中添加设备
+
+```c++
+enum TargetTypeEnum {
+ eINVALID = -1,
+ eNV = 1,
+ eAMD = 2,
+ eARM = 3,
+ eX86 = 4,
+ eNVHX86 = 5,
+ eTNEW = 6
+};
+
+typedef TargetType NV;
+typedef TargetType ARM;
+typedef TargetType AMD;
+typedef TargetType X86;
+typedef TargetType TNEW;
+
+```
+
+### 在`saber/core`中添加设备的实现
+
+1. 在`target_traits.h`中添加新设备
+
+* 增加设备类型
+```c++
+struct __cuda_device{};
+struct __arm_device{};
+struct __amd_device{};
+struct __x86_device{};
+struct __tnew_device{};
+```
+
+* `TargetTypeTraits`模板具体化
+```c++
+template <>
+struct TargetTypeTraits {
+ typedef __xxx_target target_category;//根据实际设备是host端还是device端进行选择
+ typedef __tnew_device target_type;
+};
+```
+
+2. 在`data_traits.h`中特化`DataTrait`模板类
+
+如果设备需要特殊的数据类型,则特化出设备的`DataTrait`类的实现,例如opencl数据类型的实现如下:
+```c++
+#ifdef USE_OPENCL
+struct ClMem{
+ ClMem(){
+ dmem = nullptr;
+ offset = 0;
+ }
+
+ ClMem(cl_mem* mem_in, int offset_in = 0) {
+ dmem = mem_in;
+ offset = offset_in;
+ }
+
+ ClMem(ClMem& right) {
+ dmem = right.dmem;
+ offset = right.offset;
+ }
+
+ ClMem& operator=(ClMem& right) {
+ this->dmem = right.dmem;
+ this->offset = right.offset;
+ return *this;
+ }
+
+ ClMem& operator+(int offset_in) {
+ this->offset += offset_in;
+ return *this;
+ }
+
+ int offset{0};
+ cl_mem* dmem;
+};
+
+template <>
+struct DataTrait {
+ typedef ClMem Dtype;
+ typedef float dtype;
+};
+
+template <>
+struct DataTrait {
+ typedef ClMem Dtype;
+ typedef double dtype;
+};
+
+template <>
+struct DataTrait {
+ typedef ClMem Dtype;
+ typedef char dtype;
+};
+#endif //use_opencl
+```
+
+3. 在`target_wrapper.h`中特化`TargetWrapper`模板类
+
+特化`TargetWrapper`模板类,在`target_wrapper.h`中声明函数,具体如下:
+```c++
+template <>
+struct TargetWrapper { //根据TNEW的具体类型修改__xxx_target,__host_target或者__device_target
+
+ typedef xxx_event event_t; //根据设备实现xxx_event
+ typedef xxx_stream stream_t; //根据设备实现xxx_stream
+
+ static void get_device_count(int& count);
+
+ static void set_device(int id);
+
+ //We should add strategy to avoid malloc directly
+ static void mem_alloc(void** ptr, size_t n);
+
+ static void mem_free(void* ptr);
+
+ static void mem_set(void* ptr, int value, size_t n);
+
+ static void create_event(event_t& event, bool flag = false);
+
+ static void create_stream(stream_t& stream);
+
+ static void create_stream_with_flag(stream_t& stream, unsigned int flag);
+
+ static void create_stream_with_priority(stream_t& stream, unsigned int flag, int priority);
+
+ static void destroy_stream(stream_t& stream);
+
+ static void destroy_event(event_t& event);
+
+ static void record_event(event_t& event, stream_t stream);
+
+ static void query_event(event_t& event);
+
+ static void sync_event(event_t& event);
+
+ static void sync_stream(event_t& event, stream_t& stream);
+
+ static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+ size_t count, __DtoD);
+
+ static void async_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+ size_t count, stream_t& stream, __DtoD);
+
+ static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+ size_t count, __HtoD);
+
+ static void async_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+ size_t count, stream_t& stream, __HtoD);
+
+ static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+ size_t count, __DtoH);
+
+ static void async_memcpy(void* dst, int dst_id, const void* src, int src_id, \
+ size_t count, stream_t& stream, __DtoH);
+
+ static void sync_memcpy_p2p(void* dst, int dst_dev, const void* src, \
+ int src_dev, size_t count);
+
+ static void async_memcpy_p2p(void* dst, int dst_dev, const void* src, \
+ int src_dev, size_t count, stream_t& stream);
+
+ static int get_device_id();
+};
+
+```
+
+4. 在`impl/`目录下添加设备目录和实现
+
+在`saber/core/impl`目录下添加设备目录`tnew`。
+* 实现`TargetWrapper`结构体中各函数的定义。
+如果`TargetWrapper`的实现与默认的模板类一致,则不用特化出该类。
+
+```c++
+typedef TargetWrapper TNEW_API;
+void TNEW_API::get_device_count(int &count) {
+ // add implementation
+}
+
+void TNEW_API::set_device(int id){
+ // add implementation
+}
+
+void TNEW_API::mem_alloc(void** ptr, size_t n){
+ // add implementation
+}
+
+void TNEW_API::mem_free(void* ptr){
+ if(ptr != nullptr){
+ // add implementation
+ }
+}
+...
+
+```
+
+* 特化实现`device.h`中的`Device`
+
+```c++
+template <>
+void Device::create_stream() {
+ // add implementation
+}
+
+template <>
+void Device::get_info() {
+
+ // add implementation
+}
+
+```
+
+### 在`saber/funcs`中实现设备相关的op
+
+参考[如何增加新的Operator](addCustomOp.md)
+
+
+## 在`framework`中添加设备的具体化或实例化 ##
+
+### `framework/core`
+
+* `net.cpp`中添加实例化
+
+```c++
+#ifdef USE_TNEW_PLACE
+template class Net;
+template class Net;
+#endif
+```
+
+* `operator_func.cpp`中添加实例化
+
+```c++
+#ifdef USE_TNEW_PLACE
+template class OperatorFunc;
+#endif
+```
+
+* `worker.cpp`中添加实例化
+
+```c++
+#ifdef USE_TNEW_PLACE
+template class Worker;
+template class Worker;
+#endif
+```
+
+* `operator_attr.cpp`中添加实例化
+
+```c++
+template
+OpAttrWarpper& OpAttrWarpper::__alias__(const std::string& op_name);
+template
+OpAttrWarpper& OpAttrWarpper::__alias__(const std::string& op_name);
+template
+OpAttrWarpper& OpAttrWarpper::__alias__(const std::string& op_name);
+```
+
+* `parameter.h`中添加设备的实现
+
+```c++
+#ifdef USE_TNEW_PLACE
+template
+class PBlock {
+public:
+ typedef Tensor4d::type> type;
+
+ PBlock() {
+ _inner_tensor = std::make_shared();
+ }
+ ...
+}
+#endif //TNEW
+```
+
+* `type_traits_extend.h`中添加设备的实现
+
+```c++
+template<>
+struct target_host {
+ typedef saber::X86 type; //根据TNEW选择正确的host type
+};
+```
+
+### `framework/graph`
+
+* `graph.cpp`中添加实例化
+
+```c++
+ #ifdef USE_TNEW_PLACE
+ template class Graph;
+ template class Graph;
+ template class Graph;
+ #endif
+```
+
+### `framework/model_parser`
+
+* `parser.cpp`中添加实例化
+
+```c++
+ #ifdef USE_TNEW_PLACE
+ template
+ Status load(graph::Graph* graph,
+ const char* model_path);
+ template
+ Status load(graph::Graph* graph,
+ const char* model_path);
+ template
+ Status load(graph::Graph* graph,
+ const char* model_path);
+
+ template
+ Status save(graph::Graph* graph,
+ std::string& model_path);
+ template
+ Status save(graph::Graph* graph,
+ std::string& model_path);
+ template
+ Status save(graph::Graph* graph,
+ std::string& model_path);
+
+ template
+ Status load(graph::Graph* graph,
+ std::string& model_path);
+ template
+ Status load(graph::Graph* graph,
+ std::string& model_path);
+ template
+ Status load(graph::Graph* graph,
+ std::string& model_path);
+
+ template
+ Status save(graph::Graph* graph,
+ const char* model_path);
+ template
+ Status save(graph::Graph