diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt deleted file mode 100644 index a777a4974cc377db103a470698f817612a4e9a32..0000000000000000000000000000000000000000 --- a/doc/CMakeLists.txt +++ /dev/null @@ -1,9 +0,0 @@ -add_custom_target(paddle_apis ALL - DEPENDS paddle_v2_apis) - -add_custom_target(paddle_docs ALL - DEPENDS paddle_v2_docs paddle_v2_docs_cn - paddle_mobile_docs paddle_mobile_docs_cn) - -add_subdirectory(v2) -add_subdirectory(mobile) diff --git a/doc/about/about_us.rst b/doc/about/about_us.rst deleted file mode 100644 index f67d8b8130030db8d7e7d10b30271a913bd6272a..0000000000000000000000000000000000000000 --- a/doc/about/about_us.rst +++ /dev/null @@ -1,53 +0,0 @@ -========= -关于我们 -========= - -什么是PaddlePaddle --------------------- - -- PaddlePaddle是百度自主研发并开源的深度学习框架,它能够让开发者和企业安全、快速地实现自己的AI想法 - -- 项目团队汇聚了全球顶级的深度学习科学家,致力于为开发者和企业提供最好的深度学习研发体验 - -- 框架具有易学、易用、安全、高效四大特性,是最适合中国开发者和企业的深度学习工具 - -PaddlePaddle的技术特色 -------------------------- - -- 新一代深度学习框架: PaddlePaddle是基于“深度学习编程语言”的新一代深度学习框架,在保证性能的同时,极大的提升了框架对模型的表达能力,能够描述任意潜在可能出现的模型 - -- 对大规模计算更加友好:经过百度内多种大规模计算业务的打磨,PaddlePaddle在分布式计算上表现优异,基于EDL技术能够节约大量计算资源,同时也能支持大规模稀疏模型的训练 - -- 提供可视化的深度学习:通过Visual DL可以帮助开发者方便的观测训练整体趋势、数据样本质量和中间结果、参数分布和变化趋势、以及模型的结构,帮助开发者更便捷的完成编程过程 - -提供基于PaddlePaddle的教育体系 --------------------------------- - -- 深度学习课程:百度与中国市场顶级的教育、培训机构共同开发了深度学习精品课程以及学习教材,帮助开发者从零掌握深度学习 - -- 深度学习实训:对于目的是科研和学习的用户,PaddlePaddle提供了无需安装、线上运行的开发环境,并提供算法、算力、数据支持 - -- 线下培训:提供丰富、高质量的线下教育活动,如青年教师培训、线下实战营、沙龙等多种形式的培训和交流 - - -提供基于PaddlePaddle的AI服务 ------------------------------- - -- EadyDL:可以帮助零算法基础的企业快速完成一个深度学习任务,只需少量的数据即可得到优质的模型 - -- AI市场:提供标准化的AI 能力、产品的交易机制,帮助企业快速找到所需,有效开展AI业务 - -- 深度学习竞赛: PaddlePaddle汇聚顶尖深度学习开发者,企业可以发布自己的商业问题,通过竞赛方式快速找到最优的解决方案 - -你对PaddlePaddle有任何的问题都可以通过以下方式联系到我们 ------------------------------------------------------------ - -- 学习/使用问题:可以在 `PaddlePaddle开源社区 `_,以及 `PaddlePaddle中文社区 `_ 向我们反馈 - -- 对PaddlePaddle框架发展的建议:可发送邮件至Paddle-better@baidu.com - -我们期待与你一起打造世界顶级深度学习框架,共同推动AI技术的进步 - - - -PaddlePaddle团队 diff --git a/doc/mobile/CMakeLists.txt b/doc/mobile/CMakeLists.txt deleted file mode 100644 index 7b34ba8d0768427802b11614c6962f3c3f6ef4e3..0000000000000000000000000000000000000000 --- a/doc/mobile/CMakeLists.txt +++ /dev/null @@ -1,52 +0,0 @@ -if(NOT DEFINED SPHINX_THEME) - set(SPHINX_THEME default) -endif() - -if(NOT DEFINED SPHINX_THEME_DIR) - set(SPHINX_THEME_DIR) -endif() - -# configured documentation tools and intermediate build results -set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build") - -# Sphinx cache with pickled ReST documents -set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees") - -# HTML output director -set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html") - -set(IMPORT_PADDLE_STRING "") -set(IMPORT_PADDLEV2_STRING "") - -configure_file( - "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.en.in" - "${BINARY_BUILD_DIR_EN}/conf.py" - @ONLY) - -sphinx_add_target(paddle_mobile_docs - html - ${BINARY_BUILD_DIR_EN} - ${SPHINX_CACHE_DIR_EN} - ${CMAKE_CURRENT_SOURCE_DIR} - ${SPHINX_HTML_DIR_EN}) - -# configured documentation tools and intermediate build results -set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build") - -# Sphinx cache with pickled ReST documents -set(SPHINX_CACHE_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_doctrees") - -# HTML output director -set(SPHINX_HTML_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/html") - -configure_file( - "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.cn.in" - "${BINARY_BUILD_DIR_CN}/conf.py" - @ONLY) - -sphinx_add_target(paddle_mobile_docs_cn - html - ${BINARY_BUILD_DIR_CN} - ${SPHINX_CACHE_DIR_CN} - ${CMAKE_CURRENT_SOURCE_DIR} - ${SPHINX_HTML_DIR_CN}) diff --git a/doc/mobile/cross_compiling_for_android_cn.md b/doc/mobile/cross_compiling_for_android_cn.md deleted file mode 100644 index 0607748b751e9f2d606236d9e98868335379b05c..0000000000000000000000000000000000000000 --- a/doc/mobile/cross_compiling_for_android_cn.md +++ /dev/null @@ -1,187 +0,0 @@ -# Android平台编译指南 - -用户可通过如下两种方式,交叉编译Android平台上适用的PaddlePaddle库: - -- [基于Docker容器的编译方式](#基于docker容器的编译方式) -- [基于Linux交叉编译环境的编译方式](#基于linux交叉编译环境的编译方式) - -## 基于Docker容器的编译方式 -Docker能在所有主要操作系统(包括Linux,Mac OS X和Windows)上运行,因此,使用基于Docker容器的编译方式,用户可在自己熟悉的开发平台上编译Android平台上适用的PaddlePaddle库。 - -### 构建PaddlePaddle的Android开发镜像 -我们把PaddlePaddle的交叉编译环境打包成一个镜像,称为开发镜像,里面涵盖了交叉编译Android版PaddlePaddle库需要的所有编译工具。 - -```bash -$ git clone https://github.com/PaddlePaddle/Paddle.git -$ cd Paddle -$ docker build -t username/paddle-android:dev . -f Dockerfile.android -``` - -用户也可以使用PaddlePaddle提供的官方开发镜像: - -```bash -$ docker pull paddlepaddle/paddle:latest-dev-android -``` - -对于国内用户,我们提供了加速访问的镜像源: - -```bash -$ docker pull docker.paddlepaddlehub.com/paddle:latest-dev-android -``` - -### 编译PaddlePaddle C-API库 -构建好开发镜像后,即可使用开发镜像来编译Android版PaddlePaddle C-API库。 -Android的Docker开发镜像向用户提供两个可配置的参数: - - -- - - - - - - - - - - - - - - - - - - - - - - -
ArgumentOptional ValuesDefault
ANDROID_ABIarmeabi-v7a, arm64-v8aarmeabi-v7a
ANDROID_API>= 1621
- -- 编译`armeabi-v7a`,`Android API 21`的PaddlePaddle库 - -```bash -$ docker run -it --rm -v $PWD:/paddle -w /paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" username/paddle-android:dev ./paddle/scripts/paddle_build.sh build_android -``` - -- 编译`arm64-v8a`,`Android API 21`的PaddlePaddle库 - -```bash -$ docker run -it --rm -v $PWD:/paddle -w /paddle -e "ANDROID_ABI=arm64-v8a" -e "ANDROID_API=21" username/paddle-android:dev ./paddle/scripts/paddle_build.sh build_android -``` - -执行上述`docker run`命令时,容器执行[paddle/scripts/paddle_build.sh build_android](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/paddle_build.sh)脚本。该脚本中记录了交叉编译Android版PaddlePaddle库常用的CMake配置,并且会根据`ANDROID_ABI`和`ANDROID_API`自动构建独立工具链、进行编译和安装。由于arm64架构要求Android API不小于21。因此当`ANDROID_ABI=arm64-v8a`,`ANDROID_API<21`时,Docker容器中将默认使用`Android API 21`的编译工具链。用户可以参考下文[配置交叉编译参数](#配置交叉编译参数)章节,根据个人的需求修改定制Docker容器所执行的脚本。编译安装结束之后,PaddlePaddle的C-API库将被安装到`$PWD/install_android`目录,所依赖的第三方库同时也被安装到`$PWD/install_android/third_party`目录。 - -## 基于Linux交叉编译环境的编译方式 -本文档将以Linux x86-64平台为例,介绍交叉编译Android平台上适用的PaddlePaddle库的方法和步骤。 - -### 准备交叉编译环境 - -从源码交叉编译PaddlePaddle,用户需要提前准备好交叉编译环境。Android平台上使用的C/C++交叉编译工具链为[Android NDK](https://developer.android.com/ndk/downloads/index.html?hl=zh-cn),用户可自行前往下载预编译好的版本,也可通过以下命令获取: - -```bash -wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip -unzip -q android-ndk-r14b-linux-x86_64.zip -``` - -Android NDK中包含了所有Android API级别、所有架构(arm/arm64/x86/mips)需要用到的编译工具和系统库。用户可根据自己的编译目标架构、所需支持的最低Android API级别,构建[独立工具链](https://developer.android.google.cn/ndk/guides/standalone_toolchain.html?hl=zh-cn)。 - -- 构建`armeabi-v7a`、 `Android API 21`的独立工具链: - -```bash -your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \ - --arch=arm --platform=android-21 --install-dir=your/path/to/arm_standalone_toolchain -``` - -此命令将在`your/path/to/arm_standalone_toolchain`目录生成一套独立编译工具链,面向架构为32位ARM架构,支持的最小的Android API级别为21,支持编译器`arm-linux-androideabi-gcc (GCC) 4.9`和`clang 3.8`。 - -- 构建`arm64-v8a`、 `Android API 21`的独立工具链: - -```bash -your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \ - --arch=arm64 --platform=android-21 --install-dir=your/path/to/arm64_standalone_toolchain -``` - -此命令将在`your/path/to/arm64_standalone_toolchain`目录生成一套独立编译工具链,面向架构为64位ARM64架构,支持的最小Android API级别为21,支持编译器`arm-linux-androideabi-gcc (GCC) 4.9`和`clang 3.8`。 - -### 配置交叉编译参数 - -CMake系统对交叉编译提供了支持[cmake-toolchains](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling)。为了简化cmake配置,PaddlePaddle为交叉编译提供了工具链配置文档[cmake/cross_compiling/android.cmake](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/android.cmake),以提供一些默认的编译器和编译参数相关配置。注意,从CMake 3.7版本开始,CMake官方对Android平台的交叉编译提供了通用的支持。PaddlePaddle若检测到用户使用的CMake版本不低于3.7时,将会将用户传进来的配置参数传递CMake系统,交由CMake系统本身来处理。有关参数配置的详细说明见[cmake-toolchains](https://cmake.org/cmake/help/v3.7/manual/cmake-toolchains.7.html#cross-compiling)。 - -交叉编译Android版本的PaddlePaddle库时,有一些必须配置的参数: -- `CMAKE_SYSTEM_NAME`,CMake编译的目标平台,必须设置为`Android`。在设置`CMAKE_SYSTEM_NAME=Android`后,PaddlePaddle的CMake系统才认为是在交叉编译Android系统的版本,并自动编译PaddlePaddle所需的所有第三方库。此外,还会强制设置一些PaddlePaddle参数的值(`WITH_GPU=OFF`、`WITH_AVX=OFF`、`WITH_PYTHON=OFF`、`WITH_RDMA=OFF`、`WITH_MKL=OFF`、`WITH_GOLANG=OFF`)。 -- `WITH_C_API`,必须设置为`ON`。在Android平台上只支持使用C-API来预测。 -- `WITH_SWIG_PY`,必须设置为`OFF`。在Android平台上不支持通过swig调用来训练或者预测。 - -Android平台可选配置参数: - -- `ANDROID_STANDALONE_TOOLCHAIN`,独立工具链所在的绝对路径,或者相对于构建目录的相对路径。PaddlePaddle的CMake系统将根据该值自动推导和设置需要使用的交叉编译器、sysroot、以及Android API级别;否则,用户需要在cmake时手动设置这些值。无默认值。 -- `ANDROID_TOOLCHAIN`,目标工具链。可设置`gcc/clang`,默认值为`clang`。 - - CMake 3.7以上,将会始终使用`clang`工具链;CMake 3.7以下,可设置`ANDROID_TOOLCHAIN=gcc`以使用`gcc`工具链。 - - Android官方提供的`clang`编译器要求系统支持`GLIBC 2.15`以上。 -- `ANDROID_ABI`,目标架构ABI。目前支持`armeabi-v7a`和`arm64-v8a`,默认值为`armeabi-v7a`。 -- `ANDROID_NATIVE_API_LEVEL`,工具链的Android API级别。若没有显式设置,PaddlePaddle将根据`ANDROID_STANDALONE_TOOLCHAIN`的值自动推导得到。 -- `ANROID_ARM_MODE`,是否使用ARM模式。 - - `ANDROID_ABI=armeabi-v7a`时,可设置`ON/OFF`,默认值为`ON`; - - `ANDROID_ABI=arm64-v8a`时,不需要设置。 -- `ANDROID_ARM_NEON`,是否使用NEON指令。 - - `ANDROID_ABI=armeabi-v7a`时,可设置`ON/OFF`,默认值为`ON`; - - `ANDROID_ABI=arm64-v8a`时,不需要设置。 - -其他配置参数: - -- `USE_EIGEN_FOR_BLAS`,是否使用Eigen库进行矩阵计算。可设置`ON/OFF`,默认值为`OFF`。 -- `HOST_C/CXX_COMPILER`,宿主机的C/C++编译器。在编译宿主机版protoc可执行文件和目标机版OpenBLAS库时需要用到。默认设置成环境变量`CC/CXX`的值;若环境变量`CC/CXX`没有设置,则设置成`cc/c++`编译器。 - -常用的cmake配置如下: - -```bash -cmake -DCMAKE_SYSTEM_NAME=Android \ - -DANDROID_STANDALONE_TOOLCHAIN=your/path/to/arm_standalone_toolchain \ - -DANDROID_ABI=armeabi-v7a \ - -DANDROID_ARM_NEON=ON \ - -DANDROID_ARM_MODE=ON \ - -DUSE_EIGEN_FOR_BLAS=ON \ - -DCMAKE_INSTALL_PREFIX=your/path/to/install \ - -DWITH_C_API=ON \ - -DWITH_SWIG_PY=OFF \ - .. -``` - -``` -cmake -DCMAKE_SYSTEM_NAME=Android \ - -DANDROID_STANDALONE_TOOLCHAIN=your/path/to/arm64_standalone_toolchain \ - -DANDROID_ABI=arm64-v8a \ - -DUSE_EIGEN_FOR_BLAS=OFF \ - -DCMAKE_INSTALL_PREFIX=your/path/to/install \ - -DWITH_C_API=ON \ - -DWITH_SWIG_PY=OFF \ - .. -``` - -用户还可根据自己的需求设置其他编译参数。 - -- 设置`CMAKE_BUILD_TYPE`为`MinSizeRel`,最小化生成的库的大小。 -- 设置`CMAKE_BUILD_TYPE`为`Release`,获得最快的执行速度, -- 用户亦可以通过手动设置`CMAKE_C/CXX_FLAGS`来影响PaddlePaddle的编译过程。 - -**性能TIPS**,为了达到最快的计算速度,在CMake参数配置上,有以下建议: - -- 设置`CMAKE_BUILD_TYPE`为`Release` -- 使用`clang`编译工具链 -- `armeabi-v7a`时,设置`USE_EIGEN_BLAS=ON`,使用Eigen进行矩阵计算;`arm64-v8a`时,设置`USE_EIGEN_FOR_BLAS=OFF`,使用OpenBLAS进行矩阵计算 - -### 编译和安装 - -CMake配置完成后,执行以下命令,PaddlePaddle将自动下载和编译所有第三方依赖库、编译和安装PaddlePaddle预测库。 - -```bash -make -make install -``` - -注意:如果你曾经在源码目录下编译过其他平台的PaddlePaddle库,请先使用`rm -rf`命令删除`third_party`目录和`build`目录,以确保所有的第三方依赖库和PaddlePaddle代码都是针对新的CMake配置重新编译的。 - -执行完安装命令后,`your/path/to/install`目录中会包含`include`、`lib`和`third_party`目录,其中`include`中包含C-API的头文件,`lib`中包含若干个不同Android ABI的PaddlePaddle库,`third_party`中包含所依赖的所有第三方库。自此,PaddlePaddle的已经安装完成,用户可将`your/path/to/install`目录下的生成文件用于深度学习相关Android App中,调用方法见C-API文档。 diff --git a/doc/mobile/cross_compiling_for_android_en.md b/doc/mobile/cross_compiling_for_android_en.md deleted file mode 100644 index 572063e8012efee2d2e142eb57e459e0e8c6382c..0000000000000000000000000000000000000000 --- a/doc/mobile/cross_compiling_for_android_en.md +++ /dev/null @@ -1,189 +0,0 @@ -# Build PaddlePaddle for Android - -There are two approaches to build PaddlePaddle for Android: - -- [Cross-Compiling Using Docker](#cross-compiling-using-docker) -- [Cross-Compiling on Linux](#cross-compiling-on-linux) - -## Cross-Compiling Using Docker - -Docker-based cross-compiling is the recommended approach because Docker runs on all major operating systems, including Linux, Mac OS X, and Windows. - -### Build the Docker Image - -The following steps pack all the tools that we need to build PaddlePaddle into a Docker image. - -```bash -$ git clone https://github.com/PaddlePaddle/Paddle.git -$ cd Paddle -$ docker build -t paddle:dev-android . -f Dockerfile.android -``` - -Users can directly use the published Docker image. - -```bash -$ docker pull paddlepaddle/paddle:latest-dev-android -``` - -For users in China, we provide a faster mirror. - -```bash -$ docker pull docker.paddlepaddlehub.com/paddle:latest-dev-android -``` - -### Build the Inference Library - -We can run the Docker image we just created to build the inference library of PaddlePaddle for Android using the command below: - -```bash -$ docker run -it --rm -v $PWD:/paddle -w /paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" paddle:dev-android ./paddle/scripts/paddle_build.sh build_android -``` - -The Docker image accepts two arguments `ANDROID_ABI` and `ANDROID_API`: - - -- - - - - - - - - - - - - - - - - - - - - - - -
ArgumentOptional ValuesDefault
ANDROID_ABIarmeabi-v7a, arm64-v8aarmeabi-v7a
ANDROID_API>= 1621
- -The ARM-64 architecture (`arm64-v8a`) requires at least level 21 of Android API. - -The build command, [`paddle/scripts/paddle_build.sh build_android`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/paddle_build.sh) generates the [Android cross-compiling standalone toolchain](https://developer.android.com/ndk/guides/standalone_toolchain.html) based on the argument: `ANDROID_ABI` or `ANDROID_API`. For information about other configuration arguments, please continue reading. - -The above command generates and outputs the inference library in `$PWD/install_android` and puts third-party libraries in `$PWD/install_android/third_party`. - -## Cross-Compiling on Linux - -The Linux-base approach to cross-compile is to run steps in `Dockerfile.android` manually on a Linux x64 computer. - -### Setup the Environment - -To build for Android's, we need [Android NDK]( -https://developer.android.com/ndk/downloads/index.html): - -```bash -wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip -unzip -q android-ndk-r14b-linux-x86_64.zip -``` - -Android NDK includes everything we need to build the [*standalone toolchain*](https://developer.android.com/ndk/guides/standalone_toolchain.html), which in then used to build PaddlePaddle for Android. (We plan to remove the intermediate stage of building the standalone toolchain in the near future.) - -- To build the standalone toolchain for `armeabi-v7a` and Android API level 21: - -```bash -your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \ - --arch=arm --platform=android-21 --install-dir=your/path/to/arm_standalone_toolchain -``` - - The generated standalone toolchain will be in `your/path/to/arm_standalone_toolchain`. - -- To build the standalone toolchain for `arm64-v8a` and Android API level 21: - -```bash -your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \ - --arch=arm64 --platform=android-21 --install-dir=your/path/to/arm64_standalone_toolchain -``` - - The generated standalone toolchain will be in `your/path/to/arm64_standalone_toolchain`. - -### Cross-Compiling Arguments - -CMake supports [choosing the toolchain](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling). PaddlePaddle provides [`android.cmake`](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/android.cmake), which configures the Android cross-compiling toolchain for CMake. `android.cmake` is not required for CMake >= 3.7, which support Android cross-compiling. PaddlePaddle detects the CMake version, for those newer than 3.7, it uses [the official version](https://cmake.org/cmake/help/v3.7/manual/cmake-toolchains.7.html#cross-compiling). - -Some other CMake arguments you need to know: - -- `CMAKE_SYSTEM_NAME` must be `Android`. This tells PaddlePaddle's CMake system to cross-compile third-party dependencies. This also changes some other CMake arguments like `WITH_GPU=OFF`, `WITH_AVX=OFF`, `WITH_PYTHON=OFF`, `WITH_RDMA=OFF`, `WITH_MKL=OFF` and `WITH_GOLANG=OFF`. -- `WITH_C_API` must be `ON`, to build the C-based inference library for Android. -- `WITH_SWIG_PY` must be `OFF` because the Android platform doesn't support SWIG-based API. - -Some Android-specific arguments: - -- `ANDROID_STANDALONE_TOOLCHAIN`: the absolute path of the Android standalone toolchain, or the path relative to the CMake build directory. PaddlePaddle's CMake extensions would derive the cross-compiler, sysroot and Android API level from this argument. -- `ANDROID_TOOLCHAIN`: could be `gcc` or `clang`. The default value is `clang`. - - For CMake >= 3.7, it should anyway be `clang`. For older versions, it could be `gcc`. - - Android's official `clang` requires `glibc` >= 2.15. -- `ANDROID_ABI`: could be `armeabi-v7a` or `arm64-v8a`. The default value is `armeabi-v7a`. -- `ANDROID_NATIVE_API_LEVEL`: could be derived from the value of `ANDROID_STANDALONE_TOOLCHAIN`. -- `ANROID_ARM_MODE`: - - could be `ON` or `OFF`, and defaults to `ON`, when `ANDROID_ABI=armeabi-v7a`; - - no need to specify when `ANDROID_ABI=arm64-v8a`. -- `ANDROID_ARM_NEON`: indicates if to use NEON instructions. - - could be `ON` or `OFF`, and defaults to `ON`, when `ANDROID_ABI=armeabi-v7a`; - - no need to specify when `ANDROID_ABI=arm64-v8a`. - -Other useful arguments: - -- `USE_EIGEN_FOR_BLAS`: indicates if using Eigen. Could be `ON` or `OFF`, defaults to `OFF`. -- `HOST_C/CXX_COMPILER`: specifies the host compiler, which is used to build the host-specific protoc and target-specific OpenBLAS. It defaults to the value of the environment variable `CC/C++`, or `cc/c++`. - -Some frequent configurations for your reference: - -```bash -cmake -DCMAKE_SYSTEM_NAME=Android \ - -DANDROID_STANDALONE_TOOLCHAIN=your/path/to/arm_standalone_toolchain \ - -DANDROID_ABI=armeabi-v7a \ - -DANDROID_ARM_NEON=ON \ - -DANDROID_ARM_MODE=ON \ - -DUSE_EIGEN_FOR_BLAS=ON \ - -DCMAKE_INSTALL_PREFIX=your/path/to/install \ - -DWITH_C_API=ON \ - -DWITH_SWIG_PY=OFF \ - .. -``` - -``` -cmake -DCMAKE_SYSTEM_NAME=Android \ - -DANDROID_STANDALONE_TOOLCHAIN=your/path/to/arm64_standalone_toolchain \ - -DANDROID_ABI=arm64-v8a \ - -DUSE_EIGEN_FOR_BLAS=OFF \ - -DCMAKE_INSTALL_PREFIX=your/path/to/install \ - -DWITH_C_API=ON \ - -DWITH_SWIG_PY=OFF \ - .. -``` - - -There are some other arguments you might want to configure. - -- `CMAKE_BUILD_TYPE=MinSizeRel` minimizes the size of library. -- `CMAKE_BUILD_TYPE-Release` optimizes the runtime performance. - -Our own tip for performance optimization to use clang and Eigen or OpenBLAS: - -- `CMAKE_BUILD_TYPE=Release` -- `ANDROID_TOOLCHAIN=clang` -- `USE_EIGEN_BLAS=ON` for `armeabi-v7a`, or `USE_EIGEN_FOR_BLAS=OFF` for `arm64-v8a`. - -### Build and Install - -After running `cmake`, we can run `make; make install` to build and install. - -Before building, you might want to remove the `third_party` and `build` directories including pre-built libraries for other architectures. - -After building,in the directory `CMAKE_INSTALL_PREFIX`, you will find three sub-directories: - -- `include`: the header file of the inference library, -- `lib`: the inference library built for various Android ABIs, -- `third_party`: dependent third-party libraries built for Android. diff --git a/doc/mobile/cross_compiling_for_ios_cn.md b/doc/mobile/cross_compiling_for_ios_cn.md deleted file mode 100644 index d5196d9a4c93c7692d2a624ec7d0650e32806338..0000000000000000000000000000000000000000 --- a/doc/mobile/cross_compiling_for_ios_cn.md +++ /dev/null @@ -1,117 +0,0 @@ -# iOS平台编译指南 -交叉编译iOS平台上适用的PaddlePaddle库,需要在MacOS系统上进行。本文的将介绍在MacOS上,从源码交叉编译iOS平台上适用的PaddlePaddle库。 - -## 准备交叉编译环境 -Apple官方为iOS开发提供了完整的交叉编译工具和集成开发环境,用户从App Store下载安装Xcode即可。也可自行前往官网下载,[Xcode](https://developer.apple.com/cn/xcode/)。安装完成之后,可在命令行执行`xcodebuild -version`,判断是否安装成功。 - -```bash -$ xcodebuild -version -Xcode 9.0 -Build version 9A235 -``` - -## 配置交叉编译参数 - -PaddlePaddle为交叉编译提供了工具链配置文档[cmake/cross_compiling/ios.cmake](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/ios.cmake),以提供一些默认的编译器和编译参数配置。 - -交叉编译iOS版本的PaddlePaddle库时,有一些必须配置的参数: - -- `CMAKE_SYSTEM_NAME`,CMake编译的目标平台,必须设置为`iOS`。在设置`CMAKE_SYSTEM_NAME=iOS`后,PaddlePaddle的CMake系统会自动编译所有的第三方依赖库,并且强制设置一些PaddlePaddle参数的值(`WITH_C_API=ON`、`WITH_GPU=OFF`、`WITH_AVX=OFF`、`WITH_PYTHON=OFF`、`WITH_RDMA=OFF`)。 -- `WITH_C_API`,是否编译C-API预测库,必须设置为ON。在iOS平台上只支持使用C-API来预测。 -- `WITH_SWIG_PY`,必须设置为`OFF`。在iOS平台上不支持通过swig调用来训练或者预测。 - -iOS平台可选配置参数: - -- `IOS_PLATFORM`,可设置为`OS`(默认值)或`SIMULATOR`。 - - `OS`,构建目标为`arm`架构的iPhone或者iPad等物理设备。 - - `SIMULATOR`,构建目标为`x86`架构的模拟器平台。 -- `IOS_ARCH`,目标架构。针对不同的`IOS_PLATFORM`,可设置的目标架构如下表所示,默认编译所有架构: - - - - - - - - - - - - - - - - - - - - - - -
IOS_PLATFORMIOS_ARCH
OSarmv7, armv7s, arm64
SIMULATORi386, x86_64
- -- `IOS_DEPLOYMENT_TARGET`,最小的iOS部署版本,默认值为`7.0`。 -- `IOS_ENABLE_BITCODE`,是否使能[Bitcode](https://developer.apple.com/library/content/documentation/IDEs/Conceptual/AppDistributionGuide/AppThinning/AppThinning.html#//apple_ref/doc/uid/TP40012582-CH35-SW3),可设置`ON/OFF`,默认值为`ON`。 -- `IOS_USE_VECLIB_FOR_BLAS`,是否使用[vecLib](https://developer.apple.com/documentation/accelerate/veclib)框架进行BLAS矩阵计算,可设置`ON/OFF`,默认值为`OFF`。 -- `IOS_DEVELOPMENT_ROOT`,`Developer`目录,可显式指定为`/path/to/platform/Developer`。若未显式指定,PaddlePaddle将会根据`IOS_PLATFORM`自动选择`Xcode`对应`platform`的`Developer`目录。 -- `IOS_SDK_ROOT`,所使用`SDK`的根目录,可显式指定为`/path/to/platform/Developer/SDKs/SDK`。若未显式指定,PaddlePaddle将会自动选择`IOS_DEVELOPMENT_ROOT`目录下最新的`SDK`版本。 - -其他配置参数: - -- `USE_EIGEN_FOR_BLAS`,是否使用Eigen库进行矩阵计算,在`IOS_USE_VECLIB_FOR_BLAS=OFF`时有效。可设置`ON/OFF`,默认值为`OFF`。 -- `HOST_C/CXX_COMPILER`,宿主机的C/C++编译器。默认值为环境变量`CC/CXX`的值;若环境变量`CC/CXX`未设置,则使用`cc/c++`编译器。 - -常用的cmake配置如下: - -```bash -cmake -DCMAKE_SYSTEM_NAME=iOS \ - -DIOS_PLATFORM=OS \ - -DIOS_ARCH="armv7;arm64" \ - -DIOS_ENABLE_BITCODE=ON \ - -DIOS_USE_VECLIB_FOR_BLAS=ON \ - -DCMAKE_INSTALL_PREFIX=your/path/to/install \ - -DWITH_C_API=ON \ - -DWITH_TESTING=OFF \ - -DWITH_SWIG_PY=OFF \ - .. -``` - -```bash -cmake -DCMAKE_SYSTEM_NAME=iOS \ - -DIOS_PLATFORM=SIMULATOR \ - -DIOS_ARCH="x86_64" \ - -DIOS_USE_VECLIB_FOR_BLAS=ON \ - -DCMAKE_INSTALL_PREFIX=your/path/to/install \ - -DWITH_C_API=ON \ - -DWITH_TESTING=OFF \ - -DWITH_SWIG_PY=OFF \ - .. -``` - -用户还可根据自己的需求设置其他编译参数。比如希望最小化生成库的大小,可以设置`CMAKE_BUILD_TYPE`为`MinSizeRel`;若希望得到最快的执行速度,则可设置`CMAKE_BUILD_TYPE`为`Release`。亦可以通过手动设置`CMAKE_C/CXX_FLAGS`来影响PaddlePaddle的编译过程。 - -**性能TIPS**,为了达到最快的计算速度,在CMake参数配置上,有以下建议: - -- 设置`CMAKE_BUILD_TYPE`为`Release` -- 设置`IOS_USE_VECLIB_FOR_BLAS=ON`,调用`vecLib`框架提供的BLAS函数进行矩阵计算。 - -## 编译和安装 - -CMake配置完成后,执行以下命令,PaddlePaddle将自动下载和编译所有第三方依赖库、编译和安装PaddlePaddle预测库。 - -``` -$ make -$ make install -``` - -注意:如果你曾在源码目录下编译过其他平台的PaddlePaddle库,请先使用`rm -rf`命令删除`third_party`目录和`build`目录,以确保所有的第三方依赖库和PaddlePaddle代码都是针对新的CMake配置重新编译的。 - -执行完安装命令后,`your/path/to/install`目录中会包含以下内容: - -- `include`目录,其中包含所有C-API的头文件 -- `lib`目录,其中包含PaddlePaddle的C-API静态库 -- `third_party`目录,其中包含所依赖的所有第三方库 - -注意,如果PaddlePaddle库需要同时支持真机和模拟器,则需要分别编译真机和模拟器版本,然后使用`lipo`工具合并fat库。 - -自此,PaddlePaddle库已经安装完成,用户可将合成的fat库用于深度学习相关的iOS App中,调用方法见C-API文档。 diff --git a/doc/mobile/cross_compiling_for_ios_en.md b/doc/mobile/cross_compiling_for_ios_en.md deleted file mode 100644 index 19bfe86c511c7e43b462f94c8cabba420b3007f1..0000000000000000000000000000000000000000 --- a/doc/mobile/cross_compiling_for_ios_en.md +++ /dev/null @@ -1,120 +0,0 @@ -# Build PaddlePaddle for iOS - -This tutorial will walk you through cross compiling the PaddlePaddle library for iOS from the source in MacOS. - -## Preparation - -Apple provides Xcode for cross-compiling and IDE for iOS development. Download from App store or [here](https://developer.apple.com/cn/xcode/). To verify your installation, run command as follows - -```bash -$ xcodebuild -version -Xcode 9.0 -Build version 9A235 -``` - -## Cross-compiling configurations - -PaddlePaddle provides cross-compiling toolchain configuration documentation [cmake/cross_compiling/ios.cmake](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/ios.cmake), which has some default settings for frequently used compilers. - -There are some mandatory environment variables need to be set before cross compiling PaddlePaddle for iOS: - -- `CMAKE_SYSTEM_NAME`, CMake compiling target platform name, has to be `iOS`. PaddlePaddle CMake will compile all the third party dependencies and enforce some parameters (`WITH_C_API=ON`, `WITH_GPU=OFF`, `WITH_AVX=OFF`, `WITH_PYTHON=OFF`,`WITH_RDMA=OFF`) when this variable is set with value `iOS`. - -- `WITH_C_API`, Whether to compile inference C-API library, has to be `ON`, since C-API is the only supported interface for inferencing in iOS. -- `WITH_SWIG_PY`, has to be `OFF`. It's not supported to inference or train via swig in iOS. - -Optional environment variables for iOS are: - -- `IOS_PLATFORM`, either `OS` (default) or `SIMULATOR`. - - `OS`, build targets ARM-based physical devices like iPhone or iPad. - - `SIMULATOR`, build targets x86 architecture simulators. -- `IOS_ARCH`, target architecture. By default, all architecture types will be compiled. If you need to specify the architecture to compile for, please find valid values for different `IOS_PLATFORM` settings from the table below: - - - - - - - - - - - - - - - - - - - - - - -
IOS_PLATFORMIOS_ARCH
OSarmv7, armv7s, arm64
SIMULATORi386, x86_64
- -- `IOS_DEPLOYMENT_TARGET`, minimum iOS version to deployment, `7.0` by default. -- `IOS_ENABLE_BITCODE`, whether to enable [Bitcode](https://developer.apple.com/library/content/documentation/IDEs/Conceptual/AppDistributionGuide/AppThinning/AppThinning.html#//apple_ref/doc/uid/TP40012582-CH35-SW3), values can be `ON/OFF`, `ON` by default. -- `IOS_USE_VECLIB_FOR_BLAS`, whether to use [vecLib](https://developer.apple.com/documentation/accelerate/veclib) framework for BLAS computing. values can be `ON/OFF`, `OFF` by default. -- `IOS_DEVELOPMENT_ROOT`, the path to `Developer` directory, can be explicitly set with your `/path/to/platform/Developer`. If left blank, PaddlePaddle will automatically pick the Xcode corresponding `platform`'s `Developer` directory based on your `IOS_PLATFORM` value. -- `IOS_SDK_ROOT`, the path to `SDK` root, can be explicitly set with your `/path/to/platform/Developer/SDKs/SDK`. if left black, PaddlePaddle will pick the latest SDK in the directory of `IOS_DEVELOPMENT_ROOT`. - -other settings: - -- `USE_EIGEN_FOR_BLAS`, whether to use Eigen for matrix computing. effective when `IOS_USE_VECLIB_FOR_BLAS=OFF`. Values can be `ON/OFF`, `OFF` by default. -- `HOST_C/CXX_COMPILER`, host C/C++ compiler. Uses value from environment variable `CC/CXX` by default or `cc/c++` if `CC/CXX` doesn't exist. - -some typical cmake configurations: - -```bash -cmake -DCMAKE_SYSTEM_NAME=iOS \ - -DIOS_PLATFORM=OS \ - -DIOS_ARCH="armv7;arm64" \ - -DIOS_ENABLE_BITCODE=ON \ - -DIOS_USE_VECLIB_FOR_BLAS=ON \ - -DCMAKE_INSTALL_PREFIX=your/path/to/install \ - -DWITH_C_API=ON \ - -DWITH_TESTING=OFF \ - -DWITH_SWIG_PY=OFF \ - .. -``` - -```bash -cmake -DCMAKE_SYSTEM_NAME=iOS \ - -DIOS_PLATFORM=SIMULATOR \ - -DIOS_ARCH="x86_64" \ - -DIOS_USE_VECLIB_FOR_BLAS=ON \ - -DCMAKE_INSTALL_PREFIX=your/path/to/install \ - -DWITH_C_API=ON \ - -DWITH_TESTING=OFF \ - -DWITH_SWIG_PY=OFF \ - .. -``` - -You can set other compiling parameters for your own need. I.E. if you are trying to minimize the library size, set `CMAKE_BUILD_TYPE` with `MinSizeRel`; or if the performance is your concern, set `CMAKE_BUILD_TYPE` with `Release`. You can even manipulate the PaddlePaddle compiling procedure by manually set `CMAKE_C/CXX_FLAGS` values. - -**TIPS for a better performance**: - -- set `CMAKE_BUILD_TYPE` with `Release` -- set `IOS_USE_VECLIB_FOR_BLAS` with `ON` - -## Build and install - -After CMake, run following commands, PaddlePaddle will download the compile 3rd party dependencies, compile and install PaddlePaddle inference library. - -``` -$ make -$ make install -``` - -Please Note: if you compiled PaddlePaddle in the source directory for other platforms, do remove `third_party` and `build` directory within the source with `rm -rf` to ensure that all the 3rd party libraries dependencies and PaddlePaddle is newly compiled with current CMake configuration. - -`your/path/to/install` directory will have following directories after `make install`: - -- `include`, contains all the C-API header files. -- `lib`, contains PaddlePaddle C-API static library. -- `third_party` contains all the 3rd party libraries. - -Please note: if PaddlePaddle library need to support both physical devices and simulators, you will need to compile correspondingly, then merge fat library with `lipo`. - -Now you will have PaddlePaddle library compiled and installed, the fat library can be used in deep learning related iOS APPs. Please refer to C-API documentation for usage guides. diff --git a/doc/mobile/cross_compiling_for_raspberry_cn.md b/doc/mobile/cross_compiling_for_raspberry_cn.md deleted file mode 100644 index f8ef9dc8031613831437745995268f3abc392f5b..0000000000000000000000000000000000000000 --- a/doc/mobile/cross_compiling_for_raspberry_cn.md +++ /dev/null @@ -1,62 +0,0 @@ -# Raspberry Pi平台编译指南 - -通常有两个方法来构建基于 Rasspberry Pi 的版本: - -1. 通过ssh等方式登录到Raspberry Pi系统上来构建。所需的开发工具和第三方库可以参考 [`/Dockerfile`](https://github.com/PaddlePaddle/Paddle/blob/develop/Dockerfile)。 - -1. 另一个方法是交叉编译。这篇文档介绍在 Linux/x64 上交叉编译Raspberry Pi平台上适用的PaddlePaddle的方法和步骤。 - -## 安装交叉编译器 - -克隆下面 Github repo - -```bash -git clone https://github.com/raspberrypi/tools.git -``` - -即可在 `./tools/tree/master/arm-bcm2708/gcc-linaro-arm-linux-gnueabihf-raspbian-x64` 目录里找到交叉编译器 arm-linux-gnueabihf-gcc 4.8.3。运行该编译工具链需要一台 Linux x64 机器上以及 2.14版本以上的 glibc。 - -## 配置交叉编译参数 - -CMake[支持交叉编译](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling)。PaddlePaddle for Raspberry Pi的配置信息在[cmake/cross_compiling/raspberry_pi.cmake](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/raspberry_pi.cmake)。 - -交叉编译Raspberry Pi版本PaddlePaddle库时,有一些必须配置的参数: - -- `CMAKE_SYSTEM_NAME`:CMake编译的目标平台,必须配置为`RPi`。在设置`CMAKE_SYSTEM_NAME=RPi`后,PaddlePaddle的CMake系统才认为在是在交叉编译Raspberry Pi系统的版本,并自动编译宿主机版protoc可执行文件、目标机版protobuf库、以及目标机版OpenBLAS库。 - -- `RPI_TOOLCHAIN`:编译工具链所在的绝对路径,或者相对于构建目录的相对路径。PaddlePaddle的CMake系统将根据该值自动设置需要使用的交叉编译器;否则,用户需要在cmake时手动设置这些值。无默认值。 - -- `RPI_ARM_NEON`:是否使用NEON指令。目前必须设置成`ON`,默认值为`ON`。 - -- `HOST_C/CXX_COMPILER`,宿主机的C/C++编译器。在编译宿主机版protoc可执行文件和目标机版OpenBLAS库时需要用到。默认设置成环境变量`CC`的值;若环境变量`CC`没有设置,则设置成`cc`编译器。 - -一个常用的CMake配置如下: - -``` -cmake -DCMAKE_SYSTEM_NAME=RPi \ - -DRPI_TOOLCHAIN=your/path/to/arm-bcm2708/gcc-linaro-arm-linux-gnueabihf-raspbian-x64 \ - -DRPI_ARM_NEON=ON \ - -DCMAKE_INSTALL_PREFIX=your/path/to/install \ - -DWITH_GPU=OFF \ - -DWITH_C_API=ON \ - -DWITH_PYTHON=OFF \ - -DWITH_SWIG_PY=OFF \ - .. -``` - -其中`WITH_C_API=ON`表示需要构建推理库。 - -用户还可根据自己的需求设置其他编译参数。比如希望最小化生成的库的大小,可以设置`CMAKE_BUILD_TYPE`为`MinSizeRel`;若希望最快的执行速度,则可设置`CMAKE_BUILD_TYPE`为`Release`。 - -## 编译和安装 - -CMake配置完成后,执行以下命令,PaddlePaddle将自动下载和编译所有第三方依赖库、编译和安装PaddlePaddle。 - -```bash -make -make install -``` - -注意:如果你曾经在源码目录下编译过其他平台的PaddlePaddle库,请先使用`rm -rf`命令删除`third_party`目录和`build`目录,以确保所有的第三方依赖库和PaddlePaddle代码都是针对新的CMake配置重新编译的。 - -执行完安装命令后,`your/path/to/install`目录中会包含`include`和`lib`目录,其中`include`中包含C-API的头文件,`lib`中包含一个Raspberry Pi版本的库。 diff --git a/doc/mobile/cross_compiling_for_raspberry_en.md b/doc/mobile/cross_compiling_for_raspberry_en.md deleted file mode 100644 index 3c1a5950ff9553bb725d5a96e3fdf2e5e9f6f95c..0000000000000000000000000000000000000000 --- a/doc/mobile/cross_compiling_for_raspberry_en.md +++ /dev/null @@ -1,62 +0,0 @@ -# Build PaddlePaddle for Raspberry Pi - -You may use any of the following two approaches to build the inference library of PaddlePaddle for Raspberry Pi: - -1. Build using SSH: Log in to a Raspberry Pi using SSH and build the library. The required development tools and third-party dependencies are listed in here: [`/Dockerfile`](https://github.com/PaddlePaddle/Paddle/blob/develop/Dockerfile). - -1. Cross-compile: We talk about how to cross-compile PaddlePaddle for Raspberry Pi on a Linux/x64 machine, in more detail in this article. - -## The Cross-Compiling Toolchain - -Step 1. Clone the Github repo by running the following command. - -```bash -git clone https://github.com/raspberrypi/tools.git -``` - -Step 2. Use the pre-built cross-compiler found in `./tools/tree/master/arm-bcm2708/gcc-linaro-arm-linux-gnueabihf-raspbian-x64`. To run it on a Linux computer, glibc version >= 2.14 is needed. - -## CMake Arguments - -CMake supports [cross-compiling](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling). All CMake configuration arguments required for the cross-compilation for Raspberry Pi can be found in [`cmake/cross_compiling/raspberry_pi.cmake`](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/raspberry_pi.cmake). - -Some important arguments that need to be set: - -- `CMAKE_SYSTEM_NAME`: The target platform. Must be `RPi`. - -- `RPI_TOOLCHAIN`: The absolute path of the cross-compiling toolchain. - -- `RPI_ARM_NEON`: Use ARM NEON Intrinsics. This is a required argument and set default to `ON`. - -- `HOST_C/CXX_COMPILER`: The C/C++ compiler for the host. It is used to build building tools running on the host, for example, protoc. - -A commonly-used CMake configuration is as follows: - -``` -cmake -DCMAKE_SYSTEM_NAME=RPi \ - -DRPI_TOOLCHAIN=your/path/to/arm-bcm2708/gcc-linaro-arm-linux-gnueabihf-raspbian-x64 \ - -DRPI_ARM_NEON=ON \ - -DCMAKE_INSTALL_PREFIX=your/path/to/install \ - -DWITH_GPU=OFF \ - -DWITH_C_API=ON \ - -DWITH_PYTHON=OFF \ - -DWITH_SWIG_PY=OFF \ - .. -``` - -To build the inference library, please set the argument WITH\_C\_API to ON: `WITH_C_API=ON`. - -You can add more arguments. For example, to minimize the size of the generated inference library, you may use `CMAKE_BUILD_TYPE=MinSizeRel`. For performance optimization, you may use `CMAKE_BUILD_TYPE=Release`. - -## Build and Install - -The following commands build the inference library of PaddlePaddle for Raspberry Pi and third-party dependencies. - -```bash -make -make install -``` - - The intermediate files will be stored in `build`. Third-party libraries will be located in `build/third_party`. If you have already built it for other platforms like Android or iOS, you may want to clear these directories by running the command: `rm -rf build`. - -The infernece library will be in `your/path/to/install/lib`, with related header files in `your/path/to/install/include`. diff --git a/doc/mobile/index_cn.rst b/doc/mobile/index_cn.rst deleted file mode 100644 index 56d1515005f6e40b084c6b2184c6a0b3e3a00496..0000000000000000000000000000000000000000 --- a/doc/mobile/index_cn.rst +++ /dev/null @@ -1,9 +0,0 @@ -移动端 -====== - -.. toctree:: - :maxdepth: 1 - - cross_compiling_for_android_cn.md - cross_compiling_for_ios_cn.md - cross_compiling_for_raspberry_cn.md diff --git a/doc/mobile/index_en.rst b/doc/mobile/index_en.rst deleted file mode 100644 index e0acdff0284e3bc84b2cc4a34a142ee01754f940..0000000000000000000000000000000000000000 --- a/doc/mobile/index_en.rst +++ /dev/null @@ -1,9 +0,0 @@ -Mobile -====== - -.. toctree:: - :maxdepth: 1 - - cross_compiling_for_android_en.md - cross_compiling_for_ios_en.md - cross_compiling_for_raspberry_en.md diff --git a/doc/survey/cluster_bootstrapping_tools.md b/doc/survey/cluster_bootstrapping_tools.md deleted file mode 100644 index 1cd9962700bb49866f1ed6987abc28b27888a23f..0000000000000000000000000000000000000000 --- a/doc/survey/cluster_bootstrapping_tools.md +++ /dev/null @@ -1,71 +0,0 @@ -# Cluster bootstrapping tool survey -## Abstract -In order to bring up a cluster from bare metal machine to a fully functional kubernetes cluster for Paddlepaddle to run, we need to utilize some tools. Here we are going to compare [Sextant](https://github.com/k8sp/sextant) and [Tectonic installer](https://github.com/coreos/tectonic-installer) - -## Basic assumptions -Here are some basic assumptions before we move on to details -1. You are an administrator of a bare metal machine cluster, which means: - * you have full control to each of the machines. - * you have full control to the network which machines are connected to. -2. Machines can be booted from network with PEX or iPXE -3. You understand the [general procedure to bring up a cluster](#appendix-general-procedure-to-bring-up-a-cluster) - -if your cluster is able to mark above items with checkmarks, then keep reading. - -## Comparing Sextant and Tectonic installer -### Sextant -Sextant is an end2end solution to bring up a bare metal cluster to a fully functional k8s cluster, it integrates DHCP, name service, PEX, cloud-config-service, docker registry services altogether. - -#### Pros -1. End2End: basically all admin need to do is to config the cluster.yaml and power on the cluster. -2. Offline cluster configuration: Sextant has 2 phases during working with it, config time and deploy time. when admin is configuring, it requires admin's machine has internet connectivity, which will download some images, etc. But in deploy time, it's completely OK to go offline since all dependencies are ready during config time. -3. docker registry integrated. -4. GPU machine took care of. - -### Cons -1. k8s API server is not deployed with high availability in considering by default. -2. No grouping support. -3. No API interface, a one-off service. - - -### Tectonic installer -First of all, Tectonic is not free, it requires coreos.com account as a step of installation, and free user can only create less than 10 nodes. - -Tectonic is a suite of software which wraps around k8s and providing more utility regarding dev ops, ie, -Tectonic installer as it's named, it installs Tectonic to a bare metal cluster which means it's not totally an equivalent of Sextant. At the "booting a cluster" part, it mostly utilizes [Matchbox](https://github.com/coreos/matchbox), which is a general cluster bootstrapper. - -Matchbox's Approach is similar to Sexstant. - -### Pros -1. supports grouping machines. -2. supports running provisioning service in rtk. (not a big deal though). -3. supports http/gRPC API interface. -4. supports multi-template. - -### Cons -1. Not an e2e solution to bring up a cluster, need a lot of extra work and other software. -2. [Not fully supporting](https://github.com/coreos/matchbox/issues/550) centOS deployment yet. - -## Conclusion -Sextant is a better solution overall for paddle cloud deploying to a bare metal cluster. It would be great if Sextant can also 1) deploy k8s api server with high availability by default; 2) not designed as a one-off service. - - - -## Appendix: General procedure to bring up a cluster -It's physically impossible for a cluster admin to manually install OS and applications into cluster nodes one by one, here is what an admin would do in cloud industry: -1. setup a bootstrap machine with static IP in the cluster, which has following services: - * DHCP: assigns ip address for rest of the nodes. - * name service: to map node name to a IP - * PXE related services: the booting related info will be delivered to newly booted machines as their IP is assigned via DHCP service, PXE service will provide further booting and installing info and image with TFTP and http protocol. - * cluster config service: this is for providing cluster node with OS config via http - * optional docker registry: a built-in docker registry makes the whole cluster independent from connecting internet, and speeds up software distribution. -2. New node powers on, it will - * broadcast the request for an IP address - * DHCP server assigns the IP address, and deliver the PXE booting related info to the node. - * cluster node will request config files with booting info delivered with DHCP via the TFTP service, and in most of the cases, the config file will point to a http service for the booting image. - * Since PXE is configured with initrd, it will utilize the cloud config service and do further installations like coreOS or K8s installations. - * then restart the node. - -For further understanding, following 2 links from Matchbox are some good readings: -* [Machine lifecycle](https://github.com/coreos/matchbox/blob/master/Documentation/machine-lifecycle.md) -* [PXE booting](https://github.com/coreos/matchbox/blob/master/Documentation/network-booting.md) diff --git a/doc/survey/dynamic_graph.md b/doc/survey/dynamic_graph.md deleted file mode 100644 index 7f62eeadff43af1f0a3c81e284a6508bf063b21e..0000000000000000000000000000000000000000 --- a/doc/survey/dynamic_graph.md +++ /dev/null @@ -1,379 +0,0 @@ -# Automatic Differentiation with the Tape - -## Automatic Differentiation - -A key challenge in deep learning is to automatically derive the backward pass given the forward pass as a program, which has been long studied in the field of [automatic differentiation](https://arxiv.org/pdf/1502.05767.pdf), or autodiff, before the prosperity of deep learning. - -## Program Transformation v.s. Backtracking - -Given the forward pass program, there are two strategies to derive the backward pass: - -1. by transforming the forward pass program without executing it, or -1. by backtracking the execution process of the forward pass program. - -This article is about the latter strategy. - -## The Tape and Dynamic Networks - -We refer to the trace of the execution of the forward pass program as a *tape* [[1]](http://www.bcl.hamilton.ie/~barak/papers/toplas-reverse.pdf). When we train a deep learning model, the tape changes every iteration as the input data change, so we'd have to re-derive the backward pass, which is time-consuming, but also eases the case that the forward program includes control flows like if-else and for/while. With these control flows, the execution trace might change with iterations. Such changes are known as *dynamic networks* in the field of deep learning. - -## Typical Systems - -Deep learning systems that utilize the idea of dynamic networks gained their popularities in recent years. This article surveys the following typical systems: - -- [DyNet](https://dynet.readthedocs.io/en/latest/) -- [PyTorch](https://pytorch.org/) -- Chainer -- Autograd from HIPS - -Before diving into these systems, let us pose an example forward pass program: - -```python -x = Variable(randn(20, 1))) -label = Variable(randint(1)) -W_1, W_2 = Variable(randn(20, 20)), Variable(randn(10, 20)) -h = matmul(W_1, x) -pred = matmul(W_2, h) -loss = softmax(pred, label) -loss.backward() -``` - -## The Representation of Tapes - -### DyNet: the Tape as a List - -DyNet uses a linear data structure, a list, to represent the tape. During the execution of the above example, it is a list of operators: `matmul`, `matmul`, and `softmax`. The list also includes information needed to do the backward pass, such as pointers to the inputs and outputs. Then the tape is played in reverse order at `loss.backward().` - -
- -digraph g { - graph [ - rankdir = "LR" - ]; - node [ - fontsize = "16" - shape = "ellipse" - ]; - edge []; - "node0" [ - label = " type: matmul | input: W_1, x | output: h" - shape = "record" - ]; - "node1" [ - label = " type: matmul | input: W_2, h | output: pred" - shape = "record" - ]; - "node2" [ - label = " type: softmax | input: pred, label | output: loss" - shape = "record" - ]; - "node0":f0 -> "node1":f0 []; - "node1":f0 -> "node2":f0 []; -} -
- -![Alt text](https://g.gravizo.com/svg?digraph%20g%20{%20graph%20[%20rankdir%20=%20%22LR%22%20];%20node%20[%20fontsize%20=%20%2216%22%20shape%20=%20%22ellipse%22%20];%20edge%20[];%20%22node0%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20%3Cf1%3E%20input:%20W_1,%20x%20|%20%3Cf2%3E%20output:%20h%22%20shape%20=%20%22record%22%20];%20%22node1%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20%3Cf1%3E%20input:%20W_2,%20h%20|%20%3Cf2%3E%20output:%20pred%22%20shape%20=%20%22record%22%20];%20%22node2%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20softmax%20|%20%3Cf1%3E%20input:%20pred,%20label%20|%20%3Cf2%3E%20output:%20loss%22%20shape%20=%20%22record%22%20];%20%22node0%22:f0%20-%3E%20%22node1%22:f0%20[%20id%20=%200%20];%20%22node1%22:f0%20-%3E%20%22node2%22:f0%20[%20id%20=%201%20];%20}) - -### PyTorch: the Tape as a Graph - -The graph is composed of `Variable`s and `Function`s. During the forward execution, a `Variable` records its creator function, e.g. `h.creator = matmul`. And a Function records its inputs' previous/dependent functions `prev_func` through `creator`, e.g. `matmul.prev_func = matmul1`. At `loss.backward()`, a topological sort is performed on all `prev_func`s. Then the grad op is performed by the sorted order. Please be aware that a `Function` might have more than one `prev_func`s. - -
- -digraph g { - graph [ - rankdir = "LR" - ]; - - subgraph function { - node [ - fontsize = "16" - style = filled - shape = "record" - ]; - "matmul0" [ label = " type: matmul | prev_func: None" ]; - "matmul1" [ label = " type: matmul | prev_func: matmul" ]; - "softmax" [ label = " type: softmax | prev_func: matmul" ]; - } - - subgraph variable { - node [ - fontsize = "16" - shape = "Mrecord" - style = filled - fillcolor = white - ]; - "x" [ label = " x | creator: None" ]; - "label" [ label = " label | creator: None" ]; - "W_1" [ label = " W_1 | creator: None" ]; - "W_2" [ label = " W_2 | creator: None" ]; - "h" [ label = " h | creator: None" ]; - "pred" [ label = " pred | creator: matmul" ]; - "loss" [ label = " loss | creator: softmax" ]; - } - - subgraph data_flow { - "x":f0 -> "matmul0":f0; - "W_1":f0 -> "matmul0":f0; - "matmul0":f0 -> "h":f0; - - "h":f0 -> "matmul1":f0; - "W_2":f0 -> "matmul1":f0; - "matmul1":f0 -> "pred":f0; - - "pred":f0 -> "softmax":f0; - "label":f0 -> "softmax":f0; - "softmax":f0 -> "loss":f0; - } - - subgraph prev_func { - edge [color="red", arrowsize="0.6", penwidth="1", constraint=false]; - "matmul1":f1 -> "matmul0":f0; - "softmax":f1 -> "matmul1":f0; - label = "prev_func"; - } -} -
- -![Alt text](https://g.gravizo.com/svg?digraph%20g%20{%20graph%20[%20rankdir%20=%20%22LR%22%20];%20subgraph%20function%20{%20node%20[%20fontsize%20=%20%2216%22%20style%20=%20filled%20shape%20=%20%22record%22%20];%20%22matmul0%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20prev_func:%20None%22%20];%20%22matmul1%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20prev_func:%20matmul%22%20];%20%22softmax%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20softmax%20|%20prev_func:%20matmul%22%20];%20}%20subgraph%20variable%20{%20node%20[%20fontsize%20=%20%2216%22%20shape%20=%20%22Mrecord%22%20style%20=%20filled%20fillcolor%20=%20white%20];%20%22x%22%20[%20label%20=%20%22%3Cf0%3E%20x%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22label%22%20[%20label%20=%20%22%3Cf0%3E%20label%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22W_1%22%20[%20label%20=%20%22%3Cf0%3E%20W_1%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22W_2%22%20[%20label%20=%20%22%3Cf0%3E%20W_2%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22h%22%20[%20label%20=%20%22%3Cf0%3E%20h%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22pred%22%20[%20label%20=%20%22%3Cf0%3E%20pred%20|%20%3Cf1%3E%20creator:%20matmul%22%20];%20%22loss%22%20[%20label%20=%20%22%3Cf0%3E%20loss%20|%20%3Cf1%3E%20creator:%20softmax%22%20];%20}%20subgraph%20data_flow%20{%20%22x%22:f0%20-%3E%20%22matmul0%22:f0;%20%22W_1%22:f0%20-%3E%20%22matmul0%22:f0;%20%22matmul0%22:f0%20-%3E%20%22h%22:f0;%20%22h%22:f0%20-%3E%20%22matmul1%22:f0;%20%22W_2%22:f0%20-%3E%20%22matmul1%22:f0;%20%22matmul1%22:f0%20-%3E%20%22pred%22:f0;%20%22pred%22:f0%20-%3E%20%22softmax%22:f0;%20%22label%22:f0%20-%3E%20%22softmax%22:f0;%20%22softmax%22:f0%20-%3E%20%22loss%22:f0;%20}%20subgraph%20prev_func%20{%20edge%20[color=%22red%22,%20arrowsize=%220.6%22,%20penwidth=%221%22,%20constraint=false];%20%22matmul1%22:f1%20-%3E%20%22matmul0%22:f0;%20%22softmax%22:f1%20-%3E%20%22matmul1%22:f0;%20label%20=%20%22prev_func%22;%20}%20}) - -Chainer and Autograd use the similar techniques to record the forward pass. For details, please refer to the appendix. - -## Comparison: List v.s. Graph - -The list of DyNet could be considered the result of the topological sort of the graph of PyTorch. Or, the graph is the raw representation of the tape, which gives us the chance to *prune* part of the graph that is irrelevant with the backward pass before the topological sort [[2]](https://openreview.net/pdf?id=BJJsrmfCZ). Consider the following example, PyTorch only does backward on `SmallNet` while DyNet does both `SmallNet` and `BigNet`: - -```python -result = BigNet(data) -loss = SmallNet(data) -loss.backward() -``` - -## Lazy v.s. Immediate Evaluation - -Another difference between DyNet and PyTorch is that DyNet lazily evaluates the forward pass, whereas PyTorch executes it immediately. Consider the following example: - -```python -for epoch in range(num_epochs): - for in_words, out_label in training_data: - dy.renew_cg() - W = dy.parameter(W_p) - b = dy.parameter(b_p) - score_sym = dy.softmax(W*dy.concatenate([E[in_words[0]],E[in_words[1]]])+b) - loss_sym = dy.pickneglogsoftmax(score_sym, out_label) - loss_val = loss_sym.value() - loss_sym.backward() -``` - -The computation of `lookup`, `concat`, `matmul` and `softmax` didn't happen until the call of `loss_sym.value()`. This defered execution is useful because it allows some graph-like optimization possible, e.g. kernel fusion. - -PyTorch chooses immediate evaluation. It avoids ever materializing a "forward graph"/"tape" (no need to explicitly call `dy.renew_cg()` to reset the list), recording only what is necessary to differentiate the computation, i.e. `creator` and `prev_func`. - - -## Fluid: Learning the Lessons - -Please refer to `paddle/contrib/dynamic/`. - -## Appendix - -### Overview - -| Framework | Has Tape | Core in C++ | First Release Date | -|-----------|----------|-------------|--------------------| -| Autograd | No | No | Mar 5, 2015 | -| Chainer | No | No | Jun 5, 2015 | -| Pytorch | No | Yes | Aug 31, 2016 | -| Dynet | Yes | Yes | Oct 12, 2016 | - -### Source Code -#### Autograd -[Backward code](https://github.com/HIPS/autograd/blob/442205dfefe407beffb33550846434baa90c4de7/autograd/core.py#L8-L40). In the forward pass, a graph of VJPNode is constructed. -```python -# User API -def make_grad(fun, x): - start_node = VJPNode.new_root() - end_value, end_node = trace(start_node, fun, x) - return backward_pass(g, end_node), end_value - -# trace the forward pass by creating VJPNodes -def trace(start_node, fun, x): - with trace_stack.new_trace() as t: - start_box = new_box(x, t, start_node) - end_box = fun(start_box) - return end_box._value, end_box._node - -def backward_pass(g, end_node): - outgrads = {end_node : (g, False)} - for node in toposort(end_node): - outgrad = outgrads.pop(node) - ingrads = node.vjp(outgrad[0]) - for parent, ingrad in zip(node.parents, ingrads): - outgrads[parent] = add_outgrads(outgrads.get(parent), ingrad) - return outgrad[0] - -# Every VJPNode corresponds to a op_grad -class VJPNode(Node): - __slots__ = ['parents', 'vjp'] - def __init__(self, value, fun, args, kwargs, parent_argnums, parents): - self.parents = parents - vjpmaker = primitive_vjps[fun] - self.vjp = vjpmaker(parent_argnums, value, args, kwargs) -``` -#### Chainer -Example Code -```python -# (1) Function Set definition, creates FunctionNode -model = FunctionSet( - l1=F.Linear(784, 100), - l2=F.Linear(100, 100), - l3=F.Linear(100, 10)).to_gpu() - -# (2) Optimizer Setup -opt = optimizers.SGD() -opt.setup(model) - -# (3) Forward computation -def forward(x, t): - h1 = F.relu(model.l1(x)) - h2 = F.relu(model.l2(h1)) - y = model.l3(h2) - return F.softmax_cross_entropy(y, t) - -# (4) Training loop -for epoch in xrange(n_epoch): - for i in xrange(0, N, b_size): - x = Variable(to_gpu(...)) - t = Variable(to_gpu(...)) - opt.zero_grads() - loss = forward(x, t) - loss.backward() - opt.update() -``` -In `forward(x, t)`, a graph of [`VariableNode`](https://github.com/chainer/chainer/blob/master/chainer/variable.py#L110) and [`FunctionNode`](https://github.com/chainer/chainer/blob/a69103a4aa59d5b318f39b01dbcb858d465b89cf/chainer/function_node.py#L19) is constructed. Every output's `VariableNode.creator` is pointed to the `FunctionNode`. -```python -class FunctionNode(object): - ... - def apply(self, inputs): - outputs = self.forward(inputs) - ret = tuple([variable.Variable(y, requires_grad=requires_grad) - for y in outputs]) - # Topological ordering - self.rank = max([x.rank for x in inputs]) if input_vars else 0 - # Add backward edges - for y in ret: - y.creator_node = self - self.inputs = tuple([x.node for x in input_vars]) - self.outputs = tuple([y.node for y in ret]) - - return ret -``` -`loss.backward()` will calculate the accumulated gradient of all variables. All the backward of `FunctionNode`s will be called based on the topological order. -```python -class VariableNode(object): - ... - def backward(self, retain_grad, loss_scale): - if self.creator_node is None: - return - - cand_funcs = [] - seen_set = set() - grads = {} - - # Initialize error by 1, if this is a loss variable - if self.data.size == 1 and self._grad_var is None: - self.grad = numpy.ones_like(self.data) - grads[self._node] = self._grad_var - - def add_cand(cand): - if cand not in seen_set: - # Negate since heapq is min-heap. This is a global variable - heapq.heappush(cand_funcs, (-cand.rank, len(seen_set), cand)) - seen_set.add(cand) - - add_cand(self.creator_node) - - while cand_funcs: - _, _, func = heapq.heappop(cand_funcs) - gxs = func.backward_accumulate(func.inputs, func.outputs, func.outputs.grad) - - for x, gx in enumerate(gxs): - if x in grads: - grads[x] += gx - else: - grads[x] = gx - - if x.creator_node is not None: - add_cand(x.creator_node) -``` - -#### PyTorch -Example Code -```python -x = Variable(torch.ones(5, 5)) -y = Variable(torch.ones(5, 5) * 4) -z = x ** 2 + x * 2 + x * y + y -z.backward(torch.ones(5, 5)) -``` -The trace is done by `Variable.creator` and `Function.previous_functions`. -```python -class Variable(object): - def __init__(self, tensor, creator=None, requires_grad=True): - if creator is None: - creator = Leaf(self, requires_grad) - self.data = tensor - self.creator = creator - self._grad = None - - def backward(self, gradient=None): - if gradient is None: - if self.data.numel() != 1: - raise RuntimeError('backward should be called only on a scalar (i.e. 1-element tensor) or with gradient w.r.t. the variable') - gradient = self.data.new(1).fill_(1) - self._execution_engine.run_backward(self, gradient) - -class Function(obejct): - # ... - def _do_forward(self, *input): - unpacked_input = tuple(arg.data for arg in input) - raw_output = self.forward(*unpacked_input) - - # mark output.creator = self for backward trace - output = tuple(Variable(tensor, self) for tensor in raw_output) - - self.previous_functions = [(arg.creator, id(arg)) for arg in input] - self.output_ids = {id(var): i for i, var in enumerate(output)} - return output - - def _do_backward(self, grad_output): - return self.backwaerd(grad_output) -``` -The [backward](https://github.com/pytorch/pytorch/blob/v0.1.1/torch/autograd/engine.py) is similar to Autograd. - -#### DyNet -Example code -```python -model = dy.model() -W_p = model.add_parameters((20, 100)) -b_p = model.add_parameters(20) -E = model.add_lookup_parameters((20000, 50)) -for epoch in range(num_epochs): - for in_words, out_label in training_data: - dy.renew_cg() # init tape - W = dy.parameter(W_p) - b = dy.parameter(b_p) - score_sym = dy.softmax(W*dy.concatenate([E[in_words[0]],E[in_words[1]]])+b) - loss_sym = dy.pickneglogsoftmax(score_sym, out_label) - loss_val = loss_sym.value() - loss_sym.backward() -``` -[forward](https://github.com/clab/dynet/blob/740a9626a13a2732544de142e256ad0d0a166658/dynet/exec.cc#L84-L158), [backward](https://github.com/clab/dynet/blob/740a9626a13a2732544de142e256ad0d0a166658/dynet/exec.cc#L166-L284). The trace is done by creating a tape of expressions in every iteration. Backward is done by traverse the tape in the reverse order. -```c++ -void SimpleExecutionEngine::backward(VariableIndex from_where, bool full) { - ... - for (int i = num_nodes - 1; i >= 0; --i) { - // each node corresponds to an op - node->backward(xs, node_fx, node_dEdfx, ai, node_dEdxai); - } - ... -} -``` diff --git a/doc/survey/op_fusion_design.md b/doc/survey/op_fusion_design.md deleted file mode 100644 index d6e48f4f58269b67450cb012f6dcc59e1083abba..0000000000000000000000000000000000000000 --- a/doc/survey/op_fusion_design.md +++ /dev/null @@ -1,20 +0,0 @@ -# Operator fusion -Fusing multiple operators together is an important method to optimize the program execution, particularly for GPU or other specialized accelerators. An obvious benefit is to avoid the overhead of saving the intermediate result back into global memory. - -There are generally two ways to fuse operators, fusing directly connected operators and fusing non directly connected operators. The first method is mainly used by [NNVM Compiler](https://github.com/dmlc/tvm/) and [XLA](https://www.tensorflow.org/performance/xla/). The second method is mainly used by Dynet and TensorFlow Fold to do auto-batching. The principle of fusing operator is according to some rules to combine multiple operations into one, for example, `Y = X * W` and `Z = Y + B` can be fused to `Z = X * W + B`, and `Y1 = X1 * W` and `Y2 = X2 * W` can be fused to `[Y1;Y2] = [X1;X2] * W`. In order to get a short-term profit, we decided to try to manually specify these rules. - -## Challenge -The challenge of fusing operators is: - - how to make the rules. - - how to implement these rules efficiently. - -### How to make the rules? - -The problem of determining the best single location for a fusion operator is an NP-hard combinatorial problem. After analysis the operators of the DL model, we found there are two group of operators can be fused explicitly, one is the simple and adjacent operations, for example, `tmp = x + y` and `z = Relu(tmp)`, and the other is the operators that have the same function, for example, a serials of `SGD` or `Momentum`. They usually appear in the model in a large number. So we should think about how to fuse them separately first. - -### How to implement these rules efficiently? -#### How to fuse the adjacent operations efficiently? -Here we use a template function to represent the fused operations. The pros of using a template function are that it is simple and efficient, and the cons are that it is not easy to expand, and it can only be used to express some simple operations. So taking into account our current needs, the template function is more appropriate. - -#### How to fuse the operators that have the same function efficiently? -We take SGD operator as an example, the training model may have hundreds of parameters and correspondingly have the same number of SGD operators. The expression(`w = w - lr*w_g`) of those operators is the same, so during of training, the executor will execute this expression hundreds time in CPU or other specialized accelerators. If we can fuse them and make the address of all `w` and all `w_g` continuous respectively, we only need execute one time. For some accelerators, the time of launching kernel is not neglected, so the time of hundreds of times of launching and executing kernel may be larger than launching and executing only once. There usually are many operators that similar to `SGD` in the DL model, such as `AllReduce` and `FC`. diff --git a/doc/templates/conf.py.cn.in b/doc/templates/conf.py.cn.in deleted file mode 100644 index 890f70615538af23cd05b9ffd685e870a5644cdb..0000000000000000000000000000000000000000 --- a/doc/templates/conf.py.cn.in +++ /dev/null @@ -1,151 +0,0 @@ -# -*- coding: utf-8 -*- -# -# documentation build configuration file, created by -# sphinx-quickstart on Thu Jul 23 19:40:08 2015. -# -# This file is execfile()d with the current directory set to its -# containing dir. -# -# Note that not all possible configuration values are present in this -# autogenerated file. -# -# All configuration values have a default; values that are commented out -# serve to show the default. -import sys -import os, subprocess -sys.path.insert(0, os.path.abspath('@PADDLE_BINARY_DIR@/python')) -import shlex -from recommonmark import parser, transform -@IMPORT_PADDLE_STRING@ -@IMPORT_PADDLEV2_STRING@ - -MarkdownParser = parser.CommonMarkParser -AutoStructify = transform.AutoStructify - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -templates_path = ["@PADDLE_SOURCE_DIR@/doc/templates"] - -# -- General configuration ------------------------------------------------ - -# General information about the project. -project = u'PaddlePaddle' -author = u'%s developers' % project -copyright = u'2016, %s' % author -github_doc_root = '' - -# add markdown parser -MarkdownParser.github_doc_root = github_doc_root -source_parsers = { - '.md': MarkdownParser, - '.Rmd': MarkdownParser, -} -os.environ['PADDLE_BUILD_DOC'] = '1' - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones -extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.autosummary', - 'sphinx.ext.mathjax', - 'sphinx.ext.napoleon', - 'sphinx.ext.graphviz' -] -mathjax_path="https://cdn.bootcss.com/mathjax/2.7.0/MathJax.js" -table_styling_embed_css = True - -autodoc_member_order = 'bysource' - -# The suffix(es) of source filenames. -# You can specify multiple suffix as a list of string: -# source_suffix = ['.rst', '.md'] -source_suffix = ['.rst', '.md', '.Rmd'] - -# The encoding of source files. -source_encoding = 'utf-8' - -# The master toctree document. -master_doc = 'index_cn' - -# The language for content autogenerated by Sphinx. Refer to documentation -# for a list of supported languages. -# -# This is also used if you do content translation via gettext catalogs. -# Usually you set "language" from the command line for these cases. -language = 'zh_CN' - -# There are two options for replacing |today|: either, you set today to some -# non-false value, then it is used: -#today = '' -# Else, today_fmt is used as the format for a strftime call. -#today_fmt = '%B %d, %Y' - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -exclude_patterns = ['_build', '**/*_en*', '*_en*', 'api/*'] - -# The reST default role (used for this markup: `text`) to use for all -# documents. -#default_role = None - -# If true, '()' will be appended to :func: etc. cross-reference text. -#add_function_parentheses = True - -# If true, the current module name will be prepended to all description -# unit titles (such as .. function::). -#add_module_names = True - -# If true, sectionauthor and moduleauthor directives will be shown in the -# output. They are ignored by default. -#show_authors = False - -# The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' - -# A list of ignored prefixes for module index sorting. -#modindex_common_prefix = [] - -# If true, keep warnings as "system message" paragraphs in the built documents. -#keep_warnings = False - -# If true, `todo` and `todoList` produce output, else they produce nothing. -todo_include_todos = False - -# -- Options for HTML output ---------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -html_theme = 'sphinx_rtd_theme' - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -#html_static_path = [] - -# Output file base name for HTML help builder. -htmlhelp_basename = project + 'doc' - -# -- Options for LaTeX output --------------------------------------------- -latex_elements = { -} - -# Grouping the document tree into LaTeX files. List of tuples -# (source start file, target name, title, -# author, documentclass [howto, manual, or own class]). -latex_documents = [ - (master_doc, '%s.tex' % project, project, - author, 'manual'), -] - -# Use the .. admonition:: directive for Notes sections. -# False to use the .. rubric:: directive instead. -napoleon_use_admonition_for_notes = True - -def setup(app): - # Add hook for building doxygen xml when needed - # no c++ API for now - app.add_config_value('recommonmark_config', { - 'url_resolver': lambda url: github_doc_root + url, - }, True) - app.add_transform(AutoStructify) diff --git a/doc/templates/conf.py.en.in b/doc/templates/conf.py.en.in deleted file mode 100644 index 5b09464cb991f96127edec40f7dbbc97a8d82582..0000000000000000000000000000000000000000 --- a/doc/templates/conf.py.en.in +++ /dev/null @@ -1,152 +0,0 @@ -# -*- coding: utf-8 -*- -# -# documentation build configuration file, created by -# sphinx-quickstart on Thu Jul 23 19:40:08 2015. -# -# This file is execfile()d with the current directory set to its -# containing dir. -# -# Note that not all possible configuration values are present in this -# autogenerated file. -# -# All configuration values have a default; values that are commented out -# serve to show the default. -import sys -import os, subprocess -sys.path.insert(0, os.path.abspath('@PADDLE_BINARY_DIR@/python')) -import shlex -from recommonmark import parser, transform -@IMPORT_PADDLE_STRING@ -@IMPORT_PADDLEV2_STRING@ - - -MarkdownParser = parser.CommonMarkParser -AutoStructify = transform.AutoStructify - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -templates_path = ["@PADDLE_SOURCE_DIR@/doc/templates"] - -# -- General configuration ------------------------------------------------ - -# General information about the project. -project = u'PaddlePaddle' -author = u'%s developers' % project -copyright = u'2016, %s' % author -github_doc_root = '' - -# add markdown parser -MarkdownParser.github_doc_root = github_doc_root -source_parsers = { - '.md': MarkdownParser, - '.Rmd': MarkdownParser, -} -os.environ['PADDLE_BUILD_DOC'] = '1' - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones -extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.autosummary', - 'sphinx.ext.mathjax', - 'sphinx.ext.napoleon', -] - - -autodoc_member_order = 'bysource' - - -# The suffix(es) of source filenames. -# You can specify multiple suffix as a list of string: -# source_suffix = ['.rst', '.md'] -source_suffix = ['.rst', '.md', '.Rmd'] - -# The encoding of source files. -source_encoding = 'utf-8' - -# The master toctree document. -master_doc = 'index_en' - -# The language for content autogenerated by Sphinx. Refer to documentation -# for a list of supported languages. -# -# This is also used if you do content translation via gettext catalogs. -# Usually you set "language" from the command line for these cases. -language = None - -# There are two options for replacing |today|: either, you set today to some -# non-false value, then it is used: -#today = '' -# Else, today_fmt is used as the format for a strftime call. -#today_fmt = '%B %d, %Y' - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -exclude_patterns = ['_build', '**/*_cn*', '*_cn*', 'api/*'] - -# The reST default role (used for this markup: `text`) to use for all -# documents. -#default_role = None - -# If true, '()' will be appended to :func: etc. cross-reference text. -#add_function_parentheses = True - -# If true, the current module name will be prepended to all description -# unit titles (such as .. function::). -#add_module_names = True - -# If true, sectionauthor and moduleauthor directives will be shown in the -# output. They are ignored by default. -#show_authors = False - -# The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' - -# A list of ignored prefixes for module index sorting. -#modindex_common_prefix = [] - -# If true, keep warnings as "system message" paragraphs in the built documents. -#keep_warnings = False - -# If true, `todo` and `todoList` produce output, else they produce nothing. -todo_include_todos = False - -# -- Options for HTML output ---------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -html_theme = 'sphinx_rtd_theme' - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -#html_static_path = [] - -# Output file base name for HTML help builder. -htmlhelp_basename = project + 'doc' - -# -- Options for LaTeX output --------------------------------------------- -latex_elements = { -} - -# Grouping the document tree into LaTeX files. List of tuples -# (source start file, target name, title, -# author, documentclass [howto, manual, or own class]). -latex_documents = [ - (master_doc, '%s.tex' % project, project, - author, 'manual'), -] - -# Use the .. admonition:: directive for Notes sections. -# False to use the .. rubric:: directive instead. -napoleon_use_admonition_for_notes = True - -def setup(app): - # Add hook for building doxygen xml when needed - # no c++ API for now - app.add_config_value('recommonmark_config', { - 'url_resolver': lambda url: github_doc_root + url, - 'enable_eval_rst': True, - }, True) - app.add_transform(AutoStructify) diff --git a/doc/templates/layout.html b/doc/templates/layout.html deleted file mode 100644 index 5091eb32eaeff77bd40f5d348e887b99b6eff4ea..0000000000000000000000000000000000000000 --- a/doc/templates/layout.html +++ /dev/null @@ -1,23 +0,0 @@ -{# layout.html #} -{# Import the theme's layout. #} -{% extends "!layout.html" %} - -{# SIDE NAV, TOGGLES ON MOBILE #} -{% block menu %} - -{% endblock %} - -{%- block extrahead %} - -{% endblock %} diff --git a/doc/v2/CMakeLists.txt b/doc/v2/CMakeLists.txt deleted file mode 100644 index d230a1b9217eea6740419822f350096e361a4435..0000000000000000000000000000000000000000 --- a/doc/v2/CMakeLists.txt +++ /dev/null @@ -1,54 +0,0 @@ -if(NOT DEFINED SPHINX_THEME) - set(SPHINX_THEME default) -endif() - -if(NOT DEFINED SPHINX_THEME_DIR) - set(SPHINX_THEME_DIR) -endif() - -# configured documentation tools and intermediate build results -set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build") - -# Sphinx cache with pickled ReST documents -set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees") - -# HTML output director -set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html") - -set(IMPORT_PADDLE_STRING "") -set(IMPORT_PADDLEV2_STRING "") - -configure_file( - "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.en.in" - "${BINARY_BUILD_DIR_EN}/conf.py" - @ONLY) - -sphinx_add_target(paddle_v2_docs - html - ${BINARY_BUILD_DIR_EN} - ${SPHINX_CACHE_DIR_EN} - ${CMAKE_CURRENT_SOURCE_DIR} - ${SPHINX_HTML_DIR_EN}) - -# configured documentation tools and intermediate build results -set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build") - -# Sphinx cache with pickled ReST documents -set(SPHINX_CACHE_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_doctrees") - -# HTML output directory -set(SPHINX_HTML_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/html") - -configure_file( - "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.cn.in" - "${BINARY_BUILD_DIR_CN}/conf.py" - @ONLY) - -sphinx_add_target(paddle_v2_docs_cn - html - ${BINARY_BUILD_DIR_CN} - ${SPHINX_CACHE_DIR_CN} - ${CMAKE_CURRENT_SOURCE_DIR} - ${SPHINX_HTML_DIR_CN}) - -add_subdirectory(api) diff --git a/doc/v2/api/CMakeLists.txt b/doc/v2/api/CMakeLists.txt deleted file mode 100644 index 0c74522cb089b17c8419e9058f76631b0fe0df93..0000000000000000000000000000000000000000 --- a/doc/v2/api/CMakeLists.txt +++ /dev/null @@ -1,25 +0,0 @@ -# configured documentation tools and intermediate build results -set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build") - -# Sphinx cache with pickled ReST documents -set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees") - -# HTML output director -set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html") - -set(IMPORT_PADDLE_STRING "import paddle") -set(IMPORT_PADDLEV2_STRING "import paddle.v2") - -configure_file( - "${CMAKE_CURRENT_SOURCE_DIR}/../../templates/conf.py.en.in" - "${BINARY_BUILD_DIR_EN}/conf.py" - @ONLY) - -sphinx_add_target(paddle_v2_apis - html - ${BINARY_BUILD_DIR_EN} - ${SPHINX_CACHE_DIR_EN} - ${CMAKE_CURRENT_SOURCE_DIR} - ${SPHINX_HTML_DIR_EN}) - -add_dependencies(paddle_v2_apis gen_proto_py framework_py_proto copy_paddle_pybind paddle_python) diff --git a/doc/v2/api/config/activation.rst b/doc/v2/api/config/activation.rst deleted file mode 100644 index 5317e66b64bbd85c61f19700a9d2c1d239dee573..0000000000000000000000000000000000000000 --- a/doc/v2/api/config/activation.rst +++ /dev/null @@ -1,108 +0,0 @@ -=========== -Activation -=========== - -Abs -=== - -.. automodule:: paddle.v2.activation - :members: Abs - :noindex: - -Exp -=== - -.. automodule:: paddle.v2.activation - :members: Exp - :noindex: - -Identity -======== - -.. automodule:: paddle.v2.activation - :members: Identity - :noindex: - -Linear -====== - -.. automodule:: paddle.v2.activation - :members: Linear - :noindex: - -Log -=== - -.. automodule:: paddle.v2.activation - :members: Log - :noindex: - -Square -====== - -.. automodule:: paddle.v2.activation - :members: Square - :noindex: - -Sigmoid -======= - -.. automodule:: paddle.v2.activation - :members: Sigmoid - :noindex: - -Softmax -======= - -.. automodule:: paddle.v2.activation - :members: Softmax - :noindex: - -SequenceSoftmax -=============== - -.. automodule:: paddle.v2.activation - :members: SequenceSoftmax - :noindex: - -Relu -==== - -.. automodule:: paddle.v2.activation - :members: Relu - :noindex: - -BRelu -===== - -.. automodule:: paddle.v2.activation - :members: BRelu - :noindex: - -SoftRelu -======== - -.. automodule:: paddle.v2.activation - :members: SoftRelu - :noindex: - -Tanh -==== - -.. automodule:: paddle.v2.activation - :members: Tanh - :noindex: - -STanh -===== - -.. automodule:: paddle.v2.activation - :members: STanh - :noindex: - -SoftSign -======== - -.. automodule:: paddle.v2.activation - :members: SoftSign - :noindex: diff --git a/doc/v2/api/config/attr.rst b/doc/v2/api/config/attr.rst deleted file mode 100644 index a93f41b86779200d8bac651614f4d61f4895875f..0000000000000000000000000000000000000000 --- a/doc/v2/api/config/attr.rst +++ /dev/null @@ -1,6 +0,0 @@ -Parameter Attribute -=================== - -.. automodule:: paddle.v2.attr - :members: - :noindex: diff --git a/doc/v2/api/config/evaluators.rst b/doc/v2/api/config/evaluators.rst deleted file mode 100644 index 458d892e825a7a9bbe7843ad5c508bd5a31f5f0f..0000000000000000000000000000000000000000 --- a/doc/v2/api/config/evaluators.rst +++ /dev/null @@ -1,110 +0,0 @@ -.. _api_v2: - -========== -Evaluators -========== - -Classification -============== - -classification_error --------------------- -.. automodule:: paddle.v2.evaluator - :members: classification_error - :noindex: - -auc ---- -.. automodule:: paddle.v2.evaluator - :members: auc - :noindex: - -ctc_error ---------- -.. automodule:: paddle.v2.evaluator - :members: ctc_error - :noindex: - -chunk ------ -.. automodule:: paddle.v2.evaluator - :members: chunk - :noindex: - -precision_recall ----------------- -.. automodule:: paddle.v2.evaluator - :members: precision_recall - :noindex: - -Rank -==== - -pnpair ------- -.. automodule:: paddle.v2.evaluator - :members: pnpair - :noindex: - -Utils -===== - -sum ---- -.. automodule:: paddle.v2.evaluator - :members: sum - :noindex: - -column_sum ----------- -.. automodule:: paddle.v2.evaluator - :members: column_sum - :noindex: - -Print -===== - -classification_error_printer ----------------------------- -.. automodule:: paddle.v2.evaluator - :members: classification_error_printer - :noindex: - -gradient_printer ----------------- -.. automodule:: paddle.v2.evaluator - :members: gradient_printer - :noindex: - -maxid_printer -------------- -.. automodule:: paddle.v2.evaluator - :members: maxid_printer - :noindex: - -maxframe_printer ----------------- -.. automodule:: paddle.v2.evaluator - :members: maxframe_printer - :noindex: - -seqtext_printer ---------------- -.. automodule:: paddle.v2.evaluator - :members: seqtext_printer - :noindex: - -value_printer -------------- -.. automodule:: paddle.v2.evaluator - :members: value_printer - :noindex: - -Detection -========== - -detection_map -------------- -.. automodule:: paddle.v2.evaluator - :members: detection_map - :noindex: diff --git a/doc/v2/api/config/layer.rst b/doc/v2/api/config/layer.rst deleted file mode 100644 index 5a0cfadfce84df41defdf518b7c3a6222d5b30a1..0000000000000000000000000000000000000000 --- a/doc/v2/api/config/layer.rst +++ /dev/null @@ -1,552 +0,0 @@ -.. _api_v2.layer: - -====== -Layers -====== - -Data layer -=========== - -.. _api_v2.layer_data: - -data ----- -.. autofunction:: paddle.v2.layer.data - :noindex: - -Fully Connected Layers -====================== - -.. _api_v2.layer_fc: - -fc --- -.. autofunction:: paddle.v2.layer.fc - :noindex: - -selective_fc ------------- -.. autofunction:: paddle.v2.layer.selective_fc - :noindex: - -Conv Layers -=========== - -conv_operator -------------- -.. autofunction:: paddle.v2.layer.conv_operator - :noindex: - -conv_projection ---------------- -.. autofunction:: paddle.v2.layer.conv_projection - :noindex: - -conv_shift ----------- -.. autofunction:: paddle.v2.layer.conv_shift - :noindex: - -img_conv --------- -.. autofunction:: paddle.v2.layer.img_conv - :noindex: - -.. _api_v2.layer_context_projection: - -context_projection ------------------- -.. autofunction:: paddle.v2.layer.context_projection - :noindex: - -row_conv --------- -.. autofunction:: paddle.v2.layer.row_conv - :noindex: - -Image Pooling Layer -=================== - -img_pool --------- -.. autofunction:: paddle.v2.layer.img_pool - :noindex: - -spp ---- -.. autofunction:: paddle.v2.layer.spp - :noindex: - -maxout ------- -.. autofunction:: paddle.v2.layer.maxout - :noindex: - -roi_pool --------- -.. autofunction:: paddle.v2.layer.roi_pool - :noindex: - -pad ----- -.. autofunction:: paddle.v2.layer.pad - :noindex: - -Norm Layer -========== - -img_cmrnorm ------------ -.. autofunction:: paddle.v2.layer.img_cmrnorm - :noindex: - -batch_norm ----------- -.. autofunction:: paddle.v2.layer.batch_norm - :noindex: - -sum_to_one_norm ---------------- -.. autofunction:: paddle.v2.layer.sum_to_one_norm - :noindex: - -cross_channel_norm ------------------- -.. autofunction:: paddle.v2.layer.cross_channel_norm - :noindex: - -row_l2_norm ------------ -.. autofunction:: paddle.v2.layer.row_l2_norm - :noindex: - -Recurrent Layers -================ - -recurrent ---------- -.. autofunction:: paddle.v2.layer.recurrent - :noindex: - -lstmemory ---------- -.. autofunction:: paddle.v2.layer.lstmemory - :noindex: - -grumemory ---------- -.. autofunction:: paddle.v2.layer.grumemory - :noindex: - -gated_unit ------------ -.. autofunction:: paddle.v2.layer.gated_unit - :noindex: - -Recurrent Layer Group -===================== - -memory ------- -.. autofunction:: paddle.v2.layer.memory - :noindex: - -recurrent_group ---------------- -.. autofunction:: paddle.v2.layer.recurrent_group - :noindex: - -lstm_step ---------- -.. autofunction:: paddle.v2.layer.lstm_step - :noindex: - -gru_step --------- -.. autofunction:: paddle.v2.layer.gru_step - :noindex: - -beam_search ------------- -.. autofunction:: paddle.v2.layer.beam_search - :noindex: - -get_output ----------- -.. autofunction:: paddle.v2.layer.get_output - :noindex: - -Mixed Layer -=========== - -.. _api_v2.layer_mixed: - -mixed ------ -.. autofunction:: paddle.v2.layer.mixed - :noindex: - -.. _api_v2.layer_embedding: - -embedding ---------- -.. autofunction:: paddle.v2.layer.embedding - :noindex: - -scaling_projection ------------------- -.. autofunction:: paddle.v2.layer.scaling_projection - :noindex: - -dotmul_projection ------------------ -.. autofunction:: paddle.v2.layer.dotmul_projection - :noindex: - -dotmul_operator ---------------- -.. autofunction:: paddle.v2.layer.dotmul_operator - :noindex: - -full_matrix_projection ----------------------- -.. autofunction:: paddle.v2.layer.full_matrix_projection - :noindex: - -identity_projection -------------------- -.. autofunction:: paddle.v2.layer.identity_projection - :noindex: - -slice_projection -------------------- -.. autofunction:: paddle.v2.layer.slice_projection - :noindex: - -table_projection ----------------- -.. autofunction:: paddle.v2.layer.table_projection - :noindex: - -trans_full_matrix_projection ----------------------------- -.. autofunction:: paddle.v2.layer.trans_full_matrix_projection - :noindex: - -Aggregate Layers -================ - -AggregateLevel --------------- -.. autoclass:: paddle.v2.layer.AggregateLevel - :noindex: - -.. _api_v2.layer_pooling: - -pooling -------- -.. autofunction:: paddle.v2.layer.pooling - :noindex: - -.. _api_v2.layer_last_seq: - -last_seq --------- -.. autofunction:: paddle.v2.layer.last_seq - :noindex: - -.. _api_v2.layer_first_seq: - -first_seq ---------- -.. autofunction:: paddle.v2.layer.first_seq - :noindex: - -sub_seq ---------- -.. autofunction:: paddle.v2.layer.sub_seq - :noindex: - -concat ------- -.. autofunction:: paddle.v2.layer.concat - :noindex: - -seq_concat ----------- -.. autofunction:: paddle.v2.layer.seq_concat - :noindex: - -seq_slice ---------- -.. autofunction:: paddle.v2.layer.seq_slice - :noindex: - -sub_nested_seq --------------- -.. autofunction:: paddle.v2.layer.sub_nested_seq - :noindex: - -Reshaping Layers -================ - -block_expand ------------- -.. autofunction:: paddle.v2.layer.block_expand - :noindex: - -.. _api_v2.layer_expand: - -ExpandLevel ------------ -.. autoclass:: paddle.v2.layer.ExpandLevel - :noindex: - -expand ------- -.. autofunction:: paddle.v2.layer.expand - :noindex: - -repeat ------- -.. autofunction:: paddle.v2.layer.repeat - :noindex: - -rotate ------- -.. autofunction:: paddle.v2.layer.rotate - :noindex: - -seq_reshape ------------ -.. autofunction:: paddle.v2.layer.seq_reshape - :noindex: - -Math Layers -=========== - -addto ------ -.. autofunction:: paddle.v2.layer.addto - :noindex: - -linear_comb ------------ -.. autofunction:: paddle.v2.layer.linear_comb - :noindex: - -interpolation -------------- -.. autofunction:: paddle.v2.layer.interpolation - :noindex: - -bilinear_interp ---------------- -.. autofunction:: paddle.v2.layer.bilinear_interp - :noindex: - -dropout --------- -.. autofunction:: paddle.v2.layer.dropout - :noindex: - -dot_prod ---------- -.. autofunction:: paddle.v2.layer.dot_prod - :noindex: - -out_prod --------- -.. autofunction:: paddle.v2.layer.out_prod - :noindex: - -power ------ -.. autofunction:: paddle.v2.layer.power - :noindex: - -scaling -------- -.. autofunction:: paddle.v2.layer.scaling - :noindex: - -clip ----- -.. autofunction:: paddle.v2.layer.clip - :noindex: - -resize ------- -.. autofunction:: paddle.v2.layer.resize - :noindex: - -slope_intercept ---------------- -.. autofunction:: paddle.v2.layer.slope_intercept - :noindex: - -tensor ------- -.. autofunction:: paddle.v2.layer.tensor - :noindex: - -.. _api_v2.layer_cos_sim: - -cos_sim -------- -.. autofunction:: paddle.v2.layer.cos_sim - :noindex: - -l2_distance ------------ -.. autofunction:: paddle.v2.layer.l2_distance - :noindex: - -trans ------ -.. autofunction:: paddle.v2.layer.trans - :noindex: - -scale_shift ------------ -.. autofunction:: paddle.v2.layer.scale_shift - :noindex: - -factorization_machine ---------------------- -.. autofunction:: paddle.v2.layer.factorization_machine - :noindex: - -Sampling Layers -=============== - -maxid ------ -.. autofunction:: paddle.v2.layer.max_id - :noindex: - -sampling_id ------------ -.. autofunction:: paddle.v2.layer.sampling_id - :noindex: - -multiplex ---------- -.. autofunction:: paddle.v2.layer.multiplex - :noindex: - -.. _api_v2.layer_costs: - -Cost Layers -=========== - -cross_entropy_cost ------------------- -.. autofunction:: paddle.v2.layer.cross_entropy_cost - :noindex: - -cross_entropy_with_selfnorm_cost --------------------------------- -.. autofunction:: paddle.v2.layer.cross_entropy_with_selfnorm_cost - :noindex: - -multi_binary_label_cross_entropy_cost -------------------------------------- -.. autofunction:: paddle.v2.layer.multi_binary_label_cross_entropy_cost - :noindex: - -classification_cost -------------------- -.. autofunction:: paddle.v2.layer.classification_cost - :noindex: - -huber_regression_cost -------------------------- -.. autofunction:: paddle.v2.layer.huber_regression_cost - :noindex: - -huber_classification_cost -------------------------- -.. autofunction:: paddle.v2.layer.huber_classification_cost - :noindex: - -lambda_cost ------------ -.. autofunction:: paddle.v2.layer.lambda_cost - :noindex: - -square_error_cost ------------------ -.. autofunction:: paddle.v2.layer.square_error_cost - :noindex: - -rank_cost ---------- -.. autofunction:: paddle.v2.layer.rank_cost - :noindex: - -sum_cost ---------- -.. autofunction:: paddle.v2.layer.sum_cost - :noindex: - -crf ---- -.. autofunction:: paddle.v2.layer.crf - :noindex: - -crf_decoding ------------- -.. autofunction:: paddle.v2.layer.crf_decoding - :noindex: - -ctc ---- -.. autofunction:: paddle.v2.layer.ctc - :noindex: - -warp_ctc --------- -.. autofunction:: paddle.v2.layer.warp_ctc - :noindex: - -nce ---- -.. autofunction:: paddle.v2.layer.nce - :noindex: - -hsigmoid ---------- -.. autofunction:: paddle.v2.layer.hsigmoid - :noindex: - -smooth_l1_cost --------------- -.. autofunction:: paddle.v2.layer.smooth_l1_cost - :noindex: - -multibox_loss --------------- -.. autofunction:: paddle.v2.layer.multibox_loss - :noindex: - -detection_output ----------------- -.. autofunction:: paddle.v2.layer.detection_output - :noindex: - -Check Layer -============ - -eos ---- -.. autofunction:: paddle.v2.layer.eos - :noindex: - -Activation -========== - -prelu --------- -.. autofunction:: paddle.v2.layer.prelu - :noindex: diff --git a/doc/v2/api/config/networks.rst b/doc/v2/api/config/networks.rst deleted file mode 100644 index 048379cf01f4aec5e73e2fe3ddfa728f3c17a5d1..0000000000000000000000000000000000000000 --- a/doc/v2/api/config/networks.rst +++ /dev/null @@ -1,132 +0,0 @@ -======== -Networks -======== - -The v2.networks module contains pieces of neural network that combine multiple layers. - -NLP -=== - -sequence_conv_pool ------------------- -.. automodule:: paddle.v2.networks - :members: sequence_conv_pool - :noindex: - -.. _api_trainer_config_helpers_network_text_conv_pool: - -text_conv_pool --------------- -.. automodule:: paddle.v2.networks - :members: text_conv_pool - :noindex: - -Images -====== - -img_conv_bn_pool ----------------- -.. automodule:: paddle.v2.networks - :members: img_conv_bn_pool - :noindex: - -img_conv_group --------------- -.. automodule:: paddle.v2.networks - :members: img_conv_group - :noindex: - -.. _api_trainer_config_helpers_network_simple_img_conv_pool: - -simple_img_conv_pool --------------------- -.. automodule:: paddle.v2.networks - :members: simple_img_conv_pool - :noindex: - -small_vgg ---------- -.. automodule:: paddle.v2.networks - :members: small_vgg - :noindex: - -vgg_16_network ---------------- -.. automodule:: paddle.v2.networks - :members: vgg_16_network - :noindex: - -Recurrent -========= - -LSTM ----- - -lstmemory_unit -`````````````` -.. automodule:: paddle.v2.networks - :members: lstmemory_unit - :noindex: - -lstmemory_group -``````````````` -.. automodule:: paddle.v2.networks - :members: lstmemory_group - :noindex: - -simple_lstm -``````````` -.. automodule:: paddle.v2.networks - :members: simple_lstm - :noindex: - -bidirectional_lstm -`````````````````` -.. automodule:: paddle.v2.networks - :members: bidirectional_lstm - :noindex: - -GRU ---- - -gru_unit -```````` -.. automodule:: paddle.v2.networks - :members: gru_unit - :noindex: - -gru_group -````````` -.. automodule:: paddle.v2.networks - :members: gru_group - :noindex: - -simple_gru -`````````` -.. automodule:: paddle.v2.networks - :members: simple_gru - :noindex: - -simple_gru2 -``````````` -.. automodule:: paddle.v2.networks - :members: simple_gru2 - :noindex: - -bidirectional_gru -`````````````````` -.. automodule:: paddle.v2.networks - :members: bidirectional_gru - :noindex: - -simple_attention ----------------- -.. automodule:: paddle.v2.networks - :members: simple_attention - :noindex: - -dot_product_attention ---------------------- -.. automodule:: paddle.v2.networks - :members: dot_product_attention - :noindex: diff --git a/doc/v2/api/config/optimizer.rst b/doc/v2/api/config/optimizer.rst deleted file mode 100644 index b32373fdef52a7aa9d64b12cda3f76cb2abf351b..0000000000000000000000000000000000000000 --- a/doc/v2/api/config/optimizer.rst +++ /dev/null @@ -1,45 +0,0 @@ -========== -Optimizer -========== - -Momentum -======== -.. automodule:: paddle.v2.optimizer - :members: Momentum - :noindex: - -Adam -==== -.. automodule:: paddle.v2.optimizer - :members: Adam - :noindex: - -Adamax -====== -.. automodule:: paddle.v2.optimizer - :members: Adamax - :noindex: - -AdaGrad -======= -.. automodule:: paddle.v2.optimizer - :members: AdaGrad - :noindex: - -DecayedAdaGrad -============== -.. automodule:: paddle.v2.optimizer - :members: DecayedAdaGrad - :noindex: - -AdaDelta -======== -.. automodule:: paddle.v2.optimizer - :members: AdaDelta - :noindex: - -RMSProp -======= -.. automodule:: paddle.v2.optimizer - :members: RMSProp - :noindex: diff --git a/doc/v2/api/config/pooling.rst b/doc/v2/api/config/pooling.rst deleted file mode 100644 index d26b365c9284632210a1532853e39feedc70758b..0000000000000000000000000000000000000000 --- a/doc/v2/api/config/pooling.rst +++ /dev/null @@ -1,46 +0,0 @@ -======= -Pooling -======= - -BasePool -======== -.. automodule:: paddle.v2.pooling - :members: BasePool - :noindex: - -Avg -=== -.. automodule:: paddle.v2.pooling - :members: Avg - :noindex: - -Max -=== -.. automodule:: paddle.v2.pooling - :members: Max - :noindex: - -Sum -=== -.. automodule:: paddle.v2.pooling - :members: Sum - :noindex: - -SquareRootN -=========== -.. automodule:: paddle.v2.pooling - :members: SquareRootN - :noindex: - -CudnnAvg -======== -.. automodule:: paddle.v2.pooling - :members: CudnnAvg - :noindex: - -CudnnMax -======== -.. automodule:: paddle.v2.pooling - :members: CudnnMax - :noindex: - diff --git a/doc/v2/api/data.rst b/doc/v2/api/data.rst deleted file mode 100644 index b56c7332cc284649c7e04328e51a7faa78593a39..0000000000000000000000000000000000000000 --- a/doc/v2/api/data.rst +++ /dev/null @@ -1,10 +0,0 @@ -================================== -Data Reader Interface and DataSets -================================== - -.. toctree:: - :maxdepth: 1 - - data/data_reader.rst - data/image.rst - data/dataset.rst diff --git a/doc/v2/api/data/data_reader.rst b/doc/v2/api/data/data_reader.rst deleted file mode 100644 index 1a35d0bbc8f9d751f49c7e1fc26feb1bcb3ae7f0..0000000000000000000000000000000000000000 --- a/doc/v2/api/data/data_reader.rst +++ /dev/null @@ -1,72 +0,0 @@ -===================== -Data Reader Interface -===================== - - -DataTypes -========= - -.. autofunction:: paddle.v2.data_type.dense_array - :noindex: - -.. autofunction:: paddle.v2.data_type.integer_value - :noindex: - -.. autofunction:: paddle.v2.data_type.integer_value_sequence - :noindex: - -.. autofunction:: paddle.v2.data_type.integer_value_sub_sequence - :noindex: - -.. autofunction:: paddle.v2.data_type.sparse_binary_vector - :noindex: - -.. autofunction:: paddle.v2.data_type.sparse_binary_vector_sequence - :noindex: - -.. autofunction:: paddle.v2.data_type.sparse_binary_vector_sub_sequence - :noindex: - -.. autofunction:: paddle.v2.data_type.sparse_float_vector - :noindex: - -.. autofunction:: paddle.v2.data_type.sparse_float_vector_sequence - :noindex: - -.. autofunction:: paddle.v2.data_type.sparse_float_vector_sub_sequence - :noindex: - -.. autofunction:: paddle.v2.data_type.sparse_non_value_slot - :noindex: - -.. autofunction:: paddle.v2.data_type.sparse_value_slot - :noindex: - -.. autoclass:: paddle.v2.data_type.InputType - :members: - :noindex: - -DataFeeder -========== - -.. automodule:: paddle.v2.data_feeder - :members: - :noindex: - -Reader -====== - -.. automodule:: paddle.reader - :members: - :noindex: - -.. automodule:: paddle.reader.creator - :members: - :noindex: - -minibatch -========= - -.. automodule:: paddle.v2.minibatch - :members: - :noindex: diff --git a/doc/v2/api/data/dataset.rst b/doc/v2/api/data/dataset.rst deleted file mode 100644 index e7c8be4452bf55e0967d750c2e624e8e316e9330..0000000000000000000000000000000000000000 --- a/doc/v2/api/data/dataset.rst +++ /dev/null @@ -1,82 +0,0 @@ -Dataset -======= - -.. automodule:: paddle.dataset - :members: - :noindex: - -mnist -+++++ - -.. automodule:: paddle.dataset.mnist - :members: - :noindex: - -cifar -+++++ - -.. automodule:: paddle.dataset.cifar - :members: - :noindex: - -conll05 -+++++++ - -.. automodule:: paddle.dataset.conll05 - :members: get_dict,get_embedding,test - :noindex: - -imdb -++++ - -.. automodule:: paddle.dataset.imdb - :members: - :noindex: - -imikolov -++++++++ - -.. automodule:: paddle.dataset.imikolov - :members: - :noindex: - -movielens -+++++++++ - -.. automodule:: paddle.dataset.movielens - :members: - :noindex: - -.. autoclass:: paddle.dataset.movielens.MovieInfo - :noindex: - -.. autoclass:: paddle.dataset.movielens.UserInfo - :noindex: - -sentiment -+++++++++ - -.. automodule:: paddle.dataset.sentiment - :members: - :noindex: - -uci_housing -+++++++++++ - -.. automodule:: paddle.dataset.uci_housing - :members: - :noindex: - -wmt14 -+++++ - -.. automodule:: paddle.dataset.wmt14 - :members: - :noindex: - -wmt16 -+++++ - -.. automodule:: paddle.dataset.wmt16 - :members: - :noindex: diff --git a/doc/v2/api/data/image.rst b/doc/v2/api/data/image.rst deleted file mode 100644 index 97651ffa6be56cf3ecaca2caca38a353fa5c1f49..0000000000000000000000000000000000000000 --- a/doc/v2/api/data/image.rst +++ /dev/null @@ -1,5 +0,0 @@ -Image Interface -=============== - -.. automodule:: paddle.v2.image - :members: diff --git a/doc/v2/api/index_en.rst b/doc/v2/api/index_en.rst deleted file mode 100644 index 5813509dce46677444f0234db8e0eaa4f113e3a0..0000000000000000000000000000000000000000 --- a/doc/v2/api/index_en.rst +++ /dev/null @@ -1,9 +0,0 @@ -API -=== - -.. toctree:: - :maxdepth: 1 - - model_configs.rst - data.rst - run_logic.rst diff --git a/doc/v2/api/model_configs.rst b/doc/v2/api/model_configs.rst deleted file mode 100644 index 992b559cbd87244612521d4c96f84f997d6c4196..0000000000000000000000000000000000000000 --- a/doc/v2/api/model_configs.rst +++ /dev/null @@ -1,13 +0,0 @@ -Model Configuration -=================== - -.. toctree:: - :maxdepth: 1 - - config/activation.rst - config/layer.rst - config/evaluators.rst - config/optimizer.rst - config/pooling.rst - config/networks.rst - config/attr.rst diff --git a/doc/v2/api/overview.rst b/doc/v2/api/overview.rst deleted file mode 100644 index a6f21428de1e4906e4af9433bc1c994f2b2c8b8e..0000000000000000000000000000000000000000 --- a/doc/v2/api/overview.rst +++ /dev/null @@ -1,12 +0,0 @@ -V2 API Overview -================ - -The PaddlePaddle V2 API is designed to provide a modern user interface for PaddlePaddle V1(the original layer-based platform of PaddlePaddle), -it proposes some high-level concepts such as `Layers `_ , `Optimizer `_ , `Evaluator `_ and `Data Reader `_ to make the model configuration more familiar to users. - -A model is composed of the computation described by a group of `Layers`, with `Evaluator` to define the error, `Optimizer` to update the parameters and `Data Reader` to feed in the data. - -We also provide the `interface for Training and Inference `_ to help control the training and inference phrase, -it has several easy to use methods to better expose the internal running details, different `events `_ are available to users by writing some callbacks. - -All in all, the V2 API gives a higher abstraction and make PaddlePaddle programs require fiew lines of code. diff --git a/doc/v2/api/run_logic.rst b/doc/v2/api/run_logic.rst deleted file mode 100644 index 5c97651f6536d89d2b5926d4b2907a547aa86b55..0000000000000000000000000000000000000000 --- a/doc/v2/api/run_logic.rst +++ /dev/null @@ -1,31 +0,0 @@ -====================== -Training and Inference -====================== - -Parameters -========== - -.. automodule:: paddle.v2.parameters - :members: Parameters - :noindex: - -Trainer -======= - -.. automodule:: paddle.v2.trainer - :members: SGD - :noindex: - -Event -===== - -.. automodule:: paddle.v2.event - :members: - :noindex: - -Inference -========= - -.. autofunction:: paddle.v2.infer - :noindex: - \ No newline at end of file diff --git a/doc/v2/build_and_install/build_from_source_cn.rst b/doc/v2/build_and_install/build_from_source_cn.rst deleted file mode 100644 index d0dacb104f148c2aeb323365cbd6f014ae00ed5a..0000000000000000000000000000000000000000 --- a/doc/v2/build_and_install/build_from_source_cn.rst +++ /dev/null @@ -1,225 +0,0 @@ -从源码编译 -====================== - -.. _requirements: - -需要的软硬件 ----------------- - -为了编译PaddlePaddle,我们需要 - -1. 一台电脑,可以装的是 Linux, Windows 或者 MacOS 操作系统 -2. Docker - -不需要依赖其他任何软件了。即便是 Python 和 GCC 都不需要,因为我们会把所有编译工具都安装进一个 Docker 镜像里。 - -.. _build_step: - -编译方法 ----------------- - -PaddlePaddle需要使用Docker环境完成编译,这样可以免去单独安装编译依赖的步骤,可选的不同编译环境Docker镜像 -可以在 `这里 `__ 找到,您也可以 -在 `这里 `__ 找到 paddle_manylinux_devel -镜像的编译以及使用方法。或者参考下述可选步骤,从源码中构建用于编译PaddlePaddle的Docker镜像。 - -如果您选择不使用Docker镜像,则需要在本机安装下面章节列出的 :ref:`编译依赖 <_compile_deps>` 之后才能开始编译的步骤。 - -编译PaddlePaddle,需要执行: - -.. code-block:: bash - - # 1. 获取源码 - git clone https://github.com/PaddlePaddle/Paddle.git - cd Paddle - # 2. 可选步骤:源码中构建用于编译PaddlePaddle的Docker镜像 - docker build -t paddle:dev . - # 3. 执行下面的命令编译CPU-Only的二进制 - docker run -it -v $PWD:/paddle -w /paddle -e "PYTHON_ABI=cp27-cp27mu" -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build - # 4. 或者也可以使用为上述可选步骤构建的镜像(必须先执行第2步) - docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev ./paddle/scripts/paddle_build.sh build - -注: - -- 上述命令把当前目录(源码树根目录)映射为 container 里的 :code:`/paddle` 目录。 - -- 如果您使用的是 manylinux 的镜像进行编译, 那么您需要通过环境变量 :code:`PYTHON_ABI` 来指定一个 `Python ABI `__. -PaddlePaddle目前支持的 Python ABI 有 :code:`cp27-cp27m` 和 :code:`cp27-cp27mu`. - -编译完成后会在build/python/dist目录下生成输出的whl包,可以选在在当前机器安装也可以拷贝到目标机器安装: - -.. code-block:: bash - - pip install build/python/dist/*.whl - -如果机器中已经安装过PaddlePaddle,有两种方法: - -.. code-block:: bash - - 1. 先卸载之前的版本,再重新安装 - pip uninstall paddlepaddle - pip install build/python/dist/*.whl - - 2. 直接升级到更新的版本 - pip install build/python/dist/*.whl -U - -.. _run_test: - -执行单元测试 ----------------- - -如果您期望在编译完成后立即执行所有的单元测试,可以按照下面的方法: - -设置 :code:`RUN_TEST=ON` 和 :code:`WITH_TESTING=ON` 就会在完成编译之后,立即执行单元测试。 -开启 :code:`WITH_GPU=ON` 可以指定同时执行GPU上的单元测试。 - -.. code-block:: bash - - docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh test - -如果期望执行其中一个单元测试,(比如 :code:`test_sum_op` ): - -.. code-block:: bash - - docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash - ./paddle/scripts/paddle_build.sh build - cd build - ctest -R test_sum_op -V - -.. _faq_docker: - -常见问题 ----------------- - -- 什么是 Docker? - - 如果您没有听说 Docker,可以把它想象为一个类似 virtualenv 的系统,但是虚拟的不仅仅是 Python 的运行环境。 - -- Docker 还是虚拟机? - - 有人用虚拟机来类比 Docker。需要强调的是:Docker 不会虚拟任何硬件,Docker container 里运行的编译工具实际上都是在本机的 CPU 和操作系统上直接运行的,性能和把编译工具安装在本机运行一样。 - -- 为什么用 Docker? - - 把工具和配置都安装在一个 Docker image 里可以标准化编译环境。这样如果遇到问题,其他人可以复现问题以便帮助。 - - 另外,对于习惯使用Windows和MacOS的开发者来说,使用Docker就不用配置交叉编译环境了。 - -- 我可以选择不用Docker吗? - - 当然可以。大家可以用把开发工具安装进入 Docker image 一样的方式,把这些工具安装到本机。这篇文档介绍基于 Docker 的开发流程,是因为这个流程比其他方法都更简便。 - -- 学习 Docker 有多难? - - 理解 Docker 并不难,大概花十分钟看一下 `如何使用Docker `_ 。这可以帮您省掉花一小时安装和配置各种开发工具,以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。 - -- 我可以用 IDE 吗? - - 当然可以,因为源码就在本机上。IDE 默认调用 make 之类的程序来编译源码,我们只需要配置 IDE 来调用 Docker 命令编译源码即可。 - - 很多 PaddlePaddle 开发者使用 Emacs。他们在自己的 `~/.emacs` 配置文件里加两行 - - .. code-block:: emacs - - (global-set-key "\C-cc" 'compile) - (setq compile-command "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev") - - 就可以按 `Ctrl-C` 和 `c` 键来启动编译了。 - -- 可以并行编译吗? - - 是的。我们的 Docker image 运行一个 `Paddle编译Bash脚本 `_ 。这个脚本调用 `make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。 - -- Docker 需要 sudo - - 如果用自己的电脑开发,自然也就有管理员权限(sudo)了。如果用公用的电脑开发,需要请管理员安装和配置好 Docker。此外,PaddlePaddle 项目在努力开始支持其他不需要 sudo 的集装箱技术,比如 rkt。 - -- 在 Windows/MacOS 上编译很慢 - - Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存,以保证编译高效。具体做法请参考 `如何为Windows/Mac计算机上的Docker增加内存和虚拟机 `_ 。 - -- 磁盘不够 - - 本文中的例子里,`docker run` 命令里都用了 `--rm` 参数,这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 `docker ps -a` 命令看到停止后但是没有删除的 containers。`docker build` 命令有时候会产生一些中间结果,是没有名字的 images,也会占用磁盘。可以参考 `如何删除Docker Container `_ 来清理这些内容。 - - -.. _compile_deps: - -附录:编译依赖 ----------------- - -PaddlePaddle编译需要使用到下面的依赖(包含但不限于),其他的依赖软件,会自动在编译时下载。 - -.. csv-table:: PaddlePaddle编译依赖 - :header: "依赖", "版本", "说明" - :widths: 10, 15, 30 - - "CMake", ">=3.2", "" - "GCC", "4.8.2", "推荐使用CentOS的devtools2" - "Python", "2.7.x", "依赖libpython2.7.so" - "pip", ">=9.0", "" - "numpy", "", "" - "SWIG", ">=2.0", "" - "Go", ">=1.8", "可选" - - -.. _build_options: - -附录:编译选项 ----------------- - -PaddlePaddle的编译选项,包括生成CPU/GPU二进制文件、链接何种BLAS库等。 -用户可在调用cmake的时候设置它们,详细的cmake使用方法可以参考 -`官方文档 `_ 。 - -在cmake的命令行中,通过使用 ``-D`` 命令设置该类编译选项,例如: - -.. code-block:: bash - - cmake .. -DWITH_GPU=OFF - -.. csv-table:: 编译选项说明 - :header: "选项", "说明", "默认值" - :widths: 1, 7, 2 - - "WITH_GPU", "是否支持GPU", "ON" - "WITH_C_API", "是否仅编译CAPI", "OFF" - "WITH_DOUBLE", "是否使用双精度浮点数", "OFF" - "WITH_DSO", "是否运行时动态加载CUDA动态库,而非静态加载CUDA动态库。", "ON" - "WITH_AVX", "是否编译含有AVX指令集的PaddlePaddle二进制文件", "ON" - "WITH_PYTHON", "是否内嵌PYTHON解释器", "ON" - "WITH_STYLE_CHECK", "是否编译时进行代码风格检查", "ON" - "WITH_TESTING", "是否开启单元测试", "OFF" - "WITH_DOC", "是否编译中英文文档", "OFF" - "WITH_SWIG_PY", "是否编译PYTHON的SWIG接口,该接口可用于预测和定制化训练", "Auto" - "WITH_GOLANG", "是否编译go语言的可容错parameter server", "OFF" - "WITH_MKL", "是否使用MKL数学库,如果为否则是用OpenBLAS", "ON" - -BLAS -+++++ - -PaddlePaddle支持 `MKL `_ 和 -`OpenBlAS `_ 两种BLAS库。默认使用MKL。如果使用MKL并且机器含有AVX2指令集, -还会下载MKL-DNN数学库,详细参考 `mkldnn设计文档 `_ 。 - -如果关闭MKL,则会使用OpenBLAS作为BLAS库。 - -CUDA/cuDNN -+++++++++++ - -PaddlePaddle在编译时/运行时会自动找到系统中安装的CUDA和cuDNN库进行编译和执行。 -使用参数 :code:`-DCUDA_ARCH_NAME=Auto` 可以指定开启自动检测SM架构,加速编译。 - -PaddlePaddle可以使用cuDNN v5.1之后的任何一个版本来编译运行,但尽量请保持编译和运行使用的cuDNN是同一个版本。 -我们推荐使用最新版本的cuDNN。 - -编译选项的设置 -++++++++++++++ - -PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/cuDNN库。cmake编译时,首先在系统路径( :code:`/usr/lib:/usr/local/lib` )中搜索这几个库,同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置,例如 - -.. code-block:: bash - - cmake .. -DWITH_GPU=ON -DWITH_TESTING=OFF -DCUDNN_ROOT=/opt/cudnnv5 - -**注意:这几个编译选项的设置,只在第一次cmake的时候有效。如果之后想要重新设置,推荐清理整个编译目录(** :code:`rm -rf` )**后,再指定。** diff --git a/doc/v2/build_and_install/build_from_source_en.rst b/doc/v2/build_and_install/build_from_source_en.rst deleted file mode 100644 index 664b68da8b7dd3e005ebf3ec34de77729e5ab355..0000000000000000000000000000000000000000 --- a/doc/v2/build_and_install/build_from_source_en.rst +++ /dev/null @@ -1,237 +0,0 @@ -Build from Sources -========================== - -.. _requirements: - -Requirements ----------------- - -To build PaddlePaddle, you need - -1. A computer -- Linux, Windows, MacOS. -2. Docker. - -Nothing else. Not even Python and GCC, because you can install all build tools into a Docker image. -We run all the tools by running this image. - -.. _build_step: - -How To Build ----------------- - -You need to use Docker to build PaddlePaddle -to avoid installing dependencies by yourself. We have several pre-built -Docker images `here `_ , -you can also find how to build and use paddle_manylinux_devel Docker image from -`here `__ -Or you can build your own image from source as the optional step below: - -If you don't wish to use docker,you need to install several compile dependencies manually as :ref:`Compile Dependencies <_compile_deps>` shows to start compilation. - -.. code-block:: bash - - # 1. clone the source code - git clone https://github.com/PaddlePaddle/Paddle.git - cd Paddle - # 2. Optional: build development docker image from source - docker build -t paddle:dev . - # 3. Run the following command to build a CPU-Only binaries - docker run -it -v $PWD:/paddle -w /paddle -e "PYTHON_ABI=cp27-cp27mu" -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build - # 4. Or, use your built Docker image to build PaddlePaddle (must run step 2) - docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev ./paddle/scripts/paddle_build.sh build - -NOTE: - -- The above command try to mount the current working directory (root directory of source code) -into :code:`/paddle` directory inside docker container. - -- You need to pass in the required environment variable :code:`PYTHON_ABI` to specify a `Python ABI `__. -Currently PaddlePaddle supported Python ABIs include :code:`cp27-cp27m` and :code:`cp27-cp27mu` . - -When the compile finishes, you can get the output whl package under -build/python/dist, then you can choose to install the whl on local -machine or copy it to the target machine. - -.. code-block:: bash - - pip install build/python/dist/*.whl - -If the machine has installed PaddlePaddle before, there are two methods: - -.. code-block:: bash - - 1. uninstall and reinstall - pip uninstall paddlepaddle - pip install build/python/dist/*.whl - - 2. upgrade directly - pip install build/python/dist/*.whl -U - -.. _run_test: - -Run Tests ----------------- - -If you wish to run the tests, you may follow the below steps: - -When using Docker, set :code:`RUN_TEST=ON` and :code:`WITH_TESTING=ON` will run test immediately after the build. -Set :code:`WITH_GPU=ON` Can also run tests on GPU. - -.. code-block:: bash - - docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh test - -If you wish to run only one unit test, like :code:`test_sum_op`: - -.. code-block:: bash - - docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash - ./paddle/scripts/paddle_build.sh build - cd build - ctest -R test_sum_op -V - -.. _faq_docker: - -Frequently Asked Questions ---------------------------- - -- What is Docker? - - If you haven't heard of it, consider it something like Python's virtualenv. - -- Docker or virtual machine? - - Some people compare Docker with VMs, but Docker doesn't virtualize any hardware nor running a guest OS, which means there is no compromise on the performance. - -- Why Docker? - - Using a Docker image of build tools standardizes the building environment, which makes it easier for others to reproduce your problems and to help. - - Also, some build tools don't run on Windows or Mac or BSD, but Docker runs almost everywhere, so developers can use whatever computer they want. - -- Can I choose not to use Docker? - - Sure, you don't have to install build tools into a Docker image; instead, you can install them on your local computer. This document exists because Docker would make the development way easier. - -- How difficult is it to learn Docker? - - It takes you ten minutes to read `an introductory article `_ and saves you more than one hour to install all required build tools, configure them, especially when new versions of PaddlePaddle require some new tools. Not even to mention the time saved when other people trying to reproduce the issue you have. - -- Can I use my favorite IDE? - - Yes, of course. The source code resides on your local computer, and you can edit it using whatever editor you like. - - Many PaddlePaddle developers are using Emacs. They add the following few lines into their `~/.emacs` configure file: - - .. code-block:: emacs - - (global-set-key "\C-cc" 'compile) - (setq compile-command "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev") - - so they could type `Ctrl-C` and `c` to build PaddlePaddle from source. - -- Does Docker do parallel building? - - Our building Docker image runs a `Bash script `_ , which calls `make -j$(nproc)` to starts as many processes as the number of your CPU cores. - -- Docker requires sudo - - An owner of a computer has the administrative privilege, a.k.a., sudo, and Docker requires this privilege to work properly. If you use a shared computer for development, please ask the administrator to install and configure Docker. We will do our best to support rkt, another container technology that doesn't require sudo. - -- Docker on Windows/MacOS builds slowly - - On Windows and MacOS, Docker containers run in a Linux VM. You might want to give this VM some more memory and CPUs so to make the building efficient. Please refer to `this issue `_ for details. - -- Not enough disk space - - Examples in this article use option `--rm` with the `docker run` command. This option ensures that stopped containers do not exist on hard disks. We can use `docker ps -a` to list all containers, including stopped. Sometimes `docker build` generates some intermediate dangling images, which also take disk space. To clean them, please refer to `this article `_ . - -.. _compile_deps: - -Appendix: Compile Dependencies -------------------------------- - -PaddlePaddle need the following dependencies when compiling, other dependencies -will be downloaded automatically. - -.. csv-table:: PaddlePaddle Compile Dependencies - :header: "Dependency", "Version", "Description" - :widths: 10, 15, 30 - - "CMake", ">=3.2", "" - "GCC", "4.8.2", "Recommend devtools2 for CentOS" - "Python", "2.7.x", "Need libpython2.7.so" - "pip", ">=9.0", "" - "numpy", "", "" - "SWIG", ">=2.0", "" - "Go", ">=1.8", "Optional" - - -.. _build_options: - -Appendix: Build Options -------------------------- - -Build options include whether build binaries for CPU or GPU, which BLAS -library to use etc. You may pass these settings when running cmake. -For detailed cmake tutorial please refer to `here `__ 。 - - -You can add :code:`-D` argument to pass such options, like: - -.. code-block:: bash - - cmake .. -DWITH_GPU=OFF - -.. csv-table:: Bool Type Options - :header: "Option", "Description", "Default" - :widths: 1, 7, 2 - - "WITH_GPU", "Build with GPU support", "ON" - "WITH_C_API", "Build only CAPI", "OFF" - "WITH_DOUBLE", "Build with double precision", "OFF" - "WITH_DSO", "Dynamically load CUDA libraries", "ON" - "WITH_AVX", "Build with AVX support", "ON" - "WITH_PYTHON", "Build with integrated Python interpreter", "ON" - "WITH_STYLE_CHECK", "Check code style when building", "ON" - "WITH_TESTING", "Build unit tests", "OFF" - "WITH_DOC", "Build documentations", "OFF" - "WITH_SWIG_PY", "Build Python SWIG interface for V2 API", "Auto" - "WITH_GOLANG", "Build fault-tolerant parameter server written in go", "OFF" - "WITH_MKL", "Use MKL as BLAS library, else use OpenBLAS", "ON" - - -BLAS -+++++ - -PaddlePaddle supports `MKL `_ and -`OpenBlAS `_ as BLAS library。By default it uses MKL. -If you are using MKL and your machine supports AVX2, MKL-DNN will also be downloaded -and used, for more `details `_ . - -If you choose not to use MKL, then OpenBlAS will be used. - -CUDA/cuDNN -+++++++++++ - -PaddlePaddle will automatically find CUDA and cuDNN when compiling and running. -parameter :code:`-DCUDA_ARCH_NAME=Auto` can be used to detect SM architecture -automatically in order to speed up the build. - -PaddlePaddle can build with any version later than cuDNN v5.1, and we intend to -keep on with latest cuDNN versions. Be sure to run with the same version of cuDNN -you built. - -Pass Compile Options -++++++++++++++++++++++ - -You can pass compile options to use intended BLAS/CUDA/Cudnn libraries. -When running cmake command, it will search system paths like -:code:`/usr/lib:/usr/local/lib` and then search paths that you -passed to cmake, i.e. - -.. code-block:: bash - - cmake .. -DWITH_GPU=ON -DWITH_TESTING=OFF -DCUDNN_ROOT=/opt/cudnnv5 - -**NOTE: These options only take effect when running cmake for the first time, you need to clean the cmake cache or clean the build directory (** :code:`rm -rf` **) if you want to change it.** diff --git a/doc/v2/build_and_install/docker_install_cn.rst b/doc/v2/build_and_install/docker_install_cn.rst deleted file mode 100644 index 106c86bace075764c84bc2a7f7cb09d466fa8794..0000000000000000000000000000000000000000 --- a/doc/v2/build_and_install/docker_install_cn.rst +++ /dev/null @@ -1,146 +0,0 @@ -使用Docker安装运行 -================================ - -使用Docker安装和运行PaddlePaddle可以无需考虑依赖环境即可运行。并且也可以在Windows的docker中运行。 -您可以在 `Docker官网 `_ 获得基本的Docker安装和使用方法。 - -如果您在使用Windows,可以参考 -`这篇 `_ -教程,完成在Windows上安装和使用Docker。 - -在了解Docker的基本使用方法之后,即可开始下面的步骤: - -.. _docker_pull: - -获取PaddlePaddle的Docker镜像 ------------------------------- - -执行下面的命令获取最新的PaddlePaddle Docker镜像,版本为cpu_avx_mkl: - - .. code-block:: bash - - docker pull paddlepaddle/paddle - -对于国内用户,我们提供了加速访问的镜像源: - - .. code-block:: bash - - docker pull docker.paddlepaddlehub.com/paddle - -下载GPU版本(cuda8.0_cudnn5_avx_mkl)的Docker镜像: - - .. code-block:: bash - - docker pull paddlepaddle/paddle:latest-gpu - docker pull docker.paddlepaddlehub.com/paddle:latest-gpu - -选择下载使用不同的BLAS库的Docker镜像: - - .. code-block:: bash - - # 默认是使用MKL的镜像 - docker pull paddlepaddle/paddle - # 使用OpenBLAS的镜像 - docker pull paddlepaddle/paddle:latest-openblas - -下载指定版本的Docker镜像,可以从 `DockerHub网站 `_ 获取可选的tag,并执行下面的命令: - - .. code-block:: bash - - docker pull paddlepaddle/paddle:[tag] - # 比如: - docker pull docker.paddlepaddlehub.com/paddle:0.11.0-gpu - -.. _docker_run: - -在Docker中执行PaddlePaddle训练程序 ----------------------------------- - -假设您已经在当前目录(比如在/home/work)编写了一个PaddlePaddle的程序 :code:`train.py` (可以参考 -`PaddlePaddleBook `_ -编写),就可以使用下面的命令开始执行训练: - - .. code-block:: bash - - cd /home/work - docker run -it -v $PWD:/work paddlepaddle/paddle /work/train.py - -上述命令中, :code:`-it` 参数说明容器已交互式运行; :code:`-v $PWD:/work` -指定将当前路径(Linux中$PWD变量会展开为当前路径的绝对路径)挂载到容器内部的 :code:`/work` -目录; :code:`paddlepaddle/paddle` 指定需要使用的容器; 最后 :code:`/work/train.py` -为容器内执行的命令,即运行训练程序。 - -当然,您也可以进入到Docker容器中,以交互式的方式执行或调试您的代码: - - .. code-block:: bash - - docker run -it -v $PWD:/work paddlepaddle/paddle /bin/bash - cd /work - python train.py - -**注:PaddlePaddle Docker镜像为了减小体积,默认没有安装vim,您可以在容器中执行** :code:`apt-get install -y vim` **安装后,在容器中编辑代码。** - -.. _docker_run_book: - -使用Docker启动PaddlePaddle Book教程 ------------------------------------ - -使用Docker可以快速在本地启动一个包含了PaddlePaddle官方Book教程的Jupyter Notebook,可以通过网页浏览。 -PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Notebook。 -如果您想要更深入了解deep learning,PaddlePaddle Book一定是您最好的选择。 -大家可以通过它阅读教程,或者制作和分享带有代码、公式、图表、文字的交互式文档。 - -我们提供可以直接运行PaddlePaddle Book的Docker镜像,直接运行: - - .. code-block:: bash - - docker run -p 8888:8888 paddlepaddle/book - -国内用户可以使用下面的镜像源来加速访问: - - .. code-block:: bash - - docker run -p 8888:8888 docker.paddlepaddlehub.com/book - -然后在浏览器中输入以下网址: - - .. code-block:: text - - http://localhost:8888/ - -就这么简单,享受您的旅程! - -.. _docker_run_gpu: - -使用Docker执行GPU训练 ------------------------------- - -为了保证GPU驱动能够在镜像里面正常运行,我们推荐使用 -`nvidia-docker `_ 来运行镜像。 -请不要忘记提前在物理机上安装GPU最新驱动。 - - .. code-block:: bash - - nvidia-docker run -it -v $PWD:/work paddlepaddle/paddle:latest-gpu /bin/bash - -**注: 如果没有安装nvidia-docker,可以尝试以下的方法,将CUDA库和Linux设备挂载到Docker容器内:** - - .. code-block:: bash - - export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')" - export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}') - docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu - -**关于AVX:** - -AVX是一种CPU指令集,可以加速PaddlePaddle的计算。最新的PaddlePaddle Docker镜像默认 -是开启AVX编译的,所以,如果您的电脑不支持AVX,需要单独 -`编译 <./build_from_source_cn.html>`_ PaddlePaddle为no-avx版本。 - -以下指令能检查Linux电脑是否支持AVX: - - .. code-block:: bash - - if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi - -如果输出是No,就需要选择使用no-AVX的镜像 diff --git a/doc/v2/build_and_install/docker_install_en.rst b/doc/v2/build_and_install/docker_install_en.rst deleted file mode 100644 index 25aecb8d0da9feb00006da6259b529b7011d91cb..0000000000000000000000000000000000000000 --- a/doc/v2/build_and_install/docker_install_en.rst +++ /dev/null @@ -1,153 +0,0 @@ -Run in Docker Containers -================================= - -Run PaddlePaddle in Docker container so that you don't need to care about -runtime dependencies, also you can run under Windows system. You can get -tutorials at `here `_ . - -If you are using Windows, please refer to -`this `_ -tutorial to start running docker under windows. - -After you've read above tutorials you may proceed the following steps. - -.. _docker_pull: - -Pull PaddlePaddle Docker Image ------------------------------- - -Run the following command to download the latest Docker images, the version is cpu_avx_mkl: - - .. code-block:: bash - - docker pull paddlepaddle/paddle - -For users in China, we provide a faster mirror: - - .. code-block:: bash - - docker pull docker.paddlepaddlehub.com/paddle - -Download GPU version (cuda8.0_cudnn5_avx_mkl) images: - - .. code-block:: bash - - docker pull paddlepaddle/paddle:latest-gpu - docker pull docker.paddlepaddlehub.com/paddle:latest-gpu - -Choose between different BLAS version: - - .. code-block:: bash - - # image using MKL by default - docker pull paddlepaddle/paddle - # image using OpenBLAS - docker pull paddlepaddle/paddle:latest-openblas - - -If you want to use legacy versions, choose a tag from -`DockerHub `_ -and run: - - .. code-block:: bash - - docker pull paddlepaddle/paddle:[tag] - # i.e. - docker pull docker.paddlepaddlehub.com/paddle:0.11.0-gpu - -.. _docker_run: - -Launch your training program in Docker --------------------------------------- - -Assume that you have already written a PaddlePaddle program -named :code:`train.py` under directory :code:`/home/work` (refer to -`PaddlePaddleBook `_ -for more samples), then run the following command: - - .. code-block:: bash - - cd /home/work - docker run -it -v $PWD:/work paddlepaddle/paddle /work/train.py - -In the above command, :code:`-it` means run the container interactively; -:code:`-v $PWD:/work` means mount the current directory ($PWD will expand -to current absolute path in Linux) under :code:`/work` in the container. -:code:`paddlepaddle/paddle` to specify image to use; finnally -:code:`/work/train.py` is the command to run inside docker. - -Also, you can go into the container shell, run or debug your code -interactively: - - .. code-block:: bash - - docker run -it -v $PWD:/work paddlepaddle/paddle /bin/bash - cd /work - python train.py - -**NOTE: We did not install vim in the default docker image to reduce the image size, you can run** :code:`apt-get install -y vim` **to install it if you need to edit python files.** - -.. _docker_run_book: - -PaddlePaddle Book ------------------- - -You can create a container serving PaddlePaddle Book using Jupyter Notebook in -one minute using Docker. PaddlePaddle Book is an interactive Jupyter Notebook -for users and developers.If you want to -dig deeper into deep learning, PaddlePaddle Book definitely is your best choice. - -We provide a packaged book image, simply issue the command: - - .. code-block:: bash - - docker run -p 8888:8888 paddlepaddle/book - -For users in China, we provide a faster mirror: - - .. code-block:: bash - - docker run -p 8888:8888 docker.paddlepaddlehub.com/book - -Then, you would back and paste the address into the local browser: - - .. code-block:: text - - http://localhost:8888/ - -That's all. Enjoy your journey! - -.. _docker_run_gpu: - -Train with Docker with GPU ------------------------------- - -We recommend using -`nvidia-docker `_ -to run GPU training jobs. Please ensure you have latest -GPU driver installed before move on. - - .. code-block:: bash - - nvidia-docker run -it -v $PWD:/work paddlepaddle/paddle:latest-gpu /bin/bash - -**NOTE: If you don't have nvidia-docker installed, try the following method to mount CUDA libs and devices into the container.** - - .. code-block:: bash - - export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')" - export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}') - docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu - -**About AVX:** - -AVX is a kind of CPU instruction can accelerate PaddlePaddle's calculations. -The latest PaddlePaddle Docker image turns AVX on by default, so, if your -computer doesn't support AVX, you'll probably need to -`build <./build_from_source_en.html>`_ with :code:`WITH_AVX=OFF`. - -The following command will tell you whether your computer supports AVX. - - .. code-block:: bash - - if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi diff --git a/doc/v2/build_and_install/index_cn.rst b/doc/v2/build_and_install/index_cn.rst deleted file mode 100644 index 1a9305ac4b6578c14a962f223c647a71e3b8a72b..0000000000000000000000000000000000000000 --- a/doc/v2/build_and_install/index_cn.rst +++ /dev/null @@ -1,56 +0,0 @@ -安装与编译 -========== - -.. _install_steps: - -PaddlePaddle针对不同的用户群体提供了多种安装方式。 - -专注深度学习模型开发 --------------------- - -PaddlePaddle提供了多种python wheel包,可通过pip一键安装: - -.. toctree:: - :maxdepth: 1 - - pip_install_cn.rst - -这是最便捷的安装方式,请根据机器配置和系统选择对应的安装包。 - -关注底层框架 -------------- - -PaddlePaddle提供了基于Docker的安装方式,请参照以下教程: - -.. toctree:: - :maxdepth: 1 - - docker_install_cn.rst - -我们推荐在Docker中运行PaddlePaddle,该方式具有以下优势: - -- 无需单独安装第三方依赖 -- 方便分享运行时环境,易于问题的复现 - -对于有定制化二进制文件需求的用户,我们同样提供了从源码编译安装PaddlePaddle的方法: - -.. toctree:: - :maxdepth: 1 - - build_from_source_cn.rst - -.. warning:: - - 需要提醒的是,这种安装方式会涉及到一些第三方库的下载、编译及安装,整个安装过程耗时较长。 - - -常见问题汇总 --------------- - -如果在安装过程中遇到了问题,请先尝试在下面的页面寻找答案: - -:ref:`常见问题解答 ` - -如果问题没有得到解决,欢迎向PaddlePaddle社区反馈问题: - -`创建issue `_ diff --git a/doc/v2/build_and_install/index_en.rst b/doc/v2/build_and_install/index_en.rst deleted file mode 100644 index 7990bacbd6966e88e8763e9c5709e410f7e9fed4..0000000000000000000000000000000000000000 --- a/doc/v2/build_and_install/index_en.rst +++ /dev/null @@ -1,56 +0,0 @@ -install and Compile -====================== - -.. _install_steps: - -PaddlePaddle provides various methods of installation for many different users - -Focus on Deep Learning Model Development ----------------------------------------- - -PaddlePaddle provides lots of packages of python wheel , that pip can install: - -.. toctree:: - :maxdepth: 1 - - pip_install_en.rst - -This is the most convenient way of installation. Please choose the right installation package with machine configure and system. - -Follow the Bottom Frame ------------------------- - -PaddlePaddle also supports installation using Docker. Please refer to the tutorial below: - -.. toctree:: - :maxdepth: 1 - - docker_install_en.rst - -We recommend running PaddlePaddle in Docker. This method has the following advantages: - -- Does not require installation of third-party dependencies. -- Easy to share runtime environment. - -Lastly, users can also compile and install PaddlePaddle from source code. The instructions are below: - -.. toctree:: - :maxdepth: 1 - - build_from_source_en.rst - -.. warning:: - - One caveat with this approach is that developers will have to download, compile and install all third-party dependencies. Thus this process of installation is more time consuming. - - -FAQ ------------ - -For any problems during installation, please refer to the page below for answers: - -:ref:`常见问题解答 ` - -If the problem still persists, you are welcome to seek assistance from the PaddlePaddle community: - -`创建issue `_ diff --git a/doc/v2/build_and_install/paddleci.png b/doc/v2/build_and_install/paddleci.png deleted file mode 100644 index 16087ce059aa3c07ce8c927d983eb86351915825..0000000000000000000000000000000000000000 Binary files a/doc/v2/build_and_install/paddleci.png and /dev/null differ diff --git a/doc/v2/build_and_install/pip_install_cn.rst b/doc/v2/build_and_install/pip_install_cn.rst deleted file mode 100644 index 095da19cd41d29bfa72ab23abd24bec45f925a86..0000000000000000000000000000000000000000 --- a/doc/v2/build_and_install/pip_install_cn.rst +++ /dev/null @@ -1,105 +0,0 @@ -使用pip安装 -================================ - -PaddlePaddle可以使用常用的Python包管理工具 -`pip `_ -完成安装,并可以在大多数主流的Linux操作系统以及MacOS上执行。 - -.. _pip_install: - -使用pip安装 ------------------------------- - -执行下面的命令即可在当前机器上安装PaddlePaddle的运行时环境,并自动下载安装依赖软件。 - - .. code-block:: bash - - pip install paddlepaddle - -当前的默认版本为0.12.0,cpu_avx_openblas,您可以通过指定版本号来安装其它版本,例如: - - .. code-block:: bash - - pip install paddlepaddle==0.11.0 - - -如果需要安装支持GPU的版本(cuda8.0_cudnn5_avx_openblas),需要执行: - - .. code-block:: bash - - pip install paddlepaddle-gpu - -当前的默认版本也是0.12.0,PaddlePaddle针对不同需求提供了更多版本的安装包,部分列表如下: - -================================= ======================================== -版本号 版本说明 -================================= ======================================== -paddlepaddle-gpu==0.12.0 使用CUDA 8.0和cuDNN 5编译的0.12.0版本 -paddlepaddle-gpu==0.11.0.post87 使用CUDA 8.0和cuDNN 7编译的0.11.0版本 -paddlepaddle-gpu==0.11.0.post8 使用CUDA 8.0和cuDNN 5编译的0.11.0版本 -paddlepaddle-gpu==0.11.0 使用CUDA 7.5和cuDNN 5编译的0.11.0版本 -================================= ======================================== - -您可以在 `Release History `_ 中找到paddlepaddle-gpu的各个发行版本。 - -如果需要获取并安装最新的(开发分支)PaddlePaddle,可以从我们的CI系统中下载最新的whl安装包和c-api开发包并安装, -您可以从下面的表格中找到需要的版本: - -如果在点击下面链接时出现如下登陆界面,点击“Log in as guest”即可开始下载: - -.. image:: paddleci.png - :scale: 50 % - :align: center - -.. csv-table:: 各个版本最新的whl包 - :header: "版本说明", "cp27-cp27mu", "cp27-cp27m" - :widths: 1, 3, 3 - - "cpu_avx_mkl", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl `__" - "cpu_avx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl `__" - "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl `_" - "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl `__" - "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl `__" - "cuda9.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl `__" - -.. _pip_dependency: - -运行环境依赖 ------------------------------- - -PaddlePaddle安装包由于不仅仅包含.py程序,而且包含了C++编写的部分,所以我们确保发布的二进制包可以支持主流的Linux操作系统,比如CentOS 6以上,Ubuntu 14.04以上,MacOS 10.12以上。 - -PaddlePaddle发布的安装包会尽量对齐 `manylinux1 `_ 标准,通常使用CentOS 5作为编译环境。但由于CUDA库通常需要CentOS 6以上,而且CentOS 5即将停止维护,所以我们默认使用CentOS 6作为标准编译环境。 - -.. csv-table:: PaddlePaddle环境依赖 - :header: "依赖", "版本", "说明" - :widths: 10, 15, 30 - - "操作系统", "Linux, MacOS", "CentOS 6以上,Ubuntu 14.04以上,MacOS 10.12以上" - "Python", "2.7.x", "暂时不支持Python3" - "libc.so", "GLIBC_2.7", "glibc至少包含GLIBC_2.7以上的符号" - "libstdc++.so", "GLIBCXX_3.4.11, CXXABI_1.3.3", "至少包含GLIBCXX_3.4.11, CXXABI_1.3.3以上的符号" - "libgcc_s.so", "GCC_3.3", "至少包含GCC_3.3以上的符号" - -.. _pip_faq: - -安装常见问题和解决方法 ------------------------------- - -- paddlepaddle*.whl is not a supported wheel on this platform. - - 出现这个问题的主要原因是,没有找到和当前系统匹配的paddlepaddle安装包。请检查Python版本是否为2.7系列。另外最新的pip官方源中的安装包默认是manylinux1标准,需要使用最新的pip (>9.0.0) 才可以安装。可以使用下面的命令更新您的pip: - - .. code-block:: bash - - pip install --upgrade pip - - 如果仍然存在问题,可以执行: - - .. code-block:: bash - - python -c "import pip; print(pip.pep425tags.get_supported())" - - 获取当前系统支持的安装包格式,并检查和需安装的包是否匹配。pypi安装包可以在 `这个 `_ 链接中找到。 - - 如果系统支持的是 linux_x86_64 而安装包是 manylinux1_x86_64 ,需要升级pip版本到最新; 如果系统支持 manylinux1_x86_64 而安装包(本地)是 linux_x86_64 ,可以重命名这个whl包为 manylinux1_x86_64 再安装。 diff --git a/doc/v2/build_and_install/pip_install_en.rst b/doc/v2/build_and_install/pip_install_en.rst deleted file mode 100644 index 8406e4aa1fbb953c3b615b10d1bcb2c45974dde0..0000000000000000000000000000000000000000 --- a/doc/v2/build_and_install/pip_install_en.rst +++ /dev/null @@ -1,123 +0,0 @@ -Install using pip -================================ - -You can use current widely used Python package management -tool `pip `_ -to install PaddlePaddle. This method can be used in -most of current Linux systems or MacOS. - -.. _pip_install: - -Install using pip ------------------------------- - -Run the following command to install PaddlePaddle on the current -machine, it will also download requirements. - - .. code-block:: bash - - pip install paddlepaddle - -the default version is 0.12.0, cpu_avx_openblas, you can specify the versions to satisfy your demands, like: - - .. code-block:: bash - - pip install paddlepaddle==0.11.0 - -If you need to install a GPU-enabled version (cuda8.0_cudnn5_avx_openblas), you need to run: - - .. code-block:: bash - - pip install paddlepaddle-gpu - -The default version is also 0.12.0, PaddlePaddle provides several versions of packages for different needs, as shown in the table: - -================================= ======================================== -版本号 版本说明 -================================= ======================================== -paddlepaddle-gpu==0.12.0 0.12.0 built with CUDA 8.0 and cuDNN 5 -paddlepaddle-gpu==0.11.0.post87 0.11.0 built with CUDA 8.0 and cuDNN 7 -paddlepaddle-gpu==0.11.0.post8 0.11.0 built with CUDA 8.0 and cuDNN 5 -paddlepaddle-gpu==0.11.0 0.11.0 built with CUDA 7.5 and cuDNN 5 -================================= ======================================== - -You can find all versions released of paddlepaddle-gpu in `Release History `_ . - -If you wish to install the latest develop branch PaddlePaddle, -you can download the latest whl package from our CI system. Access -the below links, log in as guest, then click at the "Artifact" -tab, you'll find the download link of whl packages. - -If the links below shows up the login form, just click "Log in as guest" to start the download: - -.. image:: paddleci.png - :scale: 50 % - :align: center - -.. csv-table:: whl package of each version - :header: "version", "cp27-cp27mu", "cp27-cp27m" - :widths: 1, 3, 3 - - "cpu_avx_mkl", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl `__" - "cpu_avx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl `__" - "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl `__" - "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl `__" - "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl `__" - "cuda9.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl `__" - -.. _pip_dependency: - -Runtime Dependency ------------------------------- - -PaddlePaddle installation packages (whl) does not only contain .py files, -but also binaries built from C++ code. We ensure that PaddlePaddle can -run on current mainline Linux distributions, like CentOS 6, Ubuntu 14.04 -and MacOS 10.12. - -PaddlePaddle whl packages are trying to satisfy -`manylinux1 `_ -standard, which uses CentOS 5 as default build environment. But CUDA libraries -seems only run on CentOS 6 at least, also, CentOS 5 is about to end its lifetime, -so we use CentOS 6 as default build environment. - -.. csv-table:: PaddlePaddle Runtime Deps - :header: "Dependency", "version", "description" - :widths: 10, 15, 30 - - "OS", "Linux, MacOS", "CentOS 6 or later,Ubuntu 14.04 or later,MacOS 10.12 or later" - "Python", "2.7.x", "Currently Python3 is not supported" - "libc.so", "GLIBC_2.7", "glibc at least include GLIBC_2.7 symbols" - "libstdc++.so", "GLIBCXX_3.4.11, CXXABI_1.3.3", "At least include GLIBCXX_3.4.11, CXXABI_1.3.3 symbols" - "libgcc_s.so", "GCC_3.3", "At least include GCC_3.3 symbols" - -.. _pip_faq: - -FAQ ------------------------------- - -- paddlepaddle*.whl is not a supported wheel on this platform. - - The main cause of this issue is that your current platform is - not supported. Please check that you are using Python 2.7 series. - Besides, pypi only supports manylinux1 standard, you'll need to - upgrade your pip to >9.0.0. Then run the below command: - - .. code-block:: bash - - pip install --upgrade pip - - If the problem still exists, run the following command: - - .. code-block:: bash - - python -c "import pip; print(pip.pep425tags.get_supported())" - - Then you'll get supported package suffixes, then check if it matches - the file name of the whl package. You can find default whl package at - `here `_ - - If your system supports linux_x86_64 but the whl package is manylinux1_x86_64, - you'll need to update pip to the latest version; If your system supports - manylinux1_x86_64 but the whl package is linux_x86_64 you can rename the - file to manylinux1_x86_64 suffix and then install. diff --git a/doc/v2/design/cluster_train/README.md b/doc/v2/design/cluster_train/README.md deleted file mode 100644 index 177a5f5d54bd924fab34795219ce1f7b270c8e25..0000000000000000000000000000000000000000 --- a/doc/v2/design/cluster_train/README.md +++ /dev/null @@ -1,182 +0,0 @@ -# Design Doc: Distributed Training - -## Objective - -In [this slides](https://www.slideshare.net/cxwangyi/paddlepaddle-a-complete-solution-for-businesses), we explained that we'd like PaddlePaddle running on general-purpose clusters like those managed by Kubernetes, so to address demands for AI from both Internet and non-Internet industries. - -This poses technical challenges to PaddlePaddle: - -1. Support fault-recovery. -1. Support both offline and online training. -1. [Serverless computing](https://en.wikipedia.org/wiki/Serverless_computing) of distributed training. - - -## Training Job - -A training job will be created once user asks Paddle cloud to train a model. The training job is made up of different processes that collaboratively consume data and produce a trained model. There are three kinds of processes: - -1. the *master server process*, which dispatches tasks to -1. one or more *trainer processes*, which run distributed training and synchronize gradients/models via -1. one or more *parameter server processes*, where each holds a shard of the global model, and receive the uploaded gradients from every *trainer process*, so they can run the optimize functions to update their parameters. - -Their relation is illustrated in the following graph: - - - -By coordinating these processes, PaddlePaddle supports use both Synchronize Stochastic Gradient Descent (sync SGD) and Asynchronous Stochastic Gradient Descent (async SGD) to train user-defined neural network topologies. - -When training with sync SGD, parameter servers wait for all trainers to finish gradients update and then send the updated parameters to trainers, training can not proceed until the trainer received the updated parameters. This creates a synchronization point between trainers. When training with async SGD, each trainer upload gradient and download new parameters individually, without the synchronization with other trainers. Using asyc SGD will be faster in terms of time per pass, but have more noise in gradient since trainers are likely to have a stale model. - -### Master Server Process - -The master server process will: - -- Partition a dataset into [tasks](#task) and dispatch tasks to trainers. -- Keep track of training progress on the dataset with [task queue](#task-queue). A training job will iterate on the dataset for a full pass until it goes into next pass. - - -#### Task - -A task is a data shard to be trained. The total number of tasks will be much bigger than the total number of trainers. The number of data instances inside a task will be much bigger than the mini-batch size. - -#### Task Queue - -The master server has three task queues to track training progress. As illustrated in the graph below, Job A and Job B both have one master server. Each master server process has three task queues. - - - -- The todo queue holds tasks to be dispatched. When a job starts, the master server fills in the todo queue with all tasks. -- The pending queue holds tasks that are currently training by trainers. -- the done queue holds tasks that are already trained. - -The life cycle of a single task is illustrated below: - - - -1. When a new pass of training starts, all tasks will be placed in the todo queue. -1. Upon trainer requests for new task, the master server will dispatch a task from todo queue to it, put the task in the pending queue and wait for completion. -1. The trainer will work on its task and tell the master server once the task is completed and ask for new task. The master server will dispatch a new task to that trainer. -1. If a task fails for any reason in trainer, or takes longer than a specific period of time, the master server will move the task back to the todo queue. The timeout count for that task will increase by one. If the timeout count is above a threshold, the task is likely to cause a trainer to crash, then it will be discarded. -1. The master server will move completed task to the done queue. When the todo queue is empty, the master server will start a new pass by moving all tasks in the done queue to todo queue and reset the timeout counter of all tasks to zero. - -### Trainer Process - -The trainer process will: - -- Request tasks from the master. -- Work on the tasks -- Upload gradient to parameter servers, and update local model by downloading new parameters from parameter servers. - -### Parameter Server Process - -Parameter server processes hold the parameters collaboratively. The parameters are partitioned on different parameter servers. - -The parameter server will: - -- Receive gradient from the trainers, update its parameters, and give the trainers the latest parameters. -- Periodically save its parameters to distributed file system by overriding the previous save. - -### Optimization Algorithms - -The communication pattern between the trainers and the parameter servers depends on the category of optimization algorithm: - -- Synchronous Stochastic Gradient Descent (sync-SGD) - - Parameter server will wait for all trainer finish n-th mini-batch calculation and send their gradients before broadcasting new parameters to every trainer. Every trainer will wait for the new parameters before starting n+1-th mini-batch. - -- Asynchronous Stochastic Gradient Descent (async-SGD) - - There will no synchronization between different trainers, and parameter server updates its parameter as soon as it receives new gradient: - - - Each trainer uploads its accumulated gradient every n mini-batches. - - Every m mini-batches, the trainer downloads new parameters from parameter server. - - n and m do not have to be equal. - -## Fault Tolerant - -The training job will pause if the master server processes is dead, or any of the parameter server process is dead. They will be started by [Kubernetes](https://kubernetes.io/) and recover in few minutes. Please refer to [fault recovery](#fault-recovery). - -The training job will continue to make progress if there is at least one training process running. The strategy depends on the type of optimization algorithm: - -- sync-SGD - - TODO - -- async-SGD - - Since async-SGD does not require synchronization between mini-batches, the system will by definition make process if at least one trainer is running. - -## Fault Recovery - -PaddlePaddle uses [etcd](https://github.com/coreos/etcd) to keep track of the states of processes. Because etcd is a distributed reliable key-value store, the restarted process can recover its states from etcd. The model parameters are periodically saved into distributed file system, so a restarted parameter server can recover its parameters from the saved file. - -Now we will introduce how each process recovers from a failure, the graph below shows how etcd is used: - - - -### Master Server Process - -When the master is started by the Kubernetes, it executes the following steps at startup: - -1. Grabs a unique *master* lock in etcd, which prevents concurrent master instantiations. -1. Recovers the task queues from etcd if they already exist, otherwise, the master will create them. -1. Write its ip address to */master/addr* so that trainers can discover it. -1. Listens to trainers' request of task, dispatch one upon request, and updates task queue using an etcd transaction to ensure lock is held during the update. - -When the master server process is dead for any reason, Kubernetes will restart it. It will be online again with all states recovered from etcd in few minutes. - -### Trainer Process - -When the trainer is started by the Kubernetes, it executes the following steps at startup: - -1. Watches the available parameter server prefix keys `/ps/` on etcd and waits until the count of parameter servers reaches the desired count */ps_desired*. -1. Finds and watches */master/addr* to get master's address. -1. Requests for tasks from the master to start training. - -When a trainer fails, Kuberentes would try to restart it. The recovered trainer would fetch tasks from master and go on training. - -### Parameter Server Process - -When the parameter server is started by Kubernetes, it executes the following steps at startup: - -1. Read desired total number of parameter servers from etcd `/ps_desired` -1. Search through etcd keys `/ps/` (`/ps/0`, `/ps/1`, ...) to find the first non-existant key whose index is smaller than the total number of parameter servers. Set the key using a transaction to avoid concurrent writes. The parameter server's index is inferred from the key name. - - The desired number of parameter servers is 3: - - - - The third parameter server joined: - - - -1. The parameter server can load parameters if there are already saved parameters in the save path (inferred from its index). -1. Now the parameter server is ready for the trainers' requests. - -If the parameter server's etcd lease expires, the parameter server will kill itself. - - -## Parameter Server Checkpointing -See [here](./checkpointing.md) - -## Store and dispatching trainning data -See [here](./data_dispatch.md) - - -## Dynamic Scaling - -### Trainer Scaling - -TODO - -### Parameter Server Scaling - -Not planned for v1. - -## Training Dataset Format - -TODO - -## User Interface - -TODO diff --git a/doc/v2/design/cluster_train/checkpointing.md b/doc/v2/design/cluster_train/checkpointing.md deleted file mode 100644 index c87ef2c7d2636208866d05456d5d44316d0bb200..0000000000000000000000000000000000000000 --- a/doc/v2/design/cluster_train/checkpointing.md +++ /dev/null @@ -1,44 +0,0 @@ -## 模型参数检查点(Checkpointing) -模型数据检查点的实现,可以有效的避免parameter server的单点或多点同时故障。模型参数检查点通过定期向磁盘上保存一份存储在parameter server内存中的模型数据的完整镜像,来保证训练过程可以从中间状态重新启动。在一个不可中断并缺少备份的训练任务中,可以通过阶段性的保存每个parameter server的数据快照(snapshot)到 ***分布式存储服务*** 达到容灾的目的,比如每隔10分钟最新的快照,并删除更早的快照。在出现单点故障时,只需要恢复这台节点,或者将这台节点迁移到另一个节点并启动即可恢复训练任务。 - - - -### 快照保存的设计如下: - -说明: - -* parameter server在集群中启动后,自动挂载分布式存储目录,并把快照保存到这个目录下。 -* ***注:每个parameter server的检查点各自独立保存,暂时不考虑多个parameter server同步的保存一个特定时间点的全局检查点,因为这样做也没法保证消除随机性。*** - -检查点保存程序流程: - -1. 如果满足条件"每隔10分钟"时,parameter server会获取parameters内存的`read_lock`,启动一个新的线程开始保存检查点。如果已经正在执行保存检查点的线程,则忽略。由于对parameters的更新需要获取parameters内存的`write_lock`,所以在写入快照的过程中,parameter server会暂停参数更新并等待。 -2. parameter server生成一个UUID,向指定的目录中一个新的文件(文件名为此UUID)写入快照数据。在快照写入完成后,计算这个文件的MD5 sum。然后在etcd的`/checkpoints/[pserver_id]`中写入json内容:`{"uuid": [UUID], "md5", "MD5 sum", "timestamp": xxxx}`。 -3. 删除磁盘目录中不是当前uuid的快照文件。 -4. 释放对paramters内存的锁定,停止保存检查点的线程。 - -这里需要用户额外注意,在您的实际环境中,训练任务的运行可能会占满trainer和parameter server之间的网络带宽,如果parameter server此时还需要通过网络访问分布式存储以保存快照,可能会造成网络拥塞,而出现阶段性的运行停滞。 - -### 从快照恢复 - -在parameter server第一次启动或任意时间parameter server故障后被Kubernetes重新启动,则需要回滚到上一个检查点: - - 1. 从etcd中读取节点:`/checkpoints/[pserver_id]`获取最新的检查点的文件uuid - 1. 从磁盘文件中加载uuid文件名的检查点快照文件,并加载其中的参数 - 1. 如果上面两步出现错误,则使用启动参数定义的初始化方法初始化参数 - 1. 开始提供服务 - -## TODO List -### 推测执行/加速执行(TODO) -在异构集群中,如果存在某些trainer执行速度过慢会影响整体集群的速度(如图中Trainer 1),此时master将负责启动一个新的Trainer(Accelerate Trainer 2),使用同样的训练数据block。哪个trainer先完成block的训练,则把另一个慢速的kill掉。 - -### 动态扩容/缩容 -目前只考虑动态扩容trainer数量,可以减小系统复杂性。 - -## 术语 -* model: 指深度学习训练之后得到的所有参数,使用这个神经网络可以完成对新数据的预测 -* parameters: 神经网络中的参数,包括权重w和偏置b。一个神经网络的模型由大量的参数组成 -* shard: 分片,通常指将一个整体拆分成多份的其中的一份。 -* model shard: 将一个神经网络参数拆分成多份,每个shard分别存储在其中一台parameter server之上 -* parameter block: 多个parameter block构成一个model shard -* 单点故障: 任意时刻只可能同时有一台服务器故障。由于集群中同时存在两台机器故障的概率极低((平均故障率*平均故障修复时间)^2)只对特殊在线系统考虑两台以上同时故障的容灾。 diff --git a/doc/v2/design/cluster_train/data_dispatch.md b/doc/v2/design/cluster_train/data_dispatch.md deleted file mode 100644 index 1f5d22ff5e6abcb576d16cbe7391da1967a1ab8e..0000000000000000000000000000000000000000 --- a/doc/v2/design/cluster_train/data_dispatch.md +++ /dev/null @@ -1,160 +0,0 @@ -## 训练数据的存储和分发 - -### 概念解释 - -### 流程介绍 -生产环境中的训练数据集通常体积很大,并被存储在诸如Hadoop HDFS,Ceph,AWS S3之类的分布式存储之上。这些分布式存储服务通常会把数据切割成多个分片分布式的存储在多个节点之上。这样就可以在云端执行多种数据类计算任务,包括: - -* 数据预处理任务 -* Paddle训练任务 -* 在线模型预测服务 -
- -
- -在上图中显示了在一个实际生产环境中的应用(人脸识别)的数据流图。生产环境的日志数据会通过实时流的方式(Kafka)和离线数据的方式(HDFS)存储,并在集群中运行多个分布式数据处理任务,比如流式数据处理(online data process),离线批处理(offline data process)完成数据的预处理,提供给paddle作为训练数据。用户也可以上传labeled data到分布式存储补充训练数据。在paddle之上运行的深度学习训练输出的模型会提供给在线人脸识别的应用使用。 - -### 训练数据存储 -我们选择[CephFS](http://docs.ceph.com/docs/master/cephfs/)作为存储系统。 - -- 无论是从[PFSClient](../file_manager/README.md)的角度,还是从[Pod](https://kubernetes.io/docs/concepts/workloads/pods/pod/)中运行任务的角度,统一用`/pfs/$DATACENTER/home/$USER`来访问用户自己的数据。 -- `/pfs/$DATACENTER/common`下存放公共数据集合 - - 做只读挂载 - -
- -
- -### 文件预处理 - - -在开始训练之前, 数据集需要预先被转换成PaddlePaddle分布式训练使用的存储格[RecordIO](https://github.com/PaddlePaddle/Paddle/issues/1947)。我们提供两个转换方式: - -1. 用户在本地转换好再上传 -1. 用户上传数据后,在机群上运行转换程序 - -转换生成的文件名会是以下格式: - -```text -name_prefix-aaaaa-of-bbbbb -``` - -"aaaaa"和"bbbbb"都是五位的数字,每一个文件是数据集的一个shard,"aaaaa"代表shard的index,"bbbbb"代表这个shard的最大index。 - -比如ImageNet这个数据集可能被分成1000个shard,它们的文件名是: -```text -imagenet-00000-of-00999 -imagenet-00001-of-00999 -... -imagenet-00999-of-00999 -``` - -#### 转换库 - -无论是在本地或是云端转换,我们都提供Python的转换库,接口是: -```python -def convert(output_path, reader, num_shards, name_prefix) -``` - -- `output_path`: directory in which output files will be saved. -- `reader`: a [data reader](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/reader/README.md#data-reader-interface), from which the convert program will read data instances. -- `num_shards`: the number of shards that the dataset will be partitioned into. -- `name_prefix`: the name prefix of generated files. - -`reader`每次输出一个data instance,这个instance可以是单个值,或者用tuple表示的多个值: - -```python -yield 1 # 单个值 -yield numpy.random.uniform(-1, 1, size=28*28) # 单个值 -yield numpy.random.uniform(-1, 1, size=28*28), 0 # 多个值 -``` - -每个值的类型可以是整形、浮点型数据、字符串,或者由它们组成的list,以及numpy.ndarray。如果是其它类型,会被Pickle序列化成字符串。 - -### 示例程序 - -#### 使用转换库 - -以下`reader_creator`生成的`reader`每次输出一个data instance,每个data instance包涵两个值:numpy.ndarray类型的值和整型的值: -```python -def reader_creator(): - def reader(): - for i in range(1000): - yield numpy.random.uniform(-1, 1, size=28*28), 0 # 多个值 - return reader -``` - -把`reader_creator`生成的`reader`传入`convert`函数即可完成转换: -```python -convert("./", reader_creator(), 100, random_images) -``` - -以上命令会在当前目录下生成100个文件: -```text -random_images-00000-of-00099 -random_images-00001-of-00099 -... -random_images-00099-of-00099 -``` - -#### 进行训练 - - -PaddlePaddle提供专用的[data reader creator](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/reader/README.md#python-data-reader-design-doc),生成给定`RecordIO`文件对应的data reader。**无论在本地还是在云端,reader的使用方式都是一致的**: - -```python -# ... -reader = paddle.reader.creator.RecordIO("/pfs/datacenter_name/home/user_name/random_images-*-of-*") -batch_reader = paddle.batch(paddle.dataset.mnist.train(), 128) -trainer.train(batch_reader, ...) -``` - -以上代码的reader输出的data instance与生成数据集时,reader输出的data instance是一模一样的。 - -### 上传训练文件 - -使用下面命令,可以把本地的数据上传到存储集群中。 - -```bash -paddle pfs cp filename /pfs/$DATACENTER/home/$USER/folder/ -``` - -比如,把之前示例中转换完毕的random_images数据集上传到云端的`/home/`可以用以下指令: - -```bash -paddle pfs cp random_images-*-of-* /pfs/$DATACENTER/home/$USER/folder/ -``` - -需要`$DATACENTER`的配置写到配置文件中,例如 - -``` -# config file -[datacenter_1] -username=user -usercert=user.pem -userkey=user-key.pem -endpoint=datacenter1.paddlepaddle.org - -[datacenter_2] -username=user -usercert=user.pem -userkey=user-key.pem -endpoint=datacenter2.paddlepaddle.org -``` -## TODO -### 文件访问的权限 -控制用户权限 - -- 用户可以把自己的数据分享给别人 - -### 文件访问方式 -不用mount的方式来访问数据,而是直接用API的接口远程访问 - -例如: - -``` -f = open('/pfs/datacenter_name/home/user_name/test1.dat') -``` - - -### 支持用户自定义的数据预处理job diff --git a/doc/v2/design/cluster_train/large_model_dist_train.md b/doc/v2/design/cluster_train/large_model_dist_train.md deleted file mode 100644 index edb0245ea083e791b7f32ac57a330698299fceda..0000000000000000000000000000000000000000 --- a/doc/v2/design/cluster_train/large_model_dist_train.md +++ /dev/null @@ -1,101 +0,0 @@ -# Alalysis of large model distributed training in Paddle - -***NOTE: This is only some note for how we implemeted this scheme in V1, not a new design.*** - -## What is it - -We often encounter cases that the embedding layer parameters(sparse) are so large that we can not store it in the trainer's memory when training. So we need to put them to several servers, and fetch them row by row instead of fetch all of the parameters. - -## How to use - -Specify command-line argument like `--loadsave_parameters_in_pserver=true --ports_num_for_sparse=1 --use_old_updater=1` when starting the paddle trainer. And also add something like `--ports_num_for_sparse=1 --pserver_num_threads=5` when starting pserver processes. - -Accrodingly, configure your embedding layers like: - -```python -SPARSE_REMOTE=True - -w1 = data_layer(name="w1", size=dict_size) -emb1 = embedding_layer(input=w1, size=32, param_attr=ParameterAttribute(sparse_update=SPARSE_REMOTE)) -w2 = data_layer(name="w2", size=dict_size) -emb2 = embedding_layer(input=w2, size=32, param_attr=ParameterAttribute(sparse_update=SPARSE_REMOTE)) -... -``` - -## Implementation details - -```c++ -enum MatType { - MAT_NORMAL, - MAT_NORMAL_SHARED, - MAT_VALUE_SHARED, - MAT_SPARSE_ROW_IDS, - MAT_SPARSE_ROW_AUTO_GROW, - MAT_CACHE_ROW, - MAT_SPARSE_ROW, - MAT_SPARSE_ROW_PREFETCH, - MAT_SPARSE_ROW_PREFETCH_FULL_SIZE, -}; -``` - -`MAT_SPARSE_ROW_PREFETCH` is what we use when configured to fetch only row of matrix when training. - -In `trainer_internal.cpp:L93 trainOneBatch`: - -```c++ - if (config_->getOptConfig().use_sparse_remote_updater()) { - REGISTER_TIMER("prefetch"); - gradientMachine_->prefetch(inArgs); - parameterUpdater_->getParametersRemote(); - } -``` - -When doing actual network forward and backward, at the beginning of each batch, the trainer will try to download one row of data from pserver. - -In `legacy/trainer/RemoteParameterUpdater.cpp`: `parameterUpdater_->getParametersRemote();`: - -```c++ -if (fullSize) { - ... -} else { -getParams = [&] { - parameterClient_->getParameterSparse( - /* recvParameterType= */ PARAMETER_VALUE, sendBackParameterType); -}; -applyL1 = [](Parameter& para, real decayRate) { - para.getMat(PARAMETER_VALUE)->applyL1(/*lr=*/1.0f, decayRate); -}; -} -``` - -Calling `parameterClient_->getParameterSparse` will do remote call to pserver's `getParameterSparse`: - -```c++ -void ParameterServer2::getParameterSparse(const SendParameterRequest& request, - std::vector& inputBuffers, - SendParameterResponse* response, - std::vector* outputBuffers) { - (void)inputBuffers; - auto& buffer = *readWriteBuffer_; - size_t numReals = 0; - for (const auto& block : request.blocks()) { - numReals += getParameterConfig(block).dims(1); - } - buffer.resize(numReals); - - VLOG(3) << "pserver: getParameterSparse, numReals=" << numReals; - - ReadLockGuard guard(parameterMutex_); - size_t offset = 0; - for (const auto& block : request.blocks()) { - size_t width = getParameterConfig(block).dims(1); - Buffer buf = {buffer.data() + offset, width}; - int type = request.send_back_parameter_type(); - sendBackParameterSparse(block, type, response, &buf, width, outputBuffers); - offset += width; - } -} -``` - -`getParameterConfig(block).dims(1)` returns the width of the current "parameter block"(a shard of parameter object), -then `getParameterSparse` remote call returns only one row of data to the client. diff --git a/doc/v2/design/cluster_train/master_server.md b/doc/v2/design/cluster_train/master_server.md deleted file mode 100644 index 4bf3c506f101361875043f8bfd97972b8c981a22..0000000000000000000000000000000000000000 --- a/doc/v2/design/cluster_train/master_server.md +++ /dev/null @@ -1,91 +0,0 @@ -# Design Doc: Master Server - -For an overview of master server's role, please refer to [distributed training design doc](./README.md). In this design doc we will discuss the master server in more details. The master will be implemented in [Go](https://golang.org/). - -## Dataset - - - -A dataset is a list of files in *RecordIO* format. A RecordIO file consists of chunks, whereas each chunk consists some records. - -## Task Queue - -As mentioned in [distributed training design doc](./README.md), a *task* is a data shard that the master server assigns to the trainer process to train on. A task consists of one or multiple *chunks* from one or multiple files. The master server maintains *task queues* to track the training progress. - -### Task Queue Creation - -1. Each trainer will make an RPC call (using Go's [rpc](https://golang.org/pkg/net/rpc/) package) to the master server, telling it the RecordIO files representing the dataset specified by the user. Since every trainer will tell the master server the same dataset, only the first RPC call will be honored. - - The RPC interface is: - ```go - func (m *RPCServer) ReportDataset(Paths []string, dummy *int) error { - } - ``` -1. The master server will scan through each RecordIO file to generate the *chunk index* and know how many chunks does each file have. A chunk can be referenced by the file path and the index of the chunk within the file. The chunk index is in memory data structure that enables fast access to each chunk, and the index of the chunk with the file is an integer start from 0, representing the n-th chunk within the file. - - The definition of the chunk is: - ```go - type Chunk struct { - Idx int // index of the chunk within the file - Path string - Index recordio.Index // chunk index - } - ``` -1. Chunks are grouped into tasks, and tasks are filled into the todo queue. The pending queue and the done queue are initialized with no element. - - The definition of the task is: - ```go - type Task struct { - Index int - Chunks []Chunk - } - ``` - - The elements in the tasks queues is of type `TaskEntry`, containing a timeout counter (described in [task retry logic](#task-retry-logic)), and a task: - ```go - type TaskEntry struct { - NumTimeout int - Task Task - } - ``` - - The definition of task queues is: - ```go - type TaskQueues struct { - Todo []TaskEntry - Pending map[int]TaskEntry // map from task index to task entry - Done []TaskEntry - } - ``` - -### Task Queue Persistence - -The task queues need to be persisted on [etcd](https://github.com/coreos/etcd) for fault recovery. Since the task queues only change once a task is completed or timed out, which is not very frequent, we can afford to synchronize with etcd every time the task queues change. - -We will serialize the task queues data structure with [gob encoding](https://golang.org/pkg/encoding/gob/), compress with gzip, and save into etcd synchronously under key `/task_queues`. - -### Task Dispatch - -The trainer will make an RPC call to master to get a new task when: - -- the trainer first started, or -- the trainer finishes a task. - -The RPC interface is: -```go -func (m *RPCServer) GetTask(finished *Task, result *Task) error { -} -``` -Argument `finished` will be `nil` when the trainer is just started. - -During the RPC call the master will do the following: - -- Make a copy of the task queues, and update the copy reflecting the finished tasks and the new pending tasks. -- Synchronize the copy of task queues with etcd using a transaction conditioned on holding the master lock. -- Replace the task queues with the copy and report to the trainer with the new tasks if succeeded, or discard the copy and report the error to the trainer if failed. - -### Task Retry Logic - -When a task is dispatched to the trainer, the master will schedule a function for execution after the timeout duration (based on the moving average of task completion time). If the task entry in still in the pending queue, its timeout counter will increase by one, and the task will be moved to todo queue. If the timeout counter is above the threshold, the master will log the error and discard the task. - -Please note that since a timed out task could be completed after it has been dispatched for retry, so it is possible for a task to be processed multiple times. We do not try to prevent it from happening since it's fine to train on the same task multiple times due to the stochastic nature of the stochastic gradient decent algorithm. diff --git a/doc/v2/design/cluster_train/pserver_client.md b/doc/v2/design/cluster_train/pserver_client.md deleted file mode 100644 index 474b8c572cd92fc87e9f7f3f2b19d12cccd158de..0000000000000000000000000000000000000000 --- a/doc/v2/design/cluster_train/pserver_client.md +++ /dev/null @@ -1,171 +0,0 @@ -# Design Doc: The Client Library of Parameter Server - -For an overview of trainer's role, please refer to [distributed training design doc](README.md). In this design doc, we will discuss the parameter server's client library, which will manage communication with parameter servers. The library will be implemented in [Go](https://golang.org/) and made available as a static or dynamic library with a C header file. - -## Parameter Partition - -Each parameter will be partitioned into parameter blocks to make the parameters evenly distributed on parameter servers. The partition is done automatically by the client library. The *sparse parameter* require a little different treatment: - -### Sparse Parameter - -The sparse parameter is a parameter that is updated sparsely. The name is somewhat misleading, it does not have a sparse representation, it has the same representation as a dense vector. - -Because a sparse parameter is updated sparsely, the trainer will have to partition the sparse parameter. Because the parameter server will merge all sparse parameter shard into the same file when saving the parameter. It needs special naming convention: - -If a sparse parameter is partitioned into n shards, they should be named as: - -```text -name:sparse-0 -name:sparse-1 -... -name:sparse-n-1 -``` - -The library is unaware of the partition, and treat each parameter independently. Only when saving parameters, the parameter servers will merge the sparse parameters according to the naming convention. - -## Model Optimization Using Gradients - -There are two ways to perform model optimization using gradients: - -- On Client - - The client does multiple steps of forward and backward update. In each step, the gradients are calculated and a new model is generated. After some steps, the client will calculate the difference between the newest model and the old model at step 0. The difference will be updated to parameter servers. Parameter servers will just update parameters using the difference without any optimization using gradients (such as Adam and L1 regularization). - -- On Parameter Server - - The client will send accumulated gradients to parameter servers, the parameter server will do the optimization using gradients. - -## L1 and L2 Regularization - -PaddlePaddle allows L1 or L2 regularizations to be specified per parameter, so when the trainer initializes the parameter it needs include a parameter configuration when L1 or L2 regularization is necessary. - -## Parameter Initialization - -The parameters on parameter servers need to be initialized. To provide maximum flexibility, the trainer will initialize the parameters. Only one trainer will do the initialization, the other trainers will wait for the completion of initialization and get the parameters from the parameter servers. - -### Trainer Selection - -To select the trainer for initialization, every trainer will try to get a distributed lock, whoever owns the lock will do the initialization. As illustrated below: - - - -### Trainer Selection Process - -The trainer select process is encapsulated in the C API function: -```c -int paddle_begin_init_params(paddle_pserver_client* client, const char* config_proto); -``` -The selected trainer's call to `paddle_begin_init_params` will return with 1, and the other trainers' call to `paddle_begin_init_params` will return 0. `paddle_get_params` will be blocked until initialization is completed. As illustrated below: - - - -## C Interface - -```c -typedef enum { - PADDLE_ELEMENT_TYPE_INT32 = 0, - PADDLE_ELEMENT_TYPE_UINT32 = 1, - PADDLE_ELEMENT_TYPE_INT64 = 2, - PADDLE_ELEMENT_TYPE_UINT64 = 3, - PADDLE_ELEMENT_TYPE_FLOAT32 = 4, - PADDLE_ELEMENT_TYPE_FLOAT64 = 5, -} paddle_element_type; - -typedef struct { - char* name; - paddle_element_type element_type; - unsigned char* content; - int content_len; -} paddle_parameter, paddle_gradient; - -typedef int paddle_pserver_client; - -/** - * @brief creates a pserver client that talks to etcd for coordination. - */ -paddle_pserver_client paddle_new_etcd_pserver_client(char* etcd_addr); - -/** - * @brief creates a pserver client given pserver addresses. - * - * @param pserver_addrs comma-separated pserver addresses. - * @param selected if current pserver client is selected to initialize all parameter servers. - */ -paddle_pserver_client paddle_new_pserver_client(char* pserver_addrs, int selected); -void paddle_pserver_client_release(paddle_pserver_client c); - -/** - * @brief paddle_begin_init_params begins to initialize parameters on - * parameter servers. - * - * paddle_begin_init_params will be called from multiple trainers, - * only one trainer will be selected to initialize the parameters on - * parameter servers. Other trainers need to get the initialized - * parameters from parameter servers using @paddle_get_params. - * - * @return 1 if the trainer is selected to initialize parameter - * servers, otherwise 0. - */ -int paddle_begin_init_params(paddle_pserver_client client); - -/** - * @brief paddle_init_param initializes the parameter on parameter - * servers. - * - * @param param the parameter to initialize. - * @param param_config_proto the configuration for the parameter. - * @param config_len the length of param_config_proto - * @return 0 if successful, otherwise -1. On failure, the trainer - * needs to restart the entire initialization process (starting from - * @paddle_begin_init_param). Or simply exit the program and wait for - * the cluster management system to restart the trainer. - */ -int paddle_init_param(paddle_pserver_client client, paddle_parameter param, const unsigned char* param_config_proto, int config_len); - -/** - * @brief paddle_finish_init_params tells parameter servers client has - * sent all parameters to parameter servers as initialization. - * - * @return 0 if successful, otherwise -1. On failure, the trainer - * needs to restart the entire initialization process (starting from - * @paddle_begin_init_param). Or simply exit the program and wait for - * the cluster management system to restart the trainer. - */ -int paddle_finish_init_params(paddle_pserver_client client); - -/** - * @brief paddle_send_grads sends gradients to parameter servers for - * updating parameters. - * - * @param grads the array of gradients to send. - * @param len the length of the gradient array. - * @param learning_rate the learning rate for the gradients. - * @return 0 if successful, otherwise -1. - */ -int paddle_send_grads(paddle_pserver_client client, const paddle_gradient* grads, int len); - -/** - * @brief paddle_get_params gets parameters from parameter servers. - * - * paddle_get_params will block until parameters are initialized on - * the parameter servers. - * - * @param dst the destination array of parameter pointers to save to. - * The parameter pointer must be pre-popullated with required parameter name, - * and the content of parameter must be pre-allocated of the size of required - * parameter on pserver. - * @param len the length of the names array and the paddle_parameter - * array. - * @return 0 if successful, otherwise -1. - */ -int paddle_get_params(paddle_pserver_client client, paddle_parameter** dst, int len); - -/** - * @brief paddle_save_model indicates parameters to save the parameter - * to the given path - * - * @param path the path to save parameters. - * @return 0 if successful, otherwise -1. - */ -int paddle_save_model(paddle_pserver_client client, const char* path); -``` diff --git a/doc/v2/design/cluster_train/remote_parameter_updater.md b/doc/v2/design/cluster_train/remote_parameter_updater.md deleted file mode 100644 index 6e8e5938455b869e0f3367794c41250340b37f77..0000000000000000000000000000000000000000 --- a/doc/v2/design/cluster_train/remote_parameter_updater.md +++ /dev/null @@ -1,21 +0,0 @@ -# Design Doc: Remote Parameter Updater for Cluster Train - -For an overview of distribute training, please refer to [distributed training design doc](README.md). In this design doc, we will discuss the parameter updater that will use parameter server cclient [The Client Library of Parameter Server Design Doc](pserver_client.md) to manage and update parameters. - -## Parameter Updater - -Parameter Updater is used by trainer to manage and update parameter, there are mainly two kind of parameter updater: local and remote, since this design is for cluster train, we will only discuss remote parameter updater here. - -### Remote Parameter Updater - -Remote Parameter Updater manage parameters through remote parameter server with the client that communicate with pserver([The Client Library of Parameter Server Design Doc](pserver_client.md)) - -In PaddlePaddle Python V2 API, trainer is implemented in python, and the trainer will hold a instance of parameter updater and call it's functions directly. In this design, we will also expose the api of RemoteParameterUpdater to python with swig. - -#### Sparse Remote Parameter Updater - -Since we will only implement dense parameter management new, the mechanism for sparse parameter will be discussed in next stage. - -### Interface Design - -TBD diff --git a/doc/v2/design/cluster_train/save_model.md b/doc/v2/design/cluster_train/save_model.md deleted file mode 100644 index b755185c81ad617b9c85c47de0f5f65d2201c658..0000000000000000000000000000000000000000 --- a/doc/v2/design/cluster_train/save_model.md +++ /dev/null @@ -1,111 +0,0 @@ -# Design Doc: Save Model - -## Overview - -The model is the output of the training process. There are two -ways from which user can obtain a model: - -- Save model triggered by user code: user code asks PaddlePaddle to - save a model. -- Convert model from the checkpoint: model being converted from - pservers' periodic checkpoint. In this way, the user can cancel a - job at any time, and still have a relatively fresh model (we - checkpoint around every 5 minutes). - -### Trainer Saving Model vs. Pservers Saving Model - -Both trainers and pservers have access to the model. So the model can -be saved from a trainer or pservers. We need to decide where the model -is saved from. - -#### Dense Update vs. Sparse Update - -There are two types of model update methods: dense update and sparse -update (when the model parameter is configured to be sparse). - -- Dense update - - Every trainer has it's own full copy of the model. Every model - update will update the entire model. - -- Sparse update - - The training input is sparse, and the trainer does not have the - entire model. It will only download the sub-model necessary related - to the input. When updating the model, only the sub-model related to - the training input is updated. - - -#### Pservers Saving Model - -The benefit of letting pservers save model is they have the entire -model all the time. However, since pservers are on different nodes, it -requires a merging process to merge model shards into the same -model. Thus requires the pservers to write models to a distributed -filesystem, making the checkpoint shards visible to the merge program. - -#### Trainer Saving Model - -The benefit of letting one trainer to save the model is it does not -require a distributed filesystem. And it's reusing the same save model -logic when training locally - except when doing sparse update, the -trainer needs to download the entire model during the saving process. - -#### Conclusion - -Given trainer saving model does not require a distributed filesystem, -and is an intuitive extension to trainer saving model when training -locally, we decide to let the trainer save the model when doing -distributed training. - - -### Convert Model from Checkpoint - -TODO - - -## Timeline - -We first implement trainer save the model. Converting the latest -snapshot to a model will be a TODO for future. - - -## Trainer Save Model - -### Trainer Election - -One trainer will be elected as the one to save the model. When using -etcd, trainer ID is a randomly generated UUID, the trainer will -contact the master server requesting to save the model, and find out -if itself is elected. When the master server is not used, unique -trainer IDs will be given by the administrator, the trainer whose ID -is "0" is elected to save the model. - -### Model Save Path - -Each trainer will be given the directory to save the model. The -elected trainer will save the model to -`given-directory/trainerID`. Since the trainer ID is unique, this -would prevent concurrent save to the same file when multiple trainers -are elected to save the model when split-brain problem happens. - -### What Happens When Model Is Saving - -It takes some time to save model, we need to define what will happen -when save model is taking place. - -When doing dense update, the trainer uses the local model. Pservers -does not need to pause model update. - -When doing sparse update. The trainer needs to download the entire -model while saving. To get the most accurate model, the model update -needs to be paused before the download starts and resumed after the -download finishes. Otherwise, the trainer gets a model that is -"polluted": some part of the model is old, some part of the model is -new. - -It's unclear that the "polluted" model will be inferior due to the -stochastic nature of deep learning, and pausing the model update will -add more complexity to the system. Since supporting sparse update is a -TODO item. We defer the evaluation of pause the model update or not -during saving model to the future. diff --git a/doc/v2/design/cluster_train/src/checkpointing.png b/doc/v2/design/cluster_train/src/checkpointing.png deleted file mode 100644 index c221e8474f90f37e31416cbb19c9452207a0d14c..0000000000000000000000000000000000000000 Binary files a/doc/v2/design/cluster_train/src/checkpointing.png and /dev/null differ diff --git a/doc/v2/design/cluster_train/src/data_dispatch.png b/doc/v2/design/cluster_train/src/data_dispatch.png deleted file mode 100644 index 5bdcc24d6a6d193cb014f8c38b362451fded5e54..0000000000000000000000000000000000000000 Binary files a/doc/v2/design/cluster_train/src/data_dispatch.png and /dev/null differ diff --git a/doc/v2/design/cluster_train/src/dataset.graffle b/doc/v2/design/cluster_train/src/dataset.graffle deleted file mode 100644 index c10a423ed16a23229a9ee33d11bfc82bb59646c8..0000000000000000000000000000000000000000 Binary files a/doc/v2/design/cluster_train/src/dataset.graffle and /dev/null differ diff --git a/doc/v2/design/cluster_train/src/dataset.png b/doc/v2/design/cluster_train/src/dataset.png deleted file mode 100644 index 2fb7f1cce3b6dd21489392557826e95a9f207c34..0000000000000000000000000000000000000000 Binary files a/doc/v2/design/cluster_train/src/dataset.png and /dev/null differ diff --git a/doc/v2/design/cluster_train/src/file_storage.graffle b/doc/v2/design/cluster_train/src/file_storage.graffle deleted file mode 100644 index 50a17e70fa255495337c529a3bf12a5c0024a5be..0000000000000000000000000000000000000000 Binary files a/doc/v2/design/cluster_train/src/file_storage.graffle and /dev/null differ diff --git a/doc/v2/design/cluster_train/src/file_storage.png b/doc/v2/design/cluster_train/src/file_storage.png deleted file mode 100644 index fccb4e3e7e738224c7f1584326bd5f351ce799aa..0000000000000000000000000000000000000000 Binary files a/doc/v2/design/cluster_train/src/file_storage.png and /dev/null differ diff --git a/doc/v2/design/cluster_train/src/init_lock.graffle b/doc/v2/design/cluster_train/src/init_lock.graffle deleted file mode 100644 index fa9149f21b1311eed48ef72ec55e556559d0fc94..0000000000000000000000000000000000000000 Binary files a/doc/v2/design/cluster_train/src/init_lock.graffle and /dev/null differ diff --git a/doc/v2/design/cluster_train/src/init_lock.png b/doc/v2/design/cluster_train/src/init_lock.png deleted file mode 100644 index 92404ee6d6c0f9a7727952bae3c869ba338ecd7f..0000000000000000000000000000000000000000 Binary files a/doc/v2/design/cluster_train/src/init_lock.png and /dev/null differ diff --git a/doc/v2/design/cluster_train/src/paddle-cloud-in-data-center.png b/doc/v2/design/cluster_train/src/paddle-cloud-in-data-center.png deleted file mode 100644 index da5d1a77562480ad1d886f5f21dbd84001d3d508..0000000000000000000000000000000000000000 Binary files a/doc/v2/design/cluster_train/src/paddle-cloud-in-data-center.png and /dev/null differ diff --git a/doc/v2/design/cluster_train/src/paddle-etcd.graffle b/doc/v2/design/cluster_train/src/paddle-etcd.graffle deleted file mode 100644 index f973dc9b9dbf72e9bc31e2d32822916cd281f8d9..0000000000000000000000000000000000000000 Binary files a/doc/v2/design/cluster_train/src/paddle-etcd.graffle and /dev/null differ diff --git a/doc/v2/design/cluster_train/src/paddle-etcd.png b/doc/v2/design/cluster_train/src/paddle-etcd.png deleted file mode 100644 index 57981ceb4b94f0f7d6dfa63f3d28c0402bf9cc31..0000000000000000000000000000000000000000 Binary files a/doc/v2/design/cluster_train/src/paddle-etcd.png and /dev/null differ diff --git a/doc/v2/design/cluster_train/src/paddle-model-sharding.graffle b/doc/v2/design/cluster_train/src/paddle-model-sharding.graffle deleted file mode 100644 index fba30f0ca2b47f0d202a432821d95e55aac37ec8..0000000000000000000000000000000000000000 Binary files a/doc/v2/design/cluster_train/src/paddle-model-sharding.graffle and /dev/null differ diff --git a/doc/v2/design/cluster_train/src/paddle-model-sharding.png b/doc/v2/design/cluster_train/src/paddle-model-sharding.png deleted file mode 100644 index 8c3f6724ef46c6527e63a4cd8cb0b50fe0167124..0000000000000000000000000000000000000000 Binary files a/doc/v2/design/cluster_train/src/paddle-model-sharding.png and /dev/null differ diff --git a/doc/v2/design/cluster_train/src/paddle-ps-0.png b/doc/v2/design/cluster_train/src/paddle-ps-0.png deleted file mode 100644 index 47ef32806f182cab003da77f1556823b3f6d1721..0000000000000000000000000000000000000000 Binary files a/doc/v2/design/cluster_train/src/paddle-ps-0.png and /dev/null differ diff --git a/doc/v2/design/cluster_train/src/paddle-ps-1.png b/doc/v2/design/cluster_train/src/paddle-ps-1.png deleted file mode 100644 index f3125db73096c52bac6e7c60e1675552857c0774..0000000000000000000000000000000000000000 Binary files a/doc/v2/design/cluster_train/src/paddle-ps-1.png and /dev/null differ diff --git a/doc/v2/design/cluster_train/src/paddle-ps.graffle b/doc/v2/design/cluster_train/src/paddle-ps.graffle deleted file mode 100644 index 0e536ffdd91cd696008b4c01bad3cb53edebdc16..0000000000000000000000000000000000000000 Binary files a/doc/v2/design/cluster_train/src/paddle-ps.graffle and /dev/null differ diff --git a/doc/v2/design/cluster_train/src/paddle-task-queues.graffle b/doc/v2/design/cluster_train/src/paddle-task-queues.graffle deleted file mode 100644 index 4263ed8bfd2ef0e55058828bf23f2fac3595e5fd..0000000000000000000000000000000000000000 Binary files a/doc/v2/design/cluster_train/src/paddle-task-queues.graffle and /dev/null differ diff --git a/doc/v2/design/cluster_train/src/paddle-task-queues.png b/doc/v2/design/cluster_train/src/paddle-task-queues.png deleted file mode 100644 index 5f980266795776752cebd0c346b85c4a75a47780..0000000000000000000000000000000000000000 Binary files a/doc/v2/design/cluster_train/src/paddle-task-queues.png and /dev/null differ diff --git a/doc/v2/design/cluster_train/src/paddle-task-states.graffle b/doc/v2/design/cluster_train/src/paddle-task-states.graffle deleted file mode 100644 index cf1a0b9246d9386a949d2dbb8c32fe84f72eea83..0000000000000000000000000000000000000000 Binary files a/doc/v2/design/cluster_train/src/paddle-task-states.graffle and /dev/null differ diff --git a/doc/v2/design/cluster_train/src/paddle-task-states.png b/doc/v2/design/cluster_train/src/paddle-task-states.png deleted file mode 100644 index 4ae43cb66c071aee9eb90d875e2373b29af9c3e0..0000000000000000000000000000000000000000 Binary files a/doc/v2/design/cluster_train/src/paddle-task-states.png and /dev/null differ diff --git a/doc/v2/design/cluster_train/src/pserver_init.graffle b/doc/v2/design/cluster_train/src/pserver_init.graffle deleted file mode 100644 index 5f3f1f52be8aa7f9049a8fcd6b7c93c8560c1676..0000000000000000000000000000000000000000 Binary files a/doc/v2/design/cluster_train/src/pserver_init.graffle and /dev/null differ diff --git a/doc/v2/design/cluster_train/src/pserver_init.png b/doc/v2/design/cluster_train/src/pserver_init.png deleted file mode 100644 index dfe491ff98dd7db1c336093c80964a260df2cd90..0000000000000000000000000000000000000000 Binary files a/doc/v2/design/cluster_train/src/pserver_init.png and /dev/null differ diff --git a/doc/v2/design/cluster_train/src/submit-job.graffle b/doc/v2/design/cluster_train/src/submit-job.graffle deleted file mode 100644 index 677cdfb6d9a32168bf71729eb841fa1ca0dd31d6..0000000000000000000000000000000000000000 Binary files a/doc/v2/design/cluster_train/src/submit-job.graffle and /dev/null differ diff --git a/doc/v2/design/cluster_train/src/submit-job.png b/doc/v2/design/cluster_train/src/submit-job.png deleted file mode 100644 index 3046a460a7ba708079e88a560debaa215a694680..0000000000000000000000000000000000000000 Binary files a/doc/v2/design/cluster_train/src/submit-job.png and /dev/null differ diff --git a/doc/v2/design/cluster_train/src/trainer.graffle b/doc/v2/design/cluster_train/src/trainer.graffle deleted file mode 100644 index 43415ed8cf61a5acfa34f8e56b9577f338dbf254..0000000000000000000000000000000000000000 Binary files a/doc/v2/design/cluster_train/src/trainer.graffle and /dev/null differ diff --git a/doc/v2/design/cluster_train/src/trainer.png b/doc/v2/design/cluster_train/src/trainer.png deleted file mode 100644 index 6537d3d56589ca9f19a77a50a970e4b5275e6ce0..0000000000000000000000000000000000000000 Binary files a/doc/v2/design/cluster_train/src/trainer.png and /dev/null differ diff --git a/doc/v2/design/cluster_train/submit-job.md b/doc/v2/design/cluster_train/submit-job.md deleted file mode 100644 index 8377d5489dc64bd2fdc5bb4f7bc737e7b489000d..0000000000000000000000000000000000000000 --- a/doc/v2/design/cluster_train/submit-job.md +++ /dev/null @@ -1,127 +0,0 @@ -# Submit a Distributed Training Job - -The user can submit a distributed training job with Python code, rather than with a command-line interface. - -## Runtime Environment On Kubernetes - -For a distributed training job, there is two Docker image called *runtime Docker image* and *base Docker image*. The runtime Docker image is the Docker image that gets scheduled by Kubernetes to run during training. The base Docker image is for building the runtime Docker image. - -### Base Docker Image - -Usually, the base Docker image is PaddlePaddle product Docker image including paddle binary files and python package. And of course, users can specify any image name hosted on any docker registry which users have the access right. - -### Runtime Docker Image - -The trainer package which user upload and some Python dependencies are packaged into a runtime Docker image based on base Docker image. - -- Handle Python Dependencies - - You need to provide requirements.txt file in your `trainer-package` folder. Example: - - ```txt - pillow - protobuf==3.1.0 - ``` - More [details](https://pip.readthedocs.io/en/1.1/requirements.html) about requirements, an example project looks like: - ```bash - paddle_example - |-quick_start - |-trainer.py - |-dataset.py - |-requirements.txt - ``` - -## Submit Distributed Training Job With Python Code - - -- `paddle.job.dist_train()` will call the Job Server API `/v1/packages` to upload the trainer package and save them on CephFS, and then call `/v1/trainer/job` to submit the PaddlePaddle distributed job. -- `/v1/trainer/job` will start a building job for preparing the runtime Docker image. When the building job is finished, Job Server will submit the PaddlePaddle distributed job to Kubernetes. -- *NOTE*: For the first version, we will not prepare the runtime Docker image, instead, the package is uploaded to Paddle Cloud, and Paddle Cloud will mount the package in a temporary folder into the base Docker image. We will not support custom Python dependencies in the first version as well. - -You can call `paddle.job.dist_train` and provide distributed training configuration as the parameters: -```python -paddle.job.dist_train( - trainer=dist_trainer(), - paddle_job=PaddleJob( - job_name = "paddle-cloud", - entry_point = "python %s"%__file__, - trainer_package = "/example/word2vec", - image = "yancey1989/paddle-job", - trainers = 10, - pservers = 3, - trainer_cpu = 1, - trainer_gpu = 1, - trainer_mem = "10G", - pserver_cpu = 1, - pserver_mem = "2G" - )) -``` - -The parameter `trainer` of `paddle.job.dist_train` is a function and you can implement it as follows: -```python -def dist_trainer(): - def trainer_creator(): - trainer = paddle.v2.trainer.SGD(...) - trainer.train(...) - return trainer_creator -``` - -The pseudo code of `paddle.job.dist_train` is as follows: -```python -def dist_train(trainer, paddle_job): - # if the code is running on cloud, set PADDLE_ON_CLOUD=YES - if os.getenv("RUNNING_ON_CLOUD", "NO") == "NO": - #submit the paddle job - paddle_job.submit() - else: - #start the training - trainer() -``` -### PaddleJob Parameters -parameter | type | explanation - --- | --- | --- -job_name | str | the unique name for the training job -entry_point | str | entry point for startup trainer process -trainer_package | str | trainer package file path which user have the access right -image|str|the [base image](#base-docker-image) for building the [runtime image](#runtime-docker-image) -pservers|int| Parameter Server process count -trainers|int| Trainer process count -pserver_cpu|int| CPU count for each Parameter Server process -pserver_mem|str| memory allocated for each Parameter Server process, a plain integer using one of these suffixes: E, P, T, G, M, K -trainer_cpu|int| CPU count for each Trainer process -trainer_mem|str| memory allocated for each Trainer process, a plain integer using one of these suffixes: E, P, T, G, M, K -trainer_gpu|int| GPU count for each Trainer process, if you only want CPU, do not set this parameter - -### Deploy Parameter Server, Trainer and Master Process - - Deploy PaddlePaddle Parameter Server processes, it's a Kubernetes ReplicaSet. - - Deploy PaddlePaddle Trainer processes, it's a Kubernetes Job. - - Deploy PaddlePaddle Master processes, it's a Kubernetes ReplicaSet. - -## Job Server - -- RESTful API - - Job server provides RESTful HTTP API for receiving the trainer package and displaying - PaddlePaddle job related informations. - - `POST /v1/package` receive the trainer package and save them on CephFS - - `POST /v1/trainer/job` submit a trainer job - - `GET /v1/jobs/` list all jobs - - `GET /v1/jobs/` the status of a job - - `DELETE /v1/jobs/` delete a job - - `GET /v1/version` job server version - -- Build Runtime Docker Image on Kubernetes - - `paddle.job.dist_train` will upload the trainer package to Job Server, save them on the distributed filesystem, and then start up a job for building the runtime Docker image that gets scheduled by Kubernetes to run during training. - - There are some benefits for building runtime Docker image on JobServer: - - On Paddle Cloud, users will run the trainer code in a Jupyter Notebook which is a Kubernetes Pod, if we want to execute `docker build` in the Pod, we should mount the host's `docker.sock` to the Pod, user's code will connect the host's Docker Engine directly, it's not safe. - - Users only need to upload the training package files, does not need to install docker engine, docker registry as dependencies. - - If we want to change another image type, such as RKT, users do not need to care about it. - -- Deploy Parameter Server, Trainer and Master Processes - - `POST /v1/trainer/job` receives the distributed training parameters, and deploy the job as follows: - - Deploy PaddlePaddle Parameter Server processes, it's a Kubernetes ReplicaSet. - - Deploy PaddlePaddle Trainer processes, it's a Kubernetes Job. - - Deploy PaddlePaddle Master processes, it's a Kubernetes ReplicaSet. diff --git a/doc/v2/design/interface/00.why_plain_c.md b/doc/v2/design/interface/00.why_plain_c.md deleted file mode 100644 index 826ff3141bc2512b525cb44ac0f18b376ce57e92..0000000000000000000000000000000000000000 --- a/doc/v2/design/interface/00.why_plain_c.md +++ /dev/null @@ -1,118 +0,0 @@ -# Paddle多语言接口实现 -## 背景 - -Paddle需要一个多语言接口,这个接口需要做到: - -* 有标准的,良好的文档 - * 例如Python可以使用[Sphinx](http://www.sphinx-doc.org/en/stable/)生成API文档,golang可以使用[GoDoc](https://godoc.org/golang.org/x/tools/cmd/godoc)生成文档。这都需要这个接口按照约定俗成的规则来注释完备。 -* 不同语言的接口适应不同语言的特性 - * 例如Java与Python的错误处理是直接扔出来Exception,而对于golang错误处理应该使用返回值。 - -## 基本要求 - -Paddle的多语言接口实现包括一下几个方面: - -* 我们使用动态库来分发Paddle。在这个动态库中不嵌入任何其他语言的解释器,也不使用其他动态库。 -* 这个动态库使用C99标准的头文件导出一些函数,不使用/导出C++符号。 -* 不导出Paddle内部的结构体、类,仅仅使用`void*`指针作为类型的句柄(handler)。 -* 不使用SWIG这种代码生成器,而是手写多语言绑定。 - - -## 原因 - -### 使用动态库来分发Paddle - -* Paddle的链接方式比较复杂 - * 如果用户要把Paddle的静态库(libpaddle.a)链接到自己的程序里,得使用 `--whole-archive` (for GCC) 或者 `--force_load` (for Clang) 参数,来确保把 libpaddle.a 里所有的符号都写入自己的程序的二进制文件里。这是因为 Paddle 的源码里使用了[object factory design pattern](http://stackoverflow.com/a/1310326/724872)。 -* 编译型语言,例如C/C++使用静态库和动态库难度差不多。但是解释性语言,例如[Python](http://stackoverflow.com/questions/19560594/how-to-import-static-library-in-python)或者[Java](http://stackoverflow.com/questions/24493337/linking-static-library-with-jni),只能调用Paddle的动态库,否则得把Paddle静态库链接到解释器里。 - * 解释性语言实际运行的二进制是解释器本身,如果调用静态库只能将静态库与解释器链接。例如对于Java来说,便是将静态库加入JVM中。这对于通常的Java的开发者来说,是不常见的做法。 - -### 动态库中不嵌入任何其他语言的解释器 - -* 目前Paddle的进程模型是C++内部驱动Python解释器进行模型配置解析和数据读取 -* 我们最终的动态库中不嵌入Python或者其他任何语言的解释器。模型配置解析,数据读取均交由其他语言完成 - -现阶段Paddle有一个问题是,Paddle内嵌的Python解释器和外部使用的Python如果版本不同,会直接报错退出。 - -### Paddle动态库中,不引用其他动态库 - -* 即这个动态库是不依赖于其他任何文件的,可以在任何机器上执行的。 - -### 这个动态库使用C99标准的头文件导出一些函数,不使用/导出C++符号 - -* 由于C++编译器没有[名字修饰](https://en.wikipedia.org/wiki/Name_mangling#C.2B.2B)的规范,不同版本的编译器之间,对于同一段C++代码生成的符号可能不一致。而多语言接口需要直接读取生成的二进制(动态库),需要有稳定的导出符号。 -* C语言是有导出符号的标准的,并且在常见的平台上,都是ABI调用标准的。 -* 大多数语言都支持使用C语言API -* 使用C99而不使用C89,是因为C99支持[Fixed-width integer types](https://en.wikipedia.org/wiki/C_data_types#Fixed-width_integer_types)和[Boolean type](https://en.wikipedia.org/wiki/C_data_types#Boolean_type)。 -* 使用C99而不使用C11的原因是,[C11](https://en.wikipedia.org/wiki/C11_(C_standard_revision))并没有Paddle特别需要的特性,且C99相对于C11使用更加广泛。 - -### 不导出Paddle内部的结构体、类,仅仅使用`void*`指针作为类型的句柄(handler) - -* Paddle内部的类为C++书写,直接导出到C的接口比较困难。 -* 在C-API中使用`void*`来表示Paddle内部类。再在每一个API中自己检查类型。 - -在C的头文件 `paddle_matrix.h` 中: - -```C -typedef void* paddle_matrix; -typedef int paddle_error; - -extern "C" -paddle_error paddle_matrix_get_shape(paddle_matrix matrix, - uint64_t* width, - uint64_t* height); -``` -而在CPP里面实现这个C的接口,文件 `paddle_matrix.cpp` - -```cpp -#include "paddle/legacy/math/matrix.h" -extern "C" -paddle_error paddle_matrix_shape(paddle_matrix matrix, - uint64_t *width, - uint64_t *height) { - auto m = (paddle::capi::CMatrix*)(matrix); - *width = m->width(); - *height = m->height(); -} -``` - -其中`paddle/capi/CMatrix.hpp`文件内容为: - -```cpp -namespace paddle { -namespace math { - -class CMatrix { - std::shared_ptr mat; -}; - -} // namespace math -} // namespace paddle -``` - -### 不使用SWIG这种代码生成器,而是手写多语言绑定 - -* [SWIG](http://www.swig.org/)是一个多语言接口的代码生成器。他的目标是使用C/C++写代码,SWIG直接读取C/C++的头文件,生成各种语言的绑定代码。 - * 对于多语言接口,SWIG需要写一个interface文件。这个文件具有独特的语法,学习成本高。且增加一个第三方语言,就需要对这个第三方语言增加一些定义。有的时候,interface文件的写法非常[tricky](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/api/Paddle.swig#L36)。社区贡献代码学习成本高。 - * SWIG暴露的接口保留了C++的接口样式,很难保证多语言代码风格的一致性。(函数命名,错误处理) - * 因为SWIG在第三方语言中暴露的函数名,类名和C++中完全一致。C++的命名风格并不能适应其他第三方语言。如果使用SWIG我们需要将在interface文件里,将大量的`SomeCppClass`重命名成`some_python_class`,或者`SomeGoTypes`。 - * 对于不同语言,错误处理的方式也不尽相同。例如对于Java或者Python,最常见的错误处理方式是Exception,而对于Golang,错误处理方式是返回值。而SWIG只能简单的暴露C++接口,无法做到对于各种语言错误处理方式的适配。 - * 对于大多数语言,直接使用C语言的.h并不困难。例如Python的[cffi](https://cffi.readthedocs.io/en/latest/overview.html#simple-example-abi-level-in-line)或者[Cython](http://cython.org/), golang的[cgo](https://golang.org/cmd/cgo/)。 - * SWIG支持的语言或者解释器有局限。例如对于Python,使用SWIG只支持CPython解释器,而不支持PyPy解释器。 - - -## 原因列表 - -| 结论 | 对比 | 原因 | -|---| --- | --- | -| 使用动态库 | 不使用静态库 | 解释型语言只能调用动态库,Paddle静态库链接复杂 | -| 不嵌入其他语言解释器 | 不嵌入Python解释器 | Paddle C++目前嵌入Python解释器,会导致不同版本Python在一个进程里的bug | -| 不引用其他动态库 | | Paddle一个动态库可以在任何Linux系统上运行 | -| 使用C99做接口 | 不使用C++做接口 | C有标准的ABI,C99是目前C最广泛的使用标准,且C99支持bool类型和定长整数(uint64_t等)类型 | -| 使用void*作为类句柄 | 不显示的写每个类具体包含什么| 实现简单,并且让接口脱离实现细节 | -| 手写多语言绑定 | 不使用SWIG | 使用SWIG需要多语言绑定的开发人员熟练掌握SWIG配置,社区参与困难。SWIG生成的代码不能保证多语言代码风格的一致性 | - - -## 实现 - -参考[Inference implementation](01.inference_implementation.md) diff --git a/doc/v2/design/interface/01.inference_implementation.md b/doc/v2/design/interface/01.inference_implementation.md deleted file mode 100644 index 9820284523246a062581f322616d196f575c9d29..0000000000000000000000000000000000000000 --- a/doc/v2/design/interface/01.inference_implementation.md +++ /dev/null @@ -1,131 +0,0 @@ -# C-API 模型推断实现文档 - -本文档描述Paddle C-API的实现细节。Paddle C-API是多语言API的基础部分。Paddle需要暴露的API很多。先实现模型推断的API,通过模型推断API的实现作为一个样例,来进行讨论。至于为什么需要C-API,请参考[Why Plain C](./00.why_plain_c.md)。 - -## Table of Contents - * [C-API 模型推断实现文档](#c-api-模型推断实现文档) - * [暴露接口原则](#暴露接口原则) - * [目录结构](#目录结构) - * [实现方式](#实现方式) - * [capi.h](#capih) - * [具体某种类型的头文件](#具体某种类型的头文件) - * [capi_private.h](#capi_privateh) - * [具体某种类型的实现文件](#具体某种类型的实现文件) - * [libpaddle_capi_shared.{so, dylib}](#libpaddle_capi_sharedso-dylib) - * [libpaddle_capi_whole.a](#libpaddle_capi_wholea) - * [examples](#examples) - * [编译选项](#编译选项) - - -## 暴露接口原则 - -1. 所有的接口均为C接口。即使用`extern "C"` -2. 除构造某种类型的函数(`paddle_matrix_create`等),其他函数均返回`paddle_error`。且调用时不能抛出异常或出现运行时错误。 -3. 所有类型名为`paddle_类型名`,所有与类型相关的函数,函数名为`paddle_类型名_函数名` -4. 如果某一个Paddle Core概念(GradientMachine/Matrix)需要被暴露到其他语言,那么 - * 为了暴露的接口尽量简单。只暴露概念的接口,而不暴露概念的实现。即暴露`GradientMachine`或者`Matrix`但不暴露`RecurrentGradientMachine`和`CpuSparseMatrix`。 - * 暴露这个概念必要函数。`必要`是指,即完成某一个任务的最少函数。 -5. 不在`capi`接口层做过多封装。 - * 如果某一个Paddle概念必须要暴露,但是又过于琐碎。不在`capi`这一层进行封装,而是直接修改Paddle Core。让Paddle核心中,这一概念不再琐碎。 - - -## 目录结构 - -```text -Paddle - `-- paddle - `-- capi - `-- examples # The example project for C-API. - `-- tests # unittests for C-API - `-- capi.h # C-API header file. - `-- capi_private.h # The shared header file between implementation sources. - `-- matrix.{h, cpp} - `-- gradient_machine.{h, cpp} - `-- ... -``` - - -Paddle的C-API目录结构如上图表所示。这个目录中除了`capi_private.h`之外的所有头文件,均会被安装到include/paddle路径下。C-API生成的二进制文件会被安装到`lib`目录下。即,安装后的目录结构为 - -```text -`-- include - `-- paddle - `-- capi.h - `-- matrix.h - `-- gradient_machine.h - `-- ... -`-- lib - `-- libpaddle_capi_shared.{so, dylib} # In mac, dynamic libary's file name extention is `dylib` - `-- libpaddle_capi_whole.a # static library for all symbols of Paddle. -``` - -## 实现方式 - -下面分别介绍某一类文件的实现方式。 - -### capi.h - -`capi.h`是用户使用C-API时所唯一需要引入的头文件。在`capi.h`中,引入了类型的头文件,`matrix.h`, `gradient_machine.h`。在引入其他类型的头文件时,使用相对路径的引用方式。即`#include "matrix.h"` - -### 具体某种类型的头文件 - -具体某种类型的头文件,即例如`matrix.h`,`gradient_machine.h`等。在这些头文件中,包含了某种类型的类型定义和暴露的全部函数。 - -这个头文件不假设其他文件的引用顺序,即使用户直接引用某种类型的头文件,也不应该报错(虽然不鼓励这样)。如果某一个类型需要引用另一个类型,例如`gradient_machine`需要引用`matrix`,则直接引入另一种类型的头文件,即`#include "matrix.h"`。 - -### capi_private.h - -`capi_prviate.h`是各个实现中共享的头文件,他主要包含了实际暴露的类型结构。在用户使用C-API时,Paddle的类型全部退化成`void *`,即`typedef paddle_matrix void*`。但,对于每种C-API暴露的类型,均是在`capi_private.h`中实现的结构体。 - -```cpp -struct CMatrix { - int type = MatrixType; - std::shared_ptr mat; -}; -``` - -通常,这个结构体包含两个项目。 - -* `type`是一个类型的标志。对于每种类型,type字段均不尽相同。这样,即使C-API接受的类型全是`void *`,我们也可以确定每一个参数的类型。 - - ```cpp - void some_c_api_function(void* some_instance) { - int* type = (int *) some_instance; - switch (*type) { - case MatrixType: - CMatrix* mat = (CMatrix *) some_instance; - ... - ... - } - } - ``` -* 这个结构体中的另一个项目是,Paddle Core中这一类型接口的智能指针(shared_ptr)。 - * 使用智能指针的原因是: 用户可以安全的释放某个C-API的实例,而不必在意Paddle Core是否还在使用这个实例。 - * 例如,用户通过C-API获得了神经网络的参数实例。当用户使用完这个参数后,直接删除这个参数即可。即便Paddle Core中的模型还在使用这个参数,这个参数也不会一并删除。 - -### 具体某种类型的实现文件 - -具体某种类型的实现文件,即`matrix.cpp`, `gradient_machine.cpp`等文件。在这些文件中,使用C++ 11实现了C-API的接口,并且使用`extern "C"`导出这些接口。在实现过程中,对输入参数的安全性进行了必要的判断,并将C-API接口的参数转发给`Paddle Core`。 - -### libpaddle\_capi_shared.{so, dylib} - -`libpaddle_capi_shared`是C-API导出的动态库。这个动态库的连接参数与Paddle的其他二进制(例如`paddle_trainer`)类似。用户可以直接使用这个动态库来引入Paddle C-API。具体使用方法为`-lpaddle_capi_shared`。 - -### libpaddle\_capi_whole.a - -`libpaddle_capi_whole`是C-API导出的静态库。这个静态库包含了Paddle的全部符号。他是将`libpaddle_gserver.a`, `libpaddle_math.a`, `libpaddle_capi.a`等全部静态库中的目标文件全部打包后产生的文件。具体使用方法为`--whole-archive -lpaddle_capi_whole --no-whole-archive`。 - - -### examples - -在样例中,使用`C99`开发了模型预测的样例代码。具体请参考[example/README.md](../../../paddle/capi/examples/README.md)。 - -## 编译选项 - -C-API的编译选项默认关闭,打开这个编译选项,需要在cmake的时候,设置 - -```bash -cmake ${YOUR_SOURCE_ROOT} -DWITH_C_API=ON -DWITH_PYTHON=OFF -DWITH_SWIG_PY=OFF -``` - -编译C-API的时候推荐Paddle不嵌入Python解释器,也不生成`SWIG`接口,具体原因参考[Why Plain C](./00.why_plain_c.md)。 diff --git a/doc/v2/design/interface/index_cn.rst b/doc/v2/design/interface/index_cn.rst deleted file mode 100644 index 2509a5c5f4182d8ce3a16a3b7bd92c0d7bf5b056..0000000000000000000000000000000000000000 --- a/doc/v2/design/interface/index_cn.rst +++ /dev/null @@ -1,7 +0,0 @@ -多语言接口 ------------- - -.. toctree:: - :maxdepth: 1 - - 00.why_plain_c.md diff --git a/doc/v2/design/interface/index_en.rst b/doc/v2/design/interface/index_en.rst deleted file mode 100644 index 356e58c39c5ef6ee5ee50ab999b85f88628bfb85..0000000000000000000000000000000000000000 --- a/doc/v2/design/interface/index_en.rst +++ /dev/null @@ -1,7 +0,0 @@ -Multilingual Interface ------------------------ - -.. toctree:: - :maxdepth: 1 - - 00.why_plain_c.md diff --git a/doc/v2/design/mkl/image/engine.png b/doc/v2/design/mkl/image/engine.png deleted file mode 100644 index 1f5f65c2cc765a514a3ba9e7b7f468e1dc4b0c3b..0000000000000000000000000000000000000000 Binary files a/doc/v2/design/mkl/image/engine.png and /dev/null differ diff --git a/doc/v2/design/mkl/image/gradients.png b/doc/v2/design/mkl/image/gradients.png deleted file mode 100644 index f031bcf8e4cec14e63075b8b9d2c7bbd9f1b1a3c..0000000000000000000000000000000000000000 Binary files a/doc/v2/design/mkl/image/gradients.png and /dev/null differ diff --git a/doc/v2/design/mkl/image/layers.png b/doc/v2/design/mkl/image/layers.png deleted file mode 100644 index 306f79b7a844610915eb8944128f57d2b7a3065a..0000000000000000000000000000000000000000 Binary files a/doc/v2/design/mkl/image/layers.png and /dev/null differ diff --git a/doc/v2/design/mkl/image/matrix.png b/doc/v2/design/mkl/image/matrix.png deleted file mode 100644 index c33ce9cf0335e47cc8c1253304d0fe179186e6f2..0000000000000000000000000000000000000000 Binary files a/doc/v2/design/mkl/image/matrix.png and /dev/null differ diff --git a/doc/v2/design/mkl/image/overview.png b/doc/v2/design/mkl/image/overview.png deleted file mode 100644 index 8fb7bbb9dd654bf363d701d0c8cd4a557043d188..0000000000000000000000000000000000000000 Binary files a/doc/v2/design/mkl/image/overview.png and /dev/null differ diff --git a/doc/v2/design/mkl/mkl_packed.md b/doc/v2/design/mkl/mkl_packed.md deleted file mode 100644 index 0123315ad4368e68b377f66119949bfd6c1c7860..0000000000000000000000000000000000000000 --- a/doc/v2/design/mkl/mkl_packed.md +++ /dev/null @@ -1,108 +0,0 @@ -# Intel® MKL Packed on PaddlePaddle: Design Doc - - -## Contents - -- [Overview](#overview) -- [Key Points](#key-points) - - [Background](#background) - - [Solution](#solution) -- [Actions](#actions) - - [CMake](#cmake) - - [Layers](#layers) - - [Unit Tests](#unit-tests) - - [Python API](#python-api) - - [Benchmarking](#benchmarking) - - -## Overview -我们计划将 Intel® MKL 中引入的 GEMM Packed APIs\[[1](#references)\] 集成到 PaddlePaddle 中,充分发挥英特尔平台的优势,有效提升PaddlePaddle在英特尔架构上的性能。 -现阶段的优化主要针对 Recurrent Neural Network(以下简称RNN)相关层(包括`RecurrentLayer`, `GatedRecurrentLayer`和`LstmLayer`), 以及 PaddlePaddle V1 API。 - -## Key Points - -### Background -目前PaddlePaddle采用了 Intel® MKL库的[cblas_?gemm](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm)函数,这个函数本身会在计算前将原数据转换为更适合英特尔平台的内部格式。 - -1. 转换耗时 \ -这一数据格式的转换操作(Packing),在问题本身的计算量比较小的时候,显得相对来说较为耗时。例如在DeepSpeech2 \[[2](#references)\] 的Vanilla RNN部分中,矩阵大小是`batch_size * 2048`。 -2. 转换冗余 \ -由于在现有的某些情况下(例如RNN),多次调用 cblas_?gemm 会使用相同的原数据,因此,每次调用时对原数据的重复Packing便成为了冗余。 - -为了最大程度减少多次调用 cblas_?gemm 在Packing上的耗时,Intel® MKL 引入了以下四个API: - * [cblas_?gemm_alloc](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm-alloc) - * [cblas_?gemm_pack](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm-pack) - * [cblas_?gemm_compute](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm-compute) - * [cblas_?gemm_free](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm-free) - -通过使用这些API,我们可以先完成对原数据的Packing操作,再把已转换为Packed格式的数据传递给那些复用同一数据的gemm_compute函数,从而避免了Packing冗余。 - -### Solution -在RNN的情况下,同一次前向、后向(forward/backward)过程中所有时间步(time step)共享同一个权重(weight)。当只做推断(inference)时,各次前向之间也都使用了相同的权重,没有必要在每次前向中每个时间步的计算时对权重进行重复的Packing操作。 - -我们通过使用新引入的GEMM Packed APIs,在层初始化的时候,先完成对权重的Packing操作,然后在前向,后向时复用已经转换过的权重,并在每次权重更新后,对新的权重进行转换用于下次迭代。 - -* 优化前,对于序列长度(sequence length)为`T`的网络模型(model), `N`次迭代执行的转换次数为: - - `inference`: `N * T` - - `training`: `2 * N * T` -* 优化后,对于同样设置的网络模型,其转换次数减少至: - - `inference`: `1` - - `training`: `2 * N` - -## Actions - -添加的相关文件和目录结构如下: - -```txt -PaddlePaddle/Paddle -├── ... -└── paddle/ - ├── ... - └── gserver/ - ├── ... - ├── layers/ - │ ├── ... - │ ├── MKLPackedRecurrentLayer.* - | ├── MKLPackedGatedRecurrentLayer.* - | ├── MKLPackedLstmLayer.* - | └── MKLPackedGemm.h - └── tests/ - ├── ... - └── test_MKLPacked.cpp -``` - -### CMake -在对应的`CMakeLists.txt`中根据`WITH_MKL`是否打开,来决定是否开启MKL Packed相关功能。 - -### Layers -所有的`MKLPacked*Layer`都继承于PaddlePaddle的基类`Layer`, 并添加头文件 `MKLPackedGemm.h`,该文件对相关GEMM Packed APIs做了封装。 - -### Unit Tests -我们会添加`test_MKLPacked.cpp`用于MKL Packed优化后layer的测试。 -对于每一个新加的RNN layer,我们会对比如下2个方面: -1. 对比优化后layer自身,sequence mode(`rnn_use_batch=false`)与batch mode(`rnn_use_batch=true`)的结果。 -2. 对比优化后layer与相对应的PaddlePaddle原有layer, 在batch mode下的结果。 - -### Python API -计划在`paddle/utils.Flags`中添加`use_mkl_packed`的flag,用于选择是否使用相关功能,并且当编译时`WITH_MKL=ON`的情况下,默认设置为`true`。 - -同时,在`python/paddle/trainer/config_parser.py`中对应的layer处,添加`use_mkl_packed`这个选择,方便用户在Python端选择是否启用这个功能。 - -具体实现方式比如: - -```python -use_mkl_packed = bool(int(g_command_config_args.get("use_mkl_packed", 0))) -if use_mkl_packed: - self.layer_type = mkl_packed_* -``` - -所有相关的`layer_type`会以*mkl_packed_*开头,这些会在`MKLPacked*Layer`注册layer的时候保证,以示区分。 - - -### Benchmarking -会添加相应的脚本用于测试和对比在使用MKL Packed recurrent layers 前后的网络性能。 - -## References -1. [Introducing the new Packed APIs for GEMM](https://software.intel.com/en-us/articles/introducing-the-new-packed-apis-for-gemm) -2. [DeepSpeech2 on PaddlePaddle](https://github.com/PaddlePaddle/DeepSpeech#deepspeech2-on-paddlepaddle) - diff --git a/doc/v2/design/mkl/mkldnn.md b/doc/v2/design/mkl/mkldnn.md deleted file mode 100644 index 4876de0045979be20fa45bdc84d2594516f71c03..0000000000000000000000000000000000000000 --- a/doc/v2/design/mkl/mkldnn.md +++ /dev/null @@ -1,237 +0,0 @@ -# Intel® MKL-DNN on PaddlePaddle: Design Doc - -我们计划将英特尔深度神经网络数学库[Intel MKL-DNN](https://github.com/01org/mkl-dnn) -(Intel Math Kernel Library for Deep Neural Networks)集成到PaddlePaddle, -充分展现英特尔平台的优势,有效提升PaddlePaddle在英特尔架构上的性能。 - -
-
-Figure 1. PaddlePaddle on IA -
- -近期目标 - -- 完成常用Layer的MKL-DNN实现。 -- 完成常见深度神经网络VGG,GoogLeNet 和 ResNet的MKL-DNN实现。 - -目前的优化,主要针对PaddlePaddle在重构之前的代码框架以及V1的API。 -具体的完成状态可以参见[这里](https://github.com/PaddlePaddle/Paddle/projects/21)。 - -## Contents - -- [Overview](#overview) -- [Actions](#actions) - - [CMake](#cmake) - - [Matrix](#matrix) - - [Layers](#layers) - - [Activations](#activations) - - [Parameters](#parameters) - - [Gradients](#gradients) - - [Unit Tests](#unit-tests) - - [Python API](#python-api) - - [Benchmarking](#benchmarking) - - [Others](#others) -- [Design Concerns](#design-concerns) - -## Overview - -我们会把MKL-DNN会作为第三方库集成进PaddlePaddle,与其他第三方库一样,会在编译PaddlePaddle的时候下载并编译MKL-DNN。 - -同时,为了进一步提升PaddlePaddle在基本数学运算的计算速度,我们也将MKLML即(MKL small library\[[1](#references)\]) -作为另一个第三方库集成进PaddlePaddle,它只会包括生成好的动态库和头文件。 - -MKL,MKLML以及MKL-DNN三者关系如下表: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
NameOpen SourceLicenseDescriptions
MKLNoProprietaryAccelerate math processing routines
MKLMLNoProprietarySmall package of MKL, especially for Machine Learning
MKL-DNNYesApache 2.0Accelerate primitives processing routines especially for Deep Neural Networks
- -MKLML可以与MKL-DNN共同使用,以此达到最好的性能。 - -
-
-Figure 2. PaddlePaddle with MKL Engines -
- -## Actions - -添加的相关文件和目录结构如下: - -```txt -PaddlePaddle/Paddle -├── ... -├── cmake/ -│ ├── external/ -│ │ ├── ... -│ │ ├── mkldnn.cmake -│ │ └── mklml.cmake -└── paddle/ - ├── ... - ├── math/ - │ ├── ... - │ └── MKLDNNMatrix.* - └── gserver/ - ├── ... - ├── layers/ - │ ├── ... - │ └── MKLDNN*Layer.* - ├── activations/ - │ ├── ... - │ └── MKLDNNActivations.* - └── tests/ - ├── ... - ├── MKLDNNTester.* - └── test_MKLDNN.cpp -``` - -### CMake -在`CMakeLists.txt`中提供一个与MKL有关的总开关:`WITH_MKL`,它负责决定编译时是否使用MKLML和MKL-DNN - -- `WITH_MKLML` 控制是否使用MKLML库。 -当打开`WITH_MKL`时,会自动使用MKLML库作为PaddlePaddle的CBLAS和LAPACK库,同时会开启Intel OpenMP用于提高MKLML的性能。 -编译时会把对应的头文件和库放在`build/third_party/install/mklml/*`目录下对应的地方。 -MKLML的库目前都是动态库,主要包括`libiomp5.so`和`libmklml_intel.so`。 -- `WITH_MKLDNN` 控制是否使用MKL-DNN。 -当开启`WITH_MKL`时,会自动根据硬件配置[[2](#references)]选择是否编译MKL-DNN。 -编译时会把对应的头文件和库放在`build/third_party/install/mkldnn/*`目录下对应的地方。 -MKL-DNN的库目前只有动态库`libmkldnn.so`。 - -### Matrix -目前在PaddlePaddle中数据都是以`NCHW`的格式存储,但是在MKL-DNN中的排列方式不止这一种。 -所以我们定义了一个`MKLDNNMatrix`用于管理MKL-DNN数据的不同格式以及相互之间的转换。 - -
-
-Figure 3. MKLDNNMatrix -
- -### Layers -所有MKL-DNN的Layers都会继承于`MKLDNNLayer`,该类继承于PaddlePaddle的基类`Layer`。 -在`MKLDNNLayer`中会提供一些必要的接口和函数,并且会写好`forward`和`backward`的基本逻辑, -子类只需要使用定义好的接口,实现具体的函数功能即可。 - -
-
-Figure 4. MKLDNNLayer -
- -每个MKLDNNLayer都包含用于内部存储和外部存储的一系列MKLDNNMatrix: - -- 内部存储(internel memory):`inVal_`,`inGrad_`,`outVal_`和`outGrad_`,分别代表输入数据,输入梯度,输出数据和输出梯度。 -- 外部存储(external memory):都是以ext开头,比如`extInVal_`和`extInGrad_`,它们主要是用于, -当数据格式与PaddlePaddle默认的`NCHW`格式不匹配时,转换内存的工作。 -需要注意的是,PaddlePaddle的activation会直接使用`output_.value`和`output_.grad`, -所以`extOutVal_`和`extOutGrad_`必须分别与`output_.value`和`output_.grad`共享内存, -如果不需要外部存储用于转换,那么对应的内部存储也会与它们共享内存。 -- 转换函数(resetXXX): 包括`resetInValue`,`resetInGrad`,`resetOutValue`和`resetOutGrad`, -表示对输入数据,输入梯度,输出数据和输出梯度的转换。 -这些函数会根据输入参数重新设置内部和外部存储,当然这两者也可以相等,即表示不需要转换。 - -注意:每个`MKLDNNlayer`的子类只需要使用内部存储就可以了,所有外部的转换工作都会在reset系列函数中都准备好。 - -### Activations -在重构前的PaddlePaddle中,激活函数是独立于`Layer`的概念,并且输入输出都是共用一块内存, -所以添加了对应的`MKLDNNActivation`来实现,方式类似于`MKLDNNLayer`。 - -### Parameters -对于有参数的层,我们会保证`MKLDNNLayer`使用的参数与PaddlePaddle申请的buffer共用一块内存。 -如果存在数据排列格式不一样的情况时,我们会在网络训练之前把格式转换为MKL-DNN希望的格式, -在训练结束的时候再保存为PaddlePaddle的格式,但是整个训练过程中不需要任何转换。 -这样既使得最终保存的参数格式与PaddlePaddle一致,又可以避免不必要的转换。 - -### Gradients -由于MKL-DNN的操作都是直接覆盖的形式,也就是说输出的结果不会在原来的数据上累加, -这样带来的好处就是不需要一直清空memory,节省了不必要的操作。 -但是注意的是,当网络出现分支且在`backward`的时候,需要累加不同Layer传过来的梯度。 -所以在`MKLDNNlayer`中实现了一个merge的方法,此时每个小分支的`Input Gradient` -会先临时保存在`MKLDNNMatrix`中,由分支处的Layer负责求和,并把结果放到当前层的`output_.grad`中。 -所以整体上,在实现每个子类的时候就不需要关心分支的事情了。 - -
-
-Figure 5. Merge Gradients -
- -### Unit Tests -我们会添加`test_MKLDNN.cpp`和`MKLDNNTester.*`用于MKL-DNN的测试。 -测试分为每个Layer(或Activation)的单元测试和简单网络的整体测试。 -每个测试会对比PaddlePaddle中CPU算出的结果与MKL-DNN的结果,小于某个比较小的阈值认为通过。 - -### Python API -目前只考虑**v1 API**。 - -计划在`python/paddle/trainer/config_parser.py`里面添加`use_mkldnn`这个选择,方便用户选择使用MKL-DNN的layers。 - -具体实现方式比如: - -```python -use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0))) -if use_mkldnn - self.layer_type = mkldnn_* -``` - -所有MKL-DNN的`layer_type`会以*mkldnn_*开头,这些会在`MKLDNN*Layer`注册layer的时候保证,以示区分。 - -同时,会在`paddle/utils.Flags`中添加一个`use_mkldnn`的flag,用于选择是否使用MKL-DNN的相关功能。 - -### Benchmarking -会添加相应的脚本在[这里](https://github.com/PaddlePaddle/Paddle/tree/develop/benchmark/paddle/image),用于测试和对比在使用MKL-DNN前后的CNN网络性能。 -测试的性能对比结果会在[IntelOptimizedPaddle.md](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/IntelOptimizedPaddle.md) - -### Others -1. 如果在使用MKL-DNN的情况下,会把CPU的Buffer对齐为4096,具体可以参考MKL-DNN中的[memory](https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp#L673)。 -2. 深入PaddlePaddle,寻找有没有其他可以优化的可能,进一步优化。比如可能会用OpenMP改进SGD的更新性能。 - -## Design Concerns - -为了更好的符合PaddlePaddle的代码风格\[[3](#references)\],同时又尽可能少的牺牲MKL-DNN的性能\[[4](#references)\]。 - -我们总结出一些特别需要注意的点: - -1. 使用**deviceId_**。为了尽可能少的在父类Layer中添加变量或者函数, -我们决定使用已有的`deviceId_`变量来区分layer的属性,定义`-2`为`MKLDNNLayer`特有的设备ID。 -2. 重写父类Layer的**init**函数,修改`deviceId_`为`-2`,代表这个layer是用于跑在MKL-DNN的环境下。 -3. 创建`MKLDNNBase`,定义一些除了layer和memory相关的类和函数。 -包括MKL-DNN会用到`MKLDNNStream`和`CPUEngine`,和未来可能还会用到`FPGAEngine`等。 -4. 如果MKL-DNN layer的后面接有cpu device,那么就会使`output_.value`与`extOutVal_`共享内存, -同时数据格式就是`NCHW`,这样下一个cpu device就能拿到正确的数据。 -在有普通的CPU layer时, `extOutVal_`和`extOutGrad_`的格式始终是`NCHW`或者`NC`。 - -## References -1. [MKL small library](https://github.com/01org/mkl-dnn#linking-your-application)是[Intel MKL](https://software.intel.com/en-us/mkl)的一个子集。 -主要包括了深度学习相关的数学原语与操作,一般由MKL-DNN在发布[新版本](https://github.com/01org/mkl-dnn/releases)时一起更新。 -2. [MKL-DNN System Requirements](https://github.com/01org/mkl-dnn#system-requirements)。 -目前在PaddlePaddle中,仅会在支持AVX2指令集及以上的机器才使用MKL-DNN。 -3. [原来的方案](https://github.com/PaddlePaddle/Paddle/pull/3096)会引入**nextLayer**的信息。 -但是在PaddlePaddle中,无论是重构前的layer还是重构后的op,都不会想要知道next layer/op的信息。 -4. MKL-DNN的高性能格式与PaddlePaddle原有的`NCHW`不同(PaddlePaddle中的cuDNN部分使用的也是`NCHW`,所以不存在这个问题)。 -所以需要引入一个转换方法,并且只需要在必要的时候转换这种格式,才能更好的发挥MKL-DNN的性能。 diff --git a/doc/v2/dev/contribute_to_paddle_cn.md b/doc/v2/dev/contribute_to_paddle_cn.md deleted file mode 100644 index 3244eedf918b93f9351258f1218dfb2d507c1a9c..0000000000000000000000000000000000000000 --- a/doc/v2/dev/contribute_to_paddle_cn.md +++ /dev/null @@ -1,243 +0,0 @@ -# 如何贡献代码 - -我们真诚地感谢您的贡献,欢迎通过 GitHub 的 fork 和 pull request 流程来提交代码。 - -## 代码要求 -- 代码注释请遵守 [Doxygen](http://www.stack.nl/~dimitri/doxygen/) 的样式。 -- 确保编译器选项 `WITH_STYLE_CHECK` 已打开,并且编译能通过代码样式检查。 -- 所有代码必须具有单元测试。 -- 通过所有单元测试。 -- 请遵守[提交代码的一些约定](#提交代码的一些约定)。 - -以下教程将指导您提交代码。 -## [Fork](https://help.github.com/articles/fork-a-repo/) - -跳转到[PaddlePaddle](https://github.com/PaddlePaddle/Paddle) GitHub首页,然后单击 `Fork` 按钮,生成自己目录下的仓库,比如 。 - -## 克隆(Clone) - -将远程仓库 clone 到本地: - -```bash -➜ git clone https://github.com/USERNAME/Paddle -➜ cd Paddle -``` - - -## 创建本地分支 - -Paddle 目前使用[Git流分支模型](http://nvie.com/posts/a-successful-git-branching-model/)进行开发,测试,发行和维护,具体请参考 [Paddle 分支规范](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/releasing_process.md#paddle-分支规范)。 - -所有的 feature 和 bug fix 的开发工作都应该在一个新的分支上完成,一般从 `develop` 分支上创建新分支。 - -使用 `git checkout -b` 创建并切换到新分支。 - -```bash -➜ git checkout -b my-cool-stuff -``` - -值得注意的是,在 checkout 之前,需要保持当前分支目录 clean,否则会把 untracked 的文件也带到新分支上,这可以通过 `git status` 查看。 - -## 使用 `pre-commit` 钩子 - -Paddle 开发人员使用 [pre-commit](http://pre-commit.com/) 工具来管理 Git 预提交钩子。 它可以帮助我们格式化源代码(C++,Python),在提交(commit)前自动检查一些基本事宜(如每个文件只有一个 EOL,Git 中不要添加大文件等)。 - -`pre-commit`测试是 Travis-CI 中单元测试的一部分,不满足钩子的 PR 不能被提交到 Paddle,首先安装并在当前目录运行它: - -```bash -➜ pip install pre-commit -➜ pre-commit install -``` - -Paddle 使用 `clang-format` 来调整 C/C++ 源代码格式,请确保 `clang-format` 版本在 3.8 以上。 - -注:通过`pip install pre-commit`和`conda install -c conda-forge pre-commit`安装的`yapf`稍有不同的,Paddle 开发人员使用的是`pip install pre-commit`。 - -## 开始开发 - -在本例中,我删除了 README.md 中的一行,并创建了一个新文件。 - -通过 `git status` 查看当前状态,这会提示当前目录的一些变化,同时也可以通过 `git diff` 查看文件具体被修改的内容。 - -```bash -➜ git status -On branch test -Changes not staged for commit: - (use "git add ..." to update what will be committed) - (use "git checkout -- ..." to discard changes in working directory) - - modified: README.md - -Untracked files: - (use "git add ..." to include in what will be committed) - - test - -no changes added to commit (use "git add" and/or "git commit -a") -``` - -## 构建和测试 - -编译 PaddlePaddle 的源码以及生成文档需要多种开发工具。为了方便大家,我们的标准开发流程是把这些工具都装进一个Docker image,称为*开发镜像*,通常名字是 `paddle:latest-dev` 或者 `paddle:[version tag]-dev` 如 `paddle:0.11.0-dev`。然后所有用 `cmake && make` 的地方(比如IDE配置里)都用 `docker run paddle:latest-dev`来代替。 - -如要build这个开发镜像,在源码目录树的根目录中运行: - -```bash -➜ docker build -t paddle:latest-dev . -``` - -随后可以用这个开发镜像开始build PaddlePaddle的源码。比如如果要build一个不依赖GPU,但是支持AVX指令集,并且包括unit tests的PaddlePaddle,可以: - -```bash -➜ docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TESTING=ON" paddle:latest-dev -``` - -这个过程除了编译PaddlePaddle为 `./build/libpaddle.so`,并且输出一个 `./build/paddle.deb`文件之外,还会输出一个 `build/Dockerfile`。我们只需要运行下面命令把编译好的PaddlePaddle打包成一个*生产镜像*(`paddle:prod`): - -```bash -➜ docker build -t paddle:prod -f build/Dockerfile . -``` - -如果要运行所有的单元测试,可以用如下命令: - -```bash -➜ docker run -it -v $(pwd):/paddle paddle:latest-dev bash -c "cd /paddle/build && ctest" -``` - -关于构建和测试的更多信息,请参见[使用Docker安装运行](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/v2/build_and_install/docker_install_cn.rst)。 - -## 提交(commit) - -接下来我们取消对 README.md 文件的改变,然后提交新添加的 test 文件。 - -```bash -➜ git checkout -- README.md -➜ git status -On branch test -Untracked files: - (use "git add ..." to include in what will be committed) - - test - -nothing added to commit but untracked files present (use "git add" to track) -➜ git add test -``` - -Git 每次提交代码,都需要写提交说明,这可以让其他人知道这次提交做了哪些改变,这可以通过`git commit` 完成。 - -```bash -➜ git commit -CRLF end-lines remover...............................(no files to check)Skipped -yapf.................................................(no files to check)Skipped -Check for added large files..............................................Passed -Check for merge conflicts................................................Passed -Check for broken symlinks................................................Passed -Detect Private Key...................................(no files to check)Skipped -Fix End of Files.....................................(no files to check)Skipped -clang-formater.......................................(no files to check)Skipped -[my-cool-stuff c703c041] add test file - 1 file changed, 0 insertions(+), 0 deletions(-) - create mode 100644 233 -``` - -## 保持本地仓库最新 - -在准备发起 Pull Request 之前,需要同步原仓库()最新的代码。 - -首先通过 `git remote` 查看当前远程仓库的名字。 - -```bash -➜ git remote -origin -➜ git remote -v -origin https://github.com/USERNAME/Paddle (fetch) -origin https://github.com/USERNAME/Paddle (push) -``` - -这里 origin 是我们 clone 的远程仓库的名字,也就是自己用户名下的 Paddle,接下来我们创建一个原始 Paddle 仓库的远程主机,命名为 upstream。 - -```bash -➜ git remote add upstream https://github.com/PaddlePaddle/Paddle -➜ git remote -origin -upstream -``` - -获取 upstream 的最新代码并更新当前分支。 - -```bash -➜ git fetch upstream -➜ git pull upstream develop -``` - -## Push 到远程仓库 - -将本地的修改推送到 GitHub 上,也就是 https://github.com/USERNAME/Paddle。 - -```bash -# 推送到远程仓库 origin 的 my-cool-stuff 分支上 -➜ git push origin my-cool-stuff -``` - -## 建立 Issue 并完成 Pull Request - -建立一个 Issue 描述问题,并记录它的编号。 - -切换到所建分支,然后点击 `New pull request`。 - -screen shot 2017-04-26 at 9 09 28 pm - -选择目标分支: - -screen shot 2017-04-26 at 9 11 52 pm - -在 PR 的描述说明中,填写 `resolve #Issue编号` 可以在这个 PR 被 merge 后,自动关闭对应的 Issue,具体请见 。 - -接下来等待 review,如果有需要修改的地方,参照上述步骤更新 origin 中的对应分支即可。 - -## 删除远程分支 - -在 PR 被 merge 进主仓库后,我们可以在 PR 的页面删除远程仓库的分支。 - -screen shot 2017-04-26 at 9 18 24 pm - -也可以使用 `git push origin :分支名` 删除远程分支,如: - -```bash -➜ git push origin :my-cool-stuff -``` - -## 删除本地分支 - -最后,删除本地分支。 - -```bash -# 切换到 develop 分支 -➜ git checkout develop - -# 删除 my-cool-stuff 分支 -➜ git branch -D my-cool-stuff -``` - -至此,我们就完成了一次代码贡献的过程。 - -## 提交代码的一些约定 - -为了使评审人在评审代码时更好地专注于代码本身,请您每次提交代码时,遵守以下约定: - -1. 请保证Travis-CI 中单元测试能顺利通过。如果没过,说明提交的代码存在问题,评审人一般不做评审。 -2. 提交PUll Request前: - - 请注意commit的数量: - - 原因:如果仅仅修改一个文件但提交了十几个commit,每个commit只做了少量的修改,这会给评审人带来很大困扰。评审人需要逐一查看每个commit才能知道做了哪些修改,且不排除commit之间的修改存在相互覆盖的情况。 - - 建议:每次提交时,保持尽量少的commit,可以通过`git commit --amend`补充上次的commit。对已经Push到远程仓库的多个commit,可以参考[squash commits after push](http://stackoverflow.com/questions/5667884/how-to-squash-commits-in-git-after-they-have-been-pushed)。 - - 请注意每个commit的名称:应能反映当前commit的内容,不能太随意。 -3. 如果解决了某个Issue的问题,请在该PUll Request的**第一个**评论框中加上:`fix #issue_number`,这样当该PUll Request被合并后,会自动关闭对应的Issue。关键词包括:close, closes, closed, fix, fixes, fixed, resolve, resolves, resolved,请选择合适的词汇。详细可参考[Closing issues via commit messages](https://help.github.com/articles/closing-issues-via-commit-messages)。 - -此外,在回复评审人意见时,请您遵守以下约定: - -1. 评审人的每个意见都必须回复(这是开源社区的基本礼貌,别人帮了忙,应该说谢谢): - - 对评审意见同意且按其修改完的,给个简单的`Done`即可; - - 对评审意见不同意的,请给出您自己的反驳理由。 -2. 如果评审意见比较多: - - 请给出总体的修改情况。 - - 请采用[start a review](https://help.github.com/articles/reviewing-proposed-changes-in-a-pull-request/)进行回复,而非直接回复的方式。原因是每个回复都会发送一封邮件,会造成邮件灾难。 diff --git a/doc/v2/dev/contribute_to_paddle_en.md b/doc/v2/dev/contribute_to_paddle_en.md deleted file mode 120000 index 72723396444c0a6cc0516f6f2379b2d868ba59f7..0000000000000000000000000000000000000000 --- a/doc/v2/dev/contribute_to_paddle_en.md +++ /dev/null @@ -1 +0,0 @@ -../../../CONTRIBUTING.md diff --git a/doc/v2/dev/index_cn.rst b/doc/v2/dev/index_cn.rst deleted file mode 100644 index aee3c68de05de26df3cd79170fa7f4ecad4bf386..0000000000000000000000000000000000000000 --- a/doc/v2/dev/index_cn.rst +++ /dev/null @@ -1,24 +0,0 @@ -开发标准 -======== -PaddlePaddle遵守如下三个部分的代码和文档规范。 - -PaddlePaddle使用git做版本管理,docker作为构建和测试环境。代码中包含了Cuda, C++, Python, Shell等多种编程语言。语言规范遵守Google C++ Style, Pep-8, 代码库中包含自动化检查工具做风格检查。代码注释需要遵守Doxygen规范,不满足风格要求的代码会编译失败。关于如何使用git, 构建测试及代码开发, 我们提供了如下指南。 - -.. toctree:: - :maxdepth: 1 - - contribute_to_paddle_cn.md - -PaddlePaddle面向国内外用户,包含了中文和英文两部分的文档。设计文档和issue问题描述都推荐使用英文。对于设计文档,重在问题描述,背景阐述,然后才是解决方案。文档由Sphinx生成,因此代码注释也需要符合Sphinx文档标准。推荐本地使用paddlepaddle.org工具编译生成和预览文档,请参阅如下文档。 - -.. toctree:: - :maxdepth: 1 - - write_docs_cn.rst - -PaddlePaddle V2 使用新增Layer方式定义新的操作。组合基础API可以实现多种复杂Layer, 满足绝大多数应用。如需要定制Layer,请参阅如下文档,欢迎提交patch。 - -.. toctree:: - :maxdepth: 1 - - new_layer_cn.rst diff --git a/doc/v2/dev/index_en.rst b/doc/v2/dev/index_en.rst deleted file mode 100644 index cbff313fc5b9468b58159cf2b04e8464f9bebc78..0000000000000000000000000000000000000000 --- a/doc/v2/dev/index_en.rst +++ /dev/null @@ -1,28 +0,0 @@ -Development ------------- - - -PaddlePaddle adheres to the following three sections of code and document specifications. - - -PaddlePaddle uses git for version control and Docker is used for building and testing environment. The code includes Cuda, C++, Python, Shell and other programming languages,which comply with Google C++ Style, Pep-8, and the code base includes style checking by an automatic inspection tool. Code comments need to follow the Doxygen specification. The code that does not meet the style requirements will fail to compile. We provide the following guidelines for the use of Git, build tests and code development. - -.. toctree:: - :maxdepth: 1 - - contribute_to_paddle_en.md - - -PaddlePaddle is well documented in English and Chinese. We recommend using the English version of the documents and problem description. The design documents focus on problem descriptions, backgrounds, and are followed by solutions. As documents are generated by Sphinx, code comments should comply with the Sphinx documentation standard. We recommend to use the paddlepaddle.org tool to compile and generate and preview documents locally. Please refer to: - -.. toctree:: - :maxdepth: 1 - - write_docs_en.rst - -PaddlePaddle V2 defines new operations by adding new Layers. You can implement various complex layers by combining basic APIs to satisfy most applications. If you want to customize layer, please refer to the following, and welcome to propose patch. - -.. toctree:: - :maxdepth: 1 - - new_layer_en.rst diff --git a/doc/v2/dev/new_layer_cn.rst b/doc/v2/dev/new_layer_cn.rst deleted file mode 100644 index e5a14346123d342de0b67757cbbce654bd4180dc..0000000000000000000000000000000000000000 --- a/doc/v2/dev/new_layer_cn.rst +++ /dev/null @@ -1,389 +0,0 @@ -================== -如何实现新的网络层 -================== - -这份教程展示了如何在PaddlePaddle中实现一个自定义的网络层。在这里我们使用全连接层作为例子来展示实现新网络层所需要的四个步骤。 - -1. 推导该层前向和后向传递的方程。 -2. 实现该层的C++类。 -3. 增加梯度检测的单元测试,以保证梯度的正确计算。 -4. 封装该层的Python接口。 - -推导方程 -================ - -首先我们需要推导该网络层的*前向传播*和*后向传播*的方程。前向传播给定输入,计算输出。后向传播给定输出的梯度,计算输入和参数的梯度。 - -下图是一个全连接层的示意图。在全连接层中,每个输出节点都连接到所有的输入节点上。 - -.. image:: src/FullyConnected.jpg - :align: center - :scale: 60 % - -一个网络层的前向传播部分把输入转化为相应的输出。 -全连接层以一个维度为 :math:`D_i` 的稠密向量作为输入,使用一个尺度为 :math:`D_i \times D_o` 的变换矩阵 :math:`W` 把 :math:`x` 映射到一个维度为 :math:`D_o` 的向量,并在乘积结果上再加上维度为 :math:`D_o` 的偏置向量 :math:`b` 。 - -.. math:: - - y = f(W^T x + b) - -其中 :math:`f(.)` 是一个非线性的*激活方程*,例如sigmoid, tanh,以及Relu。 - -变换矩阵 :math:`W` 和偏置向量 :math:`b` 是该网络层的*参数*。一个网络层的参数是在*反向传播*时被训练的。反向传播根据输出的梯度,分别计算每个参数的梯度,以及输入的梯度。优化器则用链式法则来对每个参数计算损失函数的梯度。 - -假设损失函数是 :math:`c(y)` ,那么 - -.. math:: - - \frac{\partial c(y)}{\partial x} = \frac{\partial c(y)}{\partial y} \frac{\partial y}{\partial x} - -假设 :math:`z = W^T x + b` ,那么 - -.. math:: - - \frac{\partial y}{\partial z} = \frac{\partial f(z)}{\partial z} - -PaddlePaddle的base layer类可以自动计算上面的导数。 - -因此,对全连接层来说,我们需要计算: - -.. math:: - - \frac{\partial z}{\partial x} = W, \frac{\partial z_j}{\partial W_{ij}} = x_i, \frac{\partial z}{\partial b} = \mathbf 1 - -其中 :math:`\mathbf 1` 是一个全1的向量, :math:`W_{ij}` 是矩阵 :math:`W` 第i行第j列的数值, :math:`z_j` 是向量 :math:`z` 的第j个值, :math:`x_i` 是向量 :math:`x` 的第i个值。 - -最后我们使用链式法则计算 :math:`\frac{\partial z}{\partial x}` 以及 :math:`\frac{\partial z}{\partial W}` 。计算的细节将在下面的小节给出。 - -实现C++类 -=================== - -一个网络层的C++类需要实现初始化,前向和后向。全连接层的实现位于:code:`paddle/legacy/gserver/layers/FullyConnectedLayer.h`及:code:`paddle/legacy/gserver/layers/FullyConnectedLayer.cpp`。这里我们展示一份简化过的代码。 - -这个类需要继承 :code:`paddle::Layer` 这个基类,并且需要重写基类中的以下几个虚函数: - -- 类的构造函数和析构函数。 -- :code:`init` 函数。用于初始化参数和设置。 -- :code:`forward` 。实现网络层的前向传播。 -- :code:`backward` 。实现网络层的后向传播。 -- :code:`prefetch` 。用来从参数服务器预取参数矩阵相应的行。如果网络层不需要远程稀疏更新,则不需要重写该函数。(大多数网络层不需要支持远程稀疏更新) - - -头文件如下: - -.. code-block:: c++ - - namespace paddle { - /** - * 全连接层的每个输出都连接到上一层的所有的神经元上。 - * 它的输入与经过学习的参数做内积并加上偏置(可选)。 - * - * 配置文件接口是fc_layer。 - */ - - class FullyConnectedLayer : public Layer { - protected: - WeightList weights_; - std::unique_ptr biases_; - - public: - explicit FullyConnectedLayer(const LayerConfig& config) - : Layer(config) {} - ~FullyConnectedLayer() {} - - bool init(const LayerMap& layerMap, const ParameterMap& parameterMap); - - Weight& getWeight(int idx) { return *weights_[idx]; } - - void prefetch(); - void forward(PassType passType); - void backward(const UpdateCallback& callback = nullptr); - }; - } // namespace paddle - -头文件中把参数定义为类的成员变量。我们使用 :code:`Weight` 类作为参数的抽象,它支持多线程更新。该类的实现细节在“实现细节”中详细介绍。 - -- :code:`weights_` 是存有一系列变换矩阵的权重。在当前的实现方式下,网络层可以有多个输入。因此,它可能有不止一个权重。每个权重对应一个输入。 -- :code:`biases_` 是存有偏置向量的权重。 - -全连接层没有网络层配置的超参数。如果一个网络层需要配置的话,通常的做法是将配置存于 :code:`LayerConfig& config` 中,并在类构建函数中把它放入一个类成员变量里。 - -下面的代码片段实现了 :code:`init` 函数。 - -- 首先,所有的 :code:`init` 函数必须先调用基类中的函数 :code:`Layer::init(layerMap, parameterMap);` 。该语句会为每个层初始化其所需要的变量和连接。 -- 之后初始化所有的权重矩阵 :math:`W` 。当前的实现方式下,网络层可以有多个输入。因此,它可能有不止一个权重。 -- 最后,初始化偏置向量。 - - -.. code-block:: c++ - - bool FullyConnectedLayer::init(const LayerMap& layerMap, - const ParameterMap& parameterMap) { - /* 初始化父类 */ - Layer::init(layerMap, parameterMap); - - /* 初始化权重表 */ - CHECK(inputLayers_.size() == parameters_.size()); - for (size_t i = 0; i < inputLayers_.size(); i++) { - // 获得参数尺寸 - size_t height = inputLayers_[i]->getSize(); - size_t width = getSize(); - - // 新建一个权重 - if (parameters_[i]->isSparse()) { - CHECK_LE(parameters_[i]->getSize(), width * height); - } else { - CHECK_EQ(parameters_[i]->getSize(), width * height); - } - Weight* w = new Weight(height, width, parameters_[i]); - - // 将新建的权重加入权重表 - weights_.emplace_back(w); - } - - /* 初始化biases_ */ - if (biasParameter_.get() != NULL) { - biases_ = std::unique_ptr(new Weight(1, getSize(), biasParameter_)); - } - - return true; - } - -实现前向传播的部分有下面几个步骤。 - -- 每个层在其 :code:`forward` 函数的开头必须调用 :code:`Layer::forward(passType);` 。 -- 之后使用 :code:`reserveOutput(batchSize, size);` 为输出分配内存。由于我们支持训练数据有不同的批次大小,所以这一步是必要的。 :code:`reserveOutput` 会相应地改变输出的尺寸。为了保证效率,如果需要扩大矩阵,我们会重新分配内存;如果需要缩减矩阵,我们会继续使用现有的内存块。 -- 之后使用矩阵运算函数来计算 :math:`\sum_i W_i x + b`。:code:`getInput(i).value` 返回第i个输入矩阵。每个输入都是一个 :math:`batchSize \times dim` 的矩阵,每行表示一个批次中的单个输入。对于我们支持的全部矩阵操作,请参考 :code:`paddle/legacy/math/Matrix.h`和:code:`paddle/legacy/math/BaseMatrix.h` 。 -- 最终,使用 :code:`forwardActivation();` 进行激活操作。这会自动进行网络配置中声明的激活操作。 - - -.. code-block:: c++ - - void FullyConnectedLayer::forward(PassType passType) { - Layer::forward(passType); - - /* 若有必要,为output_申请内存 */ - int batchSize = getInput(0).getBatchSize(); - int size = getSize(); - - { - // 设置输出的尺寸 - reserveOutput(batchSize, size); - } - - MatrixPtr outV = getOutputValue(); - - // 对每个输入乘上变换矩阵 - for (size_t i = 0; i != inputLayers_.size(); ++i) { - auto input = getInput(i); - CHECK(input.value) << "The input of 'fc' layer must be matrix"; - i == 0 ? outV->mul(input.value, weights_[i]->getW(), 1, 0) - : outV->mul(input.value, weights_[i]->getW(), 1, 1); - } - - /* 加上偏置向量 */ - if (biases_.get() != NULL) { - outV->addBias(*(biases_->getW()), 1); - } - - /* 激活 */ { - forwardActivation(); - } - } - -实现后向传播的部分有下面几个步骤。 - -- :code:`backwardActivation()` 计算激活函数的梯度。通过 :code:`getOutputGrad()` 来获得输出的梯度,调用该函数后,梯度会就地(不使用额外空间)乘上输出的梯度。 -- 计算偏置的梯度。注意,我们使用 :code:`biases_->getWGrad()` 来得到某个特定参数的梯度矩阵。在一个参数的梯度被更新后,**必须**要调用 :code:`getParameterPtr()->incUpdate(callback);` 。这用于在多线程和多机上更新参数。 -- 最后,计算转换矩阵和输入的梯度,并对相应的参数调用 :code:`incUpdate` 。PaddlePaddle可以通过该机制判断是否已经收集齐所有的梯度,从而可以做一些与计算重叠的工作(例如,网络通信)。 - - -.. code-block:: c++ - - void FullyConnectedLayer::backward(const UpdateCallback& callback) { - /* 对激活求导 */ { - backwardActivation(); - } - - if (biases_ && biases_->getWGrad()) { - biases_->getWGrad()->collectBias(*getOutputGrad(), 1); - - biases_->getParameterPtr()->incUpdate(callback); - } - - bool syncFlag = hl_get_sync_flag(); - - for (size_t i = 0; i != inputLayers_.size(); ++i) { - /* 计算当前层权重的梯度 */ - if (weights_[i]->getWGrad()) { - MatrixPtr input_T = getInputValue(i)->getTranspose(); - MatrixPtr oGrad = getOutputGrad(); - { - weights_[i]->getWGrad()->mul(input_T, oGrad, 1, 1); - } - } - - - /* 计算输入层的偏差 */ - MatrixPtr preGrad = getInputGrad(i); - if (NULL != preGrad) { - MatrixPtr weights_T = weights_[i]->getW()->getTranspose(); - preGrad->mul(getOutputGrad(), weights_T, 1, 1); - } - - { - weights_[i]->getParameterPtr()->incUpdate(callback); - } - } - } - - :code:`prefetch` 函数指出了在训练时需要从参数服务器取出的行。仅在远程稀疏训练时有效。使用远程稀疏方式训练时,完整的参数矩阵被分布在不同的参数服务器上。当网络层用一个批次做训练时,该批次的输入中仅有一个子集是非零的。因此,该层仅需要这些非零样本位置所对应的变换矩阵的那些行。 :code:`prefetch` 表明了这些行的标号。 - -大多数层不需要远程稀疏训练函数。这种情况下不需要重写该函数。 - -.. code-block:: c++ - - void FullyConnectedLayer::prefetch() { - for (size_t i = 0; i != inputLayers_.size(); ++i) { - auto* sparseParam = - dynamic_cast(weights_[i]->getW().get()); - if (sparseParam) { - MatrixPtr input = getInputValue(i); - sparseParam->addRows(input); - } - } - } - -最后,使用 :code:`REGISTER_LAYER(fc, FullyConnectedLayer);` 来注册该层。 :code:`fc` 是该层的标识符, :code:`FullyConnectedLayer` 是该层的类名。 - -.. code-block:: c++ - - namespace paddle { - REGISTER_LAYER(fc, FullyConnectedLayer); - } - -若 :code:`cpp` 被放在 :code:`paddle/legacy/gserver/layers` 目录下,其会自动被加入编译列表。 - - -写梯度检查单元测试 -=============================== - -写梯度检查单元测试是一个验证新实现的层是否正确的相对简单的办法。梯度检查单元测试通过有限差分法来验证一个层的梯度。首先对输入做一个小的扰动 :math:`\Delta x` ,然后观察到输出的变化为 :math:`\Delta y` ,那么,梯度就可以通过这个方程计算得到 :math:`\frac{\Delta y}{\Delta x }` 。之后,再用这个梯度去和 :code:`backward` 函数得到的梯度去对比,以保证梯度计算的正确性。需要注意的是梯度检查仅仅验证了梯度的计算,并不保证 :code:`forward` 和 :code:`backward` 函数的实现是正确的。你需要一些更复杂的单元测试来保证你实现的网络层是正确的。 - -所有网络层的梯度检查单测都位于 :code:`paddle/legacy/gserver/tests/test_LayerGrad.cpp` 。我们建议你在写新网络层时把测试代码放入新的文件中。下面列出了全连接层的梯度检查单元测试。它包含以下几步: - -+ 生成网络层配置。网络层配置包含以下几项: - - 偏置参数的大小。(例子中是4096) - - 层的类型。(例子中是fc) - - 层的大小。(例子中是4096) - - 激活的类型。(例子中是softmax) - - dropout的比例。(例子中是0.1) -+ 配置网络层的输入。在这个例子里,我们仅有一个输入。 - - 输入的类型( :code:`INPUT_DATA` ),可以是以下几种: - - :code:`INPUT_DATA` :稠密向量。 - - :code:`INPUT_LABEL` :整数。 - - :code:`INPUT_DATA_TARGET` :稠密向量,但不用于计算梯度。 - - :code:`INPUT_SEQUENCE_DATA` :含有序列信息的稠密向量。 - - :code:`INPUT_HASSUB_SEQUENCE_DATA` :含有序列信息和子序列信息的稠密向量。 - - :code:`INPUT_SEQUENCE_LABEL` :含有序列信息的整数。 - - :code:`INPUT_SPARSE_NON_VALUE_DATA` :0-1稀疏数据。 - - :code:`INPUT_SPARSE_FLOAT_VALUE_DATA` :浮点稀疏数据。 - - 输入的名字。(例子中是 :code:`layer_0` ) - - 输入的大小。(例子中是8192) - - 非零数字的个数,仅对稀疏数据有效。 - - 稀疏数据的格式,仅对稀疏数据有效。 -+ 对每个输入,都需要调用一次 :code:`config.layerConfig.add_inputs();` 。 -+ 调用 :code:`testLayerGrad` 来做梯度检查。它包含以下参数。 - - 层和输入的配置。(例子中是 :code:`config` ) - - 网络层的类型。(例子中是 :code:`fc` ) - - 梯度检查的输入数据的批次大小。(例子中是100) - - 输入是否是转置的。大多数层需要设置为 :code:`false` 。(例子中是 :code:`false` ) - - 是否使用权重。有些层或者激活需要做归一化以保证它们的输出的和是一个常数。例如,softmax激活的输出的和总是1。在这种情况下,我们不能通过常规的梯度检查的方式来计算梯度。因此我们采用输出的加权和(非常数)来计算梯度。(例子中是 :code:`true` ,因为全连接层的激活可以是softmax) - -.. code-block:: c++ - - void testFcLayer(string format, size_t nnz) { - // Create layer configuration. - TestConfig config; - config.biasSize = 4096; - config.layerConfig.set_type("fc"); - config.layerConfig.set_size(4096); - config.layerConfig.set_active_type("softmax"); - config.layerConfig.set_drop_rate(0.1); - // Setup inputs. - config.inputDefs.push_back( - {INPUT_DATA, "layer_0", 8192, nnz, ParaSparse(format)}); - config.layerConfig.add_inputs(); - LOG(INFO) << config.inputDefs[0].sparse.sparse << " " - << config.inputDefs[0].sparse.format; - for (auto useGpu : {false, true}) { - testLayerGrad(config, "fc", 100, /* trans */ false, useGpu, - /* weight */ true); - } - } - -如果你要为了测试而增加新的文件,例如 :code:`paddle/legacy/gserver/tests/testFCGrad.cpp` ,你需要把该文件加入 :code:`paddle/legacy/gserver/tests/CMakeLists.txt` 中。下面给出了一个例子。当你执行命令 :code:`make tests` 时,所有的单测都会被执行一次。注意,有些层可能需要高精度来保证梯度检查单测正确执行。你需要在配置cmake时将 :code:`WITH_DOUBLE` 设置为 `ON` 。 - -.. code-block:: bash - - add_unittest_without_exec(test_FCGrad - test_FCGrad.cpp - LayerGradUtil.cpp - TestUtil.cpp) - - add_test(NAME test_FCGrad - COMMAND test_FCGrad) - - -实现python封装 -======================== - -python封装的实现使得我们可以在配置文件中使用新实现的网络层。所有的python封装都在 :code:`python/paddle/trainer/config_parser.py` 中。全连接层python封装的例子中包含下面几步: - -- 所有的Python封装都使用 :code:`@config_layer('fc')` 这样的装饰器。网络层的标识符为 :code:`fc` 。 -- 实现构造函数 :code:`__init__` 。 - - 它首先调用基构造函数 :code:`super(FCLayer, self).__init__(name, 'fc', size, inputs=inputs, **xargs)` 。 :code:`FCLayer` 是Python封装的类名。 :code:`fc` 是网络层的标识符。为了封装能够正确工作,这些名字必须要写对。 - - 之后,计算变换矩阵的大小和格式(是否稀疏)。 - -.. code-block:: python - - @config_layer('fc') - class FCLayer(LayerBase): - def __init__( - self, - name, - size, - inputs, - bias=True, - **xargs): - super(FCLayer, self).__init__(name, 'fc', size, inputs=inputs, **xargs) - for input_index in xrange(len(self.inputs)): - input_layer = self.get_input_layer(input_index) - psize = self.config.size * input_layer.size - dims = [input_layer.size, self.config.size] - format = self.inputs[input_index].format - sparse = format == "csr" or format == "csc" - if sparse: - psize = self.inputs[input_index].nnz - self.create_input_parameter(input_index, psize, dims, sparse, format) - self.create_bias_parameter(bias, self.config.size) - -在网络配置中,网络层的细节可以通过下面这些代码片段来指定。这个类的参数包括: - -- :code:`name` 是网络层实例的名字标识符。 -- :code:`type` 是网络层的类型,通过网络层的标识符来指定。 -- :code:`size` 是网络层输出的大小。 -- :code:`bias` 表明这个层的一个实例是否需要偏置。 -- :code:`inputs` 说明这个层的输入,输入是由一个list中的网络层实例的名字组成的。 - -.. code-block:: python - - Layer( - name = "fc1", - type = "fc", - size = 64, - bias = True, - inputs = [Input("pool3")] - ) - -我们建议你为你的Python封装实现一个“助手”,使得搭模型时更方便。具体可以参考 :code:`python/paddle/trainer_config_helpers/layers.py` 。 diff --git a/doc/v2/dev/new_layer_en.rst b/doc/v2/dev/new_layer_en.rst deleted file mode 100644 index ad723738801908a5f48343574c204bdbfc97ee08..0000000000000000000000000000000000000000 --- a/doc/v2/dev/new_layer_en.rst +++ /dev/null @@ -1,390 +0,0 @@ -================ -Write New Layers -================ - -This tutorial will guide you to write customized layers in PaddlePaddle. We will utilize fully connected layer as an example to guide you through the following steps for writing a new layer. - -- Derive equations for the forward and backward part of the layer. -- Implement C++ class for the layer. -- Write gradient check unit test to make sure the gradients are correctly computed. -- Implement Python wrapper for the layer. - -Derive Equations -================ - -First we need to derive equations of the *forward* and *backward* part of the layer. The forward part computes the output given an input. The backward part computes the gradients of the input and the parameters given the the gradients of the output. - -The illustration of a fully connected layer is shown in the following figure. In a fully connected layer, all output nodes are connected to all the input nodes. - -.. image:: src/FullyConnected.jpg - :align: center - :scale: 60 % - -The *forward part* of a layer transforms an input into the corresponding output. -Fully connected layer takes a dense input vector with dimension :math:`D_i`. It uses a transformation matrix :math:`W` with size :math:`D_i \times D_o` to project :math:`x` into a :math:`D_o` dimensional vector, and add a bias vector :math:`b` with dimension :math:`D_o` to the vector. - -.. math:: - - y = f(W^T x + b) - -where :math:`f(.)` is an nonlinear *activation* function, such as sigmoid, tanh, and Relu. - -The transformation matrix :math:`W` and bias vector :math:`b` are the *parameters* of the layer. The *parameters* of a layer are learned during training in the *backward pass*. The backward pass computes the gradients of the output function with respect to all parameters and inputs. The optimizer can use chain rule to compute the gradients of the loss function with respect to each parameter. - -Suppose our loss function is :math:`c(y)`, then - -.. math:: - - \frac{\partial c(y)}{\partial x} = \frac{\partial c(y)}{\partial y} \frac{\partial y}{\partial x} - -Suppose :math:`z = W^T x + b`, then - -.. math:: - - \frac{\partial y}{\partial z} = \frac{\partial f(z)}{\partial z} - -This derivative can be automatically computed by our base layer class. - -Then, for fully connected layer, we need to compute: - -.. math:: - - \frac{\partial z}{\partial x} = W, \frac{\partial z_j}{\partial W_{ij}} = x_i, \frac{\partial z}{\partial b} = \mathbf 1 - -where :math:`\mathbf 1` is an all one vector, :math:`W_{ij}` is the number at the i-th row and j-th column of the matrix :math:`W`, :math:`z_j` is the j-th component of the vector :math:`z`, and :math:`x_i` is the i-th component of the vector :math:`x`. - -Finally we can use chain rule to calculate :math:`\frac{\partial z}{\partial x}`, and :math:`\frac{\partial z}{\partial W}`. The details of the computation will be given in the next section. - -Implement C++ Class -=================== - -The C++ class of the layer implements the initialization, forward, and backward part of the layer. The fully connected layer is at :code:`paddle/legacy/gserver/layers/FullyConnectedLayer.h` and :code:`paddle/legacy/gserver/layers/FullyConnectedLayer.cpp`. We list simplified version of the code below. - -It needs to derive the base class :code:`paddle::Layer`, and it needs to override the following functions: - -- constructor and destructor. -- :code:`init` function. It is used to initialize the parameters and settings. -- :code:`forward`. It implements the forward part of the layer. -- :code:`backward`. It implements the backward part of the layer. -- :code:`prefetch`. It is utilized to determine the rows corresponding parameter matrix to prefetch from parameter server. You do not need to override this function if your layer does not need remote sparse update. (most layers do not need to support remote sparse update) - - -The header file is listed below: - -.. code-block:: c++ - - namespace paddle { - /** - * A layer has full connections to all neurons in the previous layer. - * It computes an inner product with a set of learned weights, and - * (optionally) adds biases. - * - * The config file api is fc_layer. - */ - - class FullyConnectedLayer : public Layer { - protected: - WeightList weights_; - std::unique_ptr biases_; - - public: - explicit FullyConnectedLayer(const LayerConfig& config) - : Layer(config) {} - ~FullyConnectedLayer() {} - - bool init(const LayerMap& layerMap, const ParameterMap& parameterMap); - - Weight& getWeight(int idx) { return *weights_[idx]; } - - void prefetch(); - void forward(PassType passType); - void backward(const UpdateCallback& callback = nullptr); - }; - } // namespace paddle - -It defines the parameters as class variables. We use :code:`Weight` class as abstraction of parameters. It supports multi-thread update. The details of this class will be described in details in the implementations. - -- :code:`weights_` is a list of weights for the transformation matrices. The current implementation can have more than one inputs. Thus, it has a list of weights. One weight corresponds to an input. -- :code:`biases_` is a weight for the bias vector. - -The fully connected layer does not have layer configuration hyper-parameters. If there are some layer hyper-parameters, a common practice is to store it in :code:`LayerConfig& config`, and put it into a class variable in the constructor. - -The following code snippet implements the :code:`init` function. - -- First, every :code:`init` function must call the :code:`init` function of the base class :code:`Layer::init(layerMap, parameterMap);`. This statement will initialize the required variables and connections for each layer. -- The it initializes all the weights matrices :math:`W`. The current implementation can have more than one inputs. Thus, it has a list of weights. -- Finally, it initializes the bias. - - -.. code-block:: c++ - - bool FullyConnectedLayer::init(const LayerMap& layerMap, - const ParameterMap& parameterMap) { - /* Initialize the basic parent class */ - Layer::init(layerMap, parameterMap); - - /* initialize the weightList */ - CHECK(inputLayers_.size() == parameters_.size()); - for (size_t i = 0; i < inputLayers_.size(); i++) { - // Option the parameters - size_t height = inputLayers_[i]->getSize(); - size_t width = getSize(); - - // create a new weight - if (parameters_[i]->isSparse()) { - CHECK_LE(parameters_[i]->getSize(), width * height); - } else { - CHECK_EQ(parameters_[i]->getSize(), width * height); - } - Weight* w = new Weight(height, width, parameters_[i]); - - // append the new weight to the list - weights_.emplace_back(w); - } - - /* initialize biases_ */ - if (biasParameter_.get() != NULL) { - biases_ = std::unique_ptr(new Weight(1, getSize(), biasParameter_)); - } - - return true; - } - -The implementation of the forward part has the following steps. - -- Every layer must call :code:`Layer::forward(passType);` at the beginning of its :code:`forward` function. -- Then it allocates memory for the output using :code:`reserveOutput(batchSize, size);`. This step is necessary because we support the batches to have different batch sizes. :code:`reserveOutput` will change the size of the output accordingly. For the sake of efficiency, we will allocate new memory if we want to expand the matrix, but we will reuse the existing memory block if we want to shrink the matrix. -- Then it computes :math:`\sum_i W_i x + b` using Matrix operations. :code:`getInput(i).value` retrieve the matrix of the i-th input. Each input is a :math:`batchSize \times dim` matrix, where each row represents an single input in a batch. For a complete lists of supported matrix operations, please refer to :code:`paddle/legacy/math/Matrix.h` and :code:`paddle/legacy/math/BaseMatrix.h`. -- Finally it applies the activation function using :code:`forwardActivation();`. It will automatically applies the corresponding activation function specifies in the network configuration. - - -.. code-block:: c++ - - void FullyConnectedLayer::forward(PassType passType) { - Layer::forward(passType); - - /* malloc memory for the output_ if necessary */ - int batchSize = getInput(0).getBatchSize(); - int size = getSize(); - - { - // Settup the size of the output. - reserveOutput(batchSize, size); - } - - MatrixPtr outV = getOutputValue(); - - // Apply the the transformation matrix to each input. - for (size_t i = 0; i != inputLayers_.size(); ++i) { - auto input = getInput(i); - CHECK(input.value) << "The input of 'fc' layer must be matrix"; - i == 0 ? outV->mul(input.value, weights_[i]->getW(), 1, 0) - : outV->mul(input.value, weights_[i]->getW(), 1, 1); - } - - /* add the bias-vector */ - if (biases_.get() != NULL) { - outV->addBias(*(biases_->getW()), 1); - } - - /* activation */ { - forwardActivation(); - } - } - -The implementation of the backward part has the following steps. - -- :code:`backwardActivation()` computes the gradients of the activation. The gradients will be multiplies in place to the gradients of the output, which can be retrieved using :code:`getOutputGrad()`. -- Compute the gradients of bias. Notice that we an use :code:`biases_->getWGrad()` to get the gradient matrix of the corresponding parameter. After the gradient of one parameter is updated, it **MUST** call :code:`getParameterPtr()->incUpdate(callback);`. This is utilize for parameter update over multiple threads or multiple machines. -- Then it computes the gradients of the transformation matrices and inputs, and it calls :code:`incUpdate` for the corresponding parameter. This gives the framework the chance to know whether it has gathered all the gradient to one parameter so that it can do some overlapping work (e.g., network communication) - - -.. code-block:: c++ - - void FullyConnectedLayer::backward(const UpdateCallback& callback) { - /* Do derivation for activations.*/ { - backwardActivation(); - } - - if (biases_ && biases_->getWGrad()) { - biases_->getWGrad()->collectBias(*getOutputGrad(), 1); - - biases_->getParameterPtr()->incUpdate(callback); - } - - bool syncFlag = hl_get_sync_flag(); - - for (size_t i = 0; i != inputLayers_.size(); ++i) { - /* Calculate the W-gradient for the current layer */ - if (weights_[i]->getWGrad()) { - MatrixPtr input_T = getInputValue(i)->getTranspose(); - MatrixPtr oGrad = getOutputGrad(); - { - weights_[i]->getWGrad()->mul(input_T, oGrad, 1, 1); - } - } - - - /* Calculate the input layers error */ - MatrixPtr preGrad = getInputGrad(i); - if (NULL != preGrad) { - MatrixPtr weights_T = weights_[i]->getW()->getTranspose(); - preGrad->mul(getOutputGrad(), weights_T, 1, 1); - } - - { - weights_[i]->getParameterPtr()->incUpdate(callback); - } - } - } - -The :code:`prefetch` function specifies the rows that need to be fetched from parameter server during training. It is only useful for remote sparse training. In remote sparse training, the full parameter matrix is stored distributedly at the parameter server. When the layer uses a batch for training, only a subset of locations of the input is non-zero in this batch. Thus, this layer only needs the rows of the transformation matrix corresponding to the locations of these non-zero entries. The :code:`prefetch` function specifies the ids of these rows. - -Most of the layers do not need remote sparse training function. You do not need to override this function in this case. - -.. code-block:: c++ - - void FullyConnectedLayer::prefetch() { - for (size_t i = 0; i != inputLayers_.size(); ++i) { - auto* sparseParam = - dynamic_cast(weights_[i]->getW().get()); - if (sparseParam) { - MatrixPtr input = getInputValue(i); - sparseParam->addRows(input); - } - } - } - -Finally, you can use :code:`REGISTER_LAYER(fc, FullyConnectedLayer);` to register the layer. :code:`fc` is the identifier of the layer, and :code:`FullyConnectedLayer` is the class name of the layer. - -.. code-block:: c++ - - namespace paddle { - REGISTER_LAYER(fc, FullyConnectedLayer); - } - -If the :code:`cpp` file is put into :code:`paddle/legacy/gserver/layers`, it will be automatically added to the compilation list. - - -Write Gradient Check Unit Test -=============================== - -An easy way to verify the correctness of new layer's implementation is to write a gradient check unit test. Gradient check unit test utilizes finite difference method to verify the gradient of a layer. It modifies the input with a small perturbation :math:`\Delta x` and observes the changes of output :math:`\Delta y`, the gradient can be computed as :math:`\frac{\Delta y}{\Delta x }`. This gradient can be compared with the gradient computed by the :code:`backward` function of the layer to ensure the correctness of the gradient computation. Notice that the gradient check only tests the correctness of the gradient computation, it does not necessarily guarantee the correctness of the implementation of the :code:`forward` and :code:`backward` function. You need to write more sophisticated unit tests to make sure your layer is implemented correctly. - -All the gradient check unit tests are located in :code:`paddle/legacy/gserver/tests/test_LayerGrad.cpp`. You are recommended to put your test into a new test file if you are planning to write a new layer. The gradient test of the gradient check unit test of the fully connected layer is listed below. It has the following steps. - -+ Create layer configuration. A layer configuration can include the following attributes: - - size of the bias parameter. (4096 in our example) - - type of the layer. (fc in our example) - - size of the layer. (4096 in our example) - - activation type. (softmax in our example) - - dropout rate. (0.1 in our example) -+ configure the input of the layer. In our example, we have only one input. - - type of the input (:code:`INPUT_DATA`) in our example. It can be one of the following types - - :code:`INPUT_DATA`: dense vector. - - :code:`INPUT_LABEL`: integer. - - :code:`INPUT_DATA_TARGET`: dense vector, but it does not used to compute gradient. - - :code:`INPUT_SEQUENCE_DATA`: dense vector with sequence information. - - :code:`INPUT_HASSUB_SEQUENCE_DATA`: dense vector with both sequence and sub-sequence information. - - :code:`INPUT_SEQUENCE_LABEL`: integer with sequence information. - - :code:`INPUT_SPARSE_NON_VALUE_DATA`: 0-1 sparse data. - - :code:`INPUT_SPARSE_FLOAT_VALUE_DATA`: float sparse data. - - name of the input. (:code:`layer_0` in our example) - - size of the input. (8192 in our example) - - number of non-zeros, only useful for sparse inputs. - - format of sparse data, only useful for sparse inputs. -+ each inputs needs to call :code:`config.layerConfig.add_inputs();` once. -+ call :code:`testLayerGrad` to perform gradient checks. It has the following arguments. - - layer and input configurations. (:code:`config` in our example) - - type of the layer. (:code:`fc` in our example) - - batch size of the gradient check. (100 in our example) - - whether the input is transpose. Most layers need to set it to :code:`false`. (:code:`false` in our example) - - whether to use weights. Some layers or activations perform normalization so that the sum of their output is a constant. For example, the sum of output of a softmax activation is one. In this case, we cannot correctly compute the gradients using regular gradient check techniques. A weighted sum of the output, which is not a constant, is utilized to compute the gradients. (:code:`true` in our example, because the activation of a fully connected layer can be softmax) - -.. code-block:: c++ - - void testFcLayer(string format, size_t nnz) { - // Create layer configuration. - TestConfig config; - config.biasSize = 4096; - config.layerConfig.set_type("fc"); - config.layerConfig.set_size(4096); - config.layerConfig.set_active_type("softmax"); - config.layerConfig.set_drop_rate(0.1); - // Setup inputs. - config.inputDefs.push_back( - {INPUT_DATA, "layer_0", 8192, nnz, ParaSparse(format)}); - config.layerConfig.add_inputs(); - LOG(INFO) << config.inputDefs[0].sparse.sparse << " " - << config.inputDefs[0].sparse.format; - for (auto useGpu : {false, true}) { - testLayerGrad(config, "fc", 100, /* trans */ false, useGpu, - /* weight */ true); - } - } - -If you are creating a new file for the test, such as :code:`paddle/legacy/gserver/tests/testFCGrad.cpp`, you need to add the file to :code:`paddle/legacy/gserver/tests/CMakeLists.txt`. An example is given below. All the unit tests will run when you execute the command :code:`make tests`. Notice that some layers might need high accuracy for the gradient check unit tests to work well. You need to configure :code:`WITH_DOUBLE` to `ON` when configuring cmake. - -.. code-block:: bash - - add_unittest_without_exec(test_FCGrad - test_FCGrad.cpp - LayerGradUtil.cpp - TestUtil.cpp) - - add_test(NAME test_FCGrad - COMMAND test_FCGrad) - - -Implement Python Wrapper -======================== - -Implementing Python wrapper allows us to use the added layer in configuration files. All the Python wrappers are in file :code:`python/paddle/legacy/trainer/config_parser.py`. An example of the Python wrapper for fully connected layer is listed below. It has the following steps: - -- Use :code:`@config_layer('fc')` at the decorator for all the Python wrapper class. :code:`fc` is the identifier of the layer. -- Implements :code:`__init__` constructor function. - - It first call :code:`super(FCLayer, self).__init__(name, 'fc', size, inputs=inputs, **xargs)` base constructor function. :code:`FCLayer` is the Python wrapper class name, and :code:`fc` is the layer identifier name. They must be correct in order for the wrapper to work. - - Then it computes the size and format (whether sparse) of each transformation matrix as well as the size. - -.. code-block:: python - - @config_layer('fc') - class FCLayer(LayerBase): - def __init__( - self, - name, - size, - inputs, - bias=True, - **xargs): - super(FCLayer, self).__init__(name, 'fc', size, inputs=inputs, **xargs) - for input_index in xrange(len(self.inputs)): - input_layer = self.get_input_layer(input_index) - psize = self.config.size * input_layer.size - dims = [input_layer.size, self.config.size] - format = self.inputs[input_index].format - sparse = format == "csr" or format == "csc" - if sparse: - psize = self.inputs[input_index].nnz - self.create_input_parameter(input_index, psize, dims, sparse, format) - self.create_bias_parameter(bias, self.config.size) - -In network configuration, the layer can be specifies using the following code snippets. The arguments of this class are: - -- :code:`name` is the name identifier of the layer instance. -- :code:`type` is the type of the layer, specified using layer identifier. -- :code:`size` is the output size of the layer. -- :code:`bias` specifies whether this layer instance has bias. -- :code:`inputs` specifies a list of layer instance names as inputs. - -.. code-block:: python - - Layer( - name = "fc1", - type = "fc", - size = 64, - bias = True, - inputs = [Input("pool3")] - ) - -You are also recommended to implement a helper for the Python wrapper, which makes it easier to write models. You can refer to :code:`python/paddle/trainer_config_helpers/layers.py` for examples. diff --git a/doc/v2/dev/src/FullyConnected.jpg b/doc/v2/dev/src/FullyConnected.jpg deleted file mode 100644 index b2241f401434e527f95ee4e0e541a3f2ff78fd1e..0000000000000000000000000000000000000000 Binary files a/doc/v2/dev/src/FullyConnected.jpg and /dev/null differ diff --git a/doc/v2/dev/src/doc_en.png b/doc/v2/dev/src/doc_en.png deleted file mode 100644 index ed6b9178fba91a3bdf45ae797a9924f84146fbc8..0000000000000000000000000000000000000000 Binary files a/doc/v2/dev/src/doc_en.png and /dev/null differ diff --git a/doc/v2/dev/write_docs_cn.rst b/doc/v2/dev/write_docs_cn.rst deleted file mode 100644 index 4231f2bb5cd800c0cd86835b5d07e491fcde4989..0000000000000000000000000000000000000000 --- a/doc/v2/dev/write_docs_cn.rst +++ /dev/null @@ -1,136 +0,0 @@ -############# -如何贡献文档 -############# - -PaddlePaddle的文档包括中英文两个部分。文档都是通过 ``cmake`` 驱动 ``sphinx`` 编译生成的,PaddlePaddle.org工具可以帮助我们实现这一编译过程,并提供更好的预览效果。 - -如何构建文档 -============ - -PaddlePaddle的文档构建有两种方式,分别为使用paddlepaddle.org工具和不使用paddlepaddle.org工具,两种方式都有各自的优点,前者方便预览,后者方便开发者进行调试。这两种方式中又分别有使用docker和不使用docker的两种构建方法。 - -我们建议使用PaddlePaddle.org工具来构建文档。 - -使用PaddlePaddle.org工具 ------------------------- -这个是目前推荐的使用方法。除了可以自动编译文档,还可以直接在网页中预览文档,需要注意的是,采用后续说明的其它方式虽然也可以预览文档,但是文档的样式与官网文档是不一致的,使用PaddlePaddle.org工具进行编译才能产生与官网文档样式一致的预览效果。 - -PaddlePaddle.org工具可以配合Docker使用,需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 `_ 。安装好Docker之后即可用以下命令启动工具 - -.. code-block:: bash - - mkdir paddlepaddle # Create paddlepaddle working directory - cd paddlepaddle - - # Clone the content repositories - git clone https://github.com/PaddlePaddle/Paddle.git - git clone https://github.com/PaddlePaddle/book.git - git clone https://github.com/PaddlePaddle/models.git - git clone https://github.com/PaddlePaddle/Mobile.git - - # Please specify the working directory through -v - docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest - -注意: PaddlePaddle.org 会在 -v (volume) 指定的内容存储库运行命令 -之后再用网页连到 http://localhost:8000 就可以在网页上生成需要的文档 -编译后的文件将被存储在工作目录 /.ppo_workspace/content。 - -如果不想使用Docker,你还可以通过运行Django框架直接激活工具的服务器。使用下面的命令来运行它。 - -.. code-block:: bash - - mkdir paddlepaddle # Create paddlepaddle working directory - cd paddlepaddle - - # Clone the content repositories and PaddlePaddle.org - git clone https://github.com/PaddlePaddle/Paddle.git - git clone https://github.com/PaddlePaddle/book.git - git clone https://github.com/PaddlePaddle/models.git - git clone https://github.com/PaddlePaddle/Mobile.git - git clone https://github.com/PaddlePaddle/PaddlePaddle.org.git - - # Please specify the PaddlePaddle working directory. In the current setting, it should be pwd - export CONTENT_DIR= - export ENV='' - cd PaddlePaddle.org/portal/ - pip install -r requirements.txt - python manage.py runserver - -工具服务器将读取环境变量 CONTENT_DIR 搜索代码库。请指定的PaddlePaddle工作目录给环境变量 CONTENT_DIR。 -之后再用网页连到 http://localhost:8000 就可以在网页上生成需要的文档。 -编译后的文件将被存储在工作目录 /.ppo_workspace/content。 - -想了解更多PaddlePaddle.org工具的详细信息,可以 `点击这里 `_ 。 - -不使用PaddlePaddle.org工具 --------------------------- - -使用Docker构建PaddlePaddle的文档,需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 `_ 。该方法与 `从源码编译PaddlePaddle `_ 相似,通过从源码中构建可用于编译PaddlePaddle文档的Docker镜像并运行,在进入Docker容器后使用源码中的脚本构建PaddlePaddle文档,具体步骤如下: - -.. code-block:: bash - - git clone https://github.com/PaddlePaddle/Paddle.git - cd Paddle - - # 从源码中构建可用于编译PaddlePaddle文档的Docker镜像 - docker build -t paddle:dev . - docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" -e "WITH_DOC=ON" paddle:dev /bin/bash - - # 进入Docker容器后使用build.sh脚本构建PaddlePaddle文档 - bash -x /paddle/paddle/scripts/docker/build.sh - -注:上述命令把当前目录(源码根目录)映射为 container 里的 :code:`/paddle` 目录。 - -编译完成后,会产生 ``doc/v2`` 和 ``doc/fluid`` 两个目录,在这两个目录下分别都生成 ``cn/html/`` 、 ``en/html`` 、 ``api/en/html`` 共三个子目录,分别进入这些目录下,执行以下命令: - -.. code-block:: bash - - python -m SimpleHTTPServer 8088 - -在浏览器中输入 http://localhost:8088 就可以看到编译生成的 ``v2`` 和 ``fluid`` 两种版本的中/英文的文档页面和英文的API页面。 - -如果不想使用Docker,也可以使用以下命令直接构建PaddlePaddle文档,即 - -.. code-block:: bash - - git clone https://github.com/PaddlePaddle/Paddle.git - cd Paddle - mkdir -p build - cd build - cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON - - # 如果只需要构建使用文档,则执行以下命令 - make -j $processors paddle_docs - - # 如果只需要构建API,则执行以下命令 - make -j $processors paddle_apis - -其中$processors代表启动和CPU核一样多的进程来并行编译,可以根据本机的CPU核数设置相应的值。 - -编译完成后,同样会产生 ``doc/v2`` 和 ``doc/fluid`` 两个目录,如果选择构建文档则会在这两个目录下分别都生成 ``cn/html/`` 、 ``en/html`` 两个子目录,选择构建API则会在这两个目录下分别生成 ``api/en/html`` 目录,分别进入这些子目录下,执行以下命令: - -.. code-block:: bash - - python -m SimpleHTTPServer 8088 - -在浏览器中输入 http://localhost:8088 就可以看到编译生成的 ``v2`` 和 ``fluid`` 两种版本的中/英文的文档页面和英文的API页面。下图为生成的 ``v2`` 英文文档首页示例。注意,示例中由于使用了sphinx的原始主题,所以页面的风格与官网并不一致,但这并不影响开发者进行调试。 - -.. image:: src/doc_en.png - :align: center - :scale: 60 % - -如何书写文档 -============ - -PaddlePaddle文档使用 `sphinx`_ 自动生成,用户可以参考sphinx教程进行书写。 - -如何更新www.paddlepaddle.org -============================ - -更新的文档以PR的形式提交到github中,提交方式参见 `如何贡献文档 `_ 。 -目前PaddlePaddle的develop分支的文档是自动触发更新的,用户可以分别查看最新的 `中文文档 `_ 和 -`英文文档 `_ 。 - - -.. _cmake: https://cmake.org/ -.. _sphinx: http://www.sphinx-doc.org/en/1.4.8/ diff --git a/doc/v2/dev/write_docs_en.rst b/doc/v2/dev/write_docs_en.rst deleted file mode 100644 index 6105455e202e4704aa25f0fd9916b9b61a569702..0000000000000000000000000000000000000000 --- a/doc/v2/dev/write_docs_en.rst +++ /dev/null @@ -1,139 +0,0 @@ -######################## -Contribute Documentation -######################## - -PaddlePaddle's documentation includes both Chinese and English versions. The documentation is built using the ``cmake`` command to drive the ``sphinx`` compiler. The PaddlePaddle.org tool helps us to implement this compilation process and provides better preview results. - -How to build Documentation -=========================== - -PaddlePaddle's documentation is built in two ways: using the PaddlePaddle.org tool and without using it. Both methods have their own advantages. The former facilitates previewing, while the latter facilitates debugging by the developer. We could choose to build the documentation with Docker or without it in each of the above ways. - -We recommend using PaddlePaddle.org tool to build documentation. - -Using PaddlePaddle.org tool ------------------------------ -This is the recommended method to build documentation, because it can automatically compile the documentation and preview the documentation directly in a web page. Note that, although you can preview the documentation in other ways, its style may not be consistent with the official website. Compiling with the PaddlePaddle.org tool produces a preview that will be consistent with the official website documentation style. - -The PaddlePaddle.org tool can be used with Docker and Docker needs to be installed first. Please refer to `Docker's official website `_ on how to install Docker. After installing Docker, you may use the following commands to activate the tool - -.. code-block:: bash - - mkdir paddlepaddle # Create paddlepaddle working directory - cd paddlepaddle - - # Clone the content repositories. You may only clone the contents you need - git clone https://github.com/PaddlePaddle/Paddle.git - git clone https://github.com/PaddlePaddle/book.git - git clone https://github.com/PaddlePaddle/models.git - git clone https://github.com/PaddlePaddle/Mobile.git - - # Please specify the working directory through -v - docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest - -Note: PaddlePaddle.org will read the content repos specified in the -v (volume) flag of the docker run commands -Use a web browser and navigate to http://localhost:8000. Click the buttons to compile the documentation. -The compiled documentations will be stored in /.ppo_workspace/content - - -If you don't wish to use Docker, you can also activate the tool through Django. Use the following the commands to set up - -.. code-block:: bash - - mkdir paddlepaddle # Create paddlepaddle working directory - cd paddlepaddle - - # Clone the content repositories and PaddlePaddle.org - git clone https://github.com/PaddlePaddle/Paddle.git - git clone https://github.com/PaddlePaddle/book.git - git clone https://github.com/PaddlePaddle/models.git - git clone https://github.com/PaddlePaddle/Mobile.git - git clone https://github.com/PaddlePaddle/PaddlePaddle.org.git - - # Please specify the PaddlePaddle working directory. In the current setting, it should be pwd - export CONTENT_DIR= - export ENV='' - cd PaddlePaddle.org/portal/ - pip install -r requirements.txt - python manage.py runserver - -Specify the PaddlePaddle working directory for the environment variable CONTENT_DIR so that the tool could find where the working directory is. - -Use a web browser and navigate to http://localhost:8000. Click the buttons to compile the documentation -The compiled documentations will be stored in /.ppo_workspace/content - -Please `click here `_ for more information about the PaddlePaddle.org tool. - - -Manually Building the Documentation -------------------------------------- - -Build PaddlePaddle's documentation with Docker,you need to install Docker first. Please refer to `Docker's official website `_ on how to install Docker. This method is quite similar to ` Build From Sources `_ , by constructing, from source code, a docker image that can be used to build PaddlePaddle documentation. Enter the Docker container and use the script ``build.sh`` in the source directory to build the PaddlePaddle documentation. The specific steps are as follows: - -.. code-block:: bash - - git clone https://github.com/PaddlePaddle/Paddle.git - cd Paddle - - # Construct a docker image from source code - docker build -t paddle:dev . - docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" -e "WITH_DOC=ON" paddle:dev /bin/bash - - # Use build.sh to build PaddlePaddle documentation - bash -x /paddle/paddle/scripts/docker/build.sh - -Note: The above commands maps the current directory (source root directory) to the :code:`/paddle` directory in the container. - -After compiling, there should be two generated directories: ``doc/v2`` and ``doc/fluid``, where three subdirectories ``cn/html/``, ``en/html`` and ``api/en/html`` are generated. Please enter these directories respectively and execute the following commands: - -.. code-block:: bash - - python -m SimpleHTTPServer 8088 - -Use a web browser and navigate to http://localhost:8000, you could see the compiled ``v2`` 's and ``fluid`` 's Chinese/English documents page and English APIs page. - -If you do not wish to use Docker, you can also use the following commands to directly build the PaddlePaddle documentation. - -.. code-block:: bash - - - git clone https://github.com/PaddlePaddle/Paddle.git - cd Paddle - mkdir -p build - cd build - cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON - - # If you only need to build documents, use the following commands - make -j $processors paddle_docs - - # If you only need to build APIs, use the following commands - make -j $processors paddle_apis - -$processors indicates that as many processes as the CPU cores are started to compile in parallel. It should be set according to the number of CPU cores of your machine. - -After compiling, there also should be two generated directories: ``doc/v2`` and ``doc/fluid`` . If you chose to build documents, two subdirectories ``cn/html/`` and ``en/html`` will be generated in both two directories. If you chose to build APIs,a subdirectory ``api/en/html`` will be generated. Please enter these directories respectively and execute the following commands: - -.. code-block:: bash - - python -m SimpleHTTPServer 8088 - -Use a web browser and navigate to http://localhost:8000, you could see the compiled ``v2`` 's and ``fluid`` 's Chinese/English documents page and English APIs page. The following figure is an example of the built ``v2`` 's English documents home page. Note that due to the sphinx's original theme used in the example, the style of the page is not consistent with the official website, but this does not affect the developer's debugging. - -.. image:: src/doc_en.png - :align: center - :scale: 60 % - -How to write Documentation -=========================== - -PaddlePaddle uses `sphinx`_ to compile documentation,Please check sphinx official website for more detail. - -How to update www.paddlepaddle.org -=================================== - -Please create PRs and submit them to github, please check `Contribute Code `_ 。 -PaddlePaddle develop branch will update the documentation once the PR is merged. User may check latest `Chinese Docs `_ and -`English Docs `_ 。 - -.. _cmake: https://cmake.org/ -.. _sphinx: http://www.sphinx-doc.org/en/1.4.8/ diff --git a/doc/v2/faq/build_and_install/index_cn.rst b/doc/v2/faq/build_and_install/index_cn.rst deleted file mode 100644 index 0d644777287aea0a572adb6fa40f498f9c147af7..0000000000000000000000000000000000000000 --- a/doc/v2/faq/build_and_install/index_cn.rst +++ /dev/null @@ -1,224 +0,0 @@ -.. _install_faq: - -################### -编译安装与单元测试 -################### - -.. contents:: - -1. 运行Docker GPU镜像出现 "CUDA driver version is insufficient" ----------------------------------------------------------------- - -用户在使用PaddlePaddle GPU的Docker镜像的时候,常常出现 `Cuda Error: CUDA driver version is insufficient for CUDA runtime version`, 原因在于没有把机器上CUDA相关的驱动和库映射到容器内部。 -具体的解决方法是: - -.. code-block:: bash - - $ export CUDA_SO="$(\ls usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')" - $ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}') - $ docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu - -更多关于Docker的安装与使用, 请参考 `PaddlePaddle Docker 文档 `_ 。 - - -2. CMake源码编译, 找到的PythonLibs和PythonInterp版本不一致 ----------------------------------------------------------------- - -这是目前CMake寻找Python的逻辑存在缺陷,如果系统安装了多个Python版本,CMake找到的Python库和Python解释器版本可能有不一致现象,导致编译PaddlePaddle失败。正确的解决方法是, -用户强制指定特定的Python版本,具体操作如下: - - .. code-block:: bash - - cmake .. -DPYTHON_EXECUTABLE= -DPYTHON_LIBRARY= -DPYTHON_INCLUDE_DIR= - -用户需要指定本机上Python的路径:````, ````, ```` - -3. CMake源码编译,Paddle版本号为0.0.0 --------------------------------------- - -如果运行 :code:`paddle version`, 出现 :code:`PaddlePaddle 0.0.0`;或者运行 :code:`cmake ..`,出现 - -.. code-block:: bash - - CMake Warning at cmake/version.cmake:20 (message): - Cannot add paddle version from git tag - -那么用户需要拉取所有的远程分支到本机,命令为 :code:`git fetch upstream`,然后重新cmake即可。 - -4. paddlepaddle\*.whl is not a supported wheel on this platform. ------------------------------------------------------------------------- - -出现这个问题的主要原因是,没有找到和当前系统匹配的paddlepaddle安装包。最新的paddlepaddle python安装包支持Linux x86_64和MacOS 10.12操作系统,并安装了python 2.7和pip 9.0.1。 - -更新 :code:`pip` 包的方法是\: - -.. code-block:: bash - - pip install --upgrade pip - -如果还不行,可以执行 :code:`python -c "import pip; print(pip.pep425tags.get_supported())"` 获取当前系统支持的python包的后缀, -并对比是否和正在安装的后缀一致。 - -如果系统支持的是 :code:`linux_x86_64` 而安装包是 :code:`manylinux1_x86_64` ,需要升级pip版本到最新; -如果系统支持 :code:`manylinux1_x86_64` 而安装包(本地)是 :code:`linux_x86_64` ,可以重命名这个whl包为 :code:`manylinux1_x86_64` 再安装。 - -5. 编译安装后执行 import paddle.v2 as paddle 报ImportError: No module named v2 ------------------------------------------------------------------------------------------- -先查看一下是否曾经安装过paddle v1版本,有的话需要先卸载: - -pip uninstall py_paddle paddle - -然后安装paddle的python环境, 在build目录下执行 - -pip install python/dist/paddle*.whl && pip install ../paddle/dist/py_paddle*.whl - -6. 遇到“非法指令”或者是“illegal instruction” --------------------------------------------- - -PaddlePaddle使用avx SIMD指令提高cpu执行效率,因此错误的使用二进制发行版可能会导致这种错误,请选择正确的版本。 - -7. python相关的单元测试都过不了 --------------------------------- - -如果出现以下python相关的单元测试都过不了的情况: - -.. code-block:: bash - - 24 - test_PyDataProvider (Failed) - 26 - test_RecurrentGradientMachine (Failed) - 27 - test_NetworkCompare (Failed) - 28 - test_PyDataProvider2 (Failed) - 32 - test_Prediction (Failed) - 33 - test_Compare (Failed) - 34 - test_Trainer (Failed) - 35 - test_TrainerOnePass (Failed) - 36 - test_CompareTwoNets (Failed) - 37 - test_CompareTwoOpts (Failed) - 38 - test_CompareSparse (Failed) - 39 - test_recurrent_machine_generation (Failed) - 40 - test_PyDataProviderWrapper (Failed) - 41 - test_config_parser (Failed) - 42 - test_swig_api (Failed) - 43 - layers_test (Failed) - -并且查询PaddlePaddle单元测试的日志,提示: - -.. code-block:: bash - - paddle package is already in your PYTHONPATH. But unittest need a clean environment. - Please uninstall paddle package before start unittest. Try to 'pip uninstall paddle'. - -解决办法是: - -* 卸载PaddlePaddle包 :code:`pip uninstall paddle`, 清理掉老旧的PaddlePaddle安装包,使得单元测试有一个干净的环境。如果PaddlePaddle包已经在python的site-packages里面,单元测试会引用site-packages里面的python包,而不是源码目录里 :code:`/python` 目录下的python包。同时,即便设置 :code:`PYTHONPATH` 到 :code:`/python` 也没用,因为python的搜索路径是优先已经安装的python包。 - -8. 下载MKLML库失败 ------------------- - -.. code-block:: bash - - make[2]: *** [third_party/mklml/src/extern_mklml-stamp/extern_mklml-download] 错误 4 - make[1]: *** [CMakeFiles/extern_mklml.dir/all] 错误 2 - make[1]: *** 正在等待未完成的任务.... - -原因:网速或SSL链接原因,导致MKLML库下载不成功。 - -解决办法是:手动下载并安装,具体步骤如下。 - -.. code-block:: bash - - // 1. 进入对应的目录 - cd build/third_party/mklml/src/extern_mklml - - // 2. 查看包的大小, 正常情况下是75M,如果小于75M,即下载失败: - du -sh mklml_lnx_2018.0.1.20171007.tgz - - // 3. 手动下载且解压缩,并手动生成download成功标签: - wget --no-check-certificate https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz -c -O mklml_lnx_2018.0.1.20171007.tgz - tar zxf mklml_lnx_2018.0.1.20171007.tgz - touch ../extern_mklml-stamp/extern_mklml-download - - // 4. 接着编译即可 - -9. 在Mac上无法安装numpy等Python包,权限错误 ------------------- - -Mac上对自带的Python和包有严格的权限保护,最好不要在自带的Python上安装。建议用virtualenv建立一个新的Python环境来操作。 - -virtualenv的基本原理是将机器上的Python运行所需的运行环境完整地拷贝一份。我们可以在一台机器上制造多份拷贝,并在这多个拷贝之间自由切换,这样就相当于在一台机器上拥有了多个相互隔离、互不干扰的Python环境。 - -下面简单介绍下如何用virtualenv为Paddle生成一个专用的Python环境: - -安装virtualenv: -:::::::::::::::: - -virtualenv本身也是Python的一个包,可以用pip进行安装: - -.. code-block:: bash - - sudo -H pip install virtualenv - -由于virtualenv需要安装给系统自带的Python,因此需要使用sudo权限。 - -创建一个新的Python运行环境: -::::::::::::::::::: - -.. code-block:: bash - - virtualenv --no-site-packages paddle - ---no-site-packages 参数表示不拷贝已有的任何第三方包,创造一个完全干净的新Python环境。后面的paddle是我们为这个新创建的环境取的名字。 - -执行完这一步后,当前目录下应该会出现一个名为paddle(或者你取的其他名字)的目录。这个目录里保存了运行一个Python环境所需要的各种文件。 - -启动运行环境: -:::::::::::::::: - -.. code-block:: bash - - source paddle/bin/activate - -执行后会发现命令提示符前面增加了(paddle)字样,说明已经成功启动了名为‘paddle’的Python环境。执行which python,可以发现使用的已经是刚刚创建的paddle目录下的Python。 - -在这个环境中,我们可以自由地进行Paddle的安装、使用和开发工作,无需担心对系统自带Python的影响。 - -退出运行环境: -::::::::::::::: - -直接执行: - -.. code-block:: bash - - deactivate - -可以看到命令提示符前面的(paddle)字样消失。 - -自动启动某一Python环境: -:::::::::::::::: - -如果我们经常使用Paddle,我们每次打开终端后都需要执行一下source paddle/bin/activate来启动环境,比较繁琐。为了简便,可以修改终端的配置文件,来让终端每次启动后自动启动特定的Python环境。 - -执行: - -.. code-block:: bash - - vi ~/.bash_profile - -打开终端配置文件,并在文件的最后添加一行: - -.. code-block:: bash - - source paddle/bin/activate - -保存并关闭文件。 - -这样,每次打开终端时就会自动启动名为‘paddle’的Python环境了。 - -10. 通过pip安装的PaddlePaddle在 :code:`import paddle.fluid` 报找不到 :code:`libmkldnn.so` 或 :code:`libmklml_intel.so` ------------------------------------------------------------------------------------------- -出现这种问题的原因是在导入 :code:`paddle.fluid` 时需要加载 :code:`libmkldnn.so` 和 :code:`libmklml_intel.so`, -但是系统没有找到该文件。一般通过pip安装PaddlePaddle时会将 :code:`libmkldnn.so` 和 :code:`libmklml_intel.so` -拷贝到 :code:`/usr/local/lib` 路径下,所以解决办法是将该路径加到 :code:`LD_LIBRARY_PATH` 环境变量下, -即: :code:`export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH` 。 - -**注意**:如果是在虚拟环境中安装PaddlePaddle, :code:`libmkldnn.so` 和 :code:`libmklml_intel.so` 可能不在 :code:`/usr/local/lib` 路径下。 \ No newline at end of file diff --git a/doc/v2/faq/build_and_install/index_en.rst b/doc/v2/faq/build_and_install/index_en.rst deleted file mode 100644 index 7488ed8137d57785f36b9f1e1ed1269f864960bc..0000000000000000000000000000000000000000 --- a/doc/v2/faq/build_and_install/index_en.rst +++ /dev/null @@ -1,143 +0,0 @@ -.. _install_faq: - -############################### -Compile, Install, and Unit Test -############################### - -.. contents:: - -1. Insufficient CUDA driver version ----------------------------------------------------------------- - -Many users usually face issues like `Cuda Error: CUDA driver version is insufficient for CUDA runtime version` when running the PaddlePaddle GPU Docker image. The cause is that you may not map the local CUDA driver to a container directory. -You can solve the issue by running the following commands: - -.. code-block:: bash - - $ export CUDA_SO="$(\ls usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')" - $ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}') - $ docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu - -For more infomation about Docker's installation and usage, please refer to `PaddlePaddle Docker documentation `_ . - - -2. Version mismatch between PythonLibs and PythonInterpreter ----------------------------------------------------------------- - -It is a common bug when CMake looks up Python. If you install multiple versions of Python, Cmake may find the version mismatch between PythonLibs and PythonInterpreter . You are forced to specify a Python version, as follows. - - .. code-block:: bash - - cmake .. -DPYTHON_EXECUTABLE= -DPYTHON_LIBRARY= -DPYTHON_INCLUDE_DIR= - -You should specify ````, ````, ```` to your local paths. - -3. PaddlePaddle version is 0.0.0 ------------------------------------------------- -This issue would happen when you run the code `paddle version` or `cmake ..` - -.. code-block:: bash - - CMake Warning at cmake/version.cmake:20 (message): - Cannot add paddle version from git tag - -You should pull all remote branches to your local machine with the command :code:`git fetch upstream` and then run :code:`cmake` - -4. paddlepaddle\*.whl is not a supported wheel on this platform. ------------------------------------------------------------------------- - -The primary cause for this issue is that it can not find the correct PaddlePaddle installation package that matches your current system.The latest PaddlePaddle Python installation package supports Linux x86_64 and MacOS 10.12 os including Python2.7 and Pip 9.0.1. - -You can upgrade Pip with the following command\: - -.. code-block:: bash - - pip install --upgrade pip - -If it does not work for you, you can run the command :code:`python -c "import pip; print(pip.pep425tags.get_supported())"` to get the suffix of Python package which your system may support and then compare it with the suffix of your installation. - -If the system supports :code:`linux_x86_64` and the installation package is :code:`manylinux1_x86_64`, you should upgrade pip to the latest - -if the system supports :code:`manylinux_x86_64` and the local installation package is :code:`linux1_x86_64`, you can rename the whl package to :code:`manylinux1_x86_64` and then try again. - - -5. ImportError: No module named v2 ----------------------------------- -Please uninstall Paddle V1 if you have installed it before. - -.. code-block:: bash - - pip uninstall py_paddle paddle - -Then install Python for PaddlePaddle , enter the build directory and run the following commands - -pip install python/dist/paddle*.whl && pip install ../paddle/dist/py_paddle*.whl - -6. Illegal instruction ------------------------ -This issue may be caused by the wrong usage of PaddlePaddle binary version which uses avx SIMD instructions to increase the performance of cpu. Please choose the correct version. - -7. Python unittest fails --------------------------------- - -If the following python unittest testcases fail: - -.. code-block:: bash - - 24 - test_PyDataProvider (Failed) - 26 - test_RecurrentGradientMachine (Failed) - 27 - test_NetworkCompare (Failed) - 28 - test_PyDataProvider2 (Failed) - 32 - test_Prediction (Failed) - 33 - test_Compare (Failed) - 34 - test_Trainer (Failed) - 35 - test_TrainerOnePass (Failed) - 36 - test_CompareTwoNets (Failed) - 37 - test_CompareTwoOpts (Failed) - 38 - test_CompareSparse (Failed) - 39 - test_recurrent_machine_generation (Failed) - 40 - test_PyDataProviderWrapper (Failed) - 41 - test_config_parser (Failed) - 42 - test_swig_api (Failed) - 43 - layers_test (Failed) - -Please check the PaddlePaddle unittest logs which may suggest the following: - -.. code-block:: bash - - paddle package is already in your PYTHONPATH. But unittest need a clean environment. - Please uninstall paddle package before start unittest. Try to 'pip uninstall paddle'. - -The solution is: - -* Remove old PaddlePaddle to make a clean environment for the unit tests. If PaddlePaddle package is already in Python's site-packages, unit tests would refer Python package in site-packages instead of Python package in the :code:`/python` directory of the source directory. Setting :code:`PYTHONPATH` to :code:`/python` is also useless because Python's search path would give the priority to the installed Python package. - - -8. Failed to download the MKLML library ----------------------------------------------- - -.. code-block:: bash - - make[2]: *** [third_party/mklml/src/extern_mklml-stamp/extern_mklml-download] error 4 - make[1]: *** [CMakeFiles/extern_mklml.dir/all] error 2 - make[1]: *** waiting for the unfinished jobs.... - -Cause: The network speed or SSL link causes the MKLML library to download unsuccessfully. - -The solution is: manually download and install, the specific steps are as follows. - -.. code-block:: bash - - // 1. enter the directory - cd build/third_party/mklml/src/extern_mklml - - // 2. check the size of the package, normally 75M, if less than 75M, the download fails - du -sh mklml_lnx_2018.0.1.20171007.tgz - - // 3. manually download and unzip and make the download success tag: - wget --no-check-certificate https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz -c -O mklml_lnx_2018.0.1.20171007.tgz - tar zxf mklml_lnx_2018.0.1.20171007.tgz - touch ../extern_mklml-stamp/extern_mklml-download - - // 4. then compile - diff --git a/doc/v2/faq/cluster/index_cn.rst b/doc/v2/faq/cluster/index_cn.rst deleted file mode 100644 index e59c1e1a54a0c876d1e6e89f88030de59fb9fc1a..0000000000000000000000000000000000000000 --- a/doc/v2/faq/cluster/index_cn.rst +++ /dev/null @@ -1,17 +0,0 @@ -############### -集群训练与预测 -############### - -.. contents:: - -1. 集群多节点训练,日志中保存均为网络通信类错误 ------------------------------------------------- - -集群多节点训练,日志报错为网络通信类错误,比如 :code:`Connection reset by peer` 等。 -此类报错通常是由于某一个节点的错误导致这个节点的训练进程退出,从而引发其他节点无法连接导致,可以参考下面的步骤排查: - -* 从 :code:`train.log` , :code:`server.log` 找到最早报错的地方,查看是否是其他错误引发的报错(比如FPE,内存不足,磁盘空间不足等)。 - -* 如果发现最早的报错就是网络通信的问题,很有可能是非独占方式执行导致的端口冲突,可以联系OP,看当前MPI集群是否支持resource=full参数提交,如果支持增加此参数提交,并更换job 端口。 - -* 如果当前MPI集群并不支持任务独占模式,可以联系OP是否可以更换集群或升级当前集群。 diff --git a/doc/v2/faq/cluster/index_en.rst b/doc/v2/faq/cluster/index_en.rst deleted file mode 100644 index fa942a09625bef78b28456beeb735272b686e061..0000000000000000000000000000000000000000 --- a/doc/v2/faq/cluster/index_en.rst +++ /dev/null @@ -1,16 +0,0 @@ -############################### -Cluster Training and Prediction -############################### - -.. contents:: - -1. Network connection errors in the log during multi-node cluster training ------------------------------------------------- -There are maybe some errors in the log belonging to network connection problem during multi-node cluster training, for example, :code:`Connection reset by peer`. -This kind of error is usually caused by the abnormal exit of a training process in some node, and the other nodes cannot connect with this node any longer. Steps to troubleshoot the problem are as follows: - -* Find the first error in the :code:`train.log`, :code:`server.log`, check whether other fault casued the problem, such as FPE, lacking of memory or disk. - -* If the first error in server.log says "Address already used", this may be caused by the port conflict of the non-exclusive execution. Connect the sys-admin to check if the current MPI cluster supports jobs submitted with parameter :code:`resource=full`. If the current MPI cluster does not support this parameter, change the server port and try agian. - -* If the current MPI cluster does not support exclusive pattern which allows a process to occupy the whole node, ask the administrator to replace or update the this cluster. diff --git a/doc/v2/faq/index_cn.rst b/doc/v2/faq/index_cn.rst deleted file mode 100644 index 4537c7a481e2efbcfed5fa7be2c81c36e13cd108..0000000000000000000000000000000000000000 --- a/doc/v2/faq/index_cn.rst +++ /dev/null @@ -1,13 +0,0 @@ -FAQ -==== - -本文档对关于PaddlePaddle的一些常见问题提供了解答。如果您的问题未在此处,请您到 `PaddlePaddle社区 `_ 查找答案或直接提 `issue `_ ,我们会及时进行回复。 - -.. toctree:: - :maxdepth: 1 - - build_and_install/index_cn.rst - model/index_cn.rst - parameter/index_cn.rst - local/index_cn.rst - cluster/index_cn.rst diff --git a/doc/v2/faq/index_en.rst b/doc/v2/faq/index_en.rst deleted file mode 100644 index 3fa220792b252617848a1c76bc2be49928e35f64..0000000000000000000000000000000000000000 --- a/doc/v2/faq/index_en.rst +++ /dev/null @@ -1,13 +0,0 @@ -FAQ -==== - -This document provides answers to some of the frequently asked questions about PaddlePaddle. If you have a question that is not covered here, please go to `PaddlePaddle Community `_ , to find an answer or submit new `issue `_ , we will reply in time. - -.. toctree:: - :maxdepth: 1 - - build_and_install/index_en.rst - model/index_en.rst - parameter/index_en.rst - local/index_en.rst - cluster/index_en.rst diff --git a/doc/v2/faq/local/index_cn.rst b/doc/v2/faq/local/index_cn.rst deleted file mode 100644 index c6d3c5bfac5a276e253c248ffd415c7789b20b29..0000000000000000000000000000000000000000 --- a/doc/v2/faq/local/index_cn.rst +++ /dev/null @@ -1,259 +0,0 @@ -############### -本地训练与预测 -############### - -.. contents:: - -1. 如何减少内存占用 -------------------- - -神经网络的训练本身是一个非常消耗内存和显存的工作,经常会消耗数10GB的内存和数GB的显存。 -PaddlePaddle的内存占用主要分为如下几个方面\: - -* DataProvider缓冲池内存(只针对内存) -* 神经元激活内存(针对内存和显存) -* 参数内存 (针对内存和显存) -* 其他内存杂项 - -其中,其他内存杂项是指PaddlePaddle本身所用的一些内存,包括字符串分配,临时变量等等,暂不考虑在内。 - -减少DataProvider缓冲池内存 -++++++++++++++++++++++++++ - -PyDataProvider使用的是异步加载,同时在内存里直接随即选取数据来做Shuffle。即 - -.. graphviz:: - - digraph { - rankdir=LR; - 数据文件 -> 内存池 -> PaddlePaddle训练 - } - -所以,减小这个内存池即可减小内存占用,同时也可以加速开始训练前数据载入的过程。但是,这 -个内存池实际上决定了shuffle的粒度。所以,如果将这个内存池减小,又要保证数据是随机的, -那么最好将数据文件在每次读取之前做一次shuffle。可能的代码为 - -.. literalinclude:: src/reduce_min_pool_size.py - -这样做可以极大的减少内存占用,并且可能会加速训练过程,详细文档参考 :ref:`api_pydataprovider2` 。 - -神经元激活内存 -++++++++++++++ - -神经网络在训练的时候,会对每一个激活暂存一些数据,如神经元激活值等。 -在反向传递的时候,这些数据会被用来更新参数。这些数据使用的内存主要和两个参数有关系, -一是batch size,另一个是每条序列(Sequence)长度。所以,其实也是和每个mini-batch中包含 -的时间步信息成正比。 - -所以做法可以有两种: - -* 减小batch size。 即在网络配置中 :code:`settings(batch_size=1000)` 设置成一个小一些的值。但是batch size本身是神经网络的超参数,减小batch size可能会对训练结果产生影响。 -* 减小序列的长度,或者直接扔掉非常长的序列。比如,一个数据集大部分序列长度是100-200, - 但是突然有一个10000长的序列,就很容易导致内存超限,特别是在LSTM等RNN中。 - -参数内存 -++++++++ - -PaddlePaddle支持非常多的优化算法(Optimizer),不同的优化算法需要使用不同大小的内存。 -例如使用 :code:`adadelta` 算法,则需要使用等于权重参数规模大约5倍的内存。举例,如果参数保存下来的模型目录 -文件为 :code:`100M`, 那么该优化算法至少需要 :code:`500M` 的内存。 - -可以考虑使用一些优化算法,例如 :code:`momentum`。 - -2. 如何加速训练速度 -------------------- - -加速PaddlePaddle训练可以考虑从以下几个方面\: - -* 减少数据载入的耗时 -* 加速训练速度 -* 利用分布式训练驾驭更多的计算资源 - -减少数据载入的耗时 -++++++++++++++++++ - -使用\ :code:`pydataprovider`\ 时,可以减少缓存池的大小,同时设置内存缓存功能,即可以极大的加速数据载入流程。 -:code:`DataProvider` 缓存池的减小,和之前减小通过减小缓存池来减小内存占用的原理一致。 - -.. literalinclude:: src/reduce_min_pool_size.py - -同时 :code:`@provider` 接口有一个 :code:`cache` 参数来控制缓存方法,将其设置成 :code:`CacheType.CACHE_PASS_IN_MEM` 的话,会将第一个 :code:`pass` (过完所有训练数据即为一个pass)生成的数据缓存在内存里,在之后的 :code:`pass` 中,不会再从 :code:`python` 端读取数据,而是直接从内存的缓存里读取数据。这也会极大减少数据读入的耗时。 - - -加速训练速度 -++++++++++++ - -PaddlePaddle支持Sparse的训练,sparse训练需要训练特征是 :code:`sparse_binary_vector` 、 :code:`sparse_vector` 、或者 :code:`integer_value` 的任一一种。同时,与这个训练数据交互的Layer,需要将其Parameter设置成 sparse 更新模式,即设置 :code:`sparse_update=True` - -这里使用简单的 :code:`word2vec` 训练语言模型距离,具体使用方法为\: - -使用一个词前两个词和后两个词,来预测这个中间的词。这个任务的DataProvider为\: - -.. literalinclude:: src/word2vec_dataprovider.py - -这个任务的配置为\: - -.. literalinclude:: src/word2vec_config.py - - -利用更多的计算资源 -++++++++++++++++++ - -利用更多的计算资源可以分为以下几个方式来进行\: - -* 单机CPU训练 - - * 使用多线程训练。设置命令行参数 :code:`trainer_count`。 - -* 单机GPU训练 - - * 使用显卡训练。设置命令行参数 :code:`use_gpu`。 - * 使用多块显卡训练。设置命令行参数 :code:`use_gpu` 和 :code:`trainer_count` 。 - -* 多机训练 - - * 请参考 :ref:`cluster_train` 。 - -3. 如何指定GPU设备 ------------------- - -例如机器上有4块GPU,编号从0开始,指定使用2、3号GPU: - -* 方式1:通过 `CUDA_VISIBLE_DEVICES `_ 环境变量来指定特定的GPU。 - -.. code-block:: bash - - env CUDA_VISIBLE_DEVICES=2,3 paddle train --use_gpu=true --trainer_count=2 - -* 方式2:通过命令行参数 ``--gpu_id`` 指定。 - -.. code-block:: bash - - paddle train --use_gpu=true --trainer_count=2 --gpu_id=2 - - -4. 训练过程中出现 :code:`Floating point exception`, 训练因此退出怎么办? ------------------------------------------------------------------------- - -Paddle二进制在运行时捕获了浮点数异常,只要出现浮点数异常(即训练过程中出现NaN或者Inf),立刻退出。浮点异常通常的原因是浮点数溢出、除零等问题。 -主要原因包括两个方面: - -* 训练过程中参数或者训练过程中的梯度尺度过大,导致参数累加,乘除等时候,导致了浮点数溢出。 -* 模型一直不收敛,发散到了一个数值特别大的地方。 -* 训练数据有问题,导致参数收敛到了一些奇异的情况。或者输入数据尺度过大,有些特征的取值达到数百万,这时进行矩阵乘法运算就可能导致浮点数溢出。 - -这里有两种有效的解决方法: - -1. 设置 :code:`gradient_clipping_threshold` 参数,示例代码如下: - -.. code-block:: python - - optimizer = paddle.optimizer.RMSProp( - learning_rate=1e-3, - gradient_clipping_threshold=10.0, - regularization=paddle.optimizer.L2Regularization(rate=8e-4)) - -具体可以参考 `nmt_without_attention `_ 示例。 - -2. 设置 :code:`error_clipping_threshold` 参数,示例代码如下: - -.. code-block:: python - - decoder_inputs = paddle.layer.fc( - act=paddle.activation.Linear(), - size=decoder_size * 3, - bias_attr=False, - input=[context, current_word], - layer_attr=paddle.attr.ExtraLayerAttribute( - error_clipping_threshold=100.0)) - -完整代码可以参考示例 `machine translation `_ 。 - -两种方法的区别: - -1. 两者都是对梯度的截断,但截断时机不同,前者在 :code:`optimzier` 更新网络参数时应用;后者在激活函数反向计算时被调用; -2. 截断对象不同:前者截断可学习参数的梯度,后者截断回传给前层的梯度; - -除此之外,还可以通过减小学习率或者对数据进行归一化处理来解决这类问题。 - -5. 如何调用 infer 接口输出多个layer的预测结果 ------------------------------------------------ - -* 将需要输出的层作为 :code:`paddle.inference.Inference()` 接口的 :code:`output_layer` 参数输入,代码如下: - -.. code-block:: python - - inferer = paddle.inference.Inference(output_layer=[layer1, layer2], parameters=parameters) - -* 指定要输出的字段进行输出。以输出 :code:`value` 字段为例,代码如下: - -.. code-block:: python - - out = inferer.infer(input=data_batch, field=["value"]) - -需要注意的是: - -* 如果指定了2个layer作为输出层,实际上需要的输出结果是两个矩阵; -* 假设第一个layer的输出A是一个 N1 * M1 的矩阵,第二个 Layer 的输出B是一个 N2 * M2 的矩阵; -* paddle.v2 默认会将A和B 横向拼接,当N1 和 N2 大小不一样时,会报如下的错误: - -.. code-block:: python - - ValueError: all the input array dimensions except for the concatenation axis must match exactly - -多个层的输出矩阵的高度不一致导致拼接失败,这种情况常常发生在: - -* 同时输出序列层和非序列层; -* 多个输出层处理多个不同长度的序列; - -此时可以在调用infer接口时通过设置 :code:`flatten_result=False` , 跳过“拼接”步骤,来解决上面的问题。这时,infer接口的返回值是一个python list: - -* list 中元素的个数等于网络中输出层的个数; -* list 中每个元素是一个layer的输出结果矩阵,类型是numpy的ndarray; -* 每一个layer输出矩阵的高度,在非序列输入时:等于样本数;序列输入时等于:输入序列中元素的总数;宽度等于配置中layer的size; - -6. 如何在训练过程中获得某一个layer的output ------------------------------------------------ - -可以在event_handler中,通过 :code:`event.gm.getLayerOutputs("layer_name")` 获得在模型配置中某一层的name :code:`layer_name` 在当前 -mini-batch forward的output的值。获得的值类型均为 :code:`numpy.ndarray` ,可以通过这个输出来完成自定义的评估指标计算等功能。例如下面代码: - -.. code-block:: python - - def score_diff(right_score, left_score): - return np.average(np.abs(right_score - left_score)) - - def event_handler(event): - if isinstance(event, paddle.event.EndIteration): - if event.batch_id % 25 == 0: - diff = score_diff( - event.gm.getLayerOutputs("right_score")["right_score"][ - "value"], - event.gm.getLayerOutputs("left_score")["left_score"][ - "value"]) - logger.info(("Pass %d Batch %d : Cost %.6f, " - "average absolute diff scores: %.6f") % - (event.pass_id, event.batch_id, event.cost, diff)) - -注意:此方法不能获取 :code:`paddle.layer.recurrent_group` 里step的内容,但可以获取 :code:`paddle.layer.recurrent_group` 的输出。 - -7. 如何在训练过程中获得参数的权重和梯度 ------------------------------------------------ - -在某些情况下,获得当前mini-batch的权重(或称作weights, parameters)有助于在训练时观察具体数值,方便排查以及快速定位问题。 -可以通过在 :code:`event_handler` 中打印其值(注意,需要使用 :code:`paddle.event.EndForwardBackward` 保证使用GPU训练时也可以获得), -示例代码如下: - -.. code-block:: python - - ... - parameters = paddle.parameters.create(cost) - ... - def event_handler(event): - if isinstance(event, paddle.event.EndForwardBackward): - if event.batch_id % 25 == 0: - for p in parameters.keys(): - logger.info("Param %s, Grad %s", - parameters.get(p), parameters.get_grad(p)) - -注意:“在训练过程中获得某一个layer的output”和“在训练过程中获得参数的权重和梯度”都会造成训练中的数据从C++拷贝到numpy,会对训练性能造成影响。不要在注重性能的训练场景下使用。 \ No newline at end of file diff --git a/doc/v2/faq/local/index_en.rst b/doc/v2/faq/local/index_en.rst deleted file mode 100644 index fa95b1753dbe293811d7a8601497ad521fa3ecda..0000000000000000000000000000000000000000 --- a/doc/v2/faq/local/index_en.rst +++ /dev/null @@ -1,248 +0,0 @@ -############################# -Parameter Setting -############################# - -.. contents:: - -1. Reduce Memory Consumption -------------------- - -The training procedure of neural networks demands dozens of gigabytes of host memory or serval gigabytes of device memory, which is a rather memory consuming work. The memory consumed by PaddlePaddle framework mainly includes: -\: - -* Cache memory for DataProvider (only on host memory), -* Memory for neurons' activation information (on both host memory and device memory), -* Memory for parameters (on both host memory and device memory), -* Other memory demands. - -Other memory demands is mainly used to support the running demand of PaddlePaddle framework itself, such as string allocation,temporary variables, which are not considered currently. - -Reduce DataProvider Cache Memory -++++++++++++++++++++++++++ - -PyDataProvider works under asynchronous mechanism, it loads together with the data fetch and shuffle procedure in host memory: - -.. graphviz:: - - digraph { - rankdir=LR; - Data Files -> Host Memory Pool -> PaddlePaddle Training - } - -Thus the reduction of the DataProvider cache memory can reduce memory occupancy, meanwhile speed up the data loading procedure before training. However, the size of the memory pool can actually affect the granularity of shuffle,which means a shuffle operation is needed before each data file reading process to ensure the randomness of data when try to reduce the size of the memory pool. - -.. literalinclude:: src/reduce_min_pool_size.py - -In this way, the memory consumption can be significantly reduced and hence the training procedure can be accelerated. More details are demonstrated in :ref:`api_pydataprovider2`. - -The Neurons Activation Memory -++++++++++++++ - -Each neuron activation operating in a neural network training process contains certain amount of temporary data such as the activation data (like the output value of a neuron). These data will be used to update parameters in back propagation period. The scale of memory consumed by these data is mainly related with two parameters, which are batch size and the length of each Sequence. Therefore, the neurons activation memory consuming is actually in proportion to the information contains in each mini-batch training. - -Two practical ways: - -* Reduce batch size. Set a smaller value in network configuration settings(batch_size=1000) can be helpful. But setting batch size to a smaller value may affect the training result due to it is a super parameter of the neural network itself. -* Shorten the sequence length or cut off those excessively long sequences. For example, if the length of sequences in a dataset are mostly varies between 100 and 200, but there is sequence lengthen out to 10,000, then it’s quite potentially leads to OOM (out of memory), especially in RNN models such as LSTM. - -The Parameters Memory -++++++++ - -The PaddlePaddle framework supports almost all popular optimizers. Different optimizers have different memory requirement. For example, the :code:`adadelta` consumes approximately 5 times memory - -space than the weights parameter’s scale, which means the :code:`adadelta` needs at least :code:`500M` memory if the model file contains all - -parameters needs :code:`100M`. - -Some optimization algorithms such as :code:`momentum` are worth giving a shot. - -2. Tricks To Speed Up Training -------------------- - -The training procedure of PaddlePaddle may be speed up when considering following aspects:\: - -* Reduce the time consumption of data loading -* Speed up training epochs -* Introduce more computing resources with the utilization of distribute training frameworks - -Reduce The Time Consumption of Data Loading -++++++++++++++++++ - - -The \ :code:`pydataprovider`\ holds big potential to speed up the data loading procedure if the cache pool and enable memory cache when use it. The principle of the reduction of :code:`DataProvider` cache pool is basically the same with the method which reduct the memory occupation with the set of a smaller cache pool. - -.. literalinclude:: src/reduce_min_pool_size.py - -Beside, the interface :code:`@provider` provides a parameter :code:`cache` to control cache. If set it to :code:`CacheType.CACHE_PASS_IN_MEM`, the data after the first :code:`pass` ( a pass means all data have be fed into the network for training) will be cached in memory and no new data will be read from the :code:`python` side in following :code:`pass` , instead from the cached data in memory. This strategy can also drop the time consuming in data loading process. - - -Accelerating Training Epochs -++++++++++++ - -Sparse training is supported in PaddlePaddle. The features needs to be trained is any of :code:`sparse_binary_vector`, :code:`sparse_vector` and :code:`integer_value` . Meanwhile, the Layer interacts with the training data need to turn the Parameter to sparse updating mode by setting :code:`sparse_update=True`. -Take :code:`word2vec` as an example, to train a language distance, one needs to predict the middle word with two words prior to it and next to it. The DataProvider of this task is: - -.. literalinclude:: src/word2vec_dataprovider.py - -The configuration of this task is: - -.. literalinclude:: src/word2vec_config.py - -Introduce More Computing Resources -++++++++++++++++++ - -More computing resources can be introduced with following manners: -* Single CPU platform training - - * Use multi-threading by set :code:`trainer_count`。 - -* Single GPU platform training - - * Set :code:`use_gpu` to train on single GPU. - * Set :code:`use_gpu` and :code:`trainer_count` to enable multiple GPU training support. - -* Cluster Training - - * Refer to :ref:`cluster_train` 。 - -3. Assign GPU Devices ------------------- - -Assume a computing platform consists of 4 GPUs which serial number from 0 to 3: - -* Method1: specify a GPU as computing device by set: - `CUDA_VISIBLE_DEVICES `_ - -.. code-block:: bash - - env CUDA_VISIBLE_DEVICES=2,3 paddle train --use_gpu=true --trainer_count=2 - -* Method2: Assign by —gpu_id: - -.. code-block:: bash - - paddle train --use_gpu=true --trainer_count=2 --gpu_id=2 - - -4. How to Fix Training Termination Caused By :code:`Floating point exception` During Training. ------------------------------------------------------------------------- - -Paddle binary catches floating exceptions during runtime, it will be terminated when NaN or Inf occurs. Floating exceptions are mostly caused by float overflow, divide by zero. There are three main reasons may raise such exception: - -* Parameters or gradients during training are oversize, which leads to float overflow during calculation. -* The model failed to converge and diverges to a big value. -* Parameters may converge to a singular value due to bad training data. If the scale of input data is too big and contains millions of parameter values, float overflow error may arise when operating matrix multiplication. - -Two ways to solve this problem: - -1. Set :code:`gradient_clipping_threshold` as: - -.. code-block:: python - - optimizer = paddle.optimizer.RMSProp( - learning_rate=1e-3, - gradient_clipping_threshold=10.0, - regularization=paddle.optimizer.L2Regularization(rate=8e-4)) - -Details can refer to example `nmt_without_attention `_ 示例。 - -2. Set :code:`error_clipping_threshold` as: - -.. code-block:: python - - decoder_inputs = paddle.layer.fc( - act=paddle.activation.Linear(), - size=decoder_size * 3, - bias_attr=False, - input=[context, current_word], - layer_attr=paddle.attr.ExtraLayerAttribute( - error_clipping_threshold=100.0)) - -Details can refer to example `machine translation `_ 。 - -The main difference between these two methods are: - -1. They both block the gradient, but happen in different occasions,the former one happens when then :code:`optimzier` updates the network parameters while the latter happens when the back propagation computing of activation functions. -2. The block target are different, the former blocks the trainable parameters’ gradient while the later blocks the gradient to be propagated to prior layers. - -Moreover, Such problems may be fixed with smaller learning rates or data normalization. - -5. Fetch Multi Layers’ Prediction Result With Infer Interface ------------------------------------------------ - -* Join the layer to be used as :code:`output_layer` layer to the input parameters of :code:`paddle.inference.Inference()` interface with: - -.. code-block:: python - - inferer = paddle.inference.Inference(output_layer=[layer1, layer2], parameters=parameters) - -* Assign certain fields to output. Take :code:`value` as example, it can be down with following code: - -.. code-block:: python - - out = inferer.infer(input=data_batch, field=["value"]) - -It is important to note that: - -* If 2 layers are assigned as output layer, then the output results consists of 2 matrixes. -* Assume the output of first layer A is a matrix sizes N1 * M1, the output of second layer B is a matrix sizes N2 * M2; -* By default, paddle.v2 will transverse join A and B, when N1 not equal to N2, it will raise following error: - -.. code-block:: python - - ValueError: all the input array dimensions except for the concatenation axis must match exactly - -The transverse of different matrixes of multi layers mainly happens when: - -* Output sequence layer and non sequence layer; -* Multiple output layers process multiple sequence with different length; - -Such issue can be avoided by calling infer interface and set :code:`flatten_result=False`. Thus, the infer interface returns a python list, in which - -* The number of elements equals to the number of output layers in the network; -* Each element in list is a result matrix of a layer, which type is numpy.ndarray; -* The height of each matrix outputted by each layer equals to the number of samples under non sequential mode or equals to the number of elements in the input sequence under sequential mode. Their width are both equal to the layer size in configuration. - -6. Fetch the Output of A Certain Layer During Training ------------------------------------------------ - -In event_handler, the interface :code:`event.gm.getLayerOutputs("layer_name")` gives the forward output value organized in :code:`numpy.ndarray` corresponding to :code:`layer_name` in the mini-batch. -The output can be used in custom measurements in following way: - -.. code-block:: python - - def score_diff(right_score, left_score): - return np.average(np.abs(right_score - left_score)) - - def event_handler(event): - if isinstance(event, paddle.event.EndIteration): - if event.batch_id % 25 == 0: - diff = score_diff( - event.gm.getLayerOutputs("right_score")["right_score"][ - "value"], - event.gm.getLayerOutputs("left_score")["left_score"][ - "value"]) - logger.info(("Pass %d Batch %d : Cost %.6f, " - "average absolute diff scores: %.6f") % - (event.pass_id, event.batch_id, event.cost, diff)) - -Note: this function can not get content of :code:`paddle.layer.recurrent_group` step, but output of :code:`paddle.layer.recurrent_group` can be fetched. - -7. Fetch Parameters’ Weight and Gradient During Training ------------------------------------------------ - -Under certain situations, knowing the weights of currently training mini-batch can provide more inceptions of many problems. Their value can be acquired by printing values in :code:`event_handler` (note that to gain such parameters when training on GPU, you should set :code:`paddle.event.EndForwardBackward`). Detailed code is as following: - -.. code-block:: python - - ... - parameters = paddle.parameters.create(cost) - ... - def event_handler(event): - if isinstance(event, paddle.event.EndForwardBackward): - if event.batch_id % 25 == 0: - for p in parameters.keys(): - logger.info("Param %s, Grad %s", - parameters.get(p), parameters.get_grad(p)) - -Note that “acquire the output of a certain layer during training” or “acquire the weights and gradients of parameters during training ” both needs to copy training data from C++ environment to numpy, which have certain degree of influence on training performance. Don’t use these two functions when the training procedure cares about the performance. diff --git a/doc/v2/faq/local/src/reduce_min_pool_size.py b/doc/v2/faq/local/src/reduce_min_pool_size.py deleted file mode 100644 index cba96652f764d26c724ea22697e04572709bf6a4..0000000000000000000000000000000000000000 --- a/doc/v2/faq/local/src/reduce_min_pool_size.py +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -@provider(min_pool_size=0, ...) -def process(settings, filename): - os.system('shuf %s > %s.shuf' % (filename, filename)) # shuffle before. - with open('%s.shuf' % filename, 'r') as f: - for line in f: - yield get_sample_from_line(line) diff --git a/doc/v2/faq/local/src/word2vec_config.py b/doc/v2/faq/local/src/word2vec_config.py deleted file mode 100644 index a5b84e8ed4de5123097026a5c7992b06fd321750..0000000000000000000000000000000000000000 --- a/doc/v2/faq/local/src/word2vec_config.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -... # the settings and define data provider is omitted. -DICT_DIM = 3000 # dictionary dimension. -word_ids = data_layer('word_ids', size=DICT_DIM) - -emb = embedding_layer( - input=word_ids, size=256, param_attr=ParamAttr(sparse_update=True)) -emb_sum = pooling_layer(input=emb, pooling_type=SumPooling()) -predict = fc_layer(input=emb_sum, size=DICT_DIM, act=Softmax()) -outputs( - classification_cost( - input=predict, label=data_layer( - 'label', size=DICT_DIM))) diff --git a/doc/v2/faq/local/src/word2vec_dataprovider.py b/doc/v2/faq/local/src/word2vec_dataprovider.py deleted file mode 100644 index 9fe67b6d6cbbbdc8a98d497f352cf114a882636f..0000000000000000000000000000000000000000 --- a/doc/v2/faq/local/src/word2vec_dataprovider.py +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -DICT_DIM = 3000 - - -@provider(input_types=[integer_sequence(DICT_DIM), integer_value(DICT_DIM)]) -def process(settings, filename): - with open(filename) as f: - # yield word ids to predict inner word id - # such as [28, 29, 10, 4], 4 - # It means the sentance is 28, 29, 4, 10, 4. - yield read_next_from_file(f) diff --git a/doc/v2/faq/model/index_cn.rst b/doc/v2/faq/model/index_cn.rst deleted file mode 100644 index 6947948bc79f4dba63954c459afb940e3242c405..0000000000000000000000000000000000000000 --- a/doc/v2/faq/model/index_cn.rst +++ /dev/null @@ -1,80 +0,0 @@ -######### -模型配置 -######### - -.. contents:: - -1. 出现 :code:`Duplicated layer name` 错误怎么办 --------------------------------------------------- - -出现该错误的原因一般是用户对不同layer的参数 :code:`name` 设置了相同的取值。遇到该错误时,先找出参数 :code:`name` 取值相同的layer,然后将这些layer的参数 :code:`name` 设置为不同的值。 - -2. :code:`paddle.layer.memory` 的参数 :code:`name` 如何使用 -------------------------------------------------------------- - -* :code:`paddle.layer.memory` 用于获取特定layer上一时间步的输出,该layer是通过参数 :code:`name` 指定,即,:code:`paddle.layer.memory` 会关联参数 :code:`name` 取值相同的layer,并将该layer上一时间步的输出作为自身当前时间步的输出。 - -* PaddlePaddle的所有layer都有唯一的name,用户通过参数 :code:`name` 设定,当用户没有显式设定时,PaddlePaddle会自动设定。而 :code:`paddle.layer.memory` 不是真正的layer,其name由参数 :code:`memory_name` 设定,当用户没有显式设定时,PaddlePaddle会自动设定。:code:`paddle.layer.memory` 的参数 :code:`name` 用于指定其要关联的layer,需要用户显式设定。 - -3. 两种使用 drop_out 的方法有何区别 ------------------------------------- - -* 在PaddlePaddle中使用dropout有两种方式 - - * 在相应layer的 :code:`layer_atter` 设置 :code:`drop_rate`,以 :code:`paddle.layer.fc` 为例,代码如下: - - .. code-block:: python - - fc = paddle.layer.fc(input=input, layer_attr=paddle.attr.ExtraLayerAttribute(drop_rate=0.5)) - - * 使用 :code:`paddle.layer.dropout`,以 :code:`paddle.layer.fc` 为例,代码如下: - - .. code-block:: python - - fc = paddle.layer.fc(input=input) - drop_fc = paddle.layer.dropout(input=fc, dropout_rate=0.5) - -* :code:`paddle.layer.dropout` 实际上使用了 :code:`paddle.layer.add_to`,并在该layer里采用第一种方式设置 :code:`drop_rate` 来使用dropout的。这种方式对内存消耗较大。 - -* PaddlePaddle在激活函数里实现dropout,而不是在layer里实现。 - -* :code:`paddle.layer.lstmemory`、:code:`paddle.layer.grumemory`、:code:`paddle.layer.recurrent` 不是通过一般的方式来实现对输出的激活,所以不能采用第一种方式在这几个layer里设置 :code:`drop_rate` 来使用dropout。若要对这几个layer使用dropout,可采用第二种方式,即使用 :code:`paddle.layer.dropout`。 - -4. 不同的 recurrent layer 的区别 ----------------------------------- -以LSTM为例,在PaddlePaddle中包含以下 recurrent layer: - -* :code:`paddle.layer.lstmemory` -* :code:`paddle.networks.simple_lstm` -* :code:`paddle.networks.lstmemory_group` -* :code:`paddle.networks.bidirectional_lstm` - -按照具体实现方式可以归纳为2类: - -1. 由 recurrent_group 实现的 recurrent layer: - - * 用户在使用这一类recurrent layer时,可以访问由recurrent unit在一个时间步内计算得到的中间值(例如:hidden states, memory cells等); - * 上述的 :code:`paddle.networks.lstmemory_group` 是这一类的 recurrent layer ; - -2. 将recurrent layer作为一个整体来实现: - - * 用户在使用这一类recurrent layer,只能访问它们的输出值; - * 上述的 :code:`paddle.networks.lstmemory_group` 、 :code:`paddle.networks.simple_lstm` 和 :code:`paddle.networks.bidirectional_lstm` 属于这一类的实现; - -将recurrent layer作为一个整体来实现, 能够针对CPU和GPU的计算做更多优化, 所以相比于recurrent group的实现方式, 第二类 recurrent layer 计算效率更高。 在实际应用中,如果用户不需要访问LSTM的中间变量,而只需要获得recurrent layer计算的输出,我们建议使用第二类实现。 - -此外,关于LSTM, PaddlePaddle中还包含 :code:`paddle.networks.lstmemory_unit` 这一计算单元: - - * 不同于上述介绍的recurrent layer , :code:`paddle.networks.lstmemory_unit` 定义了LSTM单元在一个时间步内的计算过程,它并不是一个完整的recurrent layer,也不能接收序列数据作为输入; - * :code:`paddle.networks.lstmemory_unit` 只能在recurrent_group中作为step function使用; - -5. PaddlePaddle的softmax能否指定计算的维度 ------------------------------------------ - -PaddlePaddle的softmax不能指定计算维度,只能按行计算。 -在图像任务中,对于NCHW,如果需要在C维度计算softmax,可以先使用 :code:`paddle.layer.switch_order` 改变维度顺序,即将NCHW转换成NHWC,再做一定的reshape,最后计算softmax。 - -6. PaddlePaddle是否支持维数可变的数据输入 ------------------------------------------- - -PaddlePaddle提供的 :code:`paddle.data_type.dense_array` 支持维数可变的数据输入。在使用时,将对应数据层的维数设置成一个大于输入数据维数的值用于占位即可。 diff --git a/doc/v2/faq/model/index_en.rst b/doc/v2/faq/model/index_en.rst deleted file mode 100644 index 67a33e08e192e5627ac3b0abd76e979f21ed2079..0000000000000000000000000000000000000000 --- a/doc/v2/faq/model/index_en.rst +++ /dev/null @@ -1,81 +0,0 @@ -################### -Model Configuration -################### - -.. contents:: - -1. How to deal with error :code:`Duplicated layer name` ----------------------------------------------------------- - -The general reason for this error is that users may have set the same value for the attribute :code:`name` in different layers. Try to find out the :code:`name` attribute with the same value in diffrent layers and set them differently. - -2. How to use :code:`paddle.layer.memory`'s attribute :code:`name` ----------------------------------------------------------------------- - -* :code:`paddle.layer.memory` is used to get the output of a layer's last timestep and the layer is specified by the attribute :code:`name` . Thus, :code:`paddle.layer.memory` will associate with the layer that has the same value of attribute :code:`name` , and uses the output of the layer's last timestep as the input of its current timestep. - -* All the PaddlePaddle's layers have a unique name, which is set by the attribute :code:`name` . PaddlePaddle will automatically set it for the user when it is not explicitly set. :code:`paddle.layer.memory` is not a real layer, its name is set by the attribute :code:`memory_name` and PaddlePaddle will also automatically set it when the user does not explicitly set. The :code:`paddle.layer.memory` attribute :code:`name` is used to specify the layer it is associated with, and needs to be explicitly set by the user. - - -3. What is the difference between the two ways of using dropout ------------------------------------------------------------------ - -* There are two ways to use dropout in PaddlePaddle - - * Set the :code:`drop_rate` parameter in the layer's :code:`layer_atter` attribute. Take :code:`paddle.layer.fc` as an example: - - .. code-block:: python - - fc = paddle.layer.fc(input=input, layer_attr=paddle.attr.ExtraLayerAttribute(drop_rate=0.5)) - - * Use :code:`paddle.layer.dropout` layer. Take :code:`paddle.layer.fc` as an example: - - .. code-block:: python - - fc = paddle.layer.fc(input=input) - drop_fc = paddle.layer.dropout(input=fc, dropout_rate=0.5) - -* :code:`paddle.layer.dropout` actually uses the :code:`paddle.layer.add_to` layer and sets :code:`drop_rate` as the previous method. This method is very memory intensive. - -* PaddlePaddle implements dropout in the activation function rather than in the layer. - -* :code:`paddle.layer.lstmemory`, :code:`paddle.layer.grumemory`, :code:`paddle.layer.recurrent` implement activation of output in an unusual way, so we cannot use dropout by setting :code:`drop_rate` . To use dropout for these layers, we could use the second method, which is to use :code:`paddle.layer.dropout`. - -4. The differences between different recurrent layers --------------------------------------------------------- -Take LSTM as an example. There are several kinds of recurrent layers in PaddlePaddle: - -* :code:`paddle.layer.lstmemory` -* :code:`paddle.networks.simple_lstm` -* :code:`paddle.networks.lstmemory_group` -* :code:`paddle.networks.bidirectional_lstm` - -According to implementations, recurrent layer can be classified into 2 types: - -1. Recurrent layer implemented by recurrent_group: - - * Using this type of recurrent layers, users can access the intermediate value calculated by the recurrent unit within a timestep (eg: hidden states, memory cells, etc.) - * :code:`paddle.networks.lstmemory_group` belongs to this type of recurrent layers. - -2. Recurrent layer implemented as a complete operation: - - * Users can only access output values when using this type of recurrent layers. - * :code:`paddle.networks.lstmemory_group` , :code:`paddle.networks.simple_lstm` and :code:`paddle.networks.bidirectional_lstm` belong to this type of recurrent layer; - -By implementing recurrent layer as a complete operation, CPU and GPU calculations can be optimized. Therefore, the second type of recurrent layer is more efficient than the first one. In practical applications, we propose to use the second type of recurrent layers if there is no need to access the intermediate variable of LSTM. - -In addition, PaddlePaddle also contains a kind of LSTM calculation unit: :code:`paddle.networks.lstmemory_unit`: - - * Unlike the recurrent layer described above, :code:`paddle.networks.lstmemory_unit` defines the computational process of an LSTM unit in a timestep. It is not a complete recurrent layer, nor can it receive sequence data as input. - * :code:`paddle.networks.lstmemory_unit` can only be used as a step function in recurrent_group. - -5. Can Softmax's calculation dimension be specified? --------------------------------------------------------------------- - -We can't specify calculation dimension for PaddlePaddle's softmax. It can only be calculated by rows. -In image tasks, for NCHW, if you need to calculate softmax in C dimension, you could use :code:`paddle.layer.switch_order` to change the dimension order, that is, convert NCHW to NHWC, then do the reshape operation and calculate softmax. - -6. Does PaddlePaddle support variable-dimensional data inputs ----------------------------------------------------------------- - -PaddlePaddle provides :code:`paddle.data_type.dense_array` to support variable-dimensional data input. Simply set the dimension of the data layer to a value larger than the dimension of the input data for occupancy. diff --git a/doc/v2/faq/parameter/index_cn.rst b/doc/v2/faq/parameter/index_cn.rst deleted file mode 100644 index 987e8cf088be4ee8daa7c28fdc855506cbfd31c7..0000000000000000000000000000000000000000 --- a/doc/v2/faq/parameter/index_cn.rst +++ /dev/null @@ -1,201 +0,0 @@ -######### -参数设置 -######### - -.. contents:: - -1. 如何选择SGD算法的学习率 --------------------------- - -在采用sgd/async_sgd进行训练时,一个重要的问题是选择正确的learning_rate。如果learning_rate太大,那么训练有可能不收敛,如果learning_rate太小,那么收敛可能很慢,导致训练时间过长。 - -通常做法是从一个比较大的learning_rate开始试,如果不收敛,那减少学习率10倍继续试验,直到训练收敛为止。那么如何判断训练不收敛呢?可以估计出如果模型采用不变的输出最小的cost0是多少。 - -如果训练过程的的cost明显高于这个常数输出的cost,那么我们可以判断为训练不收敛。举一个例子,假如我们是三分类问题,采用multi-class-cross-entropy作为cost,数据中0,1,2三类的比例为 :code:`0.2, 0.5, 0.3` , 那么常数输出所能达到的最小cost是 :code:`-(0.2*log(0.2)+0.5*log(0.5)+0.3*log(0.3))=1.03` 。如果训练一个pass(或者更早)后,cost还大于这个数,那么可以认为训练不收敛,应该降低学习率。 - -2. 如何设置学习率退火(learning rate annealing) ------------------------------------------------- - -在相应的优化算法里设置learning_rate_schedule及相关参数,以使用Adam算法为例,代码如下: - -.. code-block:: python - - optimizer = paddle.optimizer.Adam( - learning_rate=1e-3, - learning_rate_decay_a=0.5, - learning_rate_decay_b=0.75, - learning_rate_schedule="poly",) - -PaddlePaddle目前支持8种learning_rate_schedule,这8种learning_rate_schedule及其对应学习率计算方式如下: - -* "constant" - - lr = learning_rate - -* "poly" - - lr = learning_rate * pow(1 + learning_rate_decay_a * num_samples_processed, -learning_rate_decay_b) - - 其中,num_samples_processed为已训练样本数,下同。 - -* "caffe_poly" - - lr = learning_rate * pow(1.0 - num_samples_processed / learning_rate_decay_a, learning_rate_decay_b) - -* "exp" - - lr = learning_rate * pow(learning_rate_decay_a, num_samples_processed / learning_rate_decay_b) - -* "discexp" - - lr = learning_rate * pow(learning_rate_decay_a, floor(num_samples_processed / learning_rate_decay_b)) - -* "linear" - - lr = max(learning_rate - learning_rate_decay_a * num_samples_processed, learning_rate_decay_b) - -* "manual" - - 这是一种按已训练样本数分段取值的学习率退火方法。使用该learning_rate_schedule时,用户通过参数 :code:`learning_rate_args` 设置学习率衰减因子分段函数,当前的学习率为所设置 :code:`learning_rate` 与当前的衰减因子的乘积。以使用Adam算法为例,代码如下: - - .. code-block:: python - - optimizer = paddle.optimizer.Adam( - learning_rate=1e-3, - learning_rate_schedule="manual", - learning_rate_args="1000:1.0,2000:0.9,3000:0.8",) - - 在该示例中,当已训练样本数小于等于1000时,学习率为 :code:`1e-3 * 1.0`;当已训练样本数大于1000小于等于2000时,学习率为 :code:`1e-3 * 0.9`;当已训练样本数大于2000时,学习率为 :code:`1e-3 * 0.8`。 - -* "pass_manual" - - 这是一种按已训练pass数分段取值的学习率退火方法。使用该learning_rate_schedule时,用户通过参数 :code:`learning_rate_args` 设置学习率衰减因子分段函数,当前的学习率为所设置 :code:`learning_rate` 与当前的衰减因子的乘积。以使用Adam算法为例,代码如下: - - .. code-block:: python - - optimizer = paddle.optimizer.Adam( - learning_rate=1e-3, - learning_rate_schedule="pass_manual", - learning_rate_args="1:1.0,2:0.9,3:0.8",) - - 在该示例中,当已训练pass数小于等于1时,学习率为 :code:`1e-3 * 1.0`;当已训练pass数大于1小于等于2时,学习率为 :code:`1e-3 * 0.9`;当已训练pass数大于2时,学习率为 :code:`1e-3 * 0.8`。 - -3. 如何初始化参数 ------------------ - -默认情况下,PaddlePaddle使用均值0,标准差为 :math:`\frac{1}{\sqrt{d}}` 来初始化参数。其中 :math:`d` 为参数矩阵的宽度。这种初始化方式在一般情况下不会产生很差的结果。如果用户想要自定义初始化方式,PaddlePaddle目前提供两种参数初始化的方式\: - -* 高斯分布。将 :code:`param_attr` 设置成 :code:`param_attr=ParamAttr(initial_mean=0.0, initial_std=1.0)` -* 均匀分布。将 :code:`param_attr` 设置成 :code:`param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0)` - -比如设置一个全连接层的参数初始化方式和bias初始化方式,可以使用如下代码。 - -.. code-block:: python - - hidden = fc_layer(input=ipt, param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0), - bias_attr=ParamAttr(initial_mean=1.0, initial_std=0.0)) - -上述代码将bias全部初始化为1.0, 同时将参数初始化为 :code:`[1.0, -1.0]` 的均匀分布。 - -4. 如何共享参数 ---------------- - -PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID,相同名字的参数,会共享参数。设置参数的名字,可以使用 :code:`ParamAttr(name="YOUR_PARAM_NAME")` 来设置。更方便的设置方式,是使得要共享的参数使用同样的 :code:`ParamAttr` 对象。 - -简单的全连接网络,参数共享的配置示例为\: - -.. literalinclude:: ../../python/paddle/trainer_config_helpers/tests/configs/shared_fc.py - -这里 :code:`hidden_a` 和 :code:`hidden_b` 使用了同样的parameter和bias。并且softmax层的两个输入也使用了同样的参数 :code:`softmax_param`。 - -5. 如何加载预训练参数 ------------------------- - -* 对加载预训练参数的层,设置其参数属性 :code:`is_static=True`,使该层的参数在训练过程中保持不变。以embedding层为例,代码如下: - -.. code-block:: python - - emb_para = paddle.attr.Param(name='emb', is_static=True) - paddle.layer.embedding(size=word_dim, input=x, param_attr=emb_para) - - -* 从模型文件将预训练参数载入 :code:`numpy.array`,在创建parameters后,使用 :code:`parameters.set()` 加载预训练参数。PaddlePaddle保存的模型参数文件前16字节为头信息,用户将参数载入 :code:`numpy.array` 时须从第17字节开始。以embedding层为例,代码如下: - -.. code-block:: python - - def load_parameter(file_name, h, w): - with open(file_name, 'rb') as f: - f.read(16) # skip header. - return np.fromfile(f, dtype=np.float32).reshape(h, w) - - parameters = paddle.parameters.create(my_cost) - parameters.set('emb', load_parameter(emb_param_file, 30000, 256)) - -6. 存储的参数格式是什么,如何和明文进行相互转化 --------------------------------------------------- - -PaddlePaddle保存的模型参数文件内容由16字节头信息和网络参数两部分组成。头信息中,1~4字节表示PaddlePaddle版本信息,请直接填充0;5~8字节表示每个参数占用的字节数,当保存的网络参数为float类型时为4,double类型时为8;9~16字节表示保存的参数总个数。 - -将PaddlePaddle保存的模型参数还原回明文时,可以使用相应数据类型的 :code:`numpy.array` 加载具体网络参数,此时可以跳过PaddlePaddle模型参数文件的头信息。若在PaddlePaddle编译时,未指定按照double精度编译,默认情况下按照float精度计算,保存的参数也是float类型。这时在使用 :code:`numpy.array` 时,一般设置 :code:`dtype=float32` 。示例如下: - -.. code-block:: python - - def read_parameter(fname, width): - s = open(fname).read() - # skip header - vec = np.fromstring(s[16:], dtype=np.float32) - # width is the size of the corresponding layer - np.savetxt(fname + ".csv", vec.reshape(width, -1), - fmt="%.6f", delimiter=",") - - -将明文参数转化为PaddlePaddle可加载的模型参数时,首先构造头信息,再写入网络参数。下面的代码将随机生成的矩阵转化为可以被PaddlePaddle加载的模型参数。 - -.. code-block:: python - - def gen_rand_param(param_file, width, height, need_trans): - np.random.seed() - header = struct.pack("iil", 0, 4, height * width) - param = np.float32(np.random.rand(height, width)) - with open(param_file, "w") as fparam: - fparam.write(header + param.tostring()) - -7. A protocol message was rejected because it was too big ------------------------------------------------------------- - -如果在训练NLP相关模型时,出现以下错误: - -.. code-block:: bash - - [libprotobuf ERROR google/protobuf/io/coded_stream.cc:171] A protocol message was rejected because it was too big (more than 67108864 bytes). To increase the limit (or to disable these warnings), see CodedInputStream::SetTotalBytesLimit() in google/protobuf/io/coded_stream.h. - F1205 14:59:50.295174 14703 TrainerConfigHelper.cpp:59] Check failed: m->conf.ParseFromString(configProtoStr) - -可能的原因是:传给dataprovider的某一个args过大,一般是由于直接传递大字典导致的。错误的define_py_data_sources2类似: - -.. code-block:: python - - src_dict = dict() - for line_count, line in enumerate(open(src_dict_path, "r")): - src_dict[line.strip()] = line_count - - define_py_data_sources2( - train_list, - test_list, - module="dataprovider", - obj="process", - args={"src_dict": src_dict}) - -解决方案是:将字典的地址作为args传给dataprovider,然后在dataprovider里面根据该地址加载字典。即define_py_data_sources2应改为: - -.. code-block:: python - - define_py_data_sources2( - train_list, - test_list, - module="dataprovider", - obj="process", - args={"src_dict_path": src_dict_path}) - -完整源码可参考 `sequence_recurrent `_ 示例。 - - diff --git a/doc/v2/faq/parameter/index_en.rst b/doc/v2/faq/parameter/index_en.rst deleted file mode 100644 index 9edb8dd620f972d019db9c0063cefce616de0ebd..0000000000000000000000000000000000000000 --- a/doc/v2/faq/parameter/index_en.rst +++ /dev/null @@ -1,198 +0,0 @@ -################## -Parameter Settings -################## - -.. contents:: - -1. How to Choose the Learning Rate of SGD Algorithm --------------------------- - -An important issue when training with :code:`sgd/async_sgd` is to choose the correct value for :code:`learning_rate`. If it is too large, the training may not converge. If too small, the convergence may be slow, resulting in a long training time. - -Usually, we start with a relatively large learning rate. If the training does not converge, then we need to reduce the learning rate continuously by a factor of 10 until the training converges. We examine the convergence of the training by estimating the minimum cost at a constant output of the model. - -If the cost of the training process is significantly higher than the cost of the output, then we judge that the training does not converge. For example, if we have a three-class problem and use multi-class-cross-entropy as the cost, the ratio of 0, 1, and 2 in the data will be :code:`0.2, 0.5, 0.3`. The minimum cost thus will be :code:`-(0.2*log(0.2)+0.5*log(0.5)+0.3*log(0.3))=1.03`. If the cost is greater than this number after training a pass (or even before), then the training may not be converged and the learning rate should be reduced. - -2. How to Implement Learning Rate Annealing ------------------------------------------------- - -We use the Adam algorithm as an example. Set the parameters of :code:`learning_rate_schedule` in the corresponding optimization algorithm as follows: - -.. code-block:: python - -    Optimizer = paddle.optimizer.Adam( -        Learning_rate=1e-3, -        Learning_rate_decay_a=0.5, -        Learning_rate_decay_b=0.75, -        Learning_rate_schedule="poly",) - -PaddlePaddle currently supports 8 learning rate schedules. The 8 learning rate schedules and their corresponding learning rates are calculated as follows: - -* "constant" -   -  Lr = learning_rate - -* "poly" - -  Lr = learning_rate * pow(1 + learning_rate_decay_a * num_samples_processed, -learning_rate_decay_b) - -  Variable :code:`num_samples_processed` is the number of trained samples. - -* "caffe_poly" - -  Lr = learning_rate * pow(1.0 - num_samples_processed / learning_rate_decay_a, learning_rate_decay_b) - -* "exp" - -  Lr = learning_rate * pow(learning_rate_decay_a, num_samples_processed / learning_rate_decay_b) - -* "discexp" - -  Lr = learning_rate * pow(learning_rate_decay_a, floor(num_samples_processed / learning_rate_decay_b)) - -* "linear" - -  Lr = max(learning_rate - learning_rate_decay_a * num_samples_processed, learning_rate_decay_b) - -* "manual" - -  This is a learning rate annealing method that is segmented by the number of trained samples. When using this learning rate schedule, we modify the learning rate attenuation factor piecewise function by changing the parameter :code:`learning_rate_args`. The current learning rate is the product of :code:`learning_rate` and the current attenuation factor. Take the Adam algorithm as an example: - -  .. code-block:: python - -      Optimizer = paddle.optimizer.Adam( -          Learning_rate=1e-3, -          Learning_rate_schedule="manual", -          Learning_rate_args="1000:1.0,2000:0.9,3000:0.8",) - -  In this example, when the number of trained samples is less than or equal to 1000, the learning rate is: code:`1e-3*1.0`; when the number of trained samples is greater than 1000 or less than or equal to 2000, the learning rate is:code:`1e- 3 * 0.9`; when the number of trained samples is greater than 2,000, the learning rate is: code:`1e-3*0.8`. - -* "pass_manual" - -  This is a learning rate annealing method that piecewisely pick values according to the number of trained passes. When using this learning rate schedule, we set the learning rate attenuation factor piecewise function by the parameter :code:`learning_rate_args`. The current learning rate is the product of :code:`learning_rate` and the current attenuation factor. Take the Adam algorithm as an example: - -  .. code-block:: python - -      Optimizer = paddle.optimizer.Adam( -          Learning_rate=1e-3, -          Learning_rate_schedule="pass_manual", -          Learning_rate_args="1:1.0,2:0.9,3:0.8",) - -  In this example, when the number of trained passes is less than or equal to 1, the learning rate is :code:`1e-3*1.0`; when the number of trained passes is greater than 1 or less than 2, the learning rate is :code:`1e- 3 * 0.9`; when the number of trained passes is greater than 2, the learning rate is :code:`1e-3*0.8`. - -3. How to Initialize Parameters ------------------ - -By default, PaddlePaddle initializes parameters with an average of 0 and a standard deviation of :math:`\frac{1}{\sqrt{d}}`, where :math:`d` is the width of the parameter matrix. This initialization method does not produce bad results under normal circumstances. If users want to customize the initialization method, PaddlePaddle provides two ways to initialize the parameters: - -* Gaussian distribution. Set :code:`param_attr` to :code:`param_attr=ParamAttr(initial_mean=0.0, initial_std=1.0)` -* Uniform distribution. Set :code:`param_attr` to :code:`param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0)` - -For example, to set a full connection layer parameter initialization mode and bias initialization mode, you can use the following code: - -.. code-block:: python - -    Hidden = fc_layer(input=ipt, param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0), -                      Bias_attr=ParamAttr(initial_mean=1.0, initial_std=0.0)) - -The above code initializes the bias to 1.0 and initializes the parameters to a uniform distribution of :code:`[1.0, -1.0]`. - -4. How to Share Parameters ---------------- - -PaddlePaddle's parameters use :code:`name` as the ID. Parameters with the same name will share parameters//. We can set the name of the parameters using :code:`ParamAttr(name="YOUR_PARAM_NAME")`. More conveniently, we can make the parameters to be shared use the same :code:`ParamAttr` object. - -A simple fully connected network has its configuration of parameter sharing as follows \: - -.. literalinclude:: ../../python/paddle/trainer_config_helpers/tests/configs/shared_fc.py - -Here :code:`hidden_a` and :code:`hidden_b` have the same parameter and bias. The two input of the softmax layer also use the same parameter :code:`softmax_param`. - -5. How to Load Pre-training Parameters ------------------------- -* For layers that load pre-training parameters, set :code:`is_static = True` so that the parameters of that layer remain unchanged during the training process. Take the embedding layer as an example, the code is as follows: - -.. code-block:: python - -    Emb_para = paddle.attr.Param(name='emb', is_static=True) -    Paddle.layer.embedding(size=word_dim, input=x, param_attr=emb_para) - - -* Load pre-training parameters from the model file into :code:`numpy.array`. After creating the parameters, load the pre-training parameters using :code:`parameters.set()`. The first 16 bytes of the model parameter file saved by PaddlePaddle is the header information. The user must loads : :code:`numpy.array` starting with the 17th byte. Take the embedding layer as an example, the code is as follows: - -.. code-block:: python - -    Def load_parameter(file_name, h, w): -        With open(file_name, 'rb') as f: -            F.read(16) # skip header. -            Return np.fromfile(f, dtype=np.float32).reshape(h, w) - -    Parameters = paddle.parameters.create(my_cost) -    Parameters.set('emb', load_parameter(emb_param_file, 30000, 256)) - -6. Format of the Stored Parameter and How to Convert the File to Plain Text --------------------------------------------------- - -The model parameter file saved by PaddlePaddle consists of 16 bytes of header information and network parameters. In the header information, the first four bytes show PaddlePaddle's version information. The user should fill in with 0s. The next four bytes represent the number of bytes occupied by each parameter. If the saved network parameter is a float type, the number is four; if it is a double, the number is eight. The third group of four bytes represents the total number of saved parameters. - -When restoring the model parameters saved by PaddlePaddle back to plain text, we use the corresponding data type :code:`numpy.array` to load specific network parameters. At this time, you can skip the header information of the PaddlePaddle model parameter file. If not specified to compile with a precision for double in PaddlePaddle, then the parameter file will be caiculated with a precision for float, and the argument will be stored as a float. In this case, when using :code:`numpy.array`, generally we set :code:`dtype=float32`. An example is as follows: - -.. code-block:: python - -    Def read_parameter(fname, width): -        s = open(fname).read() -        # skip header -        Vec = np.fromstring(s[16:], dtype=np.float32) -        # width is the size of the corresponding layer -        Np.savetxt(fname + ".csv", vec.reshape(width, -1), -                Fmt="%.6f", delimiter=",") - - -When the plaintext parameters are converted into PaddlePaddle loadable model parameters, the header information is constructed first, then the network parameters are written. The following code converts the randomly generated matrix into model parameters that can be loaded by PaddlePaddle: - -.. code-block:: python - -    Def gen_rand_param(param_file, width, height, need_trans): -        Np.random.seed() -        Header = struct.pack("iil", 0, 4, height * width) -        Param = np.float32(np.random.rand(height, width)) -        With open(param_file, "w") as fparam: -            Fparam.write(header + param.tostring()) - -7. A Protocol Message Rejected Because of its Large Size --------------------------------------------------- ---------- - -If you are training NLP related models, and the following error occurs: - -.. code-block:: bash - -    [libprotobuf ERROR google/protobuf/io/coded_stream.cc:171] A protocol message was rejected because it was too big (more than 67108864 bytes). To increase the limit (or to disable these warnings), see CodedInputStream::SetTotalBytesLimit( ) in google/protobuf/io/coded_stream.h. -    F1205 14:59:50.295174 14703 TrainerConfigHelper.cpp:59] Check failed: m->conf.ParseFromString(configProtoStr) - -The possible reason is that one of the args passed to the dataprovider is too large, which is usually caused by directly passing a large dictionary. A wrongly defineed `_py_data_sources2` is similar to: - -.. code-block:: python - -     Src_dict = dict() -     For line_count, line in enumerate(open(src_dict_path, "r")): -        Src_dict[line.strip()] = line_count - -     Define_py_data_sources2( -        Train_list, -        Test_list, -        Module="dataprovider", -        Obj="process", -        Args={"src_dict": src_dict}) - -The solution is to pass the address of the dictionary as args to the dataprovider, and then load the dictionary according to the address in the dataprovider. Change `_py_data_sources2` to: - -.. code-block:: python - -     Define_py_data_sources2( -        Train_list, -        Test_list, -        Module="dataprovider", -        Obj="process", -        Args={"src_dict_path": src_dict_path}) - -The full source code can be found in the `sequence_recurrent `_ example. diff --git a/doc/v2/getstarted/concepts/src/infer.py b/doc/v2/getstarted/concepts/src/infer.py deleted file mode 100644 index afe256f234a1c7d29c33f3b65b8302646df0c45c..0000000000000000000000000000000000000000 --- a/doc/v2/getstarted/concepts/src/infer.py +++ /dev/null @@ -1,32 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle.v2 as paddle -import numpy as np - -paddle.init(use_gpu=False) -x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(2)) -y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear()) - -# loading the model which generated by training -with open('params_pass_90.tar', 'r') as f: - parameters = paddle.parameters.Parameters.from_tar(f) - -# Input multiple sets of data,Output the infer result in a array. -i = [[[1, 2]], [[3, 4]], [[5, 6]]] -print paddle.infer(output_layer=y_predict, parameters=parameters, input=i) -# Will print: -# [[ -3.24491572] -# [ -6.94668722] -# [-10.64845848]] diff --git a/doc/v2/getstarted/concepts/src/train.py b/doc/v2/getstarted/concepts/src/train.py deleted file mode 100644 index a85d5d8a3acee61d11488e5b842831a79072680a..0000000000000000000000000000000000000000 --- a/doc/v2/getstarted/concepts/src/train.py +++ /dev/null @@ -1,71 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle.v2 as paddle -import numpy as np - -# init paddle -paddle.init(use_gpu=False) - -# network config -x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(2)) -y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear()) -y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1)) -cost = paddle.layer.square_error_cost(input=y_predict, label=y) - -# create parameters -parameters = paddle.parameters.create(cost) -# create optimizer -optimizer = paddle.optimizer.Momentum(momentum=0) -# create trainer -trainer = paddle.trainer.SGD(cost=cost, - parameters=parameters, - update_equation=optimizer) - - -# event_handler to print training info -def event_handler(event): - if isinstance(event, paddle.event.EndIteration): - if event.batch_id % 1 == 0: - print "Pass %d, Batch %d, Cost %f" % (event.pass_id, event.batch_id, - event.cost) - # product model every 10 pass - if isinstance(event, paddle.event.EndPass): - if event.pass_id % 10 == 0: - with open('params_pass_%d.tar' % event.pass_id, 'w') as f: - trainer.save_parameter_to_tar(f) - - -# define training dataset reader -def train_reader(): - train_x = np.array([[1, 1], [1, 2], [3, 4], [5, 2]]) - train_y = np.array([[-2], [-3], [-7], [-7]]) - - def reader(): - for i in xrange(train_y.shape[0]): - yield train_x[i], train_y[i] - - return reader - - -# define feeding map -feeding = {'x': 0, 'y': 1} - -# training -trainer.train( - reader=paddle.batch( - train_reader(), batch_size=1), - feeding=feeding, - event_handler=event_handler, - num_passes=100) diff --git a/doc/v2/getstarted/concepts/use_concepts_cn.rst b/doc/v2/getstarted/concepts/use_concepts_cn.rst deleted file mode 100644 index 608f49f5a969b3291eb43bf2acf582af74e566a1..0000000000000000000000000000000000000000 --- a/doc/v2/getstarted/concepts/use_concepts_cn.rst +++ /dev/null @@ -1,155 +0,0 @@ -############ -基本使用概念 -############ - -PaddlePaddle是源于百度的一个深度学习平台。PaddlePaddle为深度学习研究人员提供了丰富的API,可以轻松地完成神经网络配置,模型训练等任务。 -这里将介绍PaddlePaddle的基本使用概念,并且展示了如何利用PaddlePaddle来解决一个经典的线性回归问题。 -在使用该文档之前,请参考 `安装文档 <../../build_and_install/index_cn.html>`_ 完成PaddlePaddle的安装。 - - -配置网络 -============ - -加载PaddlePaddle ----------------------- - -在进行网络配置之前,首先需要加载相应的Python库,并进行初始化操作。 - -.. code-block:: bash - - import paddle.v2 as paddle - import numpy as np - paddle.init(use_gpu=False) - - -搭建神经网络 ------------------------ - -搭建神经网络就像使用积木搭建宝塔一样。在PaddlePaddle中,layer是我们的积木,而神经网络是我们要搭建的宝塔。我们使用不同的layer进行组合,来搭建神经网络。 -宝塔的底端需要坚实的基座来支撑,同样,神经网络也需要一些特定的layer作为输入接口,来完成网络的训练。 - -例如,我们可以定义如下layer来描述神经网络的输入: - -.. code-block:: bash - - x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(2)) - y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1)) - -其中x表示输入数据是一个维度为2的稠密向量,y表示输入数据是一个维度为1的稠密向量。 - -PaddlePaddle支持不同类型的输入数据,主要包括四种类型,和三种序列模式。 - -四种数据类型: - -* dense_vector:稠密的浮点数向量。 -* sparse_binary_vector:稀疏的01向量,即大部分值为0,但有值的地方必须为1。 -* sparse_float_vector:稀疏的向量,即大部分值为0,但有值的部分可以是任何浮点数。 -* integer:整数标签。 - -三种序列模式: - -* SequenceType.NO_SEQUENCE:不是一条序列 -* SequenceType.SEQUENCE:是一条时间序列 -* SequenceType.SUB_SEQUENCE: 是一条时间序列,且序列的每一个元素还是一个时间序列。 - -不同的数据类型和序列模式返回的格式不同,列表如下: - -+----------------------+---------------------+-----------------------------------+------------------------------------------------+ -| | NO_SEQUENCE | SEQUENCE | SUB_SEQUENCE | -+======================+=====================+===================================+================================================+ -| dense_vector | [f, f, ...] | [[f, ...], [f, ...], ...] | [[[f, ...], ...], [[f, ...], ...],...] | -+----------------------+---------------------+-----------------------------------+------------------------------------------------+ -| sparse_binary_vector | [i, i, ...] | [[i, ...], [i, ...], ...] | [[[i, ...], ...], [[i, ...], ...],...] | -+----------------------+---------------------+-----------------------------------+------------------------------------------------+ -| sparse_float_vector | [(i,f), (i,f), ...] | [[(i,f), ...], [(i,f), ...], ...] | [[[(i,f), ...], ...], [[(i,f), ...], ...],...] | -+----------------------+---------------------+-----------------------------------+------------------------------------------------+ -| integer_value | i | [i, i, ...] | [[i, ...], [i, ...], ...] | -+----------------------+---------------------+-----------------------------------+------------------------------------------------+ - -其中,f代表一个浮点数,i代表一个整数。 - -注意:对sparse_binary_vector和sparse_float_vector,PaddlePaddle存的是有值位置的索引。例如, - -- 对一个5维非序列的稀疏01向量 ``[0, 1, 1, 0, 0]`` ,类型是sparse_binary_vector,返回的是 ``[1, 2]`` 。 -- 对一个5维非序列的稀疏浮点向量 ``[0, 0.5, 0.7, 0, 0]`` ,类型是sparse_float_vector,返回的是 ``[(1, 0.5), (2, 0.7)]`` 。 - - -在定义输入layer之后,我们可以使用其他layer进行组合。在组合时,需要指定layer的输入来源。 - -例如,我们可以定义如下的layer组合: - -.. code-block:: bash - - y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear()) - cost = paddle.layer.square_error_cost(input=y_predict, label=y) - -其中,x与y为之前描述的输入层;而y_predict是接收x作为输入,接上一个全连接层;cost接收y_predict与y作为输入,接上平方误差层。 - -最后一层cost中记录了神经网络的所有拓扑结构,通过组合不同的layer,我们即可完成神经网络的搭建。 - - -训练模型 -============ - -在完成神经网络的搭建之后,我们首先需要根据神经网络结构来创建所需要优化的parameters,并创建optimizer。 -之后,我们可以创建trainer来对网络进行训练。 - -.. code-block:: bash - - parameters = paddle.parameters.create(cost) - optimizer = paddle.optimizer.Momentum(momentum=0) - trainer = paddle.trainer.SGD(cost=cost, - parameters=parameters, - update_equation=optimizer) - -其中,trainer接收三个参数,包括神经网络拓扑结构、神经网络参数以及迭代方程。 - -在搭建神经网络的过程中,我们仅仅对神经网络的输入进行了描述。而trainer需要读取训练数据进行训练,PaddlePaddle中通过reader来加载数据。 - -.. code-block:: bash - - # define training dataset reader - def train_reader(): - train_x = np.array([[1, 1], [1, 2], [3, 4], [5, 2]]) - train_y = np.array([[-2], [-3], [-7], [-7]]) - def reader(): - for i in xrange(train_y.shape[0]): - yield train_x[i], train_y[i] - return reader - -最终我们可以调用trainer的train方法启动训练: - -.. code-block:: bash - - # define feeding map - feeding = {'x': 0, 'y': 1} - - # event_handler to print training info - def event_handler(event): - if isinstance(event, paddle.event.EndIteration): - if event.batch_id % 1 == 0: - print "Pass %d, Batch %d, Cost %f" % ( - event.pass_id, event.batch_id, event.cost) - # training - trainer.train( - reader=paddle.batch(train_reader(), batch_size=1), - feeding=feeding, - event_handler=event_handler, - num_passes=100) - -关于PaddlePaddle的更多使用方法请参考 `进阶指南 <../../howto/index_cn.html>`_。 - -线性回归完整示例 -============== - -下面给出在三维空间中使用线性回归拟合一条直线的例子: - -.. literalinclude:: src/train.py - :linenos: - -使用以上训练好的模型进行预测,取其中一个模型params_pass_90.tar,输入需要预测的向量组,然后打印输出: - -.. literalinclude:: src/infer.py - :linenos: - -有关线性回归的实际应用,可以参考PaddlePaddle book的 `第一章节 `_。 diff --git a/doc/v2/getstarted/concepts/use_concepts_en.rst b/doc/v2/getstarted/concepts/use_concepts_en.rst deleted file mode 100644 index 406b0cbb913894dc333d8e4561c207793c33e475..0000000000000000000000000000000000000000 --- a/doc/v2/getstarted/concepts/use_concepts_en.rst +++ /dev/null @@ -1,3 +0,0 @@ -Basic Concept -============= -TBD diff --git a/doc/v2/getstarted/index_cn.rst b/doc/v2/getstarted/index_cn.rst deleted file mode 100644 index 75af7354be93a6eeabfa9ccf86903505402a7ca6..0000000000000000000000000000000000000000 --- a/doc/v2/getstarted/index_cn.rst +++ /dev/null @@ -1,19 +0,0 @@ -新手入门 -============ - - -如果需要快速了解PaddlePaddle的使用,可以参考以下指南。 - -.. toctree:: - :maxdepth: 1 - - quickstart_cn.rst - - -在使用PaddlePaddle构建应用时,需要了解一些基本概念。 -这里以一个线性回归为例子,详细介绍了PaddlePaddle的使用流程,包括数据格式,模型配置与训练等。 - -.. toctree:: - :maxdepth: 1 - - concepts/use_concepts_cn.rst diff --git a/doc/v2/getstarted/index_en.rst b/doc/v2/getstarted/index_en.rst deleted file mode 100644 index 94b306895c9ddf6140cf600131930a6675a583eb..0000000000000000000000000000000000000000 --- a/doc/v2/getstarted/index_en.rst +++ /dev/null @@ -1,19 +0,0 @@ -GET STARTED -============ - -If you want to quickly know how to use PaddlePaddle, please refer to the following guide: - -.. toctree:: - :maxdepth: 1 - - quickstart_en.rst - - -While using PaddlePaddle to build applications, please understand some basic concepts. - -Here is an example of linear regression. It introduces workflow of PaddlePaddle, including data format, model configuration and training, etc. - -.. toctree:: - :maxdepth: 1 - - concepts/use_concepts_en.rst diff --git a/doc/v2/getstarted/quickstart_cn.rst b/doc/v2/getstarted/quickstart_cn.rst deleted file mode 100644 index d511cead262dabafd095f68adb5ffc596a7fe596..0000000000000000000000000000000000000000 --- a/doc/v2/getstarted/quickstart_cn.rst +++ /dev/null @@ -1,47 +0,0 @@ -快速开始 -======== - -快速安装 --------- - -PaddlePaddle支持使用pip快速安装,目前支持CentOS 6以上, Ubuntu 14.04以及MacOS 10.12,并安装有Python2.7。 -执行下面的命令完成快速安装,版本为cpu_avx_openblas: - - .. code-block:: bash - - pip install paddlepaddle - -如果需要安装支持GPU的版本(cuda7.5_cudnn5_avx_openblas),需要执行: - - .. code-block:: bash - - pip install paddlepaddle-gpu - -更详细的安装和编译方法参考::ref:`install_steps` 。 - -快速使用 --------- - -创建一个 housing.py 并粘贴此Python代码: - - .. code-block:: python - - import paddle.v2 as paddle - - # Initialize PaddlePaddle. - paddle.init(use_gpu=False, trainer_count=1) - - # Configure the neural network. - x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13)) - y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear()) - - # Infer using provided test data. - probs = paddle.infer( - output_layer=y_predict, - parameters=paddle.dataset.uci_housing.model(), - input=[item for item in paddle.dataset.uci_housing.test()()]) - - for i in xrange(len(probs)): - print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000) - -执行 :code:`python housing.py` 瞧! 它应该打印出预测住房数据的清单。 diff --git a/doc/v2/getstarted/quickstart_en.rst b/doc/v2/getstarted/quickstart_en.rst deleted file mode 100644 index 70f7fe0646068aa79cd72955c6848ac0250c2300..0000000000000000000000000000000000000000 --- a/doc/v2/getstarted/quickstart_en.rst +++ /dev/null @@ -1,51 +0,0 @@ -Quick Start -============ - -Quick Install -------------- - -You can use pip to install PaddlePaddle with a single command, supports -CentOS 6 above, Ubuntu 14.04 above or MacOS 10.12, with Python 2.7 installed. -Simply run the following command to install, the version is cpu_avx_openblas: - - .. code-block:: bash - - pip install paddlepaddle - -If you need to install GPU version (cuda7.5_cudnn5_avx_openblas), run: - - .. code-block:: bash - - pip install paddlepaddle-gpu - -For more details about installation and build: :ref:`install_steps` . - -Quick Use ---------- - -Create a new file called housing.py, and paste this Python -code: - - - .. code-block:: python - - import paddle.v2 as paddle - - # Initialize PaddlePaddle. - paddle.init(use_gpu=False, trainer_count=1) - - # Configure the neural network. - x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13)) - y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear()) - - # Infer using provided test data. - probs = paddle.infer( - output_layer=y_predict, - parameters=paddle.dataset.uci_housing.model(), - input=[item for item in paddle.dataset.uci_housing.test()()]) - - for i in xrange(len(probs)): - print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000) - -Run :code:`python housing.py` and voila! It should print out a list of predictions -for the test housing data. diff --git a/doc/v2/howto/capi/compile_paddle_lib_cn.md b/doc/v2/howto/capi/compile_paddle_lib_cn.md deleted file mode 100644 index 8878ee9d85064ba27708ed92790aa9b83ba316e5..0000000000000000000000000000000000000000 --- a/doc/v2/howto/capi/compile_paddle_lib_cn.md +++ /dev/null @@ -1,181 +0,0 @@ -## 安装、编译与链接C-API预测库 - -### 直接下载安装 - -从CI系统中下载最新的C-API开发包进行安装,用户可以从下面的表格中找到需要的版本: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
版本说明C-API
cpu_avx_mklpaddle.tgz
cpu_avx_openblaspaddle.tgz
cpu_noavx_openblaspaddle.tgz
cuda7.5_cudnn5_avx_mklpaddle.tgz
cuda8.0_cudnn5_avx_mklpaddle.tgz
cuda8.0_cudnn7_avx_mklpaddle.tgz
cuda9.0_cudnn7_avx_mklpaddle.tgz
- -### 从源码编译 - -用户也可以从 PaddlePaddle 核心代码编译C-API链接库,只需在编译时配制下面这些编译选项: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
选项
WITH_C_APION
WITH_PYTHONOFF(推荐)
WITH_SWIG_PYOFF(推荐)
WITH_GOLANGOFF(推荐)
WITH_GPUON/OFF
WITH_MKLON/OFF
- -建议按照推荐值设置,以避免链接不必要的库。其它可选编译选项按需进行设定。 - -下面的代码片段从github拉取最新代码,配制编译选项(需要将PADDLE_ROOT替换为PaddlePaddle预测库的安装路径): - -```shell -PADDLE_ROOT=/path/of/capi -git clone https://github.com/PaddlePaddle/Paddle.git -cd Paddle -mkdir build -cd build -cmake -DCMAKE_INSTALL_PREFIX=$PADDLE_ROOT \ - -DCMAKE_BUILD_TYPE=Release \ - -DWITH_C_API=ON \ - -DWITH_SWIG_PY=OFF \ - -DWITH_GOLANG=OFF \ - -DWITH_PYTHON=OFF \ - -DWITH_MKL=OFF \ - -DWITH_GPU=OFF \ - .. -``` - -执行上述代码生成Makefile文件后,执行:`make && make install`。成功编译后,使用C-API所需的依赖(包括:(1)编译出的PaddlePaddle预测库和头文件;(2)第三方链接库和头文件)均会存放于`PADDLE_ROOT`目录中。 - -编译成功后在 `PADDLE_ROOT` 下会看到如下目录结构(包括了编译出的PaddlePaddle头文件和链接库,以及第三方依赖链接库和头文件(如果需要,由链接方式决定)): - -```text -├── include -│   └── paddle -│   ├── arguments.h -│   ├── capi.h -│   ├── capi_private.h -│   ├── config.h -│   ├── error.h -│   ├── gradient_machine.h -│   ├── main.h -│   ├── matrix.h -│   ├── paddle_capi.map -│   └── vector.h -├── lib -│   ├── libpaddle_capi_engine.a -│   ├── libpaddle_capi_layers.a -│   ├── libpaddle_capi_shared.so -│   └── libpaddle_capi_whole.a -└── third_party - ├── gflags - │   ├── include - │   │   └── gflags - │   │   ├── gflags_completions.h - │   │   ├── gflags_declare.h - │   │   ... - │   └── lib - │   └── libgflags.a - ├── glog - │   ├── include - │   │   └── glog - │   │   ├── config.h - │   │   ... - │   └── lib - │   └── libglog.a - ├── openblas - │   ├── include - │   │   ├── cblas.h - │   │   ... - │   └── lib - │   ... - ├── protobuf - │   ├── include - │   │   └── google - │   │   └── protobuf - │   │   ... - │   └── lib - │   └── libprotobuf-lite.a - └── zlib - ├── include - │   ... - └── lib - ... - -``` - -### 链接说明 - -目前提供三种链接方式: - -1. 链接`libpaddle_capi_shared.so` 动态库(这种方式最为简便,链接相对容易,**在无特殊需求情况下,推荐使用此方式**),需注意: - 1. 如果编译时指定编译CPU版本,且使用`OpenBLAS`数学库,在使用C-API开发预测程序时,只需要链接`libpaddle_capi_shared.so`这一个库。 - 1. 如果是用编译时指定CPU版本,且使用`MKL`数学库,由于`MKL`库有自己独立的动态库文件,在使用PaddlePaddle C-API开发预测程序时,需要自己链接MKL链接库。 - 1. 如果编译时指定编译GPU版本,CUDA相关库会在预测程序运行时动态装载,需要将CUDA相关的库设置到`LD_LIBRARY_PATH`环境变量中。 - -2. 链接静态库 `libpaddle_capi_whole.a`,需注意: - 1. 需要指定`-Wl,--whole-archive`链接选项。 - 1. 需要显式地链接 `gflags`、`glog`、`libz`、`protobuf` 等第三方库,可在`PADDLE_ROOT/third_party`下找到。 - 1. 如果在编译 C-API 时使用OpenBLAS数学库,需要显示地链接`libopenblas.a`。 - 1. 如果在编译 C-API 是使用MKL数学库,需要显示地链接MKL的动态库。 - -3. 链接静态库 `libpaddle_capi_layers.a`和`libpaddle_capi_engine.a`,需注意: - 1. 这种链接方式主要用于移动端预测。 - 1. 为了减少生成链接库的大小把`libpaddle_capi_whole.a`拆成以上两个静态链接库。 - 1. 需指定`-Wl,--whole-archive -lpaddle_capi_layers` 和 `-Wl,--no-whole-archive -lpaddle_capi_engine` 进行链接。 - 1. 第三方依赖库需要按照与方式2同样方法显示地进行链接。 diff --git a/doc/v2/howto/capi/compile_paddle_lib_en.md b/doc/v2/howto/capi/compile_paddle_lib_en.md deleted file mode 100644 index 70a6edef27e75af6b38d7d4824c928eba0d29b9a..0000000000000000000000000000000000000000 --- a/doc/v2/howto/capi/compile_paddle_lib_en.md +++ /dev/null @@ -1,180 +0,0 @@ -## Install and Build - -### Download & Install - - Download the latest C-API development package from CI system and install. You can find the required version in the table below: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Version TipsC-API
cpu_avx_mklpaddle.tgz
cpu_avx_openblaspaddle.tgz
cpu_noavx_openblaspaddle.tgz
cuda7.5_cudnn5_avx_mklpaddle.tgz
cuda8.0_cudnn5_avx_mklpaddle.tgz
cuda8.0_cudnn7_avx_mklpaddle.tgz
cuda9.0_cudnn7_avx_mklpaddle.tgz
- -### From source - - Users can also compile the C-API library from PaddlePaddle source code by compiling with the following compilation options: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
OptionsValue
WITH_C_APION
WITH_PYTHONOFF(recommended)
WITH_SWIG_PYOFF(recommended)
WITH_GOLANGOFF(recommended)
WITH_GPUON/OFF
WITH_MKLON/OFF
- -It is best to set up with recommended values to avoid linking with unnecessary libraries. Set other compilation options as you need. - -Pull the latest following code snippet from github, and configure compilation options(replace PADDLE_ROOT with the installation path of the PaddlePaddle C-API inference library): - -```shell -PADDLE_ROOT=/path/of/capi -git clone https://github.com/PaddlePaddle/Paddle.git -cd Paddle -mkdir build -cd build -cmake -DCMAKE_INSTALL_PREFIX=$PADDLE_ROOT \ - -DCMAKE_BUILD_TYPE=Release \ - -DWITH_C_API=ON \ - -DWITH_SWIG_PY=OFF \ - -DWITH_GOLANG=OFF \ - -DWITH_PYTHON=OFF \ - -DWITH_MKL=OFF \ - -DWITH_GPU=OFF \ - .. -``` - -After running the above code to generate Makefile , run: `make && make install`. After successful compilation, the dependencies required by C-API(includes: (1)PaddlePaddle inference library and header files; (2) Third-party libraries and header files) will be stored in the `PADDLE_ROOT` directory. - -If the compilation is successful, see the following directory structure under `PADDLE_ROOT`(includes PaddlePaddle header files and libraries, and third-party libraries and header files(determined by the link methods if necessary)): - -```text -├── include -│   └── paddle -│   ├── arguments.h -│   ├── capi.h -│   ├── capi_private.h -│   ├── config.h -│   ├── error.h -│   ├── gradient_machine.h -│   ├── main.h -│   ├── matrix.h -│   ├── paddle_capi.map -│   └── vector.h -├── lib -│   ├── libpaddle_capi_engine.a -│   ├── libpaddle_capi_layers.a -│   ├── libpaddle_capi_shared.so -│   └── libpaddle_capi_whole.a -└── third_party - ├── gflags - │   ├── include - │   │   └── gflags - │   │   ├── gflags_completions.h - │   │   ├── gflags_declare.h - │   │   ... - │   └── lib - │   └── libgflags.a - ├── glog - │   ├── include - │   │   └── glog - │   │   ├── config.h - │   │   ... - │   └── lib - │   └── libglog.a - ├── openblas - │   ├── include - │   │   ├── cblas.h - │   │   ... - │   └── lib - │   ... - ├── protobuf - │   ├── include - │   │   └── google - │   │   └── protobuf - │   │   ... - │   └── lib - │   └── libprotobuf-lite.a - └── zlib - ├── include - │   ... - └── lib - ... - -``` - -### Linking Description: - -There are three kinds of linking methods: - -1. Linking with dynamic library `libpaddle_capi_shared.so`(This way is much more convenient and easier, **Without special requirements, it is recommended**), refer to the following: - 1. Compiling with CPU version and using `OpenBLAS`; only need to link one library named `libpaddle_capi_shared.so` to develop prediction program through C-API. - 1. Compiling with CPU version and using `MKL` lib, you need to link MKL library directly to develop prediction program through PaddlePaddle C-API, due to `MKL` has its own dynamic library. - 1. Compiling with GPU version, CUDA library will be loaded dynamically on prediction program run-time, and also set CUDA library to  `LD_LIBRARY_PATH` environment variable. - -2. Linking with static library `libpaddle_capi_whole.a`,refer to the following: - 1. Specify `-Wl,--whole-archive` linking options. - 1. Explicitly link third-party libraries such as `gflags`、`glog`、`libz`、`protobuf` .etc, you can find them under `PADDLE_ROOT/third_party` directory. - 1. Use OpenBLAS library if compiling C-API,must explicitly link `libopenblas.a`. - 1. Use MKL when compiling C-API, must explicitly link MKL dynamic library. - -3. Linking with static library `libpaddle_capi_layers.a` and `libpaddle_capi_engine.a`,refer to the following: - 1. This linking methods is mainly used for mobile prediction. - 1. Split `libpaddle_capi_whole.a` into two static linking library at least to reduce the size of linking libraries. - 1. Specify `-Wl,--whole-archive -lpaddle_capi_layers`  and `-Wl,--no-whole-archive -lpaddle_capi_engine` for linking. - 1. The third-party dependencies need explicitly link same as method 2 above. diff --git a/doc/v2/howto/capi/images/csr.png b/doc/v2/howto/capi/images/csr.png deleted file mode 100644 index 3dc10b8de4f6d3f517624956b1694b689405a031..0000000000000000000000000000000000000000 Binary files a/doc/v2/howto/capi/images/csr.png and /dev/null differ diff --git a/doc/v2/howto/capi/images/sequence_data.png b/doc/v2/howto/capi/images/sequence_data.png deleted file mode 100644 index 6e47a46b8955dfe977e85898fe3c9f33ed28de7e..0000000000000000000000000000000000000000 Binary files a/doc/v2/howto/capi/images/sequence_data.png and /dev/null differ diff --git a/doc/v2/howto/capi/images/workflow_of_CAPI.png b/doc/v2/howto/capi/images/workflow_of_CAPI.png deleted file mode 100644 index a4399ade048b3fe10d2d9c714bc34333ca068edb..0000000000000000000000000000000000000000 Binary files a/doc/v2/howto/capi/images/workflow_of_CAPI.png and /dev/null differ diff --git a/doc/v2/howto/capi/index_cn.rst b/doc/v2/howto/capi/index_cn.rst deleted file mode 100644 index 7f100717983f5e950b801e6b05ee48bfff273c62..0000000000000000000000000000000000000000 --- a/doc/v2/howto/capi/index_cn.rst +++ /dev/null @@ -1,26 +0,0 @@ -C-API预测库 -================== - -当我们训练完一个神经网络模型之后,下一步就是用模型来做预测。预测就是准备输入数据,经过模型处理之后,得到预测结果的过程。 - -相比于模型训练,预测有如下特点: - -#. 预测不需要训练过程中反向传播和参数更新的部分。 -#. 预测不需要标签(label)。 -#. 预测很多时候需要和用户系统整合在一起。 - -因为上述特点,模型预测SDK需要单独设计,并具备以下特点: - -#. 预测SDK不包含反向传播和参数更新部分,以减小SDK的体积。 -#. 预测SDK需要提供一个简洁的用户接口,方便使用。 -#. 因为输入数据可能有多种结构,对输入数据的格式做清晰简洁的封装。 -#. 为了和用户系统兼容,SDK的接口需要是满足C标准的接口。 - -PaddlePaddle提供了C-API,用于解决上述问题。关于C-API的使用,我们提供了如下指南: - -.. toctree:: - :maxdepth: 1 - - compile_paddle_lib_cn.md - organization_of_the_inputs_cn.md - workflow_of_capi_cn.md diff --git a/doc/v2/howto/capi/index_en.rst b/doc/v2/howto/capi/index_en.rst deleted file mode 100644 index 4ec39c9d5223442cf6872edaf7befeb5053b538e..0000000000000000000000000000000000000000 --- a/doc/v2/howto/capi/index_en.rst +++ /dev/null @@ -1,26 +0,0 @@ -C-API Inference Library -======================== - -After we train a neural network, we use it to do inference. Inference is the process of preparing input data and propagating it through the model to produce the result. - -Compared with model training, prediction has the following features: - -#. Inference does not require backpropagation and parameter updates, as required during training. -#. Labels are not needed in prediction. -#. Most of the time, predictions need to be integrated with the user system. - -Therefore, the model prediction SDK needs to be designed separately and has the following features: - -#. The predictive SDK does not include backpropagation and parameter updates to reduce the size of the SDK. -#. The predictive SDK needs a simple user interface for ease of use. -#. Since the input data may have a variety of structures, the format of the input data is clearly and compactly packaged. -#. In order to be compatible with user's system, the SDK's interface must conform to the C-standard interface. - -PaddlePaddle provides C-API to solve the above problem. Following are the guidelines to use the C-API: - -.. toctree:: - :maxdepth: 1 - - compile_paddle_lib_en.md - organization_of_the_inputs_en.md - workflow_of_capi_en.md diff --git a/doc/v2/howto/capi/organization_of_the_inputs_cn.md b/doc/v2/howto/capi/organization_of_the_inputs_cn.md deleted file mode 100644 index 343526c213110cb9c6abaf9a12b3d634ad3fabe9..0000000000000000000000000000000000000000 --- a/doc/v2/howto/capi/organization_of_the_inputs_cn.md +++ /dev/null @@ -1,289 +0,0 @@ -## 输入/输出数据组织 - -这篇文档介绍在使用 PaddlePaddle C-API 时如何组织输入数据,以及如何解析神经网络前向计算的输出结果。 - -### 输入/输出数据类型 -在C-API中,按照基本数据类型在PaddlePaddle内部的定义和实现,输入数据可分为: - -1. 一维整型数组 -1. 二维浮点型矩阵 - - - 稠密矩阵 - - 稀疏矩阵 - -说明: - -1. 一维数组**仅支持整型值**; - - 常用于自然语言处理任务,例如:表示词语在词典中的序号; - - 分类任务中类别标签; -1. 逻辑上高于二维的数据(例如含有多个通道的图片,视频等)在程序实现中都会转化为二维矩阵,转化方法在相应的领域都有通用解决方案,需要使用者自己了解并完成转化; -1. 二维矩阵可以表示行向量和列向量,任何时候如果需要浮点型数组(向量),都应使用C-API中的矩阵来表示,而不是C-API中的一维数组。 -1. 不论是一维整型数组还是二维浮点数矩阵,**为它们附加上序列信息将变成序列输入。PaddlePaddle 会通过判数据是否附带有序列信息来判断一个向量/矩阵是否是一个序列**。当非序列输入时,无需关心和处理序列信息。关于什么是“序列信息”,下文会详细进行介绍。 - -### 基本使用概念 - -- 在PaddlePaddle内部,神经网络中一个计算层的输入/输出被组织为一个 `Argument` 结构体,如果神经网络有多个输入或者多个输出,每一个输入/输出都会对应有自己的`Argument`。 -- `Argument` 并不真正“存储”数据,而是将输入/输出信息有机地组织在一起。 -- 在`Argument`内部由`IVector`(对应着上文提到的一维整型数组)和`Matrix`(对应着上文提到的二维浮点型矩阵)来实际存储数据;由 `Sequence Start Positions` (下文详细解释) 来描述输入/输出的序列信息。 - -- **注**: - 1. 这篇文档之后部分将会统一使用`argument`来特指PaddlePaddle中神经网络计算层一个输入/输出数据。 - 1. 使用`paddle_ivector`来特指PaddlePaddle中的一维整型数组。 - 1. 使用`paddle_matrix`来特指PaddlePaddle中的二维浮点型矩阵。 - -### 组织输入数据 -- 一维整型数组 - - 概念上可以将`paddle_ivector`理解为一个一维的整型数组,通常用于表示离散的类别标签,或是在自然语言处理任务中表示词语在字典中的序号。下面的代码片段创建了含有三个元素`1`、`2`、`3`的`paddle_ivector`。 - ```c - int ids[] = {1, 2, 3}; - paddle_ivector ids_array = - paddle_ivector_create(ids, sizeof(ids) / sizeof(int), false, false); - CHECK(paddle_arguments_set_ids(in_args, 0, ids_array)); - ``` - -- **稠密矩阵** - - 一个`m×n`的稠密矩阵是一个由`m`行`n`列元素排列成的矩形阵列,矩阵里的元素是浮点数。对神经网络来说,矩阵的高度`m`是一次预测接受的样本数目,宽度$n$是神经网络定义时,`paddle.layer.data`的`size`。 - - 下面的代码片段创建了一个高度为1,宽度为`layer_size`的稠密矩阵,矩阵中每个元素的值随机生成。 - - ```c - paddle_matrix mat = paddle_matrix_create( - /* height = batch size */ 1, - /* width = dimensionality of the data layer */ layer_size, - /* whether to use GPU */ false); - - paddle_real* array; - // Get the pointer pointing to the start address of the first row of the - // created matrix. - CHECK(paddle_matrix_get_row(mat, 0, &array)); - - // Fill the matrix with a randomly generated test sample. - srand(time(0)); - for (int i = 0; i < layer_size; ++i) { - array[i] = rand() / ((float)RAND_MAX); - } - - // Assign the matrix to the argument. - CHECK(paddle_arguments_set_value(in_args, 0, mat)); - ``` - -- **稀疏矩阵** - - PaddlePaddle C-API 中 稀疏矩阵使用[CSR(Compressed Sparse Row Format)](https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format))格式存储。下图是CSR存储稀疏矩阵的示意图。 -

-
图1. 稀疏矩阵存储示意图 -

- - CSR存储格式通过:(1)非零元素的值(上图中的`values`);(2)行偏移(上图中的`row offsets`):每一行元素在`values`中的起始偏移,`row offsets`中元素个数总是等于行数 + 1;(3)非零元素的列号(上图中的`column indices`)来确定稀疏矩阵的内容。 - - 在PaddlePaddle C-API中,通过调用以下接口创建稀疏矩阵: - - ```c - PD_API paddle_matrix paddle_matrix_create_sparse( - uint64_t height, uint64_t width, uint64_t nnz, bool isBinary, bool useGpu); - ``` - - 1. 创建稀疏矩阵时需要显示地指定矩阵的(1)高度(`height`,在神经网络中等于一次预测处理的样本数)(2)宽度(`width`,`paddle.layer.data`的`size`)以及(3)非零元个数(`nnz`)。 - 1. 当上述接口第4个参数`isBinary`指定为`true`时,**只需要设置行偏移(`row_offset`)和列号(`colum indices`),不需要提供元素值(`values`)**,这时行偏移和列号指定的元素默认其值为1。 - - 下面的代码片段创建了一个CPU上的二值稀疏矩阵: - - ```c - paddle_matrix mat = paddle_matrix_create_sparse(1, layer_size, nnz, true, false); - int colIndices[] = {9, 93, 109}; // layer_size here is greater than 109. - int rowOffset[] = {0, sizeof(colIndices) / sizeof(int)}; - - CHECK(paddle_matrix_sparse_copy_from(mat, - rowOffset, - sizeof(rowOffset) / sizeof(int), - colIndices, - (colIndices) / sizeof(int), - NULL /*values array is NULL.*/, - 0 /*size of the value arrary is 0.*/)); - CHECK(paddle_arguments_set_value(in_args, 0, mat)); - ``` - 下面的代码片段在创建了一个CPU上的带元素值的稀疏矩阵: - ```c - paddle_matrix mat = paddle_matrix_create_sparse(1, layer_size, nnz, false, false); - int colIndices[] = {9, 93, 109}; // layer_size here is greater than 109. - int rowOffset[] = {0, sizeof(colIndices) / sizeof(int)}; - float values[] = {0.5, 0.5, 0.5}; - - CHECK(paddle_matrix_sparse_copy_from(mat, - rowOffset, - sizeof(rowOffset) / sizeof(int), - colIndices, - sizeof(colIndices) / sizeof(int), - values, - sizeof(values) / sizeof(float))); - ``` - 注意事项: - 1. 移动端预测**不支持**稀疏矩阵及相关的接口。 - -### 组织序列信息 - -多个排成一列的元素(可以是整型、浮点数、浮点数向量等)构成一个序列,元素之间的顺序是序列所携带的重要信息。不同序列可能会含有不同数目个元素。在 PaddlePaddle 中,序列输入/输出数据是在上文介绍的**数据输入(一维整型数组,二维浮点数矩阵)基础上,附加上序列信息**。下面详细解释什么是“序列信息”。 - -我们将神经网络一次计算接受的所有输入样本称之为一个`batch`(可以含有一条或多条样本),每一个序列在整个`batch`中的偏移,就是PaddlePaddle中所指的**序列信息**,称之为“sequence start positions”。PaddlePaddle 支持两种序列类型: - -1. 单层序列 - - 序列中的每一个元素是非序列,是进行计算的基本单位,不可再进行拆分。 - - 例如:自然语言中的句子是一个序列,序列中的元素是词语; -1. 双层序列 - - 序列中的每一个元素又是一个序列。 - - 例如:自然语言中的段落是一个双层序列;段落是由句子构成的序列;句子是由词语构成的序列。 - - 双层序列在处理长序列的任务或是构建层级模型时会发挥作用。 - -这篇文档之后部分会统一使用`sequence_start_positions`来特指:PaddlePaddle中神经网络计算层输入/输出所携带的序列信息。 - -对双层序列来讲,不仅要提供每一个外层序列在整个`batch`中的偏移,每一个外层序列又含有若干个内层序列,需要同时提供每一个内层序列在整个`batch`中的偏移。也就是说:**双层序列需要设置分别为外层序列和内层序列分别设置`sequence_start_positions`信息**。 - -**注:** -1. 不论序列中的元素在内存中占用多少实际存储空间,`sequence_start_positions`表示的偏移是以“序列中的一个元素”作为统计的基本单位,而不是相对`batch`起始存储地址以数据的存储大小为单位的偏移。 -1. 非序列输入不携带`sequence_start_positions`,非序列输入无需构造`sequence_start_positions`。 -1. **不论是单层序列还是双层序列的序列信息,都使用`paddle_ivector`(也就是PaddlePaddle中的一维整型数组)来存储。** - -图2 是PaddlePaddle中单层序列和双层序列存储示意图。 -

-
图2. 序列输入示意图 -

- -- 单层序列 - - 图2 (a) 展示了一个含有4个序列的`batch`输入: - 1. 4个序列的长度分别为:5、3、2、4; - 1. 这时的`sequence_start_positions`为:`[0, 5, 8, 10, 14]`; - 1. 本地训练. 不论数据域是`paddle_ivector`类型还是`paddle_matrix`类型,都可以通过调用下面的接口为原有的数据输入附加上序列信息,使之变为一个单层序列输入,代码片段如下: - - ```c - int seq_pos_array[] = {0, 5, 8, 10, 14}; - paddle_ivector seq_pos = paddle_ivector_create( - seq_pos_array, sizeof(seq_pos_array) / sizeof(int), false, false); - // Suppose the network only has one input data layer. - CHECK(paddle_arguments_set_sequence_start_pos(in_args, 0, 0, seq_pos)); - ``` - -- 双层序列 - - 图2 (b) 展示了一个含有4个序列的`batch`输入; - 1. 4个序列的长度分别为:5、3、2、4;这四个序列又分别含有3、2、1、2个子序列; - 1. 这时的需要同时提供: - - 外层序列在`batch`中的起始偏移`:[0, 5, 8, 10, 14]`; - - 内层序列在`batch`中的起始偏移:`[0, 2, 3, 5, 7, 8, 10, 13, 14]`; - 1. 不论数据域是`paddle_ivector`类型还是`paddle_matrix`类型,这时需要调用创建序列信息和为`argument`设置序列信息的接口**两次**,分别为数据输入添加外层序列和内层序列的序列信息,使之变为一个双层序列输入,代码片段如下: - ```c - // set the sequence start positions for the outter sequences. - int outter_seq_pos_array[] = {0, 5, 8, 10, 14}; - paddle_ivector seq_pos = - paddle_ivector_create(outter_seq_pos_array, - sizeof(outter_pos_array) / sizeof(int), - false, - false); - // The third parameter of this API indicates the sequence level. - // 0 for the outter sequence. 1 for the inner sequence. - // If the input is a sequence not the nested sequence, the third parameter is - // fixed to be 0. - CHECK(paddle_arguments_set_sequence_start_pos(in_args, 0, 0, seq_pos)); - - // set the sequence start positions for the outter sequences. - int inner_seq_pos_array[] = {0, 2, 3, 5, 7, 8, 10, 13, 14}; - paddle_ivector seq_pos = paddle_ivector_create( - inner_pos_array, sizeof(inner_pos_array) / sizeof(int), false, false); - // The third parameter of this API indicates the sequence level. - // 0 for the outter sequence. 1 for the inner sequence. - CHECK(paddle_arguments_set_sequence_start_pos(in_args, 0, 1, seq_pos)); - ``` - -注意事项: -1. 当一个`batch`中含有多个序列,**不支持序列长度为`0`的序列(也就是空输入)** 作为输入。不同计算层对空输入的处理策略有可能不同,潜在会引起未定义行为,或者引起行时错误,请在输入时进行合法性检查。 - -### Python 端数据类型说明 - -下表列出了Python端训练接口暴露的数据类型(`paddle.layer.data`函数`type`字段的取值)对应于调用C-API需要创建的数据类型: - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Python 端数据类型C-API 输入数据类型
paddle.data_type.integer_value整型数组,无需附加序列信息
paddle.data_type.dense_vector浮点型稠密矩阵,无需附加序列信息
paddle.data_type.sparse_binary_vector浮点型稀疏矩阵,无需提供非零元的值,默认为1,无需附加序列信息
paddle.data_type.sparse_vector浮点型稀疏矩阵,需提供非零元的值,无需附加序列信息
paddle.data_type.integer_value_sequence整型数组,需附加序列信息
paddle.data_type.dense_vector_sequence浮点型稠密矩阵,需附加序列信息
paddle.data_type.sparse_binary_vector_sequence浮点型稀疏矩阵,无需提供非零元的值,默认为1,需附加序列信息
paddle.data_type.sparse_vector_sequence浮点型稀疏矩阵,需提供非零元的值,需附加序列信息
paddle.data_type.integer_value_sub_sequence整型数组,需附加双层序列信息
paddle.data_type.dense_vector_sub_sequence浮点型稠密矩阵,需附加双层序列信息
paddle.data_type.sparse_binary_vector_sub_sequence浮点型稀疏矩阵,无需提供非零元的值,默认为1,需附加双层序列信息
paddle.data_type.sparse_vector_sub_sequence浮点型稀疏矩阵,需提供非零元的值,需附加双层序列信息
- -
- - -### 输出数据 - -PaddlePaddle中一个计算层的输出数据组织方式和输入数据组织方式完全相同。一个输出数据同样被组织为一个`argument`,`argument`通过`paddle_matrix`或`paddle_ivector`存数数据,如果输出是一个序列,那么会携带有`sequence_start_positions`信息。调用C-API相关接口,读取需要的结果即可。 - -### 总结 - -- 在PaddlePaddle内部,神经网络中一个计算层的输入/输出被组织为`argument`。 -- `argument`并不真正“存储”数据,而是将输入/输出信息有机地组织在一起。 -- 在`argument`内部由`paddle_ivector`(一维整型数组)和`paddle_matrix`(二维浮点型矩阵)来实际存储数据。 -如果是一个序列输入/输出由 `sequence start positions` 来记录输入/输出的序列信息。 - -于是,在组织神经网络输入时,需要思考完成以下工作: - -1. 为每一个输入/输出创建`argument`。 - - C-API 中操作`argument`的接口请查看[argument.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/arguments.h)。 -1. 为每一个`argument`创建`paddle_matrix`或者`paddle_ivector`来存储数据。 - - C-API 中操作`paddle_ivector`的接口请查看 [vector.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/vector.h)。 - - C-API 中操作`paddle_matrix`的接口请查看[matrix.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/matrix.h)。 -1. 如果输入是序列数据,需要创建并填写`sequence_start_positions`信息。 - - 通过调用 [`paddle_arguments_set_sequence_start_pos`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/arguments.h#L137) 来为一个`argument`添加序列信息。 - - 通过调用 [`paddle_arguments_get_sequence_start_pos`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/arguments.h#L150) 来读取一个`argument`添加序列信息。 - - 接口说明请查看 [argument.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/arguments.h) 文件。 diff --git a/doc/v2/howto/capi/organization_of_the_inputs_en.md b/doc/v2/howto/capi/organization_of_the_inputs_en.md deleted file mode 100644 index 250d3b2f749aed018e63527e817899c843dff996..0000000000000000000000000000000000000000 --- a/doc/v2/howto/capi/organization_of_the_inputs_en.md +++ /dev/null @@ -1,3 +0,0 @@ -## Input/Output Data Organization - -TBD diff --git a/doc/v2/howto/capi/workflow_of_capi_cn.md b/doc/v2/howto/capi/workflow_of_capi_cn.md deleted file mode 100644 index db1568a2afbea3cca0d4e1fe053ba9536a60ab3d..0000000000000000000000000000000000000000 --- a/doc/v2/howto/capi/workflow_of_capi_cn.md +++ /dev/null @@ -1,124 +0,0 @@ -## C-API使用流程 - -这篇文档介绍 PaddlePaddle C-API 整体使用流程。 - -### 使用流程 - -使用 C-API 的工作流程如图1所示,分为(1)准备预测模型和(2)预测程序开发两大部分。 - -

-
图1. C-API使用流程示意图 -

- -- 准备预测模型 - - 1. 只将神经网络结构进行序列化。 - - 只对神经网络结构进行序列化,加载模型需同时指定:网络结构的序列化结果和模型参数存储目录。 - 1. 将网络结构定义和训练结束存储下来的模型参数文件(多个)合并入一个文件。 - - 神经网络模型结构和训练好的模型将被序列化合并入一个文件。 - - 预测时只需加载一个文件便于发布。 - - **注意**:以上两种方式只需选择其一即可。 -- 调用 C-API 开发预测序 - - 1. 初始化PaddlePaddle运行环境。 - 1. 加载预测模型。 - 1. 创建神经网络输入,组织输入数据。 - 1. 进行前向计算,获得计算结果。 - 1. 清理和结束。 - -### 准备预测模型 - -准备预测模型部分,我们以手写数字识别任务为例进行介绍。手写数字识别任务定义了一个含有[两个隐层的简单全连接网络](https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/README.cn.md#softmax回归softmax-regression),网络接受一幅图片作为输入,将图片分类到 0 ~ 9 类别标签之一。完整代码可以查看[此目录](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/capi/examples/model_inference/dense) 中的相关脚本。 - -调用C-API开发预测程序需要一个训练好的模型,运行[MNIST手写数字识别目录](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/capi/examples/model_inference/dense)下的[mnist_v2.py](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/capi/examples/model_inference/dense/mnist_v2.py)脚本,在终端执行`python mnist_v2.py`,会使用 PaddlePaddle 内置的 [MNIST 数据集](http://yann.lecun.com/exdb/mnist/)进行训练。训练好的模型默认保存在当前运行目录下的`models`目录中。 - -下面,我们将训练结束后存储下来的模型转换成预测模型。 - -1. 序列化神经网络模型配置 - - PaddlePaddle 使用 protobuf 来传输网络配置文件中定义的网络结构和相关参数,使用 C-API 进行预测时,需要将网络结构使用 protobuf 进行序列化,写入文件中。 - - 调用[`paddle.utils.dump_v2_config`](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/utils/dump_v2_config.py)中的`dump_v2_config`函数能够将使用 PaddlePaddle V2 API 定义的神经网络结构 dump 到指定文件中,示例代码如下: - - ```python - from paddle.utils.dump_v2_config import dump_v2_config - from mnist_v2 import network - - predict = network(is_infer=True) - dump_v2_config(predict, "trainer_config.bin", True) - ``` - - 对[手写数字识别](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/capi/examples/model_inference/dense)这个示例,[`mnist_v2.py`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/capi/examples/model_inference/dense/mnist_v2.py)脚本集成了序列化神经网络结构的过程,可以直接运行 `python mnist_v2.py --task dump_config` 对神经网络结构进行序列化,结果会写入当前运行目录下的`trainer_config.bin`文件中。 - - 使用这种方式,需要**在运行时将神经网络的多个可学习参数放在同一个目录中**,C-API可以通过分别指定序列化后的网络结构文件和参数目录来加载训练好的模型。 - -2. 合并模型文件(可选) - - 一些情况为了便于发布,希望能够将序列化后的神经网络结构和训练好的模型参数打包进一个文件。对于这样的需求,可以使用`paddle.utils.merge_model`中的`merge_v2_model`接口对神经网络结构和训练好的参数进行序列化,将序列化结果写入一个文件内。 - - 代码示例如下: - - ```python - from paddle.utils.merge_model import merge_v2_model - from mnist_v2 import network - - net = network(is_infer=True) - param_file = "models/params_pass_4.tar" - output_file = "output.paddle.model" - merge_v2_model(net, param_file, output_file) - ``` - - 对[手写数字识别](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/capi/examples/model_inference/dense)这个示例,可直接运行 `python` [merge_v2_model.py](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/capi/examples/model_inference/dense/merge_v2_model.py)。序列化结果会写入当前运行目录下的`output.paddle.model`文件中。使用这种方式,运行时C-API可以通过指定`output.paddle.model`文件的路径来加载预测模型。 - -#### 注意事项 -1. 为使用C-API,在调用`dump_v2_config`序列化神经网络结构时,参数`binary`必须指定为`True`。 -1. **预测使用的网络结构往往不同于训练**,通常需要去掉网络中的:(1)类别标签层;(2)损失函数层;(3)`evaluator`等,只留下核心计算层,请注意是否需要修改网络结构。 -1. 预测时,可以获取网络中定义的任意多个(大于等于一个)层前向计算的结果,需要哪些层的计算结果作为输出,就将这些层加入一个Python list中,作为调用`dump_v2_config`的第一个参数。 - -### 编写预测代码 - -预测代码更多详细示例代码请参考[C-API使用示例](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/capi/examples/model_inference) 目录下的代码示例。这一节对图1中预测代码编写的5个步骤进行介绍和说明。 - -#### step 1. 初始化PaddlePaddle运行环境 -第一步需调用[`paddle_init`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/capi/main.h#L27) 初始化PaddlePaddle运行环境,该接口接受两个参数:参数的个数和参数列表。 - -#### step2. 加载模型 - -这里介绍C-API使用中的一个重要概念:Gradient Machine。 - -概念上,在 PaddlePaddle 内部,一个GradientMachine类的对象管理着一组计算层(PaddlePaddle Layers)来完成前向和反向计算,并处理与之相关的所有细节。在调用C-API预测时,只需进行前向计算而无需调用反向计算。这篇文档之后部分会使用`gradient machine`来特指调用PaddlePaddle C-API创建的GradientMachine类的对象。每一个 `gradient machine` 都会管理维护一份训练好的模型,下面是C-API提供的,两种常用的模型加载方式: - -1. 调用[`paddle_gradient_machine_load_parameter_from_disk`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/capi/gradient_machine.h#L61)接口,从磁盘加载预测模型。这时`gradient machine`会独立拥有一份训练好的模型; -1. 调用[`paddle_gradient_machine_create_shared_param`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/capi/gradient_machine.h#L88)接口,与其它`gradient machine`的共享已经加载的预测模型。这种情况多出现在使用多线程预测时,通过多个线程共享同一个模型来减少内存开销。可参考[此示例](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/capi/examples/model_inference/multi_thread/main.c)。 - -- 注意事项 - - 1. 使用PaddlePaddle V2 API训练,模型中所有可学习参数会被存为一个压缩文件,需要手动进行解压,将它们放在同一目录中,C-API不会直接加载 V2 API 存储的压缩文件。 - 1. 如果使用`merge model`方式将神经网络结构和训练好的参数序列化到一个文件,请参考此[示例](https://github.com/PaddlePaddle/Mobile/blob/develop/Demo/linux/paddle_image_recognizer.cpp#L59)。 - 1. 通过灵活使用以上两个接口,加载模型可其它多种方式,例如也可在程序运行过程中再加载另外一个模型。 - -#### step 3. 创建神经网络输入,组织输入数据 - -基本使用概念: -- 在PaddlePaddle内部,神经网络中一个计算层的输入输出被组织为一个 `Argument` 结构体,如果神经网络有多个输入或者多个输出,每一个输入/输出都会对应有自己的`Argument`。 -- `Argument` 并不真正“存储”数据,而是将输入/输出数据有机地组织在一起。 -- 在`Argument`内部由:1. `Matrix`(二维矩阵,存储浮点类型输入/输出);2. `IVector`(一维数组,**仅用于存储整型值**,多用于自然语言处理任务)来实际存储数据。 - -C-API支持的所有输入数据类型和他们的组织方式,请参考“输入/输出数据组织”一节。 - -这篇文档的之后部分会使用`argument`来特指PaddlePaddle C-API中神经网络的一个输入/输出,使用`paddle_matrix`**特指**`argument`中用于存储数据的`Matrix`类的对象。 - -在组织神经网络输入,获取输出时,需要思考完成以下工作: - -1. 为每一个输入/输出创建`argument`; -1. 为每一个`argument`创建`paddle_matrix`来存储数据; - -与输入不同的是,不需在使用C-API时为输出`argument`的`paddle_matrix`对象分配空间。前向计算之后PaddlePaddle内部已经分配/管理了每个计算层输出的存储空间。 - -#### step 4. 前向计算 - -完成上述准备之后,通过调用 [`paddle_gradient_machine_forward`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/capi/gradient_machine.h#L73) 接口完成神经网络的前向计算。 - -#### step 5. 清理 - -结束预测之后,对使用的中间变量和资源进行清理和释放。 diff --git a/doc/v2/howto/capi/workflow_of_capi_en.md b/doc/v2/howto/capi/workflow_of_capi_en.md deleted file mode 100644 index 1692ecd56520675f02ad25ef73761330ebd0e740..0000000000000000000000000000000000000000 --- a/doc/v2/howto/capi/workflow_of_capi_en.md +++ /dev/null @@ -1,3 +0,0 @@ -## C-API Workflow - -TBD diff --git a/doc/v2/howto/cluster/cmd_argument_cn.md b/doc/v2/howto/cluster/cmd_argument_cn.md deleted file mode 100644 index c0ba093cbf2eac5c3b60a0b071b31776a11998f3..0000000000000000000000000000000000000000 --- a/doc/v2/howto/cluster/cmd_argument_cn.md +++ /dev/null @@ -1,167 +0,0 @@ -# 启动参数说明 - -下面以`doc/howto/cluster/src/word2vec`中的代码作为实例,介绍使用PaddlePaddle v2 API完成分布式训练。 - -## 启动参数服务器 - -执行以下的命令启动一个参数服务器并等待和计算节点的数据交互 - -```bash -$ paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 -``` - -如果希望可以在后台运行pserver程序,并保存输出到一个日志文件,可以运行: - -```bash -$ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 &> pserver.log -``` - -参数说明 - -- port:**必选,默认7164**,pserver监听的起始端口,根据ports_num决定总端口个数,从起始端口监听多个端口用于通信 -- ports_num:**必选,默认1**,监听的端口个数 -- ports_num_for_sparse:**必选,默认0**,用于稀疏类型参数通信的端口个数 -- num_gradient_servers:**必选,默认1**,当前训练任务pserver总数 - -## 启动计算节点 - -执行以下命令启动使用python编写的trainer程序(文件名为任意文件名,如train.py) - -```bash -$ python train.py -``` - -trainer需要和pserver保持网络联通以完成训练。trainer启动需要传入端口、pserver地址等参数使trainer可以正确连接到pserver。这些参数可以通过[环境变量](https://zh.wikipedia.org/wiki/环境变量)或编写程序时`paddle.init()`中传入参数。如果同时使用`paddle.init()`参数和环境变量,将会优先使用`paddle.init()`中传入的参数。 - -使用环境变量: - -```bash -export PADDLE_INIT_USE_GPU=False -export PADDLE_INIT_TRAINER_COUNT=1 -export PADDLE_INIT_PORT=7164 -export PADDLE_INIT_PORTS_NUM=1 -export PADDLE_INIT_PORTS_NUM_FOR_SPARSE=1 -export PADDLE_INIT_NUM_GRADIENT_SERVERS=1 -export PADDLE_INIT_TRAINER_ID=0 -export PADDLE_INIT_PSERVERS=127.0.0.1 -``` - -使用参数: - -```python -paddle.init( - use_gpu=False, - trainer_count=1, - port=7164, - ports_num=1, - ports_num_for_sparse=1, - num_gradient_servers=1, - trainer_id=0, - pservers="127.0.0.1") -``` - -参数说明 - -- use_gpu: **可选,默认False**,是否启用GPU训练 -- trainer_count:**必选,默认1**,当前trainer的线程数目 -- port:**必选,默认7164**,连接到pserver的端口 -- ports_num:**必选,默认1**,连接到pserver的端口个数 -- ports_num_for_sparse:**必选,默认0**,和pserver之间用于稀疏类型参数通信的端口个数 -- num_gradient_servers:**必选,默认1**,当前训练任务trainer总数 -- trainer_id:**必选,默认0**,每个trainer的唯一ID,从0开始的整数 -- pservers:**必选,默认127.0.0.1**,当前训练任务启动的pserver的IP列表,多个IP使用“,”隔开 - -```python -trainer = paddle.trainer.SGD(..., is_local=False) -``` - -参数说明 - -- is_local: **必选, 默认True**, 是否使用PServer更新参数 - -## 准备数据集 - -参考样例数据准备脚本[prepare.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py),准备训练数据和验证数据集,我们使用paddle.dataset.imikolov数据集,并根据分布式训练并发数(trainer节点个数),在`prepare.py`开头部分指定`SPLIT_COUNT`将数据切分成多份。 - -在线上系统中,通常会使用MapReduce任务的输出结果作为训练结果,这样训练文件的个数会比较多,而且个数并不确定。在trainer中可以使用下面取模的方法为每个trainer分配训练数据文件: - -```python -import os -train_list = [] -flist = os.listdir("/train_data/") -for f in flist: - suffix = int(f.split("-")[1]) - if suffix % TRAINER_COUNT == TRAINER_ID: - train_list.append(f) -``` - -示例程序`prepare.py`会把训练集和测试集分别分割成多个文件(例子中为3个,后缀为`-00000`、`-00001`和`-00002`): - -```bash -train.txt -train.txt-00000 -train.txt-00001 -train.txt-00002 -test.txt -test.txt-00000 -test.txt-00001 -test.txt-00002 -``` - -在进行分布式训练时,每个trainer进程需要能够读取属于自己的一份数据。在一些分布式系统中,系统会提供一个分布式存储服务,这样保存在分布式存储中的数据可以被集群中的每个节点读取到。如果不使用分布式存储,则需要手动拷贝属于每个trainer节点的训练数据到对应的节点上。 - -对于不同的训练任务,训练数据格式和训练程序的`reader()`会大不相同,所以开发者需要根据自己训练任务的实际场景完成训练数据的分割和`reader()`的编写。 - -## 准备训练程序 - -我们会对每个训练任务都会在每个节点上创建一个工作空间(workspace),其中包含了用户的训练程序、程序依赖、挂载或下载的训练数据分片。 - -最后,工作空间应如下所示: - -```bash -. -|-- my_lib.py -|-- word_dict.pickle -|-- train.py -|-- train_data_dir/ -| |-- train.txt-00000 -| |-- train.txt-00001 -| |-- train.txt-00002 -`-- test_data_dir/ - |-- test.txt-00000 - |-- test.txt-00001 - `-- test.txt-00002 -``` - -- `my_lib.py`:会被`train.py`调用的一些用户定义的库函数,比如PIL库等。 -- `word_dict.pickle`:在`train.py`中会使用到的字典数据文件。 -- `train.py`:训练程序,代码参考[api_train_v2_cluster.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py)。***注意:*** 对于本样例代码,在使用不同的分布式计算平台时,您可能需要修改`train.py`开头的部分(如下),以便获得训练数据的位置和获取环境变量配置: - - ```python - cluster_train_file = "./train_data_dir/train/train.txt" - cluster_test_file = "./test_data_dir/test/test.txt" - node_id = os.getenv("OMPI_COMM_WORLD_RANK") - if not node_id: - raise EnvironmentError("must provied OMPI_COMM_WORLD_RANK") - ``` - -- `train_data_dir`:包含训练数据的目录,可以是从分布式存储挂载过来的,也可以是在任务启动前下载到本地的。 -- `test_data_dir`:包含测试数据集的目录。 - -## 异步 SGD 更新 - -我们可以通过设置 `optimize` 的参数使之支持异步SGD更新。 -例如,设置 `AdaGrad` optimize 的 `is_async` 和 `async_lagged_grad_discard_ratio` 参数: - -```python -adagrad = paddle.optimizer.AdaGrad( - is_async=True, - async_lagged_grad_discard_ratio=1.6, - learning_rate=3e-3, - regularization=paddle.optimizer.L2Regularization(8e-4)) -``` - -- `is_async`: 是否为异步SGD更新模式。 -- `async_lagged_grad_discard_ratio`: 异步SGD更新的步长控制,接收到足够的gradient( - `async_lagged_grad_discard_ratio * num_gradient_servers`)之后,后面的gradient - 将会被抛弃。 diff --git a/doc/v2/howto/cluster/cmd_argument_en.md b/doc/v2/howto/cluster/cmd_argument_en.md deleted file mode 100644 index df1381a00fa0fa129eecffe002164c489a4183aa..0000000000000000000000000000000000000000 --- a/doc/v2/howto/cluster/cmd_argument_en.md +++ /dev/null @@ -1,169 +0,0 @@ -# Command-line arguments - -We'll take `doc/howto/cluster/src/word2vec` as an example to introduce distributed training using PaddlePaddle v2 API. - -## Starting parameter server - -Type the below command to start a parameter server which will wait for trainers to connect: - -```bash -$ paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 --nics=eth0 -``` - -If you wish to run parameter servers in background, and save a log file, you can type: - -```bash -$ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 --nics=eth0 &> pserver.log & -``` - -Parameter Description - -- port: **required, default 7164**, port which parameter server will listen on. If ports_num greater than 1, parameter server will listen on multiple ports for more network throughput. -- ports_num: **required, default 1**, total number of ports will listen on. -- ports_num_for_sparse: **required, default 0**, number of ports which serves sparse parameter update. -- num_gradient_servers: **required, default 1**, total number of gradient servers. -- nics: **optional, default xgbe0,xgbe1**, network device name which paramter server will listen on. - -## Starting trainer - -Type the command below to start the trainer(name the file whatever you want, like "train.py") - -```bash -$ python train.py -``` - -Trainers' network need to be connected with parameter servers' network to finish the job. Trainers need to know port and IPs to locate parameter servers. You can pass arguments to trainers through [environment variables](https://en.wikipedia.org/wiki/Environment_variable) or pass to `paddle.init()` function. Arguments passed to the `paddle.init()` function will overwrite environment variables. - -Use environment viriables: - -```bash -export PADDLE_INIT_USE_GPU=False -export PADDLE_INIT_TRAINER_COUNT=1 -export PADDLE_INIT_PORT=7164 -export PADDLE_INIT_PORTS_NUM=1 -export PADDLE_INIT_PORTS_NUM_FOR_SPARSE=1 -export PADDLE_INIT_NUM_GRADIENT_SERVERS=1 -export PADDLE_INIT_TRAINER_ID=0 -export PADDLE_INIT_PSERVERS=127.0.0.1 -python train.py -``` - -Pass arguments: - -```python -paddle.init( - use_gpu=False, - trainer_count=1, - port=7164, - ports_num=1, - ports_num_for_sparse=1, - num_gradient_servers=1, - trainer_id=0, - pservers="127.0.0.1") -``` - -Parameter Description - -- use_gpu: **optional, default False**, set to "True" to enable GPU training. -- trainer_count: **required, default 1**, number of threads in current trainer. -- port: **required, default 7164**, port to connect to parameter server. -- ports_num: **required, default 1**, number of ports for communication. -- ports_num_for_sparse: **required, default 0**, number of ports for sparse type caculation. -- num_gradient_servers: **required, default 1**, number of trainers in current job. -- trainer_id: **required, default 0**, ID for every trainer, start from 0. -- pservers: **required, default 127.0.0.1**, list of IPs of parameter servers, separated by ",". - -```python -trainer = paddle.trainer.SGD(..., is_local=False) -``` - -Parameter Description - -- is_local: **required, default True**, whether update parameters by PServer. - -## Prepare Training Dataset - -Here's some example code [prepare.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py), it will download public `imikolov` dataset and split it into multiple files according to job parallelism(trainers count). Modify `SPLIT_COUNT` at the begining of `prepare.py` to change the count of output files. - -In the real world, we often use `MapReduce` job's output as training data, so there will be lots of files. You can use `mod` to assign training file to trainers: - -```python -import os -train_list = [] -flist = os.listdir("/train_data/") -for f in flist: - suffix = int(f.split("-")[1]) - if suffix % TRAINER_COUNT == TRAINER_ID: - train_list.append(f) -``` - -Example code `prepare.py` will split training data and testing data into 3 files with digital suffix like `-00000`, `-00001` and`-00002`: - -```bash -train.txt -train.txt-00000 -train.txt-00001 -train.txt-00002 -test.txt -test.txt-00000 -test.txt-00001 -test.txt-00002 -``` - -When job started, every trainer needs to get it's own part of data. In some distributed systems a storage service will be provided, so the date under that path can be accessed by all the trainer nodes. Without the storage service, you must copy the training data to each trainer node. - -Different training jobs may have different data format and `reader()` function, developers may need to write different data prepare scripts and `reader()` functions for their job. - -## Prepare Training program - -We'll create a *workspace* directory on each node, storing your training program, dependencies, mounted or downloaded dataset directory. - -Your workspace may looks like: - -```bash -. -|-- my_lib.py -|-- word_dict.pickle -|-- train.py -|-- train_data_dir/ -| |-- train.txt-00000 -| |-- train.txt-00001 -| |-- train.txt-00002 -`-- test_data_dir/ - |-- test.txt-00000 - |-- test.txt-00001 - `-- test.txt-00002 -``` - -- `my_lib.py`: user defined libraries, like PIL libs. This is optional. -- `word_dict.pickle`: dict file for training word embeding. -- `train.py`: training program. Sample code: [api_train_v2_cluster.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py). ***NOTE:*** You may need to modify the head part of `train.py` when using different cluster platform to retrive configuration environment variables: - - ```python - cluster_train_file = "./train_data_dir/train/train.txt" - cluster_test_file = "./test_data_dir/test/test.txt" - node_id = os.getenv("OMPI_COMM_WORLD_RANK") - if not node_id: - raise EnvironmentError("must provied OMPI_COMM_WORLD_RANK") - ``` - -- `train_data_dir`: containing training data. Mount from storage service or copy trainning data to here. -- `test_data_dir`: containing testing data. - -## Async SGD Update - -We can set some parameters of the optimizer to make it support async SGD update. -For example, we can set the `is_async` and `async_lagged_grad_discard_ratio` of the `AdaGrad` optimizer: - -```python -adagrad = paddle.optimizer.AdaGrad( - is_async=True, - async_lagged_grad_discard_ratio=1.6, - learning_rate=3e-3, - regularization=paddle.optimizer.L2Regularization(8e-4)) -``` - -- `is_async`: Is Async-SGD or not. -- `async_lagged_grad_discard_ratio`: For async SGD gradient commit control. - when `async_lagged_grad_discard_ratio * num_gradient_servers` commit passed, - current async gradient will be discard silently. diff --git a/doc/v2/howto/cluster/index_cn.rst b/doc/v2/howto/cluster/index_cn.rst deleted file mode 100644 index 2583457c54116b7a1d797d4f7b7c2c4789c6d882..0000000000000000000000000000000000000000 --- a/doc/v2/howto/cluster/index_cn.rst +++ /dev/null @@ -1,36 +0,0 @@ -分布式训练 -========== - -深度学习模型的效果好坏与数据量的大小往往有直接的关系:相同的模型,在增大训练数据集后一般都能取得更好的效果。但是当数据量增大到一定程度后,单台计算机已经难以承受。这时,使用多台计算机进行分布式训练就是一个很自然的解决方案。在分布式训练中,训练数据被分割为多份,参与训练的多台机器分别读取自己的数据进行训练,并协同对整体模型的参数进行更新。 - -分布式训练一般有着如下图所示的架构: - -.. image:: src/ps_cn.png - :width: 500 - -- 数据分片(Data shard): 用于训练神经网络的数据,被切分成多个部分,每个部分分别给每个trainer使用。 -- 计算节点(Trainer): 每个trainer启动后读取切分好的一部分数据,开始神经网络的“前馈”和“后馈”计算,并和参数服务器通信。在完成一定量数据的训练后,上传计算得出的梯度(gradients),然后下载优化更新后的神经网络参数(parameters)。 -- 参数服务器(Parameter server):每个参数服务器只保存整个神经网络所有参数的一部分。参数服务器接收从计算节点上传的梯度,并完成参数优化更新,再将更新后的参数下发到每个计算节点。 - -通过计算节点和参数服务器的分布式协作,可以完成神经网络的同步随机梯度下降(SGD)方法的训练。PaddlePaddle同时支持同步随机梯度下降(SGD)和异步随机梯度下降(ASGD)。 - -在开始集群训练之前,需要先进行集群配置、PaddlePaddle安装等准备工作,了解如何通过这些步骤来配置分布式训练所需的基本环境: - -.. toctree:: - :maxdepth: 1 - - preparations_cn.md - -集群训练有大量可配置的参数,例如使用的机器数量、通信端口等。了解如何通过设置启动参数的方式,对分布式训练的过程进行配置: - -.. toctree:: - :maxdepth: 1 - - cmd_argument_cn.md - -PaddlePaddle可以兼容各种不同的集群。每种集群各有优势,使用的具体方式也有区别: - -.. toctree:: - :maxdepth: 1 - - multi_cluster/index_cn.rst diff --git a/doc/v2/howto/cluster/index_en.rst b/doc/v2/howto/cluster/index_en.rst deleted file mode 100644 index 31eda57c4fb3947d92df45ea8dbb9274c9814140..0000000000000000000000000000000000000000 --- a/doc/v2/howto/cluster/index_en.rst +++ /dev/null @@ -1,38 +0,0 @@ -Distributed Training -==================== - -The effectiveness of the deep learning model is often directly related to the scale of the data: it can generally achieve better results after increasing the size of the dataset on the same model. However, it can not fit in one single computer when the amount of data increases to a certain extent. At this point, using multiple computers for distributed training is a natural solution. In distributed training, the training data is divided into multiple copies (sharding), and multiple machines participating in the training read their own data for training and collaboratively update the parameters of the overall model. - -Distributed training generally has framwork as shown below: - -.. image:: src/ps_en.png - :width: 500 - -- Data shard: training data will be split into multiple partitions, trainers use the partitions of the whole dataset to do the training job. -- Trainer: each trainer reads the data shard, and train the neural network. Then the trainer will upload calculated "gradients" to parameter servers, and wait for parameters to be optimized on the parameter server side. When that finishes, the trainer download optimized parameters and continues its training. -- Parameter server: every parameter server stores part of the whole neural network model data. They will do optimization calculations when gradients are uploaded from trainers, and then send updated parameters to trainers. - -The training of synchronous random gradient descent for neural network can be achieved by cooperation of trainers and parameter servers. - -PaddlePaddle supports both synchronize stochastic gradient descent (SGD) and asynchronous SGD. - -Before starting the cluster training, you need to prepare the cluster configuration, PaddlePaddle installation, and other preparations. To understand how to configure the basic environment for distributed training, check the link below: - -.. toctree:: - :maxdepth: 1 - - preparations_en.md - -Cluster training has a large number of configurable parameters, such as the number of machines used, communication ports, etc. To learn how to configure the distributed training process by setting startup these parameters, check the link below: - -.. toctree:: - :maxdepth: 1 - - cmd_argument_en.md - -PaddlePaddle is compatible with a variety of different clusters. Each cluster has its own advantages, To learn how to run PaddlePaddle in different types of them, check the link below: - -.. toctree:: - :maxdepth: 1 - - multi_cluster/index_en.rst diff --git a/doc/v2/howto/cluster/multi_cluster/fabric_cn.md b/doc/v2/howto/cluster/multi_cluster/fabric_cn.md deleted file mode 100644 index 0385e401b399a51fad112e604dc56cb2f84c0a4b..0000000000000000000000000000000000000000 --- a/doc/v2/howto/cluster/multi_cluster/fabric_cn.md +++ /dev/null @@ -1,42 +0,0 @@ -# 使用fabric启动集群训练 - -## 准备一个Linux集群 -可以在`paddle/scripts/cluster_train_v2/fabric/docker_cluster`目录下,执行`kubectl -f ssh_servers.yaml`启动一个测试集群,并使用`kubectl get po -o wide`获得这些节点的IP地址。 - -## 启动集群作业 - -`paddle.py` 提供了自动化脚本来启动不同节点中的所有 PaddlePaddle 集群进程。默认情况下,所有命令行选项可以设置为 `paddle.py` 命令选项并且 `paddle.py` 将透明、自动地将这些选项应用到 PaddlePaddle 底层进程。 - -`paddle.py` 为方便作业启动提供了两个独特的命令选项。 - -- `job_dispatch_package` 设为本地 `workspace` 目录,它将被分发到 `conf.py` 中设置的所有节点。它有助于帮助频繁修改和访问工作区文件的用户减少负担,否则频繁的多节点工作空间部署可能会很麻烦。 -- `job_workspace` 设为已部署的工作空间目录,`paddle.py` 将跳过分发阶段直接启动所有节点的集群作业。它可以帮助减少分发延迟。 - -`cluster_train/run.sh` 提供了命令样例来运行 `doc/howto/usage/cluster/src/word2vec` 集群任务,只需用您定义的目录修改 `job_dispatch_package` 和 `job_workspace`,然后: -``` -sh run.sh -``` - -集群作业将会在几秒后启动。 - -## 终止集群作业 -`paddle.py`能获取`Ctrl + C` SIGINT 信号来自动终止它启动的所有进程。只需中断 `paddle.py` 任务来终止集群作业。如果程序崩溃你也可以手动终止。 - -## 检查集群训练结果 -详细信息请检查 $workspace/log 里的日志,每一个节点都有相同的日志结构。 - -`paddle_trainer.INFO` -提供几乎所有训练的内部输出日志,与本地训练相同。这里检验运行时间模型的收敛。 - -`paddle_pserver2.INFO` -提供 pserver 运行日志,有助于诊断分布式错误。 - -`server.log` -提供 parameter server 进程的 stderr 和 stdout。训练失败时可以检查错误日志。 - -`train.log` -提供训练过程的 stderr 和 stdout。训练失败时可以检查错误日志。 - -## 检查模型输出 -运行完成后,模型文件将被写入节点 0 的 `output` 目录中。 -工作空间中的 `nodefile` 表示当前集群作业的节点 ID。 diff --git a/doc/v2/howto/cluster/multi_cluster/fabric_en.md b/doc/v2/howto/cluster/multi_cluster/fabric_en.md deleted file mode 100644 index bac9ffe1526a06a3a23b1d8acf33a5fb74b7e50d..0000000000000000000000000000000000000000 --- a/doc/v2/howto/cluster/multi_cluster/fabric_en.md +++ /dev/null @@ -1,43 +0,0 @@ -# Fabric - -## Prepare a Linux cluster - -Run `kubectl -f ssh_servers.yaml` under the directory: `paddle/scripts/cluster_train_v2/fabric/docker_cluster` will launch a demo cluster. Run `kubectl get po -o wide` to get IP addresses of these nodes. - -## Launching Cluster Job -`paddle.py` provides automatical scripts to start all PaddlePaddle cluster processes in different nodes. By default, all command line options can be set as `paddle.py` command options and `paddle.py` will transparently and automatically set these options to PaddlePaddle lower level processes. - -`paddle.py`provides two distinguished command option for easy job launching. - -- `job_dispatch_package` set it with local `workspace` directory, it will be dispatched to all nodes which is set in `conf.py`. It could be helpful for frequently manipulating workspace files. otherwise, frequent multi-nodes workspace deployment is very annoying. -- `job_workspace` set it with already deployed workspace directory, `paddle.py` will skip dispatch stage to directly launch cluster job with all nodes. It could help to reduce heavy -dispatch latency. - -`cluster_train/run.sh` provides command line sample to run `demo/recommendation` cluster job, just modify `job_dispatch_package` and `job_workspace` with your defined directory, then: -``` -sh run.sh -``` - -The cluster Job will start in several seconds. - -## Kill Cluster Job -`paddle.py` can capture `Ctrl + C` SIGINT signal to automatically kill all processes launched by it. So just stop `paddle.py` to kill cluster job. You should manually kill the job if the program crashed. - -## Check Cluster Training Result -Check log in $workspace/log for details, each node owns same log structure. - -`paddle_trainer.INFO` -It provides almost all internal output log for training, same as local training. Check runtime model convergence here. - -`paddle_pserver2.INFO` -It provides parameter server running log, which could help to diagnose distributed error. - -`server.log` -It provides stderr and stdout of parameter server process. Check error log if training crashes. - -`train.log` -It provides stderr and stdout of trainer process. Check error log if training crashes. - -## Check Model Output -After one pass finished, model files will be written in `output` directory in node 0. -`nodefile` in workspace indicates the node id of current cluster job. diff --git a/doc/v2/howto/cluster/multi_cluster/index_cn.rst b/doc/v2/howto/cluster/multi_cluster/index_cn.rst deleted file mode 100644 index eabf95eda0b20f91913201a6b4e5b56fa440597e..0000000000000000000000000000000000000000 --- a/doc/v2/howto/cluster/multi_cluster/index_cn.rst +++ /dev/null @@ -1,35 +0,0 @@ -在不同集群中运行 -================ -用户的集群环境不尽相同,为了方便大家的部署,我们提供了多种的集群部署方式,方便提交集群训练任务,以下将一一介绍: - -`Kubernetes `_ 是Google开源的容器集群的调度框架,支持大规模集群生产环境的完整集群方案。以下指南展示了PaddlePaddle对Kubernetes的支持: - -.. toctree:: - :maxdepth: 1 - - k8s_cn.md - k8s_distributed_cn.md - -`OpenMPI `_ 是成熟的高性能并行计算框架,在HPC领域使用非常的广泛。以下指南介绍了如何使用OpenMPI来搭建PaddlePaddle的集群训练任务: - -.. toctree:: - :maxdepth: 1 - - openmpi_cn.md - -`Fabric `_ 是一个方便的程序部署和管理工具。我们提供了使用Fabric 进行部署、管理的方法,如果想详细了解,请阅读以下指南: - -.. toctree:: - :maxdepth: 1 - - fabric_cn.md - -我们也支持在AWS上部署PaddlePaddle,详细请了解: - -.. toctree:: - :maxdepth: 1 - - k8s_aws_cn.md - -您可以在 `cluster_train_v2 `_ 找到以上相关的例子。 - diff --git a/doc/v2/howto/cluster/multi_cluster/index_en.rst b/doc/v2/howto/cluster/multi_cluster/index_en.rst deleted file mode 100644 index 9bc1eb2e3796d95dd69b165e916e263ea34b87f6..0000000000000000000000000000000000000000 --- a/doc/v2/howto/cluster/multi_cluster/index_en.rst +++ /dev/null @@ -1,35 +0,0 @@ -Use different clusters -====================== - -The user's cluster environment is not the same. To facilitate everyone's deployment, we provide a variety of cluster deployment methods to facilitate the submission of cluster training tasks, which will be introduced as follows: - -`Kubernetes `_ is a scheduling framework of Google open source container cluster, supporting a complete cluster solution for large-scale cluster production environment. The following guidelines show PaddlePaddle's support for Kubernetes: - -.. toctree:: - :maxdepth: 1 - - k8s_en.md - k8s_distributed_en.md - -`OpenMPI `_ is a mature high-performance parallel computing framework, which is widely used in the field of HPC. The following guide describes how to use OpenMPI to build PaddlePaddle's cluster training task: - -.. toctree:: - :maxdepth: 1 - - openmpi_en.md - -`Fabric `_ is a convenient tool for program deployment and management. We provide a way to deploy and manage with Fabric. If you want to know more about it, please read the following guidelines: - -.. toctree:: - :maxdepth: 1 - - fabric_en.md - -We also support the deployment of PaddlePaddle on AWS. Learn more about: - -.. toctree:: - :maxdepth: 1 - - k8s_aws_en.md - -The examples can be found under `cluster_train_v2 `_ . diff --git a/doc/v2/howto/cluster/multi_cluster/k8s_aws_cn.md b/doc/v2/howto/cluster/multi_cluster/k8s_aws_cn.md deleted file mode 100644 index afc753aa42f19631c49a451a797f28365e65ed1d..0000000000000000000000000000000000000000 --- a/doc/v2/howto/cluster/multi_cluster/k8s_aws_cn.md +++ /dev/null @@ -1,672 +0,0 @@ -# Kubernetes on AWS - -我们将向你展示怎么样在AWS的Kubernetes集群上运行分布式PaddlePaddle训练,让我们从核心概念开始 - -## PaddlePaddle分布式训练的核心概念 - -### 分布式训练任务 - -一个分布式训练任务可以看做是一个Kubernetes任务 -每一个Kubernetes任务都有相应的配置文件,此配置文件指定了像任务的pod个数之类的环境变量信息 - -在分布式训练任务中,我们可以如下操作: - -1. 在分布式文件系统中,准备分块数据和配置文件(在此次教学中,我们会用到亚马逊分布式存储服务(EFS)) -2. 创建和提交一个kubernetes任务配置到集群中开始训练 - -### Parameter Server和Trainer - -在paddlepaddle集群中有两个角色:参数服务器(pserver)者和trainer, 每一个参数服务器过程都会保存一部分模型的参数。每一个trainer都保存一份完整的模型参数,并可以利用本地数据更新模型。在这个训练过程中,trainer发送模型更新到参数服务器中,参数服务器职责就是聚合这些更新,以便于trainer可以把全局模型同步到本地。 - -为了能够和pserver通信,trainer需要每一个pserver的IP地址。在Kubernetes中利用服务发现机制(比如:DNS、hostname)要比静态的IP地址要好一些,因为任何一个pod都会被杀掉然后新的pod被重启到另一个不同IP地址的node上。现在我们可以先用静态的IP地址方式,这种方式是可以更改的。 - -参数服务器和trainer一块被打包成一个docker镜像,这个镜像会运行在被Kubernetes集群调度的pod中。 - -### 训练者ID - -每一个训练过程都需要一个训练ID,以0作为基础值,作为命令行参数传递。训练过程因此用这个ID去读取数据分片。 - -### 训练 - -PaddlePaddle容器的入口是一个shell脚本,这个脚本可以读取Kubernetes内预置的环境变量。这里可以定义任务identity,在任务中identity可以用来远程访问包含所有pod的Kubernetes apiserver服务。 - -每一个pod通过ip来排序。每一个pod的序列作为“pod id”。因为我们会在每一个pod中运行训练和参数服务,可以用“pod id”作为训练ID。入口脚本详细工作流程如下: - -1. 查找apiserver得到pod信息,通过ip排序来分配一个trainer_id。 -2. 从EFS持久化卷中复制训练数据到容器中。 -3. 从环境变量中解析paddle pserver和 paddle trainer的启动参数,然后开始启动流程。 -4. 以trainer_id来训练将自动把结果写入到EFS卷中。 - - -## AWS的Kubernetes中的PaddlePaddle - -### 选择AWS服务区域 -这个教程需要多个AWS服务工作在一个区域中。在AWS创建任何东西之前,请检查链接https://aws.amazon.com/about-aws/global-infrastructure/regional-product-services/ 选择一个可以提供如下服务的区域:EC2, EFS, VPS, CloudFormation, KMS, VPC, S3。在教程中我们使用“Oregon(us-west-2)”作为例子。 - -### 创建aws账户和IAM账户 - -在每一个aws账户下可以创建多个IAM用户。允许为每一个IAM用户赋予权限,作为IAM用户可以创建/操作aws集群 - -注册aws账户,请遵循用户指南。在AWS账户下创建IAM用户和用户组,请遵循用户指南 - -请注意此教程需要如下的IAM用户权限: - -- AmazonEC2FullAccess -- AmazonS3FullAccess -- AmazonRoute53FullAccess -- AmazonRoute53DomainsFullAccess -- AmazonElasticFileSystemFullAccess -- AmazonVPCFullAccess -- IAMUserSSHKeys -- IAMFullAccess -- NetworkAdministrator -- AWSKeyManagementServicePowerUser - - -### 下载kube-aws and kubectl - -#### kube-aws - -在AWS中[kube-aws](https://github.com/coreos/kube-aws)是一个自动部署集群的CLI工具 - -##### kube-aws完整性验证 -提示:如果你用的是非官方版本(e.g RC release)的kube-aws,可以跳过这一步骤。引入coreos的应用程序签名公钥: - -``` -gpg2 --keyserver pgp.mit.edu --recv-key FC8A365E -``` - -指纹验证: - -``` -gpg2 --fingerprint FC8A365E -``` -正确的指纹是: `18AD 5014 C99E F7E3 BA5F 6CE9 50BD D3E0 FC8A 365E` - -我们可以从发布页面中下载kube-aws,教程使用0.9.1版本 [release page](https://github.com/coreos/kube-aws/releases). - -验证tar包的GPG签名: - -``` -PLATFORM=linux-amd64 - # Or -PLATFORM=darwin-amd64 - -gpg2 --verify kube-aws-${PLATFORM}.tar.gz.sig kube-aws-${PLATFORM}.tar.gz -``` -##### 安装kube-aws -解压: - -``` -tar zxvf kube-aws-${PLATFORM}.tar.gz -``` - -添加到环境变量: - -``` -mv ${PLATFORM}/kube-aws /usr/local/bin -``` - - -#### kubectl - -[kubectl](https://Kubernetes.io/docs/user-guide/kubectl-overview/) 是一个操作Kubernetes集群的命令行接口 - -利用`curl`工具从Kubernetes发布页面中下载`kubectl` - -``` -# OS X -curl -O https://storage.googleapis.com/kubernetes-release/release/"$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)"/bin/darwin/amd64/kubectl - -# Linux -curl -O https://storage.googleapis.com/kubernetes-release/release/"$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)"/bin/linux/amd64/kubectl -``` - -为了能是kubectl运行必须将之添加到环境变量中 (e.g. `/usr/local/bin`): - -``` -chmod +x ./kubectl -sudo mv ./kubectl /usr/local/bin/kubectl -``` - -### 配置AWS证书 - -首先检查这里 [this](http://docs.aws.amazon.com/cli/latest/userguide/installing.html) 安装AWS命令行工具 - -然后配置aws账户信息: - -``` -aws configure -``` - - -添加如下信息: - - -``` -AWS Access Key ID: YOUR_ACCESS_KEY_ID -AWS Secrete Access Key: YOUR_SECRETE_ACCESS_KEY -Default region name: us-west-2 -Default output format: json -``` - -`YOUR_ACCESS_KEY_ID`, and `YOUR_SECRETE_ACCESS_KEY` 是创建aws账户和IAM账户的IAM的key和密码 [Create AWS Account and IAM Account](#create-aws-account-and-iam-account) - -描述任何运行在你账户中的实例来验证凭据是否工作: - -``` -aws ec2 describe-instances -``` - -### 定义集群参数 - -#### EC2秘钥对 - -秘钥对将认证ssh访问你的EC2实例。秘钥对的公钥部分将配置到每一个COREOS节点中。 - -遵循 [EC2 Keypair User Guide](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-key-pairs.html) Keypair用户指南来创建EC2秘钥对 - -你可以使用创建好的秘钥对名称来配置集群. - -在同一工作区中秘钥对为EC2实例唯一码。在教程中使用 us-west-2 ,所以请确认在这个区域(Oregon)中创建秘钥对。 - -在浏览器中下载一个`key-name.pem`文件用来访问EC2实例,我们待会会用到. - - -#### KMS秘钥 - -亚马逊的KMS秘钥在TLS秘钥管理服务中用来加密和解密集群。如果你已经有可用的KMS秘钥,你可以跳过创建新秘钥这一步,提供现存秘钥的ARN字符串。 - -利用aws命令行创建kms秘钥: - -``` -aws kms --region=us-west-2 create-key --description="kube-aws assets" -{ - "KeyMetadata": { - "CreationDate": 1458235139.724, - "KeyState": "Enabled", - "Arn": "arn:aws:kms:us-west-2:aaaaaaaaaaaaa:key/xxxxxxxxxxxxxxxxxxx", - "AWSAccountId": "xxxxxxxxxxxxx", - "Enabled": true, - "KeyUsage": "ENCRYPT_DECRYPT", - "KeyId": "xxxxxxxxx", - "Description": "kube-aws assets" - } -} -``` - -我们稍后用到`Arn` 的值. - -在IAM用户许可中添加多个内联策略. - -进入[IAM Console](https://console.aws.amazon.com/iam/home?region=us-west-2#/home)。点击`Users`按钮,点击刚才创建的用户,然后点击`Add inline policy`按钮,选择`Custom Policy` - -粘贴内联策略: - -``` - (Caution: node_0, node_1, node_2 directories represents PaddlePaddle node and train_id, not the Kubernetes node){ - "Version": "2012-10-17", - "Statement": [ - { - "Sid": "Stmt1482205552000", - "Effect": "Allow", - "Action": [ - "kms:Decrypt", - "kms:Encrypt" - ], - "Resource": [ - "arn:aws:kms:*:AWS_ACCOUNT_ID:key/*" - ] - }, - { - "Sid": "Stmt1482205746000", - "Effect": "Allow", - "Action": [ - "cloudformation:CreateStack", - "cloudformation:UpdateStack", - "cloudformation:DeleteStack", - "cloudformation:DescribeStacks", - "cloudformation:DescribeStackResource", - "cloudformation:GetTemplate", - "cloudformation:DescribeStackEvents" - ], - "Resource": [ - "arn:aws:cloudformation:us-west-2:AWS_ACCOUNT_ID:stack/MY_CLUSTER_NAME/*" - ] - } - ] -} -``` -`Version` : 值必须是"2012-10-17". -`AWS_ACCOUNT_ID`: 你可以从命令行中获取: - -``` -aws sts get-caller-identity --output text --query Account -``` - -`MY_CLUSTER_NAME`: 选择一个你喜欢的MY_CLUSTER_NAME,稍后会用到。 -请注意,堆栈名称必须是正则表达式:[a-zA-Z][-a-zA-Z0-9*]*, 在名称中不能有"_"或者"-",否则kube-aws在下面步骤中会抛出异常 - -#### 外部DNS名称 - -当集群被创建后,基于DNS名称控制器将会暴露安全的TLS API. - -DNS名称含有CNAME指向到集群DNS名称或者记录指向集群的IP地址。 - -我们稍后会用到DNS名称,如果没有DNS名称的话,你可以选择一个(比如:`paddle`)还可以修改`/etc/hosts`用本机的DNS名称和集群IP关联。还可以在AWS上增加一个名称服务来关联paddle集群IP,稍后步骤中会查找集群IP. - -#### S3 bucket - -在启动Kubernetes集群前需要创建一个S3 bucket - -在AWS上创建s3 bucket会有许多的bugs,所以使用[s3 console](https://console.aws.amazon.com/s3/home?region=us-west-2)。 - -链接到 `Create Bucket`,确保在us-west-2 (Oregon)上创建一个唯一的BUCKET_NAME。 - -#### 初始化assets - -在本机创建一个目录用来存放产生的assets: - -``` -$ mkdir my-cluster -$ cd my-cluster -``` - -利用KMS Arn、秘钥对名称和前一步产生的DNS名称来初始化集群的CloudFormation栈: - -``` -kube-aws init \ ---cluster-name=MY_CLUSTER_NAME \ ---external-dns-name=MY_EXTERNAL_DNS_NAME \ ---region=us-west-2 \ ---availability-zone=us-west-2a \ ---key-name=KEY_PAIR_NAME \ ---kms-key-arn="arn:aws:kms:us-west-2:xxxxxxxxxx:key/xxxxxxxxxxxxxxxxxxx" -``` - -`MY_CLUSTER_NAME`: the one you picked in [KMS key](#kms-key) - -`MY_EXTERNAL_DNS_NAME`: see [External DNS name](#external-dns-name) - -`KEY_PAIR_NAME`: see [EC2 key pair](#ec2-key-pair) - -`--kms-key-arn`: the "Arn" in [KMS key](#kms-key) - -这里的`us-west-2a`用于参数`--availability-zone`,但必须在AWS账户的有效可用区中 - -如果不能切换到其他的有效可用区(e.g., `us-west-2a`, or `us-west-2b`),请检查`us-west-2a`是支持`aws ec2 --region us-west-2 describe-availability-zones`。 - -现在在asset目录中就有了集群的主配置文件cluster.yaml。 - -默认情况下kube-aws会创建一个工作节点,修改`cluster.yaml`让`workerCount`从1个节点变成3个节点. - -#### 呈现asset目录内容 - -在这个简单的例子中,你可以使用kuber-aws生成TLS身份和证书 - -``` -kube-aws render credentials --generate-ca -``` - -下一步在asset目录中生成一组集群assets. - -``` -kube-aws render stack -``` -asserts(模板和凭证)用于创建、更新和当前目录被创建的Kubernetes集群相关联 - -### 启动Kubernetes集群 - -#### 创建一个在CloudFormation模板上定义好的实例 - -现在让我们创建集群(在命令行中选择任意的 `PREFIX`) - -``` -kube-aws up --s3-uri s3://BUCKET_NAME/PREFIX -``` - -`BUCKET_NAME`: t在[S3 bucket](#s3-bucket)上使用的bucket名称 - - -#### 配置DNS - -你可以执行命令 `kube-aws status`来查看创建后集群的API. - -``` -$ kube-aws status -Cluster Name: paddle-cluster -Controller DNS Name: paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com -``` -如果你用DNS名称,在ip上设置任何记录或是安装CNAME点到`Controller DNS Name` (`paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com`) - -##### 查询IP地址 - -用命令`dig`去检查负载均衡器的域名来获取ip地址. - -``` -$ dig paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com - -;; QUESTION SECTION: -;paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com. IN A - -;; ANSWER SECTION: -paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com. 59 IN A 54.241.164.52 -paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com. 59 IN A 54.67.102.112 -``` - -在上面的例子中,`54.241.164.52`, `54.67.102.112`这两个ip都将是工作状态 - -*如果你有DNS名称*,设置记录到ip上,然后你可以跳过“Access the cluster”这一步 - -*如果没有自己的DNS名称* - -编辑/etc/hosts文件用DNS关联IP - -##### 更新本地的DNS关联 -编辑`/etc/hosts`文件用DNS关联IP -##### 在VPC上添加route53私有名称服务 - - 打开[Route53 Console](https://console.aws.amazon.com/route53/home) - - 根据配置创建域名zone - - domain名称为: "paddle" - - Type: "Private hosted zone for amazon VPC" - - VPC ID: `` - - ![route53 zone setting](src/route53_create_zone.png) - - 添加记录 - - 点击zone中刚创建的“paddle” - - 点击按钮“Create record set” - - Name : leave blank - - type: "A" - - Value: `` - - ![route53 create recordset](src/route53_create_recordset.png) - - 检查名称服务 - - 连接通过kube-aws via ssh创建的任何实例 - - 运行命令"host paddle",看看是否ip为返回的kube-controller的私有IP - -#### 进入集群 - -集群运行后如下命令会看到: - -``` -$ kubectl --kubeconfig=kubeconfig get nodes -NAME STATUS AGE -ip-10-0-0-134.us-west-2.compute.internal Ready 6m -ip-10-0-0-238.us-west-2.compute.internal Ready 6m -ip-10-0-0-50.us-west-2.compute.internal Ready 6m -ip-10-0-0-55.us-west-2.compute.internal Ready 6m -``` - - -### 集群安装弹性文件系统 - -训练数据存放在AWS上的EFS分布式文件系统中. - -1. 在[security group console](https://us-west-2.console.aws.amazon.com/ec2/v2/home?region=us-west-2#SecurityGroups:sort=groupId)为EFS创建一个安全组 - 1. 可以看到`paddle-cluster-sg-worker` (在sg-055ee37d镜像中)安全组id -
![](src/worker_security_group.png)
- - 2. 增加安全组`paddle-efs` ,以`paddle-cluster-sg-worker`的group id作为用户源和`ALL TCP`入栈规则。增加vpc `paddle-cluster-vpc`, 确保可用区是在[Initialize Assets](#initialize-assets)的时候用到的那一个. -
![](src/add_security_group.png)
- -2. 利用`paddle-cluster-vpc`私有网络在[EFS console](https://us-west-2.console.aws.amazon.com/efs/home?region=us-west-2#/wizard/1) 中创建弹性文件系统, 确定子网为`paddle-cluster-Subnet0`和安全区为`paddle-efs`. -
![](src/create_efs.png)
- - -### 开始在AWS上进行paddlepaddle的训练 - -#### 配置Kubernetes卷指向EFS - -首先需要创建一个持久卷[PersistentVolume](https://kubernetes.io/docs/user-guide/persistent-volumes/) 到EFS上 - -用 `pv.yaml`形式来保存 -``` -apiVersion: v1 -kind: PersistentVolume -metadata: - name: efsvol -spec: - capacity: - storage: 100Gi - accessModes: - - ReadWriteMany - nfs: - server: EFS_DNS_NAME - path: "/" -``` - -`EFS_DNS_NAME`: DNS名称最好能描述我们创建的`paddle-efs`,看起来像`fs-2cbf7385.efs.us-west-2.amazonaws.com` - -运行下面的命令来创建持久卷: -``` -kubectl --kubeconfig=kubeconfig create -f pv.yaml -``` -下一步创建 [PersistentVolumeClaim](https://kubernetes.io/docs/user-guide/persistent-volumes/)来声明持久卷 - -用`pvc.yaml`来保存. -``` -kind: PersistentVolumeClaim -apiVersion: v1 -metadata: - name: efsvol -spec: - accessModes: - - ReadWriteMany - resources: - requests: - storage: 50Gi -``` - -行下面命令来创建持久卷声明: -``` -kubectl --kubeconfig=kubeconfig create -f pvc.yaml -``` - -#### 准备训练数据 - -启动Kubernetes job在我们创建的持久层上进行下载、保存并均匀拆分训练数据为3份. - -用`paddle-data-job.yaml`保存 -``` -apiVersion: batch/v1 -kind: Job -metadata: - name: paddle-data -spec: - template: - metadata: - name: pi - spec: - containers: - - name: paddle-data - image: paddlepaddle/paddle-tutorial:k8s_data - imagePullPolicy: Always - volumeMounts: - - mountPath: "/efs" - name: efs - env: - - name: OUT_DIR - value: /efs/paddle-cluster-job - - name: SPLIT_COUNT - value: "3" - volumes: - - name: efs - persistentVolumeClaim: - claimName: efsvol - restartPolicy: Never -``` - -运行下面的命令来启动任务: -``` -kubectl --kubeconfig=kubeconfig create -f paddle-data-job.yaml -``` -任务运行大概需要7分钟,可以使用下面命令查看任务状态,直到`paddle-data`任务的`SUCCESSFUL`状态为`1`时成功,这里here有怎样创建镜像的源码 -``` -$ kubectl --kubeconfig=kubeconfig get jobs -NAME DESIRED SUCCESSFUL AGE -paddle-data 1 1 6m -``` -数据准备完成后的结果是以镜像`paddlepaddle/paddle-tutorial:k8s_data`存放,可以点击这里[here](src/k8s_data/README.md)查看如何创建docker镜像源码 - -#### 开始训练 - -现在可以开始运行paddle的训练任务,用`paddle-cluster-job.yaml`进行保存 -``` -apiVersion: batch/v1 -kind: Job -metadata: - name: paddle-cluster-job -spec: - parallelism: 3 - completions: 3 - template: - metadata: - name: paddle-cluster-job - spec: - volumes: - - name: efs - persistentVolumeClaim: - claimName: efsvol - containers: - - name: trainer - image: paddlepaddle/paddle-tutorial:k8s_train - command: ["bin/bash", "-c", "/root/start.sh"] - env: - - name: JOB_NAME - value: paddle-cluster-job - - name: JOB_PATH - value: /home/jobpath - - name: JOB_NAMESPACE - value: default - - name: TRAIN_CONFIG_DIR - value: quick_start - - name: CONF_PADDLE_NIC - value: eth0 - - name: CONF_PADDLE_PORT - value: "7164" - - name: CONF_PADDLE_PORTS_NUM - value: "2" - - name: CONF_PADDLE_PORTS_NUM_SPARSE - value: "2" - - name: CONF_PADDLE_GRADIENT_NUM - value: "3" - - name: TRAINER_COUNT - value: "3" - volumeMounts: - - mountPath: "/home/jobpath" - name: efs - ports: - - name: jobport0 - hostPort: 7164 - containerPort: 7164 - - name: jobport1 - hostPort: 7165 - containerPort: 7165 - - name: jobport2 - hostPort: 7166 - containerPort: 7166 - - name: jobport3 - hostPort: 7167 - containerPort: 7167 - restartPolicy: Never -``` - -`parallelism: 3, completions: 3` 意思是这个任务会同时开启3个paddlepaddle的pod,当pod启动后3个任务将被完成。 - -`env` 参数代表容器的环境变量,在这里指定paddlepaddle的参数. - -`ports` 指定TCP端口7164 - 7167和`pserver`进行连接,port从`CONF_PADDLE_PORT`(7164)到`CONF_PADDLE_PORT + CONF_PADDLE_PORTS_NUM + CONF_PADDLE_PORTS_NUM_SPARSE - 1`(7167)。我们使用多个端口密集和稀疏参数的更新来提高延迟 - -运行下面命令来启动任务. -``` -kubectl --kubeconfig=kubeconfig create -f paddle-claster-job.yaml -``` - -检查pods信息 - -``` -$ kubectl --kubeconfig=kubeconfig get pods -NAME READY STATUS RESTARTS AGE -paddle-cluster-job-cm469 1/1 Running 0 9m -paddle-cluster-job-fnt03 1/1 Running 0 9m -paddle-cluster-job-jx4xr 1/1 Running 0 9m -``` - -检查指定pod的控制台输出 -``` -kubectl --kubeconfig=kubeconfig log -f POD_NAME -``` - -`POD_NAME`: 任何一个pod的名称 (e.g., `paddle-cluster-job-cm469`). - -运行`kubectl --kubeconfig=kubeconfig describe job paddle-cluster-job`来检查训练任务的状态,将会在大约20分钟完成 - -`pserver`和`trainer`的细节都隐藏在docker镜像`paddlepaddle/paddle-tutorial:k8s_train`中,这里[here](src/k8s_train/README.md) 有创建docker镜像的源码. - -#### 检查训练输出 - -训练输出(模型快照和日志)将被保存在EFS上。我们可以用ssh登录到EC2的工作节点上,查看mount过的EFS和训练输出. - -1. ssh登录EC2工作节点 -``` -chmod 400 key-name.pem -ssh -i key-name.pem core@INSTANCE_IP -``` - -`INSTANCE_IP`: EC2上Kubernetes工作节点的公共IP地址,进入[EC2 console](https://us-west-2.console.aws.amazon.com/ec2/v2/home?region=us-west-2#Instances:sort=instanceId) 中检查任何`paddle-cluster-kube-aws-worker`实例的 `public IP` - -2. 挂载EFS -``` -mkdir efs -sudo mount -t nfs4 -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 EFS_DNS_NAME:/ efs -``` - -`EFS_DNS_NAME`: DNS名称最好能描述我们创建的`paddle-efs`,看起来像`fs-2cbf7385.efs.us-west-2.amazonaws.com`. - -文件夹`efs`上有这结构相似的node信息: -``` --- paddle-cluster-job - |-- ... - |-- output - | |-- node_0 - | | |-- server.log - | | `-- train.log - | |-- node_1 - | | |-- server.log - | | `-- train.log - | |-- node_2 - | | |-- server.log - | | `-- train.log - | |-- pass-00000 - | | |-- ___fc_layer_0__.w0 - | | |-- ___fc_layer_0__.wbias - | | |-- done - | | |-- path.txt - | | `-- trainer_config.lr.py - | |-- pass-00001... -``` -`server.log` 是`pserver`的log日志,`train.log`是`trainer`的log日志,模型快照和描述存放在`pass-0000*`. - -### Kubernetes集群卸载或删除 - -#### 删除EFS - -到[EFS Console](https://us-west-2.console.aws.amazon.com/efs/home?region=us-west-2) 中删除创建的EFS卷 - -#### 删除安全组 - -去[Security Group Console](https://us-west-2.console.aws.amazon.com/ec2/v2/home?region=us-west-2#SecurityGroups:sort=groupId) 删除安全组`paddle-efs`. - -#### 删除S3 bucket - -进入 [S3 Console](https://console.aws.amazon.com/s3/home?region=us-west-2#)删除S3 bucket - -#### 销毁集群 - -``` -kube-aws destroy -``` - -命令会立刻返回,但需要大约5分钟来销毁集群 - -可以进入 [CludFormation Console](https://us-west-2.console.aws.amazon.com/cloudformation/home?region=us-west-2#/stacks?filter=active)检查销毁的过程。 diff --git a/doc/v2/howto/cluster/multi_cluster/k8s_aws_en.md b/doc/v2/howto/cluster/multi_cluster/k8s_aws_en.md deleted file mode 100644 index 8e8e87be711bd45177ed77c81c531606e801d1f0..0000000000000000000000000000000000000000 --- a/doc/v2/howto/cluster/multi_cluster/k8s_aws_en.md +++ /dev/null @@ -1,688 +0,0 @@ -# Kubernetes on AWS - -We will show you step by step on how to run distributed PaddlePaddle training on AWS cluster with Kubernetes. Let's start from core concepts. - -## Distributed PaddlePaddle Training Core Concepts - -### Distributed Training Job - -A distributed training job is represented by a [Kubernetes job](https://kubernetes.io/docs/user-guide/jobs/#what-is-a-job). - -Each Kuberentes job is described by a job config file, which specifies the information like the number of [pods](https://kubernetes.io/docs/user-guide/pods/#what-is-a-pod) in the job and environment variables. - -In a distributed training job, we would: - -1. prepare partitioned training data and configuration file on a distributed file system (in this tutorial we use Amazon Elastic File System), and -1. create and submit the Kubernetes job config to the Kubernetes cluster to start the training job. - -### Parameter Servers and Trainers - -There are two roles in a PaddlePaddle cluster: *parameter server (pserver)* and *trainer*. Each parameter server process maintains a shard of the global model. Each trainer has its local copy of the model, and uses its local data to update the model. During the training process, trainers send model updates to parameter servers, parameter servers are responsible for aggregating these updates, so that trainers can synchronize their local copy with the global model. - -
![Model is partitioned into two shards. Managed by two parameter servers respectively.](src/pserver_and_trainer.png)
- -In order to communicate with pserver, trainer needs to know the ip address of each pserver. In kubernetes it's better to use a service discovery mechanism (e.g., DNS hostname) rather than static ip address, since any pserver's pod may be killed and a new pod could be schduled onto another node of different ip address. However, now we are using static ip. This will be improved. - -Parameter server and trainer are packaged into a same docker image. They will run once pod is scheduled by kubernetes job. - -### Trainer ID - -Each trainer process requires a trainer ID, a zero-based index value, passed in as a command-line parameter. The trainer process thus reads the data partition indexed by this ID. - -### Training - -The entry-point of a container is a shell script. It can see some environment variables pre-defined by Kubernetes. This includes one that gives the job's identity, which can be used in a remote call to the Kubernetes apiserver that lists all pods in the job. - -We rank each pod by sorting them by their ips. The rank of each pod could be the "pod ID". Because we run one trainer and one parameter server in each pod, we can use this "pod ID" as the trainer ID. A detailed workflow of the entry-point script is as follows: - -1. Query the api server to get pod information, and assign the `trainer_id` by sorting the ip. -1. Copy the training data from EFS persistent volume into container. -1. Parse the `paddle pserver` and `paddle trainer` startup parameters from environment variables, and then start up the processes. -1. Trainer with `train_id` 0 will automatically write results onto EFS volume. - - -## PaddlePaddle on AWS with Kubernetes - -### Choose AWS Service Region -This tutorial requires several AWS services work in the same region. Before we create anything in AWS, please check the following link -https://aws.amazon.com/about-aws/global-infrastructure/regional-product-services/ -Choose a region which has the following services available: EC2, EFS, VPS, CloudFormation, KMS, VPC, S3. -In this tutorial, we use "Oregon(us-west-2)" as example. - -### Create AWS Account and IAM Account - -Under each AWS account, we can create multiple [IAM](http://docs.aws.amazon.com/IAM/latest/UserGuide/introduction.html) users. This allows us to grant some privileges to each IAM user and to create/operate AWS clusters as an IAM user. - -To sign up an AWS account, please -follow -[this guide](http://docs.aws.amazon.com/lambda/latest/dg/setting-up.html). -To create IAM users and user groups under an AWS account, please -follow -[this guide](http://docs.aws.amazon.com/IAM/latest/UserGuide/id_users_create.html). - -Please be aware that this tutorial needs the following privileges for the user in IAM: - -- AmazonEC2FullAccess -- AmazonS3FullAccess -- AmazonRoute53FullAccess -- AmazonRoute53DomainsFullAccess -- AmazonElasticFileSystemFullAccess -- AmazonVPCFullAccess -- IAMUserSSHKeys -- IAMFullAccess -- NetworkAdministrator -- AWSKeyManagementServicePowerUser - - -### Download kube-aws and kubectl - -#### kube-aws - -[kube-aws](https://github.com/coreos/kube-aws) is a CLI tool to automate cluster deployment to AWS. -##### Verify kube-aws integrity -Note: if you are using a non-official release (e.g RC release) kube-aws, you can skip this setp. -Import the CoreOS Application Signing Public Key: - -``` -gpg2 --keyserver pgp.mit.edu --recv-key FC8A365E -``` - -Validate the key fingerprint: - -``` -gpg2 --fingerprint FC8A365E -``` -The correct key fingerprint is `18AD 5014 C99E F7E3 BA5F 6CE9 50BD D3E0 FC8A 365E` - -We can download `kube-aws` from its [release page](https://github.com/coreos/kube-aws/releases). In this tutorial, we use version 0.9.1 - -Validate the tarball's GPG signature: - -``` -PLATFORM=linux-amd64 - # Or -PLATFORM=darwin-amd64 - -gpg2 --verify kube-aws-${PLATFORM}.tar.gz.sig kube-aws-${PLATFORM}.tar.gz -``` -##### Install kube-aws -Extract the binary: - -``` -tar zxvf kube-aws-${PLATFORM}.tar.gz -``` - -Add kube-aws to your path: - -``` -mv ${PLATFORM}/kube-aws /usr/local/bin -``` - - -#### kubectl - -[kubectl](https://kubernetes.io/docs/user-guide/kubectl-overview/) is a command line interface for running commands against Kubernetes clusters. - -Download `kubectl` from the Kubernetes release artifact site with the `curl` tool. - -``` -# OS X -curl -O https://storage.googleapis.com/kubernetes-release/release/"$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)"/bin/darwin/amd64/kubectl - -# Linux -curl -O https://storage.googleapis.com/kubernetes-release/release/"$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)"/bin/linux/amd64/kubectl -``` - -Make the kubectl binary executable and move it to your PATH (e.g. `/usr/local/bin`): - -``` -chmod +x ./kubectl -sudo mv ./kubectl /usr/local/bin/kubectl -``` - -### Configure AWS Credentials - -First check out [this](http://docs.aws.amazon.com/cli/latest/userguide/installing.html) for installing the AWS command line interface. - -And then configure your AWS account information: - -``` -aws configure -``` - - -Fill in the required fields: - - -``` -AWS Access Key ID: YOUR_ACCESS_KEY_ID -AWS Secrete Access Key: YOUR_SECRETE_ACCESS_KEY -Default region name: us-west-2 -Default output format: json -``` - -`YOUR_ACCESS_KEY_ID`, and `YOUR_SECRETE_ACCESS_KEY` is the IAM key and secret from [Create AWS Account and IAM Account](#create-aws-account-and-iam-account) - -Verify that your credentials work by describing any instances you may already have running on your account: - -``` -aws ec2 describe-instances -``` - -### Define Cluster Parameters - -#### EC2 key pair - -The keypair that will authenticate SSH access to your EC2 instances. The public half of this key pair will be configured on each CoreOS node. - -Follow [EC2 Keypair User Guide](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-key-pairs.html) to create a EC2 key pair - -After creating a key pair, you will use the key pair name to configure the cluster. - -Key pairs are only available to EC2 instances in the same region. We are using us-west-2 in our tutorial, so make sure to creat key pairs in that region (Oregon). - -Your browser will download a `key-name.pem` file which is the key to access the EC2 instances. We will use it later. - - -#### KMS key - -Amazon KMS keys are used to encrypt and decrypt cluster TLS assets. If you already have a KMS Key that you would like to use, you can skip creating a new key and provide the Arn string for your existing key. - -You can create a KMS key with the aws command line tool: - -``` -aws kms --region=us-west-2 create-key --description="kube-aws assets" -{ - "KeyMetadata": { - "CreationDate": 1458235139.724, - "KeyState": "Enabled", - "Arn": "arn:aws:kms:us-west-2:aaaaaaaaaaaaa:key/xxxxxxxxxxxxxxxxxxx", - "AWSAccountId": "xxxxxxxxxxxxx", - "Enabled": true, - "KeyUsage": "ENCRYPT_DECRYPT", - "KeyId": "xxxxxxxxx", - "Description": "kube-aws assets" - } -} -``` - -We will need to use the value of `Arn` later. - -And then let's add several inline policies in your IAM user permission. - -Go to [IAM Console](https://console.aws.amazon.com/iam/home?region=us-west-2#/home). Click on button `Users`, click user that we just created, and then click on `Add inline policy` button, and select `Custom Policy`. - -Paste into following inline policies: - -``` - (Caution: node_0, node_1, node_2 directories represents PaddlePaddle node and train_id, not the Kubernetes node){ - "Version": "2012-10-17", - "Statement": [ - { - "Sid": "Stmt1482205552000", - "Effect": "Allow", - "Action": [ - "kms:Decrypt", - "kms:Encrypt" - ], - "Resource": [ - "arn:aws:kms:*:AWS_ACCOUNT_ID:key/*" - ] - }, - { - "Sid": "Stmt1482205746000", - "Effect": "Allow", - "Action": [ - "cloudformation:CreateStack", - "cloudformation:UpdateStack", - "cloudformation:DeleteStack", - "cloudformation:DescribeStacks", - "cloudformation:DescribeStackResource", - "cloudformation:GetTemplate", - "cloudformation:DescribeStackEvents" - ], - "Resource": [ - "arn:aws:cloudformation:us-west-2:AWS_ACCOUNT_ID:stack/MY_CLUSTER_NAME/*" - ] - } - ] -} -``` -`Version` : Its value has to be exactly "2012-10-17". -`AWS_ACCOUNT_ID`: You can get it from following command line: - -``` -aws sts get-caller-identity --output text --query Account -``` - -`MY_CLUSTER_NAME`: Pick a MY_CLUSTER_NAME that you like, you will use it later as well. -Please note, stack name must satisfy regular expression pattern: [a-zA-Z][-a-zA-Z0-9*]*, which means no "_" or "-" in stack name, or kube-aws will throw error in later steps. - -#### External DNS name - -When the cluster is created, the controller will expose the TLS-secured API on a DNS name. - -DNS name should have a CNAME points to cluster DNS name or an A record points to the cluster IP address. - -We will need to use DNS name later in tutorial. If you don't already own one, you can choose any DNS name (e.g., `paddle`) and modify `/etc/hosts` to associate cluster IP with that DNS name for your local machine. And add name service (route53) in aws to associate the IP to paddle for cluster. We will find the cluster IP in later steps. - -#### S3 bucket - -You need to create an S3 bucket before startup the Kubernetes cluster. - -There are some bugs in aws cli in creating S3 bucket, so let's use the [S3 Console](https://console.aws.amazon.com/s3/home?region=us-west-2). - -Click on `Create Bucket`, fill in a unique BUCKET_NAME, and make sure region is us-west-2 (Oregon). - - -#### Initialize Assets - -Create a directory on your local machine to hold the generated assets: - -``` -$ mkdir my-cluster -$ cd my-cluster -``` - -Initialize the cluster CloudFormation stack with the KMS Arn, key pair name, and DNS name from the previous step: - -``` -kube-aws init \ ---cluster-name=MY_CLUSTER_NAME \ ---external-dns-name=MY_EXTERNAL_DNS_NAME \ ---region=us-west-2 \ ---availability-zone=us-west-2a \ ---key-name=KEY_PAIR_NAME \ ---kms-key-arn="arn:aws:kms:us-west-2:xxxxxxxxxx:key/xxxxxxxxxxxxxxxxxxx" -``` - -`MY_CLUSTER_NAME`: the one you picked in [KMS key](#kms-key) - -`MY_EXTERNAL_DNS_NAME`: see [External DNS name](#external-dns-name) - -`KEY_PAIR_NAME`: see [EC2 key pair](#ec2-key-pair) - -`--kms-key-arn`: the "Arn" in [KMS key](#kms-key) - -Here `us-west-2a` is used for parameter `--availability-zone`, but supported availability zone varies among AWS accounts. - -Please check if `us-west-2a` is supported by `aws ec2 --region us-west-2 describe-availability-zones`, if not switch to other supported availability zone. (e.g., `us-west-2a`, or `us-west-2b`) - - -There will now be a cluster.yaml file in the asset directory. This is the main configuration file for your cluster. - -By default `kube-aws` will only create one worker node. Let's edit `cluster.yaml` and change `workerCount` from 1 to 3. - - -#### Render contents of the asset directory - -In the simplest case, you can have kube-aws generate both your TLS identities and certificate authority for you. - -``` -kube-aws render credentials --generate-ca -``` - -The next command generates the default set of cluster assets in your asset directory. - -``` -kube-aws render stack -``` -Assets (templates and credentials) that are used to create, update and interact with your Kubernetes cluster will be created under your current folder. - - -### Kubernetes Cluster Start Up - -#### Create the instances defined in the CloudFormation template - -Now let's create your cluster (choose any `PREFIX` for the command below): - -``` -kube-aws up --s3-uri s3://BUCKET_NAME/PREFIX -``` - -`BUCKET_NAME`: the bucket name that you used in [S3 bucket](#s3-bucket) - - -#### Configure DNS - -You can invoke `kube-aws status` to get the cluster API endpoint after cluster creation. - -``` -$ kube-aws status -Cluster Name: paddle-cluster -Controller DNS Name: paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com -``` - -If you own a DNS name, set the A record to any of the above ip. __Or__ you can set up CNAME point to `Controller DNS Name` (`paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com`) - -##### Find IP address - -Use command `dig` to check the load balancer hostname to get the ip address. - -``` -$ dig paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com - -;; QUESTION SECTION: -;paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com. IN A - -;; ANSWER SECTION: -paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com. 59 IN A 54.241.164.52 -paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com. 59 IN A 54.67.102.112 -``` - -In the above output, both ip `54.241.164.52`, `54.67.102.112` will work. - -*If you own a DNS name*, set the A record to any of the above ip. Then you can skip to the step "Access the cluster". - -*If you do not own a DNS name*: -##### Update local DNS association -Edit `/etc/hosts` to associate above ip with the DNS name. -##### Add Route53 private name service in VPC - - Open [Route53 Console](https://console.aws.amazon.com/route53/home) - - Create hosted zone with following config - - Domain name: "paddle" - - Type: "Private hosted zone for amazon VPC" - - VPC ID: `` - - ![route53 zone setting](src/route53_create_zone.png) - - Add A record - - Click on the zone "paddle" just created - - Click the button "Create record set" - - Name : leave blank - - type: "A" - - Value: `` - - ![route53 create recordset](src/route53_create_recordset.png) - - Verify name service - - Connect to any instance created by kube-aws via ssh - - Run command "host paddle", see if the ip returned is the private ip of kube-controller - -#### Access the cluster - -Once the API server is running, you should see: - -``` -$ kubectl --kubeconfig=kubeconfig get nodes -NAME STATUS AGE -ip-10-0-0-134.us-west-2.compute.internal Ready 6m -ip-10-0-0-238.us-west-2.compute.internal Ready 6m -ip-10-0-0-50.us-west-2.compute.internal Ready 6m -ip-10-0-0-55.us-west-2.compute.internal Ready 6m -``` - - -### Setup Elastic File System for Cluster - -Training data is usually served on a distributed filesystem, we use Elastic File System (EFS) on AWS. - -1. Create security group for EFS in [security group console](https://us-west-2.console.aws.amazon.com/ec2/v2/home?region=us-west-2#SecurityGroups:sort=groupId) - 1. Look up security group id for `paddle-cluster-sg-worker` (`sg-055ee37d` in the image below) -
![](src/worker_security_group.png)
- 2. Add security group `paddle-efs` with `ALL TCP` inbound rule and custom source as group id of `paddle-cluster-sg-worker`. And VPC of `paddle-cluster-vpc`. Make sure availability zone is same as the one you used in [Initialize Assets](#initialize-assets). -
![](src/add_security_group.png)
- -2. Create the Elastic File System in [EFS console](https://us-west-2.console.aws.amazon.com/efs/home?region=us-west-2#/wizard/1) with `paddle-cluster-vpc` VPC. Make sure subnet is `paddle-cluster-Subnet0` andd security group is `paddle-efs`. -
![](src/create_efs.png)
- - -### Start PaddlePaddle Training Demo on AWS - -#### Configure Kubernetes Volume that Points to EFS - -First we need to create a [PersistentVolume](https://kubernetes.io/docs/user-guide/persistent-volumes/) to provision EFS volumn. - -Save following snippet as `pv.yaml` -``` -apiVersion: v1 -kind: PersistentVolume -metadata: - name: efsvol -spec: - capacity: - storage: 100Gi - accessModes: - - ReadWriteMany - nfs: - server: EFS_DNS_NAME - path: "/" -``` - -`EFS_DNS_NAME`: DNS name as shown in description of `paddle-efs` that we created. Looks similar to `fs-2cbf7385.efs.us-west-2.amazonaws.com` - -Run following command to create a persistent volumn: -``` -kubectl --kubeconfig=kubeconfig create -f pv.yaml -``` - -Next let's create a [PersistentVolumeClaim](https://kubernetes.io/docs/user-guide/persistent-volumes/) to claim the persistent volume. - -Save following snippet as `pvc.yaml`. -``` -kind: PersistentVolumeClaim -apiVersion: v1 -metadata: - name: efsvol -spec: - accessModes: - - ReadWriteMany - resources: - requests: - storage: 50Gi -``` - -Run following command to create a persistent volumn claim: -``` -kubectl --kubeconfig=kubeconfig create -f pvc.yaml -``` - -#### Prepare Training Data - -We will now launch a kubernetes job that downloads, saves and evenly splits training data into 3 shards on the persistent volumn that we just created. - -save following snippet as `paddle-data-job.yaml` -``` -apiVersion: batch/v1 -kind: Job -metadata: - name: paddle-data -spec: - template: - metadata: - name: pi - spec: - containers: - - name: paddle-data - image: paddlepaddle/paddle-tutorial:k8s_data - imagePullPolicy: Always - volumeMounts: - - mountPath: "/efs" - name: efs - env: - - name: OUT_DIR - value: /efs/paddle-cluster-job - - name: SPLIT_COUNT - value: "3" - volumes: - - name: efs - persistentVolumeClaim: - claimName: efsvol - restartPolicy: Never -``` - -Run following command to launch the job: -``` -kubectl --kubeconfig=kubeconfig create -f paddle-data-job.yaml -``` - -Job may take 7 min to finish, use following command to check job status. Do not proceed until `SUCCESSFUL` for `paddle-data` job is `1` -``` -$ kubectl --kubeconfig=kubeconfig get jobs -NAME DESIRED SUCCESSFUL AGE -paddle-data 1 1 6m -``` - -Data preparation is done by docker image `paddlepaddle/paddle-tutorial:k8s_data`, see [here](src/k8s_data/README.md) for how to build this docker image and source code. - -#### Start Training - -Now we are ready to start paddle training job. Save following snippet as `paddle-cluster-job.yaml` -``` -apiVersion: batch/v1 -kind: Job -metadata: - name: paddle-cluster-job -spec: - parallelism: 3 - completions: 3 - template: - metadata: - name: paddle-cluster-job - spec: - volumes: - - name: efs - persistentVolumeClaim: - claimName: efsvol - containers: - - name: trainer - image: paddlepaddle/paddle-tutorial:k8s_train - command: ["bin/bash", "-c", "/root/start.sh"] - env: - - name: JOB_NAME - value: paddle-cluster-job - - name: JOB_PATH - value: /home/jobpath - - name: JOB_NAMESPACE - value: default - - name: TRAIN_CONFIG_DIR - value: quick_start - - name: CONF_PADDLE_NIC - value: eth0 - - name: CONF_PADDLE_PORT - value: "7164" - - name: CONF_PADDLE_PORTS_NUM - value: "2" - - name: CONF_PADDLE_PORTS_NUM_SPARSE - value: "2" - - name: CONF_PADDLE_GRADIENT_NUM - value: "3" - - name: TRAINER_COUNT - value: "3" - volumeMounts: - - mountPath: "/home/jobpath" - name: efs - ports: - - name: jobport0 - hostPort: 7164 - containerPort: 7164 - - name: jobport1 - hostPort: 7165 - containerPort: 7165 - - name: jobport2 - hostPort: 7166 - containerPort: 7166 - - name: jobport3 - hostPort: 7167 - containerPort: 7167 - restartPolicy: Never -``` - -`parallelism: 3, completions: 3` means this job will simultaneously start 3 PaddlePaddle pods, and this job will be finished when there are 3 finished pods. - -`env` field represents container's environment variables, we specify PaddlePaddle parameters by environment variables. - -`ports` indicates that TCP port 7164 - 7167 are exposed for communication between `pserver` ans trainer. port starts continously from `CONF_PADDLE_PORT` (7164) to `CONF_PADDLE_PORT + CONF_PADDLE_PORTS_NUM + CONF_PADDLE_PORTS_NUM_SPARSE - 1` (7167). We use multiple ports for dense and sparse paramter updates to improve latency. - -Run following command to launch the job. -``` -kubectl --kubeconfig=kubeconfig create -f paddle-claster-job.yaml -``` - -Inspect individual pods - -``` -$ kubectl --kubeconfig=kubeconfig get pods -NAME READY STATUS RESTARTS AGE -paddle-cluster-job-cm469 1/1 Running 0 9m -paddle-cluster-job-fnt03 1/1 Running 0 9m -paddle-cluster-job-jx4xr 1/1 Running 0 9m -``` - -Inspect individual console output -``` -kubectl --kubeconfig=kubeconfig log -f POD_NAME -``` - -`POD_NAME`: name of any pod (e.g., `paddle-cluster-job-cm469`). - -Run `kubectl --kubeconfig=kubeconfig describe job paddle-cluster-job` to check training job status. It will complete in around 20 minutes. - -The details for start `pserver` and `trainer` are hidden inside docker image `paddlepaddle/paddle-tutorial:k8s_train`, see [here](src/k8s_train/README.md) for how to build the docker image and source code. - -#### Inspect Training Output - -Training output (model snapshot and logs) will be saved in EFS. We can ssh into worker EC2 instance, mount EFS and check training output. - -1. ssh Into Worker EC2 instance -``` -chmod 400 key-name.pem -ssh -i key-name.pem core@INSTANCE_IP -``` - -`INSTANCE_IP`: public IP address of EC2 kubernetes worker node. Go to [EC2 console](https://us-west-2.console.aws.amazon.com/ec2/v2/home?region=us-west-2#Instances:sort=instanceId) and check `public IP` of any `paddle-cluster-kube-aws-worker` instance. - -2. Mount EFS -``` -mkdir efs -sudo mount -t nfs4 -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 EFS_DNS_NAME:/ efs -``` - -`EFS_DNS_NAME`: DNS name as shown in description of `paddle-efs` that we created. Look similar to `fs-2cbf7385.efs.us-west-2.amazonaws.com`. - -Now folder `efs` will have structure similar to: -``` --- paddle-cluster-job - |-- ... - |-- output - | |-- node_0 - | | |-- server.log - | | `-- train.log - | |-- node_1 - | | |-- server.log - | | `-- train.log - | |-- node_2 - | | |-- server.log - | | `-- train.log - | |-- pass-00000 - | | |-- ___fc_layer_0__.w0 - | | |-- ___fc_layer_0__.wbias - | | |-- done - | | |-- path.txt - | | `-- trainer_config.lr.py - | |-- pass-00001... -``` -`server.log` contains log for `pserver`. `train.log` contains log for `trainer`. Model description and snapshot is stored in `pass-0000*`. - -### Kubernetes Cluster Tear Down - -#### Delete EFS - -Go to [EFS Console](https://us-west-2.console.aws.amazon.com/efs/home?region=us-west-2) and delete the EFS volumn that we created. - -#### Delete security group - -Go to [Security Group Console](https://us-west-2.console.aws.amazon.com/ec2/v2/home?region=us-west-2#SecurityGroups:sort=groupId) and delete security group `paddle-efs`. - - -#### Delete S3 Bucket - -Go to [S3 Console](https://console.aws.amazon.com/s3/home?region=us-west-2#) and delete the S3 bucket that we created. - -#### Destroy Cluster - -``` -kube-aws destroy -``` - -The command will return immediately, but it might take 5 min to tear down the whole cluster. - -You can go to [CludFormation Console](https://us-west-2.console.aws.amazon.com/cloudformation/home?region=us-west-2#/stacks?filter=active) to check destroy process. diff --git a/doc/v2/howto/cluster/multi_cluster/k8s_cn.md b/doc/v2/howto/cluster/multi_cluster/k8s_cn.md deleted file mode 100644 index c1a11f7165a2f9da9dd044641274447e7943a597..0000000000000000000000000000000000000000 --- a/doc/v2/howto/cluster/multi_cluster/k8s_cn.md +++ /dev/null @@ -1,206 +0,0 @@ -# Kubernetes单机训练 - -在这篇文档里,我们介绍如何在 Kubernetes 集群上启动一个单机使用CPU的PaddlePaddle训练作业。在下一篇中,我们将介绍如何启动分布式训练作业。 - -## 制作Docker镜像 - -在一个功能齐全的Kubernetes机群里,通常我们会安装Ceph等分布式文件系统来存储训练数据。这样的话,一个分布式PaddlePaddle训练任务中 -的每个进程都可以从Ceph读取数据。在这个例子里,我们只演示一个单机作业,所以可以简化对环境的要求,把训练数据直接放在 -PaddlePaddle的Docker Image里。为此,我们需要制作一个包含训练数据的PaddlePaddle镜像。 - -PaddlePaddle的 `paddlepaddle/paddle:cpu-demo-latest` 镜像里有PaddlePaddle的源码与demo, -(请注意,默认的PaddlePaddle生产环境镜像 `paddlepaddle/paddle:latest` 是不包括源码的,PaddlePaddle的各版本镜像可以参考 -[Docker Installation Guide](http://paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/docker_install_cn.html)), -下面我们使用这个镜像来下载数据到Docker Container中,并把这个包含了训练数据的Container保存为一个新的镜像。 - -### 运行容器 - -``` -$ docker run --name quick_start_data -it paddlepaddle/paddle:cpu-demo-latest -``` - -### 下载数据 - -进入容器`/root/paddle/demo/quick_start/data`目录,使用`get_data.sh`下载数据 - -``` -$ root@fbd1f2bb71f4:~/paddle/demo/quick_start/data# ./get_data.sh - -Downloading Amazon Electronics reviews data... ---2016-10-31 01:33:43-- http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz -Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80 -Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:80... connected. -HTTP request sent, awaiting response... 200 OK -Length: 495854086 (473M) [application/x-gzip] -Saving to: 'reviews_Electronics_5.json.gz' - - 10% [=======> ] 874,279 64.7KB/s eta 2h 13m - -``` - -### 修改启动脚本 - -下载完数据后,修改`/root/paddle/demo/quick_start/train.sh`文件,内容如下(增加了一条cd命令) -``` -set -e -cd /root/paddle/demo/quick_start -cfg=trainer_config.lr.py -#cfg=trainer_config.emb.py -#cfg=trainer_config.cnn.py -#cfg=trainer_config.lstm.py -#cfg=trainer_config.bidi-lstm.py -#cfg=trainer_config.db-lstm.py -paddle train \ - --config=$cfg \ - --save_dir=./output \ - --trainer_count=4 \ - --log_period=20 \ - --num_passes=15 \ - --use_gpu=false \ - --show_parameter_stats_period=100 \ - --test_all_data_in_one_period=1 \ - 2>&1 | tee 'train.log' -``` - -### 提交镜像 - -修改启动脚本后,退出容器,使用`docker commit`命令创建新镜像。 - -``` -$ docker commit quick_start_data mypaddle/paddle:quickstart -``` - -## 使用 Kubernetes 进行训练 - ->针对任务运行完成后容器自动退出的场景,Kubernetes有Job类型的资源来支持。下文就是用Job类型的资源来进行训练。 - -### 编写yaml文件 - -在训练时,输出结果可能会随着容器的消耗而被删除,需要在创建容器前挂载卷以便我们保存训练结果。使用我们之前构造的镜像,可以创建一个 [Kubernetes Job](http://kubernetes.io/docs/user-guide/jobs/#what-is-a-job),简单的yaml文件如下: - -``` -apiVersion: batch/v1 -kind: Job -metadata: - name: quickstart -spec: - parallelism: 1 - completions: 1 - template: - metadata: - name: quickstart - spec: - volumes: - - name: output - hostPath: - path: /home/work/paddle_output - containers: - - name: pi - image: mypaddle/paddle:quickstart - command: ["bin/bash", "-c", "/root/paddle/demo/quick_start/train.sh"] - volumeMounts: - - name: output - mountPath: /root/paddle/demo/quick_start/output - restartPolicy: Never -``` - -### 创建PaddlePaddle Job - -使用上文创建的yaml文件创建Kubernetes Job,命令为: - -``` -$ kubectl create -f paddle.yaml -``` - -查看job的详细情况: - -``` -$ kubectl get job -NAME DESIRED SUCCESSFUL AGE -quickstart 1 0 58s - -$ kubectl describe job quickstart -Name: quickstart -Namespace: default -Image(s): registry.baidu.com/public/paddle:cpu-demo-latest -Selector: controller-uid=f120da72-9f18-11e6-b363-448a5b355b84 -Parallelism: 1 -Completions: 1 -Start Time: Mon, 31 Oct 2016 11:20:16 +0800 -Labels: controller-uid=f120da72-9f18-11e6-b363-448a5b355b84,job-name=quickstart -Pods Statuses: 0 Running / 1 Succeeded / 0 Failed -Volumes: - output: - Type: HostPath (bare host directory volume) - Path: /home/work/paddle_output -Events: - FirstSeen LastSeen Count From SubobjectPath Type Reason Message - --------- -------- ----- ---- ------------- -------- ------ ------- - 1m 1m 1 {job-controller } Normal SuccessfulCreate Created pod: quickstart-fa0wx -``` - -### 查看训练结果 - -根据Job对应的Pod信息,可以查看此Pod运行的宿主机。 - -``` -kubectl describe pod quickstart-fa0wx -Name: quickstart-fa0wx -Namespace: default -Node: paddle-demo-let02/10.206.202.44 -Start Time: Mon, 31 Oct 2016 11:20:17 +0800 -Labels: controller-uid=f120da72-9f18-11e6-b363-448a5b355b84,job-name=quickstart -Status: Succeeded -IP: 10.0.0.9 -Controllers: Job/quickstart -Containers: - quickstart: - Container ID: docker://b8561f5c79193550d64fa47418a9e67ebdd71546186e840f88de5026b8097465 - Image: registry.baidu.com/public/paddle:cpu-demo-latest - Image ID: docker://18e457ce3d362ff5f3febf8e7f85ffec852f70f3b629add10aed84f930a68750 - Port: - Command: - bin/bash - -c - /root/paddle/demo/quick_start/train.sh - QoS Tier: - cpu: BestEffort - memory: BestEffort - State: Terminated - Reason: Completed - Exit Code: 0 - Started: Mon, 31 Oct 2016 11:20:20 +0800 - Finished: Mon, 31 Oct 2016 11:21:46 +0800 - Ready: False - Restart Count: 0 - Environment Variables: -Conditions: - Type Status - Ready False -Volumes: - output: - Type: HostPath (bare host directory volume) - Path: /home/work/paddle_output -``` - -我们还可以登录到宿主机上查看训练结果。 - -``` -[root@paddle-demo-let02 paddle_output]# ll -total 60 -drwxr-xr-x 2 root root 4096 Oct 31 11:20 pass-00000 -drwxr-xr-x 2 root root 4096 Oct 31 11:20 pass-00001 -drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00002 -drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00003 -drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00004 -drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00005 -drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00006 -drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00007 -drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00008 -drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00009 -drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00010 -drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00011 -drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00012 -drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00013 -drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00014 -``` diff --git a/doc/v2/howto/cluster/multi_cluster/k8s_distributed_cn.md b/doc/v2/howto/cluster/multi_cluster/k8s_distributed_cn.md deleted file mode 100644 index 167089b8074b33e3b094fa3ec8e377630cec42ac..0000000000000000000000000000000000000000 --- a/doc/v2/howto/cluster/multi_cluster/k8s_distributed_cn.md +++ /dev/null @@ -1,312 +0,0 @@ -# Kubernetes分布式训练 - -前一篇文章介绍了如何在Kubernetes集群上启动一个单机PaddlePaddle训练作业 (Job)。在这篇文章里,我们介绍如何在Kubernetes集群上进行分布式PaddlePaddle训练作业。关于PaddlePaddle的分布式训练,文章 [Cluster Training](http://www.paddlepaddle.org/docs/develop/documentation/zh/howto/usage/cluster/cluster_train_cn.html)介绍了一种通过SSH远程分发任务,进行分布式训练的方法,与此不同的是,本文将介绍在Kubernetes容器管理平台上快速构建PaddlePaddle容器集群,进行分布式训练的方案。 - -## 整体方案 - -在训练之前,用户将配置与训练数据切分好放在分布式文件系统预先分配好的目录中(不同的分布式文件系统,需要使用其制定的方式挂载后并导入数据),训练时,程序从此目录拷贝文件到容器内进行训练,将结果保存到此目录里。整体的结构图如下: - -![paddle on kubernetes结构图](src/k8s-paddle-arch.png) - -上图描述了一个3节点的分布式训练场景,在每个Pod上都通过volume方式挂载分布式文件系统的一个目录用于保存训练数据和输出结果。Kubernetes为这次训练创建了3个pod并且调度到了3个node上运行,每个pod包含一个PaddlePaddle容器。在容器创建后,会启动pserver与trainer进程,读取volume中的数据进行这次分布式训练。 - -根据前文的描述,要在已有的Kubernetes集群上进行PaddlePaddle的分布式训练,按照下面步骤即可: - -1. [制作PaddlePaddle镜像](#制作镜像) -1. [将训练文件与切分好的数据上传到共享存储](#上传训练文件) -1. [编写本次训练的YAML文件,创建一个Kubernetes job](#创建Job) -1. [训练结束后查看输出结果](#查看输出) - -下面就根据这几个步骤分别介绍。 - -### 制作镜像 - -PaddlePaddle镜像需要提供`paddle pserver`与`paddle train`进程的运行环境,用这个镜像创建的容器需要有以下两个功能: - -- 拷贝训练文件到容器内 -- 生成`paddle pserver`与`paddle train`进程的启动参数,并且启动训练 - -因为官方镜像 `paddlepaddle/paddle:latest` 内已经包含PaddlePaddle的执行程序但是还没上述功能,所以我们可以在这个基础上,添加启动脚本,制作新镜像来完成以上的工作。参考镜像的[*Dockerfile*](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/usage/cluster/src/k8s_train/Dockerfile)。 - -```bash -$ cd doc/howto/usage/k8s/src/k8s_train -$ docker build -t [YOUR_REPO]/paddle:mypaddle . -``` - -然后将构建成功的镜像上传到镜像仓库。 - -```bash -docker push [YOUR_REPO]/paddle:mypaddle -``` - -注意上述命令中`[YOUR_REPO]`表示读者所使用的Docker镜像仓库地址,读者需要替换成自己使用的仓库地址。下文使用`[YOUR_REPO]/paddle:mypaddle`这个地址来表示此步骤所构建出的镜像。 - -### 准备训练数据 - -这里我们通过在Kubernetes集群上启动一个Job来下载并切割数据,也可以通过修改[k8s_train](./src/k8s_train/README.md)的内容来定制image. - -在启动Job之前,需要根据不同的分布式存储来绑定一个[persistentVolumeClaim](https://kubernetes.io/docs/user-guide/persistent-volumes/),生成的数据将会存储在这个volume下. - -```yaml -apiVersion: batch/v1 -kind: Job -metadata: - name: paddle-data -spec: - template: - metadata: - name: pi - spec: - hostNetwork: true - containers: - - name: paddle-data - image: paddlepaddle/paddle-tutorial:k8s_data - imagePullPolicy: Always - volumeMounts: - - mountPath: "/mnt" - name: nfs - env: - - name: OUT_DIR - value: /home/work/mfs/paddle-cluster-job - - name: SPLIT_COUNT - value: "3" - volumes: - - name: nfs - persistentVolumeClaim: - claimName: mfs - restartPolicy: Never -``` - -完成后volume中的文件内容大致如下: -```base -[root@paddle-kubernetes-node0 nfsdir]$ tree -d -. -`-- paddle-cluster-job - |-- 0 - | `-- data - |-- 1 - | `-- data - |-- 2 - | `-- data - |-- output - |-- quick_start -``` - -目录中paddle-cluster-job是本次训练对应的job name,本次训练要求有3个PaddlePaddle节点,在paddle-cluster-job/data目录中存放切分好的数据,文件夹0,1,2分别代表3个节点的trainer_id。recommendation文件夹内存放训练文件,output文件夹存放训练结果与日志。 - -### 创建Job - -Kubernetes可以通过YAML文件来创建相关对象,然后可以使用命令行工具创建job。 - -Job YAML文件描述了这次训练使用的Docker镜像,需要启动的节点个数以及 `paddle pserver`与 `paddle train`进程启动的必要参数,也描述了容器需要使用的存储卷挂载的情况。YAML文件中各个字段的具体含义,可以查看[Kubernetes Job API](http://kubernetes.io/docs/api-reference/batch/v1/definitions/#_v1_job)。例如,本次训练的YAML文件可以写成: - -```yaml -apiVersion: batch/v1 -kind: Job -metadata: - name: paddle-cluster-job -spec: - parallelism: 3 - completions: 3 - template: - metadata: - name: paddle-cluster-job - spec: - volumes: - - name: jobpath - hostPath: - path: /home/work/mfs - containers: - - name: trainer - image: [YOUR_REPO]/paddle:mypaddle - command: ["bin/bash", "-c", "/root/start.sh"] - env: - - name: JOB_NAME - value: paddle-cluster-job - - name: JOB_PATH - value: /home/jobpath - - name: JOB_NAMESPACE - value: default - - name: TRAIN_CONFIG_DIR - value: recommendation - - name: CONF_PADDLE_NIC - value: eth0 - - name: CONF_PADDLE_PORT - value: "7164" - - name: CONF_PADDLE_PORTS_NUM - value: "2" - - name: CONF_PADDLE_PORTS_NUM_SPARSE - value: "2" - - name: CONF_PADDLE_GRADIENT_NUM - value: "3" - volumeMounts: - - name: jobpath - mountPath: /home/jobpath - restartPolicy: Never -``` - -文件中,`metadata`下的`name`表示这个job的名字。`parallelism,completions`字段表示这个job会同时开启3个PaddlePaddle节点,成功训练且退出的pod数目为3时,这个job才算成功结束。然后申明一个存储卷`jobpath`,代表宿主机目录`/home/work/mfs`,在对容器的描述`containers`字段中,将此目录挂载为容器的`/home/jobpath`目录,这样容器的`/home/jobpath`目录就成为了共享存储,放在这个目录里的文件其实是保存到了MFS上。 - -`env`字段表示容器的环境变量,我们将`paddle`运行的一些参数通过这种方式传递到容器内: - - -- JOB_PATH:共享存储挂在的路径 -- JOB_NAME:Job的名字 -- TRAIN_CONFIG_DIR:本次训练文件所在目录,与JOB_PATH,JOB_NAME组合可以找到本次训练需要的文件路径 -- CONF_PADDLE_NIC:`paddle pserver`进程需要的`--nics`参数,即网卡名 -- CONF_PADDLE_PORT:`paddle paserver`的`--port`参数 -- CONF_PADDLE_PORTS_NUM:稠密更新的端口数量,即`--ports_num`参数 -- CONF_PADDLE_PORTS_NUM_SPARSE:稀疏更新的端口数量,即`--ports_num_for_sparse`参数 -- CONF_PADDLE_GRADIENT_NUM:训练节点数量,即`--num_gradient_servers参数` - -这些参数的具体描述,读者可以查看[这里](http://www.paddlepaddle.org/docs/develop/documentation/zh/howto/usage/cmd_parameter/detail_introduction_cn.html)。 - -编写完YAML文件后,可以使用Kubernetes的命令行工具创建job。 - -```bash -kubectl create -f job.yaml -``` - -创建成功后,Kubernetes就会创建3个pod作为PaddlePaddle节点然后拉取镜像,启动容器开始训练。 - - -### 查看输出 - -在训练过程中,可以在共享存储上查看输出的日志和模型,例如output目录下就存放了输出结果。注意node_0,node_1,node_2这几个目录表示PaddlePaddle节点与trainer_id,并不是Kubernetes中的node概念。 - -```bash -[root@paddle-kubernetes-node0 output]# tree -d -. -├── node_0 -│   ├── server.log -│   └── train.log -├── node_1 -│   ├── server.log -│   └── train.log -├── node_2 -...... -├── pass-00002 -│   ├── done -│   ├── ___embedding_0__.w0 -│   ├── ___embedding_1__.w0 -...... -``` - -我们可以通过日志查看容器训练的情况,例如: - -```bash -[root@paddle-kubernetes-node0 node_0]# cat train.log -I1116 09:10:17.123121 50 Util.cpp:155] commandline: - /usr/local/bin/../opt/paddle/bin/paddle_trainer - --nics=eth0 --port=7164 - --ports_num=2 --comment=paddle_process_by_paddle - --pservers=192.168.129.66,192.168.223.143,192.168.129.71 - --ports_num_for_sparse=2 --config=./trainer_config.py - --trainer_count=4 --num_passes=10 --use_gpu=0 - --log_period=50 --dot_period=10 --saving_period=1 - --local=0 --trainer_id=0 - --save_dir=/home/jobpath/paddle-cluster-job/output -I1116 09:10:17.123440 50 Util.cpp:130] Calling runInitFunctions -I1116 09:10:17.123764 50 Util.cpp:143] Call runInitFunctions done. -[WARNING 2016-11-16 09:10:17,227 default_decorators.py:40] please use keyword arguments in paddle config. -[INFO 2016-11-16 09:10:17,239 networks.py:1282] The input order is [movie_id, title, genres, user_id, gender, age, occupation, rating] -[INFO 2016-11-16 09:10:17,239 networks.py:1289] The output order is [__square_error_cost_0__] -I1116 09:10:17.392917 50 Trainer.cpp:170] trainer mode: Normal -I1116 09:10:17.613910 50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process -I1116 09:10:17.680917 50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process -I1116 09:10:17.681543 50 GradientMachine.cpp:134] Initing parameters.. -I1116 09:10:18.012390 50 GradientMachine.cpp:141] Init parameters done. -I1116 09:10:18.018641 50 ParameterClient2.cpp:122] pserver 0 192.168.129.66:7164 -I1116 09:10:18.018950 50 ParameterClient2.cpp:122] pserver 1 192.168.129.66:7165 -I1116 09:10:18.019069 50 ParameterClient2.cpp:122] pserver 2 192.168.223.143:7164 -I1116 09:10:18.019492 50 ParameterClient2.cpp:122] pserver 3 192.168.223.143:7165 -I1116 09:10:18.019716 50 ParameterClient2.cpp:122] pserver 4 192.168.129.71:7164 -I1116 09:10:18.019836 50 ParameterClient2.cpp:122] pserver 5 192.168.129.71:7165 -``` - - -## 一些细节的补充 - -### 使用环境变量 - -使用容器方式运行训练任务的Kubernetes Job,通常会使用环境变量配置Job的配置信息`start_paddle.py`提供了一个启动脚本,将环境变量转换成paddle的命令行参数: -``` -API = "/api/v1/namespaces/" -JOBSELECTOR = "labelSelector=job-name=" -JOB_PATH = os.getenv("JOB_PATH") + "/" + os.getenv("JOB_NAME") -JOB_PATH_OUTPUT = JOB_PATH + "/output" -JOBNAME = os.getenv("JOB_NAME") -NAMESPACE = os.getenv("JOB_NAMESPACE") -PADDLE_NIC = os.getenv("CONF_PADDLE_NIC") -PADDLE_PORT = os.getenv("CONF_PADDLE_PORT") -PADDLE_PORTS_NUM = os.getenv("CONF_PADDLE_PORTS_NUM") -PADDLE_PORTS_NUM_SPARSE = os.getenv("CONF_PADDLE_PORTS_NUM_SPARSE") -PADDLE_SERVER_NUM = os.getenv("CONF_PADDLE_GRADIENT_NUM") -``` - -### Pod间通信 -`start_paddle.py`脚本开始时,会先进行参数的初始化与解析。 - -```python -parser = argparse.ArgumentParser(prog="start_paddle.py", - description='simple tool for k8s') - args, train_args_list = parser.parse_known_args() - train_args = refine_unknown_args(train_args_list) - train_args_dict = dict(zip(train_args[:-1:2], train_args[1::2])) - podlist = getPodList() -``` - -然后通过函数`getPodList()`访问Kubernetes的接口来查询此job对应的所有pod信息。当所有pod都处于running状态(容器运行都运行)时,再通过函数`getIdMap(podlist)`获取trainer_id。 - -```python - podlist = getPodList() - # need to wait until all pods are running - while not isPodAllRunning(podlist): - time.sleep(10) - podlist = getPodList() - idMap = getIdMap(podlist) -``` -* *注意*: `getPodList()`会获取当前namespace下的所有pod,如果已经有pod运行,可能会导致出错。这种集群节点管理方式会在将来使用[statfulsets](https://kubernetes.io/docs/concepts/abstractions/controllers/statefulsets/)代替。 - -在函数`getIdMap(podlist)`内部,我们通过读取`podlist`中每个pod的IP地址,将IP排序生成的序号作为trainer_id。 - -```python -def getIdMap(podlist): - ''' - generate tainer_id by ip - ''' - ips = [] - for pod in podlist["items"]: - ips.append(pod["status"]["podIP"]) - ips.sort() - idMap = {} - for i in range(len(ips)): - idMap[ips[i]] = i - return idMap -``` - -在得到`idMap`后,通过函数`startPaddle(idMap, train_args_dict)`构造`paddle pserver`与`paddle train`的启动参数并执行进程。 - -### 启动任务 - -在函数`startPaddle`中,最主要的工作就是解析出`paddle pserver`与`paddle train`的启动参数。例如`paddle train`参数的解析,解析环境变量得到`PADDLE_NIC`,`PADDLE_PORT`,`PADDLE_PORTS_NUM`等参数,然后通过自身的IP地址在`idMap`中获取`trainerId`。 - -```python - program = 'paddle train' - args = " --nics=" + PADDLE_NIC - args += " --port=" + str(PADDLE_PORT) - args += " --ports_num=" + str(PADDLE_PORTS_NUM) - args += " --comment=" + "paddle_process_by_paddle" - ip_string = "" - for ip in idMap.keys(): - ip_string += (ip + ",") - ip_string = ip_string.rstrip(",") - args += " --pservers=" + ip_string - args_ext = "" - for key, value in train_args_dict.items(): - args_ext += (' --' + key + '=' + value) - localIP = socket.gethostbyname(socket.gethostname()) - trainerId = idMap[localIP] - args += " " + args_ext + " --trainer_id=" + \ - str(trainerId) + " --save_dir=" + JOB_PATH_OUTPUT -``` diff --git a/doc/v2/howto/cluster/multi_cluster/k8s_distributed_en.md b/doc/v2/howto/cluster/multi_cluster/k8s_distributed_en.md deleted file mode 100644 index b2dc4da8451af317df76c5b3df328b6f58429610..0000000000000000000000000000000000000000 --- a/doc/v2/howto/cluster/multi_cluster/k8s_distributed_en.md +++ /dev/null @@ -1,372 +0,0 @@ -# Distributed Training on Kubernetes - -We introduced how to create a PaddlePaddle Job with a single node on Kuberentes in the -previous document. -In this article, we will introduce how to create a PaddlePaddle job with multiple nodes -on Kubernetes cluster. - -## Overall Architecture - -Before creating a training job, the users need to slice the training data and deploy -the Python scripts along with it into the distributed file system -(We can use the different type of Kuberentes Volumes to mount different distributed -file systems). Before training starts, The program will copy the training data into the -Container and also save the models at the same path during training. The global architecture -is as follows: - -![PaddlePaddle on Kubernetes Architecture](src/k8s-paddle-arch.png) - -The above figure describes a distributed training architecture which contains 3 nodes, each -Pod mounts a folder of the distributed file system to save training data and models -by Kubernetes Volume. Kubernetes created 3 Pods for this training phase and scheduled these on -3 nodes, each Pod has a PaddlePaddle container. After the containers car created, -PaddlePaddle starts up the communication between PServer and Trainer and read training -data for this training job. - -As the description above, we can start up a PaddlePaddle distributed training job on a -Kubernetes ready cluster with the following steps: - -1. [Build PaddlePaddle Docker Image](#Build a Docker Image) -1. [Split training data and upload to the distributed file system](#Upload Training Data) -1. [Edit a YAML file and create a Kubernetes Job](#Create a Job) -1. [Check the output](#Check The Output) - -We will introduce these steps as follows: - -### Build a Docker Image - -Training docker image needs to package the paddle pserver and paddle trainer runtimes, as well as two more processes before we can kick off the training: - -- Copying the training data into container. -- Generating the initialization arguments for `Paddle PServer` and `Paddle Training` processes. - -Since the paddlepaddle official docker image already has the runtimes we need, we'll take it as the base image and pack some additional scripts for the processes mentioned above to build our training image. for more detail, please find from the following link: -- https://github.com/PaddlePaddle/Paddle/tree/develop/doc/v2/howto/cluster/multi_cluster/src/k8s_train/Dockerfile - - -```bash -$ cd doc/howto/usage/k8s/src/k8s_train -$ docker build -t [YOUR_REPO]/paddle:mypaddle . -``` - -And then upload the new Docker Image to a Docker hub: - -```bash -docker push [YOUR_REPO]/paddle:mypaddle -``` - -**[NOTE]**, in the above command arguments, `[YOUR_REPO]` represents your Docker repository, -you need to use your repository instead of it. We will replace it with your respository name to -represent the Docker Image which built in this step. - -### Prepare Training Data - -We can download and split the training job by creating a Kubernetes Job, or custom your image -by editing [k8s_train](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/v2/howto/cluster/multi_cluster/src/k8s_train). - -Before creating a Job, we need to bind a [persistenVolumeClaim](https://kubernetes.io/docs/user-guide/persistent-volumes) by the different type of -the different file system, the generated dataset would be saved on this volume. - -```yaml -apiVersion: batch/v1 -kind: Job -metadata: - name: paddle-data -spec: - template: - metadata: - name: pi - spec: - hostNetwork: true - containers: - - name: paddle-data - image: paddlepaddle/paddle-tutorial:k8s_data - imagePullPolicy: Always - volumeMounts: - - mountPath: "/mnt" - name: nfs - env: - - name: OUT_DIR - value: /home/work/mfs/paddle-cluster-job - - name: SPLIT_COUNT - value: "3" - volumes: - - name: nfs - persistentVolumeClaim: - claimName: mfs - restartPolicy: Never -``` - -Create the Job with the following command: - -```bash -> kubectl create -f xxx.yaml -``` - -If created successfully, you can see some information like this: - -```base -[root@paddle-kubernetes-node0 nfsdir]$ tree -d -. -`-- paddle-cluster-job - |-- 0 - | `-- data - |-- 1 - | `-- data - |-- 2 - | `-- data - |-- output - |-- quick_start -``` - -The `paddle-cluster-job` above is the job name for this training job; we need 3 -PaddlePaddle training nodes and save the split training data in `paddle-cluster-job` path, -the folder `0`, `1` and `2` represents the `training_id` on each node, `quick_start` folder is used to store training data, `output` folder is used to store the models and logs. - - -### Create a Job - -Kubernetes allow users to create objects with YAML files, and we can use a command-line tool -to create it. - -The Job YAML file describes that which Docker Image would be used in this training job, how much nodes would be created, what's the startup arguments of `Paddle PServer/Trainer` process and what's the type of Volumes. You can find the details of the YAML filed in -[Kubernetes Job API](http://kubernetes.io/docs/api-reference/batch/v1/definitions/#_v1_job). -The following is an example for this training job: - -```yaml -apiVersion: batch/v1 -kind: Job -metadata: - name: paddle-cluster-job -spec: - parallelism: 3 - completions: 3 - template: - metadata: - name: paddle-cluster-job - spec: - volumes: - - name: jobpath - hostPath: - path: /home/work/mfs - containers: - - name: trainer - image: [YOUR_REPO]/paddle:mypaddle - command: ["bin/bash", "-c", "/root/start.sh"] - env: - - name: JOB_NAME - value: paddle-cluster-job - - name: JOB_PATH - value: /home/jobpath - - name: JOB_NAMESPACE - value: default - - name: TRAIN_CONFIG_DIR - value: recommendation - - name: CONF_PADDLE_NIC - value: eth0 - - name: CONF_PADDLE_PORT - value: "7164" - - name: CONF_PADDLE_PORTS_NUM - value: "2" - - name: CONF_PADDLE_PORTS_NUM_SPARSE - value: "2" - - name: CONF_PADDLE_GRADIENT_NUM - value: "3" - volumeMounts: - - name: jobpath - mountPath: /home/jobpath - restartPolicy: Never -``` - -In the above YAML file: -- `metadata.name`, The job name. -- `parallelism`, Whether the Kubernetes Job would create `parallelism` Pods at the same time. -- `completions`, The Job would become the success status only when the number of successful Pod(the exit code is 0) - is equal to `completions`. -- `volumeMounts`, the name field `jobpath` is a key, the `mountPath` field represents - the path in the container, and we can define the `jobpath` in `volumes` filed, use `hostPath` - to configure the host path we want to mount. -- `env`, the environment variables in the Container, we pass some startup arguments by - this approach, some details are as following: - - JOB_PATH:the mount path in the container - - JOB_NAME:the job name - - TRAIN_CONFIG_DIR:the job path in the container, we can find the training data path by - combine with JOB_NAME. - - CONF_PADDLE_NIC: the argument `--nics` of `Paddle PServer` process, the network - device name. - - CONF_PADDLE_PORT: the argument `--port` of `Paddle PServer` process. - - CONF_PADDLE_PORTS_NUM: the argument `--ports_num` of `Paddle PServer`, the port number - for dense prameter update. - - CONF_PADDLE_PORTS_NUM_SPARSE:the argument `--ports_num_for_sparse` of `Paddle PServer`, - the port number for sparse parameter update. - - CONF_PADDLE_GRADIENT_NUM:the number of training node, the argument - `--num_gradient_servers` of `Paddle PServer` and `Paddle Trainer`. - -You can find some details information at [here] -(http://www.paddlepaddle.org/docs/develop/documentation/zh/howto/usage/cmd_parameter/detail_introduction_cn.html)。 - -We can use the command-line tool of Kubernetes to create a Job when we finish the YAML file: - -```bash -kubectl create -f job.yaml -``` - -Upon successful creation, Kubernetes would create 3 Pods as PaddlePaddle training node, -pull the Docker image and begin to train. - - -### Checkout the Output - -At the process of training, we can check the logs and the output models which is stored in -the `output` folder. - -**NOTE**, `node_0`, `node_1` and `node_2` represent the -`trainer_id` of the PaddlePaddle training job rather than the node id of Kubernetes. - -```bash -[root@paddle-kubernetes-node0 output]# tree -d -. -├── node_0 -│   ├── server.log -│   └── train.log -├── node_1 -│   ├── server.log -│   └── train.log -├── node_2 -...... -├── pass-00002 -│   ├── done -│   ├── ___embedding_0__.w0 -│   ├── ___embedding_1__.w0 -...... -``` - -We can checkout the status of each training Pod by viewing the logs: - -```bash -[root@paddle-kubernetes-node0 node_0]# cat train.log -I1116 09:10:17.123121 50 Util.cpp:155] commandline: - /usr/local/bin/../opt/paddle/bin/paddle_trainer - --nics=eth0 --port=7164 - --ports_num=2 --comment=paddle_process_by_paddle - --pservers=192.168.129.66,192.168.223.143,192.168.129.71 - --ports_num_for_sparse=2 --config=./trainer_config.py - --trainer_count=4 --num_passes=10 --use_gpu=0 - --log_period=50 --dot_period=10 --saving_period=1 - --local=0 --trainer_id=0 - --save_dir=/home/jobpath/paddle-cluster-job/output -I1116 09:10:17.123440 50 Util.cpp:130] Calling runInitFunctions -I1116 09:10:17.123764 50 Util.cpp:143] Call runInitFunctions done. -[WARNING 2016-11-16 09:10:17,227 default_decorators.py:40] please use keyword arguments in paddle config. -[INFO 2016-11-16 09:10:17,239 networks.py:1282] The input order is [movie_id, title, genres, user_id, gender, age, occupation, rating] -[INFO 2016-11-16 09:10:17,239 networks.py:1289] The output order is [__square_error_cost_0__] -I1116 09:10:17.392917 50 Trainer.cpp:170] trainer mode: Normal -I1116 09:10:17.613910 50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process -I1116 09:10:17.680917 50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process -I1116 09:10:17.681543 50 GradientMachine.cpp:134] Initing parameters.. -I1116 09:10:18.012390 50 GradientMachine.cpp:141] Init parameters done. -I1116 09:10:18.018641 50 ParameterClient2.cpp:122] pserver 0 192.168.129.66:7164 -I1116 09:10:18.018950 50 ParameterClient2.cpp:122] pserver 1 192.168.129.66:7165 -I1116 09:10:18.019069 50 ParameterClient2.cpp:122] pserver 2 192.168.223.143:7164 -I1116 09:10:18.019492 50 ParameterClient2.cpp:122] pserver 3 192.168.223.143:7165 -I1116 09:10:18.019716 50 ParameterClient2.cpp:122] pserver 4 192.168.129.71:7164 -I1116 09:10:18.019836 50 ParameterClient2.cpp:122] pserver 5 192.168.129.71:7165 -``` - -## Some Additional Details - -### Using Environment Variables - -Usually we use the environment varialbes to configurate the PaddlePaddle Job which runs in -Kubernetes, `start_paddle.py` provides a start up script to convert the environment variable -to the start up arguments of PaddlePaddle process: - -```bash -API = "/api/v1/namespaces/" -JOBSELECTOR = "labelSelector=job-name=" -JOB_PATH = os.getenv("JOB_PATH") + "/" + os.getenv("JOB_NAME") -JOB_PATH_OUTPUT = JOB_PATH + "/output" -JOBNAME = os.getenv("JOB_NAME") -NAMESPACE = os.getenv("JOB_NAMESPACE") -PADDLE_NIC = os.getenv("CONF_PADDLE_NIC") -PADDLE_PORT = os.getenv("CONF_PADDLE_PORT") -PADDLE_PORTS_NUM = os.getenv("CONF_PADDLE_PORTS_NUM") -PADDLE_PORTS_NUM_SPARSE = os.getenv("CONF_PADDLE_PORTS_NUM_SPARSE") -PADDLE_SERVER_NUM = os.getenv("CONF_PADDLE_GRADIENT_NUM") -``` - -### Communication between Pods - -At the begin of `start_paddle.py`, it would initializes and parses the arguments. - -```python -parser = argparse.ArgumentParser(prog="start_paddle.py", - description='simple tool for k8s') - args, train_args_list = parser.parse_known_args() - train_args = refine_unknown_args(train_args_list) - train_args_dict = dict(zip(train_args[:-1:2], train_args[1::2])) - podlist = getPodList() -``` - -And then query the status of all the other Pods of this Job by the function `getPodList()`, and fetch `triner_id` by the function `getIdMap(podlist)` if all the Pods status is `RUNNING`. - -```python - podlist = getPodList() - # need to wait until all pods are running - while not isPodAllRunning(podlist): - time.sleep(10) - podlist = getPodList() - idMap = getIdMap(podlist) -``` - -**NOTE**: `getPodList()` would prefetch all the Pods in the current namespace, if some -Pods are alreay running, it may cause some error. We will use [statfulesets](https://kubernetes.io/docs/concepts/abstractions/controllers/statefulsets) instead of -Kubernetes Pod or Replicaset in the future. - -The function `getIdMap(podlist)` fetches IPs addresses of `podlist` and then sort them -to generate `trainer_id`. - -```python -def getIdMap(podlist): - ''' - generate tainer_id by ip - ''' - ips = [] - for pod in podlist["items"]: - ips.append(pod["status"]["podIP"]) - ips.sort() - idMap = {} - for i in range(len(ips)): - idMap[ips[i]] = i - return idMap -``` - -After getting the `idMap`, we can generate the arguments of `Paddle PServer` and `Paddle Trainer` -so that we can start up them by `startPaddle(idMap, train_args_dict)`. - -### Create Job - -The main goal of `startPaddle` is generating the arguments of `Paddle PServer` and -`Paddle Trainer` processes. Take `Paddle Trainer` as an example, we parse the -environment variable and then get `PADDLE_NIC`, `PADDLE_PORT`, `PADDLE_PORTS_NUM` and etc..., -finally find `trainerId` from `idMap` according to its IP address. - -```python - program = 'paddle train' - args = " --nics=" + PADDLE_NIC - args += " --port=" + str(PADDLE_PORT) - args += " --ports_num=" + str(PADDLE_PORTS_NUM) - args += " --comment=" + "paddle_process_by_paddle" - ip_string = "" - for ip in idMap.keys(): - ip_string += (ip + ",") - ip_string = ip_string.rstrip(",") - args += " --pservers=" + ip_string - args_ext = "" - for key, value in train_args_dict.items(): - args_ext += (' --' + key + '=' + value) - localIP = socket.gethostbyname(socket.gethostname()) - trainerId = idMap[localIP] - args += " " + args_ext + " --trainer_id=" + \ - str(trainerId) + " --save_dir=" + JOB_PATH_OUTPUT -``` diff --git a/doc/v2/howto/cluster/multi_cluster/k8s_en.md b/doc/v2/howto/cluster/multi_cluster/k8s_en.md deleted file mode 100644 index 96ff652705726fc56fa0078593cd2a695fcdb5e2..0000000000000000000000000000000000000000 --- a/doc/v2/howto/cluster/multi_cluster/k8s_en.md +++ /dev/null @@ -1,210 +0,0 @@ -# Kubernetes - -In this article, we will introduce how to run PaddlePaddle training job on single CPU machine using Kubernetes. In next article, we will introduce how to run PaddlePaddle training job on distributed cluster. - -## Build Docker Image - -In distributed Kubernetes cluster, we will use Ceph or other distributed -storage system for storing training related data so that all processes in -PaddlePaddle training can retrieve data from Ceph. In this example, we will -only demo training job on single machine. In order to simplify the requirement -of the environment, we will directly put training data into the PaddlePaddle Docker Image, -so we need to create a PaddlePaddle Docker image that includes the training data. - -The production Docker Image `paddlepaddle/paddle:cpu-demo-latest` has the PaddlePaddle -source code and demo. (Caution: Default PaddlePaddle Docker Image `paddlepaddle/paddle:latest` doesn't include -the source code, PaddlePaddle's different versions of Docker Image can be referred here: -[Docker Installation Guide](http://paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/docker_install_en.html)), -so we run this Docker Image and download the training data, and then commit the whole -Container to be a new Docker Image. - -### Run Docker Container - -``` -$ docker run --name quick_start_data -it paddlepaddle/paddle:cpu-demo-latest -``` - -### Download Training Data - -Getting into `/root/paddle/demo/quick_start/data` Directory,using `get_data.sh` to download training data. -Then getting into `/root/paddle/demo/quick_start` Directory, using `preprocess.sh` to pre-process training data. - -``` -$ root@fbd1f2bb71f4:~/paddle/demo/quick_start/data# ./get_data.sh - -Downloading Amazon Electronics reviews data... ---2016-10-31 01:33:43-- http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz -Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80 -Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:80... connected. -HTTP request sent, awaiting response... 200 OK -Length: 495854086 (473M) [application/x-gzip] -Saving to: 'reviews_Electronics_5.json.gz' - - 10% [=======> ] 874,279 64.7KB/s eta 2h 13m - -``` - -### Modify Startup Script - -After downloading the data,modify `/root/paddle/demo/quick_start/train.sh` file contents are as follows (one more cd cmd): -``` -set -e -cd /root/paddle/demo/quick_start -cfg=trainer_config.lr.py -#cfg=trainer_config.emb.py -#cfg=trainer_config.cnn.py -#cfg=trainer_config.lstm.py -#cfg=trainer_config.bidi-lstm.py -#cfg=trainer_config.db-lstm.py -paddle train \ - --config=$cfg \ - --save_dir=./output \ - --trainer_count=4 \ - --log_period=20 \ - --num_passes=15 \ - --use_gpu=false \ - --show_parameter_stats_period=100 \ - --test_all_data_in_one_period=1 \ - 2>&1 | tee 'train.log' -``` - -### Commit Docker Image - -``` -$ docker commit quick_start_data mypaddle/paddle:quickstart -``` - -## Use Kubernetes For Training - -We will use Kubernetes job for training process, following steps shows how to do the training with Kubernetes. - -### Create Yaml Files - -The output result in container will be demolished when job finished (container stopped running), so we need to mount the volume out to the local disk when creating the container to store the training result. Using our previously created image, we can create a [Kubernetes Job](http://kubernetes.io/docs/user-guide/jobs/#what-is-a-job), the yaml contents are as follows: - -``` -apiVersion: batch/v1 -kind: Job -metadata: - name: quickstart -spec: - parallelism: 1 - completions: 1 - template: - metadata: - name: quickstart - spec: - volumes: - - name: output - hostPath: - path: /home/work/paddle_output - containers: - - name: pi - image: mypaddle/paddle:quickstart - command: ["bin/bash", "-c", "/root/paddle/demo/quick_start/train.sh"] - volumeMounts: - - name: output - mountPath: /root/paddle/demo/quick_start/output - restartPolicy: Never -``` - -### Start PaddlePaddle Job - -Using the above yaml file to start the Kubernetes job. - -``` -$ kubectl create -f paddle.yaml -``` - -Get the detailed status of the job: - -``` -$ kubectl get job -NAME DESIRED SUCCESSFUL AGE -quickstart 1 0 58s - -$ kubectl describe job quickstart -Name: quickstart -Namespace: default -Image(s): registry.baidu.com/public/paddle:cpu-demo-latest -Selector: controller-uid=f120da72-9f18-11e6-b363-448a5b355b84 -Parallelism: 1 -Completions: 1 -Start Time: Mon, 31 Oct 2016 11:20:16 +0800 -Labels: controller-uid=f120da72-9f18-11e6-b363-448a5b355b84,job-name=quickstart -Pods Statuses: 0 Running / 1 Succeeded / 0 Failed -Volumes: - output: - Type: HostPath (bare host directory volume) - Path: /home/work/paddle_output -Events: - FirstSeen LastSeen Count From SubobjectPath Type Reason Message - --------- -------- ----- ---- ------------- -------- ------ ------- - 1m 1m 1 {job-controller } Normal SuccessfulCreate Created pod: quickstart-fa0wx -``` - -### Get Training Result - -We can use kubectl command to take a look at the status of related pod. - -``` -$ kubectl describe pod quickstart-fa0wx -Name: quickstart-fa0wx -Namespace: default -Node: paddle-demo-let02/10.206.202.44 -Start Time: Mon, 31 Oct 2016 11:20:17 +0800 -Labels: controller-uid=f120da72-9f18-11e6-b363-448a5b355b84,job-name=quickstart -Status: Succeeded -IP: 10.0.0.9 -Controllers: Job/quickstart -Containers: - quickstart: - Container ID: docker://b8561f5c79193550d64fa47418a9e67ebdd71546186e840f88de5026b8097465 - Image: registry.baidu.com/public/paddle:cpu-demo-latest - Image ID: docker://18e457ce3d362ff5f3febf8e7f85ffec852f70f3b629add10aed84f930a68750 - Port: - Command: - bin/bash - -c - /root/paddle/demo/quick_start/train.sh - QoS Tier: - cpu: BestEffort - memory: BestEffort - State: Terminated - Reason: Completed - Exit Code: 0 - Started: Mon, 31 Oct 2016 11:20:20 +0800 - Finished: Mon, 31 Oct 2016 11:21:46 +0800 - Ready: False - Restart Count: 0 - Environment Variables: -Conditions: - Type Status - Ready False -Volumes: - output: - Type: HostPath (bare host directory volume) - Path: /home/work/paddle_output -``` - -We can also ssh to Kubernetes node to take a look at the training result. - -``` -[root@paddle-demo-let02 paddle_output]# ll -total 60 -drwxr-xr-x 2 root root 4096 Oct 31 11:20 pass-00000 -drwxr-xr-x 2 root root 4096 Oct 31 11:20 pass-00001 -drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00002 -drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00003 -drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00004 -drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00005 -drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00006 -drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00007 -drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00008 -drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00009 -drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00010 -drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00011 -drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00012 -drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00013 -drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00014 -``` diff --git a/doc/v2/howto/cluster/multi_cluster/openmpi_cn.md b/doc/v2/howto/cluster/multi_cluster/openmpi_cn.md deleted file mode 100644 index 954b2215cc3136ec5b3e1cdc2f6d3f508f814516..0000000000000000000000000000000000000000 --- a/doc/v2/howto/cluster/multi_cluster/openmpi_cn.md +++ /dev/null @@ -1,41 +0,0 @@ -# 在OpenMPI集群中启动训练 - -## 准备OpenMPI集群 - -执行下面的命令以启动3个节点的OpenMPI集群和一个"head"节点: - -```bash -paddle/scripts/cluster_train_v2/openmpi/docker_cluster -kubectl create -f head.yaml -kubectl create -f mpi-nodes.yaml -``` - -然后可以从head节点ssh无密码登录到OpenMPI的每个节点上。 - -## 启动集群作业 - -您可以按照下面的步骤在OpenMPI集群中提交paddle训练任务: - -```bash -# 获得head和node节点的IP地址 -kubectl get po -o wide -# 将node节点的IP地址保存到machines文件中 -kubectl get po -o wide | grep nodes | awk '{print $6}' > machines -# 拷贝必要的文件到head节点 -scp -i ssh/id_rsa.mpi.pub machines prepare.py train.py start_mpi_train.sh tutorial@[headIP]:~ -# ssh 登录到head节点 -ssh -i ssh/id_rsa.mpi.pub tutorial@[headIP] -# --------------- 以下操作均在head节点中执行 --------------- -# 准备训练数据 -python prepare.py -# 拷贝训练程序和字典文件到每台MPI节点 -cat machines | xargs -i scp word_dict.pickle train.py start_mpi_train.sh machines {}:/home/tutorial -# 创建日志目录 -mpirun -hostfile machines -n 3 mkdir /home/tutorial/logs -# 拷贝训练数据到各自的节点 -scp train.txt-00000 test.txt-00000 [node1IP]:/home/tutorial -scp train.txt-00001 test.txt-00001 [node2IP]:/home/tutorial -scp train.txt-00002 test.txt-00002 [node3IP]:/home/tutorial -# 启动训练任务 -mpirun -hostfile machines -n 3 /home/tutorial/start_mpi_train.sh -``` diff --git a/doc/v2/howto/cluster/multi_cluster/openmpi_en.md b/doc/v2/howto/cluster/multi_cluster/openmpi_en.md deleted file mode 100644 index a5c02b336b8a974f546499acae32edac24219be9..0000000000000000000000000000000000000000 --- a/doc/v2/howto/cluster/multi_cluster/openmpi_en.md +++ /dev/null @@ -1,41 +0,0 @@ -# OpenMPI - -## Prepare an OpenMPI cluster - -Run the following command to start a 3-node MPI cluster and one "head" node. - -```bash -cd paddle/scripts/cluster_train_v2/openmpi/docker_cluster -kubectl create -f head.yaml -kubectl create -f mpi-nodes.yaml -``` - -Then you can log in to every OpenMPI node using ssh without input any passwords. - -## Launching Cluster Job - -Follow the steps to launch a PaddlePaddle training job in OpenMPI cluster:\ - -```bash -# find out node IP addresses -kubectl get po -o wide -# generate a "machines" file containing node IP addresses -kubectl get po -o wide | grep nodes | awk '{print $6}' > machines -# copy necessary files onto "head" node -scp -i ssh/id_rsa.mpi.pub machines prepare.py train.py start_mpi_train.sh tutorial@[headIP]:~ -# login to head node using ssh -ssh -i ssh/id_rsa.mpi.pub tutorial@[headIP] -# --------------- in head node --------------- -# prepare training data -python prepare.py -# copy training data and dict file to MPI nodes -cat machines | xargs -i scp word_dict.pickle train.py start_mpi_train.sh machines {}:/home/tutorial -# creat a directory for storing log files -mpirun -hostfile machines -n 3 mkdir /home/tutorial/logs -# copy training data to every node -scp train.txt-00000 test.txt-00000 [node1IP]:/home/tutorial -scp train.txt-00001 test.txt-00001 [node2IP]:/home/tutorial -scp train.txt-00002 test.txt-00002 [node3IP]:/home/tutorial -# start the job -mpirun -hostfile machines -n 3 /home/tutorial/start_mpi_train.sh -``` diff --git a/doc/v2/howto/cluster/multi_cluster/src/add_security_group.png b/doc/v2/howto/cluster/multi_cluster/src/add_security_group.png deleted file mode 100644 index bd34f46c9b0ada7027fd53e553e7d033255d25fc..0000000000000000000000000000000000000000 Binary files a/doc/v2/howto/cluster/multi_cluster/src/add_security_group.png and /dev/null differ diff --git a/doc/v2/howto/cluster/multi_cluster/src/create_efs.png b/doc/v2/howto/cluster/multi_cluster/src/create_efs.png deleted file mode 100644 index e5f1526033d1daf401700989af1d25919bcb7675..0000000000000000000000000000000000000000 Binary files a/doc/v2/howto/cluster/multi_cluster/src/create_efs.png and /dev/null differ diff --git a/doc/v2/howto/cluster/multi_cluster/src/k8s-paddle-arch.png b/doc/v2/howto/cluster/multi_cluster/src/k8s-paddle-arch.png deleted file mode 100644 index b3800c4fe81302d35e49f7dbacb9221c4dfa5cde..0000000000000000000000000000000000000000 Binary files a/doc/v2/howto/cluster/multi_cluster/src/k8s-paddle-arch.png and /dev/null differ diff --git a/doc/v2/howto/cluster/multi_cluster/src/k8s_data/Dockerfile b/doc/v2/howto/cluster/multi_cluster/src/k8s_data/Dockerfile deleted file mode 100644 index 6d3a12ae393aa594b8e6e9a5f726109426937284..0000000000000000000000000000000000000000 --- a/doc/v2/howto/cluster/multi_cluster/src/k8s_data/Dockerfile +++ /dev/null @@ -1,7 +0,0 @@ -FROM alpine - -RUN apk update && apk upgrade && apk add coreutils -ADD quick_start /quick_start -ADD get_data.sh /bin/ -RUN chmod +x /bin/get_data.sh -ENTRYPOINT ["/bin/get_data.sh"] diff --git a/doc/v2/howto/cluster/multi_cluster/src/k8s_data/README.md b/doc/v2/howto/cluster/multi_cluster/src/k8s_data/README.md deleted file mode 100644 index 83cef7affd0ac4d3a1ca08ea5b046fa81e1bc630..0000000000000000000000000000000000000000 --- a/doc/v2/howto/cluster/multi_cluster/src/k8s_data/README.md +++ /dev/null @@ -1,6 +0,0 @@ -To build PaddlePaddle data preparation image in tutorial [Distributed PaddlePaddle Training on AWS with Kubernetes](../../k8s_aws_en.md), run following commands: - -``` -cp -r ../../../../../../demo/quick_start . -docker build . -t prepare-data-image-name -``` diff --git a/doc/v2/howto/cluster/multi_cluster/src/k8s_data/get_data.sh b/doc/v2/howto/cluster/multi_cluster/src/k8s_data/get_data.sh deleted file mode 100755 index d187ba5ac8d03f69dfdefd4f63610ed7921575be..0000000000000000000000000000000000000000 --- a/doc/v2/howto/cluster/multi_cluster/src/k8s_data/get_data.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/bin/sh - -out_dir=$OUT_DIR -split_count=$SPLIT_COUNT - -set -e - -mkdir -p $out_dir -cp -r /quick_start $out_dir/ - -mkdir -p $out_dir/0/data -cd $out_dir/0/data -wget http://paddlepaddle.bj.bcebos.com/demo/quick_start_preprocessed_data/preprocessed_data.tar.gz -tar zxvf preprocessed_data.tar.gz -rm preprocessed_data.tar.gz - -split -d --number=l/$split_count -a 5 train.txt train. -mv train.00000 train.txt - -cd $out_dir -end=$(expr $split_count - 1) -for i in $(seq 1 $end); do - mkdir -p $i/data - cp -r 0/data/* $i/data - mv $i/data/train.`printf %05d $i` $i/data/train.txt -done; diff --git a/doc/v2/howto/cluster/multi_cluster/src/k8s_train/Dockerfile b/doc/v2/howto/cluster/multi_cluster/src/k8s_train/Dockerfile deleted file mode 100644 index 77f021a89a70d934bf70424eaa3c6dc3f7c93a28..0000000000000000000000000000000000000000 --- a/doc/v2/howto/cluster/multi_cluster/src/k8s_train/Dockerfile +++ /dev/null @@ -1,6 +0,0 @@ -FROM paddlepaddle/paddle:latest - -COPY start.sh /root/ -COPY start_paddle.py /root/ -RUN chmod +x /root/start.sh -CMD ["bash"," -c","/root/start.sh"] diff --git a/doc/v2/howto/cluster/multi_cluster/src/k8s_train/README.md b/doc/v2/howto/cluster/multi_cluster/src/k8s_train/README.md deleted file mode 100644 index 96bf65497ffa23e90c4c9350504f86367b48daf2..0000000000000000000000000000000000000000 --- a/doc/v2/howto/cluster/multi_cluster/src/k8s_train/README.md +++ /dev/null @@ -1,5 +0,0 @@ -To build PaddlePaddle training image in tutorial [Distributed PaddlePaddle Training on AWS with Kubernetes](../../k8s_aws_en.md), run following command: - -``` -docker build . -t train-image-name -``` diff --git a/doc/v2/howto/cluster/multi_cluster/src/k8s_train/start.sh b/doc/v2/howto/cluster/multi_cluster/src/k8s_train/start.sh deleted file mode 100755 index 12dfe1e6386885a6989d3887f21c6922f137a9ae..0000000000000000000000000000000000000000 --- a/doc/v2/howto/cluster/multi_cluster/src/k8s_train/start.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/sh - -set -eu - -jobconfig=${JOB_PATH}"/"${JOB_NAME}"/"${TRAIN_CONFIG_DIR} -cd /root -cp -rf $jobconfig/* . - -python /root/start_paddle.py \ - --dot_period=10 \ - --ports_num=$CONF_PADDLE_PORTS_NUM \ - --ports_num_for_sparse=$CONF_PADDLE_PORTS_NUM_SPARSE \ - --log_period=50 \ - --num_passes=10 \ - --trainer_count=$TRAINER_COUNT \ - --saving_period=1 \ - --local=0 \ - --config=trainer_config.lr.py \ - --use_gpu=0 diff --git a/doc/v2/howto/cluster/multi_cluster/src/k8s_train/start_paddle.py b/doc/v2/howto/cluster/multi_cluster/src/k8s_train/start_paddle.py deleted file mode 100755 index 935c12bb67e1fe08bc135a7a2220fcd43c548482..0000000000000000000000000000000000000000 --- a/doc/v2/howto/cluster/multi_cluster/src/k8s_train/start_paddle.py +++ /dev/null @@ -1,170 +0,0 @@ -#!/usr/bin/python -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import requests -import time -import socket -import os -import argparse - -# configuration for cluster -API = "/api/v1/namespaces/" -JOBSELECTOR = "labelSelector=job-name=" -JOB_PATH = os.getenv("JOB_PATH") + "/" + os.getenv("JOB_NAME") -JOB_PATH_OUTPUT = JOB_PATH + "/output" -JOBNAME = os.getenv("JOB_NAME") -NAMESPACE = os.getenv("JOB_NAMESPACE") -PADDLE_NIC = os.getenv("CONF_PADDLE_NIC") -PADDLE_PORT = os.getenv("CONF_PADDLE_PORT") -PADDLE_PORTS_NUM = os.getenv("CONF_PADDLE_PORTS_NUM") -PADDLE_PORTS_NUM_SPARSE = os.getenv("CONF_PADDLE_PORTS_NUM_SPARSE") -PADDLE_SERVER_NUM = os.getenv("CONF_PADDLE_GRADIENT_NUM") - -tokenpath = '/var/run/secrets/kubernetes.io/serviceaccount/token' - - -def refine_unknown_args(cmd_args): - ''' - refine unknown parameters to handle some special parameters - ''' - new_args = [] - for arg in cmd_args: - if arg.startswith("--") and arg.find("=") != -1: - equal_pos = arg.find("=") # find first = pos - arglist = list(arg) - arglist[equal_pos] = " " - arg = "".join(arglist) - arg = arg.lstrip("-") - new_args += arg.split(" ") - elif arg.startswith("--") and arg.find("=") == -1: - arg = arg.lstrip("-") - new_args.append(arg) - else: - new_args.append(arg) - return new_args - - -def isPodAllRunning(podlist): - ''' - check all pod is running - ''' - require = len(podlist["items"]) - running = 0 - for pod in podlist["items"]: - if pod["status"]["phase"] == "Running": - running += 1 - print "waiting for pods running, require:", require, "running:", running - if require == running: - return True - return False - - -def getPodList(): - ''' - get all container status of the job - ''' - apiserver = "https://" + \ - os.getenv("KUBERNETES_SERVICE_HOST") + ":" + \ - os.getenv("KUBERNETES_SERVICE_PORT_HTTPS") - - pod = API + NAMESPACE + "/pods?" - job = JOBNAME - if os.path.isfile(tokenpath): - tokenfile = open(tokenpath, mode='r') - token = tokenfile.read() - Bearer = "Bearer " + token - headers = {"Authorization": Bearer} - return requests.get(apiserver + pod + JOBSELECTOR + job, - headers=headers, - verify=False).json() - else: - return requests.get(apiserver + pod + JOBSELECTOR + job, - verify=False).json() - - -def getIdMap(podlist): - ''' - generate tainer_id by ip - ''' - ips = [] - for pod in podlist["items"]: - ips.append(pod["status"]["podIP"]) - ips.sort() - idMap = {} - for i in range(len(ips)): - idMap[ips[i]] = i - return idMap - - -def startPaddle(idMap={}, train_args_dict=None): - ''' - start paddle pserver and trainer - ''' - program = 'paddle train' - args = " --nics=" + PADDLE_NIC - args += " --port=" + str(PADDLE_PORT) - args += " --ports_num=" + str(PADDLE_PORTS_NUM) - args += " --comment=" + "paddle_process_by_paddle" - ip_string = "" - for ip in idMap.keys(): - ip_string += (ip + ",") - ip_string = ip_string.rstrip(",") - args += " --pservers=" + ip_string - args_ext = "" - for key, value in train_args_dict.items(): - args_ext += (' --' + key + '=' + value) - localIP = socket.gethostbyname(socket.gethostname()) - trainerId = idMap[localIP] - args += " " + args_ext + " --trainer_id=" + \ - str(trainerId) + " --save_dir=" + JOB_PATH_OUTPUT - logDir = JOB_PATH_OUTPUT + "/node_" + str(trainerId) - if not os.path.exists(JOB_PATH_OUTPUT): - os.makedirs(JOB_PATH_OUTPUT) - if not os.path.exists(logDir): - os.mkdir(logDir) - copyCommand = 'cp -rf ' + JOB_PATH + \ - "/" + str(trainerId) + "/data/*" + " ./data/" - os.system(copyCommand) - startPserver = 'nohup paddle pserver' + \ - " --port=" + str(PADDLE_PORT) + \ - " --ports_num=" + str(PADDLE_PORTS_NUM) + \ - " --ports_num_for_sparse=" + str(PADDLE_PORTS_NUM_SPARSE) + \ - " --nics=" + PADDLE_NIC + \ - " --comment=" + "paddle_process_by_paddle" + \ - " --num_gradient_servers=" + str(PADDLE_SERVER_NUM) +\ - " > " + logDir + "/server.log 2>&1 &" - print startPserver - os.system(startPserver) - # wait until pservers completely start - time.sleep(20) - startTrainer = program + args + " 2>&1 | tee " + \ - logDir + "/train.log" - print startTrainer - os.system(startTrainer) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - prog="start_paddle.py", description='simple tool for k8s') - args, train_args_list = parser.parse_known_args() - train_args = refine_unknown_args(train_args_list) - train_args_dict = dict(zip(train_args[:-1:2], train_args[1::2])) - podlist = getPodList() - # need to wait until all pods are running - while not isPodAllRunning(podlist): - time.sleep(20) - podlist = getPodList() - idMap = getIdMap(podlist) - startPaddle(idMap, train_args_dict) diff --git a/doc/v2/howto/cluster/multi_cluster/src/pserver_and_trainer.png b/doc/v2/howto/cluster/multi_cluster/src/pserver_and_trainer.png deleted file mode 100644 index f41fe48920590333ad332bb51eb18e03dc251541..0000000000000000000000000000000000000000 Binary files a/doc/v2/howto/cluster/multi_cluster/src/pserver_and_trainer.png and /dev/null differ diff --git a/doc/v2/howto/cluster/multi_cluster/src/route53_create_recordset.png b/doc/v2/howto/cluster/multi_cluster/src/route53_create_recordset.png deleted file mode 100644 index 34e476c7beac30fcdde13fccc4cc8d08b4be3d35..0000000000000000000000000000000000000000 Binary files a/doc/v2/howto/cluster/multi_cluster/src/route53_create_recordset.png and /dev/null differ diff --git a/doc/v2/howto/cluster/multi_cluster/src/route53_create_zone.png b/doc/v2/howto/cluster/multi_cluster/src/route53_create_zone.png deleted file mode 100644 index 25b7ddb831c5cba97f4b2edddd27da3234d621af..0000000000000000000000000000000000000000 Binary files a/doc/v2/howto/cluster/multi_cluster/src/route53_create_zone.png and /dev/null differ diff --git a/doc/v2/howto/cluster/multi_cluster/src/worker_security_group.png b/doc/v2/howto/cluster/multi_cluster/src/worker_security_group.png deleted file mode 100644 index 57eb0265a34ad4223b69600d2a3dd355482e0bf5..0000000000000000000000000000000000000000 Binary files a/doc/v2/howto/cluster/multi_cluster/src/worker_security_group.png and /dev/null differ diff --git a/doc/v2/howto/cluster/preparations_cn.md b/doc/v2/howto/cluster/preparations_cn.md deleted file mode 100644 index ce40697e703503b66f6306e15ebdb0ce1329991d..0000000000000000000000000000000000000000 --- a/doc/v2/howto/cluster/preparations_cn.md +++ /dev/null @@ -1,16 +0,0 @@ -## 环境准备 - -1. 准备您的计算集群。计算集群通常由一组(几台到几千台规模)的Linux服务器组成。服务器之间可以通过局域网(LAN)联通,每台服务器具有集群中唯一的IP地址(或者可被DNS解析的主机名)。集群中的每台计算机通常被成为一个“节点”。 -1. 我们需要在集群的所有节点上安装 PaddlePaddle。 如果要启用GPU,还需要在节点上安装对应的GPU驱动以及CUDA。PaddlePaddle的安装可以参考[build_and_install](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/index_cn.html)的多种安装方式。我们推荐使用[Docker](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/docker_install_cn.html)安装方式来快速安装PaddlePaddle。 - -安装完成之后,执行下面的命令可以查看已经安装的版本(docker安装方式可以进入docker容器执行:`docker run -it paddlepaddle/paddle:[tag] /bin/bash`): -```bash -$ paddle version -PaddlePaddle 0.10.0, compiled with - with_avx: ON - with_gpu: OFF - with_double: OFF - with_python: ON - with_rdma: OFF - with_timer: OFF -``` diff --git a/doc/v2/howto/cluster/preparations_en.md b/doc/v2/howto/cluster/preparations_en.md deleted file mode 100644 index 4b77b293907ae0548134fc65ceed3aa0ed0b845d..0000000000000000000000000000000000000000 --- a/doc/v2/howto/cluster/preparations_en.md +++ /dev/null @@ -1,17 +0,0 @@ -## Preparations - -1. Prepare your computer cluster. It's normally a bunch of Linux servers connected by LAN. Each server will be assigned a unique IP address. The computers in the cluster can be called "nodes". -2. Install PaddlePaddle on every node. If you are going to take advantage of GPU cards, you'll also need to install proper driver and CUDA libraries. To install PaddlePaddle please read [this build and install](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/index_en.html) document. We strongly recommend using [Docker installation](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/docker_install_en.html). - -After installation, you can check the version by typing the below command (run a docker container if using docker: `docker run -it paddlepaddle/paddle:[tag] /bin/bash`): - -```bash -$ paddle version -PaddlePaddle 0.10.0rc, compiled with - with_avx: ON - with_gpu: OFF - with_double: OFF - with_python: ON - with_rdma: OFF - with_timer: OFF -``` diff --git a/doc/v2/howto/cluster/src/Dockerfile b/doc/v2/howto/cluster/src/Dockerfile deleted file mode 100644 index e178bf4da0f32fca9586b5b69a2c7419de5d9cb1..0000000000000000000000000000000000000000 --- a/doc/v2/howto/cluster/src/Dockerfile +++ /dev/null @@ -1,7 +0,0 @@ -FROM paddlepaddle/paddle:latest - -MAINTAINER zjsxzong89@gmail.com - -COPY start.sh /root/ -COPY start_paddle.py /root/ -CMD ["bash"," -c","/root/start.sh"] \ No newline at end of file diff --git a/doc/v2/howto/cluster/src/efs_mount.png b/doc/v2/howto/cluster/src/efs_mount.png deleted file mode 100644 index 0f9e3cab98445707e5e9baa18ddabe15cdf04576..0000000000000000000000000000000000000000 Binary files a/doc/v2/howto/cluster/src/efs_mount.png and /dev/null differ diff --git a/doc/v2/howto/cluster/src/managed_policy.png b/doc/v2/howto/cluster/src/managed_policy.png deleted file mode 100644 index c7ecda555b81d7750e9292a9ab72d2f517f76a2a..0000000000000000000000000000000000000000 Binary files a/doc/v2/howto/cluster/src/managed_policy.png and /dev/null differ diff --git a/doc/v2/howto/cluster/src/ps_cn.png b/doc/v2/howto/cluster/src/ps_cn.png deleted file mode 100644 index f9525739cc8bc6506adde642aafa0a85ae3ebebc..0000000000000000000000000000000000000000 Binary files a/doc/v2/howto/cluster/src/ps_cn.png and /dev/null differ diff --git a/doc/v2/howto/cluster/src/ps_en.png b/doc/v2/howto/cluster/src/ps_en.png deleted file mode 100644 index 6537d3d56589ca9f19a77a50a970e4b5275e6ce0..0000000000000000000000000000000000000000 Binary files a/doc/v2/howto/cluster/src/ps_en.png and /dev/null differ diff --git a/doc/v2/howto/cluster/src/trainer.png b/doc/v2/howto/cluster/src/trainer.png deleted file mode 100644 index 6537d3d56589ca9f19a77a50a970e4b5275e6ce0..0000000000000000000000000000000000000000 Binary files a/doc/v2/howto/cluster/src/trainer.png and /dev/null differ diff --git a/doc/v2/howto/cluster/src/trainer_cn.png b/doc/v2/howto/cluster/src/trainer_cn.png deleted file mode 100644 index f9525739cc8bc6506adde642aafa0a85ae3ebebc..0000000000000000000000000000000000000000 Binary files a/doc/v2/howto/cluster/src/trainer_cn.png and /dev/null differ diff --git a/doc/v2/howto/cluster/src/word2vec/api_train_v2.py b/doc/v2/howto/cluster/src/word2vec/api_train_v2.py deleted file mode 100644 index 9107e24c175f1fbf29d86e222e4b66031a5b505e..0000000000000000000000000000000000000000 --- a/doc/v2/howto/cluster/src/word2vec/api_train_v2.py +++ /dev/null @@ -1,114 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gzip -import math - -import paddle.v2 as paddle - -embsize = 32 -hiddensize = 256 -N = 5 - - -def wordemb(inlayer): - wordemb = paddle.layer.embedding( - input=inlayer, - size=embsize, - param_attr=paddle.attr.Param( - name="_proj", - initial_std=0.001, - learning_rate=1, - l2_rate=0, - sparse_update=True)) - return wordemb - - -def main(): - # for local training - cluster_train = False - - if not cluster_train: - paddle.init(use_gpu=False, trainer_count=1) - else: - paddle.init( - use_gpu=False, - trainer_count=2, - port=7164, - ports_num=1, - ports_num_for_sparse=1, - num_gradient_servers=1) - word_dict = paddle.dataset.imikolov.build_dict() - dict_size = len(word_dict) - firstword = paddle.layer.data( - name="firstw", type=paddle.data_type.integer_value(dict_size)) - secondword = paddle.layer.data( - name="secondw", type=paddle.data_type.integer_value(dict_size)) - thirdword = paddle.layer.data( - name="thirdw", type=paddle.data_type.integer_value(dict_size)) - fourthword = paddle.layer.data( - name="fourthw", type=paddle.data_type.integer_value(dict_size)) - nextword = paddle.layer.data( - name="fifthw", type=paddle.data_type.integer_value(dict_size)) - - Efirst = wordemb(firstword) - Esecond = wordemb(secondword) - Ethird = wordemb(thirdword) - Efourth = wordemb(fourthword) - - contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth]) - hidden1 = paddle.layer.fc(input=contextemb, - size=hiddensize, - act=paddle.activation.Sigmoid(), - layer_attr=paddle.attr.Extra(drop_rate=0.5), - bias_attr=paddle.attr.Param(learning_rate=2), - param_attr=paddle.attr.Param( - initial_std=1. / math.sqrt(embsize * 8), - learning_rate=1)) - predictword = paddle.layer.fc(input=hidden1, - size=dict_size, - bias_attr=paddle.attr.Param(learning_rate=2), - act=paddle.activation.Softmax()) - - def event_handler(event): - if isinstance(event, paddle.event.EndIteration): - if event.batch_id % 100 == 0: - with gzip.open("batch-" + str(event.batch_id) + ".tar.gz", - 'w') as f: - trainer.save_parameter_to_tar(f) - result = trainer.test( - paddle.batch( - paddle.dataset.imikolov.test(word_dict, N), 32)) - print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % ( - event.pass_id, event.batch_id, event.cost, event.metrics, - result.metrics) - - cost = paddle.layer.classification_cost(input=predictword, label=nextword) - - parameters = paddle.parameters.create(cost) - adagrad = paddle.optimizer.AdaGrad( - learning_rate=3e-3, - regularization=paddle.optimizer.L2Regularization(8e-4)) - trainer = paddle.trainer.SGD(cost, - parameters, - adagrad, - is_local=not cluster_train) - trainer.train( - paddle.batch(paddle.dataset.imikolov.train(word_dict, N), 32), - num_passes=30, - event_handler=event_handler) - - -if __name__ == '__main__': - main() diff --git a/doc/v2/howto/cluster/src/word2vec/api_train_v2_cluster.py b/doc/v2/howto/cluster/src/word2vec/api_train_v2_cluster.py deleted file mode 100644 index 791504094f3ecae925226ff1d90f20f91d4c018d..0000000000000000000000000000000000000000 --- a/doc/v2/howto/cluster/src/word2vec/api_train_v2_cluster.py +++ /dev/null @@ -1,137 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import math -import os -import paddle.v2 as paddle -import pickle - -embsize = 32 -hiddensize = 256 -N = 5 -cluster_train_file = "./train_data_dir/train/train.txt" -cluster_test_file = "./test_data_dir/test/test.txt" -node_id = os.getenv("OMPI_COMM_WORLD_RANK") -if not node_id: - raise EnvironmentError("must provied OMPI_COMM_WORLD_RANK") - - -def wordemb(inlayer): - wordemb = paddle.layer.embedding( - input=inlayer, - size=embsize, - param_attr=paddle.attr.Param( - name="_proj", - initial_std=0.001, - learning_rate=1, - l2_rate=0, - sparse_update=True)) - return wordemb - - -def cluster_reader_cluster(filename, node_id): - def cluster_reader(): - with open("-".join([filename, "%05d" % int(node_id)]), "r") as f: - for l in f: - csv_data = [int(cell) for cell in l.split(",")] - yield tuple(csv_data) - - return cluster_reader - - -def main(): - # get arguments from env - - # for local training - TRUTH = ["true", "True", "TRUE", "1", "yes", "Yes", "YES"] - cluster_train = os.getenv('PADDLE_CLUSTER_TRAIN', "False") in TRUTH - use_gpu = os.getenv('PADDLE_INIT_USE_GPU', "False") - - if not cluster_train: - paddle.init( - use_gpu=use_gpu, - trainer_count=int(os.getenv("PADDLE_INIT_TRAINER_COUNT", "1"))) - else: - paddle.init( - use_gpu=use_gpu, - trainer_count=int(os.getenv("PADDLE_INIT_TRAINER_COUNT", "1")), - port=int(os.getenv("PADDLE_INIT_PORT", "7164")), - ports_num=int(os.getenv("PADDLE_INIT_PORTS_NUM", "1")), - ports_num_for_sparse=int( - os.getenv("PADDLE_INIT_PORTS_NUM_FOR_SPARSE", "1")), - num_gradient_servers=int( - os.getenv("PADDLE_INIT_NUM_GRADIENT_SERVERS", "1")), - trainer_id=int(os.getenv("PADDLE_INIT_TRAINER_ID", "0")), - pservers=os.getenv("PADDLE_INIT_PSERVERS", "127.0.0.1")) - fn = open("thirdparty/wuyi_train_thdpty/word_dict.pickle", "r") - word_dict = pickle.load(fn) - fn.close() - dict_size = len(word_dict) - firstword = paddle.layer.data( - name="firstw", type=paddle.data_type.integer_value(dict_size)) - secondword = paddle.layer.data( - name="secondw", type=paddle.data_type.integer_value(dict_size)) - thirdword = paddle.layer.data( - name="thirdw", type=paddle.data_type.integer_value(dict_size)) - fourthword = paddle.layer.data( - name="fourthw", type=paddle.data_type.integer_value(dict_size)) - nextword = paddle.layer.data( - name="fifthw", type=paddle.data_type.integer_value(dict_size)) - - Efirst = wordemb(firstword) - Esecond = wordemb(secondword) - Ethird = wordemb(thirdword) - Efourth = wordemb(fourthword) - - contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth]) - hidden1 = paddle.layer.fc(input=contextemb, - size=hiddensize, - act=paddle.activation.Sigmoid(), - layer_attr=paddle.attr.Extra(drop_rate=0.5), - bias_attr=paddle.attr.Param(learning_rate=2), - param_attr=paddle.attr.Param( - initial_std=1. / math.sqrt(embsize * 8), - learning_rate=1)) - predictword = paddle.layer.fc(input=hidden1, - size=dict_size, - bias_attr=paddle.attr.Param(learning_rate=2), - act=paddle.activation.Softmax()) - - def event_handler(event): - if isinstance(event, paddle.event.EndIteration): - if event.batch_id % 100 == 0: - result = trainer.test( - paddle.batch( - cluster_reader_cluster(cluster_test_file, node_id), 32)) - print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % ( - event.pass_id, event.batch_id, event.cost, event.metrics, - result.metrics) - - cost = paddle.layer.classification_cost(input=predictword, label=nextword) - parameters = paddle.parameters.create(cost) - adagrad = paddle.optimizer.AdaGrad( - learning_rate=3e-3, - regularization=paddle.optimizer.L2Regularization(8e-4)) - trainer = paddle.trainer.SGD(cost, - parameters, - adagrad, - is_local=not cluster_train) - trainer.train( - paddle.batch(cluster_reader_cluster(cluster_train_file, node_id), 32), - num_passes=30, - event_handler=event_handler) - - -if __name__ == '__main__': - main() diff --git a/doc/v2/howto/cluster/src/word2vec/prepare.py b/doc/v2/howto/cluster/src/word2vec/prepare.py deleted file mode 100644 index a42548fbf03a0298e1e397c868e4d531801ec89a..0000000000000000000000000000000000000000 --- a/doc/v2/howto/cluster/src/word2vec/prepare.py +++ /dev/null @@ -1,55 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle.v2 as paddle -import tarfile -import os -import pickle - -SPLIT_COUNT = 3 -N = 5 - - -def file_len(fd): - for i, l in enumerate(fd): - pass - return i + 1 - - -def split_from_reader_by_line(filename, reader, split_count): - fn = open(filename, "w") - for batch_id, batch_data in enumerate(reader()): - batch_data_str = [str(d) for d in batch_data] - fn.write(",".join(batch_data_str)) - fn.write("\n") - fn.close() - - fn = open(filename, "r") - total_line_count = file_len(fn) - fn.close() - per_file_lines = total_line_count / split_count + 1 - cmd = "split -d -a 5 -l %d %s %s-" % (per_file_lines, filename, filename) - os.system(cmd) - - -word_dict = paddle.dataset.imikolov.build_dict() -with open("word_dict.pickle", "w") as dict_f: - pickle.dump(word_dict, dict_f) - -split_from_reader_by_line("train.txt", - paddle.dataset.imikolov.train(word_dict, N), - SPLIT_COUNT) -split_from_reader_by_line("test.txt", - paddle.dataset.imikolov.test(word_dict, N), - SPLIT_COUNT) diff --git a/doc/v2/howto/cmd_parameter/arguments_cn.md b/doc/v2/howto/cmd_parameter/arguments_cn.md deleted file mode 100644 index 2dea231ca5487978d59a4d0a570431722ed6b3bf..0000000000000000000000000000000000000000 --- a/doc/v2/howto/cmd_parameter/arguments_cn.md +++ /dev/null @@ -1,394 +0,0 @@ -# 参数概述 - -虽然Paddle看起来包含了众多参数,但是大部分参数是为开发者提供的,或者已经在集群提交环境中自动设置,因此用户并不需要关心它们。在此,根据这些参数的使用场合,我们将它们划分为不同的类别。例如,`通用`类别中的参数可用于所有场合。某些参数只可用于特定的层中,而有些参数需要在集群多机训练中使用等。 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -√ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
参数本地训练集群训练本地测试集群测试
通用job
use_gpu
local
config
config_args
num_passes
trainer_count
version
show_layer_stat
训练dot_period
test_period
saving_period
show_parameter_stats_period
init_model_path
load_missing_parameter_strategy
saving_period_by_batches
use_old_updater
enable_grad_share
grad_share_block_num
log_error_clipping
log_clipping
save_only_one
start_pass
训练/测试save_dir
训练过程中测试test_period
average_test_period
测试model_list
test_wait
test_pass
predict_output_dir
distribute_test
Auc/正负对验证(PnpairValidation)predict_file
GPUgpu_id
parallel_nn
allow_only_one_model_on_one_gpu
cudnn_dir
cuda_dir
cudnn_conv_workspace_limit_in_mb
递归神经网络(RNN)beam_size
rnn_use_batch
prev_batch_state
diy_beam_search_prob_so
参数服务器(PServer)start_pserver
pservers
port
port_num
ports_num_for_sparse
nics
rdma_tcp
small_messages
loadsave_parameters_in_pserver
log_period_server
pserver_num_threads
sock_send_buf_size
sock_recv_buf_size
num_gradient_servers
parameter_block_size
parameter_block_size_for_sparse
异步随机梯度下降(Async SGD)async_count
async_lagged_ratio_min
async_lagged_ratio_default
性能调优(Performance Tuning)log_barrier_abstract
log_barrier_lowest_nodes
log_barrier_show_log
check_sparse_distribution_batches
check_sparse_distribution_ratio
check_sparse_distribution_unbalance_degree
check_sparse_distribution_in_pserver
show_check_sparse_distribution_log
数据提供器(Data Provider)memory_threshold_on_load_data
随机数seed
thread_local_rand_use_global_seed
单元测试checkgrad_eps
矩阵/向量enable_parallel_vector
- diff --git a/doc/v2/howto/cmd_parameter/arguments_en.md b/doc/v2/howto/cmd_parameter/arguments_en.md deleted file mode 100644 index d1963067bda949b11ececefed3db7db1432c6223..0000000000000000000000000000000000000000 --- a/doc/v2/howto/cmd_parameter/arguments_en.md +++ /dev/null @@ -1,394 +0,0 @@ -# Argument Outline - -It looks like there are a lot of arguments. However, most of them are for developers or alrealy set automatically in cluster submitting environment and users do not need to care about them. Here, we divide these arguments into serveral classes according to the scenario that they are used in. For example, the arguments in `common` can be used in all scenes. Some arguments can be only used in certain layers. Some are needed by multi machines training in cluster, etc. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -√ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
argslocal traincluster trainlocal testcluster test
commonjob
use_gpu
local
config
config_args
num_passes
trainer_count
version
show_layer_stat
traindot_period
test_period
saving_period
show_parameter_stats_period
init_model_path
load_missing_parameter_strategy
saving_period_by_batches
use_old_updater
enable_grad_share
grad_share_block_num
log_error_clipping
log_clipping
save_only_one
start_pass
train/testsave_dir
testing during trainingtest_period
average_test_period
testmodel_list
test_wait
test_pass
predict_output_dir
distribute_test
Auc/PnpairValidationpredict_file
GPUgpu_id
parallel_nn
allow_only_one_model_on_one_gpu
cudnn_dir
cuda_dir
cudnn_conv_workspace_limit_in_mb
RNNbeam_size
rnn_use_batch
prev_batch_state
diy_beam_search_prob_so
PServerstart_pserver
pservers
port
port_num
ports_num_for_sparse
nics
rdma_tcp
small_messages
loadsave_parameters_in_pserver
log_period_server
pserver_num_threads
sock_send_buf_size
sock_recv_buf_size
num_gradient_servers
parameter_block_size
parameter_block_size_for_sparse
Async SGDasync_count
async_lagged_ratio_min
async_lagged_ratio_default
Performance Tuninglog_barrier_abstract
log_barrier_lowest_nodes
log_barrier_show_log
check_sparse_distribution_batches
check_sparse_distribution_ratio
check_sparse_distribution_unbalance_degree
check_sparse_distribution_in_pserver
show_check_sparse_distribution_log
Data Providermemory_threshold_on_load_data
RandomNumberseed
thread_local_rand_use_global_seed
UnitTestcheckgrad_eps
Matrix/Vectorenable_parallel_vector
- diff --git a/doc/v2/howto/cmd_parameter/detail_introduction_cn.md b/doc/v2/howto/cmd_parameter/detail_introduction_cn.md deleted file mode 100644 index b4625ba68cf23e5697554ba94efaf0b873f2c1de..0000000000000000000000000000000000000000 --- a/doc/v2/howto/cmd_parameter/detail_introduction_cn.md +++ /dev/null @@ -1,323 +0,0 @@ -# 细节描述 - -## 通用 - -* `--job` - - 工作模式,包括: **train, test, checkgrad**,其中checkgrad主要为开发者使用,使用者不需要关心。 - - 类型: string (默认: train) - -* `--config` - - 用于指定网络配置文件。 - - 类型: string (默认: null). - -* `--use_gpu` - - 训练过程是否使用GPU,设置为true使用GPU模式,否则使用CPU模式。 - - 类型: bool (默认: 1). - -* `--local` -  - 训练过程是否为本地模式,设置为true使用本地训练或者使用集群上的一个节点,否则使用多机训练。 - - 类型: bool (默认: 1). - -* `--trainer_count` - - 指定一台机器上使用的线程数。例如,trainer_count = 4, 意思是在GPU模式下使用4个GPU,或者在CPU模式下使用4个线程。每个线程(或GPU)分配到当前数据块样本数的四分之一。也就是说,如果在训练配置中设置batch_size为512,每个线程分配到128个样本用于训练。 - - 类型: int32 (默认: 1). - -* `--num_passes` - - 当模式为`--job=train`时, 该参数的意思是训练num_passes轮。每轮会将数据集中的所有训练样本使用一次。当模式为`--job=test`时,意思是使用第test_pass个模型到第 num_passes-1 个模型测试数据。 - - 类型: int32 (默认: 100). - -* `--config_args` - - 传递给配置文件的参数。格式: key1=value1,key2=value2. - - 类型: string (默认: null). - -* `--version` - - 是否打印版本信息。 - - 类型: bool (默认: 0). - -* `--show_layer_stat` - - 是否显示**每个批次数据**中每层的数值统计. - - 类型: bool (默认: 0). - -## 训练 - -* `--log_period` - - 每log_period个批次打印日志进度. - - 类型: int32 (默认: 100). - -* `--dot_period` - - 每dot_period个批次输出符号'.'. - - 类型: int32 (默认: 1). - -* `--saving_period` - - 每saving_period轮保存训练参数. - - 类型: int32 (默认: 1). - -* `--save_dir` - - 保存模型参数的目录,需要明确指定,但不需要提前创建。 - - 类型: string (默认: null). - -* `--start_pass` - - 从start_pass轮开始训练,会加载上一轮的参数。 - - 类型: int32 (默认: 0). - -* `--show_parameter_stats_period` - - 在训练过程中每show_parameter_stats_period个批次输出参数统计。默认不显示。 - - 类型: int32 (默认: 0). - -* `--save_only_one` - - 只保存最后一轮的参数,而之前的参数将会被删除。 - - 类型: bool (默认: 0). - -* `--load_missing_parameter_strategy` - - 当模型参数不存在时,指定加载的方式。目前支持fail/rand/zero三种操作. - - `fail`: 程序直接退出. - - `rand`: 根据网络配置中的**initial\_strategy**采用均匀分布或者高斯分布初始化。均匀分布的范围是: **[mean - std, mean + std]**, 其中mean和std是训练配置中的参数. - - `zero`: 所有参数置为零. - - 类型: string (默认: fail). - -* `--init_model_path` - - 初始化模型的路径。如果设置该参数,start\_pass将不起作用。同样也可以在测试模式中指定模型路径。 - - 类型: string (默认: null). - -* `--saving_period_by_batches` - - 在一轮中每saving_period_by_batches个批次保存一次参数。 - - 类型: int32 (默认: 0). - -* `--log_error_clipping` - - 当在网络层配置中设置**error_clipping_threshold**时,该参数指示是否打印错误截断日志。如果为true,**每批次**的反向传播将会打印日志信息。该截断会影响**输出的梯度**. - - 类型: bool (默认: 0). - -* `--log_clipping` - - 当在训练配置中设置**gradient_clipping_threshold**时,该参数指示是否打印日志截断信息。该截断会影响**权重更新的梯度**. - - 类型: bool (默认: 0). - -* `--use_old_updater` - - 是否使用旧的RemoteParameterUpdater。 默认使用ConcurrentRemoteParameterUpdater,主要为开发者使用,使用者通常无需关心. - - 类型: bool (默认: 0). - -* `--enable_grad_share` - - 启用梯度参数的阈值,在多CPU训练时共享该参数. - - 类型: int32 (默认: 100 \* 1024 \* 1024). - -* `--grad_share_block_num` - - 梯度参数的分块数目,在多CPU训练时共享该参数. - - 类型: int32 (默认: 64). - -## 测试 - -* `--test_pass` - - 加载test_pass轮的模型用于测试. - - 类型: int32 (默认: -1). - -* `--test_period` - - 如果为0,每轮结束时对所有测试数据进行测试;如果不为0,每test_period个批次对所有测试数据进行测试. - - 类型: int32 (默认: 0). - -* `--test_wait` - - 指示当指定轮的测试模型不存在时,是否需要等待该轮模型参数。如果在训练期间同时发起另外一个进程进行测试,可以使用该参数. - - 类型: bool (默认: 0). - -* `--model_list` - - 测试时指定的存储模型列表的文件. - - 类型: string (默认: "", null). - -* `--predict_output_dir` - - 保存网络层输出结果的目录。该参数在网络配置的Outputs()中指定,默认为null,意思是不保存结果。在测试阶段,如果你想要保存某些层的特征图,请指定该目录。需要注意的是,网络层的输出是经过激活函数之后的值. - - 类型: string (默认: "", null). - -* `--average_test_period` - - 使用`average_test_period`个批次的参数平均值进行测试。该参数必须能被FLAGS_log_period整除,默认为0,意思是不使用平均参数执行测试. - - 类型: int32 (默认: 0). - -* `--distribute_test` - - 在分布式环境中测试,将多台机器的测试结果合并. - - 类型: bool (默认: 0). - -* `--predict_file` - - 保存预测结果的文件名。该参数默认为null,意思是不保存结果。目前该参数仅用于AucValidationLayer和PnpairValidationLayer层,每轮都会保存预测结果. - - 类型: string (默认: "", null). - -## GPU - -* `--gpu_id` - - 指示使用哪个GPU核. - - 类型: int32 (默认: 0). - -* `--allow_only_one_model_on_one_gpu` - - 如果为true,一个GPU设备上不允许配置多个模型. - - 类型: bool (默认: 1). - -* `--parallel_nn` - - 指示是否使用多线程来计算一个神经网络。如果为false,设置gpu_id指定使用哪个GPU核(训练配置中的设备属性将会无效)。如果为true,GPU核在训练配置中指定(gpu_id无效). - - 类型: bool (默认: 0). - -* `--cudnn_dir` - - 选择路径来动态加载NVIDIA CuDNN库,例如,/usr/local/cuda/lib64. [默认]: LD_LIBRARY_PATH - - 类型: string (默认: "", null) - -* `--cuda_dir` - - 选择路径来动态加载NVIDIA CUDA库,例如,/usr/local/cuda/lib64. [默认]: LD_LIBRARY_PATH - - 类型: string (默认: "", null) - -* `--cudnn_conv_workspace_limit_in_mb` - - 指定cuDNN的最大工作空间容限,单位是MB,默认为4096MB=4GB. - - 类型: int32 (默认: 4096MB=4GB) - -## 自然语言处理(NLP): RNN/LSTM/GRU -* `--rnn_use_batch` - - 指示在简单的RecurrentLayer层的计算中是否使用批处理方法. - - 类型: bool (默认: 0). - -* `--prev_batch_state` - - 标识是否为连续的batch计算. - - 类型: bool (默认: 0). - -* `--beam_size` - - 集束搜索使用广度优先搜索的方式构建查找树。在树的每一层上,都会产生当前层状态的所有继承结果,按启发式损失的大小递增排序。然而,每层上只能保存固定数目个最好的状态,该数目是提前定义好的,称之为集束大小. - - 类型: int32 (默认: 1). - -* `--diy_beam_search_prob_so` -  - 用户可以自定义beam search的方法,编译成动态库,供PaddlePaddle加载。 该参数用于指定动态库路径. - - 类型: string (默认: "", null). - -## 数据支持(DataProvider) - -* `--memory_threshold_on_load_data` - - 内存容限阈值,当超过该阈值时,停止加载数据. - - 类型: double (默认: 1.0). - -## 单元测试 - -* `--checkgrad_eps` - - 使用checkgrad模式时的参数变化大小. - - 类型: double (默认: 1e-05). - -## 参数服务器和分布式通信 - -* `--start_pserver` - - 指示是否开启参数服务器(parameter server). - - 类型: bool (默认: 0). - -* `--pservers` - - 参数服务器的IP地址,以逗号间隔. - - 类型: string (默认: "127.0.0.1"). - -* `--port` - - 参数服务器的监听端口. - - 类型: int32 (默认: 20134). - -* `--ports_num` - - 发送参数的端口号,根据默认端口号递增. - - 类型: int32 (默认: 1). - -* `--trainer_id` -  - 在分布式训练中,每个训练节点必须指定一个唯一的id号,从0到num_trainers-1。0号训练节点是主训练节点。使用者无需关心这个参数. - - 类型: int32 (默认: 0). - -* `--num_gradient_servers` - - 梯度服务器的数量,该参数在集群提交环境中自动设置. - - 类型: int32 (默认: 1). - -* `--small_messages` - - 如果消息数据太小,建议将该参数设为true,启动快速应答,无延迟. - - 类型: bool (默认: 0). - -* `--sock_send_buf_size` - - 限制套接字发送缓冲区的大小。如果仔细设置的话,可以有效减小网络的阻塞. - - 类型: int32 (默认: 1024 \* 1024 \* 40). - -* `--sock_recv_buf_size` - - 限制套接字接收缓冲区的大小. - - 类型: int32 (默认: 1024 \* 1024 \* 40). - -* `--parameter_block_size` - - 参数服务器的参数分块大小。如果未设置,将会自动计算出一个合适的值. - - 类型: int32 (默认: 0). - -* `--parameter_block_size_for_sparse` - - 参数服务器稀疏更新的参数分块大小。如果未设置,将会自动计算出一个合适的值. - - 类型: int32 (默认: 0). - -* `--log_period_server` - - 在参数服务器终端每log_period_server个批次打印日志进度. - - 类型: int32 (默认: 500). - -* `--loadsave_parameters_in_pserver` - - 在参数服务器上加载和保存参数,只有当设置了sparse_remote_update参数时才有效. - - 类型: bool (默认: 0). - -* `--pserver_num_threads` - - 同步执行操作的线程数. - - 类型: bool (默认: 1). - -* `--ports_num_for_sparse` - - 发送参数的端口号,根据默认值递增(port + ports_num),用于稀疏训练中. - - 类型: int32 (默认: 0). - -* `--nics` - - 参数服务器的网络设备名称,已经在集群提交环境中完成设置. - - 类型: string (默认: "xgbe0,xgbe1"). - -* `--rdma_tcp` - - 使用rdma还是tcp传输协议,该参数已经在集群提交环境中完成设置. - - 类型: string (默认: "tcp"). - -## 异步随机梯度下降(Async SGD) -* `--async_count` - - 定义异步训练的长度,如果为0,则使用同步训练. - - 类型: int32 (默认: 0). - -* `--async_lagged_ratio_min` - - 控制`config_.async_lagged_grad_discard_ratio()`的最小值. - - 类型: double (默认: 1.0). - -* `--async_lagged_ratio_default` - - 如果在网络配置中未设置async_lagged_grad_discard_ratio,则使用该参数作为默认值. - - 类型: double (默认: 1.5). - -## 性能调优(Performance Tuning) - -* `--log_barrier_abstract` - - 如果为true,则显示阻隔性能的摘要信息. - - 类型: bool (默认: 1). - -* `--log_barrier_show_log` - - 如果为true,则总会显示阻隔摘要信息,即使间隔很小. - - 类型: bool (默认: 0). - -* `--log_barrier_lowest_nodes` - - 最少显示多少个节点. - - 类型: int32 (默认: 5). - -* `--check_sparse_distribution_in_pserver` - - 指示是否检查所有参数服务器上的稀疏参数的分布是均匀的. - - 类型: bool (默认: 0). - -* `--show_check_sparse_distribution_log` - - 指示是否显示参数服务器上的稀疏参数分布的日志细节. - - 类型: bool (默认: 0). - -* `--check_sparse_distribution_batches` - - 每运行多少个批次执行一次稀疏参数分布的检查. - - 类型: int32 (默认: 100). - -* `--check_sparse_distribution_ratio` - - 如果检查到分配在不同参数服务器上的参数的分布不均匀次数大于check_sparse_distribution_ratio * check_sparse_distribution_batches次,程序停止. - - 类型: double (默认: 0.6). - -* `--check_sparse_distribution_unbalance_degree` - - 不同参数服务器上数据大小的最大值与最小值的比率. - - 类型: double (默认: 2). - -## 矩阵/向量/随机数 -* `--enable_parallel_vector` - - 启动并行向量的阈值. - - 类型: int32 (默认: 0). - -* `--seed` - - 随机数的种子。srand(time)的为0. - - 类型: int32 (默认: 1) - -* `--thread_local_rand_use_global_seed` - - 是否将全局种子应用于本地线程的随机数. - - 类型: bool (默认: 0). diff --git a/doc/v2/howto/cmd_parameter/detail_introduction_en.md b/doc/v2/howto/cmd_parameter/detail_introduction_en.md deleted file mode 100644 index b681ebc81a355dfc1a7638a4463dff6979929a45..0000000000000000000000000000000000000000 --- a/doc/v2/howto/cmd_parameter/detail_introduction_en.md +++ /dev/null @@ -1,327 +0,0 @@ -```eval_rst -.. _cmd_detail_introduction: -``` - -# Detail Description - -## Common - -* `--job` - - Job mode, including: **train, test, checkgrad**, where checkgrad is mainly for developers and users do not need to care about. - - type: string (default: train) - -* `--config` - - Use to specfiy network configure file. - - type: string (default: null). - -* `--use_gpu` - - Whether to use GPU for training, false is cpu mode and true is gpu mode. - - type: bool (default: 1). - -* `--local` - - Whether the training is in local mode or not. True when training locally or using one node in cluster. False when using multiple machines in cluster. - - type: bool (default: 1). - -* `--trainer_count` - - Define the number of threads used in one machine. For example, trainer_count = 4, means use 4 GPU in GPU mode and 4 threads in CPU mode. Each thread (or GPU) is assigned to 1/4 samples in current batch. That is to say, if setting batch_size of 512 in trainer config, each thread train 128 samples. - - type: int32 (default: 1). - -* `--num_passes` - - When `--job=train`, means training for num_passes passes. One pass means training all samples in dataset one time. When `--job=test`, means testing data from model of test_pass to model of (num_passes - 1). - - type: int32 (default: 100). - -* `--config_args` - - arguments passed to config file. Format: key1=value1,key2=value2. - - type: string (default: null). - -* `--version` - - Whether to print version information. - - type: bool (default: 0). - -* `--show_layer_stat` - - Whether to show the statistics of each layer **per batch**. - - type: bool (default: 0). - -## Train - -* `--log_period` - - Log progress every log_period batches. - - type: int32 (default: 100). - -* `--dot_period` - - Print '.' every dot_period batches. - - type: int32 (default: 1). - -* `--saving_period` - - Save parameters every saving_period passes - - type: int32 (default: 1). - -* `--save_dir` - - Directory for saving model parameters. It needs to be specified, but no need to be created in advance. - - type: string (default: null). - -* `--start_pass` - - Start training from this pass. It will load parameters from the previous pass. - - type: int32 (default: 0). - -* `--show_parameter_stats_period` - - Show parameter statistic during training every show_parameter_stats_period batches. It will not show by default. - - type: int32 (default: 0). - -* `--save_only_one` - - Save the parameters only in last pass, while the previous parameters will be removed. - - type: bool (default: 0). - -* `--load_missing_parameter_strategy` - - Specify the loading operation when model file is missing. Now support fail/rand/zero three operations. - - `fail`: program will exit. - - `rand`: uniform or normal distribution according to **initial\_strategy** in network config. Uniform range is: **[mean - std, mean + std]**, where mean and std are configures in trainer config. - - `zero`: all parameters are zero. - - type: string (default: fail). - -* `--init_model_path` - - Path of the initialization model. If it was set, start\_pass will be ignored. It can be used to specify model path in testing mode as well. - - type: string (default: null). - -* `--saving_period_by_batches` - - Save parameters every saving_period_by_batches batches in one pass. - - type: int32 (default: 0). - -* `--log_error_clipping` - - Whether to print error clipping log when setting **error_clipping_threshold** in layer config. If it is true, log will be printed in backward propagation **per batch**. This clipping effects on **gradient of output**. - - type: bool (default: 0). - -* `--log_clipping` - - Enable print log clipping or not when setting **gradient_clipping_threshold** in trainer config. This clipping effects on **gradient w.r.t. (with respect to) weight**. - - type: bool (default: 0). - -* `--use_old_updater` - - Whether to use the old RemoteParameterUpdater. Default use ConcurrentRemoteParameterUpdater. It is mainly for deverlopers and users usually do not need to care about. - - type: bool (default: 0). - -* `--enable_grad_share` - - threshold for enable gradient parameter, which is shared for batch multi-cpu training. - - type: int32 (default: 100 \* 1024 \* 1024). - -* `--grad_share_block_num` - - block number of gradient parameter, which is shared for batch multi-cpu training. - - type: int32 (default: 64). - -## Test - -* `--test_pass` - - Load parameter from this pass to test. - - type: int32 (default: -1). - -* `--test_period` - - if equal 0, do test on all test data at the end of each pass. While if equal non-zero, do test on all test data every test_period batches. - - type: int32 (default: 0). - -* `--test_wait` -  - Whether to wait for parameter per pass if not exist. It can be used when user launch another process to perfom testing during the training process. - - type: bool (default: 0). - -* `--model_list` - - File that saves the model list when testing. - - type: string (default: "", null). - -* `--predict_output_dir` - - Directory that saves the layer output. It is configured in Outputs() in network config. Default, this argument is null, meaning save nothing. Specify this directory if you want to save feature map of some layers in testing mode. Note that, layer outputs are values after activation function. - - type: string (default: "", null). - -* `--average_test_period` - - Do test on average parameter every `average_test_period` batches. It MUST be devided by FLAGS_log_period. Default 0 means do not test on average parameter. - - type: int32 (default: 0). - -* `--distribute_test` - - Testing in distribute environment will merge results from multiple machines. - - type: bool (default: 0). - -* `--predict_file` - - File name for saving predicted result. Default, this argument is null, meaning save nothing. Now, this argument is only used in AucValidationLayer and PnpairValidationLayer, and saves predicted result every pass. - - type: string (default: "", null). - -## GPU - -* `--gpu_id` - - Which gpu core to use. - - type: int32 (default: 0). - -* `--allow_only_one_model_on_one_gpu` - - If true, do not allow multiple models on one GPU device. - - type: bool (default: 1). - -* `--parallel_nn` - - Whether to use multi-thread to calculate one neural network or not. If false, use gpu_id specify which gpu core to use (the device property in trainer config will be ingored). If true, the gpu core is specified in trainer config (gpu_id will be ignored). - - type: bool (default: 0). - -* `--cudnn_dir` - - Choose path to dynamic load NVIDIA CuDNN library, for instance, /usr/local/cuda/lib64. [Default]: LD_LIBRARY_PATH - - type: string (default: "", null) - -* `--cuda_dir` - - Choose path to dynamic load NVIDIA CUDA library, for instance, /usr/local/cuda/lib64. [Default]: LD_LIBRARY_PATH - - type: string (default: "", null) - -* `--cudnn_conv_workspace_limit_in_mb` - - Specify cuDNN max workspace limit, in units MB, 4096MB=4GB by default. - - type: int32 (default: 4096MB=4GB) - -## NLP: RNN/LSTM/GRU -* `--rnn_use_batch` - - Whether to use batch method for calculation in simple RecurrentLayer. - - type: bool (default: 0). - -* `--prev_batch_state` - - batch is continue with next batch. - - type: bool (default: 0). - -* `--beam_size` - - Beam search uses breadth-first search to build its search tree. At each level of the tree, it generates all successors of the states at the current level, sorting them in increasing order of heuristic cost. However, it only stores a predetermined number of best states at each level (called the beam size). - - type: int32 (default: 1). - -* `--diy_beam_search_prob_so` - - Specify shared dynamic library. It can be defined out of paddle by user. - - type: string (default: "", null). - -## DataProvider - -* `--memory_threshold_on_load_data` - - Stop loading data when memory is not sufficient. - - type: double (default: 1.0). - -## Unit Test - -* `--checkgrad_eps` - - parameter change size for checkgrad. - - type: double (default: 1e-05). - -## Parameter Server and Distributed Communication - -* `--start_pserver` - - Whether to start pserver (parameter server). - - type: bool (default: 0). - -* `--pservers` - - Comma separated IP addresses of pservers. - - type: string (default: "127.0.0.1"). - -* `--port` - - Listening port for pserver. - - type: int32 (default: 20134). - -* `--ports_num` - - The ports number for parameter send, increment based on default port number. - - type: int32 (default: 1). - -* `--trainer_id` - - In distributed training, each trainer must be given an unique id ranging from 0 to num_trainers-1. Trainer 0 is the master trainer. User do not need to care this flag. - - type: int32 (default: 0). - -* `--num_gradient_servers` - - Numbers of gradient servers. This arguments is set automatically in cluster submitting environment. - - type: int32 (default: 1). - -* `--small_messages` - - If message size is small, recommend set it True to enable quick ACK and no delay - - type: bool (default: 0). - -* `--sock_send_buf_size` - - Restrict socket send buffer size. It can reduce network congestion if set carefully. - - type: int32 (default: 1024 \* 1024 \* 40). - -* `--sock_recv_buf_size` - - Restrict socket recieve buffer size. - - type: int32 (default: 1024 \* 1024 \* 40). - -* `--parameter_block_size` - - Parameter block size for pserver, will automatically calculate a suitable value if it's not set. - - type: int32 (default: 0). - -* `--parameter_block_size_for_sparse` - - Parameter block size for sparse update pserver, will automatically calculate a suitable value if it's not set. - - type: int32 (default: 0). - -* `--log_period_server` - - Log progress every log_period_server batches at pserver end. - - type: int32 (default: 500). - -* `--loadsave_parameters_in_pserver` - - Load and save parameters in pserver. Only work when parameter set sparse_remote_update. - - type: bool (default: 0). - -* `--pserver_num_threads` - - number of threads for sync op exec. - - type: bool (default: 1). - -* `--ports_num_for_sparse` - - The ports number for parameter send, increment based on default (port + ports_num). It is used by sparse Tranning. - - type: int32 (default: 0). - -* `--nics` - - Network device name for pservers, already set in cluster submitting environment. - - type: string (default: "xgbe0,xgbe1"). - -* `--rdma_tcp` - - Use rdma or tcp transport protocol, already set in cluster submitting environment. - - type: string (default: "tcp"). - -## Async SGD -* `--async_count` - - Defined the asynchronous training length, if 0, then use synchronized training. - - type: int32 (default: 0). - -* `--async_lagged_ratio_min` - - Control the minimize value of `config_.async_lagged_grad_discard_ratio()`. - - type: double (default: 1.0). - -* `--async_lagged_ratio_default` - - If async_lagged_grad_discard_ratio is not set in network config, use it as defalut value. - - type: double (default: 1.5). - -## Performance Tuning - -* `--log_barrier_abstract` - - If true, show abstract barrier performance information. - - type: bool (default: 1). - -* `--log_barrier_show_log` - - If true, always show barrier abstract even with little gap. - - type: bool (default: 0). - -* `--log_barrier_lowest_nodes` - - How many lowest node will be logged. - - type: int32 (default: 5). - -* `--check_sparse_distribution_in_pserver` - - Whether to check that the distribution of sparse parameter on all pservers is balanced. - - type: bool (default: 0). - -* `--show_check_sparse_distribution_log` - - show log details for sparse parameter distribution in pserver. - - type: bool (default: 0). - -* `--check_sparse_distribution_batches` - - Running sparse parameter distribution check every so many batches. - - type: int32 (default: 100). - -* `--check_sparse_distribution_ratio` - - If parameters dispatched to different pservers have an unbalanced distribution for check_sparse_distribution_ratio * check_sparse_distribution_batches times, crash program. - - type: double (default: 0.6). - -* `--check_sparse_distribution_unbalance_degree` - - The ratio of maximum data size / minimun data size for different pserver. - - type: double (default: 2). - -## Matrix/Vector/RandomNumber -* `--enable_parallel_vector` - - threshold for enable parallel vector. - - type: int32 (default: 0). - -* `--seed` - - random number seed. 0 for srand(time) - - type: int32 (default: 1) - -* `--thread_local_rand_use_global_seed` - - Whether to use global seed in rand of thread local. - - type: bool (default: 0). diff --git a/doc/v2/howto/cmd_parameter/index_cn.rst b/doc/v2/howto/cmd_parameter/index_cn.rst deleted file mode 100644 index 6900bb1443e611d326e8d5640e794ac2b9079beb..0000000000000000000000000000000000000000 --- a/doc/v2/howto/cmd_parameter/index_cn.rst +++ /dev/null @@ -1,26 +0,0 @@ -.. _cmd_line_index: - -命令行参数设置 -=============== -深度学习算法的实现有着多样化的特点,运行环境、运行阶段、模型结构、训练策略等等这些都是常见的变化因素。PaddlePaddle支持用户灵活地设置各种命令行参数,以实现对模型训练或预测流程的控制。 - -在这一部分,首先以几个实际场景为例,展示了部分命令行参数的使用: - -.. toctree:: - :maxdepth: 1 - - use_case_cn.md - -接着对所有参数的使用场合进行概述和分类: - -.. toctree:: - :maxdepth: 1 - - arguments_cn.md - -最后给出细节描述,详细解释这些参数的属性和意义: - -.. toctree:: - :maxdepth: 1 - - detail_introduction_cn.md diff --git a/doc/v2/howto/cmd_parameter/index_en.rst b/doc/v2/howto/cmd_parameter/index_en.rst deleted file mode 100644 index f49683948ef78f363e2439cc25332431830eeb24..0000000000000000000000000000000000000000 --- a/doc/v2/howto/cmd_parameter/index_en.rst +++ /dev/null @@ -1,26 +0,0 @@ -.. _cmd_line_index: - -Set Command-line Parameters -=========================== -The implementation of deep learning algorithms has a variety of characteristics, such as running environment, running stage, structure of the model and the traning strategy. PaddlePaddle supports the user to set various command-line parameters flexibly, which helps to achieve control of the model training or prediction process. - -In this part, we take several actual scenarios as an example, and the use of some command-line parameters is displayed: - -.. toctree:: - :maxdepth: 1 - - use_case_en.md - -Then, we summarize and classify the use of all command-line parameters: - -.. toctree:: - :maxdepth: 1 - - arguments_en.md - -Finally, the detailed descriptions are given, and we try to explain the propeties and significance of these command-line parameters in detail: - -.. toctree:: - :maxdepth: 1 - - detail_introduction_en.md diff --git a/doc/v2/howto/cmd_parameter/use_case_cn.md b/doc/v2/howto/cmd_parameter/use_case_cn.md deleted file mode 100644 index db8c39d950771726346ff9c9481990abc13036cf..0000000000000000000000000000000000000000 --- a/doc/v2/howto/cmd_parameter/use_case_cn.md +++ /dev/null @@ -1,182 +0,0 @@ -# 使用案例 - -## 本地训练 - -本地训练的实验,诸如图像分类,自然语言处理等,通常都会使用下面这些命令行参数。 - -``` -paddle train \ - --use_gpu=1/0 \ #1:GPU,0:CPU(默认为1) - --config=network_config \ - --save_dir=output \ - --trainer_count=COUNT \ #(默认为1) - --test_period=M \ #(默认为0) - --num_passes=N \ #(默认为100) - --log_period=K \ #(默认为100) - --dot_period=1000 \ #(默认为1) - #[--show_parameter_stats_period=100] \ #(默认为0) - #[--saving_period_by_batches=200] \ #(默认为0) -``` -根据你的任务,可以选择是否使用参数`show_parameter_stats_period`和`saving_period_by_batches`。 - -### 1) 将命令参数传给网络配置 - -`config_args`是一个很有用的参数,用于将参数传递给网络配置。 - -``` ---config_args=generating=1,beam_size=5,layer_num=10 \ -``` -`get_config_arg`可用于在网络配置中解析这些参数,如下所示: - -``` -generating = get_config_arg('generating', bool, False) -beam_size = get_config_arg('beam_size', int, 3) -layer_num = get_config_arg('layer_num', int, 8) -``` - -`get_config_arg`: - -``` -get_config_arg(name, type, default_value) -``` -- name: `--config_args`中指定的名字 -- type: 值类型,包括bool, int, str, float等 -- default_value: 默认值 - -### 2) 使用模型初始化网络 - -增加如下参数: - -``` ---init_model_path=model_path ---load_missing_parameter_strategy=rand -``` - -## 本地测试 - -方法一: - -``` -paddle train --job=test \ - --use_gpu=1/0 \ - --config=network_config \ - --trainer_count=COUNT \ - --init_model_path=model_path \ -``` -- 使用init\_model\_path指定测试的模型 -- 只能测试单个模型 - -方法二: - -``` -paddle train --job=test \ - --use_gpu=1/0 \ - --config=network_config \ - --trainer_count=COUNT \ - --model_list=model.list \ -``` -- 使用model_list指定测试的模型列表 -- 可以测试多个模型,文件model.list如下所示: - -``` -./alexnet_pass1 -./alexnet_pass2 -``` - -方法三: - -``` -paddle train --job=test \ - --use_gpu=1/0 \ - --config=network_config \ - --trainer_count=COUNT \ - --save_dir=model \ - --test_pass=M \ - --num_passes=N \ -``` -这种方式必须使用Paddle存储的模型路径格式,如:`model/pass-%5d`。测试的模型包括从第M轮到第N-1轮存储的所有模型。例如,M=12,N=14这种写法将会测试模型`model/pass-00012`和`model/pass-00013`。 - -## 稀疏训练 - -当输入是维度很高的稀疏数据时,通常使用稀疏训练来加速计算过程。例如,输入数据的字典维数是1百万,但是每个样本仅包含几个词。在Paddle中,稀疏矩阵的乘积应用于前向传播过程,而稀疏更新在反向传播之后的权重更新时进行。 - -### 1) 本地训练 - -用户需要在网络配置中指定**sparse\_update=True**。请参照网络配置的文档了解更详细的信息。 - -### 2) 集群训练 - -在集群上训练一个稀疏模型需要加上下面的参数。同时用户需要在网络配置中指定**sparse\_remote\_update=True**。请参照网络配置的文档了解更详细的信息。 - -``` ---ports_num_for_sparse=1 #(默认为0) -``` - -## parallel_nn -用户可以设置`parallel_nn`来混合使用GPU和CPU计算网络层的参数。也就是说,你可以将网络配置成某些层使用GPU计算,而其他层使用CPU计算。另一种方式是将网络层划分到不同的GPU上去计算,这样可以减小GPU内存,或者采用并行计算来加速某些层的更新。 - -如果你想使用这些特性,你需要在网络配置中指定设备的ID号(表示为deviceId),并且加上下面的命令行参数: - -``` ---parallel_nn=true -``` -### 案例一:GPU和CPU混合使用 -请看下面的例子: - -``` -#command line: -paddle train --use_gpu=true --parallel_nn=true trainer_count=COUNT - -default_device(0) - -fc1=fc_layer(...) -fc2=fc_layer(...) -fc3=fc_layer(...,layer_attr=ExtraAttr(device=-1)) - -``` -- default_device(0): 设置默认设备号为0。这意味着除了指定device=-1的层之外,其他所有层都会使用GPU计算,每层使用的GPU号依赖于参数trainer\_count和gpu\_id(默认为0)。在此,fc1和fc2层在GPU上计算。 - -- device=-1: fc3层使用CPU计算。 - -- trainer_count: - - trainer_count=1: 如果未设置gpu\_id,那么fc1和fc2层将会使用第1个GPU来计算。否则使用gpu\_id指定的GPU。 - - - trainer_count>1: 在trainer\_count个GPU上使用数据并行来计算某一层。例如,trainer\_count=2意味着0号和1号GPU将会使用数据并行来计算fc1和fc2层。 - -### 案例二:在不同设备上指定层 - -``` -#command line: -paddle train --use_gpu=true --parallel_nn=true --trainer_count=COUNT - -#network: -fc2=fc_layer(input=l1, layer_attr=ExtraAttr(device=0), ...) -fc3=fc_layer(input=l1, layer_attr=ExtraAttr(device=1), ...) -fc4=fc_layer(input=fc2, layer_attr=ExtraAttr(device=-1), ...) -``` -在本例中,我们假设一台机器上有4个GPU。 - -- trainer_count=1: - - 使用0号GPU计算fc2层。 - - 使用1号GPU计算fc3层。 - - 使用CPU计算fc4层。 - -- trainer_count=2: - - 使用0号和1号GPU计算fc2层。 - - 使用2号和3号GPU计算fc3层。 - - 使用CPU两线程计算fc4层。 - -- trainer_count=4: - - 运行失败(注意到我们已经假设机器上有4个GPU),因为参数`allow_only_one_model_on_one_gpu`默认设置为真。 - -**当`device!=-1`时设备ID号的分配:** - -``` -(deviceId + gpu_id + threadId * numLogicalDevices_) % numDevices_ - -deviceId: 在层中指定 -gpu_id: 默认为0 -threadId: 线程ID号,范围: 0,1,..., trainer_count-1 -numDevices_: 机器的设备(GPU)数目 -numLogicalDevices_: min(max(deviceId + 1), numDevices_) -``` diff --git a/doc/v2/howto/cmd_parameter/use_case_en.md b/doc/v2/howto/cmd_parameter/use_case_en.md deleted file mode 100644 index e287f0c4b9617cbc6504596512bf408c56dc10f9..0000000000000000000000000000000000000000 --- a/doc/v2/howto/cmd_parameter/use_case_en.md +++ /dev/null @@ -1,182 +0,0 @@ -# Use Case - -## Local Training - -These command line arguments are commonly used by local training experiments, such as image classification, natural language processing, et al. - -``` -paddle train \ - --use_gpu=1/0 \ #1:GPU,0:CPU(default:true) - --config=network_config \ - --save_dir=output \ - --trainer_count=COUNT \ #(default:1) - --test_period=M \ #(default:0) - --num_passes=N \ #(defalut:100) - --log_period=K \ #(default:100) - --dot_period=1000 \ #(default:1) - #[--show_parameter_stats_period=100] \ #(default:0) - #[--saving_period_by_batches=200] \ #(default:0) -``` -`show_parameter_stats_period` and `saving_period_by_batches` are optional according to your task. - -### 1) Pass Command Argument to Network config - -`config_args` is a useful parameter to pass arguments to network config. - -``` ---config_args=generating=1,beam_size=5,layer_num=10 \ -``` -And `get_config_arg` can be used to parse these arguments in network config as follows: - -``` -generating = get_config_arg('generating', bool, False) -beam_size = get_config_arg('beam_size', int, 3) -layer_num = get_config_arg('layer_num', int, 8) -``` - -`get_config_arg`: - -``` -get_config_arg(name, type, default_value) -``` -- name: the name specified in the `--config_args` -- type: value type, bool, int, str, float etc. -- default_value: default value if not set. - -### 2) Use Model to Initialize Network - -add argument: - -``` ---init_model_path=model_path ---load_missing_parameter_strategy=rand -``` - -## Local Testing - -Method 1: - -``` -paddle train --job=test \ - --use_gpu=1/0 \ - --config=network_config \ - --trainer_count=COUNT \ - --init_model_path=model_path \ -``` -- use init\_model\_path to specify test model. -- only can test one model. - -Method 2: - -``` -paddle train --job=test \ - --use_gpu=1/0 \ - --config=network_config \ - --trainer_count=COUNT \ - --model_list=model.list \ -``` -- use model_list to specify test models -- can test several models, where model.list likes: - -``` -./alexnet_pass1 -./alexnet_pass2 -``` - -Method 3: - -``` -paddle train --job=test \ - --use_gpu=1/0 \ - --config=network_config \ - --trainer_count=COUNT \ - --save_dir=model \ - --test_pass=M \ - --num_passes=N \ -``` -This way must use model path saved by Paddle like this: `model/pass-%5d`. Testing model is from M-th pass to (N-1)-th pass. For example: M=12 and N=14 will test `model/pass-00012` and `model/pass-00013`. - -## Sparse Training - -Sparse training is usually used to accelerate calculation when input is sparse data with highly dimension. For example, dictionary dimension of input data is 1 million, but one sample just have several words. In paddle, sparse matrix multiplication is used in forward propagation and sparse updating is perfomed on weight updating after backward propagation. - -### 1) Local training - -You need to set **sparse\_update=True** in network config. Check the network config documentation for more details. - -### 2) cluster training - -Add the following argument for cluster training of a sparse model. At the same time you need to set **sparse\_remote\_update=True** in network config. Check the network config documentation for more details. - -``` ---ports_num_for_sparse=1 #(default: 0) -``` - -## parallel_nn -`parallel_nn` can be set to mixed use of GPUs and CPUs to compute layers. That is to say, you can deploy network to use a GPU to compute some layers and use a CPU to compute other layers. The other way is to split layers into different GPUs, which can **reduce GPU memory** or **use parallel computation to accelerate some layers**. - -If you want to use these characteristics, you need to specify device ID in network config (denote it as deviceId) and add command line argument: - -``` ---parallel_nn=true -``` -### case 1: Mixed Use of GPU and CPU -Consider the following example: - -``` -#command line: -paddle train --use_gpu=true --parallel_nn=true trainer_count=COUNT - -default_device(0) - -fc1=fc_layer(...) -fc2=fc_layer(...) -fc3=fc_layer(...,layer_attr=ExtraAttr(device=-1)) - -``` -- default_device(0): set default device ID to 0. This means that except the layers with device=-1, all layers will use a GPU, and the specific GPU used for each layer depends on trainer\_count and gpu\_id (0 by default). Here, layer fc1 and fc2 are computed on the GPU. - -- device=-1: use the CPU for layer fc3. - -- trainer_count: - - trainer_count=1: if gpu\_id is not set, then use the first GPU to compute layers fc1 and fc2. Otherwise use the GPU with gpu\_id. - - - trainer_count>1: use trainer\_count GPUs to compute one layer using data parallelism. For example, trainer\_count=2 means that GPUs 0 and 1 will use data parallelism to compute layer fc1 and fc2. - -### Case 2: Specify Layers in Different Devices - -``` -#command line: -paddle train --use_gpu=true --parallel_nn=true --trainer_count=COUNT - -#network: -fc2=fc_layer(input=l1, layer_attr=ExtraAttr(device=0), ...) -fc3=fc_layer(input=l1, layer_attr=ExtraAttr(device=1), ...) -fc4=fc_layer(input=fc2, layer_attr=ExtraAttr(device=-1), ...) -``` -In this case, we assume that there are 4 GPUs in one machine. - -- trainer_count=1: - - Use GPU 0 to compute layer fc2. - - Use GPU 1 to compute layer fc3. - - Use CPU to compute layer fc4. - -- trainer_count=2: - - Use GPU 0 and 1 to compute layer fc2. - - Use GPU 2 and 3 to compute layer fc3. - - Use CPU to compute fc4 in two threads. - -- trainer_count=4: - - It will fail (note, we have assumed that there are 4 GPUs in machine), because argument `allow_only_one_model_on_one_gpu` is true by default. - -**Allocation of device ID when `device!=-1`**: - -``` -(deviceId + gpu_id + threadId * numLogicalDevices_) % numDevices_ - -deviceId: specified in layer. -gpu_id: 0 by default. -threadId: thread ID, range: 0,1,..., trainer_count-1 -numDevices_: device (GPU) count in machine. -numLogicalDevices_: min(max(deviceId + 1), numDevices_) -``` diff --git a/doc/v2/howto/index_cn.rst b/doc/v2/howto/index_cn.rst deleted file mode 100644 index b0268907bceb11cd53a4630c3f8b8e0424abe247..0000000000000000000000000000000000000000 --- a/doc/v2/howto/index_cn.rst +++ /dev/null @@ -1,37 +0,0 @@ -进阶使用 -======== - -PaddlePaddle支持用户灵活地设置各种命令行参数,以实现对模型训练或预测流程的控制。使用方式请参考: - -.. toctree:: - :maxdepth: 1 - - cmd_parameter/index_cn.rst - -PaddlePaddle支持在fabric集群、MPI集群、kubernetes集群上分布式训练任务,具体环境配置和使用说明请参考: - -.. toctree:: - :maxdepth: 1 - - cluster/index_cn.rst - -PaddlePaddle提供了用于预测的C-API,关于C-API的使用,我们提供了如下指南: - -.. toctree:: - :maxdepth: 1 - - capi/index_cn.rst - -PaddlePaddle支持多种灵活和高效的循环神经网络,具体配置使用方式请参考: - -.. toctree:: - :maxdepth: 1 - - rnn/index_cn.rst - -关于如何使用内置的定时工具、nvprof 或 nvvp 来运行性能分析和调优,请参考: - -.. toctree:: - :maxdepth: 1 - - optimization/gpu_profiling_cn.rst diff --git a/doc/v2/howto/index_en.rst b/doc/v2/howto/index_en.rst deleted file mode 100644 index 35ef197f58f1f865e2cdbdebb567d5637284637a..0000000000000000000000000000000000000000 --- a/doc/v2/howto/index_en.rst +++ /dev/null @@ -1,37 +0,0 @@ -HOW TO -======== - -PaddlePaddle provides the users the ability to flexibly set various command line parameters to control the model training and inference process. Please refer to the following instructions on using PaddlePaddle: - -.. toctree:: - :maxdepth: 1 - - cmd_parameter/index_en.rst - -PaddlePaddle supports distributed training tasks on fabric clusters, MPI clusters, and Kubernetes clusters. For detailed configuration and usage instructions, refer to: - -.. toctree:: - :maxdepth: 1 - - cluster/index_en.rst - -PaddlePaddle provides a C-API for inference. We provide the following guidelines for using the C-API: - -.. toctree:: - :maxdepth: 1 - - capi/index_en.rst - -PaddlePaddle supports a variety of flexible and efficient recurrent neural networks. For details, please refer to: - -.. toctree:: - :maxdepth: 1 - - rnn/index_en.rst - -How to use the built-in timing tool, nvprof, or nvvp to run performance analysis and tuning, please refer to: - -.. toctree:: - :maxdepth: 1 - - optimization/gpu_profiling_en.rst diff --git a/doc/v2/howto/optimization/gpu_profiling_cn.rst b/doc/v2/howto/optimization/gpu_profiling_cn.rst deleted file mode 100644 index f2396716bddd4810fa77c738d41f5482aa6d6055..0000000000000000000000000000000000000000 --- a/doc/v2/howto/optimization/gpu_profiling_cn.rst +++ /dev/null @@ -1,242 +0,0 @@ -============ -GPU性能调优 -============ - -.. contents:: - -此教程将向您分步介绍如何使用内置的定时工具、 **nvprof** 或 **nvvp** 来运行性能分析和调优。 - -- 什么是性能分析? -- 为什么需要性能分析? -- 如何进行性能分析? -- 性能分析工具介绍 -- 详细教程 -- 性能分析小技巧 - -什么是性能分析? -================ -在软件工程的范畴里,性能分析(Profiling)是一个动态程序分析的术语,它可以指测量一个程序的空间(内存)复杂度或时间复杂度, -也可以说是某些特定指令的使用情况,或者是函数调用的频率和耗时等。通常情况下,分析得到的信息用于协助进行程序的优化。 - -简单来说,性能分析工具是用于给应用程序的性能做定量分析的。如果想很好的理解程序的行为,那程序分析工具是必不可少的利器。简单的性能分析,可以告诉您某个操作到底花了多长时间?而更深入的分析,甚至能解释为什么某个操作花了很长时间? - -为什么需要性能分析? -============================ -训练好一个深层神经网络通常要耗费非常长的时间,所以性能也就逐步变成了深度学习领域最重要的指标。 -而优化性能的首要任务,是需要了解哪些步骤拖慢了整体。 -如果某一块根本就不怎么耗时,那也就不需要急着优化性能啦! - -如何进行性能分析? -======================== -为了达到性能最优,您可以采用下面五个步骤: - -- 对代码进行性能分析 -- 找到运行慢的部分 -- 找到运行慢的原因 -- 修改成更快的版本 -- 再次对代码进行性能分析 - -Usually, processor has two key performance limits include float point throughput and -memory throughput. For GPU, it also need more parallelism to fulfill its potential. -This is why they can be so fast. - -通常情况下,处理器有两个关键性能限制:一个是浮点计算量,另一个是内存操作量。 -GPU则还需要高并行性,才能发挥其全部能力。这正是它们速度快的原因。 - -性能分析工具介绍 -====================== -就通常的GPU性能分析来说,市面上已经有NVIDIA或第三方提供的众多工具。 - -**nvprof** 是Nvidia性能分析工具, **nvvp** 则是带GUI的Nvidia可视化性能分析工具。 -在这个教程中,我们主要会介绍nvprof和nvvp。 - -:code:`test_GpuProfiler` from :code:`paddle/legacy/math/tests` directory will be used to evaluate -above profilers. - -:code:`paddle/legacy/math/test` 目录中的 :code:`test_GpuProfiler` 就是用于展示上述分析工具的用法。 - -.. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp - :language: c++ - :lines: 137-151 - :linenos: - -上述的代码片段包含了两种方法,您可以任意使用一个或两个来对感兴趣的代码段做性能分析。 - -1. :code:`REGISTER_TIMER_INFO` 是一个内置的定时器封装,可以用来计算CPU函数或cuda内核的时间消耗。 - -2. :code:`REGISTER_GPU_PROFILER` is a general purpose wrapper object of :code:`cudaProfilerStart` and :code:`cudaProfilerStop` to avoid -program crashes when CPU version of PaddlePaddle invokes them. - -3. :code:`REGISTER_GPU_PROFILER` 是一个封装对象,封装了 :code:`cudaProfilerStart` 和 :code:`cudaProfileStop` 两个操作;同时其内部实现可以避免纯CPU版本PaddlePaddle在执行本语句时发生崩溃。 - -您会在接下来的部分中获得更多的细节介绍。 - -详细教程 -============ - -内置定时器 ------------- - -如果想要启用PaddlePaddle的内置定时器,您首先需要在相关代码段中加入 :code:`REGISTER_TIMER_INFO`。 -接下来就可以使用 :code:`printStatus` 或者 :code:`printAllStatus` 函数来将信息输出到界面中。 -下面举个简单的例子: - -1. 加入 :code:`REGISTER_TIMER_INFO` 和 :code:`printAllStatus` 函数(如高亮部分)。 - - .. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp - :language: c++ - :lines: 137-151 - :emphasize-lines: 8-12,14 - :linenos: - -2. cmake配置中将 **WITH_TIMER** 打开,重新编译PaddlePaddle。 - - .. code-block:: bash - - cmake .. -DWITH_TIMER=ON - make - -3. 执行您的代码,并观察结果(如高亮部分)。 - - .. code-block:: bash - :emphasize-lines: 1,12-15 - - > ./paddle/legacy/math/tests/test_GpuProfiler - I1117 11:13:42.313065 2522362816 Util.cpp:155] commandline: ./paddle/legacy/math/tests/test_GpuProfiler - I1117 11:13:42.845065 2522362816 Util.cpp:130] Calling runInitFunctions - I1117 11:13:42.845208 2522362816 Util.cpp:143] Call runInitFunctions done. - [==========] Running 1 test from 1 test case. - [----------] Global test environment set-up. - [----------] 1 test from Profiler - [ RUN ] Profiler.BilinearFwdBwd - I1117 11:13:42.845310 2522362816 test_GpuProfiler.cpp:114] Enable GPU Profiler Stat: [testBilinearFwdBwd] "numSamples = 10, channels = 16, im - gSizeX = 64, imgSizeY = 64" - I1117 11:13:42.850154 2522362816 ThreadLocal.cpp:37] thread use undeterministic rand seed:20659751 - I1117 11:13:42.981501 2522362816 Stat.cpp:130] ======= StatSet: [GlobalStatInfo] status ====== - I1117 11:13:42.981539 2522362816 Stat.cpp:133] Stat=testBilinearFwdBwd total=136.141 avg=136.141 max=136.141 min=136.141 count=1 - I1117 11:13:42.981572 2522362816 Stat.cpp:141] ======= BarrierStatSet status ====== - I1117 11:13:42.981575 2522362816 Stat.cpp:154] -------------------------------------------------- - [ OK ] Profiler.BilinearFwdBwd (136 ms) - [----------] 1 test from Profiler (136 ms total) - - [----------] Global test environment tear-down - [==========] 1 test from 1 test case ran. (136 ms total) - [ PASSED ] 1 test. - -nvprof 工具 ----------------- - -要使用命令行分析工具 **nvprof**,您按如下步骤操作即可: - -1. 将 :code:`REGISTER_GPU_PROFILER` 函数加到代码中(参考强调部分)。 - - .. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp - :language: c++ - :lines: 137-151 - :emphasize-lines: 6-7 - :linenos: - -2. cmake中将 **WITH_PROFILER** 配置打开,重新编译PaddlePaddle。 - - .. code-block:: bash - - cmake .. -DWITH_PROFILER=ON - make - -3. 使用 **nvprof** 来分析执行文件。 - - .. code-block:: bash - - nvprof ./paddle/legacy/math/tests/test_GpuProfiler - -然后,您就能获得如下的分析结果: - -.. code-block:: bash - - ==78544== Profiling application: ./paddle/legacy/math/tests/test_GpuProfiler - ==78544== Profiling result: - Time(%) Time Calls Avg Min Max Name - 27.60% 9.6305ms 5 1.9261ms 3.4560us 6.4035ms [CUDA memcpy HtoD] - 26.07% 9.0957ms 1 9.0957ms 9.0957ms 9.0957ms KeBilinearInterpBw - 23.78% 8.2977ms 1 8.2977ms 8.2977ms 8.2977ms KeBilinearInterpFw - 22.55% 7.8661ms 2 3.9330ms 1.5798ms 6.2863ms [CUDA memcpy DtoH] - - ==78544== API calls: - Time(%) Time Calls Avg Min Max Name - 46.85% 682.28ms 8 85.285ms 12.639us 682.03ms cudaStreamCreateWithFlags - 39.83% 580.00ms 4 145.00ms 302ns 550.27ms cudaFree - 9.82% 143.03ms 9 15.892ms 8.7090us 142.78ms cudaStreamCreate - 1.23% 17.983ms 7 2.5690ms 23.210us 6.4563ms cudaMemcpy - 1.23% 17.849ms 2 8.9247ms 8.4726ms 9.3768ms cudaStreamSynchronize - 0.66% 9.5969ms 7 1.3710ms 288.43us 2.4279ms cudaHostAlloc - 0.13% 1.9530ms 11 177.54us 7.6810us 591.06us cudaMalloc - 0.07% 1.0424ms 8 130.30us 1.6970us 453.72us cudaGetDevice - 0.04% 527.90us 40 13.197us 525ns 253.99us cudaEventCreateWithFlags - 0.03% 435.73us 348 1.2520us 124ns 42.704us cuDeviceGetAttribute - 0.03% 419.36us 1 419.36us 419.36us 419.36us cudaGetDeviceCount - 0.02% 260.75us 2 130.38us 129.32us 131.43us cudaGetDeviceProperties - 0.02% 222.32us 2 111.16us 106.94us 115.39us cudaLaunch - 0.01% 214.06us 4 53.514us 28.586us 77.655us cuDeviceGetName - 0.01% 115.45us 4 28.861us 9.8250us 44.526us cuDeviceTotalMem - 0.01% 83.988us 4 20.997us 578ns 77.760us cudaSetDevice - 0.00% 38.918us 1 38.918us 38.918us 38.918us cudaEventCreate - 0.00% 34.573us 31 1.1150us 279ns 12.784us cudaDeviceGetAttribute - 0.00% 17.767us 1 17.767us 17.767us 17.767us cudaProfilerStart - 0.00% 15.228us 2 7.6140us 3.5460us 11.682us cudaConfigureCall - 0.00% 14.536us 2 7.2680us 1.1490us 13.387us cudaGetLastError - 0.00% 8.6080us 26 331ns 173ns 783ns cudaSetupArgument - 0.00% 5.5470us 6 924ns 215ns 2.6780us cuDeviceGet - 0.00% 5.4090us 6 901ns 328ns 3.3320us cuDeviceGetCount - 0.00% 4.1770us 3 1.3920us 1.0630us 1.8300us cuDriverGetVersion - 0.00% 3.4650us 3 1.1550us 1.0810us 1.2680us cuInit - 0.00% 830ns 1 830ns 830ns 830ns cudaRuntimeGetVersion - - -nvvp 工具 --------------- - -如果想使用可视化的分析器 **nvvp**,您可以导入 :code:`nvprof -o ...` 的输出,或者从工具的界面里运行您的应用。 - -**备注: nvvp 也支持CPU的性能分析** (需在nvvp界面中选上才能开启) - -.. image:: nvvp1.png - :align: center - :scale: 33% - -从内核函数的角度, **nvvp** 可以精确说明一个长耗时操作的具体原因。 -同时,如下图所示, **nvvp** 的内核block使用情况、寄存器使用情况和共享内存使用情况能让我们对GPU的整体使用有更好的理解。 - - -.. image:: nvvp2.png - :align: center - :scale: 33% - -而从应用的角度, **nvvp** 可以帮您提供一些定位性能瓶颈的建议。 -例如,下图中就展示了一些关于内存数据迁徙和计算资源利用率的建议,为您做性能调优提供了方向。 - -.. image:: nvvp3.png - :align: center - :scale: 33% - -.. image:: nvvp4.png - :align: center - :scale: 33% - -性能分析小技巧 -================== - -- 开始阶段,从 **nvprof** 和 **nvvp** 的输出信息入手是个不错的选择。 -- 接下来可以考虑下时间线的分析。 -- 如果真想挖掘内核深处的某个秘密,您最好先确认:这一块的耗时比例真的太高,值得深入分析。 -- 可能的情况下,试着让输出的分析数据和理论值对应。 - - 1) 例如,如果我知道内核花了10ms来移动1GB数据,那我会期望分析工具统计到速度是100GB/s。 - 2) 若有不一致之处,很有可能实际应用就是没有按照您的预期情况运行。 -- 了解您的硬件:如果您的GPU理论可以达到6 TFLOPs(6万亿次浮点运算每秒),而当前已经有5.5 TFLOPs了,那估计这里的潜力就没啥好挖的了…… - -性能分析是性能优化的关键一步。有的时候简简单单的改变就能在性能上产生明显的优化效果! -当然,具体情况因人而异。 - -参考资料 -=========== -Jeremy Appleyard, `GPU Profiling for Deep Learning `_, 2015 diff --git a/doc/v2/howto/optimization/gpu_profiling_en.rst b/doc/v2/howto/optimization/gpu_profiling_en.rst deleted file mode 100644 index 6e439be9bba8935cdd65f1c131cfd3725530ec0e..0000000000000000000000000000000000000000 --- a/doc/v2/howto/optimization/gpu_profiling_en.rst +++ /dev/null @@ -1,240 +0,0 @@ -==================== -Tune GPU Performance -==================== - -.. contents:: - -This tutorial will guide you step-by-step through how to conduct profiling and performance tuning using built-in timer, **nvprof** and **nvvp**. - -- What is profiling? -- Why we need profiling? -- How to do profiling? -- Profile tools -- Hands-on Tutorial -- Profiling tips - -What's profiling? -================= -In software engineering, profiling is a form of dynamic program analysis that measures the space (memory) or time -complexity of a program, the usage of particular instructions, or the frequency and duration of function calls. -Most commonly, profiling information serves to aid program optimization. - -Briefly, profiler is used to measure application performance. Program analysis tools are extremely important for -understanding program behavior. Simple profiling can tell you that how long does an operation take? For advanced -profiling, it can interpret why does an operation take a long time? - -Why we need profiling? -====================== -Since training deep neural network typically take a very long time to get over, performance is gradually becoming -the most important thing in deep learning field. The first step to improve performance is to understand what parts -are slow. There is no point in improving performance of a region which doesn’t take much time! - - -How to do profiling? -==================== -To achieve maximum performance, there are five steps you can take to reach your goals. - -- Profile the code -- Find the slow parts -- Work out why they’re slow -- Make them fast -- Profile the code again - -Usually, processor has two key performance limits include float point throughput and -memory throughput. For GPU, it also need more parallelism to fulfill its potential. -This is why they can be so fast. - -Profiler Tools -============== -For general GPU profiling, a bunch of tools are provided from both NVIDIA and third party. - -**nvprof** is Nvidia profiler and **nvvp** is (GUI based) Nvidia visual profiler. -In this tutorial, we will focus on nvprof and nvvp. - -:code:`test_GpuProfiler` from :code:`paddle/legacy/math/tests` directory will be used to evaluate -above profilers. - -.. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp - :language: c++ - :lines: 137-151 - :linenos: - -The above code snippet includes two methods, you can use any of them to profile the regions of interest. - -1. :code:`REGISTER_TIMER_INFO` is a built-in timer wrapper which can calculate the time overhead of both cpu functions and cuda kernels. - -2. :code:`REGISTER_GPU_PROFILER` is a general purpose wrapper object of :code:`cudaProfilerStart` and :code:`cudaProfilerStop` to avoid -program crashes when CPU version of PaddlePaddle invokes them. - -You can find more details about how to use both of them in the next session. - -Hands-on Approach -================= - -Built-in Timer --------------- - -To enable built-in timer in PaddlePaddle, first you have to add :code:`REGISTER_TIMER_INFO` into the regions of you interest. -Then, all information could be stamped in the console via :code:`printStatus` or :code:`printAllStatus` function. -As a simple example, consider the following: - -1. Add :code:`REGISTER_TIMER_INFO` and :code:`printAllStatus` functions (see the emphasize-lines). - - .. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp - :language: c++ - :lines: 137-151 - :emphasize-lines: 8-12,14 - :linenos: - -2. Configure cmake with **WITH_TIMER** and recompile PaddlePaddle. - - .. code-block:: bash - - cmake .. -DWITH_TIMER=ON - make - -3. Execute your code and observe the results (see the emphasize-lines). - - .. code-block:: bash - :emphasize-lines: 1,12-15 - - > ./paddle/legacy/math/tests/test_GpuProfiler - I1117 11:13:42.313065 2522362816 Util.cpp:155] commandline: ./paddle/legacy/math/tests/test_GpuProfiler - I1117 11:13:42.845065 2522362816 Util.cpp:130] Calling runInitFunctions - I1117 11:13:42.845208 2522362816 Util.cpp:143] Call runInitFunctions done. - [==========] Running 1 test from 1 test case. - [----------] Global test environment set-up. - [----------] 1 test from Profiler - [ RUN ] Profiler.BilinearFwdBwd - I1117 11:13:42.845310 2522362816 test_GpuProfiler.cpp:114] Enable GPU Profiler Stat: [testBilinearFwdBwd] "numSamples = 10, channels = 16, im - gSizeX = 64, imgSizeY = 64" - I1117 11:13:42.850154 2522362816 ThreadLocal.cpp:37] thread use undeterministic rand seed:20659751 - I1117 11:13:42.981501 2522362816 Stat.cpp:130] ======= StatSet: [GlobalStatInfo] status ====== - I1117 11:13:42.981539 2522362816 Stat.cpp:133] Stat=testBilinearFwdBwd total=136.141 avg=136.141 max=136.141 min=136.141 count=1 - I1117 11:13:42.981572 2522362816 Stat.cpp:141] ======= BarrierStatSet status ====== - I1117 11:13:42.981575 2522362816 Stat.cpp:154] -------------------------------------------------- - [ OK ] Profiler.BilinearFwdBwd (136 ms) - [----------] 1 test from Profiler (136 ms total) - - [----------] Global test environment tear-down - [==========] 1 test from 1 test case ran. (136 ms total) - [ PASSED ] 1 test. - -nvprof profiler ---------------- - -To use this command line profiler **nvprof**, you can simply issue the following command: - -1. Add :code:`REGISTER_GPU_PROFILER` function (see the emphasize-lines). - - .. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp - :language: c++ - :lines: 137-151 - :emphasize-lines: 6-7 - :linenos: - -2. Configure cmake with **WITH_PROFILER** and recompile PaddlePaddle. - - .. code-block:: bash - - cmake .. -DWITH_PROFILER=ON - make - -3. Use Nvidia profiler **nvprof** to profile the binary. - - .. code-block:: bash - - nvprof ./paddle/legacy/math/tests/test_GpuProfiler - -Then, you can get the following profiling result: - -.. code-block:: bash - - ==78544== Profiling application: ./paddle/legacy/math/tests/test_GpuProfiler - ==78544== Profiling result: - Time(%) Time Calls Avg Min Max Name - 27.60% 9.6305ms 5 1.9261ms 3.4560us 6.4035ms [CUDA memcpy HtoD] - 26.07% 9.0957ms 1 9.0957ms 9.0957ms 9.0957ms KeBilinearInterpBw - 23.78% 8.2977ms 1 8.2977ms 8.2977ms 8.2977ms KeBilinearInterpFw - 22.55% 7.8661ms 2 3.9330ms 1.5798ms 6.2863ms [CUDA memcpy DtoH] - - ==78544== API calls: - Time(%) Time Calls Avg Min Max Name - 46.85% 682.28ms 8 85.285ms 12.639us 682.03ms cudaStreamCreateWithFlags - 39.83% 580.00ms 4 145.00ms 302ns 550.27ms cudaFree - 9.82% 143.03ms 9 15.892ms 8.7090us 142.78ms cudaStreamCreate - 1.23% 17.983ms 7 2.5690ms 23.210us 6.4563ms cudaMemcpy - 1.23% 17.849ms 2 8.9247ms 8.4726ms 9.3768ms cudaStreamSynchronize - 0.66% 9.5969ms 7 1.3710ms 288.43us 2.4279ms cudaHostAlloc - 0.13% 1.9530ms 11 177.54us 7.6810us 591.06us cudaMalloc - 0.07% 1.0424ms 8 130.30us 1.6970us 453.72us cudaGetDevice - 0.04% 527.90us 40 13.197us 525ns 253.99us cudaEventCreateWithFlags - 0.03% 435.73us 348 1.2520us 124ns 42.704us cuDeviceGetAttribute - 0.03% 419.36us 1 419.36us 419.36us 419.36us cudaGetDeviceCount - 0.02% 260.75us 2 130.38us 129.32us 131.43us cudaGetDeviceProperties - 0.02% 222.32us 2 111.16us 106.94us 115.39us cudaLaunch - 0.01% 214.06us 4 53.514us 28.586us 77.655us cuDeviceGetName - 0.01% 115.45us 4 28.861us 9.8250us 44.526us cuDeviceTotalMem - 0.01% 83.988us 4 20.997us 578ns 77.760us cudaSetDevice - 0.00% 38.918us 1 38.918us 38.918us 38.918us cudaEventCreate - 0.00% 34.573us 31 1.1150us 279ns 12.784us cudaDeviceGetAttribute - 0.00% 17.767us 1 17.767us 17.767us 17.767us cudaProfilerStart - 0.00% 15.228us 2 7.6140us 3.5460us 11.682us cudaConfigureCall - 0.00% 14.536us 2 7.2680us 1.1490us 13.387us cudaGetLastError - 0.00% 8.6080us 26 331ns 173ns 783ns cudaSetupArgument - 0.00% 5.5470us 6 924ns 215ns 2.6780us cuDeviceGet - 0.00% 5.4090us 6 901ns 328ns 3.3320us cuDeviceGetCount - 0.00% 4.1770us 3 1.3920us 1.0630us 1.8300us cuDriverGetVersion - 0.00% 3.4650us 3 1.1550us 1.0810us 1.2680us cuInit - 0.00% 830ns 1 830ns 830ns 830ns cudaRuntimeGetVersion - - -nvvp profiler -------------- - -For visual profiler **nvvp**, you can either import the output of :code:`nvprof –o ...` or -run application through GUI. - -**Note: nvvp also support CPU profiling** (Click the box in nvvp to enable profile execution on CPU). - -.. image:: nvvp1.png - :align: center - :scale: 33% - -From the perspective of kernel functions, **nvvp** can even illustrate why does an operation take a long time? -As shown in the following figure, kernel's block usage, register usage and shared memory usage from :code:`nvvp` -allow us to fully utilize all warps on the GPU. - -.. image:: nvvp2.png - :align: center - :scale: 33% - -From the perspective of application, **nvvp** can give you some suggestions to address performance bottleneck. -For instance, some advice in data movement and compute utilization from the below figure can guide you to tune performance. - -.. image:: nvvp3.png - :align: center - :scale: 33% - -.. image:: nvvp4.png - :align: center - :scale: 33% - -Profiling tips -============== - -- The **nvprof** and **nvvp** output is a very good place to start. -- The timeline is a good place to go next. -- Only dig deep into a kernel if it’s taking a significant amount of your time. -- Where possible, try to match profiler output with theory. - 1) For example, if I know I’m moving 1GB, and my kernel takes 10ms, I expect the profiler to report 100GB/s. - 2) Discrepancies are likely to mean your application isn’t doing what you thought it was. -- Know your hardware: If your GPU can do 6 TFLOPs, and you’re already doing 5.5 TFLOPs, you won’t go much faster! - - -Profiling is a key step in optimization. Sometimes quite simple changes can lead to big improvements in performance. -Your mileage may vary! - -Reference -========= -Jeremy Appleyard, `GPU Profiling for Deep Learning `_, 2015 diff --git a/doc/v2/howto/optimization/nvvp1.png b/doc/v2/howto/optimization/nvvp1.png deleted file mode 100644 index 1af23ac3c52929b2b0645d2f9fa4d4c6db1f6e77..0000000000000000000000000000000000000000 Binary files a/doc/v2/howto/optimization/nvvp1.png and /dev/null differ diff --git a/doc/v2/howto/optimization/nvvp2.png b/doc/v2/howto/optimization/nvvp2.png deleted file mode 100644 index 177c9db708da6863d1075f3e615f5962dbe18b29..0000000000000000000000000000000000000000 Binary files a/doc/v2/howto/optimization/nvvp2.png and /dev/null differ diff --git a/doc/v2/howto/optimization/nvvp3.png b/doc/v2/howto/optimization/nvvp3.png deleted file mode 100644 index d8f393667d6569b6f1e61ffccac43fae5888b6db..0000000000000000000000000000000000000000 Binary files a/doc/v2/howto/optimization/nvvp3.png and /dev/null differ diff --git a/doc/v2/howto/optimization/nvvp4.png b/doc/v2/howto/optimization/nvvp4.png deleted file mode 100644 index 51f2f3e183295de6cf8ddaf2b3b8a0862aa35f01..0000000000000000000000000000000000000000 Binary files a/doc/v2/howto/optimization/nvvp4.png and /dev/null differ diff --git a/doc/v2/howto/rnn/hierarchical_layer_cn.rst b/doc/v2/howto/rnn/hierarchical_layer_cn.rst deleted file mode 100644 index 2f8f408b40299890da694862a7b9418cf9ff07f2..0000000000000000000000000000000000000000 --- a/doc/v2/howto/rnn/hierarchical_layer_cn.rst +++ /dev/null @@ -1,89 +0,0 @@ -########################### -支持双层序列作为输入的Layer -########################### - -.. contents:: - -概述 -==== - -在自然语言处理任务中,序列是一种常见的数据类型。一个独立的词语,可以看作是一个非序列输入,或者,我们称之为一个0层的序列;由词语构成的句子,是一个单层序列;若干个句子构成一个段落,是一个双层的序列。 - -双层序列是一个嵌套的序列,它的每一个元素,又是一个单层的序列。这是一种非常灵活的数据组织方式,帮助我们构造一些复杂的输入信息。 - -我们可以按照如下层次定义非序列,单层序列,以及双层序列。 - -+ 0层序列:一个独立的元素,类型可以是PaddlePaddle支持的任意输入数据类型 -+ 单层序列:排成一列的多个元素,每个元素是一个0层序列,元素之间的顺序是重要的输入信息 -+ 双层序列:排成一列的多个元素,每个元素是一个单层序列,称之为双层序列的一个子序列(subseq),subseq的每个元素是一个0层序列 - -在 PaddlePaddle中,下面这些Layer能够接受双层序列作为输入,完成相应的计算。 - -pooling -======== - -pooling 的使用示例如下。 - -.. code-block:: bash - - seq_pool = pooling(input=layer, - pooling_type=pooling.Max(), - agg_level=AggregateLevel.TO_SEQUENCE) - -- `pooling_type` 目前支持两种,分别是:pooling.Max()和pooling.Avg()。 - -- `agg_level=AggregateLevel.TO_NO_SEQUENCE` 时(默认值): - - - 作用:双层序列经过运算变成一个0层序列,或单层序列经过运算变成一个0层序列 - - 输入:一个双层序列,或一个单层序列 - - 输出:一个0层序列,即整个输入序列(单层或双层)的平均值(或最大值) - -- `agg_level=AggregateLevel.TO_SEQUENCE` 时: - - - 作用:一个双层序列经过运算变成一个单层序列 - - 输入:必须是一个双层序列 - - 输出:一个单层序列,序列的每个元素是原来双层序列每个subseq元素的平均值(或最大值) - -last_seq 和 first_seq -===================== - -last_seq 的使用示例如下(first_seq 类似)。 - -.. code-block:: bash - - last = last_seq(input=layer, - agg_level=AggregateLevel.TO_SEQUENCE) - -- `agg_level=AggregateLevel.TO_NO_SEQUENCE` 时(默认值): - - - 作用:一个双层序列经过运算变成一个0层序列,或一个单层序列经过运算变成一个0层序列 - - 输入:一个双层序列或一个单层序列 - - 输出:一个0层序列,即整个输入序列(双层或者单层)最后一个,或第一个元素。 - -- `agg_level=AggregateLevel.TO_SEQUENCE` 时: - - 作用:一个双层序列经过运算变成一个单层序列 - - 输入:必须是一个双层序列 - - 输出:一个单层序列,其中每个元素是双层序列中每个subseq最后一个(或第一个)元素。 - -expand -====== - -expand 的使用示例如下。 - -.. code-block:: bash - - ex = expand(input=layer1, - expand_as=layer2, - expand_level=ExpandLevel.FROM_NO_SEQUENCE) - -- `expand_level=ExpandLevel.FROM_NO_SEQUENCE` 时(默认值): - - - 作用:一个0层序列经过运算扩展成一个单层序列,或者一个双层序列 - - 输入:layer1必须是一个0层序列,是待扩展的数据;layer2 可以是一个单层序列,或者是一个双层序列,提供扩展的长度信息 - - 输出:一个单层序列或一个双层序列,输出序列的类型(双层序列或单层序列)和序列中含有元素的数目同 layer2 一致。若输出是单层序列,单层序列的每个元素(0层序列),都是对layer1元素的拷贝;若输出是双层序列,双层序列每个subseq中每个元素(0层序列),都是对layer1元素的拷贝 - -- `expand_level=ExpandLevel.FROM_SEQUENCE` 时: - - - 作用:一个单层序列经过运算扩展成一个双层序列 - - 输入:layer1必须是一个单层序列,是待扩展的数据;layer2 必须是一个双层序列,提供扩展的长度信息 - - 输出:一个双层序列,序列中含有元素的数目同 layer2 一致。要求单层序列含有元素的数目(0层序列)和双层序列含有subseq 的数目一致。单层序列第i个元素(0层序列),被扩展为一个单层序列,构成了输出双层序列的第i个 subseq 。 diff --git a/doc/v2/howto/rnn/hierarchical_layer_en.rst b/doc/v2/howto/rnn/hierarchical_layer_en.rst deleted file mode 100644 index fb668f1babb47f49b2dab6d2411565e99599d8b0..0000000000000000000000000000000000000000 --- a/doc/v2/howto/rnn/hierarchical_layer_en.rst +++ /dev/null @@ -1,89 +0,0 @@ -########################### -Layers that Support Hierarchical Sequences as Input -########################### -  -.. contents:: -  -Overview -==== -  -A sequence is a common data type in natural language processing tasks. An independent word can be regarded as a non-sequential input or a 0-level sequence. A sentence made up of words is a single-level sequence; a number of sentences make up a paragraph, which is a double-level sequence. -  -A double-level sequence is a nested sequence where each element is a single-level sequence. This is a very flexible way of organizing data that helps us construct some complex input information. -  -We can define non-sequences, single-level sequences, and double-level sequences at the following levels. -  -+ 0-level sequence: an independent element. Its type can be any input data type supported by PaddlePaddle; -+ Single-level sequence: multiple elements arranged in a row; each element is a 0-level sequence. The order of elements is an important input information; -+ Double-level sequence: multiple elements arranged in a row; each element is a single-layer sequence called a subseq of a double-level sequence, and each element of the subseq is a 0-level sequence. -  -In PaddlePaddle, the following layers accept double-layer sequences as input and perform corresponding calculations. -  -`pooling` -======== -  -The use of pooling is as follows: -  -.. code-block:: bash -  -        Seq_pool = pooling(input=layer, -                           Pooling_type=pooling.Max(), -                           Agg_level=AggregateLevel.TO_SEQUENCE) -         -- `pooling_type` currently supports two types: pooling.Max() and pooling.Avg(). -  -- When ʻagg_level=AggregateLevel.TO_NO_SEQUENCE` (default): -  -  - Effect: a double-level sequence input will be converted into a 0-level sequence, and a single-level sequence will be converted into a 0-level sequence  -  - Input: a double-level sequence or a single-level sequence -  - Output: a 0-level sequence which is the average (or maximum) of the entire input sequence (single or double) -  -- When ʻagg_level=AggregateLevel.TO_SEQUENCE`: -  -  - Effect: a double-level sequence will be transformed into a single-level sequence -  - Input: a double-level sequence -  - Output: a single-level sequence where each element of the sequence is the average (or maximum) value of each subseq element of the original double-level sequence. -  -`last_seq` and `first_seq` -===================== -  -An example of using `last_seq` is as follows (usage of `first_seq` is similar). -  -.. code-block:: bash -  -        Last = last_seq(input=layer, -                        Agg_level=AggregateLevel.TO_SEQUENCE) -         -- When ʻagg_level=AggregateLevel.TO_NO_SEQUENCE` (default): -  -  - Effect: a double-level sequence input will be converted into a 0-level sequence, and a single-level sequence will be converted into a 0-level sequence -  - Input: a double-level sequence or a single-level sequence -  - Output: a 0-level sequence, which is the last or the first element of the input sequence (double or single level). -  -- When ʻagg_level=AggregateLevel.TO_SEQUENCE`: -  - Effect: a double-level sequence will be transformed into a single-level sequence -  - Input: a double-level sequence -  - Output: a single-layer sequence in which each element is the last (or first) element of each subseq in a double-level sequence. -  -`expand` -====== -  -The use of expand is as follows. -  -.. code-block:: bash -  -        Ex = expand(input=layer1, -                    Expand_as=layer2, -                    Expand_level=ExpandLevel.FROM_NO_SEQUENCE) -         -- When `expand_level=ExpandLevel.FROM_NO_SEQUENCE` (default): -  -  - Effect: a 0-level sequence is extended to a single-level sequence or a double-level sequence -  - Input: layer1 must be a 0-level sequence to be extended; layer2 can be a single-level sequence or a double-level sequence that provides the extended length information -  - Output: a single-level sequence or a double-level sequence; the type of the output sequence and the number of elements contained in the sequence are the same as layer2. If the output is a single-level sequence, each element of the single-level sequence will be a copy of the layer1 element. If the output is a double-level sequence, each element in the double-level sequence will be a copy of the layer1 element -  -- When `expand_level=ExpandLevel.FROM_SEQUENCE`: -  -  - Effect: a single-level sequence is extended to a double-level sequence -  - Input: layer1 must be a single-level sequence to be extended; layer2 must be a double-level sequence providing extended length information -  - Output: a double-level sequence with the same number of elements as that of layer2. It is required that the number of elements in the single-level sequence be the same as the number of subseq in the double-level sequences. The i-th element of the single-level sequence (the 0-level sequence) is expanded into a single-level sequence that constitutes the i-th subseq of the output, the double-level sequence. diff --git a/doc/v2/howto/rnn/hrnn_rnn_api_compare_cn.rst b/doc/v2/howto/rnn/hrnn_rnn_api_compare_cn.rst deleted file mode 100644 index 9d6d417075485dceb1ee71f527b408aa6a6638ea..0000000000000000000000000000000000000000 --- a/doc/v2/howto/rnn/hrnn_rnn_api_compare_cn.rst +++ /dev/null @@ -1,226 +0,0 @@ -.. _algo_hrnn_rnn_api_compare: - -##################### -单双层RNN API对比介绍 -##################### - -本文以PaddlePaddle的双层RNN单元测试为示例,用多对效果完全相同的、分别使用单双层RNN作为网络配置的模型,来讲解如何使用双层RNN。本文中所有的例子,都只是介绍双层RNN的API接口,并不是使用双层RNN解决实际的问题。如果想要了解双层RNN在具体问题中的使用,请参考\ :ref:`algo_hrnn_demo`\ 。本文中示例所使用的单元测试文件是\ `test_RecurrentGradientMachine.cpp `_\ 。 - -示例1:双层RNN,子序列间无Memory -================================ - -在双层RNN中的经典情况是将内层的每一个时间序列数据,分别进行序列操作;并且内层的序列操作之间独立无依赖,即不需要使用Memory\ 。 - -在本示例中,单层RNN和双层RNN的网络配置,都是将每一句分好词后的句子,使用LSTM作为encoder,压缩成一个向量。区别是RNN使用两层序列模型,将多句话看成一个整体同时使用encoder压缩。二者语意上完全一致。这组语义相同的示例配置如下: - -* 单层RNN\: `sequence_layer_group.conf `_ -* 双层RNN\: `sequence_nest_layer_group.conf `_ - - -读取双层序列数据 ----------------- - -首先,本示例中使用的原始数据如下\: - -- 本例中的原始数据一共有10个样本。每个样本由两部分组成,一个label(此处都为2)和一个已经分词后的句子。这个数据也被单层RNN网络直接使用。 - -.. literalinclude:: ../../../../paddle/legacy/gserver/tests/Sequence/tour_train_wdseg - :language: text - - -- 双层序列数据一共有4个样本。 每个样本间用空行分开,整体数据和原始数据完全一样。但于双层序列的LSTM来说,第一个样本同时encode两条数据成两个向量。这四条数据同时处理的句子数量为\ :code:`[2, 3, 2, 3]`\ 。 - -.. literalinclude:: ../../../../paddle/legacy/gserver/tests/Sequence/tour_train_wdseg.nest - :language: text - -其次,对于两种不同的输入数据类型,不同DataProvider对比如下(`sequenceGen.py `_)\: - -.. literalinclude:: ../../../../paddle/legacy/gserver/tests/sequenceGen.py - :language: python - :lines: 21-39 - :linenos: - -- 这是普通的单层时间序列的DataProvider代码,其说明如下: - - * DataProvider共返回两个数据,分别是words和label。即上述代码中的第19行。 - - - words是原始数据中的每一句话,所对应的词表index数组。它是integer_value_sequence类型的,即整数数组。words即为这个数据中的单层时间序列。 - - label是原始数据中对于每一句话的分类标签,它是integer_value类型的。 - -.. literalinclude:: ../../../../paddle/legacy/gserver/tests/sequenceGen.py - :language: python - :lines: 42-71 - :linenos: - -- 对于同样的数据,双层时间序列的DataProvider的代码。其说明如下: - - - DataProvider共返回两组数据,分别是sentences和labels。即在双层序列的原始数据中,每一组内的所有句子和labels - - sentences是双层时间序列的数据。由于它内部包含了每组数据中的所有句子,且每个句子表示为对应的词表索引数组,因此它是integer_value_sub_sequence 类型的,即双层时间序列。 - - labels是每组内每个句子的标签,故而是一个单层时间序列。 - - -模型配置的模型配置 ------------------------------------------- - -首先,我们看一下单层RNN的配置。代码中9-15行(高亮部分)即为单层RNN序列的使用代码。这里使用了PaddlePaddle预定义好的RNN处理函数。在这个函数中,RNN对于每一个时间步通过了一个LSTM网络。 - -.. literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_layer_group.conf - :language: python - :lines: 38-63 - :linenos: - :emphasize-lines: 9-15 - - -其次,我们看一下语义相同的双层RNN的网络配置\: - -* PaddlePaddle中的许多layer并不在意输入是否是时间序列,例如\ :code:`embedding_layer`\ 。在这些layer中,所有的操作都是针对每一个时间步来进行的。 - -* 在该配置的7-26行(高亮部分),将双层时间序列数据先变换成单层时间序列数据,再对每一个单层时间序列进行处理。 - - * 使用\ :code:`recurrent_group`\ 这个函数进行变换,在变换时需要将输入序列传入。由于我们想要的变换是双层时间序列=> 单层时间序列,所以我们需要将输入数据标记成\ :code:`SubsequenceInput`\ 。 - - * 在本例中,我们将原始数据的每一组,通过\ :code:`recurrent_group`\ 进行拆解,拆解成的每一句话再通过一个LSTM网络。这和单层RNN的配置是等价的。 - -* 与单层RNN的配置类似,我们只需要使用LSTM encode成的最后一个向量。所以对\ :code:`recurrent_group`\ 进行了\ :code:`last_seq`\ 操作。但和单层RNN不同,我们是对每一个子序列取最后一个元素,因此\ :code:`agg_level=AggregateLevel.TO_SEQUENCE`\ 。 - -* 至此,\ :code:`lstm_last`\ 便和单层RNN配置中的\ :code:`lstm_last`\ 具有相同的结果了。 - -.. literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_nest_layer_group.conf - :language: python - :lines: 38-64 - :linenos: - :emphasize-lines: 7-26 - -示例2:双层RNN,子序列间有Memory -================================ - -本示例意图使用单层RNN和双层RNN实现两个完全等价的全连接RNN。 - -* 对于单层RNN,输入数据为一个完整的时间序列,例如\ :code:`[4, 5, 2, 0, 9, 8, 1, 4]`\ 。 - -* 对于双层RNN,输入数据为在单层RNN数据里面,任意将一些数据组合成双层时间序列,例如\ :code:`[ [4, 5, 2], [0, 9], [8, 1, 4]]`。 - -模型配置的模型配置 ------------------- - -我们选取单双层序列配置中的不同部分,来对比分析两者语义相同的原因。 - -- 单层RNN:过了一个很简单的recurrent_group。每一个时间步,当前的输入y和上一个时间步的输出rnn_state做了一个全链接。 - -.. literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_rnn.conf - :language: python - :lines: 36-48 - -- 双层RNN,外层memory是一个元素: - - - 内层inner_step的recurrent_group和单层序列的几乎一样。除了boot_layer=outer_mem,表示将外层的outer_mem作为内层memory的初始状态。外层outer_step中,outer_mem是一个子句的最后一个向量,即整个双层group是将前一个子句的最后一个向量,作为下一个子句memory的初始状态。 - - 从输入数据上看,单双层序列的句子是一样的,只是双层序列将其又做了子序列划分。因此双层序列的配置中,必须将前一个子句的最后一个元素,作为boot_layer传给下一个子句的memory,才能保证和单层序列的配置中“每个时间步都用了上一个时间步的输出结果”一致。 - -.. literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_nest_rnn.conf - :language: python - :lines: 39-66 - -.. warning:: - PaddlePaddle目前只支持在每个时间步中,Memory的时间序列长度一致的情况。 - -示例3:双层RNN,输入不等长 -========================== - -.. role:: red - -.. raw:: html - - - -**输入不等长** 是指recurrent_group的多个输入序列,在每个时间步的子序列长度可以不相等。但序列输出时,需要指定与某一个输入的序列信息是一致的。使用\ :red:`targetInlink`\ 可以指定哪一个输入和输出序列信息一致,默认指定第一个输入。 - -示例3的配置分别为\ `单层不等长RNN `_\ 和\ `双层不等长RNN `_\ 。 - -示例3对于单层RNN和双层RNN数据完全相同。 - -* 对于单层RNN的数据一共有两个样本,他们分别是\ :code:`[1, 2, 4, 5, 2], [5, 4, 1, 3, 1]`\ 和\ :code:`[0, 2, 2, 5, 0, 1, 2], [1, 5, 4, 2, 3, 6, 1]`\ 。对于每一个单层RNN的数据,均有两组特征。 - -* 在单层数据的基础上,双层RNN数据随意加了一些隔断,例如将第一条数据转化为\ :code:`[[0, 2], [2, 5], [0, 1, 2]],[[1, 5], [4], [2, 3, 6, 1]]`\ 。 - -* 需要注意的是PaddlePaddle目前只支持子序列数目一样的多输入双层RNN。例如本例中的两个特征,均有三个子序列。每个子序列长度可以不一致,但是子序列的数目必须一样。 - - -模型配置 --------- - -和示例2中的配置类似,示例3的配置使用了单层RNN和双层RNN,实现两个完全等价的全连接RNN。 - -* 单层RNN\: - -.. literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_rnn_multi_unequalength_inputs.py - :language: python - :lines: 42-59 - :linenos: - -* 双层RNN\ \: - -.. literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py - :language: python - :lines: 41-80 - :linenos: - -在上面代码中,单层和双层序列的使用和示例2中的示例类似,区别是同时处理了两个输入。而对于双层序列,两个输入的子序列长度也并不相同。但是,我们使用了\ :code:`targetInlink`\ 参数设置了外层\ :code:`recurrent_group`\ 的输出格式。所以外层输出的序列形状,和\ :code:`emb2`\ 的序列形状一致。 - - -词汇表 -====== - -.. _glossary_memory: - -Memory ------- - -Memory是PaddlePaddle实现RNN时候使用的一个概念。RNN即时间递归神经网络,通常要求时间步之间具有一些依赖性,即当前时间步下的神经网络依赖前一个时间步神经网络中某一个神经元输出。如下图所示。 - -.. graphviz:: src/glossary_rnn.dot - -上图中虚线的连接,即是跨越时间步的网络连接。PaddlePaddle在实现RNN的时候,将这种跨越时间步的连接用一个特殊的神经网络单元实现。这个神经网络单元就叫Memory。Memory可以缓存上一个时刻某一个神经元的输出,然后在下一个时间步输入给另一个神经元。使用Memory的RNN实现便如下图所示。 - -.. graphviz:: src/glossary_rnn_with_memory.dot - -使用这种方式,PaddlePaddle可以比较简单的判断哪些输出是应该跨越时间步的,哪些不是。 - -.. _glossary_timestep: - -时间步 ------- - -参考时间序列。 - - -.. _glossary_sequence: - -时间序列 --------- - -时间序列(time series)是指一系列的特征数据。这些特征数据之间的顺序是有意义的。即特征的数组,而不是特征的集合。而这每一个数组元素,或者每一个系列里的特征数据,即为一个时间步(time step)。值得注意的是,时间序列、时间步的概念,并不真正的和『时间』有关。只要一系列特征数据中的『顺序』是有意义的,即为时间序列的输入。 - -举例说明,例如文本分类中,我们通常将一句话理解成一个时间序列。比如一句话中的每一个单词,会变成词表中的位置。而这一句话就可以表示成这些位置的数组。例如 :code:`[9, 2, 3, 5, 3]` 。 - -关于时间序列(time series)的更详细准确的定义,可以参考 `维基百科页面 Time series `_ 或者 `维基百科中文页面 时间序列 `_ 。 - -另外,Paddle中经常会将时间序列成为 :code:`Sequence` 。他们在Paddle的文档和API中是一个概念。 - -.. _glossary_RNN: - -RNN ---- - -RNN 在PaddlePaddle的文档中,一般表示 :code:`Recurrent neural network`,即时间递归神经网络。详细介绍可以参考 `维基百科页面 Recurrent neural network `_ 或者 `中文维基百科页面 `_ 中关于时间递归神经网络的介绍。 - -RNN 一般在PaddlePaddle中,指对于一个时间序列输入数据,每一个时间步之间的神经网络具有一定的相关性。例如,某一个神经元的一个输入为上一个时间步网络中某一个神经元的输出。或者,从每一个时间步来看,神经网络的网络结构中具有有向环结构。 - -.. _glossary_双层RNN: - -双层RNN -------- - -双层RNN顾名思义,即RNN之间有一次嵌套关系。输入数据整体上是一个时间序列,而对于每一个内层特征数据而言,也是一个时间序列。即二维数组,或者数组的数组这个概念。 而双层RNN是可以处理这种输入数据的网络结构。 - -例如,对于段落的文本分类,即将一段话进行分类。我们将一段话看成句子的数组,每个句子又是单词的数组。这便是一种双层RNN的输入数据。而将这个段落的每一句话用lstm编码成一个向量,再对每一句话的编码向量用lstm编码成一个段落的向量。再对这个段落向量进行分类,即为这个双层RNN的网络结构。 - diff --git a/doc/v2/howto/rnn/hrnn_rnn_api_compare_en.rst b/doc/v2/howto/rnn/hrnn_rnn_api_compare_en.rst deleted file mode 100644 index a4485f7b5edf21871444801230ab1ee191b1137b..0000000000000000000000000000000000000000 --- a/doc/v2/howto/rnn/hrnn_rnn_api_compare_en.rst +++ /dev/null @@ -1,226 +0,0 @@ -.. _algo_hrnn_rnn_api_compare: - -##################### -API comparision between RNN and hierarchical RNN -##################### - -This article takes PaddlePaddle's hierarchical RNN unit test as an example. We will use several examples to illestrate the usage of single-layer and hierarchical RNNs. Each example has two model configurations, one for single-layer, and the other for hierarchical RNN. Although the implementations are different, both the two model configurations' effects are the same. All of the examples in this article only describe the API interface of the hierarchical RNN, while we do not use this hierarchical RNN to solve practical problems. If you want to understand the use of hierarchical RNN in specific issues, please refer to \ :ref:`algo_hrnn_demo`\ 。The unit test file used in this article's example is \ `test_RecurrentGradientMachine.cpp `_\ 。 - -Example 1:Hierarchical RNN without Memory between subsequences -================================ - -The classical case in the hierarchical RNN is to perform sequence operations on each time series data in the inner layers seperately. And the sequence operations in the inner layers is independent, that is, it does not need to use Memory. - -In this example, the network configuration of single-layer RNNs and hierarchical RNNs are all to use LSTM as en encoder to compress a word-segmented sentence into a vector. The difference is that, RNN uses a hierarchical RNN model, treating multiple sentences as a whole to use encoder to compress simultaneously. They are completely consistent in their semantic meanings. This pair of semantically identical example configurations is as follows: - -* RNN\: `sequence_layer_group.conf `_ -* Hierarchical RNN\: `sequence_nest_layer_group.conf `_ - - -Reading hierarchical sequence data ----------------- - -Firstly, the original data in this example is as follows \: - -- The original data in this example has 10 samples. Each of the sample includes two components: a lable(all 2 here), and a word-segmented sentence. This data is used by single RNN as well. - -.. literalinclude:: ../../../../paddle/legacy/gserver/tests/Sequence/tour_train_wdseg - :language: text - - -- The data for hierarchical RNN has 4 samples. Every sample is seperated by a blank line, while the content of the data is the same as the original data. But as for hierarchical LSTM, the first sample will encode two sentences into two vectors simultaneously. The sentence count dealed simultaneously by this 4 samples are \ :code:`[2, 3, 2, 3]`\ . - -.. literalinclude:: ../../../../paddle/legacy/gserver/tests/Sequence/tour_train_wdseg.nest - :language: text - -Secondly, as for these two types of different input data formats, the contrast of different DataProviders are as follows (`sequenceGen.py `_)\: - -.. literalinclude:: ../../../../paddle/legacy/gserver/tests/sequenceGen.py - :language: python - :lines: 21-39 - :linenos: - -- This is the DataProvider code for an ordinary single-layer time series. Its description is as follows: - - * DataProvider returns two parts, that are "words" and "label",as line 19 in the above code. - - - "words" is a list of word table indices corresponding to each word in the sentence in the original data. Its data type is integer_value_sequence, that is integer list. So, "words" is a singler-layer time series in the data. - - "label" is the categorical label of each sentence, whose data type is integer_value. - -.. literalinclude:: ../../../../paddle/legacy/gserver/tests/sequenceGen.py - :language: python - :lines: 42-71 - :linenos: - -- As for the same data, the DataProvider code for hierarchical time series. Its description is as follows: - - - DataProvider returns two lists of data, that are "sentences" and "labels", corresponding to the sentences and labels in each group in the original data of hierarchical time series. - - "sentences" comes from the hierarchical time series original data. As it contains every sentences in each group internally, and each sentences are represented by a list of word table indices, so its data type is integer_value_sub_sequence, which is hierarchical time series. - - "labels" is the categorical lable of each sentence, so it is a sigle-layer time series. - - -Model configuration ------------------------------------------- - -Firstly, let's look at the configuration of single-layer RNN. The hightlighted part of line 9 to line 15 is the usage of single-layer RNN. Here we use the pre-defined RNN process function in PaddlePaddle. In this function, for each time step, RNN passes through an LSTM network. - -.. literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_layer_group.conf - :language: python - :lines: 38-63 - :linenos: - :emphasize-lines: 9-15 - - -Secondly, let's look at the model configuration of hierarchical RNN which has the same semantic meaning. \: - -* Most layers in PaddlePaddle do not care about whether the input is time series or not, e.g. \ :code:`embedding_layer`\ . In these layers, every operation is processed on each time step. - -* In the hightlighted part of line 7 to line 26 of this configuration, we transform the hierarchical time series data into single-layer time series data, then process each single-layer time series. - - * Use the function \ :code:`recurrent_group`\ to transform. Input sequences need to be passed in when transforming. As we want to transform hierarchical time series into single-layer sequences, we need to lable the input data as \ :code:`SubsequenceInput`\ . - - * In this example, we disassemble every group of the original data into sentences using \ :code:`recurrent_group`\ . Each of the disassembled sentences passes through an LSTM network. This is equivalent to single-layer RNN configuration. - -* Similar to single-layer RNN configuration, we only use the last vector after the encode of LSTM. So we use the operation of \ :code:`last_seq`\ to \ :code:`recurrent_group`\ . But unlike single-layer RNN, we use the last element of every subsequence, so we need to set \ :code:`agg_level=AggregateLevel.TO_SEQUENCE`\ . - -* Till now, \ :code:`lstm_last`\ has the same result as \ :code:`lstm_last`\ in single-layer RNN configuration. - -.. literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_nest_layer_group.conf - :language: python - :lines: 38-64 - :linenos: - :emphasize-lines: 7-26 - -Example 2:Hierarchical RNN with Memory between subsequences -================================ - -This example is intended to implement two fully-equivalent fully-connected RNNs using single-layer RNN and hierarchical RNN. - -* As for single-layer RNN, input is a full time series, e.g. \ :code:`[4, 5, 2, 0, 9, 8, 1, 4]`\ . - -* As for hierarchical RNN, input is a hierarchical time series which elements are arbitrarily combination of data in single-layer RNN, e.g. \ :code:`[ [4, 5, 2], [0, 9], [8, 1, 4]]`. - -model configuration ------------------- - -We select the different parts between single-layer RNN and hierarchical RNN configurations, to compare and analyze the reason why they have same semantic meanings. - -- single-layer RNN:passes through a simple recurrent_group. For each time step, the current input y and the last time step's output rnn_state pass through a fully-connected layer. - -.. literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_rnn.conf - :language: python - :lines: 36-48 - -- hierarchical RNN, the outer layer's memory is an element. - - - The recurrent_group of inner layer's inner_step is nearly the same as single-layer sequence, except for the case of boot_layer=outer_mem, which means using the outer layer's outer_mem as the initial state for the inner layer's memory. In the outer layer's out_step, outer_mem is the last vector of a subsequence, that is, the whole hierarchical group uses the last vector of the previous subsequence as the initial state for the next subsequence's memory. - - From the aspect of the input data, sentences from single-layer and hierarchical RNN are the same. The only difference is that, hierarchical RNN disassembes the sequence into subsequences. So in the hierarchical RNN configuration, we must use the last element of the previous subsequence as a boot_layer for the memory of the next subsequence, so that it makes no difference with "every time step uses the output of last time step" in the sigle-layer RNN configuration. - -.. literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_nest_rnn.conf - :language: python - :lines: 39-66 - -.. warning:: - Currently PaddlePaddle only supports the case that the lengths of the time series of Memory in each time step are the same. - -Example 3:hierarchical RNN with unequal length inputs -========================== - -.. role:: red - -.. raw:: html - - - -**unequal length inputs** means in the multiple input sequences of recurrent_group, the lengths of subsequences can be unequal. But the output of the sequence, needs to be consistent with one of the input sequences. Using \ :red:`targetInlink`\ can help you specify which of the input sequences and the output sequence can be consistent, by default is the first input. - -The configurations of Example 3 are \ `sequence_rnn_multi_unequalength_inputs `_ \ and \ `sequence_nest_rnn_multi_unequalength_inputs `_\ . - -The data for the configurations of Example 3's single-layer RNN and hierarchical RNN are exactly the same. - -* For the single-layer RNN, the data has two samples, which are \ :code:`[1, 2, 4, 5, 2], [5, 4, 1, 3, 1]`\ and \ :code:`[0, 2, 2, 5, 0, 1, 2], [1, 5, 4, 2, 3, 6, 1]`\ . Each of the data for the single-layer RNN has two group of features. - -* On the basis of the single-layer's data, hierarchical RNN's data randomly adds some partitions. For example, the first sample is transformed to \ :code:`[[0, 2], [2, 5], [0, 1, 2]],[[1, 5], [4], [2, 3, 6, 1]]`\ . - -* You need to pay attention that, PaddlePaddle only supports multiple input hierarchical RNNs that have same amount of subsequences currently. In this example, the two features both have 3 subsequences. Although the length of each subsequence can be different, the amount of subsequences should be the same. - - -model configuration --------- - -Similar to Example 2's configuration, Example 3's configuration uses single-layer and hierarchical RNN to implement 2 fully-equivalent fully-connected RNNs. - -* single-layer RNN\: - -.. literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_rnn_multi_unequalength_inputs.py - :language: python - :lines: 42-59 - :linenos: - -* hierarchical RNN\ \: - -.. literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py - :language: python - :lines: 41-80 - :linenos: - -In the above code, the usage of single-layer and hierarchical RNNs are similar to Example 2, which difference is that it processes 2 inputs simultaneously. As for the hierarchical RNN, the lengths of the 2 input's subsequences are not equal. But we use the parameter \ :code:`targetInlink` \ to set the outper layer's \ :code:`recurrent_group` \ 's output format, so the shape of outer layer's output is the same as the shape of \ :code:`emb2`\ . - - -Glossary -====== - -.. _glossary_memory: - -Memory ------- - -Memory is a concept when PaddlePaddle is implementing RNN. RNN, recurrent neural network, usually requires some dependency between time steps, that is, the neural network in current time step depends on one of the neurons in the neural network in previous time steps, as the following figure shows: - -.. graphviz:: src/glossary_rnn.dot - -The dotted connections in the figure, is the network connections across time steps. When PaddlePaddle is implementing RNN, this connection accross time steps is implemented using a special neural network unit, called Memory. Memory can cache the output of one of the neurons in previous time step, then can be passed to another neuron in next time step. The implementation of an RNN using Memory is as follows: - -.. graphviz:: src/glossary_rnn_with_memory.dot - -With this method, PaddlePaddle can easily determine which outputs should cross time steps, and which should not. - -.. _glossary_timestep: - -time step ------- - -refers to time series - - -.. _glossary_sequence: - -time series --------- - -Time series is a series of featured data. The order among these featured data is meaningful. So it is a list of features, not a set of features. As for each element of this list, or the featured data in each series, is called a time step. It must be noted that, the concepts of time series and time steps, are not necessarrily related to "time". As long as the "order" in a series of featured data is meaningful, it can be the input of time series. - -For example, in text classification task, we regard a sentence as a time series. So, each word in the sentence can become the index of the word in the word table. So this sentence can be represented as a list of these indices, e.g.:code:`[9, 2, 3, 5, 3]` . - -For a more detailed and accurate definition of the time series, please refer to `Wikipedia of Time series `_ or `Chinese Wikipedia of time series `_ . - -In additioin, Paddle always calls time series as :code:`Sequence` . They are a same concept in Paddle's documentations and APIs. - -.. _glossary_RNN: - -RNN ---- - -In PaddlePaddle's documentations, RNN is usually represented as :code:`Recurrent neural network` . For more information, please refer to `Wikipedia Recurrent neural network `_ or `Chinese Wikipedia `_ . - -In PaddlePaddle, RNN usually means, for the input data of a time series, the neural network between each time steps has a certain relevance. For example, the input of a certain neuron is the output of a certain neuron in the neural network of the last time step. Or, as for each time step, the network structure of the neural network has a directed ring structure. - -.. _glossary_hierarchical_RNN: - -hierarchical RNN -------- - -Hierarchical RNN, as the name suggests, means there is a nested relationship in RNNs. The input data is a time series, but for each of the inner featured data, it is also a time series, namely 2-dimentional array, or, array of array. Hierarchical RNN is a neural network that can process this type of input data. - -For example, the task of text classification of a paragragh, meaning to classify a paragraph of sentences. We can treat a paragraph as an array of sentences, and each sentence is an array of words. This is a type of the input data for the hierarchical RNN. We encode each sentence of this paragraph into a vector using LSTM, then encode each of the encoded vectors into a vector of this paragraph using LSTM. Finally we use this paragraph vector perform classification, which is the neural network structure of this hierarchical RNN. - diff --git a/doc/v2/howto/rnn/index_cn.rst b/doc/v2/howto/rnn/index_cn.rst deleted file mode 100644 index 2032fb9e296ab024c68da1348064580c8c88d5be..0000000000000000000000000000000000000000 --- a/doc/v2/howto/rnn/index_cn.rst +++ /dev/null @@ -1,34 +0,0 @@ -RNN模型 -=========== -循环神经网络(RNN)是对序列数据建模的重要工具。PaddlePaddle提供了灵活的接口以支持复杂循环神经网络的构建。 -这里将分为以下四个部分详细介绍如何使用PaddlePaddle搭建循环神经网络。 - -第一部分由浅入深的展示了使用PaddlePaddle搭建循环神经网络的全貌:首先以简单的循环神经网络(vanilla RNN)为例, -说明如何封装配置循环神经网络组件;然后更进一步的通过序列到序列(sequence to sequence)模型,逐步讲解如何构建完整而复杂的循环神经网络模型。 - -.. toctree:: - :maxdepth: 1 - - rnn_config_cn.rst - -Recurrent Group是PaddlePaddle中实现复杂循环神经网络的关键,第二部分阐述了PaddlePaddle中Recurrent Group的相关概念和原理, -对Recurrent Group接口进行了详细说明。另外,对双层RNN(对应的输入为双层序列)及Recurrent Group在其中的使用进行了介绍。 - -.. toctree:: - :maxdepth: 1 - - recurrent_group_cn.md - -第三部分对双层序列进行了解释说明,列出了PaddlePaddle中支持双层序列作为输入的Layer,并对其使用进行了逐一介绍。 - -.. toctree:: - :maxdepth: 1 - - hierarchical_layer_cn.rst - -第四部分以PaddlePaddle的双层RNN单元测试中的网络配置为示例,辅以效果相同的单层RNN网络配置作为对比,讲解了多种情况下双层RNN的使用。 - -.. toctree:: - :maxdepth: 1 - - hrnn_rnn_api_compare_cn.rst diff --git a/doc/v2/howto/rnn/index_en.rst b/doc/v2/howto/rnn/index_en.rst deleted file mode 100644 index 6e8b5c61b23ca2725dc0c9761c8dd4165033973c..0000000000000000000000000000000000000000 --- a/doc/v2/howto/rnn/index_en.rst +++ /dev/null @@ -1,32 +0,0 @@ -RNN Models -========== -Recurrent neural networks(RNN) are an important tool to model sequential data. PaddlePaddle provides flexible interface for building complex recurrent neural network. We will demonstrate how to use PaddlePaddle to build RNN models in the following 4 parts. - -In the first part, we will guide you how to configure recurrent neural network in PaddlePaddle from simple to complex. First, we will use a vanilla recurrent neural network as an example to show how to configure recurrent neural network architecture. Then We will use the sequence to sequence model as an example to demonstrate how you can configure complex recurrent neural network models gradually. - -.. toctree:: - :maxdepth: 1 - - rnn_config_en.rst - -Recurrent Group is the key unit to build complex recurrent neural network models. The second part describes related concepts and Basic principles of Recurrent Group, and give a detailed description of Recurrent Group API interface. In addition, it also introduces Sequence-level RNN(hierarchical sequence as input) and the usage of Recurrent Group in it. - -.. toctree:: - :maxdepth: 1 - - recurrent_group_en.md - -In the third part, two-level sequence is demonstrated briefly and then layers supporting two-level sequence as input are listed and described respectively. - -.. toctree:: - :maxdepth: 1 - - hierarchical_layer_en.rst - -In the last part, the unit test of hierarchical RNN is presented as an example to explain how to use hierarchical RNN. We will use two-level sequence RNN and single-layer sequence RNN which have same effects with former as the network configuration seperately in unit test. - -.. toctree:: - :maxdepth: 1 - - hrnn_rnn_api_compare_en.rst - diff --git a/doc/v2/howto/rnn/recurrent_group_cn.md b/doc/v2/howto/rnn/recurrent_group_cn.md deleted file mode 100644 index 06dc9e089ab2b2b926fcb1bd034262f2c846f06f..0000000000000000000000000000000000000000 --- a/doc/v2/howto/rnn/recurrent_group_cn.md +++ /dev/null @@ -1,96 +0,0 @@ -# Recurrent Group教程 - -## 概述 - -序列数据是自然语言处理任务面对的一种主要输入数据类型。 - -一句话是由词语构成的序列,多句话进一步构成了段落。因此,段落可以看作是一个嵌套的双层的序列,这个序列的每个元素又是一个序列。 - -双层序列是PaddlePaddle支持的一种非常灵活的数据组织方式,帮助我们更好地描述段落、多轮对话等更为复杂的语言数据。基于双层序列输入,我们可以设计搭建一个灵活的、层次化的RNN,分别从词语和句子级别编码输入数据,同时也能够引入更加复杂的记忆机制,更好地完成一些复杂的语言理解任务。 - -在PaddlePaddle中,`recurrent_group`是一种任意复杂的RNN单元,用户只需定义RNN在一个时间步内完成的计算,PaddlePaddle负责完成信息和误差在时间序列上的传播。 - -更进一步,`recurrent_group`同样可以扩展到双层序列的处理上。通过两个嵌套的`recurrent_group`分别定义子句级别和词语级别上需要完成的运算,最终实现一个层次化的复杂RNN。 - -目前,在PaddlePaddle中,能够对双向序列进行处理的有`recurrent_group`和部分Layer,具体可参考文档:支持双层序列作为输入的Layer。 - -## 相关概念 - -### 基本原理 -`recurrent_group` 是PaddlePaddle支持的一种任意复杂的RNN单元。使用者只需要关注于设计RNN在一个时间步之内完成的计算,PaddlePaddle负责完成信息和梯度在时间序列上的传播。 - -PaddlePaddle中,`recurrent_group`的一个简单调用如下: - -``` python -recurrent_group(step, input, reverse) -``` -- step:一个可调用的函数,定义一个时间步之内RNN单元完成的计算 -- input:输入,必须是一个单层序列,或者一个双层序列 -- reverse:是否以逆序处理输入序列 - -使用`recurrent_group`的核心是设计step函数的计算逻辑。step函数内部可以自由组合PaddlePaddle支持的各种layer,完成任意的运算逻辑。`recurrent_group` 的输入(即input)会成为step函数的输入,由于step 函数只关注于RNN一个时间步之内的计算,在这里`recurrent_group`替我们完成了原始输入数据的拆分。 - -### 输入 -`recurrent_group`处理的输入序列主要分为以下三种类型: - -- **数据输入**:一个双层序列进入`recurrent_group`会被拆解为一个单层序列,一个单层序列进入`recurrent_group`会被拆解为非序列,然后交给step函数,这一过程对用户是完全透明的。可以有以下两种:1)通过data_layer拿到的用户输入;2)其它layer的输出。 - -- **只读Memory输入**:`StaticInput` 定义了一个只读的Memory,由`StaticInput`指定的输入不会被`recurrent_group`拆解,`recurrent_group` 循环展开的每个时间步总是能够引用所有输入,可以是一个非序列,或者一个单层序列。 - -- **序列生成任务的输入**:`GeneratedInput`只用于在序列生成任务中指定输入数据。 - -### 输入示例 - -序列生成任务大多遵循encoder-decoer架构,encoder和decoder可以是能够处理序列的任意神经网络单元,而RNN是最流行的选择。 - -给定encoder输出和当前词,decoder每次预测产生下一个最可能的词语。在这种结构中,decoder接受两个输入: - -- 要生成的目标序列:是decoder的数据输入,也是decoder循环展开的依据,`recurrent_group`会对这类输入进行拆解。 - -- encoder输出,可以是一个非序列,或者一个单层序列:是一个unbounded memory,decoder循环展开的每一个时间步会引用全部结果,不应该被拆解,这种类型的输入必须通过`StaticInput`指定。关于Unbounded Memory的更多讨论请参考论文 [Neural Turning Machine](https://arxiv.org/abs/1410.5401)。 - -在序列生成任务中,decoder RNN总是引用上一时刻预测出的词的词向量,作为当前时刻输入。`GeneratedInput`自动完成这一过程。 - -### 输出 -`step`函数必须返回一个或多个Layer的输出,这个Layer的输出会作为整个`recurrent_group` 最终的输出结果。在输出的过程中,`recurrent_group` 会将每个时间步的输出拼接,这个过程对用户也是透明的。 - -### memory -memory只能在`recurrent_group`中定义和使用。memory不能独立存在,必须指向一个PaddlePaddle定义的Layer。引用memory得到这layer上一时刻输出,因此,可以将memory理解为一个时延操作。 - -可以显示地指定一个layer的输出用于初始化memory。不指定时,memory默认初始化为0。 - -## 双层RNN介绍 -`recurrent_group`帮助我们完成对输入序列的拆分,对输出的合并,以及计算逻辑在序列上的循环展开。 - -利用这种特性,两个嵌套的`recurrent_group`能够处理双层序列,实现词语和句子两个级别的双层RNN结构。 - -- 单层(word-level)RNN:每个状态(state)对应一个词(word)。 -- 双层(sequence-level)RNN:一个双层RNN由多个单层RNN组成,每个单层RNN(即双层RNN的每个状态)对应一个子句(subseq)。 - -为了描述方便,下文以NLP任务为例,将含有子句(subseq)的段落定义为一个双层序列,将含有词语的句子定义为一个单层序列,那么0层序列即为一个词语。 - -## 双层RNN的使用 - -### 训练流程的使用方法 -使用 `recurrent_group`需要遵循以下约定: - -- **单进单出**:输入和输出都是单层序列。 - - 如果有多个输入,不同输入序列含有的词语数必须严格相等。 - - 输出一个单层序列,输出序列的词语数和输入序列一致。 - - memory:在step函数中定义 memory指向一个layer,通过引用memory得到这个layer上一个时刻输出,形成recurrent 连接。memory的is_seq参数必须为false。如果没有定义memory,每个时间步之内的运算是独立的。 - - boot_layer:memory的初始状态,默认初始状为0,memory的is_seq参数必须为false。 - -- **双进双出**:输入和输出都是双层序列。 - - 如果有多个输入序列,不同输入含有的子句(subseq)数必须严格相等,但子句含有的词语数可以不相等。 - - 输出一个双层序列,子句(subseq)数、子句的单词数和指定的一个输入序列一致,默认为第一个输入。 - - memory:在step函数中定义memory,指向一个layer,通过引用memory得到这个layer上一个时刻的输出,形成recurrent连接。定义在外层`recurrent_group` step函数中的memory,能够记录上一个subseq 的状态,可以是一个单层序列(只作为read-only memory),也可以是一个词语。如果没有定义memory,那么 subseq 之间的运算是独立的。 - - boot_layer:memory 初始状态,可以是一个单层序列(只作为read-only memory)或一个向量。默认不设置,即初始状态为0。 - -- **双进单出**:目前还未支持,会报错"In hierachical RNN, all out links should be from sequences now"。 - - -### 生成流程的使用方法 -使用`beam_search`需要遵循以下约定: - -- 单层RNN:从一个word生成下一个word。 -- 双层RNN:即把单层RNN生成后的subseq给拼接成一个新的双层seq。从语义上看,也不存在一个subseq直接生成下一个subseq的情况。 diff --git a/doc/v2/howto/rnn/recurrent_group_en.md b/doc/v2/howto/rnn/recurrent_group_en.md deleted file mode 100644 index de6b60f29eb97029a54609cd2194bb7faf3ffec5..0000000000000000000000000000000000000000 --- a/doc/v2/howto/rnn/recurrent_group_en.md +++ /dev/null @@ -1,96 +0,0 @@ -# Recurrent Group Tutorial - -## Overview - -Sequential data is common in natural language processing. - -A sentence is a sequence of words and many sentences form a paragraph further. Therefore, a paragraph can be viewed as a nested sequence with two level, where each element of the sequence is another sequence. That is to say, sequential data could be recursive. An example of two-level recursive sequential data is that an article is composed of a sequence of sentences, and each sentence a sequence of words. - -PaddlePaddle and PaddlePaddle v2 support two-level recursive sequential data. The two-level sequence is a very flexible data, which helps us to better describe more complex language data such as discribing paragraphs and several rounds of dialogues. Based on two-level sequence input, we can design and build a flexible, hierarchical RNN model that encodes input data from the word and sentence level. For the support of arbitrary levels, please refer to PaddlePaddle Fluid. - -In PaddlePaddle, `recurrent_group` is an arbitrarily complex RNN unit. The user only needs to define the calculation that the RNN will complete in one time step. PaddlePaddle is responsible for the propagation of information and error in time series. - -Furthermore, `recurrent_group` can also be extended to handle two-level sequence. By defining two nested `recurrent_group` operations at the clause level and the word level respectively, a hierarchical and complex RNN is finally achieved. - -Currently, in the PaddlePaddle, there are `recurrent_group` and some Layers that can process bidirectional sequences. For details, refer to the document: Layers for supporting double-layer sequences as input. - -## Related Concepts - -### Basic Principle -`recurrent_group` is an arbitrarily complex RNN unit supported by PaddlePaddle. The user only needs to focus on the calculations that the RNN is designed to complete within a single time step. The PaddlePaddle is responsible for completing the propagation of information and gradients over time. - -In PaddlePaddle, a simple call to `recurrent_group` is as follows: - -``` python -recurrent_group(step, input, reverse) -``` -- step: A callable function that defines the calculations completed by the RNN unit within a time step -- input: The input must be a single-layer sequence or a double-layer sequence -- reverse: Whether to process the input sequence in reverse order - -The core of using `recurrent_group` is to design the logic of the step function. The step function can be freely combined with various layers supported by PaddlePaddle to complete arbitrary arithmetic logic. The input of `recurrent_group` (input) becomes the input of the step function. Since the step function only focuses on the calculation within one time step of RNN, here `recurrent_group` completes the splitting of the original input data for us. - -### Input -The input sequence processed by `recurrent_group` is mainly divided into the following three types: - -- **Input Data**: When putting a two-level sequence into `recurrent_group`, it will be disassembled into a single-level sequence. When putting a single-level sequence into `recurrent_group`, it will be disassembled into a non-sequence and then passed to the step function. This process is completely transparent to the user. There are two possible types: 1) User input via data_layer; 2) Output from other layers. - -- **Read-only Memory Input**: `StaticInput` defines a read-only Memory. The input specified by `StaticInput` will not be disassembled by `recurrent_group`, and each time step of the `recurrent_group` loop will always be able to reference all inputs. It may be a non-sequence or a single-layer sequence. - -- **Input of Sequence Generation Task**: `GeneratedInput` is only used to specify input data in a sequence generation task. - -### Input Example - -Sequence generation tasks mostly follow the encoder-decoer architecture. The encoder and decoder can be arbitrary neural network units capable of processing sequences and RNN is the most popular choice. - -Given the encoder output and the current word, the decoder predicts the next most likely word each time. In this structure, the decoder accepts two inputs: - -- Target sequence to be generated: a input of the decoder and the basis of the decoder loop. `recurrent_group` will disassemble this input type. - -- Encoder output, an non-sequencce or single-sequence: a unbounded memory. Each time step in the decoder loop will reference the entire result and should not be disassembled. This type of input must be specified via `StaticInput`. For more discussion on Unbounded Memory, please refer to the paper [Neural Turning Machine](https://arxiv.org/abs/1410.5401). - -In a sequence generation task, the decoder RNN always refers to the word vector of the word predicted at the previous moment as the current time input. `GeneratedInput` will automate this process. - -### Output -The `step` function must return the output of one or more Layers. The output of this Layer will be the final output of the entire `recurrent_group`. In the output process, `recurrent_group` will concatenate the output of each time step, which is also transparent to the user. - -### Memory -Memory can only be defined and used in `recurrent_group`. Memory cannot exist independently and must point to a layer defined by PaddlePaddle. Memory is referenced to get a momentary output from this layer, so memory can be interpreted as a delay operation. - -The user can explicitly specify the output of a layer to initialize the memory. When not specified, memory is initialized to 0 by default. - -## Sequence-level RNN Introduction - -`recurrent_group` helps us to split the input sequence, merge the output, and loop through the sequence of computational logic. - -Using this feature, the two nested `recurrent_group` can handle the nested two-level sequences, implementing sequence-level RNN structures at both the word and sentence levels. - -- Word-level RNN: each state corresponds to a word. -- Sequence-level RNN: a sequence-layer RNN consists of multiple word-layer RNNs. Each word-layer RNN (ie, each state of a sequence-layer RNN) has a subsequence. - -For convenience of description, the following takes the NLP task as an example. A paragraph containing a subsequence is defined as a two-level sequence, and a sentence containing a word is defined as a single-layer sequence. Then, the zero-level sequence is a word. - -## Usage of Sequence-level RNN - -### Usage of Training Process -Using `recurrent_group` requires the following conventions: - -- **Single-input Single-output**: Both input and output are single layer sequences. - - If there are multiple inputs, the number of words in different input sequences must be exactly equal. - - A single-layer sequence is output, and the number of words in the output sequence is the same as the input sequence. - - memory: define memory to point to a layer in the step function, get a moment output from this layer by referencing memory to form a recurrent connection. The is_seq parameter of memory must be false. If memory is not defined, the operations within each time step are independent. - - boot_layer: the initial state of memory, set 0 by default. is_seq in memory must be false. - -- **Double-input Double-output**: Both input and output are two-level sequence. - - If there are multiple input sequences, the number of subsequence contained in different inputs must be strictly equal, but the number of words in the subsequence may not be equal. - - output a two-level sequence. The number of subsequence and the number of words are the same as the specified input sequence and the first input is default. - - memory: defining memory in the step function, pointing to a layer, by referring to the memory to get the output of this layer at a time, forming a recurrent connection. The memory defined in the outer `recurrent_group` step function can record the state of the previous subsequence, either as a single-level sequence (only as read-only memory) or as a word. If memory is not defined, the operations between subsequence are independent. - - boot_layer: the initial state of memory. It is either a single-level sequence (only as read-only memory) or a vector. The default is not set, that is, the initial state is 0. - -- **Double-input Single-output**: not support for now, and output the error with "In hierachical RNN, all out links should be from sequences now". - -### Usage of Generation Process -Using `beam_search` need follow those conventions: - -- Word-level RNN: generate the next word from a word. -- Sequence-level RNN: the single-layer RNN generated subsequence is concatenated into a new double-layer sequence. Semantically, there is no case where a subsequence generates the next subseq directly. diff --git a/doc/v2/howto/rnn/rnn_config_cn.rst b/doc/v2/howto/rnn/rnn_config_cn.rst deleted file mode 100644 index 63fa161fafed0f3a8ec8799af21304cbec62d813..0000000000000000000000000000000000000000 --- a/doc/v2/howto/rnn/rnn_config_cn.rst +++ /dev/null @@ -1,261 +0,0 @@ -RNN配置 -======== - -本教程将指导你如何在 PaddlePaddle -中配置循环神经网络(RNN)。PaddlePaddle -高度支持灵活和高效的循环神经网络配置。 在本教程中,您将了解如何: - -- 配置循环神经网络架构。 -- 使用学习完成的循环神经网络模型生成序列。 - -我们将使用 vanilla 循环神经网络和 sequence to sequence -模型来指导你完成这些步骤。sequence to sequence -模型的代码可以在 `book/08.machine_translation `_ 找到。 -wmt14数据的提供文件在 `python/paddle/v2/dataset/wmt14.py `_ 。 - -配置循环神经网络架构 --------------------- - -简单门控循环神经网络(Gated Recurrent Neural Network) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -循环神经网络在每个时间步骤顺序地处理序列。下面列出了 LSTM 的架构的示例。 - -.. image:: src/bi_lstm.jpg - :align: center - -一般来说,循环网络从 :math:`t=1` 到 :math:`t=T` 或者反向地从 :math:`t=T` 到 :math:`t=1` 执行以下操作。 - -.. math:: - - x_{t+1} = f_x(x_t), y_t = f_y(x_t) - -其中 :math:`f_x(.)` 称为\ **单步函数**\ (即单时间步执行的函数,step -function),而 :math:`f_y(.)` 称为\ **输出函数**\ 。在 vanilla -循环神经网络中,单步函数和输出函数都非常简单。然而,PaddlePaddle -可以通过修改这两个函数来实现复杂的网络配置。我们将使用 sequence to -sequence -模型演示如何配置复杂的循环神经网络模型。在本节中,我们将使用简单的 -vanilla -循环神经网络作为使用\ ``recurrent_group``\ 配置简单循环神经网络的例子。 -注意,如果你只需要使用简单的RNN,GRU或LSTM,那么推荐使用\ ``grumemory``\ 和\ ``lstmemory``\ ,因为它们的计算效率比\ ``recurrent_group``\ 更高。 - -对于 vanilla RNN,在每个时间步长,\ **单步函数**\ 为: - -.. math:: - - x_{t+1} = W_x x_t + W_i I_t + b - -其中 :math:`x_t` 是RNN状态,并且 :math:`I_t` 是输入,:math:`W_x` 和 -:math:`W_i` 分别是RNN状态和输入的变换矩阵。:math:`b` 是偏差。它的\ **输出函数**\ 只需要 :math:`x_t` 作为输出。 - -``recurrent_group``\ 是构建循环神经网络的最重要的工具。 -它定义了\ **单步函数**\ ,\ **输出函数**\ 和循环神经网络的输入。注意,这个函数的\ ``step``\ 参数需要实现\ ``step function``\ (单步函数)和\ ``output function``\ (输出函数): - -.. code:: python - - def simple_rnn(input, - size=None, - name=None, - reverse=False, - rnn_bias_attr=None, - act=None, - rnn_layer_attr=None): - def __rnn_step__(ipt): - out_mem = paddle.layer.memory(name=name, size=size) - rnn_out = paddle.layer.mixed(input = [paddle.layer.full_matrix_projection(input=ipt), - paddle.layer.full_matrix_projection(input=out_mem)], - name = name, - bias_attr = rnn_bias_attr, - act = act, - layer_attr = rnn_layer_attr, - size = size) - return rnn_out - return paddle.layer.recurrent_group(name='%s_recurrent_group' % name, - step=__rnn_step__, - reverse=reverse, - input=input) - -PaddlePaddle -使用“Memory”(记忆模块)实现单步函数。\ **Memory**\ 是在PaddlePaddle中构造循环神经网络时最重要的概念。 -Memory是在单步函数中循环使用的状态,例如 :math:`x_{t+1} = f_x(x_t)` 。 -一个Memory包含\ **输出**\ 和\ **输入**\ 。当前时间步处的Memory的输出作为下一时间步Memory的输入。Memory也可以具有\ **boot -layer(引导层)**\ ,其输出被用作Memory的初始值。 -在我们的例子中,门控循环单元的输出被用作输出Memory。请注意,\ ``rnn_out``\ 层的名称与\ ``out_mem``\ 的名称相同。这意味着\ ``rnn_out`` -(*x*\ \ *t* + 1)的输出被用作\ ``out_mem``\ Memory的\ **输出**\ 。 - -Memory也可以是序列。在这种情况下,在每个时间步中,我们有一个序列作为循环神经网络的状态。这在构造非常复杂的循环神经网络时是有用的。 -其他高级功能包括定义多个Memory,以及使用子序列来定义分级循环神经网络架构。 - -我们在函数的结尾返回\ ``rnn_out``\ 。 这意味着 ``rnn_out`` -层的输出被用作门控循环神经网络的\ **输出**\ 函数。 - -Sequence to Sequence Model with Attention -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -我们将使用 sequence to sequence model with attention -作为例子演示如何配置复杂的循环神经网络模型。该模型的说明如下图所示。 - -.. image:: src/encoder-decoder-attention-model.png - :align: center - -在这个模型中,源序列 :math:`S = \{s_1, \dots, s_T\}` -用双向门控循环神经网络编码。双向门控循环神经网络的隐藏状态 -:math:`H_S = \{H_1, \dots, H_T\}` 被称为 -*编码向量*\ 。解码器是门控循环神经网络。当解读每一个 :math:`y_t` 时, -这个门控循环神经网络生成一系列权重 :math:`W_S^t = \{W_1^t, \dots, W_T^t\}` , -用于计算编码向量的加权和。加权和用来生成 :math:`y_t` 。 - -模型的编码器部分如下所示。它叫做\ ``grumemory``\ 来表示门控循环神经网络。如果网络架构简单,那么推荐使用循环神经网络的方法,因为它比 -``recurrent_group`` -更快。我们已经实现了大多数常用的循环神经网络架构,可以参考 :ref:`api_trainer_config_helpers_layers` 了解更多细节。 - -我们还将编码向量投射到 ``decoder_size`` -维空间。这通过获得反向循环网络的第一个实例,并将其投射到 -``decoder_size`` 维空间完成: - -.. code:: python - - # 定义源语句的数据层 - src_word_id = paddle.layer.data( - name='source_language_word', - type=paddle.data_type.integer_value_sequence(source_dict_dim)) - # 计算每个词的词向量 - src_embedding = paddle.layer.embedding( - input=src_word_id, - size=word_vector_dim, - param_attr=paddle.attr.ParamAttr(name='_source_language_embedding')) - # 应用前向循环神经网络 - src_forward = paddle.networks.simple_gru( - input=src_embedding, size=encoder_size) - # 应用反向递归神经网络(reverse=True表示反向循环神经网络) - src_backward = paddle.networks.simple_gru( - input=src_embedding, size=encoder_size, reverse=True) - # 将循环神经网络的前向和反向部分混合在一起 - encoded_vector = paddle.layer.concat(input=[src_forward, src_backward]) - - # 投射编码向量到 decoder_size - encoded_proj = paddle.layer.mixed( - size=decoder_size, - input=paddle.layer.full_matrix_projection(encoded_vector)) - - # 计算反向RNN的第一个实例 - backward_first = paddle.layer.first_seq(input=src_backward) - - # 投射反向RNN的第一个实例到 decoder size - decoder_boot = paddle.layer.mixed( - size=decoder_size, - act=paddle.activation.Tanh(), - input=paddle.layer.full_matrix_projection(backward_first)) - -解码器使用 ``recurrent_group`` 来定义循环神经网络。单步函数和输出函数在 -``gru_decoder_with_attention`` 中定义: - -.. code:: python - - group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True) - group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True) - group_inputs = [group_input1, group_input2] - trg_embedding = paddle.layer.embedding( - input=paddle.layer.data( - name='target_language_word', - type=paddle.data_type.integer_value_sequence(target_dict_dim)), - size=word_vector_dim, - param_attr=paddle.attr.ParamAttr(name='_target_language_embedding')) - group_inputs.append(trg_embedding) - group_inputs.append(trg_embedding) - - # 对于配备有注意力机制的解码器,在训练中, - # 目标向量(groudtruth)是数据输入, - # 而源序列的编码向量可以被无边界的memory访问 - # StaticInput 意味着不同时间步的输入都是相同的值, - # 否则它以一个序列输入,不同时间步的输入是不同的。 - # 所有输入序列应该有相同的长度。 - decoder = paddle.layer.recurrent_group( - name=decoder_group_name, - step=gru_decoder_with_attention, - input=group_inputs) - -单步函数的实现如下所示。首先,它定义解码网络的\ **Memory**\ 。然后定义 -attention,门控循环单元单步函数和输出函数: - -.. code:: python - - def gru_decoder_with_attention(enc_vec, enc_proj, current_word): - # 定义解码器的Memory - # Memory的输出定义在 gru_step 内 - # 注意 gru_step 应该与它的Memory名字相同 - decoder_mem = paddle.layer.memory( - name='gru_decoder', size=decoder_size, boot_layer=decoder_boot) - # 计算 attention 加权编码向量 - context = paddle.networks.simple_attention( - encoded_sequence=enc_vec, - encoded_proj=enc_proj, - decoder_state=decoder_mem) - # 混合当前词向量和attention加权编码向量 - decoder_inputs = paddle.layer.mixed( - size=decoder_size * 3, - input=[ - paddle.layer.full_matrix_projection(input=context), - paddle.layer.full_matrix_projection(input=current_word) - ]) - # 定义门控循环单元循环神经网络单步函数 - gru_step = paddle.layer.gru_step( - name='gru_decoder', - input=decoder_inputs, - output_mem=decoder_mem, - size=decoder_size) - # 定义输出函数 - out = paddle.layer.mixed( - size=target_dict_dim, - bias_attr=True, - act=paddle.activation.Softmax(), - input=paddle.layer.full_matrix_projection(input=gru_step)) - return out - -生成序列 --------- - -训练模型后,我们可以使用它来生成序列。通常的做法是使用\ **beam search** -生成序列。以下代码片段定义 beam search 算法。注意,\ ``beam_search`` -函数假设 ``step`` 的输出函数返回的是下一个时刻输出词的 softmax -归一化概率向量。我们对模型进行了以下更改。 - -- 使用 ``GeneratedInput`` 来表示 trg\_embedding。 ``GeneratedInput`` - 将上一时间步所生成的词的向量来作为当前时间步的输入。 -- 使用 ``beam_search`` 函数。这个函数需要设置: - - - ``bos_id``: 开始标记。每个句子都以开始标记开头。 - - ``eos_id``: 结束标记。每个句子都以结束标记结尾。 - - ``beam_size``: beam search 算法中的beam大小。 - - ``max_length``: 生成序列的最大长度。 - -代码如下: - -.. code:: python - - group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True) - group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True) - group_inputs = [group_input1, group_input2] - # 在生成时,解码器基于编码源序列和最后生成的目标词预测下一目标词。 - # 编码源序列(编码器输出)必须由只读Memory的 StaticInput 指定。 - # 这里, GeneratedInputs 自动获取上一个生成的词,并在最开始初始化为起始词,如 。 - trg_embedding = paddle.layer.GeneratedInput( - size=target_dict_dim, - embedding_name='_target_language_embedding', - embedding_size=word_vector_dim) - group_inputs.append(trg_embedding) - beam_gen = paddle.layer.beam_search( - name=decoder_group_name, - step=gru_decoder_with_attention, - input=group_inputs, - bos_id=0, # Beginnning token. - eos_id=1, # End of sentence token. - beam_size=beam_size, - max_length=max_length) - - return beam_gen - -注意,这种生成技术只用于类似解码器的生成过程。如果你正在处理序列标记任务,请参阅 `book/06.understand_sentiment `_ 了解更多详细信息。 - -完整的配置文件在 `book/08.machine_translation/train.py `_ 。 diff --git a/doc/v2/howto/rnn/rnn_config_en.rst b/doc/v2/howto/rnn/rnn_config_en.rst deleted file mode 100644 index f92edd108ff5c10a31b5f181f0f6dcb7a3f119f3..0000000000000000000000000000000000000000 --- a/doc/v2/howto/rnn/rnn_config_en.rst +++ /dev/null @@ -1,235 +0,0 @@ -RNN Configuration -================= - -This tutorial will guide you how to configure recurrent neural network in PaddlePaddle. PaddlePaddle supports highly flexible and efficient recurrent neural network configuration. In this tutorial, you will learn how to: - -- configure recurrent neural network architecture. -- generate sequence with learned recurrent neural network models. - -We will use vanilla recurrent neural network, and sequence to sequence model to guide you through these steps. The code of sequence to sequence model can be found at `book/08.machine_translation `_ . -And the data preparation of this model can be found at `python/paddle/v2/dataset/wmt14.py `_ - -=============================================== -Configure Recurrent Neural Network Architecture -=============================================== - -------------------------------------- -Simple Gated Recurrent Neural Network -------------------------------------- - -Recurrent neural network process a sequence at each time step sequentially. An example of the architecture of LSTM is listed below. - -.. image:: src/bi_lstm.jpg - :align: center - -Generally speaking, a recurrent network perform the following operations from :math:`t=1` to :math:`t=T`, or reversely from :math:`t=T` to :math:`t=1`. - -.. math:: - - x_{t+1} = f_x(x_t), y_t = f_y(x_t) - - -where :math:`f_x(.)` is called **step function**, and :math:`f_y(.)` is called **output function**. In vanilla recurrent neural network, both of the step function and output function are very simple. However, PaddlePaddle supports the configuration of very complex architectures by modifying these two functions. We will use the sequence to sequence model with attention as an example to demonstrate how you can configure complex recurrent neural network models. In this section, we will use a simple vanilla recurrent neural network as an example of configuring simple recurrent neural network using :code:`recurrent_group`. Notice that if you only need to use simple RNN, GRU, or LSTM, then :code:`grumemory` and :code:`lstmemory` is recommended because they are more computationally efficient than :code:`recurrent_group`. - -For vanilla RNN, at each time step, the **step function** is: - -.. math:: - - x_{t+1} = W_x x_t + W_i I_t + b - -where :math:`x_t` is the RNN state, and :math:`I_t` is the input, :math:`W_x` and :math:`W_i` are transformation matrices for RNN states and inputs, respectively. :math:`b` is the bias. -Its **output function** simply takes :math:`x_t` as the output. - -:code:`recurrent_group` is the most important tools for constructing recurrent neural networks. It defines the **step function**, **output function** and the inputs of the recurrent neural network. Notice that the :code:`step` argument of this function implements both the :code:`step function` and the :code:`output function`: - -.. code-block:: python - - def simple_rnn(input, - size=None, - name=None, - reverse=False, - rnn_bias_attr=None, - act=None, - rnn_layer_attr=None): - def __rnn_step__(ipt): - out_mem = paddle.layer.memory(name=name, size=size) - rnn_out = paddle.layer.mixed(input = [paddle.layer.full_matrix_projection(input=ipt), - paddle.layer.full_matrix_projection(input=out_mem)], - name = name, - bias_attr = rnn_bias_attr, - act = act, - layer_attr = rnn_layer_attr, - size = size) - return rnn_out - return paddle.layer.recurrent_group(name='%s_recurrent_group' % name, - step=__rnn_step__, - reverse=reverse, - input=input) - - -PaddlePaddle uses memory to construct step function. **Memory** is the most important concept when constructing recurrent neural networks in PaddlePaddle. A memory is a state that is used recurrently in step functions, such as :math:`x_{t+1} = f_x(x_t)`. One memory contains an **output** and a **input**. The output of memory at the current time step is utilized as the input of the memory at the next time step. A memory can also has a **boot layer**, whose output is utilized as the initial value of the memory. In our case, the output of the gated recurrent unit is employed as the output memory. Notice that the name of the layer :code:`rnn_out` is the same as the name of :code:`out_mem`. This means the output of the layer :code:`rnn_out` (:math:`x_{t+1}`) is utilized as the **output** of :code:`out_mem` memory. - -A memory can also be a sequence. In this case, at each time step, we have a sequence as the state of the recurrent neural network. This can be useful when constructing very complex recurrent neural network. Other advanced functions include defining multiple memories, and defining hierarchical recurrent neural network architecture using sub-sequence. - -We return :code:`rnn_out` at the end of the function. It means that the output of the layer :code:`rnn_out` is utilized as the **output** function of the gated recurrent neural network. - ------------------------------------------ -Sequence to Sequence Model with Attention ------------------------------------------ -We will use the sequence to sequence model with attention as an example to demonstrate how you can configure complex recurrent neural network models. An illustration of the sequence to sequence model with attention is shown in the following figure. - -.. image:: src/encoder-decoder-attention-model.png - :align: center - -In this model, the source sequence :math:`S = \{s_1, \dots, s_T\}` is encoded with a bidirectional gated recurrent neural networks. The hidden states of the bidirectional gated recurrent neural network :math:`H_S = \{H_1, \dots, H_T\}` is called *encoder vector* The decoder is a gated recurrent neural network. When decoding each token :math:`y_t`, the gated recurrent neural network generates a set of weights :math:`W_S^t = \{W_1^t, \dots, W_T^t\}`, which are used to compute a weighted sum of the encoder vector. The weighted sum of the encoder vector is utilized to condition the generation of the token :math:`y_t`. - -The encoder part of the model is listed below. It calls :code:`grumemory` to represent gated recurrent neural network. It is the recommended way of using recurrent neural network if the network architecture is simple, because it is faster than :code:`recurrent_group`. We have implemented most of the commonly used recurrent neural network architectures, you can refer to :ref:`api_trainer_config_helpers_layers` for more details. - -We also project the encoder vector to :code:`decoder_size` dimensional space, get the first instance of the backward recurrent network, and project it to :code:`decoder_size` dimensional space: - -.. code-block:: python - - # Define the data layer of the source sentence. - src_word_id = paddle.layer.data( - name='source_language_word', - type=paddle.data_type.integer_value_sequence(source_dict_dim)) - # Calculate the word embedding of each word. - src_embedding = paddle.layer.embedding( - input=src_word_id, - size=word_vector_dim, - param_attr=paddle.attr.ParamAttr(name='_source_language_embedding')) - # Apply forward recurrent neural network. - src_forward = paddle.networks.simple_gru( - input=src_embedding, size=encoder_size) - # Apply backward recurrent neural network. reverse=True means backward recurrent neural network. - src_backward = paddle.networks.simple_gru( - input=src_embedding, size=encoder_size, reverse=True) - # Mix the forward and backward parts of the recurrent neural network together. - encoded_vector = paddle.layer.concat(input=[src_forward, src_backward]) - - # Project encoding vector to decoder_size. - encoded_proj = paddle.layer.mixed( - size=decoder_size, - input=paddle.layer.full_matrix_projection(encoded_vector)) - - # Compute the first instance of the backward RNN. - backward_first = paddle.layer.first_seq(input=src_backward) - - # Project the first instance of backward RNN to decoder size. - decoder_boot = paddle.layer.mixed( - size=decoder_size, - act=paddle.activation.Tanh(), - input=paddle.layer.full_matrix_projection(backward_first)) - - -The decoder uses :code:`recurrent_group` to define the recurrent neural network. The step and output functions are defined in :code:`gru_decoder_with_attention`: - -.. code-block:: python - - group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True) - group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True) - group_inputs = [group_input1, group_input2] - trg_embedding = paddle.layer.embedding( - input=paddle.layer.data( - name='target_language_word', - type=paddle.data_type.integer_value_sequence(target_dict_dim)), - size=word_vector_dim, - param_attr=paddle.attr.ParamAttr(name='_target_language_embedding')) - group_inputs.append(trg_embedding) - group_inputs.append(trg_embedding) - - # For decoder equipped with attention mechanism, in training, - # target embedding (the groudtruth) is the data input, - # while encoded source sequence is accessed to as an unbounded memory. - # StaticInput means the same value is utilized at different time steps. - # Otherwise, it is a sequence input. Inputs at different time steps are different. - # All sequence inputs should have the same length. - decoder = paddle.layer.recurrent_group( - name=decoder_group_name, - step=gru_decoder_with_attention, - input=group_inputs) - - -The implementation of the step function is listed as below. First, it defines the **memory** of the decoder network. Then it defines attention, gated recurrent unit step function, and the output function: - -.. code-block:: python - - def gru_decoder_with_attention(enc_vec, enc_proj, current_word): - # Defines the memory of the decoder. - # The output of this memory is defined in gru_step. - # Notice that the name of gru_step should be the same as the name of this memory. - decoder_mem = paddle.layer.memory( - name='gru_decoder', size=decoder_size, boot_layer=decoder_boot) - # Compute attention weighted encoder vector. - context = paddle.networks.simple_attention( - encoded_sequence=enc_vec, - encoded_proj=enc_proj, - decoder_state=decoder_mem) - # Mix the current word embedding and the attention weighted encoder vector. - decoder_inputs = paddle.layer.mixed( - size=decoder_size * 3, - input=[ - paddle.layer.full_matrix_projection(input=context), - paddle.layer.full_matrix_projection(input=current_word) - ]) - # Define Gated recurrent unit recurrent neural network step function. - gru_step = paddle.layer.gru_step( - name='gru_decoder', - input=decoder_inputs, - output_mem=decoder_mem, - size=decoder_size) - # Defines the output function. - out = paddle.layer.mixed( - size=target_dict_dim, - bias_attr=True, - act=paddle.activation.Softmax(), - input=paddle.layer.full_matrix_projection(input=gru_step)) - return out - - -================= -Generate Sequence -================= -After training the model, we can use it to generate sequences. A common practice is to use **beam search** to generate sequences. The following code snippets defines a beam search algorithm. Notice that :code:`beam_search` function assumes the output function of the :code:`step` returns a softmax normalized probability vector of the next token. We made the following changes to the model. - -* use :code:`GeneratedInput` for trg_embedding. :code:`GeneratedInput` computes the embedding of the generated token at the last time step for the input at the current time step. -* use :code:`beam_search` function. This function needs to set: - - - :code:`bos_id`: the start token. Every sentence starts with the start token. - - :code:`eos_id`: the end token. Every sentence ends with the end token. - - :code:`beam_size`: the beam size used in beam search. - - :code:`max_length`: the maximum length of the generated sentences. - -The code is listed below: - -.. code-block:: python - - group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True) - group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True) - group_inputs = [group_input1, group_input2] - # In generation, decoder predicts a next target word based on - # the encoded source sequence and the last generated target word. - # The encoded source sequence (encoder's output) must be specified by - # StaticInput which is a read-only memory. - # Here, GeneratedInputs automatically fetchs the last generated word, - # which is initialized by a start mark, such as . - trg_embedding = paddle.layer.GeneratedInput( - size=target_dict_dim, - embedding_name='_target_language_embedding', - embedding_size=word_vector_dim) - group_inputs.append(trg_embedding) - beam_gen = paddle.layer.beam_search( - name=decoder_group_name, - step=gru_decoder_with_attention, - input=group_inputs, - bos_id=0, # Beginnning token. - eos_id=1, # End of sentence token. - beam_size=beam_size, - max_length=max_length) - - return beam_gen - - -Notice that this generation technique is only useful for decoder like generation process. If you are working on sequence tagging tasks, please refer to `book/06.understand_sentiment `_ for more details. - -The full configuration file is located at `book/08.machine_translation/train.py `_ . diff --git a/doc/v2/howto/rnn/src/bi_lstm.jpg b/doc/v2/howto/rnn/src/bi_lstm.jpg deleted file mode 100644 index adec1606d64d6e35ffe7e62abfa9a09309b05c84..0000000000000000000000000000000000000000 Binary files a/doc/v2/howto/rnn/src/bi_lstm.jpg and /dev/null differ diff --git a/doc/v2/howto/rnn/src/encoder-decoder-attention-model.png b/doc/v2/howto/rnn/src/encoder-decoder-attention-model.png deleted file mode 100644 index 79f911d4ba12ac0c0d1a936c9df639c302786914..0000000000000000000000000000000000000000 Binary files a/doc/v2/howto/rnn/src/encoder-decoder-attention-model.png and /dev/null differ diff --git a/doc/v2/howto/rnn/src/glossary_rnn.dot b/doc/v2/howto/rnn/src/glossary_rnn.dot deleted file mode 100644 index 2cd0fb1820c44b0e8e0b869f9d39fcad27efa758..0000000000000000000000000000000000000000 --- a/doc/v2/howto/rnn/src/glossary_rnn.dot +++ /dev/null @@ -1,42 +0,0 @@ -digraph G{ - subgraph cluster_timestep0 { - label="recurrent timestep i-1" - bgcolor=lightgray - node [style=filled,color=white] - fc0_0 [label="fc 0"] - fc0_1 [label="fc 1"] - fc0_2 [label="fc 2"] - - fc0_0 -> fc0_1 - fc0_1 -> fc0_2 - } - - subgraph cluster_timestep1 { - label="recurrent timestep i" - node [style=filled]; - fc1_0 [label="fc 0"] - fc1_1 [label="fc 1"] - fc1_2 [label="fc 2"] - color=blue - - fc1_0 -> fc1_1 - fc1_1 -> fc1_2 - } - - subgraph cluster_timestep2 { - label="recurrent timestep i+1" - bgcolor=lightgray - node [style=filled,color=white] - fc2_0 [label="fc 0"] - fc2_1 [label="fc 1"] - fc2_2 [label="fc 2"] - - fc2_0 -> fc2_1 - fc2_1 -> fc2_2 - } - - - fc0_1 -> fc1_1 [style="dotted" constraint=false] - fc1_1 -> fc2_1 [style="dotted" constraint=false] - -} \ No newline at end of file diff --git a/doc/v2/howto/rnn/src/glossary_rnn_with_memory.dot b/doc/v2/howto/rnn/src/glossary_rnn_with_memory.dot deleted file mode 100644 index 0f101ec2d8f15aec76c57f328046b6b55cf0c7eb..0000000000000000000000000000000000000000 --- a/doc/v2/howto/rnn/src/glossary_rnn_with_memory.dot +++ /dev/null @@ -1,48 +0,0 @@ -digraph G{ - subgraph cluster_timestep0 { - label="recurrent timestep i-1" - bgcolor=lightgray - node [style=filled,color=white] - fc0_0 [label="fc 0"] - fc0_1 [label="fc 1"] - fc0_2 [label="fc 2"] - m0 [label="memory"] - fc0_0 -> fc0_1 - fc0_1 -> fc0_2 - fc0_1 -> m0 - m0 -> fc0_1 - } - - subgraph cluster_timestep1 { - label="recurrent timestep i" - node [style=filled]; - fc1_0 [label="fc 0"] - fc1_1 [label="fc 1"] - fc1_2 [label="fc 2"] - m1 [label="memory"] - color=blue - fc1_0 -> fc1_1 - fc1_1 -> fc1_2 - fc1_1 -> m1 - m1 -> fc1_1 - } - - subgraph cluster_timestep2 { - label="recurrent timestep i+1" - bgcolor=lightgray - node [style=filled,color=white] - fc2_0 [label="fc 0"] - fc2_1 [label="fc 1"] - fc2_2 [label="fc 2"] - m2 [label="memory"] - fc2_0 -> fc2_1 - fc2_1 -> fc2_2 - fc2_1 -> m2 - m2 -> fc2_1 - } - - - m0 -> m1 [style="dotted" constraint=false] - m1 -> m2 [style="dotted" constraint=false] - -} \ No newline at end of file diff --git a/doc/v2/howto/rnn/src/simple_full_hierarchical_recurrent.dot b/doc/v2/howto/rnn/src/simple_full_hierarchical_recurrent.dot deleted file mode 100644 index ff278a0323bb2c3ef07bf6f016a3a8df05783581..0000000000000000000000000000000000000000 --- a/doc/v2/howto/rnn/src/simple_full_hierarchical_recurrent.dot +++ /dev/null @@ -1,30 +0,0 @@ -digraph G { - rankdir=LR; - - subgraph cluster_t0 { - a [label="4"] - b [label="5"] - c [label="2"] - } - - subgraph cluster_t1 { - d [label="0"] - e [label="9"] - } - - subgraph cluster_t2 { - f [label="8"] - g [label="1"] - h [label="4"] - } - - a -> b; - b -> c; - c -> d [constraint=false]; - - d -> e; - e -> f [constraint=false]; - - f -> g; - g -> h; -} \ No newline at end of file diff --git a/doc/v2/howto/rnn/src/simple_full_recurrent.dot b/doc/v2/howto/rnn/src/simple_full_recurrent.dot deleted file mode 100644 index cee281fbac993afbd0cc3416570f95965cdf0a59..0000000000000000000000000000000000000000 --- a/doc/v2/howto/rnn/src/simple_full_recurrent.dot +++ /dev/null @@ -1,19 +0,0 @@ -digraph G { - rankdir=LR; - a [label="4"] - b [label="5"] - c [label="2"] - d [label="0"] - e [label="9"] - f [label="8"] - g [label="1"] - h [label="4"] - - a -> b; - b -> c; - c -> d; - d -> e; - e -> f; - f -> g; - g -> h; -} \ No newline at end of file diff --git a/doc/v2/images/FullyConnected.jpg b/doc/v2/images/FullyConnected.jpg deleted file mode 100644 index b2241f401434e527f95ee4e0e541a3f2ff78fd1e..0000000000000000000000000000000000000000 Binary files a/doc/v2/images/FullyConnected.jpg and /dev/null differ diff --git a/doc/v2/images/add_security_group.png b/doc/v2/images/add_security_group.png deleted file mode 100644 index bd34f46c9b0ada7027fd53e553e7d033255d25fc..0000000000000000000000000000000000000000 Binary files a/doc/v2/images/add_security_group.png and /dev/null differ diff --git a/doc/v2/images/bi_lstm.jpg b/doc/v2/images/bi_lstm.jpg deleted file mode 100644 index adec1606d64d6e35ffe7e62abfa9a09309b05c84..0000000000000000000000000000000000000000 Binary files a/doc/v2/images/bi_lstm.jpg and /dev/null differ diff --git a/doc/v2/images/checkpointing.png b/doc/v2/images/checkpointing.png deleted file mode 100644 index c221e8474f90f37e31416cbb19c9452207a0d14c..0000000000000000000000000000000000000000 Binary files a/doc/v2/images/checkpointing.png and /dev/null differ diff --git a/doc/v2/images/create_efs.png b/doc/v2/images/create_efs.png deleted file mode 100644 index e5f1526033d1daf401700989af1d25919bcb7675..0000000000000000000000000000000000000000 Binary files a/doc/v2/images/create_efs.png and /dev/null differ diff --git a/doc/v2/images/csr.png b/doc/v2/images/csr.png deleted file mode 100644 index 3dc10b8de4f6d3f517624956b1694b689405a031..0000000000000000000000000000000000000000 Binary files a/doc/v2/images/csr.png and /dev/null differ diff --git a/doc/v2/images/data_dispatch.png b/doc/v2/images/data_dispatch.png deleted file mode 100644 index 5bdcc24d6a6d193cb014f8c38b362451fded5e54..0000000000000000000000000000000000000000 Binary files a/doc/v2/images/data_dispatch.png and /dev/null differ diff --git a/doc/v2/images/dataset.graffle b/doc/v2/images/dataset.graffle deleted file mode 100644 index c10a423ed16a23229a9ee33d11bfc82bb59646c8..0000000000000000000000000000000000000000 Binary files a/doc/v2/images/dataset.graffle and /dev/null differ diff --git a/doc/v2/images/dataset.png b/doc/v2/images/dataset.png deleted file mode 100644 index 2fb7f1cce3b6dd21489392557826e95a9f207c34..0000000000000000000000000000000000000000 Binary files a/doc/v2/images/dataset.png and /dev/null differ diff --git a/doc/v2/images/doc_en.png b/doc/v2/images/doc_en.png deleted file mode 100644 index ed6b9178fba91a3bdf45ae797a9924f84146fbc8..0000000000000000000000000000000000000000 Binary files a/doc/v2/images/doc_en.png and /dev/null differ diff --git a/doc/v2/images/efs_mount.png b/doc/v2/images/efs_mount.png deleted file mode 100644 index 0f9e3cab98445707e5e9baa18ddabe15cdf04576..0000000000000000000000000000000000000000 Binary files a/doc/v2/images/efs_mount.png and /dev/null differ diff --git a/doc/v2/images/encoder-decoder-attention-model.png b/doc/v2/images/encoder-decoder-attention-model.png deleted file mode 100644 index 79f911d4ba12ac0c0d1a936c9df639c302786914..0000000000000000000000000000000000000000 Binary files a/doc/v2/images/encoder-decoder-attention-model.png and /dev/null differ diff --git a/doc/v2/images/engine.png b/doc/v2/images/engine.png deleted file mode 100644 index 1f5f65c2cc765a514a3ba9e7b7f468e1dc4b0c3b..0000000000000000000000000000000000000000 Binary files a/doc/v2/images/engine.png and /dev/null differ diff --git a/doc/v2/images/file_storage.graffle b/doc/v2/images/file_storage.graffle deleted file mode 100644 index 50a17e70fa255495337c529a3bf12a5c0024a5be..0000000000000000000000000000000000000000 Binary files a/doc/v2/images/file_storage.graffle and /dev/null differ diff --git a/doc/v2/images/file_storage.png b/doc/v2/images/file_storage.png deleted file mode 100644 index fccb4e3e7e738224c7f1584326bd5f351ce799aa..0000000000000000000000000000000000000000 Binary files a/doc/v2/images/file_storage.png and /dev/null differ diff --git a/doc/v2/images/glossary_rnn.dot b/doc/v2/images/glossary_rnn.dot deleted file mode 100644 index 2cd0fb1820c44b0e8e0b869f9d39fcad27efa758..0000000000000000000000000000000000000000 --- a/doc/v2/images/glossary_rnn.dot +++ /dev/null @@ -1,42 +0,0 @@ -digraph G{ - subgraph cluster_timestep0 { - label="recurrent timestep i-1" - bgcolor=lightgray - node [style=filled,color=white] - fc0_0 [label="fc 0"] - fc0_1 [label="fc 1"] - fc0_2 [label="fc 2"] - - fc0_0 -> fc0_1 - fc0_1 -> fc0_2 - } - - subgraph cluster_timestep1 { - label="recurrent timestep i" - node [style=filled]; - fc1_0 [label="fc 0"] - fc1_1 [label="fc 1"] - fc1_2 [label="fc 2"] - color=blue - - fc1_0 -> fc1_1 - fc1_1 -> fc1_2 - } - - subgraph cluster_timestep2 { - label="recurrent timestep i+1" - bgcolor=lightgray - node [style=filled,color=white] - fc2_0 [label="fc 0"] - fc2_1 [label="fc 1"] - fc2_2 [label="fc 2"] - - fc2_0 -> fc2_1 - fc2_1 -> fc2_2 - } - - - fc0_1 -> fc1_1 [style="dotted" constraint=false] - fc1_1 -> fc2_1 [style="dotted" constraint=false] - -} \ No newline at end of file diff --git a/doc/v2/images/glossary_rnn_with_memory.dot b/doc/v2/images/glossary_rnn_with_memory.dot deleted file mode 100644 index 0f101ec2d8f15aec76c57f328046b6b55cf0c7eb..0000000000000000000000000000000000000000 --- a/doc/v2/images/glossary_rnn_with_memory.dot +++ /dev/null @@ -1,48 +0,0 @@ -digraph G{ - subgraph cluster_timestep0 { - label="recurrent timestep i-1" - bgcolor=lightgray - node [style=filled,color=white] - fc0_0 [label="fc 0"] - fc0_1 [label="fc 1"] - fc0_2 [label="fc 2"] - m0 [label="memory"] - fc0_0 -> fc0_1 - fc0_1 -> fc0_2 - fc0_1 -> m0 - m0 -> fc0_1 - } - - subgraph cluster_timestep1 { - label="recurrent timestep i" - node [style=filled]; - fc1_0 [label="fc 0"] - fc1_1 [label="fc 1"] - fc1_2 [label="fc 2"] - m1 [label="memory"] - color=blue - fc1_0 -> fc1_1 - fc1_1 -> fc1_2 - fc1_1 -> m1 - m1 -> fc1_1 - } - - subgraph cluster_timestep2 { - label="recurrent timestep i+1" - bgcolor=lightgray - node [style=filled,color=white] - fc2_0 [label="fc 0"] - fc2_1 [label="fc 1"] - fc2_2 [label="fc 2"] - m2 [label="memory"] - fc2_0 -> fc2_1 - fc2_1 -> fc2_2 - fc2_1 -> m2 - m2 -> fc2_1 - } - - - m0 -> m1 [style="dotted" constraint=false] - m1 -> m2 [style="dotted" constraint=false] - -} \ No newline at end of file diff --git a/doc/v2/images/gradients.png b/doc/v2/images/gradients.png deleted file mode 100644 index f031bcf8e4cec14e63075b8b9d2c7bbd9f1b1a3c..0000000000000000000000000000000000000000 Binary files a/doc/v2/images/gradients.png and /dev/null differ diff --git a/doc/v2/images/init_lock.graffle b/doc/v2/images/init_lock.graffle deleted file mode 100644 index fa9149f21b1311eed48ef72ec55e556559d0fc94..0000000000000000000000000000000000000000 Binary files a/doc/v2/images/init_lock.graffle and /dev/null differ diff --git a/doc/v2/images/init_lock.png b/doc/v2/images/init_lock.png deleted file mode 100644 index 92404ee6d6c0f9a7727952bae3c869ba338ecd7f..0000000000000000000000000000000000000000 Binary files a/doc/v2/images/init_lock.png and /dev/null differ diff --git a/doc/v2/images/k8s-paddle-arch.png b/doc/v2/images/k8s-paddle-arch.png deleted file mode 100644 index b3800c4fe81302d35e49f7dbacb9221c4dfa5cde..0000000000000000000000000000000000000000 Binary files a/doc/v2/images/k8s-paddle-arch.png and /dev/null differ diff --git a/doc/v2/images/layers.png b/doc/v2/images/layers.png deleted file mode 100644 index 306f79b7a844610915eb8944128f57d2b7a3065a..0000000000000000000000000000000000000000 Binary files a/doc/v2/images/layers.png and /dev/null differ diff --git a/doc/v2/images/managed_policy.png b/doc/v2/images/managed_policy.png deleted file mode 100644 index c7ecda555b81d7750e9292a9ab72d2f517f76a2a..0000000000000000000000000000000000000000 Binary files a/doc/v2/images/managed_policy.png and /dev/null differ diff --git a/doc/v2/images/matrix.png b/doc/v2/images/matrix.png deleted file mode 100644 index c33ce9cf0335e47cc8c1253304d0fe179186e6f2..0000000000000000000000000000000000000000 Binary files a/doc/v2/images/matrix.png and /dev/null differ diff --git a/doc/v2/images/nvvp1.png b/doc/v2/images/nvvp1.png deleted file mode 100644 index 1af23ac3c52929b2b0645d2f9fa4d4c6db1f6e77..0000000000000000000000000000000000000000 Binary files a/doc/v2/images/nvvp1.png and /dev/null differ diff --git a/doc/v2/images/nvvp2.png b/doc/v2/images/nvvp2.png deleted file mode 100644 index 177c9db708da6863d1075f3e615f5962dbe18b29..0000000000000000000000000000000000000000 Binary files a/doc/v2/images/nvvp2.png and /dev/null differ diff --git a/doc/v2/images/nvvp3.png b/doc/v2/images/nvvp3.png deleted file mode 100644 index d8f393667d6569b6f1e61ffccac43fae5888b6db..0000000000000000000000000000000000000000 Binary files a/doc/v2/images/nvvp3.png and /dev/null differ diff --git a/doc/v2/images/nvvp4.png b/doc/v2/images/nvvp4.png deleted file mode 100644 index 51f2f3e183295de6cf8ddaf2b3b8a0862aa35f01..0000000000000000000000000000000000000000 Binary files a/doc/v2/images/nvvp4.png and /dev/null differ diff --git a/doc/v2/images/overview.png b/doc/v2/images/overview.png deleted file mode 100644 index 8fb7bbb9dd654bf363d701d0c8cd4a557043d188..0000000000000000000000000000000000000000 Binary files a/doc/v2/images/overview.png and /dev/null differ diff --git a/doc/v2/images/paddle-cloud-in-data-center.png b/doc/v2/images/paddle-cloud-in-data-center.png deleted file mode 100644 index da5d1a77562480ad1d886f5f21dbd84001d3d508..0000000000000000000000000000000000000000 Binary files a/doc/v2/images/paddle-cloud-in-data-center.png and /dev/null differ diff --git a/doc/v2/images/paddle-etcd.graffle b/doc/v2/images/paddle-etcd.graffle deleted file mode 100644 index f973dc9b9dbf72e9bc31e2d32822916cd281f8d9..0000000000000000000000000000000000000000 Binary files a/doc/v2/images/paddle-etcd.graffle and /dev/null differ diff --git a/doc/v2/images/paddle-etcd.png b/doc/v2/images/paddle-etcd.png deleted file mode 100644 index 57981ceb4b94f0f7d6dfa63f3d28c0402bf9cc31..0000000000000000000000000000000000000000 Binary files a/doc/v2/images/paddle-etcd.png and /dev/null differ diff --git a/doc/v2/images/paddle-model-sharding.graffle b/doc/v2/images/paddle-model-sharding.graffle deleted file mode 100644 index fba30f0ca2b47f0d202a432821d95e55aac37ec8..0000000000000000000000000000000000000000 Binary files a/doc/v2/images/paddle-model-sharding.graffle and /dev/null differ diff --git a/doc/v2/images/paddle-model-sharding.png b/doc/v2/images/paddle-model-sharding.png deleted file mode 100644 index 8c3f6724ef46c6527e63a4cd8cb0b50fe0167124..0000000000000000000000000000000000000000 Binary files a/doc/v2/images/paddle-model-sharding.png and /dev/null differ diff --git a/doc/v2/images/paddle-ps-0.png b/doc/v2/images/paddle-ps-0.png deleted file mode 100644 index 47ef32806f182cab003da77f1556823b3f6d1721..0000000000000000000000000000000000000000 Binary files a/doc/v2/images/paddle-ps-0.png and /dev/null differ diff --git a/doc/v2/images/paddle-ps-1.png b/doc/v2/images/paddle-ps-1.png deleted file mode 100644 index f3125db73096c52bac6e7c60e1675552857c0774..0000000000000000000000000000000000000000 Binary files a/doc/v2/images/paddle-ps-1.png and /dev/null differ diff --git a/doc/v2/images/paddle-ps.graffle b/doc/v2/images/paddle-ps.graffle deleted file mode 100644 index 0e536ffdd91cd696008b4c01bad3cb53edebdc16..0000000000000000000000000000000000000000 Binary files a/doc/v2/images/paddle-ps.graffle and /dev/null differ diff --git a/doc/v2/images/paddle-task-queues.graffle b/doc/v2/images/paddle-task-queues.graffle deleted file mode 100644 index 4263ed8bfd2ef0e55058828bf23f2fac3595e5fd..0000000000000000000000000000000000000000 Binary files a/doc/v2/images/paddle-task-queues.graffle and /dev/null differ diff --git a/doc/v2/images/paddle-task-queues.png b/doc/v2/images/paddle-task-queues.png deleted file mode 100644 index 5f980266795776752cebd0c346b85c4a75a47780..0000000000000000000000000000000000000000 Binary files a/doc/v2/images/paddle-task-queues.png and /dev/null differ diff --git a/doc/v2/images/paddle-task-states.graffle b/doc/v2/images/paddle-task-states.graffle deleted file mode 100644 index cf1a0b9246d9386a949d2dbb8c32fe84f72eea83..0000000000000000000000000000000000000000 Binary files a/doc/v2/images/paddle-task-states.graffle and /dev/null differ diff --git a/doc/v2/images/paddle-task-states.png b/doc/v2/images/paddle-task-states.png deleted file mode 100644 index 4ae43cb66c071aee9eb90d875e2373b29af9c3e0..0000000000000000000000000000000000000000 Binary files a/doc/v2/images/paddle-task-states.png and /dev/null differ diff --git a/doc/v2/images/ps_cn.png b/doc/v2/images/ps_cn.png deleted file mode 100644 index f9525739cc8bc6506adde642aafa0a85ae3ebebc..0000000000000000000000000000000000000000 Binary files a/doc/v2/images/ps_cn.png and /dev/null differ diff --git a/doc/v2/images/ps_en.png b/doc/v2/images/ps_en.png deleted file mode 100644 index 6537d3d56589ca9f19a77a50a970e4b5275e6ce0..0000000000000000000000000000000000000000 Binary files a/doc/v2/images/ps_en.png and /dev/null differ diff --git a/doc/v2/images/pserver_and_trainer.png b/doc/v2/images/pserver_and_trainer.png deleted file mode 100644 index f41fe48920590333ad332bb51eb18e03dc251541..0000000000000000000000000000000000000000 Binary files a/doc/v2/images/pserver_and_trainer.png and /dev/null differ diff --git a/doc/v2/images/pserver_init.graffle b/doc/v2/images/pserver_init.graffle deleted file mode 100644 index 5f3f1f52be8aa7f9049a8fcd6b7c93c8560c1676..0000000000000000000000000000000000000000 Binary files a/doc/v2/images/pserver_init.graffle and /dev/null differ diff --git a/doc/v2/images/pserver_init.png b/doc/v2/images/pserver_init.png deleted file mode 100644 index dfe491ff98dd7db1c336093c80964a260df2cd90..0000000000000000000000000000000000000000 Binary files a/doc/v2/images/pserver_init.png and /dev/null differ diff --git a/doc/v2/images/route53_create_recordset.png b/doc/v2/images/route53_create_recordset.png deleted file mode 100644 index 34e476c7beac30fcdde13fccc4cc8d08b4be3d35..0000000000000000000000000000000000000000 Binary files a/doc/v2/images/route53_create_recordset.png and /dev/null differ diff --git a/doc/v2/images/route53_create_zone.png b/doc/v2/images/route53_create_zone.png deleted file mode 100644 index 25b7ddb831c5cba97f4b2edddd27da3234d621af..0000000000000000000000000000000000000000 Binary files a/doc/v2/images/route53_create_zone.png and /dev/null differ diff --git a/doc/v2/images/sequence_data.png b/doc/v2/images/sequence_data.png deleted file mode 100644 index 6e47a46b8955dfe977e85898fe3c9f33ed28de7e..0000000000000000000000000000000000000000 Binary files a/doc/v2/images/sequence_data.png and /dev/null differ diff --git a/doc/v2/images/simple_full_hierarchical_recurrent.dot b/doc/v2/images/simple_full_hierarchical_recurrent.dot deleted file mode 100644 index ff278a0323bb2c3ef07bf6f016a3a8df05783581..0000000000000000000000000000000000000000 --- a/doc/v2/images/simple_full_hierarchical_recurrent.dot +++ /dev/null @@ -1,30 +0,0 @@ -digraph G { - rankdir=LR; - - subgraph cluster_t0 { - a [label="4"] - b [label="5"] - c [label="2"] - } - - subgraph cluster_t1 { - d [label="0"] - e [label="9"] - } - - subgraph cluster_t2 { - f [label="8"] - g [label="1"] - h [label="4"] - } - - a -> b; - b -> c; - c -> d [constraint=false]; - - d -> e; - e -> f [constraint=false]; - - f -> g; - g -> h; -} \ No newline at end of file diff --git a/doc/v2/images/simple_full_recurrent.dot b/doc/v2/images/simple_full_recurrent.dot deleted file mode 100644 index cee281fbac993afbd0cc3416570f95965cdf0a59..0000000000000000000000000000000000000000 --- a/doc/v2/images/simple_full_recurrent.dot +++ /dev/null @@ -1,19 +0,0 @@ -digraph G { - rankdir=LR; - a [label="4"] - b [label="5"] - c [label="2"] - d [label="0"] - e [label="9"] - f [label="8"] - g [label="1"] - h [label="4"] - - a -> b; - b -> c; - c -> d; - d -> e; - e -> f; - f -> g; - g -> h; -} \ No newline at end of file diff --git a/doc/v2/images/submit-job.graffle b/doc/v2/images/submit-job.graffle deleted file mode 100644 index 677cdfb6d9a32168bf71729eb841fa1ca0dd31d6..0000000000000000000000000000000000000000 Binary files a/doc/v2/images/submit-job.graffle and /dev/null differ diff --git a/doc/v2/images/submit-job.png b/doc/v2/images/submit-job.png deleted file mode 100644 index 3046a460a7ba708079e88a560debaa215a694680..0000000000000000000000000000000000000000 Binary files a/doc/v2/images/submit-job.png and /dev/null differ diff --git a/doc/v2/images/trainer.graffle b/doc/v2/images/trainer.graffle deleted file mode 100644 index 43415ed8cf61a5acfa34f8e56b9577f338dbf254..0000000000000000000000000000000000000000 Binary files a/doc/v2/images/trainer.graffle and /dev/null differ diff --git a/doc/v2/images/trainer.png b/doc/v2/images/trainer.png deleted file mode 100644 index 6537d3d56589ca9f19a77a50a970e4b5275e6ce0..0000000000000000000000000000000000000000 Binary files a/doc/v2/images/trainer.png and /dev/null differ diff --git a/doc/v2/images/trainer_cn.png b/doc/v2/images/trainer_cn.png deleted file mode 100644 index f9525739cc8bc6506adde642aafa0a85ae3ebebc..0000000000000000000000000000000000000000 Binary files a/doc/v2/images/trainer_cn.png and /dev/null differ diff --git a/doc/v2/images/worker_security_group.png b/doc/v2/images/worker_security_group.png deleted file mode 100644 index 57eb0265a34ad4223b69600d2a3dd355482e0bf5..0000000000000000000000000000000000000000 Binary files a/doc/v2/images/worker_security_group.png and /dev/null differ diff --git a/doc/v2/images/workflow_of_CAPI.png b/doc/v2/images/workflow_of_CAPI.png deleted file mode 100644 index a4399ade048b3fe10d2d9c714bc34333ca068edb..0000000000000000000000000000000000000000 Binary files a/doc/v2/images/workflow_of_CAPI.png and /dev/null differ diff --git a/doc/v2/index_cn.rst b/doc/v2/index_cn.rst deleted file mode 100644 index 0f645db6fc5d0f84bbe0cbb335677752e3a355ea..0000000000000000000000000000000000000000 --- a/doc/v2/index_cn.rst +++ /dev/null @@ -1,11 +0,0 @@ -PaddlePaddle 文档 -====================== - -.. toctree:: - :maxdepth: 1 - - getstarted/index_cn.rst - build_and_install/index_cn.rst - howto/index_cn.rst - dev/index_cn.rst - faq/index_cn.rst diff --git a/doc/v2/index_en.rst b/doc/v2/index_en.rst deleted file mode 100644 index 909f035cca3db2a02fd38462acc451375eceff40..0000000000000000000000000000000000000000 --- a/doc/v2/index_en.rst +++ /dev/null @@ -1,11 +0,0 @@ -PaddlePaddle Documentation -========================== - -.. toctree:: - :maxdepth: 1 - - getstarted/index_en.rst - build_and_install/index_en.rst - howto/index_en.rst - dev/index_en.rst - faq/index_en.rst