diff --git a/.clang-format b/.clang-format index 9ba433b17362424973626470d930356c2173dd84..aff93435f58c522f5ed1090aef2005f76e91cf31 100644 --- a/.clang-format +++ b/.clang-format @@ -25,4 +25,3 @@ AllowAllParametersOfDeclarationOnNextLine: true BinPackParameters: false BinPackArguments: false ... - diff --git a/.gitignore b/.gitignore index 351b8204100dfd71e94cb3efa2e946b44b9e4285..ac56a3320ec85769d2c87c072512f5217eca0c24 100644 --- a/.gitignore +++ b/.gitignore @@ -21,10 +21,11 @@ third_party/ cmake-build-* # generated while compiling -python/paddle/v2/framework/core.so +python/paddle/v2/fluid/core.so paddle/pybind/pybind.h CMakeFiles cmake_install.cmake paddle/.timestamp python/paddlepaddle.egg-info/ paddle/pybind/pybind.h +python/paddle/version.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 83fe9af768964003130d02b7d913ad1c2102dd1d..59661c9c1da53a2ddac0127ed1827fedde811a1d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -31,6 +31,3 @@ - id: go-fmt types: - go - - id: gometalinter - types: - - go diff --git a/.travis.yml b/.travis.yml index d0e2696f100e55f320e410afd6a3038db647f76f..e2d49daa1981396628efa5d16459eb70e9e76884 100644 --- a/.travis.yml +++ b/.travis.yml @@ -30,6 +30,7 @@ addons: - automake - libtool - ccache + ssh_known_hosts: 52.76.173.135 before_install: - if [[ "$JOB" == "check_style" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python @@ -41,7 +42,15 @@ before_install: script: - | timeout 2580 paddle/scripts/travis/${JOB}.sh # 43min timeout - RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else false; fi; + RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true ;else exit 1; fi; + - | + if [[ "$JOB" != "build_doc" ]]; then exit 0; fi; + if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi; + if [[ "$TRAVIS_BRANCH" != "develop" && ! "$TRAVIS_BRANCH" =~ ^v[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then exit 0; fi; + export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/master/scripts/deploy/deploy_docs.sh + export DOCS_DIR=`pwd` + cd .. + curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH $DOCS_DIR $DOCS_DIR/build/doc notifications: email: on_success: change diff --git a/CMakeLists.txt b/CMakeLists.txt index 4921226ec1c90a969fa1cfc383823820500c7757..b309ff37e52b4fd28b14925bdd7e3740e1e2fa47 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,10 +16,14 @@ cmake_minimum_required(VERSION 3.0) set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake") set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) +SET(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG") +SET(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG") include(system) project(paddle CXX C Go) +message(STATUS "CXX compiler: " ${CMAKE_CXX_COMPILER} ", version: " ${CMAKE_CXX_COMPILER_VERSION}) +message(STATUS "C compiler: " ${CMAKE_C_COMPILER} ", version: " ${CMAKE_C_COMPILER_VERSION}) find_package(Sphinx) if(NOT CMAKE_CROSSCOMPILING) @@ -36,8 +40,7 @@ include(simd) ################################ Configurations ####################################### option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND}) option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND}) -option(WITH_MKLDNN "Compile PaddlePaddle with mkl-dnn support." ${AVX_FOUND}) -option(WITH_MKLML "Compile PaddlePaddle with mklml package." ${AVX_FOUND}) +option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND}) option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON) option(WITH_TESTING "Compile PaddlePaddle with unit testing" ON) option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON) @@ -55,7 +58,9 @@ option(WITH_C_API "Compile PaddlePaddle with C-API(Prediction)" OFF) option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF) option(GLIDE_INSTALL "Download and install go dependencies " ON) option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF) +option(WITH_DISTRIBUTE "Compile with grpc distributed support" OFF) option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF) +option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF) # CMAKE_BUILD_TYPE if(NOT CMAKE_BUILD_TYPE) @@ -68,9 +73,6 @@ if(ANDROID OR IOS) if(ANDROID) if(${CMAKE_SYSTEM_VERSION} VERSION_LESS "16") message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 16") - elseif(${CMAKE_SYSTEM_VERSION} VERSION_LESS "21") - # TODO: support glog for Android api 16 ~ 19 in the future - message(WARNING "Using the unofficial git repository instead") endif() endif() @@ -82,10 +84,18 @@ if(ANDROID OR IOS) "Disable PYTHON when cross-compiling for Android and iOS" FORCE) set(WITH_RDMA OFF CACHE STRING "Disable RDMA when cross-compiling for Android and iOS" FORCE) - set(WITH_MKLDNN OFF CACHE STRING - "Disable MKLDNN when cross-compiling for Android and iOS" FORCE) - set(WITH_MKLML OFF CACHE STRING - "Disable MKLML package when cross-compiling for Android and iOS" FORCE) + set(WITH_MKL OFF CACHE STRING + "Disable MKL when cross-compiling for Android and iOS" FORCE) + set(WITH_GOLANG OFF CACHE STRING + "Disable golang when cross-compiling for Android and iOS" FORCE) + + # Compile PaddlePaddle mobile inference library + if (NOT WITH_C_API) + set(WITH_C_API ON CACHE STRING + "Always compile the C_API when cross-compiling for Android and iOS" FORCE) + endif() + set(MOBILE_INFERENCE ON) + add_definitions(-DPADDLE_MOBILE_INFERENCE) endif() set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING @@ -97,6 +107,20 @@ if (WITH_C_API AND WITH_PYTHON) "different Python interpreter from compiling.") endif() +if(MOBILE_INFERENCE) + set(THIRD_PARTY_BUILD_TYPE MinSizeRel) +else() + set(THIRD_PARTY_BUILD_TYPE Release) +endif() + +set(WITH_MKLML ${WITH_MKL}) +if (WITH_MKL AND AVX2_FOUND) + set(WITH_MKLDNN ON) +else() + message(STATUS "Do not have AVX2 intrinsics and disabled MKL-DNN") + set(WITH_MKLDNN OFF) +endif() + ######################################################################################## include(external/mklml) # download mklml package @@ -112,7 +136,10 @@ include(external/swig) # download, build, install swig include(external/warpctc) # download, build, install warpctc include(external/any) # download libn::any include(external/eigen) # download eigen3 -include(external/pybind11) # download pybind11 +include(external/pybind11) # download pybind11 +include(external/nccl) +include(external/cares) +include(external/grpc) include(cudnn) # set cudnn libraries, must before configure include(configure) # add paddle env configuration @@ -143,14 +170,15 @@ set(EXTERNAL_LIBS ) if(WITH_GPU) - list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY}) - if(NOT WITH_DSO) - list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY}) - endif(NOT WITH_DSO) + include(cuda) endif(WITH_GPU) +if(WITH_MKLML) + list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB}) +endif() + if(WITH_MKLDNN) - list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB} ${MKLDNN_IOMP_LIB}) + list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB}) endif() if(USE_NNPACK) @@ -160,9 +188,11 @@ endif(USE_NNPACK) add_subdirectory(proto) -# "add_subdirectory(go)" should be placed after the following loine, -# because it depends on paddle/optimizer. -add_subdirectory(paddle/optimizer) +if(NOT MOBILE_INFERENCE) + # "add_subdirectory(go)" should be placed after the following loine, + # because it depends on paddle/optimizer. + add_subdirectory(paddle/optimizer) +endif() # "add_subdirectory(paddle)" and "add_subdirectory(python)" should be # placed after this block, because they depends on it. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 0d4bb973ae87bb45ef4386a63c26ed62602f2cee..a60453ff4e3bba6e6cb3b3de915dd69afd3a1ec3 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1 +1,157 @@ -./doc/howto/dev/contribute_to_paddle_en.md +# Contribute Code + +We sincerely appreciate your contribution. This document explains our workflow and work style. + +## Workflow + +PaddlePaddle uses this [Git branching model](http://nvie.com/posts/a-successful-git-branching-model/). The following steps guide usual contributions. + +1. Fork + + Our development community has been growing fastly; it doesn't make sense for everyone to write into the official repo. So, please file Pull Requests from your fork. To make a fork, just head over to the GitHub page and click the ["Fork" button](https://help.github.com/articles/fork-a-repo/). + +1. Clone + + To make a copy of your fork to your local computers, please run + + ```bash + git clone https://github.com/your-github-account/paddle + cd paddle + ``` + +1. Create the local feature branch + + For daily works like adding a new feature or fixing a bug, please open your feature branch before coding: + + ```bash + git checkout -b my-cool-stuff + ``` + +1. Commit + + Before issuing your first `git commit` command, please install [`pre-commit`](http://pre-commit.com/) by running the following commands: + + ```bash + pip install pre-commit + pre-commit install + ``` + + Our pre-commit configuration requires clang-format 3.8 for auto-formating C/C++ code and yapf for Python. + + Once installed, `pre-commit` checks the style of code and documentation in every commit. We will see something like the following when you run `git commit`: + + ``` + ➜ git commit + CRLF end-lines remover...............................(no files to check)Skipped + yapf.................................................(no files to check)Skipped + Check for added large files..............................................Passed + Check for merge conflicts................................................Passed + Check for broken symlinks................................................Passed + Detect Private Key...................................(no files to check)Skipped + Fix End of Files.....................................(no files to check)Skipped + clang-formater.......................................(no files to check)Skipped + [my-cool-stuff c703c041] add test file + 1 file changed, 0 insertions(+), 0 deletions(-) + create mode 100644 233 + ``` + +1. Build and test + + Users can build PaddlePaddle natively on Linux and Mac OS X. But to unify the building environment and to make it easy for debugging, the recommended way is [using Docker](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/build_en.md). + +1. Keep pulling + + An experienced Git user pulls from the official repo often -- daily or even hourly, so they notice conflicts with others work early, and it's easier to resolve smaller conflicts. + + ```bash + git remote add upstream https://github.com/PaddlePaddle/Paddle + git pull upstream develop + ``` + +1. Push and file a pull request + + You can "push" your local work into your forked repo: + + ```bash + git push origin my-cool-stuff + ``` + + The push allows you to create a pull request, requesting owners of this [official repo](https://github.com/PaddlePaddle/Paddle) to pull your change into the official one. + + To create a pull request, please follow [these steps](https://help.github.com/articles/creating-a-pull-request/). + + If your change is for fixing an issue, please write ["Fixes "](https://help.github.com/articles/closing-issues-using-keywords/) in the description section of your pull request. Github would close the issue when the owners merge your pull request. + + Please remember to specify some reviewers for your pull request. If you don't know who are the right ones, please follow Github's recommendation. + + +1. Delete local and remote branches + + To keep your local workspace and your fork clean, you might want to remove merged branches: + + ```bash + git push origin :my-cool-stuff + git checkout develop + git pull upstream develop + git branch -d my-cool-stuff + ``` + +### Code Review + +- Please feel free to ping your reviewers by sending them the URL of your pull request via IM or email. Please do this after your pull request passes the CI. + +- Please answer reviewers' every comment. If you are to follow the comment, please write "Done"; please give a reason otherwise. + +- If you don't want your reviewers to get overwhelmed by email notifications, you might reply their comments by [in a batch](https://help.github.com/articles/reviewing-proposed-changes-in-a-pull-request/). + +- Reduce the unnecessary commits. Some developers commit often. It is recommended to append a sequence of small changes into one commit by running `git commit --amend` instead of `git commit`. + + +## Coding Standard + +### Code Style + +Our C/C++ code follows the [Google style guide](http://google.github.io/styleguide/cppguide.html). + +Our Python code follows the [PEP8 style guide](https://www.python.org/dev/peps/pep-0008/). + +Our build process helps to check the code style. In [`build.sh`](https://github.com/PaddlePaddle/Paddle/blob/b84e8226514b8bb4405c3c28e54aa5077193d179/paddle/scripts/docker/build.sh#L42), the entry point of our [builder Docker image](https://github.com/PaddlePaddle/Paddle/blob/b84e8226514b8bb4405c3c28e54aa5077193d179/Dockerfile#L88), the CMake argument `WITH_STYLE_CHECK` is set to `ON` by default. This flag is on + +Please install pre-commit, which automatically reformat the changes to C/C++ and Python code whenever we run `git commit`. To check the whole codebase, we can run the command `pre-commit run -a`, as in the [`check_style.sh` file](https://github.com/PaddlePaddle/Paddle/blob/b84e8226514b8bb4405c3c28e54aa5077193d179/paddle/scripts/travis/check_style.sh#L30), which is invoked by [our Travis CI configuration](https://github.com/PaddlePaddle/Paddle/blob/b84e8226514b8bb4405c3c28e54aa5077193d179/.travis.yml#L43). + +### Unit Tests + +Please remember to add related unit tests. + +- For C/C++ code, please follow [`google-test` Primer](https://github.com/google/googletest/blob/master/googletest/docs/Primer.md). + +- For Python code, please use [Python's standard `unittest` package](http://pythontesting.net/framework/unittest/unittest-introduction/). + + +### Writing Logs + +We use [glog](https://github.com/google/glog) for logging in our C/C++ code. + +For general information, please use `LOG`. For debug information, please use [`VLOG`](http://htmlpreview.github.io/?https://github.com/google/glog/blob/master/doc/glog.html#verbose). The reason is at [here](https://groups.google.com/a/chromium.org/d/msg/chromium-dev/3NDNd1KzXeY/AZKMMx37fdQJ). + +`VLOG` requires a *verbose level* parameter. For example: + +```c++ +VLOG(3) << "Operator FC is taking " << num_inputs << "inputs." +``` + +When we run a PaddlePaddle application or test, we can specify a verbose threshold. For example: + +```bash +GLOG_vmodule=buddy_allocator=2 \ +GLOG_v=10 \ +python \ +../python/paddle/v2/framework/tests/test_recurrent_op.py +``` + +This will enable VLOG messages generated by `buddy_allocator.{h,cc}` and in the verbose range of 0 to 3, so you will see above example VLOG message, which is in level 3. This suggests that we output overall messages in lower verbose levels, so they display with higher probability. When coding C++, please follow the verbose level convention as follows: + +- verbose level 1: [framework](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework) +- verbose level 3: [operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators) +- verbose level 5: [memory](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory), [platform](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/platform) +- verbose level 7: [math](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/math) diff --git a/Dockerfile b/Dockerfile index 136db772cc6a24b8084120fa6bab666bc1eda78e..857d3f3e5f64791146741ffb29feabfcb2ecbb84 100644 --- a/Dockerfile +++ b/Dockerfile @@ -22,14 +22,14 @@ COPY ./paddle/scripts/docker/root/ /root/ RUN apt-get update && \ apt-get install -y \ - git python-pip python-dev openssh-server bison \ + git python-pip python-dev openssh-server bison libnccl-dev \ wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \ curl sed grep graphviz libjpeg-dev zlib1g-dev \ python-matplotlib gcc-4.8 g++-4.8 \ automake locales clang-format swig doxygen cmake \ liblapack-dev liblapacke-dev libboost-dev \ clang-3.8 llvm-3.8 libclang-3.8-dev \ - net-tools && \ + net-tools libtool && \ apt-get clean -y # Install Go and glide diff --git a/RELEASE.cn.md b/RELEASE.cn.md index 5deaf230a8f5dd3089993f0fc79b9460fd049750..494c59730dd3c2830514e8924aa3d59a34ac412e 100644 --- a/RELEASE.cn.md +++ b/RELEASE.cn.md @@ -1,3 +1,62 @@ +# v0.11.0版本 + +## PaddlePaddle Fluid + +- PaddlePaddle发布版本v0.11.0包含一个新的特性*PaddlePaddle Fluid*. Fluid 是设计用来让用户像Pytorch和Tensorflow Eager Execution一样执行程序。在这些系统中,不再有*模型*这个概念,应用也不再包含一个用于描述Operator图或者一系列层的符号描述,而是像通用程序那样描述训练或者预测的过程。而Fluid与PyTorch或Eager Execution的区别在于Fluid不依赖Python提供的控制流,例如 if-else-then或者for,而是提供了基于C++实现的控制流并暴露了对应的用with语法实现的Python接口。例如: + + https://github.com/PaddlePaddle/Paddle/blob/3df78ed2a98d37f7ae6725894cc7514effd5664b/python/paddle/v2/fluid/tests/test_while_op.py#L36-L44 + +- 在v0.11.0版本中,我们提供了一个C++类`Executor`用于运行一个Fluid程序。Executor类似一个解释器。在未来的版本中,我们将提升和优化Executor成为一个调试器,就像GDB。并可能提供一些编译器,这个编译器会读取一个上文所描述的应用然后编译成一个等价的 +源代码,这个源代码可以被nvcc编译成可以使用CUDA的二进制,或者被icc编译成可以充分利用Intel CPU的二进制。 + + +## 新特点 + +* 发布 `PaddlePaddle Fluid`。 +* 增加了用于模型预测的C-API。 +* 用Fluid API实现了一个简单的GAN的例子。 +* 增加了关于性能调优的文档。 +* 为`paddle.v2.dataset`下载数据集提供了重试机制. +* C++中使用protobuf-lite替换protobuf减少了二进制的大小。 +* 发布了新特性 [Elastic Deep Learning (EDL)](https://github.com/PaddlePaddle/cloud/tree/develop/doc/autoscale/experiment). +* 基于Bazel API利用cmake实现了一个的新的构建系统函数库。 +* 当使用编译选项`WITH_MKL=ON`时自动下载和编译Intel® [MKLML](https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz) 函数库. +* [Intel® MKL-DNN on PaddlePaddle](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn): + - 完成了 11个 MKL-DNN 层: Convolution, Fully connectivity, Pooling, ReLU, Tanh, ELU, Softmax, BatchNorm, AddTo, Concat, LRN。 + - 完成了 3个 MKL-DNN 网络: VGG-19, ResNet-50, GoogleNet + - 基于Intel Skylake 6148 CPU的[性能测试](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/IntelOptimizedPaddle.md) : 相对于MKLML有2~3倍的训练加速。 +* 增加 [softsign activation](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/activation.html#softsign) +* 增加 [dot product layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#dot-prod) +* 增加 [L2 distance layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#l2-distance) +* 增加 [sub-nested sequence layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#sub-nested-seq) +* 增加 [kmax sequence score layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#kmax-sequence-score) +* 增加 [sequence slice layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#seq-slice) +* 增加 [row convolution layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#row-conv) +* 增加移动端友好的网页 + +## 改进 + +* 使用一个Python`whl`包即可安装. +* [V2 API可以实现用户定制化评估](https://github.com/PaddlePaddle/models/tree/develop/ltr#训练过程中输出自定义评估指标)。 +* 将 `PADDLE_ONLY_CPU` 改为 `PADDLE_WITH_GPU`, 因为我们会支持多种设备。 +* 删除了有一些bug的BarrierStat。 +* 清理和删除了paddle::Parameter中未使用的函数。 +* 删除了ProtoDataProvider。 +* Huber loss同时支持回归和分类。 +* 为sequence pooling 层增加`stride`参数。 +* v2 API自动使用cudnn batch normalization。 +* 可以使用一个固定的参数名共享BN层的参数。 +* 2D convolution operation支持variable-dimension input特性。 +* 重构cmake中关于CUDA的部分并实现自动检测GPU架构的功能。 +* 优化网页导航。 + +## 错误修复 + +* 修复ROI pooling的Bug. cc9a761 +* 修复当label是dense vector是AUC变成0的问题. #5274 +* 修复WarpCTC 层的Bug. + + # v0.10.0版本 我们非常高兴发布了PaddlePaddle V0.10.0版,并开发了新的[Python API](http://research.baidu.com/paddlepaddles-new-api-simplifies-deep-learning-programs/)。 diff --git a/RELEASE.md b/RELEASE.md index 146f7afa7dfbc152500b82fde28445ae3155c16c..5a62c955131007c9f3329d162c20d1b462550019 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,3 +1,75 @@ +# Release v0.11.0 + +## PaddlePaddle Fluid + +- Release 0.11.0 includes a new feature *PaddlePaddle Fluid*. Fluid is + designed to allow users to program like PyTorch and TensorFlow Eager Execution. + In these systems, there is no longer the concept *model* and applications + do not include a symbolic description of a graph of operators nor a sequence + of layers. Instead, applications look exactly like a usual program that + describes a process of training or inference. The difference between + Fluid and PyTorch or Eager Execution is that Fluid doesn't rely on Python's + control-flow, `if-then-else` nor `for`. Instead, Fluid provides its + C++ implementations and their Python binding using the `with` statement. For an example + + https://github.com/PaddlePaddle/Paddle/blob/3df78ed2a98d37f7ae6725894cc7514effd5664b/python/paddle/v2/fluid/tests/test_while_op.py#L36-L44 + +- In 0.11.0, we provides a C++ class `Executor` to run a Fluid program. +Executor works like an interpreter. In future version, we will improve +`Executor` into a debugger like GDB, and we might provide some compilers, +which, for example, takes an application like the above one, and outputs +an equivalent C++ source program, which can be compiled using +[`nvcc`](http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html) +to generate binaries that use CUDA, or using +[`icc`](https://software.intel.com/en-us/c-compilers) to generate binaries +that make full use of Intel CPUs. + +## New Features + +* Release `PaddlePaddle Fluid`. +* Add C-API for model inference +* Use fluid API to create a simple GAN demo. +* Add develop guide about performance tunning. +* Add retry when download `paddle.v2.dataset`. +* Linking protobuf-lite not protobuf in C++. Reduce the binary size. +* Feature [Elastic Deep Learning (EDL)](https://github.com/PaddlePaddle/cloud/tree/develop/doc/autoscale/experiment) released. +* A new style cmake functions for Paddle. It is based on Bazel API. +* Automatically download and compile with Intel® [MKLML](https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz) library as CBLAS when build `WITH_MKL=ON`. +* [Intel® MKL-DNN on PaddlePaddle](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn): + - Complete 11 MKL-DNN layers: Convolution, Fully connectivity, Pooling, ReLU, Tanh, ELU, Softmax, BatchNorm, AddTo, Concat, LRN. + - Complete 3 MKL-DNN networks: VGG-19, ResNet-50, GoogleNet + - [Benchmark](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/IntelOptimizedPaddle.md) on Intel Skylake 6148 CPU: 2~3x training speedup compared with MKLML. +* Add the [`softsign` activation](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/activation.html#softsign). +* Add the [dot product layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#dot-prod). +* Add the [L2 distance layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#l2-distance). +* Add the [sub-nested sequence layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#sub-nested-seq). +* Add the [kmax sequence score layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#kmax-sequence-score). +* Add the [sequence slice layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#seq-slice). +* Add the [row convolution layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#row-conv) +* Add mobile friendly webpages. + +## Improvements + +* Build and install using a single `whl` package. +* [Custom evaluating in V2 API](https://github.com/PaddlePaddle/models/tree/develop/ltr#训练过程中输出自定义评估指标). +* Change `PADDLE_ONLY_CPU` to `PADDLE_WITH_GPU`, since we will support many kinds of devices. +* Remove buggy BarrierStat. +* Clean and remove unused functions in paddle::Parameter. +* Remove ProtoDataProvider. +* Huber loss supports both regression and classification. +* Add the `stride` parameter for sequence pooling layers. +* Enable v2 API use cudnn batch normalization automatically. +* The BN layer's parameter can be shared by a fixed the parameter name. +* Support variable-dimension input feature for 2D convolution operation. +* Refine cmake about CUDA to automatically detect GPU architecture. +* Improved website navigation. + +## Bug Fixes + +* Fix bug in ROI pooling. cc9a761 +* Fix AUC is zero when label is dense vector. #5274 +* Fix bug in WarpCTC layer. + # Release v0.10.0 We are glad to release version 0.10.0. In this version, we are happy to release the new diff --git a/benchmark/IntelOptimizedPaddle.md b/benchmark/IntelOptimizedPaddle.md new file mode 100644 index 0000000000000000000000000000000000000000..26930a76377397cc36af8cf411b49b02e4f67748 --- /dev/null +++ b/benchmark/IntelOptimizedPaddle.md @@ -0,0 +1,57 @@ +# Benchmark + +Machine: + +- Server: Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz, 2 Sockets, 20 Cores per socket +- Laptop: TBD + +System: CentOS release 6.3 (Final), Docker 1.12.1. + +PaddlePaddle: (TODO: will rerun after 0.11.0) +- paddlepaddle/paddle:latest (for MKLML and MKL-DNN) + - MKL-DNN tag v0.11 + - MKLML 2018.0.1.20171007 +- paddlepaddle/paddle:latest-openblas (for OpenBLAS) + - OpenBLAS v0.2.20 + +On each machine, we will test and compare the performance of training on single node using MKL-DNN / MKLML / OpenBLAS respectively. + +## Benchmark Model + +### Server +Test on batch size 64, 128, 256 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz + +Input image size - 3 * 224 * 224, Time: images/second + +- VGG-19 + +| BatchSize | 64 | 128 | 256 | +|--------------|-------| -----| --------| +| OpenBLAS | 7.80 | 9.00 | 10.80 | +| MKLML | 12.12 | 13.70 | 16.18 | +| MKL-DNN | 28.46 | 29.83 | 30.44 | + + + + - ResNet-50 + +| BatchSize | 64 | 128 | 256 | +|--------------|-------| ------| -------| +| OpenBLAS | 25.22 | 25.68 | 27.12 | +| MKLML | 32.52 | 31.89 | 33.12 | +| MKL-DNN | 81.69 | 82.35 | 84.08 | + + + + - GoogLeNet + +| BatchSize | 64 | 128 | 256 | +|--------------|-------| ------| -------| +| OpenBLAS | 89.52 | 96.97 | 108.25 | +| MKLML | 128.46| 137.89| 158.63 | +| MKL-DNN     | 250.46| 264.83| 269.50 | + + + +### Laptop +TBD diff --git a/benchmark/figs/googlenet-cpu-train.png b/benchmark/figs/googlenet-cpu-train.png new file mode 100644 index 0000000000000000000000000000000000000000..c3f67faf096fe9b45dd815f294b41679dc7c9e54 Binary files /dev/null and b/benchmark/figs/googlenet-cpu-train.png differ diff --git a/benchmark/figs/resnet-cpu-train.png b/benchmark/figs/resnet-cpu-train.png new file mode 100644 index 0000000000000000000000000000000000000000..b96ecd5ff940c0d000613b1ed1f11fb16796cf47 Binary files /dev/null and b/benchmark/figs/resnet-cpu-train.png differ diff --git a/benchmark/figs/vgg-cpu-train.png b/benchmark/figs/vgg-cpu-train.png new file mode 100644 index 0000000000000000000000000000000000000000..f830ca6a87d10b72a5113636dd5686ab25a2e864 Binary files /dev/null and b/benchmark/figs/vgg-cpu-train.png differ diff --git a/benchmark/paddle/image/googlenet.py b/benchmark/paddle/image/googlenet.py index bc893bab98c4d2e07c62fbd012d51a0939db4766..7059c13bd2c2b98eb3fbcf633a6f7064e54d5402 100644 --- a/benchmark/paddle/image/googlenet.py +++ b/benchmark/paddle/image/googlenet.py @@ -5,10 +5,22 @@ height = 224 width = 224 num_class = 1000 batch_size = get_config_arg('batch_size', int, 128) - -args = {'height': height, 'width': width, 'color': True, 'num_class': num_class} +use_gpu = get_config_arg('use_gpu', bool, True) +is_infer = get_config_arg("is_infer", bool, False) + +args = { + 'height': height, + 'width': width, + 'color': True, + 'num_class': num_class, + 'is_infer': is_infer +} define_py_data_sources2( - "train.list", None, module="provider", obj="process", args=args) + "train.list" if not is_infer else None, + "test.list" if is_infer else None, + module="provider", + obj="process", + args=args) settings( batch_size=batch_size, @@ -16,6 +28,8 @@ settings( learning_method=MomentumOptimizer(0.9), regularization=L2Regularization(0.0005 * batch_size)) +conv_projection = conv_projection if use_gpu else img_conv_layer + def inception2(name, input, channels, \ filter1, filter3R, filter3, @@ -138,12 +152,11 @@ def inception(name, input, channels, \ cat = concat_layer( name=name, input=[cov1, cov3, cov5, covprj], - bias_attr=True, + bias_attr=True if use_gpu else False, act=ReluActivation()) return cat -lab = data_layer(name="label", size=1000) data = data_layer(name="input", size=3 * height * width) # stage 1 @@ -221,6 +234,10 @@ pool5 = img_pool_layer( dropout = dropout_layer(name="dropout", input=pool5, dropout_rate=0.4) out3 = fc_layer( name="output3", input=dropout, size=1000, act=SoftmaxActivation()) -loss3 = cross_entropy(name='loss3', input=out3, label=lab) -outputs(loss3) +if is_infer: + outputs(out3) +else: + lab = data_layer(name="label", size=num_class) + loss3 = cross_entropy(name='loss3', input=out3, label=lab) + outputs(loss3) diff --git a/benchmark/paddle/image/provider.py b/benchmark/paddle/image/provider.py index 4703944c8722552d56ba80a8e0663de5fb4df53d..927b1759941f362ef4b5ffe84dd01332986d9306 100644 --- a/benchmark/paddle/image/provider.py +++ b/benchmark/paddle/image/provider.py @@ -13,14 +13,20 @@ def initHook(settings, height, width, color, num_class, **kwargs): settings.data_size = settings.height * settings.width * 3 else: settings.data_size = settings.height * settings.width - - settings.slots = [dense_vector(settings.data_size), integer_value(1)] + settings.is_infer = kwargs.get('is_infer', False) + if settings.is_infer: + settings.slots = [dense_vector(settings.data_size)] + else: + settings.slots = [dense_vector(settings.data_size), integer_value(1)] @provider( init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM) def process(settings, file_list): - for i in xrange(1024): + for i in xrange(2560 if settings.is_infer else 1024): img = np.random.rand(1, settings.data_size).reshape(-1, 1).flatten() - lab = random.randint(0, settings.num_class - 1) - yield img.astype('float32'), int(lab) + if settings.is_infer: + yield img.astype('float32') + else: + lab = random.randint(0, settings.num_class - 1) + yield img.astype('float32'), int(lab) diff --git a/benchmark/paddle/image/resnet.py b/benchmark/paddle/image/resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..4a14363ff1db48a5072cbb5f5eb3bc9241ffca8f --- /dev/null +++ b/benchmark/paddle/image/resnet.py @@ -0,0 +1,228 @@ +#!/usr/bin/env python +from paddle.trainer_config_helpers import * + +height = 224 +width = 224 +num_class = 1000 +batch_size = get_config_arg('batch_size', int, 64) +layer_num = get_config_arg("layer_num", int, 50) +is_infer = get_config_arg("is_infer", bool, False) + +args = { + 'height': height, + 'width': width, + 'color': True, + 'num_class': num_class, + 'is_infer': is_infer +} +define_py_data_sources2( + "train.list" if not is_infer else None, + "test.list" if is_infer else None, + module="provider", + obj="process", + args=args) + +settings( + batch_size=batch_size, + learning_rate=0.01 / batch_size, + learning_method=MomentumOptimizer(0.9), + regularization=L2Regularization(0.0005 * batch_size)) + + +#######################Network Configuration ############# +def conv_bn_layer(name, + input, + filter_size, + num_filters, + stride, + padding, + channels=None, + active_type=ReluActivation()): + """ + A wrapper for conv layer with batch normalization layers. + Note: + conv layer has no activation. + """ + + tmp = img_conv_layer( + name=name + "_conv", + input=input, + filter_size=filter_size, + num_channels=channels, + num_filters=num_filters, + stride=stride, + padding=padding, + act=LinearActivation(), + bias_attr=False) + return batch_norm_layer( + name=name + "_bn", + input=tmp, + act=active_type, + use_global_stats=is_infer) + + +def bottleneck_block(name, input, num_filters1, num_filters2): + """ + A wrapper for bottlenect building block in ResNet. + Last conv_bn_layer has no activation. + Addto layer has activation of relu. + """ + last_name = conv_bn_layer( + name=name + '_branch2a', + input=input, + filter_size=1, + num_filters=num_filters1, + stride=1, + padding=0) + last_name = conv_bn_layer( + name=name + '_branch2b', + input=last_name, + filter_size=3, + num_filters=num_filters1, + stride=1, + padding=1) + last_name = conv_bn_layer( + name=name + '_branch2c', + input=last_name, + filter_size=1, + num_filters=num_filters2, + stride=1, + padding=0, + active_type=LinearActivation()) + + return addto_layer( + name=name + "_addto", input=[input, last_name], act=ReluActivation()) + + +def mid_projection(name, input, num_filters1, num_filters2, stride=2): + """ + A wrapper for middile projection in ResNet. + projection shortcuts are used for increasing dimensions, + and other shortcuts are identity + branch1: projection shortcuts are used for increasing + dimensions, has no activation. + branch2x: bottleneck building block, shortcuts are identity. + """ + # stride = 2 + branch1 = conv_bn_layer( + name=name + '_branch1', + input=input, + filter_size=1, + num_filters=num_filters2, + stride=stride, + padding=0, + active_type=LinearActivation()) + + last_name = conv_bn_layer( + name=name + '_branch2a', + input=input, + filter_size=1, + num_filters=num_filters1, + stride=stride, + padding=0) + last_name = conv_bn_layer( + name=name + '_branch2b', + input=last_name, + filter_size=3, + num_filters=num_filters1, + stride=1, + padding=1) + + last_name = conv_bn_layer( + name=name + '_branch2c', + input=last_name, + filter_size=1, + num_filters=num_filters2, + stride=1, + padding=0, + active_type=LinearActivation()) + + return addto_layer( + name=name + "_addto", input=[branch1, last_name], act=ReluActivation()) + + +img = data_layer(name='image', size=height * width * 3) + + +def deep_res_net(res2_num=3, res3_num=4, res4_num=6, res5_num=3): + """ + A wrapper for 50,101,152 layers of ResNet. + res2_num: number of blocks stacked in conv2_x + res3_num: number of blocks stacked in conv3_x + res4_num: number of blocks stacked in conv4_x + res5_num: number of blocks stacked in conv5_x + """ + # For ImageNet + # conv1: 112x112 + tmp = conv_bn_layer( + "conv1", + input=img, + filter_size=7, + channels=3, + num_filters=64, + stride=2, + padding=3) + tmp = img_pool_layer(name="pool1", input=tmp, pool_size=3, stride=2) + + # conv2_x: 56x56 + tmp = mid_projection( + name="res2_1", input=tmp, num_filters1=64, num_filters2=256, stride=1) + for i in xrange(2, res2_num + 1, 1): + tmp = bottleneck_block( + name="res2_" + str(i), input=tmp, num_filters1=64, num_filters2=256) + + # conv3_x: 28x28 + tmp = mid_projection( + name="res3_1", input=tmp, num_filters1=128, num_filters2=512) + for i in xrange(2, res3_num + 1, 1): + tmp = bottleneck_block( + name="res3_" + str(i), + input=tmp, + num_filters1=128, + num_filters2=512) + + # conv4_x: 14x14 + tmp = mid_projection( + name="res4_1", input=tmp, num_filters1=256, num_filters2=1024) + for i in xrange(2, res4_num + 1, 1): + tmp = bottleneck_block( + name="res4_" + str(i), + input=tmp, + num_filters1=256, + num_filters2=1024) + + # conv5_x: 7x7 + tmp = mid_projection( + name="res5_1", input=tmp, num_filters1=512, num_filters2=2048) + for i in xrange(2, res5_num + 1, 1): + tmp = bottleneck_block( + name="res5_" + str(i), + input=tmp, + num_filters1=512, + num_filters2=2048) + + tmp = img_pool_layer( + name='avgpool', + input=tmp, + pool_size=7, + stride=1, + pool_type=AvgPooling()) + + return fc_layer(input=tmp, size=num_class, act=SoftmaxActivation()) + + +if layer_num == 50: + resnet = deep_res_net(3, 4, 6, 3) +elif layer_num == 101: + resnet = deep_res_net(3, 4, 23, 3) +elif layer_num == 152: + resnet = deep_res_net(3, 8, 36, 3) +else: + print("Wrong layer number.") + +if is_infer: + outputs(resnet) +else: + lbl = data_layer(name="label", size=num_class) + loss = cross_entropy(name='loss', input=resnet, label=lbl) + outputs(loss) diff --git a/benchmark/paddle/image/run_mkldnn.sh b/benchmark/paddle/image/run_mkldnn.sh deleted file mode 100755 index e31fec1cd850157d90ddcab2d559d52381ecd317..0000000000000000000000000000000000000000 --- a/benchmark/paddle/image/run_mkldnn.sh +++ /dev/null @@ -1,51 +0,0 @@ -set -e - -function train() { - unset OMP_NUM_THREADS MKL_NUM_THREADS - export OMP_DYNAMIC="FALSE" - export KMP_AFFINITY="granularity=fine,compact,0,0" - topology=$1 - bs=$2 - use_mkldnn=$3 - if [ $3 == "True" ]; then - thread=1 - log="logs/${topology}-mkldnn-${bs}.log" - elif [ $3 == "False" ]; then - thread=`nproc` - # each trainer_count use only 1 core to avoid conflict - export OMP_NUM_THREADS=1 - export MKL_NUM_THREADS=1 - log="logs/${topology}-${thread}mklml-${bs}.log" - else - echo "Wrong input $3, use True or False." - exit 0 - fi - args="batch_size=${bs}" - config="${topology}.py" - paddle train --job=time \ - --config=$config \ - --use_mkldnn=$use_mkldnn \ - --use_gpu=False \ - --trainer_count=$thread \ - --log_period=10 \ - --test_period=100 \ - --config_args=$args \ - 2>&1 | tee ${log} -} - -if [ ! -d "train.list" ]; then - echo " " > train.list -fi -if [ ! -d "logs" ]; then - mkdir logs -fi - -#========== mkldnn ==========# -train vgg 64 True -train vgg 128 True -train vgg 256 True - -#========== mklml ===========# -train vgg 64 False -train vgg 128 False -train vgg 256 False diff --git a/benchmark/paddle/image/run_mkldnn_infer.sh b/benchmark/paddle/image/run_mkldnn_infer.sh new file mode 100755 index 0000000000000000000000000000000000000000..d795bcab1b7d098295066f79189d17e8299d28fb --- /dev/null +++ b/benchmark/paddle/image/run_mkldnn_infer.sh @@ -0,0 +1,86 @@ +set -e + +function clock_to_seconds() { + hours=`echo $1 | awk -F ':' '{print $1}'` + mins=`echo $1 | awk -F ':' '{print $2}'` + secs=`echo $1 | awk -F ':' '{print $3}'` + echo `awk 'BEGIN{printf "%.2f",('$secs' + '$mins' * 60 + '$hours' * 3600)}'` +} + +function infer() { + unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY + topology=$1 + layer_num=$2 + bs=$3 + use_mkldnn=$4 + if [ $4 == "True" ]; then + thread=1 + log="logs/infer-${topology}-${layer_num}-mkldnn-${bs}.log" + elif [ $4 == "False" ]; then + thread=`nproc` + if [ $thread -gt $bs ]; then + thread=$bs + fi + log="logs/infer-${topology}-${layer_num}-${thread}mklml-${bs}.log" + else + echo "Wrong input $4, use True or False." + exit 0 + fi + + models_in="models/${topology}-${layer_num}/pass-00000/" + if [ ! -d $models_in ]; then + echo "Training model ${topology}_${layer_num}" + paddle train --job=train \ + --config="${topology}.py" \ + --use_mkldnn=True \ + --use_gpu=False \ + --trainer_count=1 \ + --num_passes=1 \ + --save_dir="models/${topology}-${layer_num}" \ + --config_args="batch_size=128,layer_num=${layer_num}" \ + > /dev/null 2>&1 + echo "Done" + fi + log_period=$((256 / bs)) + paddle train --job=test \ + --config="${topology}.py" \ + --use_mkldnn=$use_mkldnn \ + --use_gpu=False \ + --trainer_count=$thread \ + --log_period=$log_period \ + --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True" \ + --init_model_path=$models_in \ + 2>&1 | tee ${log} + + # calculate the last 5 logs period time of 1280 samples, + # the time before are burning time. + start=`tail ${log} -n 7 | head -n 1 | awk -F ' ' '{print $2}' | xargs` + end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs` + start_sec=`clock_to_seconds $start` + end_sec=`clock_to_seconds $end` + fps=`awk 'BEGIN{printf "%.2f",(1280 / ('$end_sec' - '$start_sec'))}'` + echo "Last 1280 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log} + echo "FPS: $fps images/sec" 2>&1 | tee -a ${log} +} + +if [ ! -f "train.list" ]; then + echo " " > train.list +fi +if [ ! -f "test.list" ]; then + echo " " > test.list +fi +if [ ! -d "logs" ]; then + mkdir logs +fi +if [ ! -d "models" ]; then + mkdir -p models +fi + +# inference benchmark +for use_mkldnn in True False; do + for batchsize in 1 2 4 8 16; do + infer googlenet v1 $batchsize $use_mkldnn + infer resnet 50 $batchsize $use_mkldnn + infer vgg 19 $batchsize $use_mkldnn + done +done diff --git a/benchmark/paddle/image/run_mkldnn_train.sh b/benchmark/paddle/image/run_mkldnn_train.sh new file mode 100755 index 0000000000000000000000000000000000000000..320206239ae960bd088b05d3b10934a98da741b1 --- /dev/null +++ b/benchmark/paddle/image/run_mkldnn_train.sh @@ -0,0 +1,47 @@ +set -e + +function train() { + unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY + topology=$1 + layer_num=$2 + bs=$3 + use_mkldnn=$4 + if [ $4 == "True" ]; then + thread=1 + log="logs/train-${topology}-${layer_num}-mkldnn-${bs}.log" + elif [ $4 == "False" ]; then + thread=`nproc` + # each trainer_count use only 1 core to avoid conflict + log="logs/train-${topology}-${layer_num}-${thread}mklml-${bs}.log" + else + echo "Wrong input $4, use True or False." + exit 0 + fi + args="batch_size=${bs},layer_num=${layer_num}" + config="${topology}.py" + paddle train --job=time \ + --config=$config \ + --use_mkldnn=$use_mkldnn \ + --use_gpu=False \ + --trainer_count=$thread \ + --log_period=10 \ + --test_period=100 \ + --config_args=$args \ + 2>&1 | tee ${log} +} + +if [ ! -f "train.list" ]; then + echo " " > train.list +fi +if [ ! -d "logs" ]; then + mkdir logs +fi + +# training benchmark +for use_mkldnn in True False; do + for batchsize in 64 128 256; do + train vgg 19 $batchsize $use_mkldnn + train resnet 50 $batchsize $use_mkldnn + train googlenet v1 $batchsize $use_mkldnn + done +done diff --git a/benchmark/paddle/image/vgg.py b/benchmark/paddle/image/vgg.py index b8429975f5c83df6996e71478fe276b246e8b77b..8d0a1e97a451cd52ef17e4e326673cc90059ef3c 100644 --- a/benchmark/paddle/image/vgg.py +++ b/benchmark/paddle/image/vgg.py @@ -6,14 +6,25 @@ width = 224 num_class = 1000 batch_size = get_config_arg('batch_size', int, 64) layer_num = get_config_arg('layer_num', int, 19) +is_infer = get_config_arg("is_infer", bool, False) -args = {'height': height, 'width': width, 'color': True, 'num_class': num_class} +args = { + 'height': height, + 'width': width, + 'color': True, + 'num_class': num_class, + 'is_infer': is_infer +} define_py_data_sources2( - "train.list", None, module="provider", obj="process", args=args) + "train.list" if not is_infer else None, + "test.list" if is_infer else None, + module="provider", + obj="process", + args=args) settings( batch_size=batch_size, - learning_rate=0.01 / batch_size, + learning_rate=0.001 / batch_size, learning_method=MomentumOptimizer(0.9), regularization=L2Regularization(0.0005 * batch_size)) @@ -98,6 +109,9 @@ elif layer_num == 19: else: print("Wrong layer number.") -lab = data_layer('label', num_class) -loss = cross_entropy(input=vgg, label=lab) -outputs(loss) +if is_infer: + outputs(vgg) +else: + lab = data_layer('label', num_class) + loss = cross_entropy(input=vgg, label=lab) + outputs(loss) diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake index 8fdc382f0c1c453a01dba884a3dad216e1c3092c..b21fc43904d9aafe9f7d019dfbe5b1c0d3f9e2d6 100644 --- a/cmake/cblas.cmake +++ b/cmake/cblas.cmake @@ -1,17 +1,12 @@ # Find the CBlas and lapack libraries # -# It will search MKL, atlas, OpenBlas, reference-cblas in order. +# It will search MKLML, atlas, OpenBlas, reference-cblas in order. # # If any cblas implementation found, the following variable will be set. -# CBLAS_PROVIDER # one of MKL, ATLAS, OPENBLAS, REFERENCE +# CBLAS_PROVIDER # one of MKLML, ATLAS, OPENBLAS, REFERENCE # CBLAS_INC_DIR # the include directory for cblas. # CBLAS_LIBS # a list of libraries should be linked by paddle. # # Each library should be full path to object file. -# -# User should set one of MKL_ROOT, ATLAS_ROOT, OPENBLAS_ROOT, REFERENCE_CBLAS_ROOT -# during cmake. If none of them set, it will try to find cblas implementation in -# system paths. -# set(CBLAS_FOUND OFF) @@ -30,44 +25,6 @@ if(WITH_MKLML AND MKLML_INC_DIR AND MKLML_LIB) return() endif() -## Then find MKL. -set(INTEL_MKL_ROOT "/opt/intel/mkl" CACHE PATH "Folder contains intel mkl libs") -set(MKL_ROOT $ENV{MKL_ROOT} CACHE PATH "Folder contains env MKL") - -set(MKL_INCLUDE_SEARCH_PATHS - ${MKL_ROOT}/include - ${INTEL_MKL_ROOT}/include) -set(MKL_LIB_SEARCH_PATHS - ${MKL_ROOT}/lib - ${MKL_ROOT}/lib/intel64 - ${INTEL_MKL_ROOT}/lib - ${INTEL_MKL_ROOT}/lib/intel64) - -find_path(MKL_INC_DIR mkl.h PATHS - ${MKL_INCLUDE_SEARCH_PATHS}) -find_path(MKL_LAPACK_INC_DIR mkl_lapacke.h PATHS - ${MKL_INCLUDE_SEARCH_PATHS}) -find_library(MKL_CORE_LIB NAMES mkl_core PATHS - ${MKL_LIB_SEARCH_PATHS}) -find_library(MKL_SEQUENTIAL_LIB NAMES mkl_sequential PATHS - ${MKL_LIB_SEARCH_PATHS}) -find_library(MKL_INTEL_LP64 NAMES mkl_intel_lp64 PATHS - ${MKL_LIB_SEARCH_PATHS}) - -if(MKL_LAPACK_INC_DIR AND MKL_INC_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64) - set(CBLAS_FOUND ON) - set(CBLAS_PROVIDER MKL) - set(CBLAS_INC_DIR ${MKL_INC_DIR} ${MKL_LAPACK_INC_DIR}) - set(CBLAS_LIBRARIES ${MKL_INTEL_LP64} ${MKL_SEQUENTIAL_LIB} ${MKL_CORE_LIB}) - - add_definitions(-DPADDLE_USE_MKL) - add_definitions(-DLAPACK_FOUND) - - message(STATUS "Found MKL (include: ${MKL_INC_DIR}, library: ${CBLAS_LIBRARIES})") - message(STATUS "Found lapack in MKL (include: ${MKL_LAPACK_INC_DIR})") - return() -endif() - ## Then find atlas. set(ATLAS_ROOT $ENV{ATLAS_ROOT} CACHE PATH "Folder contains Atlas") set(ATLAS_INCLUDE_SEARCH_PATHS diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 51c3b918cc4ef4cf6c8052ccc14028a872309fcf..5c6bcfde76a1201f792d04766d698db8cd395a49 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -24,6 +24,15 @@ if(WITH_DOUBLE) add_definitions(-DPADDLE_TYPE_DOUBLE) endif(WITH_DOUBLE) +if(WITH_ARM_FP16) + add_definitions(-DPADDLE_ARM_FP16) + add_definitions("-march=armv8.2-a+fp16+simd") +endif(WITH_ARM_FP16) + +if(WITH_TESTING) + add_definitions(-DPADDLE_WITH_TESTING) +endif(WITH_TESTING) + if(NOT WITH_TIMER) add_definitions(-DPADDLE_DISABLE_TIMER) endif(NOT WITH_TIMER) @@ -49,19 +58,20 @@ if(NOT WITH_GOLANG) endif(NOT WITH_GOLANG) if(NOT WITH_GPU) - add_definitions(-DPADDLE_ONLY_CPU) add_definitions(-DHPPL_STUB_FUNC) list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu) else() + add_definitions(-DPADDLE_WITH_CUDA) + FIND_PACKAGE(CUDA REQUIRED) if(${CUDA_VERSION_MAJOR} VERSION_LESS 7) - message(FATAL_ERROR "Paddle need CUDA >= 7.0 to compile") + message(FATAL_ERROR "Paddle needs CUDA >= 7.0 to compile") endif() if(NOT CUDNN_FOUND) - message(FATAL_ERROR "Paddle need cudnn to compile") + message(FATAL_ERROR "Paddle needs cudnn to compile") endif() set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SIMD_FLAG}") @@ -71,27 +81,14 @@ else() include_directories(${CUDA_TOOLKIT_INCLUDE}) endif(NOT WITH_GPU) -if(WITH_MKLDNN) - add_definitions(-DPADDLE_USE_MKLDNN) - if (WITH_MKLML AND MKLDNN_IOMP_DIR) - message(STATUS "Enable Intel OpenMP at ${MKLDNN_IOMP_DIR}") - set(OPENMP_FLAGS "-fopenmp") - set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) - set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}") - else() - find_package(OpenMP) - if(OPENMP_FOUND) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") - else() - message(WARNING "Can not find OpenMP." - "Some performance features in MKLDNN may not be available") - endif() - endif() - -endif(WITH_MKLDNN) +if (WITH_MKLML AND MKLML_IOMP_LIB) + message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}") + set(OPENMP_FLAGS "-fopenmp") + set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) + set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}") +endif() set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SIMD_FLAG}") diff --git a/cmake/cross_compiling/ios.cmake b/cmake/cross_compiling/ios.cmake index 0b38943952f7fb9052368fe95eb31dd7592d8a47..d3f5bf6852b3b295f3b5806b0577a880b0ce6ba6 100644 --- a/cmake/cross_compiling/ios.cmake +++ b/cmake/cross_compiling/ios.cmake @@ -76,12 +76,9 @@ set(IOS_PLATFORM ${IOS_PLATFORM} CACHE STRING "Type of iOS Platform") # Set the architecture for iOS if(NOT DEFINED IOS_ARCH) if(IOS_PLATFORM STREQUAL "OS") - # FIXME(liuyiqun): support "armv7;armv7s;arm64" future - set(IOS_ARCH "arm64") + set(IOS_ARCH "armv7;armv7s;arm64") elseif(IOS_PLATFORM STREQUAL "SIMULATOR") set(IOS_ARCH "i386;x86_64") - elseif(IOS_PLATFORM STREQUAL "WATCHOS") - set(IOS_ARCH armv7k) endif() endif() set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string "Build architecture for iOS") @@ -249,7 +246,7 @@ set(IOS_COMPILER_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} ${XCODE_IOS_BITCODE_ # Hidden visibilty is required for cxx on iOS set(CMAKE_C_FLAGS "${IOS_COMPILER_FLAGS} ${CMAKE_C_FLAGS}" CACHE STRING "C flags") -set(CMAKE_CXX_FLAGS "${IOS_COMPILER_FLAGS} -fvisibility-inlines-hidden ${CMAKE_CXX_FLAGS}" CACHE STRING "CXX flags") +set(CMAKE_CXX_FLAGS "${IOS_COMPILER_FLAGS} -fvisibility=hidden -fvisibility-inlines-hidden ${CMAKE_CXX_FLAGS}" CACHE STRING "CXX flags") set(IOS_LINK_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} -Wl,-search_paths_first") diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake new file mode 100644 index 0000000000000000000000000000000000000000..6bea7cf3022242ce48cc882915f7e71810937283 --- /dev/null +++ b/cmake/cuda.cmake @@ -0,0 +1,188 @@ +if(NOT WITH_GPU) + return() +endif() + +set(paddle_known_gpu_archs "30 35 50 52 60 61 70") +set(paddle_known_gpu_archs7 "30 35 50 52") +set(paddle_known_gpu_archs8 "30 35 50 52 60 61") + +###################################################################################### +# A function for automatic detection of GPUs installed (if autodetection is enabled) +# Usage: +# detect_installed_gpus(out_variable) +function(detect_installed_gpus out_variable) + if(NOT CUDA_gpu_detect_output) + set(cufile ${PROJECT_BINARY_DIR}/detect_cuda_archs.cu) + + file(WRITE ${cufile} "" + "#include \n" + "int main() {\n" + " int count = 0;\n" + " if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n" + " if (count == 0) return -1;\n" + " for (int device = 0; device < count; ++device) {\n" + " cudaDeviceProp prop;\n" + " if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n" + " std::printf(\"%d.%d \", prop.major, prop.minor);\n" + " }\n" + " return 0;\n" + "}\n") + + execute_process(COMMAND "${CUDA_NVCC_EXECUTABLE}" "-ccbin=${CUDA_HOST_COMPILER}" + "--run" "${cufile}" + WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/" + RESULT_VARIABLE nvcc_res OUTPUT_VARIABLE nvcc_out + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) + + if(nvcc_res EQUAL 0) + # only keep the last line of nvcc_out + STRING(REGEX REPLACE ";" "\\\\;" nvcc_out "${nvcc_out}") + STRING(REGEX REPLACE "\n" ";" nvcc_out "${nvcc_out}") + list(GET nvcc_out -1 nvcc_out) + string(REPLACE "2.1" "2.1(2.0)" nvcc_out "${nvcc_out}") + set(CUDA_gpu_detect_output ${nvcc_out} CACHE INTERNAL "Returned GPU architetures from detect_installed_gpus tool" FORCE) + endif() + endif() + + if(NOT CUDA_gpu_detect_output) + message(STATUS "Automatic GPU detection failed. Building for all known architectures.") + set(${out_variable} ${paddle_known_gpu_archs} PARENT_SCOPE) + else() + set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE) + endif() +endfunction() + + +######################################################################## +# Function for selecting GPU arch flags for nvcc based on CUDA_ARCH_NAME +# Usage: +# select_nvcc_arch_flags(out_variable) +function(select_nvcc_arch_flags out_variable) + # List of arch names + set(archs_names "Kepler" "Maxwell" "Pascal" "All" "Manual") + set(archs_name_default "All") + if(NOT CMAKE_CROSSCOMPILING) + list(APPEND archs_names "Auto") + endif() + + # set CUDA_ARCH_NAME strings (so it will be seen as dropbox in CMake-Gui) + set(CUDA_ARCH_NAME ${archs_name_default} CACHE STRING "Select target NVIDIA GPU achitecture.") + set_property( CACHE CUDA_ARCH_NAME PROPERTY STRINGS "" ${archs_names} ) + mark_as_advanced(CUDA_ARCH_NAME) + + # verify CUDA_ARCH_NAME value + if(NOT ";${archs_names};" MATCHES ";${CUDA_ARCH_NAME};") + string(REPLACE ";" ", " archs_names "${archs_names}") + message(FATAL_ERROR "Only ${archs_names} architeture names are supported.") + endif() + + if(${CUDA_ARCH_NAME} STREQUAL "Manual") + set(CUDA_ARCH_BIN ${paddle_known_gpu_archs} CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported") + set(CUDA_ARCH_PTX "50" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for") + mark_as_advanced(CUDA_ARCH_BIN CUDA_ARCH_PTX) + else() + unset(CUDA_ARCH_BIN CACHE) + unset(CUDA_ARCH_PTX CACHE) + endif() + + if(${CUDA_ARCH_NAME} STREQUAL "Kepler") + set(cuda_arch_bin "30 35") + elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell") + set(cuda_arch_bin "50") + elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal") + set(cuda_arch_bin "60 61") + elseif(${CUDA_ARCH_NAME} STREQUAL "Volta") + set(cuda_arch_bin "70") + elseif(${CUDA_ARCH_NAME} STREQUAL "All") + set(cuda_arch_bin ${paddle_known_gpu_archs}) + elseif(${CUDA_ARCH_NAME} STREQUAL "Auto") + detect_installed_gpus(cuda_arch_bin) + else() # (${CUDA_ARCH_NAME} STREQUAL "Manual") + set(cuda_arch_bin ${CUDA_ARCH_BIN}) + endif() + + # remove dots and convert to lists + string(REGEX REPLACE "\\." "" cuda_arch_bin "${cuda_arch_bin}") + string(REGEX REPLACE "\\." "" cuda_arch_ptx "${CUDA_ARCH_PTX}") + string(REGEX MATCHALL "[0-9()]+" cuda_arch_bin "${cuda_arch_bin}") + string(REGEX MATCHALL "[0-9]+" cuda_arch_ptx "${cuda_arch_ptx}") + list(REMOVE_DUPLICATES cuda_arch_bin) + list(REMOVE_DUPLICATES cuda_arch_ptx) + + set(nvcc_flags "") + set(nvcc_archs_readable "") + + # Tell NVCC to add binaries for the specified GPUs + foreach(arch ${cuda_arch_bin}) + if(arch MATCHES "([0-9]+)\\(([0-9]+)\\)") + # User explicitly specified PTX for the concrete BIN + list(APPEND nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1}) + list(APPEND nvcc_archs_readable sm_${CMAKE_MATCH_1}) + else() + # User didn't explicitly specify PTX for the concrete BIN, we assume PTX=BIN + list(APPEND nvcc_flags -gencode arch=compute_${arch},code=sm_${arch}) + list(APPEND nvcc_archs_readable sm_${arch}) + endif() + endforeach() + + # Tell NVCC to add PTX intermediate code for the specified architectures + foreach(arch ${cuda_arch_ptx}) + list(APPEND nvcc_flags -gencode arch=compute_${arch},code=compute_${arch}) + list(APPEND nvcc_archs_readable compute_${arch}) + endforeach() + + string(REPLACE ";" " " nvcc_archs_readable "${nvcc_archs_readable}") + set(${out_variable} ${nvcc_flags} PARENT_SCOPE) + set(${out_variable}_readable ${nvcc_archs_readable} PARENT_SCOPE) +endfunction() + +message(STATUS "CUDA detected: " ${CUDA_VERSION}) +if (${CUDA_VERSION} LESS 7.0) + set(paddle_known_gpu_archs ${paddle_known_gpu_archs}) +elseif (${CUDA_VERSION} LESS 8.0) # CUDA 7.x + set(paddle_known_gpu_archs ${paddle_known_gpu_archs7}) + list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED") + list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__") +elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x + set(paddle_known_gpu_archs ${paddle_known_gpu_archs8}) + list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED") + list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__") + # CUDA 8 may complain that sm_20 is no longer supported. Suppress the + # warning for now. + list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets") +endif() + +include_directories(${CUDA_INCLUDE_DIRS}) +list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY}) +if(NOT WITH_DSO) + list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY}) +endif(NOT WITH_DSO) + +# setting nvcc arch flags +select_nvcc_arch_flags(NVCC_FLAGS_EXTRA) +list(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA}) +message(STATUS "Added CUDA NVCC flags for: ${NVCC_FLAGS_EXTRA_readable}") + +# Set C++11 support +set(CUDA_PROPAGATE_HOST_FLAGS OFF) + +# Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc. +# So, don't set these flags here. +list(APPEND CUDA_NVCC_FLAGS "-std=c++11") +list(APPEND CUDA_NVCC_FLAGS "--use_fast_math") +list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC") +# Set :expt-relaxed-constexpr to suppress Eigen warnings +list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr") + +if(CMAKE_BUILD_TYPE STREQUAL "Debug") + list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG}) +elseif(CMAKE_BUILD_TYPE STREQUAL "Release") + list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE}) +elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") + list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}) +elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel") + list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_MINSIZEREL}) +endif() + +mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD) +mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION) diff --git a/cmake/external/cares.cmake b/cmake/external/cares.cmake new file mode 100644 index 0000000000000000000000000000000000000000..aec51410b33669f8a549f2eca193cc6aa2d07a13 --- /dev/null +++ b/cmake/external/cares.cmake @@ -0,0 +1,45 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +IF(MOBILE_INFERENCE OR NOT WITH_DISTRIBUTE) + return() +ENDIF() + +include (ExternalProject) + +# NOTE: c-ares is needed when linking with grpc. + +SET(CARES_SOURCES_DIR ${THIRD_PARTY_PATH}/cares) +SET(CARES_INSTALL_DIR ${THIRD_PARTY_PATH}/install/cares) +SET(CARES_INCLUDE_DIR "${CARES_INSTALL_DIR}/include/" CACHE PATH "cares include directory." FORCE) + +ExternalProject_Add( + extern_cares + GIT_REPOSITORY "https://github.com/c-ares/c-ares.git" + GIT_TAG "cares-1_13_0" + PREFIX ${CARES_SOURCES_DIR} + UPDATE_COMMAND "" + CONFIGURE_COMMAND ./buildconf && ./configure --disable-shared --prefix=${CARES_INSTALL_DIR} + BUILD_IN_SOURCE 1 + BUILD_COMMAND make -j8 + INSTALL_COMMAND make install +) + +ADD_LIBRARY(cares STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET cares PROPERTY IMPORTED_LOCATION + "${CARES_INSTALL_DIR}/lib/libcares.a") + +include_directories(${CARES_INCLUDE_DIR}) +ADD_DEPENDENCIES(cares extern_cares) diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake index f7483f6be9169eb58f0148cd3a956a8c881e1fe3..96fc886a342cae38d5b804266d3af7bc909a4da2 100644 --- a/cmake/external/eigen.cmake +++ b/cmake/external/eigen.cmake @@ -8,7 +8,7 @@ ExternalProject_Add( extern_eigen3 ${EXTERNAL_PROJECT_LOG_ARGS} GIT_REPOSITORY "https://github.com/RLovelett/eigen.git" - GIT_TAG "master" + GIT_TAG 70661066beef694cadf6c304d0d07e0758825c10 PREFIX ${EIGEN_SOURCE_DIR} UPDATE_COMMAND "" CONFIGURE_COMMAND "" diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake index 957f8271e4841836956b0c3f2cf3d8c88a31192a..d4f252bb9f64c8db82b841fedf0817f5d8596501 100644 --- a/cmake/external/gflags.cmake +++ b/cmake/external/gflags.cmake @@ -28,14 +28,8 @@ INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR}) ExternalProject_Add( extern_gflags ${EXTERNAL_PROJECT_LOG_ARGS} - # TODO(yiwang): The annoying warnings mentioned in - # https://github.com/PaddlePaddle/Paddle/issues/3277 are caused by - # gflags. I fired a PR https://github.com/gflags/gflags/pull/230 - # to fix it. Before it gets accepted by the gflags team, we use - # my personal fork, which contains above fix, temporarily. Let's - # change this back to the official Github repo once my PR is - # merged. - GIT_REPOSITORY "https://github.com/wangkuiyi/gflags.git" + GIT_REPOSITORY "https://github.com/gflags/gflags.git" + GIT_TAG 77592648e3f3be87d6c7123eb81cbad75f9aef5a PREFIX ${GFLAGS_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} @@ -45,11 +39,11 @@ ExternalProject_Add( -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR} -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DBUILD_TESTING=OFF - -DCMAKE_BUILD_TYPE=Release + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} ${EXTERNAL_OPTIONAL_ARGS} CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR} -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON - -DCMAKE_BUILD_TYPE:STRING=Release + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} ) ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL) diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake index b3fef738ccc0b5886bb0a32501bb7b7adade0ff1..0c6b3aafcb4e990b9d4549820137474e5968a7aa 100644 --- a/cmake/external/glog.cmake +++ b/cmake/external/glog.cmake @@ -26,11 +26,21 @@ ENDIF(WIN32) INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR}) +IF(ANDROID AND ${CMAKE_SYSTEM_VERSION} VERSION_LESS "21") + # Using the unofficial glog for Android API < 21 + SET(GLOG_REPOSITORY "https://github.com/Xreki/glog.git") + SET(GLOG_TAG "8a547150548b284382ccb6582408e9140ff2bea8") +ELSE() + SET(GLOG_REPOSITORY "https://github.com/google/glog.git") + SET(GLOG_TAG "v0.3.5") +ENDIF() + ExternalProject_Add( extern_glog ${EXTERNAL_PROJECT_LOG_ARGS} DEPENDS gflags - GIT_REPOSITORY "https://github.com/google/glog.git" + GIT_REPOSITORY ${GLOG_REPOSITORY} + GIT_TAG ${GLOG_TAG} PREFIX ${GLOG_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} @@ -43,12 +53,12 @@ ExternalProject_Add( -DWITH_GFLAGS=ON -Dgflags_DIR=${GFLAGS_INSTALL_DIR}/lib/cmake/gflags -DBUILD_TESTING=OFF - -DCMAKE_BUILD_TYPE=Release + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} ${EXTERNAL_OPTIONAL_ARGS} CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GLOG_INSTALL_DIR} -DCMAKE_INSTALL_LIBDIR:PATH=${GLOG_INSTALL_DIR}/lib -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON - -DCMAKE_BUILD_TYPE:STRING=Release + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} ) ADD_LIBRARY(glog STATIC IMPORTED GLOBAL) diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake new file mode 100644 index 0000000000000000000000000000000000000000..abee6698e30b7e76ca42825ed225876bf2ba5ec0 --- /dev/null +++ b/cmake/external/grpc.cmake @@ -0,0 +1,66 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +IF(MOBILE_INFERENCE OR NOT WITH_DISTRIBUTE) + return() +ENDIF() + +include (ExternalProject) + +SET(GRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/grpc) +SET(GRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/grpc) +SET(GRPC_INCLUDE_DIR "${GRPC_INSTALL_DIR}/include/" CACHE PATH "grpc include directory." FORCE) +SET(GRPC_CPP_PLUGIN "${GRPC_INSTALL_DIR}/bin/grpc_cpp_plugin" CACHE FILEPATH "GRPC_CPP_PLUGIN" FORCE) +IF(APPLE) + SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin | sed "s/-Werror//g" | sh) +ELSE() + SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin) +ENDIF() + +ExternalProject_Add( + extern_grpc + DEPENDS protobuf zlib + GIT_REPOSITORY "https://github.com/grpc/grpc.git" + GIT_TAG "v1.7.x" + PREFIX ${GRPC_SOURCES_DIR} + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_IN_SOURCE 1 + # NOTE(yuyang18): + # Disable -Werror, otherwise the compile will fail in MacOS. + # It seems that we cannot configure that by make command. + # Just dry run make command and remove `-Werror`, then use a shell to run make commands + BUILD_COMMAND ${BUILD_CMD} + INSTALL_COMMAND make prefix=${GRPC_INSTALL_DIR} install +) + +# FIXME(typhoonzero): hack to get static lib path, try a better way like merge them. +ADD_LIBRARY(grpc++_unsecure STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET grpc++_unsecure PROPERTY IMPORTED_LOCATION + "${GRPC_INSTALL_DIR}/lib/libgrpc++_unsecure.a") + +ADD_LIBRARY(grpc++ STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET grpc++ PROPERTY IMPORTED_LOCATION + "${GRPC_INSTALL_DIR}/lib/libgrpc++.a") +ADD_LIBRARY(gpr STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET gpr PROPERTY IMPORTED_LOCATION + "${GRPC_INSTALL_DIR}/lib/libgpr.a") + +ADD_LIBRARY(grpc_unsecure STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET grpc_unsecure PROPERTY IMPORTED_LOCATION + "${GRPC_INSTALL_DIR}/lib/libgrpc_unsecure.a") + +include_directories(${GRPC_INCLUDE_DIR}) +ADD_DEPENDENCIES(grpc++_unsecure extern_grpc) diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake index 6a2a79b7631b32e8a099797de509af64533bbb95..5a4aa7a5b71a4fdfd556a46037e6d1846d668fc4 100644 --- a/cmake/external/gtest.cmake +++ b/cmake/external/gtest.cmake @@ -56,11 +56,11 @@ IF(WITH_TESTING) -DBUILD_GMOCK=ON -Dgtest_disable_pthreads=ON -Dgtest_force_shared_crt=ON - -DCMAKE_BUILD_TYPE=Release + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} ${EXTERNAL_OPTIONAL_ARGS} CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GTEST_INSTALL_DIR} -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON - -DCMAKE_BUILD_TYPE:STRING=Release + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} ) ADD_LIBRARY(gtest STATIC IMPORTED GLOBAL) diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake index 9686df00219001769d074ee815d9cc8db0258496..fc52d339d7a336b44c97f2e0a9fc8d6604854365 100644 --- a/cmake/external/mkldnn.cmake +++ b/cmake/external/mkldnn.cmake @@ -40,28 +40,32 @@ INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR}) IF(${CBLAS_PROVIDER} STREQUAL "MKLML") SET(MKLDNN_DEPENDS ${MKLML_PROJECT}) - SET(MKLDNN_MKLROOT ${MKLML_ROOT}) - SET(MKLDNN_IOMP_LIB ${MKLML_IOMP_LIB}) - SET(MKLDNN_IOMP_DIR ${MKLML_LIB_DIR}) - MESSAGE(STATUS "Build MKLDNN with ${MKLDNN_MKLROOT}") + MESSAGE(STATUS "Build MKLDNN with MKLML ${MKLML_ROOT}") +ELSE() + MESSAGE(FATAL_ERROR "Should enable MKLML when build MKLDNN") ENDIF() +SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} -Wno-error=strict-overflow") +SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} -Wno-error=strict-overflow") ExternalProject_Add( ${MKLDNN_PROJECT} ${EXTERNAL_PROJECT_LOG_ARGS} DEPENDS ${MKLDNN_DEPENDS} GIT_REPOSITORY "https://github.com/01org/mkl-dnn.git" - GIT_TAG "v0.10" + GIT_TAG "v0.11" PREFIX ${MKLDNN_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR} - CMAKE_ARGS -DMKLROOT=${MKLDNN_MKLROOT} + CMAKE_ARGS -DMKLROOT=${MKLML_ROOT} + CMAKE_ARGS -DCMAKE_C_FLAGS=${MKLDNN_CFLAG} + CMAKE_ARGS -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG} CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR} - -DMKLROOT:PATH=${MKLDNN_MKLROOT} + -DMKLROOT:PATH=${MKLML_ROOT} ) ADD_LIBRARY(mkldnn SHARED IMPORTED GLOBAL) SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB}) ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT}) -MESSAGE(STATUS "Mkldnn library: ${MKLDNN_LIB}") +MESSAGE(STATUS "MKLDNN library: ${MKLDNN_LIB}") +add_definitions(-DPADDLE_USE_MKLDNN) LIST(APPEND external_project_dependencies mkldnn) diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake index 74f3279831357c21038df133df0f5a432a6dfd20..20dbc32a738d982df2d3f035206279c82c8de264 100644 --- a/cmake/external/mklml.cmake +++ b/cmake/external/mklml.cmake @@ -27,8 +27,8 @@ ENDIF() INCLUDE(ExternalProject) SET(MKLML_PROJECT "extern_mklml") -SET(MKLML_VER "mklml_lnx_2018.0.20170720") -SET(MKLML_URL "https://github.com/01org/mkl-dnn/releases/download/v0.10/${MKLML_VER}.tgz") +SET(MKLML_VER "mklml_lnx_2018.0.1.20171007") +SET(MKLML_URL "https://github.com/01org/mkl-dnn/releases/download/v0.11/${MKLML_VER}.tgz") SET(MKLML_SOURCE_DIR "${THIRD_PARTY_PATH}/mklml") SET(MKLML_DOWNLOAD_DIR "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}") SET(MKLML_DST_DIR "mklml") diff --git a/cmake/external/nccl.cmake b/cmake/external/nccl.cmake new file mode 100644 index 0000000000000000000000000000000000000000..fc43766efafc3d3e16f2906ce7f9a3d692c8e4ff --- /dev/null +++ b/cmake/external/nccl.cmake @@ -0,0 +1,67 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if(NOT WITH_GPU) + return() +endif() + +include(ExternalProject) + +set(NCCL_SOURCE_DIR ${THIRD_PARTY_PATH}/nccl) + +include_directories(${NCCL_SOURCE_DIR}/src/extern_nccl/src) + +if(WITH_DSO) + # If we use DSO, we do not build nccl, just download the dependencies + set(NCCL_BUILD_COMMAND "") + set(NCCL_INSTALL_COMMAND "") + set(NCCL_INSTALL_DIR "") +else() + # otherwise, we build nccl and link it. + set(NCCL_INSTALL_DIR ${THIRD_PARTY_PATH}/install/nccl) + # Note: cuda 8.0 is needed to make nccl + # When cuda is not installed on the system directory, need to set CUDA_HOME to your cuda root + set(NCCL_BUILD_COMMAND "make -j 8") + set(NCCL_INSTALL_COMMAND "make install PREFIX=${NCCL_INSTALL_DIR}") +endif() + +ExternalProject_Add( + extern_nccl + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY "https://github.com/NVIDIA/nccl.git" + GIT_TAG "v1.3.4-1" + PREFIX "${NCCL_SOURCE_DIR}" + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND "${NCCL_BUILD_COMMAND}" + INSTALL_COMMAND "${NCCL_INSTALL_COMMAND}" + INSTALL_DIR "${NCCL_INSTALL_DIR}" + TEST_COMMAND "" +) + +if(WITH_DSO) + if(${CMAKE_VERSION} VERSION_LESS "3.3.0") + set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/lib_nccl_dummy.c) + file(WRITE ${dummyfile} "const char * dummy_nccl = \"${dummyfile}\";") + add_library(nccl STATIC ${dummyfile}) + else() + add_library(nccl INTERFACE) + endif() +else() + add_library(nccl STATIC IMPORTED GLOBAL) + set_property(TARGET nccl PROPERTY IMPORTED_LOCATION + ${NCCL_INSTALL_DIR}/lib/libnccl_static.a) +endif() + +add_dependencies(nccl extern_nccl) diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake index 143b57a954e4e6b2bf273535ebdf0fa8e3dab768..97857a686b38d935b19f510ecdcb66bcca91fe03 100644 --- a/cmake/external/openblas.cmake +++ b/cmake/external/openblas.cmake @@ -1,11 +1,11 @@ # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -29,7 +29,7 @@ IF(NOT ${CBLAS_FOUND}) "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}" CACHE FILEPATH "openblas library." FORCE) - SET(OPENBLAS_CC "${CMAKE_C_COMPILER}") + SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable") IF(CMAKE_CROSSCOMPILING) SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER}) @@ -45,15 +45,14 @@ IF(NOT ${CBLAS_FOUND}) SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0) ENDIF() ELSEIF(IOS) - # FIXME(liuyiqun): support multiple architectures - SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5") - SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}") - IF(CMAKE_OSX_ARCHITECTURES MATCHES "armv7") - SET(OPENBLAS_CC "${OPENBLAS_CC} -arch armv7") - SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0) - ELSEIF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64") + IF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64") + SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5") + SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}") SET(OPENBLAS_CC "${OPENBLAS_CC} -arch arm64") SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0 CROSS_SUFFIX=${CROSS_SUFFIX}) + ELSE() + MESSAGE(FATAL_ERROR "OpenBLAS only support arm64 architectures on iOS. " + "You can set IOS_USE_VECLIB_FOR_BLAS=ON or USE_EIGEN_FOR_BLAS=ON to use other blas library instead.") ENDIF() ELSEIF(RPI) # use hardfp @@ -86,7 +85,7 @@ IF(NOT ${CBLAS_FOUND}) UPDATE_COMMAND "" CONFIGURE_COMMAND "" ) - + SET(CBLAS_PROVIDER openblas) IF(WITH_C_API) INSTALL(DIRECTORY ${CBLAS_INC_DIR} DESTINATION third_party/openblas) # Because libopenblas.a is a symbolic link of another library, thus need to @@ -98,7 +97,7 @@ IF(NOT ${CBLAS_FOUND}) ENDIF() INSTALL(CODE "execute_process( COMMAND ${CMAKE_COMMAND} -E copy_directory ${CBLAS_INSTALL_DIR}/lib - destination ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR} + ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR} )" ) INSTALL(CODE "MESSAGE(STATUS \"Installing: \" @@ -115,11 +114,7 @@ INCLUDE_DIRECTORIES(${CBLAS_INC_DIR}) # linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas) SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cblas_dummy.c) FILE(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";") -IF(${CBLAS_PROVIDER} MATCHES MKL) - ADD_LIBRARY(cblas SHARED ${dummyfile}) -ELSE() - ADD_LIBRARY(cblas STATIC ${dummyfile}) -ENDIF() +ADD_LIBRARY(cblas STATIC ${dummyfile}) TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES}) IF(NOT ${CBLAS_FOUND}) diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake index 7cf7ba85cca4c248dcc74e078124c0b3815ee380..fab2af362bb070a54987b6499748056f3d12a56b 100644 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -15,7 +15,18 @@ INCLUDE(ExternalProject) # Always invoke `FIND_PACKAGE(Protobuf)` for importing function protobuf_generate_cpp FIND_PACKAGE(Protobuf QUIET) -SET(PROTOBUF_FOUND "OFF") +macro(UNSET_VAR VAR_NAME) + UNSET(${VAR_NAME} CACHE) + UNSET(${VAR_NAME}) +endmacro() +UNSET_VAR(PROTOBUF_INCLUDE_DIR) +UNSET_VAR(PROTOBUF_FOUND) +UNSET_VAR(PROTOBUF_PROTOC_EXECUTABLE) +UNSET_VAR(PROTOBUF_PROTOC_LIBRARY) +UNSET_VAR(PROTOBUF_LITE_LIBRARY) +UNSET_VAR(PROTOBUF_LIBRARY) +UNSET_VAR(PROTOBUF_INCLUDE_DIR) +UNSET_VAR(Protobuf_PROTOC_EXECUTABLE) if(NOT COMMAND protobuf_generate_python) # before cmake 3.4, protobuf_genrerate_python is not defined. function(protobuf_generate_python SRCS) @@ -110,7 +121,6 @@ macro(PROMPT_PROTOBUF_LIB) # FIND_Protobuf.cmake uses `Protobuf_PROTOC_EXECUTABLE`. # make `protobuf_generate_cpp` happy. SET(Protobuf_PROTOC_EXECUTABLE ${PROTOBUF_PROTOC_EXECUTABLE}) - FOREACH(dep ${protobuf_DEPS}) ADD_DEPENDENCIES(protobuf ${dep}) ADD_DEPENDENCIES(protobuf_lite ${dep}) @@ -128,11 +138,11 @@ endmacro() set(PROTOBUF_ROOT "" CACHE PATH "Folder contains protobuf") if (NOT "${PROTOBUF_ROOT}" STREQUAL "") - find_path(PROTOBUF_INCLUDE_DIR google/protobuf/message.h PATHS ${PROTOBUF_ROOT}/include) - find_library(PROTOBUF_LIBRARY protobuf PATHS ${PROTOBUF_ROOT}/lib) - find_library(PROTOBUF_LITE_LIBRARY protobuf-lite PATHS ${PROTOBUF_ROOT}/lib) - find_library(PROTOBUF_PROTOC_LIBRARY protoc PATHS ${PROTOBUF_ROOT}/lib) - find_program(PROTOBUF_PROTOC_EXECUTABLE protoc PATHS ${PROTOBUF_ROOT}/bin) + find_path(PROTOBUF_INCLUDE_DIR google/protobuf/message.h PATHS ${PROTOBUF_ROOT}/include NO_DEFAULT_PATH) + find_library(PROTOBUF_LIBRARY protobuf PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH) + find_library(PROTOBUF_LITE_LIBRARY protobuf-lite PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH) + find_library(PROTOBUF_PROTOC_LIBRARY protoc PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH) + find_program(PROTOBUF_PROTOC_EXECUTABLE protoc PATHS ${PROTOBUF_ROOT}/bin NO_DEFAULT_PATH) if (PROTOBUF_INCLUDE_DIR AND PROTOBUF_LIBRARY AND PROTOBUF_LITE_LIBRARY AND PROTOBUF_PROTOC_LIBRARY AND PROTOBUF_PROTOC_EXECUTABLE) message(STATUS "Using custom protobuf library in ${PROTOBUF_ROOT}.") SET_PROTOBUF_VERSION() @@ -178,32 +188,48 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) SET(OPTIONAL_CACHE_ARGS "-DZLIB_ROOT:STRING=${ZLIB_ROOT}") ENDIF() + SET(PROTOBUF_REPO "https://github.com/google/protobuf.git") + SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546") + IF(MOBILE_INFERENCE) + # The reason why the official version is not used is described in + # https://github.com/PaddlePaddle/Paddle/issues/6114 + SET(PROTOBUF_REPO "https://github.com/qingqing01/protobuf.git") + SET(PROTOBUF_TAG "v3.2.0") + IF(NOT BUILD_FOR_HOST) + SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} "-Dprotobuf_BUILD_PROTOC_BINARIES=OFF") + ENDIF() + ENDIF() + ExternalProject_Add( ${TARGET_NAME} ${EXTERNAL_PROJECT_LOG_ARGS} PREFIX ${PROTOBUF_SOURCES_DIR} UPDATE_COMMAND "" DEPENDS zlib - GIT_REPOSITORY "https://github.com/google/protobuf.git" - GIT_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546" + GIT_REPOSITORY ${PROTOBUF_REPO} + GIT_TAG ${PROTOBUF_TAG} CONFIGURE_COMMAND ${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/${TARGET_NAME}/cmake ${OPTIONAL_ARGS} -Dprotobuf_BUILD_TESTS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON - -DCMAKE_BUILD_TYPE=Release + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR} -DCMAKE_INSTALL_LIBDIR=lib CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR} - -DCMAKE_BUILD_TYPE:STRING=Release + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON ${OPTIONAL_CACHE_ARGS} ) ENDFUNCTION() -SET(PROTOBUF_VERSION 3.1) +IF(NOT MOBILE_INFERENCE) + SET(PROTOBUF_VERSION 3.1) +ELSE() + SET(PROTOBUF_VERSION 3.2) +ENDIF() IF(CMAKE_CROSSCOMPILING) build_protobuf(protobuf_host TRUE) LIST(APPEND external_project_dependencies protobuf_host) diff --git a/cmake/external/pybind11.cmake b/cmake/external/pybind11.cmake index 9391c285c7544669a5b1a078b7473d7a656c1bb4..4e87dc49d8956d1fa6dec777efc5a63c6b0f79a5 100644 --- a/cmake/external/pybind11.cmake +++ b/cmake/external/pybind11.cmake @@ -1,8 +1,26 @@ -INCLUDE(ExternalProject) +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. -SET(PYBIND_SOURCE_DIR ${THIRD_PARTY_PATH}/pybind) +if(NOT WITH_PYTHON) + return() +endif() + +include(ExternalProject) -INCLUDE_DIRECTORIES(${PYBIND_SOURCE_DIR}/src/extern_pybind/include) +set(PYBIND_SOURCE_DIR ${THIRD_PARTY_PATH}/pybind) + +include_directories(${PYBIND_SOURCE_DIR}/src/extern_pybind/include) ExternalProject_Add( extern_pybind @@ -17,14 +35,12 @@ ExternalProject_Add( TEST_COMMAND "" ) -if (${CMAKE_VERSION} VERSION_LESS "3.3.0") +if(${CMAKE_VERSION} VERSION_LESS "3.3.0") set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/pybind_dummy.c) - file(WRITE ${dummyfile} "const char * dummy_any = \"${dummyfile}\";") + file(WRITE ${dummyfile} "const char * dummy_pybind = \"${dummyfile}\";") add_library(pybind STATIC ${dummyfile}) else() add_library(pybind INTERFACE) endif() add_dependencies(pybind extern_pybind) - -LIST(APPEND external_project_dependencies pybind) diff --git a/cmake/external/swig.cmake b/cmake/external/swig.cmake index ce088ae7eaa3355f2f9761e8c421da0d7ef89fa7..9db457c7b2d61228e5d5af6827c4cda11a20a463 100644 --- a/cmake/external/swig.cmake +++ b/cmake/external/swig.cmake @@ -1,11 +1,11 @@ # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake index bb258c7b5581fc22b44f4fe15c119f8081f4767e..a8e1aca49c97df256b1269c286b0bce7732fa932 100644 --- a/cmake/external/warpctc.cmake +++ b/cmake/external/warpctc.cmake @@ -12,6 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +IF(MOBILE_INFERENCE) + return() +ENDIF() + INCLUDE(ExternalProject) SET(WARPCTC_SOURCES_DIR ${THIRD_PARTY_PATH}/warpctc) @@ -35,6 +39,7 @@ ExternalProject_Add( extern_warpctc ${EXTERNAL_PROJECT_LOG_ARGS} GIT_REPOSITORY "https://github.com/gangliao/warp-ctc.git" + GIT_TAG b63a0644654a3e0ed624c85a1767bc8193aead09 PREFIX ${WARPCTC_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} @@ -48,9 +53,9 @@ ExternalProject_Add( -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON -DBUILD_SHARED=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON - -DCMAKE_BUILD_TYPE=Release + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} ${EXTERNAL_OPTIONAL_ARGS} - CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=Release + CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR} ) diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake index c496a52b780364f3014f8fa3dfbc944a7aa7430e..1638cd8fdfc34575132462859e056a1907f0b2f1 100644 --- a/cmake/external/zlib.cmake +++ b/cmake/external/zlib.cmake @@ -1,11 +1,11 @@ # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -42,14 +42,16 @@ ExternalProject_Add( -DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DCMAKE_MACOSX_RPATH=ON - -DCMAKE_BUILD_TYPE=Release + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} ${EXTERNAL_OPTIONAL_ARGS} CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ZLIB_INSTALL_DIR} -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON - -DCMAKE_BUILD_TYPE:STRING=Release + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} ) LIST(APPEND external_project_dependencies zlib) +ADD_LIBRARY(zlib_target STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET zlib_target PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES}) IF(WITH_C_API) INSTALL(DIRECTORY ${ZLIB_INCLUDE_DIR} DESTINATION third_party/zlib) diff --git a/cmake/flags.cmake b/cmake/flags.cmake index 4593ae6180b6d7deb61d897eb634b17ac0bb1683..1120677a37e0d44163816b66600121c8f0d545af 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -111,6 +111,8 @@ set(COMMON_FLAGS -Wno-error=sign-compare -Wno-error=unused-local-typedefs -Wno-error=parentheses-equality # Warnings in pybind11 + -Wno-error=ignored-attributes # Warnings in Eigen, gcc 6.3 + -Wno-error=terminate # Warning in PADDLE_ENFORCE ) set(GPU_COMMON_FLAGS @@ -149,58 +151,3 @@ endforeach() foreach(flag ${GPU_COMMON_FLAGS}) safe_set_nvflag(${flag}) endforeach() - - -set(CUDA_PROPAGATE_HOST_FLAGS OFF) - -# Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc. -# So, don't set these flags here. -LIST(APPEND CUDA_NVCC_FLAGS -std=c++11) -LIST(APPEND CUDA_NVCC_FLAGS --use_fast_math) - -if(CMAKE_BUILD_TYPE STREQUAL "Debug") - LIST(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG}) -elseif(CMAKE_BUILD_TYPE STREQUAL "Release") - LIST(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE}) -elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") - LIST(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}) -elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel") - LIST(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_MINSIZEREL}) -endif() - -function(specify_cuda_arch cuda_version cuda_arch) - if(${cuda_version} VERSION_GREATER "8.0") - foreach(capability 61 62) - if(${cuda_arch} STREQUAL ${capability}) - list(APPEND __arch_flags " -gencode arch=compute_${cuda_arch},code=sm_${cuda_arch}") - endif() - endforeach() - elseif(${cuda_version} VERSION_GREATER "7.0" and ${cuda_arch} STREQUAL "53") - list(APPEND __arch_flags " -gencode arch=compute_${cuda_arch},code=sm_${cuda_arch}") - endif() -endfunction() - -# Common gpu architectures: Kepler, Maxwell -foreach(capability 30 35 50) - list(APPEND __arch_flags " -gencode arch=compute_${capability},code=sm_${capability}") -endforeach() - -if (CUDA_VERSION VERSION_GREATER "7.0" OR CUDA_VERSION VERSION_EQUAL "7.0") - list(APPEND __arch_flags " -gencode arch=compute_52,code=sm_52") -endif() - -# Modern gpu architectures: Pascal -if (CUDA_VERSION VERSION_GREATER "8.0" OR CUDA_VERSION VERSION_EQUAL "8.0") - list(APPEND __arch_flags " -gencode arch=compute_60,code=sm_60") - list(APPEND CUDA_NVCC_FLAGS --expt-relaxed-constexpr) -endif() - -# Custom gpu architecture -set(CUDA_ARCH) - -if(CUDA_ARCH) - specify_cuda_arch(${CUDA_VERSION} ${CUDA_ARCH}) -endif() - -set(CUDA_NVCC_FLAGS ${__arch_flags} ${CUDA_NVCC_FLAGS}) - diff --git a/cmake/generic.cmake b/cmake/generic.cmake index ff9868fc4e0d970b11e4763d2e0c8581f4f85907..66c8e3ad7ef7c80c1f388c25983425a0db5c0220 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -93,7 +93,7 @@ include_directories(${CMAKE_CURRENT_BINARY_DIR}) if(NOT APPLE AND NOT ANDROID) find_package(Threads REQUIRED) link_libraries(${CMAKE_THREAD_LIBS_INIT}) - set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -ldl -lrt") + set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt") endif(NOT APPLE AND NOT ANDROID) function(merge_static_libs TARGET_NAME) @@ -227,8 +227,8 @@ function(cc_test TARGET_NAME) set(multiValueArgs SRCS DEPS) cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) add_executable(${TARGET_NAME} ${cc_test_SRCS}) - target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main) - add_dependencies(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main) + target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags) + add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags) add_test(NAME ${TARGET_NAME} COMMAND ${TARGET_NAME} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) endif() endfunction(cc_test) @@ -288,8 +288,8 @@ function(nv_test TARGET_NAME) set(multiValueArgs SRCS DEPS) cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS}) - target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} gtest gtest_main) - add_dependencies(${TARGET_NAME} ${nv_test_DEPS} gtest gtest_main) + target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags) + add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags) add_test(${TARGET_NAME} ${TARGET_NAME}) endif() endfunction(nv_test) @@ -389,13 +389,60 @@ function(go_test TARGET_NAME) WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) endfunction(go_test) +# Modification of standard 'protobuf_generate_cpp()' with protobuf-lite support +# Usage: +# paddle_protobuf_generate_cpp( ) + +function(paddle_protobuf_generate_cpp SRCS HDRS) + if(NOT ARGN) + message(SEND_ERROR "Error: paddle_protobuf_generate_cpp() called without any proto files") + return() + endif() + + set(${SRCS}) + set(${HDRS}) + + if (MOBILE_INFERENCE) + set(EXTRA_FLAG "lite:") + else() + set(EXTRA_FLAG "") + endif() + + foreach(FIL ${ARGN}) + get_filename_component(ABS_FIL ${FIL} ABSOLUTE) + get_filename_component(FIL_WE ${FIL} NAME_WE) + + set(_protobuf_protoc_src "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.cc") + set(_protobuf_protoc_hdr "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.h") + list(APPEND ${SRCS} "${_protobuf_protoc_src}") + list(APPEND ${HDRS} "${_protobuf_protoc_hdr}") + + add_custom_command( + OUTPUT "${_protobuf_protoc_src}" + "${_protobuf_protoc_hdr}" + + COMMAND ${CMAKE_COMMAND} -E make_directory "${CMAKE_CURRENT_BINARY_DIR}" + COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} + -I${CMAKE_CURRENT_SOURCE_DIR} + --cpp_out "${EXTRA_FLAG}${CMAKE_CURRENT_BINARY_DIR}" ${ABS_FIL} + DEPENDS ${ABS_FIL} protoc + COMMENT "Running C++ protocol buffer compiler on ${FIL}" + VERBATIM ) + endforeach() + + set_source_files_properties(${${SRCS}} ${${HDRS}} PROPERTIES GENERATED TRUE) + set(${SRCS} ${${SRCS}} PARENT_SCOPE) + set(${HDRS} ${${HDRS}} PARENT_SCOPE) +endfunction() + + function(proto_library TARGET_NAME) set(oneValueArgs "") set(multiValueArgs SRCS DEPS) cmake_parse_arguments(proto_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) set(proto_srcs) set(proto_hdrs) - protobuf_generate_cpp(proto_srcs proto_hdrs ${proto_library_SRCS}) + paddle_protobuf_generate_cpp(proto_srcs proto_hdrs ${proto_library_SRCS}) cc_library(${TARGET_NAME} SRCS ${proto_srcs} DEPS ${proto_library_DEPS} protobuf) endfunction() @@ -412,11 +459,58 @@ function(py_test TARGET_NAME) if(WITH_TESTING) set(options STATIC static SHARED shared) set(oneValueArgs "") - set(multiValueArgs SRCS DEPS) - cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + set(multiValueArgs SRCS DEPS ARGS) + cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) add_test(NAME ${TARGET_NAME} COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python - python2 ${py_test_SRCS} + ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) endif() endfunction() + +# grpc_library generate grpc code using grpc_cpp_plugin and protoc +# then build the generated protobuf code and grpc code with your +# implementation source codes together. Use SRCS argument for your +# implementation source files and PROTO argument for your .proto +# files. +# +# Usage: grpc_library(my_target SRCS my_client.cc PROTO my_target.proto DEPS my_dep) + +function(grpc_library TARGET_NAME) + set(oneValueArgs PROTO) + set(multiValueArgs SRCS DEPS) + set(options "") + cmake_parse_arguments(grpc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + message(STATUS "generating grpc ${grpc_library_PROTO}") + + get_filename_component(ABS_PROTO ${grpc_library_PROTO} ABSOLUTE) + get_filename_component(PROTO_WE ${grpc_library_PROTO} NAME_WE) + get_filename_component(PROTO_PATH ${ABS_PROTO} PATH) + + protobuf_generate_cpp(grpc_proto_srcs grpc_proto_hdrs "${ABS_PROTO}") + set(grpc_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.cc") + set(grpc_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.h") + cc_library("${TARGET_NAME}_proto" SRCS "${grpc_proto_srcs}") + + add_custom_command( + OUTPUT "${grpc_grpc_srcs}" "${grpc_grpc_hdrs}" + COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} + ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}" -I "${PROTO_PATH}" + --plugin=protoc-gen-grpc="${GRPC_CPP_PLUGIN}" "${ABS_PROTO}" + DEPENDS "${ABS_PROTO}" ${PROTOBUF_PROTOC_EXECUTABLE} extern_grpc) + + # FIXME(typhoonzero): grpc generated code do not generate virtual-dtor, mark it + # as compiler warnings instead of error. Should try remove the warnings also. + set_source_files_properties( + ${grpc_grpc_srcs} + PROPERTIES + COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") + cc_library("${TARGET_NAME}_grpc" SRCS "${grpc_grpc_srcs}") + + set_source_files_properties( + ${grpc_library_SRCS} + PROPERTIES + COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") + cc_library("${TARGET_NAME}" SRCS "${grpc_library_SRCS}" DEPS "${TARGET_NAME}_grpc" "${TARGET_NAME}_proto" "${grpc_library_DEPS}") +endfunction() diff --git a/cmake/simd.cmake b/cmake/simd.cmake index 46035a908b588861607a25d3a21cf34b7b6fd4b8..53c2de332ea74b06d1bd6e5bb119cad6af27ed01 100644 --- a/cmake/simd.cmake +++ b/cmake/simd.cmake @@ -1,27 +1,28 @@ # This file is use to check all support level of AVX on your machine # so that PaddlePaddle can unleash the vectorization power of muticore. -INCLUDE(CheckCXXSourceRuns) -INCLUDE(CheckCXXSourceCompiles) +include(CheckCXXSourceRuns) +include(CheckCXXSourceCompiles) -IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") +if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") set(MMX_FLAG "-mmmx") set(SSE2_FLAG "-msse2") set(SSE3_FLAG "-msse3") - SET(AVX_FLAG "-mavx") - SET(AVX2_FLAG "-mavx2") -ELSEIF(MSVC) + set(AVX_FLAG "-mavx") + set(AVX2_FLAG "-mavx2") +elseif(MSVC) set(MMX_FLAG "/arch:MMX") set(SSE2_FLAG "/arch:SSE2") set(SSE3_FLAG "/arch:SSE3") SET(AVX_FLAG "/arch:AVX") SET(AVX2_FLAG "/arch:AVX2") -ENDIF() +endif() set(CMAKE_REQUIRED_FLAGS_RETAINED ${CMAKE_REQUIRED_FLAGS}) # Check MMX set(CMAKE_REQUIRED_FLAGS ${MMX_FLAG}) +set(MMX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) CHECK_CXX_SOURCE_RUNS(" #include int main() @@ -32,6 +33,7 @@ int main() # Check SSE2 set(CMAKE_REQUIRED_FLAGS ${SSE2_FLAG}) +set(SSE2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) CHECK_CXX_SOURCE_RUNS(" #include int main() @@ -42,6 +44,7 @@ int main() # Check SSE3 set(CMAKE_REQUIRED_FLAGS ${SSE3_FLAG}) +set(SSE3_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) CHECK_CXX_SOURCE_RUNS(" #include int main() @@ -55,6 +58,7 @@ int main() # Check AVX set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG}) +set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) CHECK_CXX_SOURCE_RUNS(" #include int main() @@ -67,6 +71,7 @@ int main() # Check AVX 2 set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG}) +set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) CHECK_CXX_SOURCE_RUNS(" #include int main() diff --git a/cmake/util.cmake b/cmake/util.cmake index d1aee3e170a2d143ac06b438725e907e96f041c8..0dc33ce385175d1e2dc454d41db467d4b9d9cf9a 100644 --- a/cmake/util.cmake +++ b/cmake/util.cmake @@ -73,32 +73,50 @@ function(link_paddle_exe TARGET_NAME) generate_rdma_links() endif() - target_circle_link_libraries(${TARGET_NAME} - ARCHIVE_START - paddle_gserver - paddle_function - ARCHIVE_END - paddle_pserver - paddle_trainer_lib - paddle_network - paddle_math - paddle_utils - paddle_parameter - paddle_proto - paddle_cuda - paddle_optimizer - ${EXTERNAL_LIBS} - ${CMAKE_THREAD_LIBS_INIT} - ${CMAKE_DL_LIBS} - ${RDMA_LD_FLAGS} - ${RDMA_LIBS}) + if(MOBILE_INFERENCE) + target_circle_link_libraries(${TARGET_NAME} + ARCHIVE_START + paddle_gserver + paddle_function + ARCHIVE_END + paddle_math + paddle_utils + paddle_parameter + paddle_proto + paddle_cuda + ${EXTERNAL_LIBS} + ${CMAKE_THREAD_LIBS_INIT} + ${CMAKE_DL_LIBS} + ${RDMA_LD_FLAGS} + ${RDMA_LIBS}) + else() + target_circle_link_libraries(${TARGET_NAME} + ARCHIVE_START + paddle_gserver + paddle_function + ARCHIVE_END + paddle_pserver + paddle_trainer_lib + paddle_network + paddle_math + paddle_utils + paddle_parameter + paddle_proto + paddle_cuda + paddle_optimizer + ${EXTERNAL_LIBS} + ${CMAKE_THREAD_LIBS_INIT} + ${CMAKE_DL_LIBS} + ${RDMA_LD_FLAGS} + ${RDMA_LIBS}) + endif() if(ANDROID) target_link_libraries(${TARGET_NAME} log) endif(ANDROID) - if(WITH_MKLDNN AND WITH_MKLML AND MKLDNN_IOMP_DIR) - target_link_libraries(${TARGET_NAME} "-L${MKLDNN_IOMP_DIR} -liomp5 -Wl,--as-needed") + if(WITH_MKLML AND MKLML_LIB_DIR AND MKLML_IOMP_LIB) + target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed") endif() add_dependencies(${TARGET_NAME} ${external_project_dependencies}) @@ -150,17 +168,3 @@ function(create_resources res_file output_file) COMMAND python ARGS ${PADDLE_SOURCE_DIR}/cmake/make_resource.py ${res_file} ${output_file} DEPENDS ${res_file} ${PADDLE_SOURCE_DIR}/cmake/make_resource.py) endfunction() - - -# Create a python unittest using run_python_tests.sh, -# which takes care of making correct running environment -function(add_python_test TEST_NAME) - foreach(arg ${ARGN}) - get_filename_component(py_fn ${arg} NAME_WE) - set(TRG_NAME ${TEST_NAME}_${py_fn}) - add_test(NAME ${TRG_NAME} - COMMAND env PYTHONPATH=${PADDLE_PYTHON_PACKAGE_DIR} - python2 ${arg} - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) - endforeach() -endfunction() diff --git a/doc/api/index_en.rst b/doc/api/index_en.rst index 25c1dd00b9cbb3ab647e04cdc2b4c27c552a2332..e6f632e1a5b9c4b50b7c6aa96a120030bd6ce338 100644 --- a/doc/api/index_en.rst +++ b/doc/api/index_en.rst @@ -7,3 +7,4 @@ API v2/model_configs.rst v2/data.rst v2/run_logic.rst + v2/fluid.rst diff --git a/doc/api/v2/config/activation.rst b/doc/api/v2/config/activation.rst index eca3ce03bcdc599edca802d8dfca48d4f28275a2..5317e66b64bbd85c61f19700a9d2c1d239dee573 100644 --- a/doc/api/v2/config/activation.rst +++ b/doc/api/v2/config/activation.rst @@ -99,3 +99,10 @@ STanh .. automodule:: paddle.v2.activation :members: STanh :noindex: + +SoftSign +======== + +.. automodule:: paddle.v2.activation + :members: SoftSign + :noindex: diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst index d4e9d53e5c0955912a594fe8cd9cd41a4080a2d2..c3f9c18d0663a7a24880b441981875c1e4f015aa 100644 --- a/doc/api/v2/config/layer.rst +++ b/doc/api/v2/config/layer.rst @@ -54,7 +54,7 @@ img_conv .. _api_v2.layer_context_projection: -context_projection +context_projection ------------------ .. autoclass:: paddle.v2.layer.context_projection :noindex: @@ -70,7 +70,7 @@ Image Pooling Layer img_pool -------- .. autoclass:: paddle.v2.layer.img_pool - :noindex: + :noindex: spp --- @@ -82,6 +82,11 @@ maxout .. autoclass:: paddle.v2.layer.maxout :noindex: +roi_pool +-------- +.. autoclass:: paddle.v2.layer.roi_pool + :noindex: + Norm Layer ========== @@ -99,7 +104,7 @@ sum_to_one_norm --------------- .. autoclass:: paddle.v2.layer.sum_to_one_norm :noindex: - + cross_channel_norm ------------------ .. autoclass:: paddle.v2.layer.cross_channel_norm @@ -109,7 +114,7 @@ row_l2_norm ----------- .. autoclass:: paddle.v2.layer.row_l2_norm :noindex: - + Recurrent Layers ================ @@ -330,6 +335,16 @@ bilinear_interp .. autoclass:: paddle.v2.layer.bilinear_interp :noindex: +dot_prod +--------- +.. autoclass:: paddle.v2.layer.dot_prod + :noindex: + +out_prod +-------- +.. autoclass:: paddle.v2.layer.out_prod + :noindex: + power ----- .. autoclass:: paddle.v2.layer.power @@ -367,6 +382,11 @@ cos_sim .. autoclass:: paddle.v2.layer.cos_sim :noindex: +l2_distance +----------- +.. autoclass:: paddle.v2.layer.l2_distance + :noindex: + trans ----- .. autoclass:: paddle.v2.layer.trans @@ -395,6 +415,13 @@ multiplex .. autoclass:: paddle.v2.layer.multiplex :noindex: +Factorization Machine Layer +============================ + +factorization_machine +--------------------- +.. autoclass:: paddle.v2.layer.factorization_machine + :noindex: Slicing and Joining Layers ========================== diff --git a/doc/api/v2/config/networks.rst b/doc/api/v2/config/networks.rst index 6e813ab1a820d068ea3e54cad6178f1cf928eadc..048379cf01f4aec5e73e2fe3ddfa728f3c17a5d1 100644 --- a/doc/api/v2/config/networks.rst +++ b/doc/api/v2/config/networks.rst @@ -125,3 +125,8 @@ simple_attention :members: simple_attention :noindex: +dot_product_attention +--------------------- +.. automodule:: paddle.v2.networks + :members: dot_product_attention + :noindex: diff --git a/doc/api/v2/data.rst b/doc/api/v2/data.rst index fef87c4fbdb452771ecdb361c6eeae5b32bcee14..b56c7332cc284649c7e04328e51a7faa78593a39 100644 --- a/doc/api/v2/data.rst +++ b/doc/api/v2/data.rst @@ -2,112 +2,9 @@ Data Reader Interface and DataSets ================================== +.. toctree:: + :maxdepth: 1 -DataTypes -========= - -.. automodule:: paddle.v2.data_type - :members: - :noindex: - -DataFeeder -========== - -.. automodule:: paddle.v2.data_feeder - :members: - :noindex: - -Reader -====== - -.. automodule:: paddle.v2.reader - :members: - :noindex: - -.. automodule:: paddle.v2.reader.creator - :members: - :noindex: - -minibatch -========= - -.. automodule:: paddle.v2.minibatch - :members: - :noindex: - -Dataset -======= - -.. automodule:: paddle.v2.dataset - :members: - :noindex: - -mnist -+++++ - -.. automodule:: paddle.v2.dataset.mnist - :members: - :noindex: - -cifar -+++++ - -.. automodule:: paddle.v2.dataset.cifar - :members: - :noindex: - -conll05 -+++++++ - -.. automodule:: paddle.v2.dataset.conll05 - :members: get_dict,get_embedding,test - :noindex: - -imdb -++++ - -.. automodule:: paddle.v2.dataset.imdb - :members: - :noindex: - -imikolov -++++++++ - -.. automodule:: paddle.v2.dataset.imikolov - :members: - :noindex: - -movielens -+++++++++ - -.. automodule:: paddle.v2.dataset.movielens - :members: - :noindex: - -.. autoclass:: paddle.v2.dataset.movielens.MovieInfo - :noindex: - -.. autoclass:: paddle.v2.dataset.movielens.UserInfo - :noindex: - -sentiment -+++++++++ - -.. automodule:: paddle.v2.dataset.sentiment - :members: - :noindex: - -uci_housing -+++++++++++ - -.. automodule:: paddle.v2.dataset.uci_housing - :members: - :noindex: - -wmt14 -+++++ - -.. automodule:: paddle.v2.dataset.wmt14 - :members: - :noindex: - + data/data_reader.rst + data/image.rst + data/dataset.rst diff --git a/doc/api/v2/data/data_reader.rst b/doc/api/v2/data/data_reader.rst new file mode 100644 index 0000000000000000000000000000000000000000..2ccfec9c284877a7576e9751526b169a4ac78d8e --- /dev/null +++ b/doc/api/v2/data/data_reader.rst @@ -0,0 +1,36 @@ +===================== +Data Reader Interface +===================== + + +DataTypes +========= + +.. automodule:: paddle.v2.data_type + :members: + :noindex: + +DataFeeder +========== + +.. automodule:: paddle.v2.data_feeder + :members: + :noindex: + +Reader +====== + +.. automodule:: paddle.v2.reader + :members: + :noindex: + +.. automodule:: paddle.v2.reader.creator + :members: + :noindex: + +minibatch +========= + +.. automodule:: paddle.v2.minibatch + :members: + :noindex: diff --git a/doc/api/v2/data/dataset.rst b/doc/api/v2/data/dataset.rst new file mode 100644 index 0000000000000000000000000000000000000000..6a8ecc5bb1d855e0ded3719943ab3adb810de365 --- /dev/null +++ b/doc/api/v2/data/dataset.rst @@ -0,0 +1,75 @@ +Dataset +======= + +.. automodule:: paddle.v2.dataset + :members: + :noindex: + +mnist ++++++ + +.. automodule:: paddle.v2.dataset.mnist + :members: + :noindex: + +cifar ++++++ + +.. automodule:: paddle.v2.dataset.cifar + :members: + :noindex: + +conll05 ++++++++ + +.. automodule:: paddle.v2.dataset.conll05 + :members: get_dict,get_embedding,test + :noindex: + +imdb +++++ + +.. automodule:: paddle.v2.dataset.imdb + :members: + :noindex: + +imikolov +++++++++ + +.. automodule:: paddle.v2.dataset.imikolov + :members: + :noindex: + +movielens ++++++++++ + +.. automodule:: paddle.v2.dataset.movielens + :members: + :noindex: + +.. autoclass:: paddle.v2.dataset.movielens.MovieInfo + :noindex: + +.. autoclass:: paddle.v2.dataset.movielens.UserInfo + :noindex: + +sentiment ++++++++++ + +.. automodule:: paddle.v2.dataset.sentiment + :members: + :noindex: + +uci_housing ++++++++++++ + +.. automodule:: paddle.v2.dataset.uci_housing + :members: + :noindex: + +wmt14 ++++++ + +.. automodule:: paddle.v2.dataset.wmt14 + :members: + :noindex: diff --git a/doc/api/v2/data/image.rst b/doc/api/v2/data/image.rst new file mode 100644 index 0000000000000000000000000000000000000000..97651ffa6be56cf3ecaca2caca38a353fa5c1f49 --- /dev/null +++ b/doc/api/v2/data/image.rst @@ -0,0 +1,5 @@ +Image Interface +=============== + +.. automodule:: paddle.v2.image + :members: diff --git a/doc/api/v2/fluid.rst b/doc/api/v2/fluid.rst new file mode 100644 index 0000000000000000000000000000000000000000..43fc19dc492bbc119f2356034b81c65e443db2fa --- /dev/null +++ b/doc/api/v2/fluid.rst @@ -0,0 +1,18 @@ +====================== +Fluid +====================== + +.. toctree:: + :maxdepth: 1 + + fluid/layers.rst + fluid/data_feeder.rst + fluid/executor.rst + fluid/initializer.rst + fluid/evaluator.rst + fluid/nets.rst + fluid/optimizer.rst + fluid/param_attr.rst + fluid/profiler.rst + fluid/regularizer.rst + diff --git a/doc/api/v2/fluid/data_feeder.rst b/doc/api/v2/fluid/data_feeder.rst new file mode 100644 index 0000000000000000000000000000000000000000..0fa78f7dfb04c13be7eb83b7fd35cb03f2f4a7fa --- /dev/null +++ b/doc/api/v2/fluid/data_feeder.rst @@ -0,0 +1,9 @@ +=========== +DataFeeder +=========== + +DataFeeder +----------- +.. automodule:: paddle.v2.fluid.data_feeder + :members: DataFeeder + :noindex: diff --git a/doc/api/v2/fluid/evaluator.rst b/doc/api/v2/fluid/evaluator.rst new file mode 100644 index 0000000000000000000000000000000000000000..a23f3301d0331e0ea3733f06444515eb4680cd31 --- /dev/null +++ b/doc/api/v2/fluid/evaluator.rst @@ -0,0 +1,9 @@ +=========== +Evaluator +=========== + +Evaluator +----------- +.. automodule:: paddle.v2.fluid.evaluator + :members: Evaluator + :noindex: diff --git a/doc/api/v2/fluid/executor.rst b/doc/api/v2/fluid/executor.rst new file mode 100644 index 0000000000000000000000000000000000000000..3a283538c120cfa1ef646c390bb71c6251c23675 --- /dev/null +++ b/doc/api/v2/fluid/executor.rst @@ -0,0 +1,9 @@ +=========== +Executor +=========== + +Executor +----------- +.. automodule:: paddle.v2.fluid.executor + :members: Executor + :noindex: diff --git a/doc/api/v2/fluid/initializer.rst b/doc/api/v2/fluid/initializer.rst new file mode 100644 index 0000000000000000000000000000000000000000..8f587837e9873370722062404f511654a9460587 --- /dev/null +++ b/doc/api/v2/fluid/initializer.rst @@ -0,0 +1,50 @@ +=========== +Initializer +=========== + + + +Initializer +----------- +.. automodule:: paddle.v2.fluid.initializer + :members: Initializer + :noindex: + + + +ConstantInitializer +------------------- +.. automodule:: paddle.v2.fluid.initializer + :members: ConstantInitializer + :noindex: + + + +UniformInitializer +------------------ +.. automodule:: paddle.v2.fluid.initializer + :members: UniformInitializer + :noindex: + + + +NormalInitializer +----------------- +.. automodule:: paddle.v2.fluid.initializer + :members: NormalInitializer + :noindex: + + +XavierInitializer +----------------- +.. automodule:: paddle.v2.fluid.initializer + :members: XavierInitializer + :noindex: + + +MSRAInitializer +--------------- +.. automodule:: paddle.v2.fluid.initializer + :members: MSRAInitializer + :noindex: + diff --git a/doc/api/v2/fluid/layers.rst b/doc/api/v2/fluid/layers.rst new file mode 100644 index 0000000000000000000000000000000000000000..89e5fec13bf9062dc7a7187b1334c8f5486a980b --- /dev/null +++ b/doc/api/v2/fluid/layers.rst @@ -0,0 +1,302 @@ +========== +Layers +========== + + +fc +--- +.. autofunction:: paddle.v2.fluid.layers.fc + :noindex: + +embedding +--------- +.. autofunction:: paddle.v2.fluid.layers.embedding + :noindex: + +dynamic_lstm +------------ +.. autofunction:: paddle.v2.fluid.layers.dynamic_lstm + :noindex: + +data +--------- +.. autofunction:: paddle.v2.fluid.layers.data + :noindex: + +mean +--------- +.. autofunction:: paddle.v2.fluid.layers.mean + :noindex: + +mul +--------- +.. autofunction:: paddle.v2.fluid.layers.mul + :noindex: + +elementwise_add +--------------- +.. autofunction:: paddle.v2.fluid.layers.elementwise_add + :noindex: + +elementwise_div +--------------- +.. autofunction:: paddle.v2.fluid.layers.elementwise_div + :noindex: + + +dropout +--------- +.. autofunction:: paddle.v2.fluid.layers.dropout + :noindex: + + +reshape +--------- +.. autofunction:: paddle.v2.fluid.layers.reshape + :noindex: + + +sigmoid +--------- +.. autofunction:: paddle.v2.fluid.layers.sigmoid + :noindex: + + +scale +--------- +.. autofunction:: paddle.v2.fluid.layers.scale + :noindex: + + +reshape +--------- +.. autofunction:: paddle.v2.fluid.layers.reshape + :noindex: + + +transpose +--------- +.. autofunction:: paddle.v2.fluid.layers.transpose + :noindex: + + +sigmoid_cross_entropy_with_logits +--------- +.. autofunction:: paddle.v2.fluid.layers.esigmoid_cross_entropy_with_logits + :noindex: + + +cast +--------- +.. autofunction:: paddle.v2.fluid.layers.cast + :noindex: + + +concat +--------- +.. autofunction:: paddle.v2.fluid.layers.concat + :noindex: + + +sums +--------- +.. autofunction:: paddle.v2.fluid.layers.sums + :noindex: + + +linear_chain_crf +--------- +.. autofunction:: paddle.v2.fluid.layers.linear_chain_crf + :noindex: + + +assign +--------- +.. autofunction:: paddle.v2.fluid.layers.embedding + :noindex: + + +split_lod_tensor +--------- +.. autofunction:: paddle.v2.fluid.layers.split_lod_tensor + :noindex: + + +merge_lod_tensor +--------- +.. autofunction:: paddle.v2.fluid.layers.merge_lod_tensor + :noindex: + +cos_sim +--------- +.. autofunction:: paddle.v2.fluid.layers.cos_sim + :noindex: + + +cross_entropy +--------- +.. autofunction:: paddle.v2.fluid.layers.cross_entropy + :noindex: + + + +square_error_cost +--------- +.. autofunction:: paddle.v2.fluid.layers.square_error_cost + :noindex: + + +accuracy +--------- +.. autofunction:: paddle.v2.fluid.layers.accuracy + :noindex: + + +sequence_conv +--------- +.. autofunction:: paddle.v2.fluid.layers.sequence_conv + :noindex: + + +conv2d +--------- +.. autofunction:: paddle.v2.fluid.layers.conv2d + :noindex: + + +sequence_pool +--------- +.. autofunction:: paddle.v2.fluid.layers.sequence_pool + :noindex: + + +pool2d +--------- +.. autofunction:: paddle.v2.fluid.layers.pool2d + :noindex: + + +batch_norm +--------- +.. autofunction:: paddle.v2.fluid.layers.batch_norm + :noindex: + + +beam_search_decode +--------- +.. autofunction:: paddle.v2.fluid.layers.beam_search_decode + :noindex: + + +lstm +--------- +.. autofunction:: paddle.v2.fluid.layers.lstm + :noindex: + + +lod_rank_table +--------- +.. autofunction:: paddle.v2.fluid.layers.lod_rank_table + :noindex: + + +max_sequence_len +--------- +.. autofunction:: paddle.v2.fluid.layers.max_sequence_len + :noindex: + + +topk +--------- +.. autofunction:: paddle.v2.fluid.layers.topk + :noindex: + + +lod_tensor_to_array +--------- +.. autofunction:: paddle.v2.fluid.layers.lod_tensor_to_array + :noindex: + + + +array_to_lod_tensor +--------- +.. autofunction:: paddle.v2.fluid.layers.array_to_lod_tensor + :noindex: + + + + +fill_constant +--------- +.. autofunction:: paddle.v2.fluid.layers.fill_constant + :noindex: + + + +fill_constant_batch_size_like +--------- +.. autofunction:: paddle.v2.fluid.layers.fill_constant_batch_size_like + :noindex: + + +ones +--------- +.. autofunction:: paddle.v2.fluid.layers.ones + :noindex: + + +zeros +--------- +.. autofunction:: paddle.v2.fluid.layers.zeros + :noindex: + + +increment +--------- +.. autofunction:: paddle.v2.fluid.layers.increment + :noindex: + + +array_write +--------- +.. autofunction:: paddle.v2.fluid.layers.array_write + :noindex: + + + +create_array +--------- +.. autofunction:: paddle.v2.fluid.layers.create_array + :noindex: + + +less_than +--------- +.. autofunction:: paddle.v2.fluid.layers.less_than + :noindex: + + +array_read +--------- +.. autofunction:: paddle.v2.fluid.layers.array_read + :noindex: + + +shrink_memory +--------- +.. autofunction:: paddle.v2.fluid.layers.shrink_memory + :noindex: + + +array_length +--------- +.. autofunction:: paddle.v2.fluid.layers.array_length + :noindex: + + +conv2d_transpose +--------- +.. autofunction:: paddle.v2.fluid.layers.conv2d_transpose + :noindex: + diff --git a/doc/api/v2/fluid/nets.rst b/doc/api/v2/fluid/nets.rst new file mode 100644 index 0000000000000000000000000000000000000000..2c3d075422de29c96e25458e831133a30270dd39 --- /dev/null +++ b/doc/api/v2/fluid/nets.rst @@ -0,0 +1,22 @@ +=========== +Nets +=========== + +simple_img_conv_pool +----------- +.. autofunction:: paddle.v2.fluid.nets.simple_img_conv_pool + :noindex: + + +img_conv_group +----------- +.. autofunction:: paddle.v2.fluid.nets.img_conv_group + :noindex: + + +sequence_conv_pool +----------- +.. autofunction:: paddle.v2.fluid.nets.sequence_conv_pool + :noindex: + + diff --git a/doc/api/v2/fluid/optimizer.rst b/doc/api/v2/fluid/optimizer.rst new file mode 100644 index 0000000000000000000000000000000000000000..233762fcdfb39e592740adef6721a556fae3feef --- /dev/null +++ b/doc/api/v2/fluid/optimizer.rst @@ -0,0 +1,54 @@ +=========== +Optimizer +=========== + +Optimizer +----------- +.. automodule:: paddle.v2.fluid.optimizer + :members: Optimizer + :noindex: + + +SGDOptimizer +----------- +.. automodule:: paddle.v2.fluid.optimizer + :members: SGDOptimizer + :noindex: + + + +MomentumOptimizer +----------- +.. automodule:: paddle.v2.fluid.optimizer + :members: MomentumOptimizer + :noindex: + + + +AdagradOptimizer +----------- +.. automodule:: paddle.v2.fluid.optimizer + :members: AdagradOptimizer + :noindex: + + +AdamOptimizer +----------- +.. automodule:: paddle.v2.fluid.optimizer + :members: AdamOptimizer + :noindex: + + +AdamaxOptimizer +----------- +.. automodule:: paddle.v2.fluid.optimizer + :members: AdamaxOptimizer + :noindex: + + +DecayedAdagradOptimizer +----------- +.. automodule:: paddle.v2.fluid.optimizer + :members: DecayedAdagradOptimizer + :noindex: + diff --git a/doc/api/v2/fluid/param_attr.rst b/doc/api/v2/fluid/param_attr.rst new file mode 100644 index 0000000000000000000000000000000000000000..ca0c8af9e8c4f2271de7a131ad0d27c0e8635f50 --- /dev/null +++ b/doc/api/v2/fluid/param_attr.rst @@ -0,0 +1,11 @@ +=========== +ParamAttr +=========== + + + +ParamAttr +----------- +.. automodule:: paddle.v2.fluid.param_attr + :members: ParamAttr + :noindex: diff --git a/doc/api/v2/fluid/profiler.rst b/doc/api/v2/fluid/profiler.rst new file mode 100644 index 0000000000000000000000000000000000000000..7d4042d1f41c12c4a551ba6576559d612116872a --- /dev/null +++ b/doc/api/v2/fluid/profiler.rst @@ -0,0 +1,10 @@ +=========== +Profiler +=========== + + + +Profiler +----------- +.. autofunction:: paddle.v2.fluid.profiler.cuda_profiler + :noindex: diff --git a/doc/api/v2/fluid/regularizer.rst b/doc/api/v2/fluid/regularizer.rst new file mode 100644 index 0000000000000000000000000000000000000000..3af2b07d2ae55d99df705fbf1ad2402eee05c435 --- /dev/null +++ b/doc/api/v2/fluid/regularizer.rst @@ -0,0 +1,25 @@ +=========== +Regularizer +=========== + +WeightDecayRegularizer +----------- +.. automodule:: paddle.v2.fluid.regularizer + :members: WeightDecayRegularizer + :noindex: + + +L2DecayRegularizer +----------- +.. automodule:: paddle.v2.fluid.regularizer + :members: L2DecayRegularizer + :noindex: + + + +L1DecayRegularizer +----------- +.. automodule:: paddle.v2.fluid.regularizer + :members: L1DecayRegularizer + + diff --git a/doc/design/block.md b/doc/design/block.md index be8800122035984df281692fc40009c397565046..4066122c0e8dfa33776796c3d205ba5aec9e0f52 100644 --- a/doc/design/block.md +++ b/doc/design/block.md @@ -5,12 +5,12 @@ Both deep learning systems and programming languages help users describe computation procedures. These systems use various representations of computation: - Caffe, Torch, and Paddle: sequences of layers. -- TensorFlow, Caffe2, Mxnet: graphs of operators. +- TensorFlow, Caffe2, Mxnet: graph of operators. - PaddlePaddle: nested blocks, like C++ and Java programs. ## Block in Programming Languages and Deep Learning -In programming languages, a block is a pair of curly braces that includes local variables definitions and a sequence of instructions, or operators. +In programming languages, a block is a pair of curly braces that includes local variables definitions and a sequence of instructions or operators. Blocks work with control flow structures like `if`, `else`, and `for`, which have equivalents in deep learning: @@ -24,14 +24,14 @@ A key difference is that a C++ program describes a one pass computation, whereas ## Stack Frames and the Scope Hierarchy -The existence of the backward makes the execution of a block of traditional programs and PaddlePaddle different to each other: +The existence of the backward pass makes the execution of a block of PaddlePaddle different from traditional programs: -| programming languages | PaddlePaddle | -|-----------------------|-------------------------------| -| stack | scope hierarchy | -| stack frame | scope | -| push at entering block| push at entering block | -| pop at leaving block | destroy at minibatch completes| +| programming languages | PaddlePaddle | +|-----------------------|---------------------------------| +| stack | scope hierarchy | +| stack frame | scope | +| push at entering block| push at entering block | +| pop at leaving block | destroy when minibatch completes| 1. In traditional programs: @@ -42,9 +42,9 @@ The existence of the backward makes the execution of a block of traditional prog 1. In PaddlePaddle - When the execution enters a block, PaddlePaddle adds a new scope, where it realizes variables. - - PaddlePaddle doesn't pop a scope after the execution of the block because variables therein are to be used by the backward pass. So it has a stack forest known as a *scope hierarchy*. + - PaddlePaddle doesn't pop a scope after the execution of the block because variables therein are used by the backward pass. So it has a stack forest known as a *scope hierarchy*. - The height of the highest tree is the maximum depth of nested blocks. - - After the process of a minibatch, PaddlePaddle destroys the scope hierarchy. + - After the processing of a minibatch, PaddlePaddle destroys the scope hierarchy. ## Use Blocks in C++ and PaddlePaddle Programs @@ -55,17 +55,23 @@ Let us consolidate the discussion by presenting some examples. The following C++ programs shows how blocks are used with the `if-else` structure: ```c++ +namespace pd = paddle; + int x = 10; -int y = 20; -int out; +int y = 1; +int z = 10; bool cond = false; +int o1, o2; if (cond) { int z = x + y; - out = softmax(z); + o1 = z; + o2 = pd::layer::softmax(z); } else { - int z = fc(x); - out = z; + int d = pd::layer::fc(z); + o1 = d; + o2 = d+1; } + ``` An equivalent PaddlePaddle program from the design doc of the [IfElseOp operator](./if_else_op.md) is as follows: @@ -73,57 +79,55 @@ An equivalent PaddlePaddle program from the design doc of the [IfElseOp operator ```python import paddle as pd -x = var(10) -y = var(20) -cond = var(false) -ie = pd.create_ifelseop(inputs=[x], output_num=1) +x = minibatch([10, 20, 30]) # shape=[None, 1] +y = var(1) # shape=[1], value=1 +z = minibatch([10, 20, 30]) # shape=[None, 1] +cond = larger_than(x, 15) # [false, true, true] + +ie = pd.ifelse() with ie.true_block(): - x = ie.inputs(true, 0) - z = operator.add(x, y) - ie.set_output(true, 0, operator.softmax(z)) + d = pd.layer.add_scalar(x, y) + ie.output(d, pd.layer.softmax(d)) with ie.false_block(): - x = ie.inputs(false, 0) - z = layer.fc(x) - ie.set_output(true, 0, operator.softmax(z)) -out = b(cond) + d = pd.layer.fc(z) + ie.output(d, d+1) +o1, o2 = ie(cond) ``` -In both examples, the left branch computes `softmax(x+y)` and the right branch computes `fc(x)`. +In both examples, the left branch computes `x+y` and `softmax(x+y)`, the right branch computes `fc(x)` and `x+1` . + +The difference is that variables in the C++ program contain scalar values, whereas those in the PaddlePaddle programs are mini-batches of instances. -A difference is that variables in the C++ program contain scalar values, whereas those in the PaddlePaddle programs are mini-batches of instances. The `ie.input(true, 0)` invocation returns instances in the 0-th input, `x`, that corresponds to true values in `cond` as the local variable `x`, where `ie.input(false, 0)` returns instances corresponding to false values. ### Blocks with `for` and `RNNOp` -The following RNN model from the [RNN design doc](./rnn.md) +The following RNN model in PaddlePaddle from the [RNN design doc](./rnn.md) : ```python -x = sequence([10, 20, 30]) -m = var(0) -W = tensor() -U = tensor() - -rnn = create_rnn(inputs=[input]) -with rnn.stepnet() as net: - x = net.set_inputs(0) - h = net.add_memory(init=m) - fc_out = pd.matmul(W, x) - hidden_out = pd.matmul(U, h.pre(n=1)) - sum = pd.add_two(fc_out, hidden_out) - act = pd.sigmoid(sum) - h.update(act) # update memory with act - net.set_outputs(0, act, hidden_out) # two outputs - +x = sequence([10, 20, 30]) # shape=[None, 1] +m = var(0) # shape=[1] +W = var(0.314, param=true) # shape=[1] +U = var(0.375, param=true) # shape=[1] + +rnn = pd.rnn() +with rnn.step(): + h = rnn.memory(init = m) + h_prev = rnn.previous_memory(h) + a = layer.fc(W, x) + b = layer.fc(U, h_prev) + s = pd.add(a, b) + act = pd.sigmoid(s) + rnn.update_memory(h, act) + rnn.output(a, b) o1, o2 = rnn() -print o1, o2 ``` - has its equivalent C++ program as follows ```c++ int* x = {10, 20, 30}; -int m = 0; -int W = some_value(); -int U = some_other_value(); +int* m = {0}; +int* W = {0.314}; +int* U = {0.375}; int mem[sizeof(x) / sizeof(x[0]) + 1]; int o1[sizeof(x) / sizeof(x[0]) + 1]; @@ -131,25 +135,21 @@ int o2[sizeof(x) / sizeof(x[0]) + 1]; for (int i = 1; i <= sizeof(x)/sizeof(x[0]); ++i) { int x = x[i-1]; if (i == 1) mem[0] = m; - int fc_out = W * x; - int hidden_out = Y * mem[i-1]; - int sum = fc_out + hidden_out; + int a = W * x; + int b = Y * mem[i-1]; + int s = fc_out + hidden_out; int act = sigmoid(sum); mem[i] = act; o1[i] = act; o2[i] = hidden_out; } - -print_array(o1); -print_array(o2); ``` - ## Compilation and Execution -Like TensorFlow programs, a PaddlePaddle program is written in Python. The first part describes a neural network as a protobuf message, and the rest part executes the message for training or inference. +Like TensorFlow, a PaddlePaddle program is written in Python. The first part describes a neural network as a protobuf message, and the rest executes the message for training or inference. -The generation of this protobuf message is like what a compiler generates a binary executable file. The execution of the message that the OS executes the binary file. +The generation of this protobuf message is similar to how a compiler generates a binary executable file. The execution of the message is similar to how the OS executes the binary file. ## The "Binary Executable File Format" @@ -186,10 +186,10 @@ Also, the RNN operator in above example is serialized into a protobuf message of ``` OpDesc { - inputs = {0} // the index of x - outputs = {5, 3} // indices of act and hidden_out + inputs = {0} // the index of x in vars of BlockDesc above + outputs = {5, 3} // indices of act and hidden_out in vars of BlockDesc above attrs { - "memories" : {1} // the index of h + "states" : {1} // the index of h "step_net" : } }; @@ -203,32 +203,32 @@ This `OpDesc` value is in the `ops` field of the `BlockDesc` value representing During the generation of the Protobuf message, the Block should store VarDesc (the Protobuf message which describes Variable) and OpDesc (the Protobuf message which describes Operator). VarDesc in a block should have its name scope to avoid local variables affect parent block's name scope. -Child block's name scopes should inherit the parent's so that OpDesc in child block can reference a VarDesc that stored in parent block. For example +Child block's name scopes should inherit the parent's so that OpDesc in child block can reference a VarDesc that stored in parent block. For example: ```python -a = pd.Varaible(shape=[20, 20]) +a = pd.Variable(shape=[20, 20]) b = pd.fc(a, params=["fc.w", "fc.b"]) rnn = pd.create_rnn() -with rnn.stepnet() as net: - x = net.set_inputs(a) +with rnn.stepnet(): + x = a.as_step_input() # reuse fc's parameter fc_without_b = pd.get_variable("fc.w") - net.set_outputs(fc_without_b) + rnn.output(fc_without_b) out = rnn() ``` -the method `pd.get_variable` can help retrieve a Variable by a name, a Variable may store in a parent block, but might be retrieved in a child block, so block should have a variable scope that supports inheritance. +The method `pd.get_variable` can help retrieve a Variable by the name. The Variable may be stored in a parent block, but might be retrieved in a child block, so block should have a variable scope that supports inheritance. In compiler design, the symbol table is a data structure created and maintained by compilers to store information about the occurrence of various entities such as variable names, function names, classes, etc. To store the definition of variables and operators, we define a C++ class `SymbolTable`, like the one used in compilers. -`SymbolTable` can do the following stuff: +`SymbolTable` can do the following: - store the definitions (some names and attributes) of variables and operators, -- to verify if a variable was declared, -- to make it possible to implement type checking (offer Protobuf message pointers to `InferShape` handlers). +- verify if a variable was declared, +- make it possible to implement type checking (offer Protobuf message pointers to `InferShape` handlers). ```c++ @@ -240,19 +240,18 @@ class SymbolTable { OpDesc* NewOp(const string& name=""); - // TODO determine whether name is generated by python or C++ - // currently assume that a unique name will be generated by C++ if the - // argument name left default. - VarDesc* NewVar(const string& name=""); + // TODO determine whether name is generated by python or C++. + // Currently assume that a unique name will be generated by C++ if the + // argument name is left default. + VarDesc* Var(const string& name=""); - // find a VarDesc by name, if recursive true, find parent's SymbolTable + // find a VarDesc by name, if recursive is true, find parent's SymbolTable // recursively. // this interface is introduced to support InferShape, find protobuf messages // of variables and operators, pass pointers into InferShape. - // operator // // NOTE maybe some C++ classes such as VarDescBuilder and OpDescBuilder should - // be proposed and embedded into pybind to enable python operate on C++ pointers. + // be proposed and embedded into pybind to enable python operation on C++ pointers. VarDesc* FindVar(const string& name, bool recursive=true); OpDesc* FindOp(const string& name); @@ -270,7 +269,7 @@ class SymbolTable { After all the description of variables and operators is added into SymbolTable, the block has enough information to run. -The `Block` class takes a `BlockDesc` as input, and provide `Run` and `InferShape` functions. +The `Block` class takes a `BlockDesc` as input, and provides `Run` and `InferShape` functions. ```c++ @@ -302,7 +301,7 @@ public: void CreateVariables(const framework::Scope& scope); void CreateOperators(); - // some other necessary interfaces of NetOp are list below + // some other necessary interfaces of NetOp are listed below // ... private: @@ -316,15 +315,14 @@ private: Block inherits from OperatorBase, which has a Run method. Block's Run method will run its operators sequentially. -There is another important interface called `Eval`, which take some arguments called targets, and generate a minimal graph which takes targets as the end points and creates a new Block, -after `Run`, `Eval` will get the latest value and return the targets. +There is another important interface called `Eval`, which takes some arguments called targets and generates a minimal graph which treats targets as the end points and creates a new Block. After `Run`, `Eval` will get the latest value and return the targets. The definition of Eval is as follows: ```c++ // clean a block description by targets using the corresponding dependency graph. // return a new BlockDesc with minimal number of operators. -// NOTE not return a Block but the block's description so that this can be distributed +// NOTE: The return type is not a Block but the block's description so that this can be distributed // to a cluster. BlockDesc Prune(const BlockDesc& desc, vector targets); diff --git a/doc/design/cluster_train/src/trainer.graffle b/doc/design/cluster_train/src/trainer.graffle index 42384a3f059966e22e22f5fa4295cc9ead5cef83..43415ed8cf61a5acfa34f8e56b9577f338dbf254 100644 Binary files a/doc/design/cluster_train/src/trainer.graffle and b/doc/design/cluster_train/src/trainer.graffle differ diff --git a/doc/design/dcgan.png b/doc/design/dcgan.png new file mode 100644 index 0000000000000000000000000000000000000000..15e8e290a111ff43900934341365cb4360d87d28 Binary files /dev/null and b/doc/design/dcgan.png differ diff --git a/doc/design/evaluator.md b/doc/design/evaluator.md new file mode 100644 index 0000000000000000000000000000000000000000..11cc129d56905a9ee666da92fbe6f8559c6d325a --- /dev/null +++ b/doc/design/evaluator.md @@ -0,0 +1,58 @@ +## Evaluator Design + +### Problem Statement + +During training or inference, we provide an evaluation function to measure the model performance, for example, accuracy, precision, etc. In the operator based framework design, the data passes through the network pipeline batch by batch. As a result, inside the operator, we only calculate the metrics for one minibatch. Thus, we need to provide a mechanism to calculate the metrics for each N pass/batch the user wants. + +### Evaluator Design +Currently, every operation is expressed in the graph. We divide the evaluator process into three steps. + +1. Initialize the metric state and add it into the block. + +2. Calculate the concerned metrics for every mini-batch. The single evaluator operator is only responsible for calculating the necessary statistics for one mini-batch. For example, the accuracy operator only calculates the accuracy for a minibatch data if run once. + + +3. Merge the mini-batch statistics to form the evaluation result for multiple mini-batches. When it comes to distributed training/Multi-GPU training, aggregate the value from different devices. + +### Implementation +This design is shown in the Python API. +Each metric operator needs to caculate the metric statistic and return the batch-aware states. Python side is responsible for accumulating the states for each pass. + + +```python +class Evaluator(object): + """ + Evaluator Base class. + """ + def __init__(self, name, **kwargs): + """ + Different evaluator may has different metric states. E.g, Accuracy need two variables, total and right sample counts. + Auc need four variables, `true_positives`, + `true_negatives`, `false_positives` and `false_negatives`. So every evaluator should create its needed variables and append to main_program + + The initialization of Evaluator should be responsible for: + create metric states and append to the main_program + """ + pass + + def _update_ops(self, input, label, **kwargs) + """ + Add mini-batch evaluator caculate operators to the main_program. + Add increment operator to accumulate the metric states. + """ + + + def reset(self, executor, reset_program=None): + """ + Reset metric states at the begin of each pass/user specified batch number. + Execute the reset_program to reset the states. + """ + + + def eval(self, executor, eval_program=None): + """ + Merge the mini-batch statistics to form the evaluation result for multiple mini-batches. + Execute the eval_program and return the result. + """ + return eval_result +``` diff --git a/doc/design/executor.md b/doc/design/executor.md new file mode 100644 index 0000000000000000000000000000000000000000..b5fb6c5c3c1da3c112ce63878322083dd5c42b70 --- /dev/null +++ b/doc/design/executor.md @@ -0,0 +1,23 @@ +# Executor Design Doc + +## Motivation + +We use executor to do the runtime evaluation of a `ProgramDesc`. + +## Overview + +An executor takes a `ProgramDesc`, a `block_id` and a `Scope`. The `ProgramDesc` is a list of blocks and each block contains the protobuf definition of all the parameters and operators. The `block_id` specifies the entrance block. And the `Scope` is the container of all the variable instance, which is persistent throughout different runs. + +### What does executor do? + +It evaluates all the operators in the `block_id`th block of a `ProgramDesc`. + +### What does executor NOT do? + +It does not do runtime optimization, meaning intelligently parse the dependency of each op a choose which one to be run and in which order they should be run. + +It does not do graph partitioning, meaning dividing the `ProgramDesc` into several small pieces and executing them on different devices. + +## Implementation + +`Executor` evaluates a `ProgramDesc`. Essentially, it instantiates Variables and Operators, then run all the operators in sequence. [[code]](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.cc) diff --git a/doc/design/float16.md b/doc/design/float16.md new file mode 100644 index 0000000000000000000000000000000000000000..1ea95ed6b5d6792171569b6ff76d09be92fcb13e --- /dev/null +++ b/doc/design/float16.md @@ -0,0 +1,105 @@ +# Design Doc: float16 + +## Why float16 +Half precision (float16) is a binary floating-point format that occupies 16 bits in memory. float16 is half the size of traditional 32-bit single precision format (float) and has lower precision and smaller range. + +When high precision computation is not required, using float16 data type could potentially + +- reduce storage space, memory bandwidth, and power usages; +- increase the chance of data fitting into a smaller cache of lower latency; +- provide arithmetic speed up if supported by hardware. + +## Survey of current float16 support +A brief survey of float16 support on different compilers, hardwares, and libraries can be found below. Interested readers can refer to [link1](https://github.com/PaddlePaddle/Paddle/issues/4853) and [link2](https://github.com/Xreki/Xreki.github.io/blob/master/multi_data_types_in_dl_framework/ppt/float16_and_quantized_type.md) for more info. + +The goal of float16 is to serve as a key for the executor to find and run the correct version of compute method specialized for float16 in operator kernel. It should be compatible with various natively supported float16 implementations including `__half` for cuda, `float16_t` for ARM, and `Eigen::half` for Eigen to make writing customized float16 kernels easier. + +### Compiler +- nvcc supports `__half` data type after CUDA 7.5. +- `__fp16` or `float16_t` is supported as storage type for gcc >= 6.1 and clang >= 3.4. +- `__fp16` or `float16_t` is supported as arithmetic type for gcc >= 7.1 and clang >= 3.9. + +### Hardware +- `__half` is supported on GPU with compute capability >= 5.3. +- `__fp16` is supported as storage type for ARMv7-A, ARMv8-A, and above. +- `__fp16` is supported as arithmetic type after ARMv8.2-A (currently, the only microarchitecture implementing ARMv8.2-A is ARM Cortex-A75, which is announced in May 2017. There seems to be no application processors currently available on market that adopts this architecture. It is reported that Qualcomm Snapdragon 845 uses Cortex-A75 design and will be available in mobile devices in early 2018). + +### Libraries +- [Eigen](https://github.com/RLovelett/eigen) >= 3.3 supports float16 calculation on both GPU and CPU using the `Eigen::half` class. It is mostly useful for Nvidia GPUs because of the overloaded arithmetic operators using cuda intrinsics. It falls back to using software emulation on CPU for calculation and there is no special treatment to ARM processors. +- [ARM compute library](https://github.com/ARM-software/ComputeLibrary) >= 17.02.01 supports NEON FP16 kernels (requires ARMv8.2-A CPU). + +### CUDA version issue +There are currently three versions of CUDA that supports `__half` data type, namely, CUDA 7.5, 8.0, and 9.0. +CUDA 7.5 and 8.0 define `__half` as a simple struct that has a `uint16_t` data (see [`cuda_fp16.h`](https://github.com/ptillet/isaac/blob/9212ab5a3ddbe48f30ef373f9c1fb546804c7a8c/include/isaac/external/CUDA/cuda_fp16.h)) as follows: +``` +typedef struct __align__(2) { + unsigned short x; +} __half; + +typedef __half half; +``` +This struct does not define any overloaded arithmetic operators. So you have to directly use `__hadd` instead of `+` to correctly add two half types: +``` +__global__ void Add() { + half a, b, c; + c = __hadd(a, b); // correct + c = a + b; // compiler error: no operator "+" matches these operands +} +``` +CUDA 9.0 provides a major update to the half data type. The related code can be found in the updated [`cuda_fp16.h`](https://github.com/ptillet/isaac/blob/master/include/isaac/external/CUDA/cuda_fp16.h) and the newly added [`cuda_fp16.hpp`](https://github.com/ptillet/isaac/blob/master/include/isaac/external/CUDA/cuda_fp16.hpp). + +Essentially, CUDA 9.0 renames the original `__half` type in 7.5 and 8.0 as `__half_raw`, and defines a new `__half` class type that has constructors, conversion operators, and also provides overloaded arithmetic operators such as follows: +``` +typedef struct __CUDA_ALIGN__(2) { + unsigned short x; +} __half_raw; + + +struct __CUDA_ALIGN__(2) __half { +protected: + unsigned short __x; +public: + // constructors and conversion operators from/to + // __half_raw and other built-in data types +} + +typedef __half half; + +__device__ __forceinline__ +__half operator+(const __half &lh, const __half &rh) { + return __hadd(lh, rh); +} + +// Other overloaded operators +``` +This new design makes `c = a + b` work correctly for CUDA half data type. + +## Implementation +The float16 class holds a 16-bit `uint16_t` data internally. +``` +struct float16 { + uint16_t x; +}; +``` + +float16 supports the following features: + - constructors / assignment operators that take input from primitive data types including bool, integers of various length, float, and double. + - constructors / assignment operators that take input from `__half` on cuda, `float16_t` on ARM, and `Eigen::half` on Eigen. + - conversion operators to primitive data types and half precision data types on cuda, ARM and Eigen. + - overloaded arithmetic operators for cuda, arm, and non-arm cpu, respectively. These operators will take advantage of the cuda and ARM intrinsics on the corresponding hardware. + +To support the above features, two fundamental conversion functions are provided: +``` +float16 float_to_half_rn(float f); // convert to half precision in round-to-nearest-even mode +float half_to_float(float16 h); +``` +which provides one-to-one conversion between float32 and float16. These twos functions will do different conversion routines based on the current hardware. CUDA/ARM instrinsics will be used when the corresonding hardware is available. If the hardware or compiler level does not support float32 to float16 conversion, software emulation will be performed to do the conversion. + +## To do +After float16 class is available, some of the future items are below: + +- Update pybind/tensor_py.h to bind c++ float16 with numpy float16. + +- Modify `GetKernelType()` method in `framework/operator.h` to make it compatible with float16. + +- Create a type-casting operator that can convert the data type in tensor between float16 and other types. diff --git a/doc/design/gan_api.md b/doc/design/gan_api.md new file mode 100644 index 0000000000000000000000000000000000000000..fb41df8615f73d9fd4c32995eab265833eac1a55 --- /dev/null +++ b/doc/design/gan_api.md @@ -0,0 +1,253 @@ +# Design for GAN + +GAN (General Adversarial Net [https://arxiv.org/abs/1406.2661]) is an important model for unsupervised learning and widely used in many areas. + +It applies several important concepts in machine learning system design, including building and running subgraphs, dependency tracing, different optimizers in one executor and so forth. + +In our GAN design, we wrap it as a user-friendly easily customized python API to design different models. We take the conditional DC-GAN (Unsupervised Representation Learning with Deep Convolutional Generative Adversarial Networks [https://arxiv.org/abs/1511.06434]) as an example due to its good performance on image generation. + +

+
+Figure 1. The overall running logic of GAN. The black solid arrows indicate the forward pass; the green dashed arrows indicate the backward pass of generator training; the red dashed arrows indicate the backward pass of the discriminator training. The BP pass of the green (red) arrow should only update the parameters in the green (red) boxes. The diamonds indicate the data providers. d\_loss and g\_loss marked in red and green are the two targets we would like to run. +

+ +The operators, layers and functions required/optional to build a GAN demo is summarized in https://github.com/PaddlePaddle/Paddle/issues/4563. + +

+
+Figure 2. Photo borrowed from the original DC-GAN paper. +

+ +## The Conditional-GAN might be a class. +This design we adopt the popular open source design in https://github.com/carpedm20/DCGAN-tensorflow and https://github.com/rajathkmp/DCGAN. It contains following data structure: + +- DCGAN(object): which contains everything required to build a GAN model. It provides following member functions methods as API: + +- __init__(...): Initialize hyper-parameters (like conv dimension and so forth), and declare model parameters of discriminator and generator as well. + +- generator(z, y=None): Generate a fake image from input noise z. If the label y is provided, the conditional GAN model will be chosen. +Returns a generated image. + +- discriminator(image): +Given an image, decide if it is from a real source or a fake one. +Returns a 0/1 binary label. + +- build_model(self): +build the whole GAN model, define training loss for both generator and discrimator. + +## Discussion on Engine Functions required to build GAN +- Trace the tensor and variable dependency in the engine executor. (Very critical, otherwise GAN can'be be trained correctly) +- Different optimizers responsible for optimizing different loss. + +To be more detailed, we introduce our design of DCGAN as following: + +### Class member Function: Initializer +- Set up hyper-parameters, including condtional dimension, noise dimension, batch size and so forth. +- Declare and define all the model variables. All the discriminator parameters are included in the list self.theta_D and all the generator parameters are included in the list self.theta_G. +```python +class DCGAN(object): + def __init__(self, y_dim=None): + + # hyper parameters + self.y_dim = y_dim # conditional gan or not + self.batch_size = 100 + self.z_dim = z_dim # input noise dimension + + # define parameters of discriminators + self.D_W0 = pd.Variable(shape=[3,3, 1, 128], data=pd.gaussian_normal_randomizer()) + self.D_b0 = pd.Variable(np.zeros(128)) # variable also support initialization using a numpy data + self.D_W1 = pd.Variable(shape=[784, 128], data=pd.gaussian_normal_randomizer()) + self.D_b1 = pd.Variable(np.zeros(128)) # variable also support initialization using a numpy data + self.D_W2 = pd.Varialble(np.random.rand(128, 1)) + self.D_b2 = pd.Variable(np.zeros(128)) + self.theta_D = [self.D_W0, self.D_b0, self.D_W1, self.D_b1, self.D_W2, self.D_b2] + + # define parameters of generators + self.G_W0 = pd.Variable(shape=[784, 128], data=pd.gaussian_normal_randomizer()) + self.G_b0 = pd.Variable(np.zeros(128)) # variable also support initialization using a numpy data + self.G_W1 = pd.Variable(shape=[784, 128], data=pd.gaussian_normal_randomizer()) + self.G_b1 = pd.Variable(np.zeros(128)) # variable also support initialization using a numpy data + self.G_W2 = pd.Varialble(np.random.rand(128, 1)) + self.G_b2 = pd.Variable(np.zeros(128)) + self.theta_G = [self.G_W0, self.G_b0, self.G_W1, self.G_b1, self.G_W2, self.G_b2] +``` + +### Class member Function: Generator +- Given a noisy input z, returns a fake image. +- Concatenation, batch-norm, FC operations required; +- Deconv layer required, which is missing now... +```python +class DCGAN(object): + def generator(self, z, y = None): + # input z: the random noise + # input y: input data label (optional) + # output G_im: generated fake images + + if not self.y_dim: + z = pd.layer.concat(1, [z, y]) + + G_h0 = pd.layer.fc(z, self.G_w0, self.G_b0) + G_h0_bn = pd.layer.batch_norm(G_h0) + G_h0_relu = pd.layer.relu(G_h0_bn) + + G_h1 = pd.layer.deconv(G_h0_relu, self.G_w1, self.G_b1) + G_h1_bn = pd.layer.batch_norm(G_h1) + G_h1_relu = pd.layer.relu(G_h1_bn) + + G_h2 = pd.layer.deconv(G_h1_relu, self.G_W2, self.G_b2)) + G_im = pd.layer.tanh(G_im) + return G_im +``` + +### Class member function: Discriminator +- Given a noisy input z, returns a fake image. +- Concatenation, Convolution, batch-norm, FC, Leaky-ReLU operations required; +```python +class DCGAN(object): + def discriminator(self, image): + # input image: either generated images or real ones + # output D_h2: binary logit of the label + + D_h0 = pd.layer.conv2d(image, w=self.D_w0, b=self.D_b0) + D_h0_bn = pd.layer.batchnorm(h0) + D_h0_relu = pd.layer.lrelu(h0_bn) + + D_h1 = pd.layer.conv2d(D_h0_relu, w=self.D_w1, b=self.D_b1) + D_h1_bn = pd.layer.batchnorm(D_h1) + D_h1_relu = pd.layer.lrelu(D_h1_bn) + + D_h2 = pd.layer.fc(D_h1_relu, w=self.D_w2, b=self.D_b2) + return D_h2 +``` + +### Class member function: Build the model +- Define data readers as placeholders to hold the data; +- Build generator and discriminators; +- Define two training losses for discriminator and generator, respectively. +If we have execution dependency engine to back-trace all tensors, the module building our GAN model will be like this: +```python +class DCGAN(object): + def build_model(self): + if self.y_dim: + self.y = pd.data(pd.float32, [self.batch_size, self.y_dim]) + self.images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size]) + self.faked_images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size]) + self.z = pd.data(tf.float32, [None, self.z_size]) + + # step 1: generate images by generator, classify real/fake images with discriminator + if self.y_dim: # if conditional GAN, includes label + self.G = self.generator(self.z, self.y) + self.D_t = self.discriminator(self.images) + # generated fake images + self.sampled = self.sampler(self.z, self.y) + self.D_f = self.discriminator(self.G) + else: # original version of GAN + self.G = self.generator(self.z) + self.D_t = self.discriminator(self.images) + # generate fake images + self.sampled = self.sampler(self.z) + self.D_f = self.discriminator(self.images) + + # step 2: define the two losses + self.d_loss_real = pd.reduce_mean(pd.cross_entropy(self.D_t, np.ones(self.batch_size)) + self.d_loss_fake = pd.reduce_mean(pd.cross_entropy(self.D_f, np.zeros(self.batch_size)) + self.d_loss = self.d_loss_real + self.d_loss_fake + + self.g_loss = pd.reduce_mean(pd.cross_entropy(self.D_f, np.ones(self.batch_szie)) +``` + +If we do not have dependency engine but blocks, the module building our GAN model will be like this: +```python +class DCGAN(object): + def build_model(self, default_block): + # input data in the default block + if self.y_dim: + self.y = pd.data(pd.float32, [self.batch_size, self.y_dim]) + self.images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size]) + # self.faked_images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size]) + self.z = pd.data(tf.float32, [None, self.z_size]) + + # step 1: generate images by generator, classify real/fake images with discriminator + with pd.default_block().g_block(): + if self.y_dim: # if conditional GAN, includes label + self.G = self.generator(self.z, self.y) + self.D_g = self.discriminator(self.G, self.y) + else: # original version of GAN + self.G = self.generator(self.z) + self.D_g = self.discriminator(self.G, self.y) + self.g_loss = pd.reduce_mean(pd.cross_entropy(self.D_g, np.ones(self.batch_szie)) + + with pd.default_block().d_block(): + if self.y_dim: # if conditional GAN, includes label + self.D_t = self.discriminator(self.images, self.y) + self.D_f = self.discriminator(self.G, self.y) + else: # original version of GAN + self.D_t = self.discriminator(self.images) + self.D_f = self.discriminator(self.G) + + # step 2: define the two losses + self.d_loss_real = pd.reduce_mean(pd.cross_entropy(self.D_t, np.ones(self.batch_size)) + self.d_loss_fake = pd.reduce_mean(pd.cross_entropy(self.D_f, np.zeros(self.batch_size)) + self.d_loss = self.d_loss_real + self.d_loss_fake +``` +Some small confusion and problems with this design: +- D\_g and D\_f are actually the same thing, but has to be written twice; i.e., if we want to run two sub-graphs conceptually, the same codes have to be written twice if they are shared by the graph. +- Requires ability to create a block anytime, rather than in if-else or rnn only; + +## Main function for the demo: +Generally, the user of GAN just need to the following things: +- Define an object as DCGAN class; +- Build the DCGAN model; +- Specify two optimizers for two different losses with respect to different parameters. +```python +# pd for short, should be more concise. +from paddle.v2 as pd +import numpy as np +import logging + +if __name__ == "__main__": + # dcgan class in the default graph/block + # if we use dependency engine as tensorflow + # the codes, will be slightly different like: + # dcgan = DCGAN() + # dcgan.build_model() + with pd.block() as def_block: + dcgan = DCGAN() + dcgan.build_model(def_block) + + # load mnist data + data_X, data_y = self.load_mnist() + + # Two subgraphs required!!! + with pd.block().d_block(): + d_optim = pd.train.Adam(lr = .001, beta= .1) + d_step = d_optim.minimize(dcgan.d_loss, dcgan.theta_D) + with pd.block.g_block(): + g_optim = pd.train.Adam(lr = .001, beta= .1) + g_step = pd.minimize(dcgan.g_loss, dcgan.theta_G) + + # executor + sess = pd.executor() + + # training + for epoch in xrange(10000): + for batch_id in range(N / batch_size): + idx = ... + # sample a batch + batch_im, batch_label = data_X[idx:idx+batch_size], data_y[idx:idx+batch_size] + # sample z + batch_z = np.random.uniform(-1., 1., [batch_size, z_dim]) + + if batch_id % 2 == 0: + sess.run(d_step, + feed_dict = {dcgan.images: batch_im, + dcgan.y: batch_label, + dcgan.z: batch_z}) + else: + sess.run(g_step, + feed_dict = {dcgan.z: batch_z}) +``` + +# More thinking about dependency engine v.s. block design: +- What if we just want to run an intermediate result? Do we need to run the whole block/graph? +- Should we call eval() to get the fake images in the first stage? And then train the discriminator in the second stage? diff --git a/doc/design/graph_survey.md b/doc/design/graph_survey.md new file mode 100644 index 0000000000000000000000000000000000000000..6c6db08f463ae0a2b94fc4546f123a1d7c151870 --- /dev/null +++ b/doc/design/graph_survey.md @@ -0,0 +1,232 @@ +## Survey on Graph + +Neural network framework often provides symbolic API for users to write network topology conveniently. This doc manily focus on symbolic API in most popular neural network frameworks, and try to find out how to parse symbolic configuration to a portable file, such as protobuf or json. + +### Mxnet + +The core concept of symbolic API is `Symbol`. Mxnet implements `Symbol` class in C++, and export to Python using C-API. Please refer to the comments in Mxnet: + + +`Symbol` is help class used to represent the operator node in Graph. +`Symbol` acts as an interface for building graphs from different components like Variable, Functor and Group. `Symbol` is also exported to python front-end (while Graph is not) to enable quick test and deployment. Conceptually, symbol is the final operation of a graph and thus including all the information required (the graph) to evaluate its output value. + + +A simple network topology wrote by Symbol is as follows: + +```python +def get_symbol(num_classes=10, **kwargs): + data = mx.symbol.Variable('data') + data = mx.symbol.Flatten(data=data) + fc1 = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=128) + act1 = mx.symbol.Activation(data = fc1, name='relu1', act_type="relu") + fc2 = mx.symbol.FullyConnected(data = act1, name = 'fc2', num_hidden = 64) + act2 = mx.symbol.Activation(data = fc2, name='relu2', act_type="relu") + fc3 = mx.symbol.FullyConnected(data = act2, name='fc3', num_hidden=num_classes) + mlp = mx.symbol.SoftmaxOutput(data = fc3, name = 'softmax') + return mlp +``` + + + +Varible here is actually a Symbol. Every basic Symbol will correspond to one Node, and every Node has its own NodeAttr. There is a op field in NodeAttr class, when a Symbol represents Variable(often input data), the op field is null. + +Symbol contains a data member, std::vector outputs, and NodeEntry cantains a poniter to Node. We can follow the Node pointer to get all the Graph. + +And Symbol can be saved to a Json file. + +Here is a detailed example: + +``` +>>> import mxnet as mx +>>> data = mx.symbol.Variable('data') +>>> print data.debug_str() +Variable:data + +>>> data = mx.symbol.Flatten(data=data) +>>> print data.debug_str() +Symbol Outputs: + output[0]=flatten0(0) +Variable:data +-------------------- +Op:Flatten, Name=flatten0 +Inputs: + arg[0]=data(0) version=0 + +>>> fc1 = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=128) +>>> print fc1.debug_str() +Symbol Outputs: + output[0]=fc1(0) +Variable:data +-------------------- +Op:Flatten, Name=flatten0 +Inputs: + arg[0]=data(0) version=0 +Variable:fc1_weight +Variable:fc1_bias +-------------------- +Op:FullyConnected, Name=fc1 +Inputs: + arg[0]=flatten0(0) + arg[1]=fc1_weight(0) version=0 + arg[2]=fc1_bias(0) version=0 +Attrs: + num_hidden=128 + +``` + + +### TensorFlow + + +The core concept of symbolic API is `Tensor`. Tensorflow defines `Tensor` in Python. Please refer to the comments in TensorFlow: + +A `Tensor` is a symbolic handle to one of the outputs of an `Operation`. It does not hold the values of that operation's output, but instead provides a means of computing those values in a TensorFlow [Session](https://www.tensorflow.org/api_docs/python/tf/Session). + +A simple example is as follows: + +```python + # Build a dataflow graph. + c = tf.constant([[1.0, 2.0], [3.0, 4.0]]) + d = tf.constant([[1.0, 1.0], [0.0, 1.0]]) + e = tf.matmul(c, d) + + # Construct a `Session` to execute the graph. + sess = tf.Session() + + # Execute the graph and store the value that `e` represents in `result`. + result = sess.run(e) +``` + + +The main method of `Tensor` is as follows: + + +```python +@property +def op(self): + """The `Operation` that produces this tensor as an output.""" + return self._op + +@property +def dtype(self): + """The `DType` of elements in this tensor.""" + return self._dtype + +@property +def graph(self): + """The `Graph` that contains this tensor.""" + return self._op.graph + +@property +def name(self): + """The string name of this tensor.""" + if not self._op.name: + raise ValueError("Operation was not named: %s" % self._op) + return "%s:%d" % (self._op.name, self._value_index) + +@property +def device(self): + """The name of the device on which this tensor will be produced, or None.""" + return self._op.device +``` + + +Tensor can be taken as target to run by session. Tensor contains all the information of Graph, and tracks data dependency. + + +Here is a detailed example: + + +``` +>>> import tensorflow as tf +>>> c = tf.constant([[1.0, 2.0], [3.0, 4.0]]) +>>> print c.graph + +>>> d = tf.constant([[1.0, 1.0], [0.0, 1.0]]) +>>> print d.graph + +>>> e = tf.matmul(c, d) +>>> print e.graph + +``` + +### Dynet + + +The core concept of symbolic API is `Expression`, and Dynet defines `Expression` class in C++. + + +A simple example is as follows: + +```cpp +ComputationGraph cg; +Expression W = parameter(cg, pW); + +Expression in = input(cg, xs[i]); +Expression label = input(cg, ys[i]); +Expression pred = W * in; +Expression loss = square(pred - label); +``` + +The input data and parameter are also represented by Expression. Every basci Expression corresponds to a Node. And input data is also a Node. + +Expression has a data member ComputationGraph, and ComputationGraph will be modified in users' configuring process. Expression can be a running target, beacuse Expression contains all dependency. + + +Here is a detailed example: + +write topology in C++ + +``` +ComputationGraph cg; +Expression W = parameter(cg, pW); +cg.print_graphviz(); + +Expression pred = W * xs[i]; +cg.print_graphviz(); + +Expression loss = square(pred - ys[i]); +cg.print_graphviz(); +``` + +compile and print + +``` +# first print +digraph G { + rankdir=LR; + nodesep=.05; + N0 [label="v0 = parameters({1}) @ 0x7ffe4de00110"]; +} +# second print +digraph G { + rankdir=LR; + nodesep=.05; + N0 [label="v0 = parameters({1}) @ 0x7ffe4de00110"]; + N1 [label="v1 = v0 * -0.98"]; + N0 -> N1; +} +# third print +digraph G { + rankdir=LR; + nodesep=.05; + N0 [label="v0 = parameters({1}) @ 0x7ffe4de00110"]; + N1 [label="v1 = v0 * -0.98"]; + N0 -> N1; + N2 [label="v2 = -1.88387 - v1"]; + N1 -> N2; + N3 [label="v3 = -v2"]; + N2 -> N3; + N4 [label="v4 = square(v3)"]; + N3 -> N4; +} +``` + +### Conclusion + + +Actually, Symbol/Tensor/Expression in Mxnet/TensorFlow/Dynet are the same level concepts. We use a unified name Expression here, this level concept has following features: + +- Users wirte topoloy with symbolic API, and all return value is Expression, including input data and parameter. +- Expression corresponds with a global Graph, and Expression can also be composed. +- Expression tracks all dependency and can be taken as a run target diff --git a/doc/design/if_else_op.md b/doc/design/if_else_op.md index 954a19c0733358c235eae3cffe134c23dac94c95..26d140f06db4ecefa86be015eaa731ffddc6910c 100644 --- a/doc/design/if_else_op.md +++ b/doc/design/if_else_op.md @@ -1,41 +1,51 @@ -IfOp should have only one branch. An IfOp operator takes a `cond` variable whose value must be a vector of N boolean elements. Its return value has N instances. If cond[i] == True, input instance input[i] will go through true_block() and generate output[i]; otherwise it will produce output from false_bloack(). +# The `IfElse` Operator -```python -import paddle as pd +PaddlePaddle's `IfElse` operator differs from TensorFlow's: -x = var() -y = var() -cond = var() -default_value = var() -b = pd.create_ifelseop(inputs=[x], output_num=1) -with b.true_block(): - x = b.inputs(0) - z = operator.add(x, y) - b.set_output(0, operator.softmax(z)) - -with b.false_block(): - x = b.inputs(0) - z = layer.fc(x) - b.set_output(0, operator.softmax(z)) - -out = b(cond) -``` +- the TensorFlow version takes a scalar boolean value as the condition so that the whole mini-batch goes to either the true or the false branch, whereas +- the PaddlePaddle version takes a vector of boolean value as the condition, and instances corresponding to true values go to the true branch, those corresponding to false values go to the false branch. + +## Example + +The following PaddlePaddle program shows the usage of the IfElse operator: -If only true_block is set in an IfElseOp, a special case is that we can have a default value for false as: ```python import paddle as pd -x = var() -y = var() -cond = var() -default_value = var() -b = pd.create_ifelseop(inputs=[x], output_num=1, default_value) - -with b.true_block(): - x = b.inputs(0) - z = operator.add(x, y) - b.set_output(0, operator.softmax(z)) +x = minibatch([10, 20, 30]) # shape=[None, 1] +y = var(1) # shape=[1], value=1 +z = minibatch([10, 20, 30]) # shape=[None, 1] +cond = larger_than(x, 15) # [false, true, true] + +ie = pd.ifelse() +with ie.true_block(): + d = pd.layer.add(x, y) + ie.output(d, pd.layer.softmax(d)) +with ie.false_block(): + d = pd.layer.fc(z) + ie.output(d, d+1) +o1, o2 = ie(cond) +``` -out = b(cond) +A challenge to implement the `IfElse` operator is to infer those variables to be split, or, say, to identify the variable of the mini-batch or those derived from the mini-batch. + +An equivalent C++ program is as follows: + +```c++ +namespace pd = paddle; + +int x = 10; +int y = 1; +int z = 10; +bool cond = false; +int o1, o2; +if (cond) { + int d = x + y; + o1 = z; + o2 = pd::layer::softmax(z); +} else { + int d = pd::layer::fc(z); + o1 = d; + o2 = d+1; +} ``` -where default_value is a list of vars for `cond` == False. diff --git a/doc/design/images/asgd.gif b/doc/design/images/asgd.gif new file mode 100644 index 0000000000000000000000000000000000000000..4a0da7bf6df9326a2aab1638b77c5455c18b8c4e Binary files /dev/null and b/doc/design/images/asgd.gif differ diff --git a/doc/design/images/feed_forward.png b/doc/design/images/feed_forward.png new file mode 100644 index 0000000000000000000000000000000000000000..d312371a04c26aa6cd196e0bd1f51becb425180b Binary files /dev/null and b/doc/design/images/feed_forward.png differ diff --git a/doc/design/images/feed_forward_regularized.png b/doc/design/images/feed_forward_regularized.png new file mode 100644 index 0000000000000000000000000000000000000000..677e99bfd9f8e72ed9fe4b27127af2ced202f447 Binary files /dev/null and b/doc/design/images/feed_forward_regularized.png differ diff --git a/doc/design/images/graph_construction_example.dot b/doc/design/images/graph_construction_example.dot index 8d1b673abf6b78c851676fa379dc850c4818f0e5..e115f9844bae6ad24f638c8ed4749cea8aff06a9 100644 --- a/doc/design/images/graph_construction_example.dot +++ b/doc/design/images/graph_construction_example.dot @@ -33,7 +33,6 @@ digraph ImageClassificationGraph { cost -> MSE_Grad [color=red]; d_cost -> MSE_Grad [color=red]; - x -> MSE_Grad [color=red]; l -> MSE_Grad [color=red]; y -> MSE_Grad -> d_y [color=red]; diff --git a/doc/design/images/graph_construction_example_all.png b/doc/design/images/graph_construction_example_all.png index 181187503472d15779b87284105841168b3945c4..261611a5721f9aa97874f7e6d897fe48cf667db2 100644 Binary files a/doc/design/images/graph_construction_example_all.png and b/doc/design/images/graph_construction_example_all.png differ diff --git a/doc/design/images/graph_construction_example_forward_backward.png b/doc/design/images/graph_construction_example_forward_backward.png index 3049a9315fd616464dec54e33064cb75598ca536..4c69687f4a6a181138f3df72ce5e8aa48487b5be 100644 Binary files a/doc/design/images/graph_construction_example_forward_backward.png and b/doc/design/images/graph_construction_example_forward_backward.png differ diff --git a/doc/design/images/graph_construction_example_forward_only.png b/doc/design/images/graph_construction_example_forward_only.png index 25d19088cbf0b5f68cf734f2ff21eba8af4a2860..e668c16e0cac73acb4e5dc2b1827557ae77126b4 100644 Binary files a/doc/design/images/graph_construction_example_forward_only.png and b/doc/design/images/graph_construction_example_forward_only.png differ diff --git a/doc/design/images/l1_regularization.png b/doc/design/images/l1_regularization.png new file mode 100644 index 0000000000000000000000000000000000000000..e1b9c7a44f94dc027598a98da93ddb8133190972 Binary files /dev/null and b/doc/design/images/l1_regularization.png differ diff --git a/doc/design/images/l2_regularization.png b/doc/design/images/l2_regularization.png new file mode 100644 index 0000000000000000000000000000000000000000..d5c2fcbc2ccae75ad083162e5a2dceb0210be298 Binary files /dev/null and b/doc/design/images/l2_regularization.png differ diff --git a/doc/design/images/loss_equation.png b/doc/design/images/loss_equation.png new file mode 100644 index 0000000000000000000000000000000000000000..14212ec8d36c803de96bde8a9a4b5591bd20434e Binary files /dev/null and b/doc/design/images/loss_equation.png differ diff --git a/doc/design/images/theta_star.gif b/doc/design/images/theta_star.gif new file mode 100644 index 0000000000000000000000000000000000000000..dd24d33e124396be3fc410c9b12f33148f64efe2 Binary files /dev/null and b/doc/design/images/theta_star.gif differ diff --git a/doc/design/infer_var_type.md b/doc/design/infer_var_type.md new file mode 100644 index 0000000000000000000000000000000000000000..d9d5397becba2ef1806d9341cd49cd9aabbf4a6a --- /dev/null +++ b/doc/design/infer_var_type.md @@ -0,0 +1,78 @@ +# Design Doc: InferVarType + +## The Problem Posed + +The variable in our design can hold variant types. Such as `LoDTensor` and `SelectedRows`. An operator should be able to inference the variable types of its output. + +For example, a `lookup table` operator takes two `LoDTensor`; one is a float tensor as the embedding table, the other is an int tensor as word ID. The gradient operator of `lookup table` will generate a `SelectedRows` as its output. A `sum` operator can take both `LoDTensor` and `SelectedRows` as its inputs and will generate a `LoDTensor` if any of its inputs is `LoDTensor`, otherwise, the `sum` operator will generate `SelectedRows` as its output. + +The variable type will be constant at runtime. Every variable's type can either be set by the user (input data and parameter) or be inferred by the operator in compile time. + +## Proposed Solution + +The `InferVarType` is a compile-time function which is registered to each operator. The inferface of that function is: + + +```c++ +using InferVarTypeFN = std::function< + void (const OpDescBind& /*op_desc*/, BlockDescBind* /*block*/)>; +``` + +It takes an operator description as its input and will write the output variable type and store them in block description. + +The `InferVarTypeFN` will be registered in `OpInfo`, to replace `infer_var_type_` field. The `OpInfo` should be + +```cpp +struct OpInfo { + InferVarTypeFN infer_var_type_; + ... +}; +``` + +The default `InferVarType` will set output type as `LoDTensor`. It can be done by `GetInferVarType()`. + +```cpp +void DefaultInferVarType(const OpDescBind& op_desc, BlockDescBind* block) { + // set the output type of variable as `LoDTensor`. + // ... +} + +struct OpInfo { + InferVarTypeFN infer_var_type_; + InferVarTypeFN GetInferVarType() const { + if (infer_var_type_) { + return infer_var_type_; + } else { + return DefaultInferVarType; + } + } +}; +``` + +## Register InferVarType + +We provide a thin base class for registering an `InferVarTypeFN`. To use a base class will ease the implementation of registry since we can detect the registry entry is an `InferVarTypeFN` or not. + +```cpp +class VarTypeInferer { +public: + virtual void operator()(const OpDescBind& op_desc, BlockDescBind* block) const = 0; +} +``` + +Operator developers can write the specialize `VarTypeInferer` as follow. + +```cpp +class SpecialVarTypeInferer : public VarTypeInferer { +public: + virtual void operator()(const OpDescBind& op_desc, BlockDescBind* block) const { + // .. own logic + } +} +``` + +Then user can register the `InferVarType` just like `GradOpDescMaker` and `OpInfoMaker`. + +``` +REGISTER_OPERATOR(some_op, OpType, SpecialVarTypeInferer, ...); +``` diff --git a/doc/design/mkldnn/README.MD b/doc/design/mkldnn/README.MD index fe8da907d9d45a2164031430ac5b7a3d5523967a..61d453de243c25defc56161641bc4a888a88a3b7 100644 --- a/doc/design/mkldnn/README.MD +++ b/doc/design/mkldnn/README.MD @@ -1,64 +1,163 @@ # Intel® MKL-DNN on PaddlePaddle: Design Doc -我们计划将Intel深度神经网络数学库(**MKL-DNN**\[[1](#references)\])集成到PaddlePaddle,充分展现英特尔平台的优势,有效提升PaddlePaddle在英特尔架构上的性能。 +我们计划将英特尔深度神经网络数学库[Intel MKL-DNN](https://github.com/01org/mkl-dnn) +(Intel Math Kernel Library for Deep Neural Networks)集成到PaddlePaddle, +充分展现英特尔平台的优势,有效提升PaddlePaddle在英特尔架构上的性能。 -我们短期内的基本目标是: +
+
+Figure 1. PaddlePaddle on IA +
-- 完成常用layer的MKL-DNN实现。 +近期目标 + +- 完成常用Layer的MKL-DNN实现。 - 完成常见深度神经网络VGG,GoogLeNet 和 ResNet的MKL-DNN实现。 +目前的优化,主要针对PaddlePaddle在重构之前的代码框架以及V1的API。 +具体的完成状态可以参见[这里](https://github.com/PaddlePaddle/Paddle/projects/21)。 ## Contents - [Overview](#overview) - [Actions](#actions) - [CMake](#cmake) + - [Matrix](#matrix) - [Layers](#layers) - [Activations](#activations) + - [Parameters](#parameters) + - [Gradients](#gradients) - [Unit Tests](#unit-tests) - - [Protobuf Messages](#protobuf-messages) - [Python API](#python-api) - - [Demos](#demos) - [Benchmarking](#benchmarking) - [Others](#others) - [Design Concerns](#design-concerns) ## Overview -我们会把MKL-DNN作为第三方库集成进PaddlePaddle,整体框架图 +我们会把MKL-DNN会作为第三方库集成进PaddlePaddle,与其他第三方库一样,会在编译PaddlePaddle的时候下载并编译MKL-DNN。 + +同时,为了进一步提升PaddlePaddle在基本数学运算的计算速度,我们也将MKLML即(MKL small library\[[1](#references)\]) +作为另一个第三方库集成进PaddlePaddle,它只会包括生成好的动态库和头文件。 + +MKL,MKLML以及MKL-DNN三者关系如下表: + +| Name | Open Source | License | Descriptions | +| :---------- | :--------------- | :---------- | :------------ | +| MKL | No | Proprietary | Accelerate math processing routines | +| MKLML | No | Proprietary | Small package of MKL, especially for Machine Learning | +| MKL-DNN | Yes | Apache 2.0 | Accelerate primitives processing routines especially for Deep Neural Networks | + +MKLML可以与MKL-DNN共同使用,以此达到最好的性能。 +
-
-Figure 1. PaddlePaddle on IA. +
+Figure 2. PaddlePaddle with MKL Engines
## Actions -我们把集成方案大致分为了如下几个方面。 + +添加的相关文件和目录结构如下: + +```txt +PaddlePaddle/Paddle +├── ... +├── cmake/ +│ ├── external/ +│ │ ├── ... +│ │ ├── mkldnn.cmake +│ │ └── mklml.cmake +└── paddle/ + ├── ... + ├── math/ + │ ├── ... + │ └── MKLDNNMatrix.* + └── gserver/ + ├── ... + ├── layers/ + │ ├── ... + │ └── MKLDNN*Layer.* + ├── activations/ + │ ├── ... + │ └── MKLDNNActivations.* + └── tests/ + ├── ... + ├── MKLDNNTester.* + └── test_MKLDNN.cpp +``` ### CMake -我们会在`CMakeLists.txt`中会添加`WITH_MKLDNN`的选项,当设置这个值为`ON`的时候会启用编译MKL-DNN功能。同时会自动开启OpenMP用于提高MKL-DNN的性能。 +在`CMakeLists.txt`中提供一个与MKL有关的总开关:`WITH_MKL`,它负责决定编译时是否使用MKLML和MKL-DNN -同时,我们会引入`WITH_MKLML`选项,用于选择是否使用MKL-DNN自带的MKLML安装包。这个安装包可以独立于MKL-DNN使用,但是建议在开启MKL-DNN的同时也打开MKLML的开关,这样才能发挥最好的性能。 +- `WITH_MKLML` 控制是否使用MKLML库。 +当打开`WITH_MKL`时,会自动使用MKLML库作为PaddlePaddle的CBLAS和LAPACK库,同时会开启Intel OpenMP用于提高MKLML的性能。 +编译时会把对应的头文件和库放在`build/third_party/install/mklml/*`目录下对应的地方。 +MKLML的库目前都是动态库,主要包括`libiomp5.so`和`libmklml_intel.so`。 +- `WITH_MKLDNN` 控制是否使用MKL-DNN。 +当开启`WITH_MKL`时,会自动根据硬件配置[[2](#references)]选择是否编译MKL-DNN。 +编译时会把对应的头文件和库放在`build/third_party/install/mkldnn/*`目录下对应的地方。 +MKL-DNN的库目前只有动态库`libmkldnn.so`。 -所以,我们会在`cmake/external`目录新建`mkldnn.cmake`和`mklml.cmake`文件,它们会在编译PaddlePaddle的时候下载对应的软件包,并放到PaddlePaddle的third party目录中。 +### Matrix +目前在PaddlePaddle中数据都是以`NCHW`的格式存储,但是在MKL-DNN中的排列方式不止这一种。 +所以我们定义了一个`MKLDNNMatrix`用于管理MKL-DNN数据的不同格式以及相互之间的转换。 -**备注**:当`WITH_MKLML=ON`的时候,会优先使用这个包作为PaddlePaddle的CBLAS和LAPACK库,所以会稍微改动`cmake/cblas.cmake`中的逻辑。 +
+
+Figure 3. MKLDNNMatrix +
### Layers -所有MKL-DNN相关的C++ layers,都会按照PaddlePaddle的目录结构存放在 -`paddle/gserver/layers`中,并且文件名都会一以*Mkldnn*开头。 +所有MKL-DNN的Layers都会继承于`MKLDNNLayer`,该类继承于PaddlePaddle的基类`Layer`。 +在`MKLDNNLayer`中会提供一些必要的接口和函数,并且会写好`forward`和`backward`的基本逻辑, +子类只需要使用定义好的接口,实现具体的函数功能即可。 -所有MKL-DNN的layers都会继承于一个叫做`MkldnnLayer`的父类,该父类继承于PaddlePaddle的基类`Layer`。 +
+
+Figure 4. MKLDNNLayer +
-### Activations -由于在PaddlePaddle中,激活函数是独立于layer概念的,所以会在`paddle/gserver/activations`目录下添加一个`MkldnnActivation.h`文件定义一些用于MKL-DNN的接口,实现方法还是会在`ActivationFunction.cpp`文件。 +每个MKLDNNLayer都包含用于内部存储和外部存储的一系列MKLDNNMatrix: -### Unit Tests -会在`paddle/gserver/test`目录下添加`test_Mkldnn.cpp`和`MkldnnTester.*`用于MKL-DNN的测试。 +- 内部存储(internel memory):`inVal_`,`inGrad_`,`outVal_`和`outGrad_`,分别代表输入数据,输入梯度,输出数据和输出梯度。 +- 外部存储(external memory):都是以ext开头,比如`extInVal_`和`extInGrad_`,它们主要是用于, +当数据格式与PaddlePaddle默认的`NCHW`格式不匹配时,转换内存的工作。 +需要注意的是,PaddlePaddle的activation会直接使用`output_.value`和`output_.grad`, +所以`extOutVal_`和`extOutGrad_`必须分别与`output_.value`和`output_.grad`共享内存, +如果不需要外部存储用于转换,那么对应的内部存储也会与它们共享内存。 +- 转换函数(resetXXX): 包括`resetInValue`,`resetInGrad`,`resetOutValue`和`resetOutGrad`, +表示对输入数据,输入梯度,输出数据和输出梯度的转换。 +这些函数会根据输入参数重新设置内部和外部存储,当然这两者也可以相等,即表示不需要转换。 + +注意:每个`MKLDNNlayer`的子类只需要使用内部存储就可以了,所有外部的转换工作都会在reset系列函数中都准备好。 + +### Activations +在重构前的PaddlePaddle中,激活函数是独立于`Layer`的概念,并且输入输出都是共用一块内存, +所以添加了对应的`MKLDNNActivation`来实现,方式类似于`MKLDNNLayer`。 + +### Parameters +对于有参数的层,我们会保证`MKLDNNLayer`使用的参数与PaddlePaddle申请的buffer共用一块内存。 +如果存在数据排列格式不一样的情况时,我们会在网络训练之前把格式转换为MKL-DNN希望的格式, +在训练结束的时候再保存为PaddlePaddle的格式,但是整个训练过程中不需要任何转换。 +这样既使得最终保存的参数格式与PaddlePaddle一致,又可以避免不必要的转换。 + +### Gradients +由于MKL-DNN的操作都是直接覆盖的形式,也就是说输出的结果不会在原来的数据上累加, +这样带来的好处就是不需要一直清空memory,节省了不必要的操作。 +但是注意的是,当网络出现分支且在`backward`的时候,需要累加不同Layer传过来的梯度。 +所以在`MKLDNNlayer`中实现了一个merge的方法,此时每个小分支的`Input Gradient` +会先临时保存在`MKLDNNMatrix`中,由分支处的Layer负责求和,并把结果放到当前层的`output_.grad`中。 +所以整体上,在实现每个子类的时候就不需要关心分支的事情了。 -Activation的测试,计划在PaddlePaddle原有的测试文件上直接添加新的测试type。 +
+
+Figure 5. Merge Gradients +
-### Protobuf Messages -根据具体layer的需求可能会在`proto/ModelConfig.proto`里面添加必要的选项。 +### Unit Tests +我们会添加`test_MKLDNN.cpp`和`MKLDNNTester.*`用于MKL-DNN的测试。 +测试分为每个Layer(或Activation)的单元测试和简单网络的整体测试。 +每个测试会对比PaddlePaddle中CPU算出的结果与MKL-DNN的结果,小于某个比较小的阈值认为通过。 ### Python API 目前只考虑**v1 API**。 @@ -73,39 +172,40 @@ if use_mkldnn self.layer_type = mkldnn_* ``` -所有MKL-DNN的layer type会以*mkldnn_*开头,以示区分。 +所有MKL-DNN的`layer_type`会以*mkldnn_*开头,这些会在`MKLDNN*Layer`注册layer的时候保证,以示区分。 -并且可能在`python/paddle/trainer_config_helper`目录下的`activations.py `和`layers.py`里面添加必要的MKL-DNN的接口。 - -### Demos - -会在`v1_api_demo`目录下添加一个`mkldnn`的文件夹,里面放入一些用于MKL-DNN测试的demo脚本。 +同时,会在`paddle/utils.Flags`中添加一个`use_mkldnn`的flag,用于选择是否使用MKL-DNN的相关功能。 ### Benchmarking -会考虑添加部分逻辑在`benchmark/paddle/image/run.sh`,添加使用MKL-DNN的测试。 +会添加相应的脚本在[这里](https://github.com/PaddlePaddle/Paddle/tree/develop/benchmark/paddle/image),用于测试和对比在使用MKL-DNN前后的CNN网络性能。 +测试的性能对比结果会在[IntelOptimizedPaddle.md](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/IntelOptimizedPaddle.md) ### Others -1. 如果在使用MKL-DNN的情况下,会把CPU的Buffer对齐为64。 +1. 如果在使用MKL-DNN的情况下,会把CPU的Buffer对齐为4096,具体可以参考MKL-DNN中的[memory](https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp#L673)。 2. 深入PaddlePaddle,寻找有没有其他可以优化的可能,进一步优化。比如可能会用OpenMP改进SGD的更新性能。 ## Design Concerns -为了更好的符合PaddlePaddle的代码风格\[[2](#references)\],同时又尽可能少的牺牲MKL-DNN的性能\[[3](#references)\]。 +为了更好的符合PaddlePaddle的代码风格\[[3](#references)\],同时又尽可能少的牺牲MKL-DNN的性能\[[4](#references)\]。 我们总结出一些特别需要注意的点: -1. 使用**deviceId_**。为了尽可能少的在父类Layer中添加变量或者函数,我们决定使用已有的`deviceId_`变量来区分layer的属性,定义`-2`为`MkldnnLayer`特有的设备ID。 -2. 重写父类Layer的**init**函数,修改`deviceId_`为`-2`,代表这个layer是用于跑在MKL-DNN的环境下。 -3. 创建`MkldnnMatrix`,用于管理MKL-DNN会用到的相关memory函数、接口以及会用的到格式信息。 -4. 创建`MkldnnBase`,定义一些除了layer和memory相关的类和函数。包括MKL-DNN会用到`MkldnnStream`和`CpuEngine`,和未来可能还会用到`FPGAEngine`等。 -5. 在**Argument**里添加两个`MkldnnMatrixPtr`,取名为`mkldnnValue`和`mkldnnGrad`,用于存放`MkldnnLayer`会用到的memory buffer。 并且添加函数cvt(会修改为一个更加合适的函数名),用于处理"CPU device"和"MKL-DNN device"之间memory的相互转化。 -6. 在父类`Layer`中的`getOutput`函数中添加一段逻辑,用于判断`deviceId`,并针对device在MKL-DNN和CPU之间不统一的情况,做一个前期转换。 也就是调用`Argument`的cvt函数把output统一到需要的device上。 -7. 在原来的`FLAGS`中添加一个`use_mkldnn`的flag,用于选择是否使用MKL-DNN的相关功能。 -8. 关于MKLDNN参数的保存。由于MKLDNN参数的格式与PaddlePaddle原有的格式存在不一样的情况,所以需要在保存参数时同时保存该格式信息。目前准备扩展[Header](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/parameter/Parameter.h#L247)里面的`int32_t version`。这个值不管是在v1还是在v2里面,一直保存的是0,所以可以充分利用这个信息,定义一个枚举处理所有MKLDNN的参数格式,从而`MKLDNNLayer`就可以从输入的参数中获取需要的格式信息。 +1. 使用**deviceId_**。为了尽可能少的在父类Layer中添加变量或者函数, +我们决定使用已有的`deviceId_`变量来区分layer的属性,定义`-2`为`MKLDNNLayer`特有的设备ID。 +2. 重写父类Layer的**init**函数,修改`deviceId_`为`-2`,代表这个layer是用于跑在MKL-DNN的环境下。 +3. 创建`MKLDNNBase`,定义一些除了layer和memory相关的类和函数。 +包括MKL-DNN会用到`MKLDNNStream`和`CPUEngine`,和未来可能还会用到`FPGAEngine`等。 +4. 如果MKL-DNN layer的后面接有cpu device,那么就会使`output_.value`与`extOutVal_`共享内存, +同时数据格式就是`NCHW`,这样下一个cpu device就能拿到正确的数据。 +在有普通的CPU layer时, `extOutVal_`和`extOutGrad_`的格式始终是`NCHW`或者`NC`。 ## References - -1. [Intel Math Kernel Library for Deep Neural Networks (Intel MKL-DNN)](https://github.com/01org/mkl-dnn "Intel MKL-DNN") -2. [原来的方案](https://github.com/PaddlePaddle/Paddle/pull/3096)会引入**nextLayer**的信息。但是在PaddlePaddle中,无论是重构前的layer还是重构后的op,都不会想要知道next layer/op的信息。 -3. MKL-DNN的高性能格式与PaddlePaddle原有的`NCHW`不同(PaddlePaddle中的CUDNN部分使用的也是`NCHW`,所以不存在这个问题),所以需要引入一个转换方法,并且只需要在必要的时候转换这种格式,才能更好的发挥MKL-DNN的性能。 +1. [MKL small library](https://github.com/01org/mkl-dnn#linking-your-application)是[Intel MKL](https://software.intel.com/en-us/mkl)的一个子集。 +主要包括了深度学习相关的数学原语与操作,一般由MKL-DNN在发布[新版本](https://github.com/01org/mkl-dnn/releases)时一起更新。 +2. [MKL-DNN System Requirements](https://github.com/01org/mkl-dnn#system-requirements)。 +目前在PaddlePaddle中,仅会在支持AVX2指令集及以上的机器才使用MKL-DNN。 +3. [原来的方案](https://github.com/PaddlePaddle/Paddle/pull/3096)会引入**nextLayer**的信息。 +但是在PaddlePaddle中,无论是重构前的layer还是重构后的op,都不会想要知道next layer/op的信息。 +4. MKL-DNN的高性能格式与PaddlePaddle原有的`NCHW`不同(PaddlePaddle中的cuDNN部分使用的也是`NCHW`,所以不存在这个问题)。 +所以需要引入一个转换方法,并且只需要在必要的时候转换这种格式,才能更好的发挥MKL-DNN的性能。 diff --git a/doc/design/mkldnn/image/engine.png b/doc/design/mkldnn/image/engine.png new file mode 100644 index 0000000000000000000000000000000000000000..1f5f65c2cc765a514a3ba9e7b7f468e1dc4b0c3b Binary files /dev/null and b/doc/design/mkldnn/image/engine.png differ diff --git a/doc/design/mkldnn/image/gradients.png b/doc/design/mkldnn/image/gradients.png new file mode 100644 index 0000000000000000000000000000000000000000..f031bcf8e4cec14e63075b8b9d2c7bbd9f1b1a3c Binary files /dev/null and b/doc/design/mkldnn/image/gradients.png differ diff --git a/doc/design/mkldnn/image/layers.png b/doc/design/mkldnn/image/layers.png new file mode 100644 index 0000000000000000000000000000000000000000..306f79b7a844610915eb8944128f57d2b7a3065a Binary files /dev/null and b/doc/design/mkldnn/image/layers.png differ diff --git a/doc/design/mkldnn/image/matrix.png b/doc/design/mkldnn/image/matrix.png new file mode 100644 index 0000000000000000000000000000000000000000..c33ce9cf0335e47cc8c1253304d0fe179186e6f2 Binary files /dev/null and b/doc/design/mkldnn/image/matrix.png differ diff --git a/doc/design/mkldnn/image/overview.png b/doc/design/mkldnn/image/overview.png index 84b455c28230703599a2529f014cfbb222138fef..8fb7bbb9dd654bf363d701d0c8cd4a557043d188 100644 Binary files a/doc/design/mkldnn/image/overview.png and b/doc/design/mkldnn/image/overview.png differ diff --git a/doc/design/model_format.md b/doc/design/model_format.md new file mode 100644 index 0000000000000000000000000000000000000000..e29129fddf775939c9f7a8b49d850d523e6e5a45 --- /dev/null +++ b/doc/design/model_format.md @@ -0,0 +1,36 @@ +# Design Doc: Model Format + +## Motivation + +A model is an output of the training process. One complete model consists of two parts, the **topology** and the **parameters**. In order to support industrial deployment, the model format must be self-complete and must not expose any training source code. + +As a result, In PaddlePaddle, the **topology** is represented as a [ProgramDesc](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/doc/design/program.md), which describes the model structure. The **parameters** contain all the trainable weights in the model. We must support large size parameters and efficient serialization/deserialization of parameters. + +## Implementation + +The topology is saved as a plain text in a detailed self-contain protobuf file. + +The parameters are saved as a binary file. As we all know, the protobuf message has a limit of [64M size](https://developers.google.com/protocol-buffers/docs/reference/cpp/google.protobuf.io.coded_stream#CodedInputStream.SetTotalBytesLimit.details). We have done a [benchmark experiment](https://github.com/PaddlePaddle/Paddle/pull/4610), which shows that protobuf is not fit for the task. + +As a result, we design a particular format for tensor serialization. By default, an arbitrary tensor in Paddle is a [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md), and has a description information proto of [LoDTensorDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L99). We save the DescProto as the byte string header. It contains all the necessary information, such as the `dims`, and the `LoD` information in [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/paddle/framework/lod_tensor.md). A tensor stores values in a continuous memory buffer. For speed we dump the raw memory to disk and save it as the byte string content. So, the binary format of one tensor is, + +The table below shows a tensor's byte view in detail. Note that all the signed values are written in the little-endian format. + +|field name | type | description | +| --- | --- | --- | +| version | uint32_t | Version of saved file. Always 0 now. | +| tensor desc length | uint32_t | TensorDesc(Protobuf message) length in bytes. | +| tensor desc | void* | TensorDesc protobuf binary message | +| tensor data | void* | Tensor's data in binary format. The length of `tensor_data` is decided by `TensorDesc.dims()` and `TensorDesc.data_type()` | +| lod_level | uint64_t | Level of LoD | +| length of lod[0] | uint64_t | [Optional] length of lod[0] in bytes. | +| data of lod[0] | uint64_t* | [Optional] lod[0].data() | +| ... | ... | ... | + + + +## Summary + +- We introduce a model format. +- The model represented by its forward-pass computation procedure is saved in a **ProgramDesc** protobuf message. +- A bunch of specified format binary tensors describe the **parameters**. diff --git a/doc/design/ops/images/2_level_rnn.dot b/doc/design/ops/images/2_level_rnn.dot index a498e882a3d85a33d44dbad7474fa2a340e33976..5d77865061ca7bbbfcf254dd938f09aef5553505 100644 --- a/doc/design/ops/images/2_level_rnn.dot +++ b/doc/design/ops/images/2_level_rnn.dot @@ -1,6 +1,6 @@ digraph G { - rnn [label="1-th level RNN" shape=box] + rnn [label="1st level RNN" shape=box] subgraph cluster0 { label = "time step 0" @@ -8,7 +8,7 @@ digraph G { sent0 [label="sentence"] sent1 [label="sentence"] - rnn1 [label="2-th level RNN" shape=box] + rnn1 [label="2nd level RNN" shape=box] sent0 -> rnn1 sent1 -> rnn1 @@ -20,7 +20,7 @@ digraph G { sent2 [label="sentence"] sent3 [label="sentence"] - rnn2 [label="2-th level RNN" shape=box] + rnn2 [label="2nd level RNN" shape=box] sent2 -> rnn2 sent3 -> rnn2 @@ -32,7 +32,7 @@ digraph G { sent4 [label="sentence"] sent5 [label="sentence"] - rnn3 [label="2-th level RNN" shape=box] + rnn3 [label="2nd level RNN" shape=box] sent4 -> rnn3 sent5 -> rnn3 diff --git a/doc/design/ops/images/LOD-and-shape-changes-during-decoding.jpg b/doc/design/ops/images/LOD-and-shape-changes-during-decoding.jpg new file mode 100644 index 0000000000000000000000000000000000000000..8b0d90f7b9d8184b314b0ee4e521f53eb5f1b455 Binary files /dev/null and b/doc/design/ops/images/LOD-and-shape-changes-during-decoding.jpg differ diff --git a/doc/design/ops/rnn.md b/doc/design/ops/rnn.md index a78eea7d45e9e9553d153170aa31da55ec6e8289..2f4854793fa1f0b02e4dc17b51a48a972be61c06 100644 --- a/doc/design/ops/rnn.md +++ b/doc/design/ops/rnn.md @@ -1,62 +1,62 @@ # RNNOp design -This document is about an RNN operator which requires that instances in a mini-batch have the same length. We will have a more flexible RNN operator. +This document describes the RNN (Recurrent Neural Network) operator and how it is implemented in PaddlePaddle. The RNN op requires that all instances in a mini-batch have the same length. We will have a more flexible dynamic RNN operator in the future. ## RNN Algorithm Implementation -

+

The above diagram shows an RNN unrolled into a full network. -There are several important concepts: +There are several important concepts here: -- *step-net*: the sub-graph to run at each step, -- *memory*, $h_t$, the state of the current step, -- *ex-memory*, $h_{t-1}$, the state of the previous step, -- *initial memory value*, the ex-memory of the first step. +- *step-net*: the sub-graph that runs at each step. +- *memory*, $h_t$, the state of the current step. +- *ex-memory*, $h_{t-1}$, the state of the previous step. +- *initial memory value*, the memory of the first (initial) step. ### Step-scope -There could be local variables defined in step-nets. PaddlePaddle runtime realizes these variables in *step-scopes* -- scopes created for each step. +There could be local variables defined in each step-net. PaddlePaddle runtime realizes these variables in *step-scopes* which are created for each step. -

+


-Figure 2 the RNN's data flow +Figure 2 illustrates the RNN's data flow

-Please be aware that all steps run the same step-net. Each step +Please be aware that every step runs the same step-net. Each step does the following: -1. creates the step-scope, -2. realizes local variables, including step-outputs, in the step-scope, and -3. runs the step-net, which could use these variables. +1. Creates the step-scope. +2. Initializes the local variables including step-outputs, in the step-scope. +3. Runs the step-net, which uses the above mentioned variables. -The RNN operator will compose its output from step outputs in step scopes. +The RNN operator will compose its output from step outputs in each of the step scopes. ### Memory and Ex-memory -Let's give more details about memory and ex-memory via a simply example: +Let's give more details about memory and ex-memory using a simple example: $$ h_t = U h_{t-1} + W x_t $$, -where $h_t$ and $h_{t-1}$ are the memory and ex-memory of step $t$'s respectively. +where $h_t$ and $h_{t-1}$ are the memory and ex-memory (previous memory) of step $t$ respectively. -In the implementation, we can make an ex-memory variable either "refers to" the memory variable of the previous step, -or copy the value of the previous memory value to the current ex-memory variable. +In the implementation, we can make an ex-memory variable either "refer to" the memory variable of the previous step, +or copy the memory value of the previous step to the current ex-memory variable. ### Usage in Python For more information on Block, please refer to the [design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md). -We can define an RNN's step-net using Block: +We can define an RNN's step-net using a Block: ```python import paddle as pd -X = some_op() # x is some operator's output, and is a LoDTensor +X = some_op() # x is some operator's output and is a LoDTensor a = some_op() # declare parameters @@ -68,7 +68,7 @@ with rnn.stepnet(): x = rnn.add_input(X) # declare a memory (rnn's step) h = rnn.add_memory(init=a) - # h.pre_state() means previous memory of rnn + # h.pre_state(), the previous memory of rnn new_state = pd.add_two( pd.matmul(W, x) + pd.matmul(U, h.pre_state())) # update current memory h.update(new_state) @@ -80,19 +80,19 @@ out = rnn() Python API functions in above example: -- `rnn.add_input` indicates the parameter is a variable that will be segmented into step-inputs. -- `rnn.add_memory` creates a variable used as the memory. -- `rnn.add_outputs` mark the variables that will be concatenated across steps into the RNN output. +- `rnn.add_input`: indicates that the parameter is a variable that will be segmented into step-inputs. +- `rnn.add_memory`: creates a variable used as the memory. +- `rnn.add_outputs`: marks the variables that will be concatenated across steps into the RNN output. ### Nested RNN and LoDTensor An RNN whose step-net includes other RNN operators is known as an *nested RNN*. -For example, we could have a 2-level RNN, where the top level corresponds to paragraphs, and the lower level corresponds to sentences. +For example, we could have a 2-level RNN, where the top level corresponds to paragraphs, and the lower level corresponds to sentences. Each step of the higher level RNN also receives an input from the corresponding step of the lower level, and additionally the output from the previous time step at the same level. -The following figure illustrates the feeding of text into the lower level, one sentence each step, and the feeding of step outputs to the top level. The final top level output is about the whole text. +The following figure illustrates feeding in text into the lower level, one sentence at a step, and the feeding in step outputs to the top level. The final top level output is about the whole text. -

+

@@ -110,7 +110,7 @@ a = some_op() # chapter_data is a set of 128-dim word vectors # the first level of LoD is sentence -# the second level of LoD is chapter +# the second level of LoD is a chapter chapter_data = pd.Variable(shape=[None, 128], type=pd.lod_tensor, level=2) def lower_level_rnn(paragraph): @@ -138,14 +138,14 @@ with top_level_rnn.stepnet(): pd.matmul(W0, paragraph_data) + pd.matmul(U0, h.pre_state())) top_level_rnn.add_outputs(h) -# just output the last step +# output the last step chapter_out = top_level_rnn(output_all_steps=False) ``` -in above example, the construction of the `top_level_rnn` calls `lower_level_rnn`. The input is a LoD Tensor. The top level RNN segments input text data into paragraphs, and the lower level RNN segments each paragraph into sentences. +In the above example, the construction of the `top_level_rnn` calls `lower_level_rnn`. The input is an LoD Tensor. The top level RNN segments input text data into paragraphs, and the lower level RNN segments each paragraph into sentences. -By default, the `RNNOp` will concatenate the outputs from all the time steps, -if the `output_all_steps` set to False, it will only output the final time step. +By default, the `RNNOp` will concatenate the outputs from all the time steps. +If the `output_all_steps` is set to False, it will only output the final time step.

diff --git a/doc/design/ops/sequence_decoder.md b/doc/design/ops/sequence_decoder.md new file mode 100644 index 0000000000000000000000000000000000000000..9db5fb8e9a9f89b004bf71ddc064cd976c0d0bee --- /dev/null +++ b/doc/design/ops/sequence_decoder.md @@ -0,0 +1,229 @@ +# Design: Sequence Decoder Generating LoDTensors +In tasks such as machine translation and visual captioning, +a [sequence decoder](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.md) is necessary to generate sequences, one word at a time. + +This documentation describes how to implement the sequence decoder as an operator. + +## Beam Search based Decoder +The [beam search algorithm](https://en.wikipedia.org/wiki/Beam_search) is necessary when generating sequences. It is a heuristic search algorithm that explores the paths by expanding the most promising node in a limited set. + +In the old version of PaddlePaddle, the C++ class `RecurrentGradientMachine` implements the general sequence decoder based on beam search, due to the complexity involved, the implementation relies on a lot of special data structures that are quite trivial and hard to be customized by users. + +There are a lot of heuristic tricks in the sequence generation tasks, so the flexibility of sequence decoder is very important to users. + +During the refactoring of PaddlePaddle, some new concepts are proposed such as: [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md) and [TensorArray](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/tensor_array.md) that can better support the sequence usage, and they can also help make the implementation of beam search based sequence decoder **more transparent and modular** . + +For example, the RNN states, candidates IDs and probabilities of beam search can be represented all as `LoDTensors`; +the selected candidate's IDs in each time step can be stored in a `TensorArray`, and `Packed` to the sentences translated. + +## Changing LoD's absolute offset to relative offsets +The current `LoDTensor` is designed to store levels of variable-length sequences. It stores several arrays of integers where each represents a level. + +The integers in each level represent the begin and end (not inclusive) offset of a sequence **in the underlying tensor**, +let's call this format the **absolute-offset LoD** for clarity. + +The relative-offset LoD can retrieve any sequence very quickly but fails to represent empty sequences, for example, a two-level LoD is as follows +```python +[[0, 3, 9] + [0, 2, 3, 3, 3, 9]] +``` +The first level tells that there are two sequences: +- the first's offset is `[0, 3)` +- the second's offset is `[3, 9)` + +while on the second level, there are several empty sequences that both begin and end at `3`. +It is impossible to tell how many empty second-level sequences exist in the first-level sequences. + +There are many scenarios that rely on empty sequence representation, for example in machine translation or visual captioning, one instance has no translation or the empty candidate set for a prefix. + +So let's introduce another format of LoD, +it stores **the offsets of the lower level sequences** and is called **relative-offset** LoD. + +For example, to represent the same sequences of the above data + +```python +[[0, 3, 6] + [0, 2, 3, 3, 3, 9]] +``` + +the first level represents that there are two sequences, +their offsets in the second-level LoD is `[0, 3)` and `[3, 5)`. + +The second level is the same with the relative offset example because the lower level is a tensor. +It is easy to find out the second sequence in the first-level LoD has two empty sequences. + +The following examples are based on relative-offset LoD. + +## Usage in a simple machine translation model +Let's start from a simple machine translation model that is simplified from the [machine translation chapter](https://github.com/PaddlePaddle/book/tree/develop/08.machine_translation) to draw a blueprint of what a sequence decoder can do and how to use it. + +The model has an encoder that learns the semantic vector from a sequence, and a decoder which uses the sequence encoder to generate new sentences. + +**Encoder** +```python +import paddle as pd + +dict_size = 8000 +source_dict_size = dict_size +target_dict_size = dict_size +word_vector_dim = 128 +encoder_dim = 128 +decoder_dim = 128 +beam_size = 5 +max_length = 120 + +# encoder +src_word_id = pd.data( + name='source_language_word', + type=pd.data.integer_value_sequence(source_dict_dim)) +src_embedding = pd.embedding(size=source_dict_size, size=word_vector_dim) + +src_word_vec = pd.lookup(src_embedding, src_word_id) + +encoder_out_seq = pd.gru(input=src_word_vec, size=encoder_dim) + +encoder_ctx = pd.last_seq(encoder_out_seq) +# encoder_ctx_proj is the learned semantic vector +encoder_ctx_proj = pd.fc( + encoder_ctx, size=decoder_dim, act=pd.activation.Tanh(), bias=None) +``` + +**Decoder** + +```python +def generate(): + decoder = pd.while_loop() + with decoder.step(): + decoder_mem = decoder.memory(init=encoder_ctx) # mark the memory + generated_ids = decoder.memory() # TODO init to batch_size s + generated_scores = decoder.memory() # TODO init to batch_size 1s or 0s + + target_word = pd.lookup(trg_embedding, gendrated_ids) + # expand encoder_ctx's batch to fit target_word's lod + # for example + # decoder_mem.lod is + # [[0 1 3], + # [0 1 3 6]] + # its tensor content is [a1 a2 a3 a4 a5] + # which means there are 2 sentences to translate + # - the first sentence has 1 translation prefixes, the offsets are [0, 1) + # - the second sentence has 2 translation prefixes, the offsets are [1, 3) and [3, 6) + # the target_word.lod is + # [[0, 1, 6] + # [0, 2, 4, 7, 9 12]] + # which means 2 sentences to translate, each has 1 and 5 prefixes + # the first prefix has 2 candidates + # the following has 2, 3, 2, 3 candidates + # the encoder_ctx_expanded's content will be + # [a1 a1 a2 a2 a3 a3 a3 a4 a4 a5 a5 a5] + encoder_ctx_expanded = pd.lod_expand(encoder_ctx, target_word) + decoder_input = pd.fc( + act=pd.activation.Linear(), + input=[target_word, encoder_ctx], + size=3 * decoder_dim) + gru_out, cur_mem = pd.gru_step( + decoder_input, mem=decoder_mem, size=decoder_dim) + scores = pd.fc( + gru_out, + size=trg_dic_size, + bias=None, + act=pd.activation.Softmax()) + # K is an config + topk_scores, topk_ids = pd.top_k(scores, K) + topk_generated_scores = pd.add_scalar(topk_scores, generated_scores) + + selected_ids, selected_generation_scores = decoder.beam_search( + topk_ids, topk_generated_scores) + + # update the states + decoder_mem.update(cur_mem) # tells how to update state + generated_ids.update(selected_ids) + generated_scores.update(selected_generation_scores) + + decoder.output(selected_ids) + decoder.output(selected_generation_scores) + +translation_ids, translation_scores = decoder() +``` +The `decoder.beam_search` is an operator that, given the candidates and the scores of translations including the candidates, +returns the result of the beam search algorithm. + +In this way, users can customize anything on the input or output of beam search, for example: + +1. Make the corresponding elements in `topk_generated_scores` zero or some small values, beam_search will discard this candidate. +2. Remove some specific candidate in `selected_ids`. +3. Get the final `translation_ids`, remove the translation sequence in it. + +The implementation of sequence decoder can reuse the C++ class: [RNNAlgorithm](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/paddle/operators/dynamic_recurrent_op.h#L30), +so the python syntax is quite similar to that of an [RNN](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/doc/design/block.md#blocks-with-for-and-rnnop). + +Both of them are two-level `LoDTensors`: + +- The first level represents `batch_size` of (source) sentences. +- The second level represents the candidate ID sets for translation prefix. + +For example, 3 source sentences to translate, and has 2, 3, 1 candidates. + +Unlike an RNN, in sequence decoder, the previous state and the current state have different LoD and shape, and an `lod_expand` operator is used to expand the LoD of the previous state to fit the current state. + +For example, the previous state: + +* LoD is `[0, 1, 3][0, 2, 5, 6]` +* content of tensor is `a1 a2 b1 b2 b3 c1` + +the current state is stored in `encoder_ctx_expanded`: + +* LoD is `[0, 2, 7][0 3 5 8 9 11 11]` +* the content is + - a1 a1 a1 (a1 has 3 candidates, so the state should be copied 3 times for each candidates) + - a2 a2 + - b1 b1 b1 + - b2 + - b3 b3 + - None (c1 has 0 candidates, so c1 is dropped) + +The benefit from the relative offset LoD is that the empty candidate set can be represented naturally. + +The status in each time step can be stored in `TensorArray`, and `Pack`ed to a final LoDTensor. The corresponding syntax is: + +```python +decoder.output(selected_ids) +decoder.output(selected_generation_scores) +``` + +The `selected_ids` are the candidate ids for the prefixes, and will be `Packed` by `TensorArray` to a two-level `LoDTensor`, where the first level represents the source sequences and the second level represents generated sequences. + +Packing the `selected_scores` will get a `LoDTensor` that stores scores of each translation candidate. + +Packing the `selected_generation_scores` will get a `LoDTensor`, and each tail is the probability of the translation. + +## LoD and shape changes during decoding +

+ +

+ +According to the image above, the only phase that changes the LoD is beam search. + +## Beam search design +The beam search algorithm will be implemented as one method of the sequence decoder and has 3 inputs: + +1. `topk_ids`, the top K candidate ids for each prefix. +2. `topk_scores`, the corresponding scores for `topk_ids` +3. `generated_scores`, the score of the prefixes. + +All of these are LoDTensors, so that the sequence affiliation is clear. Beam search will keep a beam for each prefix and select a smaller candidate set for each prefix. + +It will return three variables: + +1. `selected_ids`, the final candidate beam search function selected for the next step. +2. `selected_scores`, the scores for the candidates. +3. `generated_scores`, the updated scores for each prefix (with the new candidates appended). + +## Introducing the LoD-based `Pack` and `Unpack` methods in `TensorArray` +The `selected_ids`, `selected_scores` and `generated_scores` are LoDTensors that exist at each time step, +so it is natural to store them in arrays. + +Currently, PaddlePaddle has a module called `TensorArray` which can store an array of tensors. It is better to store the results of beam search in a `TensorArray`. + +The `Pack` and `UnPack` in `TensorArray` are used to pack tensors in the array to an `LoDTensor` or split the `LoDTensor` to an array of tensors. +It needs some extensions to support the packing or unpacking an array of `LoDTensors`. diff --git a/doc/design/optimizer.md b/doc/design/optimizer.md new file mode 100644 index 0000000000000000000000000000000000000000..202b4b65103c0b7c536a9cb466c4120ce134d8c3 --- /dev/null +++ b/doc/design/optimizer.md @@ -0,0 +1,91 @@ +## Optimizer Design + +### The Problem + +A PaddlePaddle program, or a block, is a sequence of operators operating variables. A training program needs to do three kinds of works: + +1. the forward pass, which computes intermediate results and the cost(s), +1. the backward pass, which derives gradients from intermediate results and costs, and +1. the optimization pass, which update model parameters to optimize the cost(s). + +These works rely on three kinds of operators: + +1. forward operators, +1. gradient operators, and +1. optimization operators. + +It's true that users should be able to create all these operators manually by calling some low-level API, but it would be much more convenient if they could only describe the forward pass and let PaddlePaddle create the backward and optimization operators automatically. + +In this design, we propose a high-level API that automatically derives the optimisation pass and operators from the forward pass. + + +### High-level Python API to describe the training process + +1. User write code to describe the network: + + ```python + images = layer.data("images") + labels = layer.data("labels") + w1 = pd.var("w1") + b1 = pd.var("b1") + hidden = layer.fc(images, w=w1, b=b1) + cost = layer.mse(hidden, labels) + ``` + + The above code snippet will create forward operators in [Block](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md). + + +2. Users create a certain kind of Optimizer with some argument. + + ```python + optimizer = AdagradOptimizer(learing_rate=0.001) + ``` + +3. Users use the optimizer to `minimize` a certain `cost` through updating parameters in parameter_list. + + ```python + opt_op_list = optimizer.minimize(cost, parameter_list=[w1, b1]) + ``` + The above code snippet will create gradient and optimization operators in Block. The return value of `minimize()` is list of optimization operators that will be run by session. + +4. Users use Session/Executor to run this opt_op_list as target to do training. + + ```python + sess.run(target= opt_op_list, ...) + ``` + +#### Optimizer Python interface: + +```python +class Optimizer(object): + """Optimizer Base class. + + """ + + def __init__(self): + pass + + def create_optimization_pass(self, parameters_and_grads): + """Add optimization operators to update gradients to variables. + + Args: + parameters_and_grads: a list of (variable, gradient) pair to update. + + Returns: + optmization_op_list: a list of optimization operator that will update parameter using gradient. + """ + return None + + def minimize(self, loss, parameter_list): + """Add operations to minimize `loss` by updating `parameter_list`. + + This method combines interface `append_backward_ops()` and + `create_optimization_pass()` into one. + """ + params_grads = self.create_backward_pass(loss, parameter_list) + update_ops = self.create_optimization_pass(params_grads) + return update_ops + +``` + +Users can inherit the Optimizer above to create their own Optimizer with some special logic, such as AdagradOptimizer. diff --git a/doc/design/parameter_average.md b/doc/design/parameter_average.md new file mode 100644 index 0000000000000000000000000000000000000000..2c4edee9fe31d502ea62b9fe5c8757c0a4c5e79f --- /dev/null +++ b/doc/design/parameter_average.md @@ -0,0 +1,72 @@ +# Averaging Parameter in PaddlePaddle + +## Why Averaging +In a large scale machine learning setup where the size of the training data is huge, it could take us a large number of iterations over the training data before we can achieve the optimal values of parameters of our model. Looking at the problem setup, it is desirable if we can obtain the optimal values of parameters by going through the data in as few passes as we can. + +Polyak and Juditsky (1992) showed that the test performance of simple average of parameters obtained by Stochastic Gradient Descent (SGD) is as good as that of parameter values that are obtained by training the model over and over again, over the training dataset. + +Hence, to accelerate the speed of Stochastic Gradient Descent, Averaged Stochastic Gradient Descent (ASGD) was proposed in Polyak and Juditsky (1992). For ASGD, the running average of parameters obtained by SGD, is used as the estimator for
. The averaging is done as follows: + +
+ +We propose averaging for any optimizer similar to how ASGD performs it, as mentioned above. + +### How to perform Parameter Averaging in PaddlePaddle + +Parameter Averaging in PaddlePaddle works in the following way during training : +1. It will take in an instance of a normal optimizer as an input, e.g. RMSPropOptimizer +2. The optimizer itself is responsible for updating the parameters. +3. The ParameterAverageOptimizer maintains a separate copy of the parameters for itself: + 1. In concept, the values of this copy are the average of the values of the parameters in the most recent N batches. + 2. However, saving all the N instances of the parameters in memory is not feasible. + 3. Therefore, an approximation algorithm is used. + +Hence, overall we have have two copies of the parameters: one for the optimizer itself, and one for the ParameterAverageOptimizer. The former should be used in back propagation, while the latter should be used during testing and should be saved. + +During the testing/ saving the model phase, we perform the following steps: +1. Perform the delayed operations. +2. Save current values of the parameters to a temporary variable. +3. Replace the values of the parameters with the averaged values. +4. Perform testing and/or save the parameters. +5. Restore the values of the parameters once done. + +### How to implement Averaging of Parameter in PaddlePaddle + +We can add the ParameterAverageOptimizer op to the graph through Python API. Using this approach, we manually add this op to the graph and direct the output of the optimizer op to this op during training. + + **Advantages**: + - Allows for greater flexibility to the users of PaddlePaddle. Using this approach, the users can plug different optimizers into ParameterAverageOptimizer by passing in the optimizer to the op. + - Makes it easy for the users to customize and extend the framework. + + **Disadvantages**: + - Implementation requires re-writing the averaging methodology in Python. + +### Low-Level implementation + +In the new design, we propose to create a new operation for averaging parameter updates (ParameterAverageOptimizer). For now, we can add an op that takes in the following as input: +- the optimizer +- the window_size to keep the updates + +The ParameterAverageOptimizer op can be like any other operator with its own CPU/GPU implementation either using Eigen or separate CPU and GPU kernels. As the initial implementation, we can implement the kernel using Eigen following the abstraction pattern implemented for [Operators](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/rmsprop_op.h). We also want to support the case when the Trainer/Optimizer runs on the GPU while ParameterAverageOptimizer runs on a CPU. + +The idea of building an op for averaging is in sync with the refactored PaddlePaddle philosophy of using operators to represent any computation unit. The way the op will be added to the computation graph will be decided by the [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) in Python API. + +### Python API implementation for ParameterAverageOptimizer + +Based on Polyak and Juditsky (1992), we can generalize the averaging of updates to any optimizer. The input to the op would be the following: +- Any optimizer (RMSProp , AdaGrad etc.) +- A window size. The op keeps accumulating updated parameter values over a window of N batches and takes an average. Move the averaged value to a buffer when window is full to avoid loss of precision. + +Using the ParameterAverageOptimizer op, any user can add the operation to their computation graphs. However, this will require a lot of lines of code and we should design Python APIs that support averaging. As per the PaddlePaddle [Python API design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md), the layer functions are responsible for creating operators, operator parameters and variables. Since ParameterAverageOptimizer will be an operator, it makes sense to create it in the layer functions. +We will have a wrapper written in Python that will support the functionality and implement the actual core computation in C++ core as we have done for other [Optimizers](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/rmsprop_op.cc) + +#### Creation of the ParameterAverageOptimizer operator +There are two ways for creating the ParameterAverageOptimizer op: +1. We create the op immediately while building the computation graph. +2. We add the op in a lazy manner, just before the backward pass, similar to the way the optimization ops are added. + +The proposal is to add the op immediately while building the computation graph. + +#### High-level API + +In PaddlePaddle Python API, users will primarily rely on [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) to create neural network layers. Hence, we also need to provide parameter average functionality in layer functions. diff --git a/doc/design/program.md b/doc/design/program.md index fb8f86ac07af403c9fee015f2a3adbfaa3c6d631..bd2456787c4e336d357a65255a8274a7c9e465cc 100644 --- a/doc/design/program.md +++ b/doc/design/program.md @@ -1,8 +1,10 @@ -# Design Doc: ProgramDesc +# Design Doc: PaddlePaddle Programs -The basic structure of a PaddlePaddle program is some nested blocks, as a C++ or Java program. +## Compile and Execution + +A PaddlePaddle program consists of two parts -- the first generates a `ProgramDesc` protobuf message that describes the program, and the second runs this message using a C++ class `Executor`. -As described in [graph.md](./graph.md), the first five lines of the following PaddlePaddle program +A simple example PaddlePaddle program can be found in [graph.md](./graph.md): ```python x = layer.data("images") @@ -13,36 +15,112 @@ optimize(cost) train(cost, reader=mnist.train()) ``` -generates, or compiles, a PaddelPaddle program, which is represented by the following protobuf message: +The first five lines of the following PaddlePaddle program generates, or, compiles, the `ProgramDesc` message. The last line runs it. -```protobuf -message ProgramDesc { - repeated BlockDesc blocks = 1; +## Programs and Blocks + +The basic structure of a PaddlePaddle program is some nested blocks, as a C++ or Java program. + +- program: some nested blocks +- [block](./block.md): + - some local variable definitions, and + - a sequence of operators + +The concept of block comes from usual programs. For example, the following C++ program has three blocks: + +```c++ +int main() { // block 0 + int i = 0; + if (i < 10) { // block 1 + for (int j = 0; j < 10; j++) { // block 2 + } + } + return 0; } +``` + +The following PaddlePaddle program has three blocks: + +```python +import paddle as pd // block 0 + +x = minibatch([10, 20, 30]) # shape=[None, 1] +y = var(1) # shape=[1], value=1 +z = minibatch([10, 20, 30]) # shape=[None, 1] +cond = larger_than(x, 15) # [false, true, true] +ie = pd.ifelse() +with ie.true_block(): // block 1 + d = pd.layer.add_scalar(x, y) + ie.output(d, pd.layer.softmax(d)) +with ie.false_block(): // block 2 + d = pd.layer.fc(z) + ie.output(d, d+1) +o1, o2 = ie(cond) +``` + +## `BlockDesc` and `ProgramDesc` + +All protobuf messages are defined in `framework.proto`. + +`BlockDesc` is straight-forward -- it includes local variable definitions, `vars`, and a sequence of operators, `ops`. + +```protobuf message BlockDesc { required int32 parent = 1; repeated VarDesc vars = 2; repeated OpDesc ops = 3; } +``` + +The parent ID indicates the parent block so that operators in a block can refer to variables defined locally and also those defined in their ancestor blocks. + +All hierarchical blocks in a program are flattened and stored in an array. The block ID is the index of the block in this array. + +```protobuf +message ProgramDesc { + repeated BlockDesc blocks = 1; +} +``` + + +### Global Block +The global block is the first one in the above array. + +## Operators that Use Blocks + +In the above example, the operator `IfElseOp` has two blocks -- the true branch and the false branch. + +The definition of `OpDesc` shows that an operator could have some attributes: + +```protobuf message OpDesc { AttrDesc attrs = 1; ... } +``` + +and an attribute could be of type block, which is, in fact, a block ID as described above: +``` message AttrDesc { - required AttrType type = 1; + required string name = 1; - // index into ProgramDesc::blocks when type==BLOCK - optional int32 block = 2; + enum AttrType { + INT = 1, + STRING = 2, + ... + BLOCK = ... + } + required AttrType type = 2; + + optional int32 block = 10; // when type == BLOCK ... } ``` -When each of the first five lines runs, related Python function, e.g., `layer.fc`, calls C++ InferShape functions. This InferShape function needs to access the properties of VarDesc's accessed by the current OpDesc. These VarDesc's might not be defined in the current block, but in some ancestor blocks. This requires that we can trace the parent of a block. - -A nested block is often an attribute of an operator, most likely, an IfElseOp or a WhileOp. In above solution, all blocks are in `ProgramDesc::blocks`, this implicitly assigns a zero-based ID to each block -- the index of the block in `ProgramDesc::blocks`. So that `AttrDesc::block` could be an integer block ID. +## InferShape With this design, the InferShape function should take the following parameters: diff --git a/doc/design/prune.md b/doc/design/prune.md new file mode 100644 index 0000000000000000000000000000000000000000..4a5cf10c79a554779137f0cce5494fdd96ef6b7a --- /dev/null +++ b/doc/design/prune.md @@ -0,0 +1,63 @@ +# Prune + +## Motivation + +We want to support running inference, training and checkpointing in one `ProgramDesc`. We implement +`void Prune(const ProgramDesc* input, ProgramDesc* output)` function, which takes a `ProgramDesc` +and generate a pruned `ProgramDesc`. + +## Challenge + +Pruning need to support both variables and operators being evaluation targets. Consider the following +different situations. + +```python +# Case 1: run foward pass. +cost_np = session.run(target=cost) +# Case 2: run backward passing. +opts_np, _ = session.run(target=[cost, opt]) +# Case 3: run checkpointing +_ = session.run(target=checkpoint) +``` + +## Solution + +To support evaluation of operators, we add `is_target` field in the `OpDesc`. + +```c++ +message OpDesc { + required string type = 3; + repeated Var inputs = 1; + repeated Var outputs = 2; + repeated Attr attrs = 4; + optional bool is_target = 5 [ default = false ]; +}; +``` + +To support evaluation of variables, we add [fetch_op](https://github.com/PaddlePaddle/Paddle/pull/4599). +For each variable in the `target`, we insert a `fetch_op` into the `ProgramDesc` with `variable` being +`fetch_op`'s input. Then we also set `fetch_op` is a target. + +### Algorithm + +If an operator needs to be run, it must fall into one of the following cases: + +1. It is the target. +2. It is depended by some other ops, meaning its output is some other op's input. + +The first case can be checked by `op_desc.is_traget()` . The second case can be implement as + +```c++ +bool HasDependentVar(const OpDesc& op_desc, const std::set& dependent_vars) { + for (auto& var : op_desc.outputs()) { + for (auto& argu : var.arguments()) { + if (dependent_vars.count(argu) != 0) { + return true; + } + } + } + return false; +} +``` + +Then the whole algorithm can be implemented as the following [code](https://github.com/tonyyang-svail/Paddle/blob/prune_impl/paddle/framework/prune.cc). diff --git a/doc/design/python_api.md b/doc/design/python_api.md index 5c68354274d577c29e054fdd1d3b17c7ea9b4cf9..cb5fdc765b7126fc66a1c8978d4b96c0dc5a9f2c 100644 --- a/doc/design/python_api.md +++ b/doc/design/python_api.md @@ -15,14 +15,14 @@ Please be aware that these Python classes need to maintain some construction-tim ### Program -A `ProgramDesc` describes a [DL program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md), which is composed of an array of `BlockDesc`s. A `BlockDesc` refers to its parent block by its index in the array. For example, operators in the step block of an RNN operator needs to be able to access variables in its ancessor blocks. +A `ProgramDesc` describes a [DL program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md), which is composed of an array of `BlockDesc`s. The `BlockDesc`s in a `ProgramDesc` can have a tree-like hierarchical structure. However, the `ProgramDesc` onlys stores a flattened array of `BlockDesc`s. A `BlockDesc` refers to its parent block by its index in the array. For example, operators in the step block of an RNN operator need to be able to access variables in its ancestor blocks. -Whenever we create a block, we need set its parent block to the current block, so the Python class `Program` needs to maintain a data member `current_block`. +Whenever we create a block, we need to set its parent block to the current block, hence the Python class `Program` needs to maintain a data member `current_block`. ```python class Program(objects): def __init__(self): - self.proto = core.NewProgram() # a C++ ProgramDesc pointer. + self.desc = core.NewProgram() # a C++ ProgramDesc pointer. self.blocks = vector() self.blocks.append(Block(self, -1)) # the global block self.current_block = 0 # initialized to the global block @@ -57,7 +57,7 @@ A [Block](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.m ```python class Block(objects): def __init__(self, program, parent_idx): - self.proto = core.NewBlock(program.proto) + self.desc = core.NewBlock(program.desc) self.program = program self.vars = map() self.ops = vector() @@ -81,13 +81,13 @@ class Block(objects): self.ops.prepend(Operator(self, ...)) ``` -`create_parameter` is necessary because parameters are global variables, those defined in the global block, but can be created in some sub-blocks, e.g., an FC layer in the step block of an RNN operator. +`create_parameter` is necessary because parameters are global variables, defined in the global block, but can be created in some sub-blocks. For example, an FC layer in the step block of an RNN operator. -`prepand_operator` is necessary because the constructor of `Parameter` needs to create the initialize (or load) operator of the parameter, and would like to put it in the *preamble* of the global block. +`prepend_operator` is necessary because the constructor of `Parameter` needs to create the initialize (or load) operator of the parameter, and would like to put it in the *preamble* of the global block. ### Operator -The `Operator` class fills in the `OpDesc` message and calls the C++ function `InferShape` to infer output shape from input shape. +The `Operator` class fills in the `OpDesc` message and calls the C++ function `InferShape` to infer the output shapes from the input shapes. ```python class Operator(object): @@ -98,14 +98,14 @@ class Operator(object): outputs,# dict attrs # dict ): - self.proto = core.NewOpDesc(block.proto, type, inputs, outputs, attrs) - core.infer_shape(self.proto, inputs, outputs) + self.desc = core.NewOpDesc(block.desc, type, inputs, outputs, attrs) + core.infer_shape(self.desc, inputs, outputs) def type(self): - return self.proto.type() + return self.desc.type() ``` -`Operator` creates the `OpDesc` message in C++ space, so could it call the `InferShape` function, which is in C++. +`Operator` creates the `OpDesc` message in C++ space, so that it can call the `InferShape` function, which is in C++. ### Variable @@ -124,11 +124,11 @@ class Variable(object): name = unique_name_generator() self.name = name self.block = block - self.proto = core.NewVarDesc(block.proto, name, shape, lod_level) + self.desc = core.NewVarDesc(block.desc, name, shape, lod_level) self.writer = None ``` -Please be aware of `self.writer`, that tracks operator who creates the variable. It possible that there are more than one operators who write a variable, but in Python space, each writes to a variable is represented by a Variable class. This is guaranteed by the fact that **`core.NewVarDesc` must NOT create a new `VarDesc` message if its name already exists in the specified block**. +Please be aware of `self.writer`, that tracks operator who creates the variable. It possible that there are more than one operators who write a variable, but in Python space, each write to a variable is represented by a Variable class. This is guaranteed by the fact that **`core.NewVarDesc` must NOT create a new `VarDesc` message if its name already exists in the specified block**. ### Parameter @@ -155,7 +155,7 @@ class Parameter(Variable): initialize_op_attrs) ``` -When users create a parameter, s/he can call +When users create a parameter, they can call ```python program.create_parameter( @@ -179,38 +179,106 @@ init_attr={ `optimize_op_attrs` is not in the `VarDesc` message, but kept in the Python instance, as it will be used in the Python space when creating the optimize operator's `OpDesc`, and will be in the `OpDesc` message. -## Layer Functions +## Layer Function -A layer is a Python function that creates some operators and variables. Layers simplify the work of application programmers. +A layer is a Python function that creates some operators and variables. Layers simplify the work of application programmers. -### Data Layer +Layer functions take `Variable` and configuration parameters as its input and return the output variable(s). + +For example, `FullyConnected` take one or more variable as its input. The input could be input data or another layer's output. There are many configuration options for a `FullyConnected` layer, such as layer size, activation, parameter names, initialization strategies of parameters, and so on. The `FullyConnected` layer will return an output variable. + + +### Necessity for reusing code between layer functions + +There are a lot of code that can be reused. Such as + +* Give the default value of configuration. e.g., default initialize strategy for parameters is uniform random with `min = -1.0`, `max = 1.0`. and default initialize strategy for bias is to fill zero. +* Append the activation operator. +* Create a temporary variable. +* Create parameter. +* Generate a unique name. +* Add a bias. +* ... + +A mechanism to reuse code between layer functions is necessary. It will be around [150 lines of code](https://github.com/PaddlePaddle/Paddle/pull/4724/files#diff-823b27e07e93914ada859232ae23f846R12) if we write a `FullyConnected` layer without any helper functions. + + + +### Comparision between global functions and helper class + +The `FullyConnected` layer will be as follow when we provide global functions: ```python -def data_layer(name, type, column_name): - block = the_current_program.glolal_block() - var = block.create_global_var( - name=name, - shape=[None] + type.dims(), - dtype=type.dtype) - block.prepend_operator(block, - type="Feed", - inputs = None, - outputs = [var], - {column_name: column_name}) - return var +def fc_layer(input, size, param_attr=None, bias_attr=None, act=None, name=None): + if name is None: + name = unique_name("fc") + input = multiple_input(input) + param_attr = default_param_attr(param_attr) + param_attr = multiple_param_attr(param_attr, len(input)) + + # mul + mul_results = [] + for ipt, attr in zip(input, param_attr): + shape = ipt.shape[1:] + [size] + w = g_program.global_block().create_parameter(shape, ipt.dtype, name, attr) + tmp = create_tmp_var(name) + g_program.current_block().append_op("mul", {ipt, w}, {tmp}) + mul_results.append(tmp) + + # add sum + ... + # add bias + ... + # add activation + ... + return out ``` -The input to the feed operator is a special variable in the global scope, which is the output of [Python readers](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/reader/README.md). +We can provide many helpers functions for layer developers. However, there are several disadvantages for global helper functions: -### FC Layer +1. We need a namespace for these methods, then layer developers can quickly figure out what method they can use. +2. Global functions will force layer developers to pass its parameter time by time. + +So we provide a helper class, `LayerHelper`, to share code between layer functions. The `FullyConnected` Layer will be as follow. + +```python +def fc_layer(input, size, param_attr=None, bias_attr=None, act=None, name=None): + helper = LayerHelper(locals()) # pass all parameter to LayerHelper + + mul_results = [] + for ipt, param in helper.iter_multiple_input_and_param(): + w = helper.create_parameter(shape=ipt.shape[1:] + [size], dtype = ipt.dtype) + tmp = helper.create_tmp_variable() + helper.append_op('mul', {ipt, w}, {tmp}) + mul_results.append(tmp) + + pre_bias = helper.add_sum(mul_results) + pre_activation = helper.add_bias(pre_bias) + return helper.add_activation(pre_activation) +``` + +We not only use the fewer lines of code to write `fc_layer` but also make the code clearer to understand. At the same time, layer developers can figure out what function they can invoke by typing `helper.` in a python editor. + + +### Implementation of layer helper + +We just keep all parameters of a layer function as a dictionary in layer helper as a private data member. Every method of layer helper will look up the dictionary after it is invoked. In that way, we can implement a layer helper for all layer functions even some layer does not contain some operator. For example, The `activation` is used by the FullyConnected layer or convolution layers, but a cross-entropy layer does not use it. The example code of `add_activation` are: ```python -def fc_layer(input, size, ...): - block = program.current_block() - w = block.create_parameter(...) - b = block.create_parameter(...) - out = block.create_var() - op = block.append_operator("FC", X=input, W=w, b=b, out=out) - out.writer = op - return out +class LayerHelper(object): + def __init__(self, **kwargs): # kwargs is short for `keyword arguments` + self.kwargs = kwargs + + def add_activation(self, input_var): + act = self.kwargs.get("act", None) # default value is None + if act is None: # do nothing if no act + return input_var + + tmp = self.create_tmp_var(self) + self.append_op(type=act, input=input_var, output=tmp) + return tmp ``` + +## Optimizer + +[Optimizer Design Doc](./optimizer.md) diff --git a/doc/design/reader/README.md b/doc/design/reader/README.md index 320dccec3ddc7bfe6042f4e65b2518ea7b1ad24a..2cd4b6225b61cf374458e40afabad7745f61ba71 100644 --- a/doc/design/reader/README.md +++ b/doc/design/reader/README.md @@ -1,25 +1,25 @@ # Python Data Reader Design Doc -At training and testing time, PaddlePaddle programs need to read data. To ease the users' work to write data reading code, we define that +During the training and testing phases, PaddlePaddle programs need to read data. To help the users write code that performs reading input data, we define the following: -- A *reader* is a function that reads data (from file, network, random number generator, etc) and yields data items. -- A *reader creator* is a function that returns a reader function. -- A *reader decorator* is a function, which accepts one or more readers, and returns a reader. -- A *batch reader* is a function that reads data (from *reader*, file, network, random number generator, etc) and yields a batch of data items. +- A *reader*: A function that reads data (from file, network, random number generator, etc) and yields the data items. +- A *reader creator*: A function that returns a reader function. +- A *reader decorator*: A function, which takes in one or more readers, and returns a reader. +- A *batch reader*: A function that reads data (from *reader*, file, network, random number generator, etc) and yields a batch of data items. -and provide function which converts reader to batch reader, frequently used reader creators and reader decorators. +and also provide a function which can convert a reader to a batch reader, frequently used reader creators and reader decorators. ## Data Reader Interface -Indeed, *data reader* doesn't have to be a function that reads and yields data items. It can be any function with no parameter that creates a iterable (anything can be used in `for x in iterable`): +*Data reader* doesn't have to be a function that reads and yields data items. It can just be any function without any parameters that creates an iterable (anything can be used in `for x in iterable`) as follows: ``` iterable = data_reader() ``` -Element produced from the iterable should be a **single** entry of data, **not** a mini batch. That entry of data could be a single item, or a tuple of items. Item should be of [supported type](http://www.paddlepaddle.org/doc/ui/data_provider/pydataprovider2.html?highlight=dense_vector#input-types) (e.g., numpy 1d array of float32, int, list of int) +The item produced from the iterable should be a **single** entry of data and **not** a mini batch. The entry of data could be a single item or a tuple of items. Item should be of one of the [supported types](http://www.paddlepaddle.org/doc/ui/data_provider/pydataprovider2.html?highlight=dense_vector#input-types) (e.g., numpy 1d array of float32, int, list of int etc.) -An example implementation for single item data reader creator: +An example implementation for single item data reader creator is as follows: ```python def reader_creator_random_image(width, height): @@ -29,7 +29,7 @@ def reader_creator_random_image(width, height): return reader ``` -An example implementation for multiple item data reader creator: +An example implementation for multiple item data reader creator is as follows: ```python def reader_creator_random_image_and_label(width, height, label): def reader(): @@ -40,9 +40,10 @@ def reader_creator_random_image_and_label(width, height, label): ## Batch Reader Interface -*batch reader* can be any function with no parameter that creates a iterable (anything can be used in `for x in iterable`). The output of the iterable should be a batch (list) of data items. Each item inside the list must be a tuple. +*Batch reader* can be any function without any parameters that creates an iterable (anything can be used in `for x in iterable`). The output of the iterable should be a batch (list) of data items. Each item inside the list should be a tuple. + +Here are some valid outputs: -Here are valid outputs: ```python # a mini batch of three data items. Each data item consist three columns of data, each of which is 1. [(1, 1, 1), @@ -58,20 +59,22 @@ Here are valid outputs: Please note that each item inside the list must be a tuple, below is an invalid output: ```python # wrong, [1,1,1] needs to be inside a tuple: ([1,1,1],). - # Otherwise it's ambiguous whether [1,1,1] means a single column of data [1, 1, 1], - # or three column of datas, each of which is 1. + # Otherwise it is ambiguous whether [1,1,1] means a single column of data [1, 1, 1], + # or three columns of data, each of which is 1. [[1,1,1], [2,2,2], [3,3,3]] ``` -It's easy to convert from reader to batch reader: +It is easy to convert from a reader to a batch reader: + ```python mnist_train = paddle.dataset.mnist.train() mnist_train_batch_reader = paddle.batch(mnist_train, 128) ``` -Also easy to create custom batch reader: +It is also straight forward to create a custom batch reader: + ```python def custom_batch_reader(): while True: @@ -85,7 +88,8 @@ mnist_random_image_batch_reader = custom_batch_reader ## Usage -batch reader, mapping from item(s) read to data layer, batch size and number of total pass will be passed into `paddle.train`: +Following is how we can use the reader with PaddlePaddle: +The batch reader, a mapping from item(s) to data layer, the batch size and the number of total passes will be passed into `paddle.train` as follows: ```python # two data layer is created: @@ -99,13 +103,13 @@ paddle.train(batch_reader, {"image":0, "label":1}, 128, 10, ...) ## Data Reader Decorator -*Data reader decorator* takes a single or multiple data reader, returns a new data reader. It is similar to a [python decorator](https://wiki.python.org/moin/PythonDecorators), but it does not use `@` syntax. +The *Data reader decorator* takes in a single reader or multiple data readers and returns a new data reader. It is similar to a [python decorator](https://wiki.python.org/moin/PythonDecorators), but it does not use `@` in the syntax. -Since we have a strict interface for data readers (no parameter, return a single data item). Data reader can be used flexiable via data reader decorators. Following are a few examples: +Since we have a strict interface for data readers (no parameters and return a single data item), a data reader can be used in a flexible way using data reader decorators. Following are a few examples: ### Prefetch Data -Since reading data may take time and training can not proceed without data. It is generally a good idea to prefetch data. +Since reading data may take some time and training can not proceed without data, it is generally a good idea to prefetch the data. Use `paddle.reader.buffered` to prefetch data: @@ -117,9 +121,9 @@ buffered_reader = paddle.reader.buffered(paddle.dataset.mnist.train(), 100) ### Compose Multiple Data Readers -For example, we want to use a source of real images (reusing mnist dataset), and a source of random images as input for [Generative Adversarial Networks](https://arxiv.org/abs/1406.2661). +For example, if we want to use a source of real images (say reusing mnist dataset), and a source of random images as input for [Generative Adversarial Networks](https://arxiv.org/abs/1406.2661). -We can do: +We can do the following : ```python def reader_creator_random_image(width, height): @@ -139,13 +143,13 @@ false_reader = reader_creator_bool(False) reader = paddle.reader.compose(paddle.dataset.mnist.train(), data_reader_creator_random_image(20, 20), true_reader, false_reader) # Skipped 1 because paddle.dataset.mnist.train() produces two items per data entry. -# And we don't care second item at this time. +# And we don't care about the second item at this time. paddle.train(paddle.batch(reader, 128), {"true_image":0, "fake_image": 2, "true_label": 3, "false_label": 4}, ...) ``` ### Shuffle -Given shuffle buffer size `n`, `paddle.reader.shuffle` will return a data reader that buffers `n` data entries and shuffle them before a data entry is read. +Given the shuffle buffer size `n`, `paddle.reader.shuffle` returns a data reader that buffers `n` data entries and shuffles them before a data entry is read. Example: ```python @@ -154,21 +158,21 @@ reader = paddle.reader.shuffle(paddle.dataset.mnist.train(), 512) ## Q & A -### Why reader return only a single entry, but not a mini batch? +### Why does a reader return only a single entry, and not a mini batch? -Always returning a single entry make reusing existing data readers much easier (e.g., if existing reader return not a single entry but 3 entries, training code will be more complex because it need to handle cases like batch size 2). +Returning a single entry makes reusing existing data readers much easier (for example, if an existing reader returns 3 entries instead if a single entry, the training code will be more complicated because it need to handle cases like a batch size 2). -We provide function `paddle.batch` to turn (single entry) reader into batch reader. +We provide a function: `paddle.batch` to turn (a single entry) reader into a batch reader. -### Why do we need batch reader, isn't train take reader and batch_size as arguments sufficient? +### Why do we need a batch reader, isn't is sufficient to give the reader and batch_size as arguments during training ? -In most of the case, train taking reader and batch_size as arguments would be sufficent. However sometimes user want to customize order of data entries inside a mini batch. Or even change batch size dynamically. +In most of the cases, it would be sufficient to give the reader and batch_size as arguments to the train method. However sometimes the user wants to customize the order of data entries inside a mini batch, or even change the batch size dynamically. For these cases using a batch reader is very efficient and helpful. -### Why use a dictionary but not a list to provide mapping? +### Why use a dictionary instead of a list to provide mapping? -We decided to use dictionary (`{"image":0, "label":1}`) instead of list (`["image", "label"]`) is because that user can easily resue item (e.g., using `{"image_a":0, "image_b":0, "label":1}`) or skip item (e.g., using `{"image_a":0, "label":2}`). +Using a dictionary (`{"image":0, "label":1}`) instead of a list (`["image", "label"]`) gives the advantage that the user can easily reuse the items (e.g., using `{"image_a":0, "image_b":0, "label":1}`) or even skip an item (e.g., using `{"image_a":0, "label":2}`). -### How to create custom data reader creator +### How to create a custom data reader creator ? ```python def image_reader_creator(image_path, label_path, n): @@ -192,7 +196,7 @@ paddle.train(paddle.batch(reader, 128), {"image":0, "label":1}, ...) ### How is `paddle.train` implemented -An example implementation of paddle.train could be: +An example implementation of paddle.train is: ```python def train(batch_reader, mapping, batch_size, total_pass): diff --git a/doc/design/refactor/distributed_architecture.md b/doc/design/refactor/distributed_architecture.md index ac7e98ccf1aadbb973a4801fde842375cf63448c..d9fe7d6bbb0eeb73fcdca3ee749a4f10bcdda682 100644 --- a/doc/design/refactor/distributed_architecture.md +++ b/doc/design/refactor/distributed_architecture.md @@ -2,106 +2,70 @@ ## Abstract -PaddlePaddle v0.10.0 uses the "trainer-parameter server" -architecture. We run multiple replicated instances of trainers (runs -the same code written by the user) and parameter servers for -distributed training. This architecture served us well, but has some -limitations: +PaddlePaddle version 0.10.0 uses the "trainer-parameter server" architecture. We run multiple instances of trainers (where each trainer runs the same model) and parameter servers for distributed training. This architecture serves well, but has few limitations: -1. Need to write special code to handle tasks which should only be run - by a single trainer. E.g., initializing model and saving model. +1. There is a need to write special code that handles tasks which should only be run on a single trainer. E.g., initializing the model, saving the model etc. -2. Model parallelism is hard: need to write if-else branches conditioned - on the trainer ID to partition model onto each trainer, and manually - write the inter-model-shard communication code. +2. Model parallelism is hard: It would need all the if-else branches conditioned on the trainer ID to partition the model onto the trainers, and eventually manually writing out the inter-model-shard communication code to communicate between different trainers. -3. The user can not directly specify the parameter update rule: need - to modify the parameter server C++ code and compile a new - binary. This adds complication for researchers: A lot of extra - effort is required. Besides, the training job submission program - may not allow running arbitrary binaries. +3. The user can not directly specify the parameter update rule: This would need to modify the parameter server code and compile a new binary. This makes things more complicated for researchers: A lot of extra effort is required to make this work. Besides, the training job submission program may not allow running arbitrary binaries. -This design doc discusses PaddlePaddle's new distributed training -architecture that addresses the above limitations. +This design doc discusses PaddlePaddle's new distributed training architecture that addresses the above mentioned limitations. ## Analysis -We will assume the user writes the trainer program by Python, the same -analysis holds if the trainer program is written in C++. +The assumption is that the user writes the trainer program in either Python or C++. ### Limitation 1 -If we look at the Python code that the user writes, there are two -kinds of functionalities: +There are two basic functionalities in the trainer program: -- The training logic such as load / save model and print log. -- The neural network definition such as the definition of the data - layer, the fully connected layer, the cost function and the +1. The training logic such as loading / saving the model and printing out the logs. +2. The neural network definition such as the definition of the data layer, the fully connected layer, the cost function and the optimizer. -When we training with PaddlePaddle v0.10.0 distributedly, multiple -replicated Python instances are running on different nodes: both the -training logic and the neural network computation is replicated. +When we train using PaddlePaddle v0.10.0 in a distributed fashion, multiple instances of the same Python code are run on different nodes, hence both: the +training logic as well as the neural network computation logic, is replicated. -The tasks that should only run once all belong to the training logic, -if we only replicate the neural network computation, but do **not** -replicate the training logic, the limitation could be solved. +The tasks that only need to be run once belong to the training logic. Hence if we only replicate the neural network computation part, and do **not** +replicate the training logic, the limitation mentioned above can be avoided. ### Limitation 2 -Model parallelism means running a single model on multiple nodes by -partitioning the model onto different nodes and managing the -inter-model-shard communications. +Model parallelism means that a single model is partitioned into different components and each node runs one of the component separately. This comes at the extra cost of managing the +inter-model-shard communication between nodes. -PaddlePaddle should be able to modify the nerual network computation -definition to support model parallelism automatically. However, the -computation is only specified in Python code, and PaddlePaddle can not -modify Python code. +PaddlePaddle should ideally be able to modify the neural network computation and figure out the support for model parallelism automatically. However, the +computation is only specified in Python code which sits outside of PaddlePaddle, hence PaddlePaddle can not support the feature in this setup. -Just like compiler uses a intermediate representation (IR) so that -programmer does not need to manually optimize their code in most of -the cases - the compiler will optimize the IR: +Similar to how a compiler uses an intermediate representation (IR) so that the programmer does not need to manually optimize their code for most of the cases, we can have an intermediate representation in PaddlePaddle as well. The compiler optimizes the IR as follows: -We can have our own IR too: PaddlePaddle can support model parallel by -converting the IR so the user no longer need to manually do it in -Python: +PaddlePaddle can support model parallelism by converting the IR so that the user no longer needs to manually perform the computation and operations in the Python component: -The IR for PaddlePaddle after refactor is called `Block`, it specifies -the computation dependency graph and the variables used in the -computation. +The IR for PaddlePaddle after refactoring is called a `Block`, it specifies the computation dependency graph and the variables used in the computation. ### Limitation 3 -The user can not directly specify the parameter update rule for the -parameter server because the parameter server does not use the same -computation definition as the trainer. Instead, the update rule is -baked in the parameter server. The user can not specify the update -rule in the same way of specifying the trainer computation. +The user can not directly specify the parameter update rule for the parameter server in the Python module, since the parameter server does not use the same computation definition as the trainer. Instead, the update rule is baked inside the parameter server. The user can not specify the update rule explicitly. -This could be fixed by making the parameter server run the same -computation definition as the trainer. For a detailed explanation, -please -see -[Design Doc: Operation Graph Based Parameter Server](./dist_train.md) +This could be fixed by making the parameter server run the same computation definition as the trainer (the user's Python module). For a detailed explanation, refer to this document - +[Design Doc: Operation Graph Based Parameter Server](./parameter_server.md) ## Distributed Training Architecture -The new distributed training architecture can address the above -limitations. Below is the illustration: +The revamped distributed training architecture can address the above discussed limitations. Below is the illustration of how it does so: -The architecture includes major components: *PaddlePaddle Python*, -*PaddlePaddle converter* and *PaddlePaddle runtime*: +The major components in the architecture are: *PaddlePaddle Python*, *PaddlePaddle converter* and *PaddlePaddle runtime*. ### PaddlePaddle Python -PaddlePaddle Python is the Python library that user's Python trainer -invoke to build the neural network topology, start training, etc. +PaddlePaddle Python is the Python library that user's Python code invokes, to read the data. build the neural network topology, start training, etc. ```Python paddle.init() @@ -117,102 +81,60 @@ for i in range(1000): print cost_val ``` -The code above is a typical Python trainer code, the neural network -topology is built using helper functions such as -`paddle.layer.fc`. The training is done by calling `session.eval` -iteratively. +The above code is what a typical Python trainer code is, the neural network topology is built using the helper functions such as `paddle.layer.fc`. Training is done by calling `session.eval` iteratively. #### session.eval -As shown in the graph, `session.eval` sends the IR and the evaluation -inputs/targets to the PaddlePaddle cluster for evaluation. The -targets can be any variable in the computation graph. When the target -is the `optimizer` variable, the neural network will be optimized -once. When the target is the `cost` variable, `session.eval` returns -the cost value. +As shown in the graph, `session.eval` sends the IR and the evaluation inputs or targets to the PaddlePaddle cluster for evaluation. +The targets can be any variable in the computation graph. When the target is say, the `optimizer` variable, the neural network will be optimized once. When the target is the `cost` variable, `session.eval` returns the cost value. Based on what the target is, an appropriate action is taken. -The Python `session` is a wrapper of the C++ `Session` class. For more -information about `Session`, please -see [Design Doc: Session](./session.md). +The Python `session` is a wrapper of the C++ `Session` class. For more information about `Session`, refer to this document - [Design Doc: Session](./session.md). ### PaddlePaddle Converter -PaddlePaddle converter automatically converts the IR in the request -(IR and evaluation inputs/targets) from PaddlePaddle Python to new -partitioned IRs and dispatch the new IRs and evaluation inputs/targets -to different PaddlePaddle runtimes. Below are the steps: +The PaddlePaddle converter automatically converts the IR in the request (IR and evaluation inputs/targets) from PaddlePaddle Python to partitioned IRs and dispatches the new IRs and evaluation inputs/targets to different PaddlePaddle runtimes. Below are the steps that are followed : -1. Add `feed` OP that feeds the eval inputs, and `fetch` OP that - fetches the eval targets to the IR. +1. Add a `feed` OP that feeds the eval inputs, and a `fetch` OP that fetches the eval targets to the IR. -1. Extract a new computation (sub)graph with `feed` and `fetch` OP as - the boundary. The runtime does not need to run the OP that is not - dependent by the `fetch` OP. +2. Extract a new computation (sub)graph with the `feed` and `fetch` OPs as the boundary. The runtime does not need to run the OP that is not dependent on the `fetch` OP. -1. Optimizes the computation graph. +3. Optimize the computation graph. -1. Place the OPs in the graph onto different devices on different - PaddlePaddle runtime according to a placement algorithm and device - constraint specified by the user. +4. Place the OPs in the graph onto different devices on different PaddlePaddle runtime according to a placement algorithm and the device constraints specified by the user. -1. Partition the graph according to runtime boundaries and add `send` / - `recv` OP pair on the runtime boundaries. +5. Partition the graph according to runtime boundaries and add `send` / `recv` OP pair on the runtime boundaries. -1. Dispatch the partitioned graph to different PaddlePaddle runtimes. +6. Dispatch the partitioned graph to different PaddlePaddle runtimes. + +7. PaddlePaddle runtimes with the `fetch` OP reports evaluation results back to the converter, the converter reports the evaluation results back to the PaddlePaddle Python. -1. PaddlePaddle runtimes with the `fetch` OP reports evaluation - results back to the converter, the convert reports the evaluation - results back to the PaddlePaddle Python. - The output IRs will be cached to optimize the conversion latency. #### Placement Algorithm -Our first implementation will only support "trainer-parameter server" -placement: the parameters, initializers, and optimizers are placed on -the PaddlePaddle runtimes with the parameter server role. And -everything else will be placed on the PaddlePaddle runtimes with the -trainer role. This has the same functionality of our -"trainer-parameter server" architecture of PaddlePaddle v0.10.0, but -is more general and flexible. +Our first implementation will only support "trainer-parameter server" placement: the parameters, initializers, and optimizers are all placed on the PaddlePaddle runtimes with the parameter server role. Everything else will be placed on the PaddlePaddle runtimes with the trainer role. This has the same functionality as the "trainer-parameter server" architecture of PaddlePaddle v0.10.0, but is more generic and flexible. -In the future, we will implement the general placement algorithm, -which makes placements according to the input IR, and a model of -device computation time and device communication time. Model -parallelism requires the general placement algorithm. +In the future, a more general placement algorithm should be implemented, which makes placements according to the input IR, and a model of device computation time and device communication time. Model parallelism requires the generic placement algorithm. ### PaddlePaddle Runtime -The PaddlePaddle runtime owns multiple devices (e.g., CPUs, GPUs) and -runs the IR. The runtime does not need to do OP placement since it's -already done by the converter. +The PaddlePaddle runtime owns multiple devices (e.g., CPUs, GPUs) and runs the IR. The runtime does not need to do OP placement since it is already done by the converter. ### Local Training Architecture -The local training architecture will be the same as the distributed -training architecture, the differences are everything runs locally, -and there is just one PaddlePaddle runtime: +The local training architecture will be the same as the distributed training architecture, the difference is that everything runs locally, and there is just one PaddlePaddle runtime: ### Training Data -In PaddlePaddle v0.10.0, training data is typically read -with [data reader](../reader/README.md) from Python. This approach is -no longer efficient when training distributedly since the Python -process no longer runs on the same node with the trainer processes, -the Python reader will need to read from the distributed filesystem -(assuming it has the access) and send to the trainers, doubling the -network traffic. - -When doing distributed training, the user can still use Python data -reader: the training data are sent with `session.eval`. However should -be used for debugging purpose only. The users are encouraged to use -the read data OPs. +In PaddlePaddle v0.10.0, training data is typically read with a [data reader](../reader/README.md) from Python. This approach is no longer efficient when training in a distributed fashion since the Python process no longer runs on the same node with the trainer processes. The Python reader will need to read from the distributed filesystem (assuming it has the required access) and send to the trainers, doubling the network traffic. + +When doing distributed training, the user can still use Python data reader: the training data are sent with `session.eval`. However this should be used for debugging purpose only. The users are encouraged to use the read data OPs. ## References: diff --git a/doc/design/refactor/session.md b/doc/design/refactor/session.md new file mode 100644 index 0000000000000000000000000000000000000000..1d9a26683c14f54e3b5fe41675cd03b5620646b8 --- /dev/null +++ b/doc/design/refactor/session.md @@ -0,0 +1,180 @@ +# Design Doc: Session + +## Abstract + +The *session* object encapsulates the environment in which the +computation graph is executed. + +We will have the *local* session and *remote* session, they offer the +same [interface](#interface). The local session encapsulates the local +runtime environment and the remote session encapsulates the cluster +runtime environment. + +The local runtime environment contains: + +1. computation devices (i.e., CPU, GPU) handles, and +1. the [scope](../scope.md) which holds all variables. + +The remote runtime environment contains: + +1. computation devices (i.e., CPU and GPU on node 0, 1) in a cluster, + and +1. the distributed [scope](../scope.md) in a cluster which holds all + variables. + +The user can create a remote session on Paddle Cloud and evaluate the +computation graph with it. In this way, the user can control the +remote computation resource in a cluster from his local computer. + + +## Background + +The current design has an implicit global session in which +`paddle.eval()` is executed. The pain point is: + +Since the user is not able to explicitly switch between runtime +environments, the user cannot run a topology in two independent +environments. + +For example, in reinforcement learning, the user may want to have a +stale model for inference and a fresh model for training, and only +replace the stale model with the fresh model periodically. + +Furthermore, we have no concept that encapsulates a remote environment +that executes a computation graph. + +We need the session object to address above issues. + + +## Session + +A session is an object that owns the runtime environment. All +computations are executed through `session.eval()`. + + +### Interface + +```python +eval( + targets, + feed_dict=None, +) +``` + +Evaluates the target Operations or Variables in `targets`. + +- *targets*: the evaluation targets. Can be a single Operation or + Variable, or a list with the Operations or Variables as + elements. The value returned by `eval()` has the same shape as the + `target` argument. + + The PaddlePaddle program is represented by + the [ProgramDesc](../design/program.md), `eval()` will infer the + ProgramDesc from the given targets and run the PaddlePaddle + program. Please + see + [this graph](./distributed_architecture.md#local-training-architecture) for + the detailed illustration for the local session + and + [this graph](./distributed_architecture.md#distributed-training-architecture) for + the detailed illustration for the remote session. + +- *feed_dict*: a dictionary that contains the tensors which override + the edges of the computation graph. + + feed_dict not only can provide the input data, it can override any + OP's input as well: + + ```python + a = pd.constant(2.0, name="a") + b = pd.variable(name="b") + c = pd.mul(a,b) + sess.eval(targets=c, feed_dict={"b":3.0}) # returns 6.0 + ``` + +```python +close() +``` + +Closes the session and releases the scope that the session owns. + + +### Create a Local Session + +```python +session( + devices=None +) +``` + +Creates a new session. One session owns one global scope, so creating +multiple sessions will create different scopes. + +- *devices*: a single `string` or a list of `string` of device names, + the corresponding devices will be the computation devices for + `eval()`. If not specified, all available devices (e.g., all GPUs) + will be used. The user doesn't need to specify the CPU device since + it will be always used. Multiple sessions can use the same device. + + +#### Example + +```Python +a = paddle.constant(1.0) +b = paddle.constant(2.0) +c = a + b +sess = paddle.session(devices=["gpu:0", "gpu:1", "fpga:0"]) +sess.eval(c) +sess.close() +``` + +### Create a Remote Session + +```python +create_cloud_job( + name, + num_trainer, + mem_per_trainer, + gpu_per_trainer, + cpu_per_trainer, + num_ps, + mem_per_ps, + cpu_per_ps, +) +``` + +Creates a Paddle Cloud job. Fails if the job name exists. + +```python +get_cloud_job( + name +) +``` + +Gets a Paddle Cloud job. + +```python +remote_session( + job +) +``` + +- *job*: the Paddle Cloud job. + +#### Example + +```Python +reader = paddle.reader.recordio("/pfs/home/peter/mnist-train-*") # data stored on Paddle Cloud +image = reader.column(0) +label = reader.column(1) +fc1 = paddle.op.fc(image, size=256, act="sigmoid") +fc2 = paddle.op.fc(fc1, size=10, act="softmax") +cost = paddle.op.cross_entropy(fc2, label) +opt = paddle.optimizer.sgd(cost) + +job = paddle.create_cloud_job("test", 3, "1G", 1, 1, 2, "1G", 1) +sess = paddle.remote_ession(job) +for i in range(1000): + sess.eval(opt) +sess.close() +``` diff --git a/doc/design/refactorization.md b/doc/design/refactorization.md index 629422e7743af666b42fd69fbff442ce15bef596..f93d6155e1764386b01d2f0df3f141ab75cd55d4 100644 --- a/doc/design/refactorization.md +++ b/doc/design/refactorization.md @@ -17,22 +17,22 @@ The goals of refactoring include: 1. A graph is composed of *variables* and *operators*. -1. The description of graphs must be capable of being serialized/deserialized, so that: +1. The description of graphs must be serializable/deserializable, so that: - 1. It can to be sent to the cloud for distributed execution, and + 1. It can be sent to the cloud for distributed execution, and 1. It can be sent to clients for mobile or enterprise deployment. -1. The Python program does the following steps +1. The Python program does two things - 1. *compilation*: run a Python program to generate a protobuf message representation of the graph and send it to + 1. *Compilation* runs a Python program to generate a protobuf message representation of the graph and send it to 1. the C++ library `libpaddle.so` for local execution, 1. the master process of a distributed training job for training, or 1. the server process of a Kubernetes serving job for distributed serving. - 1. *execution*: execute the graph by constructing instances of class [`Variable`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h#L24) and [`OperatorBase`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L70), according to the protobuf message. + 1. *Execution* executes the graph by constructing instances of class [`Variable`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h#L24) and [`OperatorBase`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L70), according to the protobuf message. ## Description and Realization of Computation Graph -At compile time, the Python program generates a protobuf message representation of the graph, or the description of the graph. +At compile time, the Python program generates a protobuf message representation of the graph, or a description of the graph. At runtime, the C++ program realizes the graph and runs it. @@ -42,11 +42,11 @@ At runtime, the C++ program realizes the graph and runs it. |Operation|[OpDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L35)|[Operator](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L64)| |Block|BlockDesc|Block| -The word *graph* is interchangeable with *block* in this document. A graph represents computation steps and local variables similar to a C++/Java program block, or a pair of parentheses(`{` and `}`). +The word *graph* is interchangeable with *block* in this document. A graph consists of computation steps and local variables similar to a C++/Java program block, or a pair of parentheses(`{` and `}`). ## Compilation and Execution -1. Run an application Python program to describe the graph. In particular, the Python application program does the following: +1. Run a Python program to describe the graph. In particular, the Python application program does the following: 1. Create `VarDesc` to represent local/intermediate variables, 1. Create operators and set attributes, @@ -54,10 +54,10 @@ The word *graph* is interchangeable with *block* in this document. A graph repr 1. Infer the type and the shape of variables, 1. Plan memory-reuse for variables, 1. Generate the backward graph - 1. Optimize the computation graph. - 1. Potentially, split the graph for distributed training. + 1. Add optimization operators to the computation graph. + 1. Optionally, split the graph for distributed training. -1. The invocation of `train` or [`infer`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/inference.py#L108) methods in the application Python program does the following: +1. The invocation of `train` or [`infer`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/inference.py#L108) methods in the Python program does the following: 1. Create a new Scope instance in the [scope hierarchy](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/scope.md) for each run of a block, 1. realize local variables defined in the BlockDesc message in the new scope, @@ -107,8 +107,8 @@ Compile Time -> IR -> Runtime ![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/dd598e8f1976f5759f58af5e5ef94738a6b2e661/op.dot) * `Operator` is the fundamental building block of the user interface. - * Operator stores input/output variable names, and attributes. - * The `InferShape` interface is used to infer the shape of the output variable shapes based on the shapes of the input variables. + * Operator stores input/output variable names and attributes. + * The `InferShape` interface is used to infer the shape of the output variables based on the shapes of the input variables. * Use `Run` to compute the `output` variables from the `input` variables. --- @@ -139,7 +139,7 @@ Compile Time -> IR -> Runtime * Limit the number of `tensor.device(dev) = ` in your code. * `thrust::transform` and `std::transform`. * `thrust` has the same API as C++ standard library. Using `transform`, one can quickly implement customized element-wise kernels. - * `thrust` also has more complex APIs, like `scan`, `reduce`, `reduce_by_key`. + * `thrust`, in addition, supports more complex APIs, like `scan`, `reduce`, `reduce_by_key`. * Hand-writing `GPUKernel` and `CPU` code * Do not write in header (`.h`) files. CPU Kernel should be in cpp source (`.cc`) and GPU kernels should be in cuda (`.cu`) files. (GCC cannot compile GPU code.) --- @@ -177,19 +177,14 @@ REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type, grad_op_class) REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) ``` -### USE Macros -Make sure the registration process is executed and linked. - --- # Registration Process 1. Write an Op class and its gradient Op class, if required. 2. Write an Op maker class. In the constructor of this class, describe the inputs, outputs and attributes of the operator. 3. Invoke the macro `REGISTER_OP`. This macro will - 1. Call maker class to complete the `proto` and the `checker` + 1. Call maker class to complete `proto` and `checker` 2. Using the completed `proto` and `checker`, it will add a new key-value pair to the `OpInfoMap` -4. Invoke the `USE` macro in which the Op is used, to make sure that it is linked. - --- # Backward Module (1/2) ### Create Backward Operator @@ -199,13 +194,14 @@ Make sure the registration process is executed and linked. --- # Backward Module (2/2) ### Build Backward Network -- **Input**: graph of forward operators -- **Output**: graph of backward operators +- **Input**: a graph of forward operators +- **Output**: a graph of backward operators - **Corner cases in construction** - Shared Variables => insert an `Add` operator to combine gradients - No Gradient => insert a `fill_zero_grad` operator - Recursive NetOp => call `Backward` recursively - RNN Op => recursively call `Backward` on stepnet + - RNN Op => recursively call `Backward` on stepnet --- @@ -215,10 +211,10 @@ Make sure the registration process is executed and linked. * Only dims and data pointers are stored in `Tensor`. * All operations on `Tensor` are written in `Operator` or global functions. * Variable length Tensor design [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md) -* `Variable` instances are the inputs and the outputs of an operator. Not just `Tensor`. +* `Variable` instances are the inputs and the outputs of an operator, not just `Tensor`. * `step_scopes` in RNN is a variable and not a tensor. -* `Scope` is where variables are stores. - * map +* `Scope` is where variables are stored. + * map * `Scope` has a hierarchical structure. The local scope can get variables from its parent scope. --- @@ -246,7 +242,7 @@ Make sure the registration process is executed and linked. --- # Control the migration quality - Compare the performance of migrated models with old ones. -- Follow the google C++ style +- Follow the google C++ style guide. - Build the automatic workflow of generating Python/C++ documentations. - The documentation of layers and ops should be written inside the code. - Take the documentation quality into account when submitting pull requests. diff --git a/doc/design/register_grad_op.md b/doc/design/register_grad_op.md index 12b04fb2713d8bb862c08ddb538ddf860bb2983c..8d973eb53178c3e889c845144553a453e11f067c 100644 --- a/doc/design/register_grad_op.md +++ b/doc/design/register_grad_op.md @@ -3,15 +3,17 @@ ## The Problem Posed -In our current operator registration mechanism, for each operator, the programmer should register a *gradient operator creator* function, which takes a C++ operator instance, and returns the corresponding gradient instance. +Currently, for each C++ operator class definition, a *gradient operator creator* function is registered, which takes as input a C++ operator instance and returns the corresponding gradient operator instance. -However, as we decided to separate the *compilation* and *execution* of DL models, we need to reshape the creator to take a protobuf `OpDesc` message, and returns a corresponding message. +However, we noticed two problems with the current design: -More than that, the new registration mechanism need to support the fact that an operators' gradient computation might be a composition of operators. +1. As we decided to separate the *compilation* and the *execution* phases, we need to change the creator to take an `OpDesc` protobuf message in a `ProgramDesc` and inserts corresponding `OpDesc` messages into the `ProgramDesc` message. -## Current Implementation +1. For some operators, the gradient computation can be written in terms of existing operators. For example, the gradient of *minus* operator consists of two operators -- an *identity* operator followed by a *scale* operator. Hence the registration mechanism needs to support mapping from an operator to a set of operators for the gradient computation. -OpInfos store in a association map which key is the operator type. The `grad_op_type` indicate associated gradient operator type. Operator can create gradient operator by `OpInfo::creator_` of gradient. The pseudo code is +## The Current Implementation + +Instances of the C++ class `OpInfo` are stored an associative map whose key is the operator type. The `grad_op_type` indicates the associated gradient operator type. An operator can create the gradient operator by invoking `OpInfo::creator_` of the gradient operator. The pseudo code is as follows ```cpp struct OpInfo { @@ -29,27 +31,50 @@ OperatorBase* CreateGradientOperator(const OperatorBase& op) { ## Proposed Solution -The mapping relationship between an operator and its gradient operators is a function. The interface of that function is: +The mapping relationship between an operator and its gradient operators is a function. The interface of this function is: ```cpp // (OpDesc) --> vector -using GradOpDescMaker = std::function(const OpDesc&)>; +std::function(const OpDescBind&)>; ``` -The function take a `OpDesc` of the forward operator and return one or many gradient operator descriptions. +The function takes an `OpDescBind` of the forward operator and returns one or many gradient operator descriptions. `OpDescBind` is a C++ wrapper for the protobuf message `OpDesc` for rapid manipulation of `OpDesc`. -The `GradOpDescMaker` will be registered in `OpInfo`, to replace `grad_op_type_` field. The `OpInfo` should be +The `GradOpDescMaker` will be registered in `OpInfo` and will replace the `grad_op_type_` field. The `OpInfo` should look like ```cpp struct OpInfo { - GradOpDescMaker grad_op_maker_; + std::function>(const OpDescBind&)> grad_op_maker_; ... }; ``` -The `grad_op_maker_ ` is `nullptr` if the operator does not have associated gradient operators. +The `grad_op_maker_ ` is a `nullptr` if the operator does not have any associated gradient operators. + +We propose a base class called `GradOpDescMakerBase` to let operator developers generate `Gradient Operators` easily. The public interface of that class is + +```cpp +class GradOpDescMakerBase { +public: + GradOpDescMakerBase(const OpDescBind& ); + virtual std::vector> operator()()const = 0; +}; +``` + +We can convert `GradOpDescMakerBase` to `std::function>(const OpDescBind&)>` by + +```cpp +using GradOpMaker = ...; +std::function(const OpDescBind&)> func; +func = [] (const OpDescBind& fwd_op) { + GradOpMaker maker(fwd_op); + return maker(); +}; +``` + +We can write many helper functions since the `GradOpDescMakerBase` is a class now. The basic helper functions get the variables of `Input`, `Output`, `InputGradient` and `OutputGradient` in the forwarding operator. -We should chagne register macros at the same time. In the current solution, there is no difference between forwarding operators and backward operators. So `REGISTER_OP` just register one operator. If the `REGISTER_OPERATOR ` contains `OpProtoAndCheckerMaker` and `GradOpDescMaker`, we just list them in the same macro. It can be done by a macro contains `__VA_ARGS__`. +We should change register macros at the same time. In the current solution, there is no difference between forwarding operators and backward operators. So `REGISTER_OP` just register one operator. If the `REGISTER_OPERATOR ` contains `OpProtoAndCheckerMaker` and `GradOpDescMaker`, we just list them in the same macro. It can be done by a macro contains `__VA_ARGS__`. The user interface should be diff --git a/doc/design/regularization.md b/doc/design/regularization.md new file mode 100644 index 0000000000000000000000000000000000000000..21280ac898feb4dd5e5a5d9e88d121e856850f0b --- /dev/null +++ b/doc/design/regularization.md @@ -0,0 +1,72 @@ +# Regularization in PaddlePaddle + +## Introduction to Regularization +A central problem in machine learning is how to design an algorithm that will perform well not just on the training data, but also on new data. A frequently faced problem is the problem of **overfitting**, where the model does not make reliable predictions on new unseen data. **Regularization** is the process of introducing additional information in order to prevent overfitting. This is usually done by adding extra penalties to the loss function that restricts the parameter spaces that an optimization algorithm can explore. + +### Parameter Norm Penalties +Most common regularization approaches in deep learning are based on limiting the capacity of the models by adding a parameter norm penalty to the objective function `J`. This is given as follows: + +
+ +The parameter `alpha` is a hyperparameter that weights the relative contribution of the norm penalty term, `omega`, relative to the standard objective function `J`. + +The most commonly used norm penalties are the L2 norm penalty and the L1 norm penalty. These are given as follows: + +##### L2 Regularization: +
+ +##### L1 Regularization +
+ +A much more detailed mathematical background of regularization can be found [here](http://www.deeplearningbook.org/contents/regularization.html). + +## Regularization Survey + +A detailed survey of regularization in various deep learning frameworks can be found [here](https://github.com/PaddlePaddle/Paddle/wiki/Regularization-Survey). + +## Proposal for Regularization in PaddlePaddle + +### Low-Level implementation + +In the new design, we propose to create new operations for regularization. For now, we can add 2 ops that correspond to the most frequently used regularizations: +- L2_regularization_op +- L1_regularization_op + +These ops can be like any other ops with their own CPU/GPU implementations either using Eigen or separate CPU and GPU kernels. As the initial implementation, we can implement their kernels using Eigen following the abstraction pattern implemented for [Activation Ops](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/accuracy_op.h). This abstraction pattern can make it very easy to implement new regularization schemes other than L1 and L2 norm penalties. + +The idea of building ops for regularization is in sync with the refactored Paddle philosophy of using operators to represent any computation unit. The way these ops will be added to the computation graph, will be decided by the [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) in Python API. + +### Computation Graph + +Below is an example of a really simple feed forward neural network. + +
+ +The Python API will modify this computation graph to add regularization operators. The modified computation graph will look as follows: + +
+    +### Python API implementation for Regularization + +Using the low level ops, `L2_regularization_op` and `L1_regularization_op`, any user can add regularization to their computation graphs. However, this will require a lot of lines of code and we should design Python APIs that support regularization. An example of such an API can be seen in [Keras](https://keras.io/regularizers/). As per the PaddlePaddle [Python API design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md), the layer functions are responsible for creating operators, operator parameters and variables. Since regularization is a property of parameters, it makes sense to create these in the layer functions. + +#### Creation of Regularization ops +There are two possibilities for creating the regularization ops: +1. We create these ops immediately while building the computation graph. +2. We add these ops in a lazy manner, just before the backward, similar to the way the optimization ops are added. + +The proposal is to add these ops in a lazy manner just before the backward pass. + +#### Storage of Regularization attributes + +Since we want to create the regularization ops in a lazy manner, the regularization attributes (type of regularization and weight of regularization penalty) can be stored as attributes of the [`Parameter`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/framework.py#L421) class. This is because regularization is a property of the parameters and storing regularization properties with Parameters also allows for shared parameters. + +#### High-level API + +In PaddlePaddle Python API, users will primarily rely on [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) to create neural network layers. Hence, we also need to provide regularization functionality in layer functions. The design of these APIs can be postponed for later right now. A good reference for these APIs can be found in [Keras](https://keras.io/regularizers/) and also by looking at Tensorflow in [`tf.contrib.layers`](https://www.tensorflow.org/api_guides/python/contrib.layers). + + + + + + diff --git a/doc/design/releasing_process.md b/doc/design/releasing_process.md index 62ff8f3229bbbb5bc82e4da29259baffc30c2c87..14c081ea84282e52a2e36475c3c0ea755122d154 100644 --- a/doc/design/releasing_process.md +++ b/doc/design/releasing_process.md @@ -5,8 +5,9 @@ PaddlePaddle使用git-flow branching model做分支管理,使用[Semantic Vers PaddlePaddle每次发新的版本,遵循以下流程: 1. 从`develop`分支派生出新的分支,分支名为`release/版本号`。例如,`release/0.10.0` -2. 将新分支的版本打上tag,tag为`版本号rc.Patch号`。第一个tag为`0.10.0rc1`,第二个为`0.10.0rc2`,依次类推。 -3. 对这个版本的提交,做如下几个操作: +1. 将新分支的版本打上tag,tag为`版本号rc.Patch号`。第一个tag为`0.10.0rc1`,第二个为`0.10.0rc2`,依次类推。 +1. 对这个版本的提交,做如下几个操作: + * 修改`python/setup.py.in`中的版本信息,并将`istaged`字段设为`True`。 * 编译这个版本的Docker发行镜像,发布到dockerhub。如果失败,修复Docker编译镜像问题,Patch号加一,返回第二步 * 编译这个版本的Ubuntu Deb包。如果失败,修复Ubuntu Deb包编译问题,Patch号加一,返回第二步。 * 使用Regression Test List作为检查列表,测试Docker镜像/ubuntu安装包的功能正确性 @@ -20,9 +21,9 @@ PaddlePaddle每次发新的版本,遵循以下流程: pip install twine twine upload dist/[package to upload] ``` -4. 第三步完成后,将`release/版本号`分支合入master分支,并删除`release/版本号`分支。将master分支的合入commit打上tag,tag为`版本号`。同时再将`master`分支合入`develop`分支。最后删除`release/版本号`分支。 -5. 编译master分支的Docker发行镜像,发布到dockerhub。编译ubuntu的deb包,发布到github release页面 -6. 协同完成Release Note的书写 +1. 第三步完成后,将`release/版本号`分支合入master分支,并删除`release/版本号`分支。将master分支的合入commit打上tag,tag为`版本号`。同时再将`master`分支合入`develop`分支。最后删除`release/版本号`分支。 +1. 编译master分支的Docker发行镜像,发布到dockerhub。编译ubuntu的deb包,发布到github release页面 +1. 协同完成Release Note的书写 需要注意的是: @@ -30,7 +31,7 @@ PaddlePaddle每次发新的版本,遵循以下流程: * `release/版本号`分支一旦建立,一般不允许再从`develop`分支合入`release/版本号`。这样保证`release/版本号`分支功能的封闭,方便测试人员测试PaddlePaddle的行为。 * 在`release/版本号`分支存在的时候,如果有bugfix的行为,需要将bugfix的分支同时merge到`master`, `develop`和`release/版本号`这三个分支。 -# PaddlePaddle 分支规范 +## PaddlePaddle 分支规范 PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范,并适应github的特性做了一些区别。 @@ -47,11 +48,11 @@ PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git- * BugFix分支也是在开发者自己的fork版本库维护,与功能分支不同的是,BugFix分支需要分别给主版本库的`master`、`develop`与可能有的`release/版本号`分支,同时提起`Pull Request`。 -# PaddlePaddle回归测试列表 +## PaddlePaddle回归测试列表 本列表说明PaddlePaddle发版之前需要测试的功能点。 -## PaddlePaddle Book中所有章节 +### PaddlePaddle Book中所有章节 PaddlePaddle每次发版本首先要保证PaddlePaddle Book中所有章节功能的正确性。功能的正确性包括验证PaddlePaddle目前的`paddle_trainer`训练和纯使用`Python`训练模型正确性。 diff --git a/doc/design/scope.md b/doc/design/scope.md index b1f9bb4378eb5ec6926f1e53f7c1f4fd5674064c..4da76eebb74abcd26ec2b8671399e6bc4fb58574 100644 --- a/doc/design/scope.md +++ b/doc/design/scope.md @@ -37,7 +37,7 @@ Scope is an association of a name to variable. All variables belong to `Scope`. ```cpp class Scope { public: - Variable* NewVar(const std::string& name); + Variable* Var(const std::string& name); const Variable* FindVar(const std::string& name) const; private: @@ -98,7 +98,7 @@ class Scope { Variable* FindVar(const std::string& name) const; // return if already contains same name variable. - Variable* NewVar(const std::string& name); + Variable* Var(const std::string& name); private: std::shared_ptr parent_; @@ -107,7 +107,7 @@ class Scope { ``` ## Only scope can create a variable -To ensure `only scope can create a variable`, we should mark `Variable`'s constructor as a private member function, and Scope is a friend class of Variable. And then only `NewVar` can construct `Variable`. +To ensure `only scope can create a variable`, we should mark `Variable`'s constructor as a private member function, and Scope is a friend class of Variable. And then only `Var` can construct `Variable`. ## When scope destroyed, all variables inside this scope should be destroyed together @@ -121,4 +121,4 @@ Also, as the parent scope is a `shared_ptr`, we can only `Create()` a scope shar ## Orthogonal interface -`FindVar` will return `nullptr` when `name` is not found. It can be used as `Contains` method. `NewVar` will return an `Error` when there is a name conflict locally. Combine `FindVar` and `NewVar`, we can implement `NewVar` easily. +`FindVar` will return `nullptr` when `name` is not found. It can be used as `Contains` method. `Var` will return an `Error` when there is a name conflict locally. Combine `FindVar` and `Var`, we can implement `Var` easily. diff --git a/doc/design/selected_rows.md b/doc/design/selected_rows.md new file mode 100644 index 0000000000000000000000000000000000000000..1a98839a957612b91b2276b58818623ecc62d1d5 --- /dev/null +++ b/doc/design/selected_rows.md @@ -0,0 +1,74 @@ +# Design Doc: Selected Rows + +`SelectedRows` is a type of sparse tensor data type, which is designed to support `embedding` operators. The gradient of embedding table is a sparse tensor. Only a few rows are non-zero values in this tensor. It is straight-forward to represent a sparse tensor by the following sparse tensor data structure: + +```cpp +class SelectedRows { + private: + vector rows_; + Tensor value_; + int height_; +}; +``` + +The field `height_` is the first dimension of `SelectedRows`. The `rows` are the indices of the non-zero rows of `SelectedRows`. The `value_` field is an N-dim tensor of shape `[rows.size() /* NUM_ROWS */, ...]`, which supplies values for each row. The dimension of `SelectedRows` satisfies `[height_] + value_.shape[1:]`. + +Suppose that a SelectedRows-typed variable `x` has many rows, but only two of them have values -- row 73 is `[1, 2]` and row 84 is `[3, 4]`, the `SelectedRows` representation would be: + +``` +x = SelectedRow { + rows = [73, 84], + value = [[1, 2], [3,4]] +} +``` + + +## SelectedRows in Protobuf + +`SelectedRows` is a type of `Variable`. `VarDesc` in protobuf should describe the `SelectedRows` information. Only the tensor dimension of a `SelectedRows` will be described in compile-time because the `rows_` and `value_` are dependent on the training data. +So we use `TensorDesc` to unify `data_type` and `dims`. A LodTensorDesc contains a `TensorDesc` and `lod_level`. The description of `SelectedRows` is a Tensor description. + +```proto +message TensorDesc { + required DataType data_type = 1; + repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480] +} + +message LodTensorDesc { + required TensorDesc tensor = 1; + optional int lod_level = 2; +} + +message VarDesc { + required string name = 1; + enum VarType { + LOD_TENSOR = 0; + SELECTED_ROWS = 1; + } + required VarType type = 2; + optional LodTensorDesc lod_desc = 3; + optional TensorDesc selected_rows_desc = 4; + optional bool persistable = 5 [ default = false ]; +} +``` + +## InferShape for Selected Rows + +Just like `LoD` information, `InferShape` method will infer the output tensor type as well. The operator should decide whether its output is a `SelectedRows` or `Dense` tensor. + +For example, the gradient operator of `TableLookup` will always generate `SelectedRows`. Its `InferShape` method should be like following + +```cpp +void TableLookupGrad::InferShape(context) { + ... + context.SetDataType("Embedding.Grad", kSelectedRows); +} +``` + + +## Sparse Operators + +There are several operators that need to be written to support `SelectedRows`. These are: + +1. Operators which generate `SelectedRows` gradient. e.g. Gradient of `TableLookupOp`. +2. Optimize operators which support `SelectedRows` gradient. e.g. `SGD` or `AdaGrad` for `SelectedRows`. However, there should be only one `SGD` operator. `OpWithKernel::Run` should select a suitable kernel for both `dense` tensor or `SelectedRows`. diff --git a/doc/design/tensor_array.md b/doc/design/tensor_array.md index a0419ec002159893b035fae1300fce489e68936a..37e4f7b90f94fa3eb015e733999cd84c96b2239c 100644 --- a/doc/design/tensor_array.md +++ b/doc/design/tensor_array.md @@ -1,39 +1,250 @@ # Design for TensorArray +This design doc presents the necessity of a new C++ class `TensorArray`. +In addition to the very simple C++ implementation + +```c++ +class TensorArray { + public: + explicit TensorArray(const LoDTensor&); + explicit TensorArray(size_t size); + + private: + vector values_; +}; +``` + +We also need to expose it to PaddlePaddle's Python API, +because users would want to use it with our very flexible operators `WhileLoop`. +An example for a RNN based on dynamic operators is + +```python +input = pd.data(...) +num_steps = Var(12) + +TensorArray states(size=num_steps) +TensorArray step_inputs(unstack_from=input) +TensorArray step_outputs(size=num_steps) + +W = Tensor(...) +U = Tensor(...) +default_state = some_op() + +step = Var(1) + +wloop = paddle.create_whileloop(loop_vars=[step]) +with wloop.frame(): + wloop.break_if(pd.equal(step, num_steps) + pre_state = states.read(step-1, default_state) + step_input = step_inputs.read(step) + state = pd.sigmoid(pd.matmul(U, pre_state) + pd.matmul(W, step_input)) + states.write(step, state) + step_outputs.write(step, state) # output state + step.update(state+1) + +output = step_outputs.stack() +``` + +## Background +Steps are one of the core concepts of RNN. In each time step of RNN, there should be several input segments, states, and output segments; all these components act like arrays, for example, call `states[step_id]` will get the state in `step_id`th time step. + +An RNN can be implemented with the following pseudocode + +```c++ +Array states; +Array input_segments; +Array output_segments; +Parameter W, U; + +step = 1 +seq_len = 12 +while_loop { + if (step == seq_len) break; + states[step] = sigmoid(W * states[step-1] + U * input_segments[step]); + output_segments[step] = states[step] // take state as output + step++; +} +``` +According to the [RNN roadmap](https://github.com/PaddlePaddle/Paddle/issues/4561), there are several different RNNs that PaddlePaddle will eventually support. + +Currently, the basic RNN implementation supported by PaddlePaddle is the `recurrent_op` which takes tensors as input and splits them into `input_segments`. + + +Since a tensor cannot store variable-length sequences directly, PaddlePaddle implements the tensor with level of details (`LoDTensor` for short). +Segmenting the `LoDTensor` is much more complicated than splitting a tensor, that makes it necessary to refactor the `recurrent_op` with `LoDTensor` segmenting support. + +As the next step in RNN support, `dynamic_recurrent_op` should be introduced to handle inputs with variable-length sequences. + +The implementation is similar to `recurrent_op`. +The key difference is the way **the original input `LoDTensors` and outupts are split to get the `input_segments` and the `output_segments`.** + + +Though it can't be built over `recurrent_op` or `dynamic_recurrent_op` directly, +the logic behind splitting a tensor or a LoD tensor into `input_segments` remains the same. + +## Why `TensorArray` +The logic behind splitting the inputs to segments, states and outputs is similar and can be shared in a seperate module. + +The array of `states`, `input_segments` and `output_segments` would be exposed to users when writing a dynamic RNN model similar to the above pseudo codes. + +So there should be an array-like container, which can store the segments of a tensor or LoD tensor. + +**This container can store an array of tensors and provides several methods to split a tensor or a LoD tensor** . +This is where the notion of `TensorArray` comes from. + +## Introduce TensorArray to uniform all the three RNNs TensorArray as a new concept is borrowed from TensorFlow, it is meant to be used with dynamic iteration primitives such as `while_loop` and `map_fn`. This concept can be used to support our new design of dynamic operations, and help to refactor some existing variant-sentence-related layers, -such as `RecurrentGradientMachine`. +such as `recurrent_op`, `RecurrentGradientMachine`. In [our design for dynamic RNN](https://github.com/PaddlePaddle/Paddle/pull/4401), `TensorArray` is used to segment inputs and store states in all time steps. By providing some methods similar to a C++ array, -the definition of some state-based dynamic models such as RNN could be more natural and highly flexible. - -## Dynamic-Related Methods -Some basic methods should be proposed as follows: - -### stack() -Pack the values in a `TensorArray` into a tensor with rank one higher than each tensor in `values`. -### unstack(axis=0) -Unpacks the given dimension of a rank-`R` tensor into rank-`(R-1)` tensors. -### concat() -Return the values in the `TensorArray` as a concatenated Tensor. -### write(index, value, data_shared=true) -Write value into index of the TensorArray. -### read(index) -Read the value at location `index` in the `TensorArray`. -### size() -Return the number of values. +the definition of some state-based dynamic models such as RNN can be more natural and highly flexible. + +## Dynamic-operations on TensorArray + +`TensorArray` will be used directly when defining dynamic models, so some operators listed below should be implemented + +```python +# several helper operators for TensorArray +def tensor_array_stack(ta, tensor): + ''' + get a tensor array `ta`, return a packed `tensor`. + ''' + pass + +def tensor_array_unstack(tensor, ta): + ''' + get a `tensor`, unstack it and get a tensor array `ta`. + ''' + pass + +def tensor_array_write(ta, index, tensor, data_shared): + ''' + get a `tensor` and a scalar tensor `index`, write `tensor` into index-th + value of the tensor array `ta`. + `data_shared` is an attribute that specifies whether to copy or reference the tensors. + ''' + pass + +def tensor_array_read(ta, index, tensor): + ''' + get a tensor array `ta`, a scalar tensor `index`, read the index-th value of + `ta` and return as the `tensor`. + ''' + pass + +def tensor_array_size(ta, tensor): + ''' + get a tensor array `ta`, return the size of `ta` and return as the scalar `tensor`. + ''' + pass +``` + +It is trivial for users to use so many low-level operators, so some helper methods should be proposed in python wrapper to make `TensorArray` easier to use, +for example + +```python +class TensorArray: + def __init__(self, name): + self.name = name + self.desc = TensorArrayDesc() + + def stack(self, name=None): + ''' + Pack the values in a `TensorArray` into a tensor with rank one higher + than each tensor in `values`. + `stack` can be used to split tensor into time steps for RNN or whileloop. + + @name: str + the name of the variable to output. + ''' + tensor = Var(name) + tensor_array_stack(self.name, tensor) + return tensor + + def unstack(self, input): + ''' + Unpacks the given dimension of a rank-`R` tensor into rank-`(R-1)` tensors. + `unstack` can be used to concatenate all the time steps for RNN or whileloop. + + @input: str + the name of input tensor + ''' + tensor_array_unstack(tensor, self.name) + + def write(self, index, value, data_shared=True): + ''' + Write value into index of the TensorArray. + If `data_shared` is set to True, than the index-th value in TensorArray will + be shared with the tensor passed in. + + @index: str + name of a scalar tensor + @value: str + name of a tensor + @data_shared: bool + ''' + tensor_array_write(self.name, index, value, data_shared) + + def read(self, index, output): + ''' + Read the value at location `index` in the `TensorArray`. + + @index: str + name of a scalar tensor + @output: + name of a output variable + ''' + tensor_array_read(self.name, index, output) + + + def size(self, output): + ''' + Return the number of values. + + @output: str + name of a scalar tensor + ''' + tensor_array_size(self.name, output) +``` ## LoDTensor-related Supports -The `RecurrentGradientMachine` in Paddle serves as a flexible RNN layer; it takes variant length sequences as input, -because each step of RNN could only take a tensor-represented batch of data as input, +The `RecurrentGradientMachine` in Paddle serves as a flexible RNN layer; it takes varience-length sequences as input, and output sequences too. + +Since each step of RNN can only take a tensor-represented batch of data as input, some preprocess should be taken on the inputs such as sorting the sentences by their length in descending order and cut each word and pack to new batches. -Such cut-like operations can be embedded into `TensorArray` as general methods called `unpack` and `pack`. +Such cut-like operations can be embedded into `TensorArray` as general methods called `unpack` and `pack`, +these two operations are similar to `stack` and `unstack` except that they operate on variable-length sequences formated as a LoD tensor rather than a tensor. + +Some definitions are like + +```python +def unpack(level): + ''' + Split LodTensor in some `level` and generate batches, if set `sort_by_length`, + will sort by length. -With these two methods, a variant-sentence-RNN can be implemented like + Returns: + - a new `TensorArray`, whose values are LodTensors and represents batches + of data. + - an int32 Tensor, which stores the map from the new batch's indices to + original LoDTensor + ''' + pass + +def pack(level, indices_map): + ''' + Recover the original LoD-arranged LoDTensor with the values in a `TensorArray` + and `level` and `indices_map`. + ''' + pass +``` + +With these two methods, a varience-length sentence supported RNN can be implemented like ```c++ // input is the varient-length data @@ -58,16 +269,3 @@ LoDTensor rnn_output = ta.pack(ta, indice_map); ``` the code above shows that by embedding the LoDTensor-related preprocess operations into `TensorArray`, the implementation of a RNN that supports varient-length sentences is far more concise than `RecurrentGradientMachine` because the latter mixes all the codes together, hard to read and extend. - - -some details are as follows. - -### unpack(level, sort_by_length) -Split LodTensor in some `level` and generate batches, if set `sort_by_length`, will sort by length. - -Returns: - -- a new `TensorArray`, whose values are LodTensors and represents batches of data. -- an int32 Tensor, which stores the map from the new batch's indices to original LoDTensor -### pack(level, indices_map) -Recover the original LoD-arranged LoDTensor with the values in a `TensorArray` and `level` and `indices_map`. diff --git a/doc/design/test.dot b/doc/design/test.dot new file mode 100644 index 0000000000000000000000000000000000000000..62c69b8fc8010a26a54a6ee8ef1488aad94d747a --- /dev/null +++ b/doc/design/test.dot @@ -0,0 +1,35 @@ + +digraph Test { + z -> generator -> G_img; + G_img -> discriminator -> D_f -> d_loss_f; + label0 -> d_loss_f -> d_loss; + + img -> discriminator -> D_t -> d_loss_t; + label1 -> d_loss_t -> d_loss; + + d_loss -> d_loss_t[color=red, style=dashed]; + d_loss -> d_loss_f[color=red, style=dashed]; + d_loss_t -> D_t[color=red, style=dashed]; + d_loss_f -> D_f[color=red, style=dashed]; + D_t -> discriminator[color=red, style=dashed]; + D_f -> discriminator[color=red, style=dashed]; + + D_f -> g_loss; + label2 -> g_loss; + + g_loss -> D_f[color=green, style=dashed]; + D_f -> discriminator[color=green, style=dashed]; + discriminator -> G_img[color=green, style=dashed]; + G_img -> generator[color=green, style=dashed]; + + discriminator [color=red, shape=box]; + generator [color=green, shape=box]; + z [shape=diamond]; + img [shape=diamond]; + label0 [shape=diamond]; + label1 [shape=diamond]; + label2 [shape=diamond]; + + d_loss [color=red]; + g_loss [color=green]; +} diff --git a/doc/design/test.dot.png b/doc/design/test.dot.png new file mode 100644 index 0000000000000000000000000000000000000000..4e121a40b9f7b2232d7cdda315bad15926446f55 Binary files /dev/null and b/doc/design/test.dot.png differ diff --git a/doc/design/var_desc.md b/doc/design/var_desc.md index bfbbdd0578ebc69ea4b49ade9b041573a9e9ad55..0b2958c1b10ef6a6ce51aa75f61e15a7f2d94b3f 100644 --- a/doc/design/var_desc.md +++ b/doc/design/var_desc.md @@ -16,16 +16,23 @@ The computation graph is constructed by Data Node and Operation Node. The concep ## Definition of VarDesc -A VarDesc should have a name and value, in PaddlePaddle, the value will always be a tensor. Since we use LoDTensor most of the time. We add a LoDTesnorDesc to represent it. +A VarDesc should have a name, and value. The are two kinds of variable type in compile time, they are `LoDTensor` and `SelectedRows`. ```proto message VarDesc { required string name = 1; - optional LoDTensorDesc lod_tensor = 2; + enum VarType { + LOD_TENSOR = 0; + SELECTED_ROWS = 1; + } + required VarType type = 2; + optional LoDTensorDesc lod_desc = 3; + optional TensorDesc selected_rows_desc = 4; + optional bool persistable = 5 [ default = false ]; } ``` -## Definition of LodTensorDesc +## Definition of TensorDesc ```proto enum DataType { @@ -38,87 +45,25 @@ enum DataType { FP64 = 6; } -message LoDTensorDesc { +message TensorDesc { required DataType data_type = 1; - repeated int32 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480] - optional int32 lod_level = 3 [default=0]; + repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480] } ``` -## Definition of Variable in Python - -In Python API, layer will take Variable as Input, and return Variable as Output. There should be a class `Variable` in python to help create and manage Variable. - -```python -image = Variable(dims=[-1, 640, 480]) -# fc1 and fc2 are both Variable -fc1 = layer.fc(input=image, output_size=10) -fc2 = layer.fc(input=fc1, output_size=20) -``` -### what should class `Variable` Have -1. `name`.a name of string type is used to mark the value of the Variable. -1. `initializer`. Since our Tensor does not have value. we will always use some Operator to fullfill it when run. So we should have a initialize method to help add the init operator. -1. `operator`. Variable should record which operator produce itself. The reaon is: - - we use pd.eval(targets=[var1, var2]) to run the related ops to get the value of var1 and var2. var.op is used to trace the dependency of the current variable. - -In PaddlePaddle, we use Block to describe Computation Graph, so in the code we will use Block but not Graph. - -```python -import VarDesc -import LoDTensorDesc -import framework - -def AddInitialOperator(variable, initializer): - # add an initialize Operator to block to init this Variable - -class Variable(object): - def __init__(self, name, dims, type, initializer): - self._block = get_default_block() - self._name = name - self.op = None - - tensor_desc = LoDTensorDesc(data_type=type, dims=dims) - _var_desc = VarDesc(name=name, lod_tensor=tensor_desc) - self._var = framework.CreateVar(_var_desc) - self._block.add_var(self) +A TensorDesc describes `SelectedRows` and `LoDTensor`. For details of `SelectedRows`, please reference [`SelectedRows`](./selected_rows.md). - # add initial op according to initializer - if initializer is not None: - AddInitialOperator(self, initializer) - - def dims(self): - return self._var.dims() - - def data_type(self): - return self._var.data_type() +## Definition of LodTensorDesc - def to_proto(self): - pass +```proto +message LoDTensorDesc { + required TensorDesc tensor = 1; + optional int lod_level = 2; +} ``` -Then we can use this Variable to create a fc layer in Python. +A LoDTensorDesc contains a tensor and a lod_level. -```python -import paddle as pd - -def flatten_size(X, num_flatten_dims): - prod = 1 # of last num_flatten_dims - for i in xrange(num_flatten_dims): - prod = prod * X.dims[-i-1] - return prod - -def layer.fc(X, output_size, num_flatten_dims): - W = Variable(pd.random_uniform(), type=FP32, dims=[flatten_size(X, num_flatten_dims), output_size]) - b = Variable(pd.random_uniform(), type=FP32, dims=[output_size]) - out = Variable(type=FP32) - y = operator.fc(X, W, b, output=out) # fc will put fc op input into out - pd.InferShape(y) - return out - -x = Variable(dims=[-1, 640, 480]) -y = layer.fc(x, output_size=100) -z = layer.fc(y, output_size=200) +## Definition of Variable in Python -paddle.eval(targets=[z], ...) -print(z) -``` +For Variable in Python, please reference [`Python API`](./python_api.md). diff --git a/doc/faq/local/index_cn.rst b/doc/faq/local/index_cn.rst index 75c4ba028e497e29e9030a86514348726d9c0a80..b331d9d36e6a279881c3b1a5586835e7186957fb 100644 --- a/doc/faq/local/index_cn.rst +++ b/doc/faq/local/index_cn.rst @@ -99,7 +99,7 @@ PaddlePaddle支持Sparse的训练,sparse训练需要训练特征是 :code:`spa 利用更多的计算资源 ++++++++++++++++++ -利用更多的计算资源可以分为一下几个方式来进行\: +利用更多的计算资源可以分为以下几个方式来进行\: * 单机CPU训练 @@ -174,7 +174,7 @@ decoder_inputs = paddle.layer.fc( 1. 两者都是对梯度的截断,但截断时机不同,前者在 :code:`optimzier` 更新网络参数时应用;后者在激活函数反向计算时被调用; 2. 截断对象不同:前者截断可学习参数的梯度,后者截断回传给前层的梯度; -除此之外,还可以通过减小学习律或者对数据进行归一化处理来解决这类问题。 +除此之外,还可以通过减小学习率或者对数据进行归一化处理来解决这类问题。 5. 如何调用 infer 接口输出多个layer的预测结果 ----------------------------------------------- diff --git a/doc/faq/parameter/index_cn.rst b/doc/faq/parameter/index_cn.rst index c721b623183cc7d8d17e2c9fb1635ea07b8970cc..6fa0c64413be1616a435640b0347904a49873349 100644 --- a/doc/faq/parameter/index_cn.rst +++ b/doc/faq/parameter/index_cn.rst @@ -75,7 +75,7 @@ PaddlePaddle目前支持8种learning_rate_schedule,这8种learning_rate_schedu optimizer = paddle.optimizer.Adam( learning_rate=1e-3, - learning_rate_schedule="manual", + learning_rate_schedule="pass_manual", learning_rate_args="1:1.0,2:0.9,3:0.8",) 在该示例中,当已训练pass数小于等于1时,学习率为 :code:`1e-3 * 1.0`;当已训练pass数大于1小于等于2时,学习率为 :code:`1e-3 * 0.9`;当已训练pass数大于2时,学习率为 :code:`1e-3 * 0.8`。 diff --git a/doc/getstarted/basic_usage/index_cn.rst b/doc/getstarted/basic_usage/index_cn.rst deleted file mode 100644 index b473944fc7fb89d3e0a0b330933f2226734bb5bd..0000000000000000000000000000000000000000 --- a/doc/getstarted/basic_usage/index_cn.rst +++ /dev/null @@ -1,108 +0,0 @@ -经典的线性回归任务 -================== - -PaddlePaddle是源于百度的一个深度学习平台。这份简短的介绍将向你展示如何利用PaddlePaddle来解决一个经典的线性回归问题。 - -任务简介 --------- - -我们展示如何用PaddlePaddle解决 `单变量的线性回归 `_ 问题。线性回归的输入是一批点 `(x, y)` ,其中 `y = wx + b + ε`, 而 ε 是一个符合高斯分布的随机变量。线性回归的输出是从这批点估计出来的参数 `w` 和 `b` 。 - -一个例子是房产估值。我们假设房产的价格(y)是其大小(x)的一个线性函数,那么我们可以通过收集市场上房子的大小和价格,用来估计线性函数的参数w 和 b。 - -准备数据 ------------ - -假设变量 `x` 和 `y` 的真实关系为: `y = 2x + 0.3 + ε`,这里展示如何使用观测数据来拟合这一线性关系。首先,Python代码将随机产生2000个观测点,作为线性回归的输入。下面脚本符合PaddlePaddle期待的读取数据的Python程序的模式。 - -.. code-block:: python - - # dataprovider.py - from paddle.trainer.PyDataProvider2 import * - import random - - # 定义输入数据的类型: 2个浮点数 - @provider(input_types=[dense_vector(1), dense_vector(1)],use_seq=False) - def process(settings, input_file): - for i in xrange(2000): - x = random.random() - yield [x], [2*x+0.3] - -训练模型 ------------ - -为了还原 `y = 2x + 0.3`,我们先从一条随机的直线 `y' = wx + b` 开始,然后利用观测数据调整 `w` 和 `b` 使得 `y'` 和 `y` 的差距不断减小,最终趋于接近。这个过程就是模型的训练过程,而 `w` 和 `b` 就是模型的参数,即我们的训练目标。 - -在PaddlePaddle里,该模型的网络配置如下。 - -.. code-block:: python - - # trainer_config.py - from paddle.trainer_config_helpers import * - - # 1. 定义数据来源,调用上面的process函数获得观测数据 - data_file = 'empty.list' - with open(data_file, 'w') as f: f.writelines(' ') - define_py_data_sources2(train_list=data_file, test_list=None, - module='dataprovider', obj='process',args={}) - - # 2. 学习算法。控制如何改变模型参数 w 和 b - settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer()) - - # 3. 神经网络配置 - x = data_layer(name='x', size=1) - y = data_layer(name='y', size=1) - # 线性计算网络层: ȳ = wx + b - ȳ = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b')) - # 计算误差函数,即 ȳ 和真实 y 之间的距离 - cost = square_error_cost(input= ȳ, label=y) - outputs(cost) - - -这段简短的配置展示了PaddlePaddle的基本用法: - -- 第一部分定义了数据输入。一般情况下,PaddlePaddle先从一个文件列表里获得数据文件地址,然后交给用户自定义的函数(例如上面的 `process`函数)进行读入和预处理从而得到真实输入。本文中由于输入数据是随机生成的不需要读输入文件,所以放一个空列表(`empty.list`)即可。 - -- 第二部分主要是选择学习算法,它定义了模型参数改变的规则。PaddlePaddle提供了很多优秀的学习算法,这里使用一个基于momentum的随机梯度下降(SGD)算法,该算法每批量(batch)读取12个采样数据进行随机梯度计算来更新更新。 - -- 最后一部分是神经网络的配置。由于PaddlePaddle已经实现了丰富的网络层,所以很多时候你需要做的只是定义正确的网络层并把它们连接起来。这里使用了三种网络单元: - - - **数据层**:数据层 `data_layer` 是神经网络的入口,它读入数据并将它们传输到接下来的网络层。这里数据层有两个,分别对应于变量 `x` 和 `y`。 - - **全连接层**:全连接层 `fc_layer` 是基础的计算单元,这里利用它建模变量之间的线性关系。计算单元是神经网络的核心,PaddlePaddle支持大量的计算单元和任意深度的网络连接,从而可以拟合任意的函数来学习复杂的数据关系。 - - **回归误差代价层**:回归误差代价层 `square_error_cost` 是众多误差代价函数层的一种,它们在训练过程作为网络的出口,用来计算模型的误差,是模型参数优化的目标函数。 - -定义了网络结构并保存为 `trainer_config.py` 之后,运行以下训练命令: - -.. code-block:: bash - - paddle train --config=trainer_config.py --save_dir=./output --num_passes=30 - -PaddlePaddle将在观测数据集上迭代训练30轮,并将每轮的模型结果存放在 `./output` 路径下。从输出日志可以看到,随着轮数增加误差代价函数的输出在不断的减小,这意味着模型在训练数据上不断的改进,直到逼近真实解:` y = 2x + 0.3 ` - -模型检验 ------------ - -训练完成后,我们希望能够检验模型的好坏。一种常用的做法是用学习的模型对另外一组测试数据进行预测,评价预测的效果。在这个例子中,由于已经知道了真实答案,我们可以直接观察模型的参数是否符合预期来进行检验。 - -PaddlePaddle将每个模型参数作为一个numpy数组单独存为一个文件,所以可以利用如下方法读取模型的参数。 - -.. code-block:: python - - import numpy as np - import os - - def load(file_name): - with open(file_name, 'rb') as f: - f.read(16) # skip header for float type. - return np.fromfile(f, dtype=np.float32) - - print 'w=%.6f, b=%.6f' % (load('output/pass-00029/w'), load('output/pass-00029/b')) - # w=1.999743, b=0.300137 - -.. image:: ./parameters.png - :align: center - :scale: 80 % - -从图中可以看到,虽然 `w` 和 `b` 都使用随机值初始化,但在起初的几轮训练中它们都在快速逼近真实值,并且后续仍在不断改进,使得最终得到的模型几乎与真实模型一致。 - -这样,我们用PaddlePaddle解决了单变量线性回归问题, 包括数据输入、模型训练和最后的结果验证。 diff --git a/doc/getstarted/basic_usage/index_en.rst b/doc/getstarted/basic_usage/index_en.rst deleted file mode 100644 index 2cc438ebbe0f97345d25354b93b4ebbd43502415..0000000000000000000000000000000000000000 --- a/doc/getstarted/basic_usage/index_en.rst +++ /dev/null @@ -1,101 +0,0 @@ -Simple Linear Regression -======================== - -PaddlePaddle is a deep learning platform open-sourced by Baidu. With PaddlePaddle, you can easily train a classic neural network within a couple lines of configuration, or you can build sophisticated models that provide state-of-the-art performance on difficult learning tasks like sentiment analysis, machine translation, image caption and so on. - -Problem Background ------------------- - -Now, to give you a hint of what using PaddlePaddle looks like, let's start with a fundamental learning problem - `simple linear regression `_: you have observed a set of two-dimensional data points of ``X`` and ``Y``, where ``X`` is an explanatory variable and ``Y`` is corresponding dependent variable, and you want to recover the underlying correlation between ``X`` and ``Y``. Linear regression can be used in many practical scenarios. For example, ``X`` can be a variable about house size, and ``Y`` a variable about house price. You can build a model that captures relationship between them by observing real estate markets. - -Prepare the Data ------------------ - -Suppose the true relationship can be characterized as ``Y = 2X + 0.3``, let's see how to recover this pattern only from observed data. Here is a piece of python code that feeds synthetic data to PaddlePaddle. The code is pretty self-explanatory, the only extra thing you need to add for PaddlePaddle is a definition of input data types. - - .. code-block:: python - - # dataprovider.py - from paddle.trainer.PyDataProvider2 import * - import random - - # define data types of input: 2 real numbers - @provider(input_types=[dense_vector(1), dense_vector(1)],use_seq=False) - def process(settings, input_file): - for i in xrange(2000): - x = random.random() - yield [x], [2*x+0.3] - -Train a NeuralNetwork ----------------------- - -To recover this relationship between ``X`` and ``Y``, we use a neural network with one layer of linear activation units and a square error cost layer. Don't worry if you are not familiar with these terminologies, it's just saying that we are starting from a random line ``Y' = wX + b`` , then we gradually adapt ``w`` and ``b`` to minimize the difference between ``Y'`` and ``Y``. Here is what it looks like in PaddlePaddle: - - .. code-block:: python - - # trainer_config.py - from paddle.trainer_config_helpers import * - - # 1. read data. Suppose you saved above python code as dataprovider.py - data_file = 'empty.list' - with open(data_file, 'w') as f: f.writelines(' ') - define_py_data_sources2(train_list=data_file, test_list=None, - module='dataprovider', obj='process',args={}) - - # 2. learning algorithm - settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer()) - - # 3. Network configuration - x = data_layer(name='x', size=1) - y = data_layer(name='y', size=1) - y_predict = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b')) - cost = square_error_cost(input=y_predict, label=y) - outputs(cost) - -Some of the most fundamental usages of PaddlePaddle are demonstrated: - -- The first part shows how to feed data into PaddlePaddle. In general cases, PaddlePaddle reads raw data from a list of files, and then do some user-defined process to get real input. In this case, we only need to create a placeholder file since we are generating synthetic data on the fly. - -- The second part describes learning algorithm. It defines in what ways adjustments are made to model parameters. PaddlePaddle provides a rich set of optimizers, but a simple momentum based optimizer will suffice here, and it processes 12 data points each time. - -- Finally, the network configuration. It usually is as simple as "stacking" layers. Three kinds of layers are used in this configuration: - - **Data Layer**: a network always starts with one or more data layers. They provide input data to the rest of the network. In this problem, two data layers are used respectively for ``X`` and ``Y``. - - **FC Layer**: FC layer is short for Fully Connected Layer, which connects all the input units to current layer and does the actual computation specified as activation function. Computation layers like this are the fundamental building blocks of a deeper model. - - **Cost Layer**: in training phase, cost layers are usually the last layers of the network. They measure the performance of current model, and provide guidence to adjust parameters. - -Now that everything is ready, you can train the network with a simple command line call: - - .. code-block:: bash - - paddle train --config=trainer_config.py --save_dir=./output --num_passes=30 - - -This means that PaddlePaddle will train this network on the synthectic dataset for 30 passes, and save all the models under path ``./output``. You will see from the messages printed out during training phase that the model cost is decreasing as time goes by, which indicates we are getting a closer guess. - - -Evaluate the Model -------------------- - -Usually, a different dataset that left out during training phase should be used to evalute the models. However, we are lucky enough to know the real answer: ``w=2, b=0.3``, thus a better option is to check out model parameters directly. - -In PaddlePaddle, training is just to get a collection of model parameters, which are ``w`` and ``b`` in this case. Each parameter is saved in an individual file in the popular ``numpy`` array format. Here is the code that reads parameters from last pass. - - .. code-block:: python - - import numpy as np - import os - - def load(file_name): - with open(file_name, 'rb') as f: - f.read(16) # skip header for float type. - return np.fromfile(f, dtype=np.float32) - - print 'w=%.6f, b=%.6f' % (load('output/pass-00029/w'), load('output/pass-00029/b')) - # w=1.999743, b=0.300137 - - .. image:: parameters.png - :align: center - -Although starts from a random guess, you can see that value of ``w`` changes quickly towards 2 and ``b`` changes quickly towards 0.3. In the end, the predicted line is almost identical with real answer. - -There, you have recovered the underlying pattern between ``X`` and ``Y`` only from observed data. diff --git a/doc/getstarted/basic_usage/parameters.png b/doc/getstarted/basic_usage/parameters.png deleted file mode 100644 index 2ec67480951e21f0400bce1c34b3108dcd65c18c..0000000000000000000000000000000000000000 Binary files a/doc/getstarted/basic_usage/parameters.png and /dev/null differ diff --git a/doc/getstarted/build_and_install/build_from_source_cn.rst b/doc/getstarted/build_and_install/build_from_source_cn.rst new file mode 100644 index 0000000000000000000000000000000000000000..c875c807b8ab2e420dec189ef32d41533f58fa6d --- /dev/null +++ b/doc/getstarted/build_and_install/build_from_source_cn.rst @@ -0,0 +1,141 @@ +从源码编译 +====================== + +.. _build_step: + +编译方法 +---------------- + +PaddlePaddle主要使用 `CMake `_ 以及GCC, G++作为编译工具。 +我们推荐您使用PaddlePaddle Docker编译环境镜像完成编译,这样可以免去单独安装编译依赖的步骤,可选的不同编译环境Docker镜像 +可以在 `这里 `_ 找到。 + +如果您选择不使用Docker镜像,则需要在本机安装下面章节列出的 `编译依赖`_ 之后才能开始编译的步骤。 + +编译PaddlePaddle,需要执行: + +.. code-block:: bash + + git clone https://github.com/PaddlePaddle/Paddle.git + cd Paddle + # 如果使用Docker编译环境,执行下面的命令编译CPU-Only的二进制 + docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh + # 如果不使用Docker编译环境,执行下面的命令 + mkdir build + cd build + cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF .. + make + +编译完成后会在build/python/dist目录下生成输出的whl包,可以选在在当前机器安装也可以拷贝到目标机器安装: + +.. code-block:: bash + + pip install build/python/dist/*.whl + + +.. _run_test: + +执行单元测试 +---------------- + +如果您期望在编译完成后立即执行所有的单元测试,可以按照下面的方法: + +使用Docker的情况下,设置 :code:`RUN_TEST=ON` 和 :code:`WITH_TESTING=ON` 就会在完成编译之后,立即执行单元测试。 +开启 :code:`WITH_GPU=ON` 可以指定同时执行GPU上的单元测试。 + +.. code-block:: bash + + docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh + +如果不使用Docker,可以执行ctest命令即可: + +.. code-block:: bash + + mkdir build + cd build + cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF .. + make + ctest + # 指定执行其中一个单元测试 test_mul_op + ctest -R test_mul_op + +.. _compile_deps: + +编译依赖 +---------------- + +PaddlePaddle编译需要使用到下面的依赖(包含但不限于),其他的依赖软件,会自动在编译时下载。 + +.. csv-table:: PaddlePaddle编译依赖 + :header: "依赖", "版本", "说明" + :widths: 10, 15, 30 + + "CMake", ">=3.5", "" + "GCC", "4.8.2", "推荐使用CentOS的devtools2" + "Python", "2.7.x", "依赖libpython2.7.so" + "pip", ">=9.0", "" + "numpy", "", "" + "SWIG", ">=2.0", "" + "Go", ">=1.8", "可选" + + +.. _build_options: + +编译选项 +---------------- + +PaddlePaddle的编译选项,包括生成CPU/GPU二进制文件、链接何种BLAS库等。 +用户可在调用cmake的时候设置它们,详细的cmake使用方法可以参考 +`官方文档 `_ 。 + +在cmake的命令行中,通过使用 ``-D`` 命令设置该类编译选项,例如: + +.. code-block:: bash + + cmake .. -DWITH_GPU=OFF + +.. csv-table:: 编译选项说明 + :header: "选项", "说明", "默认值" + :widths: 1, 7, 2 + + "WITH_GPU", "是否支持GPU", "ON" + "WITH_C_API", "是否仅编译CAPI", "OFF" + "WITH_DOUBLE", "是否使用双精度浮点数", "OFF" + "WITH_DSO", "是否运行时动态加载CUDA动态库,而非静态加载CUDA动态库。", "ON" + "WITH_AVX", "是否编译含有AVX指令集的PaddlePaddle二进制文件", "ON" + "WITH_PYTHON", "是否内嵌PYTHON解释器", "ON" + "WITH_STYLE_CHECK", "是否编译时进行代码风格检查", "ON" + "WITH_TESTING", "是否开启单元测试", "ON" + "WITH_DOC", "是否编译中英文文档", "OFF" + "WITH_SWIG_PY", "是否编译PYTHON的SWIG接口,该接口可用于预测和定制化训练", "Auto" + "WITH_GOLANG", "是否编译go语言的可容错parameter server", "ON" + "WITH_MKL", "是否使用MKL数学库,如果为否则是用OpenBLAS", "ON" + +BLAS ++++++ + +PaddlePaddle支持 `MKL `_ 和 +`OpenBlAS `_ 两种BLAS库。默认使用MKL。如果使用MKL并且机器含有AVX2指令集, +还会下载MKL-DNN数学库,详细参考 `这里 `_ 。 + +如果关闭MKL,则会使用OpenBLAS作为BLAS库。 + +CUDA/cuDNN ++++++++++++ + +PaddlePaddle在编译时/运行时会自动找到系统中安装的CUDA和cuDNN库进行编译和执行。 +使用参数 :code:`-DCUDA_ARCH_NAME=Auto` 可以指定开启自动检测SM架构,加速编译。 + +PaddlePaddle可以使用cuDNN v5.1之后的任何一个版本来编译运行,但尽量请保持编译和运行使用的cuDNN是同一个版本。 +我们推荐使用最新版本的cuDNN。 + +编译选项的设置 +++++++++++++++ + +PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/cuDNN库。cmake编译时,首先在系统路径( :code:`/usr/lib:/usr/local/lib` )中搜索这几个库,同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置,例如 + +.. code-block:: bash + + cmake .. -DWITH_GPU=ON -DWITH_TESTING=OFF -DCUDNN_ROOT=/opt/cudnnv5 + +**注意:这几个编译选项的设置,只在第一次cmake的时候有效。如果之后想要重新设置,推荐清理整个编译目录(** :code:`rm -rf` )**后,再指定。** diff --git a/doc/getstarted/build_and_install/build_from_source_en.md b/doc/getstarted/build_and_install/build_from_source_en.md deleted file mode 100644 index 2f1461489495618718d5abaeab9cbeda9b93700f..0000000000000000000000000000000000000000 --- a/doc/getstarted/build_and_install/build_from_source_en.md +++ /dev/null @@ -1,236 +0,0 @@ -Installing from Sources -========================== - -* [1. Download and Setup](#download) -* [2. Requirements](#requirements) -* [3. Build on Ubuntu](#ubuntu) -* [4. Build on Centos](#centos) - - -## Download and Setup -You can download PaddlePaddle from the [github source](https://github.com/PaddlePaddle/Paddle). - -```bash -git clone https://github.com/PaddlePaddle/Paddle paddle -cd paddle -``` -## Requirements - -To compile the source code, your computer must be equipped with the following dependencies. - -- **Compiler**: GCC >= 4.8 or Clang >= 3.3 (AppleClang >= 5.1) and gfortran compiler -- **CMake**: CMake >= 3.0 (at least CMake 3.4 on Mac OS X) -- **BLAS**: MKL, OpenBlas or ATLAS -- **Python**: only support Python 2.7 -- **Go** - -**Note:** For CUDA 7.0 and CUDA 7.5, GCC 5.0 and up are not supported! -For CUDA 8.0, GCC versions later than 5.3 are not supported! - -### Options - -PaddlePaddle supports some build options. - - - - - - - - - - - - - - - - - - - - - - - - - - -
OptionalDescription
WITH_GPUCompile PaddlePaddle with NVIDIA GPU
WITH_AVXCompile PaddlePaddle with AVX intrinsics
WITH_DSOCompile PaddlePaddle with dynamic linked CUDA
WITH_TESTINGCompile PaddlePaddle with unit testing
WITH_SWIG_PYCompile PaddlePaddle with inference api
WITH_STYLE_CHECKCompile PaddlePaddle with style check
WITH_PYTHONCompile PaddlePaddle with python interpreter
WITH_DOUBLECompile PaddlePaddle with double precision
WITH_RDMACompile PaddlePaddle with RDMA support
WITH_TIMERCompile PaddlePaddle with stats timer
WITH_PROFILERCompile PaddlePaddle with GPU profiler
WITH_DOCCompile PaddlePaddle with documentation
WITH_COVERAGECompile PaddlePaddle with code coverage
COVERALLS_UPLOADPackage code coverage data to coveralls
ON_TRAVISExclude special unit test on Travis CI
- - -**Note:** - - The GPU version works best with Cuda Toolkit 8.0 and cuDNN v5. - - Other versions like Cuda Toolkit 7.0, 7.5 and cuDNN v3, v4 are also supported. - - **To utilize cuDNN v5, Cuda Toolkit 7.5 is prerequisite and vice versa.** - -As a simple example, consider the following: - -1. **BLAS Dependencies(optional)** - - CMake will search BLAS libraries from the system. If not found, OpenBLAS will be downloaded, built and installed automatically. - To utilize preinstalled BLAS, you can simply specify MKL, OpenBLAS or ATLAS via `MKL_ROOT`, `OPENBLAS_ROOT` or `ATLAS_ROOT`. - - ```bash - # specify MKL - cmake .. -DMKL_ROOT= - # or specify OpenBLAS - cmake .. -DOPENBLAS_ROOT= - ``` - -2. **Doc Dependencies(optional)** - - To generate PaddlePaddle's documentation, install dependencies and set `-DWITH_DOC=ON` as follows: - - ```bash - pip install 'sphinx>=1.4.0' - pip install sphinx_rtd_theme recommonmark - - # install doxygen on Ubuntu - sudo apt-get install doxygen - # install doxygen on Mac OS X - brew install doxygen - - # active docs in cmake - cmake .. -DWITH_DOC=ON` - ``` - -## Build on Ubuntu 14.04 - -### Install Dependencies - -- **Paddle Dependencies** - - ```bash - # necessary - sudo apt-get update - sudo apt-get install -y git curl gcc g++ gfortran make build-essential automake - sudo apt-get install -y python python-pip python-numpy libpython-dev bison - sudo pip install 'protobuf==3.1.0.post1' - - # Install Go - # You can follow https://golang.org/doc/install for a detailed explanation. - wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \ - tar -C $HOME -xzf go.tgz && \ - mkdir $HOME/gopath && \ - rm go.tgz - - # Setup environment variables - export GOROOT=$HOME/go - export GOPATH=$HOME/gopath - export PATH=$PATH:$GOROOT/bin - - # install cmake 3.4 - curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \ - cd cmake-3.4.1 && ./bootstrap && make -j4 && sudo make install && \ - cd .. && rm -rf cmake-3.4.1 - ``` - -- **GPU Dependencies (optional)** - - To build GPU version, you will need the following installed: - - 1. a CUDA-capable GPU - 2. A supported version of Linux with a GCC compiler and toolchain - 3. NVIDIA CUDA Toolkit (available at http://developer.nvidia.com/cuda-downloads) - 4. NVIDIA cuDNN Library (available at https://developer.nvidia.com/cudnn) - - The CUDA development environment relies on tight integration with the host development environment, - including the host compiler and C runtime libraries, and is therefore only supported on - distribution versions that have been qualified for this CUDA Toolkit release. - - After downloading cuDNN library, issue the following commands: - - ```bash - sudo tar -xzf cudnn-7.5-linux-x64-v5.1.tgz -C /usr/local - sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn* - ``` - Then you need to set LD\_LIBRARY\_PATH, PATH environment variables in ~/.bashrc. - - ```bash - export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH - export PATH=/usr/local/cuda/bin:$PATH - ``` - -### Build and Install - -As usual, the best option is to create build folder under paddle project directory. - -```bash -mkdir build && cd build -``` - -Finally, you can build and install PaddlePaddle: - -```bash -# you can add build option here, such as: -cmake .. -DCMAKE_INSTALL_PREFIX= -# please use sudo make install, if you want to install PaddlePaddle into the system -make -j `nproc` && make install -# set PaddlePaddle installation path in ~/.bashrc -export PATH=/bin:$PATH -# install PaddlePaddle Python modules. -sudo pip install /opt/paddle/share/wheels/*.whl -``` - -## Build on Centos 7 - -### Install Dependencies - -- **CPU Dependencies** - - ```bash - # necessary - sudo yum update - sudo yum install -y epel-release - sudo yum install -y make cmake3 python-devel python-pip gcc-gfortran swig git - sudo pip install wheel numpy - sudo pip install 'protobuf>=3.0.0' - ``` - -- **GPU Dependencies (optional)** - - To build GPU version, you will need the following installed: - - 1. a CUDA-capable GPU - 2. A supported version of Linux with a GCC compiler and toolchain - 3. NVIDIA CUDA Toolkit (available at http://developer.nvidia.com/cuda-downloads) - 4. NVIDIA cuDNN Library (available at https://developer.nvidia.com/cudnn) - - The CUDA development environment relies on tight integration with the host development environment, - including the host compiler and C runtime libraries, and is therefore only supported on - distribution versions that have been qualified for this CUDA Toolkit release. - - After downloading cuDNN library, issue the following commands: - - ```bash - sudo tar -xzf cudnn-7.5-linux-x64-v5.1.tgz -C /usr/local - sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn* - ``` - Then you need to set LD\_LIBRARY\_PATH, PATH environment variables in ~/.bashrc. - - ```bash - export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH - export PATH=/usr/local/cuda/bin:$PATH - ``` - -### Build and Install - -As usual, the best option is to create build folder under paddle project directory. - -```bash -mkdir build && cd build -``` - -Finally, you can build and install PaddlePaddle: - -```bash -# you can add build option here, such as: -cmake3 .. -DCMAKE_INSTALL_PREFIX= -# please use sudo make install, if you want to install PaddlePaddle into the system -make -j `nproc` && make install -# set PaddlePaddle installation path in ~/.bashrc -export PATH=/bin:$PATH -# install PaddlePaddle Python modules. -sudo pip install /opt/paddle/share/wheels/*.whl -``` diff --git a/doc/getstarted/build_and_install/build_from_source_en.rst b/doc/getstarted/build_and_install/build_from_source_en.rst new file mode 100644 index 0000000000000000000000000000000000000000..f194f84ce7c961bb8644d7c077a7c71730220ea2 --- /dev/null +++ b/doc/getstarted/build_and_install/build_from_source_en.rst @@ -0,0 +1,159 @@ +Build from Sources +========================== + +.. _build_step: + +How To Build +---------------- + +PaddlePaddle mainly uses `CMake `_ and GCC, G++ as compile +tools. We recommend you to use our pre-built Docker image to run the build +to avoid installing dependencies by yourself. We have several build environment +Docker images `here `_ . + +If you choose not to use Docker image for your build, you need to install the +below `Compile Dependencies`_ before run the build. + +Then run: + +.. code-block:: bash + + git clone https://github.com/PaddlePaddle/Paddle.git + cd Paddle + # run the following command to build a CPU-Only binaries if you are using docker + docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh + # else run these commands + mkdir build + cd build + cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF .. + make + +When the compile finishes, you can get the output whl package under +build/python/dist, then you can choose to install the whl on local +machine or copy it to the target machine. + +.. code-block:: bash + + pip install build/python/dist/*.whl + + +.. _run_test: + +Run Tests +---------------- + +If you wish to run the tests, you may follow the below steps: + +When using Docker, set :code:`RUN_TEST=ON` and :code:`WITH_TESTING=ON` will run test immediately after the build. +Set :code:`WITH_GPU=ON` Can also run tests on GPU. + +.. code-block:: bash + + docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/paddle/scripts/docker/build.sh + +If you don't use Docker, just run ctest will start the tests: + +.. code-block:: bash + + mkdir build + cd build + cmake -DWITH_GPU=OFF -DWITH_TESTING=ON .. + make + ctest + # run a single test like test_mul_op + ctest -R test_mul_op + + +.. _compile_deps: + +Compile Dependencies +---------------- + +PaddlePaddle need the following dependencies when compiling, other dependencies +will be downloaded automatically. + +.. csv-table:: PaddlePaddle Compile Dependencies + :header: "Dependency", "Version", "Description" + :widths: 10, 15, 30 + + "CMake", ">=3.5", "" + "GCC", "4.8.2", "Recommend devtools2 for CentOS" + "Python", "2.7.x", "Need libpython2.7.so" + "pip", ">=9.0", "" + "numpy", "", "" + "SWIG", ">=2.0", "" + "Go", ">=1.8", "Optional" + + +.. _build_options: + +Build Options +---------------- + +Build options include whether build binaries for CPU or GPU, which BLAS +library to use etc. You may pass these settings when running cmake. +For detailed cmake tutorial please refer to `here `_ 。 + +.. _build_options_bool: + +Bool Type Options +---------------- + +You can add :code:`-D` argument to pass such options, like: + +.. code-block:: bash + + cmake .. -DWITH_GPU=OFF + +.. csv-table:: Bool Type Options + :header: "Option", "Description", "Default" + :widths: 1, 7, 2 + + "WITH_GPU", "Build with GPU support", "ON" + "WITH_C_API", "Build only CAPI", "OFF" + "WITH_DOUBLE", "Build with double precision", "OFF" + "WITH_DSO", "Dynamically load CUDA libraries", "ON" + "WITH_AVX", "Build with AVX support", "ON" + "WITH_PYTHON", "Build with integrated Python interpreter", "ON" + "WITH_STYLE_CHECK", "Check code style when building", "ON" + "WITH_TESTING", "Build unit tests", "ON" + "WITH_DOC", "Build documentations", "OFF" + "WITH_SWIG_PY", "Build Python SWIG interface for V2 API", "Auto" + "WITH_GOLANG", "Build fault-tolerant parameter server written in go", "ON" + "WITH_MKL", "Use MKL as BLAS library, else use OpenBLAS", "ON" + + +BLAS ++++++ + +PaddlePaddle supports `MKL `_ and +`OpenBlAS `_ as BLAS library。By default it uses MKL. +If you are using MKL and your machine supports AVX2, MKL-DNN will also be downloaded +and used, for more `details `_ . + +If you choose not to use MKL, then OpenBlAS will be used. + +CUDA/cuDNN ++++++++++++ + +PaddlePaddle will automatically find CUDA and cuDNN when compiling and running. +parameter :code:`-DCUDA_ARCH_NAME=Auto` can be used to detect SM architecture +automatically in order to speed up the build. + +PaddlePaddle can build with any version later than cuDNN v5.1, and we intend to +keep on with latest cuDNN versions. Be sure to run with the same version of cuDNN +you built. + +Pass Compile Options +++++++++++++++ + +You can pass compile options to use intended BLAS/CUDA/Cudnn libraries. +When running cmake command, it will search system paths like +:code:`/usr/lib:/usr/local/lib` and then search paths that you +passed to cmake, i.e. + +.. code-block:: bash + + cmake .. -DWITH_GPU=ON -DWITH_TESTING=OFF -DCUDNN_ROOT=/opt/cudnnv5 + +**NOTE: These options only take effect when running cmake for the first time, you need to clean the cmake cache or clean the build directory (** :code:`rm -rf` **) if you want to change it.** diff --git a/doc/getstarted/build_and_install/cmake.png b/doc/getstarted/build_and_install/cmake.png deleted file mode 100644 index a58cd09ad99cf27cc1ca5785fe54d726b83a82f6..0000000000000000000000000000000000000000 Binary files a/doc/getstarted/build_and_install/cmake.png and /dev/null differ diff --git a/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst b/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst deleted file mode 100644 index be0c1ffa451b2901ec06621dd4d886f800b4562e..0000000000000000000000000000000000000000 --- a/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst +++ /dev/null @@ -1,43 +0,0 @@ -PaddlePaddle的编译选项 -====================== - -PaddlePaddle的编译选项,包括生成CPU/GPU二进制文件、链接何种BLAS库等。用户可在调用cmake的时候设置它们,详细的cmake使用方法可以参考 `官方文档 `_ 。 - -Bool型的编译选项 ----------------- -用户可在cmake的命令行中,通过使用 ``-D`` 命令设置该类编译选项,例如 - -.. code-block:: bash - - cmake .. -DWITH_GPU=OFF - -.. csv-table:: Bool型的编译选项 - :widths: 1, 7, 2 - :file: compile_options.csv - -BLAS/CUDA/Cudnn的编译选项 --------------------------- -BLAS -+++++ - -PaddlePaddle支持以下任意一种BLAS库:`MKL `_ ,`ATLAS `_ ,`OpenBlAS `_ 和 `REFERENCE BLAS `_ 。 - -.. csv-table:: BLAS路径相关的编译选项 - :widths: 1, 2, 7 - :file: cblas_settings.csv - -CUDA/Cudnn -+++++++++++ - -PaddlePaddle可以使用cudnn v2之后的任何一个版本来编译运行,但尽量请保持编译和运行使用的cudnn是同一个版本。 我们推荐使用最新版本的cudnn v5.1。 - -编译选项的设置 -++++++++++++++ - -PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/Cudnn库。cmake编译时,首先在系统路径(/usr/lib\:/usr/local/lib)中搜索这几个库,同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置,例如 - -.. code-block:: bash - - cmake .. -DMKL_ROOT=/opt/mkl/ -DCUDNN_ROOT=/opt/cudnnv5 - -注意:这几个编译选项的设置,只在第一次cmake的时候有效。如果之后想要重新设置,推荐清理整个编译目录(``rm -rf``)后,再指定。 diff --git a/doc/getstarted/build_and_install/cmake/cblas_settings.csv b/doc/getstarted/build_and_install/cmake/cblas_settings.csv deleted file mode 100644 index a6356baf16a0d3d2499e39d2055d8ee878dcaef2..0000000000000000000000000000000000000000 --- a/doc/getstarted/build_and_install/cmake/cblas_settings.csv +++ /dev/null @@ -1,5 +0,0 @@ -编译选项,描述,注意 -MKL_ROOT,MKL的路径,${MKL_ROOT}/include下需要包含mkl.h,${MKL_ROOT}/lib目录下需要包含mkl_core,mkl_sequential和mkl_intel_lp64三个库。 -ATLAS_ROOT,ATLAS的路径,${ATLAS_ROOT}/include下需要包含cblas.h,${ATLAS_ROOT}/lib下需要包含cblas和atlas两个库。 -OPENBLAS_ROOT,OpenBLAS的路径,${OPENBLAS_ROOT}/include下需要包含cblas.h,${OPENBLAS_ROOT}/lib下需要包含openblas库。 -REFERENCE_CBLAS_ROOT,REFERENCE BLAS的路径,${REFERENCE_CBLAS_ROOT}/include下需要包含cblas.h,${REFERENCE_CBLAS_ROOT}/lib下需要包含cblas库。 \ No newline at end of file diff --git a/doc/getstarted/build_and_install/cmake/compile_options.csv b/doc/getstarted/build_and_install/cmake/compile_options.csv deleted file mode 100644 index 463b825470579d0c3736a408b1e82dd33e6f8d42..0000000000000000000000000000000000000000 --- a/doc/getstarted/build_and_install/cmake/compile_options.csv +++ /dev/null @@ -1,12 +0,0 @@ -选项,说明,默认值 -WITH_GPU,是否支持GPU。,取决于是否寻找到CUDA工具链 -WITH_DOUBLE,是否使用双精度浮点数。,否 -WITH_DSO,是否运行时动态加载CUDA动态库,而非静态加载CUDA动态库。,是 -WITH_AVX,是否编译含有AVX指令集的PaddlePaddle二进制文件,是 -WITH_PYTHON,是否内嵌PYTHON解释器。方便今后的嵌入式移植工作。,是 -WITH_STYLE_CHECK,是否编译时进行代码风格检查,是 -WITH_RDMA,是否开启RDMA,否 -WITH_TIMER,是否开启计时功能。如果开启会导致运行略慢,打印的日志变多,但是方便调试和测Benchmark,否 -WITH_TESTING,是否开启单元测试,取决于是否寻找到GTEST -WITH_DOC,是否编译中英文文档,否 -WITH_SWIG_PY,是否编译PYTHON的SWIG接口,该接口可用于预测和定制化训练,取决于是否寻找到SWIG \ No newline at end of file diff --git a/doc/getstarted/build_and_install/docker_install_cn.rst b/doc/getstarted/build_and_install/docker_install_cn.rst index 30b144d849bec367cd0197b6082889e011193a9a..f78b1fb0e11aa028a4b7abb5270740b97f8039e9 100644 --- a/doc/getstarted/build_and_install/docker_install_cn.rst +++ b/doc/getstarted/build_and_install/docker_install_cn.rst @@ -1,222 +1,139 @@ -PaddlePaddle的Docker容器使用方式 +使用Docker安装运行 ================================ -PaddlePaddle目前唯一官方支持的运行的方式是Docker容器。因为Docker能在所有主要操作系统(包括Linux,Mac OS X和Windows)上运行。 请注意,您需要更改 `Dockers设置 `_ 才能充分利用Mac OS X和Windows上的硬件资源。 +使用Docker安装和运行PaddlePaddle可以无需考虑依赖环境即可运行。并且也可以在Windows的docker中运行。 +您可以在 `Docker官网 `_ 获得基本的Docker安装和使用方法。 -Docker使用入门 ------------------------------- - -几个基础的概念帮助理解和使用Docker: +如果您在使用Windows,可以参考 +`这篇 `_ +教程,完成在Windows上安装和使用Docker。 -- *镜像*:一个Docker镜像是一个打包好的软件。它包含了这个软件本身和它所依赖的运行环境。PaddlePaddle的Docker镜像就包含了PaddlePaddle的Python库以及其依赖的多个Python库。这样我们可以直接在Docker中运行需要的程序而不需要安装后在执行。可以执行: +在了解Docker的基本使用方法之后,即可开始下面的步骤: - .. code-block:: bash +.. _docker_pull: - docker images +获取PaddlePaddle的Docker镜像 +------------------------------ - 来列出当前系统中的所有镜像,同样可以执行: +执行下面的命令获取最新的PaddlePaddle Docker镜像 .. code-block:: bash - - docker pull paddlepaddle/paddle:0.10.0 - 来下载Docker镜像,paddlepaddle/paddle是从官方镜像源Dockerhub.com下载的,推荐国内用户使用docker.paddlepaddle.org/paddle下载。 + docker pull paddlepaddle/paddle -- *容器*: 如果说一个Docker镜像就是一个程序,那容器就是这个程序运行时产生的“进程”。 - 实际上,一个容器就是一个操作系统的进程,但是是运行在独立的进程空间,文件系统以及网络之上。 - 可以执行: +对于国内用户,我们提供了加速访问的镜像源: .. code-block:: bash - docker run paddlepaddle/paddle:0.10.0 + docker pull docker.paddlepaddle.org/paddle - 来使用一个镜像启动一个容器。 - -- 默认情况下,Docker容器会运行在独立的文件系统空间之上,我们无法在Docker容器中 - 访问到主机上的文件。可以通过*挂载Volume*的方式,将主机上的文件或目录挂载到 - Docker容器中。下面的命令把当前目录挂载到了容器中的 /data 目录下,容器使用 - debian镜像,并且启动后执行 :code:`ls /data`。 +下载GPU版本的Docker镜像: .. code-block:: bash - docker run --rm -v $(pwd):/data debian ls /data - -PaddlePaddle发布的Docker镜像使用说明 ------------------------------- - -我们把PaddlePaddle的编译环境打包成一个镜像,称为开发镜像,里面涵盖了 -PaddlePaddle需要的所有编译工具。把编译出来的PaddlePaddle也打包成一个镜 -像,称为生产镜像,里面涵盖了PaddlePaddle运行所需的所有环境。每次 -PaddlePaddle发布新版本的时候都会发布对应版本的生产镜像以及开发镜像。运 -行镜像包括纯CPU版本和GPU版本以及其对应的非AVX版本。我们会在 -`dockerhub.com `_ -和国内镜像`docker.paddlepaddle.org` 提供最新 -的Docker镜像,可以在"tags"标签下找到最新的Paddle镜像版本。 - -**注意:为了方便在国内的开发者下载Docker镜像,我们提供了国内的镜像服务器供大家使用。如果您在国内,请把文档里命令中的paddlepaddle/paddle替换成docker.paddlepaddle.org/paddle。** - -1. 开发镜像::code:`paddlepaddle/paddle:0.10.0-dev` - - 这个镜像包含了Paddle相关的开发工具以及编译和运行环境。用户可以使用开发镜像代替配置本地环境,完成开发,编译,发布, - 文档编写等工作。由于不同的Paddle的版本可能需要不同的依赖和工具,所以如果需要自行配置开发环境需要考虑版本的因素。 - 开发镜像包含了以下工具: - - - gcc/clang - - nvcc - - Python - - sphinx - - woboq - - sshd - 很多开发者会使用远程的安装有GPU的服务器工作,用户可以使用ssh登录到这台服务器上并执行 :code:`docker exec`进入开发镜像并开始工作, - 也可以在开发镜像中启动一个SSHD服务,方便开发者直接登录到镜像中进行开发: - - 以交互容器方式运行开发镜像: - - .. code-block:: bash - - docker run -it --rm -v $(pwd):/paddle paddlepaddle/paddle:0.10.0-dev /bin/bash - - 或者,可以以后台进程方式运行容器: - - .. code-block:: bash - - docker run -d -p 2202:22 -p 8888:8888 -v $(pwd):/paddle paddlepaddle/paddle:0.10.0-dev /usr/sbin/sshd -D - - 然后用密码 :code:`root` SSH进入容器: - - .. code-block:: bash - - ssh -p 2202 root@localhost - - SSH方式的一个优点是我们可以从多个终端进入容器。比如,一个终端运行vi,另一个终端运行Python。另一个好处是我们可以把PaddlePaddle容器运行在远程服务器上,并在笔记本上通过SSH与其连接。 - -2. 生产镜像:根据CPU、GPU和非AVX区分了如下4个镜像: - - - GPU/AVX::code:`paddlepaddle/paddle:-gpu` - - GPU/no-AVX::code:`paddlepaddle/paddle:-gpu-noavx` - - CPU/AVX::code:`paddlepaddle/paddle:` - - CPU/no-AVX::code:`paddlepaddle/paddle:-noavx` - - 纯CPU镜像以及GPU镜像都会用到AVX指令集,但是2008年之前生产的旧电脑不支持AVX。以下指令能检查Linux电脑是否支持AVX: - - .. code-block:: bash - - if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi - - 如果输出是No,就需要选择使用no-AVX的镜像 - - **注:在0.10.0之后的版本,PaddlePaddle都可以自动判断硬件是否支持AVX,所以无需判断AVX即可使用** + docker pull paddlepaddle/paddle:latest-gpu + docker pull docker.paddlepaddle.org/paddle:latest-gpu - 以上方法在GPU镜像里也能用,只是请不要忘记提前在物理机上安装GPU最新驱动。 - 为了保证GPU驱动能够在镜像里面正常运行,我们推荐使用[nvidia-docker](https://github.com/NVIDIA/nvidia-docker)来运行镜像。 +选择下载使用不同的BLAS库的Docker镜像: - .. code-block:: bash - - nvidia-docker run -it --rm paddledev/paddle:0.10.0-gpu /bin/bash + .. code-block:: bash - 注意: 如果使用nvidia-docker存在问题,你也许可以尝试更老的方法,具体如下,但是我们并不推荐这种方法。: + # 默认是使用MKL的镜像 + docker pull paddlepaddle/paddle + # 使用OpenBLAS的镜像 + docker pull paddlepaddle/paddle:latest-openblas - .. code-block:: bash +下载指定版本的Docker镜像,可以从 `DockerHub网站 `_ 获取可选的tag,并执行下面的命令: - export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')" - export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}') - docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:0.10.0-gpu + .. code-block:: bash -3. 运行以及发布您的AI程序 + docker pull paddlepaddle/paddle:[tag] + # 比如: + docker pull docker.paddlepaddle.org/paddle:0.10.0-gpu - 假设您已经完成了一个AI训练的python程序 :code:`a.py`,这个程序是您在开发机上使用开发镜像完成开发。此时您可以运行这个命令在开发机上进行测试运行: +.. _docker_run: - .. code-block:: bash +在Docker中执行PaddlePaddle训练程序 +------------------------------ - docker run -it -v $PWD:/work paddle /work/a.py +假设您已经在当前目录(比如在/home/work)编写了一个PaddlePaddle的程序 :code:`train.py` (可以参考 +`PaddlePaddleBook `_ +编写),就可以使用下面的命令开始执行训练: - 如果要使用GPU,请运行: + .. code-block:: bash - .. code-block:: bash + cd /home/work + docker run -it -v $PWD:/work paddlepaddle/paddle /work/train.py + +上述命令中, :code:`-it` 参数说明容器已交互式运行; :code:`-v $PWD:/work` +指定将当前路径(Linux中$PWD变量会展开为当前路径的绝对路径)挂载到容器内部的 :code:`/work` +目录; :code:`paddlepaddle/paddle` 指定需要使用的容器; 最后 :code:`/work/train.py` +为容器内执行的命令,即运行训练程序。 - nvidia-docker run -it -v $PWD:/work paddle /work/a.py +当然,您也可以进入到Docker容器中,以交互式的方式执行或调试您的代码: + .. code-block:: bash + docker run -it -v $PWD:/work paddlepaddle/paddle /bin/bash + cd /work + python train.py - 这里`a.py`包含的所有依赖假设都可以在Paddle的运行容器中。如果需要包含更多的依赖、或者需要发布您的应用的镜像,可以编写`Dockerfile`使用`FROM paddledev/paddle:0.10.0` - 创建和发布自己的AI程序镜像。 +**注:PaddlePaddle Docker镜像为了减小体积,默认没有安装vim,您可以在容器中执行** :code:`apt-get install -y vim` **安装后,在容器中编辑代码。** -运行PaddlePaddle Book ---------------------- +.. _docker_run_book: -Jupyter Notebook是一个开源的web程序,大家可以通过它制作和分享带有代码、公式、图表、文字的交互式文档。用户可以通过网页浏览文档。 +使用Docker启动PaddlePaddle Book教程 +------------------------------ -PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Nodebook。 +使用Docker可以快速在本地启动一个包含了PaddlePaddle官方Book教程的Jupyter Notebook,可以通过网页浏览。 +PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Notebook。 如果您想要更深入了解deep learning,PaddlePaddle Book一定是您最好的选择。 +大家可以通过它阅读教程,或者制作和分享带有代码、公式、图表、文字的交互式文档。 我们提供可以直接运行PaddlePaddle Book的Docker镜像,直接运行: -.. code-block:: bash + .. code-block:: bash - docker run -p 8888:8888 paddlepaddle/book + docker run -p 8888:8888 paddlepaddle/book 然后在浏览器中输入以下网址: -.. code-block:: text + .. code-block:: text - http://localhost:8888/ + http://localhost:8888/ 就这么简单,享受您的旅程! -通过Docker容器开发PaddlePaddle ------------------------------- - -开发人员可以在Docker开发镜像中开发PaddlePaddle。这样开发人员可以以一致的方式在不同的平台上工作 - Linux,Mac OS X和Windows。 +.. _docker_run_gpu: -1. 制作PaddlePaddle开发镜像 - - PaddlePaddle每次发布新版本都会发布对应的开发镜像供开发者直接使用。这里介绍如生成造这个开发镜像。 - 生成Docker镜像的方式有两个,一个是直接把一个容器转换成镜像,另一个是创建Dockerfile并运行docker build指令按照Dockerfile生成镜像。第一个方法的好处是简单快捷,适合自己实验,可以快速迭代。第二个方法的好处是Dockerfile可以把整个生成流程描述很清楚,其他人很容易看懂镜像生成过程,持续集成系统也可以简单地复现这个过程。我们采用第二个方法。Dockerfile位于PaddlePaddle repo的根目录。生成生产镜像只需要运行: - - .. code-block:: bash - - git clone https://github.com/PaddlePaddle/Paddle.git - cd Paddle - docker build -t paddle:dev . - - docker build这个命令的-t指定了生成的镜像的名字,这里我们用paddle:dev。到此,PaddlePaddle开发镜像就被构建完毕了。 +使用Docker执行GPU训练 +------------------------------ -2. 制作PaddlePaddle生产镜像 +为了保证GPU驱动能够在镜像里面正常运行,我们推荐使用 +`nvidia-docker `_ 来运行镜像。 +请不要忘记提前在物理机上安装GPU最新驱动。 - 生产镜像的生成分为两步,第一步是运行: + .. code-block:: bash - .. code-block:: bash - - docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=OFF" -e "WITH_TEST=ON" paddle:dev + nvidia-docker run -it -v $PWD:/work paddledev/paddle:latest-gpu /bin/bash - 以上命令会编译PaddlePaddle,生成运行程序,以及生成创建生产镜像的Dockerfile。所有生成的的文件都在build目录下。“WITH_GPU”控制生成的生产镜像是否支持GPU,“WITH_AVX”控制生成的生产镜像是否支持AVX,”WITH_TEST“控制是否生成单元测试。 +**注: 如果没有安装nvidia-docker,可以尝试以下的方法,将CUDA库和Linux设备挂载到Docker容器内:** - 第二步是运行: + .. code-block:: bash - .. code-block:: bash - - docker build -t paddle:prod -f build/Dockerfile ./build + export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')" + export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}') + docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:latest-gpu - 以上命令会按照生成的Dockerfile把生成的程序拷贝到生产镜像中并做相应的配置,最终生成名为paddle:prod的生产镜像。 +**关于AVX:** -3. 运行单元测试 +AVX是一种CPU指令集,可以加速PaddlePaddle的计算。最新的PaddlePaddle Docker镜像默认 +是开启AVX编译的,所以,如果您的电脑不支持AVX,需要单独 +`编译 <./build_from_source_cn.rst>`_ PaddlePaddle为no-avx版本。 - 运行以下指令: +以下指令能检查Linux电脑是否支持AVX: .. code-block:: bash - - docker run -it -v $(pwd):/paddle paddle:dev bash -c "cd /paddle/build && ctest" - -文档 ----- - -Paddle的Docker开发镜像带有一个通过 `woboq code browser -`_ 生成的HTML版本的C++源代码,便于用户浏览C++源码。 -只要在Docker里启动PaddlePaddle的时候给它一个名字,就可以再运行另一个Nginx Docker镜像来服务HTML代码: - -.. code-block:: bash - - docker run -d --name paddle-cpu-doc paddle:0.10.0-dev - docker run -d --volumes-from paddle-cpu-doc -p 8088:80 nginx + if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi -接着我们就能够打开浏览器在 http://localhost:8088/paddle/ 浏览代码。 +如果输出是No,就需要选择使用no-AVX的镜像 diff --git a/doc/getstarted/build_and_install/docker_install_en.rst b/doc/getstarted/build_and_install/docker_install_en.rst index 94860240f6a4a9bed8a865684a8a79960489280e..d7acc7aeb744b19d83acb520d07c8551168dd096 100644 --- a/doc/getstarted/build_and_install/docker_install_en.rst +++ b/doc/getstarted/build_and_install/docker_install_en.rst @@ -1,270 +1,146 @@ -PaddlePaddle in Docker Containers +Run in Docker Containers ================================= -Docker container is currently the only officially-supported way to -running PaddlePaddle. This is reasonable as Docker now runs on all -major operating systems including Linux, Mac OS X, and Windows. -Please be aware that you will need to change `Dockers settings -`_ to make full use -of your hardware resource on Mac OS X and Windows. +Run PaddlePaddle in Docker container so that you don't need to care about +runtime dependencies, also you can run under Windows system. You can get +tutorials at `here `_ . -Working With Docker -------------------- +If you are using Windows, please refer to +`this `_ +tutorial to start running docker under windows. -Docker is simple as long as we understand a few basic concepts: +After you've read above tutorials you may proceed the following steps. -- *image*: A Docker image is a pack of software. It could contain one or more programs and all their dependencies. For example, the PaddlePaddle's Docker image includes pre-built PaddlePaddle and Python and many Python packages. We can run a Docker image directly, other than installing all these software. We can type +.. _docker_pull: - .. code-block:: bash - - docker images +Pull PaddlePaddle Docker Image +------------------------------ - to list all images in the system. We can also run +Run the following command to download the latest Docker images: .. code-block:: bash - - docker pull paddlepaddle/paddle:0.10.0 - to download a Docker image, paddlepaddle/paddle in this example, - from Dockerhub.com. + docker pull paddlepaddle/paddle -- *container*: considering a Docker image a program, a container is a - "process" that runs the image. Indeed, a container is exactly an - operating system process, but with a virtualized filesystem, network - port space, and other virtualized environment. We can type +For users in China, we provide a faster mirror: .. code-block:: bash - docker run paddlepaddle/paddle:0.10.0 + docker pull docker.paddlepaddle.org/paddle - to start a container to run a Docker image, paddlepaddle/paddle in this example. - -- By default docker container have an isolated file system namespace, - we can not see the files in the host file system. By using *volume*, - mounted files in host will be visible inside docker container. - Following command will mount current dirctory into /data inside - docker container, run docker container from debian image with - command :code:`ls /data`. +Download GPU version images: .. code-block:: bash - docker run --rm -v $(pwd):/data debian ls /data - -Usage of CPU-only and GPU Images ----------------------------------- - -We package PaddlePaddle's compile environment into a Docker image, -called the develop image, it contains all compiling tools that -PaddlePaddle needs. We package compiled PaddlePaddle program into a -Docker image as well, called the production image, it contains all -runtime environment that running PaddlePaddle needs. For each version -of PaddlePaddle, we release both of them. Production image includes -CPU-only version and a CUDA GPU version and their no-AVX versions. - -We put the docker images on `dockerhub.com -`_. You can find the -latest versions under "tags" tab at dockerhub.com. - -** NOTE: If you are in China, you can use our Docker image registry mirror to speed up the download process. To use it, please replace all paddlepaddle/paddle in the commands to docker.paddlepaddle.org/paddle.** - - -1. development image :code:`paddlepaddle/paddle:-dev` - - This image has packed related develop tools and runtime - environment. Users and developers can use this image instead of - their own local computer to accomplish development, build, - releasing, document writing etc. While different version of paddle - may depends on different version of libraries and tools, if you - want to setup a local environment, you must pay attention to the - versions. The development image contains: - - - gcc/clang - - nvcc - - Python - - sphinx - - woboq - - sshd - - Many developers use servers with GPUs, they can use ssh to login to - the server and run :code:`docker exec` to enter the docker - container and start their work. Also they can start a development - docker image with SSHD service, so they can login to the container - and start work. - -2. Production images, this image might have multiple variants: - - - GPU/AVX::code:`paddlepaddle/paddle:-gpu` - - GPU/no-AVX::code:`paddlepaddle/paddle:-gpu-noavx` - - CPU/AVX::code:`paddlepaddle/paddle:` - - CPU/no-AVX::code:`paddlepaddle/paddle:-noavx` - - Please be aware that the CPU-only and the GPU images both use the - AVX instruction set, but old computers produced before 2008 do not - support AVX. The following command checks if your Linux computer - supports AVX: - - .. code-block:: bash - - if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi - - **NOTE:versions after 0.10.0 will automatically detect system AVX support, so manual detect is not needed in this case.** - To run the CPU-only image as an interactive container: - - .. code-block:: bash - - docker run -it --rm paddlepaddle/paddle:0.10.0 /bin/bash - - Above method work with the GPU image too -- the recommended way is - using `nvidia-docker `_. - - Please install nvidia-docker first following this `tutorial - `_. - - Now you can run a GPU image: - - .. code-block:: bash - - nvidia-docker run -it --rm paddlepaddle/paddle:0.10.0-gpu /bin/bash - - -Train Model Using Python API ----------------------------- - -Our official docker image provides a runtime for PaddlePaddle -programs. The typical workflow will be as follows: - -Create a directory as workspace: - -.. code-block:: bash - - mkdir ~/workspace - -Edit a PaddlePaddle python program using your favourite editor - -.. code-block:: bash - - emacs ~/workspace/example.py - -Run the program using docker: - -.. code-block:: bash - - docker run --rm -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0 python /workspace/example.py - -Or if you are using GPU for training: + docker pull paddlepaddle/paddle:latest-gpu + docker pull docker.paddlepaddle.org/paddle:latest-gpu -.. code-block:: bash +Choose between different BLAS version: - nvidia-docker run --rm -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0-gpu python /workspace/example.py - -Above commands will start a docker container by running :code:`python -/workspace/example.py`. It will stop once :code:`python -/workspace/example.py` finishes. - -Another way is to tell docker to start a :code:`/bin/bash` session and -run PaddlePaddle program interactively: - -.. code-block:: bash - - docker run -it -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0 /bin/bash - # now we are inside docker container - cd /workspace - python example.py - -Running with GPU is identical: - -.. code-block:: bash - - nvidia-docker run -it -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0-gpu /bin/bash - # now we are inside docker container - cd /workspace - python example.py - - -Develop PaddlePaddle or Train Model Using C++ API ---------------------------------------------------- - -We will be using PaddlePaddle development image since it contains all -compiling tools and dependencies. + .. code-block:: bash -1. Build PaddlePaddle develop image + # image using MKL by default + docker pull paddlepaddle/paddle + # image using OpenBLAS + docker pull paddlepaddle/paddle:latest-openblas - Use following command to build PaddlePaddle develop image: - .. code-block:: bash +If you want to use legacy versions, choose a tag from +`DockerHub `_ +and run: - git clone https://github.com/PaddlePaddle/Paddle.git && cd Paddle - docker build -t paddle:dev . - -2. Build PaddlePaddle production image + .. code-block:: bash - There are two steps for building production image, the first step is to run: + docker pull paddlepaddle/paddle:[tag] + # i.e. + docker pull docker.paddlepaddle.org/paddle:0.10.0-gpu - .. code-block:: bash +.. _docker_run: - docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=OFF" -e "WITH_TEST=ON" paddle:dev +Launch your training program in Docker +------------------------------ - The above command will compile PaddlePaddle and create a Dockerfile for building production image. All the generated files are in the build directory. "WITH_GPU" controls if the generated production image supports GPU. "WITH_AVX" controls if the generated production image supports AVX. "WITH_TEST" controls if the unit test will be generated. +Assume that you have already written a PaddlePaddle program +named :code:`train.py` under directory :code:`/home/work` (refer to +`PaddlePaddleBook `_ +for more samples), then run the following command: - The second step is to run: + .. code-block:: bash - .. code-block:: bash + cd /home/work + docker run -it -v $PWD:/work paddlepaddle/paddle /work/train.py - docker build -t paddle:prod -f build/Dockerfile ./build +In the above command, :code:`-it` means run the container interactively; +:code:`-v $PWD:/work` means mount the current directory ($PWD will expand +to current absolute path in Linux) under :code:`/work` in the container. +:code:`paddlepaddle/paddle` to specify image to use; finnally +:code:`/work/train.py` is the command to run inside docker. - The above command will generate the production image by copying the compiled PaddlePaddle program into the image. +Also, you can go into the container shell, run or debug your code +interactively: -3. Run unit test + .. code-block:: bash + docker run -it -v $PWD:/work paddlepaddle/paddle /bin/bash + cd /work + python train.py - Following command will run unit test: +**NOTE: We did not install vim in the default docker image to reduce the image size, you can run** :code:`apt-get install -y vim` **to install it if you need to edit python files.** - .. code-block:: bash - - docker run -it -v $(pwd):/paddle paddle:dev bash -c "cd /paddle/build && ctest" +.. _docker_run_book: PaddlePaddle Book ------------------ -The Jupyter Notebook is an open-source web application that allows -you to create and share documents that contain live code, equations, -visualizations and explanatory text in a single browser. - -PaddlePaddle Book is an interactive Jupyter Notebook for users and developers. -We already exposed port 8888 for this book. If you want to +You can create a container serving PaddlePaddle Book using Jupyter Notebook in +one minute using Docker. PaddlePaddle Book is an interactive Jupyter Notebook +for users and developers.If you want to dig deeper into deep learning, PaddlePaddle Book definitely is your best choice. We provide a packaged book image, simply issue the command: -.. code-block:: bash + .. code-block:: bash - docker run -p 8888:8888 paddlepaddle/book + docker run -p 8888:8888 paddlepaddle/book Then, you would back and paste the address into the local browser: -.. code-block:: text + .. code-block:: text - http://localhost:8888/ + http://localhost:8888/ That's all. Enjoy your journey! +.. _docker_run_gpu: -Documentation -------------- +Train with Docker with GPU +------------------------------ -Paddle Docker images include an HTML version of C++ source code -generated using `woboq code browser -`_. This makes it easy -for users to browse and understand the C++ source code. +We recommend using +`nvidia-docker `_ +to run GPU training jobs. Please ensure you have latest +GPU driver installed before move on. -As long as we give the Paddle Docker container a name, we can run an -additional Nginx Docker container to serve the volume from the Paddle -container: + .. code-block:: bash -.. code-block:: bash + nvidia-docker run -it -v $PWD:/work paddledev/paddle:latest-gpu /bin/bash - docker run -d --name paddle-cpu-doc paddle: - docker run -d --volumes-from paddle-cpu-doc -p 8088:80 nginx +**NOTE: If you don't have nvidia-docker installed, try the following method to mount CUDA libs and devices into the container.** + .. code-block:: bash -Then we can direct our Web browser to the HTML version of source code -at http://localhost:8088/paddle/ + export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')" + export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}') + docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:latest-gpu + +**About AVX:** + +AVX is a kind of CPU instruction can accelerate PaddlePaddle's calculations. +The latest PaddlePaddle Docker image turns AVX on by default, so, if your +computer doesn't support AVX, you'll probably need to +`build <./build_from_source_en.rst>`_ with :code:`WITH_AVX=OFF`. + +The following command will tell you whether your computer supports AVX. + + .. code-block:: bash + + if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi diff --git a/doc/getstarted/build_and_install/index_cn.rst b/doc/getstarted/build_and_install/index_cn.rst index dd9923697ab85825557aa89a08870bece7c76673..c9ba84c842b530162c92713046e64fdf82bd441b 100644 --- a/doc/getstarted/build_and_install/index_cn.rst +++ b/doc/getstarted/build_and_install/index_cn.rst @@ -6,22 +6,28 @@ 安装流程 ++++++++ -PaddlePaddle提供Docker镜像来部署环境。 +PaddlePaddle提供pip和Docker的安装方式: .. toctree:: :maxdepth: 1 - - docker_install_cn.rst + pip_install_cn.rst + docker_install_cn.rst + ../../howto/dev/build_cn.md 编译流程 ++++++++ .. warning:: - 编译流程主要推荐高级用户查看,普通用户请走安装流程。 + 建议直接使用上述安装流程,方便快速安装。只有在遇到需要独立定制的二进制时才需要编译。 .. toctree:: :maxdepth: 1 - cmake/build_from_source_cn.rst + build_from_source_cn.rst + +常见问题解答 +++++++++++ + +`常见问题解答 `_ diff --git a/doc/getstarted/build_and_install/index_en.rst b/doc/getstarted/build_and_install/index_en.rst index 8a53588e0439df8f4d5fd529b7a20262c67d4e58..32d66d63dd5b2a30d5de4a088dc80b680830cb84 100644 --- a/doc/getstarted/build_and_install/index_en.rst +++ b/doc/getstarted/build_and_install/index_en.rst @@ -1,22 +1,34 @@ Install and Build ================= -Install PaddlePaddle ----------------------- +.. _install_steps: -.. toctree:: - :maxdepth: 1 +Install Steps +++++++++ + +You can choose either pip or Docker to complete your install: + +.. toctree:: + :maxdepth: 1 + + pip_install_en.rst + docker_install_en.rst + ../../howto/dev/build_en.md - docker_install_en.rst Build from Source ----------------- .. warning:: - Please use :code:`docker` image to install paddle. The building guide is used for hacking or contributing PaddlePaddle source code. + We recommend to directly install via above installation steps, you'll only need to build PaddlePaddle from source when you need a modifed binary. .. toctree:: :maxdepth: 1 build_from_source_en.md + +FAQ +++++++++++ + +`FAQ `_ diff --git a/doc/getstarted/build_and_install/paddleci.png b/doc/getstarted/build_and_install/paddleci.png new file mode 100644 index 0000000000000000000000000000000000000000..16087ce059aa3c07ce8c927d983eb86351915825 Binary files /dev/null and b/doc/getstarted/build_and_install/paddleci.png differ diff --git a/doc/getstarted/build_and_install/pip_install_cn.rst b/doc/getstarted/build_and_install/pip_install_cn.rst new file mode 100644 index 0000000000000000000000000000000000000000..b270e2c2f0b0cbfd6fb4b9b0750d207952f84d76 --- /dev/null +++ b/doc/getstarted/build_and_install/pip_install_cn.rst @@ -0,0 +1,86 @@ +使用pip安装 +================================ + +PaddlePaddle可以使用常用的Python包管理工具 +`pip `_ +完成安装,并可以在大多数主流的Linux操作系统以及MacOS上执行。 + +.. _pip_install: + +使用pip安装 +------------------------------ + + +执行下面的命令即可在当前机器上安装PaddlePaddle的运行时环境,并自动下载安装依赖软件。 + + .. code-block:: bash + + pip install paddlepaddle + + +如果需要安装支持GPU的版本,需要执行: + + .. code-block:: bash + + pip install paddlepaddle-gpu + +如果需要获取并安装最新的(开发分支)PaddlePaddle,可以从我们的CI系统中下载最新的whl安装包和c-api开发包并安装, +您可以从下面的表格中找到需要的版本: + +如果在点击下面链接时出现如下登陆界面,点击“Log in as guest”即可开始下载: + +.. image:: paddleci.png + :scale: 50 % + :align: center + +.. csv-table:: 各个版本最新的whl包 + :header: "版本说明", "cp27-cp27mu", "cp27-cp27m", "C-API" + :widths: 1, 3, 3, 3 + + "cpu_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" + "cpu_avx_openblas", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl `_", "暂无" + "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" + "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" + "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" + +.. _pip_dependency: + +运行环境依赖 +------------------------------ + +PaddlePaddle安装包由于不仅仅包含.py程序,而且包含了C++编写的部分,所以我们确保发布的二进制包可以支持主流的Linux操作系统,比如CentOS 6以上,Ubuntu 14.04以上,MacOS 10.12以上。 + +PaddlePaddle发布的安装包会尽量对齐 `manylinux1 `_ 标准,通常使用CentOS 5作为编译环境。但由于CUDA库通常需要CentOS 6以上,而且CentOS 5即将停止维护,所以我们默认使用CentOS 6作为标准编译环境。 + +.. csv-table:: PaddlePaddle环境依赖 + :header: "依赖", "版本", "说明" + :widths: 10, 15, 30 + + "操作系统", "Linux, MacOS", "CentOS 6以上,Ubuntu 14.04以上,MacOS 10.12以上" + "Python", "2.7.x", "暂时不支持Python3" + "libc.so", "GLIBC_2.7", "glibc至少包含GLIBC_2.7以上的符号" + "libstdc++.so", "GLIBCXX_3.4.11, CXXABI_1.3.3", "至少包含GLIBCXX_3.4.11, CXXABI_1.3.3以上的符号" + "libgcc_s.so", "GCC_3.3", "至少包含GCC_3.3以上的符号" + +.. _pip_faq: + +安装常见问题和解决方法 +------------------------------ + +- paddlepaddle*.whl is not a supported wheel on this platform. + + 出现这个问题的主要原因是,没有找到和当前系统匹配的paddlepaddle安装包。请检查Python版本是否为2.7系列。另外最新的pip官方源中的安装包默认是manylinux1标准,需要使用最新的pip (>9.0.0) 才可以安装。可以使用下面的命令更新您的pip: + + .. code-block:: bash + + pip install --upgrade pip + + 如果仍然存在问题,可以执行: + + .. code-block:: bash + + python -c "import pip; print(pip.pep425tags.get_supported())" + + 获取当前系统支持的安装包格式,并检查和需安装的包是否匹配。pypi安装包可以在 `这个 `_ 链接中找到。 + + 如果系统支持的是 linux_x86_64 而安装包是 manylinux1_x86_64 ,需要升级pip版本到最新; 如果系统支持 manylinux1_x86_64 而安装包(本地)是 linux_x86_64 ,可以重命名这个whl包为 manylinux1_x86_64 再安装。 diff --git a/doc/getstarted/build_and_install/pip_install_en.rst b/doc/getstarted/build_and_install/pip_install_en.rst new file mode 100644 index 0000000000000000000000000000000000000000..70f601a11c610e0a2b5dcc8b73d2c3ea19e195e1 --- /dev/null +++ b/doc/getstarted/build_and_install/pip_install_en.rst @@ -0,0 +1,104 @@ +Install Using pip +================================ + +You can use current widely used Python package management +tool `pip `_ +to install PaddlePaddle. This method can be used in +most of current Linux systems or MacOS. + +.. _pip_install: + +Install Using pip +------------------------------ + +Run the following command to install PaddlePaddle on the current +machine, it will also download requirements. + + .. code-block:: bash + + pip install paddlepaddle + + +If you wish to install GPU version, just run: + + .. code-block:: bash + + pip install paddlepaddle-gpu + +If you wish to install the latest develop branch PaddlePaddle, +you can download the latest whl package from our CI system. Access +the below links, log in as guest, then click at the "Artifact" +tab, you'll find the download link of whl packages. + +If the links below shows up the login form, just click "Log in as guest" to start the download: + +.. image:: paddleci.png + :scale: 50 % + :align: center + +.. csv-table:: whl package of each version + :header: "version", "cp27-cp27mu", "cp27-cp27m", "C-API" + :widths: 1, 3, 3, 3 + + "cpu_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" + "cpu_avx_openblas", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl `_", "Not Available" + "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" + "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" + "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" + +.. _pip_dependency: + +Runtime Dependency +------------------------------ + +PaddlePaddle installation packages (whl) does not only contain .py files, +but also binaries built from C++ code. We ensure that PaddlePaddle can +run on current mainline Linux distributions, like CentOS 6, Ubuntu 14.04 +and MacOS 10.12. + +PaddlePaddle whl packages are trying to satisfy +`manylinux1 `_ +standard, which uses CentOS 5 as default build environment. But CUDA libraries +seems only run on CentOS 6 at least, also, CentOS 5 is about to end its lifetime, +so we use CentOS 6 as default build environment. + +.. csv-table:: PaddlePaddle Runtime Deps + :header: "Dependency", "version", "description" + :widths: 10, 15, 30 + + "OS", "Linux, MacOS", "CentOS 6 or later,Ubuntu 14.04 or later,MacOS 10.12 or later" + "Python", "2.7.x", "Currently Python3 is not supported" + "libc.so", "GLIBC_2.7", "glibc at least include GLIBC_2.7 symbols" + "libstdc++.so", "GLIBCXX_3.4.11, CXXABI_1.3.3", "At least include GLIBCXX_3.4.11, CXXABI_1.3.3 symbols" + "libgcc_s.so", "GCC_3.3", "At least include GCC_3.3 symbols" + +.. _pip_faq: + +FAQ +------------------------------ + +- paddlepaddle*.whl is not a supported wheel on this platform. + + The main cause of this issue is that your current platform is + not supported. Please check that you are using Python 2.7 series. + Besides, pypi only supports manylinux1 standard, you'll need to + upgrade your pip to >9.0.0. Then run the below command: + + .. code-block:: bash + + pip install --upgrade pip + + If the problem still exists, run the following command: + + .. code-block:: bash + + python -c "import pip; print(pip.pep425tags.get_supported())" + + Then you'll get supported package suffixes, then check if it matches + the file name of the whl package. You can find default whl package at + `here `_ + + If your system supports linux_x86_64 but the whl package is manylinux1_x86_64, + you'll need to update pip to the latest version; If your system supports + manylinux1_x86_64 but the whl package is linux_x86_64 you can rename the + file to manylinux1_x86_64 suffix and then install. diff --git a/doc/getstarted/index_cn.rst b/doc/getstarted/index_cn.rst index aa418c657a4ba16cce61c030066f4d3e14e891cc..a9087be6f350c5656cabb0c64ba0f200d1c666cc 100644 --- a/doc/getstarted/index_cn.rst +++ b/doc/getstarted/index_cn.rst @@ -1,10 +1,61 @@ 新手入门 ============ +.. _quick_install: + +快速安装 +++++++++ + +PaddlePaddle支持使用pip快速安装,目前支持CentOS 6以上, Ubuntu 14.04以及MacOS 10.12,并安装有Python2.7。 +执行下面的命令完成快速安装: + + .. code-block:: bash + + pip install paddlepaddle + +如果需要安装支持GPU的版本,需要执行: + + .. code-block:: bash + + pip install paddlepaddle-gpu + +更详细的安装和编译方法参考: + .. toctree:: :maxdepth: 1 build_and_install/index_cn.rst - concepts/use_concepts_cn.rst -- `深度学习入门课程 `_ +.. _quick_start: + +快速开始 +++++++++ + +创建一个 housing.py 并粘贴此Python代码: + + .. code-block:: python + + import paddle.v2 as paddle + + # Initialize PaddlePaddle. + paddle.init(use_gpu=False, trainer_count=1) + + # Configure the neural network. + x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13)) + y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear()) + + # Infer using provided test data. + probs = paddle.infer( + output_layer=y_predict, + parameters=paddle.dataset.uci_housing.model(), + input=[item for item in paddle.dataset.uci_housing.test()()]) + + for i in xrange(len(probs)): + print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000) + +执行 :code:`python housing.py` 瞧! 它应该打印出预测住房数据的清单。 + +.. toctree:: + :maxdepth: 1 + + concepts/use_concepts_cn.rst diff --git a/doc/getstarted/index_en.rst b/doc/getstarted/index_en.rst index be3253e3d41b99a2b696e2c5ef6463ed49680d69..d14e3f5c0cc90792fce9cb82e65da482c44dc433 100644 --- a/doc/getstarted/index_en.rst +++ b/doc/getstarted/index_en.rst @@ -1,9 +1,61 @@ GET STARTED ============ +.. _quick_install: + +Quick Install +---------------------- + +You can use pip to install PaddlePaddle with a single command, supports +CentOS 6 above, Ubuntu 14.04 above or MacOS 10.12, with Python 2.7 installed. +Simply run the following command to install: + + .. code-block:: bash + + pip install paddlepaddle + +If you need to install GPU version, run: + + .. code-block:: bash + + pip install paddlepaddle-gpu + +For more details about installation and build: + .. toctree:: :maxdepth: 1 build_and_install/index_en.rst -- `Deep Learning 101 `_ + +.. _quick_start: + +Quick Start +++++++++ + +Create a new file called housing.py, and paste this Python +code: + + + .. code-block:: python + + import paddle.v2 as paddle + + # Initialize PaddlePaddle. + paddle.init(use_gpu=False, trainer_count=1) + + # Configure the neural network. + x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13)) + y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear()) + + # Infer using provided test data. + probs = paddle.infer( + output_layer=y_predict, + parameters=paddle.dataset.uci_housing.model(), + input=[item for item in paddle.dataset.uci_housing.test()()]) + + for i in xrange(len(probs)): + print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000) + +Run :code:`python housing.py` and voila! It should print out a list of predictions +for the test housing data. diff --git a/doc/howto/cross_compiling/cross_compiling_for_android_cn.md b/doc/howto/cross_compiling/cross_compiling_for_android_cn.md deleted file mode 100644 index 90dc84718c9ce1374cda6022de177afeeb60279d..0000000000000000000000000000000000000000 --- a/doc/howto/cross_compiling/cross_compiling_for_android_cn.md +++ /dev/null @@ -1,75 +0,0 @@ -# 构建Android平台上的PaddlePaddle库 - -用户可通过交叉编译的方式,在用户熟悉的开发平台(Linux,Mac OS X和Windows)上编译Android平台上适用的PaddlePaddle库。 -本文档将以Linux x86-64平台为例,介绍交叉编译Android平台上适用的PaddlePaddle库的方法和步骤。 - -## 准备交叉编译环境 - -从源码交叉编译PaddlePaddle,用户需要提前准备好交叉编译环境。Android平台上使用的C/C++交叉编译工具链为[Android NDK](https://developer.android.com/ndk/downloads/index.html?hl=zh-cn),用户可自行前往下载预编译好的版本,也可通过以下命令获取: - -```bash -wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip -unzip -q android-ndk-r14b-linux-x86_64.zip -``` - -Android NDK中包含了所有Android API级别、所有架构(arm/arm64/x86/mips)需要用到的编译工具和系统库。用户可根据自己的编译目标架构、所需支持的最低Android API级别,构建[独立工具链](https://developer.android.google.cn/ndk/guides/standalone_toolchain.html?hl=zh-cn)。 -比如: - -```bash -your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \ - --arch=arm --platform=android-21 --install-dir=your/path/to/my_standalone_toolchain -``` - -此命令将在your/path/to/my_standalone_toolchain目录生成一套编译工具链,面向架构为32位ARM架构,支持的最小的Android API级别为21,使用的编译器为arm-linux-androideabi-gcc (GCC) 4.9。 - -注意:**PaddlePaddle要求使用的编译工具链所支持的Andoid API级别不小于21**。 - -## 配置交叉编译参数 - -CMake系统对交叉编译提供了支持[cmake-toolchains](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling)。为了简化cmake配置,PaddlePaddle为交叉编译提供了工具链配置文档[cmake/cross_compiling/android.cmake](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/android.cmake),以提供一些默认的编译器和编译参数相关配置。注意,从CMake 3.7版本开始,CMake官方对Android平台的交叉编译提供了通用的支持。PaddlePaddle若检测到用户使用的CMake版本不低于3.7时,将会将用户传进来的配置参数传递CMake系统,交由CMake系统本身来处理。有关参数配置的详细说明见[cmake-toolchains](https://cmake.org/cmake/help/v3.7/manual/cmake-toolchains.7.html#cross-compiling)。 - -交叉编译Android版本的PaddlePaddle库时,有一些必须配置的参数: -- `CMAKE_SYSTEM_NAME`,CMake编译的目标平台,必须设置为`Android`。在设置`CMAKE_SYSTEM_NAME=Android`后,PaddlePaddle的CMake系统才认为是在交叉编译Android系统的版本,并自动编译宿主机版protoc可执行文件、目标机版protobuf库、以及Android所需`arm_soft_fp_abi`分支的目标机版OpenBLAS库。此外,还会强制设置一些PaddlePaddle参数的值(`WITH_GPU=OFF`、`WITH_AVX=OFF`、`WITH_PYTHON=OFF`、`WITH_RDMA=OFF`)。 -- `WITH_C_API`,必须设置为`ON`。在Android平台上只支持使用C-API来预测。 -- `WITH_SWIG_PY`,必须设置为`OFF`。在Android平台上不支持通过swig调用来训练或者预测。 - -Android平台可选配置参数: - -- `ANDROID_STANDALONE_TOOLCHAIN`,独立工具链所在的绝对路径,或者相对于构建目录的相对路径。PaddlePaddle的CMake系统将根据该值自动推导和设置需要使用的交叉编译器、sysroot、以及Android API级别;否则,用户需要在cmake时手动设置这些值。无默认值。 -- `ANDROID_ABI`,目标架构ABI。目前只支持`armeabi-v7a`,默认值为`armeabi-v7a`。 -- `ANDROID_NATIVE_API_LEVEL`,工具链的Android API级别。若没有显式设置,PaddlePaddle将根据`ANDROID_STANDALONE_TOOLCHAIN`的值自动推导得到。 -- `ANROID_ARM_MODE`,是否使用ARM模式。可设置`ON/OFF`,默认值为`ON`。 -- `ANDROID_ARM_NEON`,是否使用NEON指令。目前必须设置成`ON`,默认值为`ON`。 - -其他配置参数: - -- `HOST_C/CXX_COMPILER`,宿主机的C/C++编译器。在编译宿主机版protoc可执行文件和目标机版OpenBLAS库时需要用到。默认设置成环境变量`CC`的值;若环境变量`CC`没有设置,则设置成`cc`编译器。 - -一种常用的cmake配置如下: - -```bash -cmake -DCMAKE_SYSTEM_NAME=Android \ - -DANDROID_STANDALONE_TOOLCHAIN=your/path/to/my_standalone_toolchain \ - -DANDROID_ABI=armeabi-v7a \ - -DANDROID_ARM_NEON=ON \ - -DANDROID_ARM_MODE=ON \ - -DCMAKE_INSTALL_PREFIX=your/path/to/install \ - -DWITH_C_API=ON \ - -DWITH_SWIG_PY=OFF \ - .. -``` - -用户还可根据自己的需求设置其他编译参数。比如希望最小化生成的库的大小,可以设置`CMAKE_BUILD_TYPE`为`MinSizeRel`;若希望最快的执行速度,则可设置`CMAKE_BUILD_TYPE`为`Release`。亦可以通过手动设置`CMAKE_C/CXX_FLAGS_MINSIZEREL/RELEASE`来影响PaddlePaddle的编译过程。 - -## 编译和安装 - -CMake配置完成后,执行以下命令,PaddlePaddle将自动下载和编译所有第三方依赖库、编译和安装PaddlePaddle预测库。 - -```bash -make -make install -``` - -注意:如果你曾经在源码目录下编译过其他平台的PaddlePaddle库,请先使用`rm -rf`命令删除`third_party`目录和`build`目录,以确保所有的第三方依赖库和PaddlePaddle代码都是针对新的CMake配置重新编译的。 - -执行完安装命令后,`your/path/to/install`目录中会包含`include`和`lib`目录,其中`include`中包含C-API的头文件,`lib`中包含一个Android版本的库。自此,PaddlePaddle的已经安装完成,用户可将`your/path/to/install`目录下的生成文件用于深度学习相关Android App中,调用方法见C-API文档。 diff --git a/doc/howto/cross_compiling/cross_compiling_for_raspberry_cn.md b/doc/howto/cross_compiling/cross_compiling_for_raspberry_cn.md deleted file mode 100644 index 085b5dda1615a9af918b59870db460fcc5acdcca..0000000000000000000000000000000000000000 --- a/doc/howto/cross_compiling/cross_compiling_for_raspberry_cn.md +++ /dev/null @@ -1,65 +0,0 @@ -# 构建Raspberry Pi平台上的PaddlePaddle库 - -对于Rasspberry Pi系统,用户可通过ssh等方式登录到Raspberry Pi系统上,按照[源码编译PaddlePaddle](http://www.paddlepaddle.org/doc_cn/getstarted/build_and_install/cmake/build_from_source_cn.html)相关文档所述,直接编译Raspberry Pi平台上适用的PaddlePaddle库。 - -用户也可以在自己熟悉的开发平台上,通过交叉编译的方式来编译。这篇文档将以Linux x86-64平台为例,介绍交叉编译Raspberry Pi平台上适用的PaddlePaddle的方法和步骤。 - -## 准备交叉编译环境 - -从源码交叉编译PaddlePaddle,用户需要提前准备好交叉编译环境。用户可自行前往[github](https://github.com/raspberrypi/tools)下载Raspberry Pi平台使用的C/C++交叉编译工具链,也可通过以下命令获取: - -```bash -git clone https://github.com/raspberrypi/tools.git -``` - -该github仓库中包含若干个预编译好的、针对不同平台的编译工具。宿主机是Linux x86-64环境,则需选用`arm-bcm2708/gcc-linaro-arm-linux-gnueabihf-raspbian-x64`下的作为编译工具,所使用的编译器为arm-linux-gnueabihf-gcc 4.8.3。 - -注意,该编译工具链需要系统glibc支持2.14以上。 - -## 配置交叉编译参数 - -CMake系统对交叉编译提供了支持[cmake-toolchains](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling)。为了简化cmake配置,PaddlePaddle为交叉编译提供了工具链配置文档[cmake/cross_compiling/raspberry_pi.cmake](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/raspberry_pi.cmake),以提供一些默认的编译器和编译参数相关配置。 - -交叉编译Raspberry Pi版本PaddlePaddle库时,有一些必须配置的参数: - -- `CMAKE_SYSTEM_NAME`,CMake编译的目标平台,必须配置为`RPi`。在设置`CMAKE_SYSTEM_NAME=RPi`后,PaddlePaddle的CMake系统才认为在是在交叉编译Raspberry Pi系统的版本,并自动编译宿主机版protoc可执行文件、目标机版protobuf库、以及目标机版OpenBLAS库。 - -Raspberry Pi平台可选配置参数: - -- `RPI_TOOLCHAIN`,编译工具链所在的绝对路径,或者相对于构建目录的相对路径。PaddlePaddle的CMake系统将根据该值自动设置需要使用的交叉编译器;否则,用户需要在cmake时手动设置这些值。无默认值。 -- `RPI_ARM_NEON`,是否使用NEON指令。目前必须设置成`ON`,默认值为`ON`。 - -其他配置参数: - -- `HOST_C/CXX_COMPILER`,宿主机的C/C++编译器。在编译宿主机版protoc可执行文件和目标机版OpenBLAS库时需要用到。默认设置成环境变量`CC`的值;若环境变量`CC`没有设置,则设置成`cc`编译器。 - -cmake参数如下; - -``` -cmake -DCMAKE_SYSTEM_NAME=RPi \ - -DRPI_TOOLCHAIN=your/path/to/arm-bcm2708/gcc-linaro-arm-linux-gnueabihf-raspbian-x64 \ - -DRPI_ARM_NEON=ON \ - -DCMAKE_INSTALL_PREFIX=your/path/to/install \ - -DWITH_GPU=OFF \ - -DWITH_C_API=ON \ - -DWITH_PYTHON=OFF \ - -DWITH_SWIG_PY=OFF \ - .. -``` - -用户还可根据自己的需求设置其他编译参数。比如希望最小化生成的库的大小,可以设置`CMAKE_BUILD_TYPE`为`MinSizeRel`;若希望最快的执行速度,则可设置`CMAKE_BUILD_TYPE`为`Release`。亦可以通过手动设置`CMAKE_C/CXX_FLAGS_MINSIZEREL/RELEASE`来影响PaddlePaddle的编译过程。 - -## 编译和安装 - -CMake配置完成后,执行以下命令,PaddlePaddle将自动下载和编译所有第三方依赖库、编译和安装PaddlePaddle。 - -```bash -make -make install -``` - -注意:如果你曾经在源码目录下编译过其他平台的PaddlePaddle库,请先使用`rm -rf`命令删除`third_party`目录和`build`目录,以确保所有的第三方依赖库和PaddlePaddle代码都是针对新的CMake配置重新编译的。 - -执行完安装命令后,由于上一步cmake配置中`WITH_C_API`设置为`ON`,`your/path/to/install`目录中会包含`include`和`lib`目录,其中`include`中包含C-API的头文件,`lib`中包含一个Raspberry Pi版本的库。 - -更多的编译配置见[源码编译PaddlePaddle](http://www.paddlepaddle.org/doc_cn/getstarted/build_and_install/cmake/build_from_source_cn.html)相关文档。 diff --git a/doc/howto/deep_model/rnn/rnn_config_cn.rst b/doc/howto/deep_model/rnn/rnn_config_cn.rst index 4d684cf8ad5a8082cf31fb27027119b3d3e700b6..63fa161fafed0f3a8ec8799af21304cbec62d813 100644 --- a/doc/howto/deep_model/rnn/rnn_config_cn.rst +++ b/doc/howto/deep_model/rnn/rnn_config_cn.rst @@ -21,7 +21,7 @@ wmt14数据的提供文件在 `python/paddle/v2/dataset/wmt14.py