diff --git a/.clang-format b/.clang-format index 9ba433b17362424973626470d930356c2173dd84..aff93435f58c522f5ed1090aef2005f76e91cf31 100644 --- a/.clang-format +++ b/.clang-format @@ -25,4 +25,3 @@ AllowAllParametersOfDeclarationOnNextLine: true BinPackParameters: false BinPackArguments: false ... - diff --git a/.clang_format.hook b/.clang_format.hook new file mode 100755 index 0000000000000000000000000000000000000000..1d928216867c0ba3897d71542fea44debf8d72a0 --- /dev/null +++ b/.clang_format.hook @@ -0,0 +1,15 @@ +#!/bin/bash +set -e + +readonly VERSION="3.8" + +version=$(clang-format -version) + +if ! [[ $version == *"$VERSION"* ]]; then + echo "clang-format version check failed." + echo "a version contains '$VERSION' is needed, but get '$version'" + echo "you can install the right version, and make an soft-link to '\$PATH' env" + exit -1 +fi + +clang-format $@ diff --git a/.gitignore b/.gitignore index 2b30f7938c8a1672acd0a14b7051af12c37889fb..ac56a3320ec85769d2c87c072512f5217eca0c24 100644 --- a/.gitignore +++ b/.gitignore @@ -16,3 +16,16 @@ third_party/ *~ bazel-* third_party/ + +# clion workspace. +cmake-build-* + +# generated while compiling +python/paddle/v2/fluid/core.so +paddle/pybind/pybind.h +CMakeFiles +cmake_install.cmake +paddle/.timestamp +python/paddlepaddle.egg-info/ +paddle/pybind/pybind.h +python/paddle/version.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9b138576fcc695408c4cc0a03d227da7d0c6f440..59661c9c1da53a2ddac0127ed1827fedde811a1d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,8 +3,8 @@ hooks: - id: remove-crlf files: (?!.*third_party)^.*$ | (?!.*book)^.*$ -- repo: https://github.com/reyoung/mirrors-yapf.git - sha: v0.13.2 +- repo: https://github.com/PaddlePaddle/mirrors-yapf.git + sha: 0d79c0c469bab64f7229c9aca2b1186ef47f0e37 hooks: - id: yapf files: (.*\.(py|bzl)|BUILD|.*\.BUILD|WORKSPACE)$ @@ -17,7 +17,17 @@ - id: detect-private-key files: (?!.*third_party)^.*$ | (?!.*book)^.*$ - id: end-of-file-fixer -- repo: https://github.com/PaddlePaddle/clang-format-pre-commit-hook.git - sha: 28c0ea8a67a3e2dbbf4822ef44e85b63a0080a29 +- repo: local hooks: - - id: clang-formater + - id: clang-format-with-version-check + name: clang-format + description: Format files with ClangFormat. + entry: bash ./.clang_format.hook -i + language: system + files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$ +- repo: https://github.com/PaddlePaddle/pre-commit-golang + sha: 8337620115c25ff8333f1b1a493bd031049bd7c0 + hooks: + - id: go-fmt + types: + - go diff --git a/.travis.yml b/.travis.yml index 865e21f046b7f3ac4bc3de09c1300a0a1d0337d4..e2d49daa1981396628efa5d16459eb70e9e76884 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,30 +1,28 @@ language: cpp cache: directories: - - $HOME/third_party - $HOME/.ccache - $HOME/.cache/pip + - $TRAVIS_BUILD_DIR/build/third_party sudo: required dist: trusty os: - linux env: - - JOB=DOCS - - JOB=BUILD_AND_TEST - - JOB=PRE_COMMIT + - JOB=build_doc + - JOB=check_style addons: apt: packages: - gcc-4.8 - g++-4.8 - - gfortran-4.8 - git - build-essential - python - python-pip - python2.7-dev - - python-numpy - python-wheel + - libboost-dev - curl - swig - graphviz @@ -32,28 +30,27 @@ addons: - automake - libtool - ccache + ssh_known_hosts: 52.76.173.135 before_install: - - | - if [ ${JOB} == "BUILD_AND_TEST" ]; then - local change_list=`git diff --name-only $TRAVIS_COMMIT_RANGE` - if [ $? -eq 0 ]; then # if git diff return no zero, then rerun unit test. - if ! echo ${change_list} | grep -qvE '(\.md$)|(\.rst$)|(\.jpg$)|(\.png$)' - then - echo "Only markdown docs were updated, stopping build process." - exit - fi - fi - fi - - if [[ "$JOB" == "PRE_COMMIT" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi - # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python + - if [[ "$JOB" == "check_style" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi + # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python # protobuf version. - - pip install numpy wheel 'protobuf==3.1' sphinx recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit requests==2.9.2 LinkChecker + - sudo pip install -r $TRAVIS_BUILD_DIR/python/requirements.txt + - sudo pip install wheel sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit LinkChecker - | function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; } script: - - | - timeout 2580 paddle/scripts/travis/main.sh # 43min timeout - RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else false; fi; + - | + timeout 2580 paddle/scripts/travis/${JOB}.sh # 43min timeout + RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true ;else exit 1; fi; + - | + if [[ "$JOB" != "build_doc" ]]; then exit 0; fi; + if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi; + if [[ "$TRAVIS_BRANCH" != "develop" && ! "$TRAVIS_BRANCH" =~ ^v[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then exit 0; fi; + export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/master/scripts/deploy/deploy_docs.sh + export DOCS_DIR=`pwd` + cd .. + curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH $DOCS_DIR $DOCS_DIR/build/doc notifications: email: on_success: change diff --git a/AUTHORS.md b/AUTHORS.md new file mode 100644 index 0000000000000000000000000000000000000000..4db4a4a8e7441b07ce2db4adff043bb99a09014b --- /dev/null +++ b/AUTHORS.md @@ -0,0 +1,48 @@ +| Github account | name | +|---|---| +| backyes | Yan-Fei Wang | +| beckett1124 | Bin Qi | +| Canpio | Jia-Yi Feng | +| chengxiaohua1105 | Xiao-Hua Cheng | +| cxwangyi, yiwangbaidu, wangkuiyi | Yi Wang | +| cxysteven | Xing-Yi Cheng | +| dzhwinter | Zhi-Hong Dong | +| emailweixu | Wei Xu | +| gangliao | Gang Liao | +| gongweibao | Wei-Bao Gong | +| Guo Sheng | Sheng Guo | +| Haichao-Zhang | Hai-Chao Zhang | +| hedaoyuan | Dao-Yuan He | +| helinwang | He-Lin Wang | +| jacquesqiao | Long-Fei Qiao | +| kuke | Yi-Bing Liu | +| lcy-seso | Ying Cao | +| lipeng-unisound | Peng Li | +| liuyuan | Yuan Liu | +| livc | Zhao Li | +| llxxxll | Yong-Feng Liu | +| luotao01 | Tao Luo | +| lzhao4ever | Liang Zhao | +| NHZlX | Zhao-Long Xing | +| pakchoi | Chuan-Jiang Song | +| pengli09 | Peng Li | +| pkuyym | Ya-Ming Yang | +| QiJune | Jun Qi | +| qingqing01 | Qing-Qing Dang | +| reyoung | Yang Yu | +| Superjom | Chun-Wei Yan | +| tianbingsz | Tian-Bing Xu | +| typhoonzero | Yi Wu | +| wanghaoshuang | Hao-Shuang Wang | +| wangyang59 | Yang Wang | +| wangzhen-nlp | Zhen Wang | +| wen-bo-yang | Wen-Bo Yang | +| wwhu | Wei-Wei Hu | +| xinghai-sun | Xing-Hai Sun | +| Xreki | Yi-Qun Liu | +| xujun05 | Jun Xu | +| xushaoyong | Shao-Yong Xu | +| Yancey1989 | Xu Yan | +| zhaopu7 | Pu Zhao | +| zhouxiao-coder | Xiao Zhou | +| Zrachel | Rui-Qing Zhang | diff --git a/CMakeLists.txt b/CMakeLists.txt index 1a59db8c71bf3b1ea472c1ee56a1cd97de42dad8..b309ff37e52b4fd28b14925bdd7e3740e1e2fa47 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,29 +1,29 @@ # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License -set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake") -set(PROJ_ROOT ${CMAKE_SOURCE_DIR}) +cmake_minimum_required(VERSION 3.0) +set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake") +set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) +set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) +SET(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG") +SET(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG") include(system) -if(ANDROID) - cmake_minimum_required(VERSION 3.7) -else() - cmake_minimum_required(VERSION 3.0) -endif() - -project(paddle CXX C) +project(paddle CXX C Go) +message(STATUS "CXX compiler: " ${CMAKE_CXX_COMPILER} ", version: " ${CMAKE_CXX_COMPILER_VERSION}) +message(STATUS "C compiler: " ${CMAKE_C_COMPILER} ", version: " ${CMAKE_C_COMPILER_VERSION}) find_package(Sphinx) if(NOT CMAKE_CROSSCOMPILING) @@ -31,12 +31,16 @@ if(NOT CMAKE_CROSSCOMPILING) endif(NOT CMAKE_CROSSCOMPILING) find_package(Git REQUIRED) find_package(Threads REQUIRED) +if(NOT ANDROID AND NOT IOS) + find_package(Boost QUIET) +endif() include(simd) ################################ Configurations ####################################### option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND}) option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND}) +option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND}) option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON) option(WITH_TESTING "Compile PaddlePaddle with unit testing" ON) option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON) @@ -50,6 +54,13 @@ option(WITH_DOC "Compile PaddlePaddle with documentation" OFF) option(WITH_COVERAGE "Compile PaddlePaddle with code coverage" OFF) option(COVERALLS_UPLOAD "Package code coverage data to coveralls" OFF) option(ON_TRAVIS "Exclude special unit test on Travis CI" OFF) +option(WITH_C_API "Compile PaddlePaddle with C-API(Prediction)" OFF) +option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF) +option(GLIDE_INSTALL "Download and install go dependencies " ON) +option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF) +option(WITH_DISTRIBUTE "Compile with grpc distributed support" OFF) +option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF) +option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF) # CMAKE_BUILD_TYPE if(NOT CMAKE_BUILD_TYPE) @@ -58,25 +69,61 @@ if(NOT CMAKE_BUILD_TYPE) FORCE) endif() -if(ANDROID) - if(${CMAKE_SYSTEM_VERSION} VERSION_LESS "21") - message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 21") +if(ANDROID OR IOS) + if(ANDROID) + if(${CMAKE_SYSTEM_VERSION} VERSION_LESS "16") + message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 16") + endif() endif() set(WITH_GPU OFF CACHE STRING - "Disable GPU when cross-compiling for Android" FORCE) + "Disable GPU when cross-compiling for Android and iOS" FORCE) set(WITH_AVX OFF CACHE STRING - "Disable AVX when cross-compiling for Android" FORCE) + "Disable AVX when cross-compiling for Android and iOS" FORCE) set(WITH_PYTHON OFF CACHE STRING - "Disable PYTHON when cross-compiling for Android" FORCE) + "Disable PYTHON when cross-compiling for Android and iOS" FORCE) set(WITH_RDMA OFF CACHE STRING - "Disable RDMA when cross-compiling for Android" FORCE) -endif(ANDROID) + "Disable RDMA when cross-compiling for Android and iOS" FORCE) + set(WITH_MKL OFF CACHE STRING + "Disable MKL when cross-compiling for Android and iOS" FORCE) + set(WITH_GOLANG OFF CACHE STRING + "Disable golang when cross-compiling for Android and iOS" FORCE) + + # Compile PaddlePaddle mobile inference library + if (NOT WITH_C_API) + set(WITH_C_API ON CACHE STRING + "Always compile the C_API when cross-compiling for Android and iOS" FORCE) + endif() + set(MOBILE_INFERENCE ON) + add_definitions(-DPADDLE_MOBILE_INFERENCE) +endif() -set(THIRD_PARTY_PATH "${PROJ_ROOT}/third_party" CACHE STRING +set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING "A path setting third party libraries download & build directories.") + +if (WITH_C_API AND WITH_PYTHON) + message(WARNING "It is suggest not embedded a python interpreter in Paddle " + "when using C-API. It will give an unpredictable behavior when using a " + "different Python interpreter from compiling.") +endif() + +if(MOBILE_INFERENCE) + set(THIRD_PARTY_BUILD_TYPE MinSizeRel) +else() + set(THIRD_PARTY_BUILD_TYPE Release) +endif() + +set(WITH_MKLML ${WITH_MKL}) +if (WITH_MKL AND AVX2_FOUND) + set(WITH_MKLDNN ON) +else() + message(STATUS "Do not have AVX2 intrinsics and disabled MKL-DNN") + set(WITH_MKLDNN OFF) +endif() + ######################################################################################## +include(external/mklml) # download mklml package include(external/zlib) # download, build, install zlib include(external/gflags) # download, build, install gflags include(external/glog) # download, build, install glog @@ -84,24 +131,34 @@ include(external/gtest) # download, build, install gtest include(external/protobuf) # download, build, install protobuf include(external/python) # download, build, install python include(external/openblas) # download, build, install openblas +include(external/mkldnn) # download, build, install mkldnn include(external/swig) # download, build, install swig include(external/warpctc) # download, build, install warpctc include(external/any) # download libn::any +include(external/eigen) # download eigen3 +include(external/pybind11) # download pybind11 +include(external/nccl) +include(external/cares) +include(external/grpc) +include(cudnn) # set cudnn libraries, must before configure +include(configure) # add paddle env configuration +include(generic) # simplify cmake module include(package) # set paddle packages include(cpplint) # set paddle c++ style include(ccache) # set ccache for compilation include(util) # set unittest and link libs include(rdma) # set rdma libraries include(flags) # set paddle compile flags -include(cudnn) # set cudnn libraries include(version) # set PADDLE_VERSION include(coveralls) # set code coverage -include(configure) # add paddle env configuration -include_directories("${PROJ_ROOT}") -include_directories("${PROJ_ROOT}/paddle/cuda/include") + +include_directories("${PADDLE_SOURCE_DIR}") +include_directories("${PADDLE_SOURCE_DIR}/paddle/cuda/include") include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto") +include_directories("${CMAKE_CURRENT_BINARY_DIR}/go/pserver/client/c") +include_directories(${Boost_INCLUDE_DIRS}) set(EXTERNAL_LIBS ${GFLAGS_LIBRARIES} @@ -113,15 +170,41 @@ set(EXTERNAL_LIBS ) if(WITH_GPU) - list(APPEND EXTERNAL_LIB ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY}) - if(NOT WITH_DSO) - list(APPEND EXTERNAL_LIB ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY}) - endif(NOT WITH_DSO) + include(cuda) endif(WITH_GPU) +if(WITH_MKLML) + list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB}) +endif() + +if(WITH_MKLDNN) + list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB}) +endif() + +if(USE_NNPACK) + include(external/nnpack) + list(APPEND EXTERNAL_LIBS ${NNPACK_LIBS}) +endif(USE_NNPACK) + add_subdirectory(proto) + +if(NOT MOBILE_INFERENCE) + # "add_subdirectory(go)" should be placed after the following loine, + # because it depends on paddle/optimizer. + add_subdirectory(paddle/optimizer) +endif() + +# "add_subdirectory(paddle)" and "add_subdirectory(python)" should be +# placed after this block, because they depends on it. +if(WITH_GOLANG) + add_subdirectory(go) +endif(WITH_GOLANG) + +set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build") add_subdirectory(paddle) -add_subdirectory(python) +if(WITH_PYTHON) + add_subdirectory(python) +endif() if(WITH_DOC) add_subdirectory(doc) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 0d4bb973ae87bb45ef4386a63c26ed62602f2cee..a60453ff4e3bba6e6cb3b3de915dd69afd3a1ec3 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1 +1,157 @@ -./doc/howto/dev/contribute_to_paddle_en.md +# Contribute Code + +We sincerely appreciate your contribution. This document explains our workflow and work style. + +## Workflow + +PaddlePaddle uses this [Git branching model](http://nvie.com/posts/a-successful-git-branching-model/). The following steps guide usual contributions. + +1. Fork + + Our development community has been growing fastly; it doesn't make sense for everyone to write into the official repo. So, please file Pull Requests from your fork. To make a fork, just head over to the GitHub page and click the ["Fork" button](https://help.github.com/articles/fork-a-repo/). + +1. Clone + + To make a copy of your fork to your local computers, please run + + ```bash + git clone https://github.com/your-github-account/paddle + cd paddle + ``` + +1. Create the local feature branch + + For daily works like adding a new feature or fixing a bug, please open your feature branch before coding: + + ```bash + git checkout -b my-cool-stuff + ``` + +1. Commit + + Before issuing your first `git commit` command, please install [`pre-commit`](http://pre-commit.com/) by running the following commands: + + ```bash + pip install pre-commit + pre-commit install + ``` + + Our pre-commit configuration requires clang-format 3.8 for auto-formating C/C++ code and yapf for Python. + + Once installed, `pre-commit` checks the style of code and documentation in every commit. We will see something like the following when you run `git commit`: + + ``` + ➜ git commit + CRLF end-lines remover...............................(no files to check)Skipped + yapf.................................................(no files to check)Skipped + Check for added large files..............................................Passed + Check for merge conflicts................................................Passed + Check for broken symlinks................................................Passed + Detect Private Key...................................(no files to check)Skipped + Fix End of Files.....................................(no files to check)Skipped + clang-formater.......................................(no files to check)Skipped + [my-cool-stuff c703c041] add test file + 1 file changed, 0 insertions(+), 0 deletions(-) + create mode 100644 233 + ``` + +1. Build and test + + Users can build PaddlePaddle natively on Linux and Mac OS X. But to unify the building environment and to make it easy for debugging, the recommended way is [using Docker](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/build_en.md). + +1. Keep pulling + + An experienced Git user pulls from the official repo often -- daily or even hourly, so they notice conflicts with others work early, and it's easier to resolve smaller conflicts. + + ```bash + git remote add upstream https://github.com/PaddlePaddle/Paddle + git pull upstream develop + ``` + +1. Push and file a pull request + + You can "push" your local work into your forked repo: + + ```bash + git push origin my-cool-stuff + ``` + + The push allows you to create a pull request, requesting owners of this [official repo](https://github.com/PaddlePaddle/Paddle) to pull your change into the official one. + + To create a pull request, please follow [these steps](https://help.github.com/articles/creating-a-pull-request/). + + If your change is for fixing an issue, please write ["Fixes "](https://help.github.com/articles/closing-issues-using-keywords/) in the description section of your pull request. Github would close the issue when the owners merge your pull request. + + Please remember to specify some reviewers for your pull request. If you don't know who are the right ones, please follow Github's recommendation. + + +1. Delete local and remote branches + + To keep your local workspace and your fork clean, you might want to remove merged branches: + + ```bash + git push origin :my-cool-stuff + git checkout develop + git pull upstream develop + git branch -d my-cool-stuff + ``` + +### Code Review + +- Please feel free to ping your reviewers by sending them the URL of your pull request via IM or email. Please do this after your pull request passes the CI. + +- Please answer reviewers' every comment. If you are to follow the comment, please write "Done"; please give a reason otherwise. + +- If you don't want your reviewers to get overwhelmed by email notifications, you might reply their comments by [in a batch](https://help.github.com/articles/reviewing-proposed-changes-in-a-pull-request/). + +- Reduce the unnecessary commits. Some developers commit often. It is recommended to append a sequence of small changes into one commit by running `git commit --amend` instead of `git commit`. + + +## Coding Standard + +### Code Style + +Our C/C++ code follows the [Google style guide](http://google.github.io/styleguide/cppguide.html). + +Our Python code follows the [PEP8 style guide](https://www.python.org/dev/peps/pep-0008/). + +Our build process helps to check the code style. In [`build.sh`](https://github.com/PaddlePaddle/Paddle/blob/b84e8226514b8bb4405c3c28e54aa5077193d179/paddle/scripts/docker/build.sh#L42), the entry point of our [builder Docker image](https://github.com/PaddlePaddle/Paddle/blob/b84e8226514b8bb4405c3c28e54aa5077193d179/Dockerfile#L88), the CMake argument `WITH_STYLE_CHECK` is set to `ON` by default. This flag is on + +Please install pre-commit, which automatically reformat the changes to C/C++ and Python code whenever we run `git commit`. To check the whole codebase, we can run the command `pre-commit run -a`, as in the [`check_style.sh` file](https://github.com/PaddlePaddle/Paddle/blob/b84e8226514b8bb4405c3c28e54aa5077193d179/paddle/scripts/travis/check_style.sh#L30), which is invoked by [our Travis CI configuration](https://github.com/PaddlePaddle/Paddle/blob/b84e8226514b8bb4405c3c28e54aa5077193d179/.travis.yml#L43). + +### Unit Tests + +Please remember to add related unit tests. + +- For C/C++ code, please follow [`google-test` Primer](https://github.com/google/googletest/blob/master/googletest/docs/Primer.md). + +- For Python code, please use [Python's standard `unittest` package](http://pythontesting.net/framework/unittest/unittest-introduction/). + + +### Writing Logs + +We use [glog](https://github.com/google/glog) for logging in our C/C++ code. + +For general information, please use `LOG`. For debug information, please use [`VLOG`](http://htmlpreview.github.io/?https://github.com/google/glog/blob/master/doc/glog.html#verbose). The reason is at [here](https://groups.google.com/a/chromium.org/d/msg/chromium-dev/3NDNd1KzXeY/AZKMMx37fdQJ). + +`VLOG` requires a *verbose level* parameter. For example: + +```c++ +VLOG(3) << "Operator FC is taking " << num_inputs << "inputs." +``` + +When we run a PaddlePaddle application or test, we can specify a verbose threshold. For example: + +```bash +GLOG_vmodule=buddy_allocator=2 \ +GLOG_v=10 \ +python \ +../python/paddle/v2/framework/tests/test_recurrent_op.py +``` + +This will enable VLOG messages generated by `buddy_allocator.{h,cc}` and in the verbose range of 0 to 3, so you will see above example VLOG message, which is in level 3. This suggests that we output overall messages in lower verbose levels, so they display with higher probability. When coding C++, please follow the verbose level convention as follows: + +- verbose level 1: [framework](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework) +- verbose level 3: [operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators) +- verbose level 5: [memory](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory), [platform](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/platform) +- verbose level 7: [math](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/math) diff --git a/Dockerfile b/Dockerfile index f12be36ceb764a535e8a87b7071757f1ef3dada7..857d3f3e5f64791146741ffb29feabfcb2ecbb84 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,6 @@ # A image for building paddle binaries # Use cuda devel base image for both cpu and gpu environment -FROM nvidia/cuda:8.0-cudnn5-devel-ubuntu14.04 +FROM nvidia/cuda:8.0-cudnn5-devel-ubuntu16.04 MAINTAINER PaddlePaddle Authors ARG UBUNTU_MIRROR @@ -10,26 +10,40 @@ RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ub ARG WITH_GPU ARG WITH_AVX ARG WITH_DOC -ARG WITH_STYLE_CHECK ENV WOBOQ OFF -ENV WITH_GPU=${WITH_GPU:-OFF} +ENV WITH_GPU=${WITH_GPU:-ON} ENV WITH_AVX=${WITH_AVX:-ON} ENV WITH_DOC=${WITH_DOC:-OFF} -ENV WITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF} ENV HOME /root # Add bash enhancements COPY ./paddle/scripts/docker/root/ /root/ RUN apt-get update && \ - apt-get install -y git python-pip python-dev openssh-server bison && \ - apt-get install -y wget unzip tar xz-utils bzip2 gzip coreutils && \ - apt-get install -y curl sed grep graphviz libjpeg-dev zlib1g-dev && \ - apt-get install -y python-numpy python-matplotlib gcc g++ gfortran && \ - apt-get install -y automake locales clang-format-3.8 swig doxygen && \ + apt-get install -y \ + git python-pip python-dev openssh-server bison libnccl-dev \ + wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \ + curl sed grep graphviz libjpeg-dev zlib1g-dev \ + python-matplotlib gcc-4.8 g++-4.8 \ + automake locales clang-format swig doxygen cmake \ + liblapack-dev liblapacke-dev libboost-dev \ + clang-3.8 llvm-3.8 libclang-3.8-dev \ + net-tools libtool && \ apt-get clean -y +# Install Go and glide +RUN wget -qO- https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \ + tar -xz -C /usr/local && \ + mkdir /root/gopath && \ + mkdir /root/gopath/bin && \ + mkdir /root/gopath/src +ENV GOROOT=/usr/local/go GOPATH=/root/gopath +# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT. +ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin +# install glide +RUN curl -s -q https://glide.sh/get | sh + # git credential to skip password typing RUN git config --global credential.helper store @@ -39,18 +53,29 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8 # FIXME: due to temporary ipykernel dependency issue, specify ipykernel jupyter # version util jupyter fixes this issue. RUN pip install --upgrade pip && \ - pip install -U 'protobuf==3.1.0' && \ - pip install -U wheel pillow BeautifulSoup && \ + pip install -U wheel && \ pip install -U docopt PyYAML sphinx && \ - pip install -U sphinx-rtd-theme==0.1.9 recommonmark && \ - pip install pre-commit 'requests==2.9.2' 'ipython==5.3.0' && \ - pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' + pip install -U sphinx-rtd-theme==0.1.9 recommonmark + +RUN pip install pre-commit 'ipython==5.3.0' && \ + pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ + pip install opencv-python + +COPY ./python/requirements.txt /root/ +RUN pip install -r /root/requirements.txt + +# To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use +# the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2 +RUN apt-get install -y libssl-dev libffi-dev +RUN pip install certifi urllib3[secure] -RUN curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \ - cd cmake-3.4.1 && ./bootstrap && make -j `nproc` && make install && \ - cd .. && rm -rf cmake-3.4.1 -VOLUME ["/woboq_out"] +# Install woboq_codebrowser to /woboq +RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \ + (cd /woboq \ + cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \ + -DCMAKE_BUILD_TYPE=Release . \ + make) # Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service RUN mkdir /var/run/sshd diff --git a/Dockerfile.android b/Dockerfile.android new file mode 100644 index 0000000000000000000000000000000000000000..9d13a414f67be04e17b7d83403228d92bce0eda9 --- /dev/null +++ b/Dockerfile.android @@ -0,0 +1,54 @@ +FROM ubuntu:16.04 +MAINTAINER PaddlePaddle Authors + +ARG UBUNTU_MIRROR +RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi' + +# ENV variables +ARG ANDROID_ABI +ARG ANDROID_API + +ENV ANDROID_ABI=${ANDROID_ABI:-"armeabi-v7a"} +ENV ANDROID_API=${ANDROID_API:-21} + +ENV HOME=/root \ + ANDROID_NDK_HOME=/opt/android-ndk-linux \ + ANDROID_TOOLCHAINS_DIR=/opt/toolchains + +RUN apt-get update && \ + apt-get install -y \ + git python-dev python-pip python-numpy \ + wget curl tar unzip gcc g++ locales clang-format-3.8 swig cmake && \ + apt-get clean -y + +# Install Go and glide +RUN wget -qO- go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \ + tar -xz -C /usr/local && \ + mkdir /root/gopath && \ + mkdir /root/gopath/bin && \ + mkdir /root/gopath/src +ENV GOROOT=/usr/local/go GOPATH=/root/gopath +# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT. +ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin + +# git credential to skip password typing +RUN git config --global credential.helper store + +# Fix locales to en_US.UTF-8 +RUN localedef -i en_US -f UTF-8 en_US.UTF-8 + +RUN pip install --upgrade pip && \ + pip install -U 'protobuf==3.1.0' && \ + pip install -U wheel sphinx && \ + pip install pre-commit + +# Android NDK +RUN mkdir -p ${ANDROID_TOOLCHAINS_DIR} && \ + mkdir -p /opt/android-ndk-tmp && \ + cd /opt/android-ndk-tmp && \ + wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip && \ + unzip -q android-ndk-r14b-linux-x86_64.zip && \ + mv android-ndk-r14b ${ANDROID_NDK_HOME} && \ + rm -rf /opt/android-ndk-tmp + +CMD ["bash", "/paddle/paddle/scripts/docker/build_android.sh"] diff --git a/README.md b/README.md index bcc24b84128df282a2e3f0bc62aafe1ffe172338..db0fbd88b250cdc2a3cc77521cc1c2cea77c6e87 100644 --- a/README.md +++ b/README.md @@ -2,8 +2,8 @@ [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle) -[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/develop/doc/) -[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/doc_cn/) +[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://doc.paddlepaddle.org/develop/doc/) +[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://doc.paddlepaddle.org/develop/doc_cn/) [![Coverage Status](https://coveralls.io/repos/github/PaddlePaddle/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/PaddlePaddle/Paddle?branch=develop) [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases) [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE) @@ -51,45 +51,46 @@ Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddl - **Connected to Products** In addition, PaddlePaddle is also designed to be easily deployable. At Baidu, - PaddlePaddle has been deployed into products or service with a vast number + PaddlePaddle has been deployed into products and services with a vast number of users, including ad click-through rate (CTR) prediction, large-scale image classification, optical character recognition(OCR), search ranking, computer virus detection, recommendation, etc. It is widely utilized in products at - Baidu and it has achieved a significant impact. We hope you can also exploit - the capability of PaddlePaddle to make a huge impact for your product. + Baidu and it has achieved a significant impact. We hope you can also explore + the capability of PaddlePaddle to make an impact on your product. ## Installation It is recommended to check out the -[Docker installation guide](http://www.paddlepaddle.org/develop/doc/getstarted/build_and_install/docker_install_en.html) +[Docker installation guide](http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/docker_install_en.html) before looking into the -[build from source guide](http://www.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html) +[build from source guide](http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html). ## Documentation -We provide [English](http://www.paddlepaddle.org/develop/doc/) and -[Chinese](http://www.paddlepaddle.org/doc_cn/) documentation. +We provide [English](http://doc.paddlepaddle.org/develop/doc/) and +[Chinese](http://doc.paddlepaddle.org/doc_cn/) documentation. -- [Deep Learning 101](http://book.paddlepaddle.org/index.en.html) +- [Deep Learning 101](http://book.paddlepaddle.org/index.html) - You might want to start from the this online interactive book that can run in Jupyter Notebook. + You might want to start from this online interactive book that can run in a Jupyter Notebook. -- [Distributed Training](http://www.paddlepaddle.org/develop/doc/howto/usage/cluster/cluster_train_en.html) +- [Distributed Training](http://doc.paddlepaddle.org/develop/doc/howto/usage/cluster/cluster_train_en.html) You can run distributed training jobs on MPI clusters. -- [Distributed Training on Kubernetes](http://www.paddlepaddle.org/develop/doc/howto/usage/k8s/k8s_en.html) +- [Distributed Training on Kubernetes](http://doc.paddlepaddle.org/develop/doc/howto/usage/k8s/k8s_en.html) You can also run distributed training jobs on Kubernetes clusters. -- [Python API](http://www.paddlepaddle.org/develop/doc/api/index_en.html) +- [Python API](http://doc.paddlepaddle.org/develop/doc/api/index_en.html) Our new API enables much shorter programs. -- [How to Contribute](http://www.paddlepaddle.org/develop/doc/howto/dev/contribute_to_paddle_en.html) +- [How to Contribute](http://doc.paddlepaddle.org/develop/doc/howto/dev/contribute_to_paddle_en.html) We appreciate your contributions! + ## Ask Questions You are welcome to submit questions and bug reports as [Github Issues](https://github.com/PaddlePaddle/Paddle/issues). diff --git a/RELEASE.cn.md b/RELEASE.cn.md old mode 100755 new mode 100644 index 5deaf230a8f5dd3089993f0fc79b9460fd049750..494c59730dd3c2830514e8924aa3d59a34ac412e --- a/RELEASE.cn.md +++ b/RELEASE.cn.md @@ -1,3 +1,62 @@ +# v0.11.0版本 + +## PaddlePaddle Fluid + +- PaddlePaddle发布版本v0.11.0包含一个新的特性*PaddlePaddle Fluid*. Fluid 是设计用来让用户像Pytorch和Tensorflow Eager Execution一样执行程序。在这些系统中,不再有*模型*这个概念,应用也不再包含一个用于描述Operator图或者一系列层的符号描述,而是像通用程序那样描述训练或者预测的过程。而Fluid与PyTorch或Eager Execution的区别在于Fluid不依赖Python提供的控制流,例如 if-else-then或者for,而是提供了基于C++实现的控制流并暴露了对应的用with语法实现的Python接口。例如: + + https://github.com/PaddlePaddle/Paddle/blob/3df78ed2a98d37f7ae6725894cc7514effd5664b/python/paddle/v2/fluid/tests/test_while_op.py#L36-L44 + +- 在v0.11.0版本中,我们提供了一个C++类`Executor`用于运行一个Fluid程序。Executor类似一个解释器。在未来的版本中,我们将提升和优化Executor成为一个调试器,就像GDB。并可能提供一些编译器,这个编译器会读取一个上文所描述的应用然后编译成一个等价的 +源代码,这个源代码可以被nvcc编译成可以使用CUDA的二进制,或者被icc编译成可以充分利用Intel CPU的二进制。 + + +## 新特点 + +* 发布 `PaddlePaddle Fluid`。 +* 增加了用于模型预测的C-API。 +* 用Fluid API实现了一个简单的GAN的例子。 +* 增加了关于性能调优的文档。 +* 为`paddle.v2.dataset`下载数据集提供了重试机制. +* C++中使用protobuf-lite替换protobuf减少了二进制的大小。 +* 发布了新特性 [Elastic Deep Learning (EDL)](https://github.com/PaddlePaddle/cloud/tree/develop/doc/autoscale/experiment). +* 基于Bazel API利用cmake实现了一个的新的构建系统函数库。 +* 当使用编译选项`WITH_MKL=ON`时自动下载和编译Intel® [MKLML](https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz) 函数库. +* [Intel® MKL-DNN on PaddlePaddle](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn): + - 完成了 11个 MKL-DNN 层: Convolution, Fully connectivity, Pooling, ReLU, Tanh, ELU, Softmax, BatchNorm, AddTo, Concat, LRN。 + - 完成了 3个 MKL-DNN 网络: VGG-19, ResNet-50, GoogleNet + - 基于Intel Skylake 6148 CPU的[性能测试](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/IntelOptimizedPaddle.md) : 相对于MKLML有2~3倍的训练加速。 +* 增加 [softsign activation](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/activation.html#softsign) +* 增加 [dot product layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#dot-prod) +* 增加 [L2 distance layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#l2-distance) +* 增加 [sub-nested sequence layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#sub-nested-seq) +* 增加 [kmax sequence score layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#kmax-sequence-score) +* 增加 [sequence slice layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#seq-slice) +* 增加 [row convolution layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#row-conv) +* 增加移动端友好的网页 + +## 改进 + +* 使用一个Python`whl`包即可安装. +* [V2 API可以实现用户定制化评估](https://github.com/PaddlePaddle/models/tree/develop/ltr#训练过程中输出自定义评估指标)。 +* 将 `PADDLE_ONLY_CPU` 改为 `PADDLE_WITH_GPU`, 因为我们会支持多种设备。 +* 删除了有一些bug的BarrierStat。 +* 清理和删除了paddle::Parameter中未使用的函数。 +* 删除了ProtoDataProvider。 +* Huber loss同时支持回归和分类。 +* 为sequence pooling 层增加`stride`参数。 +* v2 API自动使用cudnn batch normalization。 +* 可以使用一个固定的参数名共享BN层的参数。 +* 2D convolution operation支持variable-dimension input特性。 +* 重构cmake中关于CUDA的部分并实现自动检测GPU架构的功能。 +* 优化网页导航。 + +## 错误修复 + +* 修复ROI pooling的Bug. cc9a761 +* 修复当label是dense vector是AUC变成0的问题. #5274 +* 修复WarpCTC 层的Bug. + + # v0.10.0版本 我们非常高兴发布了PaddlePaddle V0.10.0版,并开发了新的[Python API](http://research.baidu.com/paddlepaddles-new-api-simplifies-deep-learning-programs/)。 diff --git a/RELEASE.md b/RELEASE.md index 9a09644b681b2ae4e922d92ed29500205c2a6ca4..5a62c955131007c9f3329d162c20d1b462550019 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,8 +1,78 @@ +# Release v0.11.0 + +## PaddlePaddle Fluid + +- Release 0.11.0 includes a new feature *PaddlePaddle Fluid*. Fluid is + designed to allow users to program like PyTorch and TensorFlow Eager Execution. + In these systems, there is no longer the concept *model* and applications + do not include a symbolic description of a graph of operators nor a sequence + of layers. Instead, applications look exactly like a usual program that + describes a process of training or inference. The difference between + Fluid and PyTorch or Eager Execution is that Fluid doesn't rely on Python's + control-flow, `if-then-else` nor `for`. Instead, Fluid provides its + C++ implementations and their Python binding using the `with` statement. For an example + + https://github.com/PaddlePaddle/Paddle/blob/3df78ed2a98d37f7ae6725894cc7514effd5664b/python/paddle/v2/fluid/tests/test_while_op.py#L36-L44 + +- In 0.11.0, we provides a C++ class `Executor` to run a Fluid program. +Executor works like an interpreter. In future version, we will improve +`Executor` into a debugger like GDB, and we might provide some compilers, +which, for example, takes an application like the above one, and outputs +an equivalent C++ source program, which can be compiled using +[`nvcc`](http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html) +to generate binaries that use CUDA, or using +[`icc`](https://software.intel.com/en-us/c-compilers) to generate binaries +that make full use of Intel CPUs. + +## New Features + +* Release `PaddlePaddle Fluid`. +* Add C-API for model inference +* Use fluid API to create a simple GAN demo. +* Add develop guide about performance tunning. +* Add retry when download `paddle.v2.dataset`. +* Linking protobuf-lite not protobuf in C++. Reduce the binary size. +* Feature [Elastic Deep Learning (EDL)](https://github.com/PaddlePaddle/cloud/tree/develop/doc/autoscale/experiment) released. +* A new style cmake functions for Paddle. It is based on Bazel API. +* Automatically download and compile with Intel® [MKLML](https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz) library as CBLAS when build `WITH_MKL=ON`. +* [Intel® MKL-DNN on PaddlePaddle](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn): + - Complete 11 MKL-DNN layers: Convolution, Fully connectivity, Pooling, ReLU, Tanh, ELU, Softmax, BatchNorm, AddTo, Concat, LRN. + - Complete 3 MKL-DNN networks: VGG-19, ResNet-50, GoogleNet + - [Benchmark](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/IntelOptimizedPaddle.md) on Intel Skylake 6148 CPU: 2~3x training speedup compared with MKLML. +* Add the [`softsign` activation](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/activation.html#softsign). +* Add the [dot product layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#dot-prod). +* Add the [L2 distance layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#l2-distance). +* Add the [sub-nested sequence layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#sub-nested-seq). +* Add the [kmax sequence score layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#kmax-sequence-score). +* Add the [sequence slice layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#seq-slice). +* Add the [row convolution layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#row-conv) +* Add mobile friendly webpages. + +## Improvements + +* Build and install using a single `whl` package. +* [Custom evaluating in V2 API](https://github.com/PaddlePaddle/models/tree/develop/ltr#训练过程中输出自定义评估指标). +* Change `PADDLE_ONLY_CPU` to `PADDLE_WITH_GPU`, since we will support many kinds of devices. +* Remove buggy BarrierStat. +* Clean and remove unused functions in paddle::Parameter. +* Remove ProtoDataProvider. +* Huber loss supports both regression and classification. +* Add the `stride` parameter for sequence pooling layers. +* Enable v2 API use cudnn batch normalization automatically. +* The BN layer's parameter can be shared by a fixed the parameter name. +* Support variable-dimension input feature for 2D convolution operation. +* Refine cmake about CUDA to automatically detect GPU architecture. +* Improved website navigation. + +## Bug Fixes + +* Fix bug in ROI pooling. cc9a761 +* Fix AUC is zero when label is dense vector. #5274 +* Fix bug in WarpCTC layer. + # Release v0.10.0 -We are glad to release version 0.10.0. In this version, we are happy to -release the -new +We are glad to release version 0.10.0. In this version, we are happy to release the new [Python API](http://research.baidu.com/paddlepaddles-new-api-simplifies-deep-learning-programs/). - Our old Python API is kind of out of date. It's hard to learn and hard to diff --git a/authors b/authors deleted file mode 100644 index daac4ec5d8173cba95df9f9b3c69c02b5256f5b2..0000000000000000000000000000000000000000 --- a/authors +++ /dev/null @@ -1,56 +0,0 @@ -Cao, Ying -Cheng, Yujuan -Dang, Qingqing -Dong, Tengfei -Du, Dalong -Feng, Shouqiang -Gao, Haoyuan -Han, Baochang -Han, Jinchen -Hao, Nanyu -He, Daoyuan -He, Zhengyan -Hou, Jue -Huang, Chang -Huang, Zhiheng -Hu, Na -Kong, Qi -Liao, Gang -Li, Bo -Li, Jiajie -Li, Jing -Li, Lei -Li, Peng -Liu, Sheng -Liu, Yuan -Li, Yuze -Luo, Heng -Luo, Tao -Lyu, Qin -Mao, Hongyue -Qian, Xiaojun -Qiao, Longfei -Qi, Jun -Qin, Duohao -Shen, Guolong -Shi, Guangchuan -Song, Xiang -Wang, Helin -Wang, Jiang -Wang, Yanfei -Wang, Yi -Wang, Yong -Weng, Renliang -Xu, Tianbing -Xu, Wei -Xu, Xingyu -Yan, Chong -Yan, Chunwei -Yang, Yi -Yu, Yang -Yu, Yinan -Zhang, Jian -Zhang, Ruiqing -Zhang, Weide -Zhao, Liang -Zhou, Jie diff --git a/benchmark/IntelOptimizedPaddle.md b/benchmark/IntelOptimizedPaddle.md new file mode 100644 index 0000000000000000000000000000000000000000..26930a76377397cc36af8cf411b49b02e4f67748 --- /dev/null +++ b/benchmark/IntelOptimizedPaddle.md @@ -0,0 +1,57 @@ +# Benchmark + +Machine: + +- Server: Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz, 2 Sockets, 20 Cores per socket +- Laptop: TBD + +System: CentOS release 6.3 (Final), Docker 1.12.1. + +PaddlePaddle: (TODO: will rerun after 0.11.0) +- paddlepaddle/paddle:latest (for MKLML and MKL-DNN) + - MKL-DNN tag v0.11 + - MKLML 2018.0.1.20171007 +- paddlepaddle/paddle:latest-openblas (for OpenBLAS) + - OpenBLAS v0.2.20 + +On each machine, we will test and compare the performance of training on single node using MKL-DNN / MKLML / OpenBLAS respectively. + +## Benchmark Model + +### Server +Test on batch size 64, 128, 256 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz + +Input image size - 3 * 224 * 224, Time: images/second + +- VGG-19 + +| BatchSize | 64 | 128 | 256 | +|--------------|-------| -----| --------| +| OpenBLAS | 7.80 | 9.00 | 10.80 | +| MKLML | 12.12 | 13.70 | 16.18 | +| MKL-DNN | 28.46 | 29.83 | 30.44 | + + + + - ResNet-50 + +| BatchSize | 64 | 128 | 256 | +|--------------|-------| ------| -------| +| OpenBLAS | 25.22 | 25.68 | 27.12 | +| MKLML | 32.52 | 31.89 | 33.12 | +| MKL-DNN | 81.69 | 82.35 | 84.08 | + + + + - GoogLeNet + +| BatchSize | 64 | 128 | 256 | +|--------------|-------| ------| -------| +| OpenBLAS | 89.52 | 96.97 | 108.25 | +| MKLML | 128.46| 137.89| 158.63 | +| MKL-DNN     | 250.46| 264.83| 269.50 | + + + +### Laptop +TBD diff --git a/benchmark/figs/googlenet-cpu-train.png b/benchmark/figs/googlenet-cpu-train.png new file mode 100644 index 0000000000000000000000000000000000000000..c3f67faf096fe9b45dd815f294b41679dc7c9e54 Binary files /dev/null and b/benchmark/figs/googlenet-cpu-train.png differ diff --git a/benchmark/figs/resnet-cpu-train.png b/benchmark/figs/resnet-cpu-train.png new file mode 100644 index 0000000000000000000000000000000000000000..b96ecd5ff940c0d000613b1ed1f11fb16796cf47 Binary files /dev/null and b/benchmark/figs/resnet-cpu-train.png differ diff --git a/benchmark/figs/vgg-cpu-train.png b/benchmark/figs/vgg-cpu-train.png new file mode 100644 index 0000000000000000000000000000000000000000..f830ca6a87d10b72a5113636dd5686ab25a2e864 Binary files /dev/null and b/benchmark/figs/vgg-cpu-train.png differ diff --git a/benchmark/paddle/image/googlenet.py b/benchmark/paddle/image/googlenet.py index bc893bab98c4d2e07c62fbd012d51a0939db4766..7059c13bd2c2b98eb3fbcf633a6f7064e54d5402 100644 --- a/benchmark/paddle/image/googlenet.py +++ b/benchmark/paddle/image/googlenet.py @@ -5,10 +5,22 @@ height = 224 width = 224 num_class = 1000 batch_size = get_config_arg('batch_size', int, 128) - -args = {'height': height, 'width': width, 'color': True, 'num_class': num_class} +use_gpu = get_config_arg('use_gpu', bool, True) +is_infer = get_config_arg("is_infer", bool, False) + +args = { + 'height': height, + 'width': width, + 'color': True, + 'num_class': num_class, + 'is_infer': is_infer +} define_py_data_sources2( - "train.list", None, module="provider", obj="process", args=args) + "train.list" if not is_infer else None, + "test.list" if is_infer else None, + module="provider", + obj="process", + args=args) settings( batch_size=batch_size, @@ -16,6 +28,8 @@ settings( learning_method=MomentumOptimizer(0.9), regularization=L2Regularization(0.0005 * batch_size)) +conv_projection = conv_projection if use_gpu else img_conv_layer + def inception2(name, input, channels, \ filter1, filter3R, filter3, @@ -138,12 +152,11 @@ def inception(name, input, channels, \ cat = concat_layer( name=name, input=[cov1, cov3, cov5, covprj], - bias_attr=True, + bias_attr=True if use_gpu else False, act=ReluActivation()) return cat -lab = data_layer(name="label", size=1000) data = data_layer(name="input", size=3 * height * width) # stage 1 @@ -221,6 +234,10 @@ pool5 = img_pool_layer( dropout = dropout_layer(name="dropout", input=pool5, dropout_rate=0.4) out3 = fc_layer( name="output3", input=dropout, size=1000, act=SoftmaxActivation()) -loss3 = cross_entropy(name='loss3', input=out3, label=lab) -outputs(loss3) +if is_infer: + outputs(out3) +else: + lab = data_layer(name="label", size=num_class) + loss3 = cross_entropy(name='loss3', input=out3, label=lab) + outputs(loss3) diff --git a/benchmark/paddle/image/provider.py b/benchmark/paddle/image/provider.py index 1ac47212b5a75667e8e9d4465b33f575516e2836..927b1759941f362ef4b5ffe84dd01332986d9306 100644 --- a/benchmark/paddle/image/provider.py +++ b/benchmark/paddle/image/provider.py @@ -13,14 +13,20 @@ def initHook(settings, height, width, color, num_class, **kwargs): settings.data_size = settings.height * settings.width * 3 else: settings.data_size = settings.height * settings.width - - settings.slots = [dense_vector(settings.data_size), integer_value(1)] + settings.is_infer = kwargs.get('is_infer', False) + if settings.is_infer: + settings.slots = [dense_vector(settings.data_size)] + else: + settings.slots = [dense_vector(settings.data_size), integer_value(1)] @provider( init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM) def process(settings, file_list): - for i in xrange(1024): + for i in xrange(2560 if settings.is_infer else 1024): img = np.random.rand(1, settings.data_size).reshape(-1, 1).flatten() - lab = random.randint(0, settings.num_class) - yield img.astype('float32'), int(lab) + if settings.is_infer: + yield img.astype('float32') + else: + lab = random.randint(0, settings.num_class - 1) + yield img.astype('float32'), int(lab) diff --git a/benchmark/paddle/image/resnet.py b/benchmark/paddle/image/resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..4a14363ff1db48a5072cbb5f5eb3bc9241ffca8f --- /dev/null +++ b/benchmark/paddle/image/resnet.py @@ -0,0 +1,228 @@ +#!/usr/bin/env python +from paddle.trainer_config_helpers import * + +height = 224 +width = 224 +num_class = 1000 +batch_size = get_config_arg('batch_size', int, 64) +layer_num = get_config_arg("layer_num", int, 50) +is_infer = get_config_arg("is_infer", bool, False) + +args = { + 'height': height, + 'width': width, + 'color': True, + 'num_class': num_class, + 'is_infer': is_infer +} +define_py_data_sources2( + "train.list" if not is_infer else None, + "test.list" if is_infer else None, + module="provider", + obj="process", + args=args) + +settings( + batch_size=batch_size, + learning_rate=0.01 / batch_size, + learning_method=MomentumOptimizer(0.9), + regularization=L2Regularization(0.0005 * batch_size)) + + +#######################Network Configuration ############# +def conv_bn_layer(name, + input, + filter_size, + num_filters, + stride, + padding, + channels=None, + active_type=ReluActivation()): + """ + A wrapper for conv layer with batch normalization layers. + Note: + conv layer has no activation. + """ + + tmp = img_conv_layer( + name=name + "_conv", + input=input, + filter_size=filter_size, + num_channels=channels, + num_filters=num_filters, + stride=stride, + padding=padding, + act=LinearActivation(), + bias_attr=False) + return batch_norm_layer( + name=name + "_bn", + input=tmp, + act=active_type, + use_global_stats=is_infer) + + +def bottleneck_block(name, input, num_filters1, num_filters2): + """ + A wrapper for bottlenect building block in ResNet. + Last conv_bn_layer has no activation. + Addto layer has activation of relu. + """ + last_name = conv_bn_layer( + name=name + '_branch2a', + input=input, + filter_size=1, + num_filters=num_filters1, + stride=1, + padding=0) + last_name = conv_bn_layer( + name=name + '_branch2b', + input=last_name, + filter_size=3, + num_filters=num_filters1, + stride=1, + padding=1) + last_name = conv_bn_layer( + name=name + '_branch2c', + input=last_name, + filter_size=1, + num_filters=num_filters2, + stride=1, + padding=0, + active_type=LinearActivation()) + + return addto_layer( + name=name + "_addto", input=[input, last_name], act=ReluActivation()) + + +def mid_projection(name, input, num_filters1, num_filters2, stride=2): + """ + A wrapper for middile projection in ResNet. + projection shortcuts are used for increasing dimensions, + and other shortcuts are identity + branch1: projection shortcuts are used for increasing + dimensions, has no activation. + branch2x: bottleneck building block, shortcuts are identity. + """ + # stride = 2 + branch1 = conv_bn_layer( + name=name + '_branch1', + input=input, + filter_size=1, + num_filters=num_filters2, + stride=stride, + padding=0, + active_type=LinearActivation()) + + last_name = conv_bn_layer( + name=name + '_branch2a', + input=input, + filter_size=1, + num_filters=num_filters1, + stride=stride, + padding=0) + last_name = conv_bn_layer( + name=name + '_branch2b', + input=last_name, + filter_size=3, + num_filters=num_filters1, + stride=1, + padding=1) + + last_name = conv_bn_layer( + name=name + '_branch2c', + input=last_name, + filter_size=1, + num_filters=num_filters2, + stride=1, + padding=0, + active_type=LinearActivation()) + + return addto_layer( + name=name + "_addto", input=[branch1, last_name], act=ReluActivation()) + + +img = data_layer(name='image', size=height * width * 3) + + +def deep_res_net(res2_num=3, res3_num=4, res4_num=6, res5_num=3): + """ + A wrapper for 50,101,152 layers of ResNet. + res2_num: number of blocks stacked in conv2_x + res3_num: number of blocks stacked in conv3_x + res4_num: number of blocks stacked in conv4_x + res5_num: number of blocks stacked in conv5_x + """ + # For ImageNet + # conv1: 112x112 + tmp = conv_bn_layer( + "conv1", + input=img, + filter_size=7, + channels=3, + num_filters=64, + stride=2, + padding=3) + tmp = img_pool_layer(name="pool1", input=tmp, pool_size=3, stride=2) + + # conv2_x: 56x56 + tmp = mid_projection( + name="res2_1", input=tmp, num_filters1=64, num_filters2=256, stride=1) + for i in xrange(2, res2_num + 1, 1): + tmp = bottleneck_block( + name="res2_" + str(i), input=tmp, num_filters1=64, num_filters2=256) + + # conv3_x: 28x28 + tmp = mid_projection( + name="res3_1", input=tmp, num_filters1=128, num_filters2=512) + for i in xrange(2, res3_num + 1, 1): + tmp = bottleneck_block( + name="res3_" + str(i), + input=tmp, + num_filters1=128, + num_filters2=512) + + # conv4_x: 14x14 + tmp = mid_projection( + name="res4_1", input=tmp, num_filters1=256, num_filters2=1024) + for i in xrange(2, res4_num + 1, 1): + tmp = bottleneck_block( + name="res4_" + str(i), + input=tmp, + num_filters1=256, + num_filters2=1024) + + # conv5_x: 7x7 + tmp = mid_projection( + name="res5_1", input=tmp, num_filters1=512, num_filters2=2048) + for i in xrange(2, res5_num + 1, 1): + tmp = bottleneck_block( + name="res5_" + str(i), + input=tmp, + num_filters1=512, + num_filters2=2048) + + tmp = img_pool_layer( + name='avgpool', + input=tmp, + pool_size=7, + stride=1, + pool_type=AvgPooling()) + + return fc_layer(input=tmp, size=num_class, act=SoftmaxActivation()) + + +if layer_num == 50: + resnet = deep_res_net(3, 4, 6, 3) +elif layer_num == 101: + resnet = deep_res_net(3, 4, 23, 3) +elif layer_num == 152: + resnet = deep_res_net(3, 8, 36, 3) +else: + print("Wrong layer number.") + +if is_infer: + outputs(resnet) +else: + lbl = data_layer(name="label", size=num_class) + loss = cross_entropy(name='loss', input=resnet, label=lbl) + outputs(loss) diff --git a/benchmark/paddle/image/run_mkldnn_infer.sh b/benchmark/paddle/image/run_mkldnn_infer.sh new file mode 100755 index 0000000000000000000000000000000000000000..d795bcab1b7d098295066f79189d17e8299d28fb --- /dev/null +++ b/benchmark/paddle/image/run_mkldnn_infer.sh @@ -0,0 +1,86 @@ +set -e + +function clock_to_seconds() { + hours=`echo $1 | awk -F ':' '{print $1}'` + mins=`echo $1 | awk -F ':' '{print $2}'` + secs=`echo $1 | awk -F ':' '{print $3}'` + echo `awk 'BEGIN{printf "%.2f",('$secs' + '$mins' * 60 + '$hours' * 3600)}'` +} + +function infer() { + unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY + topology=$1 + layer_num=$2 + bs=$3 + use_mkldnn=$4 + if [ $4 == "True" ]; then + thread=1 + log="logs/infer-${topology}-${layer_num}-mkldnn-${bs}.log" + elif [ $4 == "False" ]; then + thread=`nproc` + if [ $thread -gt $bs ]; then + thread=$bs + fi + log="logs/infer-${topology}-${layer_num}-${thread}mklml-${bs}.log" + else + echo "Wrong input $4, use True or False." + exit 0 + fi + + models_in="models/${topology}-${layer_num}/pass-00000/" + if [ ! -d $models_in ]; then + echo "Training model ${topology}_${layer_num}" + paddle train --job=train \ + --config="${topology}.py" \ + --use_mkldnn=True \ + --use_gpu=False \ + --trainer_count=1 \ + --num_passes=1 \ + --save_dir="models/${topology}-${layer_num}" \ + --config_args="batch_size=128,layer_num=${layer_num}" \ + > /dev/null 2>&1 + echo "Done" + fi + log_period=$((256 / bs)) + paddle train --job=test \ + --config="${topology}.py" \ + --use_mkldnn=$use_mkldnn \ + --use_gpu=False \ + --trainer_count=$thread \ + --log_period=$log_period \ + --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True" \ + --init_model_path=$models_in \ + 2>&1 | tee ${log} + + # calculate the last 5 logs period time of 1280 samples, + # the time before are burning time. + start=`tail ${log} -n 7 | head -n 1 | awk -F ' ' '{print $2}' | xargs` + end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs` + start_sec=`clock_to_seconds $start` + end_sec=`clock_to_seconds $end` + fps=`awk 'BEGIN{printf "%.2f",(1280 / ('$end_sec' - '$start_sec'))}'` + echo "Last 1280 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log} + echo "FPS: $fps images/sec" 2>&1 | tee -a ${log} +} + +if [ ! -f "train.list" ]; then + echo " " > train.list +fi +if [ ! -f "test.list" ]; then + echo " " > test.list +fi +if [ ! -d "logs" ]; then + mkdir logs +fi +if [ ! -d "models" ]; then + mkdir -p models +fi + +# inference benchmark +for use_mkldnn in True False; do + for batchsize in 1 2 4 8 16; do + infer googlenet v1 $batchsize $use_mkldnn + infer resnet 50 $batchsize $use_mkldnn + infer vgg 19 $batchsize $use_mkldnn + done +done diff --git a/benchmark/paddle/image/run_mkldnn_train.sh b/benchmark/paddle/image/run_mkldnn_train.sh new file mode 100755 index 0000000000000000000000000000000000000000..320206239ae960bd088b05d3b10934a98da741b1 --- /dev/null +++ b/benchmark/paddle/image/run_mkldnn_train.sh @@ -0,0 +1,47 @@ +set -e + +function train() { + unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY + topology=$1 + layer_num=$2 + bs=$3 + use_mkldnn=$4 + if [ $4 == "True" ]; then + thread=1 + log="logs/train-${topology}-${layer_num}-mkldnn-${bs}.log" + elif [ $4 == "False" ]; then + thread=`nproc` + # each trainer_count use only 1 core to avoid conflict + log="logs/train-${topology}-${layer_num}-${thread}mklml-${bs}.log" + else + echo "Wrong input $4, use True or False." + exit 0 + fi + args="batch_size=${bs},layer_num=${layer_num}" + config="${topology}.py" + paddle train --job=time \ + --config=$config \ + --use_mkldnn=$use_mkldnn \ + --use_gpu=False \ + --trainer_count=$thread \ + --log_period=10 \ + --test_period=100 \ + --config_args=$args \ + 2>&1 | tee ${log} +} + +if [ ! -f "train.list" ]; then + echo " " > train.list +fi +if [ ! -d "logs" ]; then + mkdir logs +fi + +# training benchmark +for use_mkldnn in True False; do + for batchsize in 64 128 256; do + train vgg 19 $batchsize $use_mkldnn + train resnet 50 $batchsize $use_mkldnn + train googlenet v1 $batchsize $use_mkldnn + done +done diff --git a/benchmark/paddle/image/vgg.py b/benchmark/paddle/image/vgg.py new file mode 100644 index 0000000000000000000000000000000000000000..8d0a1e97a451cd52ef17e4e326673cc90059ef3c --- /dev/null +++ b/benchmark/paddle/image/vgg.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python +from paddle.trainer_config_helpers import * + +height = 224 +width = 224 +num_class = 1000 +batch_size = get_config_arg('batch_size', int, 64) +layer_num = get_config_arg('layer_num', int, 19) +is_infer = get_config_arg("is_infer", bool, False) + +args = { + 'height': height, + 'width': width, + 'color': True, + 'num_class': num_class, + 'is_infer': is_infer +} +define_py_data_sources2( + "train.list" if not is_infer else None, + "test.list" if is_infer else None, + module="provider", + obj="process", + args=args) + +settings( + batch_size=batch_size, + learning_rate=0.001 / batch_size, + learning_method=MomentumOptimizer(0.9), + regularization=L2Regularization(0.0005 * batch_size)) + +img = data_layer(name='image', size=height * width * 3) + + +def vgg_network(vgg_num=3): + tmp = img_conv_group( + input=img, + num_channels=3, + conv_padding=1, + conv_num_filter=[64, 64], + conv_filter_size=3, + conv_act=ReluActivation(), + pool_size=2, + pool_stride=2, + pool_type=MaxPooling()) + + tmp = img_conv_group( + input=tmp, + conv_num_filter=[128, 128], + conv_padding=1, + conv_filter_size=3, + conv_act=ReluActivation(), + pool_stride=2, + pool_type=MaxPooling(), + pool_size=2) + + channels = [] + for i in range(vgg_num): + channels.append(256) + tmp = img_conv_group( + input=tmp, + conv_num_filter=channels, + conv_padding=1, + conv_filter_size=3, + conv_act=ReluActivation(), + pool_stride=2, + pool_type=MaxPooling(), + pool_size=2) + channels = [] + for i in range(vgg_num): + channels.append(512) + tmp = img_conv_group( + input=tmp, + conv_num_filter=channels, + conv_padding=1, + conv_filter_size=3, + conv_act=ReluActivation(), + pool_stride=2, + pool_type=MaxPooling(), + pool_size=2) + tmp = img_conv_group( + input=tmp, + conv_num_filter=channels, + conv_padding=1, + conv_filter_size=3, + conv_act=ReluActivation(), + pool_stride=2, + pool_type=MaxPooling(), + pool_size=2) + + tmp = fc_layer( + input=tmp, + size=4096, + act=ReluActivation(), + layer_attr=ExtraAttr(drop_rate=0.5)) + + tmp = fc_layer( + input=tmp, + size=4096, + act=ReluActivation(), + layer_attr=ExtraAttr(drop_rate=0.5)) + + return fc_layer(input=tmp, size=num_class, act=SoftmaxActivation()) + + +if layer_num == 16: + vgg = vgg_network(3) +elif layer_num == 19: + vgg = vgg_network(4) +else: + print("Wrong layer number.") + +if is_infer: + outputs(vgg) +else: + lab = data_layer('label', num_class) + loss = cross_entropy(input=vgg, label=lab) + outputs(loss) diff --git a/cmake/CMakeDetermineGoCompiler.cmake b/cmake/CMakeDetermineGoCompiler.cmake new file mode 100644 index 0000000000000000000000000000000000000000..abf0a00c5e99e4201dede36f13200cfc9c151ad3 --- /dev/null +++ b/cmake/CMakeDetermineGoCompiler.cmake @@ -0,0 +1,46 @@ +if(NOT CMAKE_Go_COMPILER) + if(NOT $ENV{GO_COMPILER} STREQUAL "") + get_filename_component(CMAKE_Go_COMPILER_INIT $ENV{GO_COMPILER} PROGRAM PROGRAM_ARGS CMAKE_Go_FLAGS_ENV_INIT) + + if(CMAKE_Go_FLAGS_ENV_INIT) + set(CMAKE_Go_COMPILER_ARG1 "${CMAKE_Go_FLAGS_ENV_INIT}" CACHE STRING "First argument to Go compiler") + endif() + + if(NOT EXISTS ${CMAKE_Go_COMPILER_INIT}) + message(SEND_ERROR "Could not find compiler set in environment variable GO_COMPILER:\n$ENV{GO_COMPILER}.") + endif() + + endif() + + set(Go_BIN_PATH + $ENV{GOPATH} + $ENV{GOROOT} + $ENV{GOROOT}/bin + $ENV{GO_COMPILER} + /usr/bin + /usr/local/bin + ) + + if(CMAKE_Go_COMPILER_INIT) + set(CMAKE_Go_COMPILER ${CMAKE_Go_COMPILER_INIT} CACHE PATH "Go Compiler") + else() + find_program(CMAKE_Go_COMPILER + NAMES go + PATHS ${Go_BIN_PATH} + ) + if(CMAKE_Go_COMPILER) + EXEC_PROGRAM(${CMAKE_Go_COMPILER} ARGS version OUTPUT_VARIABLE GOLANG_VERSION) + STRING(REGEX MATCH "go[0-9]+[.0-9]*[ /A-Za-z0-9]*" VERSION "${GOLANG_VERSION}") + message("-- The Golang compiler identification is ${VERSION}") + message("-- Check for working Golang compiler: ${CMAKE_Go_COMPILER}") + endif() + endif() + +endif() + +mark_as_advanced(CMAKE_Go_COMPILER) + +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/CMakeGoCompiler.cmake.in + ${CMAKE_PLATFORM_INFO_DIR}/CMakeGoCompiler.cmake @ONLY) + +set(CMAKE_Go_COMPILER_ENV_VAR "GO_COMPILER") diff --git a/cmake/CMakeGoCompiler.cmake.in b/cmake/CMakeGoCompiler.cmake.in new file mode 100644 index 0000000000000000000000000000000000000000..a71f08e064656fbaad8cfa77aea6f216515712ef --- /dev/null +++ b/cmake/CMakeGoCompiler.cmake.in @@ -0,0 +1,8 @@ +set(CMAKE_Go_COMPILER "@CMAKE_Go_COMPILER@") +set(CMAKE_Go_COMPILER_LOADED 1) + +set(CMAKE_Go_SOURCE_FILE_EXTENSIONS go) +set(CMAKE_Go_LINKER_PREFERENCE 40) +set(CMAKE_Go_OUTPUT_EXTENSION .o) +set(CMAKE_Go_OUTPUT_EXTENSION_REPLACE 1) +set(CMAKE_Go_COMPILER_ENV_VAR "GO_COMPILER") diff --git a/cmake/CMakeGoInformation.cmake b/cmake/CMakeGoInformation.cmake new file mode 100644 index 0000000000000000000000000000000000000000..ba51ac93fcd429478f324b66bd5129d94ea2a8f4 --- /dev/null +++ b/cmake/CMakeGoInformation.cmake @@ -0,0 +1,7 @@ +if(NOT CMAKE_Go_COMPILE_OBJECT) + set(CMAKE_Go_COMPILE_OBJECT "go tool compile -l -N -o ") +endif() + +if(NOT CMAKE_Go_LINK_EXECUTABLE) + set(CMAKE_Go_LINK_EXECUTABLE "go tool link -o ") +endif() diff --git a/cmake/CMakeTestGoCompiler.cmake b/cmake/CMakeTestGoCompiler.cmake new file mode 100644 index 0000000000000000000000000000000000000000..b9891b015baced05b51e34dba562fd98a84fe14c --- /dev/null +++ b/cmake/CMakeTestGoCompiler.cmake @@ -0,0 +1 @@ +set(CMAKE_Go_COMPILER_WORKS 1 CACHE INTERNAL "") diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake index b8bf1bb07a1f779354b2c10071264bf41d279f6c..b21fc43904d9aafe9f7d019dfbe5b1c0d3f9e2d6 100644 --- a/cmake/cblas.cmake +++ b/cmake/cblas.cmake @@ -1,53 +1,28 @@ # Find the CBlas and lapack libraries # -# It will search MKL, atlas, OpenBlas, reference-cblas in order. +# It will search MKLML, atlas, OpenBlas, reference-cblas in order. # # If any cblas implementation found, the following variable will be set. -# CBLAS_PROVIDER # one of MKL, ATLAS, OPENBLAS, REFERENCE +# CBLAS_PROVIDER # one of MKLML, ATLAS, OPENBLAS, REFERENCE # CBLAS_INC_DIR # the include directory for cblas. -# CBLAS_LIBS # a list of libraries should be linked by paddle. +# CBLAS_LIBS # a list of libraries should be linked by paddle. # # Each library should be full path to object file. -# -# User should set one of MKL_ROOT, ATLAS_ROOT, OPENBLAS_ROOT, REFERENCE_CBLAS_ROOT -# during cmake. If none of them set, it will try to find cblas implementation in -# system paths. -# set(CBLAS_FOUND OFF) -## Find MKL First. -set(INTEL_ROOT "/opt/intel" CACHE PATH "Folder contains intel libs") -set(MKL_ROOT ${INTEL_ROOT}/mkl CACHE PATH "Folder contains MKL") - -find_path(MKL_INC_DIR mkl.h PATHS - ${MKL_ROOT}/include) -find_path(MKL_LAPACK_INC_DIR mkl_lapacke.h PATHS - ${MKL_ROOT}/include) -find_library(MKL_CORE_LIB NAMES mkl_core PATHS - ${MKL_ROOT}/lib - ${MKL_ROOT}/lib/intel64) -find_library(MKL_SEQUENTIAL_LIB NAMES mkl_sequential PATHS - ${MKL_ROOT}/lib - ${MKL_ROOT}/lib/intel64) -find_library(MKL_INTEL_LP64 NAMES mkl_intel_lp64 PATHS - ${MKL_ROOT}/lib - ${MKL_ROOT}/lib/intel64) - - -if(MKL_INC_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64) - set(CBLAS_PROVIDER MKL) - set(CBLAS_INC_DIR ${MKL_INC_DIR}) - set(CBLAS_LIBRARIES ${MKL_INTEL_LP64} - ${MKL_SEQUENTIAL_LIB} - ${MKL_CORE_LIB}) - add_definitions(-DPADDLE_USE_MKL) - message(STATUS "Found MKL (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})") +## Find MKLML First. +if(WITH_MKLML AND MKLML_INC_DIR AND MKLML_LIB) set(CBLAS_FOUND ON) - if(${MKL_LAPACK_INC_DIR}) - add_definitions(-DPADDLE_USE_LAPACK) - message(STATUS "Found lapack in MKL (include: ${MKL_LAPACK_INC_DIR})") - endif() - return() # return file. + set(CBLAS_PROVIDER MKLML) + set(CBLAS_INC_DIR ${MKLML_INC_DIR}) + set(CBLAS_LIBRARIES ${MKLML_LIB}) + + add_definitions(-DPADDLE_USE_MKLML) + add_definitions(-DLAPACK_FOUND) + + message(STATUS "Found cblas and lapack in MKLML " + "(include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})") + return() endif() ## Then find atlas. @@ -63,26 +38,26 @@ set(ATLAS_LIB_SEARCH_PATHS /usr/lib/atlas /usr/lib/atlas-base # special for ubuntu 14.04. ) -find_path(ATLAS_INC_DIR NAMES cblas.h +find_path(ATLAS_INC_DIR NAMES cblas.h PATHS ${ATLAS_INCLUDE_SEARCH_PATHS}) find_path(ATLAS_CLAPACK_INC_DIR NAMES clapack.h PATHS ${ATLAS_INCLUDE_SEARCH_PATHS}) -find_library(ATLAS_CBLAS_LIB NAMES cblas libcblas.so.3 +find_library(ATLAS_CBLAS_LIB NAMES cblas libcblas.so.3 PATHS ${ATLAS_LIB_SEARCH_PATHS}) -find_library(ATLAS_LIB NAMES lapack_atlas liblapack_atlas.so.3 +find_library(ATLAS_CLAPACK_LIB NAMES lapack_atlas liblapack_atlas.so.3 PATHS ${ATLAS_LIB_SEARCH_PATHS}) -if(ATLAS_INC_DIR AND ATLAS_CBLAS_LIB AND ATLAS_LIB AND NOT CBLAS_FOUND) - set(CBLAS_PROVIDER ATLAS) - set(CBLAS_INC_DIR ${ATLAS_INC_DIR}) - set(CBLAS_LIBRARIES ${ATLAS_LIB} ${ATLAS_CBLAS_LIB}) - add_definitions(-DPADDLE_USE_ATLAS) - message(STATUS "Found ATLAS (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})") +if(ATLAS_CLAPACK_INC_DIR AND ATLAS_INC_DIR AND ATLAS_CBLAS_LIB AND ATLAS_CLAPACK_LIB) set(CBLAS_FOUND ON) - if(ATLAS_CLAPACK_INC_DIR) - add_definitions(-DPADDLE_USE_LAPACK) - message(STATUS "Found lapack in ATLAS (include: ${ATLAS_CLAPACK_INC_DIR})") - endif() + set(CBLAS_PROVIDER ATLAS) + set(CBLAS_INC_DIR ${ATLAS_INC_DIR} ${ATLAS_CLAPACK_INC_DIR}) + set(CBLAS_LIBRARIES ${ATLAS_CLAPACK_LIB} ${ATLAS_CBLAS_LIB}) + + add_definitions(-DPADDLE_USE_ATLAS) + add_definitions(-DLAPACK_FOUND) + + message(STATUS "Found ATLAS (include: ${ATLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})") + message(STATUS "Found lapack in ATLAS (include: ${ATLAS_CLAPACK_INC_DIR})") return() endif() @@ -107,16 +82,17 @@ find_path(OPENBLAS_LAPACKE_INC_DIR NAMES lapacke.h find_library(OPENBLAS_LIB NAMES openblas PATHS ${OPENBLAS_LIB_SEARCH_PATHS}) -if(OPENBLAS_INC_DIR AND OPENBLAS_LIB) +if(OPENBLAS_LAPACKE_INC_DIR AND OPENBLAS_INC_DIR AND OPENBLAS_LIB) + set(CBLAS_FOUND ON) set(CBLAS_PROVIDER OPENBLAS) - set(CBLAS_INC_DIR ${OPENBLAS_INC_DIR}) + set(CBLAS_INC_DIR ${OPENBLAS_INC_DIR} ${OPENBLAS_LAPACKE_INC_DIR}) set(CBLAS_LIBRARIES ${OPENBLAS_LIB}) - message(STATUS "Found OpenBLAS (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})") - set(CBLAS_FOUND ON) - if(OPENBLAS_LAPACKE_INC_DIR) - add_definitions(-DPADDLE_USE_LAPACK) - message(STATUS "Found lapack in OpenBLAS (include: ${OPENBLAS_LAPACKE_INC_DIR})") - endif() + + add_definitions(-DPADDLE_USE_OPENBLAS) + add_definitions(-DLAPACK_FOUND) + + message(STATUS "Found OpenBLAS (include: ${OPENBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})") + message(STATUS "Found lapack in OpenBLAS (include: ${OPENBLAS_LAPACKE_INC_DIR})") return() endif() @@ -124,7 +100,7 @@ endif() ## Then find the reference-cblas. www.netlib.org/blas/ -set(REFERENCE_CBLAS_ROOT $ENV{REFERENCE_CBLAS_ROOT} CACHE PATH +set(REFERENCE_CBLAS_ROOT $ENV{REFERENCE_CBLAS_ROOT} CACHE PATH "Folder contains reference-cblas") set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/include @@ -145,9 +121,17 @@ find_library(REFERENCE_CBLAS_LIBRARY NAMES cblas PATHS ${REFERENCE_CBLAS_LIB_SEARCH_PATHS}) if (REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY) + set(CBLAS_FOUND ON) set(CBLAS_PROVIDER REFERENCE) set(CBLAS_INC_DIR ${REFERENCE_CBLAS_INCLUDE_DIR}) set(CBLAS_LIBRARIES ${REFERENCE_CBLAS_LIBRARY}) - message(STATUS "Found reference-cblas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBS})") + add_definitions(-DPADDLE_USE_REFERENCE_CBLAS) + message(STATUS "Found reference-cblas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})") +endif() + +if(IOS_USE_VECLIB_FOR_BLAS AND VECLIB_FOUND) set(CBLAS_FOUND ON) + set(CBLAS_PROVIDER vecLib) + set(CBLAS_INC_DIR ${VECLIB_INC_DIR}) + add_definitions(-DPADDLE_USE_VECLIB) endif() diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 5e507e78f74eee885922f502f35e3c15fafb622d..5c6bcfde76a1201f792d04766d698db8cd395a49 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -1,11 +1,11 @@ # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -24,10 +24,23 @@ if(WITH_DOUBLE) add_definitions(-DPADDLE_TYPE_DOUBLE) endif(WITH_DOUBLE) +if(WITH_ARM_FP16) + add_definitions(-DPADDLE_ARM_FP16) + add_definitions("-march=armv8.2-a+fp16+simd") +endif(WITH_ARM_FP16) + +if(WITH_TESTING) + add_definitions(-DPADDLE_WITH_TESTING) +endif(WITH_TESTING) + if(NOT WITH_TIMER) add_definitions(-DPADDLE_DISABLE_TIMER) endif(NOT WITH_TIMER) +if(USE_EIGEN_FOR_BLAS) + add_definitions(-DPADDLE_USE_EIGEN_FOR_BLAS) +endif(USE_EIGEN_FOR_BLAS) + if(NOT WITH_PROFILER) add_definitions(-DPADDLE_DISABLE_PROFILER) endif(NOT WITH_PROFILER) @@ -40,20 +53,25 @@ if(NOT CMAKE_CROSSCOMPILING) endif() endif() +if(NOT WITH_GOLANG) + add_definitions(-DPADDLE_WITHOUT_GOLANG) +endif(NOT WITH_GOLANG) + if(NOT WITH_GPU) - add_definitions(-DPADDLE_ONLY_CPU) add_definitions(-DHPPL_STUB_FUNC) list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu) else() + add_definitions(-DPADDLE_WITH_CUDA) + FIND_PACKAGE(CUDA REQUIRED) if(${CUDA_VERSION_MAJOR} VERSION_LESS 7) - message(FATAL_ERROR "Paddle need CUDA >= 7.0 to compile") + message(FATAL_ERROR "Paddle needs CUDA >= 7.0 to compile") endif() if(NOT CUDNN_FOUND) - message(FATAL_ERROR "Paddle need cudnn to compile") + message(FATAL_ERROR "Paddle needs cudnn to compile") endif() set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SIMD_FLAG}") @@ -63,5 +81,63 @@ else() include_directories(${CUDA_TOOLKIT_INCLUDE}) endif(NOT WITH_GPU) +if (WITH_MKLML AND MKLML_IOMP_LIB) + message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}") + set(OPENMP_FLAGS "-fopenmp") + set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) + set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}") +endif() + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SIMD_FLAG}") + +if(WITH_GOLANG) + # we need to symlink Paddle directory into GOPATH. If we + # don't do it and we have code that depends on Paddle, go + # get ./... will download a new Paddle repo from Github, + # without the changes in our current Paddle repo that we + # want to build. + set(GOPATH "${CMAKE_CURRENT_BINARY_DIR}/go") + file(MAKE_DIRECTORY ${GOPATH}) + set(PADDLE_IN_GOPATH "${GOPATH}/src/github.com/PaddlePaddle/Paddle") + file(MAKE_DIRECTORY "${PADDLE_IN_GOPATH}") + set(PADDLE_GO_PATH "${CMAKE_SOURCE_DIR}/go") + + add_custom_target(go_path) + add_custom_command(TARGET go_path + # Symlink Paddle directory into GOPATH + COMMAND mkdir -p ${PADDLE_IN_GOPATH} + COMMAND rm -rf ${PADDLE_IN_GOPATH} + COMMAND ln -sf ${CMAKE_SOURCE_DIR} ${PADDLE_IN_GOPATH} + # Automatically get all dependencies specified in the source code + # We can't run `go get -d ./...` for every target, because + # multiple `go get` can not run concurrently, but make need to be + # able to run with multiple jobs. + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + ) + + if (GLIDE_INSTALL) + if(EXISTS $ENV{GOPATH}/bin/glide) + set(GLIDE "$ENV{GOPATH}/bin/glide") + else() + message(FATAL_ERROR "no glide executeble found: $ENV{GOPATH}/bin/glide") + endif() + + # this command will only run when the file it depends is missing + # or has changed, or the output is missing. + add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/glide + COMMAND env GOPATH=${GOPATH} ${GLIDE} install + COMMAND touch ${CMAKE_BINARY_DIR}/glide + DEPENDS ${PADDLE_SOURCE_DIR}/go/glide.lock + WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go" + ) + + # depends on the custom command which outputs + # ${CMAKE_BINARY_DIR}/glide, the custom command does not need to + # run every time this target is built. + add_custom_target(go_vendor DEPENDS ${CMAKE_BINARY_DIR}/glide go_path) + endif() + +endif(WITH_GOLANG) diff --git a/cmake/cpplint.cmake b/cmake/cpplint.cmake index 38c636b30edc0af1c07255814e8bc2b1ad9514da..4823dc3e91390002aefac70f7931b4197db05789 100644 --- a/cmake/cpplint.cmake +++ b/cmake/cpplint.cmake @@ -25,8 +25,10 @@ set(STYLE_FILTER "${STYLE_FILTER}-readability/casting") set(IGNORE_PATTERN .*ImportanceSampler.* .*cblas\\.h.* - .*LtrDataProvider.* - .*MultiDataProvider.*) + .*\\.pb\\.txt + .*MultiDataProvider.* + .*pb.* + .*pybind.h) # add_style_check_target # @@ -34,29 +36,27 @@ set(IGNORE_PATTERN # # first argument: target name to attach # rest arguments: source list to check code style. -# +# # NOTE: If WITH_STYLE_CHECK is OFF, then this macro just do nothing. macro(add_style_check_target TARGET_NAME) if(WITH_STYLE_CHECK) set(SOURCES_LIST ${ARGN}) list(REMOVE_DUPLICATES SOURCES_LIST) - list(SORT SOURCES_LIST) - foreach(filename ${SOURCES_LIST}) - set(LINT ON) foreach(pattern ${IGNORE_PATTERN}) if(filename MATCHES ${pattern}) - message(STATUS "DROP LINT ${filename}") - set(LINT OFF) - endif() + list(REMOVE_ITEM SOURCES_LIST ${filename}) + endif() endforeach() - if(LINT MATCHES ON) - add_custom_command(TARGET ${TARGET_NAME} - PRE_BUILD - COMMAND env ${py_env} "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py" - "--filter=${STYLE_FILTER}" ${filename} - WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}) - endif() endforeach() + + if(SOURCES_LIST) + add_custom_command(TARGET ${TARGET_NAME} POST_BUILD + COMMAND "${PYTHON_EXECUTABLE}" "${PADDLE_SOURCE_DIR}/paddle/scripts/cpplint.py" + "--filter=${STYLE_FILTER}" + ${SOURCES_LIST} + COMMENT "cpplint: Checking source code style" + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) + endif() endif() endmacro() diff --git a/cmake/cross_compiling/android.cmake b/cmake/cross_compiling/android.cmake new file mode 100644 index 0000000000000000000000000000000000000000..84219cfa5587f5b765b2e8f35180797d7053169f --- /dev/null +++ b/cmake/cross_compiling/android.cmake @@ -0,0 +1,236 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This is a toolchain file for cross-compiling for Android, and the +# configuration refers to the open-source resposity: +# https://github.com/taka-no-me/android-cmake +# Most of the variables are compatible with that used in +# https://developer.android.com/ndk/guides/cmake.html +# The supported variables are listed belows: +# +# ANDROID_STANDALONE_TOOLCHAIN +# ANDROID_TOOLCHAIN +# ANDROID_ABI +# ANDROID_NATIVE_API_LEVEL +# ANDROID_ARM_MODE +# ANDROID_ARM_NEON +# +# For CMake >= 3.7.0, all the settings will be delivered to CMake system +# variables to let CMake do the cross-compiling configurations itself. +# More detail of cross-compiling settings +# https://cmake.org/cmake/help/v3.7/manual/cmake-toolchains.7.html + +IF(NOT ANDROID) + return() +ENDIF() + +# check the exist of android standalone toolchain +IF(NOT DEFINED ANDROID_STANDALONE_TOOLCHAIN) + SET(ANDROID_STANDALONE_TOOLCHAIN $ENV{ANDROID_STANDALONE_TOOLCHAIN} + CACHE PATH "Folder holds the standalone toolchain of Android NDK") +ENDIF() +IF(NOT ANDROID_STANDALONE_TOOLCHAIN) + MESSAGE(WARNING "It is recommended to set ANDROID_STANDALONE_TOOLCHAIN to " + "use a standalone toolchain.\n" + "To cross-compile for Android, you need to:\n" + "1. Download an Android NDK from" + " https://developer.android.com/ndk/downloads/index.html\n" + "2. Setup a standalone toolchain" + "https://developer.android.google.cn/ndk/guides/standalone_toolchain.html?hl=zh-cn\n") +ENDIF() + +IF(NOT DEFINED CMAKE_SYSTEM_VERSION AND ANDROID_NATIVE_API_LEVEL) + IF(ANDROID_NATIVE_API_LEVEL MATCHES "^android-[0-9]+$") + STRING(REPLACE "android-" "" CMAKE_SYSTEM_VERSION "${CMAKE_MATCH_0}") + ELSEIF(ANDROID_NATIVE_API_LEVEL MATCHES "^[0-9]+$") + SET(CMAKE_SYSTEM_VERSION ${ANDROID_NATIVE_API_LEVEL}) + ENDIF() +ENDIF() + +IF(NOT DEFINED ANDROID_TOOLCHAIN) + SET(ANDROID_TOOLCHAIN clang) +ENDIF() + +IF(NOT DEFINED ANDROID_ABI) + SET(ANDROID_ABI "armeabi-v7a") +ENDIF() + +IF(NOT DEFINED ANDROID_ARM_MODE) + SET(ANDROID_ARM_MODE ON) +ENDIF() +IF(ANDROID_ARM_MODE) + SET(ANDROID_ARM_MODE_NAME "arm") +ELSE(ANDROID_ARM_MODE) + SET(ANDROID_ARM_MODE_NAME "thumb") +ENDIF(ANDROID_ARM_MODE) + +IF(NOT DEFINED ANDROID_ARM_NEON) + SET(ANDROID_ARM_NEON ON) +ENDIF() + +IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0") + IF("${CMAKE_VERSION}" VERSION_LESS "3.1.0") + SET(CMAKE_SYSTEM_NAME "Linux") + ENDIF() + MESSAGE(WARNING "It is recommended to use CMake >= 3.7.0 (current version: " + "${CMAKE_VERSION}), when cross-compiling for Android.") + + IF(ANDROID_STANDALONE_TOOLCHAIN) + # Use standalone toolchain + SET(CMAKE_SYSROOT "${ANDROID_STANDALONE_TOOLCHAIN}/sysroot") + + IF(NOT CMAKE_SYSTEM_VERSION) + SET(ANDROID_STANDALONE_TOOLCHAIN_API "") + SET(ANDROID_API_LEVEL_H_REGEX "^[\t ]*#[\t ]*define[\t ]+__ANDROID_API__[\t ]+([0-9]+)") + FILE(STRINGS "${ANDROID_STANDALONE_TOOLCHAIN}/sysroot/usr/include/android/api-level.h" + ANDROID_API_LEVEL_H_CONTENT REGEX "${ANDROID_API_LEVEL_H_REGEX}") + IF(ANDROID_API_LEVEL_H_CONTENT MATCHES "${ANDROID_API_LEVEL_H_REGEX}") + SET(ANDROID_STANDALONE_TOOLCHAIN_API "${CMAKE_MATCH_1}") + ENDIF() + SET(CMAKE_SYSTEM_VERSION ${ANDROID_STANDALONE_TOOLCHAIN_API}) + ENDIF() + + # Toolchain + SET(ANDROID_TOOLCHAIN_ROOT ${ANDROID_STANDALONE_TOOLCHAIN}) + ELSE(ANDROID_NDK) + # TODO: use android ndk + ENDIF() + + IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$") + SET(ANDROID_TOOLCHAIN_NAME arm-linux-androideabi) + IF(ANDROID_ABI STREQUAL "armeabi") + SET(CMAKE_SYSTEM_PROCESSOR armv5te) + SET(ANDROID_CLANG_TRIPLE armv5te-none-linux-androideabi) + ELSEIF(ANDROID_ABI STREQUAL "armeabi-v7a") + SET(CMAKE_SYSTEM_PROCESSOR armv7-a) + SET(ANDROID_CLANG_TRIPLE armv7-none-linux-androideabi) + ENDIF() + ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a") + SET(ANDROID_TOOLCHAIN_NAME aarch64-linux-android) + SET(CMAKE_SYSTEM_PROCESSOR aarch64) + SET(ANDROID_CLANG_TRIPLE aarch64-none-linux-android) + ELSE() + MESSAGE(FATAL_ERROR "Invalid Android ABI: ${ANDROID_ABI}.") + ENDIF() + SET(ANDROID_TOOLCHAIN_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_NAME}-") + + IF(ANDROID_TOOLCHAIN STREQUAL clang) + SET(ANDROID_C_COMPILER_NAME clang) + SET(ANDROID_CXX_COMPILER_NAME clang++) + SET(CMAKE_C_COMPILER_TARGET ${ANDROID_CLANG_TRIPLE}) + SET(CMAKE_CXX_COMPILER_TARGET ${ANDROID_CLANG_TRIPLE}) + ELSEIF(ANDROID_TOOLCHAIN STREQUAL gcc) + SET(ANDROID_C_COMPILER_NAME gcc) + SET(ANDROID_CXX_COMPILER_NAME g++) + ELSE() + MESSAGE(FATAL_ERROR "Invalid Android toolchain: ${ANDROID_TOOLCHAIN}") + ENDIF() + + # C compiler + IF(NOT CMAKE_C_COMPILER) + SET(ANDROID_C_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}${ANDROID_C_COMPILER_NAME}") + ELSE() + GET_FILENAME_COMPONENT(ANDROID_C_COMPILER ${CMAKE_C_COMPILER} PROGRAM) + ENDIF() + IF(NOT EXISTS ${ANDROID_C_COMPILER}) + MESSAGE(FATAL_ERROR "Cannot find C compiler: ${ANDROID_C_COMPILER}") + ENDIF() + + # CXX compiler + IF(NOT CMAKE_CXX_COMPILER) + SET(ANDROID_CXX_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}${ANDROID_CXX_COMPILER_NAME}") + ELSE() + GET_FILENAME_COMPONENT(ANDROID_CXX_COMPILER ${CMAKE_CXX_COMPILER} PROGRAM) + ENDIF() + IF(NOT EXISTS ${ANDROID_CXX_COMPILER}) + MESSAGE(FATAL_ERROR "Cannot find CXX compiler: ${ANDROID_CXX_COMPILER}") + ENDIF() + + SET(CMAKE_C_COMPILER ${ANDROID_C_COMPILER} CACHE PATH "C compiler" FORCE) + SET(CMAKE_CXX_COMPILER ${ANDROID_CXX_COMPILER} CACHE PATH "CXX compiler" FORCE) + + # Toolchain and ABI specific flags. + SET(ANDROID_COMPILER_FLAGS "-ffunction-sections -fdata-sections") + SET(ANDROID_LINKER_FLAGS "-Wl,--gc-sections") + + IF(ANDROID_ABI STREQUAL "armeabi") + LIST(APPEND ANDROID_COMPILER_FLAGS + -march=armv5te + -mtune=xscale + -msoft-float) + ELSEIF(ANDROID_ABI STREQUAL "armeabi-v7a") + LIST(APPEND ANDROID_COMPILER_FLAGS + -march=armv7-a + -mfloat-abi=softfp) + IF(ANDROID_ARM_NEON) + LIST(APPEND ANDROID_COMPILER_FLAGS -mfpu=neon) + ELSE() + LIST(APPEND ANDROID_COMPILER_FLAGS -mfpu=vfpv3-d16) + ENDIF() + LIST(APPEND ANDROID_LINKER_FLAGS -Wl,--fix-cortex-a8) + ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a") + LIST(APPEND ANDROID_COMPILER_FLAGS -march=armv8-a) + ENDIF() + + IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$") + IF(ANDROID_ARM_MODE) + LIST(APPEND ANDROID_COMPILER_FLAGS -marm) + ELSE() + LIST(APPEND ANDROID_COMPILER_FLAGS -mthumb) + ENDIF() + IF(ANDROID_TOOLCHAIN STREQUAL clang) + # Disable integrated-as for better compatibility. + LIST(APPEND ANDROID_COMPILER_FLAGS -fno-integrated-as) + ENDIF() + ENDIF() + + IF(ANDROID_TOOLCHAIN STREQUAL clang) + # CMake automatically forwards all compiler flags to the linker, + # and clang doesn't like having -Wa flags being used for linking. + # To prevent CMake from doing this would require meddling with + # the CMAKE__COMPILE_OBJECT rules, which would get quite messy. + LIST(APPEND ANDROID_LINKER_FLAGS -Qunused-arguments) + ENDIF() + + STRING(REPLACE ";" " " ANDROID_COMPILER_FLAGS "${ANDROID_COMPILER_FLAGS}") + STRING(REPLACE ";" " " ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS}") + + SET(CMAKE_C_FLAGS "${ANDROID_COMPILER_FLAGS} ${CMAKE_C_FLAGS}" + CACHE STRING "C flags") + SET(CMAKE_CXX_FLAGS "${ANDROID_COMPILER_FLAGS} ${CMAKE_CXX_FLAGS}" + CACHE STRING "CXX flags") + SET(CMAKE_SHARED_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} ${CMAKE_SHARED_LINKER_FLAGS}" + CACHE STRING "shared linker flags") + + SET(CMAKE_POSITION_INDEPENDENT_CODE TRUE) + SET(CMAKE_EXE_LINKER_FLAGS "-pie -fPIE ${ANDROID_LINKER_FLAGS} ${CMAKE_EXE_LINKER_FLAGS}" + CACHE STRING "executable linker flags") + + MESSAGE(STATUS "Android: Targeting API '${CMAKE_SYSTEM_VERSION}' " + "with architecture '${ANDROID_ARM_MODE_NAME}', " + "ABI '${ANDROID_ABI}', and processor '${CMAKE_SYSTEM_PROCESSOR}'") + MESSAGE(STATUS "System CMAKE_C_FLAGS: " ${CMAKE_C_FLAGS}) + MESSAGE(STATUS "System CMAKE_CXX_FLAGS: " ${CMAKE_CXX_FLAGS}) +ELSE() + IF(ANDROID_STANDALONE_TOOLCHAIN) + SET(CMAKE_ANDROID_STANDALONE_TOOLCHAIN ${ANDROID_STANDALONE_TOOLCHAIN}) + ENDIF() + SET(CMAKE_ANDROID_ARCH_ABI ${ANDROID_ABI}) + IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$") + SET(CMAKE_ANDROID_ARM_MODE ${ANDROID_ARM_MODE}) + IF(ANDROID_ABI STREQUAL "armeabi-v7a") + SET(CMAKE_ANDROID_ARM_NEON ${ANDROID_ARM_NEON}) + ENDIF() + ENDIF() +ENDIF() diff --git a/cmake/cross_compiling/host.cmake b/cmake/cross_compiling/host.cmake new file mode 100644 index 0000000000000000000000000000000000000000..14c35266ec60b439aaef30e5e4e0540c534160ae --- /dev/null +++ b/cmake/cross_compiling/host.cmake @@ -0,0 +1,49 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# find host C compiler +IF(HOST_C_COMPILER) + SET(HOST_C_COMPILER_NAME ${HOST_C_COMPILER}) +ELSEIF(NOT $ENV{CC} STREQUAL "") + SET(HOST_C_COMPILER_NAME $ENV{CC}) +ELSE() + SET(HOST_C_COMPILER_NAME cc) +ENDIF() + +GET_FILENAME_COMPONENT(HOST_C_COMPILER_PATH ${HOST_C_COMPILER_NAME} PROGRAM) +IF(NOT HOST_C_COMPILER_PATH OR NOT EXISTS ${HOST_C_COMPILER_PATH}) + MESSAGE(FATAL_ERROR "Cannot find host C compiler, set host C compiler:\n" + "\tcmake .. -DHOST_C_COMPILER=...") +ENDIF() + +# find host CXX compiler +IF(HOST_CXX_COMPILER) + SET(HOST_CXX_COMPILER_NAME ${HOST_CXX_COMPILER}) +ELSEIF(NOT $ENV{CXX} STREQUAL "") + SET(HOST_CXX_COMPILER_NAME $ENV{CXX}) +ELSE() + SET(HOST_CXX_COMPILER_NAME c++) +ENDIF() + +GET_FILENAME_COMPONENT(HOST_CXX_COMPILER_PATH ${HOST_CXX_COMPILER_NAME} PROGRAM) +IF(NOT HOST_CXX_COMPILER_PATH OR NOT EXISTS ${HOST_CXX_COMPILER_PATH}) + MESSAGE(FATAL_ERROR "Cannot find host CXX compiler, set host CXX compiler:\n" + "\tcmake .. -DHOST_CXX_COMPILER=...") +ENDIF() + +SET(HOST_C_COMPILER ${HOST_C_COMPILER_PATH} CACHE PATH "Host C compiler") +SET(HOST_CXX_COMPILER ${HOST_CXX_COMPILER_PATH} CACHE PATH "Host CXX compiler") + +MESSAGE(STATUS "Found host C compiler: " ${HOST_C_COMPILER}) +MESSAGE(STATUS "Found host CXX compiler: " ${HOST_CXX_COMPILER}) diff --git a/cmake/cross_compiling/ios.cmake b/cmake/cross_compiling/ios.cmake new file mode 100644 index 0000000000000000000000000000000000000000..d3f5bf6852b3b295f3b5806b0577a880b0ce6ba6 --- /dev/null +++ b/cmake/cross_compiling/ios.cmake @@ -0,0 +1,347 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This is a toolchain file for cross-compiling for iOS, and the +# configuration largely refers to public toolchain file: +# https://raw.githubusercontent.com/leetal/ios-cmake/master/ios.toolchain.cmake +# and +# https://github.com/cristeab/ios-cmake +# +# Supports options: +# IOS_PLATFORM = OS (default) or SIMULATOR +# This decides if SDKS will be selected from the iPhoneOS.platform or iPhoneSimulator.platform folders +# OS - the default, used to build for iPhone and iPad physical devices, which have an arm arch. +# SIMULATOR - used to build for the Simulator platforms, which have an x86 arch. +# IOS_ARCH +# The archectures wanted to support, such "arm64", "armv7;arm64" +# IOS_DEPLOYMENT_TARGET +# The minimum iOS deployment version, such as "7.0" +# IOS_ENABLE_BITCODE = ON (default) or OFF +# IOS_USE_VECLIB_FOR_BLAS = OFF (default) or ON +# IOS_DEVELOPER_ROOT = automatic(default) or /path/to/platform/Developer folder +# By default this location is automatcially chosen based on the IOS_PLATFORM value above. +# If set manually, it will override the default location and force the user of a particular Developer Platform +# IOS_SDK_ROOT = automatic(default) or /path/to/platform/Developer/SDKs/SDK folder +# By default this location is automatcially chosen based on the IOS_DEVELOPER_ROOT value. +# In this case it will always be the most up-to-date SDK found in the IOS_DEVELOPER_ROOT path. +# If set manually, this will force the use of a specific SDK version + +# Macros: +# set_xcode_property (TARGET XCODE_PROPERTY XCODE_VALUE) +# A convenience macro for setting xcode specific properties on targets +# example: set_xcode_property (myioslib IPHONEOS_DEPLOYMENT_TARGET "3.1") +# find_host_package (PROGRAM ARGS) +# A macro used to find executable programs on the host system, not within the iOS environment. +# Thanks to the android-cmake project for providing the command + +if(NOT IOS) + return() +endif() + +set(CMAKE_SYSTEM_NAME Darwin) + +# Get the Xcode version being used. +execute_process(COMMAND xcodebuild -version + OUTPUT_VARIABLE XCODE_VERSION + RESULT_VARIABLE XCODE_VERSION_RESULT + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) +if(NOT ${XCODE_VERSION_RESULT}) + string(REGEX MATCH "Xcode [0-9\\.]+" XCODE_VERSION "${XCODE_VERSION}") + string(REGEX REPLACE "Xcode ([0-9\\.]+)" "\\1" XCODE_VERSION "${XCODE_VERSION}") + message(STATUS "Building with Xcode version: ${XCODE_VERSION}") +else() + message(FATAL_ERROR "Cannot execute xcodebuild, please check whether xcode is installed.") +endif() + +# Required as of cmake 2.8.10 +set(CMAKE_OSX_DEPLOYMENT_TARGET "" CACHE STRING "Force unset of the deployment target for iOS" FORCE) + +# Setup iOS platform unless specified manually with IOS_PLATFORM +if(NOT DEFINED IOS_PLATFORM) + set(IOS_PLATFORM "OS") +endif() +set(IOS_PLATFORM ${IOS_PLATFORM} CACHE STRING "Type of iOS Platform") + +# Set the architecture for iOS +if(NOT DEFINED IOS_ARCH) + if(IOS_PLATFORM STREQUAL "OS") + set(IOS_ARCH "armv7;armv7s;arm64") + elseif(IOS_PLATFORM STREQUAL "SIMULATOR") + set(IOS_ARCH "i386;x86_64") + endif() +endif() +set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string "Build architecture for iOS") + +# Specify minimum iOS deployment version +if(NOT DEFINED IOS_DEPLOYMENT_TARGET) + set(IOS_DEPLOYMENT_TARGET "7.0") +endif() +set(IOS_DEPLOYMENT_TARGET ${IOS_DEPLOYMENT_TARGET} CACHE STRING "Minimum iOS version") + +# Whether to enable bitcode +if(NOT DEFINED IOS_ENABLE_BITCODE) + set(IOS_ENABLE_BITCODE ON) +endif() +set(IOS_ENABLE_BITCODE ${IOS_ENABLE_BITCODE} CACHE BOOL "Whether to enable bitcode") + +if(NOT DEFINED IOS_USE_VECLIB_FOR_BLAS) + set(IOS_USE_VECLIB_FOR_BLAS OFF) +endif() +set(IOS_USE_VECLIB_FOR_BLAS ${IOS_UES_VECLIB_FOR_BLAS} CACHE BOOL "Whether to use veclib") + +# Check the platform selection and setup for developer root +if(${IOS_PLATFORM} STREQUAL "OS") + set(IOS_PLATFORM_LOCATION "iPhoneOS.platform") + set(XCODE_IOS_PLATFORM iphoneos) + + # This causes the installers to properly locate the output libraries + set(CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphoneos") +elseif(${IOS_PLATFORM} STREQUAL "SIMULATOR") + set(IOS_PLATFORM_LOCATION "iPhoneSimulator.platform") + set(XCODE_IOS_PLATFORM iphonesimulator) + + # This causes the installers to properly locate the output libraries + set(CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphonesimulator") +elseif(${IOS_PLATFORM} STREQUAL "WATCHOS") + set(IOS_PLATFORM_LOCATION "WatchOS.platform") + set(XCODE_IOS_PLATFORM watchos) + + # This causes the installers to properly locate the output libraries + set(CMAKE_XCODE_EFFECTIVE_PLATFORMS "-watchos") +else(${IOS_PLATFORM} STREQUAL "OS") + message(FATAL_ERROR "Unsupported IOS_PLATFORM value selected. Please set to\n" + "\t OS, SIMULATOR, or WATCHOS.") +endif() + +# Check iOS developer toolchain +if(NOT DEFINED IOS_DEVELOPER_ROOT) + # Setup iOS developer location + execute_process(COMMAND xcode-select -print-path + OUTPUT_VARIABLE XCODE_DEVELOPER_DIR + RESULT_VARIABLE XCODE_DEVELOPER_DIR_RESULT + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) + # Xcode 4.3 changed the installation location, choose the most recent one available + if(${XCODE_VERSION} VERSION_LESS "4.3.0") + set(IOS_DEVELOPER_ROOT "/Developer/Platforms/${IOS_PLATFORM_LOCATION}/Developer") + else() + set(IOS_DEVELOPER_ROOT "${XCODE_DEVELOPER_DIR}/Platforms/${IOS_PLATFORM_LOCATION}/Developer") + endif() +endif() +if(EXISTS ${IOS_DEVELOPER_ROOT}) + set(IOS_DEVELOPER_ROOT ${IOS_DEVELOPER_ROOT} CACHE PATH "Location of iOS Platform") +else() + message(FATAL_ERROR "Invalid IOS_DEVELOPER_ROOT: ${IOS_DEVELOPER_ROOT} does not exist.") +endif() + +# Check iOS SDK +if(NOT DEFINED IOS_SDK_ROOT) + # Find and use the most recent iOS sdk + file(GLOB IOS_SDK_LISTS "${IOS_DEVELOPER_ROOT}/SDKs/*") + if(IOS_SDK_LISTS) + list(SORT IOS_SDK_LISTS) + list(REVERSE IOS_SDK_LISTS) + list(GET IOS_SDK_LISTS 0 IOS_SDK_ROOT) + else(IOS_SDK_LISTS) + message(FATAL_ERROR "No iOS SDK's found in default search path ${IOS_DEVELOPER_ROOT}." + " Please manually set IOS_SDK_ROOT or install the iOS SDK.") + endif(IOS_SDK_LISTS) +endif() +if(EXISTS ${IOS_SDK_ROOT}) + set(IOS_SDK_ROOT ${IOS_SDK_ROOT} CACHE PATH "Location of the selected iOS SDK") + message(STATUS "iOS toolchain: ${IOS_SDK_ROOT}") +else() + message(FATAL_ERROR "Invalid IOS_SDK_ROOT: ${IOS_SDK_ROOT} does not exist.") +endif() + +# Set the sysroot default to the most recent SDK +set(CMAKE_OSX_SYSROOT ${IOS_SDK_ROOT} CACHE PATH "Sysroot used for iOS support") + +# Get version of iOS SDK +execute_process(COMMAND xcodebuild -sdk ${CMAKE_OSX_SYSROOT} -version SDKVersion + OUTPUT_VARIABLE IOS_SDK_VERSION + RESULT_VARIABLE IOS_SDK_VERSION_RESULT + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) +if(${IOS_SDK_VERSION_RESULT}) + string(REGEX MATCH "(([0-9]+)\\.)+([0-9]+)" IOS_SDK_VERSION "${IOS_SDK_ROOT}") +endif() +if(NOT IOS_SDK_VERSION) + message(WARNING "Cannot get SDK's version.") + set(IOS_SDK_VERSION 1) +endif() +set(CMAKE_SYSTEM_VERSION ${IOS_SDK_VERSION}) + +# Find the C & C++ compilers for the specified SDK. +if(NOT CMAKE_C_COMPILER) + # Default to use clang + execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT} -find clang + OUTPUT_VARIABLE IOS_C_COMPILER + RESULT_VARIABLE IOS_C_COMPILER_RESULT + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) + if(${IOS_C_COMPILER_RESULT}) + get_filename_component(IOS_C_COMPILER clang PROGRAM) + endif() +else(NOT CMAKE_C_COMPILER) + # User can set it in cmake command + get_filename_component(IOS_C_COMPILER ${CMAKE_C_COMPILER} PROGRAM) +endif(NOT CMAKE_C_COMPILER) +if(NOT EXISTS ${IOS_C_COMPILER}) + message(FATAL_ERROR "Cannot find C compiler: ${IOS_C_COMPILER}") +endif() + +if(NOT CMAKE_CXX_COMPILER) + # Default to use clang++ + execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT} -find clang++ + OUTPUT_VARIABLE IOS_CXX_COMPILER + RESULT_VARIABLE IOS_CXX_COMPILER_RESULT + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) + if(${IOS_CXX_COMPILER_RESULT}) + get_filename_component(IOS_CXX_COMPILER clang++ PROGRAM) + endif() +else(NOT CMAKE_CXX_COMPILER) + # User can set it in cmake command + get_filename_component(IOS_CXX_COMPILER ${CMAKE_CXX_COMPILER} PROGRAM) +endif(NOT CMAKE_CXX_COMPILER) +if(NOT EXISTS ${IOS_CXX_COMPILER}) + message(FATAL_ERROR "Cannot find CXX compiler: ${IOS_CXX_COMPILER}") +endif() + +set(CMAKE_C_COMPILER ${IOS_C_COMPILER} CACHE PATH "C compiler" FORCE) +set(CMAKE_CXX_COMPILER ${IOS_CXX_COMPILER} CACHE PATH "CXX compiler" FORCE) + +set(CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG "-compatibility_version ") +set(CMAKE_C_OSX_CURRENT_VERSION_FLAG "-current_version ") +set(CMAKE_CXX_OSX_COMPATIBILITY_VERSION_FLAG "${CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG}") +set(CMAKE_CXX_OSX_CURRENT_VERSION_FLAG "${CMAKE_C_OSX_CURRENT_VERSION_FLAG}") + +# Set iOS specific C/C++ flags +if(IOS_PLATFORM STREQUAL "OS") + if(XCODE_VERSION VERSION_LESS "7.0") + set(XCODE_IOS_PLATFORM_VERSION_FLAGS "-mios-version-min=${IOS_DEPLOYMENT_TARGET}") + else() + # Xcode 7.0+ uses flags we can build directly from XCODE_IOS_PLATFORM. + set(XCODE_IOS_PLATFORM_VERSION_FLAGS "-m${XCODE_IOS_PLATFORM}-version-min=${IOS_DEPLOYMENT_TARGET}") + endif() +else() + set(XCODE_IOS_FLATFORM_VERSION_FLAGS "-mios-simulator-version-min=${IOS_DEPLOYMENT_TARGET}") +endif() + +if(IOS_ENABLE_BITCODE) + set(XCODE_IOS_BITCODE_FLAGS "${IOS_COMPILER_FLAGS} -fembed-bitcode") +else() + set(XCODE_IOS_BITCODE_FLAGS "") +endif() + +set(IOS_COMPILER_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} ${XCODE_IOS_BITCODE_FLAGS}") + +# Hidden visibilty is required for cxx on iOS +set(CMAKE_C_FLAGS "${IOS_COMPILER_FLAGS} ${CMAKE_C_FLAGS}" CACHE STRING "C flags") +set(CMAKE_CXX_FLAGS "${IOS_COMPILER_FLAGS} -fvisibility=hidden -fvisibility-inlines-hidden ${CMAKE_CXX_FLAGS}" CACHE STRING "CXX flags") + +set(IOS_LINK_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} -Wl,-search_paths_first") + +if(IOS_USE_VECLIB_FOR_BLAS) + # Find vecLib for iOS + set(VECLIB_SEARCH_DIRS + ${IOS_SDK_ROOT}/System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks + ${IOS_SDK_ROOT}/System/Library/Frameworks/Accelerate.framework/Frameworks + ) + find_path(VECLIB_INC_DIR vecLib.h PATHS ${VECLIB_SEARCH_DIRS}/vecLib.framework/Headers) + + include(FindPackageHandleStandardArgs) + find_package_handle_standard_args(vecLib DEFAULT_MSG VECLIB_INC_DIR) + + if(VECLIB_FOUND) + if(VECLIB_INC_DIR MATCHES "^/System/Library/Frameworks/vecLib.framework.*") + set(IOS_LINK_FLAGS ${IOS_LINK_FLAGS} -lcblas "-framework vecLib") + message(STATUS "Found standalone vecLib.framework") + else() + set(IOS_LINK_FLAGS ${IOS_LINK_FLAGS} -lcblas "-framework Accelerate") + message(STATUS "Found vecLib as part of Accelerate.framework") + endif() + + endif() +endif() + +set(CMAKE_C_LINK_FLAGS "${IOS_LINK_FLAGS} ${CMAKE_C_LINK_FLAGS}") +set(CMAKE_CXX_LINK_FLAGS "${IOS_LINK_FLAGS} ${CMAKE_CXX_LINK_FLAGS}") + +set(CMAKE_PLATFORM_HAS_INSTALLNAME 1) +if(NOT IOS_ENABLE_BITCODE) + set(CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS "-dynamiclib -headerpad_max_install_names") + set(CMAKE_SHARED_MODULE_CREATE_C_FLAGS "-bundle -headerpad_max_install_names") +else() + set(CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS "-dynamiclib") + set(CMAKE_SHARED_MODULE_CREATE_C_FLAGS "-bundle") +endif() +set(CMAKE_SHARED_MODULE_LOADER_C_FLAG "-Wl,-bundle_loader,") +set(CMAKE_SHARED_MODULE_LOADER_CXX_FLAG "-Wl,-bundle_loader,") +set(CMAKE_FIND_LIBRARY_SUFFIXES ".dylib" ".so" ".a") + +# hack: if a new cmake (which uses CMAKE_INSTALL_NAME_TOOL) runs on an old build tree +# (where install_name_tool was hardcoded) and where CMAKE_INSTALL_NAME_TOOL isn't in the cache +# and still cmake didn't fail in CMakeFindBinUtils.cmake (because it isn't rerun) +# hardcode CMAKE_INSTALL_NAME_TOOL here to install_name_tool, so it behaves as it did before, Alex +if(NOT DEFINED CMAKE_INSTALL_NAME_TOOL) + find_program(CMAKE_INSTALL_NAME_TOOL install_name_tool) +endif() + +# Set the find root to the iOS developer roots and to user defined paths +set(CMAKE_FIND_ROOT_PATH ${IOS_DEVELOPER_ROOT} ${IOS_SDK_ROOT} ${CMAKE_PREFIX_PATH} + CACHE string "iOS find search path root") + +# default to searching for frameworks first +set(CMAKE_FIND_FRAMEWORK FIRST) + +# set up the default search directories for frameworks +set(CMAKE_SYSTEM_FRAMEWORK_PATH + ${IOS_SDK_ROOT}/System/Library/Frameworks + ${IOS_SDK_ROOT}/System/Library/PrivateFrameworks + ${IOS_SDK_ROOT}/Developer/Library/Frameworks + ) + +# only search the iOS sdks, not the remainder of the host filesystem +set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) +set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) +set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) + +message(STATUS "iOS: Targeting iOS '${CMAKE_SYSTEM_VERSION}', " + "building for '${IOS_PLATFORM}' platform, with architecture '${CMAKE_OSX_ARCHITECTURES}'") +message(STATUS "System CMAKE_C_FLAGS: ${CMAKE_C_FLAGS}") +message(STATUS "System CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}") + +# Used in ExternalProject command +string(REPLACE ";" "\\$" EXTERNAL_IOS_ARCHITECTURES "${CMAKE_OSX_ARCHITECTURES}") +set(EXTERNAL_OPTIONAL_ARGS + -DCMAKE_OSX_SYSROOT=${CMAKE_OSX_SYSROOT} + -DCMAKE_OSX_ARCHITECTURES=${EXTERNAL_IOS_ARCHITECTURES}) + +# This little macro lets you set any XCode specific property +macro(set_xcode_property TARGET XCODE_PROPERTY XCODE_VALUE) + set_property (TARGET ${TARGET} PROPERTY XCODE_ATTRIBUTE_${XCODE_PROPERTY} ${XCODE_VALUE}) +endmacro(set_xcode_property) + +# This macro lets you find executable programs on the host system +macro(find_host_package) + set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) + set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER) + set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE NEVER) + set(IOS FALSE) + + find_package(${ARGN}) + + set(IOS TRUE) + set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY) + set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) + set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) +endmacro(find_host_package) diff --git a/cmake/cross_compiling/raspberry_pi.cmake b/cmake/cross_compiling/raspberry_pi.cmake new file mode 100644 index 0000000000000000000000000000000000000000..817b39f6833e37c340d4ee465048480cfc3db151 --- /dev/null +++ b/cmake/cross_compiling/raspberry_pi.cmake @@ -0,0 +1,84 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This is a toolchain file for cross-compiling for Raspberry Pi. +# The supported variables are listed belows: +# +# RPI_TOOLCHAIN +# RPI_ARM_NEON +# +# Also you can set CMAKE_C/CXX_COMPILER yourself, through cmake arguments. + +IF(NOT RPI) + return() +ENDIF() + +SET(CMAKE_SYSTEM_NAME Linux) +SET(CMAKE_SYSTEM_VERSION 1) +SET(CMAKE_SYSTEM_PROCESSOR arm) + +# check the exist of raspberry pi toolchain +IF(NOT DEFINED RPI_TOOLCHAIN) + SET(RPI_TOOLCHAIN $ENV{RPI_TOOLCHAIN} + CACHE PATH "Folder holds the toolchain of Raspberr Pi") +ENDIF() +IF(NOT RPI_TOOLCHAIN) + MESSAGE(WARNING "It is recommended to set RPI_TOOLCHAIN to use toolchain.\n" + "To cross-compile for Raspberry Pi, you need to download the tools using:\n" + " git clone https://github.com/raspberrypi/tools\n") +ENDIF() + +IF(NOT DEFINED RPI_ARM_NEON) + SET(RPI_ARM_NEON ON) +ENDIF() + +IF(RPI_TOOLCHAIN) + SET(RPI_TOOLCHAIN_ROOT ${RPI_TOOLCHAIN}) + IF(RPI_TOOLCHAIN_ROOT MATCHES "gcc-linaro-arm-linux-gnueabihf-raspbian(-x64)?$") + # gcc-linaro-arm-linux-gnueabihf-raspbian + # gcc-linaro-arm-linux-gnueabihf-raspbian-x64 + SET(RPI_TOOLCHAIN_NAME arm-linux-gnueabihf) + ENDIF() + SET(RPI_TOOLCHAIN_PREFIX "${RPI_TOOLCHAIN_ROOT}/bin/${RPI_TOOLCHAIN_NAME}-") +ENDIF() + +# C compiler +IF(NOT CMAKE_C_COMPILER) + SET(RPI_C_COMPILER "${RPI_TOOLCHAIN_PREFIX}gcc") +ELSE() + GET_FILENAME_COMPONENT(RPI_C_COMPILER ${CMAKE_C_COMPILER} PROGRAM) +ENDIF() +IF(NOT EXISTS ${RPI_C_COMPILER}) + MESSAGE(FATAL_ERROR "Cannot find C compiler: ${RPI_C_COMPILER}") +ENDIF() + +# CXX compiler +IF(NOT CMAKE_CXX_COMPILER) + SET(RPI_CXX_COMPILER "${RPI_TOOLCHAIN_PREFIX}g++") +ELSE() + GET_FILENAME_COMPONENT(RPI_CXX_COMPILER ${CMAKE_CXX_COMPILER} PROGRAM) +ENDIF() +IF(NOT EXISTS ${RPI_CXX_COMPILER}) + MESSAGE(FATAL_ERROR "Cannot find CXX compiler: ${RPI_CXX_COMPILER}") +ENDIF() + +SET(CMAKE_C_COMPILER ${RPI_C_COMPILER} CACHE PATH "C compiler" FORCE) +SET(CMAKE_CXX_COMPILER ${RPI_CXX_COMPILER} CACHE PATH "CXX compiler" FORCE) + +IF(RPI_ARM_NEON) + SET(RPI_C_FLAGS "${RPI_C_FLAGS} -mfpu=neon") +ENDIF() + +SET(CMAKE_C_FLAGS "${RPI_C_FLAGS} ${CMAKE_C_FLAGS}" CACHE STRING "C flags") +SET(CMAKE_CXX_FLAGS "${RPI_C_FLAGS} ${CMAKE_CXX_FLAGS}" CACHE STRING "CXX flags") diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake new file mode 100644 index 0000000000000000000000000000000000000000..6bea7cf3022242ce48cc882915f7e71810937283 --- /dev/null +++ b/cmake/cuda.cmake @@ -0,0 +1,188 @@ +if(NOT WITH_GPU) + return() +endif() + +set(paddle_known_gpu_archs "30 35 50 52 60 61 70") +set(paddle_known_gpu_archs7 "30 35 50 52") +set(paddle_known_gpu_archs8 "30 35 50 52 60 61") + +###################################################################################### +# A function for automatic detection of GPUs installed (if autodetection is enabled) +# Usage: +# detect_installed_gpus(out_variable) +function(detect_installed_gpus out_variable) + if(NOT CUDA_gpu_detect_output) + set(cufile ${PROJECT_BINARY_DIR}/detect_cuda_archs.cu) + + file(WRITE ${cufile} "" + "#include \n" + "int main() {\n" + " int count = 0;\n" + " if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n" + " if (count == 0) return -1;\n" + " for (int device = 0; device < count; ++device) {\n" + " cudaDeviceProp prop;\n" + " if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n" + " std::printf(\"%d.%d \", prop.major, prop.minor);\n" + " }\n" + " return 0;\n" + "}\n") + + execute_process(COMMAND "${CUDA_NVCC_EXECUTABLE}" "-ccbin=${CUDA_HOST_COMPILER}" + "--run" "${cufile}" + WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/" + RESULT_VARIABLE nvcc_res OUTPUT_VARIABLE nvcc_out + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) + + if(nvcc_res EQUAL 0) + # only keep the last line of nvcc_out + STRING(REGEX REPLACE ";" "\\\\;" nvcc_out "${nvcc_out}") + STRING(REGEX REPLACE "\n" ";" nvcc_out "${nvcc_out}") + list(GET nvcc_out -1 nvcc_out) + string(REPLACE "2.1" "2.1(2.0)" nvcc_out "${nvcc_out}") + set(CUDA_gpu_detect_output ${nvcc_out} CACHE INTERNAL "Returned GPU architetures from detect_installed_gpus tool" FORCE) + endif() + endif() + + if(NOT CUDA_gpu_detect_output) + message(STATUS "Automatic GPU detection failed. Building for all known architectures.") + set(${out_variable} ${paddle_known_gpu_archs} PARENT_SCOPE) + else() + set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE) + endif() +endfunction() + + +######################################################################## +# Function for selecting GPU arch flags for nvcc based on CUDA_ARCH_NAME +# Usage: +# select_nvcc_arch_flags(out_variable) +function(select_nvcc_arch_flags out_variable) + # List of arch names + set(archs_names "Kepler" "Maxwell" "Pascal" "All" "Manual") + set(archs_name_default "All") + if(NOT CMAKE_CROSSCOMPILING) + list(APPEND archs_names "Auto") + endif() + + # set CUDA_ARCH_NAME strings (so it will be seen as dropbox in CMake-Gui) + set(CUDA_ARCH_NAME ${archs_name_default} CACHE STRING "Select target NVIDIA GPU achitecture.") + set_property( CACHE CUDA_ARCH_NAME PROPERTY STRINGS "" ${archs_names} ) + mark_as_advanced(CUDA_ARCH_NAME) + + # verify CUDA_ARCH_NAME value + if(NOT ";${archs_names};" MATCHES ";${CUDA_ARCH_NAME};") + string(REPLACE ";" ", " archs_names "${archs_names}") + message(FATAL_ERROR "Only ${archs_names} architeture names are supported.") + endif() + + if(${CUDA_ARCH_NAME} STREQUAL "Manual") + set(CUDA_ARCH_BIN ${paddle_known_gpu_archs} CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported") + set(CUDA_ARCH_PTX "50" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for") + mark_as_advanced(CUDA_ARCH_BIN CUDA_ARCH_PTX) + else() + unset(CUDA_ARCH_BIN CACHE) + unset(CUDA_ARCH_PTX CACHE) + endif() + + if(${CUDA_ARCH_NAME} STREQUAL "Kepler") + set(cuda_arch_bin "30 35") + elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell") + set(cuda_arch_bin "50") + elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal") + set(cuda_arch_bin "60 61") + elseif(${CUDA_ARCH_NAME} STREQUAL "Volta") + set(cuda_arch_bin "70") + elseif(${CUDA_ARCH_NAME} STREQUAL "All") + set(cuda_arch_bin ${paddle_known_gpu_archs}) + elseif(${CUDA_ARCH_NAME} STREQUAL "Auto") + detect_installed_gpus(cuda_arch_bin) + else() # (${CUDA_ARCH_NAME} STREQUAL "Manual") + set(cuda_arch_bin ${CUDA_ARCH_BIN}) + endif() + + # remove dots and convert to lists + string(REGEX REPLACE "\\." "" cuda_arch_bin "${cuda_arch_bin}") + string(REGEX REPLACE "\\." "" cuda_arch_ptx "${CUDA_ARCH_PTX}") + string(REGEX MATCHALL "[0-9()]+" cuda_arch_bin "${cuda_arch_bin}") + string(REGEX MATCHALL "[0-9]+" cuda_arch_ptx "${cuda_arch_ptx}") + list(REMOVE_DUPLICATES cuda_arch_bin) + list(REMOVE_DUPLICATES cuda_arch_ptx) + + set(nvcc_flags "") + set(nvcc_archs_readable "") + + # Tell NVCC to add binaries for the specified GPUs + foreach(arch ${cuda_arch_bin}) + if(arch MATCHES "([0-9]+)\\(([0-9]+)\\)") + # User explicitly specified PTX for the concrete BIN + list(APPEND nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1}) + list(APPEND nvcc_archs_readable sm_${CMAKE_MATCH_1}) + else() + # User didn't explicitly specify PTX for the concrete BIN, we assume PTX=BIN + list(APPEND nvcc_flags -gencode arch=compute_${arch},code=sm_${arch}) + list(APPEND nvcc_archs_readable sm_${arch}) + endif() + endforeach() + + # Tell NVCC to add PTX intermediate code for the specified architectures + foreach(arch ${cuda_arch_ptx}) + list(APPEND nvcc_flags -gencode arch=compute_${arch},code=compute_${arch}) + list(APPEND nvcc_archs_readable compute_${arch}) + endforeach() + + string(REPLACE ";" " " nvcc_archs_readable "${nvcc_archs_readable}") + set(${out_variable} ${nvcc_flags} PARENT_SCOPE) + set(${out_variable}_readable ${nvcc_archs_readable} PARENT_SCOPE) +endfunction() + +message(STATUS "CUDA detected: " ${CUDA_VERSION}) +if (${CUDA_VERSION} LESS 7.0) + set(paddle_known_gpu_archs ${paddle_known_gpu_archs}) +elseif (${CUDA_VERSION} LESS 8.0) # CUDA 7.x + set(paddle_known_gpu_archs ${paddle_known_gpu_archs7}) + list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED") + list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__") +elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x + set(paddle_known_gpu_archs ${paddle_known_gpu_archs8}) + list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED") + list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__") + # CUDA 8 may complain that sm_20 is no longer supported. Suppress the + # warning for now. + list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets") +endif() + +include_directories(${CUDA_INCLUDE_DIRS}) +list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY}) +if(NOT WITH_DSO) + list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY}) +endif(NOT WITH_DSO) + +# setting nvcc arch flags +select_nvcc_arch_flags(NVCC_FLAGS_EXTRA) +list(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA}) +message(STATUS "Added CUDA NVCC flags for: ${NVCC_FLAGS_EXTRA_readable}") + +# Set C++11 support +set(CUDA_PROPAGATE_HOST_FLAGS OFF) + +# Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc. +# So, don't set these flags here. +list(APPEND CUDA_NVCC_FLAGS "-std=c++11") +list(APPEND CUDA_NVCC_FLAGS "--use_fast_math") +list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC") +# Set :expt-relaxed-constexpr to suppress Eigen warnings +list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr") + +if(CMAKE_BUILD_TYPE STREQUAL "Debug") + list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG}) +elseif(CMAKE_BUILD_TYPE STREQUAL "Release") + list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE}) +elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") + list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}) +elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel") + list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_MINSIZEREL}) +endif() + +mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD) +mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION) diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake index af9be86961833dcd62371227165d411a3b61d79e..2c84061ff572de4687b4d496f8ded6deee8d1011 100644 --- a/cmake/cudnn.cmake +++ b/cmake/cudnn.cmake @@ -2,7 +2,7 @@ if(NOT WITH_GPU) return() endif() -set(CUDNN_ROOT "" CACHE PATH "CUDNN ROOT") +set(CUDNN_ROOT "/usr" CACHE PATH "CUDNN ROOT") find_path(CUDNN_INCLUDE_DIR cudnn.h PATHS ${CUDNN_ROOT} ${CUDNN_ROOT}/include $ENV{CUDNN_ROOT} $ENV{CUDNN_ROOT}/include ${CUDA_TOOLKIT_INCLUDE} @@ -11,11 +11,16 @@ find_path(CUDNN_INCLUDE_DIR cudnn.h get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH) +set(TARGET_ARCH "x86_64") +if(NOT ${CMAKE_SYSTEM_PROCESSOR}) + set(TARGET_ARCH ${CMAKE_SYSTEM_PROCESSOR}) +endif() + list(APPEND CUDNN_CHECK_LIBRARY_DIRS ${CUDNN_ROOT} ${CUDNN_ROOT}/lib64 ${CUDNN_ROOT}/lib - ${CUDNN_ROOT}/lib/x86_64-linux-gnu + ${CUDNN_ROOT}/lib/${TARGET_ARCH}-linux-gnu $ENV{CUDNN_ROOT} $ENV{CUDNN_ROOT}/lib64 $ENV{CUDNN_ROOT}/lib diff --git a/cmake/external/any.cmake b/cmake/external/any.cmake index 8116f235d535917c03deb646ff4ec083a0cdadc7..85cce80b70a1fcf57015ac7a264e4950616b2717 100644 --- a/cmake/external/any.cmake +++ b/cmake/external/any.cmake @@ -2,13 +2,13 @@ INCLUDE(ExternalProject) SET(ANY_SOURCE_DIR ${THIRD_PARTY_PATH}/any) -INCLUDE_DIRECTORIES(${ANY_SOURCE_DIR}/src/linb_any) +INCLUDE_DIRECTORIES(${ANY_SOURCE_DIR}/src/extern_lib_any) ExternalProject_Add( - linb_any + extern_lib_any ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY "https://github.com/thelink2012/any.git" - GIT_TAG "8fef1e93710a0edf8d7658999e284a1142c4c020" + GIT_REPOSITORY "https://github.com/PaddlePaddle/any.git" + GIT_TAG "15595d8324be9e8a9a80d9ae442fdd12bd66df5d" PREFIX ${ANY_SOURCE_DIR} UPDATE_COMMAND "" CONFIGURE_COMMAND "" @@ -17,4 +17,15 @@ ExternalProject_Add( TEST_COMMAND "" ) +if (${CMAKE_VERSION} VERSION_LESS "3.3.0") + set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/lib_any_dummy.c) + file(WRITE ${dummyfile} "const char * dummy_any = \"${dummyfile}\";") + add_library(lib_any STATIC ${dummyfile}) +else() + add_library(lib_any INTERFACE) +endif() + +add_dependencies(lib_any extern_lib_any) + add_definitions(-DANY_IMPL_ANY_CAST_MOVEABLE) +LIST(APPEND external_project_dependencies lib_any) diff --git a/cmake/external/cares.cmake b/cmake/external/cares.cmake new file mode 100644 index 0000000000000000000000000000000000000000..aec51410b33669f8a549f2eca193cc6aa2d07a13 --- /dev/null +++ b/cmake/external/cares.cmake @@ -0,0 +1,45 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +IF(MOBILE_INFERENCE OR NOT WITH_DISTRIBUTE) + return() +ENDIF() + +include (ExternalProject) + +# NOTE: c-ares is needed when linking with grpc. + +SET(CARES_SOURCES_DIR ${THIRD_PARTY_PATH}/cares) +SET(CARES_INSTALL_DIR ${THIRD_PARTY_PATH}/install/cares) +SET(CARES_INCLUDE_DIR "${CARES_INSTALL_DIR}/include/" CACHE PATH "cares include directory." FORCE) + +ExternalProject_Add( + extern_cares + GIT_REPOSITORY "https://github.com/c-ares/c-ares.git" + GIT_TAG "cares-1_13_0" + PREFIX ${CARES_SOURCES_DIR} + UPDATE_COMMAND "" + CONFIGURE_COMMAND ./buildconf && ./configure --disable-shared --prefix=${CARES_INSTALL_DIR} + BUILD_IN_SOURCE 1 + BUILD_COMMAND make -j8 + INSTALL_COMMAND make install +) + +ADD_LIBRARY(cares STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET cares PROPERTY IMPORTED_LOCATION + "${CARES_INSTALL_DIR}/lib/libcares.a") + +include_directories(${CARES_INCLUDE_DIR}) +ADD_DEPENDENCIES(cares extern_cares) diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake new file mode 100644 index 0000000000000000000000000000000000000000..96fc886a342cae38d5b804266d3af7bc909a4da2 --- /dev/null +++ b/cmake/external/eigen.cmake @@ -0,0 +1,30 @@ +INCLUDE(ExternalProject) + +SET(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3) + +INCLUDE_DIRECTORIES(${EIGEN_SOURCE_DIR}/src/extern_eigen3) + +ExternalProject_Add( + extern_eigen3 + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY "https://github.com/RLovelett/eigen.git" + GIT_TAG 70661066beef694cadf6c304d0d07e0758825c10 + PREFIX ${EIGEN_SOURCE_DIR} + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + TEST_COMMAND "" +) + +if (${CMAKE_VERSION} VERSION_LESS "3.3.0") + set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/eigen3_dummy.c) + file(WRITE ${dummyfile} "const char * dummy_eigen3 = \"${dummyfile}\";") + add_library(eigen3 STATIC ${dummyfile}) +else() + add_library(eigen3 INTERFACE) +endif() + +add_dependencies(eigen3 extern_eigen3) + +LIST(APPEND external_project_dependencies eigen3) diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake index 0afb3ab9af48046af01f03838eefa0bd2fcb2821..d4f252bb9f64c8db82b841fedf0817f5d8596501 100644 --- a/cmake/external/gflags.cmake +++ b/cmake/external/gflags.cmake @@ -1,11 +1,11 @@ # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -18,30 +18,45 @@ SET(GFLAGS_SOURCES_DIR ${THIRD_PARTY_PATH}/gflags) SET(GFLAGS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/gflags) SET(GFLAGS_INCLUDE_DIR "${GFLAGS_INSTALL_DIR}/include" CACHE PATH "gflags include directory." FORCE) IF(WIN32) - set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/gflags.lib" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE) + set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/gflags.lib" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE) ELSE(WIN32) - set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/libgflags.a" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE) + set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/libgflags.a" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE) ENDIF(WIN32) INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR}) ExternalProject_Add( - gflags + extern_gflags ${EXTERNAL_PROJECT_LOG_ARGS} GIT_REPOSITORY "https://github.com/gflags/gflags.git" + GIT_TAG 77592648e3f3be87d6c7123eb81cbad75f9aef5a PREFIX ${GFLAGS_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} - CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} - CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR} - CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON - CMAKE_ARGS -DBUILD_TESTING=OFF - CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR} + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DBUILD_TESTING=OFF + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + ${EXTERNAL_OPTIONAL_ARGS} CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR} -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON - -DCMAKE_BUILD_TYPE:STRING=Release + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} ) +ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET gflags PROPERTY IMPORTED_LOCATION ${GFLAGS_LIBRARIES}) +ADD_DEPENDENCIES(gflags extern_gflags) + LIST(APPEND external_project_dependencies gflags) + +IF(WITH_C_API) + INSTALL(DIRECTORY ${GFLAGS_INCLUDE_DIR} DESTINATION third_party/gflags) + IF(ANDROID) + INSTALL(FILES ${GFLAGS_LIBRARIES} DESTINATION third_party/gflags/lib/${ANDROID_ABI}) + ELSE() + INSTALL(FILES ${GFLAGS_LIBRARIES} DESTINATION third_party/gflags/lib) + ENDIF() +ENDIF() diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake index 4a9e2ecc6bbe74c5856a55fb0c982777d7ac25b7..0c6b3aafcb4e990b9d4549820137474e5968a7aa 100644 --- a/cmake/external/glog.cmake +++ b/cmake/external/glog.cmake @@ -19,33 +19,60 @@ SET(GLOG_INSTALL_DIR ${THIRD_PARTY_PATH}/install/glog) SET(GLOG_INCLUDE_DIR "${GLOG_INSTALL_DIR}/include" CACHE PATH "glog include directory." FORCE) IF(WIN32) - SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.lib" CACHE FILEPATH "glog library." FORCE) + SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.lib" CACHE FILEPATH "glog library." FORCE) ELSE(WIN32) - SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.a" CACHE FILEPATH "glog library." FORCE) + SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.a" CACHE FILEPATH "glog library." FORCE) ENDIF(WIN32) INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR}) +IF(ANDROID AND ${CMAKE_SYSTEM_VERSION} VERSION_LESS "21") + # Using the unofficial glog for Android API < 21 + SET(GLOG_REPOSITORY "https://github.com/Xreki/glog.git") + SET(GLOG_TAG "8a547150548b284382ccb6582408e9140ff2bea8") +ELSE() + SET(GLOG_REPOSITORY "https://github.com/google/glog.git") + SET(GLOG_TAG "v0.3.5") +ENDIF() + ExternalProject_Add( - glog + extern_glog ${EXTERNAL_PROJECT_LOG_ARGS} DEPENDS gflags - GIT_REPOSITORY "https://github.com/google/glog.git" + GIT_REPOSITORY ${GLOG_REPOSITORY} + GIT_TAG ${GLOG_TAG} PREFIX ${GLOG_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} - CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} - CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR} - CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON - CMAKE_ARGS -DWITH_GFLAGS=ON - CMAKE_ARGS -Dgflags_DIR=${GFLAGS_INSTALL_DIR}/lib/cmake/gflags - CMAKE_ARGS -DBUILD_TESTING=OFF - CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR=${GLOG_INSTALL_DIR}/lib + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DWITH_GFLAGS=ON + -Dgflags_DIR=${GFLAGS_INSTALL_DIR}/lib/cmake/gflags + -DBUILD_TESTING=OFF + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + ${EXTERNAL_OPTIONAL_ARGS} CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GLOG_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR:PATH=${GLOG_INSTALL_DIR}/lib -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON - -DCMAKE_BUILD_TYPE:STRING=Release + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} ) +ADD_LIBRARY(glog STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET glog PROPERTY IMPORTED_LOCATION ${GLOG_LIBRARIES}) +ADD_DEPENDENCIES(glog extern_glog gflags) +LINK_LIBRARIES(glog gflags) + LIST(APPEND external_project_dependencies glog) + +IF(WITH_C_API) + INSTALL(DIRECTORY ${GLOG_INCLUDE_DIR} DESTINATION third_party/glog) + IF(ANDROID) + INSTALL(FILES ${GLOG_LIBRARIES} DESTINATION third_party/glog/lib/${ANDROID_ABI}) + ELSE() + INSTALL(FILES ${GLOG_LIBRARIES} DESTINATION third_party/glog/lib) + ENDIF() +ENDIF() diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake new file mode 100644 index 0000000000000000000000000000000000000000..abee6698e30b7e76ca42825ed225876bf2ba5ec0 --- /dev/null +++ b/cmake/external/grpc.cmake @@ -0,0 +1,66 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +IF(MOBILE_INFERENCE OR NOT WITH_DISTRIBUTE) + return() +ENDIF() + +include (ExternalProject) + +SET(GRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/grpc) +SET(GRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/grpc) +SET(GRPC_INCLUDE_DIR "${GRPC_INSTALL_DIR}/include/" CACHE PATH "grpc include directory." FORCE) +SET(GRPC_CPP_PLUGIN "${GRPC_INSTALL_DIR}/bin/grpc_cpp_plugin" CACHE FILEPATH "GRPC_CPP_PLUGIN" FORCE) +IF(APPLE) + SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin | sed "s/-Werror//g" | sh) +ELSE() + SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin) +ENDIF() + +ExternalProject_Add( + extern_grpc + DEPENDS protobuf zlib + GIT_REPOSITORY "https://github.com/grpc/grpc.git" + GIT_TAG "v1.7.x" + PREFIX ${GRPC_SOURCES_DIR} + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_IN_SOURCE 1 + # NOTE(yuyang18): + # Disable -Werror, otherwise the compile will fail in MacOS. + # It seems that we cannot configure that by make command. + # Just dry run make command and remove `-Werror`, then use a shell to run make commands + BUILD_COMMAND ${BUILD_CMD} + INSTALL_COMMAND make prefix=${GRPC_INSTALL_DIR} install +) + +# FIXME(typhoonzero): hack to get static lib path, try a better way like merge them. +ADD_LIBRARY(grpc++_unsecure STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET grpc++_unsecure PROPERTY IMPORTED_LOCATION + "${GRPC_INSTALL_DIR}/lib/libgrpc++_unsecure.a") + +ADD_LIBRARY(grpc++ STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET grpc++ PROPERTY IMPORTED_LOCATION + "${GRPC_INSTALL_DIR}/lib/libgrpc++.a") +ADD_LIBRARY(gpr STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET gpr PROPERTY IMPORTED_LOCATION + "${GRPC_INSTALL_DIR}/lib/libgpr.a") + +ADD_LIBRARY(grpc_unsecure STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET grpc_unsecure PROPERTY IMPORTED_LOCATION + "${GRPC_INSTALL_DIR}/lib/libgrpc_unsecure.a") + +include_directories(${GRPC_INCLUDE_DIR}) +ADD_DEPENDENCIES(grpc++_unsecure extern_grpc) diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake index 49c7d71443cda700a14af6be65ff6658eec7229f..5a4aa7a5b71a4fdfd556a46037e6d1846d668fc4 100644 --- a/cmake/external/gtest.cmake +++ b/cmake/external/gtest.cmake @@ -1,11 +1,11 @@ # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -34,26 +34,42 @@ IF(WITH_TESTING) "${GTEST_INSTALL_DIR}/lib/libgtest_main.a" CACHE FILEPATH "gtest main libraries." FORCE) ENDIF(WIN32) + IF(WITH_MKLML) + # wait for mklml downloading completed + SET(GTEST_DEPENDS ${MKLML_PROJECT}) + ENDIF() + ExternalProject_Add( - gtest + extern_gtest ${EXTERNAL_PROJECT_LOG_ARGS} + DEPENDS ${GTEST_DEPENDS} GIT_REPOSITORY "https://github.com/google/googletest.git" GIT_TAG "release-1.8.0" PREFIX ${GTEST_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} - CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} - CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${GTEST_INSTALL_DIR} - CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON - CMAKE_ARGS -DBUILD_GMOCK=ON - CMAKE_ARGS -Dgtest_disable_pthreads=ON - CMAKE_ARGS -Dgtest_force_shared_crt=ON - CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_INSTALL_PREFIX=${GTEST_INSTALL_DIR} + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DBUILD_GMOCK=ON + -Dgtest_disable_pthreads=ON + -Dgtest_force_shared_crt=ON + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + ${EXTERNAL_OPTIONAL_ARGS} CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GTEST_INSTALL_DIR} -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON - -DCMAKE_BUILD_TYPE:STRING=Release + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} ) - LIST(APPEND external_project_dependencies gtest) + + ADD_LIBRARY(gtest STATIC IMPORTED GLOBAL) + SET_PROPERTY(TARGET gtest PROPERTY IMPORTED_LOCATION ${GTEST_LIBRARIES}) + ADD_DEPENDENCIES(gtest extern_gtest) + + ADD_LIBRARY(gtest_main STATIC IMPORTED GLOBAL) + SET_PROPERTY(TARGET gtest_main PROPERTY IMPORTED_LOCATION ${GTEST_MAIN_LIBRARIES}) + ADD_DEPENDENCIES(gtest_main extern_gtest) + + LIST(APPEND external_project_dependencies gtest gtest_main) ENDIF(WITH_TESTING) diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake new file mode 100644 index 0000000000000000000000000000000000000000..fc52d339d7a336b44c97f2e0a9fc8d6604854365 --- /dev/null +++ b/cmake/external/mkldnn.cmake @@ -0,0 +1,71 @@ +# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +IF(NOT ${WITH_MKLDNN}) + return() +ENDIF(NOT ${WITH_MKLDNN}) + +INCLUDE(ExternalProject) + +SET(MKLDNN_PROJECT "extern_mkldnn") +SET(MKLDNN_SOURCES_DIR ${THIRD_PARTY_PATH}/mkldnn) +SET(MKLDNN_INSTALL_DIR ${THIRD_PARTY_PATH}/install/mkldnn) +SET(MKLDNN_INC_DIR "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE) + +IF(WIN32 OR APPLE) + MESSAGE(WARNING + "Windows or Mac is not supported with MKLDNN in Paddle yet." + "Force WITH_MKLDNN=OFF") + SET(WITH_MKLDNN OFF CACHE STRING "Disable MKLDNN in Windows and MacOS" FORCE) + return() +ENDIF() + +SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/lib/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE) +MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/lib to runtime path") +SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) +SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib") + +INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR}) + +IF(${CBLAS_PROVIDER} STREQUAL "MKLML") + SET(MKLDNN_DEPENDS ${MKLML_PROJECT}) + MESSAGE(STATUS "Build MKLDNN with MKLML ${MKLML_ROOT}") +ELSE() + MESSAGE(FATAL_ERROR "Should enable MKLML when build MKLDNN") +ENDIF() + +SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} -Wno-error=strict-overflow") +SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} -Wno-error=strict-overflow") +ExternalProject_Add( + ${MKLDNN_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + DEPENDS ${MKLDNN_DEPENDS} + GIT_REPOSITORY "https://github.com/01org/mkl-dnn.git" + GIT_TAG "v0.11" + PREFIX ${MKLDNN_SOURCES_DIR} + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR} + CMAKE_ARGS -DMKLROOT=${MKLML_ROOT} + CMAKE_ARGS -DCMAKE_C_FLAGS=${MKLDNN_CFLAG} + CMAKE_ARGS -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR} + -DMKLROOT:PATH=${MKLML_ROOT} +) + +ADD_LIBRARY(mkldnn SHARED IMPORTED GLOBAL) +SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB}) +ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT}) +MESSAGE(STATUS "MKLDNN library: ${MKLDNN_LIB}") +add_definitions(-DPADDLE_USE_MKLDNN) +LIST(APPEND external_project_dependencies mkldnn) diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake new file mode 100644 index 0000000000000000000000000000000000000000..20dbc32a738d982df2d3f035206279c82c8de264 --- /dev/null +++ b/cmake/external/mklml.cmake @@ -0,0 +1,68 @@ +# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +IF(NOT ${WITH_MKLML}) + return() +ENDIF(NOT ${WITH_MKLML}) + +IF(WIN32 OR APPLE) + MESSAGE(WARNING + "Windows or Mac is not supported with MKLML in Paddle yet." + "Force WITH_MKLML=OFF") + SET(WITH_MKLML OFF CACHE STRING "Disable MKLML package in Windows and MacOS" FORCE) + return() +ENDIF() + +INCLUDE(ExternalProject) + +SET(MKLML_PROJECT "extern_mklml") +SET(MKLML_VER "mklml_lnx_2018.0.1.20171007") +SET(MKLML_URL "https://github.com/01org/mkl-dnn/releases/download/v0.11/${MKLML_VER}.tgz") +SET(MKLML_SOURCE_DIR "${THIRD_PARTY_PATH}/mklml") +SET(MKLML_DOWNLOAD_DIR "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}") +SET(MKLML_DST_DIR "mklml") +SET(MKLML_INSTALL_ROOT "${THIRD_PARTY_PATH}/install") +SET(MKLML_INSTALL_DIR ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR}) +SET(MKLML_ROOT ${MKLML_INSTALL_DIR}/${MKLML_VER}) +SET(MKLML_INC_DIR ${MKLML_ROOT}/include) +SET(MKLML_LIB_DIR ${MKLML_ROOT}/lib) +SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml_intel.so) +SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so) +SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib") + +INCLUDE_DIRECTORIES(${MKLML_INC_DIR}) + +FILE(WRITE ${MKLML_DOWNLOAD_DIR}/CMakeLists.txt + "PROJECT(MKLML)\n" + "cmake_minimum_required(VERSION 3.0)\n" + "install(DIRECTORY ${MKLML_VER}\n" + " DESTINATION ${MKLML_DST_DIR})\n") + +ExternalProject_Add( + ${MKLML_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${MKLML_SOURCE_DIR} + DOWNLOAD_DIR ${MKLML_DOWNLOAD_DIR} + DOWNLOAD_COMMAND wget --no-check-certificate ${MKLML_URL} -c -q -O ${MKLML_VER}.tgz + && tar zxf ${MKLML_VER}.tgz + DOWNLOAD_NO_PROGRESS 1 + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLML_INSTALL_ROOT} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${MKLML_INSTALL_ROOT} +) + +ADD_LIBRARY(mklml SHARED IMPORTED GLOBAL) +SET_PROPERTY(TARGET mklml PROPERTY IMPORTED_LOCATION ${MKLML_LIB}) +ADD_DEPENDENCIES(mklml ${MKLML_PROJECT}) +LIST(APPEND external_project_dependencies mklml) diff --git a/cmake/external/nccl.cmake b/cmake/external/nccl.cmake new file mode 100644 index 0000000000000000000000000000000000000000..fc43766efafc3d3e16f2906ce7f9a3d692c8e4ff --- /dev/null +++ b/cmake/external/nccl.cmake @@ -0,0 +1,67 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if(NOT WITH_GPU) + return() +endif() + +include(ExternalProject) + +set(NCCL_SOURCE_DIR ${THIRD_PARTY_PATH}/nccl) + +include_directories(${NCCL_SOURCE_DIR}/src/extern_nccl/src) + +if(WITH_DSO) + # If we use DSO, we do not build nccl, just download the dependencies + set(NCCL_BUILD_COMMAND "") + set(NCCL_INSTALL_COMMAND "") + set(NCCL_INSTALL_DIR "") +else() + # otherwise, we build nccl and link it. + set(NCCL_INSTALL_DIR ${THIRD_PARTY_PATH}/install/nccl) + # Note: cuda 8.0 is needed to make nccl + # When cuda is not installed on the system directory, need to set CUDA_HOME to your cuda root + set(NCCL_BUILD_COMMAND "make -j 8") + set(NCCL_INSTALL_COMMAND "make install PREFIX=${NCCL_INSTALL_DIR}") +endif() + +ExternalProject_Add( + extern_nccl + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY "https://github.com/NVIDIA/nccl.git" + GIT_TAG "v1.3.4-1" + PREFIX "${NCCL_SOURCE_DIR}" + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND "${NCCL_BUILD_COMMAND}" + INSTALL_COMMAND "${NCCL_INSTALL_COMMAND}" + INSTALL_DIR "${NCCL_INSTALL_DIR}" + TEST_COMMAND "" +) + +if(WITH_DSO) + if(${CMAKE_VERSION} VERSION_LESS "3.3.0") + set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/lib_nccl_dummy.c) + file(WRITE ${dummyfile} "const char * dummy_nccl = \"${dummyfile}\";") + add_library(nccl STATIC ${dummyfile}) + else() + add_library(nccl INTERFACE) + endif() +else() + add_library(nccl STATIC IMPORTED GLOBAL) + set_property(TARGET nccl PROPERTY IMPORTED_LOCATION + ${NCCL_INSTALL_DIR}/lib/libnccl_static.a) +endif() + +add_dependencies(nccl extern_nccl) diff --git a/cmake/external/nnpack.cmake b/cmake/external/nnpack.cmake new file mode 100644 index 0000000000000000000000000000000000000000..d42bcb0f329041462bd8b568052fbb8226d18e4e --- /dev/null +++ b/cmake/external/nnpack.cmake @@ -0,0 +1,30 @@ +# Find the NNPACK library +# NNPACK_ROOT - where to find NNPACK include and library. +# + +set(NNPACK_FOUND OFF) +set(NNPACK_ROOT $ENV{NNPACK_ROOT} CACHE PATH "Folder contains NNPACK") +find_path(NNPACK_INC_DIR nnpack.h PATHS ${NNPACK_ROOT}/include) +find_library(NNPACK_LIB NAMES nnpack PATHS ${NNPACK_ROOT}/lib) +find_library(PTHREADPOOL_LIB NAMES pthreadpool PATHS ${NNPACK_ROOT}/lib) +find_library(NNPACK_UKERNELS_LIB NAMES nnpack_ukernels PATHS ${NNPACK_ROOT}/lib) +find_library(NNPACK_CPUFEATURES_LIB NAMES cpufeatures PATHS ${NNPACK_ROOT}/lib) + +if(NNPACK_INC_DIR AND NNPACK_LIB AND PTHREADPOOL_LIB) + set(NNPACK_FOUND ON) + INCLUDE_DIRECTORIES(${NNPACK_INC_DIR}) + + set(NNPACK_LIBS) + list(APPEND NNPACK_LIBS ${NNPACK_LIB} ${PTHREADPOOL_LIB}) + if (NNPACK_UKERNELS_LIB) + list(APPEND NNPACK_LIBS ${NNPACK_UKERNELS_LIB}) + endif() + if (NNPACK_CPUFEATURES_LIB) + list(APPEND NNPACK_LIBS ${NNPACK_CPUFEATURES_LIB}) + endif() + if(NOT ANDROID) + list(APPEND NNPACK_LIBS "rt") + endif() +else() + message(FATAL_ERROR "Cannot find NNPACK in (${NNPACK_ROOT})") +endif() diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake index 92ea23c7633e974fd09251f967965364b1928307..97857a686b38d935b19f510ecdcb66bcca91fe03 100644 --- a/cmake/external/openblas.cmake +++ b/cmake/external/openblas.cmake @@ -1,17 +1,21 @@ # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +IF(USE_EIGEN_FOR_BLAS) + return() +ENDIF(USE_EIGEN_FOR_BLAS) + INCLUDE(cblas) IF(NOT ${CBLAS_FOUND}) @@ -21,65 +25,103 @@ IF(NOT ${CBLAS_FOUND}) SET(CBLAS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/openblas) SET(CBLAS_INC_DIR "${CBLAS_INSTALL_DIR}/include" CACHE PATH "openblas include directory." FORCE) - IF(WIN32) - SET(CBLAS_LIBRARIES "${CBLAS_INSTALL_DIR}/lib/openblas.lib" CACHE FILEPATH "openblas library." FORCE) - ELSE(WIN32) - SET(CBLAS_LIBRARIES "${CBLAS_INSTALL_DIR}/lib/libopenblas.a" CACHE FILEPATH "openblas library" FORCE) - ENDIF(WIN32) + SET(CBLAS_LIBRARIES + "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}" + CACHE FILEPATH "openblas library." FORCE) - IF(CMAKE_COMPILER_IS_GNUCC) - ENABLE_LANGUAGE(Fortran) - if (NOT CMAKE_Fortran_COMPILER_VERSION) - # cmake < 3.4 cannot get CMAKE_Fortran_COMPILER_VERSION directly. - execute_process(COMMAND ${CMAKE_Fortran_COMPILER} -dumpversion - OUTPUT_VARIABLE CMAKE_Fortran_COMPILER_VERSION) - endif() - string(REGEX MATCHALL "[0-9]+" Fortran_VERSION ${CMAKE_Fortran_COMPILER_VERSION}) - list(GET Fortran_VERSION 0 Fortran_MAJOR) - list(GET Fortran_VERSION 1 Fortran_MINOR) - find_library(GFORTRAN_LIBRARY NAMES gfortran PATHS - /lib - /usr/lib - /usr/lib/gcc/x86_64-linux-gnu/${Fortran_MAJOR}.${Fortran_MINOR}/ - /usr/lib/gcc/x86_64-linux-gnu/${Fortran_MAJOR}/) - if (NOT GFORTRAN_LIBRARY) - message(FATAL_ERROR "Cannot found gfortran library which it is used by openblas") - endif() - find_package(Threads REQUIRED) - LIST(APPEND CBLAS_LIBRARIES ${GFORTRAN_LIBRARY} ${CMAKE_THREAD_LIBS_INIT}) - ENDIF(CMAKE_COMPILER_IS_GNUCC) + SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable") - IF(NOT CMAKE_Fortran_COMPILER) - MESSAGE(FATAL_ERROR "To build lapack in libopenblas, " - "you need to set gfortran compiler: cmake .. -DCMAKE_Fortran_COMPILER=...") - ENDIF(NOT CMAKE_Fortran_COMPILER) + IF(CMAKE_CROSSCOMPILING) + SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER}) + GET_FILENAME_COMPONENT(CROSS_SUFFIX ${CMAKE_C_COMPILER} DIRECTORY) + SET(CROSS_SUFFIX ${CROSS_SUFFIX}/) + IF(ANDROID) + # arm_soft_fp_abi branch of OpenBLAS to support softfp + # https://github.com/xianyi/OpenBLAS/tree/arm_soft_fp_abi + SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5") + IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$") + SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0) + ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a") + SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0) + ENDIF() + ELSEIF(IOS) + IF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64") + SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5") + SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}") + SET(OPENBLAS_CC "${OPENBLAS_CC} -arch arm64") + SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0 CROSS_SUFFIX=${CROSS_SUFFIX}) + ELSE() + MESSAGE(FATAL_ERROR "OpenBLAS only support arm64 architectures on iOS. " + "You can set IOS_USE_VECLIB_FOR_BLAS=ON or USE_EIGEN_FOR_BLAS=ON to use other blas library instead.") + ENDIF() + ELSEIF(RPI) + # use hardfp + SET(OPENBLAS_COMMIT "v0.2.20") + SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 USE_THREAD=0) + ENDIF() + ELSE() + IF(APPLE) + SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}") + ENDIF() + SET(OPENBLAS_COMMIT "v0.2.20") + SET(OPTIONAL_ARGS "") + IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$") + SET(OPTIONAL_ARGS DYNAMIC_ARCH=1 NUM_THREADS=64) + ENDIF() + ENDIF() - ADD_DEFINITIONS(-DPADDLE_USE_LAPACK) + SET(COMMON_ARGS CC=${OPENBLAS_CC} NO_SHARED=1 NO_LAPACK=1 libs) ExternalProject_Add( - openblas + extern_openblas ${EXTERNAL_PROJECT_LOG_ARGS} GIT_REPOSITORY https://github.com/xianyi/OpenBLAS.git - GIT_TAG v0.2.19 + GIT_TAG ${OPENBLAS_COMMIT} PREFIX ${CBLAS_SOURCES_DIR} INSTALL_DIR ${CBLAS_INSTALL_DIR} BUILD_IN_SOURCE 1 - BUILD_COMMAND ${CMAKE_MAKE_PROGRAM} FC=${CMAKE_Fortran_COMPILER} CC=${CMAKE_C_COMPILER} HOSTCC=${CMAKE_C_COMPILER} DYNAMIC_ARCH=1 NO_SHARED=1 libs netlib - INSTALL_COMMAND ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 PREFIX= + BUILD_COMMAND ${CMAKE_MAKE_PROGRAM} ${COMMON_ARGS} ${OPTIONAL_ARGS} + INSTALL_COMMAND ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 NO_LAPACK=1 PREFIX= UPDATE_COMMAND "" CONFIGURE_COMMAND "" ) - - ExternalProject_Add_Step( - openblas lapacke_install - COMMAND ${CMAKE_COMMAND} -E copy "${CBLAS_SOURCES_DIR}/src/openblas/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h" "${CBLAS_INSTALL_DIR}/include/lapacke_mangling.h" - COMMAND ${CMAKE_COMMAND} -E copy "${CBLAS_SOURCES_DIR}/src/openblas/lapack-netlib/LAPACKE/include/lapacke.h" "${CBLAS_INSTALL_DIR}/include/lapacke.h" - COMMAND ${CMAKE_COMMAND} -E copy "${CBLAS_SOURCES_DIR}/src/openblas/lapack-netlib/LAPACKE/include/lapacke_config.h" "${CBLAS_INSTALL_DIR}/include/lapacke_config.h" - COMMAND ${CMAKE_COMMAND} -E copy "${CBLAS_SOURCES_DIR}/src/openblas/lapack-netlib/LAPACKE/include/lapacke_utils.h" "${CBLAS_INSTALL_DIR}/include/lapacke_utils.h" - DEPENDEES install - ) - - LIST(APPEND external_project_dependencies openblas) + SET(CBLAS_PROVIDER openblas) + IF(WITH_C_API) + INSTALL(DIRECTORY ${CBLAS_INC_DIR} DESTINATION third_party/openblas) + # Because libopenblas.a is a symbolic link of another library, thus need to + # install the whole directory. + IF(ANDROID) + SET(TMP_INSTALL_DIR third_party/openblas/lib/${ANDROID_ABI}) + ELSE() + SET(TMP_INSTALL_DIR third_party/openblas/lib) + ENDIF() + INSTALL(CODE "execute_process( + COMMAND ${CMAKE_COMMAND} -E copy_directory ${CBLAS_INSTALL_DIR}/lib + ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR} + )" + ) + INSTALL(CODE "MESSAGE(STATUS \"Installing: \" + \"${CBLAS_INSTALL_DIR}/lib -> ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}\" + )" + ) + ENDIF() ENDIF(NOT ${CBLAS_FOUND}) +MESSAGE(STATUS "BLAS library: ${CBLAS_LIBRARIES}") INCLUDE_DIRECTORIES(${CBLAS_INC_DIR}) + +# FIXME(gangliao): generate cblas target to track all high performance +# linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas) +SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cblas_dummy.c) +FILE(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";") +ADD_LIBRARY(cblas STATIC ${dummyfile}) +TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES}) + +IF(NOT ${CBLAS_FOUND}) + ADD_DEPENDENCIES(cblas extern_openblas) + LIST(APPEND external_project_dependencies cblas) +ELSE() + IF("${CBLAS_PROVIDER}" STREQUAL "MKLML") + ADD_DEPENDENCIES(cblas mklml) + ENDIF() +ENDIF(NOT ${CBLAS_FOUND}) diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake index 2df042d226af8308d00f7870e7d2de0eacfdf07e..fab2af362bb070a54987b6499748056f3d12a56b 100644 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -1,11 +1,11 @@ # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -13,68 +13,257 @@ # limitations under the License. INCLUDE(ExternalProject) +# Always invoke `FIND_PACKAGE(Protobuf)` for importing function protobuf_generate_cpp +FIND_PACKAGE(Protobuf QUIET) +macro(UNSET_VAR VAR_NAME) + UNSET(${VAR_NAME} CACHE) + UNSET(${VAR_NAME}) +endmacro() +UNSET_VAR(PROTOBUF_INCLUDE_DIR) +UNSET_VAR(PROTOBUF_FOUND) +UNSET_VAR(PROTOBUF_PROTOC_EXECUTABLE) +UNSET_VAR(PROTOBUF_PROTOC_LIBRARY) +UNSET_VAR(PROTOBUF_LITE_LIBRARY) +UNSET_VAR(PROTOBUF_LIBRARY) +UNSET_VAR(PROTOBUF_INCLUDE_DIR) +UNSET_VAR(Protobuf_PROTOC_EXECUTABLE) -set(PROTOBUF_VERSION 3.1) -FIND_PACKAGE(Protobuf ${PROTOBUF_VERSION}) +if(NOT COMMAND protobuf_generate_python) # before cmake 3.4, protobuf_genrerate_python is not defined. + function(protobuf_generate_python SRCS) + # shameless copy from https://github.com/Kitware/CMake/blob/master/Modules/FindProtobuf.cmake + if(NOT ARGN) + message(SEND_ERROR "Error: PROTOBUF_GENERATE_PYTHON() called without any proto files") + return() + endif() -IF(PROTOBUF_FOUND) + if(PROTOBUF_GENERATE_CPP_APPEND_PATH) + # Create an include path for each file specified + foreach(FIL ${ARGN}) + get_filename_component(ABS_FIL ${FIL} ABSOLUTE) + get_filename_component(ABS_PATH ${ABS_FIL} PATH) + list(FIND _protobuf_include_path ${ABS_PATH} _contains_already) + if(${_contains_already} EQUAL -1) + list(APPEND _protobuf_include_path -I ${ABS_PATH}) + endif() + endforeach() + else() + set(_protobuf_include_path -I ${CMAKE_CURRENT_SOURCE_DIR}) + endif() + + if(DEFINED PROTOBUF_IMPORT_DIRS AND NOT DEFINED Protobuf_IMPORT_DIRS) + set(Protobuf_IMPORT_DIRS "${PROTOBUF_IMPORT_DIRS}") + endif() + + if(DEFINED Protobuf_IMPORT_DIRS) + foreach(DIR ${Protobuf_IMPORT_DIRS}) + get_filename_component(ABS_PATH ${DIR} ABSOLUTE) + list(FIND _protobuf_include_path ${ABS_PATH} _contains_already) + if(${_contains_already} EQUAL -1) + list(APPEND _protobuf_include_path -I ${ABS_PATH}) + endif() + endforeach() + endif() + + set(${SRCS}) + foreach(FIL ${ARGN}) + get_filename_component(ABS_FIL ${FIL} ABSOLUTE) + get_filename_component(FIL_WE ${FIL} NAME_WE) + if(NOT PROTOBUF_GENERATE_CPP_APPEND_PATH) + get_filename_component(FIL_DIR ${FIL} DIRECTORY) + if(FIL_DIR) + set(FIL_WE "${FIL_DIR}/${FIL_WE}") + endif() + endif() + + list(APPEND ${SRCS} "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py") + add_custom_command( + OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py" + COMMAND ${Protobuf_PROTOC_EXECUTABLE} --python_out ${CMAKE_CURRENT_BINARY_DIR} ${_protobuf_include_path} ${ABS_FIL} + DEPENDS ${ABS_FIL} ${Protobuf_PROTOC_EXECUTABLE} + COMMENT "Running Python protocol buffer compiler on ${FIL}" + VERBATIM ) + endforeach() + + set(${SRCS} ${${SRCS}} PARENT_SCOPE) + endfunction() +endif() + +# Print and set the protobuf library information, +# finish this cmake process and exit from this file. +macro(PROMPT_PROTOBUF_LIB) + SET(protobuf_DEPS ${ARGN}) + + MESSAGE(STATUS "Protobuf protoc executable: ${PROTOBUF_PROTOC_EXECUTABLE}") + MESSAGE(STATUS "Protobuf library: ${PROTOBUF_LIBRARY}") + MESSAGE(STATUS "Protobuf version: ${PROTOBUF_VERSION}") + INCLUDE_DIRECTORIES(${PROTOBUF_INCLUDE_DIR}) + + # Assuming that all the protobuf libraries are of the same type. + IF(${PROTOBUF_LIBRARY} MATCHES "${CMAKE_STATIC_LIBRARY_SUFFIX}$") + SET(protobuf_LIBTYPE STATIC) + ELSEIF(${PROTOBUF_LIBRARY} MATCHES "${CMAKE_SHARED_LIBRARY_SUFFIX}$") + SET(protobuf_LIBTYPE SHARED) + ELSE() + MESSAGE(FATAL_ERROR "Unknown library type: ${PROTOBUF_LIBRARY}") + ENDIF() + + ADD_LIBRARY(protobuf ${protobuf_LIBTYPE} IMPORTED GLOBAL) + SET_PROPERTY(TARGET protobuf PROPERTY IMPORTED_LOCATION ${PROTOBUF_LIBRARY}) + + ADD_LIBRARY(protobuf_lite ${protobuf_LIBTYPE} IMPORTED GLOBAL) + SET_PROPERTY(TARGET protobuf_lite PROPERTY IMPORTED_LOCATION ${PROTOBUF_LITE_LIBRARY}) + + ADD_LIBRARY(libprotoc ${protobuf_LIBTYPE} IMPORTED GLOBAL) + SET_PROPERTY(TARGET libprotoc PROPERTY IMPORTED_LOCATION ${PROTOC_LIBRARY}) + + ADD_EXECUTABLE(protoc IMPORTED GLOBAL) + SET_PROPERTY(TARGET protoc PROPERTY IMPORTED_LOCATION ${PROTOBUF_PROTOC_EXECUTABLE}) + # FIND_Protobuf.cmake uses `Protobuf_PROTOC_EXECUTABLE`. + # make `protobuf_generate_cpp` happy. + SET(Protobuf_PROTOC_EXECUTABLE ${PROTOBUF_PROTOC_EXECUTABLE}) + FOREACH(dep ${protobuf_DEPS}) + ADD_DEPENDENCIES(protobuf ${dep}) + ADD_DEPENDENCIES(protobuf_lite ${dep}) + ADD_DEPENDENCIES(libprotoc ${dep}) + ADD_DEPENDENCIES(protoc ${dep}) + ENDFOREACH() + + LIST(APPEND external_project_dependencies protobuf) + RETURN() +endmacro() +macro(SET_PROTOBUF_VERSION) EXEC_PROGRAM(${PROTOBUF_PROTOC_EXECUTABLE} ARGS --version OUTPUT_VARIABLE PROTOBUF_VERSION) STRING(REGEX MATCH "[0-9]+.[0-9]+" PROTOBUF_VERSION "${PROTOBUF_VERSION}") - IF (${PROTOBUF_VERSION} VERSION_LESS "3.1.0") - SET(PROTOBUF_FOUND OFF) +endmacro() + +set(PROTOBUF_ROOT "" CACHE PATH "Folder contains protobuf") +if (NOT "${PROTOBUF_ROOT}" STREQUAL "") + find_path(PROTOBUF_INCLUDE_DIR google/protobuf/message.h PATHS ${PROTOBUF_ROOT}/include NO_DEFAULT_PATH) + find_library(PROTOBUF_LIBRARY protobuf PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH) + find_library(PROTOBUF_LITE_LIBRARY protobuf-lite PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH) + find_library(PROTOBUF_PROTOC_LIBRARY protoc PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH) + find_program(PROTOBUF_PROTOC_EXECUTABLE protoc PATHS ${PROTOBUF_ROOT}/bin NO_DEFAULT_PATH) + if (PROTOBUF_INCLUDE_DIR AND PROTOBUF_LIBRARY AND PROTOBUF_LITE_LIBRARY AND PROTOBUF_PROTOC_LIBRARY AND PROTOBUF_PROTOC_EXECUTABLE) + message(STATUS "Using custom protobuf library in ${PROTOBUF_ROOT}.") + SET_PROTOBUF_VERSION() + PROMPT_PROTOBUF_LIB() + else() + message(WARNING "Cannot find protobuf library in ${PROTOBUF_ROOT}.") + endif() +endif() + +FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) + STRING(REPLACE "extern_" "" TARGET_DIR_NAME "${TARGET_NAME}") + SET(PROTOBUF_SOURCES_DIR ${THIRD_PARTY_PATH}/${TARGET_DIR_NAME}) + SET(PROTOBUF_INSTALL_DIR ${THIRD_PARTY_PATH}/install/${TARGET_DIR_NAME}) + + SET(${TARGET_NAME}_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" PARENT_SCOPE) + SET(PROTOBUF_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" PARENT_SCOPE) + SET(${TARGET_NAME}_LITE_LIBRARY + "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite${CMAKE_STATIC_LIBRARY_SUFFIX}" + PARENT_SCOPE) + SET(${TARGET_NAME}_LIBRARY + "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf${CMAKE_STATIC_LIBRARY_SUFFIX}" + PARENT_SCOPE) + SET(${TARGET_NAME}_PROTOC_LIBRARY + "${PROTOBUF_INSTALL_DIR}/lib/libprotoc${CMAKE_STATIC_LIBRARY_SUFFIX}" + PARENT_SCOPE) + SET(${TARGET_NAME}_PROTOC_EXECUTABLE + "${PROTOBUF_INSTALL_DIR}/bin/protoc${CMAKE_EXECUTABLE_SUFFIX}" + PARENT_SCOPE) + + SET(OPTIONAL_CACHE_ARGS "") + SET(OPTIONAL_ARGS "") + IF(BUILD_FOR_HOST) + SET(OPTIONAL_ARGS "-Dprotobuf_WITH_ZLIB=OFF") + ELSE() + SET(OPTIONAL_ARGS + "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}" + "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}" + "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}" + "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}" + "-Dprotobuf_WITH_ZLIB=ON" + "-DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}" + ${EXTERNAL_OPTIONAL_ARGS}) + SET(OPTIONAL_CACHE_ARGS "-DZLIB_ROOT:STRING=${ZLIB_ROOT}") ENDIF() -ENDIF(PROTOBUF_FOUND) -IF(NOT PROTOBUF_FOUND) - SET(PROTOBUF_SOURCES_DIR ${THIRD_PARTY_PATH}/protobuf) - SET(PROTOBUF_INSTALL_DIR ${THIRD_PARTY_PATH}/install/protobuf) - SET(PROTOBUF_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" CACHE PATH "protobuf include directory." FORCE) - - IF(WIN32) - SET(PROTOBUF_LITE_LIBRARY - "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite.lib" CACHE FILEPATH "protobuf lite library." FORCE) - SET(PROTOBUF_LIBRARY - "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf.lib" CACHE FILEPATH "protobuf library." FORCE) - SET(PROTOBUF_PROTOC_LIBRARY - "${PROTOBUF_INSTALL_DIR}/lib/libprotoc.lib" CACHE FILEPATH "protoc library." FORCE) - SET(PROTOBUF_PROTOC_EXECUTABLE "${PROTOBUF_INSTALL_DIR}/bin/protoc.exe" CACHE FILEPATH "protobuf executable." FORCE) - ELSE(WIN32) - SET(PROTOBUF_LITE_LIBRARY - "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite.a" CACHE FILEPATH "protobuf lite library." FORCE) - SET(PROTOBUF_LIBRARY - "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf.a" CACHE FILEPATH "protobuf library." FORCE) - SET(PROTOBUF_PROTOC_LIBRARY - "${PROTOBUF_INSTALL_DIR}/lib/libprotoc.a" CACHE FILEPATH "protoc library." FORCE) - SET(PROTOBUF_PROTOC_EXECUTABLE "${PROTOBUF_INSTALL_DIR}/bin/protoc" CACHE FILEPATH "protobuf executable." FORCE) - ENDIF(WIN32) + SET(PROTOBUF_REPO "https://github.com/google/protobuf.git") + SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546") + IF(MOBILE_INFERENCE) + # The reason why the official version is not used is described in + # https://github.com/PaddlePaddle/Paddle/issues/6114 + SET(PROTOBUF_REPO "https://github.com/qingqing01/protobuf.git") + SET(PROTOBUF_TAG "v3.2.0") + IF(NOT BUILD_FOR_HOST) + SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} "-Dprotobuf_BUILD_PROTOC_BINARIES=OFF") + ENDIF() + ENDIF() ExternalProject_Add( - protobuf + ${TARGET_NAME} ${EXTERNAL_PROJECT_LOG_ARGS} PREFIX ${PROTOBUF_SOURCES_DIR} UPDATE_COMMAND "" DEPENDS zlib - GIT_REPOSITORY "https://github.com/google/protobuf.git" - GIT_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546" + GIT_REPOSITORY ${PROTOBUF_REPO} + GIT_TAG ${PROTOBUF_TAG} CONFIGURE_COMMAND - ${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/protobuf/cmake + ${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/${TARGET_NAME}/cmake + ${OPTIONAL_ARGS} -Dprotobuf_BUILD_TESTS=OFF - -DZLIB_ROOT:FILEPATH=${ZLIB_ROOT} - -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -DCMAKE_POSITION_INDEPENDENT_CODE=ON - -DCMAKE_BUILD_TYPE=Release + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR} -DCMAKE_INSTALL_LIBDIR=lib CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR} - -DCMAKE_BUILD_TYPE:STRING=Release + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON - -DZLIB_ROOT:STRING=${ZLIB_ROOT} + ${OPTIONAL_CACHE_ARGS} ) +ENDFUNCTION() - LIST(APPEND external_project_dependencies protobuf) -ENDIF(NOT PROTOBUF_FOUND) +IF(NOT MOBILE_INFERENCE) + SET(PROTOBUF_VERSION 3.1) +ELSE() + SET(PROTOBUF_VERSION 3.2) +ENDIF() +IF(CMAKE_CROSSCOMPILING) + build_protobuf(protobuf_host TRUE) + LIST(APPEND external_project_dependencies protobuf_host) + + SET(PROTOBUF_PROTOC_EXECUTABLE ${protobuf_host_PROTOC_EXECUTABLE} + CACHE FILEPATH "protobuf executable." FORCE) +ENDIF() -INCLUDE_DIRECTORIES(${PROTOBUF_INCLUDE_DIR}) +IF(NOT PROTOBUF_FOUND) + build_protobuf(extern_protobuf FALSE) + + SET(PROTOBUF_INCLUDE_DIR ${extern_protobuf_INCLUDE_DIR} + CACHE PATH "protobuf include directory." FORCE) + SET(PROTOBUF_LITE_LIBRARY ${extern_protobuf_LITE_LIBRARY} + CACHE FILEPATH "protobuf lite library." FORCE) + SET(PROTOBUF_LIBRARY ${extern_protobuf_LIBRARY} + CACHE FILEPATH "protobuf library." FORCE) + SET(PROTOBUF_PROTOC_LIBRARY ${extern_protobuf_PROTOC_LIBRARY} + CACHE FILEPATH "protoc library." FORCE) + + IF(WITH_C_API) + INSTALL(DIRECTORY ${PROTOBUF_INCLUDE_DIR} DESTINATION third_party/protobuf) + IF(ANDROID) + INSTALL(FILES ${PROTOBUF_LIBRARY} DESTINATION third_party/protobuf/lib/${ANDROID_ABI}) + ELSE() + INSTALL(FILES ${PROTOBUF_LIBRARY} DESTINATION third_party/protobuf/lib) + ENDIF() + ENDIF() + + IF(CMAKE_CROSSCOMPILING) + PROMPT_PROTOBUF_LIB(protobuf_host extern_protobuf) + ELSE() + SET(PROTOBUF_PROTOC_EXECUTABLE ${extern_protobuf_PROTOC_EXECUTABLE} + CACHE FILEPATH "protobuf executable." FORCE) + PROMPT_PROTOBUF_LIB(extern_protobuf) + ENDIF() +ENDIF(NOT PROTOBUF_FOUND) diff --git a/cmake/external/pybind11.cmake b/cmake/external/pybind11.cmake new file mode 100644 index 0000000000000000000000000000000000000000..4e87dc49d8956d1fa6dec777efc5a63c6b0f79a5 --- /dev/null +++ b/cmake/external/pybind11.cmake @@ -0,0 +1,46 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if(NOT WITH_PYTHON) + return() +endif() + +include(ExternalProject) + +set(PYBIND_SOURCE_DIR ${THIRD_PARTY_PATH}/pybind) + +include_directories(${PYBIND_SOURCE_DIR}/src/extern_pybind/include) + +ExternalProject_Add( + extern_pybind + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY "https://github.com/pybind/pybind11.git" + GIT_TAG "v2.1.1" + PREFIX ${PYBIND_SOURCE_DIR} + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + TEST_COMMAND "" +) + +if(${CMAKE_VERSION} VERSION_LESS "3.3.0") + set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/pybind_dummy.c) + file(WRITE ${dummyfile} "const char * dummy_pybind = \"${dummyfile}\";") + add_library(pybind STATIC ${dummyfile}) +else() + add_library(pybind INTERFACE) +endif() + +add_dependencies(pybind extern_pybind) diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake index 9fd3afd0998b38c18b4490e6fb1c6fe0222ed142..46c68cce324f565ec9985ef1a280d6d933f88f1f 100644 --- a/cmake/external/python.cmake +++ b/cmake/external/python.cmake @@ -1,26 +1,31 @@ # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -INCLUDE(ExternalProject) +IF(NOT WITH_PYTHON) + return() +ENDIF() + INCLUDE(python_module) FIND_PACKAGE(PythonInterp 2.7) FIND_PACKAGE(PythonLibs 2.7) +# Fixme: Maybe find a static library. Get SHARED/STATIC by FIND_PACKAGE. +ADD_LIBRARY(python SHARED IMPORTED GLOBAL) +SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES}) SET(py_env "") - -IF(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND) +IF(PYTHONINTERP_FOUND) find_python_module(pip REQUIRED) find_python_module(numpy REQUIRED) find_python_module(wheel REQUIRED) @@ -30,198 +35,7 @@ IF(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND) MESSAGE(FATAL_ERROR "Found Python Protobuf ${PY_GOOGLE.PROTOBUF_VERSION} < 3.0.0, " "please use pip to upgrade protobuf. pip install -U protobuf") ENDIF() -ELSE(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND) - MESSAGE(FATAL_ERROR "Please install python 2.7 before building PaddlePaddle.") - ##################################### PYTHON ######################################## - SET(PYTHON_SOURCES_DIR ${THIRD_PARTY_PATH}/python) - SET(PYTHON_INSTALL_DIR ${THIRD_PARTY_PATH}/install/python) - SET(_python_DIR ${PYTHON_INSTALL_DIR}) - - IF(UNIX) - SET(PYTHON_FOUND ON) - SET(PYTHON_INCLUDE_DIR "${PYTHON_INSTALL_DIR}/include/python2.7" CACHE PATH "Python include dir" FORCE) - SET(PYTHON_LIBRARIES "${PYTHON_INSTALL_DIR}/lib/libpython2.7.a" CACHE FILEPATH "Python library" FORCE) - SET(PYTHON_EXECUTABLE ${PYTHON_INSTALL_DIR}/bin/python CACHE FILEPATH "Python executable" FORCE) - SET(PY_SITE_PACKAGES_PATH "${PYTHON_INSTALL_DIR}/lib/python2.7/site-packages" CACHE PATH "Python site-packages path" FORCE) - ELSEIF(WIN32) - SET(PYTHON_FOUND ON) - SET(PYTHON_INCLUDE_DIR "${PYTHON_INSTALL_DIR}/include" CACHE PATH "Python include dir" FORCE) - SET(PYTHON_LIBRARIES "${PYTHON_INSTALL_DIR}/libs/python27.lib" CACHE FILEPATH "Python library" FORCE) - SET(PYTHON_EXECUTABLE "${PYTHON_INSTALL_DIR}/bin/python.exe" CACHE FILEPATH "Python executable" FORCE) - SET(PY_SITE_PACKAGES_PATH "${PYTHON_INSTALL_DIR}/Lib/site-packages" CACHE PATH "Python site-packages path" FORCE) - ELSE() - MESSAGE(FATAL_ERROR "Unknown system !") - ENDIF() - - IF(APPLE) - LIST(APPEND EXTERNAL_PROJECT_OPTIONAL_CMAKE_ARGS - -DCMAKE_BUILD_WITH_INSTALL_RPATH:BOOL=ON - ) - ENDIF() - - SET(EXTERNAL_PROJECT_OPTIONAL_CMAKE_CACHE_ARGS) - - # Force Python build to "Release". - IF(CMAKE_CONFIGURATION_TYPES) - SET(SAVED_CMAKE_CFG_INTDIR ${CMAKE_CFG_INTDIR}) - SET(CMAKE_CFG_INTDIR "Release") - ELSE() - LIST(APPEND EXTERNAL_PROJECT_OPTIONAL_CMAKE_CACHE_ARGS - -DCMAKE_BUILD_TYPE:STRING=Release - ) - ENDIF() - - ExternalProject_Add(python - ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY "https://github.com/python-cmake-buildsystem/python-cmake-buildsystem.git" - PREFIX ${PYTHON_SOURCES_DIR} - UPDATE_COMMAND "" - CMAKE_ARGS -DPYTHON_VERSION=2.7.12 - CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - CMAKE_CACHE_ARGS - -DCMAKE_INSTALL_PREFIX:PATH=${PYTHON_INSTALL_DIR} - -DBUILD_LIBPYTHON_SHARED:BOOL=OFF - -DUSE_SYSTEM_LIBRARIES:BOOL=OFF - -DZLIB_ROOT:FILEPATH=${ZLIB_ROOT} - -DZLIB_INCLUDE_DIR:PATH=${ZLIB_INCLUDE_DIR} - -DZLIB_LIBRARY:FILEPATH=${ZLIB_LIBRARIES} - -DDOWNLOAD_SOURCES:BOOL=ON - -DINSTALL_WINDOWS_TRADITIONAL:BOOL=OFF - ${EXTERNAL_PROJECT_OPTIONAL_CMAKE_CACHE_ARGS} - ${EXTERNAL_PROJECT_OPTIONAL_CMAKE_ARGS} - DEPENDS zlib - ) - - SET(py_env - PATH=${PYTHON_INSTALL_DIR}/bin - PYTHONHOME=${PYTHON_INSTALL_DIR} - PYTHONPATH=${PYTHON_INSTALL_DIR}/lib:${PYTHON_INSTALL_DIR}/lib/python2.7:${PY_SITE_PACKAGES_PATH}) - #################################################################################### +ENDIF(PYTHONINTERP_FOUND) - ##################################### SETUPTOOLS ################################### - SET(SETUPTOOLS_SOURCES_DIR ${PYTHON_SOURCES_DIR}/setuptools) - ExternalProject_Add(setuptools - ${EXTERNAL_PROJECT_LOG_ARGS} - PREFIX ${SETUPTOOLS_SOURCES_DIR} - URL "https://pypi.python.org/packages/source/s/setuptools/setuptools-18.3.2.tar.gz" - BUILD_IN_SOURCE 1 - PATCH_COMMAND "" - UPDATE_COMMAND "" - CONFIGURE_COMMAND "" - INSTALL_COMMAND "" - BUILD_COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py install - DEPENDS python zlib - ) - ##################################################################################### - - ##################################### SIX ########################################### - SET(SIX_SOURCES_DIR ${PYTHON_SOURCES_DIR}/six) - ExternalProject_Add(six - ${EXTERNAL_PROJECT_LOG_ARGS} - PREFIX ${SIX_SOURCES_DIR} - URL https://pypi.python.org/packages/source/s/six/six-1.10.0.tar.gz - BUILD_IN_SOURCE 1 - PATCH_COMMAND "" - UPDATE_COMMAND "" - CONFIGURE_COMMAND "" - INSTALL_COMMAND "" - BUILD_COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py install - DEPENDS python setuptools - ) - ##################################################################################### - - ##################################### CYTHON ######################################## - SET(CYTHON_SOURCES_DIR ${PYTHON_SOURCES_DIR}/cython) - ExternalProject_Add(cython - ${EXTERNAL_PROJECT_LOG_ARGS} - PREFIX ${CYTHON_SOURCES_DIR} - URL https://github.com/cython/cython/archive/0.25.2.tar.gz - GIT_TAG 0.25.2 - BUILD_IN_SOURCE 1 - CONFIGURE_COMMAND "" - PATCH_COMMAND "" - UPDATE_COMMAND "" - INSTALL_COMMAND "" - BUILD_COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py install - DEPENDS python - ) - #################################################################################### - - ##################################### NUMPY ######################################## - SET(NUMPY_SOURCES_DIR ${PYTHON_SOURCES_DIR}/numpy) - SET(NUMPY_TAG_VERSION "v1.11.3") - SET(NUMPY_VERSION "1.11.3") - - SET(EGG_NAME "") - SET(PYTHON_NUMPY_INCLUDE_DIR "") - IF(WIN32) - SET(EGG_NAME "numpy-${NUMPY_VERSION}-py2.7-${HOST_SYSTEM}.egg") - ELSE(WIN32) - IF(APPLE) - SET(EGG_NAME "numpy-${NUMPY_VERSION}-py2.7-${HOST_SYSTEM}-${MACOS_VERSION}") - ELSE(APPLE) - SET(EGG_NAME "numpy-${NUMPY_VERSION}-py2.7-linux") - SET(EGG_NAME "numpy-${NUMPY_VERSION}-py2.7-linux") - ENDIF(APPLE) - - FOREACH(suffix x86_64 intel fat64 fat32 universal) - LIST(APPEND PYTHON_NUMPY_INCLUDE_DIR ${PY_SITE_PACKAGES_PATH}/${EGG_NAME}-${suffix}.egg/numpy/core/include) - ENDFOREACH() - ENDIF(WIN32) - - ExternalProject_Add(numpy - ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY https://github.com/numpy/numpy.git - GIT_TAG ${NUMPY_TAG_VERSION} - CONFIGURE_COMMAND "" - UPDATE_COMMAND "" - PREFIX ${NUMPY_SOURCES_DIR} - BUILD_COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py build - INSTALL_COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py install - BUILD_IN_SOURCE 1 - DEPENDS python setuptools cython - ) - #################################################################################### - - ##################################### WHEEL ######################################## - SET(WHEEL_SOURCES_DIR ${PYTHON_SOURCES_DIR}/wheel) - ExternalProject_Add(wheel - ${EXTERNAL_PROJECT_LOG_ARGS} - URL https://pypi.python.org/packages/source/w/wheel/wheel-0.29.0.tar.gz - PREFIX ${WHEEL_SOURCES_DIR} - CONFIGURE_COMMAND "" - UPDATE_COMMAND "" - BUILD_COMMAND "" - INSTALL_COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py install - BUILD_IN_SOURCE 1 - DEPENDS python setuptools - ) - #################################################################################### - - ################################### PROTOBUF ####################################### - SET(PY_PROTOBUF_SOURCES_DIR ${PYTHON_SOURCES_DIR}/protobuf) - ExternalProject_Add(python-protobuf - ${EXTERNAL_PROJECT_LOG_ARGS} - URL https://pypi.python.org/packages/e0/b0/0a1b364fe8a7d177b4b7d4dca5b798500dc57a7273b93cca73931b305a6a/protobuf-3.1.0.post1.tar.gz - URL_MD5 38b5fb160c768d2f8444d0c6d637ff91 - PREFIX ${PY_PROTOBUF_SOURCES_DIR} - BUILD_IN_SOURCE 1 - PATCH_COMMAND "" - CONFIGURE_COMMAND "" - BUILD_COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py build - INSTALL_COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py install - DEPENDS python setuptools six - ) - #################################################################################### - - LIST(APPEND external_project_dependencies python setuptools six cython wheel python-protobuf numpy) - -ENDIF(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND) - -IF(WITH_PYTHON) - INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR}) - INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR}) -ELSE() - SET(PYTHON_LIBRARIES "") -ENDIF() +INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR}) +INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR}) diff --git a/cmake/external/swig.cmake b/cmake/external/swig.cmake index 744c766ee7b067058b2cb4aa7f7b761cbb9778d4..9db457c7b2d61228e5d5af6827c4cda11a20a463 100644 --- a/cmake/external/swig.cmake +++ b/cmake/external/swig.cmake @@ -1,17 +1,21 @@ # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +IF(NOT WITH_SWIG_PY) + return() +ENDIF() + FIND_PACKAGE(SWIG) IF(NOT SWIG_FOUND) diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake index 293070c3cfcc1196001f64469f3254289b0de792..a8e1aca49c97df256b1269c286b0bce7732fa932 100644 --- a/cmake/external/warpctc.cmake +++ b/cmake/external/warpctc.cmake @@ -1,40 +1,33 @@ # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +IF(MOBILE_INFERENCE) + return() +ENDIF() + INCLUDE(ExternalProject) SET(WARPCTC_SOURCES_DIR ${THIRD_PARTY_PATH}/warpctc) SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc) -SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include" CACHE PATH "Warp-ctc Directory" FORCE) - -INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR}) - -SET(WARPCTC_LIB_DIR "${WARPCTC_INSTALL_DIR}/lib" CACHE PATH "Warp-ctc Library Directory" FORCE) -IF(WIN32) - SET(WARPCTC_LIBRARIES - "${WARPCTC_INSTALL_DIR}/lib/warpctc.dll" CACHE FILEPATH "Warp-ctc Library" FORCE) -ELSE(WIN32) - IF(APPLE) - SET(_warpctc_SHARED_SUFFIX dylib) - ELSE(APPLE) - SET(_warpctc_SHARED_SUFFIX so) - ENDIF(APPLE) - - SET(WARPCTC_LIBRARIES - "${WARPCTC_INSTALL_DIR}/lib/libwarpctc.${_warpctc_SHARED_SUFFIX}" CACHE FILEPATH "Warp-ctc Library" FORCE) -ENDIF(WIN32) +SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include" + CACHE PATH "Warp-ctc Directory" FORCE) +# Used in unit test test_WarpCTCLayer +SET(WARPCTC_LIB_DIR "${WARPCTC_INSTALL_DIR}/lib" + CACHE PATH "Warp-ctc Library Directory" FORCE) +SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_SHARED_LIBRARY_SUFFIX}" + CACHE FILEPATH "Warp-ctc Library" FORCE) IF(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" ) SET(USE_OMP OFF) @@ -43,26 +36,35 @@ ELSE() ENDIF() ExternalProject_Add( - warpctc + extern_warpctc ${EXTERNAL_PROJECT_LOG_ARGS} GIT_REPOSITORY "https://github.com/gangliao/warp-ctc.git" + GIT_TAG b63a0644654a3e0ed624c85a1767bc8193aead09 PREFIX ${WARPCTC_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} - CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} - CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR} - CMAKE_ARGS -DWITH_GPU=${WITH_GPU} - CMAKE_ARGS -DWITH_OMP=${USE_OMP} - CMAKE_ARGS -DWITH_TORCH=OFF - CMAKE_ARGS -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON - CMAKE_ARGS -DBUILD_SHARED=ON - CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON - CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release - CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=Release + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR} + -DWITH_GPU=${WITH_GPU} + -DWITH_OMP=${USE_OMP} + -DWITH_TORCH=OFF + -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON + -DBUILD_SHARED=ON + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + ${EXTERNAL_OPTIONAL_ARGS} + CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR} ) +MESSAGE(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}") +INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR}) + +ADD_LIBRARY(warpctc STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET warpctc PROPERTY IMPORTED_LOCATION ${WARPCTC_LIBRARIES}) +ADD_DEPENDENCIES(warpctc extern_warpctc) + LIST(APPEND external_project_dependencies warpctc) diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake index 45ca5542b7dc30216b45487782f849b93c5f8fca..1638cd8fdfc34575132462859e056a1907f0b2f1 100644 --- a/cmake/external/zlib.cmake +++ b/cmake/external/zlib.cmake @@ -1,11 +1,11 @@ # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -34,18 +34,30 @@ ExternalProject_Add( GIT_TAG "v1.2.8" PREFIX ${ZLIB_SOURCES_DIR} UPDATE_COMMAND "" - CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} - CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} - CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${ZLIB_INSTALL_DIR} - CMAKE_ARGS -DBUILD_SHARED_LIBS=OFF - CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON - CMAKE_ARGS -DCMAKE_MACOSX_RPATH=ON - CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release + -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_INSTALL_PREFIX=${ZLIB_INSTALL_DIR} + -DBUILD_SHARED_LIBS=OFF + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DCMAKE_MACOSX_RPATH=ON + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + ${EXTERNAL_OPTIONAL_ARGS} CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ZLIB_INSTALL_DIR} -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON - -DCMAKE_BUILD_TYPE:STRING=Release + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} ) LIST(APPEND external_project_dependencies zlib) +ADD_LIBRARY(zlib_target STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET zlib_target PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES}) + +IF(WITH_C_API) + INSTALL(DIRECTORY ${ZLIB_INCLUDE_DIR} DESTINATION third_party/zlib) + IF(ANDROID) + INSTALL(FILES ${ZLIB_LIBRARIES} DESTINATION third_party/zlib/lib/${ANDROID_ABI}) + ELSE() + INSTALL(FILES ${ZLIB_LIBRARIES} DESTINATION third_party/zlib/lib) + ENDIF() +ENDIF() diff --git a/cmake/flags.cmake b/cmake/flags.cmake index 7eb92efcb00fa18461e61e0508b485c13ef23a1f..1120677a37e0d44163816b66600121c8f0d545af 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -109,7 +109,11 @@ set(COMMON_FLAGS -Wno-unused-function -Wno-error=literal-suffix -Wno-error=sign-compare - -Wno-error=unused-local-typedefs) + -Wno-error=unused-local-typedefs + -Wno-error=parentheses-equality # Warnings in pybind11 + -Wno-error=ignored-attributes # Warnings in Eigen, gcc 6.3 + -Wno-error=terminate # Warning in PADDLE_ENFORCE +) set(GPU_COMMON_FLAGS -fPIC @@ -122,11 +126,14 @@ set(GPU_COMMON_FLAGS -Wno-error=literal-suffix -Wno-error=unused-local-typedefs -Wno-error=unused-function # Warnings in Numpy Header. + -Wno-error=array-bounds # Warnings in Eigen::array ) if (APPLE) - # On Mac OS X build fat binaries with x86_64 architectures by default. - set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE) + if(NOT CMAKE_CROSSCOMPILING) + # On Mac OS X build fat binaries with x86_64 architectures by default. + set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE) + endif() else() set(GPU_COMMON_FLAGS -Wall @@ -144,56 +151,3 @@ endforeach() foreach(flag ${GPU_COMMON_FLAGS}) safe_set_nvflag(${flag}) endforeach() - - -set(CUDA_PROPAGATE_HOST_FLAGS OFF) - -# Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc. -# So, don't set these flags here. -LIST(APPEND CUDA_NVCC_FLAGS -std=c++11) -LIST(APPEND CUDA_NVCC_FLAGS --use_fast_math) - -if(CMAKE_BUILD_TYPE STREQUAL "Debug") - LIST(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG}) -elseif(CMAKE_BUILD_TYPE STREQUAL "Release") - LIST(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE}) -elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") - LIST(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}) -elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel") - LIST(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_MINSIZEREL}) -endif() - -function(specify_cuda_arch cuda_version cuda_arch) - if(${cuda_version} VERSION_GREATER "8.0") - foreach(capability 61 62) - if(${cuda_arch} STREQUAL ${capability}) - list(APPEND __arch_flags " -gencode arch=compute_${cuda_arch},code=sm_${cuda_arch}") - endif() - endforeach() - elseif(${cuda_version} VERSION_GREATER "7.0" and ${cuda_arch} STREQUAL "53") - list(APPEND __arch_flags " -gencode arch=compute_${cuda_arch},code=sm_${cuda_arch}") - endif() -endfunction() - -# Common gpu architectures: Kepler, Maxwell -foreach(capability 30 35 50) - list(APPEND __arch_flags " -gencode arch=compute_${capability},code=sm_${capability}") -endforeach() - -if (CUDA_VERSION VERSION_GREATER "7.0" OR CUDA_VERSION VERSION_EQUAL "7.0") - list(APPEND __arch_flags " -gencode arch=compute_52,code=sm_52") -endif() - -# Modern gpu architectures: Pascal -if (CUDA_VERSION VERSION_GREATER "8.0" OR CUDA_VERSION VERSION_EQUAL "8.0") - list(APPEND __arch_flags " -gencode arch=compute_60,code=sm_60") -endif() - -# Custom gpu architecture -set(CUDA_ARCH) - -if(CUDA_ARCH) - specify_cuda_arch(${CUDA_VERSION} ${CUDA_ARCH}) -endif() - -set(CUDA_NVCC_FLAGS ${__arch_flags} ${CUDA_NVCC_FLAGS}) diff --git a/cmake/generic.cmake b/cmake/generic.cmake new file mode 100644 index 0000000000000000000000000000000000000000..66c8e3ad7ef7c80c1f388c25983425a0db5c0220 --- /dev/null +++ b/cmake/generic.cmake @@ -0,0 +1,516 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +# generic.cmake defines CMakes functions that look like Bazel's +# building rules (https://bazel.build/). +# +# +# ------------------------------------------- +# C++ CUDA C++ Go +# ------------------------------------------- +# cc_library nv_library go_library +# cc_binary nv_binary go_binary +# cc_test nv_test go_test +# ------------------------------------------- +# +# To build a static library example.a from example.cc using the system +# compiler (like GCC): +# +# cc_library(example SRCS example.cc) +# +# To build a static library example.a from multiple source files +# example{1,2,3}.cc: +# +# cc_library(example SRCS example1.cc example2.cc example3.cc) +# +# To build a shared library example.so from example.cc: +# +# cc_library(example SHARED SRCS example.cc) +# +# To build a library using Nvidia's NVCC from .cu file(s), use the nv_ +# prefixed version: +# +# nv_library(example SRCS example.cu) +# +# To specify that a library new_example.a depends on other libraies: +# +# cc_library(new_example SRCS new_example.cc DEPS example) +# +# Static libraries can be composed of other static libraries: +# +# cc_library(composed DEPS dependent1 dependent2 dependent3) +# +# To build an executable binary file from some source files and +# dependent libraries: +# +# cc_binary(example SRCS main.cc something.cc DEPS example1 example2) +# +# To build an executable binary file using NVCC, use the nv_ prefixed +# version: +# +# nv_binary(example SRCS main.cc something.cu DEPS example1 example2) +# +# To build a unit test binary, which is an executable binary with +# GoogleTest linked: +# +# cc_test(example_test SRCS example_test.cc DEPS example) +# +# To build a unit test binary using NVCC, use the nv_ prefixed version: +# +# nv_test(example_test SRCS example_test.cu DEPS example) +# +# It is pretty often that executable and test binaries depend on +# pre-defined external libaries like glog and gflags defined in +# /cmake/external/*.cmake: +# +# cc_test(example_test SRCS example_test.cc DEPS example glog gflags) +# +# To build a go static library using Golang, use the go_ prefixed version: +# +# go_library(example STATIC) +# +# To build a go shared library using Golang, use the go_ prefixed version: +# +# go_library(example SHARED) +# + +# including binary directory for generated headers. +include_directories(${CMAKE_CURRENT_BINARY_DIR}) + +if(NOT APPLE AND NOT ANDROID) + find_package(Threads REQUIRED) + link_libraries(${CMAKE_THREAD_LIBS_INIT}) + set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt") +endif(NOT APPLE AND NOT ANDROID) + +function(merge_static_libs TARGET_NAME) + set(libs ${ARGN}) + list(REMOVE_DUPLICATES libs) + + # Get all propagation dependencies from the merged libraries + foreach(lib ${libs}) + list(APPEND libs_deps ${${lib}_LIB_DEPENDS}) + endforeach() + list(REMOVE_DUPLICATES libs_deps) + + # To produce a library we need at least one source file. + # It is created by add_custom_command below and will helps + # also help to track dependencies. + set(target_SRCS ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c) + + if(APPLE) # Use OSX's libtool to merge archives + # Make the generated dummy source file depended on all static input + # libs. If input lib changes,the source file is touched + # which causes the desired effect (relink). + add_custom_command(OUTPUT ${target_SRCS} + COMMAND ${CMAKE_COMMAND} -E touch ${target_SRCS} + DEPENDS ${libs}) + + # Generate dummy staic lib + file(WRITE ${target_SRCS} "const char *dummy = \"${target_SRCS}\";") + add_library(${TARGET_NAME} STATIC ${target_SRCS}) + target_link_libraries(${TARGET_NAME} ${libs_deps}) + + foreach(lib ${libs}) + # Get the file names of the libraries to be merged + set(libfiles ${libfiles} $) + endforeach() + add_custom_command(TARGET ${TARGET_NAME} POST_BUILD + COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" + COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles} + ) + else() # general UNIX: use "ar" to extract objects and re-add to a common lib + set(target_DIR ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}.dir) + + foreach(lib ${libs}) + set(objlistfile ${target_DIR}/${lib}.objlist) # list of objects in the input library + set(objdir ${target_DIR}/${lib}.objdir) + + add_custom_command(OUTPUT ${objdir} + COMMAND ${CMAKE_COMMAND} -E make_directory ${objdir} + DEPENDS ${lib}) + + add_custom_command(OUTPUT ${objlistfile} + COMMAND ${CMAKE_AR} -x "$" + COMMAND ${CMAKE_AR} -t "$" > ${objlistfile} + DEPENDS ${lib} ${objdir} + WORKING_DIRECTORY ${objdir}) + + list(APPEND target_OBJS "${objlistfile}") + endforeach() + + # Make the generated dummy source file depended on all static input + # libs. If input lib changes,the source file is touched + # which causes the desired effect (relink). + add_custom_command(OUTPUT ${target_SRCS} + COMMAND ${CMAKE_COMMAND} -E touch ${target_SRCS} + DEPENDS ${libs} ${target_OBJS}) + + # Generate dummy staic lib + file(WRITE ${target_SRCS} "const char *dummy = \"${target_SRCS}\";") + add_library(${TARGET_NAME} STATIC ${target_SRCS}) + target_link_libraries(${TARGET_NAME} ${libs_deps}) + + # Get the file name of the generated library + set(target_LIBNAME "$") + + add_custom_command(TARGET ${TARGET_NAME} POST_BUILD + COMMAND ${CMAKE_AR} crs ${target_LIBNAME} `find ${target_DIR} -name '*.o'` + COMMAND ${CMAKE_RANLIB} ${target_LIBNAME} + WORKING_DIRECTORY ${target_DIR}) + endif() +endfunction(merge_static_libs) + +function(cc_library TARGET_NAME) + set(options STATIC static SHARED shared) + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(cc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + if (cc_library_SRCS) + if (cc_library_SHARED OR cc_library_shared) # build *.so + add_library(${TARGET_NAME} SHARED ${cc_library_SRCS}) + else() + add_library(${TARGET_NAME} STATIC ${cc_library_SRCS}) + endif() + if (cc_library_DEPS) + add_dependencies(${TARGET_NAME} ${cc_library_DEPS}) + target_link_libraries(${TARGET_NAME} ${cc_library_DEPS}) + endif() + + # cpplint code style + foreach(source_file ${cc_library_SRCS}) + string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file}) + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) + list(APPEND cc_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) + endif() + endforeach() + add_style_check_target(${TARGET_NAME} ${cc_library_SRCS} ${cc_library_HEADERS}) + + else(cc_library_SRCS) + if(cc_library_DEPS) + merge_static_libs(${TARGET_NAME} ${cc_library_DEPS}) + else() + message(FATAL "Please specify source file or library in cc_library.") + endif() + endif(cc_library_SRCS) +endfunction(cc_library) + +function(cc_binary TARGET_NAME) + set(options "") + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(cc_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + add_executable(${TARGET_NAME} ${cc_binary_SRCS}) + if(cc_binary_DEPS) + target_link_libraries(${TARGET_NAME} ${cc_binary_DEPS}) + add_dependencies(${TARGET_NAME} ${cc_binary_DEPS}) + endif() +endfunction(cc_binary) + +function(cc_test TARGET_NAME) + if(WITH_TESTING) + set(options "") + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + add_executable(${TARGET_NAME} ${cc_test_SRCS}) + target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags) + add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags) + add_test(NAME ${TARGET_NAME} COMMAND ${TARGET_NAME} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) + endif() +endfunction(cc_test) + +function(nv_library TARGET_NAME) + if (WITH_GPU) + set(options STATIC static SHARED shared) + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(nv_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + if(nv_library_SRCS) + if (nv_library_SHARED OR nv_library_shared) # build *.so + cuda_add_library(${TARGET_NAME} SHARED ${nv_library_SRCS}) + else() + cuda_add_library(${TARGET_NAME} STATIC ${nv_library_SRCS}) + endif() + if (nv_library_DEPS) + add_dependencies(${TARGET_NAME} ${nv_library_DEPS}) + target_link_libraries(${TARGET_NAME} ${nv_library_DEPS}) + endif() + # cpplint code style + foreach(source_file ${nv_library_SRCS}) + string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file}) + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) + list(APPEND nv_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) + endif() + endforeach() + add_style_check_target(${TARGET_NAME} ${nv_library_SRCS} ${nv_library_HEADERS}) + else(nv_library_SRCS) + if (nv_library_DEPS) + merge_static_libs(${TARGET_NAME} ${nv_library_DEPS}) + else() + message(FATAL "Please specify source file or library in nv_library.") + endif() + endif(nv_library_SRCS) + endif() +endfunction(nv_library) + +function(nv_binary TARGET_NAME) + if (WITH_GPU) + set(options "") + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(nv_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + cuda_add_executable(${TARGET_NAME} ${nv_binary_SRCS}) + if(nv_binary_DEPS) + target_link_libraries(${TARGET_NAME} ${nv_binary_DEPS}) + add_dependencies(${TARGET_NAME} ${nv_binary_DEPS}) + endif() + endif() +endfunction(nv_binary) + +function(nv_test TARGET_NAME) + if (WITH_GPU AND WITH_TESTING) + set(options "") + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS}) + target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags) + add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags) + add_test(${TARGET_NAME} ${TARGET_NAME}) + endif() +endfunction(nv_test) + +function(go_library TARGET_NAME) + set(options STATIC static SHARED shared) + set(oneValueArgs "") + set(multiValueArgs DEPS) + cmake_parse_arguments(go_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + if (go_library_SHARED OR go_library_shared) + set(BUILD_MODE "-buildmode=c-shared") + set(${TARGET_NAME}_LIB_NAME "${CMAKE_SHARED_LIBRARY_PREFIX}${TARGET_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}" CACHE STRING "output library name for target ${TARGET_NAME}") + else() + set(BUILD_MODE "-buildmode=c-archive") + set(${TARGET_NAME}_LIB_NAME "${CMAKE_STATIC_LIBRARY_PREFIX}${TARGET_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}" CACHE STRING "output library name for target ${TARGET_NAME}") + endif() + + set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c) + + # This custom command will always run since it depends on a not + # existing file. + add_custom_command( + OUTPUT dummy_rebulid_${TARGET_NAME} + COMMAND cmake -E touch ${dummyfile} + ) + # Create a custom target that depends on the custom command output + # file, so the custom command can be referenced as a dependency by + # `add_dependencies`. + add_custom_target(rebuild_${TARGET_NAME} + DEPENDS dummy_rebulid_${TARGET_NAME} + ) + + # Add dummy code to support `make target_name` under Terminal Command + file(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";") + if (go_library_SHARED OR go_library_shared) + add_library(${TARGET_NAME} SHARED ${dummyfile}) + else() + add_library(${TARGET_NAME} STATIC ${dummyfile}) + endif() + if(go_library_DEPS) + add_dependencies(${TARGET_NAME} ${go_library_DEPS}) + endif(go_library_DEPS) + + # The "source file" of the library is `${dummyfile}` which never + # change, so the target will never rebuild. Make the target depends + # on the custom command that touches the library "source file", so + # rebuild will always happen. + add_dependencies(${TARGET_NAME} rebuild_${TARGET_NAME}) + + set(${TARGET_NAME}_LIB_PATH "${CMAKE_CURRENT_BINARY_DIR}/${${TARGET_NAME}_LIB_NAME}" CACHE STRING "output library path for target ${TARGET_NAME}") + + file(GLOB GO_SOURCE RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.go") + string(REPLACE "${PADDLE_GO_PATH}/" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR}) + + add_custom_command(TARGET ${TARGET_NAME} POST_BUILD + COMMAND rm "${${TARGET_NAME}_LIB_PATH}" + # Golang build source code + COMMAND GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build ${BUILD_MODE} + -o "${${TARGET_NAME}_LIB_PATH}" + "./${CMAKE_CURRENT_SOURCE_REL_DIR}/${GO_SOURCE}" + # must run under GOPATH + WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go") + add_dependencies(${TARGET_NAME} go_vendor) +endfunction(go_library) + +function(go_binary TARGET_NAME) + set(options OPTIONAL) + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(go_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + string(REPLACE "${PADDLE_GO_PATH}/" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR}) + + add_custom_command(OUTPUT ${TARGET_NAME}_timestamp + COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build + -o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}" + "./${CMAKE_CURRENT_SOURCE_REL_DIR}/${go_binary_SRCS}" + WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go") + add_custom_target(${TARGET_NAME} ALL DEPENDS go_vendor ${TARGET_NAME}_timestamp ${go_binary_DEPS}) + install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME} DESTINATION bin) +endfunction(go_binary) + +function(go_test TARGET_NAME) + set(options OPTIONAL) + set(oneValueArgs "") + set(multiValueArgs DEPS) + cmake_parse_arguments(go_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + string(REPLACE "${PADDLE_GO_PATH}" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR}) + add_custom_target(${TARGET_NAME} ALL DEPENDS go_vendor ${go_test_DEPS}) + add_custom_command(TARGET ${TARGET_NAME} POST_BUILD + COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} test -race + -c -o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}" + ".${CMAKE_CURRENT_SOURCE_REL_DIR}" + WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go") + add_test(NAME ${TARGET_NAME} + COMMAND ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME} + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) +endfunction(go_test) + +# Modification of standard 'protobuf_generate_cpp()' with protobuf-lite support +# Usage: +# paddle_protobuf_generate_cpp( ) + +function(paddle_protobuf_generate_cpp SRCS HDRS) + if(NOT ARGN) + message(SEND_ERROR "Error: paddle_protobuf_generate_cpp() called without any proto files") + return() + endif() + + set(${SRCS}) + set(${HDRS}) + + if (MOBILE_INFERENCE) + set(EXTRA_FLAG "lite:") + else() + set(EXTRA_FLAG "") + endif() + + foreach(FIL ${ARGN}) + get_filename_component(ABS_FIL ${FIL} ABSOLUTE) + get_filename_component(FIL_WE ${FIL} NAME_WE) + + set(_protobuf_protoc_src "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.cc") + set(_protobuf_protoc_hdr "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.h") + list(APPEND ${SRCS} "${_protobuf_protoc_src}") + list(APPEND ${HDRS} "${_protobuf_protoc_hdr}") + + add_custom_command( + OUTPUT "${_protobuf_protoc_src}" + "${_protobuf_protoc_hdr}" + + COMMAND ${CMAKE_COMMAND} -E make_directory "${CMAKE_CURRENT_BINARY_DIR}" + COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} + -I${CMAKE_CURRENT_SOURCE_DIR} + --cpp_out "${EXTRA_FLAG}${CMAKE_CURRENT_BINARY_DIR}" ${ABS_FIL} + DEPENDS ${ABS_FIL} protoc + COMMENT "Running C++ protocol buffer compiler on ${FIL}" + VERBATIM ) + endforeach() + + set_source_files_properties(${${SRCS}} ${${HDRS}} PROPERTIES GENERATED TRUE) + set(${SRCS} ${${SRCS}} PARENT_SCOPE) + set(${HDRS} ${${HDRS}} PARENT_SCOPE) +endfunction() + + +function(proto_library TARGET_NAME) + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(proto_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + set(proto_srcs) + set(proto_hdrs) + paddle_protobuf_generate_cpp(proto_srcs proto_hdrs ${proto_library_SRCS}) + cc_library(${TARGET_NAME} SRCS ${proto_srcs} DEPS ${proto_library_DEPS} protobuf) +endfunction() + +function(py_proto_compile TARGET_NAME) + set(oneValueArgs "") + set(multiValueArgs SRCS) + cmake_parse_arguments(py_proto_compile "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + set(py_srcs) + protobuf_generate_python(py_srcs ${py_proto_compile_SRCS}) + add_custom_target(${TARGET_NAME} ALL DEPENDS ${py_srcs}) +endfunction() + +function(py_test TARGET_NAME) + if(WITH_TESTING) + set(options STATIC static SHARED shared) + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS ARGS) + cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + add_test(NAME ${TARGET_NAME} + COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python + ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS} + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) + endif() +endfunction() + +# grpc_library generate grpc code using grpc_cpp_plugin and protoc +# then build the generated protobuf code and grpc code with your +# implementation source codes together. Use SRCS argument for your +# implementation source files and PROTO argument for your .proto +# files. +# +# Usage: grpc_library(my_target SRCS my_client.cc PROTO my_target.proto DEPS my_dep) + +function(grpc_library TARGET_NAME) + set(oneValueArgs PROTO) + set(multiValueArgs SRCS DEPS) + set(options "") + cmake_parse_arguments(grpc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + message(STATUS "generating grpc ${grpc_library_PROTO}") + + get_filename_component(ABS_PROTO ${grpc_library_PROTO} ABSOLUTE) + get_filename_component(PROTO_WE ${grpc_library_PROTO} NAME_WE) + get_filename_component(PROTO_PATH ${ABS_PROTO} PATH) + + protobuf_generate_cpp(grpc_proto_srcs grpc_proto_hdrs "${ABS_PROTO}") + set(grpc_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.cc") + set(grpc_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.h") + cc_library("${TARGET_NAME}_proto" SRCS "${grpc_proto_srcs}") + + add_custom_command( + OUTPUT "${grpc_grpc_srcs}" "${grpc_grpc_hdrs}" + COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} + ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}" -I "${PROTO_PATH}" + --plugin=protoc-gen-grpc="${GRPC_CPP_PLUGIN}" "${ABS_PROTO}" + DEPENDS "${ABS_PROTO}" ${PROTOBUF_PROTOC_EXECUTABLE} extern_grpc) + + # FIXME(typhoonzero): grpc generated code do not generate virtual-dtor, mark it + # as compiler warnings instead of error. Should try remove the warnings also. + set_source_files_properties( + ${grpc_grpc_srcs} + PROPERTIES + COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") + cc_library("${TARGET_NAME}_grpc" SRCS "${grpc_grpc_srcs}") + + set_source_files_properties( + ${grpc_library_SRCS} + PROPERTIES + COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") + cc_library("${TARGET_NAME}" SRCS "${grpc_library_SRCS}" DEPS "${TARGET_NAME}_grpc" "${TARGET_NAME}_proto" "${grpc_library_DEPS}") +endfunction() diff --git a/cmake/make_resource.py b/cmake/make_resource.py new file mode 100644 index 0000000000000000000000000000000000000000..a9241b0e3e36c2e79c79e46b4f9114b7f6947341 --- /dev/null +++ b/cmake/make_resource.py @@ -0,0 +1,11 @@ +import os +import re +import sys + +res = sys.argv[1] +out = sys.argv[2] +var = re.sub(r'[ .-]', '_', os.path.basename(res)) + +open(out, "w").write("const unsigned char " + var + "[] = {" + ",".join([ + "0x%02x" % ord(c) for c in open(res).read() +]) + ",0};\n" + "const unsigned " + var + "_size = sizeof(" + var + ");\n") diff --git a/cmake/package.cmake b/cmake/package.cmake index ff49a2d08e8f6004320acfce266339aa301eb9c4..79e02147f3f7cc19c1bf45d8a1d208a9a32416ff 100644 --- a/cmake/package.cmake +++ b/cmake/package.cmake @@ -12,7 +12,7 @@ set(CPACK_PACKAGE_DESCRIPTION "") set(CPACK_DEBIAN_PACKAGE_DEPENDS "libpython2.7-dev, libstdc++6, python-pip, curl, libgfortran3, python-pip-whl") set(CPACK_DEBIAN_PACKAGE_SECTION Devel) set(CPACK_DEBIAN_PACKAGE_VERSION ${PADDLE_VERSION}) -set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJ_ROOT}/paddle/scripts/deb/postinst") +set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PADDLE_SOURCE_DIR}/paddle/scripts/deb/postinst") #set(CPACK_GENERATOR "DEB") # Start cpack include (CMakePackageConfigHelpers) diff --git a/cmake/rdma.cmake b/cmake/rdma.cmake index 9ff1a77cac74fb1bdfe470a78d225ed1767bb1b5..b698f3bdc3ff586a72badee3e0109e29285b457f 100644 --- a/cmake/rdma.cmake +++ b/cmake/rdma.cmake @@ -10,7 +10,7 @@ if(WITH_RDMA) function(generate_rdma_links) #redirect to current DIR to isolate the pollution from system runtime environment - #it can benifits unified control for different gcc environment. + #it can benifits unified control for different gcc environment. #e.g, by default gcc48 did not refer /usr/lib64 which could contain low version #runtime libraries that will crash process while loading it. That redirect trick #can fix it. @@ -19,7 +19,9 @@ if(WITH_RDMA) COMMAND ln -s -f /usr/lib64/libibverbs.so.1.0.0 librdma/libibverbs.so.1 COMMAND ln -s -f /usr/lib64/libibverbs.so.1.0.0 librdma/libibverbs.so COMMAND ln -s -f /usr/lib64/librdmacm.so.1.0.0 librdma/librdmacm.so.1 - COMMAND ln -s -f /usr/lib64/librdmacm.so.1.0.0 librdma/librdmacm.so + COMMAND ln -s -f /usr/lib64/librdmacm.so.1.0.0 librdma/librdmacm.so + COMMAND ln -s -f /lib64/libnl.so.1.1.4 librdma/libnl.so.1 + COMMAND ln -s -f /lib64/libnl.so.1.1.4 librdma/libnl.so WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} ) endfunction(generate_rdma_links) @@ -44,7 +46,7 @@ if(WITH_RDMA) RDMA_INC_XIO AND RDMA_INC_EVENT AND RDMA_INC_NUMA AND - RDMA_LIB_SXISOCK AND + RDMA_LIB_SXISOCK AND RDMA_LIB_XIO AND RDMA_LIB_EVENT AND RDMA_LIB_EVENT_CORE AND @@ -53,19 +55,19 @@ if(WITH_RDMA) RDMA_LIB_NUMA ) - set(RDMA_INC_DIR - ${RDMA_INC_SXISOCK} + set(RDMA_INC_DIR + ${RDMA_INC_SXISOCK} ${RDMA_INC_XIO} ${RDMA_INC_EVENT} ${RDMA_INC_NUMA}) - set(RDMA_LIBS - ${RDMA_LIB_SXISOCK} - ${RDMA_LIB_XIO} - ${RDMA_LIB_EVENT} - ${RDMA_LIB_EVENT_CORE} - ${RDMA_LIB_EVENT_EXTRA} - ${RDMA_LIB_EVENT_PTHREADS} - ${RDMA_LIB_NUMA} + set(RDMA_LIBS + ${RDMA_LIB_SXISOCK} + ${RDMA_LIB_XIO} + ${RDMA_LIB_EVENT} + ${RDMA_LIB_EVENT_CORE} + ${RDMA_LIB_EVENT_EXTRA} + ${RDMA_LIB_EVENT_PTHREADS} + ${RDMA_LIB_NUMA} ) set(RDMA_LD_FLAGS "-L./librdma -libverbs -lrdmacm -Xlinker -rpath ./librdma") include_directories("${RDMA_INC_DIR}") diff --git a/cmake/simd.cmake b/cmake/simd.cmake index 46035a908b588861607a25d3a21cf34b7b6fd4b8..53c2de332ea74b06d1bd6e5bb119cad6af27ed01 100644 --- a/cmake/simd.cmake +++ b/cmake/simd.cmake @@ -1,27 +1,28 @@ # This file is use to check all support level of AVX on your machine # so that PaddlePaddle can unleash the vectorization power of muticore. -INCLUDE(CheckCXXSourceRuns) -INCLUDE(CheckCXXSourceCompiles) +include(CheckCXXSourceRuns) +include(CheckCXXSourceCompiles) -IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") +if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") set(MMX_FLAG "-mmmx") set(SSE2_FLAG "-msse2") set(SSE3_FLAG "-msse3") - SET(AVX_FLAG "-mavx") - SET(AVX2_FLAG "-mavx2") -ELSEIF(MSVC) + set(AVX_FLAG "-mavx") + set(AVX2_FLAG "-mavx2") +elseif(MSVC) set(MMX_FLAG "/arch:MMX") set(SSE2_FLAG "/arch:SSE2") set(SSE3_FLAG "/arch:SSE3") SET(AVX_FLAG "/arch:AVX") SET(AVX2_FLAG "/arch:AVX2") -ENDIF() +endif() set(CMAKE_REQUIRED_FLAGS_RETAINED ${CMAKE_REQUIRED_FLAGS}) # Check MMX set(CMAKE_REQUIRED_FLAGS ${MMX_FLAG}) +set(MMX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) CHECK_CXX_SOURCE_RUNS(" #include int main() @@ -32,6 +33,7 @@ int main() # Check SSE2 set(CMAKE_REQUIRED_FLAGS ${SSE2_FLAG}) +set(SSE2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) CHECK_CXX_SOURCE_RUNS(" #include int main() @@ -42,6 +44,7 @@ int main() # Check SSE3 set(CMAKE_REQUIRED_FLAGS ${SSE3_FLAG}) +set(SSE3_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) CHECK_CXX_SOURCE_RUNS(" #include int main() @@ -55,6 +58,7 @@ int main() # Check AVX set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG}) +set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) CHECK_CXX_SOURCE_RUNS(" #include int main() @@ -67,6 +71,7 @@ int main() # Check AVX 2 set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG}) +set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) CHECK_CXX_SOURCE_RUNS(" #include int main() diff --git a/cmake/system.cmake b/cmake/system.cmake index 3ca06665ab2385e34302a6bcce7ada549ea1e247..396bd1a0797edea0522bb1f02349373563b7726a 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -13,9 +13,9 @@ # limitations under the License. # Detects the OS and sets appropriate variables. -# CMAKE_SYSTEM_NAME only give us a coarse-grained name, -# but the name like centos is necessary in some scenes -# to distinguish system for customization. +# CMAKE_SYSTEM_NAME only give us a coarse-grained name of the OS CMake is +# building for, but the host processor name like centos is necessary +# in some scenes to distinguish system for customization. # # for instance, protobuf libs path is /lib64 # on CentOS, but /lib on other systems. @@ -24,10 +24,15 @@ IF(WIN32) SET(HOST_SYSTEM "win32") ELSE(WIN32) IF(APPLE) - EXEC_PROGRAM (sw_vers ARGS -productVersion OUTPUT_VARIABLE MACOSX_VERSION) - STRING(REGEX MATCH "[0-9]+.[0-9]+" VERSION "${MACOSX_VERSION}") - SET(MACOS_VERSION ${VERSION}) SET(HOST_SYSTEM "macosx") + EXEC_PROGRAM(sw_vers ARGS -productVersion OUTPUT_VARIABLE HOST_SYSTEM_VERSION) + STRING(REGEX MATCH "[0-9]+.[0-9]+" MACOS_VERSION "${HOST_SYSTEM_VERSION}") + IF(NOT DEFINED $ENV{MACOSX_DEPLOYMENT_TARGET}) + # Set cache variable - end user may change this during ccmake or cmake-gui configure. + SET(CMAKE_OSX_DEPLOYMENT_TARGET ${MACOS_VERSION} CACHE STRING + "Minimum OS X version to target for deployment (at runtime); newer APIs weak linked. Set to empty string for default value.") + ENDIF() + set(CMAKE_EXE_LINKER_FLAGS "-framework CoreFoundation -framework Security") ELSE(APPLE) IF(EXISTS "/etc/issue") @@ -43,6 +48,8 @@ ELSE(WIN32) ELSEIF(LINUX_ISSUE MATCHES "Fedora") SET(HOST_SYSTEM "fedora") ENDIF() + + STRING(REGEX MATCH "(([0-9]+)\\.)+([0-9]+)" HOST_SYSTEM_VERSION "${LINUX_ISSUE}") ENDIF(EXISTS "/etc/issue") IF(EXISTS "/etc/redhat-release") @@ -64,12 +71,21 @@ CMAKE_HOST_SYSTEM_INFORMATION(RESULT CPU_CORES QUERY NUMBER_OF_LOGICAL_CORES) MARK_AS_ADVANCED(HOST_SYSTEM CPU_CORES) -MESSAGE(STATUS "Found Paddle host system: ${HOST_SYSTEM}") +MESSAGE(STATUS "Found Paddle host system: ${HOST_SYSTEM}, version: ${HOST_SYSTEM_VERSION}") MESSAGE(STATUS "Found Paddle host system's CPU: ${CPU_CORES} cores") +# configuration for cross-compiling IF(DEFINED CMAKE_SYSTEM_NAME) + INCLUDE(cross_compiling/host) IF(${CMAKE_SYSTEM_NAME} STREQUAL "Android") SET(ANDROID TRUE) + INCLUDE(cross_compiling/android) + ELSEIF(${CMAKE_SYSTEM_NAME} STREQUAL "RPi") + SET(RPI TRUE) + INCLUDE(cross_compiling/raspberry_pi) + ELSEIF(${CMAKE_SYSTEM_NAME} STREQUAL "iOS") + SET(IOS TRUE) + INCLUDE(cross_compiling/ios) ENDIF() ENDIF() diff --git a/cmake/util.cmake b/cmake/util.cmake index 099a85809d93e01772bf8c5c329fd9055ee4f054..0dc33ce385175d1e2dc454d41db467d4b9d9cf9a 100644 --- a/cmake/util.cmake +++ b/cmake/util.cmake @@ -25,7 +25,9 @@ function(target_circle_link_libraries TARGET_NAME) endif() endforeach() if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang") - list(APPEND LIBS "-undefined dynamic_lookup") + if(NOT IOS_ENABLE_BITCODE) + list(APPEND LIBS "-undefined dynamic_lookup") + endif() endif() list(REVERSE libsInArgn) target_link_libraries(${TARGET_NAME} @@ -71,29 +73,52 @@ function(link_paddle_exe TARGET_NAME) generate_rdma_links() endif() - target_circle_link_libraries(${TARGET_NAME} - ARCHIVE_START - paddle_gserver - paddle_function - ARCHIVE_END - paddle_pserver - paddle_trainer_lib - paddle_network - paddle_math - paddle_utils - paddle_parameter - paddle_proto - paddle_cuda - ${EXTERNAL_LIBS} - ${CMAKE_THREAD_LIBS_INIT} - ${CMAKE_DL_LIBS} - ${RDMA_LD_FLAGS} - ${RDMA_LIBS}) + if(MOBILE_INFERENCE) + target_circle_link_libraries(${TARGET_NAME} + ARCHIVE_START + paddle_gserver + paddle_function + ARCHIVE_END + paddle_math + paddle_utils + paddle_parameter + paddle_proto + paddle_cuda + ${EXTERNAL_LIBS} + ${CMAKE_THREAD_LIBS_INIT} + ${CMAKE_DL_LIBS} + ${RDMA_LD_FLAGS} + ${RDMA_LIBS}) + else() + target_circle_link_libraries(${TARGET_NAME} + ARCHIVE_START + paddle_gserver + paddle_function + ARCHIVE_END + paddle_pserver + paddle_trainer_lib + paddle_network + paddle_math + paddle_utils + paddle_parameter + paddle_proto + paddle_cuda + paddle_optimizer + ${EXTERNAL_LIBS} + ${CMAKE_THREAD_LIBS_INIT} + ${CMAKE_DL_LIBS} + ${RDMA_LD_FLAGS} + ${RDMA_LIBS}) + endif() if(ANDROID) target_link_libraries(${TARGET_NAME} log) endif(ANDROID) + if(WITH_MKLML AND MKLML_LIB_DIR AND MKLML_IOMP_LIB) + target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed") + endif() + add_dependencies(${TARGET_NAME} ${external_project_dependencies}) endfunction() @@ -117,7 +142,6 @@ endfunction() macro(add_unittest_without_exec TARGET_NAME) add_executable(${TARGET_NAME} ${ARGN}) link_paddle_test(${TARGET_NAME}) - add_style_check_target(${TARGET_NAME} ${ARGN}) endmacro() # add_unittest @@ -138,17 +162,9 @@ macro(add_simple_unittest TARGET_NAME) endmacro() # Creates C resources file from files in given resource file -function(create_resources res_file output) - # Create empty output file - file(WRITE ${output} "") - # Get short filename - string(REGEX MATCH "([^/]+)$" filename ${res_file}) - # Replace filename spaces & extension separator for C compatibility - string(REGEX REPLACE "\\.| |-" "_" filename ${filename}) - # Read hex data from file - file(READ ${res_file} filedata HEX) - # Convert hex data for C compatibility - string(REGEX REPLACE "([0-9a-f][0-9a-f])" "0x\\1," filedata ${filedata}) - # Append data to output file - file(APPEND ${output} "const unsigned char ${filename}[] = {${filedata}0};\nconst unsigned ${filename}_size = sizeof(${filename});\n") +function(create_resources res_file output_file) + add_custom_command( + OUTPUT ${output_file} + COMMAND python ARGS ${PADDLE_SOURCE_DIR}/cmake/make_resource.py ${res_file} ${output_file} + DEPENDS ${res_file} ${PADDLE_SOURCE_DIR}/cmake/make_resource.py) endfunction() diff --git a/cmake/version.cmake b/cmake/version.cmake index ac1583a24c828629c46cb9cf4e965f8da2273732..cde650128a068faf32f4abfff5cdfdeb656d8577 100644 --- a/cmake/version.cmake +++ b/cmake/version.cmake @@ -4,7 +4,7 @@ set(tmp_version "HEAD") while ("${PADDLE_VERSION}" STREQUAL "") execute_process( COMMAND ${GIT_EXECUTABLE} describe --tags --abbrev=0 ${tmp_version} - WORKING_DIRECTORY ${PROJ_ROOT} + WORKING_DIRECTORY ${PADDLE_SOURCE_DIR} OUTPUT_VARIABLE GIT_TAG_NAME RESULT_VARIABLE GIT_RESULT ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) diff --git a/demo/image_classification/.gitignore b/demo/image_classification/.gitignore deleted file mode 100644 index 6a05b8f6632db0977fceade8b48a89b9f7f6e6cc..0000000000000000000000000000000000000000 --- a/demo/image_classification/.gitignore +++ /dev/null @@ -1,9 +0,0 @@ -data/cifar-10-batches-py -data/cifar-out -cifar_vgg_model/* -plot.png -train.log -image_provider_copy_1.py -*pyc -train.list -test.list diff --git a/demo/image_classification/api_v2_resnet.py b/demo/image_classification/api_v2_resnet.py deleted file mode 100644 index 19d20540780becf504973a23b50445d4b65dc2ef..0000000000000000000000000000000000000000 --- a/demo/image_classification/api_v2_resnet.py +++ /dev/null @@ -1,74 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle.v2 as paddle - -__all__ = ['resnet_cifar10'] - - -def conv_bn_layer(input, - ch_out, - filter_size, - stride, - padding, - active_type=paddle.activation.Relu(), - ch_in=None): - tmp = paddle.layer.img_conv( - input=input, - filter_size=filter_size, - num_channels=ch_in, - num_filters=ch_out, - stride=stride, - padding=padding, - act=paddle.activation.Linear(), - bias_attr=False) - return paddle.layer.batch_norm(input=tmp, act=active_type) - - -def shortcut(ipt, n_in, n_out, stride): - if n_in != n_out: - return conv_bn_layer(ipt, n_out, 1, stride, 0, - paddle.activation.Linear()) - else: - return ipt - - -def basicblock(ipt, ch_out, stride): - ch_in = ch_out * 2 - tmp = conv_bn_layer(ipt, ch_out, 3, stride, 1) - tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, paddle.activation.Linear()) - short = shortcut(ipt, ch_in, ch_out, stride) - return paddle.layer.addto(input=[tmp, short], act=paddle.activation.Relu()) - - -def layer_warp(block_func, ipt, features, count, stride): - tmp = block_func(ipt, features, stride) - for i in range(1, count): - tmp = block_func(tmp, features, 1) - return tmp - - -def resnet_cifar10(ipt, depth=32): - # depth should be one of 20, 32, 44, 56, 110, 1202 - assert (depth - 2) % 6 == 0 - n = (depth - 2) / 6 - nStages = {16, 64, 128} - conv1 = conv_bn_layer( - ipt, ch_in=3, ch_out=16, filter_size=3, stride=1, padding=1) - res1 = layer_warp(basicblock, conv1, 16, n, 1) - res2 = layer_warp(basicblock, res1, 32, n, 2) - res3 = layer_warp(basicblock, res2, 64, n, 2) - pool = paddle.layer.img_pool( - input=res3, pool_size=8, stride=1, pool_type=paddle.pooling.Avg()) - return pool diff --git a/demo/image_classification/api_v2_train.py b/demo/image_classification/api_v2_train.py deleted file mode 100644 index 53cffa6fb4e8b2e19725f4f44bf7b9ffffb25232..0000000000000000000000000000000000000000 --- a/demo/image_classification/api_v2_train.py +++ /dev/null @@ -1,92 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License - -import sys - -import paddle.v2 as paddle - -from api_v2_vgg import vgg_bn_drop - - -def main(): - datadim = 3 * 32 * 32 - classdim = 10 - - # PaddlePaddle init - paddle.init(use_gpu=False, trainer_count=1) - - image = paddle.layer.data( - name="image", type=paddle.data_type.dense_vector(datadim)) - - # Add neural network config - # option 1. resnet - # net = resnet_cifar10(image, depth=32) - # option 2. vgg - net = vgg_bn_drop(image) - - out = paddle.layer.fc(input=net, - size=classdim, - act=paddle.activation.Softmax()) - - lbl = paddle.layer.data( - name="label", type=paddle.data_type.integer_value(classdim)) - cost = paddle.layer.classification_cost(input=out, label=lbl) - - # Create parameters - parameters = paddle.parameters.create(cost) - - # Create optimizer - momentum_optimizer = paddle.optimizer.Momentum( - momentum=0.9, - regularization=paddle.optimizer.L2Regularization(rate=0.0002 * 128), - learning_rate=0.1 / 128.0, - learning_rate_decay_a=0.1, - learning_rate_decay_b=50000 * 100, - learning_rate_schedule='discexp', - batch_size=128) - - # End batch and end pass event handler - def event_handler(event): - if isinstance(event, paddle.event.EndIteration): - if event.batch_id % 100 == 0: - print "\nPass %d, Batch %d, Cost %f, %s" % ( - event.pass_id, event.batch_id, event.cost, event.metrics) - else: - sys.stdout.write('.') - sys.stdout.flush() - if isinstance(event, paddle.event.EndPass): - result = trainer.test( - reader=paddle.batch( - paddle.dataset.cifar.test10(), batch_size=128), - feeding={'image': 0, - 'label': 1}) - print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics) - - # Create trainer - trainer = paddle.trainer.SGD(cost=cost, - parameters=parameters, - update_equation=momentum_optimizer) - trainer.train( - reader=paddle.batch( - paddle.reader.shuffle( - paddle.dataset.cifar.train10(), buf_size=50000), - batch_size=128), - num_passes=5, - event_handler=event_handler, - feeding={'image': 0, - 'label': 1}) - - -if __name__ == '__main__': - main() diff --git a/demo/image_classification/api_v2_vgg.py b/demo/image_classification/api_v2_vgg.py deleted file mode 100644 index 1e0e6b93adde30425f17aa9cd07542275f4fec37..0000000000000000000000000000000000000000 --- a/demo/image_classification/api_v2_vgg.py +++ /dev/null @@ -1,47 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle.v2 as paddle - -__all__ = ['vgg_bn_drop'] - - -def vgg_bn_drop(input): - def conv_block(ipt, num_filter, groups, dropouts, num_channels=None): - return paddle.networks.img_conv_group( - input=ipt, - num_channels=num_channels, - pool_size=2, - pool_stride=2, - conv_num_filter=[num_filter] * groups, - conv_filter_size=3, - conv_act=paddle.activation.Relu(), - conv_with_batchnorm=True, - conv_batchnorm_drop_rate=dropouts, - pool_type=paddle.pooling.Max()) - - conv1 = conv_block(input, 64, 2, [0.3, 0], 3) - conv2 = conv_block(conv1, 128, 2, [0.4, 0]) - conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0]) - conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0]) - conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0]) - - drop = paddle.layer.dropout(input=conv5, dropout_rate=0.5) - fc1 = paddle.layer.fc(input=drop, size=512, act=paddle.activation.Linear()) - bn = paddle.layer.batch_norm( - input=fc1, - act=paddle.activation.Relu(), - layer_attr=paddle.attr.Extra(drop_rate=0.5)) - fc2 = paddle.layer.fc(input=bn, size=512, act=paddle.activation.Linear()) - return fc2 diff --git a/demo/image_classification/data/download_cifar.sh b/demo/image_classification/data/download_cifar.sh deleted file mode 100755 index 532178d627fe19ab8ea79ecae73e5328b5294bea..0000000000000000000000000000000000000000 --- a/demo/image_classification/data/download_cifar.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e -wget https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz -tar zxf cifar-10-python.tar.gz -rm cifar-10-python.tar.gz -rm -rf cifar-out/* -echo Converting CIFAR data to images..... -python process_cifar.py ./cifar-10-batches-py ./cifar-out diff --git a/demo/image_classification/data/process_cifar.py b/demo/image_classification/data/process_cifar.py deleted file mode 100644 index db6666189e5b8008a6b66fb64afcdf98980e72bb..0000000000000000000000000000000000000000 --- a/demo/image_classification/data/process_cifar.py +++ /dev/null @@ -1,89 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np -import sys -import os -import PIL.Image as Image -""" - Usage: python process_cifar input_dir output_dir -""" - - -def mkdir_not_exist(path): - """ - Make dir if the path does not exist. - path: the path to be created. - """ - if not os.path.exists(path): - os.mkdir(path) - - -def create_dir_structure(output_dir): - """ - Create the directory structure for the directory. - output_dir: the direcotry structure path. - """ - mkdir_not_exist(os.path.join(output_dir)) - mkdir_not_exist(os.path.join(output_dir, "train")) - mkdir_not_exist(os.path.join(output_dir, "test")) - - -def convert_batch(batch_path, label_set, label_map, output_dir, data_split): - """ - Convert CIFAR batch to the structure of Paddle format. - batch_path: the batch to be converted. - label_set: the set of labels. - output_dir: the output path. - data_split: whether it is training or testing data. - """ - data = np.load(batch_path) - for data, label, filename in zip(data['data'], data['labels'], - data['filenames']): - data = data.reshape((3, 32, 32)) - data = np.transpose(data, (1, 2, 0)) - label = label_map[label] - output_dir_this = os.path.join(output_dir, data_split, str(label)) - output_filename = os.path.join(output_dir_this, filename) - if not label in label_set: - label_set[label] = True - mkdir_not_exist(output_dir_this) - Image.fromarray(data).save(output_filename) - - -if __name__ == '__main__': - input_dir = sys.argv[1] - output_dir = sys.argv[2] - num_batch = 5 - create_dir_structure(output_dir) - label_map = { - 0: "airplane", - 1: "automobile", - 2: "bird", - 3: "cat", - 4: "deer", - 5: "dog", - 6: "frog", - 7: "horse", - 8: "ship", - 9: "truck" - } - labels = {} - for i in range(1, num_batch + 1): - convert_batch( - os.path.join(input_dir, "data_batch_%d" % i), labels, label_map, - output_dir, "train") - convert_batch( - os.path.join(input_dir, "test_batch"), {}, label_map, output_dir, - "test") diff --git a/demo/image_classification/image_provider.py b/demo/image_classification/image_provider.py deleted file mode 100644 index 6a315ff094c1af5f8250d8a22ff96740dddd9808..0000000000000000000000000000000000000000 --- a/demo/image_classification/image_provider.py +++ /dev/null @@ -1,89 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import io -import random - -import paddle.utils.image_util as image_util -from paddle.trainer.PyDataProvider2 import * - - -# -# {'img_size': 32, -# 'settings': a global object, -# 'color': True, -# 'mean_img_size': 32, -# 'meta': './data/cifar-out/batches/batches.meta', -# 'num_classes': 10, -# 'file_list': ('./data/cifar-out/batches/train_batch_000',), -# 'use_jpeg': True} -def hook(settings, img_size, mean_img_size, num_classes, color, meta, use_jpeg, - is_train, **kwargs): - settings.mean_img_size = mean_img_size - settings.img_size = img_size - settings.num_classes = num_classes - settings.color = color - settings.is_train = is_train - - if settings.color: - settings.img_raw_size = settings.img_size * settings.img_size * 3 - else: - settings.img_raw_size = settings.img_size * settings.img_size - - settings.meta_path = meta - settings.use_jpeg = use_jpeg - - settings.img_mean = image_util.load_meta(settings.meta_path, - settings.mean_img_size, - settings.img_size, settings.color) - - settings.logger.info('Image size: %s', settings.img_size) - settings.logger.info('Meta path: %s', settings.meta_path) - settings.input_types = { - 'image': dense_vector(settings.img_raw_size), - 'label': integer_value(settings.num_classes) - } - - settings.logger.info('DataProvider Initialization finished') - - -@provider(init_hook=hook, min_pool_size=0) -def processData(settings, file_list): - """ - The main function for loading data. - Load the batch, iterate all the images and labels in this batch. - file_list: the batch file list. - """ - with open(file_list, 'r') as fdata: - lines = [line.strip() for line in fdata] - random.shuffle(lines) - for file_name in lines: - with io.open(file_name.strip(), 'rb') as file: - data = cPickle.load(file) - indexes = list(range(len(data['images']))) - if settings.is_train: - random.shuffle(indexes) - for i in indexes: - if settings.use_jpeg == 1: - img = image_util.decode_jpeg(data['images'][i]) - else: - img = data['images'][i] - img_feat = image_util.preprocess_img( - img, settings.img_mean, settings.img_size, - settings.is_train, settings.color) - label = data['labels'][i] - yield { - 'image': img_feat.astype('float32'), - 'label': int(label) - } diff --git a/demo/image_classification/image_util.py b/demo/image_classification/image_util.py deleted file mode 100644 index f09605394a19e09d92e555eeefb0b5646625b618..0000000000000000000000000000000000000000 --- a/demo/image_classification/image_util.py +++ /dev/null @@ -1,221 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np -from PIL import Image -from cStringIO import StringIO - - -def resize_image(img, target_size): - """ - Resize an image so that the shorter edge has length target_size. - img: the input image to be resized. - target_size: the target resized image size. - """ - percent = (target_size / float(min(img.size[0], img.size[1]))) - resized_size = int(round(img.size[0] * percent)), int( - round(img.size[1] * percent)) - img = img.resize(resized_size, Image.ANTIALIAS) - return img - - -def flip(im): - """ - Return the flipped image. - Flip an image along the horizontal direction. - im: input image, (H x W x K) ndarrays - """ - if len(im.shape) == 3: - return im[:, :, ::-1] - else: - return im[:, ::-1] - - -def crop_img(im, inner_size, color=True, test=True): - """ - Return cropped image. - The size of the cropped image is inner_size * inner_size. - im: (K x H x W) ndarrays - inner_size: the cropped image size. - color: whether it is color image. - test: whether in test mode. - If False, does random cropping and flipping. - If True, crop the center of images. - """ - if color: - height, width = max(inner_size, im.shape[1]), max(inner_size, - im.shape[2]) - padded_im = np.zeros((3, height, width)) - startY = (height - im.shape[1]) / 2 - startX = (width - im.shape[2]) / 2 - endY, endX = startY + im.shape[1], startX + im.shape[2] - padded_im[:, startY:endY, startX:endX] = im - else: - im = im.astype('float32') - height, width = max(inner_size, im.shape[0]), max(inner_size, - im.shape[1]) - padded_im = np.zeros((height, width)) - startY = (height - im.shape[0]) / 2 - startX = (width - im.shape[1]) / 2 - endY, endX = startY + im.shape[0], startX + im.shape[1] - padded_im[startY:endY, startX:endX] = im - if test: - startY = (height - inner_size) / 2 - startX = (width - inner_size) / 2 - else: - startY = np.random.randint(0, height - inner_size + 1) - startX = np.random.randint(0, width - inner_size + 1) - endY, endX = startY + inner_size, startX + inner_size - if color: - pic = padded_im[:, startY:endY, startX:endX] - else: - pic = padded_im[startY:endY, startX:endX] - if (not test) and (np.random.randint(2) == 0): - pic = flip(pic) - return pic - - -def decode_jpeg(jpeg_string): - np_array = np.array(Image.open(StringIO(jpeg_string))) - if len(np_array.shape) == 3: - np_array = np.transpose(np_array, (2, 0, 1)) - return np_array - - -def preprocess_img(im, img_mean, crop_size, is_train, color=True): - """ - Does data augmentation for images. - If is_train is false, cropping the center region from the image. - If is_train is true, randomly crop a region from the image, - and randomy does flipping. - im: (K x H x W) ndarrays - """ - im = im.astype('float32') - test = not is_train - pic = crop_img(im, crop_size, color, test) - pic -= img_mean - return pic.flatten() - - -def load_meta(meta_path, mean_img_size, crop_size, color=True): - """ - Return the loaded meta file. - Load the meta image, which is the mean of the images in the dataset. - The mean image is subtracted from every input image so that the expected mean - of each input image is zero. - """ - mean = np.load(meta_path)['data_mean'] - border = (mean_img_size - crop_size) / 2 - if color: - assert (mean_img_size * mean_img_size * 3 == mean.shape[0]) - mean = mean.reshape(3, mean_img_size, mean_img_size) - mean = mean[:, border:border + crop_size, border:border + - crop_size].astype('float32') - else: - assert (mean_img_size * mean_img_size == mean.shape[0]) - mean = mean.reshape(mean_img_size, mean_img_size) - mean = mean[border:border + crop_size, border:border + - crop_size].astype('float32') - return mean - - -def load_image(img_path, is_color=True): - """ - Load image and return. - img_path: image path. - is_color: is color image or not. - """ - img = Image.open(img_path) - img.load() - return img - - -def oversample(img, crop_dims): - """ - image : iterable of (H x W x K) ndarrays - crop_dims: (height, width) tuple for the crops. - Returned data contains ten crops of input image, namely, - four corner patches and the center patch as well as their - horizontal reflections. - """ - # Dimensions and center. - im_shape = np.array(img[0].shape) - crop_dims = np.array(crop_dims) - im_center = im_shape[:2] / 2.0 - - # Make crop coordinates - h_indices = (0, im_shape[0] - crop_dims[0]) - w_indices = (0, im_shape[1] - crop_dims[1]) - crops_ix = np.empty((5, 4), dtype=int) - curr = 0 - for i in h_indices: - for j in w_indices: - crops_ix[curr] = (i, j, i + crop_dims[0], j + crop_dims[1]) - curr += 1 - crops_ix[4] = np.tile(im_center, (1, 2)) + np.concatenate( - [-crop_dims / 2.0, crop_dims / 2.0]) - crops_ix = np.tile(crops_ix, (2, 1)) - - # Extract crops - crops = np.empty( - (10 * len(img), crop_dims[0], crop_dims[1], im_shape[-1]), - dtype=np.float32) - ix = 0 - for im in img: - for crop in crops_ix: - crops[ix] = im[crop[0]:crop[2], crop[1]:crop[3], :] - ix += 1 - crops[ix - 5:ix] = crops[ix - 5:ix, :, ::-1, :] # flip for mirrors - return crops - - -class ImageTransformer: - def __init__(self, - transpose=None, - channel_swap=None, - mean=None, - is_color=True): - self.transpose = transpose - self.channel_swap = None - self.mean = None - self.is_color = is_color - - def set_transpose(self, order): - if self.is_color: - assert 3 == len(order) - self.transpose = order - - def set_channel_swap(self, order): - if self.is_color: - assert 3 == len(order) - self.channel_swap = order - - def set_mean(self, mean): - # mean value, may be one value per channel - if mean.ndim == 1: - mean = mean[:, np.newaxis, np.newaxis] - else: - # elementwise mean - if self.is_color: - assert len(mean.shape) == 3 - self.mean = mean - - def transformer(self, data): - if self.transpose is not None: - data = data.transpose(self.transpose) - if self.channel_swap is not None: - data = data[self.channel_swap, :, :] - if self.mean is not None: - data -= self.mean - return data diff --git a/demo/image_classification/predict.sh b/demo/image_classification/predict.sh deleted file mode 100755 index 9d5785c9a1a4dac12f7940fa708b1a79c6ec8a93..0000000000000000000000000000000000000000 --- a/demo/image_classification/predict.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e - -model=cifar_vgg_model/pass-00299/ -image=data/cifar-out/test/airplane/seaplane_s_000978.png -use_gpu=1 -python prediction.py $model $image $use_gpu diff --git a/demo/image_classification/prediction.py b/demo/image_classification/prediction.py deleted file mode 100755 index 49c0ff600c40e0222093ff0a8a2f7e8e38ccba29..0000000000000000000000000000000000000000 --- a/demo/image_classification/prediction.py +++ /dev/null @@ -1,159 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os, sys -import numpy as np -import logging -from PIL import Image -from optparse import OptionParser - -import paddle.utils.image_util as image_util - -from py_paddle import swig_paddle, DataProviderConverter -from paddle.trainer.PyDataProvider2 import dense_vector -from paddle.trainer.config_parser import parse_config - -logging.basicConfig( - format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s') -logging.getLogger().setLevel(logging.INFO) - - -class ImageClassifier(): - def __init__(self, - train_conf, - use_gpu=True, - model_dir=None, - resize_dim=None, - crop_dim=None, - mean_file=None, - oversample=False, - is_color=True): - """ - train_conf: network configure. - model_dir: string, directory of model. - resize_dim: int, resized image size. - crop_dim: int, crop size. - mean_file: string, image mean file. - oversample: bool, oversample means multiple crops, namely five - patches (the four corner patches and the center - patch) as well as their horizontal reflections, - ten crops in all. - """ - self.train_conf = train_conf - self.model_dir = model_dir - if model_dir is None: - self.model_dir = os.path.dirname(train_conf) - - self.resize_dim = resize_dim - self.crop_dims = [crop_dim, crop_dim] - self.oversample = oversample - self.is_color = is_color - - self.transformer = image_util.ImageTransformer(is_color=is_color) - self.transformer.set_transpose((2, 0, 1)) - - self.mean_file = mean_file - mean = np.load(self.mean_file)['data_mean'] - mean = mean.reshape(3, self.crop_dims[0], self.crop_dims[1]) - self.transformer.set_mean(mean) # mean pixel - gpu = 1 if use_gpu else 0 - conf_args = "is_test=1,use_gpu=%d,is_predict=1" % (gpu) - conf = parse_config(train_conf, conf_args) - swig_paddle.initPaddle("--use_gpu=%d" % (gpu)) - self.network = swig_paddle.GradientMachine.createFromConfigProto( - conf.model_config) - assert isinstance(self.network, swig_paddle.GradientMachine) - self.network.loadParameters(self.model_dir) - - data_size = 3 * self.crop_dims[0] * self.crop_dims[1] - slots = [dense_vector(data_size)] - self.converter = DataProviderConverter(slots) - - def get_data(self, img_path): - """ - 1. load image from img_path. - 2. resize or oversampling. - 3. transformer data: transpose, sub mean. - return K x H x W ndarray. - img_path: image path. - """ - image = image_util.load_image(img_path, self.is_color) - if self.oversample: - # image_util.resize_image: short side is self.resize_dim - image = image_util.resize_image(image, self.resize_dim) - image = np.array(image) - input = np.zeros( - (1, image.shape[0], image.shape[1], 3), dtype=np.float32) - input[0] = image.astype(np.float32) - input = image_util.oversample(input, self.crop_dims) - else: - image = image.resize(self.crop_dims, Image.ANTIALIAS) - input = np.zeros( - (1, self.crop_dims[0], self.crop_dims[1], 3), dtype=np.float32) - input[0] = np.array(image).astype(np.float32) - - data_in = [] - for img in input: - img = self.transformer.transformer(img).flatten() - data_in.append([img.tolist()]) - return data_in - - def forward(self, input_data): - in_arg = self.converter(input_data) - return self.network.forwardTest(in_arg) - - def forward(self, data, output_layer): - """ - input_data: py_paddle input data. - output_layer: specify the name of probability, namely the layer with - softmax activation. - return: the predicting probability of each label. - """ - input = self.converter(data) - self.network.forwardTest(input) - output = self.network.getLayerOutputs(output_layer) - # For oversampling, average predictions across crops. - # If not, the shape of output[name]: (1, class_number), - # the mean is also applicable. - return output[output_layer]['value'].mean(0) - - def predict(self, image=None, output_layer=None): - assert isinstance(image, basestring) - assert isinstance(output_layer, basestring) - data = self.get_data(image) - prob = self.forward(data, output_layer) - lab = np.argsort(-prob) - logging.info("Label of %s is: %d", image, lab[0]) - - -if __name__ == '__main__': - image_size = 32 - crop_size = 32 - multi_crop = True - config = "vgg_16_cifar.py" - output_layer = "__fc_layer_1__" - mean_path = "data/cifar-out/batches/batches.meta" - model_path = sys.argv[1] - image = sys.argv[2] - use_gpu = bool(int(sys.argv[3])) - - obj = ImageClassifier( - train_conf=config, - model_dir=model_path, - resize_dim=image_size, - crop_dim=crop_size, - mean_file=mean_path, - use_gpu=use_gpu, - oversample=multi_crop) - obj.predict(image, output_layer) diff --git a/demo/image_classification/preprocess.py b/demo/image_classification/preprocess.py deleted file mode 100755 index 2947ad239c36f9a02ed67ccf5906380cb70e37ce..0000000000000000000000000000000000000000 --- a/demo/image_classification/preprocess.py +++ /dev/null @@ -1,54 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from paddle.utils.preprocess_img import ImageClassificationDatasetCreater -from optparse import OptionParser - - -def option_parser(): - parser = OptionParser(usage="usage: python preprcoess.py "\ - "-i data_dir [options]") - parser.add_option( - "-i", - "--input", - action="store", - dest="input", - help="Input data directory.") - parser.add_option( - "-s", - "--size", - action="store", - dest="size", - help="Processed image size.") - parser.add_option( - "-c", - "--color", - action="store", - dest="color", - help="whether to use color images.") - return parser.parse_args() - - -if __name__ == '__main__': - options, args = option_parser() - data_dir = options.input - processed_image_size = int(options.size) - color = options.color == "1" - data_creator = ImageClassificationDatasetCreater( - data_dir, processed_image_size, color) - data_creator.train_list_name = "train.txt" - data_creator.test_list_name = "test.txt" - data_creator.num_per_batch = 1000 - data_creator.overwrite = True - data_creator.create_batches() diff --git a/demo/image_classification/preprocess.sh b/demo/image_classification/preprocess.sh deleted file mode 100755 index c7396c6393599ef3f2c55089eb05f2435b2b4b82..0000000000000000000000000000000000000000 --- a/demo/image_classification/preprocess.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e - -data_dir=./data/cifar-out - -python preprocess.py -i $data_dir -s 32 -c 1 - -echo "data/cifar-out/batches/train.txt" > train.list -echo "data/cifar-out/batches/test.txt" > test.list diff --git a/demo/image_classification/train.sh b/demo/image_classification/train.sh deleted file mode 100755 index e45bd47ad5925c6674d628a70a7ad7c4d5d5c173..0000000000000000000000000000000000000000 --- a/demo/image_classification/train.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e -config=vgg_16_cifar.py -output=./cifar_vgg_model -log=train.log - -paddle train \ ---config=$config \ ---dot_period=10 \ ---log_period=100 \ ---test_all_data_in_one_period=1 \ ---use_gpu=1 \ ---trainer_count=1 \ ---num_passes=300 \ ---save_dir=$output \ -2>&1 | tee $log -paddle usage -l $log -e $? -n "image_classification_train" >/dev/null 2>&1 - -python -m paddle.utils.plotcurve -i $log > plot.png diff --git a/demo/image_classification/vgg_16_cifar.py b/demo/image_classification/vgg_16_cifar.py deleted file mode 100755 index 8ee4a64c15f885023a6e19812885b4f76bb12af9..0000000000000000000000000000000000000000 --- a/demo/image_classification/vgg_16_cifar.py +++ /dev/null @@ -1,58 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from paddle.trainer_config_helpers import * - -is_predict = get_config_arg("is_predict", bool, False) - -####################Data Configuration ################## -if not is_predict: - data_dir = 'data/cifar-out/batches/' - meta_path = data_dir + 'batches.meta' - - args = { - 'meta': meta_path, - 'mean_img_size': 32, - 'img_size': 32, - 'num_classes': 10, - 'use_jpeg': 1, - 'color': "color" - } - - define_py_data_sources2( - train_list="train.list", - test_list="train.list", - module='image_provider', - obj='processData', - args=args) - -######################Algorithm Configuration ############# -settings( - batch_size=128, - learning_rate=0.1 / 128.0, - learning_method=MomentumOptimizer(0.9), - regularization=L2Regularization(0.0005 * 128)) - -#######################Network Configuration ############# -data_size = 3 * 32 * 32 -label_size = 10 -img = data_layer(name='image', size=data_size) -# small_vgg is predefined in trainer_config_helpers.networks -predict = small_vgg(input_image=img, num_channels=3, num_classes=label_size) - -if not is_predict: - lbl = data_layer(name="label", size=label_size) - outputs(classification_cost(input=predict, label=lbl)) -else: - outputs(predict) diff --git a/demo/introduction/.gitignore b/demo/introduction/.gitignore deleted file mode 100644 index c54f3f9480ce4ceefda98f77a812ec2d6cd4a5e3..0000000000000000000000000000000000000000 --- a/demo/introduction/.gitignore +++ /dev/null @@ -1,5 +0,0 @@ -dataprovider.pyc -empty.list -train.log -output -train.list diff --git a/demo/introduction/README.md b/demo/introduction/README.md deleted file mode 100644 index 0614a7afe645677ef0b65a17ea05f1dcfa45214f..0000000000000000000000000000000000000000 --- a/demo/introduction/README.md +++ /dev/null @@ -1,3 +0,0 @@ -This folder contains scripts used in PaddlePaddle introduction. -- use `bash train.sh` to train a simple linear regression model -- use `python evaluate_model.py` to read model parameters. You can see that `w` and `b` are very close to [2, 0.3]. diff --git a/demo/introduction/api_train_v2.py b/demo/introduction/api_train_v2.py deleted file mode 100644 index 1ba971b3688ce3dec078998df2c0b183a4e449f8..0000000000000000000000000000000000000000 --- a/demo/introduction/api_train_v2.py +++ /dev/null @@ -1,58 +0,0 @@ -import paddle.v2 as paddle -import paddle.v2.dataset.uci_housing as uci_housing - - -def main(): - # init - paddle.init(use_gpu=False, trainer_count=1) - - # network config - x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13)) - y_predict = paddle.layer.fc(input=x, - param_attr=paddle.attr.Param(name='w'), - size=1, - act=paddle.activation.Linear(), - bias_attr=paddle.attr.Param(name='b')) - y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1)) - cost = paddle.layer.mse_cost(input=y_predict, label=y) - - # create parameters - parameters = paddle.parameters.create(cost) - - # create optimizer - optimizer = paddle.optimizer.Momentum(momentum=0) - - trainer = paddle.trainer.SGD(cost=cost, - parameters=parameters, - update_equation=optimizer) - - # event_handler to print training and testing info - def event_handler(event): - if isinstance(event, paddle.event.EndIteration): - if event.batch_id % 100 == 0: - print "Pass %d, Batch %d, Cost %f" % ( - event.pass_id, event.batch_id, event.cost) - - if isinstance(event, paddle.event.EndPass): - if (event.pass_id + 1) % 10 == 0: - result = trainer.test( - reader=paddle.batch( - uci_housing.test(), batch_size=2), - feeding={'x': 0, - 'y': 1}) - print "Test %d, %.2f" % (event.pass_id, result.cost) - - # training - trainer.train( - reader=paddle.batch( - paddle.reader.shuffle( - uci_housing.train(), buf_size=500), - batch_size=2), - feeding={'x': 0, - 'y': 1}, - event_handler=event_handler, - num_passes=30) - - -if __name__ == '__main__': - main() diff --git a/demo/introduction/dataprovider.py b/demo/introduction/dataprovider.py deleted file mode 100644 index 5b48aad0408800676ae7da16eba2dcbb2124f25f..0000000000000000000000000000000000000000 --- a/demo/introduction/dataprovider.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from paddle.trainer.PyDataProvider2 import * -import random - - -# define data types of input: 2 real numbers -@provider( - input_types={'x': dense_vector(1), - 'y': dense_vector(1)}, use_seq=False) -def process(settings, input_file): - for i in xrange(2000): - x = random.random() - yield {'x': [x], 'y': [2 * x + 0.3]} diff --git a/demo/introduction/evaluate_model.py b/demo/introduction/evaluate_model.py deleted file mode 100755 index eeda43c5c86f3e49f758bf55b16a68387e64238c..0000000000000000000000000000000000000000 --- a/demo/introduction/evaluate_model.py +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/env python -# -*- coding: UTF-8 -*- - -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Print model parameters in last model - -Usage: - python evaluate_model.py -""" -import numpy as np -import os - - -def load(file_name): - with open(file_name, 'rb') as f: - f.read(16) # skip header for float type. - return np.fromfile(f, dtype=np.float32) - - -def main(): - print 'w=%.6f, b=%.6f from pass 29' % (load('output/pass-00029/w'), - load('output/pass-00029/b')) - - -if __name__ == '__main__': - main() diff --git a/demo/introduction/train.sh b/demo/introduction/train.sh deleted file mode 100755 index 2ce6446d7c943ffc9bea8da43d153539f6f9f15f..0000000000000000000000000000000000000000 --- a/demo/introduction/train.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e - -paddle train \ - --config=trainer_config.py \ - --save_dir=./output \ - --num_passes=30 \ - 2>&1 |tee 'train.log' -paddle usage -l "train.log" -e $? -n "introduction" >/dev/null 2>&1 diff --git a/demo/introduction/trainer_config.py b/demo/introduction/trainer_config.py deleted file mode 100644 index 651dfaa4b7b4873810a0b393655541a62d1a311b..0000000000000000000000000000000000000000 --- a/demo/introduction/trainer_config.py +++ /dev/null @@ -1,38 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from paddle.trainer_config_helpers import * - -# 1. read data. Suppose you saved above python code as dataprovider.py -define_py_data_sources2( - train_list=['no_matter.txt'], - test_list=None, - module='dataprovider', - obj='process', - args={}) - -# 2. learning algorithm -settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer()) - -# 3. Network configuration -x = data_layer(name='x', size=1) -y = data_layer(name='y', size=1) -y_predict = fc_layer( - input=x, - param_attr=ParamAttr(name='w'), - size=1, - act=LinearActivation(), - bias_attr=ParamAttr(name='b')) -cost = mse_cost(input=y_predict, label=y) -outputs(cost) diff --git a/demo/mnist/api_train_v2.py b/demo/mnist/api_train_v2.py deleted file mode 100644 index 6b95a88042a13a280bcb80f753b3887fcef37296..0000000000000000000000000000000000000000 --- a/demo/mnist/api_train_v2.py +++ /dev/null @@ -1,137 +0,0 @@ -import paddle.v2 as paddle -import gzip - - -def softmax_regression(img): - predict = paddle.layer.fc(input=img, - size=10, - act=paddle.activation.Softmax()) - return predict - - -def multilayer_perceptron(img): - # The first fully-connected layer - hidden1 = paddle.layer.fc(input=img, size=128, act=paddle.activation.Relu()) - # The second fully-connected layer and the according activation function - hidden2 = paddle.layer.fc(input=hidden1, - size=64, - act=paddle.activation.Relu()) - # The thrid fully-connected layer, note that the hidden size should be 10, - # which is the number of unique digits - predict = paddle.layer.fc(input=hidden2, - size=10, - act=paddle.activation.Softmax()) - return predict - - -def convolutional_neural_network(img): - # first conv layer - conv_pool_1 = paddle.networks.simple_img_conv_pool( - input=img, - filter_size=5, - num_filters=20, - num_channel=1, - pool_size=2, - pool_stride=2, - act=paddle.activation.Tanh()) - # second conv layer - conv_pool_2 = paddle.networks.simple_img_conv_pool( - input=conv_pool_1, - filter_size=5, - num_filters=50, - num_channel=20, - pool_size=2, - pool_stride=2, - act=paddle.activation.Tanh()) - # The first fully-connected layer - fc1 = paddle.layer.fc(input=conv_pool_2, - size=128, - act=paddle.activation.Tanh()) - # The softmax layer, note that the hidden size should be 10, - # which is the number of unique digits - predict = paddle.layer.fc(input=fc1, - size=10, - act=paddle.activation.Softmax()) - return predict - - -def main(): - paddle.init(use_gpu=False, trainer_count=1) - - # define network topology - images = paddle.layer.data( - name='pixel', type=paddle.data_type.dense_vector(784)) - label = paddle.layer.data( - name='label', type=paddle.data_type.integer_value(10)) - - # Here we can build the prediction network in different ways. Please - # choose one by uncomment corresponding line. - predict = softmax_regression(images) - #predict = multilayer_perceptron(images) - #predict = convolutional_neural_network(images) - - cost = paddle.layer.classification_cost(input=predict, label=label) - - try: - with gzip.open('params.tar.gz', 'r') as f: - parameters = paddle.parameters.Parameters.from_tar(f) - except IOError: - parameters = paddle.parameters.create(cost) - - optimizer = paddle.optimizer.Momentum( - learning_rate=0.1 / 128.0, - momentum=0.9, - regularization=paddle.optimizer.L2Regularization(rate=0.0005 * 128)) - - trainer = paddle.trainer.SGD(cost=cost, - parameters=parameters, - update_equation=optimizer) - - lists = [] - - def event_handler(event): - if isinstance(event, paddle.event.EndIteration): - if event.batch_id % 1000 == 0: - print "Pass %d, Batch %d, Cost %f, %s" % ( - event.pass_id, event.batch_id, event.cost, event.metrics) - - with gzip.open('params.tar.gz', 'w') as f: - parameters.to_tar(f) - - elif isinstance(event, paddle.event.EndPass): - result = trainer.test(reader=paddle.batch( - paddle.dataset.mnist.test(), batch_size=128)) - print "Test with Pass %d, Cost %f, %s\n" % ( - event.pass_id, result.cost, result.metrics) - lists.append((event.pass_id, result.cost, - result.metrics['classification_error_evaluator'])) - - trainer.train( - reader=paddle.batch( - paddle.reader.shuffle( - paddle.dataset.mnist.train(), buf_size=8192), - batch_size=128), - event_handler=event_handler, - num_passes=100) - - # find the best pass - best = sorted(lists, key=lambda list: float(list[1]))[0] - print 'Best pass is %s, testing Avgcost is %s' % (best[0], best[1]) - print 'The classification accuracy is %.2f%%' % (100 - float(best[2]) * 100) - - test_creator = paddle.dataset.mnist.test() - test_data = [] - for item in test_creator(): - test_data.append((item[0], )) - if len(test_data) == 100: - break - - # output is a softmax layer. It returns probabilities. - # Shape should be (100, 10) - probs = paddle.infer( - output_layer=predict, parameters=parameters, input=test_data) - print probs.shape - - -if __name__ == '__main__': - main() diff --git a/demo/recommendation/.gitignore b/demo/recommendation/.gitignore deleted file mode 100644 index fd27ef62a87cae51f2392c0eba50a44490d029af..0000000000000000000000000000000000000000 --- a/demo/recommendation/.gitignore +++ /dev/null @@ -1,10 +0,0 @@ -log.txt -data/meta.bin -data/ml-1m -data/ratings.dat.train -data/ratings.dat.test -data/train.list -data/test.list -dataprovider_copy_1.py -*.pyc -output diff --git a/demo/recommendation/api_train_v2.py b/demo/recommendation/api_train_v2.py deleted file mode 100644 index f6a061799e3ac50236a68beedaf700dd6c698a05..0000000000000000000000000000000000000000 --- a/demo/recommendation/api_train_v2.py +++ /dev/null @@ -1,125 +0,0 @@ -import paddle.v2 as paddle -import cPickle -import copy - - -def main(): - paddle.init(use_gpu=False) - movie_title_dict = paddle.dataset.movielens.get_movie_title_dict() - uid = paddle.layer.data( - name='user_id', - type=paddle.data_type.integer_value( - paddle.dataset.movielens.max_user_id() + 1)) - usr_emb = paddle.layer.embedding(input=uid, size=32) - - usr_gender_id = paddle.layer.data( - name='gender_id', type=paddle.data_type.integer_value(2)) - usr_gender_emb = paddle.layer.embedding(input=usr_gender_id, size=16) - - usr_age_id = paddle.layer.data( - name='age_id', - type=paddle.data_type.integer_value( - len(paddle.dataset.movielens.age_table))) - usr_age_emb = paddle.layer.embedding(input=usr_age_id, size=16) - - usr_job_id = paddle.layer.data( - name='job_id', - type=paddle.data_type.integer_value(paddle.dataset.movielens.max_job_id( - ) + 1)) - - usr_job_emb = paddle.layer.embedding(input=usr_job_id, size=16) - - usr_combined_features = paddle.layer.fc( - input=[usr_emb, usr_gender_emb, usr_age_emb, usr_job_emb], - size=200, - act=paddle.activation.Tanh()) - - mov_id = paddle.layer.data( - name='movie_id', - type=paddle.data_type.integer_value( - paddle.dataset.movielens.max_movie_id() + 1)) - mov_emb = paddle.layer.embedding(input=mov_id, size=32) - - mov_categories = paddle.layer.data( - name='category_id', - type=paddle.data_type.sparse_binary_vector( - len(paddle.dataset.movielens.movie_categories()))) - - mov_categories_hidden = paddle.layer.fc(input=mov_categories, size=32) - - mov_title_id = paddle.layer.data( - name='movie_title', - type=paddle.data_type.integer_value_sequence(len(movie_title_dict))) - mov_title_emb = paddle.layer.embedding(input=mov_title_id, size=32) - mov_title_conv = paddle.networks.sequence_conv_pool( - input=mov_title_emb, hidden_size=32, context_len=3) - - mov_combined_features = paddle.layer.fc( - input=[mov_emb, mov_categories_hidden, mov_title_conv], - size=200, - act=paddle.activation.Tanh()) - - inference = paddle.layer.cos_sim( - a=usr_combined_features, b=mov_combined_features, size=1, scale=5) - cost = paddle.layer.mse_cost( - input=inference, - label=paddle.layer.data( - name='score', type=paddle.data_type.dense_vector(1))) - - parameters = paddle.parameters.create(cost) - - trainer = paddle.trainer.SGD(cost=cost, - parameters=parameters, - update_equation=paddle.optimizer.Adam( - learning_rate=1e-4)) - feeding = { - 'user_id': 0, - 'gender_id': 1, - 'age_id': 2, - 'job_id': 3, - 'movie_id': 4, - 'category_id': 5, - 'movie_title': 6, - 'score': 7 - } - - def event_handler(event): - if isinstance(event, paddle.event.EndIteration): - if event.batch_id % 100 == 0: - print "Pass %d Batch %d Cost %.2f" % ( - event.pass_id, event.batch_id, event.cost) - - trainer.train( - reader=paddle.batch( - paddle.reader.shuffle( - paddle.dataset.movielens.train(), buf_size=8192), - batch_size=256), - event_handler=event_handler, - feeding=feeding, - num_passes=1) - - user_id = 234 - movie_id = 345 - - user = paddle.dataset.movielens.user_info()[user_id] - movie = paddle.dataset.movielens.movie_info()[movie_id] - - feature = user.value() + movie.value() - - def reader(): - yield feature - - infer_dict = copy.copy(feeding) - del infer_dict['score'] - - prediction = paddle.infer( - output=inference, - parameters=parameters, - reader=paddle.batch( - reader, batch_size=32), - feeding=infer_dict) - print(prediction + 5) / 2 - - -if __name__ == '__main__': - main() diff --git a/demo/recommendation/common_utils.py b/demo/recommendation/common_utils.py deleted file mode 100755 index c20c65286621d701ad58409b539bbe9c813d453a..0000000000000000000000000000000000000000 --- a/demo/recommendation/common_utils.py +++ /dev/null @@ -1,30 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from paddle.trainer.PyDataProvider2 import * - - -def meta_to_header(meta, name): - metas = meta[name]['__meta__']['raw_meta'] - for each_meta in metas: - slot_name = each_meta.get('name', '%s_id' % name) - if each_meta['type'] == 'id': - yield slot_name, integer_value(each_meta['max']) - elif each_meta['type'] == 'embedding': - is_seq = each_meta['seq'] == 'sequence' - yield slot_name, integer_value( - len(each_meta['dict']), - seq_type=SequenceType.SEQUENCE - if is_seq else SequenceType.NO_SEQUENCE) - elif each_meta['type'] == 'one_hot_dense': - yield slot_name, dense_vector(len(each_meta['dict'])) diff --git a/demo/recommendation/data/config.json b/demo/recommendation/data/config.json deleted file mode 100644 index f26e74ce47bb7843a571e6033f051c046b31f054..0000000000000000000000000000000000000000 --- a/demo/recommendation/data/config.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - "user": { - "file": { - "name": "users.dat", - "delimiter": "::" - }, - "fields": ["id", "gender", "age", "occupation"] - }, - "movie": { - "file": { - "name": "movies.dat", - "delimiter": "::" - }, - "fields": ["id", "title", "genres"] - } -} diff --git a/demo/recommendation/data/config_generator.py b/demo/recommendation/data/config_generator.py deleted file mode 100644 index 4ca496a252dffc62ed62bb8f2a5ee1661a940580..0000000000000000000000000000000000000000 --- a/demo/recommendation/data/config_generator.py +++ /dev/null @@ -1,127 +0,0 @@ -#!/bin/env python2 -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -config_generator.py - -Usage: - ./config_generator.py [--output_format=] - ./config_generator.py -h | --help - -Options: - -h --help Show this screen. - --output_format= Output Config format(json or yaml) [default: json]. -""" - -import json -import docopt -import copy - -DEFAULT_FILE = {"type": "split", "delimiter": ","} - -DEFAULT_FIELD = { - "id": { - "type": "id" - }, - "gender": { - "name": "gender", - "type": "embedding", - "dict": { - "type": "char_based" - } - }, - "age": { - "name": "age", - "type": "embedding", - "dict": { - "type": "whole_content", - "sort": True - } - }, - "occupation": { - "name": "occupation", - "type": "embedding", - "dict": { - "type": "whole_content", - "sort": "true" - } - }, - "title": { - "regex": { - "pattern": r"^(.*)\((\d+)\)$", - "group_id": 1, - "strip": True - }, - "name": "title", - "type": { - "name": "embedding", - "seq_type": "sequence", - }, - "dict": { - "type": "char_based" - } - }, - "genres": { - "type": "one_hot_dense", - "dict": { - "type": "split", - "delimiter": "|" - }, - "name": "genres" - } -} - - -def merge_dict(master_dict, slave_dict): - return dict(((k, master_dict.get(k) or slave_dict.get(k)) - for k in set(slave_dict) | set(master_dict))) - - -def main(filename, fmt): - with open(filename, 'r') as f: - conf = json.load(f) - obj = dict() - for k in conf: - val = conf[k] - file_dict = val['file'] - file_dict = merge_dict(file_dict, DEFAULT_FILE) - - fields = [] - for pos, field_key in enumerate(val['fields']): - assert isinstance(field_key, basestring) - field = copy.deepcopy(DEFAULT_FIELD[field_key]) - field['pos'] = pos - fields.append(field) - obj[k] = {"file": file_dict, "fields": fields} - meta = {"meta": obj} - # print meta - if fmt == 'json': - - def formatter(x): - import json - return json.dumps(x, indent=2) - elif fmt == 'yaml': - - def formatter(x): - import yaml - return yaml.safe_dump(x, default_flow_style=False) - else: - raise NotImplementedError("Dump format %s is not implemented" % fmt) - - print formatter(meta) - - -if __name__ == '__main__': - args = docopt.docopt(__doc__, version="0.1.0") - main(args[""], args["--output_format"]) diff --git a/demo/recommendation/data/meta_config.json b/demo/recommendation/data/meta_config.json deleted file mode 100644 index cc6a046e271dd0faaa47eeb5a5bef6d3604113fe..0000000000000000000000000000000000000000 --- a/demo/recommendation/data/meta_config.json +++ /dev/null @@ -1,81 +0,0 @@ -{ - "meta": { - "movie": { - "fields": [ - { - "type": "id", - "pos": 0 - }, - { - "regex": { - "pattern": "^(.*)\\((\\d+)\\)$", - "group_id": 1, - "strip": true - }, - "type": { - "seq_type": "sequence", - "name": "embedding" - }, - "dict": { - "type": "char_based" - }, - "name": "title", - "pos": 1 - }, - { - "type": "one_hot_dense", - "dict": { - "delimiter": "|", - "type": "split" - }, - "name": "genres", - "pos": 2 - } - ], - "file": { - "delimiter": "::", - "type": "split", - "name": "movies.dat" - } - }, - "user": { - "fields": [ - { - "type": "id", - "pos": 0 - }, - { - "type": "embedding", - "dict": { - "type": "char_based" - }, - "name": "gender", - "pos": 1 - }, - { - "type": "embedding", - "dict": { - "sort": true, - "type": "whole_content" - }, - "name": "age", - "pos": 2 - }, - { - "type": "embedding", - "dict": { - "sort": "true", - "type": "whole_content" - }, - "name": "occupation", - "pos": 3 - } - ], - "file": { - "delimiter": "::", - "type": "split", - "name": "users.dat" - } - } - } -} diff --git a/demo/recommendation/data/meta_generator.py b/demo/recommendation/data/meta_generator.py deleted file mode 100644 index 38e4679d266c331a751114cd13f0e3453016cf26..0000000000000000000000000000000000000000 --- a/demo/recommendation/data/meta_generator.py +++ /dev/null @@ -1,430 +0,0 @@ -#!/bin/env python2 -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Preprocess Movielens dataset, to get movie/user object. - -Usage: - ./preprocess.py [--config=] - ./preprocess.py -h | --help - -Options: - -h --help Show this screen. - --version Show version. - --config= Get MetaData config file [default: config.json]. -""" -import docopt -import os -import sys -import re -import collections - -try: - import cPickle as pickle -except ImportError: - import pickle - - -class UniqueIDGenerator(object): - def __init__(self): - self.pool = collections.defaultdict(self.__next_id__) - self.next_id = 0 - - def __next_id__(self): - tmp = self.next_id - self.next_id += 1 - return tmp - - def __call__(self, k): - return self.pool[k] - - def to_list(self): - ret_val = [None] * len(self.pool) - for k in self.pool.keys(): - ret_val[self.pool[k]] = k - return ret_val - - -class SortedIDGenerator(object): - def __init__(self): - self.__key_set__ = set() - self.dict = None - - def scan(self, key): - self.__key_set__.add(key) - - def finish_scan(self, compare=None, key=None, reverse=False): - self.__key_set__ = sorted( - list(self.__key_set__), cmp=compare, key=key, reverse=reverse) - self.dict = dict() - for idx, each_key in enumerate(self.__key_set__): - self.dict[each_key] = idx - - def __call__(self, key): - return self.dict[key] - - def to_list(self): - return self.__key_set__ - - -class SplitFileReader(object): - def __init__(self, work_dir, config): - assert isinstance(config, dict) - self.filename = config['name'] - self.delimiter = config.get('delimiter', ',') - self.work_dir = work_dir - - def read(self): - with open(os.path.join(self.work_dir, self.filename), 'r') as f: - for line in f: - line = line.strip() - if isinstance(self.delimiter, unicode): - self.delimiter = str(self.delimiter) - yield line.split(self.delimiter) - - @staticmethod - def create(work_dir, config): - assert isinstance(config, dict) - if config['type'] == 'split': - return SplitFileReader(work_dir, config) - - -class IFileReader(object): - READERS = [SplitFileReader] - - def read(self): - raise NotImplementedError() - - @staticmethod - def create(work_dir, config): - for reader_cls in IFileReader.READERS: - val = reader_cls.create(work_dir, config) - if val is not None: - return val - - -class IDFieldParser(object): - TYPE = 'id' - - def __init__(self, config): - self.__max_id__ = -sys.maxint - 1 - self.__min_id__ = sys.maxint - self.__id_count__ = 0 - - def scan(self, line): - idx = int(line) - self.__max_id__ = max(self.__max_id__, idx) - self.__min_id__ = min(self.__min_id__, idx) - self.__id_count__ += 1 - - def parse(self, line): - return int(line) - - def meta_field(self): - return { - "is_key": True, - 'max': self.__max_id__, - 'min': self.__min_id__, - 'count': self.__id_count__, - 'type': 'id' - } - - -class SplitEmbeddingDict(object): - def __init__(self, delimiter): - self.__id__ = UniqueIDGenerator() - self.delimiter = delimiter - - def scan(self, multi): - for val in multi.split(self.delimiter): - self.__id__(val) - - def parse(self, multi): - return map(self.__id__, multi.split(self.delimiter)) - - def meta_field(self): - return self.__id__.to_list() - - -class EmbeddingFieldParser(object): - TYPE = 'embedding' - - NO_SEQUENCE = "no_sequence" - SEQUENCE = "sequence" - - class CharBasedEmbeddingDict(object): - def __init__(self, is_seq=True): - self.__id__ = UniqueIDGenerator() - self.is_seq = is_seq - - def scan(self, s): - for ch in s: - self.__id__(ch) - - def parse(self, s): - return map(self.__id__, s) if self.is_seq else self.__id__(s[0]) - - def meta_field(self): - return self.__id__.to_list() - - class WholeContentDict(object): - def __init__(self, need_sort=True): - assert need_sort - self.__id__ = SortedIDGenerator() - self.__has_finished__ = False - - def scan(self, txt): - self.__id__.scan(txt) - - def meta_field(self): - if not self.__has_finished__: - self.__id__.finish_scan() - self.__has_finished__ = True - return self.__id__.to_list() - - def parse(self, txt): - return self.__id__(txt) - - def __init__(self, config): - try: - self.seq_type = config['type']['seq_type'] - except TypeError: - self.seq_type = EmbeddingFieldParser.NO_SEQUENCE - - if config['dict']['type'] == 'char_based': - self.dict = EmbeddingFieldParser.CharBasedEmbeddingDict( - self.seq_type == EmbeddingFieldParser.SEQUENCE) - elif config['dict']['type'] == 'split': - self.dict = SplitEmbeddingDict(config['dict'].get('delimiter', ',')) - elif config['dict']['type'] == 'whole_content': - self.dict = EmbeddingFieldParser.WholeContentDict(config['dict'][ - 'sort']) - else: - print config - assert False - - self.name = config['name'] - - def scan(self, s): - self.dict.scan(s) - - def meta_field(self): - return { - 'name': self.name, - 'dict': self.dict.meta_field(), - 'type': 'embedding', - 'seq': self.seq_type - } - - def parse(self, s): - return self.dict.parse(s) - - -class OneHotDenseFieldParser(object): - TYPE = 'one_hot_dense' - - def __init__(self, config): - if config['dict']['type'] == 'split': - self.dict = SplitEmbeddingDict(config['dict']['delimiter']) - self.name = config['name'] - - def scan(self, s): - self.dict.scan(s) - - def meta_field(self): - # print self.dict.meta_field() - return { - 'dict': self.dict.meta_field(), - 'name': self.name, - 'type': 'one_hot_dense' - } - - def parse(self, s): - ids = self.dict.parse(s) - retv = [0.0] * len(self.dict.meta_field()) - for idx in ids: - retv[idx] = 1.0 - # print retv - return retv - - -class FieldParserFactory(object): - PARSERS = [IDFieldParser, EmbeddingFieldParser, OneHotDenseFieldParser] - - @staticmethod - def create(config): - if isinstance(config['type'], basestring): - config_type = config['type'] - elif isinstance(config['type'], dict): - config_type = config['type']['name'] - - assert config_type is not None - - for each_parser_cls in FieldParserFactory.PARSERS: - if config_type == each_parser_cls.TYPE: - return each_parser_cls(config) - print config - - -class CompositeFieldParser(object): - def __init__(self, parser, extractor): - self.extractor = extractor - self.parser = parser - - def scan(self, *args, **kwargs): - self.parser.scan(self.extractor.extract(*args, **kwargs)) - - def parse(self, *args, **kwargs): - return self.parser.parse(self.extractor.extract(*args, **kwargs)) - - def meta_field(self): - return self.parser.meta_field() - - -class PositionContentExtractor(object): - def __init__(self, pos): - self.pos = pos - - def extract(self, line): - assert isinstance(line, list) - return line[self.pos] - - -class RegexPositionContentExtractor(PositionContentExtractor): - def __init__(self, pos, pattern, group_id, strip=True): - PositionContentExtractor.__init__(self, pos) - pattern = pattern.strip() - self.pattern = re.compile(pattern) - self.group_id = group_id - self.strip = strip - - def extract(self, line): - line = PositionContentExtractor.extract(self, line) - match = self.pattern.match(line) - # print line, self.pattern.pattern, match - assert match is not None - txt = match.group(self.group_id) - if self.strip: - txt.strip() - return txt - - -class ContentExtractorFactory(object): - def extract(self, line): - pass - - @staticmethod - def create(config): - if 'pos' in config: - if 'regex' not in config: - return PositionContentExtractor(config['pos']) - else: - extra_args = config['regex'] - return RegexPositionContentExtractor( - pos=config['pos'], **extra_args) - - -class MetaFile(object): - def __init__(self, work_dir): - self.work_dir = work_dir - self.obj = dict() - - def parse(self, config): - config = config['meta'] - - ret_obj = dict() - for key in config.keys(): - val = config[key] - assert 'file' in val - reader = IFileReader.create(self.work_dir, val['file']) - assert reader is not None - assert 'fields' in val and isinstance(val['fields'], list) - fields_config = val['fields'] - field_parsers = map(MetaFile.__field_config_mapper__, fields_config) - - for each_parser in field_parsers: - assert each_parser is not None - - for each_block in reader.read(): - for each_parser in field_parsers: - each_parser.scan(each_block) - - metas = map(lambda x: x.meta_field(), field_parsers) - # print metas - key_index = filter( - lambda x: x is not None, - map(lambda (idx, meta): idx if 'is_key' in meta and meta['is_key'] else None, - enumerate(metas)))[0] - - key_map = [] - for i in range(min(key_index, len(metas))): - key_map.append(i) - for i in range(key_index + 1, len(metas)): - key_map.append(i) - - obj = {'__meta__': {'raw_meta': metas, 'feature_map': key_map}} - - for each_block in reader.read(): - idx = field_parsers[key_index].parse(each_block) - val = [] - for i, each_parser in enumerate(field_parsers): - if i != key_index: - val.append(each_parser.parse(each_block)) - obj[idx] = val - ret_obj[key] = obj - self.obj = ret_obj - return ret_obj - - @staticmethod - def __field_config_mapper__(conf): - assert isinstance(conf, dict) - extrator = ContentExtractorFactory.create(conf) - field_parser = FieldParserFactory.create(conf) - assert extrator is not None - assert field_parser is not None - return CompositeFieldParser(field_parser, extrator) - - def dump(self, fp): - pickle.dump(self.obj, fp, pickle.HIGHEST_PROTOCOL) - - -def preprocess(binary_filename, dataset_dir, config, **kwargs): - assert isinstance(config, str) - with open(config, 'r') as config_file: - file_loader = None - if config.lower().endswith('.yaml'): - import yaml - file_loader = yaml - elif config.lower().endswith('.json'): - import json - file_loader = json - config = file_loader.load(config_file) - meta = MetaFile(dataset_dir) - meta.parse(config) - with open(binary_filename, 'wb') as outf: - meta.dump(outf) - - -if __name__ == '__main__': - args = docopt.docopt(__doc__, version='0.1.0') - kwargs = dict() - for key in args.keys(): - if key != '--help': - param_name = key - assert isinstance(param_name, str) - param_name = param_name.replace('<', '') - param_name = param_name.replace('>', '') - param_name = param_name.replace('--', '') - kwargs[param_name] = args[key] - preprocess(**kwargs) diff --git a/demo/recommendation/data/ml_data.sh b/demo/recommendation/data/ml_data.sh deleted file mode 100755 index 2268d876389e0bdf5ead405e74d278d276626f82..0000000000000000000000000000000000000000 --- a/demo/recommendation/data/ml_data.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -ex -cd "$(dirname "$0")" -# download the dataset -wget http://files.grouplens.org/datasets/movielens/ml-1m.zip -# unzip the dataset -unzip ml-1m.zip -# remove the unused zip file -rm ml-1m.zip diff --git a/demo/recommendation/data/split.py b/demo/recommendation/data/split.py deleted file mode 100644 index be6869c22f04be1db0f8e9c35c73c851e4c490b0..0000000000000000000000000000000000000000 --- a/demo/recommendation/data/split.py +++ /dev/null @@ -1,66 +0,0 @@ -#!/bin/env python2 -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Separate movielens 1m dataset to train/test file. - -Usage: - ./separate.py [--test_ratio=] [--delimiter=] - ./separate.py -h | --help - -Options: - -h --help Show this screen. - --version Show version. - --test_ratio= Test ratio for separate [default: 0.1]. - --delimiter= File delimiter [default: ,]. -""" -import docopt -import collections -import random - - -def process(test_ratio, input_file, delimiter, **kwargs): - test_ratio = float(test_ratio) - rating_dict = collections.defaultdict(list) - with open(input_file, 'r') as f: - for line in f: - user_id = int(line.split(delimiter)[0]) - rating_dict[user_id].append(line.strip()) - - with open(input_file + ".train", 'w') as train_file: - with open(input_file + ".test", 'w') as test_file: - for k in rating_dict.keys(): - lines = rating_dict[k] - assert isinstance(lines, list) - random.shuffle(lines) - test_len = int(len(lines) * test_ratio) - for line in lines[:test_len]: - print >> test_file, line - - for line in lines[test_len:]: - print >> train_file, line - - -if __name__ == '__main__': - args = docopt.docopt(__doc__, version='0.1.0') - kwargs = dict() - for key in args.keys(): - if key != '--help': - param_name = key - assert isinstance(param_name, str) - param_name = param_name.replace('<', '') - param_name = param_name.replace('>', '') - param_name = param_name.replace('--', '') - kwargs[param_name] = args[key] - process(**kwargs) diff --git a/demo/recommendation/dataprovider.py b/demo/recommendation/dataprovider.py deleted file mode 100755 index c4ff96d80e81926049c9a71d6d9d991c0b568c25..0000000000000000000000000000000000000000 --- a/demo/recommendation/dataprovider.py +++ /dev/null @@ -1,88 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from paddle.trainer.PyDataProvider2 import * -import common_utils # parse - - -def __list_to_map__(lst): - ret_val = dict() - for each in lst: - k, v = each - ret_val[k] = v - return ret_val - - -def hook(settings, meta, **kwargs): - """ - Init hook is invoked before process data. It will set obj.slots and store - data meta. - - :param obj: global object. It will passed to process routine. - :type obj: object - :param meta: the meta file object, which passed from trainer_config. Meta - file record movie/user features. - :param kwargs: unused other arguments. - """ - del kwargs # unused kwargs - - # Header define slots that used for paddle. - # first part is movie features. - # second part is user features. - # final part is rating score. - # header is a list of [USE_SEQ_OR_NOT?, SlotType] - movie_headers = list(common_utils.meta_to_header(meta, 'movie')) - settings.movie_names = [h[0] for h in movie_headers] - headers = movie_headers - user_headers = list(common_utils.meta_to_header(meta, 'user')) - settings.user_names = [h[0] for h in user_headers] - headers.extend(user_headers) - headers.append(("rating", dense_vector(1))) # Score - - # slot types. - settings.input_types = __list_to_map__(headers) - settings.meta = meta - - -@provider(init_hook=hook, cache=CacheType.CACHE_PASS_IN_MEM) -def process(settings, filename): - with open(filename, 'r') as f: - for line in f: - # Get a rating from file. - user_id, movie_id, score = map(int, line.split('::')[:-1]) - - # Scale score to [-5, +5] - score = float(score) * 2 - 5.0 - - # Get movie/user features by movie_id, user_id - movie_meta = settings.meta['movie'][movie_id] - user_meta = settings.meta['user'][user_id] - - outputs = [('movie_id', movie_id - 1)] - - # Then add movie features - for i, each_meta in enumerate(movie_meta): - outputs.append((settings.movie_names[i + 1], each_meta)) - - # Then add user id. - outputs.append(('user_id', user_id - 1)) - - # Then add user features. - for i, each_meta in enumerate(user_meta): - outputs.append((settings.user_names[i + 1], each_meta)) - - # Finally, add score - outputs.append(('rating', [score])) - # Return data to paddle - yield __list_to_map__(outputs) diff --git a/demo/recommendation/evaluate.py b/demo/recommendation/evaluate.py deleted file mode 100755 index 3afa7a1e9db5fefb1bbf5aaa174b8168afae4058..0000000000000000000000000000000000000000 --- a/demo/recommendation/evaluate.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/python -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import sys -import re -import math - - -def get_best_pass(log_filename): - with open(log_filename, 'r') as f: - text = f.read() - pattern = re.compile('Test.*? cost=([0-9]+\.[0-9]+).*?pass-([0-9]+)', - re.S) - results = re.findall(pattern, text) - sorted_results = sorted(results, key=lambda result: float(result[0])) - return sorted_results[0] - - -log_filename = sys.argv[1] -log = get_best_pass(log_filename) -predict_error = math.sqrt(float(log[0])) / 2 -print 'Best pass is %s, error is %s, which means predict get error as %f' % ( - log[1], log[0], predict_error) - -evaluate_pass = "output/pass-%s" % log[1] -print "evaluating from pass %s" % evaluate_pass diff --git a/demo/recommendation/evaluate.sh b/demo/recommendation/evaluate.sh deleted file mode 100755 index 02b2857de028bc9c05d7ddd67012043b671b2764..0000000000000000000000000000000000000000 --- a/demo/recommendation/evaluate.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e - -function get_best_pass() { - cat $1 | grep -Pzo 'Test .*\n.*pass-.*' | sed -r 'N;s/Test.* cost=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' | sort | head -n 1 -} - -LOG=`get_best_pass log.txt` -LOG=(${LOG}) -echo 'Best pass is '${LOG[1]}, ' error is '${LOG[0]}, 'which means predict get error as '`echo ${LOG[0]} | python -c 'import math; print math.sqrt(float(raw_input()))/2'` - -evaluate_pass="output/pass-${LOG[1]}" - -echo 'evaluating from pass '$evaluate_pass diff --git a/demo/recommendation/prediction.py b/demo/recommendation/prediction.py deleted file mode 100755 index 8ad993eab3a9f637cfff752bfedbbc62eaf3c8d5..0000000000000000000000000000000000000000 --- a/demo/recommendation/prediction.py +++ /dev/null @@ -1,51 +0,0 @@ -#!/bin/env python2 -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from py_paddle import swig_paddle, DataProviderConverter - -from common_utils import * -from paddle.trainer.config_parser import parse_config - -try: - import cPickle as pickle -except ImportError: - import pickle -import sys - -if __name__ == '__main__': - model_path = sys.argv[1] - swig_paddle.initPaddle('--use_gpu=0') - conf = parse_config("trainer_config.py", "is_predict=1") - network = swig_paddle.GradientMachine.createFromConfigProto( - conf.model_config) - assert isinstance(network, swig_paddle.GradientMachine) - network.loadParameters(model_path) - with open('./data/meta.bin', 'rb') as f: - meta = pickle.load(f) - headers = [h[1] for h in meta_to_header(meta, 'movie')] - headers.extend([h[1] for h in meta_to_header(meta, 'user')]) - cvt = DataProviderConverter(headers) - while True: - movie_id = int(raw_input("Input movie_id: ")) - user_id = int(raw_input("Input user_id: ")) - movie_meta = meta['movie'][movie_id] # Query Data From Meta. - user_meta = meta['user'][user_id] - data = [movie_id - 1] - data.extend(movie_meta) - data.append(user_id - 1) - data.extend(user_meta) - print "Prediction Score is %.2f" % ( - (network.forwardTest(cvt.convert([data]))[0]['value'][0][0] + 5) - / 2) diff --git a/demo/recommendation/preprocess.sh b/demo/recommendation/preprocess.sh deleted file mode 100755 index eeb81ce3cb47e65c0aeb303e7571024ba82dad65..0000000000000000000000000000000000000000 --- a/demo/recommendation/preprocess.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e - -UNAME_STR=`uname` - -if [[ ${UNAME_STR} == 'Linux' ]]; then - SHUF_PROG='shuf' -else - SHUF_PROG='gshuf' -fi - - -cd "$(dirname "$0")" -delimiter='::' -dir=ml-1m -cd data -echo 'generate meta config file' -python config_generator.py config.json > meta_config.json -echo 'generate meta file' -python meta_generator.py $dir meta.bin --config=meta_config.json -echo 'split train/test file' -python split.py $dir/ratings.dat --delimiter=${delimiter} --test_ratio=0.1 -echo 'shuffle train file' -${SHUF_PROG} $dir/ratings.dat.train > ratings.dat.train -cp $dir/ratings.dat.test . -echo "./data/ratings.dat.train" > train.list -echo "./data/ratings.dat.test" > test.list diff --git a/demo/recommendation/requirements.txt b/demo/recommendation/requirements.txt deleted file mode 100644 index 1ea154584a428b6a389309f1f8def502e0aadfce..0000000000000000000000000000000000000000 --- a/demo/recommendation/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -PyYAML -docopt diff --git a/demo/recommendation/run.sh b/demo/recommendation/run.sh deleted file mode 100755 index 22aef556082ba429e9ca7c6dd3ec72699b9dbcf4..0000000000000000000000000000000000000000 --- a/demo/recommendation/run.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e -paddle train \ - --config=trainer_config.py \ - --save_dir=./output \ - --use_gpu=false \ - --trainer_count=4\ - --test_all_data_in_one_period=true \ - --log_period=100 \ - --dot_period=1 \ - --num_passes=50 2>&1 | tee 'log.txt' -paddle usage -l log.txt -e $? -n "recommendation" >/dev/null 2>&1 diff --git a/demo/recommendation/trainer_config.py b/demo/recommendation/trainer_config.py deleted file mode 100755 index 25f529d7d7c430f179107fb189ade34760ab309d..0000000000000000000000000000000000000000 --- a/demo/recommendation/trainer_config.py +++ /dev/null @@ -1,98 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from paddle.trainer_config_helpers import * - -try: - import cPickle as pickle -except ImportError: - import pickle - -is_predict = get_config_arg('is_predict', bool, False) - -META_FILE = 'data/meta.bin' - -with open(META_FILE, 'rb') as f: - # load meta file - meta = pickle.load(f) - -settings( - batch_size=1600, learning_rate=1e-3, learning_method=RMSPropOptimizer()) - - -def construct_feature(name): - """ - Construct movie/user features. - - This method read from meta data. Then convert feature to neural network due - to feature type. The map relation as follow. - - * id: embedding => fc - * embedding: - is_sequence: embedding => context_projection => fc => pool - not sequence: embedding => fc - * one_hot_dense: fc => fc - - Then gather all features vector, and use a fc layer to combined them as - return. - - :param name: 'movie' or 'user' - :type name: basestring - :return: combined feature output - :rtype: LayerOutput - """ - __meta__ = meta[name]['__meta__']['raw_meta'] - fusion = [] - for each_meta in __meta__: - type_name = each_meta['type'] - slot_name = each_meta.get('name', '%s_id' % name) - if type_name == 'id': - slot_dim = each_meta['max'] - embedding = embedding_layer( - input=data_layer( - slot_name, size=slot_dim), size=256) - fusion.append(fc_layer(input=embedding, size=256)) - elif type_name == 'embedding': - is_seq = each_meta['seq'] == 'sequence' - slot_dim = len(each_meta['dict']) - din = data_layer(slot_name, slot_dim) - embedding = embedding_layer(input=din, size=256) - if is_seq: - fusion.append( - text_conv_pool( - input=embedding, context_len=5, hidden_size=256)) - else: - fusion.append(fc_layer(input=embedding, size=256)) - elif type_name == 'one_hot_dense': - slot_dim = len(each_meta['dict']) - hidden = fc_layer(input=data_layer(slot_name, slot_dim), size=256) - fusion.append(fc_layer(input=hidden, size=256)) - - return fc_layer(name="%s_fusion" % name, input=fusion, size=256) - - -movie_feature = construct_feature("movie") -user_feature = construct_feature("user") -similarity = cos_sim(a=movie_feature, b=user_feature) -if not is_predict: - outputs(mse_cost(input=similarity, label=data_layer('rating', size=1))) - - define_py_data_sources2( - 'data/train.list', - 'data/test.list', - module='dataprovider', - obj='process', - args={'meta': meta}) -else: - outputs(similarity) diff --git a/demo/semantic_role_labeling/.gitignore b/demo/semantic_role_labeling/.gitignore deleted file mode 100644 index 65c9b674c7d1dad53b7d1c6ee1dcbdb72553888d..0000000000000000000000000000000000000000 --- a/demo/semantic_role_labeling/.gitignore +++ /dev/null @@ -1,14 +0,0 @@ -*.pyc -train.log -data/feature -data/conll05st-release/ -data/src.dict -data/test.wsj.props -data/test.wsj.seq_pair -data/test.wsj.words -data/tgt.dict -output -data/emb -data/targetDict.txt -data/verbDict.txt -data/wordDict.txt diff --git a/demo/semantic_role_labeling/api_train_v2.py b/demo/semantic_role_labeling/api_train_v2.py deleted file mode 100644 index 036cad4b0a32357bb42580ef577a1eba558be8fe..0000000000000000000000000000000000000000 --- a/demo/semantic_role_labeling/api_train_v2.py +++ /dev/null @@ -1,190 +0,0 @@ -import sys -import math -import numpy as np -import paddle.v2 as paddle -import paddle.v2.dataset.conll05 as conll05 - - -def db_lstm(): - word_dict, verb_dict, label_dict = conll05.get_dict() - word_dict_len = len(word_dict) - label_dict_len = len(label_dict) - pred_len = len(verb_dict) - - mark_dict_len = 2 - word_dim = 32 - mark_dim = 5 - hidden_dim = 512 - depth = 8 - - #8 features - def d_type(size): - return paddle.data_type.integer_value_sequence(size) - - word = paddle.layer.data(name='word_data', type=d_type(word_dict_len)) - predicate = paddle.layer.data(name='verb_data', type=d_type(pred_len)) - - ctx_n2 = paddle.layer.data(name='ctx_n2_data', type=d_type(word_dict_len)) - ctx_n1 = paddle.layer.data(name='ctx_n1_data', type=d_type(word_dict_len)) - ctx_0 = paddle.layer.data(name='ctx_0_data', type=d_type(word_dict_len)) - ctx_p1 = paddle.layer.data(name='ctx_p1_data', type=d_type(word_dict_len)) - ctx_p2 = paddle.layer.data(name='ctx_p2_data', type=d_type(word_dict_len)) - mark = paddle.layer.data(name='mark_data', type=d_type(mark_dict_len)) - - target = paddle.layer.data(name='target', type=d_type(label_dict_len)) - - default_std = 1 / math.sqrt(hidden_dim) / 3.0 - - emb_para = paddle.attr.Param(name='emb', initial_std=0., learning_rate=0.) - std_0 = paddle.attr.Param(initial_std=0.) - std_default = paddle.attr.Param(initial_std=default_std) - - predicate_embedding = paddle.layer.embedding( - size=word_dim, - input=predicate, - param_attr=paddle.attr.Param( - name='vemb', initial_std=default_std)) - mark_embedding = paddle.layer.embedding( - size=mark_dim, input=mark, param_attr=std_0) - - word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2] - emb_layers = [ - paddle.layer.embedding( - size=word_dim, input=x, param_attr=emb_para) for x in word_input - ] - emb_layers.append(predicate_embedding) - emb_layers.append(mark_embedding) - - hidden_0 = paddle.layer.mixed( - size=hidden_dim, - bias_attr=std_default, - input=[ - paddle.layer.full_matrix_projection( - input=emb, param_attr=std_default) for emb in emb_layers - ]) - - mix_hidden_lr = 1e-3 - lstm_para_attr = paddle.attr.Param(initial_std=0.0, learning_rate=1.0) - hidden_para_attr = paddle.attr.Param( - initial_std=default_std, learning_rate=mix_hidden_lr) - - lstm_0 = paddle.layer.lstmemory( - input=hidden_0, - act=paddle.activation.Relu(), - gate_act=paddle.activation.Sigmoid(), - state_act=paddle.activation.Sigmoid(), - bias_attr=std_0, - param_attr=lstm_para_attr) - - #stack L-LSTM and R-LSTM with direct edges - input_tmp = [hidden_0, lstm_0] - - for i in range(1, depth): - mix_hidden = paddle.layer.mixed( - size=hidden_dim, - bias_attr=std_default, - input=[ - paddle.layer.full_matrix_projection( - input=input_tmp[0], param_attr=hidden_para_attr), - paddle.layer.full_matrix_projection( - input=input_tmp[1], param_attr=lstm_para_attr) - ]) - - lstm = paddle.layer.lstmemory( - input=mix_hidden, - act=paddle.activation.Relu(), - gate_act=paddle.activation.Sigmoid(), - state_act=paddle.activation.Sigmoid(), - reverse=((i % 2) == 1), - bias_attr=std_0, - param_attr=lstm_para_attr) - - input_tmp = [mix_hidden, lstm] - - feature_out = paddle.layer.mixed( - size=label_dict_len, - bias_attr=std_default, - input=[ - paddle.layer.full_matrix_projection( - input=input_tmp[0], param_attr=hidden_para_attr), - paddle.layer.full_matrix_projection( - input=input_tmp[1], param_attr=lstm_para_attr) - ], ) - - crf_cost = paddle.layer.crf(size=label_dict_len, - input=feature_out, - label=target, - param_attr=paddle.attr.Param( - name='crfw', - initial_std=default_std, - learning_rate=mix_hidden_lr)) - - crf_dec = paddle.layer.crf_decoding( - name='crf_dec_l', - size=label_dict_len, - input=feature_out, - label=target, - param_attr=paddle.attr.Param(name='crfw')) - - return crf_cost, crf_dec - - -def load_parameter(file_name, h, w): - with open(file_name, 'rb') as f: - f.read(16) # skip header. - return np.fromfile(f, dtype=np.float32).reshape(h, w) - - -def main(): - paddle.init(use_gpu=False, trainer_count=1) - - # define network topology - crf_cost, crf_dec = db_lstm() - - # create parameters - parameters = paddle.parameters.create([crf_cost, crf_dec]) - - # create optimizer - optimizer = paddle.optimizer.Momentum( - momentum=0, - learning_rate=2e-2, - regularization=paddle.optimizer.L2Regularization(rate=8e-4), - model_average=paddle.optimizer.ModelAverage( - average_window=0.5, max_average_window=10000), ) - - def event_handler(event): - if isinstance(event, paddle.event.EndIteration): - if event.batch_id % 100 == 0: - print "Pass %d, Batch %d, Cost %f, %s" % ( - event.pass_id, event.batch_id, event.cost, event.metrics) - - trainer = paddle.trainer.SGD(cost=crf_cost, - parameters=parameters, - update_equation=optimizer) - parameters.set('emb', load_parameter(conll05.get_embedding(), 44068, 32)) - - trn_reader = paddle.batch( - paddle.reader.shuffle( - conll05.test(), buf_size=8192), batch_size=10) - - feeding = { - 'word_data': 0, - 'ctx_n2_data': 1, - 'ctx_n1_data': 2, - 'ctx_0_data': 3, - 'ctx_p1_data': 4, - 'ctx_p2_data': 5, - 'verb_data': 6, - 'mark_data': 7, - 'target': 8 - } - - trainer.train( - reader=trn_reader, - event_handler=event_handler, - num_passes=10000, - feeding=feeding) - - -if __name__ == '__main__': - main() diff --git a/demo/semantic_role_labeling/data/extract_dict_feature.py b/demo/semantic_role_labeling/data/extract_dict_feature.py deleted file mode 100644 index da44111976a0dec68345fc139d0aa459ca9211c2..0000000000000000000000000000000000000000 --- a/demo/semantic_role_labeling/data/extract_dict_feature.py +++ /dev/null @@ -1,81 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys -import os -from optparse import OptionParser - - -def extract_dict_features(pair_file, feature_file): - - with open(pair_file) as fin, open(feature_file, 'w') as feature_out: - for line in fin: - sentence, predicate, labels = line.strip().split('\t') - sentence_list = sentence.split() - labels_list = labels.split() - - verb_index = labels_list.index('B-V') - - mark = [0] * len(labels_list) - if verb_index > 0: - mark[verb_index - 1] = 1 - ctx_n1 = sentence_list[verb_index - 1] - else: - ctx_n1 = 'bos' - - if verb_index > 1: - mark[verb_index - 2] = 1 - ctx_n2 = sentence_list[verb_index - 2] - else: - ctx_n2 = 'bos' - - mark[verb_index] = 1 - ctx_0 = sentence_list[verb_index] - - if verb_index < len(labels_list) - 1: - mark[verb_index + 1] = 1 - ctx_p1 = sentence_list[verb_index + 1] - else: - ctx_p1 = 'eos' - - if verb_index < len(labels_list) - 2: - mark[verb_index + 2] = 1 - ctx_p2 = sentence_list[verb_index + 2] - else: - ctx_p2 = 'eos' - - - feature_str = sentence + '\t' \ - + predicate + '\t' \ - + ctx_n2 + '\t' \ - + ctx_n1 + '\t' \ - + ctx_0 + '\t' \ - + ctx_p1 + '\t' \ - + ctx_p2 + '\t' \ - + ' '.join([str(i) for i in mark]) + '\t' \ - + labels - - feature_out.write(feature_str + '\n') - - -if __name__ == '__main__': - - usage = '-p pair_file -f feature_file' - parser = OptionParser(usage) - parser.add_option('-p', dest='pair_file', help='the pair file') - parser.add_option('-f', dest='feature_file', help='the feature file') - - (options, args) = parser.parse_args() - - extract_dict_features(options.pair_file, options.feature_file) diff --git a/demo/semantic_role_labeling/data/extract_pairs.py b/demo/semantic_role_labeling/data/extract_pairs.py deleted file mode 100644 index 94a8488c16734eb1882d54f7ec36f4b9308c09d4..0000000000000000000000000000000000000000 --- a/demo/semantic_role_labeling/data/extract_pairs.py +++ /dev/null @@ -1,122 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys -import os -from optparse import OptionParser - - -def read_labels(props_file): - ''' - a sentence maybe has more than one verb, each verb has its label sequence - label[], is a 3-dimension list. - the first dim is to store all sentence's label seqs, len is the sentence number - the second dim is to store all label sequences for one sentences - the third dim is to store each label for one word - ''' - labels = [] - with open(props_file) as fin: - label_seqs_for_one_sentences = [] - one_seg_in_file = [] - for line in fin: - line = line.strip() - if line == '': - for i in xrange(len(one_seg_in_file[0])): - a_kind_lable = [x[i] for x in one_seg_in_file] - label_seqs_for_one_sentences.append(a_kind_lable) - labels.append(label_seqs_for_one_sentences) - one_seg_in_file = [] - label_seqs_for_one_sentences = [] - else: - part = line.split() - one_seg_in_file.append(part) - return labels - - -def read_sentences(words_file): - sentences = [] - with open(words_file) as fin: - s = '' - for line in fin: - line = line.strip() - if line == '': - sentences.append(s) - s = '' - else: - s += line + ' ' - return sentences - - -def transform_labels(sentences, labels): - sen_lab_pair = [] - for i in xrange(len(sentences)): - if len(labels[i]) == 1: - continue - else: - verb_list = [] - for x in labels[i][0]: - if x != '-': - verb_list.append(x) - - for j in xrange(1, len(labels[i])): - label_list = labels[i][j] - current_tag = 'O' - is_in_bracket = False - label_seq = [] - verb_word = '' - for ll in label_list: - if ll == '*' and is_in_bracket == False: - label_seq.append('O') - elif ll == '*' and is_in_bracket == True: - label_seq.append('I-' + current_tag) - elif ll == '*)': - label_seq.append('I-' + current_tag) - is_in_bracket = False - elif ll.find('(') != -1 and ll.find(')') != -1: - current_tag = ll[1:ll.find('*')] - label_seq.append('B-' + current_tag) - is_in_bracket = False - elif ll.find('(') != -1 and ll.find(')') == -1: - current_tag = ll[1:ll.find('*')] - label_seq.append('B-' + current_tag) - is_in_bracket = True - else: - print 'error:', ll - sen_lab_pair.append((sentences[i], verb_list[j - 1], label_seq)) - return sen_lab_pair - - -def write_file(sen_lab_pair, output_file): - with open(output_file, 'w') as fout: - for x in sen_lab_pair: - sentence = x[0] - label_seq = ' '.join(x[2]) - assert len(sentence.split()) == len(x[2]) - fout.write(sentence + '\t' + x[1] + '\t' + label_seq + '\n') - - -if __name__ == '__main__': - - usage = '-w words_file -p props_file -o output_file' - parser = OptionParser(usage) - parser.add_option('-w', dest='words_file', help='the words file') - parser.add_option('-p', dest='props_file', help='the props file') - parser.add_option('-o', dest='output_file', help='the output_file') - (options, args) = parser.parse_args() - - sentences = read_sentences(options.words_file) - labels = read_labels(options.props_file) - sen_lab_pair = transform_labels(sentences, labels) - - write_file(sen_lab_pair, options.output_file) diff --git a/demo/semantic_role_labeling/data/get_data.sh b/demo/semantic_role_labeling/data/get_data.sh deleted file mode 100755 index a0ef26a13b9a03392cb8b6207d6d21b7761e38e8..0000000000000000000000000000000000000000 --- a/demo/semantic_role_labeling/data/get_data.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e -wget http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz -wget http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/verbDict.txt -wget http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/targetDict.txt -wget http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt -wget http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb -tar -xzvf conll05st-tests.tar.gz -rm conll05st-tests.tar.gz -cp ./conll05st-release/test.wsj/words/test.wsj.words.gz . -cp ./conll05st-release/test.wsj/props/test.wsj.props.gz . -gunzip test.wsj.words.gz -gunzip test.wsj.props.gz - -python extract_pairs.py -w test.wsj.words -p test.wsj.props -o test.wsj.seq_pair -python extract_dict_feature.py -p test.wsj.seq_pair -f feature diff --git a/demo/semantic_role_labeling/data/test.list b/demo/semantic_role_labeling/data/test.list deleted file mode 100644 index ec370e897a7811b572613150ccb6f665c3adb974..0000000000000000000000000000000000000000 --- a/demo/semantic_role_labeling/data/test.list +++ /dev/null @@ -1 +0,0 @@ -./data/feature diff --git a/demo/semantic_role_labeling/data/train.list b/demo/semantic_role_labeling/data/train.list deleted file mode 100644 index ec370e897a7811b572613150ccb6f665c3adb974..0000000000000000000000000000000000000000 --- a/demo/semantic_role_labeling/data/train.list +++ /dev/null @@ -1 +0,0 @@ -./data/feature diff --git a/demo/semantic_role_labeling/dataprovider.py b/demo/semantic_role_labeling/dataprovider.py deleted file mode 100644 index 360c57ea6283ca43986610abf1831742bfc0c3ef..0000000000000000000000000000000000000000 --- a/demo/semantic_role_labeling/dataprovider.py +++ /dev/null @@ -1,71 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from paddle.trainer.PyDataProvider2 import * - -UNK_IDX = 0 - - -def hook(settings, word_dict, label_dict, predicate_dict, **kwargs): - settings.word_dict = word_dict - settings.label_dict = label_dict - settings.predicate_dict = predicate_dict - - #all inputs are integral and sequential type - settings.slots = [ - integer_value_sequence(len(word_dict)), - integer_value_sequence(len(word_dict)), - integer_value_sequence(len(word_dict)), - integer_value_sequence(len(word_dict)), - integer_value_sequence(len(word_dict)), - integer_value_sequence(len(word_dict)), - integer_value_sequence(len(predicate_dict)), integer_value_sequence(2), - integer_value_sequence(len(label_dict)) - ] - - -def get_batch_size(yeild_data): - return len(yeild_data[0]) - - -@provider( - init_hook=hook, - should_shuffle=True, - calc_batch_size=get_batch_size, - can_over_batch_size=True, - cache=CacheType.CACHE_PASS_IN_MEM) -def process(settings, file_name): - with open(file_name, 'r') as fdata: - for line in fdata: - sentence, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, label = \ - line.strip().split('\t') - - words = sentence.split() - sen_len = len(words) - word_slot = [settings.word_dict.get(w, UNK_IDX) for w in words] - - predicate_slot = [settings.predicate_dict.get(predicate)] * sen_len - ctx_n2_slot = [settings.word_dict.get(ctx_n2, UNK_IDX)] * sen_len - ctx_n1_slot = [settings.word_dict.get(ctx_n1, UNK_IDX)] * sen_len - ctx_0_slot = [settings.word_dict.get(ctx_0, UNK_IDX)] * sen_len - ctx_p1_slot = [settings.word_dict.get(ctx_p1, UNK_IDX)] * sen_len - ctx_p2_slot = [settings.word_dict.get(ctx_p2, UNK_IDX)] * sen_len - - marks = mark.split() - mark_slot = [int(w) for w in marks] - - label_list = label.split() - label_slot = [settings.label_dict.get(w) for w in label_list] - yield word_slot, ctx_n2_slot, ctx_n1_slot, \ - ctx_0_slot, ctx_p1_slot, ctx_p2_slot, predicate_slot, mark_slot, label_slot diff --git a/demo/semantic_role_labeling/db_lstm.py b/demo/semantic_role_labeling/db_lstm.py deleted file mode 100644 index 04e2a559b19bd4b9aec0242eb43edf6ab1e7624e..0000000000000000000000000000000000000000 --- a/demo/semantic_role_labeling/db_lstm.py +++ /dev/null @@ -1,218 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import math -import os -import sys -from paddle.trainer_config_helpers import * - -#file paths -word_dict_file = './data/wordDict.txt' -label_dict_file = './data/targetDict.txt' -predicate_file = './data/verbDict.txt' -train_list_file = './data/train.list' -test_list_file = './data/test.list' - -is_test = get_config_arg('is_test', bool, False) -is_predict = get_config_arg('is_predict', bool, False) - -if not is_predict: - #load dictionaries - word_dict = dict() - label_dict = dict() - predicate_dict = dict() - with open(word_dict_file, 'r') as f_word, \ - open(label_dict_file, 'r') as f_label, \ - open(predicate_file, 'r') as f_pre: - for i, line in enumerate(f_word): - w = line.strip() - word_dict[w] = i - - for i, line in enumerate(f_label): - w = line.strip() - label_dict[w] = i - - for i, line in enumerate(f_pre): - w = line.strip() - predicate_dict[w] = i - - if is_test: - train_list_file = None - - #define data provider - define_py_data_sources2( - train_list=train_list_file, - test_list=test_list_file, - module='dataprovider', - obj='process', - args={ - 'word_dict': word_dict, - 'label_dict': label_dict, - 'predicate_dict': predicate_dict - }) - - word_dict_len = len(word_dict) - label_dict_len = len(label_dict) - pred_len = len(predicate_dict) - -else: - word_dict_len = get_config_arg('dict_len', int) - label_dict_len = get_config_arg('label_len', int) - pred_len = get_config_arg('pred_len', int) - -############################## Hyper-parameters ################################## -mark_dict_len = 2 -word_dim = 32 -mark_dim = 5 -hidden_dim = 512 -depth = 8 - -########################### Optimizer ####################################### - -settings( - batch_size=150, - learning_method=MomentumOptimizer(momentum=0), - learning_rate=2e-2, - regularization=L2Regularization(8e-4), - is_async=False, - model_average=ModelAverage( - average_window=0.5, max_average_window=10000), ) - -####################################### network ############################## -#8 features and 1 target -word = data_layer(name='word_data', size=word_dict_len) -predicate = data_layer(name='verb_data', size=pred_len) - -ctx_n2 = data_layer(name='ctx_n2_data', size=word_dict_len) -ctx_n1 = data_layer(name='ctx_n1_data', size=word_dict_len) -ctx_0 = data_layer(name='ctx_0_data', size=word_dict_len) -ctx_p1 = data_layer(name='ctx_p1_data', size=word_dict_len) -ctx_p2 = data_layer(name='ctx_p2_data', size=word_dict_len) -mark = data_layer(name='mark_data', size=mark_dict_len) - -if not is_predict: - target = data_layer(name='target', size=label_dict_len) - -default_std = 1 / math.sqrt(hidden_dim) / 3.0 - -emb_para = ParameterAttribute(name='emb', initial_std=0., learning_rate=0.) -std_0 = ParameterAttribute(initial_std=0.) -std_default = ParameterAttribute(initial_std=default_std) - -predicate_embedding = embedding_layer( - size=word_dim, - input=predicate, - param_attr=ParameterAttribute( - name='vemb', initial_std=default_std)) -mark_embedding = embedding_layer( - name='word_ctx-in_embedding', size=mark_dim, input=mark, param_attr=std_0) - -word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2] -emb_layers = [ - embedding_layer( - size=word_dim, input=x, param_attr=emb_para) for x in word_input -] -emb_layers.append(predicate_embedding) -emb_layers.append(mark_embedding) - -hidden_0 = mixed_layer( - name='hidden0', - size=hidden_dim, - bias_attr=std_default, - input=[ - full_matrix_projection( - input=emb, param_attr=std_default) for emb in emb_layers - ]) - -mix_hidden_lr = 1e-3 -lstm_para_attr = ParameterAttribute(initial_std=0.0, learning_rate=1.0) -hidden_para_attr = ParameterAttribute( - initial_std=default_std, learning_rate=mix_hidden_lr) - -lstm_0 = lstmemory( - name='lstm0', - input=hidden_0, - act=ReluActivation(), - gate_act=SigmoidActivation(), - state_act=SigmoidActivation(), - bias_attr=std_0, - param_attr=lstm_para_attr) - -#stack L-LSTM and R-LSTM with direct edges -input_tmp = [hidden_0, lstm_0] - -for i in range(1, depth): - - mix_hidden = mixed_layer( - name='hidden' + str(i), - size=hidden_dim, - bias_attr=std_default, - input=[ - full_matrix_projection( - input=input_tmp[0], param_attr=hidden_para_attr), - full_matrix_projection( - input=input_tmp[1], param_attr=lstm_para_attr) - ]) - - lstm = lstmemory( - name='lstm' + str(i), - input=mix_hidden, - act=ReluActivation(), - gate_act=SigmoidActivation(), - state_act=SigmoidActivation(), - reverse=((i % 2) == 1), - bias_attr=std_0, - param_attr=lstm_para_attr) - - input_tmp = [mix_hidden, lstm] - -feature_out = mixed_layer( - name='output', - size=label_dict_len, - bias_attr=std_default, - input=[ - full_matrix_projection( - input=input_tmp[0], param_attr=hidden_para_attr), - full_matrix_projection( - input=input_tmp[1], param_attr=lstm_para_attr) - ], ) - -if not is_predict: - crf_l = crf_layer( - name='crf', - size=label_dict_len, - input=feature_out, - label=target, - param_attr=ParameterAttribute( - name='crfw', initial_std=default_std, learning_rate=mix_hidden_lr)) - - crf_dec_l = crf_decoding_layer( - name='crf_dec_l', - size=label_dict_len, - input=feature_out, - label=target, - param_attr=ParameterAttribute(name='crfw')) - - eval = sum_evaluator(input=crf_dec_l) - - outputs(crf_l) - -else: - crf_dec_l = crf_decoding_layer( - name='crf_dec_l', - size=label_dict_len, - input=feature_out, - param_attr=ParameterAttribute(name='crfw')) - - outputs(crf_dec_l) diff --git a/demo/semantic_role_labeling/predict.py b/demo/semantic_role_labeling/predict.py deleted file mode 100644 index 372fd090b6e8f08f5bb34697772c2e4976810595..0000000000000000000000000000000000000000 --- a/demo/semantic_role_labeling/predict.py +++ /dev/null @@ -1,193 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import numpy as np -from optparse import OptionParser -from py_paddle import swig_paddle, DataProviderConverter -from paddle.trainer.PyDataProvider2 import integer_value_sequence -from paddle.trainer.config_parser import parse_config -""" -Usage: run following command to show help message. - python predict.py -h -""" -UNK_IDX = 0 - - -class Prediction(): - def __init__(self, train_conf, dict_file, model_dir, label_file, - predicate_dict_file): - """ - train_conf: trainer configure. - dict_file: word dictionary file name. - model_dir: directory of model. - """ - - self.dict = {} - self.labels = {} - self.predicate_dict = {} - self.labels_reverse = {} - self.load_dict_label(dict_file, label_file, predicate_dict_file) - - len_dict = len(self.dict) - len_label = len(self.labels) - len_pred = len(self.predicate_dict) - - conf = parse_config( - train_conf, 'dict_len=' + str(len_dict) + ',label_len=' + - str(len_label) + ',pred_len=' + str(len_pred) + ',is_predict=True') - self.network = swig_paddle.GradientMachine.createFromConfigProto( - conf.model_config) - self.network.loadParameters(model_dir) - - slots = [ - integer_value_sequence(len_dict), integer_value_sequence(len_dict), - integer_value_sequence(len_dict), integer_value_sequence(len_dict), - integer_value_sequence(len_dict), integer_value_sequence(len_dict), - integer_value_sequence(len_pred), integer_value_sequence(2) - ] - self.converter = DataProviderConverter(slots) - - def load_dict_label(self, dict_file, label_file, predicate_dict_file): - """ - Load dictionary from self.dict_file. - """ - for line_count, line in enumerate(open(dict_file, 'r')): - self.dict[line.strip()] = line_count - - for line_count, line in enumerate(open(label_file, 'r')): - self.labels[line.strip()] = line_count - self.labels_reverse[line_count] = line.strip() - - for line_count, line in enumerate(open(predicate_dict_file, 'r')): - self.predicate_dict[line.strip()] = line_count - - def get_data(self, data_file): - """ - Get input data of paddle format. - """ - with open(data_file, 'r') as fdata: - for line in fdata: - sentence, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, label = line.strip( - ).split('\t') - words = sentence.split() - sen_len = len(words) - - word_slot = [self.dict.get(w, UNK_IDX) for w in words] - predicate_slot = [self.predicate_dict.get(predicate, UNK_IDX) - ] * sen_len - ctx_n2_slot = [self.dict.get(ctx_n2, UNK_IDX)] * sen_len - ctx_n1_slot = [self.dict.get(ctx_n1, UNK_IDX)] * sen_len - ctx_0_slot = [self.dict.get(ctx_0, UNK_IDX)] * sen_len - ctx_p1_slot = [self.dict.get(ctx_p1, UNK_IDX)] * sen_len - ctx_p2_slot = [self.dict.get(ctx_p2, UNK_IDX)] * sen_len - - marks = mark.split() - mark_slot = [int(w) for w in marks] - - yield word_slot, ctx_n2_slot, ctx_n1_slot, \ - ctx_0_slot, ctx_p1_slot, ctx_p2_slot, predicate_slot, mark_slot - - def predict(self, data_file, output_file): - """ - data_file: file name of input data. - """ - input = self.converter(self.get_data(data_file)) - output = self.network.forwardTest(input) - lab = output[0]["id"].tolist() - - with open(data_file, 'r') as fin, open(output_file, 'w') as fout: - index = 0 - for line in fin: - sen = line.split('\t')[0] - len_sen = len(sen.split()) - line_labels = lab[index:index + len_sen] - index += len_sen - fout.write(sen + '\t' + ' '.join( - [self.labels_reverse[i] for i in line_labels]) + '\n') - - -def option_parser(): - usage = ( - "python predict.py -c config -w model_dir " - "-d word dictionary -l label_file -i input_file -p pred_dict_file") - parser = OptionParser(usage="usage: %s [options]" % usage) - parser.add_option( - "-c", - "--tconf", - action="store", - dest="train_conf", - help="network config") - parser.add_option( - "-d", - "--dict", - action="store", - dest="dict_file", - help="dictionary file") - parser.add_option( - "-l", - "--label", - action="store", - dest="label_file", - default=None, - help="label file") - parser.add_option( - "-p", - "--predict_dict_file", - action="store", - dest="predict_dict_file", - default=None, - help="predict_dict_file") - parser.add_option( - "-i", - "--data", - action="store", - dest="data_file", - help="data file to predict") - parser.add_option( - "-w", - "--model", - action="store", - dest="model_path", - default=None, - help="model path") - - parser.add_option( - "-o", - "--output_file", - action="store", - dest="output_file", - default=None, - help="output file") - return parser.parse_args() - - -def main(): - options, args = option_parser() - train_conf = options.train_conf - data_file = options.data_file - dict_file = options.dict_file - model_path = options.model_path - label_file = options.label_file - predict_dict_file = options.predict_dict_file - output_file = options.output_file - - swig_paddle.initPaddle("--use_gpu=0") - predict = Prediction(train_conf, dict_file, model_path, label_file, - predict_dict_file) - predict.predict(data_file, output_file) - - -if __name__ == '__main__': - main() diff --git a/demo/semantic_role_labeling/predict.sh b/demo/semantic_role_labeling/predict.sh deleted file mode 100755 index 873aad670d16803ce321ab60baabe9fe29ea64bf..0000000000000000000000000000000000000000 --- a/demo/semantic_role_labeling/predict.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash - -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e - -function get_best_pass() { - cat $1 | grep -Pzo 'Test .*\n.*pass-.*' | \ - sed -r 'N;s/Test.* cost=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' | \ - sort -n | head -n 1 -} - -log=train.log -LOG=`get_best_pass $log` -LOG=(${LOG}) -best_model_path="output/pass-${LOG[1]}" - -config_file=db_lstm.py -dict_file=./data/wordDict.txt -label_file=./data/targetDict.txt -predicate_dict_file=./data/verbDict.txt -input_file=./data/feature -output_file=predict.res - -python predict.py \ - -c $config_file \ - -w $best_model_path \ - -l $label_file \ - -p $predicate_dict_file \ - -d $dict_file \ - -i $input_file \ - -o $output_file diff --git a/demo/semantic_role_labeling/test.sh b/demo/semantic_role_labeling/test.sh deleted file mode 100755 index 095bbff2ea42627a13d8ebab436f5a05abc09743..0000000000000000000000000000000000000000 --- a/demo/semantic_role_labeling/test.sh +++ /dev/null @@ -1,41 +0,0 @@ -#!/bin/bash - -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e - -function get_best_pass() { - cat $1 | grep -Pzo 'Test .*\n.*pass-.*' | \ - sed -r 'N;s/Test.* cost=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' |\ - sort -n | head -n 1 -} - -log=train.log -LOG=`get_best_pass $log` -LOG=(${LOG}) -evaluate_pass="output/pass-${LOG[1]}" - -echo 'evaluating from pass '$evaluate_pass -model_list=./model.list -touch $model_list | echo $evaluate_pass > $model_list - -paddle train \ - --config=./db_lstm.py \ - --model_list=$model_list \ - --job=test \ - --use_gpu=false \ - --config_args=is_test=1 \ - --test_all_data_in_one_period=1 \ -2>&1 | tee 'test.log' -paddle usage -l test.log -e $? -n "semantic_role_labeling_test" >/dev/null 2>&1 diff --git a/demo/semantic_role_labeling/train.sh b/demo/semantic_role_labeling/train.sh deleted file mode 100755 index eee14010d7b04a1b824f39090fa82fc532085e0d..0000000000000000000000000000000000000000 --- a/demo/semantic_role_labeling/train.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash - -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e -paddle train \ - --config=./db_lstm.py \ - --use_gpu=0 \ - --log_period=5000 \ - --trainer_count=1 \ - --show_parameter_stats_period=5000 \ - --save_dir=./output \ - --num_passes=10000 \ - --average_test_period=10000000 \ - --init_model_path=./data \ - --load_missing_parameter_strategy=rand \ - --test_all_data_in_one_period=1 \ - 2>&1 | tee 'train.log' -paddle usage -l train.log -e $? -n "semantic_role_labeling_train" >/dev/null 2>&1 diff --git a/demo/sentiment/.gitignore b/demo/sentiment/.gitignore deleted file mode 100644 index bf2a9ab1ce3c937bf06179074cd952dc53591dfd..0000000000000000000000000000000000000000 --- a/demo/sentiment/.gitignore +++ /dev/null @@ -1,11 +0,0 @@ -data/aclImdb -data/imdb -data/pre-imdb -data/mosesdecoder-master -logs/ -model_output -dataprovider_copy_1.py -model.list -test.log -train.log -*.pyc diff --git a/demo/sentiment/data/get_imdb.sh b/demo/sentiment/data/get_imdb.sh deleted file mode 100755 index 7600af6fbb900ee845702f1297779c1f0ed9bf84..0000000000000000000000000000000000000000 --- a/demo/sentiment/data/get_imdb.sh +++ /dev/null @@ -1,51 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -e -set -x - -DIR="$( cd "$(dirname "$0")" ; pwd -P )" -cd $DIR - -#download the dataset -echo "Downloading aclImdb..." -#http://ai.stanford.edu/%7Eamaas/data/sentiment/ -wget http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz - -echo "Downloading mosesdecoder..." -#https://github.com/moses-smt/mosesdecoder -wget https://github.com/moses-smt/mosesdecoder/archive/master.zip - -#extract package -echo "Unzipping..." -tar -zxvf aclImdb_v1.tar.gz -unzip master.zip - -#move train and test set to imdb_data directory -#in order to process when traing -mkdir -p imdb/train -mkdir -p imdb/test - -cp -r aclImdb/train/pos/ imdb/train/pos -cp -r aclImdb/train/neg/ imdb/train/neg - -cp -r aclImdb/test/pos/ imdb/test/pos -cp -r aclImdb/test/neg/ imdb/test/neg - -#remove compressed package -rm aclImdb_v1.tar.gz -rm master.zip - -echo "Done." diff --git a/demo/sentiment/dataprovider.py b/demo/sentiment/dataprovider.py deleted file mode 100755 index 4b7f5d0e504aef3884a04cbed8c16503a4079772..0000000000000000000000000000000000000000 --- a/demo/sentiment/dataprovider.py +++ /dev/null @@ -1,37 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from paddle.trainer.PyDataProvider2 import * - - -def hook(settings, dictionary, **kwargs): - settings.word_dict = dictionary - settings.input_types = [ - integer_value_sequence(len(settings.word_dict)), integer_value(2) - ] - settings.logger.info('dict len : %d' % (len(settings.word_dict))) - - -@provider(init_hook=hook) -def process(settings, file_name): - with open(file_name, 'r') as fdata: - for line_count, line in enumerate(fdata): - label, comment = line.strip().split('\t\t') - label = int(label) - words = comment.split() - word_slot = [ - settings.word_dict[w] for w in words if w in settings.word_dict - ] - if not word_slot: - continue - yield word_slot, label diff --git a/demo/sentiment/predict.py b/demo/sentiment/predict.py deleted file mode 100755 index 64c78e0d6b9297e7a321a4f070517593b0bfe332..0000000000000000000000000000000000000000 --- a/demo/sentiment/predict.py +++ /dev/null @@ -1,154 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os, sys -import numpy as np -from optparse import OptionParser -from py_paddle import swig_paddle, DataProviderConverter -from paddle.trainer.PyDataProvider2 import integer_value_sequence -from paddle.trainer.config_parser import parse_config -""" -Usage: run following command to show help message. - python predict.py -h -""" - - -class SentimentPrediction(): - def __init__(self, train_conf, dict_file, model_dir=None, label_file=None): - """ - train_conf: trainer configure. - dict_file: word dictionary file name. - model_dir: directory of model. - """ - self.train_conf = train_conf - self.dict_file = dict_file - self.word_dict = {} - self.dict_dim = self.load_dict() - self.model_dir = model_dir - if model_dir is None: - self.model_dir = os.path.dirname(train_conf) - - self.label = None - if label_file is not None: - self.load_label(label_file) - - conf = parse_config(train_conf, "is_predict=1") - self.network = swig_paddle.GradientMachine.createFromConfigProto( - conf.model_config) - self.network.loadParameters(self.model_dir) - input_types = [integer_value_sequence(self.dict_dim)] - self.converter = DataProviderConverter(input_types) - - def load_dict(self): - """ - Load dictionary from self.dict_file. - """ - for line_count, line in enumerate(open(self.dict_file, 'r')): - self.word_dict[line.strip().split('\t')[0]] = line_count - return len(self.word_dict) - - def load_label(self, label_file): - """ - Load label. - """ - self.label = {} - for v in open(label_file, 'r'): - self.label[int(v.split('\t')[1])] = v.split('\t')[0] - - def get_index(self, data): - """ - transform word into integer index according to the dictionary. - """ - words = data.strip().split() - word_slot = [self.word_dict[w] for w in words if w in self.word_dict] - return word_slot - - def batch_predict(self, data_batch): - input = self.converter(data_batch) - output = self.network.forwardTest(input) - prob = output[0]["value"] - labs = np.argsort(-prob) - for idx, lab in enumerate(labs): - if self.label is None: - print("predicting label is %d" % (lab[0])) - else: - print("predicting label is %s" % (self.label[lab[0]])) - - -def option_parser(): - usage = "python predict.py -n config -w model_dir -d dictionary -i input_file " - parser = OptionParser(usage="usage: %s [options]" % usage) - parser.add_option( - "-n", - "--tconf", - action="store", - dest="train_conf", - help="network config") - parser.add_option( - "-d", - "--dict", - action="store", - dest="dict_file", - help="dictionary file") - parser.add_option( - "-b", - "--label", - action="store", - dest="label", - default=None, - help="dictionary file") - parser.add_option( - "-c", - "--batch_size", - type="int", - action="store", - dest="batch_size", - default=1, - help="the batch size for prediction") - parser.add_option( - "-w", - "--model", - action="store", - dest="model_path", - default=None, - help="model path") - return parser.parse_args() - - -def main(): - options, args = option_parser() - train_conf = options.train_conf - batch_size = options.batch_size - dict_file = options.dict_file - model_path = options.model_path - label = options.label - swig_paddle.initPaddle("--use_gpu=0") - predict = SentimentPrediction(train_conf, dict_file, model_path, label) - - batch = [] - for line in sys.stdin: - words = predict.get_index(line) - if words: - batch.append([words]) - else: - print('All the words in [%s] are not in the dictionary.' % line) - if len(batch) == batch_size: - predict.batch_predict(batch) - batch = [] - if len(batch) > 0: - predict.batch_predict(batch) - - -if __name__ == '__main__': - main() diff --git a/demo/sentiment/predict.sh b/demo/sentiment/predict.sh deleted file mode 100755 index c72a8e8641516543ef267fcb4b448630246d1e8d..0000000000000000000000000000000000000000 --- a/demo/sentiment/predict.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e - -#Note the default model is pass-00002, you shold make sure the model path -#exists or change the mode path. -model=model_output/pass-00002/ -config=trainer_config.py -label=data/pre-imdb/labels.list -cat ./data/aclImdb/test/pos/10007_10.txt | python predict.py \ - --tconf=$config\ - --model=$model \ - --label=$label \ - --dict=./data/pre-imdb/dict.txt \ - --batch_size=1 diff --git a/demo/sentiment/preprocess.py b/demo/sentiment/preprocess.py deleted file mode 100755 index 29b3682b747c66574590de5ea70574981cc536bb..0000000000000000000000000000000000000000 --- a/demo/sentiment/preprocess.py +++ /dev/null @@ -1,359 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import sys -import random -import operator -import numpy as np -from subprocess import Popen, PIPE -from os.path import join as join_path -from optparse import OptionParser - -from paddle.utils.preprocess_util import * -""" -Usage: run following command to show help message. - python preprocess.py -h -""" - - -def save_dict(dict, filename, is_reverse=True): - """ - Save dictionary into file. - dict: input dictionary. - filename: output file name, string. - is_reverse: True, descending order by value. - False, ascending order by value. - """ - f = open(filename, 'w') - for k, v in sorted(dict.items(), key=operator.itemgetter(1),\ - reverse=is_reverse): - f.write('%s\t%s\n' % (k, v)) - f.close() - - -def tokenize(sentences): - """ - Use tokenizer.perl to tokenize input sentences. - tokenizer.perl is tool of Moses. - sentences : a list of input sentences. - return: a list of processed text. - """ - dir = './data/mosesdecoder-master/scripts/tokenizer/tokenizer.perl' - tokenizer_cmd = [dir, '-l', 'en', '-q', '-'] - assert isinstance(sentences, list) - text = "\n".join(sentences) - tokenizer = Popen(tokenizer_cmd, stdin=PIPE, stdout=PIPE) - tok_text, _ = tokenizer.communicate(text) - toks = tok_text.split('\n')[:-1] - return toks - - -def read_lines(path): - """ - path: String, file path. - return a list of sequence. - """ - seqs = [] - with open(path, 'r') as f: - for line in f.readlines(): - line = line.strip() - if len(line): - seqs.append(line) - return seqs - - -class SentimentDataSetCreate(): - """ - A class to process data for sentiment analysis task. - """ - - def __init__(self, - data_path, - output_path, - use_okenizer=True, - multi_lines=False): - """ - data_path: string, traing and testing dataset path - output_path: string, output path, store processed dataset - multi_lines: whether a file has multi lines. - In order to shuffle fully, it needs to read all files into - memory, then shuffle them if one file has multi lines. - """ - self.output_path = output_path - self.data_path = data_path - - self.train_dir = 'train' - self.test_dir = 'test' - - self.train_list = "train.list" - self.test_list = "test.list" - - self.label_list = "labels.list" - self.classes_num = 0 - - self.batch_size = 50000 - self.batch_dir = 'batches' - - self.dict_file = "dict.txt" - self.dict_with_test = False - self.dict_size = 0 - self.word_count = {} - - self.tokenizer = use_okenizer - self.overwrite = False - - self.multi_lines = multi_lines - - self.train_dir = join_path(data_path, self.train_dir) - self.test_dir = join_path(data_path, self.test_dir) - self.train_list = join_path(output_path, self.train_list) - self.test_list = join_path(output_path, self.test_list) - self.label_list = join_path(output_path, self.label_list) - self.dict_file = join_path(output_path, self.dict_file) - - def data_list(self, path): - """ - create dataset from path - path: data path - return: data list - """ - label_set = get_label_set_from_dir(path) - data = [] - for lab_name in label_set.keys(): - file_paths = list_files(join_path(path, lab_name)) - for p in file_paths: - data.append({"label" : label_set[lab_name],\ - "seq_path": p}) - return data, label_set - - def create_dict(self, data): - """ - create dict for input data. - data: list, [sequence, sequnce, ...] - """ - for seq in data: - for w in seq.strip().lower().split(): - if w not in self.word_count: - self.word_count[w] = 1 - else: - self.word_count[w] += 1 - - def create_dataset(self): - """ - create file batches and dictionary of train data set. - If the self.overwrite is false and train.list already exists in - self.output_path, this function will not create and save file - batches from the data set path. - return: dictionary size, class number. - """ - out_path = self.output_path - if out_path and not os.path.exists(out_path): - os.makedirs(out_path) - - # If self.overwrite is false or self.train_list has existed, - # it will not process dataset. - if not (self.overwrite or not os.path.exists(self.train_list)): - print "%s already exists." % self.train_list - return - - # Preprocess train data. - train_data, train_lab_set = self.data_list(self.train_dir) - print "processing train set..." - file_lists = self.save_data(train_data, "train", self.batch_size, True, - True) - save_list(file_lists, self.train_list) - - # If have test data path, preprocess test data. - if os.path.exists(self.test_dir): - test_data, test_lab_set = self.data_list(self.test_dir) - assert (train_lab_set == test_lab_set) - print "processing test set..." - file_lists = self.save_data(test_data, "test", self.batch_size, - False, self.dict_with_test) - save_list(file_lists, self.test_list) - - # save labels set. - save_dict(train_lab_set, self.label_list, False) - self.classes_num = len(train_lab_set.keys()) - - # save dictionary. - save_dict(self.word_count, self.dict_file, True) - self.dict_size = len(self.word_count) - - def save_data(self, - data, - prefix="", - batch_size=50000, - is_shuffle=False, - build_dict=False): - """ - Create batches for a Dataset object. - data: the Dataset object to process. - prefix: the prefix of each batch. - batch_size: number of data in each batch. - build_dict: whether to build dictionary for data - - return: list of batch names - """ - if is_shuffle and self.multi_lines: - return self.save_data_multi_lines(data, prefix, batch_size, - build_dict) - - if is_shuffle: - random.shuffle(data) - num_batches = int(math.ceil(len(data) / float(batch_size))) - batch_names = [] - for i in range(num_batches): - batch_name = join_path(self.output_path, - "%s_part_%03d" % (prefix, i)) - begin = i * batch_size - end = min((i + 1) * batch_size, len(data)) - # read a batch of data - label_list, data_list = self.get_data_list(begin, end, data) - if build_dict: - self.create_dict(data_list) - self.save_file(label_list, data_list, batch_name) - batch_names.append(batch_name) - - return batch_names - - def get_data_list(self, begin, end, data): - """ - begin: int, begining index of data. - end: int, ending index of data. - data: a list of {"seq_path": seqquence path, "label": label index} - - return a list of label and a list of sequence. - """ - label_list = [] - data_list = [] - for j in range(begin, end): - seqs = read_lines(data[j]["seq_path"]) - lab = int(data[j]["label"]) - #File may have multiple lines. - for seq in seqs: - data_list.append(seq) - label_list.append(lab) - if self.tokenizer: - data_list = tokenize(data_list) - return label_list, data_list - - def save_data_multi_lines(self, - data, - prefix="", - batch_size=50000, - build_dict=False): - """ - In order to shuffle fully, there is no need to load all data if - each file only contains one sample, it only needs to shuffle list - of file name. But one file contains multi lines, each line is one - sample. It needs to read all data into memory to shuffle fully. - This interface is mainly for data containning multi lines in each - file, which consumes more memory if there is a great mount of data. - - data: the Dataset object to process. - prefix: the prefix of each batch. - batch_size: number of data in each batch. - build_dict: whether to build dictionary for data - - return: list of batch names - """ - assert self.multi_lines - label_list = [] - data_list = [] - - # read all data - label_list, data_list = self.get_data_list(0, len(data), data) - if build_dict: - self.create_dict(data_list) - - length = len(label_list) - perm_list = np.array([i for i in xrange(length)]) - random.shuffle(perm_list) - - num_batches = int(math.ceil(length / float(batch_size))) - batch_names = [] - for i in range(num_batches): - batch_name = join_path(self.output_path, - "%s_part_%03d" % (prefix, i)) - begin = i * batch_size - end = min((i + 1) * batch_size, length) - sub_label = [label_list[perm_list[i]] for i in range(begin, end)] - sub_data = [data_list[perm_list[i]] for i in range(begin, end)] - self.save_file(sub_label, sub_data, batch_name) - batch_names.append(batch_name) - - return batch_names - - def save_file(self, label_list, data_list, filename): - """ - Save data into file. - label_list: a list of int value. - data_list: a list of sequnece. - filename: output file name. - """ - f = open(filename, 'w') - print "saving file: %s" % filename - for lab, seq in zip(label_list, data_list): - f.write('%s\t\t%s\n' % (lab, seq)) - f.close() - - -def option_parser(): - parser = OptionParser(usage="usage: python preprcoess.py "\ - "-i data_dir [options]") - parser.add_option( - "-i", - "--data", - action="store", - dest="input", - help="Input data directory.") - parser.add_option( - "-o", - "--output", - action="store", - dest="output", - default=None, - help="Output directory.") - parser.add_option( - "-t", - "--tokenizer", - action="store", - dest="use_tokenizer", - default=True, - help="Whether to use tokenizer.") - parser.add_option("-m", "--multi_lines", action="store", - dest="multi_lines", default=False, - help="If input text files have multi lines and they "\ - "need to be shuffled, you should set -m True,") - return parser.parse_args() - - -def main(): - options, args = option_parser() - data_dir = options.input - output_dir = options.output - use_tokenizer = options.use_tokenizer - multi_lines = options.multi_lines - if output_dir is None: - outname = os.path.basename(options.input) - output_dir = join_path(os.path.dirname(data_dir), 'pre-' + outname) - data_creator = SentimentDataSetCreate(data_dir, output_dir, use_tokenizer, - multi_lines) - data_creator.create_dataset() - - -if __name__ == '__main__': - main() diff --git a/demo/sentiment/preprocess.sh b/demo/sentiment/preprocess.sh deleted file mode 100755 index 19ec34d4f016365d18db01ddec559d26202b19c6..0000000000000000000000000000000000000000 --- a/demo/sentiment/preprocess.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e - -echo "Start to preprcess..." - -data_dir="./data/imdb" -python preprocess.py -i $data_dir - -echo "Done." diff --git a/demo/sentiment/sentiment_net.py b/demo/sentiment/sentiment_net.py deleted file mode 100644 index a01577ca5ae025b7bec67c6d54c7dbd931dbee74..0000000000000000000000000000000000000000 --- a/demo/sentiment/sentiment_net.py +++ /dev/null @@ -1,145 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from os.path import join as join_path - -from paddle.trainer_config_helpers import * - - -def sentiment_data(data_dir=None, - is_test=False, - is_predict=False, - train_list="train.list", - test_list="test.list", - dict_file="dict.txt"): - """ - Predefined data provider for sentiment analysis. - is_test: whether this config is used for test. - is_predict: whether this config is used for prediction. - train_list: text file name, containing a list of training set. - test_list: text file name, containing a list of testing set. - dict_file: text file name, containing dictionary. - """ - dict_dim = len(open(join_path(data_dir, "dict.txt")).readlines()) - class_dim = len(open(join_path(data_dir, 'labels.list')).readlines()) - if is_predict: - return dict_dim, class_dim - - if data_dir is not None: - train_list = join_path(data_dir, train_list) - test_list = join_path(data_dir, test_list) - dict_file = join_path(data_dir, dict_file) - - train_list = train_list if not is_test else None - word_dict = dict() - with open(dict_file, 'r') as f: - for i, line in enumerate(open(dict_file, 'r')): - word_dict[line.split('\t')[0]] = i - - define_py_data_sources2( - train_list, - test_list, - module="dataprovider", - obj="process", - args={'dictionary': word_dict}) - - return dict_dim, class_dim - - -def bidirectional_lstm_net(input_dim, - class_dim=2, - emb_dim=128, - lstm_dim=128, - is_predict=False): - data = data_layer("word", input_dim) - emb = embedding_layer(input=data, size=emb_dim) - bi_lstm = bidirectional_lstm(input=emb, size=lstm_dim) - dropout = dropout_layer(input=bi_lstm, dropout_rate=0.5) - output = fc_layer(input=dropout, size=class_dim, act=SoftmaxActivation()) - - if not is_predict: - lbl = data_layer("label", 1) - outputs(classification_cost(input=output, label=lbl)) - else: - outputs(output) - - -def stacked_lstm_net(input_dim, - class_dim=2, - emb_dim=128, - hid_dim=512, - stacked_num=3, - is_predict=False): - """ - A Wrapper for sentiment classification task. - This network uses bi-directional recurrent network, - consisting three LSTM layers. This configure is referred to - the paper as following url, but use fewer layrs. - http://www.aclweb.org/anthology/P15-1109 - - input_dim: here is word dictionary dimension. - class_dim: number of categories. - emb_dim: dimension of word embedding. - hid_dim: dimension of hidden layer. - stacked_num: number of stacked lstm-hidden layer. - is_predict: is predicting or not. - Some layers is not needed in network when predicting. - """ - hid_lr = 1e-3 - assert stacked_num % 2 == 1 - - layer_attr = ExtraLayerAttribute(drop_rate=0.5) - fc_para_attr = ParameterAttribute(learning_rate=hid_lr) - lstm_para_attr = ParameterAttribute(initial_std=0., learning_rate=1.) - para_attr = [fc_para_attr, lstm_para_attr] - bias_attr = ParameterAttribute(initial_std=0., l2_rate=0.) - relu = ReluActivation() - linear = LinearActivation() - - data = data_layer("word", input_dim) - emb = embedding_layer(input=data, size=emb_dim) - - fc1 = fc_layer(input=emb, size=hid_dim, act=linear, bias_attr=bias_attr) - lstm1 = lstmemory( - input=fc1, act=relu, bias_attr=bias_attr, layer_attr=layer_attr) - - inputs = [fc1, lstm1] - for i in range(2, stacked_num + 1): - fc = fc_layer( - input=inputs, - size=hid_dim, - act=linear, - param_attr=para_attr, - bias_attr=bias_attr) - lstm = lstmemory( - input=fc, - reverse=(i % 2) == 0, - act=relu, - bias_attr=bias_attr, - layer_attr=layer_attr) - inputs = [fc, lstm] - - fc_last = pooling_layer(input=inputs[0], pooling_type=MaxPooling()) - lstm_last = pooling_layer(input=inputs[1], pooling_type=MaxPooling()) - output = fc_layer( - input=[fc_last, lstm_last], - size=class_dim, - act=SoftmaxActivation(), - bias_attr=bias_attr, - param_attr=para_attr) - - if is_predict: - outputs(output) - else: - outputs(classification_cost(input=output, label=data_layer('label', 1))) diff --git a/demo/sentiment/test.sh b/demo/sentiment/test.sh deleted file mode 100755 index 85c4f3ccfc3ede23fcf701769b9701ecbf57c789..0000000000000000000000000000000000000000 --- a/demo/sentiment/test.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e - -function get_best_pass() { - cat $1 | grep -Pzo 'Test .*\n.*pass-.*' | \ - sed -r 'N;s/Test.* classification_error_evaluator=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' |\ - sort -n | head -n 1 -} - -log=train.log -LOG=`get_best_pass $log` -LOG=(${LOG}) -evaluate_pass="model_output/pass-${LOG[1]}" - -echo 'evaluating from pass '$evaluate_pass - -model_list=./model.list -touch $model_list | echo $evaluate_pass > $model_list -net_conf=trainer_config.py -paddle train --config=$net_conf \ - --model_list=$model_list \ - --job=test \ - --use_gpu=false \ - --trainer_count=4 \ - --config_args=is_test=1 \ - 2>&1 | tee 'test.log' -paddle usage -l test.log -e $? -n "sentiment_test" >/dev/null 2>&1 diff --git a/demo/sentiment/train.sh b/demo/sentiment/train.sh deleted file mode 100755 index 14620f733bf03444e5ba3b3b792dfbed6146ecde..0000000000000000000000000000000000000000 --- a/demo/sentiment/train.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e - -config=trainer_config.py -output=./model_output -paddle train --config=$config \ - --save_dir=$output \ - --job=train \ - --use_gpu=false \ - --trainer_count=4 \ - --num_passes=10 \ - --log_period=10 \ - --dot_period=20 \ - --show_parameter_stats_period=100 \ - --test_all_data_in_one_period=1 \ - 2>&1 | tee 'train.log' -paddle usage -l train.log -e $? -n "sentiment_train" >/dev/null 2>&1 diff --git a/demo/sentiment/train_v2.py b/demo/sentiment/train_v2.py deleted file mode 100644 index 1c856556bd0cb32f60eba322469b3621c37e1349..0000000000000000000000000000000000000000 --- a/demo/sentiment/train_v2.py +++ /dev/null @@ -1,159 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys -import paddle.v2 as paddle - - -def convolution_net(input_dim, class_dim=2, emb_dim=128, hid_dim=128): - data = paddle.layer.data("word", - paddle.data_type.integer_value_sequence(input_dim)) - emb = paddle.layer.embedding(input=data, size=emb_dim) - conv_3 = paddle.networks.sequence_conv_pool( - input=emb, context_len=3, hidden_size=hid_dim) - conv_4 = paddle.networks.sequence_conv_pool( - input=emb, context_len=4, hidden_size=hid_dim) - output = paddle.layer.fc(input=[conv_3, conv_4], - size=class_dim, - act=paddle.activation.Softmax()) - lbl = paddle.layer.data("label", paddle.data_type.integer_value(2)) - cost = paddle.layer.classification_cost(input=output, label=lbl) - return cost - - -def stacked_lstm_net(input_dim, - class_dim=2, - emb_dim=128, - hid_dim=512, - stacked_num=3): - """ - A Wrapper for sentiment classification task. - This network uses bi-directional recurrent network, - consisting three LSTM layers. This configure is referred to - the paper as following url, but use fewer layrs. - http://www.aclweb.org/anthology/P15-1109 - - input_dim: here is word dictionary dimension. - class_dim: number of categories. - emb_dim: dimension of word embedding. - hid_dim: dimension of hidden layer. - stacked_num: number of stacked lstm-hidden layer. - """ - assert stacked_num % 2 == 1 - - layer_attr = paddle.attr.Extra(drop_rate=0.5) - fc_para_attr = paddle.attr.Param(learning_rate=1e-3) - lstm_para_attr = paddle.attr.Param(initial_std=0., learning_rate=1.) - para_attr = [fc_para_attr, lstm_para_attr] - bias_attr = paddle.attr.Param(initial_std=0., l2_rate=0.) - relu = paddle.activation.Relu() - linear = paddle.activation.Linear() - - data = paddle.layer.data("word", - paddle.data_type.integer_value_sequence(input_dim)) - emb = paddle.layer.embedding(input=data, size=emb_dim) - - fc1 = paddle.layer.fc(input=emb, - size=hid_dim, - act=linear, - bias_attr=bias_attr) - lstm1 = paddle.layer.lstmemory( - input=fc1, act=relu, bias_attr=bias_attr, layer_attr=layer_attr) - - inputs = [fc1, lstm1] - for i in range(2, stacked_num + 1): - fc = paddle.layer.fc(input=inputs, - size=hid_dim, - act=linear, - param_attr=para_attr, - bias_attr=bias_attr) - lstm = paddle.layer.lstmemory( - input=fc, - reverse=(i % 2) == 0, - act=relu, - bias_attr=bias_attr, - layer_attr=layer_attr) - inputs = [fc, lstm] - - fc_last = paddle.layer.pooling( - input=inputs[0], pooling_type=paddle.pooling.Max()) - lstm_last = paddle.layer.pooling( - input=inputs[1], pooling_type=paddle.pooling.Max()) - output = paddle.layer.fc(input=[fc_last, lstm_last], - size=class_dim, - act=paddle.activation.Softmax(), - bias_attr=bias_attr, - param_attr=para_attr) - - lbl = paddle.layer.data("label", paddle.data_type.integer_value(2)) - cost = paddle.layer.classification_cost(input=output, label=lbl) - return cost - - -if __name__ == '__main__': - # init - paddle.init(use_gpu=False) - - #data - print 'load dictionary...' - word_dict = paddle.dataset.imdb.word_dict() - dict_dim = len(word_dict) - class_dim = 2 - train_reader = paddle.batch( - paddle.reader.shuffle( - lambda: paddle.dataset.imdb.train(word_dict), buf_size=1000), - batch_size=100) - test_reader = paddle.batch( - lambda: paddle.dataset.imdb.test(word_dict), batch_size=100) - - feeding = {'word': 0, 'label': 1} - - # network config - # Please choose the way to build the network - # by uncommenting the corresponding line. - cost = convolution_net(dict_dim, class_dim=class_dim) - # cost = stacked_lstm_net(dict_dim, class_dim=class_dim, stacked_num=3) - - # create parameters - parameters = paddle.parameters.create(cost) - - # create optimizer - adam_optimizer = paddle.optimizer.Adam( - learning_rate=2e-3, - regularization=paddle.optimizer.L2Regularization(rate=8e-4), - model_average=paddle.optimizer.ModelAverage(average_window=0.5)) - - # End batch and end pass event handler - def event_handler(event): - if isinstance(event, paddle.event.EndIteration): - if event.batch_id % 100 == 0: - print "\nPass %d, Batch %d, Cost %f, %s" % ( - event.pass_id, event.batch_id, event.cost, event.metrics) - else: - sys.stdout.write('.') - sys.stdout.flush() - if isinstance(event, paddle.event.EndPass): - result = trainer.test(reader=test_reader, feeding=feeding) - print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics) - - # create trainer - trainer = paddle.trainer.SGD(cost=cost, - parameters=parameters, - update_equation=adam_optimizer) - - trainer.train( - reader=train_reader, - event_handler=event_handler, - feeding=feeding, - num_passes=2) diff --git a/demo/sentiment/trainer_config.py b/demo/sentiment/trainer_config.py deleted file mode 100644 index f1cadaa728ac58107e15f77b5994d31da088caf7..0000000000000000000000000000000000000000 --- a/demo/sentiment/trainer_config.py +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from sentiment_net import * -from paddle.trainer_config_helpers import * - -# whether this config is used for test -is_test = get_config_arg('is_test', bool, False) -# whether this config is used for prediction -is_predict = get_config_arg('is_predict', bool, False) - -data_dir = "./data/pre-imdb" -dict_dim, class_dim = sentiment_data(data_dir, is_test, is_predict) - -################## Algorithm Config ##################### - -settings( - batch_size=128, - learning_rate=2e-3, - learning_method=AdamOptimizer(), - model_average=ModelAverage(0.5), - regularization=L2Regularization(8e-4), - gradient_clipping_threshold=25) - -#################### Network Config ###################### -stacked_lstm_net( - dict_dim, class_dim=class_dim, stacked_num=3, is_predict=is_predict) -# bidirectional_lstm_net(dict_dim, class_dim=class_dim, is_predict=is_predict) diff --git a/demo/seqToseq/.gitignore b/demo/seqToseq/.gitignore deleted file mode 100644 index 21cec2c2c1f3422cbb0ad133281dc1ecdd076a96..0000000000000000000000000000000000000000 --- a/demo/seqToseq/.gitignore +++ /dev/null @@ -1,17 +0,0 @@ -data/wmt14 -data/pre-wmt14 -data/wmt14_model -data/paraphrase -data/pre-paraphrase -data/paraphrase_model -translation/gen.log -translation/gen_result -translation/train.log -paraphrase/train.log -dataprovider_copy_1.py -translation/thirdparty.tgz -translation/thirdparty/train.conf -translation/thirdparty/dataprovider.py -translation/thirdparty/seqToseq_net.py -translation/thirdparty/*.dict -*.pyc diff --git a/demo/seqToseq/api_train_v2.py b/demo/seqToseq/api_train_v2.py deleted file mode 100644 index 3072c375123a2713c655b09fb28001960c9ab64d..0000000000000000000000000000000000000000 --- a/demo/seqToseq/api_train_v2.py +++ /dev/null @@ -1,214 +0,0 @@ -import sys - -import paddle.v2 as paddle - - -def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False): - ### Network Architecture - word_vector_dim = 512 # dimension of word vector - decoder_size = 512 # dimension of hidden unit in GRU Decoder network - encoder_size = 512 # dimension of hidden unit in GRU Encoder network - - beam_size = 3 - max_length = 250 - - #### Encoder - src_word_id = paddle.layer.data( - name='source_language_word', - type=paddle.data_type.integer_value_sequence(source_dict_dim)) - src_embedding = paddle.layer.embedding( - input=src_word_id, - size=word_vector_dim, - param_attr=paddle.attr.ParamAttr(name='_source_language_embedding')) - src_forward = paddle.networks.simple_gru( - input=src_embedding, size=encoder_size) - src_backward = paddle.networks.simple_gru( - input=src_embedding, size=encoder_size, reverse=True) - encoded_vector = paddle.layer.concat(input=[src_forward, src_backward]) - - #### Decoder - with paddle.layer.mixed(size=decoder_size) as encoded_proj: - encoded_proj += paddle.layer.full_matrix_projection( - input=encoded_vector) - - backward_first = paddle.layer.first_seq(input=src_backward) - - with paddle.layer.mixed( - size=decoder_size, act=paddle.activation.Tanh()) as decoder_boot: - decoder_boot += paddle.layer.full_matrix_projection( - input=backward_first) - - def gru_decoder_with_attention(enc_vec, enc_proj, current_word): - - decoder_mem = paddle.layer.memory( - name='gru_decoder', size=decoder_size, boot_layer=decoder_boot) - - context = paddle.networks.simple_attention( - encoded_sequence=enc_vec, - encoded_proj=enc_proj, - decoder_state=decoder_mem) - - with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs: - decoder_inputs += paddle.layer.full_matrix_projection(input=context) - decoder_inputs += paddle.layer.full_matrix_projection( - input=current_word) - - gru_step = paddle.layer.gru_step( - name='gru_decoder', - input=decoder_inputs, - output_mem=decoder_mem, - size=decoder_size) - - with paddle.layer.mixed( - size=target_dict_dim, - bias_attr=True, - act=paddle.activation.Softmax()) as out: - out += paddle.layer.full_matrix_projection(input=gru_step) - return out - - decoder_group_name = "decoder_group" - group_input1 = paddle.layer.StaticInputV2(input=encoded_vector, is_seq=True) - group_input2 = paddle.layer.StaticInputV2(input=encoded_proj, is_seq=True) - group_inputs = [group_input1, group_input2] - - if not is_generating: - trg_embedding = paddle.layer.embedding( - input=paddle.layer.data( - name='target_language_word', - type=paddle.data_type.integer_value_sequence(target_dict_dim)), - size=word_vector_dim, - param_attr=paddle.attr.ParamAttr(name='_target_language_embedding')) - group_inputs.append(trg_embedding) - - # For decoder equipped with attention mechanism, in training, - # target embeding (the groudtruth) is the data input, - # while encoded source sequence is accessed to as an unbounded memory. - # Here, the StaticInput defines a read-only memory - # for the recurrent_group. - decoder = paddle.layer.recurrent_group( - name=decoder_group_name, - step=gru_decoder_with_attention, - input=group_inputs) - - lbl = paddle.layer.data( - name='target_language_next_word', - type=paddle.data_type.integer_value_sequence(target_dict_dim)) - cost = paddle.layer.classification_cost(input=decoder, label=lbl) - - return cost - else: - # In generation, the decoder predicts a next target word based on - # the encoded source sequence and the last generated target word. - - # The encoded source sequence (encoder's output) must be specified by - # StaticInput, which is a read-only memory. - # Embedding of the last generated word is automatically gotten by - # GeneratedInputs, which is initialized by a start mark, such as , - # and must be included in generation. - - trg_embedding = paddle.layer.GeneratedInputV2( - size=target_dict_dim, - embedding_name='_target_language_embedding', - embedding_size=word_vector_dim) - group_inputs.append(trg_embedding) - - beam_gen = paddle.layer.beam_search( - name=decoder_group_name, - step=gru_decoder_with_attention, - input=group_inputs, - bos_id=0, - eos_id=1, - beam_size=beam_size, - max_length=max_length) - - return beam_gen - - -def main(): - paddle.init(use_gpu=False, trainer_count=1) - is_generating = False - - # source and target dict dim. - dict_size = 30000 - source_dict_dim = target_dict_dim = dict_size - - # train the network - if not is_generating: - cost = seqToseq_net(source_dict_dim, target_dict_dim) - parameters = paddle.parameters.create(cost) - - # define optimize method and trainer - optimizer = paddle.optimizer.Adam( - learning_rate=5e-5, - regularization=paddle.optimizer.L2Regularization(rate=8e-4)) - trainer = paddle.trainer.SGD(cost=cost, - parameters=parameters, - update_equation=optimizer) - # define data reader - wmt14_reader = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.wmt14.train(dict_size), buf_size=8192), - batch_size=5) - - # define event_handler callback - def event_handler(event): - if isinstance(event, paddle.event.EndIteration): - if event.batch_id % 10 == 0: - print "\nPass %d, Batch %d, Cost %f, %s" % ( - event.pass_id, event.batch_id, event.cost, - event.metrics) - else: - sys.stdout.write('.') - sys.stdout.flush() - - # start to train - trainer.train( - reader=wmt14_reader, event_handler=event_handler, num_passes=2) - - # generate a english sequence to french - else: - # use the first 3 samples for generation - gen_creator = paddle.dataset.wmt14.gen(dict_size) - gen_data = [] - gen_num = 3 - for item in gen_creator(): - gen_data.append((item[0], )) - if len(gen_data) == gen_num: - break - - beam_gen = seqToseq_net(source_dict_dim, target_dict_dim, is_generating) - # get the pretrained model, whose bleu = 26.92 - parameters = paddle.dataset.wmt14.model() - # prob is the prediction probabilities, and id is the prediction word. - beam_result = paddle.infer( - output_layer=beam_gen, - parameters=parameters, - input=gen_data, - field=['prob', 'id']) - - # get the dictionary - src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size) - - # the delimited element of generated sequences is -1, - # the first element of each generated sequence is the sequence length - seq_list = [] - seq = [] - for w in beam_result[1]: - if w != -1: - seq.append(w) - else: - seq_list.append(' '.join([trg_dict.get(w) for w in seq[1:]])) - seq = [] - - prob = beam_result[0] - beam_size = 3 - for i in xrange(gen_num): - print "\n*******************************************************\n" - print "src:", ' '.join( - [src_dict.get(w) for w in gen_data[i][0]]), "\n" - for j in xrange(beam_size): - print "prob = %f:" % (prob[i][j]), seq_list[i * beam_size + j] - - -if __name__ == '__main__': - main() diff --git a/demo/seqToseq/data/paraphrase_data.sh b/demo/seqToseq/data/paraphrase_data.sh deleted file mode 100755 index e6497c91286d44b5ef3b66c5f824e36a09728720..0000000000000000000000000000000000000000 --- a/demo/seqToseq/data/paraphrase_data.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e -set -x - -# download the in-house paraphrase dataset -wget http://paddlepaddle.bj.bcebos.com/model_zoo/embedding/paraphrase.tar.gz - -# untar the dataset -tar -zxvf paraphrase.tar.gz -rm paraphrase.tar.gz diff --git a/demo/seqToseq/data/paraphrase_model.sh b/demo/seqToseq/data/paraphrase_model.sh deleted file mode 100755 index d0e7f214a38c4dad0fdf7c10ba3b76eb0ab40f06..0000000000000000000000000000000000000000 --- a/demo/seqToseq/data/paraphrase_model.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e -set -x - -dim=32 -pretrained_dir='../../model_zoo/embedding/' -preModel=$pretrained_dir'model_'$dim'.emb' -preDict=$pretrained_dir'baidu.dict' - -usrDict_dir='pre-paraphrase/' -srcDict=$usrDict_dir'src.dict' -trgDict=$usrDict_dir'trg.dict' - -usrModel_dir='paraphrase_model/' -mkdir $usrModel_dir -srcModel=$usrModel_dir'_source_language_embedding' -trgModel=$usrModel_dir'_target_language_embedding' - -echo 'extract desired parameters based on user dictionary' -script=$pretrained_dir'extract_para.py' -python $script --preModel $preModel --preDict $preDict \ - --usrModel $srcModel --usrDict $srcDict -d $dim -python $script --preModel $preModel --preDict $preDict \ - --usrModel $trgModel --usrDict $trgDict -d $dim diff --git a/demo/seqToseq/data/wmt14_data.sh b/demo/seqToseq/data/wmt14_data.sh deleted file mode 100755 index 43f67168d2a876ba5401e0f8490a88adac9c5551..0000000000000000000000000000000000000000 --- a/demo/seqToseq/data/wmt14_data.sh +++ /dev/null @@ -1,53 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e -set -x -mkdir wmt14 -cd wmt14 - -# download the dataset -wget http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/bitexts.tgz -wget http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz - -# untar the dataset -tar -zxvf bitexts.tgz -tar -zxvf dev+test.tgz -gunzip bitexts.selected/* -mv bitexts.selected train -rm bitexts.tgz -rm dev+test.tgz - -# separate the dev and test dataset -mkdir test gen -mv dev/ntst1213.* test -mv dev/ntst14.* gen -rm -rf dev - -set +x -# rename the suffix, .fr->.src, .en->.trg -for dir in train test gen -do - filelist=`ls $dir` - cd $dir - for file in $filelist - do - if [ ${file##*.} = "fr" ]; then - mv $file ${file/%fr/src} - elif [ ${file##*.} = 'en' ]; then - mv $file ${file/%en/trg} - fi - done - cd .. -done diff --git a/demo/seqToseq/data/wmt14_model.sh b/demo/seqToseq/data/wmt14_model.sh deleted file mode 100755 index c4b55b90a3eb98f94e0eb3be028c6de1ef57326b..0000000000000000000000000000000000000000 --- a/demo/seqToseq/data/wmt14_model.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e -set -x - -# download the pretrained model -wget http://paddlepaddle.bj.bcebos.com/model_zoo/wmt14_model.tar.gz - -# untar the model -tar -zxvf wmt14_model.tar.gz -rm wmt14_model.tar.gz diff --git a/demo/seqToseq/dataprovider.py b/demo/seqToseq/dataprovider.py deleted file mode 100755 index c2b49804be582d7d0bc3ef6332741be03936eb24..0000000000000000000000000000000000000000 --- a/demo/seqToseq/dataprovider.py +++ /dev/null @@ -1,94 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from paddle.trainer.PyDataProvider2 import * - -UNK_IDX = 2 -START = "" -END = "" - - -def hook(settings, src_dict_path, trg_dict_path, is_generating, file_list, - **kwargs): - # job_mode = 1: training mode - # job_mode = 0: generating mode - settings.job_mode = not is_generating - - def fun(dict_path): - out_dict = dict() - with open(dict_path, "r") as fin: - out_dict = { - line.strip(): line_count - for line_count, line in enumerate(fin) - } - return out_dict - - settings.src_dict = fun(src_dict_path) - settings.trg_dict = fun(trg_dict_path) - - settings.logger.info("src dict len : %d" % (len(settings.src_dict))) - - if settings.job_mode: - settings.slots = { - 'source_language_word': - integer_value_sequence(len(settings.src_dict)), - 'target_language_word': - integer_value_sequence(len(settings.trg_dict)), - 'target_language_next_word': - integer_value_sequence(len(settings.trg_dict)) - } - settings.logger.info("trg dict len : %d" % (len(settings.trg_dict))) - else: - settings.slots = { - 'source_language_word': - integer_value_sequence(len(settings.src_dict)), - 'sent_id': - integer_value_sequence(len(open(file_list[0], "r").readlines())) - } - - -def _get_ids(s, dictionary): - words = s.strip().split() - return [dictionary[START]] + \ - [dictionary.get(w, UNK_IDX) for w in words] + \ - [dictionary[END]] - - -@provider(init_hook=hook, pool_size=50000) -def process(settings, file_name): - with open(file_name, 'r') as f: - for line_count, line in enumerate(f): - line_split = line.strip().split('\t') - if settings.job_mode and len(line_split) != 2: - continue - src_seq = line_split[0] # one source sequence - src_ids = _get_ids(src_seq, settings.src_dict) - - if settings.job_mode: - trg_seq = line_split[1] # one target sequence - trg_words = trg_seq.split() - trg_ids = [settings.trg_dict.get(w, UNK_IDX) for w in trg_words] - - # remove sequence whose length > 80 in training mode - if len(src_ids) > 80 or len(trg_ids) > 80: - continue - trg_ids_next = trg_ids + [settings.trg_dict[END]] - trg_ids = [settings.trg_dict[START]] + trg_ids - yield { - 'source_language_word': src_ids, - 'target_language_word': trg_ids, - 'target_language_next_word': trg_ids_next - } - else: - yield {'source_language_word': src_ids, 'sent_id': [line_count]} diff --git a/demo/seqToseq/paraphrase/train.conf b/demo/seqToseq/paraphrase/train.conf deleted file mode 100644 index be79c5e771c0e864fd1776cedb3ef37c997b6df6..0000000000000000000000000000000000000000 --- a/demo/seqToseq/paraphrase/train.conf +++ /dev/null @@ -1,33 +0,0 @@ -#edit-mode: -*- python -*- -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys -sys.path.append("..") - -from seqToseq_net import * - -is_generating = False -### Data Definiation -train_conf = seq_to_seq_data(data_dir = "./data/pre-paraphrase", - is_generating = is_generating) - -### Algorithm Configuration -settings( - learning_method = AdamOptimizer(), - batch_size = 50, - learning_rate = 5e-4) - -### Network Architecture -gru_encoder_decoder(train_conf, is_generating, word_vector_dim = 32) diff --git a/demo/seqToseq/paraphrase/train.sh b/demo/seqToseq/paraphrase/train.sh deleted file mode 100755 index 9bb6dbdb1d4c5e35bfb31855e0331f0250a69a20..0000000000000000000000000000000000000000 --- a/demo/seqToseq/paraphrase/train.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e -cd .. - -paddle train \ - --config='paraphrase/train.conf' \ - --save_dir='paraphrase/model' \ - --init_model_path='data/paraphrase_model' \ - --load_missing_parameter_strategy=rand \ - --use_gpu=false \ - --num_passes=16 \ - --show_parameter_stats_period=100 \ - --trainer_count=4 \ - --log_period=10 \ - --dot_period=5 \ - 2>&1 | tee 'paraphrase/train.log' -paddle usage -l 'paraphrase/train.log' -e $? -n "seqToseq_paraphrase_train" >/dev/null 2>&1 diff --git a/demo/seqToseq/preprocess.py b/demo/seqToseq/preprocess.py deleted file mode 100755 index 03f371331a0755e5939e457f4bdfb1770b8dad88..0000000000000000000000000000000000000000 --- a/demo/seqToseq/preprocess.py +++ /dev/null @@ -1,219 +0,0 @@ -#!/bin/env python -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Example: - python preprocess.py -i INPUT [-d DICTSIZE] [-m] - -Options: - -h, --help show this help message and exit - -i INPUT input original dataset path - -d DICTSIZE specified word count of dictionary - -m --mergeDict merge source and target dictionary -""" -import os -import sys - -import string -from optparse import OptionParser -from paddle.utils.preprocess_util import save_list, DatasetCreater - - -class SeqToSeqDatasetCreater(DatasetCreater): - """ - A class to process data for sequence to sequence application. - """ - - def __init__(self, data_path, output_path): - """ - data_path: the path to store the train data, test data and gen data - output_path: the path to store the processed dataset - """ - DatasetCreater.__init__(self, data_path) - self.gen_dir_name = 'gen' - self.gen_list_name = 'gen.list' - self.output_path = output_path - - def concat_file(self, file_path, file1, file2, output_path, output): - """ - Concat file1 and file2 to be one output file - The i-th line of output = i-th line of file1 + '\t' + i-th line of file2 - file_path: the path to store file1 and file2 - output_path: the path to store output file - """ - file1 = os.path.join(file_path, file1) - file2 = os.path.join(file_path, file2) - output = os.path.join(output_path, output) - if not os.path.exists(output): - os.system('paste ' + file1 + ' ' + file2 + ' > ' + output) - - def cat_file(self, dir_path, suffix, output_path, output): - """ - Cat all the files in dir_path with suffix to be one output file - dir_path: the base directory to store input file - suffix: suffix of file name - output_path: the path to store output file - """ - cmd = 'cat ' - file_list = os.listdir(dir_path) - file_list.sort() - for file in file_list: - if file.endswith(suffix): - cmd += os.path.join(dir_path, file) + ' ' - output = os.path.join(output_path, output) - if not os.path.exists(output): - os.system(cmd + '> ' + output) - - def build_dict(self, file_path, dict_path, dict_size=-1): - """ - Create the dictionary for the file, Note that - 1. Valid characters include all printable characters - 2. There is distinction between uppercase and lowercase letters - 3. There is 3 special token: - : the start of a sequence - : the end of a sequence - : a word not included in dictionary - file_path: the path to store file - dict_path: the path to store dictionary - dict_size: word count of dictionary - if is -1, dictionary will contains all the words in file - """ - if not os.path.exists(dict_path): - dictory = dict() - with open(file_path, "r") as fdata: - for line in fdata: - line = line.split('\t') - for line_split in line: - words = line_split.strip().split() - for word in words: - if word not in dictory: - dictory[word] = 1 - else: - dictory[word] += 1 - output = open(dict_path, "w+") - output.write('\n\n\n') - count = 3 - for key, value in sorted( - dictory.items(), key=lambda d: d[1], reverse=True): - output.write(key + "\n") - count += 1 - if count == dict_size: - break - self.dict_size = count - - def create_dataset(self, - dict_size=-1, - mergeDict=False, - suffixes=['.src', '.trg']): - """ - Create seqToseq dataset - """ - # dataset_list and dir_list has one-to-one relationship - train_dataset = os.path.join(self.data_path, self.train_dir_name) - test_dataset = os.path.join(self.data_path, self.test_dir_name) - gen_dataset = os.path.join(self.data_path, self.gen_dir_name) - dataset_list = [train_dataset, test_dataset, gen_dataset] - - train_dir = os.path.join(self.output_path, self.train_dir_name) - test_dir = os.path.join(self.output_path, self.test_dir_name) - gen_dir = os.path.join(self.output_path, self.gen_dir_name) - dir_list = [train_dir, test_dir, gen_dir] - - # create directory - for dir in dir_list: - if not os.path.exists(dir): - os.mkdir(dir) - - # checkout dataset should be parallel corpora - suffix_len = len(suffixes[0]) - for dataset in dataset_list: - file_list = os.listdir(dataset) - if len(file_list) % 2 == 1: - raise RuntimeError("dataset should be parallel corpora") - file_list.sort() - for i in range(0, len(file_list), 2): - if file_list[i][:-suffix_len] != file_list[i + 1][:-suffix_len]: - raise RuntimeError( - "source and target file name should be equal") - - # cat all the files with the same suffix in dataset - for suffix in suffixes: - for dataset in dataset_list: - outname = os.path.basename(dataset) + suffix - self.cat_file(dataset, suffix, dataset, outname) - - # concat parallel corpora and create file.list - print 'concat parallel corpora for dataset' - id = 0 - list = ['train.list', 'test.list', 'gen.list'] - for dataset in dataset_list: - outname = os.path.basename(dataset) - self.concat_file(dataset, outname + suffixes[0], - outname + suffixes[1], dir_list[id], outname) - save_list([os.path.join(dir_list[id], outname)], - os.path.join(self.output_path, list[id])) - id += 1 - - # build dictionary for train data - dict = ['src.dict', 'trg.dict'] - dict_path = [ - os.path.join(self.output_path, dict[0]), - os.path.join(self.output_path, dict[1]) - ] - if mergeDict: - outname = os.path.join(train_dir, train_dataset.split('/')[-1]) - print 'build src dictionary for train data' - self.build_dict(outname, dict_path[0], dict_size) - print 'build trg dictionary for train data' - os.system('cp ' + dict_path[0] + ' ' + dict_path[1]) - else: - outname = os.path.join(train_dataset, self.train_dir_name) - for id in range(0, 2): - suffix = suffixes[id] - print 'build ' + suffix[1:] + ' dictionary for train data' - self.build_dict(outname + suffix, dict_path[id], dict_size) - print 'dictionary size is', self.dict_size - - -def main(): - usage = "usage: \n" \ - "python %prog -i INPUT [-d DICTSIZE] [-m]" - parser = OptionParser(usage) - parser.add_option( - "-i", action="store", dest="input", help="input original dataset path") - parser.add_option( - "-d", - action="store", - dest="dictsize", - help="specified word count of dictionary") - parser.add_option( - "-m", - "--mergeDict", - action="store_true", - dest="mergeDict", - help="merge source and target dictionary") - (options, args) = parser.parse_args() - if options.input[-1] == os.path.sep: - options.input = options.input[:-1] - outname = os.path.basename(options.input) - output_path = os.path.join(os.path.dirname(options.input), 'pre-' + outname) - dictsize = int(options.dictsize) if options.dictsize else -1 - if not os.path.exists(output_path): - os.mkdir(output_path) - data_creator = SeqToSeqDatasetCreater(options.input, output_path) - data_creator.create_dataset(dictsize, options.mergeDict) - - -if __name__ == "__main__": - main() diff --git a/demo/seqToseq/seqToseq_net.py b/demo/seqToseq/seqToseq_net.py deleted file mode 100644 index 3d1f86ec3b7eda4fceaf3a1e406e3d0a1a4a2f60..0000000000000000000000000000000000000000 --- a/demo/seqToseq/seqToseq_net.py +++ /dev/null @@ -1,204 +0,0 @@ -# edit-mode: -*- python -*- - -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys -import os -from paddle.trainer_config_helpers import * - - -def seq_to_seq_data(data_dir, - is_generating, - dict_size=30000, - train_list='train.list', - test_list='test.list', - gen_list='gen.list', - gen_result='gen_result'): - """ - Predefined seqToseq train data provider for application - is_generating: whether this config is used for generating - dict_size: word count of dictionary - train_list: a text file containing a list of training data - test_list: a text file containing a list of testing data - gen_list: a text file containing a list of generating data - gen_result: a text file containing generating result - """ - src_lang_dict = os.path.join(data_dir, 'src.dict') - trg_lang_dict = os.path.join(data_dir, 'trg.dict') - - if is_generating: - train_list = None - test_list = os.path.join(data_dir, gen_list) - else: - train_list = os.path.join(data_dir, train_list) - test_list = os.path.join(data_dir, test_list) - - define_py_data_sources2( - train_list, - test_list, - module="dataprovider", - obj="process", - args={ - "src_dict_path": src_lang_dict, - "trg_dict_path": trg_lang_dict, - "is_generating": is_generating - }) - - return { - "src_dict_path": src_lang_dict, - "trg_dict_path": trg_lang_dict, - "gen_result": gen_result - } - - -def gru_encoder_decoder(data_conf, - is_generating, - word_vector_dim=512, - encoder_size=512, - decoder_size=512, - beam_size=3, - max_length=250, - error_clipping=50): - """ - A wrapper for an attention version of GRU Encoder-Decoder network - is_generating: whether this config is used for generating - encoder_size: dimension of hidden unit in GRU Encoder network - decoder_size: dimension of hidden unit in GRU Decoder network - word_vector_dim: dimension of word vector - beam_size: expand width in beam search - max_length: a stop condition of sequence generation - """ - for k, v in data_conf.iteritems(): - globals()[k] = v - source_dict_dim = len(open(src_dict_path, "r").readlines()) - target_dict_dim = len(open(trg_dict_path, "r").readlines()) - gen_trans_file = gen_result - - src_word_id = data_layer(name='source_language_word', size=source_dict_dim) - src_embedding = embedding_layer( - input=src_word_id, - size=word_vector_dim, - param_attr=ParamAttr(name='_source_language_embedding')) - src_forward = simple_gru( - input=src_embedding, - size=encoder_size, - naive=True, - gru_layer_attr=ExtraLayerAttribute( - error_clipping_threshold=error_clipping)) - src_backward = simple_gru( - input=src_embedding, - size=encoder_size, - reverse=True, - naive=True, - gru_layer_attr=ExtraLayerAttribute( - error_clipping_threshold=error_clipping)) - encoded_vector = concat_layer(input=[src_forward, src_backward]) - - with mixed_layer(size=decoder_size) as encoded_proj: - encoded_proj += full_matrix_projection(input=encoded_vector) - - backward_first = first_seq(input=src_backward) - with mixed_layer( - size=decoder_size, - act=TanhActivation(), ) as decoder_boot: - decoder_boot += full_matrix_projection(input=backward_first) - - def gru_decoder_with_attention(enc_vec, enc_proj, current_word): - decoder_mem = memory( - name='gru_decoder', size=decoder_size, boot_layer=decoder_boot) - - context = simple_attention( - encoded_sequence=enc_vec, - encoded_proj=enc_proj, - decoder_state=decoder_mem, ) - - with mixed_layer(size=decoder_size * 3) as decoder_inputs: - decoder_inputs += full_matrix_projection(input=context) - decoder_inputs += full_matrix_projection(input=current_word) - - gru_step = gru_step_naive_layer( - name='gru_decoder', - input=decoder_inputs, - output_mem=decoder_mem, - size=decoder_size, - layer_attr=ExtraLayerAttribute( - error_clipping_threshold=error_clipping)) - - with mixed_layer( - size=target_dict_dim, bias_attr=True, - act=SoftmaxActivation()) as out: - out += full_matrix_projection(input=gru_step) - return out - - decoder_group_name = "decoder_group" - group_inputs = [ - StaticInput( - input=encoded_vector, is_seq=True), StaticInput( - input=encoded_proj, is_seq=True) - ] - - if not is_generating: - trg_embedding = embedding_layer( - input=data_layer( - name='target_language_word', size=target_dict_dim), - size=word_vector_dim, - param_attr=ParamAttr(name='_target_language_embedding')) - group_inputs.append(trg_embedding) - - # For decoder equipped with attention mechanism, in training, - # target embeding (the groudtruth) is the data input, - # while encoded source sequence is accessed to as an unbounded memory. - # Here, the StaticInput defines a read-only memory - # for the recurrent_group. - decoder = recurrent_group( - name=decoder_group_name, - step=gru_decoder_with_attention, - input=group_inputs) - - lbl = data_layer(name='target_language_next_word', size=target_dict_dim) - cost = classification_cost(input=decoder, label=lbl) - outputs(cost) - else: - # In generation, the decoder predicts a next target word based on - # the encoded source sequence and the last generated target word. - - # The encoded source sequence (encoder's output) must be specified by - # StaticInput, which is a read-only memory. - # Embedding of the last generated word is automatically gotten by - # GeneratedInputs, which is initialized by a start mark, such as , - # and must be included in generation. - - trg_embedding = GeneratedInput( - size=target_dict_dim, - embedding_name='_target_language_embedding', - embedding_size=word_vector_dim) - group_inputs.append(trg_embedding) - - beam_gen = beam_search( - name=decoder_group_name, - step=gru_decoder_with_attention, - input=group_inputs, - bos_id=0, - eos_id=1, - beam_size=beam_size, - max_length=max_length) - - seqtext_printer_evaluator( - input=beam_gen, - id_input=data_layer( - name="sent_id", size=1), - dict_file=trg_dict_path, - result_file=gen_trans_file) - outputs(beam_gen) diff --git a/demo/seqToseq/translation/eval_bleu.sh b/demo/seqToseq/translation/eval_bleu.sh deleted file mode 100755 index 54c2ed237e93adb3456dbe62f75626d36c2d90bc..0000000000000000000000000000000000000000 --- a/demo/seqToseq/translation/eval_bleu.sh +++ /dev/null @@ -1,42 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e -gen_file=$1 -beam_size=$2 - -# find top1 generating result -top1=$(printf '%s_top1.txt' `basename $gen_file .txt`) -if [ $beam_size -eq 1 ]; then - awk -F "\t" '{sub(" ","",$2);sub(" ","",$2);print $2}' $gen_file >$top1 -else - awk 'BEGIN{ - FS="\t"; - OFS="\t"; - read_pos = 2} { - if (NR == read_pos){ - sub(" ","",$3); - sub(" ","",$3); - print $3; - read_pos += (2 + res_num); - }}' res_num=$beam_size $gen_file >$top1 -fi - -# evalute bleu value -bleu_script=multi-bleu.perl -standard_res=../data/wmt14/gen/ntst14.trg -bleu_res=`perl $bleu_script $standard_res <$top1` - -echo $bleu_res -rm $top1 diff --git a/demo/seqToseq/translation/gen.conf b/demo/seqToseq/translation/gen.conf deleted file mode 100644 index e9bea4e4559ff31ad83c4474e91de7e7acc77e9f..0000000000000000000000000000000000000000 --- a/demo/seqToseq/translation/gen.conf +++ /dev/null @@ -1,36 +0,0 @@ -#edit-mode: -*- python -*- -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys -sys.path.append("..") - -from seqToseq_net import * - -# whether this config is used for generating -is_generating = True - -### Data Definiation -gen_conf = seq_to_seq_data(data_dir = "./data/pre-wmt14", - is_generating = is_generating, - gen_result = "./translation/gen_result") - -### Algorithm Configuration -settings( - learning_method = AdamOptimizer(), - batch_size = 1, - learning_rate = 0) - -### Network Architecture -gru_encoder_decoder(gen_conf, is_generating) diff --git a/demo/seqToseq/translation/gen.sh b/demo/seqToseq/translation/gen.sh deleted file mode 100755 index 64b78f5e9654e7b206740f92e224e0164108c9f1..0000000000000000000000000000000000000000 --- a/demo/seqToseq/translation/gen.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e -cd .. - -paddle train \ - --job=test \ - --config='translation/gen.conf' \ - --save_dir='data/wmt14_model' \ - --use_gpu=false \ - --num_passes=13 \ - --test_pass=12 \ - --trainer_count=1 \ - 2>&1 | tee 'translation/gen.log' -paddle usage -l 'translation/gen.log' -e $? -n "seqToseq_translation_gen" >/dev/null 2>&1 diff --git a/demo/seqToseq/translation/moses_bleu.sh b/demo/seqToseq/translation/moses_bleu.sh deleted file mode 100755 index 2f230d7f4c736da003966fbdb277f6b8b1ec952c..0000000000000000000000000000000000000000 --- a/demo/seqToseq/translation/moses_bleu.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e -set -x -echo "Downloading multi-bleu.perl" -wget https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/generic/multi-bleu.perl --no-check-certificate diff --git a/demo/seqToseq/translation/train.conf b/demo/seqToseq/translation/train.conf deleted file mode 100644 index 72b7ccdbb95dbda8f06674079db9a3257bb31622..0000000000000000000000000000000000000000 --- a/demo/seqToseq/translation/train.conf +++ /dev/null @@ -1,36 +0,0 @@ -#edit-mode: -*- python -*- -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys -sys.path.append("..") - -from seqToseq_net import * - -# whether this config is used for generating -is_generating = False - -### Data Definiation -data_dir = "./data/pre-wmt14" -train_conf = seq_to_seq_data(data_dir = data_dir, - is_generating = is_generating) - -### Algorithm Configuration -settings( - learning_method = AdamOptimizer(), - batch_size = 50, - learning_rate = 5e-4) - -### Network Architecture -gru_encoder_decoder(train_conf, is_generating) diff --git a/demo/seqToseq/translation/train.sh b/demo/seqToseq/translation/train.sh deleted file mode 100755 index b0ec9854b118cbb9ed39d6bed0cdd845403926a4..0000000000000000000000000000000000000000 --- a/demo/seqToseq/translation/train.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -set -e -cd .. - -paddle train \ ---config='translation/train.conf' \ ---save_dir='translation/model' \ ---use_gpu=false \ ---num_passes=16 \ ---show_parameter_stats_period=100 \ ---trainer_count=4 \ ---log_period=10 \ ---dot_period=5 \ -2>&1 | tee 'translation/train.log' -paddle usage -l 'translation/train.log' -e $? -n "seqToseq_translation_train" >/dev/null 2>&1 diff --git a/demo/word2vec/train_v2.py b/demo/word2vec/train_v2.py deleted file mode 100644 index 7d952b446f9db432062fc3305a6b65b0ad66dd47..0000000000000000000000000000000000000000 --- a/demo/word2vec/train_v2.py +++ /dev/null @@ -1,80 +0,0 @@ -import math - -import paddle.v2 as paddle - -dictsize = 1953 -embsize = 32 -hiddensize = 256 -N = 5 - - -def wordemb(inlayer): - wordemb = paddle.layer.table_projection( - input=inlayer, - size=embsize, - param_attr=paddle.attr.Param( - name="_proj", - initial_std=0.001, - learning_rate=1, - l2_rate=0, )) - return wordemb - - -def main(): - paddle.init(use_gpu=False, trainer_count=1) - word_dict = paddle.dataset.imikolov.build_dict() - dict_size = len(word_dict) - firstword = paddle.layer.data( - name="firstw", type=paddle.data_type.integer_value(dict_size)) - secondword = paddle.layer.data( - name="secondw", type=paddle.data_type.integer_value(dict_size)) - thirdword = paddle.layer.data( - name="thirdw", type=paddle.data_type.integer_value(dict_size)) - fourthword = paddle.layer.data( - name="fourthw", type=paddle.data_type.integer_value(dict_size)) - nextword = paddle.layer.data( - name="fifthw", type=paddle.data_type.integer_value(dict_size)) - - Efirst = wordemb(firstword) - Esecond = wordemb(secondword) - Ethird = wordemb(thirdword) - Efourth = wordemb(fourthword) - - contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth]) - hidden1 = paddle.layer.fc(input=contextemb, - size=hiddensize, - act=paddle.activation.Sigmoid(), - layer_attr=paddle.attr.Extra(drop_rate=0.5), - bias_attr=paddle.attr.Param(learning_rate=2), - param_attr=paddle.attr.Param( - initial_std=1. / math.sqrt(embsize * 8), - learning_rate=1)) - predictword = paddle.layer.fc(input=hidden1, - size=dict_size, - bias_attr=paddle.attr.Param(learning_rate=2), - act=paddle.activation.Softmax()) - - def event_handler(event): - if isinstance(event, paddle.event.EndIteration): - if event.batch_id % 100 == 0: - result = trainer.test( - paddle.batch( - paddle.dataset.imikolov.test(word_dict, N), 32)) - print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % ( - event.pass_id, event.batch_id, event.cost, event.metrics, - result.metrics) - - cost = paddle.layer.classification_cost(input=predictword, label=nextword) - parameters = paddle.parameters.create(cost) - adam_optimizer = paddle.optimizer.Adam( - learning_rate=3e-3, - regularization=paddle.optimizer.L2Regularization(8e-4)) - trainer = paddle.trainer.SGD(cost, parameters, adam_optimizer) - trainer.train( - paddle.batch(paddle.dataset.imikolov.train(word_dict, N), 32), - num_passes=30, - event_handler=event_handler) - - -if __name__ == '__main__': - main() diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt index 6fa42fd0c71e78cc2fa6b0fe2cb970baf4ac89ed..94dd3457fb5b513441c4c8e339e1862de9092517 100644 --- a/doc/CMakeLists.txt +++ b/doc/CMakeLists.txt @@ -27,10 +27,6 @@ sphinx_add_target(paddle_docs ${CMAKE_CURRENT_SOURCE_DIR} ${SPHINX_HTML_DIR_EN}) -add_dependencies(paddle_docs - gen_proto_py) - - # configured documentation tools and intermediate build results set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build") @@ -51,6 +47,3 @@ sphinx_add_target(paddle_docs_cn ${SPHINX_CACHE_DIR_CN} ${CMAKE_CURRENT_SOURCE_DIR} ${SPHINX_HTML_DIR_CN}) - -add_dependencies(paddle_docs_cn - gen_proto_py) diff --git a/doc/about/index_cn.md b/doc/about/index_cn.md deleted file mode 100644 index 3bf030004d4de8c6f3cb773c6e78c09f40878c5f..0000000000000000000000000000000000000000 --- a/doc/about/index_cn.md +++ /dev/null @@ -1,11 +0,0 @@ -关于PaddlePaddle -================ - -PaddlePaddle是一个最早由百度科学家和工程师共同研发的并行分布式深度学习平台,兼备易用性、高效性、灵活性和可扩展性,目前已被百度内部多个产品线广泛使用。 -PaddlePaddle目前已经开放源码, 但是远未完善,我们希望能在这个基础上不断的改进、扩展和延伸。 -同时我们希望广大开发者积极提供反馈和贡献源代码,建立一个活跃的开源社区。 - -致谢 --------- - -在此,特别感谢PaddlePaddle的[所有贡献者](https://github.com/PaddlePaddle/Paddle/graphs/contributors)。 diff --git a/doc/about/index_en.rst b/doc/about/index_en.rst deleted file mode 100644 index 065c430cdea802ed3c9f487cd00255b85a5598a5..0000000000000000000000000000000000000000 --- a/doc/about/index_en.rst +++ /dev/null @@ -1,14 +0,0 @@ -ABOUT -======= - -PaddlPaddle is an easy-to-use, efficient, flexible and scalable deep learning platform, -which is originally developed by Baidu scientists and engineers for the purpose of applying deep learning to many products at Baidu. - -PaddlePaddle is now open source but far from complete, which is intended to be built upon, improved, scaled, and extended. -We hope to build an active open source community both by providing feedback and by actively contributing to the source code. - - -Credits --------- - -We owe many thanks to `all contributors and developers `_ of PaddlePaddle! diff --git a/doc/api/index_en.rst b/doc/api/index_en.rst index 25c1dd00b9cbb3ab647e04cdc2b4c27c552a2332..e6f632e1a5b9c4b50b7c6aa96a120030bd6ce338 100644 --- a/doc/api/index_en.rst +++ b/doc/api/index_en.rst @@ -7,3 +7,4 @@ API v2/model_configs.rst v2/data.rst v2/run_logic.rst + v2/fluid.rst diff --git a/doc/api/v1/index_cn.rst b/doc/api/v1/index_cn.rst index 3718cd73a2003b8ef6c406a9bd51dc68e76402dc..cf146dc088e3905a751ff55c26fd82ef0ba02c89 100644 --- a/doc/api/v1/index_cn.rst +++ b/doc/api/v1/index_cn.rst @@ -21,7 +21,7 @@ Model Config API trainer_config_helpers/optimizers.rst trainer_config_helpers/data_sources.rst trainer_config_helpers/layers.rst - trainer_config_helpers/activations.rst + trainer_config_helpers/activations.rst trainer_config_helpers/poolings.rst trainer_config_helpers/networks.rst trainer_config_helpers/evaluators.rst diff --git a/doc/api/v1/trainer_config_helpers/activations.rst b/doc/api/v1/trainer_config_helpers/activations.rst deleted file mode 100644 index 269e6491e7ebe3899c3fb24fca756a393043473b..0000000000000000000000000000000000000000 --- a/doc/api/v1/trainer_config_helpers/activations.rst +++ /dev/null @@ -1,108 +0,0 @@ -=========== -Activations -=========== - -BaseActivation -============== - -.. automodule:: paddle.trainer_config_helpers.activations - :members: BaseActivation - :noindex: - -AbsActivation -=============== - -.. automodule:: paddle.trainer_config_helpers.activations - :members: AbsActivation - :noindex: - -ExpActivation -=============== - -.. automodule:: paddle.trainer_config_helpers.activations - :members: ExpActivation - :noindex: - -IdentityActivation -================== - -.. automodule:: paddle.trainer_config_helpers.activations - :members: IdentityActivation - :noindex: - -LinearActivation -================== - -.. automodule:: paddle.trainer_config_helpers.activations - :members: LinearActivation - :noindex: - -LogActivation -================== - -.. automodule:: paddle.trainer_config_helpers.activations - :members: LogActivation - :noindex: - -SquareActivation -================ - -.. automodule:: paddle.trainer_config_helpers.activations - :members: SquareActivation - :noindex: - -SigmoidActivation -================= - -.. automodule:: paddle.trainer_config_helpers.activations - :members: SigmoidActivation - :noindex: - -SoftmaxActivation -================= - -.. automodule:: paddle.trainer_config_helpers.activations - :members: SoftmaxActivation - :noindex: - -SequenceSoftmaxActivation -========================= - -.. automodule:: paddle.trainer_config_helpers.activations - :members: SequenceSoftmaxActivation - :noindex: - -ReluActivation -============== - -.. automodule:: paddle.trainer_config_helpers.activations - :members: ReluActivation - :noindex: - -BReluActivation -=============== - -.. automodule:: paddle.trainer_config_helpers.activations - :members: BReluActivation - :noindex: - -SoftReluActivation -================== - -.. automodule:: paddle.trainer_config_helpers.activations - :members: SoftReluActivation - :noindex: - -TanhActivation -============== - -.. automodule:: paddle.trainer_config_helpers.activations - :members: TanhActivation - :noindex: - -STanhActivation -=============== - -.. automodule:: paddle.trainer_config_helpers.activations - :members: STanhActivation - :noindex: diff --git a/doc/api/v1/trainer_config_helpers/attrs.rst b/doc/api/v1/trainer_config_helpers/attrs.rst deleted file mode 100644 index ac63127bf7d9db6351365ab7b58f43db12347a8e..0000000000000000000000000000000000000000 --- a/doc/api/v1/trainer_config_helpers/attrs.rst +++ /dev/null @@ -1,5 +0,0 @@ -Parameter Attributes -======================= - -.. automodule:: paddle.trainer_config_helpers.attrs - :members: diff --git a/doc/api/v1/trainer_config_helpers/data_sources.rst b/doc/api/v1/trainer_config_helpers/data_sources.rst deleted file mode 100644 index b9dd4dda01ae59d1260356aff50ddf298d02c87f..0000000000000000000000000000000000000000 --- a/doc/api/v1/trainer_config_helpers/data_sources.rst +++ /dev/null @@ -1,7 +0,0 @@ -.. _api_trainer_config_helpers_data_sources: - -DataSources -=========== - -.. automodule:: paddle.trainer_config_helpers.data_sources - :members: diff --git a/doc/api/v1/trainer_config_helpers/evaluators.rst b/doc/api/v1/trainer_config_helpers/evaluators.rst deleted file mode 100644 index 11dc735164284d6ed1d661fab1e7690d263b3a7c..0000000000000000000000000000000000000000 --- a/doc/api/v1/trainer_config_helpers/evaluators.rst +++ /dev/null @@ -1,108 +0,0 @@ -.. _api_trainer_config_helpers_evaluators: - -========== -Evaluators -========== - -Base -==== -.. automodule:: paddle.trainer_config_helpers.evaluators - :members: evaluator_base - :noindex: - -Classification -============== - -classification_error_evaluator ------------------------------- -.. automodule:: paddle.trainer_config_helpers.evaluators - :members: classification_error_evaluator - :noindex: - -auc_evaluator -------------- -.. automodule:: paddle.trainer_config_helpers.evaluators - :members: auc_evaluator - :noindex: - -ctc_error_evaluator -------------------- -.. automodule:: paddle.trainer_config_helpers.evaluators - :members: ctc_error_evaluator - :noindex: - -chunk_evaluator ---------------- -.. automodule:: paddle.trainer_config_helpers.evaluators - :members: chunk_evaluator - :noindex: - -precision_recall_evaluator --------------------------- -.. automodule:: paddle.trainer_config_helpers.evaluators - :members: precision_recall_evaluator - :noindex: - -Rank -==== - -pnpair_evaluator ----------------- -.. automodule:: paddle.trainer_config_helpers.evaluators - :members: pnpair_evaluator - :noindex: - -Utils -===== - -sum_evaluator -------------- -.. automodule:: paddle.trainer_config_helpers.evaluators - :members: sum_evaluator - :noindex: - -column_sum_evaluator --------------------- -.. automodule:: paddle.trainer_config_helpers.evaluators - :members: column_sum_evaluator - :noindex: - -Print -===== - -classification_error_printer_evaluator --------------------------------------- -.. automodule:: paddle.trainer_config_helpers.evaluators - :members: classification_error_printer_evaluator - :noindex: - -gradient_printer_evaluator --------------------------- -.. automodule:: paddle.trainer_config_helpers.evaluators - :members: gradient_printer_evaluator - :noindex: - -maxid_printer_evaluator ------------------------ -.. automodule:: paddle.trainer_config_helpers.evaluators - :members: maxid_printer_evaluator - :noindex: - -maxframe_printer_evaluator ---------------------------- -.. automodule:: paddle.trainer_config_helpers.evaluators - :members: maxframe_printer_evaluator - :noindex: - -seqtext_printer_evaluator -------------------------- -.. automodule:: paddle.trainer_config_helpers.evaluators - :members: seqtext_printer_evaluator - :noindex: - -value_printer_evaluator ------------------------ -.. automodule:: paddle.trainer_config_helpers.evaluators - :members: value_printer_evaluator - :noindex: - diff --git a/doc/api/v1/trainer_config_helpers/layers.rst b/doc/api/v1/trainer_config_helpers/layers.rst deleted file mode 100644 index 24389c2d8574dfda4bec9298776aa6b1aee51535..0000000000000000000000000000000000000000 --- a/doc/api/v1/trainer_config_helpers/layers.rst +++ /dev/null @@ -1,508 +0,0 @@ -.. _api_trainer_config_helpers_layers: - -====== -Layers -====== - -Base -====== - -LayerType ---------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: LayerType - :noindex: - -LayerOutput ------------ -.. automodule:: paddle.trainer_config_helpers.layers - :members: LayerOutput - :noindex: - -Data layer -=========== - -.. _api_trainer_config_helpers_layers_data_layer: - -data_layer ----------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: data_layer - :noindex: - -Fully Connected Layers -====================== - -.. _api_trainer_config_helpers_layers_fc_layer: - -fc_layer --------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: fc_layer - :noindex: - -selective_fc_layer ------------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: selective_fc_layer - :noindex: - -Conv Layers -=========== - -conv_operator -------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: conv_operator - :noindex: - -conv_projection ---------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: conv_projection - :noindex: - -conv_shift_layer ------------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: conv_shift_layer - :noindex: - -img_conv_layer --------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: img_conv_layer - :noindex: - -.. _api_trainer_config_helpers_layers_context_projection: - -context_projection ------------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: context_projection - :noindex: - -Image Pooling Layer -=================== - -img_pool_layer --------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: img_pool_layer - :noindex: - -spp_layer --------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: spp_layer - :noindex: - -maxout_layer ------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: maxout_layer - :noindex: - -Norm Layer -========== - -img_cmrnorm_layer ------------------ -.. automodule:: paddle.trainer_config_helpers.layers - :members: img_cmrnorm_layer - :noindex: - -batch_norm_layer ---------------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: batch_norm_layer - :noindex: - -sum_to_one_norm_layer ---------------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: sum_to_one_norm_layer - :noindex: - -Recurrent Layers -================ - -recurrent_layer ------------------ -.. automodule:: paddle.trainer_config_helpers.layers - :members: recurrent_layer - :noindex: - -lstmemory ---------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: lstmemory - :noindex: - -grumemory ---------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: grumemory - :noindex: - -Recurrent Layer Group -===================== - -memory ------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: memory - :noindex: - -recurrent_group ---------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: recurrent_group - :noindex: - -lstm_step_layer ---------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: lstm_step_layer - :noindex: - -gru_step_layer ---------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: gru_step_layer - :noindex: - -beam_search ------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: beam_search - :noindex: - -get_output_layer ------------------ -.. automodule:: paddle.trainer_config_helpers.layers - :members: get_output_layer - :noindex: - -Mixed Layer -=========== - -.. _api_trainer_config_helpers_layers_mixed_layer: - -mixed_layer ------------ -.. automodule:: paddle.trainer_config_helpers.layers - :members: mixed_layer - :noindex: - -.. _api_trainer_config_helpers_layers_embedding_layer: - -embedding_layer ---------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: embedding_layer - :noindex: - -scaling_projection ------------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: scaling_projection - :noindex: - -dotmul_projection ------------------ -.. automodule:: paddle.trainer_config_helpers.layers - :members: dotmul_projection - :noindex: - -dotmul_operator ---------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: dotmul_operator - :noindex: - -full_matrix_projection ----------------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: full_matrix_projection - :noindex: - -identity_projection -------------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: identity_projection - :noindex: - - -table_projection ----------------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: table_projection - :noindex: - -trans_full_matrix_projection ----------------------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: trans_full_matrix_projection - :noindex: - -Aggregate Layers -================ - -.. _api_trainer_config_helpers_layers_pooling_layer: - -pooling_layer -------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: pooling_layer - :noindex: - -.. _api_trainer_config_helpers_layers_last_seq: - -last_seq --------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: last_seq - :noindex: - -.. _api_trainer_config_helpers_layers_first_seq: - -first_seq ---------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: first_seq - :noindex: - -concat_layer ------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: concat_layer - :noindex: - -seq_concat_layer ----------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: seq_concat_layer - :noindex: - -Reshaping Layers -================ - -block_expand_layer ------------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: block_expand_layer - :noindex: - -.. _api_trainer_config_helpers_layers_expand_layer: - -expand_layer ------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: expand_layer - :noindex: - -repeat_layer ------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: repeat_layer - :noindex: - -rotate_layer ------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: rotate_layer - :noindex: - -seq_reshape_layer ------------------ -.. automodule:: paddle.trainer_config_helpers.layers - :members: seq_reshape_layer - :noindex: - -Math Layers -=========== - -addto_layer ------------ -.. automodule:: paddle.trainer_config_helpers.layers - :members: addto_layer - :noindex: - -linear_comb_layer ------------------ -.. automodule:: paddle.trainer_config_helpers.layers - :members: linear_comb_layer - :noindex: - -interpolation_layer -------------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: interpolation_layer - :noindex: - -bilinear_interp_layer ----------------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: bilinear_interp_layer - :noindex: - -power_layer ------------ -.. automodule:: paddle.trainer_config_helpers.layers - :members: power_layer - :noindex: - -scaling_layer -------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: scaling_layer - :noindex: - -slope_intercept_layer ----------------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: slope_intercept_layer - :noindex: - -tensor_layer ------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: tensor_layer - :noindex: - -.. _api_trainer_config_helpers_layers_cos_sim: - -cos_sim -------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: cos_sim - :noindex: - -trans_layer ------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: trans_layer - :noindex: - -Sampling Layers -=============== - -maxid_layer ------------ -.. automodule:: paddle.trainer_config_helpers.layers - :members: maxid_layer - :noindex: - -sampling_id_layer ------------------ -.. automodule:: paddle.trainer_config_helpers.layers - :members: sampling_id_layer - :noindex: - -Slicing and Joining Layers -========================== - -pad_layer ------------ -.. automodule:: paddle.trainer_config_helpers.layers - :members: pad_layer - :noindex: - -.. _api_trainer_config_helpers_layers_cost_layers: - -Cost Layers -=========== - -cross_entropy -------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: cross_entropy - :noindex: - -cross_entropy_with_selfnorm ---------------------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: cross_entropy_with_selfnorm - :noindex: - -multi_binary_label_cross_entropy --------------------------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: multi_binary_label_cross_entropy - :noindex: - -mse_cost ---------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: mse_cost - :noindex: - -huber_cost ----------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: huber_cost - :noindex: - -lambda_cost ------------ -.. automodule:: paddle.trainer_config_helpers.layers - :members: lambda_cost - :noindex: - -rank_cost ---------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: rank_cost - :noindex: - -sum_cost ---------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: sum_cost - :noindex: - -crf_layer ------------------ -.. automodule:: paddle.trainer_config_helpers.layers - :members: crf_layer - :noindex: - -crf_decoding_layer -------------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: crf_decoding_layer - :noindex: - -ctc_layer ------------ -.. automodule:: paddle.trainer_config_helpers.layers - :members: ctc_layer - :noindex: - -warp_ctc_layer --------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: warp_ctc_layer - :noindex: - -nce_layer ------------ -.. automodule:: paddle.trainer_config_helpers.layers - :members: nce_layer - :noindex: - -hsigmoid ---------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: hsigmoid - :noindex: - -Check Layer -============ - -eos_layer ------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: eos_layer - :noindex: diff --git a/doc/api/v1/trainer_config_helpers/networks.rst b/doc/api/v1/trainer_config_helpers/networks.rst deleted file mode 100644 index edb53acbf0c31532aa34bda044066fed72eaa426..0000000000000000000000000000000000000000 --- a/doc/api/v1/trainer_config_helpers/networks.rst +++ /dev/null @@ -1,123 +0,0 @@ -======== -Networks -======== - -The networks module contains pieces of neural network that combine multiple layers. - -NLP -=== - -sequence_conv_pool ------------------- -.. automodule:: paddle.trainer_config_helpers.networks - :members: sequence_conv_pool - :noindex: - -.. _api_trainer_config_helpers_network_text_conv_pool: - -text_conv_pool --------------- -.. automodule:: paddle.trainer_config_helpers.networks - :members: text_conv_pool - :noindex: - -Images -====== - -img_conv_bn_pool ----------------- -.. automodule:: paddle.trainer_config_helpers.networks - :members: img_conv_bn_pool - :noindex: - -img_conv_group --------------- -.. automodule:: paddle.trainer_config_helpers.networks - :members: img_conv_group - :noindex: - -.. _api_trainer_config_helpers_network_simple_img_conv_pool: - -simple_img_conv_pool --------------------- -.. automodule:: paddle.trainer_config_helpers.networks - :members: simple_img_conv_pool - :noindex: - -vgg_16_network ---------------- -.. automodule:: paddle.trainer_config_helpers.networks - :members: vgg_16_network - :noindex: - -Recurrent -========= - -LSTM ----- - -lstmemory_unit -`````````````` -.. automodule:: paddle.trainer_config_helpers.networks - :members: lstmemory_unit - :noindex: - -lstmemory_group -``````````````` -.. automodule:: paddle.trainer_config_helpers.networks - :members: lstmemory_group - :noindex: - -simple_lstm -``````````` -.. automodule:: paddle.trainer_config_helpers.networks - :members: simple_lstm - :noindex: - -bidirectional_lstm -`````````````````` -.. automodule:: paddle.trainer_config_helpers.networks - :members: bidirectional_lstm - :noindex: - -GRU ---- - -gru_unit -```````` -.. automodule:: paddle.trainer_config_helpers.networks - :members: gru_unit - :noindex: - -gru_group -````````` -.. automodule:: paddle.trainer_config_helpers.networks - :members: gru_group - :noindex: - -simple_gru -`````````` -.. automodule:: paddle.trainer_config_helpers.networks - :members: simple_gru - :noindex: - -simple_attention ----------------- -.. automodule:: paddle.trainer_config_helpers.networks - :members: simple_attention - :noindex: - -Miscs -===== - -dropout_layer --------------- -.. automodule:: paddle.trainer_config_helpers.networks - :members: dropout_layer - :noindex: - -outputs -------- -.. automodule:: paddle.trainer_config_helpers.networks - :members: outputs - :noindex: diff --git a/doc/api/v1/trainer_config_helpers/optimizers.rst b/doc/api/v1/trainer_config_helpers/optimizers.rst deleted file mode 100644 index d2f4958c92b8e3b7426945f1af07112ab4071136..0000000000000000000000000000000000000000 --- a/doc/api/v1/trainer_config_helpers/optimizers.rst +++ /dev/null @@ -1,61 +0,0 @@ -.. _api_trainer_config_helpers_optimizers: - -========== -Optimizers -========== - -BaseSGDOptimizer -================ -.. automodule:: paddle.trainer_config_helpers.optimizers - :members: BaseSGDOptimizer - :noindex: - -MomentumOptimizer -================= -.. automodule:: paddle.trainer_config_helpers.optimizers - :members: MomentumOptimizer - :noindex: - -AdamOptimizer -============= -.. automodule:: paddle.trainer_config_helpers.optimizers - :members: AdamOptimizer - :noindex: - -AdamaxOptimizer -================ -.. automodule:: paddle.trainer_config_helpers.optimizers - :members: AdamaxOptimizer - :noindex: - -AdaGradOptimizer -================ -.. automodule:: paddle.trainer_config_helpers.optimizers - :members: AdaGradOptimizer - :noindex: - -DecayedAdaGradOptimizer -======================= -.. automodule:: paddle.trainer_config_helpers.optimizers - :members: DecayedAdaGradOptimizer - :noindex: - -AdaDeltaOptimizer -================= -.. automodule:: paddle.trainer_config_helpers.optimizers - :members: AdaDeltaOptimizer - :noindex: - -RMSPropOptimizer -================ -.. automodule:: paddle.trainer_config_helpers.optimizers - :members: RMSPropOptimizer - :noindex: - -.. _api_trainer_config_helpers_optimizers_settings: - -settings -======== -.. automodule:: paddle.trainer_config_helpers.optimizers - :members: settings - :noindex: diff --git a/doc/api/v1/trainer_config_helpers/poolings.rst b/doc/api/v1/trainer_config_helpers/poolings.rst deleted file mode 100644 index 66566809d26f59263597b5286c5b27e0bbc9415a..0000000000000000000000000000000000000000 --- a/doc/api/v1/trainer_config_helpers/poolings.rst +++ /dev/null @@ -1,33 +0,0 @@ -======== -Poolings -======== - -BasePoolingType -=============== -.. automodule:: paddle.trainer_config_helpers.poolings - :members: BasePoolingType - :noindex: - -AvgPooling -========== -.. automodule:: paddle.trainer_config_helpers.poolings - :members: AvgPooling - :noindex: - -MaxPooling -========== -.. automodule:: paddle.trainer_config_helpers.poolings - :members: MaxPooling - :noindex: - -SumPooling -========== -.. automodule:: paddle.trainer_config_helpers.poolings - :members: SumPooling - :noindex: - -SquareRootNPooling -================== -.. automodule:: paddle.trainer_config_helpers.poolings - :members: SquareRootNPooling - :noindex: diff --git a/doc/api/v2/config/activation.rst b/doc/api/v2/config/activation.rst index eca3ce03bcdc599edca802d8dfca48d4f28275a2..5317e66b64bbd85c61f19700a9d2c1d239dee573 100644 --- a/doc/api/v2/config/activation.rst +++ b/doc/api/v2/config/activation.rst @@ -99,3 +99,10 @@ STanh .. automodule:: paddle.v2.activation :members: STanh :noindex: + +SoftSign +======== + +.. automodule:: paddle.v2.activation + :members: SoftSign + :noindex: diff --git a/doc/api/v2/config/evaluators.rst b/doc/api/v2/config/evaluators.rst new file mode 100644 index 0000000000000000000000000000000000000000..9ac972fb193a2fb525edc507f7ba1303d2c8eabe --- /dev/null +++ b/doc/api/v2/config/evaluators.rst @@ -0,0 +1,110 @@ +.. _api_v2: + +========== +Evaluators +========== + +Classification +============== + +classification_error +-------------------- +.. automodule:: paddle.v2.evaluator + :members: classification_error + :noindex: + +auc +--- +.. automodule:: paddle.v2.evaluator + :members: auc + :noindex: + +ctc_error +--------- +.. automodule:: paddle.v2.evaluator + :members: ctc_error + :noindex: + +chunk +----- +.. automodule:: paddle.v2.evaluator + :members: chunk + :noindex: + +precision_recall +---------------- +.. automodule:: paddle.v2.evaluator + :members: precision_recall + :noindex: + +Rank +==== + +pnpair +------ +.. automodule:: paddle.v2.evaluator + :members: pnpair + :noindex: + +Utils +===== + +sum +--- +.. automodule:: paddle.v2.evaluator + :members: sum + :noindex: + +column_sum +---------- +.. automodule:: paddle.v2.evaluator + :members: column_sum + :noindex: + +Print +===== + +classification_error_printer +---------------------------- +.. automodule:: paddle.v2.evaluator + :members: classification_error_printer + :noindex: + +gradient_printer +---------------- +.. automodule:: paddle.v2.evaluator + :members: gradient_printer + :noindex: + +maxid_printer +------------- +.. automodule:: paddle.v2.evaluator + :members: maxid_printer + :noindex: + +maxframe_printer +---------------- +.. automodule:: paddle.v2.evaluator + :members: maxframe_printer + :noindex: + +seqtext_printer +--------------- +.. automodule:: paddle.v2.evaluator + :members: seqtext_printer + :noindex: + +value_printer +------------- +.. automodule:: paddle.v2.evaluator + :members: value_printer + :noindex: + +Detection +===== + +detection_map +------------- +.. automodule:: paddle.v2.evaluator + :members: detection_map + :noindex: diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst index 2a02baf17ba0d1119a8d222024616ef8ae33f8d5..c3f9c18d0663a7a24880b441981875c1e4f015aa 100644 --- a/doc/api/v2/config/layer.rst +++ b/doc/api/v2/config/layer.rst @@ -54,18 +54,23 @@ img_conv .. _api_v2.layer_context_projection: -context_projection +context_projection ------------------ .. autoclass:: paddle.v2.layer.context_projection :noindex: +row_conv +-------- +.. autoclass:: paddle.v2.layer.row_conv + :noindex: + Image Pooling Layer =================== img_pool -------- .. autoclass:: paddle.v2.layer.img_pool - :noindex: + :noindex: spp --- @@ -77,6 +82,11 @@ maxout .. autoclass:: paddle.v2.layer.maxout :noindex: +roi_pool +-------- +.. autoclass:: paddle.v2.layer.roi_pool + :noindex: + Norm Layer ========== @@ -94,12 +104,17 @@ sum_to_one_norm --------------- .. autoclass:: paddle.v2.layer.sum_to_one_norm :noindex: - + cross_channel_norm ------------------ .. autoclass:: paddle.v2.layer.cross_channel_norm :noindex: - + +row_l2_norm +----------- +.. autoclass:: paddle.v2.layer.row_l2_norm + :noindex: + Recurrent Layers ================ @@ -130,7 +145,7 @@ recurrent_group --------------- .. autoclass:: paddle.v2.layer.recurrent_group :noindex: - + lstm_step --------- .. autoclass:: paddle.v2.layer.lstm_step @@ -145,12 +160,12 @@ beam_search ------------ .. autoclass:: paddle.v2.layer.beam_search :noindex: - + get_output ---------- .. autoclass:: paddle.v2.layer.get_output :noindex: - + Mixed Layer =========== @@ -193,6 +208,10 @@ identity_projection .. autoclass:: paddle.v2.layer.identity_projection :noindex: +slice_projection +------------------- +.. autoclass:: paddle.v2.layer.slice_projection + :noindex: table_projection ---------------- @@ -203,10 +222,15 @@ trans_full_matrix_projection ---------------------------- .. autoclass:: paddle.v2.layer.trans_full_matrix_projection :noindex: - + Aggregate Layers ================ +AggregateLevel +-------------- +.. autoclass:: paddle.v2.layer.AggregateLevel + :noindex: + .. _api_v2.layer_pooling: pooling @@ -238,6 +262,21 @@ seq_concat .. autoclass:: paddle.v2.layer.seq_concat :noindex: +seq_slice +--------- +.. autoclass:: paddle.v2.layer.seq_slice + :noindex: + +kmax_sequence_score +------------------- +.. autoclass:: paddle.v2.layer.kmax_sequence_score + :noindex: + +sub_nested_seq +-------------- +.. autoclass:: paddle.v2.layer.sub_nested_seq + :noindex: + Reshaping Layers ================ @@ -248,6 +287,11 @@ block_expand .. _api_v2.layer_expand: +ExpandLevel +----------- +.. autoclass:: paddle.v2.layer.ExpandLevel + :noindex: + expand ------ .. autoclass:: paddle.v2.layer.expand @@ -291,6 +335,16 @@ bilinear_interp .. autoclass:: paddle.v2.layer.bilinear_interp :noindex: +dot_prod +--------- +.. autoclass:: paddle.v2.layer.dot_prod + :noindex: + +out_prod +-------- +.. autoclass:: paddle.v2.layer.out_prod + :noindex: + power ----- .. autoclass:: paddle.v2.layer.power @@ -301,6 +355,16 @@ scaling .. autoclass:: paddle.v2.layer.scaling :noindex: +clip +---- +.. autoclass:: paddle.v2.layer.clip + :noindex: + +resize +------ +.. autoclass:: paddle.v2.layer.resize + :noindex: + slope_intercept --------------- .. autoclass:: paddle.v2.layer.slope_intercept @@ -318,11 +382,21 @@ cos_sim .. autoclass:: paddle.v2.layer.cos_sim :noindex: +l2_distance +----------- +.. autoclass:: paddle.v2.layer.l2_distance + :noindex: + trans ----- .. autoclass:: paddle.v2.layer.trans :noindex: +scale_shift +----------- +.. autoclass:: paddle.v2.layer.scale_shift + :noindex: + Sampling Layers =============== @@ -336,6 +410,19 @@ sampling_id .. autoclass:: paddle.v2.layer.sampling_id :noindex: +multiplex +--------- +.. autoclass:: paddle.v2.layer.multiplex + :noindex: + +Factorization Machine Layer +============================ + +factorization_machine +--------------------- +.. autoclass:: paddle.v2.layer.factorization_machine + :noindex: + Slicing and Joining Layers ========================== @@ -364,9 +451,14 @@ multi_binary_label_cross_entropy_cost .. autoclass:: paddle.v2.layer.multi_binary_label_cross_entropy_cost :noindex: -huber_cost ----------- -.. autoclass:: paddle.v2.layer.huber_cost +huber_regression_cost +------------------------- +.. autoclass:: paddle.v2.layer.huber_regression_cost + :noindex: + +huber_classification_cost +------------------------- +.. autoclass:: paddle.v2.layer.huber_classification_cost :noindex: lambda_cost @@ -374,9 +466,9 @@ lambda_cost .. autoclass:: paddle.v2.layer.lambda_cost :noindex: -mse_cost +square_error_cost -------- -.. autoclass:: paddle.v2.layer.mse_cost +.. autoclass:: paddle.v2.layer.square_error_cost :noindex: rank_cost @@ -419,10 +511,49 @@ hsigmoid .. autoclass:: paddle.v2.layer.hsigmoid :noindex: -Check Layer +smooth_l1_cost +-------------- +.. autoclass:: paddle.v2.layer.smooth_l1_cost + :noindex: + +multibox_loss +-------------- +.. autoclass:: paddle.v2.layer.multibox_loss + :noindex: + +Check Layer ============ eos --- .. autoclass:: paddle.v2.layer.eos :noindex: + +Miscs +===== + +dropout +-------------- +.. autoclass:: paddle.v2.layer.dropout + :noindex: + +Activation with learnable parameter +=================================== + +prelu +-------- +.. autoclass:: paddle.v2.layer.prelu + :noindex: + +gated_unit +----------- +.. autoclass:: paddle.v2.layer.gated_unit + :noindex: + +Detection output Layer +====================== + +detection_output +---------------- +.. autoclass:: paddle.v2.layer.detection_output + :noindex: diff --git a/doc/api/v2/config/networks.rst b/doc/api/v2/config/networks.rst index 6f209bc95bec7279051118bb857a96515e0371a9..048379cf01f4aec5e73e2fe3ddfa728f3c17a5d1 100644 --- a/doc/api/v2/config/networks.rst +++ b/doc/api/v2/config/networks.rst @@ -44,6 +44,12 @@ simple_img_conv_pool :members: simple_img_conv_pool :noindex: +small_vgg +--------- +.. automodule:: paddle.v2.networks + :members: small_vgg + :noindex: + vgg_16_network --------------- .. automodule:: paddle.v2.networks @@ -101,17 +107,26 @@ simple_gru :members: simple_gru :noindex: +simple_gru2 +``````````` +.. automodule:: paddle.v2.networks + :members: simple_gru2 + :noindex: + +bidirectional_gru +`````````````````` +.. automodule:: paddle.v2.networks + :members: bidirectional_gru + :noindex: + simple_attention ---------------- .. automodule:: paddle.v2.networks :members: simple_attention :noindex: -Miscs -===== - -dropout_layer --------------- +dot_product_attention +--------------------- .. automodule:: paddle.v2.networks - :members: dropout_layer + :members: dot_product_attention :noindex: diff --git a/doc/api/v2/data.rst b/doc/api/v2/data.rst index fef87c4fbdb452771ecdb361c6eeae5b32bcee14..b56c7332cc284649c7e04328e51a7faa78593a39 100644 --- a/doc/api/v2/data.rst +++ b/doc/api/v2/data.rst @@ -2,112 +2,9 @@ Data Reader Interface and DataSets ================================== +.. toctree:: + :maxdepth: 1 -DataTypes -========= - -.. automodule:: paddle.v2.data_type - :members: - :noindex: - -DataFeeder -========== - -.. automodule:: paddle.v2.data_feeder - :members: - :noindex: - -Reader -====== - -.. automodule:: paddle.v2.reader - :members: - :noindex: - -.. automodule:: paddle.v2.reader.creator - :members: - :noindex: - -minibatch -========= - -.. automodule:: paddle.v2.minibatch - :members: - :noindex: - -Dataset -======= - -.. automodule:: paddle.v2.dataset - :members: - :noindex: - -mnist -+++++ - -.. automodule:: paddle.v2.dataset.mnist - :members: - :noindex: - -cifar -+++++ - -.. automodule:: paddle.v2.dataset.cifar - :members: - :noindex: - -conll05 -+++++++ - -.. automodule:: paddle.v2.dataset.conll05 - :members: get_dict,get_embedding,test - :noindex: - -imdb -++++ - -.. automodule:: paddle.v2.dataset.imdb - :members: - :noindex: - -imikolov -++++++++ - -.. automodule:: paddle.v2.dataset.imikolov - :members: - :noindex: - -movielens -+++++++++ - -.. automodule:: paddle.v2.dataset.movielens - :members: - :noindex: - -.. autoclass:: paddle.v2.dataset.movielens.MovieInfo - :noindex: - -.. autoclass:: paddle.v2.dataset.movielens.UserInfo - :noindex: - -sentiment -+++++++++ - -.. automodule:: paddle.v2.dataset.sentiment - :members: - :noindex: - -uci_housing -+++++++++++ - -.. automodule:: paddle.v2.dataset.uci_housing - :members: - :noindex: - -wmt14 -+++++ - -.. automodule:: paddle.v2.dataset.wmt14 - :members: - :noindex: - + data/data_reader.rst + data/image.rst + data/dataset.rst diff --git a/doc/api/v2/data/data_reader.rst b/doc/api/v2/data/data_reader.rst new file mode 100644 index 0000000000000000000000000000000000000000..2ccfec9c284877a7576e9751526b169a4ac78d8e --- /dev/null +++ b/doc/api/v2/data/data_reader.rst @@ -0,0 +1,36 @@ +===================== +Data Reader Interface +===================== + + +DataTypes +========= + +.. automodule:: paddle.v2.data_type + :members: + :noindex: + +DataFeeder +========== + +.. automodule:: paddle.v2.data_feeder + :members: + :noindex: + +Reader +====== + +.. automodule:: paddle.v2.reader + :members: + :noindex: + +.. automodule:: paddle.v2.reader.creator + :members: + :noindex: + +minibatch +========= + +.. automodule:: paddle.v2.minibatch + :members: + :noindex: diff --git a/doc/api/v2/data/dataset.rst b/doc/api/v2/data/dataset.rst new file mode 100644 index 0000000000000000000000000000000000000000..6a8ecc5bb1d855e0ded3719943ab3adb810de365 --- /dev/null +++ b/doc/api/v2/data/dataset.rst @@ -0,0 +1,75 @@ +Dataset +======= + +.. automodule:: paddle.v2.dataset + :members: + :noindex: + +mnist ++++++ + +.. automodule:: paddle.v2.dataset.mnist + :members: + :noindex: + +cifar ++++++ + +.. automodule:: paddle.v2.dataset.cifar + :members: + :noindex: + +conll05 ++++++++ + +.. automodule:: paddle.v2.dataset.conll05 + :members: get_dict,get_embedding,test + :noindex: + +imdb +++++ + +.. automodule:: paddle.v2.dataset.imdb + :members: + :noindex: + +imikolov +++++++++ + +.. automodule:: paddle.v2.dataset.imikolov + :members: + :noindex: + +movielens ++++++++++ + +.. automodule:: paddle.v2.dataset.movielens + :members: + :noindex: + +.. autoclass:: paddle.v2.dataset.movielens.MovieInfo + :noindex: + +.. autoclass:: paddle.v2.dataset.movielens.UserInfo + :noindex: + +sentiment ++++++++++ + +.. automodule:: paddle.v2.dataset.sentiment + :members: + :noindex: + +uci_housing ++++++++++++ + +.. automodule:: paddle.v2.dataset.uci_housing + :members: + :noindex: + +wmt14 ++++++ + +.. automodule:: paddle.v2.dataset.wmt14 + :members: + :noindex: diff --git a/doc/api/v2/data/image.rst b/doc/api/v2/data/image.rst new file mode 100644 index 0000000000000000000000000000000000000000..97651ffa6be56cf3ecaca2caca38a353fa5c1f49 --- /dev/null +++ b/doc/api/v2/data/image.rst @@ -0,0 +1,5 @@ +Image Interface +=============== + +.. automodule:: paddle.v2.image + :members: diff --git a/doc/api/v2/fluid.rst b/doc/api/v2/fluid.rst new file mode 100644 index 0000000000000000000000000000000000000000..43fc19dc492bbc119f2356034b81c65e443db2fa --- /dev/null +++ b/doc/api/v2/fluid.rst @@ -0,0 +1,18 @@ +====================== +Fluid +====================== + +.. toctree:: + :maxdepth: 1 + + fluid/layers.rst + fluid/data_feeder.rst + fluid/executor.rst + fluid/initializer.rst + fluid/evaluator.rst + fluid/nets.rst + fluid/optimizer.rst + fluid/param_attr.rst + fluid/profiler.rst + fluid/regularizer.rst + diff --git a/doc/api/v2/fluid/data_feeder.rst b/doc/api/v2/fluid/data_feeder.rst new file mode 100644 index 0000000000000000000000000000000000000000..0fa78f7dfb04c13be7eb83b7fd35cb03f2f4a7fa --- /dev/null +++ b/doc/api/v2/fluid/data_feeder.rst @@ -0,0 +1,9 @@ +=========== +DataFeeder +=========== + +DataFeeder +----------- +.. automodule:: paddle.v2.fluid.data_feeder + :members: DataFeeder + :noindex: diff --git a/doc/api/v2/fluid/evaluator.rst b/doc/api/v2/fluid/evaluator.rst new file mode 100644 index 0000000000000000000000000000000000000000..a23f3301d0331e0ea3733f06444515eb4680cd31 --- /dev/null +++ b/doc/api/v2/fluid/evaluator.rst @@ -0,0 +1,9 @@ +=========== +Evaluator +=========== + +Evaluator +----------- +.. automodule:: paddle.v2.fluid.evaluator + :members: Evaluator + :noindex: diff --git a/doc/api/v2/fluid/executor.rst b/doc/api/v2/fluid/executor.rst new file mode 100644 index 0000000000000000000000000000000000000000..3a283538c120cfa1ef646c390bb71c6251c23675 --- /dev/null +++ b/doc/api/v2/fluid/executor.rst @@ -0,0 +1,9 @@ +=========== +Executor +=========== + +Executor +----------- +.. automodule:: paddle.v2.fluid.executor + :members: Executor + :noindex: diff --git a/doc/api/v2/fluid/initializer.rst b/doc/api/v2/fluid/initializer.rst new file mode 100644 index 0000000000000000000000000000000000000000..8f587837e9873370722062404f511654a9460587 --- /dev/null +++ b/doc/api/v2/fluid/initializer.rst @@ -0,0 +1,50 @@ +=========== +Initializer +=========== + + + +Initializer +----------- +.. automodule:: paddle.v2.fluid.initializer + :members: Initializer + :noindex: + + + +ConstantInitializer +------------------- +.. automodule:: paddle.v2.fluid.initializer + :members: ConstantInitializer + :noindex: + + + +UniformInitializer +------------------ +.. automodule:: paddle.v2.fluid.initializer + :members: UniformInitializer + :noindex: + + + +NormalInitializer +----------------- +.. automodule:: paddle.v2.fluid.initializer + :members: NormalInitializer + :noindex: + + +XavierInitializer +----------------- +.. automodule:: paddle.v2.fluid.initializer + :members: XavierInitializer + :noindex: + + +MSRAInitializer +--------------- +.. automodule:: paddle.v2.fluid.initializer + :members: MSRAInitializer + :noindex: + diff --git a/doc/api/v2/fluid/layers.rst b/doc/api/v2/fluid/layers.rst new file mode 100644 index 0000000000000000000000000000000000000000..89e5fec13bf9062dc7a7187b1334c8f5486a980b --- /dev/null +++ b/doc/api/v2/fluid/layers.rst @@ -0,0 +1,302 @@ +========== +Layers +========== + + +fc +--- +.. autofunction:: paddle.v2.fluid.layers.fc + :noindex: + +embedding +--------- +.. autofunction:: paddle.v2.fluid.layers.embedding + :noindex: + +dynamic_lstm +------------ +.. autofunction:: paddle.v2.fluid.layers.dynamic_lstm + :noindex: + +data +--------- +.. autofunction:: paddle.v2.fluid.layers.data + :noindex: + +mean +--------- +.. autofunction:: paddle.v2.fluid.layers.mean + :noindex: + +mul +--------- +.. autofunction:: paddle.v2.fluid.layers.mul + :noindex: + +elementwise_add +--------------- +.. autofunction:: paddle.v2.fluid.layers.elementwise_add + :noindex: + +elementwise_div +--------------- +.. autofunction:: paddle.v2.fluid.layers.elementwise_div + :noindex: + + +dropout +--------- +.. autofunction:: paddle.v2.fluid.layers.dropout + :noindex: + + +reshape +--------- +.. autofunction:: paddle.v2.fluid.layers.reshape + :noindex: + + +sigmoid +--------- +.. autofunction:: paddle.v2.fluid.layers.sigmoid + :noindex: + + +scale +--------- +.. autofunction:: paddle.v2.fluid.layers.scale + :noindex: + + +reshape +--------- +.. autofunction:: paddle.v2.fluid.layers.reshape + :noindex: + + +transpose +--------- +.. autofunction:: paddle.v2.fluid.layers.transpose + :noindex: + + +sigmoid_cross_entropy_with_logits +--------- +.. autofunction:: paddle.v2.fluid.layers.esigmoid_cross_entropy_with_logits + :noindex: + + +cast +--------- +.. autofunction:: paddle.v2.fluid.layers.cast + :noindex: + + +concat +--------- +.. autofunction:: paddle.v2.fluid.layers.concat + :noindex: + + +sums +--------- +.. autofunction:: paddle.v2.fluid.layers.sums + :noindex: + + +linear_chain_crf +--------- +.. autofunction:: paddle.v2.fluid.layers.linear_chain_crf + :noindex: + + +assign +--------- +.. autofunction:: paddle.v2.fluid.layers.embedding + :noindex: + + +split_lod_tensor +--------- +.. autofunction:: paddle.v2.fluid.layers.split_lod_tensor + :noindex: + + +merge_lod_tensor +--------- +.. autofunction:: paddle.v2.fluid.layers.merge_lod_tensor + :noindex: + +cos_sim +--------- +.. autofunction:: paddle.v2.fluid.layers.cos_sim + :noindex: + + +cross_entropy +--------- +.. autofunction:: paddle.v2.fluid.layers.cross_entropy + :noindex: + + + +square_error_cost +--------- +.. autofunction:: paddle.v2.fluid.layers.square_error_cost + :noindex: + + +accuracy +--------- +.. autofunction:: paddle.v2.fluid.layers.accuracy + :noindex: + + +sequence_conv +--------- +.. autofunction:: paddle.v2.fluid.layers.sequence_conv + :noindex: + + +conv2d +--------- +.. autofunction:: paddle.v2.fluid.layers.conv2d + :noindex: + + +sequence_pool +--------- +.. autofunction:: paddle.v2.fluid.layers.sequence_pool + :noindex: + + +pool2d +--------- +.. autofunction:: paddle.v2.fluid.layers.pool2d + :noindex: + + +batch_norm +--------- +.. autofunction:: paddle.v2.fluid.layers.batch_norm + :noindex: + + +beam_search_decode +--------- +.. autofunction:: paddle.v2.fluid.layers.beam_search_decode + :noindex: + + +lstm +--------- +.. autofunction:: paddle.v2.fluid.layers.lstm + :noindex: + + +lod_rank_table +--------- +.. autofunction:: paddle.v2.fluid.layers.lod_rank_table + :noindex: + + +max_sequence_len +--------- +.. autofunction:: paddle.v2.fluid.layers.max_sequence_len + :noindex: + + +topk +--------- +.. autofunction:: paddle.v2.fluid.layers.topk + :noindex: + + +lod_tensor_to_array +--------- +.. autofunction:: paddle.v2.fluid.layers.lod_tensor_to_array + :noindex: + + + +array_to_lod_tensor +--------- +.. autofunction:: paddle.v2.fluid.layers.array_to_lod_tensor + :noindex: + + + + +fill_constant +--------- +.. autofunction:: paddle.v2.fluid.layers.fill_constant + :noindex: + + + +fill_constant_batch_size_like +--------- +.. autofunction:: paddle.v2.fluid.layers.fill_constant_batch_size_like + :noindex: + + +ones +--------- +.. autofunction:: paddle.v2.fluid.layers.ones + :noindex: + + +zeros +--------- +.. autofunction:: paddle.v2.fluid.layers.zeros + :noindex: + + +increment +--------- +.. autofunction:: paddle.v2.fluid.layers.increment + :noindex: + + +array_write +--------- +.. autofunction:: paddle.v2.fluid.layers.array_write + :noindex: + + + +create_array +--------- +.. autofunction:: paddle.v2.fluid.layers.create_array + :noindex: + + +less_than +--------- +.. autofunction:: paddle.v2.fluid.layers.less_than + :noindex: + + +array_read +--------- +.. autofunction:: paddle.v2.fluid.layers.array_read + :noindex: + + +shrink_memory +--------- +.. autofunction:: paddle.v2.fluid.layers.shrink_memory + :noindex: + + +array_length +--------- +.. autofunction:: paddle.v2.fluid.layers.array_length + :noindex: + + +conv2d_transpose +--------- +.. autofunction:: paddle.v2.fluid.layers.conv2d_transpose + :noindex: + diff --git a/doc/api/v2/fluid/nets.rst b/doc/api/v2/fluid/nets.rst new file mode 100644 index 0000000000000000000000000000000000000000..2c3d075422de29c96e25458e831133a30270dd39 --- /dev/null +++ b/doc/api/v2/fluid/nets.rst @@ -0,0 +1,22 @@ +=========== +Nets +=========== + +simple_img_conv_pool +----------- +.. autofunction:: paddle.v2.fluid.nets.simple_img_conv_pool + :noindex: + + +img_conv_group +----------- +.. autofunction:: paddle.v2.fluid.nets.img_conv_group + :noindex: + + +sequence_conv_pool +----------- +.. autofunction:: paddle.v2.fluid.nets.sequence_conv_pool + :noindex: + + diff --git a/doc/api/v2/fluid/optimizer.rst b/doc/api/v2/fluid/optimizer.rst new file mode 100644 index 0000000000000000000000000000000000000000..233762fcdfb39e592740adef6721a556fae3feef --- /dev/null +++ b/doc/api/v2/fluid/optimizer.rst @@ -0,0 +1,54 @@ +=========== +Optimizer +=========== + +Optimizer +----------- +.. automodule:: paddle.v2.fluid.optimizer + :members: Optimizer + :noindex: + + +SGDOptimizer +----------- +.. automodule:: paddle.v2.fluid.optimizer + :members: SGDOptimizer + :noindex: + + + +MomentumOptimizer +----------- +.. automodule:: paddle.v2.fluid.optimizer + :members: MomentumOptimizer + :noindex: + + + +AdagradOptimizer +----------- +.. automodule:: paddle.v2.fluid.optimizer + :members: AdagradOptimizer + :noindex: + + +AdamOptimizer +----------- +.. automodule:: paddle.v2.fluid.optimizer + :members: AdamOptimizer + :noindex: + + +AdamaxOptimizer +----------- +.. automodule:: paddle.v2.fluid.optimizer + :members: AdamaxOptimizer + :noindex: + + +DecayedAdagradOptimizer +----------- +.. automodule:: paddle.v2.fluid.optimizer + :members: DecayedAdagradOptimizer + :noindex: + diff --git a/doc/api/v2/fluid/param_attr.rst b/doc/api/v2/fluid/param_attr.rst new file mode 100644 index 0000000000000000000000000000000000000000..ca0c8af9e8c4f2271de7a131ad0d27c0e8635f50 --- /dev/null +++ b/doc/api/v2/fluid/param_attr.rst @@ -0,0 +1,11 @@ +=========== +ParamAttr +=========== + + + +ParamAttr +----------- +.. automodule:: paddle.v2.fluid.param_attr + :members: ParamAttr + :noindex: diff --git a/doc/api/v2/fluid/profiler.rst b/doc/api/v2/fluid/profiler.rst new file mode 100644 index 0000000000000000000000000000000000000000..7d4042d1f41c12c4a551ba6576559d612116872a --- /dev/null +++ b/doc/api/v2/fluid/profiler.rst @@ -0,0 +1,10 @@ +=========== +Profiler +=========== + + + +Profiler +----------- +.. autofunction:: paddle.v2.fluid.profiler.cuda_profiler + :noindex: diff --git a/doc/api/v2/fluid/regularizer.rst b/doc/api/v2/fluid/regularizer.rst new file mode 100644 index 0000000000000000000000000000000000000000..3af2b07d2ae55d99df705fbf1ad2402eee05c435 --- /dev/null +++ b/doc/api/v2/fluid/regularizer.rst @@ -0,0 +1,25 @@ +=========== +Regularizer +=========== + +WeightDecayRegularizer +----------- +.. automodule:: paddle.v2.fluid.regularizer + :members: WeightDecayRegularizer + :noindex: + + +L2DecayRegularizer +----------- +.. automodule:: paddle.v2.fluid.regularizer + :members: L2DecayRegularizer + :noindex: + + + +L1DecayRegularizer +----------- +.. automodule:: paddle.v2.fluid.regularizer + :members: L1DecayRegularizer + + diff --git a/doc/api/v2/model_configs.rst b/doc/api/v2/model_configs.rst index a5fae7e29e56d9e236d489353a5c8967d1954641..992b559cbd87244612521d4c96f84f997d6c4196 100644 --- a/doc/api/v2/model_configs.rst +++ b/doc/api/v2/model_configs.rst @@ -6,6 +6,7 @@ Model Configuration config/activation.rst config/layer.rst + config/evaluators.rst config/optimizer.rst config/pooling.rst config/networks.rst diff --git a/doc/design/api.md b/doc/design/api.md index 8185d2af0ea264a2e7b4e28b9ed05279e4a22014..e6a4638d9100d9b07c3ee6b92b530a17eae1c162 100644 --- a/doc/design/api.md +++ b/doc/design/api.md @@ -3,7 +3,7 @@ ## Ingredients As our design principle is starting from the essence: how could we -allow users to express and solve their problems at neural networks. +allow users to express and solve their problems as neural networks. Some essential concepts that our API have to provide include: 1. A *topology* is an expression of *layers*. @@ -233,7 +233,7 @@ paddle.dist_train(model, num_parameter_servers=15) ``` -The pseudo code if `paddle.dist_train` is as follows: +The pseudo code of `paddle.dist_train` is as follows: ```python def dist_train(topology, parameters, trainer, reader, ...): diff --git a/doc/design/auto_gradient_check.md b/doc/design/auto_gradient_check.md new file mode 100644 index 0000000000000000000000000000000000000000..f9991541bc51c6e13ffce4e9cec60f73dc800121 --- /dev/null +++ b/doc/design/auto_gradient_check.md @@ -0,0 +1,146 @@ +## Auto Gradient Checker Design + +## Backgraound: +- Generally, it is easy to check whether the forward computation of an Operator is correct or not. However, backpropagation is a notoriously difficult algorithm to debug and get right: + 1. you should get the right backpropagation formula according to the forward computation. + 2. you should implement it right in CPP. + 3. it's difficult to prepare test data. + +- Auto gradient checking gets a numerical gradient by forward Operator and use it as a reference of the backward Operator's result. It has several advantages: + 1. numerical gradient checker only need forward operator. + 2. user only need to prepare the input data for forward Operator. + +## Mathematical Theory +The following two document from Stanford has a detailed explanation of how to get numerical gradient and why it's useful. + +- [Gradient checking and advanced optimization(en)](http://deeplearning.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization) +- [Gradient checking and advanced optimization(cn)](http://ufldl.stanford.edu/wiki/index.php/%E6%A2%AF%E5%BA%A6%E6%A3%80%E9%AA%8C%E4%B8%8E%E9%AB%98%E7%BA%A7%E4%BC%98%E5%8C%96) + + +## Numeric Gradient Implementation +### Python Interface +```python +def get_numerical_gradient(op, + input_values, + output_name, + input_to_check, + delta=0.005, + local_scope=None): + """ + Get Numeric Gradient for an operator's input. + + :param op: C++ operator instance, could be an network + :param input_values: The input variables. Should be an dictionary, whose key is + variable name, and value is numpy array. + :param output_name: The final output variable name. + :param input_to_check: The input variable with respect to which to compute the gradient. + :param delta: The perturbation value for numeric gradient method. The + smaller delta is, the more accurate result will get. But if that delta is + too small, it will suffer from numerical stability problem. + :param local_scope: The local scope used for get_numeric_gradient. + :return: The gradient array in numpy format. + """ +``` + +### Explaination: + +- Why need `output_name` + - An Operator may have multiple Output, one can get independent gradient from each Output. So caller should specify the name of the output variable. + +- Why need `input_to_check` + - One operator may have multiple inputs. Gradient Op can calculate the gradient of these inputs at the same time. But Numeric Gradient needs to calculate them one by one. So `get_numeric_gradient` is designed to calculate the gradient for one input. If you need to compute multiple inputs, you can call `get_numeric_gradient` multiple times. + + +### Core Algorithm Implementation + + +```python + # we only compute gradient of one element a time. + # we use a for loop to compute the gradient of each element. + for i in xrange(tensor_size): + # get one input element by its index i. + origin = tensor_to_check.get_float_element(i) + + # add delta to it, run op and then get the new value of the result tensor. + x_pos = origin + delta + tensor_to_check.set_float_element(i, x_pos) + y_pos = get_output() + + # plus delta to this element, run op and get the new value of the result tensor. + x_neg = origin - delta + tensor_to_check.set_float_element(i, x_neg) + y_neg = get_output() + + # restore old value + tensor_to_check.set_float_element(i, origin) + + # compute the gradient of this element and store it into a numpy array. + gradient_flat[i] = (y_pos - y_neg) / delta / 2 + + # reshape the gradient result to the shape of the source tensor. + return gradient_flat.reshape(tensor_to_check.get_dims()) +``` + +## Auto Graident Checker Framework + +Each Operator Kernel has three kinds of Gradient: + +1. Numerical gradient +2. CPU kernel gradient +3. GPU kernel gradient (if supported) + +The numerical gradient only relies on forward Operator. So we use the numerical gradient as the reference value. And the gradient checking is performed in the following three steps: + +1. calculate the numerical gradient +2. calculate CPU kernel gradient with the backward Operator and compare it with the numerical gradient +3. calculate GPU kernel gradient with the backward Operator and compare it with the numeric gradient (if supported) + +#### Python Interface + +```python + def check_grad(self, + forward_op, + input_vars, + inputs_to_check, + output_name, + no_grad_set=None, + only_cpu=False, + max_relative_error=0.005): + """ + :param forward_op: used to create backward_op + :param input_vars: numpy value of input variable. The following + computation will use these variables. + :param inputs_to_check: the input variable with respect to which to compute the gradient. + :param output_name: The final output variable name. + :param max_relative_error: The relative tolerance parameter. + :param no_grad_set: used when create backward ops + :param only_cpu: only compute and check gradient on cpu kernel. + :return: + """ +``` + +### How to check if two numpy array is close enough? +if `abs_numerical_grad` is nearly zero, then use abs error for numerical_grad + +```python +numerical_grad = ... +operator_grad = numpy.array(scope.find_var(grad_var_name(name)).get_tensor()) + +abs_numerical_grad = numpy.abs(numerical_grad) +# if abs_numerical_grad is nearly zero, then use abs error for numeric_grad, not relative +# error. +abs_numerical_grad[abs_numerical_grad < 1e-3] = 1 + +diff_mat = numpy.abs(abs_numerical_grad - operator_grad) / abs_numerical_grad +max_diff = numpy.max(diff_mat) +``` + + +#### Notes: +The Input data for auto gradient checker should be reasonable to avoid numerical stability problem. + + +#### Refs: + +- [Gradient checking and advanced optimization(en)](http://deeplearning.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization) +- [Gradient checking and advanced optimization(cn)](http://ufldl.stanford.edu/wiki/index.php/%E6%A2%AF%E5%BA%A6%E6%A3%80%E9%AA%8C%E4%B8%8E%E9%AB%98%E7%BA%A7%E4%BC%98%E5%8C%96) diff --git a/doc/design/block.md b/doc/design/block.md new file mode 100644 index 0000000000000000000000000000000000000000..4066122c0e8dfa33776796c3d205ba5aec9e0f52 --- /dev/null +++ b/doc/design/block.md @@ -0,0 +1,336 @@ +# Design Doc: Block and Scope + +## The Representation of Computation + +Both deep learning systems and programming languages help users describe computation procedures. These systems use various representations of computation: + +- Caffe, Torch, and Paddle: sequences of layers. +- TensorFlow, Caffe2, Mxnet: graph of operators. +- PaddlePaddle: nested blocks, like C++ and Java programs. + +## Block in Programming Languages and Deep Learning + +In programming languages, a block is a pair of curly braces that includes local variables definitions and a sequence of instructions or operators. + +Blocks work with control flow structures like `if`, `else`, and `for`, which have equivalents in deep learning: + +| programming languages | PaddlePaddle | +|-----------------------|-----------------------| +| for, while loop | RNN, WhileOp | +| if, if-else, switch | IfElseOp, SwitchOp | +| sequential execution | a sequence of layers | + +A key difference is that a C++ program describes a one pass computation, whereas a deep learning program describes both the forward and backward passes. + +## Stack Frames and the Scope Hierarchy + +The existence of the backward pass makes the execution of a block of PaddlePaddle different from traditional programs: + +| programming languages | PaddlePaddle | +|-----------------------|---------------------------------| +| stack | scope hierarchy | +| stack frame | scope | +| push at entering block| push at entering block | +| pop at leaving block | destroy when minibatch completes| + +1. In traditional programs: + + - When the execution enters the left curly brace of a block, the runtime pushes a frame into the stack, where it realizes local variables. + - After the execution leaves the right curly brace, the runtime pops the frame. + - The maximum number of frames in the stack is the maximum depth of nested blocks. + +1. In PaddlePaddle + + - When the execution enters a block, PaddlePaddle adds a new scope, where it realizes variables. + - PaddlePaddle doesn't pop a scope after the execution of the block because variables therein are used by the backward pass. So it has a stack forest known as a *scope hierarchy*. + - The height of the highest tree is the maximum depth of nested blocks. + - After the processing of a minibatch, PaddlePaddle destroys the scope hierarchy. + +## Use Blocks in C++ and PaddlePaddle Programs + +Let us consolidate the discussion by presenting some examples. + +### Blocks with `if-else` and `IfElseOp` + +The following C++ programs shows how blocks are used with the `if-else` structure: + +```c++ +namespace pd = paddle; + +int x = 10; +int y = 1; +int z = 10; +bool cond = false; +int o1, o2; +if (cond) { + int z = x + y; + o1 = z; + o2 = pd::layer::softmax(z); +} else { + int d = pd::layer::fc(z); + o1 = d; + o2 = d+1; +} + +``` + +An equivalent PaddlePaddle program from the design doc of the [IfElseOp operator](./if_else_op.md) is as follows: + +```python +import paddle as pd + +x = minibatch([10, 20, 30]) # shape=[None, 1] +y = var(1) # shape=[1], value=1 +z = minibatch([10, 20, 30]) # shape=[None, 1] +cond = larger_than(x, 15) # [false, true, true] + +ie = pd.ifelse() +with ie.true_block(): + d = pd.layer.add_scalar(x, y) + ie.output(d, pd.layer.softmax(d)) +with ie.false_block(): + d = pd.layer.fc(z) + ie.output(d, d+1) +o1, o2 = ie(cond) +``` + +In both examples, the left branch computes `x+y` and `softmax(x+y)`, the right branch computes `fc(x)` and `x+1` . + +The difference is that variables in the C++ program contain scalar values, whereas those in the PaddlePaddle programs are mini-batches of instances. + + +### Blocks with `for` and `RNNOp` + +The following RNN model in PaddlePaddle from the [RNN design doc](./rnn.md) : + +```python +x = sequence([10, 20, 30]) # shape=[None, 1] +m = var(0) # shape=[1] +W = var(0.314, param=true) # shape=[1] +U = var(0.375, param=true) # shape=[1] + +rnn = pd.rnn() +with rnn.step(): + h = rnn.memory(init = m) + h_prev = rnn.previous_memory(h) + a = layer.fc(W, x) + b = layer.fc(U, h_prev) + s = pd.add(a, b) + act = pd.sigmoid(s) + rnn.update_memory(h, act) + rnn.output(a, b) +o1, o2 = rnn() +``` +has its equivalent C++ program as follows + +```c++ +int* x = {10, 20, 30}; +int* m = {0}; +int* W = {0.314}; +int* U = {0.375}; + +int mem[sizeof(x) / sizeof(x[0]) + 1]; +int o1[sizeof(x) / sizeof(x[0]) + 1]; +int o2[sizeof(x) / sizeof(x[0]) + 1]; +for (int i = 1; i <= sizeof(x)/sizeof(x[0]); ++i) { + int x = x[i-1]; + if (i == 1) mem[0] = m; + int a = W * x; + int b = Y * mem[i-1]; + int s = fc_out + hidden_out; + int act = sigmoid(sum); + mem[i] = act; + o1[i] = act; + o2[i] = hidden_out; +} +``` + +## Compilation and Execution + +Like TensorFlow, a PaddlePaddle program is written in Python. The first part describes a neural network as a protobuf message, and the rest executes the message for training or inference. + +The generation of this protobuf message is similar to how a compiler generates a binary executable file. The execution of the message is similar to how the OS executes the binary file. + +## The "Binary Executable File Format" + +The definition of the protobuf message is as follows: + +```protobuf +message BlockDesc { + repeated VarDesc vars = 1; + repeated OpDesc ops = 2; +} +``` + +The step net in above RNN example would look like + +``` +BlockDesc { + vars = { + VarDesc {...} // x + VarDesc {...} // h + VarDesc {...} // fc_out + VarDesc {...} // hidden_out + VarDesc {...} // sum + VarDesc {...} // act + } + ops = { + OpDesc {...} // matmul + OpDesc {...} // add_two + OpDesc {...} // sigmoid + } +}; +``` + +Also, the RNN operator in above example is serialized into a protobuf message of type `OpDesc` and would look like: + +``` +OpDesc { + inputs = {0} // the index of x in vars of BlockDesc above + outputs = {5, 3} // indices of act and hidden_out in vars of BlockDesc above + attrs { + "states" : {1} // the index of h + "step_net" : + } +}; +``` + +This `OpDesc` value is in the `ops` field of the `BlockDesc` value representing the global block. + + +## The Compilation of Blocks + +During the generation of the Protobuf message, the Block should store VarDesc (the Protobuf message which describes Variable) and OpDesc (the Protobuf message which describes Operator). + +VarDesc in a block should have its name scope to avoid local variables affect parent block's name scope. +Child block's name scopes should inherit the parent's so that OpDesc in child block can reference a VarDesc that stored in parent block. For example: + +```python +a = pd.Variable(shape=[20, 20]) +b = pd.fc(a, params=["fc.w", "fc.b"]) + +rnn = pd.create_rnn() +with rnn.stepnet(): + x = a.as_step_input() + # reuse fc's parameter + fc_without_b = pd.get_variable("fc.w") + rnn.output(fc_without_b) + +out = rnn() +``` +The method `pd.get_variable` can help retrieve a Variable by the name. The Variable may be stored in a parent block, but might be retrieved in a child block, so block should have a variable scope that supports inheritance. + +In compiler design, the symbol table is a data structure created and maintained by compilers to store information about the occurrence of various entities such as variable names, function names, classes, etc. + +To store the definition of variables and operators, we define a C++ class `SymbolTable`, like the one used in compilers. + +`SymbolTable` can do the following: + +- store the definitions (some names and attributes) of variables and operators, +- verify if a variable was declared, +- make it possible to implement type checking (offer Protobuf message pointers to `InferShape` handlers). + + +```c++ +// Information in SymbolTable is enough to trace the dependency graph. So maybe +// the Eval() interface takes a SymbolTable is enough. +class SymbolTable { + public: + SymbolTable(SymbolTable* parent) : parent_(parent) {} + + OpDesc* NewOp(const string& name=""); + + // TODO determine whether name is generated by python or C++. + // Currently assume that a unique name will be generated by C++ if the + // argument name is left default. + VarDesc* Var(const string& name=""); + + // find a VarDesc by name, if recursive is true, find parent's SymbolTable + // recursively. + // this interface is introduced to support InferShape, find protobuf messages + // of variables and operators, pass pointers into InferShape. + // + // NOTE maybe some C++ classes such as VarDescBuilder and OpDescBuilder should + // be proposed and embedded into pybind to enable python operation on C++ pointers. + VarDesc* FindVar(const string& name, bool recursive=true); + + OpDesc* FindOp(const string& name); + + BlockDesc Compile() const; + + private: + SymbolTable* parent_; + + map ops_; + map vars_; +}; +``` + +After all the description of variables and operators is added into SymbolTable, +the block has enough information to run. + +The `Block` class takes a `BlockDesc` as input, and provides `Run` and `InferShape` functions. + + +```c++ +namespace { + +class Block : OperatorBase { +public: + Block(const BlockDesc& desc) desc_(desc) {} + + void InferShape(const framework::Scope& scope) const override { + if (!symbols_ready_) { + CreateVariables(scope); + CreateOperators(); + } + // should run InferShape first. + for (auto& op : runtime_table_.ops()) { + op->InferShape(scope); + } + } + + void Run(const framework::Scope& scope, + const platform::DeviceContext& dev_ctx) const override { + PADDLE_ENFORCE(symbols_ready_, "operators and variables should be created first."); + for (auto& op : runtime_table_.ops()) { + op->Run(scope, dev_ctx); + } + } + + void CreateVariables(const framework::Scope& scope); + void CreateOperators(); + + // some other necessary interfaces of NetOp are listed below + // ... + +private: + BlockDesc desc_; + bool symbols_ready_{false}; +}; +``` + +## The Execution of Blocks + +Block inherits from OperatorBase, which has a Run method. +Block's Run method will run its operators sequentially. + +There is another important interface called `Eval`, which takes some arguments called targets and generates a minimal graph which treats targets as the end points and creates a new Block. After `Run`, `Eval` will get the latest value and return the targets. + +The definition of Eval is as follows: + +```c++ +// clean a block description by targets using the corresponding dependency graph. +// return a new BlockDesc with minimal number of operators. +// NOTE: The return type is not a Block but the block's description so that this can be distributed +// to a cluster. +BlockDesc Prune(const BlockDesc& desc, vector targets); + +void Block::Eval(const vector& targets, + const framework::Scope& scope, + const platform::DeviceContext& dev_ctx) { + BlockDesc min_desc = Prune(desc_, targets); + Block min_block(min_desc); + min_block.Run(scope, dev_ctx); +} +``` diff --git a/doc/design/build_system/README.md b/doc/design/build_system/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bf0e4dddc1b640ecbce489f65820aaf8a4b3b1e7 --- /dev/null +++ b/doc/design/build_system/README.md @@ -0,0 +1,152 @@ +A few months ago when we were trying to replace CMake with Bazel, @emailweixu suggested that we rewrite those handy Bazel functions using CMake. Now it seems that it's the right time to get this done, as we are facing problems from the porting of Majel and the development of new the parameter server using Go and C++. + +Here are some initial thoughts. Your comments are welcome! + +### Required CMake Function + +I think we need only the following few CMake functions to make a project description mean and clean: + +| C++ | CUDA C++ | Go | +|---|---|---| +| cc_library | nv_library | go_library | +| cc_binary | nv_binary | go_binary | +| cc_test | nv_test | go_test | + +- The `_library` functions generate .a files from source code. +- The `_binary` functions generate executable binary files. +- The `_test` functions generate executable unit test files. They work like `_binary` but links `-lgtest` and `-lgtest_main`. + +The difference between `nv_` functions and `cc_` functions is that the former use `nvcc` instead of the system-default C++ compiler. + +Both `nv_` and `cc_` functions enables C++11 (-std=c++11). + +Also, + +- to describe external dependencies, we need `external_library`. +- to build shared libraries, we need `shared_library`. + +### An Example Project + +Suppose that we have aforementioned functions defined in our `/cmake` directory. The following example `CMakeLists.txt` describes a project including the following source files: + +- tensor.h +- tensor.cc +- tensor_test.cc +- ops.h +- ops.cu +- ops_test.cu +- api.go +- api_test.go + +Suppose that ops.cu depends on CUDNN. + +```cmake +# cc_binary parses tensor.cc and figures out that target also depend +# on tensor.h. +cc_binary(tensor + SRCS + tensor.cc) + +# The dependency to target tensor implies that if any of +# tensor{.h,.cc,_test.cc} is changed, tensor_test need to be re-built. +cc_test(tensor_test + SRCS + tensor_test.cc + DEPS + tensor) + +# I don't have a clear idea what parameters external_library need to +# have. @gangliao as a CMake expert would have better ideas. +external_library(cudnn + ....) + +# Suppose that ops.cu depends on external target CUDNN. Also, ops.cu +# include global functions that take Tensor as their parameters, so +# ops depend on tensor. This implies that if any of tensor.{h.cc}, +# ops.{h,cu} is changed, ops need to be re-built. +nv_library(ops + SRCS + ops.cu + DEPS + tensor + cudnn) # cudnn is defined later. + +nv_test(ops_test + SRCS + ops_test.cu + DEPS + ops) + +# Because api.go defines a GO wrapper to ops and tensor, it depends on +# both. This implies that if any of tensor.{h,cc}, ops.{h,cu}, or +# api.go is changed, api need to be re-built. +go_library(api + SRCS + api.go + DEPS + tensor # Because ops depend on tensor, this line is optional. + ops) + +go_test(api_test + SRCS + api_test.go + DEPS + api) + + +# This builds libapi.so. shared_library might use CMake target +# api_shared so to distinguish it from above target api. +shared_library(api + DEPS + api) + +``` + +### Implementation + +As above example CMakeLists.txt executes, each function invocation adds "nodes" to a dependency graph. It also use this graph to generate CMake commands including `add_executable`, `add_dependencies`, `target_link_libraries`, and `add_test`. + +### Using Package Manager For Go + +Building Go binaries and libraries need to satisfy their dependencies, generally +we can do `go get ./...` to download and compile all external dependencies. The +problems are: + +1. `go get` will always get the latest code from the default branch of the + remote repo, so changes of dependents might break the build. This is very + different with what we already have in `cmake/external` which download a + specific version or commit id of the dependency. +1. Some locations can not access external dependencies through the internet, as mentioned + in https://github.com/PaddlePaddle/Paddle/issues/2605. Using package management + tools can package the dependencies as a "vendor" package, which can be mirrored + at many cloud file hosting, so users what to compile paddle by themselves can + download this "vendor" package from a mirror site. + +#### Choose A Suitable Tool + +As mentioned by @wangkuiyi, [Here](https://github.com/golang/go/wiki/PackageManagementTools) +list dozens of Go package managers. We choose the tool using following principles: + +- Most "active" projects with more stars, more pull requests or commits +- Widely used project + +After comparing all these projects, we shall choose between the most popular +tools: Godep and Glide. + +Here's a brief comparison between Godep and Glide +: https://github.com/Masterminds/glide/wiki/Go-Package-Manager-Comparison. There are +also many complaints about using `Godep`. There's also a new "official" pakcage +management tool has been started at: https://github.com/golang/dep to resolve +such problems, but it's currently at Alpha stage. So the best choice now is +glide obviously. + +#### Manage Go Packages + +- Dependencies: `go/glide.yaml` will store the dependencies and their versions which + is directly imported by paddle. `go/glide.lock` will store all dependencies recursively + with their commit id. Builds will "lock" to these packages if we don't `glide up` + them +- Vendor package: `go/vendor` directory will generated when running `cmake` command. `cmake` + will download the code corresponding to `go/glide.lock`. If we put a vendor folder + under `go/`, cmake will just check the commit id to the packages under the folder, + if commit id matches, there will be no download at all. diff --git a/doc/design/cluster_train/README.md b/doc/design/cluster_train/README.md new file mode 100644 index 0000000000000000000000000000000000000000..177a5f5d54bd924fab34795219ce1f7b270c8e25 --- /dev/null +++ b/doc/design/cluster_train/README.md @@ -0,0 +1,182 @@ +# Design Doc: Distributed Training + +## Objective + +In [this slides](https://www.slideshare.net/cxwangyi/paddlepaddle-a-complete-solution-for-businesses), we explained that we'd like PaddlePaddle running on general-purpose clusters like those managed by Kubernetes, so to address demands for AI from both Internet and non-Internet industries. + +This poses technical challenges to PaddlePaddle: + +1. Support fault-recovery. +1. Support both offline and online training. +1. [Serverless computing](https://en.wikipedia.org/wiki/Serverless_computing) of distributed training. + + +## Training Job + +A training job will be created once user asks Paddle cloud to train a model. The training job is made up of different processes that collaboratively consume data and produce a trained model. There are three kinds of processes: + +1. the *master server process*, which dispatches tasks to +1. one or more *trainer processes*, which run distributed training and synchronize gradients/models via +1. one or more *parameter server processes*, where each holds a shard of the global model, and receive the uploaded gradients from every *trainer process*, so they can run the optimize functions to update their parameters. + +Their relation is illustrated in the following graph: + + + +By coordinating these processes, PaddlePaddle supports use both Synchronize Stochastic Gradient Descent (sync SGD) and Asynchronous Stochastic Gradient Descent (async SGD) to train user-defined neural network topologies. + +When training with sync SGD, parameter servers wait for all trainers to finish gradients update and then send the updated parameters to trainers, training can not proceed until the trainer received the updated parameters. This creates a synchronization point between trainers. When training with async SGD, each trainer upload gradient and download new parameters individually, without the synchronization with other trainers. Using asyc SGD will be faster in terms of time per pass, but have more noise in gradient since trainers are likely to have a stale model. + +### Master Server Process + +The master server process will: + +- Partition a dataset into [tasks](#task) and dispatch tasks to trainers. +- Keep track of training progress on the dataset with [task queue](#task-queue). A training job will iterate on the dataset for a full pass until it goes into next pass. + + +#### Task + +A task is a data shard to be trained. The total number of tasks will be much bigger than the total number of trainers. The number of data instances inside a task will be much bigger than the mini-batch size. + +#### Task Queue + +The master server has three task queues to track training progress. As illustrated in the graph below, Job A and Job B both have one master server. Each master server process has three task queues. + + + +- The todo queue holds tasks to be dispatched. When a job starts, the master server fills in the todo queue with all tasks. +- The pending queue holds tasks that are currently training by trainers. +- the done queue holds tasks that are already trained. + +The life cycle of a single task is illustrated below: + + + +1. When a new pass of training starts, all tasks will be placed in the todo queue. +1. Upon trainer requests for new task, the master server will dispatch a task from todo queue to it, put the task in the pending queue and wait for completion. +1. The trainer will work on its task and tell the master server once the task is completed and ask for new task. The master server will dispatch a new task to that trainer. +1. If a task fails for any reason in trainer, or takes longer than a specific period of time, the master server will move the task back to the todo queue. The timeout count for that task will increase by one. If the timeout count is above a threshold, the task is likely to cause a trainer to crash, then it will be discarded. +1. The master server will move completed task to the done queue. When the todo queue is empty, the master server will start a new pass by moving all tasks in the done queue to todo queue and reset the timeout counter of all tasks to zero. + +### Trainer Process + +The trainer process will: + +- Request tasks from the master. +- Work on the tasks +- Upload gradient to parameter servers, and update local model by downloading new parameters from parameter servers. + +### Parameter Server Process + +Parameter server processes hold the parameters collaboratively. The parameters are partitioned on different parameter servers. + +The parameter server will: + +- Receive gradient from the trainers, update its parameters, and give the trainers the latest parameters. +- Periodically save its parameters to distributed file system by overriding the previous save. + +### Optimization Algorithms + +The communication pattern between the trainers and the parameter servers depends on the category of optimization algorithm: + +- Synchronous Stochastic Gradient Descent (sync-SGD) + + Parameter server will wait for all trainer finish n-th mini-batch calculation and send their gradients before broadcasting new parameters to every trainer. Every trainer will wait for the new parameters before starting n+1-th mini-batch. + +- Asynchronous Stochastic Gradient Descent (async-SGD) + + There will no synchronization between different trainers, and parameter server updates its parameter as soon as it receives new gradient: + + - Each trainer uploads its accumulated gradient every n mini-batches. + - Every m mini-batches, the trainer downloads new parameters from parameter server. + - n and m do not have to be equal. + +## Fault Tolerant + +The training job will pause if the master server processes is dead, or any of the parameter server process is dead. They will be started by [Kubernetes](https://kubernetes.io/) and recover in few minutes. Please refer to [fault recovery](#fault-recovery). + +The training job will continue to make progress if there is at least one training process running. The strategy depends on the type of optimization algorithm: + +- sync-SGD + + TODO + +- async-SGD + + Since async-SGD does not require synchronization between mini-batches, the system will by definition make process if at least one trainer is running. + +## Fault Recovery + +PaddlePaddle uses [etcd](https://github.com/coreos/etcd) to keep track of the states of processes. Because etcd is a distributed reliable key-value store, the restarted process can recover its states from etcd. The model parameters are periodically saved into distributed file system, so a restarted parameter server can recover its parameters from the saved file. + +Now we will introduce how each process recovers from a failure, the graph below shows how etcd is used: + + + +### Master Server Process + +When the master is started by the Kubernetes, it executes the following steps at startup: + +1. Grabs a unique *master* lock in etcd, which prevents concurrent master instantiations. +1. Recovers the task queues from etcd if they already exist, otherwise, the master will create them. +1. Write its ip address to */master/addr* so that trainers can discover it. +1. Listens to trainers' request of task, dispatch one upon request, and updates task queue using an etcd transaction to ensure lock is held during the update. + +When the master server process is dead for any reason, Kubernetes will restart it. It will be online again with all states recovered from etcd in few minutes. + +### Trainer Process + +When the trainer is started by the Kubernetes, it executes the following steps at startup: + +1. Watches the available parameter server prefix keys `/ps/` on etcd and waits until the count of parameter servers reaches the desired count */ps_desired*. +1. Finds and watches */master/addr* to get master's address. +1. Requests for tasks from the master to start training. + +When a trainer fails, Kuberentes would try to restart it. The recovered trainer would fetch tasks from master and go on training. + +### Parameter Server Process + +When the parameter server is started by Kubernetes, it executes the following steps at startup: + +1. Read desired total number of parameter servers from etcd `/ps_desired` +1. Search through etcd keys `/ps/` (`/ps/0`, `/ps/1`, ...) to find the first non-existant key whose index is smaller than the total number of parameter servers. Set the key using a transaction to avoid concurrent writes. The parameter server's index is inferred from the key name. + + The desired number of parameter servers is 3: + + + + The third parameter server joined: + + + +1. The parameter server can load parameters if there are already saved parameters in the save path (inferred from its index). +1. Now the parameter server is ready for the trainers' requests. + +If the parameter server's etcd lease expires, the parameter server will kill itself. + + +## Parameter Server Checkpointing +See [here](./checkpointing.md) + +## Store and dispatching trainning data +See [here](./data_dispatch.md) + + +## Dynamic Scaling + +### Trainer Scaling + +TODO + +### Parameter Server Scaling + +Not planned for v1. + +## Training Dataset Format + +TODO + +## User Interface + +TODO diff --git a/doc/design/cluster_train/checkpointing.md b/doc/design/cluster_train/checkpointing.md new file mode 100644 index 0000000000000000000000000000000000000000..c87ef2c7d2636208866d05456d5d44316d0bb200 --- /dev/null +++ b/doc/design/cluster_train/checkpointing.md @@ -0,0 +1,44 @@ +## 模型参数检查点(Checkpointing) +模型数据检查点的实现,可以有效的避免parameter server的单点或多点同时故障。模型参数检查点通过定期向磁盘上保存一份存储在parameter server内存中的模型数据的完整镜像,来保证训练过程可以从中间状态重新启动。在一个不可中断并缺少备份的训练任务中,可以通过阶段性的保存每个parameter server的数据快照(snapshot)到 ***分布式存储服务*** 达到容灾的目的,比如每隔10分钟最新的快照,并删除更早的快照。在出现单点故障时,只需要恢复这台节点,或者将这台节点迁移到另一个节点并启动即可恢复训练任务。 + + + +### 快照保存的设计如下: + +说明: + +* parameter server在集群中启动后,自动挂载分布式存储目录,并把快照保存到这个目录下。 +* ***注:每个parameter server的检查点各自独立保存,暂时不考虑多个parameter server同步的保存一个特定时间点的全局检查点,因为这样做也没法保证消除随机性。*** + +检查点保存程序流程: + +1. 如果满足条件"每隔10分钟"时,parameter server会获取parameters内存的`read_lock`,启动一个新的线程开始保存检查点。如果已经正在执行保存检查点的线程,则忽略。由于对parameters的更新需要获取parameters内存的`write_lock`,所以在写入快照的过程中,parameter server会暂停参数更新并等待。 +2. parameter server生成一个UUID,向指定的目录中一个新的文件(文件名为此UUID)写入快照数据。在快照写入完成后,计算这个文件的MD5 sum。然后在etcd的`/checkpoints/[pserver_id]`中写入json内容:`{"uuid": [UUID], "md5", "MD5 sum", "timestamp": xxxx}`。 +3. 删除磁盘目录中不是当前uuid的快照文件。 +4. 释放对paramters内存的锁定,停止保存检查点的线程。 + +这里需要用户额外注意,在您的实际环境中,训练任务的运行可能会占满trainer和parameter server之间的网络带宽,如果parameter server此时还需要通过网络访问分布式存储以保存快照,可能会造成网络拥塞,而出现阶段性的运行停滞。 + +### 从快照恢复 + +在parameter server第一次启动或任意时间parameter server故障后被Kubernetes重新启动,则需要回滚到上一个检查点: + + 1. 从etcd中读取节点:`/checkpoints/[pserver_id]`获取最新的检查点的文件uuid + 1. 从磁盘文件中加载uuid文件名的检查点快照文件,并加载其中的参数 + 1. 如果上面两步出现错误,则使用启动参数定义的初始化方法初始化参数 + 1. 开始提供服务 + +## TODO List +### 推测执行/加速执行(TODO) +在异构集群中,如果存在某些trainer执行速度过慢会影响整体集群的速度(如图中Trainer 1),此时master将负责启动一个新的Trainer(Accelerate Trainer 2),使用同样的训练数据block。哪个trainer先完成block的训练,则把另一个慢速的kill掉。 + +### 动态扩容/缩容 +目前只考虑动态扩容trainer数量,可以减小系统复杂性。 + +## 术语 +* model: 指深度学习训练之后得到的所有参数,使用这个神经网络可以完成对新数据的预测 +* parameters: 神经网络中的参数,包括权重w和偏置b。一个神经网络的模型由大量的参数组成 +* shard: 分片,通常指将一个整体拆分成多份的其中的一份。 +* model shard: 将一个神经网络参数拆分成多份,每个shard分别存储在其中一台parameter server之上 +* parameter block: 多个parameter block构成一个model shard +* 单点故障: 任意时刻只可能同时有一台服务器故障。由于集群中同时存在两台机器故障的概率极低((平均故障率*平均故障修复时间)^2)只对特殊在线系统考虑两台以上同时故障的容灾。 diff --git a/doc/design/cluster_train/data_dispatch.md b/doc/design/cluster_train/data_dispatch.md new file mode 100644 index 0000000000000000000000000000000000000000..1f5d22ff5e6abcb576d16cbe7391da1967a1ab8e --- /dev/null +++ b/doc/design/cluster_train/data_dispatch.md @@ -0,0 +1,160 @@ +## 训练数据的存储和分发 + +### 概念解释 + +### 流程介绍 +生产环境中的训练数据集通常体积很大,并被存储在诸如Hadoop HDFS,Ceph,AWS S3之类的分布式存储之上。这些分布式存储服务通常会把数据切割成多个分片分布式的存储在多个节点之上。这样就可以在云端执行多种数据类计算任务,包括: + +* 数据预处理任务 +* Paddle训练任务 +* 在线模型预测服务 +
+ +
+ +在上图中显示了在一个实际生产环境中的应用(人脸识别)的数据流图。生产环境的日志数据会通过实时流的方式(Kafka)和离线数据的方式(HDFS)存储,并在集群中运行多个分布式数据处理任务,比如流式数据处理(online data process),离线批处理(offline data process)完成数据的预处理,提供给paddle作为训练数据。用户也可以上传labeled data到分布式存储补充训练数据。在paddle之上运行的深度学习训练输出的模型会提供给在线人脸识别的应用使用。 + +### 训练数据存储 +我们选择[CephFS](http://docs.ceph.com/docs/master/cephfs/)作为存储系统。 + +- 无论是从[PFSClient](../file_manager/README.md)的角度,还是从[Pod](https://kubernetes.io/docs/concepts/workloads/pods/pod/)中运行任务的角度,统一用`/pfs/$DATACENTER/home/$USER`来访问用户自己的数据。 +- `/pfs/$DATACENTER/common`下存放公共数据集合 + - 做只读挂载 + +
+ +
+ +### 文件预处理 + + +在开始训练之前, 数据集需要预先被转换成PaddlePaddle分布式训练使用的存储格[RecordIO](https://github.com/PaddlePaddle/Paddle/issues/1947)。我们提供两个转换方式: + +1. 用户在本地转换好再上传 +1. 用户上传数据后,在机群上运行转换程序 + +转换生成的文件名会是以下格式: + +```text +name_prefix-aaaaa-of-bbbbb +``` + +"aaaaa"和"bbbbb"都是五位的数字,每一个文件是数据集的一个shard,"aaaaa"代表shard的index,"bbbbb"代表这个shard的最大index。 + +比如ImageNet这个数据集可能被分成1000个shard,它们的文件名是: +```text +imagenet-00000-of-00999 +imagenet-00001-of-00999 +... +imagenet-00999-of-00999 +``` + +#### 转换库 + +无论是在本地或是云端转换,我们都提供Python的转换库,接口是: +```python +def convert(output_path, reader, num_shards, name_prefix) +``` + +- `output_path`: directory in which output files will be saved. +- `reader`: a [data reader](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/reader/README.md#data-reader-interface), from which the convert program will read data instances. +- `num_shards`: the number of shards that the dataset will be partitioned into. +- `name_prefix`: the name prefix of generated files. + +`reader`每次输出一个data instance,这个instance可以是单个值,或者用tuple表示的多个值: + +```python +yield 1 # 单个值 +yield numpy.random.uniform(-1, 1, size=28*28) # 单个值 +yield numpy.random.uniform(-1, 1, size=28*28), 0 # 多个值 +``` + +每个值的类型可以是整形、浮点型数据、字符串,或者由它们组成的list,以及numpy.ndarray。如果是其它类型,会被Pickle序列化成字符串。 + +### 示例程序 + +#### 使用转换库 + +以下`reader_creator`生成的`reader`每次输出一个data instance,每个data instance包涵两个值:numpy.ndarray类型的值和整型的值: +```python +def reader_creator(): + def reader(): + for i in range(1000): + yield numpy.random.uniform(-1, 1, size=28*28), 0 # 多个值 + return reader +``` + +把`reader_creator`生成的`reader`传入`convert`函数即可完成转换: +```python +convert("./", reader_creator(), 100, random_images) +``` + +以上命令会在当前目录下生成100个文件: +```text +random_images-00000-of-00099 +random_images-00001-of-00099 +... +random_images-00099-of-00099 +``` + +#### 进行训练 + + +PaddlePaddle提供专用的[data reader creator](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/reader/README.md#python-data-reader-design-doc),生成给定`RecordIO`文件对应的data reader。**无论在本地还是在云端,reader的使用方式都是一致的**: + +```python +# ... +reader = paddle.reader.creator.RecordIO("/pfs/datacenter_name/home/user_name/random_images-*-of-*") +batch_reader = paddle.batch(paddle.dataset.mnist.train(), 128) +trainer.train(batch_reader, ...) +``` + +以上代码的reader输出的data instance与生成数据集时,reader输出的data instance是一模一样的。 + +### 上传训练文件 + +使用下面命令,可以把本地的数据上传到存储集群中。 + +```bash +paddle pfs cp filename /pfs/$DATACENTER/home/$USER/folder/ +``` + +比如,把之前示例中转换完毕的random_images数据集上传到云端的`/home/`可以用以下指令: + +```bash +paddle pfs cp random_images-*-of-* /pfs/$DATACENTER/home/$USER/folder/ +``` + +需要`$DATACENTER`的配置写到配置文件中,例如 + +``` +# config file +[datacenter_1] +username=user +usercert=user.pem +userkey=user-key.pem +endpoint=datacenter1.paddlepaddle.org + +[datacenter_2] +username=user +usercert=user.pem +userkey=user-key.pem +endpoint=datacenter2.paddlepaddle.org +``` +## TODO +### 文件访问的权限 +控制用户权限 + +- 用户可以把自己的数据分享给别人 + +### 文件访问方式 +不用mount的方式来访问数据,而是直接用API的接口远程访问 + +例如: + +``` +f = open('/pfs/datacenter_name/home/user_name/test1.dat') +``` + + +### 支持用户自定义的数据预处理job diff --git a/doc/design/cluster_train/large_model_dist_train.md b/doc/design/cluster_train/large_model_dist_train.md new file mode 100644 index 0000000000000000000000000000000000000000..0c4b5bc24c854b7062d509249bea9c50d42bd5f1 --- /dev/null +++ b/doc/design/cluster_train/large_model_dist_train.md @@ -0,0 +1,101 @@ +# Alalysis of large model distributed training in Paddle + +***NOTE: This is only some note for how we implemeted this scheme in V1, not a new design.*** + +## What is it + +We often encounter cases that the embedding layer parameters(sparse) are so large that we can not store it in the trainer's memory when training. So we need to put them to several servers, and fetch them row by row instead of fetch all of the parameters. + +## How to use + +Specify command-line argument like `--loadsave_parameters_in_pserver=true --ports_num_for_sparse=1 --use_old_updater=1` when starting the paddle trainer. And also add something like `--ports_num_for_sparse=1 --pserver_num_threads=5` when starting pserver processes. + +Accrodingly, configure your embedding layers like: + +```python +SPARSE_REMOTE=True + +w1 = data_layer(name="w1", size=dict_size) +emb1 = embedding_layer(input=w1, size=32, param_attr=ParameterAttribute(sparse_update=SPARSE_REMOTE)) +w2 = data_layer(name="w2", size=dict_size) +emb2 = embedding_layer(input=w2, size=32, param_attr=ParameterAttribute(sparse_update=SPARSE_REMOTE)) +... +``` + +## Implementation details + +```c++ +enum MatType { + MAT_NORMAL, + MAT_NORMAL_SHARED, + MAT_VALUE_SHARED, + MAT_SPARSE_ROW_IDS, + MAT_SPARSE_ROW_AUTO_GROW, + MAT_CACHE_ROW, + MAT_SPARSE_ROW, + MAT_SPARSE_ROW_PREFETCH, + MAT_SPARSE_ROW_PREFETCH_FULL_SIZE, +}; +``` + +`MAT_SPARSE_ROW_PREFETCH` is what we use when configured to fetch only row of matrix when training. + +In `trainer_internal.cpp:L93 trainOneBatch`: + +```c++ + if (config_->getOptConfig().use_sparse_remote_updater()) { + REGISTER_TIMER("prefetch"); + gradientMachine_->prefetch(inArgs); + parameterUpdater_->getParametersRemote(); + } +``` + +When doing actual network forward and backward, at the beginning of each batch, the trainer will try to download one row of data from pserver. + +In `trainer/RemoteParameterUpdater.cpp`: `parameterUpdater_->getParametersRemote();`: + +```c++ +if (fullSize) { + ... +} else { +getParams = [&] { + parameterClient_->getParameterSparse( + /* recvParameterType= */ PARAMETER_VALUE, sendBackParameterType); +}; +applyL1 = [](Parameter& para, real decayRate) { + para.getMat(PARAMETER_VALUE)->applyL1(/*lr=*/1.0f, decayRate); +}; +} +``` + +Calling `parameterClient_->getParameterSparse` will do remote call to pserver's `getParameterSparse`: + +```c++ +void ParameterServer2::getParameterSparse(const SendParameterRequest& request, + std::vector& inputBuffers, + SendParameterResponse* response, + std::vector* outputBuffers) { + (void)inputBuffers; + auto& buffer = *readWriteBuffer_; + size_t numReals = 0; + for (const auto& block : request.blocks()) { + numReals += getParameterConfig(block).dims(1); + } + buffer.resize(numReals); + + VLOG(3) << "pserver: getParameterSparse, numReals=" << numReals; + + ReadLockGuard guard(parameterMutex_); + size_t offset = 0; + for (const auto& block : request.blocks()) { + size_t width = getParameterConfig(block).dims(1); + Buffer buf = {buffer.data() + offset, width}; + int type = request.send_back_parameter_type(); + sendBackParameterSparse(block, type, response, &buf, width, outputBuffers); + offset += width; + } +} +``` + +`getParameterConfig(block).dims(1)` returns the width of the current "parameter block"(a shard of parameter object), +then `getParameterSparse` remote call returns only one row of data to the client. diff --git a/doc/design/cluster_train/master_server.md b/doc/design/cluster_train/master_server.md new file mode 100644 index 0000000000000000000000000000000000000000..4bf3c506f101361875043f8bfd97972b8c981a22 --- /dev/null +++ b/doc/design/cluster_train/master_server.md @@ -0,0 +1,91 @@ +# Design Doc: Master Server + +For an overview of master server's role, please refer to [distributed training design doc](./README.md). In this design doc we will discuss the master server in more details. The master will be implemented in [Go](https://golang.org/). + +## Dataset + + + +A dataset is a list of files in *RecordIO* format. A RecordIO file consists of chunks, whereas each chunk consists some records. + +## Task Queue + +As mentioned in [distributed training design doc](./README.md), a *task* is a data shard that the master server assigns to the trainer process to train on. A task consists of one or multiple *chunks* from one or multiple files. The master server maintains *task queues* to track the training progress. + +### Task Queue Creation + +1. Each trainer will make an RPC call (using Go's [rpc](https://golang.org/pkg/net/rpc/) package) to the master server, telling it the RecordIO files representing the dataset specified by the user. Since every trainer will tell the master server the same dataset, only the first RPC call will be honored. + + The RPC interface is: + ```go + func (m *RPCServer) ReportDataset(Paths []string, dummy *int) error { + } + ``` +1. The master server will scan through each RecordIO file to generate the *chunk index* and know how many chunks does each file have. A chunk can be referenced by the file path and the index of the chunk within the file. The chunk index is in memory data structure that enables fast access to each chunk, and the index of the chunk with the file is an integer start from 0, representing the n-th chunk within the file. + + The definition of the chunk is: + ```go + type Chunk struct { + Idx int // index of the chunk within the file + Path string + Index recordio.Index // chunk index + } + ``` +1. Chunks are grouped into tasks, and tasks are filled into the todo queue. The pending queue and the done queue are initialized with no element. + + The definition of the task is: + ```go + type Task struct { + Index int + Chunks []Chunk + } + ``` + + The elements in the tasks queues is of type `TaskEntry`, containing a timeout counter (described in [task retry logic](#task-retry-logic)), and a task: + ```go + type TaskEntry struct { + NumTimeout int + Task Task + } + ``` + + The definition of task queues is: + ```go + type TaskQueues struct { + Todo []TaskEntry + Pending map[int]TaskEntry // map from task index to task entry + Done []TaskEntry + } + ``` + +### Task Queue Persistence + +The task queues need to be persisted on [etcd](https://github.com/coreos/etcd) for fault recovery. Since the task queues only change once a task is completed or timed out, which is not very frequent, we can afford to synchronize with etcd every time the task queues change. + +We will serialize the task queues data structure with [gob encoding](https://golang.org/pkg/encoding/gob/), compress with gzip, and save into etcd synchronously under key `/task_queues`. + +### Task Dispatch + +The trainer will make an RPC call to master to get a new task when: + +- the trainer first started, or +- the trainer finishes a task. + +The RPC interface is: +```go +func (m *RPCServer) GetTask(finished *Task, result *Task) error { +} +``` +Argument `finished` will be `nil` when the trainer is just started. + +During the RPC call the master will do the following: + +- Make a copy of the task queues, and update the copy reflecting the finished tasks and the new pending tasks. +- Synchronize the copy of task queues with etcd using a transaction conditioned on holding the master lock. +- Replace the task queues with the copy and report to the trainer with the new tasks if succeeded, or discard the copy and report the error to the trainer if failed. + +### Task Retry Logic + +When a task is dispatched to the trainer, the master will schedule a function for execution after the timeout duration (based on the moving average of task completion time). If the task entry in still in the pending queue, its timeout counter will increase by one, and the task will be moved to todo queue. If the timeout counter is above the threshold, the master will log the error and discard the task. + +Please note that since a timed out task could be completed after it has been dispatched for retry, so it is possible for a task to be processed multiple times. We do not try to prevent it from happening since it's fine to train on the same task multiple times due to the stochastic nature of the stochastic gradient decent algorithm. diff --git a/doc/design/cluster_train/pserver_client.md b/doc/design/cluster_train/pserver_client.md new file mode 100644 index 0000000000000000000000000000000000000000..474b8c572cd92fc87e9f7f3f2b19d12cccd158de --- /dev/null +++ b/doc/design/cluster_train/pserver_client.md @@ -0,0 +1,171 @@ +# Design Doc: The Client Library of Parameter Server + +For an overview of trainer's role, please refer to [distributed training design doc](README.md). In this design doc, we will discuss the parameter server's client library, which will manage communication with parameter servers. The library will be implemented in [Go](https://golang.org/) and made available as a static or dynamic library with a C header file. + +## Parameter Partition + +Each parameter will be partitioned into parameter blocks to make the parameters evenly distributed on parameter servers. The partition is done automatically by the client library. The *sparse parameter* require a little different treatment: + +### Sparse Parameter + +The sparse parameter is a parameter that is updated sparsely. The name is somewhat misleading, it does not have a sparse representation, it has the same representation as a dense vector. + +Because a sparse parameter is updated sparsely, the trainer will have to partition the sparse parameter. Because the parameter server will merge all sparse parameter shard into the same file when saving the parameter. It needs special naming convention: + +If a sparse parameter is partitioned into n shards, they should be named as: + +```text +name:sparse-0 +name:sparse-1 +... +name:sparse-n-1 +``` + +The library is unaware of the partition, and treat each parameter independently. Only when saving parameters, the parameter servers will merge the sparse parameters according to the naming convention. + +## Model Optimization Using Gradients + +There are two ways to perform model optimization using gradients: + +- On Client + + The client does multiple steps of forward and backward update. In each step, the gradients are calculated and a new model is generated. After some steps, the client will calculate the difference between the newest model and the old model at step 0. The difference will be updated to parameter servers. Parameter servers will just update parameters using the difference without any optimization using gradients (such as Adam and L1 regularization). + +- On Parameter Server + + The client will send accumulated gradients to parameter servers, the parameter server will do the optimization using gradients. + +## L1 and L2 Regularization + +PaddlePaddle allows L1 or L2 regularizations to be specified per parameter, so when the trainer initializes the parameter it needs include a parameter configuration when L1 or L2 regularization is necessary. + +## Parameter Initialization + +The parameters on parameter servers need to be initialized. To provide maximum flexibility, the trainer will initialize the parameters. Only one trainer will do the initialization, the other trainers will wait for the completion of initialization and get the parameters from the parameter servers. + +### Trainer Selection + +To select the trainer for initialization, every trainer will try to get a distributed lock, whoever owns the lock will do the initialization. As illustrated below: + + + +### Trainer Selection Process + +The trainer select process is encapsulated in the C API function: +```c +int paddle_begin_init_params(paddle_pserver_client* client, const char* config_proto); +``` +The selected trainer's call to `paddle_begin_init_params` will return with 1, and the other trainers' call to `paddle_begin_init_params` will return 0. `paddle_get_params` will be blocked until initialization is completed. As illustrated below: + + + +## C Interface + +```c +typedef enum { + PADDLE_ELEMENT_TYPE_INT32 = 0, + PADDLE_ELEMENT_TYPE_UINT32 = 1, + PADDLE_ELEMENT_TYPE_INT64 = 2, + PADDLE_ELEMENT_TYPE_UINT64 = 3, + PADDLE_ELEMENT_TYPE_FLOAT32 = 4, + PADDLE_ELEMENT_TYPE_FLOAT64 = 5, +} paddle_element_type; + +typedef struct { + char* name; + paddle_element_type element_type; + unsigned char* content; + int content_len; +} paddle_parameter, paddle_gradient; + +typedef int paddle_pserver_client; + +/** + * @brief creates a pserver client that talks to etcd for coordination. + */ +paddle_pserver_client paddle_new_etcd_pserver_client(char* etcd_addr); + +/** + * @brief creates a pserver client given pserver addresses. + * + * @param pserver_addrs comma-separated pserver addresses. + * @param selected if current pserver client is selected to initialize all parameter servers. + */ +paddle_pserver_client paddle_new_pserver_client(char* pserver_addrs, int selected); +void paddle_pserver_client_release(paddle_pserver_client c); + +/** + * @brief paddle_begin_init_params begins to initialize parameters on + * parameter servers. + * + * paddle_begin_init_params will be called from multiple trainers, + * only one trainer will be selected to initialize the parameters on + * parameter servers. Other trainers need to get the initialized + * parameters from parameter servers using @paddle_get_params. + * + * @return 1 if the trainer is selected to initialize parameter + * servers, otherwise 0. + */ +int paddle_begin_init_params(paddle_pserver_client client); + +/** + * @brief paddle_init_param initializes the parameter on parameter + * servers. + * + * @param param the parameter to initialize. + * @param param_config_proto the configuration for the parameter. + * @param config_len the length of param_config_proto + * @return 0 if successful, otherwise -1. On failure, the trainer + * needs to restart the entire initialization process (starting from + * @paddle_begin_init_param). Or simply exit the program and wait for + * the cluster management system to restart the trainer. + */ +int paddle_init_param(paddle_pserver_client client, paddle_parameter param, const unsigned char* param_config_proto, int config_len); + +/** + * @brief paddle_finish_init_params tells parameter servers client has + * sent all parameters to parameter servers as initialization. + * + * @return 0 if successful, otherwise -1. On failure, the trainer + * needs to restart the entire initialization process (starting from + * @paddle_begin_init_param). Or simply exit the program and wait for + * the cluster management system to restart the trainer. + */ +int paddle_finish_init_params(paddle_pserver_client client); + +/** + * @brief paddle_send_grads sends gradients to parameter servers for + * updating parameters. + * + * @param grads the array of gradients to send. + * @param len the length of the gradient array. + * @param learning_rate the learning rate for the gradients. + * @return 0 if successful, otherwise -1. + */ +int paddle_send_grads(paddle_pserver_client client, const paddle_gradient* grads, int len); + +/** + * @brief paddle_get_params gets parameters from parameter servers. + * + * paddle_get_params will block until parameters are initialized on + * the parameter servers. + * + * @param dst the destination array of parameter pointers to save to. + * The parameter pointer must be pre-popullated with required parameter name, + * and the content of parameter must be pre-allocated of the size of required + * parameter on pserver. + * @param len the length of the names array and the paddle_parameter + * array. + * @return 0 if successful, otherwise -1. + */ +int paddle_get_params(paddle_pserver_client client, paddle_parameter** dst, int len); + +/** + * @brief paddle_save_model indicates parameters to save the parameter + * to the given path + * + * @param path the path to save parameters. + * @return 0 if successful, otherwise -1. + */ +int paddle_save_model(paddle_pserver_client client, const char* path); +``` diff --git a/doc/design/cluster_train/remote_parameter_updater.md b/doc/design/cluster_train/remote_parameter_updater.md new file mode 100644 index 0000000000000000000000000000000000000000..6e8e5938455b869e0f3367794c41250340b37f77 --- /dev/null +++ b/doc/design/cluster_train/remote_parameter_updater.md @@ -0,0 +1,21 @@ +# Design Doc: Remote Parameter Updater for Cluster Train + +For an overview of distribute training, please refer to [distributed training design doc](README.md). In this design doc, we will discuss the parameter updater that will use parameter server cclient [The Client Library of Parameter Server Design Doc](pserver_client.md) to manage and update parameters. + +## Parameter Updater + +Parameter Updater is used by trainer to manage and update parameter, there are mainly two kind of parameter updater: local and remote, since this design is for cluster train, we will only discuss remote parameter updater here. + +### Remote Parameter Updater + +Remote Parameter Updater manage parameters through remote parameter server with the client that communicate with pserver([The Client Library of Parameter Server Design Doc](pserver_client.md)) + +In PaddlePaddle Python V2 API, trainer is implemented in python, and the trainer will hold a instance of parameter updater and call it's functions directly. In this design, we will also expose the api of RemoteParameterUpdater to python with swig. + +#### Sparse Remote Parameter Updater + +Since we will only implement dense parameter management new, the mechanism for sparse parameter will be discussed in next stage. + +### Interface Design + +TBD diff --git a/doc/design/cluster_train/save_model.md b/doc/design/cluster_train/save_model.md new file mode 100644 index 0000000000000000000000000000000000000000..b755185c81ad617b9c85c47de0f5f65d2201c658 --- /dev/null +++ b/doc/design/cluster_train/save_model.md @@ -0,0 +1,111 @@ +# Design Doc: Save Model + +## Overview + +The model is the output of the training process. There are two +ways from which user can obtain a model: + +- Save model triggered by user code: user code asks PaddlePaddle to + save a model. +- Convert model from the checkpoint: model being converted from + pservers' periodic checkpoint. In this way, the user can cancel a + job at any time, and still have a relatively fresh model (we + checkpoint around every 5 minutes). + +### Trainer Saving Model vs. Pservers Saving Model + +Both trainers and pservers have access to the model. So the model can +be saved from a trainer or pservers. We need to decide where the model +is saved from. + +#### Dense Update vs. Sparse Update + +There are two types of model update methods: dense update and sparse +update (when the model parameter is configured to be sparse). + +- Dense update + + Every trainer has it's own full copy of the model. Every model + update will update the entire model. + +- Sparse update + + The training input is sparse, and the trainer does not have the + entire model. It will only download the sub-model necessary related + to the input. When updating the model, only the sub-model related to + the training input is updated. + + +#### Pservers Saving Model + +The benefit of letting pservers save model is they have the entire +model all the time. However, since pservers are on different nodes, it +requires a merging process to merge model shards into the same +model. Thus requires the pservers to write models to a distributed +filesystem, making the checkpoint shards visible to the merge program. + +#### Trainer Saving Model + +The benefit of letting one trainer to save the model is it does not +require a distributed filesystem. And it's reusing the same save model +logic when training locally - except when doing sparse update, the +trainer needs to download the entire model during the saving process. + +#### Conclusion + +Given trainer saving model does not require a distributed filesystem, +and is an intuitive extension to trainer saving model when training +locally, we decide to let the trainer save the model when doing +distributed training. + + +### Convert Model from Checkpoint + +TODO + + +## Timeline + +We first implement trainer save the model. Converting the latest +snapshot to a model will be a TODO for future. + + +## Trainer Save Model + +### Trainer Election + +One trainer will be elected as the one to save the model. When using +etcd, trainer ID is a randomly generated UUID, the trainer will +contact the master server requesting to save the model, and find out +if itself is elected. When the master server is not used, unique +trainer IDs will be given by the administrator, the trainer whose ID +is "0" is elected to save the model. + +### Model Save Path + +Each trainer will be given the directory to save the model. The +elected trainer will save the model to +`given-directory/trainerID`. Since the trainer ID is unique, this +would prevent concurrent save to the same file when multiple trainers +are elected to save the model when split-brain problem happens. + +### What Happens When Model Is Saving + +It takes some time to save model, we need to define what will happen +when save model is taking place. + +When doing dense update, the trainer uses the local model. Pservers +does not need to pause model update. + +When doing sparse update. The trainer needs to download the entire +model while saving. To get the most accurate model, the model update +needs to be paused before the download starts and resumed after the +download finishes. Otherwise, the trainer gets a model that is +"polluted": some part of the model is old, some part of the model is +new. + +It's unclear that the "polluted" model will be inferior due to the +stochastic nature of deep learning, and pausing the model update will +add more complexity to the system. Since supporting sparse update is a +TODO item. We defer the evaluation of pause the model update or not +during saving model to the future. diff --git a/doc/design/cluster_train/src/checkpointing.png b/doc/design/cluster_train/src/checkpointing.png new file mode 100644 index 0000000000000000000000000000000000000000..c221e8474f90f37e31416cbb19c9452207a0d14c Binary files /dev/null and b/doc/design/cluster_train/src/checkpointing.png differ diff --git a/doc/design/cluster_train/src/data_dispatch.png b/doc/design/cluster_train/src/data_dispatch.png new file mode 100644 index 0000000000000000000000000000000000000000..5bdcc24d6a6d193cb014f8c38b362451fded5e54 Binary files /dev/null and b/doc/design/cluster_train/src/data_dispatch.png differ diff --git a/doc/design/cluster_train/src/dataset.graffle b/doc/design/cluster_train/src/dataset.graffle new file mode 100644 index 0000000000000000000000000000000000000000..c10a423ed16a23229a9ee33d11bfc82bb59646c8 Binary files /dev/null and b/doc/design/cluster_train/src/dataset.graffle differ diff --git a/doc/design/cluster_train/src/dataset.png b/doc/design/cluster_train/src/dataset.png new file mode 100644 index 0000000000000000000000000000000000000000..2fb7f1cce3b6dd21489392557826e95a9f207c34 Binary files /dev/null and b/doc/design/cluster_train/src/dataset.png differ diff --git a/doc/design/cluster_train/src/file_storage.graffle b/doc/design/cluster_train/src/file_storage.graffle new file mode 100644 index 0000000000000000000000000000000000000000..50a17e70fa255495337c529a3bf12a5c0024a5be Binary files /dev/null and b/doc/design/cluster_train/src/file_storage.graffle differ diff --git a/doc/design/cluster_train/src/file_storage.png b/doc/design/cluster_train/src/file_storage.png new file mode 100644 index 0000000000000000000000000000000000000000..fccb4e3e7e738224c7f1584326bd5f351ce799aa Binary files /dev/null and b/doc/design/cluster_train/src/file_storage.png differ diff --git a/doc/design/cluster_train/src/init_lock.graffle b/doc/design/cluster_train/src/init_lock.graffle new file mode 100644 index 0000000000000000000000000000000000000000..fa9149f21b1311eed48ef72ec55e556559d0fc94 Binary files /dev/null and b/doc/design/cluster_train/src/init_lock.graffle differ diff --git a/doc/design/cluster_train/src/init_lock.png b/doc/design/cluster_train/src/init_lock.png new file mode 100644 index 0000000000000000000000000000000000000000..92404ee6d6c0f9a7727952bae3c869ba338ecd7f Binary files /dev/null and b/doc/design/cluster_train/src/init_lock.png differ diff --git a/doc/design/cluster_train/src/paddle-cloud-in-data-center.png b/doc/design/cluster_train/src/paddle-cloud-in-data-center.png new file mode 100644 index 0000000000000000000000000000000000000000..da5d1a77562480ad1d886f5f21dbd84001d3d508 Binary files /dev/null and b/doc/design/cluster_train/src/paddle-cloud-in-data-center.png differ diff --git a/doc/design/cluster_train/src/paddle-etcd.graffle b/doc/design/cluster_train/src/paddle-etcd.graffle new file mode 100644 index 0000000000000000000000000000000000000000..f973dc9b9dbf72e9bc31e2d32822916cd281f8d9 Binary files /dev/null and b/doc/design/cluster_train/src/paddle-etcd.graffle differ diff --git a/doc/design/cluster_train/src/paddle-etcd.png b/doc/design/cluster_train/src/paddle-etcd.png new file mode 100644 index 0000000000000000000000000000000000000000..57981ceb4b94f0f7d6dfa63f3d28c0402bf9cc31 Binary files /dev/null and b/doc/design/cluster_train/src/paddle-etcd.png differ diff --git a/doc/design/dist/src/paddle-model-sharding.graffle b/doc/design/cluster_train/src/paddle-model-sharding.graffle similarity index 100% rename from doc/design/dist/src/paddle-model-sharding.graffle rename to doc/design/cluster_train/src/paddle-model-sharding.graffle diff --git a/doc/design/dist/src/paddle-model-sharding.png b/doc/design/cluster_train/src/paddle-model-sharding.png similarity index 100% rename from doc/design/dist/src/paddle-model-sharding.png rename to doc/design/cluster_train/src/paddle-model-sharding.png diff --git a/doc/design/dist/src/paddle-ps-0.png b/doc/design/cluster_train/src/paddle-ps-0.png similarity index 100% rename from doc/design/dist/src/paddle-ps-0.png rename to doc/design/cluster_train/src/paddle-ps-0.png diff --git a/doc/design/dist/src/paddle-ps-1.png b/doc/design/cluster_train/src/paddle-ps-1.png similarity index 100% rename from doc/design/dist/src/paddle-ps-1.png rename to doc/design/cluster_train/src/paddle-ps-1.png diff --git a/doc/design/dist/src/paddle-ps.graffle b/doc/design/cluster_train/src/paddle-ps.graffle similarity index 100% rename from doc/design/dist/src/paddle-ps.graffle rename to doc/design/cluster_train/src/paddle-ps.graffle diff --git a/doc/design/dist/src/paddle-task-queues.graffle b/doc/design/cluster_train/src/paddle-task-queues.graffle similarity index 100% rename from doc/design/dist/src/paddle-task-queues.graffle rename to doc/design/cluster_train/src/paddle-task-queues.graffle diff --git a/doc/design/dist/src/paddle-task-queues.png b/doc/design/cluster_train/src/paddle-task-queues.png similarity index 100% rename from doc/design/dist/src/paddle-task-queues.png rename to doc/design/cluster_train/src/paddle-task-queues.png diff --git a/doc/design/dist/src/paddle-task-states.graffle b/doc/design/cluster_train/src/paddle-task-states.graffle similarity index 100% rename from doc/design/dist/src/paddle-task-states.graffle rename to doc/design/cluster_train/src/paddle-task-states.graffle diff --git a/doc/design/dist/src/paddle-task-states.png b/doc/design/cluster_train/src/paddle-task-states.png similarity index 100% rename from doc/design/dist/src/paddle-task-states.png rename to doc/design/cluster_train/src/paddle-task-states.png diff --git a/doc/design/cluster_train/src/pserver_init.graffle b/doc/design/cluster_train/src/pserver_init.graffle new file mode 100644 index 0000000000000000000000000000000000000000..5f3f1f52be8aa7f9049a8fcd6b7c93c8560c1676 Binary files /dev/null and b/doc/design/cluster_train/src/pserver_init.graffle differ diff --git a/doc/design/cluster_train/src/pserver_init.png b/doc/design/cluster_train/src/pserver_init.png new file mode 100644 index 0000000000000000000000000000000000000000..dfe491ff98dd7db1c336093c80964a260df2cd90 Binary files /dev/null and b/doc/design/cluster_train/src/pserver_init.png differ diff --git a/doc/design/cluster_train/src/submit-job.graffle b/doc/design/cluster_train/src/submit-job.graffle new file mode 100644 index 0000000000000000000000000000000000000000..677cdfb6d9a32168bf71729eb841fa1ca0dd31d6 Binary files /dev/null and b/doc/design/cluster_train/src/submit-job.graffle differ diff --git a/doc/design/cluster_train/src/submit-job.png b/doc/design/cluster_train/src/submit-job.png new file mode 100644 index 0000000000000000000000000000000000000000..3046a460a7ba708079e88a560debaa215a694680 Binary files /dev/null and b/doc/design/cluster_train/src/submit-job.png differ diff --git a/doc/design/cluster_train/src/trainer.graffle b/doc/design/cluster_train/src/trainer.graffle new file mode 100644 index 0000000000000000000000000000000000000000..43415ed8cf61a5acfa34f8e56b9577f338dbf254 Binary files /dev/null and b/doc/design/cluster_train/src/trainer.graffle differ diff --git a/doc/design/cluster_train/src/trainer.png b/doc/design/cluster_train/src/trainer.png new file mode 100644 index 0000000000000000000000000000000000000000..6537d3d56589ca9f19a77a50a970e4b5275e6ce0 Binary files /dev/null and b/doc/design/cluster_train/src/trainer.png differ diff --git a/doc/design/cluster_train/submit-job.md b/doc/design/cluster_train/submit-job.md new file mode 100644 index 0000000000000000000000000000000000000000..8377d5489dc64bd2fdc5bb4f7bc737e7b489000d --- /dev/null +++ b/doc/design/cluster_train/submit-job.md @@ -0,0 +1,127 @@ +# Submit a Distributed Training Job + +The user can submit a distributed training job with Python code, rather than with a command-line interface. + +## Runtime Environment On Kubernetes + +For a distributed training job, there is two Docker image called *runtime Docker image* and *base Docker image*. The runtime Docker image is the Docker image that gets scheduled by Kubernetes to run during training. The base Docker image is for building the runtime Docker image. + +### Base Docker Image + +Usually, the base Docker image is PaddlePaddle product Docker image including paddle binary files and python package. And of course, users can specify any image name hosted on any docker registry which users have the access right. + +### Runtime Docker Image + +The trainer package which user upload and some Python dependencies are packaged into a runtime Docker image based on base Docker image. + +- Handle Python Dependencies + + You need to provide requirements.txt file in your `trainer-package` folder. Example: + + ```txt + pillow + protobuf==3.1.0 + ``` + More [details](https://pip.readthedocs.io/en/1.1/requirements.html) about requirements, an example project looks like: + ```bash + paddle_example + |-quick_start + |-trainer.py + |-dataset.py + |-requirements.txt + ``` + +## Submit Distributed Training Job With Python Code + + +- `paddle.job.dist_train()` will call the Job Server API `/v1/packages` to upload the trainer package and save them on CephFS, and then call `/v1/trainer/job` to submit the PaddlePaddle distributed job. +- `/v1/trainer/job` will start a building job for preparing the runtime Docker image. When the building job is finished, Job Server will submit the PaddlePaddle distributed job to Kubernetes. +- *NOTE*: For the first version, we will not prepare the runtime Docker image, instead, the package is uploaded to Paddle Cloud, and Paddle Cloud will mount the package in a temporary folder into the base Docker image. We will not support custom Python dependencies in the first version as well. + +You can call `paddle.job.dist_train` and provide distributed training configuration as the parameters: +```python +paddle.job.dist_train( + trainer=dist_trainer(), + paddle_job=PaddleJob( + job_name = "paddle-cloud", + entry_point = "python %s"%__file__, + trainer_package = "/example/word2vec", + image = "yancey1989/paddle-job", + trainers = 10, + pservers = 3, + trainer_cpu = 1, + trainer_gpu = 1, + trainer_mem = "10G", + pserver_cpu = 1, + pserver_mem = "2G" + )) +``` + +The parameter `trainer` of `paddle.job.dist_train` is a function and you can implement it as follows: +```python +def dist_trainer(): + def trainer_creator(): + trainer = paddle.v2.trainer.SGD(...) + trainer.train(...) + return trainer_creator +``` + +The pseudo code of `paddle.job.dist_train` is as follows: +```python +def dist_train(trainer, paddle_job): + # if the code is running on cloud, set PADDLE_ON_CLOUD=YES + if os.getenv("RUNNING_ON_CLOUD", "NO") == "NO": + #submit the paddle job + paddle_job.submit() + else: + #start the training + trainer() +``` +### PaddleJob Parameters +parameter | type | explanation + --- | --- | --- +job_name | str | the unique name for the training job +entry_point | str | entry point for startup trainer process +trainer_package | str | trainer package file path which user have the access right +image|str|the [base image](#base-docker-image) for building the [runtime image](#runtime-docker-image) +pservers|int| Parameter Server process count +trainers|int| Trainer process count +pserver_cpu|int| CPU count for each Parameter Server process +pserver_mem|str| memory allocated for each Parameter Server process, a plain integer using one of these suffixes: E, P, T, G, M, K +trainer_cpu|int| CPU count for each Trainer process +trainer_mem|str| memory allocated for each Trainer process, a plain integer using one of these suffixes: E, P, T, G, M, K +trainer_gpu|int| GPU count for each Trainer process, if you only want CPU, do not set this parameter + +### Deploy Parameter Server, Trainer and Master Process + - Deploy PaddlePaddle Parameter Server processes, it's a Kubernetes ReplicaSet. + - Deploy PaddlePaddle Trainer processes, it's a Kubernetes Job. + - Deploy PaddlePaddle Master processes, it's a Kubernetes ReplicaSet. + +## Job Server + +- RESTful API + + Job server provides RESTful HTTP API for receiving the trainer package and displaying + PaddlePaddle job related informations. + - `POST /v1/package` receive the trainer package and save them on CephFS + - `POST /v1/trainer/job` submit a trainer job + - `GET /v1/jobs/` list all jobs + - `GET /v1/jobs/` the status of a job + - `DELETE /v1/jobs/` delete a job + - `GET /v1/version` job server version + +- Build Runtime Docker Image on Kubernetes + + `paddle.job.dist_train` will upload the trainer package to Job Server, save them on the distributed filesystem, and then start up a job for building the runtime Docker image that gets scheduled by Kubernetes to run during training. + + There are some benefits for building runtime Docker image on JobServer: + - On Paddle Cloud, users will run the trainer code in a Jupyter Notebook which is a Kubernetes Pod, if we want to execute `docker build` in the Pod, we should mount the host's `docker.sock` to the Pod, user's code will connect the host's Docker Engine directly, it's not safe. + - Users only need to upload the training package files, does not need to install docker engine, docker registry as dependencies. + - If we want to change another image type, such as RKT, users do not need to care about it. + +- Deploy Parameter Server, Trainer and Master Processes + + `POST /v1/trainer/job` receives the distributed training parameters, and deploy the job as follows: + - Deploy PaddlePaddle Parameter Server processes, it's a Kubernetes ReplicaSet. + - Deploy PaddlePaddle Trainer processes, it's a Kubernetes Job. + - Deploy PaddlePaddle Master processes, it's a Kubernetes ReplicaSet. diff --git a/doc/design/dcgan.png b/doc/design/dcgan.png new file mode 100644 index 0000000000000000000000000000000000000000..15e8e290a111ff43900934341365cb4360d87d28 Binary files /dev/null and b/doc/design/dcgan.png differ diff --git a/doc/design/dist/README.md b/doc/design/dist/README.md deleted file mode 100644 index 1788208bcabca30f66cb1c80e80f6b824c0d9579..0000000000000000000000000000000000000000 --- a/doc/design/dist/README.md +++ /dev/null @@ -1,172 +0,0 @@ -# Design Doc: Distributed Training - -## Objective - -In [this slides](https://www.slideshare.net/cxwangyi/paddlepaddle-a-complete-solution-for-businesses), we explained that we'd like PaddlePaddle running on general-purpose clusters like those managed by Kubernetes, so to address demands for AI from both Internet and non-Internet industries. - -This poses technical challenges to PaddlePaddle: - -1. Support fault-recovery. -1. Support both offline and online training. -1. [Serverless computing](https://en.wikipedia.org/wiki/Serverless_computing) of distributed training. - - -## Training Job - -A training job will be created once user asks Paddle cloud to train a model. The training job is made up of different processes that collaboratively consume data and produce a trained model. There are three kinds of processes: - -1. the *master process*, which dispatches tasks to -1. one or more *trainer processes*, which run distributed training and synchronize gradients/models via -1. one or more *parameter server processes*, where each holds a shard of the global model. - -Their relation is illustrated in the following graph: - - - -### Master Process - -The master process will: - -- Partition a dataset into [tasks](#task) and dispatch tasks to trainers. -- Keep track of training progress on the dataset with [task queue](#task-queue). A training job will iterate on the dataset for a full pass until it goes into next pass. - - -#### Task - -A task is a data shard to be trained. The total number of tasks will be much bigger than the total number of trainers. The number of data instances inside a task will be much bigger than the mini-batch size. - -#### Task Queue - -The master process has three task queues to track training progress. As illustrated in the graph below, Job A and Job B both have one master process. Each master process has three task queues. - - - -- The todo queue holds tasks to be dispatched. When a job starts, the master process fills in the todo queue with all tasks. -- The pending queue holds tasks that are currently training by trainers. -- the done queue holds tasks that are already trained. - -The life cycle of a single task is illustrated below: - - - -1. When a new pass of training starts, all tasks will be placed in the todo queue. -1. The master process will dispatch few tasks to each trainer at a time, puts them in the pending queue and waits for completion. -1. The trainer will work on its tasks and tell the master process once a task is completed. The master process will dispatch a new task to that trainer. -1. If a task timeout. the master process will move it back to the todo queue. The timeout count will increase by one. If the timeout count is above a threshold, the task is likely to cause a trainer to crash, so it will be discarded. -1. The master process will move completed task to the done queue. When the todo queue is empty, the master process will start a new pass by moving all tasks in the done queue to todo queue and reset the timeout counter of all tasks to zero. - -### Trainer Process - -The trainer process will: - -- Receive tasks from the master. -- Work on the tasks: calculate and upload gradient to parameter servers, and update local model by downloading new parameters from parameter servers. - -### Parameter Server Process - -Parameter server processes hold the parameters collaboratively. The parameters are partitioned on different parameter servers. - -The parameter server will: - -- Receive gradient from the trainers, update its parameters, and give the trainers the latest parameters. -- Periodically save its parameters to distributed file system by overriding the previous save. - -### Optimization Algorithms - -The communication pattern between the trainers and the parameter servers depends on the category of optimization algorithm: - -- Synchronous Stochastic Gradient Descent (sync-SGD) - - Parameter server will wait for all trainer finish n-th mini-batch calculation and send their gradients before broadcasting new parameters to every trainer. Every trainer will wait for the new parameters before starting n+1-th mini-batch. - -- Asynchronous Stochastic Gradient Descent (async-SGD) - - There will no synchronization between different trainers, and parameter server updates its parameter as soon as it receives new gradient: - - - Each trainer uploads its accumulated gradient every n mini-batches. - - Every m mini-batches, the trainer downloads new parameters from parameter server. - - n and m do not have to be equal. - -## Fault Tolerant - -The training job will pause if the master processes is dead, or any of the parameter server process is dead. They will be started by [Kubernetes](https://kubernetes.io/) and recover in few minutes. Please refer to [fault recovery](#fault-recovery). - -The training job will continue to make progress if there is at least one training process running. The strategy depends on the type of optimization algorithm: - -- sync-SGD - - TODO - -- async-SGD - - Since async-SGD does not require synchronization between mini-batches, the system will by definition make process if at least one trainer is running. - -## Fault Recovery - -PaddlePaddle uses [etcd](https://github.com/coreos/etcd) to keep track of the states of processes. Because etcd is a distributed reliable key-value store, the restarted process can recover its states from etcd. The model parameters are periodically saved into distributed file system, so a restarted parameter server can recover its parameters from the saved file. - -Now we will introduce how each process recovers from a failure, the graph below shows how etcd is used: - - - -### Master Process - -When the master is started by the Kubernetes, it executes the following steps at startup: - -1. Grabs a unique *master* lock in etcd, which prevents concurrent master instantiations. -1. Recovers the task queues from etcd if they already exist, otherwise, the master will create them. -1. Watches the trainer prefix keys `/trainer/` on etcd to find the live trainers. -1. Starts dispatching the tasks to the trainers, and updates task queue using an etcd transaction to ensure lock is held during the update. - -The master process will kill itself if its etcd lease expires. - -When the master process is dead for any reason, Kubernetes will restart it. It will be online again with all states recovered from etcd in few minutes. - -### Trainer Process - -When the trainer is started by the Kubernetes, it executes the following steps at startup: - -1. Watches the available parameter server prefix keys `/ps/` on etcd and waits until the count of parameter servers reaches the desired count. -1. Generates a unique ID, and sets key `/trainer/` with its contact address as value. The key will be deleted when the lease expires, so the master will be aware of the trainer being online and offline. -1. Waits for tasks from the master to start training. - -If trainer's etcd lease expires, it will try set key `/trainer/` again so that the master process can discover the trainer again. - -### Parameter Server Process - -When the parameter server is started by Kubernetes, it executes the following steps at startup: - -1. Read desired total number of parameter servers from etcd `/ps_desired` -1. Search through etcd keys `/ps/` (`/ps/0`, `/ps/1`, ...) to find the first non-existant key whose index is smaller than the total number of parameter servers. Set the key using a transaction to avoid concurrent writes. The parameter server's index is inferred from the key name. - - The desired number of parameter servers is 3: - - - - The third parameter server joined: - - - -1. The parameter server can load parameters if there are already saved parameters in the save path (inferred from its index). -1. Now the parameter server is ready for the trainers' requests. - -If the parameter server's etcd lease expires, the parameter server will kill itself. - - -## Dynamic Scaling - -### Trainer Scaling - -TODO - -### Parameter Server Scaling - -Not planned for v1. - -## Training Dataset Format - -TODO - -## User Interface - -TODO diff --git a/doc/design/dist/src/paddle-etcd.graffle b/doc/design/dist/src/paddle-etcd.graffle deleted file mode 100644 index 56681ae5bbe11849116d621b066a6317e003e4ca..0000000000000000000000000000000000000000 Binary files a/doc/design/dist/src/paddle-etcd.graffle and /dev/null differ diff --git a/doc/design/dist/src/paddle-etcd.png b/doc/design/dist/src/paddle-etcd.png deleted file mode 100644 index 4f9c9762b3a8c089dd5e9b2c07cb9dfc78296a21..0000000000000000000000000000000000000000 Binary files a/doc/design/dist/src/paddle-etcd.png and /dev/null differ diff --git a/doc/design/evaluator.md b/doc/design/evaluator.md new file mode 100644 index 0000000000000000000000000000000000000000..11cc129d56905a9ee666da92fbe6f8559c6d325a --- /dev/null +++ b/doc/design/evaluator.md @@ -0,0 +1,58 @@ +## Evaluator Design + +### Problem Statement + +During training or inference, we provide an evaluation function to measure the model performance, for example, accuracy, precision, etc. In the operator based framework design, the data passes through the network pipeline batch by batch. As a result, inside the operator, we only calculate the metrics for one minibatch. Thus, we need to provide a mechanism to calculate the metrics for each N pass/batch the user wants. + +### Evaluator Design +Currently, every operation is expressed in the graph. We divide the evaluator process into three steps. + +1. Initialize the metric state and add it into the block. + +2. Calculate the concerned metrics for every mini-batch. The single evaluator operator is only responsible for calculating the necessary statistics for one mini-batch. For example, the accuracy operator only calculates the accuracy for a minibatch data if run once. + + +3. Merge the mini-batch statistics to form the evaluation result for multiple mini-batches. When it comes to distributed training/Multi-GPU training, aggregate the value from different devices. + +### Implementation +This design is shown in the Python API. +Each metric operator needs to caculate the metric statistic and return the batch-aware states. Python side is responsible for accumulating the states for each pass. + + +```python +class Evaluator(object): + """ + Evaluator Base class. + """ + def __init__(self, name, **kwargs): + """ + Different evaluator may has different metric states. E.g, Accuracy need two variables, total and right sample counts. + Auc need four variables, `true_positives`, + `true_negatives`, `false_positives` and `false_negatives`. So every evaluator should create its needed variables and append to main_program + + The initialization of Evaluator should be responsible for: + create metric states and append to the main_program + """ + pass + + def _update_ops(self, input, label, **kwargs) + """ + Add mini-batch evaluator caculate operators to the main_program. + Add increment operator to accumulate the metric states. + """ + + + def reset(self, executor, reset_program=None): + """ + Reset metric states at the begin of each pass/user specified batch number. + Execute the reset_program to reset the states. + """ + + + def eval(self, executor, eval_program=None): + """ + Merge the mini-batch statistics to form the evaluation result for multiple mini-batches. + Execute the eval_program and return the result. + """ + return eval_result +``` diff --git a/doc/design/executor.md b/doc/design/executor.md new file mode 100644 index 0000000000000000000000000000000000000000..b5fb6c5c3c1da3c112ce63878322083dd5c42b70 --- /dev/null +++ b/doc/design/executor.md @@ -0,0 +1,23 @@ +# Executor Design Doc + +## Motivation + +We use executor to do the runtime evaluation of a `ProgramDesc`. + +## Overview + +An executor takes a `ProgramDesc`, a `block_id` and a `Scope`. The `ProgramDesc` is a list of blocks and each block contains the protobuf definition of all the parameters and operators. The `block_id` specifies the entrance block. And the `Scope` is the container of all the variable instance, which is persistent throughout different runs. + +### What does executor do? + +It evaluates all the operators in the `block_id`th block of a `ProgramDesc`. + +### What does executor NOT do? + +It does not do runtime optimization, meaning intelligently parse the dependency of each op a choose which one to be run and in which order they should be run. + +It does not do graph partitioning, meaning dividing the `ProgramDesc` into several small pieces and executing them on different devices. + +## Implementation + +`Executor` evaluates a `ProgramDesc`. Essentially, it instantiates Variables and Operators, then run all the operators in sequence. [[code]](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.cc) diff --git a/doc/design/file_manager/README.md b/doc/design/file_manager/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3df10d801e568834729f902aace483d033340e2d --- /dev/null +++ b/doc/design/file_manager/README.md @@ -0,0 +1,87 @@ +# FileManager设计文档 +## 目标 +在本文档中,我们设计说明了名为FileManager系统,方便用户上传自己的训练数据以进行分布式训练 + +主要功能包括: + +- 提供常用的命令行管理命令管理文件和目录 +- 支持大文件的断点上传、下载 + +## 名词解释 +- PFS:是`Paddlepaddle cloud File System`的缩写,是对用户文件存储空间的抽象,与之相对的是local filesystem。目前我们用CephFS来搭建。 +- [CephFS](http://docs.ceph.com/docs/master/cephfs/):一个POSIX兼容的文件系统。 +- Chunk:逻辑划上文件分块的单位。 + +## 模块 +### 架构图 + + +### PFSClient +- 功能: 详细设计[link](./pfs/pfsclient.md) + - 提供用户管理文件的命令 + - 需要可以跨平台执行 + +- 双向验证 + PFSClient需要和Ingress之间做双向验证[tls](#tls),所以用户需要首先在`cloud.paddlepaddle.org`上注册一下,申请用户空间,并且把系统生成的CA(certificate authority)、Key、CRT(CA signed certificate)下载到本地,然后才能使用PFSClient。 + +### [Ingress](https://kubernetes.io/docs/concepts/services-networking/ingress/) +- 功能: + 提供七层协议的反向代理、基于粘性会话的负载均衡功能。 + +- 透传用户身份的办法 + Ingress需要把PFSClient的身份信息传给PFSServer,配置的方法参考[link](http://www.integralist.co.uk/posts/clientcertauth.html#3) + +### PFSServer +PFSServer提供RESTful API接口,接收处理PFSClient端的文件管理请求,并且把结果返回PFSClient端。 + +RESTful API + +- /api/v1/files + - `GET /api/v1/files`: Get metadata of files or directories. + - `POST /api/v1/files`: Create files or directories. + - `PATCH /api/v1/files`: Update files or directories. + - `DELETE /api/v1/files`: Delete files or directories. + +- /api/v1/file/chunks + - `GET /api/v1/storage/file/chunks`: Get chunks's metadata of a file. + +- /api/v1/storage/files + - `GET /api/v1/storage/files`: Download files or directories. + - `POST /api/v1/storage/files`: Upload files or directories. + +- /api/v1/storage/file/chunks + - `GET /api/v1/storage/file/chunks`: Download chunks's data. + - `POST /api/v1/storage/file/chunks`: Upload chunks's data. + +## 文件传输优化 + +### 分块文件传输 +用户文件可能是比较大的,上传到Cloud或者下载到本地的时间可能比较长,而且在传输的过程中也可能出现网络不稳定的情况。为了应对以上的问题,我们提出了Chunk的概念,一个Chunk由所在的文件偏移、数据、数据长度及校验值组成。文件的上传和下载都是通过对Chunk的操作来实现的。由于Chunk比较小(默认256K),完成一个传输动作完成的时间也比较短,不容易出错。PFSClient需要在传输完毕最后一个Chunk的时候检查destination文件的MD5值是否和source文件一致。 + +一个典型的Chunk如下所示: + +``` +type Chunk struct { + fileOffset int64 + checksum uint32 + len uint32 + data []byte +} +``` + +### 生成sparse文件 +当destination文件不存在或者大小和source文件不一致时,可以用[Fallocate](https://Go.org/pkg/syscall/#Fallocate)生成sparse文件,然后就可以并发写入多个Chunk。 + +### 覆盖不一致的部分 +文件传输的的关键在于需要PFSClient端对比source和destination的文件Chunks的checksum是否保持一致,不一致的由PFSClient下载或者传输Chunk完成。这样已经传输成功的部分就不用重新传输了。 + +## 用户使用流程 +参考[link](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/cluster_train/data_dispatch.md) + +## 框架生成 +用[swagger](https://github.com/swagger-api/swagger-codegen)生成PFSClient和PFSServer的框架部分,以便我们可以把更多的精力放到逻辑本身上。 + +## 参考文档 +- [TLS complete guide](https://github.com/k8sp/tls/blob/master/tls.md) +- [aws.s3](http://docs.aws.amazon.com/cli/latest/reference/s3/) +- [linux man document](https://linux.die.net/man/) diff --git a/doc/design/file_manager/pfs/pfsclient.md b/doc/design/file_manager/pfs/pfsclient.md new file mode 100644 index 0000000000000000000000000000000000000000..56bc70c54bbc92b78d66e04fb495b1300cf8ebe0 --- /dev/null +++ b/doc/design/file_manager/pfs/pfsclient.md @@ -0,0 +1,129 @@ +# PFSClient + +## Description +The `pfs` command is a Command Line Interface to manage your files on PaddlePaddle Cloud + +## Synopsis +``` +paddle [options] pfs [parameters] +``` + +## Options +``` +--profile (string) + Use a specific profile from your credential file. + +--help (string) + Display more information about command + +--version + Output version information and exit + +--debug + Show detailed debugging log + +--only-show-errors (boolean) + Only errors and warnings are displayed. All other output is suppressed. +``` + +## Path Arguments +When using a command, we need to specify path arguments. There are two path argument type: `localpath` and `pfspath`. + +A `pfspath` begin with `/pfs`, eg: `/pfs/$DATACENTER/home/$USER/folder`. + +[Here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/cluster_train/data_dispatch.md#上传训练文件) is how to config datacenters. + +## order of Path Arguments +Commonly, if there are two path arguments, the first is the source, and the second is the destination. + +## Subcommonds +- rm - remove files or directories + +``` +Synopsis: + rm [-r] [-v] ... + +Options: + -r + Remove directories and their contents recursively + -v + Cause rm to be verbose, showing files after they are removed. + +Examples: + paddle pfs rm /pfs/$DATACENTER/home/$USER/file + paddle pfs rm -r /pfs/$DATACENTER/home/$USER/folder +``` +- mv - move (rename) files + +``` +Synopsis: + mv [-f | -n] [-v] + mv [-f | -n] [-v] ... + mv [-f | -n] [-v] + mv [-f | -n] [-v] ... + mv [-f | -n] [-v] + mv [-f | -n] [-v] ... + +Options: + -f + Do not prompt for confirmation before overwriting the destination path. (The -f option overrides previous -n options.) + -n + Do not overwrite an existing file. (The -n option overrides previous -f options.) + -v + Cause mv to be verbose, showing files after they are moved. + +Examples: + paddle pfs mv ./text1.txt /pfs/$DATACENTER/home/$USER/text1.txt +``` +- cp - copy files or directories + +``` +Synopsis: + cp [-r] [-f | -n] [-v] [--preserve--links] + cp [-r] [-f | -n] [-v] [--preserve--links] ... + cp [-r] [-f | -n] [-v] [--preserve--links] + cp [-r] [-f | -n] [-v] [--preserve--links] ... + cp [-r] [-f | -n] [-v] [--preserve--links] + cp [-r] [-f | -n] [-v] [--preserve--links] ... + +Options: + -r + Copy directories recursively + -f + Do not prompt for confirmation before overwriting the destination path. (The -f option overrides previous -n options.) + -n + Do not overwrite an existing file. (The -n option overrides previous -f options.) + -v + Cause cp to be verbose, showing files after they are copied. + --preserve--links + Reserve links when copy links + +Examples: + paddle pfs cp ./file /pfs/$DATACENTER/home/$USER/file + paddle pfs cp /pfs/$DATACENTER/home/$USER/file ./file +``` +- ls- list files + +``` +Synopsis: + ls [-r] ... + +Options: + -R + List directory(ies) recursively + +Examples: + paddle pfs ls /pfs/$DATACENTER/home/$USER/file + paddle pfs ls /pfs/$DATACENTER/home/$USER/folder +``` + +- mkdir - mkdir directory(ies) +Create intermediate directory(ies) as required. + +``` +Synopsis: + mkdir ... + +Examples: + paddle pfs mkdir /pfs/$DATACENTER/home/$USER/folder +``` diff --git a/doc/design/file_manager/src/filemanager.graffle b/doc/design/file_manager/src/filemanager.graffle new file mode 100644 index 0000000000000000000000000000000000000000..7861a33072bc1908f69d12b37c20491dd8663103 Binary files /dev/null and b/doc/design/file_manager/src/filemanager.graffle differ diff --git a/doc/design/file_manager/src/filemanager.png b/doc/design/file_manager/src/filemanager.png new file mode 100644 index 0000000000000000000000000000000000000000..8139a19f5722f56d3c211f3ab0d3982f751134b9 Binary files /dev/null and b/doc/design/file_manager/src/filemanager.png differ diff --git a/doc/design/float16.md b/doc/design/float16.md new file mode 100644 index 0000000000000000000000000000000000000000..1ea95ed6b5d6792171569b6ff76d09be92fcb13e --- /dev/null +++ b/doc/design/float16.md @@ -0,0 +1,105 @@ +# Design Doc: float16 + +## Why float16 +Half precision (float16) is a binary floating-point format that occupies 16 bits in memory. float16 is half the size of traditional 32-bit single precision format (float) and has lower precision and smaller range. + +When high precision computation is not required, using float16 data type could potentially + +- reduce storage space, memory bandwidth, and power usages; +- increase the chance of data fitting into a smaller cache of lower latency; +- provide arithmetic speed up if supported by hardware. + +## Survey of current float16 support +A brief survey of float16 support on different compilers, hardwares, and libraries can be found below. Interested readers can refer to [link1](https://github.com/PaddlePaddle/Paddle/issues/4853) and [link2](https://github.com/Xreki/Xreki.github.io/blob/master/multi_data_types_in_dl_framework/ppt/float16_and_quantized_type.md) for more info. + +The goal of float16 is to serve as a key for the executor to find and run the correct version of compute method specialized for float16 in operator kernel. It should be compatible with various natively supported float16 implementations including `__half` for cuda, `float16_t` for ARM, and `Eigen::half` for Eigen to make writing customized float16 kernels easier. + +### Compiler +- nvcc supports `__half` data type after CUDA 7.5. +- `__fp16` or `float16_t` is supported as storage type for gcc >= 6.1 and clang >= 3.4. +- `__fp16` or `float16_t` is supported as arithmetic type for gcc >= 7.1 and clang >= 3.9. + +### Hardware +- `__half` is supported on GPU with compute capability >= 5.3. +- `__fp16` is supported as storage type for ARMv7-A, ARMv8-A, and above. +- `__fp16` is supported as arithmetic type after ARMv8.2-A (currently, the only microarchitecture implementing ARMv8.2-A is ARM Cortex-A75, which is announced in May 2017. There seems to be no application processors currently available on market that adopts this architecture. It is reported that Qualcomm Snapdragon 845 uses Cortex-A75 design and will be available in mobile devices in early 2018). + +### Libraries +- [Eigen](https://github.com/RLovelett/eigen) >= 3.3 supports float16 calculation on both GPU and CPU using the `Eigen::half` class. It is mostly useful for Nvidia GPUs because of the overloaded arithmetic operators using cuda intrinsics. It falls back to using software emulation on CPU for calculation and there is no special treatment to ARM processors. +- [ARM compute library](https://github.com/ARM-software/ComputeLibrary) >= 17.02.01 supports NEON FP16 kernels (requires ARMv8.2-A CPU). + +### CUDA version issue +There are currently three versions of CUDA that supports `__half` data type, namely, CUDA 7.5, 8.0, and 9.0. +CUDA 7.5 and 8.0 define `__half` as a simple struct that has a `uint16_t` data (see [`cuda_fp16.h`](https://github.com/ptillet/isaac/blob/9212ab5a3ddbe48f30ef373f9c1fb546804c7a8c/include/isaac/external/CUDA/cuda_fp16.h)) as follows: +``` +typedef struct __align__(2) { + unsigned short x; +} __half; + +typedef __half half; +``` +This struct does not define any overloaded arithmetic operators. So you have to directly use `__hadd` instead of `+` to correctly add two half types: +``` +__global__ void Add() { + half a, b, c; + c = __hadd(a, b); // correct + c = a + b; // compiler error: no operator "+" matches these operands +} +``` +CUDA 9.0 provides a major update to the half data type. The related code can be found in the updated [`cuda_fp16.h`](https://github.com/ptillet/isaac/blob/master/include/isaac/external/CUDA/cuda_fp16.h) and the newly added [`cuda_fp16.hpp`](https://github.com/ptillet/isaac/blob/master/include/isaac/external/CUDA/cuda_fp16.hpp). + +Essentially, CUDA 9.0 renames the original `__half` type in 7.5 and 8.0 as `__half_raw`, and defines a new `__half` class type that has constructors, conversion operators, and also provides overloaded arithmetic operators such as follows: +``` +typedef struct __CUDA_ALIGN__(2) { + unsigned short x; +} __half_raw; + + +struct __CUDA_ALIGN__(2) __half { +protected: + unsigned short __x; +public: + // constructors and conversion operators from/to + // __half_raw and other built-in data types +} + +typedef __half half; + +__device__ __forceinline__ +__half operator+(const __half &lh, const __half &rh) { + return __hadd(lh, rh); +} + +// Other overloaded operators +``` +This new design makes `c = a + b` work correctly for CUDA half data type. + +## Implementation +The float16 class holds a 16-bit `uint16_t` data internally. +``` +struct float16 { + uint16_t x; +}; +``` + +float16 supports the following features: + - constructors / assignment operators that take input from primitive data types including bool, integers of various length, float, and double. + - constructors / assignment operators that take input from `__half` on cuda, `float16_t` on ARM, and `Eigen::half` on Eigen. + - conversion operators to primitive data types and half precision data types on cuda, ARM and Eigen. + - overloaded arithmetic operators for cuda, arm, and non-arm cpu, respectively. These operators will take advantage of the cuda and ARM intrinsics on the corresponding hardware. + +To support the above features, two fundamental conversion functions are provided: +``` +float16 float_to_half_rn(float f); // convert to half precision in round-to-nearest-even mode +float half_to_float(float16 h); +``` +which provides one-to-one conversion between float32 and float16. These twos functions will do different conversion routines based on the current hardware. CUDA/ARM instrinsics will be used when the corresonding hardware is available. If the hardware or compiler level does not support float32 to float16 conversion, software emulation will be performed to do the conversion. + +## To do +After float16 class is available, some of the future items are below: + +- Update pybind/tensor_py.h to bind c++ float16 with numpy float16. + +- Modify `GetKernelType()` method in `framework/operator.h` to make it compatible with float16. + +- Create a type-casting operator that can convert the data type in tensor between float16 and other types. diff --git a/doc/design/functions_operators_layers.md b/doc/design/functions_operators_layers.md new file mode 100644 index 0000000000000000000000000000000000000000..984b59f4c6971dfb6f46dfe342f2751f392c0e88 --- /dev/null +++ b/doc/design/functions_operators_layers.md @@ -0,0 +1,100 @@ +# Design Doc: Functions, Operators, and Layers + +In a DL system, we can compose one or more fine grained operators into a coarse grained one. For example, the FC layer can be composed of a multiplication operator and an add operator. + +Historically, some fine grained operations are known as operators, and some coarse level ones are known as layers. But we need a well-defined separation. + +In general, operators are those very fine grained operations, e.g., mul and add. In the implementation, we can write them as C++ functions: + +```c++ +template T add(T x, T y) { return x + y; } +template T mul(T x, T y) { return x * y; } +``` + +Then we can wrap them into operators which are C++ classes and can be created from Python bindings by name. A C macro can do this. For example, the following macro invocation + +```c++ +#define MAKE_FUNCTION_OPERATOR(mul); +``` + +generates + +```c++ +template class mulOp : public OperatorBase {...}; +REGISTER_OP(mulOp, "mul"); +``` + +so that in Python we can create operator mul by: + +```python +X1 = Var() +X2 = Var() +Y = Var() +paddle.cpp.create_operator("mul", input=[X1, X2], output=Y) +``` + +Also, at the same time, we can compose a coarse level C++ operator class by composing functions `mul` and `add`: + +```c++ +template +class FCOp : public OperatorBase { + public: + void Run(...) { + add(mul(Input("X"), Input("W")), Input("b"); + } +}; +REGISTER_OP(FCOp, "fc"); +``` + +We need to support such composition in Python as well. To do so, we need a higher level Python wrapping of operator creation than `paddle.cpp.create_operator`. This higher level operator API should be compatible with the layer API. + +Let's explain using an example. Suppose that we are going to compose the FC using mul and add in Python, we'd like to have Python functions `mul` and `add` defined in module `operator`: + +```python +def operator.mul(X1, X2): + O = Var() + paddle.cpp.create_operator("mul", input={X1, Y1}, output=O) + return O + +def operator.add(X1, X2): + O = Var() + paddle.cpp.create_operator("add", input={X1, X2}, output=O) + return O +``` + +Above code snippets are automatically generated. Given them, users can define + +```python +def layer.fc(X): + W = Var() + b = Var() + return operator.add(operator.mul(X, W), b) +``` + +If we don't have `operator.mul` and `operator.add`, the definiton of `layer.fc` would be complicated: + +```python +def layer.fc(X): + W = Var() + b = Var() + O1 = Var() + paddle.cpp.create_operator("mul", input=[X, W], output=O1) + O2 = Var() + paddle.cpp.create_operator("add", input=[O1, b], output=O2) + return O2 +``` + +We'd like to have Python bindings to operators in package `paddle.operator`, and Python compositions of operators in package `paddle.layer`. So we have the following concepts in above illustrative example: + + +| C++ functions/functors | mul | add | | | +|------------------------|--------------|--------------|-------------|----------| +| C++ operator class | mulOp | addOp | FCOp | | +| Python binding | operator.mul | operator.add | operator.fc | | +| Python function | | | | layer.fc | + + +This is how we differentiate layer and operators in PaddlePaddle: + +- those defined in C++ and have a lightweighted Python wrapper in module `operators` are operators; whereas +- those who don't have C++ implementations but a Python implementation that compose C++ operators are known as layers. diff --git a/doc/design/gan_api.md b/doc/design/gan_api.md new file mode 100644 index 0000000000000000000000000000000000000000..fb41df8615f73d9fd4c32995eab265833eac1a55 --- /dev/null +++ b/doc/design/gan_api.md @@ -0,0 +1,253 @@ +# Design for GAN + +GAN (General Adversarial Net [https://arxiv.org/abs/1406.2661]) is an important model for unsupervised learning and widely used in many areas. + +It applies several important concepts in machine learning system design, including building and running subgraphs, dependency tracing, different optimizers in one executor and so forth. + +In our GAN design, we wrap it as a user-friendly easily customized python API to design different models. We take the conditional DC-GAN (Unsupervised Representation Learning with Deep Convolutional Generative Adversarial Networks [https://arxiv.org/abs/1511.06434]) as an example due to its good performance on image generation. + +

+
+Figure 1. The overall running logic of GAN. The black solid arrows indicate the forward pass; the green dashed arrows indicate the backward pass of generator training; the red dashed arrows indicate the backward pass of the discriminator training. The BP pass of the green (red) arrow should only update the parameters in the green (red) boxes. The diamonds indicate the data providers. d\_loss and g\_loss marked in red and green are the two targets we would like to run. +

+ +The operators, layers and functions required/optional to build a GAN demo is summarized in https://github.com/PaddlePaddle/Paddle/issues/4563. + +

+
+Figure 2. Photo borrowed from the original DC-GAN paper. +

+ +## The Conditional-GAN might be a class. +This design we adopt the popular open source design in https://github.com/carpedm20/DCGAN-tensorflow and https://github.com/rajathkmp/DCGAN. It contains following data structure: + +- DCGAN(object): which contains everything required to build a GAN model. It provides following member functions methods as API: + +- __init__(...): Initialize hyper-parameters (like conv dimension and so forth), and declare model parameters of discriminator and generator as well. + +- generator(z, y=None): Generate a fake image from input noise z. If the label y is provided, the conditional GAN model will be chosen. +Returns a generated image. + +- discriminator(image): +Given an image, decide if it is from a real source or a fake one. +Returns a 0/1 binary label. + +- build_model(self): +build the whole GAN model, define training loss for both generator and discrimator. + +## Discussion on Engine Functions required to build GAN +- Trace the tensor and variable dependency in the engine executor. (Very critical, otherwise GAN can'be be trained correctly) +- Different optimizers responsible for optimizing different loss. + +To be more detailed, we introduce our design of DCGAN as following: + +### Class member Function: Initializer +- Set up hyper-parameters, including condtional dimension, noise dimension, batch size and so forth. +- Declare and define all the model variables. All the discriminator parameters are included in the list self.theta_D and all the generator parameters are included in the list self.theta_G. +```python +class DCGAN(object): + def __init__(self, y_dim=None): + + # hyper parameters + self.y_dim = y_dim # conditional gan or not + self.batch_size = 100 + self.z_dim = z_dim # input noise dimension + + # define parameters of discriminators + self.D_W0 = pd.Variable(shape=[3,3, 1, 128], data=pd.gaussian_normal_randomizer()) + self.D_b0 = pd.Variable(np.zeros(128)) # variable also support initialization using a numpy data + self.D_W1 = pd.Variable(shape=[784, 128], data=pd.gaussian_normal_randomizer()) + self.D_b1 = pd.Variable(np.zeros(128)) # variable also support initialization using a numpy data + self.D_W2 = pd.Varialble(np.random.rand(128, 1)) + self.D_b2 = pd.Variable(np.zeros(128)) + self.theta_D = [self.D_W0, self.D_b0, self.D_W1, self.D_b1, self.D_W2, self.D_b2] + + # define parameters of generators + self.G_W0 = pd.Variable(shape=[784, 128], data=pd.gaussian_normal_randomizer()) + self.G_b0 = pd.Variable(np.zeros(128)) # variable also support initialization using a numpy data + self.G_W1 = pd.Variable(shape=[784, 128], data=pd.gaussian_normal_randomizer()) + self.G_b1 = pd.Variable(np.zeros(128)) # variable also support initialization using a numpy data + self.G_W2 = pd.Varialble(np.random.rand(128, 1)) + self.G_b2 = pd.Variable(np.zeros(128)) + self.theta_G = [self.G_W0, self.G_b0, self.G_W1, self.G_b1, self.G_W2, self.G_b2] +``` + +### Class member Function: Generator +- Given a noisy input z, returns a fake image. +- Concatenation, batch-norm, FC operations required; +- Deconv layer required, which is missing now... +```python +class DCGAN(object): + def generator(self, z, y = None): + # input z: the random noise + # input y: input data label (optional) + # output G_im: generated fake images + + if not self.y_dim: + z = pd.layer.concat(1, [z, y]) + + G_h0 = pd.layer.fc(z, self.G_w0, self.G_b0) + G_h0_bn = pd.layer.batch_norm(G_h0) + G_h0_relu = pd.layer.relu(G_h0_bn) + + G_h1 = pd.layer.deconv(G_h0_relu, self.G_w1, self.G_b1) + G_h1_bn = pd.layer.batch_norm(G_h1) + G_h1_relu = pd.layer.relu(G_h1_bn) + + G_h2 = pd.layer.deconv(G_h1_relu, self.G_W2, self.G_b2)) + G_im = pd.layer.tanh(G_im) + return G_im +``` + +### Class member function: Discriminator +- Given a noisy input z, returns a fake image. +- Concatenation, Convolution, batch-norm, FC, Leaky-ReLU operations required; +```python +class DCGAN(object): + def discriminator(self, image): + # input image: either generated images or real ones + # output D_h2: binary logit of the label + + D_h0 = pd.layer.conv2d(image, w=self.D_w0, b=self.D_b0) + D_h0_bn = pd.layer.batchnorm(h0) + D_h0_relu = pd.layer.lrelu(h0_bn) + + D_h1 = pd.layer.conv2d(D_h0_relu, w=self.D_w1, b=self.D_b1) + D_h1_bn = pd.layer.batchnorm(D_h1) + D_h1_relu = pd.layer.lrelu(D_h1_bn) + + D_h2 = pd.layer.fc(D_h1_relu, w=self.D_w2, b=self.D_b2) + return D_h2 +``` + +### Class member function: Build the model +- Define data readers as placeholders to hold the data; +- Build generator and discriminators; +- Define two training losses for discriminator and generator, respectively. +If we have execution dependency engine to back-trace all tensors, the module building our GAN model will be like this: +```python +class DCGAN(object): + def build_model(self): + if self.y_dim: + self.y = pd.data(pd.float32, [self.batch_size, self.y_dim]) + self.images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size]) + self.faked_images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size]) + self.z = pd.data(tf.float32, [None, self.z_size]) + + # step 1: generate images by generator, classify real/fake images with discriminator + if self.y_dim: # if conditional GAN, includes label + self.G = self.generator(self.z, self.y) + self.D_t = self.discriminator(self.images) + # generated fake images + self.sampled = self.sampler(self.z, self.y) + self.D_f = self.discriminator(self.G) + else: # original version of GAN + self.G = self.generator(self.z) + self.D_t = self.discriminator(self.images) + # generate fake images + self.sampled = self.sampler(self.z) + self.D_f = self.discriminator(self.images) + + # step 2: define the two losses + self.d_loss_real = pd.reduce_mean(pd.cross_entropy(self.D_t, np.ones(self.batch_size)) + self.d_loss_fake = pd.reduce_mean(pd.cross_entropy(self.D_f, np.zeros(self.batch_size)) + self.d_loss = self.d_loss_real + self.d_loss_fake + + self.g_loss = pd.reduce_mean(pd.cross_entropy(self.D_f, np.ones(self.batch_szie)) +``` + +If we do not have dependency engine but blocks, the module building our GAN model will be like this: +```python +class DCGAN(object): + def build_model(self, default_block): + # input data in the default block + if self.y_dim: + self.y = pd.data(pd.float32, [self.batch_size, self.y_dim]) + self.images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size]) + # self.faked_images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size]) + self.z = pd.data(tf.float32, [None, self.z_size]) + + # step 1: generate images by generator, classify real/fake images with discriminator + with pd.default_block().g_block(): + if self.y_dim: # if conditional GAN, includes label + self.G = self.generator(self.z, self.y) + self.D_g = self.discriminator(self.G, self.y) + else: # original version of GAN + self.G = self.generator(self.z) + self.D_g = self.discriminator(self.G, self.y) + self.g_loss = pd.reduce_mean(pd.cross_entropy(self.D_g, np.ones(self.batch_szie)) + + with pd.default_block().d_block(): + if self.y_dim: # if conditional GAN, includes label + self.D_t = self.discriminator(self.images, self.y) + self.D_f = self.discriminator(self.G, self.y) + else: # original version of GAN + self.D_t = self.discriminator(self.images) + self.D_f = self.discriminator(self.G) + + # step 2: define the two losses + self.d_loss_real = pd.reduce_mean(pd.cross_entropy(self.D_t, np.ones(self.batch_size)) + self.d_loss_fake = pd.reduce_mean(pd.cross_entropy(self.D_f, np.zeros(self.batch_size)) + self.d_loss = self.d_loss_real + self.d_loss_fake +``` +Some small confusion and problems with this design: +- D\_g and D\_f are actually the same thing, but has to be written twice; i.e., if we want to run two sub-graphs conceptually, the same codes have to be written twice if they are shared by the graph. +- Requires ability to create a block anytime, rather than in if-else or rnn only; + +## Main function for the demo: +Generally, the user of GAN just need to the following things: +- Define an object as DCGAN class; +- Build the DCGAN model; +- Specify two optimizers for two different losses with respect to different parameters. +```python +# pd for short, should be more concise. +from paddle.v2 as pd +import numpy as np +import logging + +if __name__ == "__main__": + # dcgan class in the default graph/block + # if we use dependency engine as tensorflow + # the codes, will be slightly different like: + # dcgan = DCGAN() + # dcgan.build_model() + with pd.block() as def_block: + dcgan = DCGAN() + dcgan.build_model(def_block) + + # load mnist data + data_X, data_y = self.load_mnist() + + # Two subgraphs required!!! + with pd.block().d_block(): + d_optim = pd.train.Adam(lr = .001, beta= .1) + d_step = d_optim.minimize(dcgan.d_loss, dcgan.theta_D) + with pd.block.g_block(): + g_optim = pd.train.Adam(lr = .001, beta= .1) + g_step = pd.minimize(dcgan.g_loss, dcgan.theta_G) + + # executor + sess = pd.executor() + + # training + for epoch in xrange(10000): + for batch_id in range(N / batch_size): + idx = ... + # sample a batch + batch_im, batch_label = data_X[idx:idx+batch_size], data_y[idx:idx+batch_size] + # sample z + batch_z = np.random.uniform(-1., 1., [batch_size, z_dim]) + + if batch_id % 2 == 0: + sess.run(d_step, + feed_dict = {dcgan.images: batch_im, + dcgan.y: batch_label, + dcgan.z: batch_z}) + else: + sess.run(g_step, + feed_dict = {dcgan.z: batch_z}) +``` + +# More thinking about dependency engine v.s. block design: +- What if we just want to run an intermediate result? Do we need to run the whole block/graph? +- Should we call eval() to get the fake images in the first stage? And then train the discriminator in the second stage? diff --git a/doc/design/graph.md b/doc/design/graph.md new file mode 100644 index 0000000000000000000000000000000000000000..7519a65df835a39fe14f6ef45530afff170191ff --- /dev/null +++ b/doc/design/graph.md @@ -0,0 +1,70 @@ +# Design Doc: Computations as a Graph + +A primary goal of the refactorization of PaddlePaddle is a more flexible representation of deep learning computation, in particular, a graph of operators and variables, instead of sequences of layers as before. + +This document explains that the construction of a graph as three steps: + +- construct the forward part +- construct the backward part +- construct the optimization part + +## The Construction of a Graph + +Let us take the problem of image classification as a simple example. The application program that trains the model looks like: + +```python +x = layer.data("images") +l = layer.data("label") +y = layer.fc(x) +cost = layer.mse(y, l) +optimize(cost) +train(cost, reader=mnist.train()) +``` + +### Forward Part + +The first four lines of above program build the forward part of the graph. + +![](images/graph_construction_example_forward_only.png) + +In particular, the first line `x = layer.data("images")` creates variable x and a Feed operator that copies a column from the minibatch to x. `y = layer.fc(x)` creates not only the FC operator and output variable y, but also two parameters, W and b, and the initialization operators. + +Initialization operators are kind of "run-once" operators -- the `Run` method increments a class data member counter so to run at most once. By doing so, a parameter wouldn't be initialized repeatedly, say, in every minibatch. + +In this example, all operators are created as `OpDesc` protobuf messages, and all variables are `VarDesc`. These protobuf messages are saved in a `BlockDesc` protobuf message. + +### Backward Part + +The fifth line `optimize(cost)` calls two functions, `ConstructBackwardGraph` and `ConstructOptimizationGraph`. + +`ConstructBackwardGraph` traverses the forward graph in the `BlockDesc` protobuf message and builds the backward part. + +![](images/graph_construction_example_forward_backward.png) + +According to the chain rule of gradient computation, `ConstructBackwardGraph` would + +1. create a gradient operator G for each operator F, +1. make all inputs, outputs, and outputs' gradient of F as inputs of G, +1. create gradients for all inputs of F, except for those who don't have gradients, like x and l, and +1. make all these gradients as outputs of G. + +### Optimization Part + +For each parameter, like W and b created by `layer.fc`, marked as double circles in above graphs, `ConstructOptimizationGraph` creates an optimization operator to apply its gradient. Here results in the complete graph: + +![](images/graph_construction_example_all.png) + +## Block and Graph + +The word block and graph are interchangable in the desgin of PaddlePaddle. A [Block](https://github.com/PaddlePaddle/Paddle/pull/3708) is a metaphore of the code and local variables in a pair of curly braces in programming languages, where operators are like statements or instructions. A graph of operators and variables is a representation of the block. + +A Block keeps operators in an array `BlockDesc::ops` + +```protobuf +message BlockDesc { + repeated OpDesc ops = 1; + repeated VarDesc vars = 2; +} +``` + +in the order that they appear in user programs, like the Python program at the beginning of this article. We can imagine that in `ops`, we have some forward operators, followed by some gradient operators, and then some optimization operators. diff --git a/doc/design/graph_survey.md b/doc/design/graph_survey.md new file mode 100644 index 0000000000000000000000000000000000000000..6c6db08f463ae0a2b94fc4546f123a1d7c151870 --- /dev/null +++ b/doc/design/graph_survey.md @@ -0,0 +1,232 @@ +## Survey on Graph + +Neural network framework often provides symbolic API for users to write network topology conveniently. This doc manily focus on symbolic API in most popular neural network frameworks, and try to find out how to parse symbolic configuration to a portable file, such as protobuf or json. + +### Mxnet + +The core concept of symbolic API is `Symbol`. Mxnet implements `Symbol` class in C++, and export to Python using C-API. Please refer to the comments in Mxnet: + + +`Symbol` is help class used to represent the operator node in Graph. +`Symbol` acts as an interface for building graphs from different components like Variable, Functor and Group. `Symbol` is also exported to python front-end (while Graph is not) to enable quick test and deployment. Conceptually, symbol is the final operation of a graph and thus including all the information required (the graph) to evaluate its output value. + + +A simple network topology wrote by Symbol is as follows: + +```python +def get_symbol(num_classes=10, **kwargs): + data = mx.symbol.Variable('data') + data = mx.symbol.Flatten(data=data) + fc1 = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=128) + act1 = mx.symbol.Activation(data = fc1, name='relu1', act_type="relu") + fc2 = mx.symbol.FullyConnected(data = act1, name = 'fc2', num_hidden = 64) + act2 = mx.symbol.Activation(data = fc2, name='relu2', act_type="relu") + fc3 = mx.symbol.FullyConnected(data = act2, name='fc3', num_hidden=num_classes) + mlp = mx.symbol.SoftmaxOutput(data = fc3, name = 'softmax') + return mlp +``` + + + +Varible here is actually a Symbol. Every basic Symbol will correspond to one Node, and every Node has its own NodeAttr. There is a op field in NodeAttr class, when a Symbol represents Variable(often input data), the op field is null. + +Symbol contains a data member, std::vector outputs, and NodeEntry cantains a poniter to Node. We can follow the Node pointer to get all the Graph. + +And Symbol can be saved to a Json file. + +Here is a detailed example: + +``` +>>> import mxnet as mx +>>> data = mx.symbol.Variable('data') +>>> print data.debug_str() +Variable:data + +>>> data = mx.symbol.Flatten(data=data) +>>> print data.debug_str() +Symbol Outputs: + output[0]=flatten0(0) +Variable:data +-------------------- +Op:Flatten, Name=flatten0 +Inputs: + arg[0]=data(0) version=0 + +>>> fc1 = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=128) +>>> print fc1.debug_str() +Symbol Outputs: + output[0]=fc1(0) +Variable:data +-------------------- +Op:Flatten, Name=flatten0 +Inputs: + arg[0]=data(0) version=0 +Variable:fc1_weight +Variable:fc1_bias +-------------------- +Op:FullyConnected, Name=fc1 +Inputs: + arg[0]=flatten0(0) + arg[1]=fc1_weight(0) version=0 + arg[2]=fc1_bias(0) version=0 +Attrs: + num_hidden=128 + +``` + + +### TensorFlow + + +The core concept of symbolic API is `Tensor`. Tensorflow defines `Tensor` in Python. Please refer to the comments in TensorFlow: + +A `Tensor` is a symbolic handle to one of the outputs of an `Operation`. It does not hold the values of that operation's output, but instead provides a means of computing those values in a TensorFlow [Session](https://www.tensorflow.org/api_docs/python/tf/Session). + +A simple example is as follows: + +```python + # Build a dataflow graph. + c = tf.constant([[1.0, 2.0], [3.0, 4.0]]) + d = tf.constant([[1.0, 1.0], [0.0, 1.0]]) + e = tf.matmul(c, d) + + # Construct a `Session` to execute the graph. + sess = tf.Session() + + # Execute the graph and store the value that `e` represents in `result`. + result = sess.run(e) +``` + + +The main method of `Tensor` is as follows: + + +```python +@property +def op(self): + """The `Operation` that produces this tensor as an output.""" + return self._op + +@property +def dtype(self): + """The `DType` of elements in this tensor.""" + return self._dtype + +@property +def graph(self): + """The `Graph` that contains this tensor.""" + return self._op.graph + +@property +def name(self): + """The string name of this tensor.""" + if not self._op.name: + raise ValueError("Operation was not named: %s" % self._op) + return "%s:%d" % (self._op.name, self._value_index) + +@property +def device(self): + """The name of the device on which this tensor will be produced, or None.""" + return self._op.device +``` + + +Tensor can be taken as target to run by session. Tensor contains all the information of Graph, and tracks data dependency. + + +Here is a detailed example: + + +``` +>>> import tensorflow as tf +>>> c = tf.constant([[1.0, 2.0], [3.0, 4.0]]) +>>> print c.graph + +>>> d = tf.constant([[1.0, 1.0], [0.0, 1.0]]) +>>> print d.graph + +>>> e = tf.matmul(c, d) +>>> print e.graph + +``` + +### Dynet + + +The core concept of symbolic API is `Expression`, and Dynet defines `Expression` class in C++. + + +A simple example is as follows: + +```cpp +ComputationGraph cg; +Expression W = parameter(cg, pW); + +Expression in = input(cg, xs[i]); +Expression label = input(cg, ys[i]); +Expression pred = W * in; +Expression loss = square(pred - label); +``` + +The input data and parameter are also represented by Expression. Every basci Expression corresponds to a Node. And input data is also a Node. + +Expression has a data member ComputationGraph, and ComputationGraph will be modified in users' configuring process. Expression can be a running target, beacuse Expression contains all dependency. + + +Here is a detailed example: + +write topology in C++ + +``` +ComputationGraph cg; +Expression W = parameter(cg, pW); +cg.print_graphviz(); + +Expression pred = W * xs[i]; +cg.print_graphviz(); + +Expression loss = square(pred - ys[i]); +cg.print_graphviz(); +``` + +compile and print + +``` +# first print +digraph G { + rankdir=LR; + nodesep=.05; + N0 [label="v0 = parameters({1}) @ 0x7ffe4de00110"]; +} +# second print +digraph G { + rankdir=LR; + nodesep=.05; + N0 [label="v0 = parameters({1}) @ 0x7ffe4de00110"]; + N1 [label="v1 = v0 * -0.98"]; + N0 -> N1; +} +# third print +digraph G { + rankdir=LR; + nodesep=.05; + N0 [label="v0 = parameters({1}) @ 0x7ffe4de00110"]; + N1 [label="v1 = v0 * -0.98"]; + N0 -> N1; + N2 [label="v2 = -1.88387 - v1"]; + N1 -> N2; + N3 [label="v3 = -v2"]; + N2 -> N3; + N4 [label="v4 = square(v3)"]; + N3 -> N4; +} +``` + +### Conclusion + + +Actually, Symbol/Tensor/Expression in Mxnet/TensorFlow/Dynet are the same level concepts. We use a unified name Expression here, this level concept has following features: + +- Users wirte topoloy with symbolic API, and all return value is Expression, including input data and parameter. +- Expression corresponds with a global Graph, and Expression can also be composed. +- Expression tracks all dependency and can be taken as a run target diff --git a/doc/design/if_else_op.md b/doc/design/if_else_op.md new file mode 100644 index 0000000000000000000000000000000000000000..26d140f06db4ecefa86be015eaa731ffddc6910c --- /dev/null +++ b/doc/design/if_else_op.md @@ -0,0 +1,51 @@ +# The `IfElse` Operator + +PaddlePaddle's `IfElse` operator differs from TensorFlow's: + +- the TensorFlow version takes a scalar boolean value as the condition so that the whole mini-batch goes to either the true or the false branch, whereas +- the PaddlePaddle version takes a vector of boolean value as the condition, and instances corresponding to true values go to the true branch, those corresponding to false values go to the false branch. + +## Example + +The following PaddlePaddle program shows the usage of the IfElse operator: + +```python +import paddle as pd + +x = minibatch([10, 20, 30]) # shape=[None, 1] +y = var(1) # shape=[1], value=1 +z = minibatch([10, 20, 30]) # shape=[None, 1] +cond = larger_than(x, 15) # [false, true, true] + +ie = pd.ifelse() +with ie.true_block(): + d = pd.layer.add(x, y) + ie.output(d, pd.layer.softmax(d)) +with ie.false_block(): + d = pd.layer.fc(z) + ie.output(d, d+1) +o1, o2 = ie(cond) +``` + +A challenge to implement the `IfElse` operator is to infer those variables to be split, or, say, to identify the variable of the mini-batch or those derived from the mini-batch. + +An equivalent C++ program is as follows: + +```c++ +namespace pd = paddle; + +int x = 10; +int y = 1; +int z = 10; +bool cond = false; +int o1, o2; +if (cond) { + int d = x + y; + o1 = z; + o2 = pd::layer::softmax(z); +} else { + int d = pd::layer::fc(z); + o1 = d; + o2 = d+1; +} +``` diff --git a/doc/design/images/asgd.gif b/doc/design/images/asgd.gif new file mode 100644 index 0000000000000000000000000000000000000000..4a0da7bf6df9326a2aab1638b77c5455c18b8c4e Binary files /dev/null and b/doc/design/images/asgd.gif differ diff --git a/doc/design/images/feed_forward.png b/doc/design/images/feed_forward.png new file mode 100644 index 0000000000000000000000000000000000000000..d312371a04c26aa6cd196e0bd1f51becb425180b Binary files /dev/null and b/doc/design/images/feed_forward.png differ diff --git a/doc/design/images/feed_forward_regularized.png b/doc/design/images/feed_forward_regularized.png new file mode 100644 index 0000000000000000000000000000000000000000..677e99bfd9f8e72ed9fe4b27127af2ced202f447 Binary files /dev/null and b/doc/design/images/feed_forward_regularized.png differ diff --git a/doc/design/images/graph_construction_example.bash b/doc/design/images/graph_construction_example.bash new file mode 100755 index 0000000000000000000000000000000000000000..35e6997abd17588e17a82d448918fc1b3bd7220e --- /dev/null +++ b/doc/design/images/graph_construction_example.bash @@ -0,0 +1,11 @@ +cat ./graph_construction_example.dot | \ + sed 's/color=red/color=red, style=invis/g' | \ + sed 's/color=green/color=green, style=invis/g' | \ + dot -Tpng > graph_construction_example_forward_only.png + +cat ./graph_construction_example.dot | \ + sed 's/color=green/color=green, style=invis/g' | \ + dot -Tpng > graph_construction_example_forward_backward.png + +cat ./graph_construction_example.dot | \ + dot -Tpng > graph_construction_example_all.png diff --git a/doc/design/images/graph_construction_example.dot b/doc/design/images/graph_construction_example.dot new file mode 100644 index 0000000000000000000000000000000000000000..e115f9844bae6ad24f638c8ed4749cea8aff06a9 --- /dev/null +++ b/doc/design/images/graph_construction_example.dot @@ -0,0 +1,68 @@ +digraph ImageClassificationGraph { + ///////// The forward part ///////// + FeedX [label="Feed", color=blue, shape=box]; + FeedY [label="Feed", color=blue, shape=box]; + InitW [label="Init", color=blue, shape=diamond]; + Initb [label="Init", color=blue, shape=diamond]; + FC [label="FC", color=blue, shape=box]; + MSE [label="MSE", color=blue, shape=box]; + + x [label="x", color=blue, shape=oval]; + l [label="l", color=blue, shape=oval]; + y [label="y", color=blue, shape=oval]; + W [label="W", color=blue, shape=doublecircle]; + b [label="b", color=blue, shape=doublecircle]; + cost [label="cost", color=blue, shape=oval]; + + FeedX -> x -> FC -> y -> MSE -> cost [color=blue]; + FeedY -> l [color=blue]; + InitW -> W [color=blue]; + Initb -> b [color=blue]; + W -> FC [color=blue]; + b -> FC [color=blue]; + l -> MSE [color=blue]; + + ////////// The backward part ///////// + MSE_Grad [label="MSE_grad", color=red, shape=box]; + FC_Grad [label="FC_grad", color=red, shape=box]; + + d_cost [label="d cost", color=red, shape=oval]; + d_y [label="d y", color=red, shape=oval]; + d_b [label="d b", color=red, shape=oval]; + d_W [label="d W", color=red, shape=oval]; + + cost -> MSE_Grad [color=red]; + d_cost -> MSE_Grad [color=red]; + l -> MSE_Grad [color=red]; + y -> MSE_Grad -> d_y [color=red]; + + x -> FC_Grad [color=red]; + y -> FC_Grad [color=red]; + d_y -> FC_Grad [color=red]; + W -> FC_Grad -> d_W [color=red]; + b -> FC_Grad -> d_b [color=red]; + + ////////// The optimizaiton part ////////// + + OPT_W [label="SGD", color=green, shape=box]; + OPT_b [label="SGD", color=green, shape=box]; + + W -> OPT_W [color=green]; + b -> OPT_b [color=green]; + d_W -> OPT_W -> W [color=green]; + d_b -> OPT_b -> b [color=green]; + + ////////// Groupings ////////// + + subgraph clusterMSE { + style=invis; + MSE; + MSE_Grad; + } + + subgraph clusterFC { + style=invis; + FC; + FC_Grad; + } +} diff --git a/doc/design/images/graph_construction_example_all.png b/doc/design/images/graph_construction_example_all.png new file mode 100644 index 0000000000000000000000000000000000000000..261611a5721f9aa97874f7e6d897fe48cf667db2 Binary files /dev/null and b/doc/design/images/graph_construction_example_all.png differ diff --git a/doc/design/images/graph_construction_example_forward_backward.png b/doc/design/images/graph_construction_example_forward_backward.png new file mode 100644 index 0000000000000000000000000000000000000000..4c69687f4a6a181138f3df72ce5e8aa48487b5be Binary files /dev/null and b/doc/design/images/graph_construction_example_forward_backward.png differ diff --git a/doc/design/images/graph_construction_example_forward_only.png b/doc/design/images/graph_construction_example_forward_only.png new file mode 100644 index 0000000000000000000000000000000000000000..e668c16e0cac73acb4e5dc2b1827557ae77126b4 Binary files /dev/null and b/doc/design/images/graph_construction_example_forward_only.png differ diff --git a/doc/design/images/l1_regularization.png b/doc/design/images/l1_regularization.png new file mode 100644 index 0000000000000000000000000000000000000000..e1b9c7a44f94dc027598a98da93ddb8133190972 Binary files /dev/null and b/doc/design/images/l1_regularization.png differ diff --git a/doc/design/images/l2_regularization.png b/doc/design/images/l2_regularization.png new file mode 100644 index 0000000000000000000000000000000000000000..d5c2fcbc2ccae75ad083162e5a2dceb0210be298 Binary files /dev/null and b/doc/design/images/l2_regularization.png differ diff --git a/doc/design/images/loss_equation.png b/doc/design/images/loss_equation.png new file mode 100644 index 0000000000000000000000000000000000000000..14212ec8d36c803de96bde8a9a4b5591bd20434e Binary files /dev/null and b/doc/design/images/loss_equation.png differ diff --git a/doc/design/images/replica.png b/doc/design/images/replica.png new file mode 100644 index 0000000000000000000000000000000000000000..ef59e56b01d792a059279e6bb9a29f3db6a59a41 Binary files /dev/null and b/doc/design/images/replica.png differ diff --git a/doc/design/images/theta_star.gif b/doc/design/images/theta_star.gif new file mode 100644 index 0000000000000000000000000000000000000000..dd24d33e124396be3fc410c9b12f33148f64efe2 Binary files /dev/null and b/doc/design/images/theta_star.gif differ diff --git a/doc/design/images/two_phase_commit.png b/doc/design/images/two_phase_commit.png new file mode 100644 index 0000000000000000000000000000000000000000..ef6f7317bd440cc7d9fe08fcbbf2b7a542f99049 Binary files /dev/null and b/doc/design/images/two_phase_commit.png differ diff --git a/doc/design/infer_var_type.md b/doc/design/infer_var_type.md new file mode 100644 index 0000000000000000000000000000000000000000..d9d5397becba2ef1806d9341cd49cd9aabbf4a6a --- /dev/null +++ b/doc/design/infer_var_type.md @@ -0,0 +1,78 @@ +# Design Doc: InferVarType + +## The Problem Posed + +The variable in our design can hold variant types. Such as `LoDTensor` and `SelectedRows`. An operator should be able to inference the variable types of its output. + +For example, a `lookup table` operator takes two `LoDTensor`; one is a float tensor as the embedding table, the other is an int tensor as word ID. The gradient operator of `lookup table` will generate a `SelectedRows` as its output. A `sum` operator can take both `LoDTensor` and `SelectedRows` as its inputs and will generate a `LoDTensor` if any of its inputs is `LoDTensor`, otherwise, the `sum` operator will generate `SelectedRows` as its output. + +The variable type will be constant at runtime. Every variable's type can either be set by the user (input data and parameter) or be inferred by the operator in compile time. + +## Proposed Solution + +The `InferVarType` is a compile-time function which is registered to each operator. The inferface of that function is: + + +```c++ +using InferVarTypeFN = std::function< + void (const OpDescBind& /*op_desc*/, BlockDescBind* /*block*/)>; +``` + +It takes an operator description as its input and will write the output variable type and store them in block description. + +The `InferVarTypeFN` will be registered in `OpInfo`, to replace `infer_var_type_` field. The `OpInfo` should be + +```cpp +struct OpInfo { + InferVarTypeFN infer_var_type_; + ... +}; +``` + +The default `InferVarType` will set output type as `LoDTensor`. It can be done by `GetInferVarType()`. + +```cpp +void DefaultInferVarType(const OpDescBind& op_desc, BlockDescBind* block) { + // set the output type of variable as `LoDTensor`. + // ... +} + +struct OpInfo { + InferVarTypeFN infer_var_type_; + InferVarTypeFN GetInferVarType() const { + if (infer_var_type_) { + return infer_var_type_; + } else { + return DefaultInferVarType; + } + } +}; +``` + +## Register InferVarType + +We provide a thin base class for registering an `InferVarTypeFN`. To use a base class will ease the implementation of registry since we can detect the registry entry is an `InferVarTypeFN` or not. + +```cpp +class VarTypeInferer { +public: + virtual void operator()(const OpDescBind& op_desc, BlockDescBind* block) const = 0; +} +``` + +Operator developers can write the specialize `VarTypeInferer` as follow. + +```cpp +class SpecialVarTypeInferer : public VarTypeInferer { +public: + virtual void operator()(const OpDescBind& op_desc, BlockDescBind* block) const { + // .. own logic + } +} +``` + +Then user can register the `InferVarType` just like `GradOpDescMaker` and `OpInfoMaker`. + +``` +REGISTER_OPERATOR(some_op, OpType, SpecialVarTypeInferer, ...); +``` diff --git a/doc/design/mkldnn/README.MD b/doc/design/mkldnn/README.MD new file mode 100644 index 0000000000000000000000000000000000000000..61d453de243c25defc56161641bc4a888a88a3b7 --- /dev/null +++ b/doc/design/mkldnn/README.MD @@ -0,0 +1,211 @@ +# Intel® MKL-DNN on PaddlePaddle: Design Doc + +我们计划将英特尔深度神经网络数学库[Intel MKL-DNN](https://github.com/01org/mkl-dnn) +(Intel Math Kernel Library for Deep Neural Networks)集成到PaddlePaddle, +充分展现英特尔平台的优势,有效提升PaddlePaddle在英特尔架构上的性能。 + +
+
+Figure 1. PaddlePaddle on IA +
+ +近期目标 + +- 完成常用Layer的MKL-DNN实现。 +- 完成常见深度神经网络VGG,GoogLeNet 和 ResNet的MKL-DNN实现。 + +目前的优化,主要针对PaddlePaddle在重构之前的代码框架以及V1的API。 +具体的完成状态可以参见[这里](https://github.com/PaddlePaddle/Paddle/projects/21)。 + +## Contents + +- [Overview](#overview) +- [Actions](#actions) + - [CMake](#cmake) + - [Matrix](#matrix) + - [Layers](#layers) + - [Activations](#activations) + - [Parameters](#parameters) + - [Gradients](#gradients) + - [Unit Tests](#unit-tests) + - [Python API](#python-api) + - [Benchmarking](#benchmarking) + - [Others](#others) +- [Design Concerns](#design-concerns) + +## Overview + +我们会把MKL-DNN会作为第三方库集成进PaddlePaddle,与其他第三方库一样,会在编译PaddlePaddle的时候下载并编译MKL-DNN。 + +同时,为了进一步提升PaddlePaddle在基本数学运算的计算速度,我们也将MKLML即(MKL small library\[[1](#references)\]) +作为另一个第三方库集成进PaddlePaddle,它只会包括生成好的动态库和头文件。 + +MKL,MKLML以及MKL-DNN三者关系如下表: + +| Name | Open Source | License | Descriptions | +| :---------- | :--------------- | :---------- | :------------ | +| MKL | No | Proprietary | Accelerate math processing routines | +| MKLML | No | Proprietary | Small package of MKL, especially for Machine Learning | +| MKL-DNN | Yes | Apache 2.0 | Accelerate primitives processing routines especially for Deep Neural Networks | + +MKLML可以与MKL-DNN共同使用,以此达到最好的性能。 + +
+
+Figure 2. PaddlePaddle with MKL Engines +
+ +## Actions + +添加的相关文件和目录结构如下: + +```txt +PaddlePaddle/Paddle +├── ... +├── cmake/ +│ ├── external/ +│ │ ├── ... +│ │ ├── mkldnn.cmake +│ │ └── mklml.cmake +└── paddle/ + ├── ... + ├── math/ + │ ├── ... + │ └── MKLDNNMatrix.* + └── gserver/ + ├── ... + ├── layers/ + │ ├── ... + │ └── MKLDNN*Layer.* + ├── activations/ + │ ├── ... + │ └── MKLDNNActivations.* + └── tests/ + ├── ... + ├── MKLDNNTester.* + └── test_MKLDNN.cpp +``` + +### CMake +在`CMakeLists.txt`中提供一个与MKL有关的总开关:`WITH_MKL`,它负责决定编译时是否使用MKLML和MKL-DNN + +- `WITH_MKLML` 控制是否使用MKLML库。 +当打开`WITH_MKL`时,会自动使用MKLML库作为PaddlePaddle的CBLAS和LAPACK库,同时会开启Intel OpenMP用于提高MKLML的性能。 +编译时会把对应的头文件和库放在`build/third_party/install/mklml/*`目录下对应的地方。 +MKLML的库目前都是动态库,主要包括`libiomp5.so`和`libmklml_intel.so`。 +- `WITH_MKLDNN` 控制是否使用MKL-DNN。 +当开启`WITH_MKL`时,会自动根据硬件配置[[2](#references)]选择是否编译MKL-DNN。 +编译时会把对应的头文件和库放在`build/third_party/install/mkldnn/*`目录下对应的地方。 +MKL-DNN的库目前只有动态库`libmkldnn.so`。 + +### Matrix +目前在PaddlePaddle中数据都是以`NCHW`的格式存储,但是在MKL-DNN中的排列方式不止这一种。 +所以我们定义了一个`MKLDNNMatrix`用于管理MKL-DNN数据的不同格式以及相互之间的转换。 + +
+
+Figure 3. MKLDNNMatrix +
+ +### Layers +所有MKL-DNN的Layers都会继承于`MKLDNNLayer`,该类继承于PaddlePaddle的基类`Layer`。 +在`MKLDNNLayer`中会提供一些必要的接口和函数,并且会写好`forward`和`backward`的基本逻辑, +子类只需要使用定义好的接口,实现具体的函数功能即可。 + +
+
+Figure 4. MKLDNNLayer +
+ +每个MKLDNNLayer都包含用于内部存储和外部存储的一系列MKLDNNMatrix: + +- 内部存储(internel memory):`inVal_`,`inGrad_`,`outVal_`和`outGrad_`,分别代表输入数据,输入梯度,输出数据和输出梯度。 +- 外部存储(external memory):都是以ext开头,比如`extInVal_`和`extInGrad_`,它们主要是用于, +当数据格式与PaddlePaddle默认的`NCHW`格式不匹配时,转换内存的工作。 +需要注意的是,PaddlePaddle的activation会直接使用`output_.value`和`output_.grad`, +所以`extOutVal_`和`extOutGrad_`必须分别与`output_.value`和`output_.grad`共享内存, +如果不需要外部存储用于转换,那么对应的内部存储也会与它们共享内存。 +- 转换函数(resetXXX): 包括`resetInValue`,`resetInGrad`,`resetOutValue`和`resetOutGrad`, +表示对输入数据,输入梯度,输出数据和输出梯度的转换。 +这些函数会根据输入参数重新设置内部和外部存储,当然这两者也可以相等,即表示不需要转换。 + +注意:每个`MKLDNNlayer`的子类只需要使用内部存储就可以了,所有外部的转换工作都会在reset系列函数中都准备好。 + +### Activations +在重构前的PaddlePaddle中,激活函数是独立于`Layer`的概念,并且输入输出都是共用一块内存, +所以添加了对应的`MKLDNNActivation`来实现,方式类似于`MKLDNNLayer`。 + +### Parameters +对于有参数的层,我们会保证`MKLDNNLayer`使用的参数与PaddlePaddle申请的buffer共用一块内存。 +如果存在数据排列格式不一样的情况时,我们会在网络训练之前把格式转换为MKL-DNN希望的格式, +在训练结束的时候再保存为PaddlePaddle的格式,但是整个训练过程中不需要任何转换。 +这样既使得最终保存的参数格式与PaddlePaddle一致,又可以避免不必要的转换。 + +### Gradients +由于MKL-DNN的操作都是直接覆盖的形式,也就是说输出的结果不会在原来的数据上累加, +这样带来的好处就是不需要一直清空memory,节省了不必要的操作。 +但是注意的是,当网络出现分支且在`backward`的时候,需要累加不同Layer传过来的梯度。 +所以在`MKLDNNlayer`中实现了一个merge的方法,此时每个小分支的`Input Gradient` +会先临时保存在`MKLDNNMatrix`中,由分支处的Layer负责求和,并把结果放到当前层的`output_.grad`中。 +所以整体上,在实现每个子类的时候就不需要关心分支的事情了。 + +
+
+Figure 5. Merge Gradients +
+ +### Unit Tests +我们会添加`test_MKLDNN.cpp`和`MKLDNNTester.*`用于MKL-DNN的测试。 +测试分为每个Layer(或Activation)的单元测试和简单网络的整体测试。 +每个测试会对比PaddlePaddle中CPU算出的结果与MKL-DNN的结果,小于某个比较小的阈值认为通过。 + +### Python API +目前只考虑**v1 API**。 + +计划在`python/paddle/trainer/config_parser.py`里面添加`use_mkldnn`这个选择,方便用户选择使用MKL-DNN的layers。 + +具体实现方式比如: + +```python +use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0))) +if use_mkldnn + self.layer_type = mkldnn_* +``` + +所有MKL-DNN的`layer_type`会以*mkldnn_*开头,这些会在`MKLDNN*Layer`注册layer的时候保证,以示区分。 + +同时,会在`paddle/utils.Flags`中添加一个`use_mkldnn`的flag,用于选择是否使用MKL-DNN的相关功能。 + +### Benchmarking +会添加相应的脚本在[这里](https://github.com/PaddlePaddle/Paddle/tree/develop/benchmark/paddle/image),用于测试和对比在使用MKL-DNN前后的CNN网络性能。 +测试的性能对比结果会在[IntelOptimizedPaddle.md](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/IntelOptimizedPaddle.md) + +### Others +1. 如果在使用MKL-DNN的情况下,会把CPU的Buffer对齐为4096,具体可以参考MKL-DNN中的[memory](https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp#L673)。 +2. 深入PaddlePaddle,寻找有没有其他可以优化的可能,进一步优化。比如可能会用OpenMP改进SGD的更新性能。 + +## Design Concerns + +为了更好的符合PaddlePaddle的代码风格\[[3](#references)\],同时又尽可能少的牺牲MKL-DNN的性能\[[4](#references)\]。 + +我们总结出一些特别需要注意的点: + +1. 使用**deviceId_**。为了尽可能少的在父类Layer中添加变量或者函数, +我们决定使用已有的`deviceId_`变量来区分layer的属性,定义`-2`为`MKLDNNLayer`特有的设备ID。 +2. 重写父类Layer的**init**函数,修改`deviceId_`为`-2`,代表这个layer是用于跑在MKL-DNN的环境下。 +3. 创建`MKLDNNBase`,定义一些除了layer和memory相关的类和函数。 +包括MKL-DNN会用到`MKLDNNStream`和`CPUEngine`,和未来可能还会用到`FPGAEngine`等。 +4. 如果MKL-DNN layer的后面接有cpu device,那么就会使`output_.value`与`extOutVal_`共享内存, +同时数据格式就是`NCHW`,这样下一个cpu device就能拿到正确的数据。 +在有普通的CPU layer时, `extOutVal_`和`extOutGrad_`的格式始终是`NCHW`或者`NC`。 + +## References +1. [MKL small library](https://github.com/01org/mkl-dnn#linking-your-application)是[Intel MKL](https://software.intel.com/en-us/mkl)的一个子集。 +主要包括了深度学习相关的数学原语与操作,一般由MKL-DNN在发布[新版本](https://github.com/01org/mkl-dnn/releases)时一起更新。 +2. [MKL-DNN System Requirements](https://github.com/01org/mkl-dnn#system-requirements)。 +目前在PaddlePaddle中,仅会在支持AVX2指令集及以上的机器才使用MKL-DNN。 +3. [原来的方案](https://github.com/PaddlePaddle/Paddle/pull/3096)会引入**nextLayer**的信息。 +但是在PaddlePaddle中,无论是重构前的layer还是重构后的op,都不会想要知道next layer/op的信息。 +4. MKL-DNN的高性能格式与PaddlePaddle原有的`NCHW`不同(PaddlePaddle中的cuDNN部分使用的也是`NCHW`,所以不存在这个问题)。 +所以需要引入一个转换方法,并且只需要在必要的时候转换这种格式,才能更好的发挥MKL-DNN的性能。 + diff --git a/doc/design/mkldnn/image/engine.png b/doc/design/mkldnn/image/engine.png new file mode 100644 index 0000000000000000000000000000000000000000..1f5f65c2cc765a514a3ba9e7b7f468e1dc4b0c3b Binary files /dev/null and b/doc/design/mkldnn/image/engine.png differ diff --git a/doc/design/mkldnn/image/gradients.png b/doc/design/mkldnn/image/gradients.png new file mode 100644 index 0000000000000000000000000000000000000000..f031bcf8e4cec14e63075b8b9d2c7bbd9f1b1a3c Binary files /dev/null and b/doc/design/mkldnn/image/gradients.png differ diff --git a/doc/design/mkldnn/image/layers.png b/doc/design/mkldnn/image/layers.png new file mode 100644 index 0000000000000000000000000000000000000000..306f79b7a844610915eb8944128f57d2b7a3065a Binary files /dev/null and b/doc/design/mkldnn/image/layers.png differ diff --git a/doc/design/mkldnn/image/matrix.png b/doc/design/mkldnn/image/matrix.png new file mode 100644 index 0000000000000000000000000000000000000000..c33ce9cf0335e47cc8c1253304d0fe179186e6f2 Binary files /dev/null and b/doc/design/mkldnn/image/matrix.png differ diff --git a/doc/design/mkldnn/image/overview.png b/doc/design/mkldnn/image/overview.png new file mode 100644 index 0000000000000000000000000000000000000000..8fb7bbb9dd654bf363d701d0c8cd4a557043d188 Binary files /dev/null and b/doc/design/mkldnn/image/overview.png differ diff --git a/doc/design/model_format.md b/doc/design/model_format.md new file mode 100644 index 0000000000000000000000000000000000000000..e29129fddf775939c9f7a8b49d850d523e6e5a45 --- /dev/null +++ b/doc/design/model_format.md @@ -0,0 +1,36 @@ +# Design Doc: Model Format + +## Motivation + +A model is an output of the training process. One complete model consists of two parts, the **topology** and the **parameters**. In order to support industrial deployment, the model format must be self-complete and must not expose any training source code. + +As a result, In PaddlePaddle, the **topology** is represented as a [ProgramDesc](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/doc/design/program.md), which describes the model structure. The **parameters** contain all the trainable weights in the model. We must support large size parameters and efficient serialization/deserialization of parameters. + +## Implementation + +The topology is saved as a plain text in a detailed self-contain protobuf file. + +The parameters are saved as a binary file. As we all know, the protobuf message has a limit of [64M size](https://developers.google.com/protocol-buffers/docs/reference/cpp/google.protobuf.io.coded_stream#CodedInputStream.SetTotalBytesLimit.details). We have done a [benchmark experiment](https://github.com/PaddlePaddle/Paddle/pull/4610), which shows that protobuf is not fit for the task. + +As a result, we design a particular format for tensor serialization. By default, an arbitrary tensor in Paddle is a [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md), and has a description information proto of [LoDTensorDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L99). We save the DescProto as the byte string header. It contains all the necessary information, such as the `dims`, and the `LoD` information in [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/paddle/framework/lod_tensor.md). A tensor stores values in a continuous memory buffer. For speed we dump the raw memory to disk and save it as the byte string content. So, the binary format of one tensor is, + +The table below shows a tensor's byte view in detail. Note that all the signed values are written in the little-endian format. + +|field name | type | description | +| --- | --- | --- | +| version | uint32_t | Version of saved file. Always 0 now. | +| tensor desc length | uint32_t | TensorDesc(Protobuf message) length in bytes. | +| tensor desc | void* | TensorDesc protobuf binary message | +| tensor data | void* | Tensor's data in binary format. The length of `tensor_data` is decided by `TensorDesc.dims()` and `TensorDesc.data_type()` | +| lod_level | uint64_t | Level of LoD | +| length of lod[0] | uint64_t | [Optional] length of lod[0] in bytes. | +| data of lod[0] | uint64_t* | [Optional] lod[0].data() | +| ... | ... | ... | + + + +## Summary + +- We introduce a model format. +- The model represented by its forward-pass computation procedure is saved in a **ProgramDesc** protobuf message. +- A bunch of specified format binary tensors describe the **parameters**. diff --git a/doc/design/multi_language_interface/00.why_plain_c.md b/doc/design/multi_language_interface/00.why_plain_c.md new file mode 100644 index 0000000000000000000000000000000000000000..a1443093342c5a3ed698fb6b52a751dfc7cb5319 --- /dev/null +++ b/doc/design/multi_language_interface/00.why_plain_c.md @@ -0,0 +1,118 @@ +# Paddle多语言接口实现 +## 背景 + +Paddle需要一个多语言接口,这个接口需要做到: + +* 有标准的,良好的文档 + * 例如Python可以使用[Sphinx](http://www.sphinx-doc.org/en/stable/)生成API文档,golang可以使用[GoDoc](https://godoc.org/golang.org/x/tools/cmd/godoc)生成文档。这都需要这个接口按照约定俗成的规则来注释完备。 +* 不同语言的接口适应不同语言的特性 + * 例如Java与Python的错误处理是直接扔出来Exception,而对于golang错误处理应该使用返回值。 + +## 基本要求 + +Paddle的多语言接口实现包括一下几个方面: + +* 我们使用动态库来分发Paddle。在这个动态库中不嵌入任何其他语言的解释器,也不使用其他动态库。 +* 这个动态库使用C99标准的头文件导出一些函数,不使用/导出C++符号。 +* 不导出Paddle内部的结构体、类,仅仅使用`void*`指针作为类型的句柄(handler)。 +* 不使用SWIG这种代码生成器,而是手写多语言绑定。 + + +## 原因 + +### 使用动态库来分发Paddle + +* Paddle的链接方式比较复杂 + * 如果用户要把Paddle的静态库(libpaddle.a)链接到自己的程序里,得使用 `--whole-archive` (for GCC) 或者 `--force_load` (for Clang) 参数,来确保把 libpaddle.a 里所有的符号都写入自己的程序的二进制文件里。这是因为 Paddle 的源码里使用了[object factory design pattern](http://stackoverflow.com/a/1310326/724872)。 +* 编译型语言,例如C/C++使用静态库和动态库难度差不多。但是解释性语言,例如[Python](http://stackoverflow.com/questions/19560594/how-to-import-static-library-in-python)或者[Java](http://stackoverflow.com/questions/24493337/linking-static-library-with-jni),只能调用Paddle的动态库,否则得把Paddle静态库链接到解释器里。 + * 解释性语言实际运行的二进制是解释器本身,如果调用静态库只能将静态库与解释器链接。例如对于Java来说,便是将静态库加入JVM中。这对于通常的Java的开发者来说,是不常见的做法。 + +### 动态库中不嵌入任何其他语言的解释器 + +* 目前Paddle的进程模型是C++内部驱动Python解释器进行模型配置解析和数据读取 +* 我们最终的动态库中不嵌入Python或者其他任何语言的解释器。模型配置解析,数据读取均交由其他语言完成 + +现阶段Paddle有一个问题是,Paddle内嵌的Python解释器和外部使用的Python如果版本不同,会直接报错退出。 + +### Paddle动态库中,不引用其他动态库 + +* 即这个动态库是不依赖于其他任何文件的,可以在任何机器上执行的。 + +### 这个动态库使用C99标准的头文件导出一些函数,不使用/导出C++符号 + +* 由于C++编译器没有[名字修饰](https://en.wikipedia.org/wiki/Name_mangling#C.2B.2B)的规范,不同版本的编译器之间,对于同一段C++代码生成的符号可能不一致。而多语言接口需要直接读取生成的二进制(动态库),需要有稳定的导出符号。 +* C语言是有导出符号的标准的,并且在常见的平台上,都是ABI调用标准的。 +* 大多数语言都支持使用C语言API +* 使用C99而不使用C89,是因为C99支持[Fixed-width integer types](https://en.wikipedia.org/wiki/C_data_types#Fixed-width_integer_types)和[Boolean type](https://en.wikipedia.org/wiki/C_data_types#Boolean_type)。 +* 使用C99而不使用C11的原因是,[C11](https://en.wikipedia.org/wiki/C11_(C_standard_revision))并没有Paddle特别需要的特性,且C99相对于C11使用更加广泛。 + +### 不导出Paddle内部的结构体、类,仅仅使用`void*`指针作为类型的句柄(handler) + +* Paddle内部的类为C++书写,直接导出到C的接口比较困难。 +* 在C-API中使用`void*`来表示Paddle内部类。再在每一个API中自己检查类型。 + +在C的头文件 `paddle_matrix.h` 中: + +```C +typedef void* paddle_matrix; +typedef int paddle_error; + +extern "C" +paddle_error paddle_matrix_get_shape(paddle_matrix matrix, + uint64_t* width, + uint64_t* height); +``` +而在CPP里面实现这个C的接口,文件 `paddle_matrix.cpp` + +```cpp +#include "paddle/math/matrix.h" +extern "C" +paddle_error paddle_matrix_shape(paddle_matrix matrix, + uint64_t *width, + uint64_t *height) { + auto m = (paddle::capi::CMatrix*)(matrix); + *width = m->width(); + *height = m->height(); +} +``` + +其中`paddle/capi/CMatrix.hpp`文件内容为: + +```cpp +namespace paddle { +namespace math { + +class CMatrix { + std::shared_ptr mat; +}; + +} // namespace math +} // namespace paddle +``` + +### 不使用SWIG这种代码生成器,而是手写多语言绑定 + +* [SWIG](http://www.swig.org/)是一个多语言接口的代码生成器。他的目标是使用C/C++写代码,SWIG直接读取C/C++的头文件,生成各种语言的绑定代码。 + * 对于多语言接口,SWIG需要写一个interface文件。这个文件具有独特的语法,学习成本高。且增加一个第三方语言,就需要对这个第三方语言增加一些定义。有的时候,interface文件的写法非常[tricky](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/api/Paddle.swig#L36)。社区贡献代码学习成本高。 + * SWIG暴露的接口保留了C++的接口样式,很难保证多语言代码风格的一致性。(函数命名,错误处理) + * 因为SWIG在第三方语言中暴露的函数名,类名和C++中完全一致。C++的命名风格并不能适应其他第三方语言。如果使用SWIG我们需要将在interface文件里,将大量的`SomeCppClass`重命名成`some_python_class`,或者`SomeGoTypes`。 + * 对于不同语言,错误处理的方式也不尽相同。例如对于Java或者Python,最常见的错误处理方式是Exception,而对于Golang,错误处理方式是返回值。而SWIG只能简单的暴露C++接口,无法做到对于各种语言错误处理方式的适配。 + * 对于大多数语言,直接使用C语言的.h并不困难。例如Python的[cffi](https://cffi.readthedocs.io/en/latest/overview.html#simple-example-abi-level-in-line)或者[Cython](http://cython.org/), golang的[cgo](https://golang.org/cmd/cgo/)。 + * SWIG支持的语言或者解释器有局限。例如对于Python,使用SWIG只支持CPython解释器,而不支持PyPy解释器。 + + +## 原因列表 + +| 结论 | 对比 | 原因 | +|---| --- | --- | +| 使用动态库 | 不使用静态库 | 解释型语言只能调用动态库,Paddle静态库链接复杂 | +| 不嵌入其他语言解释器 | 不嵌入Python解释器 | Paddle C++目前嵌入Python解释器,会导致不同版本Python在一个进程里的bug | +| 不引用其他动态库 | | Paddle一个动态库可以在任何Linux系统上运行 | +| 使用C99做接口 | 不使用C++做接口 | C有标准的ABI,C99是目前C最广泛的使用标准,且C99支持bool类型和定长整数(uint64_t等)类型 | +| 使用void*作为类句柄 | 不显示的写每个类具体包含什么| 实现简单,并且让接口脱离实现细节 | +| 手写多语言绑定 | 不使用SWIG | 使用SWIG需要多语言绑定的开发人员熟练掌握SWIG配置,社区参与困难。SWIG生成的代码不能保证多语言代码风格的一致性 | + + +## 实现 + +参考[Inference implementation](01.inference_implementation.md) diff --git a/doc/design/multi_language_interface/01.inference_implementation.md b/doc/design/multi_language_interface/01.inference_implementation.md new file mode 100644 index 0000000000000000000000000000000000000000..9820284523246a062581f322616d196f575c9d29 --- /dev/null +++ b/doc/design/multi_language_interface/01.inference_implementation.md @@ -0,0 +1,131 @@ +# C-API 模型推断实现文档 + +本文档描述Paddle C-API的实现细节。Paddle C-API是多语言API的基础部分。Paddle需要暴露的API很多。先实现模型推断的API,通过模型推断API的实现作为一个样例,来进行讨论。至于为什么需要C-API,请参考[Why Plain C](./00.why_plain_c.md)。 + +## Table of Contents + * [C-API 模型推断实现文档](#c-api-模型推断实现文档) + * [暴露接口原则](#暴露接口原则) + * [目录结构](#目录结构) + * [实现方式](#实现方式) + * [capi.h](#capih) + * [具体某种类型的头文件](#具体某种类型的头文件) + * [capi_private.h](#capi_privateh) + * [具体某种类型的实现文件](#具体某种类型的实现文件) + * [libpaddle_capi_shared.{so, dylib}](#libpaddle_capi_sharedso-dylib) + * [libpaddle_capi_whole.a](#libpaddle_capi_wholea) + * [examples](#examples) + * [编译选项](#编译选项) + + +## 暴露接口原则 + +1. 所有的接口均为C接口。即使用`extern "C"` +2. 除构造某种类型的函数(`paddle_matrix_create`等),其他函数均返回`paddle_error`。且调用时不能抛出异常或出现运行时错误。 +3. 所有类型名为`paddle_类型名`,所有与类型相关的函数,函数名为`paddle_类型名_函数名` +4. 如果某一个Paddle Core概念(GradientMachine/Matrix)需要被暴露到其他语言,那么 + * 为了暴露的接口尽量简单。只暴露概念的接口,而不暴露概念的实现。即暴露`GradientMachine`或者`Matrix`但不暴露`RecurrentGradientMachine`和`CpuSparseMatrix`。 + * 暴露这个概念必要函数。`必要`是指,即完成某一个任务的最少函数。 +5. 不在`capi`接口层做过多封装。 + * 如果某一个Paddle概念必须要暴露,但是又过于琐碎。不在`capi`这一层进行封装,而是直接修改Paddle Core。让Paddle核心中,这一概念不再琐碎。 + + +## 目录结构 + +```text +Paddle + `-- paddle + `-- capi + `-- examples # The example project for C-API. + `-- tests # unittests for C-API + `-- capi.h # C-API header file. + `-- capi_private.h # The shared header file between implementation sources. + `-- matrix.{h, cpp} + `-- gradient_machine.{h, cpp} + `-- ... +``` + + +Paddle的C-API目录结构如上图表所示。这个目录中除了`capi_private.h`之外的所有头文件,均会被安装到include/paddle路径下。C-API生成的二进制文件会被安装到`lib`目录下。即,安装后的目录结构为 + +```text +`-- include + `-- paddle + `-- capi.h + `-- matrix.h + `-- gradient_machine.h + `-- ... +`-- lib + `-- libpaddle_capi_shared.{so, dylib} # In mac, dynamic libary's file name extention is `dylib` + `-- libpaddle_capi_whole.a # static library for all symbols of Paddle. +``` + +## 实现方式 + +下面分别介绍某一类文件的实现方式。 + +### capi.h + +`capi.h`是用户使用C-API时所唯一需要引入的头文件。在`capi.h`中,引入了类型的头文件,`matrix.h`, `gradient_machine.h`。在引入其他类型的头文件时,使用相对路径的引用方式。即`#include "matrix.h"` + +### 具体某种类型的头文件 + +具体某种类型的头文件,即例如`matrix.h`,`gradient_machine.h`等。在这些头文件中,包含了某种类型的类型定义和暴露的全部函数。 + +这个头文件不假设其他文件的引用顺序,即使用户直接引用某种类型的头文件,也不应该报错(虽然不鼓励这样)。如果某一个类型需要引用另一个类型,例如`gradient_machine`需要引用`matrix`,则直接引入另一种类型的头文件,即`#include "matrix.h"`。 + +### capi_private.h + +`capi_prviate.h`是各个实现中共享的头文件,他主要包含了实际暴露的类型结构。在用户使用C-API时,Paddle的类型全部退化成`void *`,即`typedef paddle_matrix void*`。但,对于每种C-API暴露的类型,均是在`capi_private.h`中实现的结构体。 + +```cpp +struct CMatrix { + int type = MatrixType; + std::shared_ptr mat; +}; +``` + +通常,这个结构体包含两个项目。 + +* `type`是一个类型的标志。对于每种类型,type字段均不尽相同。这样,即使C-API接受的类型全是`void *`,我们也可以确定每一个参数的类型。 + + ```cpp + void some_c_api_function(void* some_instance) { + int* type = (int *) some_instance; + switch (*type) { + case MatrixType: + CMatrix* mat = (CMatrix *) some_instance; + ... + ... + } + } + ``` +* 这个结构体中的另一个项目是,Paddle Core中这一类型接口的智能指针(shared_ptr)。 + * 使用智能指针的原因是: 用户可以安全的释放某个C-API的实例,而不必在意Paddle Core是否还在使用这个实例。 + * 例如,用户通过C-API获得了神经网络的参数实例。当用户使用完这个参数后,直接删除这个参数即可。即便Paddle Core中的模型还在使用这个参数,这个参数也不会一并删除。 + +### 具体某种类型的实现文件 + +具体某种类型的实现文件,即`matrix.cpp`, `gradient_machine.cpp`等文件。在这些文件中,使用C++ 11实现了C-API的接口,并且使用`extern "C"`导出这些接口。在实现过程中,对输入参数的安全性进行了必要的判断,并将C-API接口的参数转发给`Paddle Core`。 + +### libpaddle\_capi_shared.{so, dylib} + +`libpaddle_capi_shared`是C-API导出的动态库。这个动态库的连接参数与Paddle的其他二进制(例如`paddle_trainer`)类似。用户可以直接使用这个动态库来引入Paddle C-API。具体使用方法为`-lpaddle_capi_shared`。 + +### libpaddle\_capi_whole.a + +`libpaddle_capi_whole`是C-API导出的静态库。这个静态库包含了Paddle的全部符号。他是将`libpaddle_gserver.a`, `libpaddle_math.a`, `libpaddle_capi.a`等全部静态库中的目标文件全部打包后产生的文件。具体使用方法为`--whole-archive -lpaddle_capi_whole --no-whole-archive`。 + + +### examples + +在样例中,使用`C99`开发了模型预测的样例代码。具体请参考[example/README.md](../../../paddle/capi/examples/README.md)。 + +## 编译选项 + +C-API的编译选项默认关闭,打开这个编译选项,需要在cmake的时候,设置 + +```bash +cmake ${YOUR_SOURCE_ROOT} -DWITH_C_API=ON -DWITH_PYTHON=OFF -DWITH_SWIG_PY=OFF +``` + +编译C-API的时候推荐Paddle不嵌入Python解释器,也不生成`SWIG`接口,具体原因参考[Why Plain C](./00.why_plain_c.md)。 diff --git a/doc/design/multi_language_interface/why_plain_c.md b/doc/design/multi_language_interface/why_plain_c.md deleted file mode 100644 index a3f41ca7b93de8a55d927c88812802ef12246182..0000000000000000000000000000000000000000 --- a/doc/design/multi_language_interface/why_plain_c.md +++ /dev/null @@ -1,118 +0,0 @@ -# Paddle多语言接口实现 -## 背景 - -Paddle需要一个多语言接口,这个接口需要做到: - -* 有标准的,良好的文档 - * 例如Python可以使用[Sphinx](http://www.sphinx-doc.org/en/stable/)生成API文档,golang可以使用[GoDoc](https://godoc.org/golang.org/x/tools/cmd/godoc)生成文档。这都需要这个接口按照约定俗成的规则来注释完备。 -* 不同语言的接口适应不同语言的特性 - * 例如Java与Python的错误处理是直接扔出来Exception,而对于golang错误处理应该使用返回值。 - -## 基本要求 - -Paddle的多语言接口实现包括一下几个方面: - -* 我们使用动态库来分发Paddle。在这个动态库中不嵌入任何其他语言的解释器,也不使用其他动态库。 -* 这个动态库使用C99标准的头文件导出一些函数,不使用/导出C++符号。 -* 不导出Paddle内部的结构体、类,仅仅使用`void*`指针作为类型的句柄(handler)。 -* 不使用SWIG这种代码生成器,而是手写多语言绑定。 - - -## 原因 - -### 使用动态库来分发Paddle - -* Paddle的链接方式比较复杂 - * 如果用户要把Paddle的静态库(libpaddle.a)链接到自己的程序里,得使用 `--whole-archive` (for GCC) 或者 `--force_load` (for Clang) 参数,来确保把 libpaddle.a 里所有的符号都写入自己的程序的二进制文件里。这是因为 Paddle 的源码里使用了[object factory design pattern](http://stackoverflow.com/a/1310326/724872)。 -* 编译型语言,例如C/C++使用静态库和动态库难度差不多。但是解释性语言,例如[Python](http://stackoverflow.com/questions/19560594/how-to-import-static-library-in-python)或者[Java](http://stackoverflow.com/questions/24493337/linking-static-library-with-jni),只能调用Paddle的动态库,否则得把Paddle静态库链接到解释器里。 - * 解释性语言实际运行的二进制是解释器本身,如果调用静态库只能将静态库与解释器链接。例如对于Java来说,便是将静态库加入JVM中。这对于通常的Java的开发者来说,是不常见的做法。 - -### 动态库中不嵌入任何其他语言的解释器 - -* 目前Paddle的进程模型是C++内部驱动Python解释器进行模型配置解析和数据读取 -* 我们最终的动态库中不嵌入Python或者其他任何语言的解释器。模型配置解析,数据读取均交由其他语言完成 - -现阶段Paddle有一个问题是,Paddle内嵌的Python解释器和外部使用的Python如果版本不同,会直接报错退出。 - -### Paddle动态库中,不引用其他动态库 - -* 即这个动态库是不依赖于其他任何文件的,可以在任何机器上执行的。 - -### 这个动态库使用C99标准的头文件导出一些函数,不使用/导出C++符号 - -* 由于C++编译器没有[名字修饰](https://en.wikipedia.org/wiki/Name_mangling#C.2B.2B)的规范,不同版本的编译器之间,对于同一段C++代码生成的符号可能不一致。而多语言接口需要直接读取生成的二进制(动态库),需要有稳定的导出符号。 -* C语言是有导出符号的标准的,并且在常见的平台上,都是ABI调用标准的。 -* 大多数语言都支持使用C语言API -* 使用C99而不使用C89,是因为C99支持[Fixed-width integer types](https://en.wikipedia.org/wiki/C_data_types#Fixed-width_integer_types)和[Boolean type](https://en.wikipedia.org/wiki/C_data_types#Boolean_type)。 -* 使用C99而不使用C11的原因是,[C11](https://en.wikipedia.org/wiki/C11_(C_standard_revision))并没有Paddle特别需要的特性,且C99相对于C11使用更加广泛。 - -### 不导出Paddle内部的结构体、类,仅仅使用`void*`指针作为类型的句柄(handler) - -* Paddle内部的类为C++书写,直接导出到C的接口比较困难。 -* 在C-API中使用`void*`来表示Paddle内部类。再在每一个API中自己检查类型。 - -在C的头文件 `paddle_matrix.h` 中: - -```C -typedef void* paddle_matrix; -typedef int paddle_error; - -extern "C" -paddle_error paddle_matrix_shape(paddle_matrix matrix, - uint64_t* width, - uint64_t* height); -``` -而在CPP里面实现这个C的接口,文件 `paddle_matrix.cpp` - -```cpp -#include "paddle/math/matrix.hpp" -extern "C" -paddle_error paddle_matrix_shape(paddle_matrix matrix, - uint64_t *width, - uint64_t *height) { - auto m = (paddle::math::matrix*)(matrix); - *width = m->width(); - *height = m->height(); -} -``` - -其中`paddle/math/matrix.hpp`文件内容为: - -```cpp -namespace paddle { -namespace math { - -class Matrix { - //... -}; - -} // namespace math -} // namespace paddle -``` - -### 不使用SWIG这种代码生成器,而是手写多语言绑定 - -* [SWIG](http://www.swig.org/)是一个多语言接口的代码生成器。他的目标是使用C/C++写代码,SWIG直接读取C/C++的头文件,生成各种语言的绑定代码。 - * 对于多语言接口,SWIG需要写一个interface文件。这个文件具有独特的语法,学习成本高。且增加一个第三方语言,就需要对这个第三方语言增加一些定义。有的时候,interface文件的写法非常[tricky](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/api/Paddle.swig#L36)。社区贡献代码学习成本高。 - * SWIG暴露的接口保留了C++的接口样式,很难保证多语言代码风格的一致性。(函数命名,错误处理) - * 因为SWIG在第三方语言中暴露的函数名,类名和C++中完全一致。C++的命名风格并不能适应其他第三方语言。如果使用SWIG我们需要将在interface文件里,将大量的`SomeCppClass`重命名成`some_python_class`,或者`SomeGoTypes`。 - * 对于不同语言,错误处理的方式也不尽相同。例如对于Java或者Python,最常见的错误处理方式是Exception,而对于Golang,错误处理方式是返回值。而SWIG只能简单的暴露C++接口,无法做到对于各种语言错误处理方式的适配。 - * 对于大多数语言,直接使用C语言的.h并不困难。例如Python的[cffi](https://cffi.readthedocs.io/en/latest/overview.html#simple-example-abi-level-in-line)或者[Cython](http://cython.org/), golang的[cgo](https://golang.org/cmd/cgo/)。 - * SWIG支持的语言或者解释器有局限。例如对于Python,使用SWIG只支持CPython解释器,而不支持PyPy解释器。 - - -## 原因列表 - -| 结论 | 对比 | 原因 | -|---| --- | --- | -| 使用动态库 | 不使用静态库 | 解释型语言只能调用动态库,Paddle静态库链接复杂 | -| 不嵌入其他语言解释器 | 不嵌入Python解释器 | Paddle C++目前嵌入Python解释器,会导致不同版本Python在一个进程里的bug | -| 不引用其他动态库 | | Paddle一个动态库可以在任何Linux系统上运行 | -| 使用C99做接口 | 不使用C++做接口 | C有标准的ABI,C99是目前C最广泛的使用标准,且C99支持bool类型和定长整数(uint64_t等)类型 | -| 使用void*作为类句柄 | 不显示的写每个类具体包含什么| 实现简单,并且让接口脱离实现细节 | -| 手写多语言绑定 | 不使用SWIG | 使用SWIG需要多语言绑定的开发人员熟练掌握SWIG配置,社区参与困难。SWIG生成的代码不能保证多语言代码风格的一致性 | - - -## 简单实现 - -TBD diff --git a/doc/design/ops/images/2_level_rnn.dot b/doc/design/ops/images/2_level_rnn.dot new file mode 100644 index 0000000000000000000000000000000000000000..5d77865061ca7bbbfcf254dd938f09aef5553505 --- /dev/null +++ b/doc/design/ops/images/2_level_rnn.dot @@ -0,0 +1,56 @@ +digraph G { + + rnn [label="1st level RNN" shape=box] + + subgraph cluster0 { + label = "time step 0" + + sent0 [label="sentence"] + sent1 [label="sentence"] + + rnn1 [label="2nd level RNN" shape=box] + + sent0 -> rnn1 + sent1 -> rnn1 + } + + subgraph cluster1 { + label = "time step 1" + + sent2 [label="sentence"] + sent3 [label="sentence"] + + rnn2 [label="2nd level RNN" shape=box] + + sent2 -> rnn2 + sent3 -> rnn2 + } + + subgraph cluster2 { + label = "time step 2" + + sent4 [label="sentence"] + sent5 [label="sentence"] + + rnn3 [label="2nd level RNN" shape=box] + + sent4 -> rnn3 + sent5 -> rnn3 + } + + + para0 [label="paragraph info 0"] + para1 [label="paragraph info 1"] + para2 [label="paragraph info 2"] + + rnn1 -> para0 + rnn2 -> para1 + rnn3 -> para2 + + para0 -> rnn + para1 -> rnn + para2 -> rnn + + chapter [label="chapter info"] + rnn -> chapter +} diff --git a/doc/design/ops/images/2_level_rnn.png b/doc/design/ops/images/2_level_rnn.png new file mode 100644 index 0000000000000000000000000000000000000000..0537a75beb175c0c284717421f7aa908da2a5038 Binary files /dev/null and b/doc/design/ops/images/2_level_rnn.png differ diff --git a/doc/design/ops/images/LOD-and-shape-changes-during-decoding.jpg b/doc/design/ops/images/LOD-and-shape-changes-during-decoding.jpg new file mode 100644 index 0000000000000000000000000000000000000000..8b0d90f7b9d8184b314b0ee4e521f53eb5f1b455 Binary files /dev/null and b/doc/design/ops/images/LOD-and-shape-changes-during-decoding.jpg differ diff --git a/doc/design/ops/images/rnn.dot b/doc/design/ops/images/rnn.dot new file mode 100644 index 0000000000000000000000000000000000000000..c1141cd9c981bb3cbf50d8bf7a6ed210280d79a5 --- /dev/null +++ b/doc/design/ops/images/rnn.dot @@ -0,0 +1,87 @@ +digraph G { + label = "simple RNN implementation" + + ranksep=2; + + //graph [nodesep=1, ranksep=1]; + + node[nodesep=1] + + subgraph cluster0 { + label = "global scope" + rankdir = TB + W + boot_memory + input + output + } + + subgraph cluster1 { + label = "step-scope 0" + rankdir = TB + memory0[label="memory"] + prememory0[label="pre-memory"] + step_input0[label="step input"] + step_output0[label="step output"] + } + + subgraph cluster2 { + label = "step-scope 1" + rankdir = TB + memory1[label="memory"] + prememory1[label="pre-memory"] + step_input1[label="step input"] + step_output1[label="step output"] + } + + subgraph cluster3 { + label = "step-scope 2" + rankdir = TB + memory2[label="memory"] + prememory2[label="pre-memory"] + step_input2[label="step input"] + step_output2[label="step output"] + } + + stepnet [shape=box] + stepnet0 [shape=box, style=dashed] + stepnet1 [shape=box, style=dashed] + stepnet2 [shape=box, style=dashed] + + + edge[color=blue] + boot_memory -> prememory0 [label="init" color="blue"] + memory0 -> prememory1 [label="copy/reference" color="blue"] + memory1 -> prememory2 [label="copy/reference" color="blue"] + + edge[color=black] + W -> stepnet0[constraint=false, style=dashed] + W -> stepnet1[constraint=false, style=dashed] + W -> stepnet2[constraint=false, style=dashed] + + memory0 -> stepnet0[style=dashed] + prememory0 -> stepnet0 -> step_output0[style=dashed] + + memory1 -> stepnet1[style=dashed] + prememory1 -> stepnet1 -> step_output1[style=dashed] + + memory2 -> stepnet2[style=dashed] + prememory2 -> stepnet2 -> step_output2[style=dashed] + + input -> step_input0 + input -> step_input1 + input -> step_input2 + + step_input0 -> stepnet0 [style=dashed] + step_input1 -> stepnet1[style=dashed] + step_input2 -> stepnet2[style=dashed] + + step_output0 -> output + step_output1 -> output + step_output2 -> output + + stepnet0 -> stepnet[style=dashed] + stepnet1 -> stepnet[style=dashed] + stepnet2 -> stepnet[style=dashed] + +} diff --git a/doc/design/ops/images/rnn.jpg b/doc/design/ops/images/rnn.jpg new file mode 100644 index 0000000000000000000000000000000000000000..9867e404cf959df0dce6ded5222b466c788fb840 Binary files /dev/null and b/doc/design/ops/images/rnn.jpg differ diff --git a/doc/design/ops/images/rnn.png b/doc/design/ops/images/rnn.png new file mode 100644 index 0000000000000000000000000000000000000000..e139e373fe8396782044cfd936fdde624f8c66fe Binary files /dev/null and b/doc/design/ops/images/rnn.png differ diff --git a/doc/design/ops/images/rnn_2level_data.dot b/doc/design/ops/images/rnn_2level_data.dot new file mode 100644 index 0000000000000000000000000000000000000000..1d85ae2617a915ad0ad8288d848b607cc37ad297 --- /dev/null +++ b/doc/design/ops/images/rnn_2level_data.dot @@ -0,0 +1,75 @@ +digraph G { + chapter [label="chapter"] + + subgraph cluster0 { + label = "paragraph 0" + + top_rnn0[label="top rnn step 0" shape=box] + + p0 [label="paragraph 0"] + p1 [label="paragraph 1"] + } + + subgraph cluster1{ + label = "paragraph 1" + + top_rnn1[label="top rnn step 1" shape=box] + + p2 [label="paragraph 0"] + p3 [label="paragraph 1"] + } + + subgraph cluster_p0 { + label = "sentence 0" + + low_rnn0 [label="low rnn step 0" shape=box] + s00 [label="sentence 0"] + s01 [label="sentence 1"] + + low_rnn0 -> s00 + low_rnn0 -> s01 + } + + subgraph cluster_p1 { + label = "sentence 1" + low_rnn1 [label="low rnn step 1" shape=box] + s10 [label="sentence 0"] + s11 [label="sentence 1"] + low_rnn1 -> s10 + low_rnn1 -> s11 + } + + subgraph cluster_p2 { + label = "sentence 1" + low_rnn2 [label="low rnn step 0" shape=box] + s20 [label="sentence 0"] + s21 [label="sentence 1"] + low_rnn2 -> s20 + low_rnn2 -> s21 + } + + subgraph cluster_p3 { + label = "sentence 1" + low_rnn3 [label="low rnn step 1" shape=box] + s30 [label="sentence 0"] + s31 [label="sentence 1"] + low_rnn3 -> s30 + low_rnn3 -> s31 + } + + + chapter -> top_rnn0 + chapter -> top_rnn1 + + top_rnn0 -> p0 + top_rnn0 -> p1 + top_rnn1 -> p2 + top_rnn1 -> p3 + + + p0 -> low_rnn0 + p1 -> low_rnn1 + p2 -> low_rnn2 + p3 -> low_rnn3 + +} diff --git a/doc/design/ops/images/rnn_2level_data.png b/doc/design/ops/images/rnn_2level_data.png new file mode 100644 index 0000000000000000000000000000000000000000..4be81b2430717a6a506342a09fc26899568574c6 Binary files /dev/null and b/doc/design/ops/images/rnn_2level_data.png differ diff --git a/doc/design/ops/rnn.md b/doc/design/ops/rnn.md new file mode 100644 index 0000000000000000000000000000000000000000..2f4854793fa1f0b02e4dc17b51a48a972be61c06 --- /dev/null +++ b/doc/design/ops/rnn.md @@ -0,0 +1,153 @@ +# RNNOp design + +This document describes the RNN (Recurrent Neural Network) operator and how it is implemented in PaddlePaddle. The RNN op requires that all instances in a mini-batch have the same length. We will have a more flexible dynamic RNN operator in the future. + +## RNN Algorithm Implementation + +

+ +

+ +The above diagram shows an RNN unrolled into a full network. + +There are several important concepts here: + +- *step-net*: the sub-graph that runs at each step. +- *memory*, $h_t$, the state of the current step. +- *ex-memory*, $h_{t-1}$, the state of the previous step. +- *initial memory value*, the memory of the first (initial) step. + +### Step-scope + +There could be local variables defined in each step-net. PaddlePaddle runtime realizes these variables in *step-scopes* which are created for each step. + +

+
+Figure 2 illustrates the RNN's data flow +

+ +Please be aware that every step runs the same step-net. Each step does the following: + +1. Creates the step-scope. +2. Initializes the local variables including step-outputs, in the step-scope. +3. Runs the step-net, which uses the above mentioned variables. + +The RNN operator will compose its output from step outputs in each of the step scopes. + +### Memory and Ex-memory + +Let's give more details about memory and ex-memory using a simple example: + +$$ +h_t = U h_{t-1} + W x_t +$$, + +where $h_t$ and $h_{t-1}$ are the memory and ex-memory (previous memory) of step $t$ respectively. + +In the implementation, we can make an ex-memory variable either "refer to" the memory variable of the previous step, +or copy the memory value of the previous step to the current ex-memory variable. + +### Usage in Python + +For more information on Block, please refer to the [design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md). + +We can define an RNN's step-net using a Block: + +```python +import paddle as pd + +X = some_op() # x is some operator's output and is a LoDTensor +a = some_op() + +# declare parameters +W = pd.Variable(shape=[20, 30]) +U = pd.Variable(shape=[20, 30]) + +rnn = pd.create_rnn_op(output_num=1) +with rnn.stepnet(): + x = rnn.add_input(X) + # declare a memory (rnn's step) + h = rnn.add_memory(init=a) + # h.pre_state(), the previous memory of rnn + new_state = pd.add_two( pd.matmul(W, x) + pd.matmul(U, h.pre_state())) + # update current memory + h.update(new_state) + # indicate that h variables in all step scopes should be merged + rnn.add_outputs(h) + +out = rnn() +``` + +Python API functions in above example: + +- `rnn.add_input`: indicates that the parameter is a variable that will be segmented into step-inputs. +- `rnn.add_memory`: creates a variable used as the memory. +- `rnn.add_outputs`: marks the variables that will be concatenated across steps into the RNN output. + +### Nested RNN and LoDTensor + +An RNN whose step-net includes other RNN operators is known as an *nested RNN*. + +For example, we could have a 2-level RNN, where the top level corresponds to paragraphs, and the lower level corresponds to sentences. Each step of the higher level RNN also receives an input from the corresponding step of the lower level, and additionally the output from the previous time step at the same level. + +The following figure illustrates feeding in text into the lower level, one sentence at a step, and the feeding in step outputs to the top level. The final top level output is about the whole text. + +

+ +

+ +```python +import paddle as pd + +W = pd.Variable(shape=[20, 30]) +U = pd.Variable(shape=[20, 30]) + +W0 = pd.Variable(shape=[20, 30]) +U0 = pd.Variable(shape=[20, 30]) + +# a is output of some op +a = some_op() + +# chapter_data is a set of 128-dim word vectors +# the first level of LoD is sentence +# the second level of LoD is a chapter +chapter_data = pd.Variable(shape=[None, 128], type=pd.lod_tensor, level=2) + +def lower_level_rnn(paragraph): + ''' + x: the input + ''' + rnn = pd.create_rnn_op(output_num=1) + with rnn.stepnet(): + sentence = rnn.add_input(paragraph, level=0) + h = rnn.add_memory(shape=[20, 30]) + h.update( + pd.matmul(W, sentence) + pd.matmul(U, h.pre_state())) + # get the last state as sentence's info + rnn.add_outputs(h) + return rnn + +top_level_rnn = pd.create_rnn_op(output_num=1) +with top_level_rnn.stepnet(): + paragraph_data = rnn.add_input(chapter_data, level=1) + low_rnn = lower_level_rnn(paragraph_data) + paragraph_out = low_rnn() + + h = rnn.add_memory(init=a) + h.update( + pd.matmul(W0, paragraph_data) + pd.matmul(U0, h.pre_state())) + top_level_rnn.add_outputs(h) + +# output the last step +chapter_out = top_level_rnn(output_all_steps=False) +``` + +In the above example, the construction of the `top_level_rnn` calls `lower_level_rnn`. The input is an LoD Tensor. The top level RNN segments input text data into paragraphs, and the lower level RNN segments each paragraph into sentences. + +By default, the `RNNOp` will concatenate the outputs from all the time steps. +If the `output_all_steps` is set to False, it will only output the final time step. + + +

+ +

diff --git a/doc/design/ops/sequence_decoder.md b/doc/design/ops/sequence_decoder.md new file mode 100644 index 0000000000000000000000000000000000000000..9db5fb8e9a9f89b004bf71ddc064cd976c0d0bee --- /dev/null +++ b/doc/design/ops/sequence_decoder.md @@ -0,0 +1,229 @@ +# Design: Sequence Decoder Generating LoDTensors +In tasks such as machine translation and visual captioning, +a [sequence decoder](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.md) is necessary to generate sequences, one word at a time. + +This documentation describes how to implement the sequence decoder as an operator. + +## Beam Search based Decoder +The [beam search algorithm](https://en.wikipedia.org/wiki/Beam_search) is necessary when generating sequences. It is a heuristic search algorithm that explores the paths by expanding the most promising node in a limited set. + +In the old version of PaddlePaddle, the C++ class `RecurrentGradientMachine` implements the general sequence decoder based on beam search, due to the complexity involved, the implementation relies on a lot of special data structures that are quite trivial and hard to be customized by users. + +There are a lot of heuristic tricks in the sequence generation tasks, so the flexibility of sequence decoder is very important to users. + +During the refactoring of PaddlePaddle, some new concepts are proposed such as: [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md) and [TensorArray](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/tensor_array.md) that can better support the sequence usage, and they can also help make the implementation of beam search based sequence decoder **more transparent and modular** . + +For example, the RNN states, candidates IDs and probabilities of beam search can be represented all as `LoDTensors`; +the selected candidate's IDs in each time step can be stored in a `TensorArray`, and `Packed` to the sentences translated. + +## Changing LoD's absolute offset to relative offsets +The current `LoDTensor` is designed to store levels of variable-length sequences. It stores several arrays of integers where each represents a level. + +The integers in each level represent the begin and end (not inclusive) offset of a sequence **in the underlying tensor**, +let's call this format the **absolute-offset LoD** for clarity. + +The relative-offset LoD can retrieve any sequence very quickly but fails to represent empty sequences, for example, a two-level LoD is as follows +```python +[[0, 3, 9] + [0, 2, 3, 3, 3, 9]] +``` +The first level tells that there are two sequences: +- the first's offset is `[0, 3)` +- the second's offset is `[3, 9)` + +while on the second level, there are several empty sequences that both begin and end at `3`. +It is impossible to tell how many empty second-level sequences exist in the first-level sequences. + +There are many scenarios that rely on empty sequence representation, for example in machine translation or visual captioning, one instance has no translation or the empty candidate set for a prefix. + +So let's introduce another format of LoD, +it stores **the offsets of the lower level sequences** and is called **relative-offset** LoD. + +For example, to represent the same sequences of the above data + +```python +[[0, 3, 6] + [0, 2, 3, 3, 3, 9]] +``` + +the first level represents that there are two sequences, +their offsets in the second-level LoD is `[0, 3)` and `[3, 5)`. + +The second level is the same with the relative offset example because the lower level is a tensor. +It is easy to find out the second sequence in the first-level LoD has two empty sequences. + +The following examples are based on relative-offset LoD. + +## Usage in a simple machine translation model +Let's start from a simple machine translation model that is simplified from the [machine translation chapter](https://github.com/PaddlePaddle/book/tree/develop/08.machine_translation) to draw a blueprint of what a sequence decoder can do and how to use it. + +The model has an encoder that learns the semantic vector from a sequence, and a decoder which uses the sequence encoder to generate new sentences. + +**Encoder** +```python +import paddle as pd + +dict_size = 8000 +source_dict_size = dict_size +target_dict_size = dict_size +word_vector_dim = 128 +encoder_dim = 128 +decoder_dim = 128 +beam_size = 5 +max_length = 120 + +# encoder +src_word_id = pd.data( + name='source_language_word', + type=pd.data.integer_value_sequence(source_dict_dim)) +src_embedding = pd.embedding(size=source_dict_size, size=word_vector_dim) + +src_word_vec = pd.lookup(src_embedding, src_word_id) + +encoder_out_seq = pd.gru(input=src_word_vec, size=encoder_dim) + +encoder_ctx = pd.last_seq(encoder_out_seq) +# encoder_ctx_proj is the learned semantic vector +encoder_ctx_proj = pd.fc( + encoder_ctx, size=decoder_dim, act=pd.activation.Tanh(), bias=None) +``` + +**Decoder** + +```python +def generate(): + decoder = pd.while_loop() + with decoder.step(): + decoder_mem = decoder.memory(init=encoder_ctx) # mark the memory + generated_ids = decoder.memory() # TODO init to batch_size s + generated_scores = decoder.memory() # TODO init to batch_size 1s or 0s + + target_word = pd.lookup(trg_embedding, gendrated_ids) + # expand encoder_ctx's batch to fit target_word's lod + # for example + # decoder_mem.lod is + # [[0 1 3], + # [0 1 3 6]] + # its tensor content is [a1 a2 a3 a4 a5] + # which means there are 2 sentences to translate + # - the first sentence has 1 translation prefixes, the offsets are [0, 1) + # - the second sentence has 2 translation prefixes, the offsets are [1, 3) and [3, 6) + # the target_word.lod is + # [[0, 1, 6] + # [0, 2, 4, 7, 9 12]] + # which means 2 sentences to translate, each has 1 and 5 prefixes + # the first prefix has 2 candidates + # the following has 2, 3, 2, 3 candidates + # the encoder_ctx_expanded's content will be + # [a1 a1 a2 a2 a3 a3 a3 a4 a4 a5 a5 a5] + encoder_ctx_expanded = pd.lod_expand(encoder_ctx, target_word) + decoder_input = pd.fc( + act=pd.activation.Linear(), + input=[target_word, encoder_ctx], + size=3 * decoder_dim) + gru_out, cur_mem = pd.gru_step( + decoder_input, mem=decoder_mem, size=decoder_dim) + scores = pd.fc( + gru_out, + size=trg_dic_size, + bias=None, + act=pd.activation.Softmax()) + # K is an config + topk_scores, topk_ids = pd.top_k(scores, K) + topk_generated_scores = pd.add_scalar(topk_scores, generated_scores) + + selected_ids, selected_generation_scores = decoder.beam_search( + topk_ids, topk_generated_scores) + + # update the states + decoder_mem.update(cur_mem) # tells how to update state + generated_ids.update(selected_ids) + generated_scores.update(selected_generation_scores) + + decoder.output(selected_ids) + decoder.output(selected_generation_scores) + +translation_ids, translation_scores = decoder() +``` +The `decoder.beam_search` is an operator that, given the candidates and the scores of translations including the candidates, +returns the result of the beam search algorithm. + +In this way, users can customize anything on the input or output of beam search, for example: + +1. Make the corresponding elements in `topk_generated_scores` zero or some small values, beam_search will discard this candidate. +2. Remove some specific candidate in `selected_ids`. +3. Get the final `translation_ids`, remove the translation sequence in it. + +The implementation of sequence decoder can reuse the C++ class: [RNNAlgorithm](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/paddle/operators/dynamic_recurrent_op.h#L30), +so the python syntax is quite similar to that of an [RNN](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/doc/design/block.md#blocks-with-for-and-rnnop). + +Both of them are two-level `LoDTensors`: + +- The first level represents `batch_size` of (source) sentences. +- The second level represents the candidate ID sets for translation prefix. + +For example, 3 source sentences to translate, and has 2, 3, 1 candidates. + +Unlike an RNN, in sequence decoder, the previous state and the current state have different LoD and shape, and an `lod_expand` operator is used to expand the LoD of the previous state to fit the current state. + +For example, the previous state: + +* LoD is `[0, 1, 3][0, 2, 5, 6]` +* content of tensor is `a1 a2 b1 b2 b3 c1` + +the current state is stored in `encoder_ctx_expanded`: + +* LoD is `[0, 2, 7][0 3 5 8 9 11 11]` +* the content is + - a1 a1 a1 (a1 has 3 candidates, so the state should be copied 3 times for each candidates) + - a2 a2 + - b1 b1 b1 + - b2 + - b3 b3 + - None (c1 has 0 candidates, so c1 is dropped) + +The benefit from the relative offset LoD is that the empty candidate set can be represented naturally. + +The status in each time step can be stored in `TensorArray`, and `Pack`ed to a final LoDTensor. The corresponding syntax is: + +```python +decoder.output(selected_ids) +decoder.output(selected_generation_scores) +``` + +The `selected_ids` are the candidate ids for the prefixes, and will be `Packed` by `TensorArray` to a two-level `LoDTensor`, where the first level represents the source sequences and the second level represents generated sequences. + +Packing the `selected_scores` will get a `LoDTensor` that stores scores of each translation candidate. + +Packing the `selected_generation_scores` will get a `LoDTensor`, and each tail is the probability of the translation. + +## LoD and shape changes during decoding +

+ +

+ +According to the image above, the only phase that changes the LoD is beam search. + +## Beam search design +The beam search algorithm will be implemented as one method of the sequence decoder and has 3 inputs: + +1. `topk_ids`, the top K candidate ids for each prefix. +2. `topk_scores`, the corresponding scores for `topk_ids` +3. `generated_scores`, the score of the prefixes. + +All of these are LoDTensors, so that the sequence affiliation is clear. Beam search will keep a beam for each prefix and select a smaller candidate set for each prefix. + +It will return three variables: + +1. `selected_ids`, the final candidate beam search function selected for the next step. +2. `selected_scores`, the scores for the candidates. +3. `generated_scores`, the updated scores for each prefix (with the new candidates appended). + +## Introducing the LoD-based `Pack` and `Unpack` methods in `TensorArray` +The `selected_ids`, `selected_scores` and `generated_scores` are LoDTensors that exist at each time step, +so it is natural to store them in arrays. + +Currently, PaddlePaddle has a module called `TensorArray` which can store an array of tensors. It is better to store the results of beam search in a `TensorArray`. + +The `Pack` and `UnPack` in `TensorArray` are used to pack tensors in the array to an `LoDTensor` or split the `LoDTensor` to an array of tensors. +It needs some extensions to support the packing or unpacking an array of `LoDTensors`. diff --git a/doc/design/optimizer.md b/doc/design/optimizer.md new file mode 100644 index 0000000000000000000000000000000000000000..202b4b65103c0b7c536a9cb466c4120ce134d8c3 --- /dev/null +++ b/doc/design/optimizer.md @@ -0,0 +1,91 @@ +## Optimizer Design + +### The Problem + +A PaddlePaddle program, or a block, is a sequence of operators operating variables. A training program needs to do three kinds of works: + +1. the forward pass, which computes intermediate results and the cost(s), +1. the backward pass, which derives gradients from intermediate results and costs, and +1. the optimization pass, which update model parameters to optimize the cost(s). + +These works rely on three kinds of operators: + +1. forward operators, +1. gradient operators, and +1. optimization operators. + +It's true that users should be able to create all these operators manually by calling some low-level API, but it would be much more convenient if they could only describe the forward pass and let PaddlePaddle create the backward and optimization operators automatically. + +In this design, we propose a high-level API that automatically derives the optimisation pass and operators from the forward pass. + + +### High-level Python API to describe the training process + +1. User write code to describe the network: + + ```python + images = layer.data("images") + labels = layer.data("labels") + w1 = pd.var("w1") + b1 = pd.var("b1") + hidden = layer.fc(images, w=w1, b=b1) + cost = layer.mse(hidden, labels) + ``` + + The above code snippet will create forward operators in [Block](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md). + + +2. Users create a certain kind of Optimizer with some argument. + + ```python + optimizer = AdagradOptimizer(learing_rate=0.001) + ``` + +3. Users use the optimizer to `minimize` a certain `cost` through updating parameters in parameter_list. + + ```python + opt_op_list = optimizer.minimize(cost, parameter_list=[w1, b1]) + ``` + The above code snippet will create gradient and optimization operators in Block. The return value of `minimize()` is list of optimization operators that will be run by session. + +4. Users use Session/Executor to run this opt_op_list as target to do training. + + ```python + sess.run(target= opt_op_list, ...) + ``` + +#### Optimizer Python interface: + +```python +class Optimizer(object): + """Optimizer Base class. + + """ + + def __init__(self): + pass + + def create_optimization_pass(self, parameters_and_grads): + """Add optimization operators to update gradients to variables. + + Args: + parameters_and_grads: a list of (variable, gradient) pair to update. + + Returns: + optmization_op_list: a list of optimization operator that will update parameter using gradient. + """ + return None + + def minimize(self, loss, parameter_list): + """Add operations to minimize `loss` by updating `parameter_list`. + + This method combines interface `append_backward_ops()` and + `create_optimization_pass()` into one. + """ + params_grads = self.create_backward_pass(loss, parameter_list) + update_ops = self.create_optimization_pass(params_grads) + return update_ops + +``` + +Users can inherit the Optimizer above to create their own Optimizer with some special logic, such as AdagradOptimizer. diff --git a/doc/design/parameter_average.md b/doc/design/parameter_average.md new file mode 100644 index 0000000000000000000000000000000000000000..2c4edee9fe31d502ea62b9fe5c8757c0a4c5e79f --- /dev/null +++ b/doc/design/parameter_average.md @@ -0,0 +1,72 @@ +# Averaging Parameter in PaddlePaddle + +## Why Averaging +In a large scale machine learning setup where the size of the training data is huge, it could take us a large number of iterations over the training data before we can achieve the optimal values of parameters of our model. Looking at the problem setup, it is desirable if we can obtain the optimal values of parameters by going through the data in as few passes as we can. + +Polyak and Juditsky (1992) showed that the test performance of simple average of parameters obtained by Stochastic Gradient Descent (SGD) is as good as that of parameter values that are obtained by training the model over and over again, over the training dataset. + +Hence, to accelerate the speed of Stochastic Gradient Descent, Averaged Stochastic Gradient Descent (ASGD) was proposed in Polyak and Juditsky (1992). For ASGD, the running average of parameters obtained by SGD, is used as the estimator for
. The averaging is done as follows: + +
+ +We propose averaging for any optimizer similar to how ASGD performs it, as mentioned above. + +### How to perform Parameter Averaging in PaddlePaddle + +Parameter Averaging in PaddlePaddle works in the following way during training : +1. It will take in an instance of a normal optimizer as an input, e.g. RMSPropOptimizer +2. The optimizer itself is responsible for updating the parameters. +3. The ParameterAverageOptimizer maintains a separate copy of the parameters for itself: + 1. In concept, the values of this copy are the average of the values of the parameters in the most recent N batches. + 2. However, saving all the N instances of the parameters in memory is not feasible. + 3. Therefore, an approximation algorithm is used. + +Hence, overall we have have two copies of the parameters: one for the optimizer itself, and one for the ParameterAverageOptimizer. The former should be used in back propagation, while the latter should be used during testing and should be saved. + +During the testing/ saving the model phase, we perform the following steps: +1. Perform the delayed operations. +2. Save current values of the parameters to a temporary variable. +3. Replace the values of the parameters with the averaged values. +4. Perform testing and/or save the parameters. +5. Restore the values of the parameters once done. + +### How to implement Averaging of Parameter in PaddlePaddle + +We can add the ParameterAverageOptimizer op to the graph through Python API. Using this approach, we manually add this op to the graph and direct the output of the optimizer op to this op during training. + + **Advantages**: + - Allows for greater flexibility to the users of PaddlePaddle. Using this approach, the users can plug different optimizers into ParameterAverageOptimizer by passing in the optimizer to the op. + - Makes it easy for the users to customize and extend the framework. + + **Disadvantages**: + - Implementation requires re-writing the averaging methodology in Python. + +### Low-Level implementation + +In the new design, we propose to create a new operation for averaging parameter updates (ParameterAverageOptimizer). For now, we can add an op that takes in the following as input: +- the optimizer +- the window_size to keep the updates + +The ParameterAverageOptimizer op can be like any other operator with its own CPU/GPU implementation either using Eigen or separate CPU and GPU kernels. As the initial implementation, we can implement the kernel using Eigen following the abstraction pattern implemented for [Operators](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/rmsprop_op.h). We also want to support the case when the Trainer/Optimizer runs on the GPU while ParameterAverageOptimizer runs on a CPU. + +The idea of building an op for averaging is in sync with the refactored PaddlePaddle philosophy of using operators to represent any computation unit. The way the op will be added to the computation graph will be decided by the [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) in Python API. + +### Python API implementation for ParameterAverageOptimizer + +Based on Polyak and Juditsky (1992), we can generalize the averaging of updates to any optimizer. The input to the op would be the following: +- Any optimizer (RMSProp , AdaGrad etc.) +- A window size. The op keeps accumulating updated parameter values over a window of N batches and takes an average. Move the averaged value to a buffer when window is full to avoid loss of precision. + +Using the ParameterAverageOptimizer op, any user can add the operation to their computation graphs. However, this will require a lot of lines of code and we should design Python APIs that support averaging. As per the PaddlePaddle [Python API design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md), the layer functions are responsible for creating operators, operator parameters and variables. Since ParameterAverageOptimizer will be an operator, it makes sense to create it in the layer functions. +We will have a wrapper written in Python that will support the functionality and implement the actual core computation in C++ core as we have done for other [Optimizers](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/rmsprop_op.cc) + +#### Creation of the ParameterAverageOptimizer operator +There are two ways for creating the ParameterAverageOptimizer op: +1. We create the op immediately while building the computation graph. +2. We add the op in a lazy manner, just before the backward pass, similar to the way the optimization ops are added. + +The proposal is to add the op immediately while building the computation graph. + +#### High-level API + +In PaddlePaddle Python API, users will primarily rely on [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) to create neural network layers. Hence, we also need to provide parameter average functionality in layer functions. diff --git a/doc/design/parameters_in_cpp.md b/doc/design/parameters_in_cpp.md new file mode 100644 index 0000000000000000000000000000000000000000..a7ac3f17c44ca94a669a8f1e283b291bceb42317 --- /dev/null +++ b/doc/design/parameters_in_cpp.md @@ -0,0 +1,41 @@ +# Design Doc: The C++ Class `Parameters` + +`Parameters` is a concept we designed in PaddlePaddle V2 API. `Parameters` is a container of parameters, which makes PaddlePaddle capable of sharing parameter between topologies. We described usages of `Parameter` in [api.md](./api.md). + +We used Python to implement Parameters when designing V2 API before. There are several defects for the current implementation: +* We just use `memcpy` to share Parameters between topologies, but this is very inefficient. +* We did not support sharing Parameters while training. We just trigger `memcpy` when start training. + +It is necessary that we implement Parameters in CPP side. However, it could result a code refactoring for PaddlePaddle, because PaddlePaddle was designed for training only one topology before, i.e., each GradientMachine contains its Parameter as a data member. In current PaddlePaddle implementation, there are three concepts associated with `Parameters`: + +1. `paddle::Parameter`. A `Parameters` is a container for `paddle::Parameter`. +It is evident that we should use `paddle::Parameter` when developing `Parameters`. +However, the `Parameter` class contains many functions and does not have a clear interface. +It contains `create/store Parameter`, `serialize/deserialize`, `optimize(i.e SGD)`, `randomize/zero`. +When we developing `Parameters`, we only use `create/store Parameter` functionality. +We should extract functionalities of Parameter into many classes to clean PaddlePaddle CPP implementation. + +2. `paddle::GradientMachine` and its sub-classes, e.g., `paddle::MultiGradientMachine`, `paddle::NeuralNetwork`. +We should pass `Parameters` to `paddle::GradientMachine` when `forward/backward` to avoid `memcpy` between topologies. +Also, we should handle multi-GPU/CPU training, because `forward` and `backward` would perform on multi-GPUs and multi-CPUs. +`Parameters` should dispatch the parameter value to each device, and gather the parameter gradient from each device. + +3. `paddle::ParameterUpdater`. The ParameterUpdater is used to update parameters in Paddle. +So `Parameters` should be used by `paddle::ParameterUpdater`, and `paddle::ParameterUpdater` should optimize `Parameters` (by SGD). + + +The step by step approach for implementation Parameters in PaddlePaddle C++ core is listed below. Each step should be a PR and could be merged into PaddlePaddle one by one. + +1. Clean `paddle::Parameter` interface. Extract the functionalities of `paddle::Parameter` to prepare for the implementation of Parameters. + +2. Implementation a `Parameters` class. It just stores the `paddle::Parameter` inside. Make `GradientMachine` uses `Parameters` as a class member. + +3. Make `Parameters` support Multi-CPU and Multi-GPU training to prepare for sharing `Parameter` between topologies. +Because we need share `Parameters` between topologies, it is `Parameters`'s response to exchange Parameters between GPUs. +`GradientMachine` should not handle how to exchange Parameters because `GradientMachine` only used to train one topology and we need to support train many topologies in Paddle, i.e., there could be many GradientMachines use one `Parameters`. + * We should use a global function to exchange Parameters between GPUs, not a member function in `Parameters`. The `MultiGradientMachine` invoke this function, which uses `Parameters` as this function inputs. + * The MultiGradientMachine contains many functionalities. Extracting the Parameters exchanging logic could make MultiGradientMachine clearer and simpler. + +4. Make `Parameters` as an argument for `forward/backward` function, not a data member for `GradientMachine`. For example, `forward` could be `forward(const Parameters& params, ...)` and `backward` could be `backward(Parameters* params, ...)`. After this step, Paddle could share `Parameters` between topologies. + +5. `ParameterUpdater` is invoked by `GradientMachine` and `Trainer`, but it updates `Parameters`. In the end of this code refactoring, we could change `ParameterUpdater` directly uses `Parameters` to make `ParameterUpdater`'s implementation clear. diff --git a/doc/design/program.md b/doc/design/program.md new file mode 100644 index 0000000000000000000000000000000000000000..bd2456787c4e336d357a65255a8274a7c9e465cc --- /dev/null +++ b/doc/design/program.md @@ -0,0 +1,139 @@ +# Design Doc: PaddlePaddle Programs + +## Compile and Execution + +A PaddlePaddle program consists of two parts -- the first generates a `ProgramDesc` protobuf message that describes the program, and the second runs this message using a C++ class `Executor`. + +A simple example PaddlePaddle program can be found in [graph.md](./graph.md): + +```python +x = layer.data("images") +l = layer.data("label") +y = layer.fc(x) +cost = layer.mse(y, l) +optimize(cost) +train(cost, reader=mnist.train()) +``` + +The first five lines of the following PaddlePaddle program generates, or, compiles, the `ProgramDesc` message. The last line runs it. + +## Programs and Blocks + +The basic structure of a PaddlePaddle program is some nested blocks, as a C++ or Java program. + +- program: some nested blocks +- [block](./block.md): + - some local variable definitions, and + - a sequence of operators + +The concept of block comes from usual programs. For example, the following C++ program has three blocks: + +```c++ +int main() { // block 0 + int i = 0; + if (i < 10) { // block 1 + for (int j = 0; j < 10; j++) { // block 2 + } + } + return 0; +} +``` + +The following PaddlePaddle program has three blocks: + +```python +import paddle as pd // block 0 + +x = minibatch([10, 20, 30]) # shape=[None, 1] +y = var(1) # shape=[1], value=1 +z = minibatch([10, 20, 30]) # shape=[None, 1] +cond = larger_than(x, 15) # [false, true, true] + +ie = pd.ifelse() +with ie.true_block(): // block 1 + d = pd.layer.add_scalar(x, y) + ie.output(d, pd.layer.softmax(d)) +with ie.false_block(): // block 2 + d = pd.layer.fc(z) + ie.output(d, d+1) +o1, o2 = ie(cond) +``` + +## `BlockDesc` and `ProgramDesc` + +All protobuf messages are defined in `framework.proto`. + +`BlockDesc` is straight-forward -- it includes local variable definitions, `vars`, and a sequence of operators, `ops`. + +```protobuf +message BlockDesc { + required int32 parent = 1; + repeated VarDesc vars = 2; + repeated OpDesc ops = 3; +} +``` + +The parent ID indicates the parent block so that operators in a block can refer to variables defined locally and also those defined in their ancestor blocks. + +All hierarchical blocks in a program are flattened and stored in an array. The block ID is the index of the block in this array. + +```protobuf +message ProgramDesc { + repeated BlockDesc blocks = 1; +} +``` + + +### Global Block + +The global block is the first one in the above array. + +## Operators that Use Blocks + +In the above example, the operator `IfElseOp` has two blocks -- the true branch and the false branch. + +The definition of `OpDesc` shows that an operator could have some attributes: + +```protobuf +message OpDesc { + AttrDesc attrs = 1; + ... +} +``` + +and an attribute could be of type block, which is, in fact, a block ID as described above: + +``` +message AttrDesc { + required string name = 1; + + enum AttrType { + INT = 1, + STRING = 2, + ... + BLOCK = ... + } + required AttrType type = 2; + + optional int32 block = 10; // when type == BLOCK + ... +} +``` + +## InferShape + +With this design, the InferShape function should take the following parameters: + +```c++ +void InferShape(int current_block, + int current_operator, + ProgramDesc* program // might change VarDesc values. + ) { + ... +} +``` + +where + +- `current_block` indices into `ProgramDesc::blocks`, +- `current_operator` indices into `BlockDesc::ops`. diff --git a/doc/design/prune.md b/doc/design/prune.md new file mode 100644 index 0000000000000000000000000000000000000000..4a5cf10c79a554779137f0cce5494fdd96ef6b7a --- /dev/null +++ b/doc/design/prune.md @@ -0,0 +1,63 @@ +# Prune + +## Motivation + +We want to support running inference, training and checkpointing in one `ProgramDesc`. We implement +`void Prune(const ProgramDesc* input, ProgramDesc* output)` function, which takes a `ProgramDesc` +and generate a pruned `ProgramDesc`. + +## Challenge + +Pruning need to support both variables and operators being evaluation targets. Consider the following +different situations. + +```python +# Case 1: run foward pass. +cost_np = session.run(target=cost) +# Case 2: run backward passing. +opts_np, _ = session.run(target=[cost, opt]) +# Case 3: run checkpointing +_ = session.run(target=checkpoint) +``` + +## Solution + +To support evaluation of operators, we add `is_target` field in the `OpDesc`. + +```c++ +message OpDesc { + required string type = 3; + repeated Var inputs = 1; + repeated Var outputs = 2; + repeated Attr attrs = 4; + optional bool is_target = 5 [ default = false ]; +}; +``` + +To support evaluation of variables, we add [fetch_op](https://github.com/PaddlePaddle/Paddle/pull/4599). +For each variable in the `target`, we insert a `fetch_op` into the `ProgramDesc` with `variable` being +`fetch_op`'s input. Then we also set `fetch_op` is a target. + +### Algorithm + +If an operator needs to be run, it must fall into one of the following cases: + +1. It is the target. +2. It is depended by some other ops, meaning its output is some other op's input. + +The first case can be checked by `op_desc.is_traget()` . The second case can be implement as + +```c++ +bool HasDependentVar(const OpDesc& op_desc, const std::set& dependent_vars) { + for (auto& var : op_desc.outputs()) { + for (auto& argu : var.arguments()) { + if (dependent_vars.count(argu) != 0) { + return true; + } + } + } + return false; +} +``` + +Then the whole algorithm can be implemented as the following [code](https://github.com/tonyyang-svail/Paddle/blob/prune_impl/paddle/framework/prune.cc). diff --git a/doc/design/python_api.md b/doc/design/python_api.md new file mode 100644 index 0000000000000000000000000000000000000000..cb5fdc765b7126fc66a1c8978d4b96c0dc5a9f2c --- /dev/null +++ b/doc/design/python_api.md @@ -0,0 +1,284 @@ +# Design Doc: Python API + +Due to the refactorization of the PaddlePaddle core, we need Python classes to construct corresponding protobuf messages that describe a DL program. + +| Python classes | Protobuf messages | +| --- | --- | +| Program | ProgramDesc | +| Block | BlockDesc | +| Operator | OpDesc | +| Variable | VarDesc | + +Please be aware that these Python classes need to maintain some construction-time information, which are not part of the protobuf messages. + +## Core Concepts + +### Program + +A `ProgramDesc` describes a [DL program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md), which is composed of an array of `BlockDesc`s. The `BlockDesc`s in a `ProgramDesc` can have a tree-like hierarchical structure. However, the `ProgramDesc` onlys stores a flattened array of `BlockDesc`s. A `BlockDesc` refers to its parent block by its index in the array. For example, operators in the step block of an RNN operator need to be able to access variables in its ancestor blocks. + +Whenever we create a block, we need to set its parent block to the current block, hence the Python class `Program` needs to maintain a data member `current_block`. + +```python +class Program(objects): + def __init__(self): + self.desc = core.NewProgram() # a C++ ProgramDesc pointer. + self.blocks = vector() + self.blocks.append(Block(self, -1)) # the global block + self.current_block = 0 # initialized to the global block + + def global_block(): + return self.blocks[0] + + def current_block(): + return self.get_block(self.current_block) + + def rollback(): + self.current_block = self.current_block().parent_idx + + def create_block(): + new_block_idx = len(self.block) + self.blocks.append(Block(self, self.current_block)) + self.current_block = new_block_idx + return current_block() +``` + +`Program` is an accessor to the protobuf message `ProgramDesc`, which is created in C++ space, because the InferShape function is in C++, which manipulates `VarDesc` messages, which are in turn members of `BlockDesc`, which is a member of `ProgramDesc`. + +`Program` creates the first block as the global block in its constructor. All parameters and their initializer operators are in the global block. + +### Block + +A [Block](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md) includes + +1. a map from variable names to an instance of the Python `Variable` class, and +1. a list of `Operator` instances. + +```python +class Block(objects): + def __init__(self, program, parent_idx): + self.desc = core.NewBlock(program.desc) + self.program = program + self.vars = map() + self.ops = vector() + self.parent_idx = parent_idx + + def create_var(self, ...): + return Variable(self, ...) + + def _create_global_var(self, ...): + program.global_block().create_var(...) + + def create_parameter(self, name, ...): + # Parameter is a subclass of variable. See Parameter section for details. + self.vars[name] = Parameter(self._create_global_var(...), ...) + return self.vars[name] + + def append_operator(self, ...): + self.ops.append(Operator(self, ...)) + + def prepend_operator(self, ...): # Parameter's ctor prepands initialize operators. + self.ops.prepend(Operator(self, ...)) +``` + +`create_parameter` is necessary because parameters are global variables, defined in the global block, but can be created in some sub-blocks. For example, an FC layer in the step block of an RNN operator. + +`prepend_operator` is necessary because the constructor of `Parameter` needs to create the initialize (or load) operator of the parameter, and would like to put it in the *preamble* of the global block. + +### Operator + +The `Operator` class fills in the `OpDesc` message and calls the C++ function `InferShape` to infer the output shapes from the input shapes. + +```python +class Operator(object): + def __init__(self, + block, # Block + type, # string + inputs, # dict + outputs,# dict + attrs # dict + ): + self.desc = core.NewOpDesc(block.desc, type, inputs, outputs, attrs) + core.infer_shape(self.desc, inputs, outputs) + + def type(self): + return self.desc.type() +``` + +`Operator` creates the `OpDesc` message in C++ space, so that it can call the `InferShape` function, which is in C++. + +### Variable + +Operators take Variables as its inputs and outputs. + +```python +class Variable(object): + def __init__(self, + block=None, # Block + name=None, # string + shape, # tuple + dtype="float32", # string + lod_level=None # int + ): + if name is None: + name = unique_name_generator() + self.name = name + self.block = block + self.desc = core.NewVarDesc(block.desc, name, shape, lod_level) + self.writer = None +``` + +Please be aware of `self.writer`, that tracks operator who creates the variable. It possible that there are more than one operators who write a variable, but in Python space, each write to a variable is represented by a Variable class. This is guaranteed by the fact that **`core.NewVarDesc` must NOT create a new `VarDesc` message if its name already exists in the specified block**. + +### Parameter + +A parameter is a global variable with an initializer (or load) operator. + +```python +class Parameter(Variable): + def __init__(self, + block=None, # Block + name=None, # string + shape, # tuple + dtype="float32", # string + lod_level=None # int + trainable, # bool + initialize_op_attrs, + optimize_op_attrs): + super(Parameter, self).__init__(block, name, shape, dtype, lod_level) + self.trainable = trainable + self.optimize_op_attrs = optimize_op_attrs + block.prepend(Operator(block, # Block + initialize_op_attrs['type'], # string + None, # no inputs + self, # output is the parameter + initialize_op_attrs) +``` + +When users create a parameter, they can call + +```python +program.create_parameter( + ..., + init_attr={ + type: "uniform_random", + min: -1.0, + max: 1.0, + }) +) +``` + +In above example, `init_attr.type` names an initialize operator. It can also name the load operator + +```python +init_attr={ + type: "load", + filename: "something.numpy", +} +``` + +`optimize_op_attrs` is not in the `VarDesc` message, but kept in the Python instance, as it will be used in the Python space when creating the optimize operator's `OpDesc`, and will be in the `OpDesc` message. + +## Layer Function + +A layer is a Python function that creates some operators and variables. Layers simplify the work of application programmers. + +Layer functions take `Variable` and configuration parameters as its input and return the output variable(s). + +For example, `FullyConnected` take one or more variable as its input. The input could be input data or another layer's output. There are many configuration options for a `FullyConnected` layer, such as layer size, activation, parameter names, initialization strategies of parameters, and so on. The `FullyConnected` layer will return an output variable. + + +### Necessity for reusing code between layer functions + +There are a lot of code that can be reused. Such as + +* Give the default value of configuration. e.g., default initialize strategy for parameters is uniform random with `min = -1.0`, `max = 1.0`. and default initialize strategy for bias is to fill zero. +* Append the activation operator. +* Create a temporary variable. +* Create parameter. +* Generate a unique name. +* Add a bias. +* ... + +A mechanism to reuse code between layer functions is necessary. It will be around [150 lines of code](https://github.com/PaddlePaddle/Paddle/pull/4724/files#diff-823b27e07e93914ada859232ae23f846R12) if we write a `FullyConnected` layer without any helper functions. + + + +### Comparision between global functions and helper class + +The `FullyConnected` layer will be as follow when we provide global functions: + +```python +def fc_layer(input, size, param_attr=None, bias_attr=None, act=None, name=None): + if name is None: + name = unique_name("fc") + input = multiple_input(input) + param_attr = default_param_attr(param_attr) + param_attr = multiple_param_attr(param_attr, len(input)) + + # mul + mul_results = [] + for ipt, attr in zip(input, param_attr): + shape = ipt.shape[1:] + [size] + w = g_program.global_block().create_parameter(shape, ipt.dtype, name, attr) + tmp = create_tmp_var(name) + g_program.current_block().append_op("mul", {ipt, w}, {tmp}) + mul_results.append(tmp) + + # add sum + ... + # add bias + ... + # add activation + ... + return out +``` + +We can provide many helpers functions for layer developers. However, there are several disadvantages for global helper functions: + +1. We need a namespace for these methods, then layer developers can quickly figure out what method they can use. +2. Global functions will force layer developers to pass its parameter time by time. + +So we provide a helper class, `LayerHelper`, to share code between layer functions. The `FullyConnected` Layer will be as follow. + +```python +def fc_layer(input, size, param_attr=None, bias_attr=None, act=None, name=None): + helper = LayerHelper(locals()) # pass all parameter to LayerHelper + + mul_results = [] + for ipt, param in helper.iter_multiple_input_and_param(): + w = helper.create_parameter(shape=ipt.shape[1:] + [size], dtype = ipt.dtype) + tmp = helper.create_tmp_variable() + helper.append_op('mul', {ipt, w}, {tmp}) + mul_results.append(tmp) + + pre_bias = helper.add_sum(mul_results) + pre_activation = helper.add_bias(pre_bias) + return helper.add_activation(pre_activation) +``` + +We not only use the fewer lines of code to write `fc_layer` but also make the code clearer to understand. At the same time, layer developers can figure out what function they can invoke by typing `helper.` in a python editor. + + +### Implementation of layer helper + +We just keep all parameters of a layer function as a dictionary in layer helper as a private data member. Every method of layer helper will look up the dictionary after it is invoked. In that way, we can implement a layer helper for all layer functions even some layer does not contain some operator. For example, The `activation` is used by the FullyConnected layer or convolution layers, but a cross-entropy layer does not use it. The example code of `add_activation` are: + +```python +class LayerHelper(object): + def __init__(self, **kwargs): # kwargs is short for `keyword arguments` + self.kwargs = kwargs + + def add_activation(self, input_var): + act = self.kwargs.get("act", None) # default value is None + if act is None: # do nothing if no act + return input_var + + tmp = self.create_tmp_var(self) + self.append_op(type=act, input=input_var, output=tmp) + return tmp +``` + +## Optimizer + +[Optimizer Design Doc](./optimizer.md) diff --git a/doc/design/reader/README.md b/doc/design/reader/README.md index f21f7af520df5171798326818ecb97c3bcd14a12..2cd4b6225b61cf374458e40afabad7745f61ba71 100644 --- a/doc/design/reader/README.md +++ b/doc/design/reader/README.md @@ -1,25 +1,25 @@ # Python Data Reader Design Doc -At training and testing time, PaddlePaddle programs need to read data. To ease the users' work to write data reading code, we define that +During the training and testing phases, PaddlePaddle programs need to read data. To help the users write code that performs reading input data, we define the following: -- A *reader* is a function that reads data (from file, network, random number generator, etc) and yields data items. -- A *reader creator* is a function that returns a reader function. -- A *reader decorator* is a function, which accepts one or more readers, and returns a reader. -- A *batch reader* is a function that reads data (from *reader*, file, network, random number generator, etc) and yields a batch of data items. +- A *reader*: A function that reads data (from file, network, random number generator, etc) and yields the data items. +- A *reader creator*: A function that returns a reader function. +- A *reader decorator*: A function, which takes in one or more readers, and returns a reader. +- A *batch reader*: A function that reads data (from *reader*, file, network, random number generator, etc) and yields a batch of data items. -and provide function which converts reader to batch reader, frequently used reader creators and reader decorators. +and also provide a function which can convert a reader to a batch reader, frequently used reader creators and reader decorators. ## Data Reader Interface -Indeed, *data reader* doesn't have to be a function that reads and yields data items. It can be any function with no parameter that creates a iterable (anything can be used in `for x in iterable`): +*Data reader* doesn't have to be a function that reads and yields data items. It can just be any function without any parameters that creates an iterable (anything can be used in `for x in iterable`) as follows: ``` iterable = data_reader() ``` -Element produced from the iterable should be a **single** entry of data, **not** a mini batch. That entry of data could be a single item, or a tuple of items. Item should be of [supported type](http://www.paddlepaddle.org/doc/ui/data_provider/pydataprovider2.html?highlight=dense_vector#input-types) (e.g., numpy 1d array of float32, int, list of int) +The item produced from the iterable should be a **single** entry of data and **not** a mini batch. The entry of data could be a single item or a tuple of items. Item should be of one of the [supported types](http://www.paddlepaddle.org/doc/ui/data_provider/pydataprovider2.html?highlight=dense_vector#input-types) (e.g., numpy 1d array of float32, int, list of int etc.) -An example implementation for single item data reader creator: +An example implementation for single item data reader creator is as follows: ```python def reader_creator_random_image(width, height): @@ -29,7 +29,7 @@ def reader_creator_random_image(width, height): return reader ``` -An example implementation for multiple item data reader creator: +An example implementation for multiple item data reader creator is as follows: ```python def reader_creator_random_image_and_label(width, height, label): def reader(): @@ -40,9 +40,10 @@ def reader_creator_random_image_and_label(width, height, label): ## Batch Reader Interface -*batch reader* can be any function with no parameter that creates a iterable (anything can be used in `for x in iterable`). The output of the iterable should be a batch (list) of data items. Each item inside the list must be a tuple. +*Batch reader* can be any function without any parameters that creates an iterable (anything can be used in `for x in iterable`). The output of the iterable should be a batch (list) of data items. Each item inside the list should be a tuple. + +Here are some valid outputs: -Here are valid outputs: ```python # a mini batch of three data items. Each data item consist three columns of data, each of which is 1. [(1, 1, 1), @@ -52,26 +53,28 @@ Here are valid outputs: # a mini batch of three data items, each data item is a list (single column). [([1,1,1],), ([2,2,2],), -([3,3,3],), +([3,3,3],)] ``` Please note that each item inside the list must be a tuple, below is an invalid output: ```python # wrong, [1,1,1] needs to be inside a tuple: ([1,1,1],). - # Otherwise it's ambiguous whether [1,1,1] means a single column of data [1, 1, 1], - # or three column of datas, each of which is 1. + # Otherwise it is ambiguous whether [1,1,1] means a single column of data [1, 1, 1], + # or three columns of data, each of which is 1. [[1,1,1], [2,2,2], [3,3,3]] ``` -It's easy to convert from reader to batch reader: +It is easy to convert from a reader to a batch reader: + ```python mnist_train = paddle.dataset.mnist.train() mnist_train_batch_reader = paddle.batch(mnist_train, 128) ``` -Also easy to create custom batch reader: +It is also straight forward to create a custom batch reader: + ```python def custom_batch_reader(): while True: @@ -85,7 +88,8 @@ mnist_random_image_batch_reader = custom_batch_reader ## Usage -batch reader, mapping from item(s) read to data layer, batch size and number of total pass will be passed into `paddle.train`: +Following is how we can use the reader with PaddlePaddle: +The batch reader, a mapping from item(s) to data layer, the batch size and the number of total passes will be passed into `paddle.train` as follows: ```python # two data layer is created: @@ -99,13 +103,13 @@ paddle.train(batch_reader, {"image":0, "label":1}, 128, 10, ...) ## Data Reader Decorator -*Data reader decorator* takes a single or multiple data reader, returns a new data reader. It is similar to a [python decorator](https://wiki.python.org/moin/PythonDecorators), but it does not use `@` syntax. +The *Data reader decorator* takes in a single reader or multiple data readers and returns a new data reader. It is similar to a [python decorator](https://wiki.python.org/moin/PythonDecorators), but it does not use `@` in the syntax. -Since we have a strict interface for data readers (no parameter, return a single data item). Data reader can be used flexiable via data reader decorators. Following are a few examples: +Since we have a strict interface for data readers (no parameters and return a single data item), a data reader can be used in a flexible way using data reader decorators. Following are a few examples: ### Prefetch Data -Since reading data may take time and training can not proceed without data. It is generally a good idea to prefetch data. +Since reading data may take some time and training can not proceed without data, it is generally a good idea to prefetch the data. Use `paddle.reader.buffered` to prefetch data: @@ -117,9 +121,9 @@ buffered_reader = paddle.reader.buffered(paddle.dataset.mnist.train(), 100) ### Compose Multiple Data Readers -For example, we want to use a source of real images (reusing mnist dataset), and a source of random images as input for [Generative Adversarial Networks](https://arxiv.org/abs/1406.2661). +For example, if we want to use a source of real images (say reusing mnist dataset), and a source of random images as input for [Generative Adversarial Networks](https://arxiv.org/abs/1406.2661). -We can do: +We can do the following : ```python def reader_creator_random_image(width, height): @@ -139,13 +143,13 @@ false_reader = reader_creator_bool(False) reader = paddle.reader.compose(paddle.dataset.mnist.train(), data_reader_creator_random_image(20, 20), true_reader, false_reader) # Skipped 1 because paddle.dataset.mnist.train() produces two items per data entry. -# And we don't care second item at this time. +# And we don't care about the second item at this time. paddle.train(paddle.batch(reader, 128), {"true_image":0, "fake_image": 2, "true_label": 3, "false_label": 4}, ...) ``` ### Shuffle -Given shuffle buffer size `n`, `paddle.reader.shuffle` will return a data reader that buffers `n` data entries and shuffle them before a data entry is read. +Given the shuffle buffer size `n`, `paddle.reader.shuffle` returns a data reader that buffers `n` data entries and shuffles them before a data entry is read. Example: ```python @@ -154,21 +158,21 @@ reader = paddle.reader.shuffle(paddle.dataset.mnist.train(), 512) ## Q & A -### Why reader return only a single entry, but not a mini batch? +### Why does a reader return only a single entry, and not a mini batch? -Always returning a single entry make reusing existing data readers much easier (e.g., if existing reader return not a single entry but 3 entries, training code will be more complex because it need to handle cases like batch size 2). +Returning a single entry makes reusing existing data readers much easier (for example, if an existing reader returns 3 entries instead if a single entry, the training code will be more complicated because it need to handle cases like a batch size 2). -We provide function `paddle.batch` to turn (single entry) reader into batch reader. +We provide a function: `paddle.batch` to turn (a single entry) reader into a batch reader. -### Why do we need batch reader, isn't train take reader and batch_size as arguments sufficient? +### Why do we need a batch reader, isn't is sufficient to give the reader and batch_size as arguments during training ? -In most of the case, train taking reader and batch_size as arguments would be sufficent. However sometimes user want to customize order of data entries inside a mini batch. Or even change batch size dynamically. +In most of the cases, it would be sufficient to give the reader and batch_size as arguments to the train method. However sometimes the user wants to customize the order of data entries inside a mini batch, or even change the batch size dynamically. For these cases using a batch reader is very efficient and helpful. -### Why use a dictionary but not a list to provide mapping? +### Why use a dictionary instead of a list to provide mapping? -We decided to use dictionary (`{"image":0, "label":1}`) instead of list (`["image", "label"]`) is because that user can easily resue item (e.g., using `{"image_a":0, "image_b":0, "label":1}`) or skip item (e.g., using `{"image_a":0, "label":2}`). +Using a dictionary (`{"image":0, "label":1}`) instead of a list (`["image", "label"]`) gives the advantage that the user can easily reuse the items (e.g., using `{"image_a":0, "image_b":0, "label":1}`) or even skip an item (e.g., using `{"image_a":0, "label":2}`). -### How to create custom data reader creator +### How to create a custom data reader creator ? ```python def image_reader_creator(image_path, label_path, n): @@ -192,7 +196,7 @@ paddle.train(paddle.batch(reader, 128), {"image":0, "label":1}, ...) ### How is `paddle.train` implemented -An example implementation of paddle.train could be: +An example implementation of paddle.train is: ```python def train(batch_reader, mapping, batch_size, total_pass): diff --git a/doc/design/refactor/distributed_architecture.md b/doc/design/refactor/distributed_architecture.md new file mode 100644 index 0000000000000000000000000000000000000000..d9fe7d6bbb0eeb73fcdca3ee749a4f10bcdda682 --- /dev/null +++ b/doc/design/refactor/distributed_architecture.md @@ -0,0 +1,144 @@ +# Design Doc: Distributed Training Architecture + +## Abstract + +PaddlePaddle version 0.10.0 uses the "trainer-parameter server" architecture. We run multiple instances of trainers (where each trainer runs the same model) and parameter servers for distributed training. This architecture serves well, but has few limitations: + +1. There is a need to write special code that handles tasks which should only be run on a single trainer. E.g., initializing the model, saving the model etc. + +2. Model parallelism is hard: It would need all the if-else branches conditioned on the trainer ID to partition the model onto the trainers, and eventually manually writing out the inter-model-shard communication code to communicate between different trainers. + +3. The user can not directly specify the parameter update rule: This would need to modify the parameter server code and compile a new binary. This makes things more complicated for researchers: A lot of extra effort is required to make this work. Besides, the training job submission program may not allow running arbitrary binaries. + +This design doc discusses PaddlePaddle's new distributed training architecture that addresses the above mentioned limitations. + +## Analysis + +The assumption is that the user writes the trainer program in either Python or C++. + +### Limitation 1 + +There are two basic functionalities in the trainer program: + +1. The training logic such as loading / saving the model and printing out the logs. +2. The neural network definition such as the definition of the data layer, the fully connected layer, the cost function and the + optimizer. + +When we train using PaddlePaddle v0.10.0 in a distributed fashion, multiple instances of the same Python code are run on different nodes, hence both: the +training logic as well as the neural network computation logic, is replicated. + +The tasks that only need to be run once belong to the training logic. Hence if we only replicate the neural network computation part, and do **not** +replicate the training logic, the limitation mentioned above can be avoided. + +### Limitation 2 + +Model parallelism means that a single model is partitioned into different components and each node runs one of the component separately. This comes at the extra cost of managing the +inter-model-shard communication between nodes. + +PaddlePaddle should ideally be able to modify the neural network computation and figure out the support for model parallelism automatically. However, the +computation is only specified in Python code which sits outside of PaddlePaddle, hence PaddlePaddle can not support the feature in this setup. + +Similar to how a compiler uses an intermediate representation (IR) so that the programmer does not need to manually optimize their code for most of the cases, we can have an intermediate representation in PaddlePaddle as well. The compiler optimizes the IR as follows: + + + +PaddlePaddle can support model parallelism by converting the IR so that the user no longer needs to manually perform the computation and operations in the Python component: + + + +The IR for PaddlePaddle after refactoring is called a `Block`, it specifies the computation dependency graph and the variables used in the computation. + +### Limitation 3 + +The user can not directly specify the parameter update rule for the parameter server in the Python module, since the parameter server does not use the same computation definition as the trainer. Instead, the update rule is baked inside the parameter server. The user can not specify the update rule explicitly. + +This could be fixed by making the parameter server run the same computation definition as the trainer (the user's Python module). For a detailed explanation, refer to this document - +[Design Doc: Operation Graph Based Parameter Server](./parameter_server.md) + +## Distributed Training Architecture + +The revamped distributed training architecture can address the above discussed limitations. Below is the illustration of how it does so: + + + +The major components in the architecture are: *PaddlePaddle Python*, *PaddlePaddle converter* and *PaddlePaddle runtime*. + +### PaddlePaddle Python + +PaddlePaddle Python is the Python library that user's Python code invokes, to read the data. build the neural network topology, start training, etc. + +```Python +paddle.init() +input = paddle.op.recordIO("/home/data/mnist.recordio") # file stored on the cluster +img, label = input[0], input[1] +hidden = paddle.layer.fc(input=img, size=200, act=paddle.activation.Tanh()) +prediction = paddle.layer.fc(input=img, size=10, act=paddle.activation.Softmax()) +cost = paddle.layer.classification_cost(input=prediction, label=label) +optimizer = paddle.optimizer.SGD(cost, learning_rate=0.01) +session = paddle.session.NewRemote(num_trainer=3, num_ps=2, GPU_per_trainer=1) +for i in range(1000): + _, cost_val = session.eval(targets=[cost, optimizer]) + print cost_val +``` + +The above code is what a typical Python trainer code is, the neural network topology is built using the helper functions such as `paddle.layer.fc`. Training is done by calling `session.eval` iteratively. + +#### session.eval + +As shown in the graph, `session.eval` sends the IR and the evaluation inputs or targets to the PaddlePaddle cluster for evaluation. +The targets can be any variable in the computation graph. When the target is say, the `optimizer` variable, the neural network will be optimized once. When the target is the `cost` variable, `session.eval` returns the cost value. Based on what the target is, an appropriate action is taken. + +The Python `session` is a wrapper of the C++ `Session` class. For more information about `Session`, refer to this document - [Design Doc: Session](./session.md). + +### PaddlePaddle Converter + +The PaddlePaddle converter automatically converts the IR in the request (IR and evaluation inputs/targets) from PaddlePaddle Python to partitioned IRs and dispatches the new IRs and evaluation inputs/targets to different PaddlePaddle runtimes. Below are the steps that are followed : + +1. Add a `feed` OP that feeds the eval inputs, and a `fetch` OP that fetches the eval targets to the IR. + +2. Extract a new computation (sub)graph with the `feed` and `fetch` OPs as the boundary. The runtime does not need to run the OP that is not dependent on the `fetch` OP. + +3. Optimize the computation graph. + +4. Place the OPs in the graph onto different devices on different PaddlePaddle runtime according to a placement algorithm and the device constraints specified by the user. + +5. Partition the graph according to runtime boundaries and add `send` / `recv` OP pair on the runtime boundaries. + +6. Dispatch the partitioned graph to different PaddlePaddle runtimes. + +7. PaddlePaddle runtimes with the `fetch` OP reports evaluation results back to the converter, the converter reports the evaluation results back to the PaddlePaddle Python. + +The output IRs will be cached to optimize the conversion latency. + + +#### Placement Algorithm + +Our first implementation will only support "trainer-parameter server" placement: the parameters, initializers, and optimizers are all placed on the PaddlePaddle runtimes with the parameter server role. Everything else will be placed on the PaddlePaddle runtimes with the trainer role. This has the same functionality as the "trainer-parameter server" architecture of PaddlePaddle v0.10.0, but is more generic and flexible. + +In the future, a more general placement algorithm should be implemented, which makes placements according to the input IR, and a model of device computation time and device communication time. Model parallelism requires the generic placement algorithm. + + +### PaddlePaddle Runtime + +The PaddlePaddle runtime owns multiple devices (e.g., CPUs, GPUs) and runs the IR. The runtime does not need to do OP placement since it is already done by the converter. + + +### Local Training Architecture + +The local training architecture will be the same as the distributed training architecture, the difference is that everything runs locally, and there is just one PaddlePaddle runtime: + + + + +### Training Data + +In PaddlePaddle v0.10.0, training data is typically read with a [data reader](../reader/README.md) from Python. This approach is no longer efficient when training in a distributed fashion since the Python process no longer runs on the same node with the trainer processes. The Python reader will need to read from the distributed filesystem (assuming it has the required access) and send to the trainers, doubling the network traffic. + +When doing distributed training, the user can still use Python data reader: the training data are sent with `session.eval`. However this should be used for debugging purpose only. The users are encouraged to use the read data OPs. + + +## References: + +[1] [TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45166.pdf) + +[2] [TensorFlow: A System for Large-Scale Machine Learning](https://www.usenix.org/system/files/conference/osdi16/osdi16-abadi.pdf) diff --git a/doc/design/refactor/parameter_server.md b/doc/design/refactor/parameter_server.md new file mode 100644 index 0000000000000000000000000000000000000000..fa3c5d7990213cf2b0d236e66e592dd2699da876 --- /dev/null +++ b/doc/design/refactor/parameter_server.md @@ -0,0 +1,106 @@ +# Design Doc: Operation Graph Based Parameter Server + +## Abstract + +We propose an approach to implement the parameter server. In this +approach, there is no fundamental difference between the trainer and +the parameter server: they both run subgraphs, but subgraphs of +different purposes. + +## Background + +The previous implementations of the parameter server does not run a +subgraph. parameter initialization, optimizer computation, network +communication and checkpointing are implemented twice on both the +trainer and the parameter server. + +It would be great if we can write code once and use them on both the +trainer and the parameter server: reduces code duplication and +improves extensibility. Given that after the current refactor, we are +representing everything as a computing graph on the +trainer. Representing everything as a computing graph on the parameter +server becomes a natural extension. + +## Design + +### Graph Converter + +The *graph converter* converts the user-defined operation (OP) graph +into subgraphs to be scheduled on different nodes with the following +steps: + +1. OP placement: the OPs will be placed on different nodes according + to heuristic that minimizes estimated total computation + time. Currently we will use a simple heuristic that puts parameter + varable on parameter server workers and everything else on trainer + workers. + +1. Add communication OPs to enable the communication between nodes. + +We will need these OPs: *Send*, *Recv*, *Enqueue*, *Dequeue*. + +Below is an example of converting the user defined graph to the +subgraphs for the trainer and the parameter server: + + + +After converting: + + + +1. The parameter variable W and it's optimizer subgraph are placed on the parameter server. +1. Operators are added to the subgraphs. + - *Send* sends data to the connected *Recv* operator. The + scheduler on the receive node will only schedule *Recv* operator + to run when the *Send* operator has ran (the *Send* OP will mark + the *Recv* OP runnable automatically). + - *Enueue* enqueues the input variable, it can block until space + become available in the queue. + - *Dequeue* outputs configurable numbers of tensors from the + queue. It will block until the queue have the required number of + tensors. + + +### Benefits + +- Model parallelism become easier to implement: it's an extension to + the trainer - parameter server approach. we already have the + communication OPs, but need to extend the graph converter's + placement functionality. + +- User-defined optimizer is easier to add - user can now express it as + a subgraph. + +- No more duplication logic inside the trainer and the parameter + server mentioned in the background section. + +### Challenges + +- It might be hard for the graph converter to cut a general graph + (without any hint for which subgraph is the optimizer). We may need + to label which subgraph inside the OP graph is the optimizer. + +- It's important to balance the parameter shards of on multiple + parameter server. If a single parameter is very big (some + word-embedding, fully connected, softmax layer), we need to + automatically partition the single parameter onto different + parameter servers when possible (only element-wise optimizer depends + on the parameter variable). + +### Discussion + +- In the "Aync SGD" figure, the "W" variable on the parameter server + could be read and wrote concurrently, what is our locking strategy? + E.g., each variable have a lock cpp method to be invoked by every + OP, or, have a lock OP. + +- Can the Enqueue OP be implemented under our current tensor design + (puts the input tensor into the queue tensor)? + +- *Dequeue* OP will have variable numbers of output (depends on the + `min_count` attribute), does our current design support it? (similar + question for the *Add* OP) + + +### References: +[1] [TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45166.pdf) diff --git a/doc/design/refactor/session.md b/doc/design/refactor/session.md new file mode 100644 index 0000000000000000000000000000000000000000..1d9a26683c14f54e3b5fe41675cd03b5620646b8 --- /dev/null +++ b/doc/design/refactor/session.md @@ -0,0 +1,180 @@ +# Design Doc: Session + +## Abstract + +The *session* object encapsulates the environment in which the +computation graph is executed. + +We will have the *local* session and *remote* session, they offer the +same [interface](#interface). The local session encapsulates the local +runtime environment and the remote session encapsulates the cluster +runtime environment. + +The local runtime environment contains: + +1. computation devices (i.e., CPU, GPU) handles, and +1. the [scope](../scope.md) which holds all variables. + +The remote runtime environment contains: + +1. computation devices (i.e., CPU and GPU on node 0, 1) in a cluster, + and +1. the distributed [scope](../scope.md) in a cluster which holds all + variables. + +The user can create a remote session on Paddle Cloud and evaluate the +computation graph with it. In this way, the user can control the +remote computation resource in a cluster from his local computer. + + +## Background + +The current design has an implicit global session in which +`paddle.eval()` is executed. The pain point is: + +Since the user is not able to explicitly switch between runtime +environments, the user cannot run a topology in two independent +environments. + +For example, in reinforcement learning, the user may want to have a +stale model for inference and a fresh model for training, and only +replace the stale model with the fresh model periodically. + +Furthermore, we have no concept that encapsulates a remote environment +that executes a computation graph. + +We need the session object to address above issues. + + +## Session + +A session is an object that owns the runtime environment. All +computations are executed through `session.eval()`. + + +### Interface + +```python +eval( + targets, + feed_dict=None, +) +``` + +Evaluates the target Operations or Variables in `targets`. + +- *targets*: the evaluation targets. Can be a single Operation or + Variable, or a list with the Operations or Variables as + elements. The value returned by `eval()` has the same shape as the + `target` argument. + + The PaddlePaddle program is represented by + the [ProgramDesc](../design/program.md), `eval()` will infer the + ProgramDesc from the given targets and run the PaddlePaddle + program. Please + see + [this graph](./distributed_architecture.md#local-training-architecture) for + the detailed illustration for the local session + and + [this graph](./distributed_architecture.md#distributed-training-architecture) for + the detailed illustration for the remote session. + +- *feed_dict*: a dictionary that contains the tensors which override + the edges of the computation graph. + + feed_dict not only can provide the input data, it can override any + OP's input as well: + + ```python + a = pd.constant(2.0, name="a") + b = pd.variable(name="b") + c = pd.mul(a,b) + sess.eval(targets=c, feed_dict={"b":3.0}) # returns 6.0 + ``` + +```python +close() +``` + +Closes the session and releases the scope that the session owns. + + +### Create a Local Session + +```python +session( + devices=None +) +``` + +Creates a new session. One session owns one global scope, so creating +multiple sessions will create different scopes. + +- *devices*: a single `string` or a list of `string` of device names, + the corresponding devices will be the computation devices for + `eval()`. If not specified, all available devices (e.g., all GPUs) + will be used. The user doesn't need to specify the CPU device since + it will be always used. Multiple sessions can use the same device. + + +#### Example + +```Python +a = paddle.constant(1.0) +b = paddle.constant(2.0) +c = a + b +sess = paddle.session(devices=["gpu:0", "gpu:1", "fpga:0"]) +sess.eval(c) +sess.close() +``` + +### Create a Remote Session + +```python +create_cloud_job( + name, + num_trainer, + mem_per_trainer, + gpu_per_trainer, + cpu_per_trainer, + num_ps, + mem_per_ps, + cpu_per_ps, +) +``` + +Creates a Paddle Cloud job. Fails if the job name exists. + +```python +get_cloud_job( + name +) +``` + +Gets a Paddle Cloud job. + +```python +remote_session( + job +) +``` + +- *job*: the Paddle Cloud job. + +#### Example + +```Python +reader = paddle.reader.recordio("/pfs/home/peter/mnist-train-*") # data stored on Paddle Cloud +image = reader.column(0) +label = reader.column(1) +fc1 = paddle.op.fc(image, size=256, act="sigmoid") +fc2 = paddle.op.fc(fc1, size=10, act="softmax") +cost = paddle.op.cross_entropy(fc2, label) +opt = paddle.optimizer.sgd(cost) + +job = paddle.create_cloud_job("test", 3, "1G", 1, 1, 2, "1G", 1) +sess = paddle.remote_ession(job) +for i in range(1000): + sess.eval(opt) +sess.close() +``` diff --git a/doc/design/refactor/src/compiler.graffle b/doc/design/refactor/src/compiler.graffle new file mode 100644 index 0000000000000000000000000000000000000000..8cc678fea3c820103e7ce81f7a5d625d6c1d92de Binary files /dev/null and b/doc/design/refactor/src/compiler.graffle differ diff --git a/doc/design/refactor/src/compiler.png b/doc/design/refactor/src/compiler.png new file mode 100644 index 0000000000000000000000000000000000000000..65d34f841afce9756def07dd8ecb9ca44e658bfe Binary files /dev/null and b/doc/design/refactor/src/compiler.png differ diff --git a/doc/design/refactor/src/dist-graph.graffle b/doc/design/refactor/src/dist-graph.graffle new file mode 100644 index 0000000000000000000000000000000000000000..941399c6ced8d5f65b6c595522b770c88259df4b Binary files /dev/null and b/doc/design/refactor/src/dist-graph.graffle differ diff --git a/doc/design/refactor/src/dist-graph.png b/doc/design/refactor/src/dist-graph.png new file mode 100644 index 0000000000000000000000000000000000000000..3546b09f1c2ee3e4f60f519d5e47f823f08051a7 Binary files /dev/null and b/doc/design/refactor/src/dist-graph.png differ diff --git a/doc/design/refactor/src/distributed_architecture.graffle b/doc/design/refactor/src/distributed_architecture.graffle new file mode 100644 index 0000000000000000000000000000000000000000..f8496e57326c38de7468eb452a7713291d57653c Binary files /dev/null and b/doc/design/refactor/src/distributed_architecture.graffle differ diff --git a/doc/design/refactor/src/distributed_architecture.png b/doc/design/refactor/src/distributed_architecture.png new file mode 100644 index 0000000000000000000000000000000000000000..410c4510c6aab301dec95e6427fe80ac24e105fe Binary files /dev/null and b/doc/design/refactor/src/distributed_architecture.png differ diff --git a/doc/design/refactor/src/local-graph.graffle b/doc/design/refactor/src/local-graph.graffle new file mode 100644 index 0000000000000000000000000000000000000000..19e509bd9af3c1e9a3f5e0f16ddd281457a339c5 Binary files /dev/null and b/doc/design/refactor/src/local-graph.graffle differ diff --git a/doc/design/refactor/src/local-graph.png b/doc/design/refactor/src/local-graph.png new file mode 100644 index 0000000000000000000000000000000000000000..ada51200f793a9bb18911e7d63cfdb3244b967d7 Binary files /dev/null and b/doc/design/refactor/src/local-graph.png differ diff --git a/doc/design/refactor/src/local_architecture.graffle b/doc/design/refactor/src/local_architecture.graffle new file mode 100644 index 0000000000000000000000000000000000000000..cc7783c45381f25ded0b898649322c81418ad317 Binary files /dev/null and b/doc/design/refactor/src/local_architecture.graffle differ diff --git a/doc/design/refactor/src/local_architecture.png b/doc/design/refactor/src/local_architecture.png new file mode 100644 index 0000000000000000000000000000000000000000..4b999538b7825c805292ee28b5e3256d5543bd09 Binary files /dev/null and b/doc/design/refactor/src/local_architecture.png differ diff --git a/doc/design/refactor/src/paddle-compile.graffle b/doc/design/refactor/src/paddle-compile.graffle new file mode 100644 index 0000000000000000000000000000000000000000..a6348cc3dbcaca923c6e794681b2edb85cb9f8f6 Binary files /dev/null and b/doc/design/refactor/src/paddle-compile.graffle differ diff --git a/doc/design/refactor/src/paddle-compile.png b/doc/design/refactor/src/paddle-compile.png new file mode 100644 index 0000000000000000000000000000000000000000..e0f13d551ac41afaec627a57dea79356464bf0bf Binary files /dev/null and b/doc/design/refactor/src/paddle-compile.png differ diff --git a/doc/design/refactorization.md b/doc/design/refactorization.md new file mode 100644 index 0000000000000000000000000000000000000000..f93d6155e1764386b01d2f0df3f141ab75cd55d4 --- /dev/null +++ b/doc/design/refactorization.md @@ -0,0 +1,249 @@ +# Design Doc: Refactorization Overview + +The goals of refactoring include: + +1. Making it easy for external contributors to write new elementary computation operations. +1. Making the codebase clean and readable. +1. Designing a new computation representation -- a computation graph of operators and variables. +1. Implementing auto-scalability and auto fault recoverable distributed computing with the help of computation graphs. + +## Computation Graphs + +1. PaddlePaddle represents the computation, training and inference of Deep Learning models, by computation graphs. + + 1. Please refer to [computation graphs](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/graph.md) for a concrete example. + +1. Users write Python programs to describe the graphs and run them (locally or remotely). + +1. A graph is composed of *variables* and *operators*. + +1. The description of graphs must be serializable/deserializable, so that: + + 1. It can be sent to the cloud for distributed execution, and + 1. It can be sent to clients for mobile or enterprise deployment. + +1. The Python program does two things + + 1. *Compilation* runs a Python program to generate a protobuf message representation of the graph and send it to + 1. the C++ library `libpaddle.so` for local execution, + 1. the master process of a distributed training job for training, or + 1. the server process of a Kubernetes serving job for distributed serving. + 1. *Execution* executes the graph by constructing instances of class [`Variable`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h#L24) and [`OperatorBase`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L70), according to the protobuf message. + +## Description and Realization of Computation Graph + +At compile time, the Python program generates a protobuf message representation of the graph, or a description of the graph. + +At runtime, the C++ program realizes the graph and runs it. + +| | Representation (protobuf messages) | Realization (C++ class objects) | +|---|---|---| +|Data|[VarDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L107)|[Variable](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h#L24)| +|Operation|[OpDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L35)|[Operator](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L64)| +|Block|BlockDesc|Block| + +The word *graph* is interchangeable with *block* in this document. A graph consists of computation steps and local variables similar to a C++/Java program block, or a pair of parentheses(`{` and `}`). + +## Compilation and Execution + +1. Run a Python program to describe the graph. In particular, the Python application program does the following: + + 1. Create `VarDesc` to represent local/intermediate variables, + 1. Create operators and set attributes, + 1. Validate attribute values, + 1. Infer the type and the shape of variables, + 1. Plan memory-reuse for variables, + 1. Generate the backward graph + 1. Add optimization operators to the computation graph. + 1. Optionally, split the graph for distributed training. + +1. The invocation of `train` or [`infer`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/inference.py#L108) methods in the Python program does the following: + + 1. Create a new Scope instance in the [scope hierarchy](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/scope.md) for each run of a block, + 1. realize local variables defined in the BlockDesc message in the new scope, + 1. a scope is similar to the stack frame in programming languages, + + 1. Create an instance of class `Block`, in which, + 1. realize operators in the BlockDesc message, + + 1. Run the Block by calling + 1. `Block::Eval(vector* targets)` for forward and backward computations, or + 1. `Block::Eval(vector* targets)` for optimization. + + +## Intermediate Representation (IR) + +```text +Compile Time -> IR -> Runtime +``` + +### Benefits of IR + +- Optimization + ```text + Compile Time -> IR -> Optimized IR -> Runtime + ``` +- Automatically send partitioned IR to different nodes. + - Automatic Data Parallelism + ```text + Compile Time + |-> Single GPU IR + |-> [trainer-IR-0, trainer-IR-1, pserver-IR] + |-> Node-0 (runs trainer-IR-0) + |-> Node-1 (runs trainer-IR-1) + |-> Node-2 (runs pserver-IR) + ``` + - Automatic Model Parallelism (planned for future) + +--- + +# Operator/OpWithKernel/OpKernel + +![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/49caf1fb70820fb4a6c217634317c9306f361f36/op_op_with_kern_class_diagram.dot) + +--- + +# Operator +![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/dd598e8f1976f5759f58af5e5ef94738a6b2e661/op.dot) + +* `Operator` is the fundamental building block of the user interface. + * Operator stores input/output variable names and attributes. + * The `InferShape` interface is used to infer the shape of the output variables based on the shapes of the input variables. + * Use `Run` to compute the `output` variables from the `input` variables. + +--- + +# OpWithKernel/Kernel + +![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/9d7f4eba185cf41c8e2fbfb40ae21890dbddcd39/op_with_kernel.dot) + +* `OpWithKernel` inherits `Operator`. +* `OpWithKernel` contains a Kernel map. + * `OpWithKernel::Run` get device's kernel, and invoke `OpKernel::Compute`. + * `OpKernelKey` is the map key. Only device place now, but may be data type later. + +--- + +# Why separate Kernel and Operator + +* Separate GPU and CPU code. + * Make Paddle capable of running without GPU. +* Make one operator (which is a user interface) and create many implementations. + * For example, same multiplication op can have different implementations kernels such as FP16 kernel, FP32 kernel, MKL, eigen kernel. +--- + +# Libraries for Kernel development + +* `Eigen::Tensor` contains basic math and element-wise functions. + * Note that `Eigen::Tensor` has broadcast implementation. + * Limit the number of `tensor.device(dev) = ` in your code. +* `thrust::transform` and `std::transform`. + * `thrust` has the same API as C++ standard library. Using `transform`, one can quickly implement customized element-wise kernels. + * `thrust`, in addition, supports more complex APIs, like `scan`, `reduce`, `reduce_by_key`. +* Hand-writing `GPUKernel` and `CPU` code + * Do not write in header (`.h`) files. CPU Kernel should be in cpp source (`.cc`) and GPU kernels should be in cuda (`.cu`) files. (GCC cannot compile GPU code.) +--- +# Operator Registration + +## Why is registration necessary? +We need a method to build mappings between Op type names and Op classes. + +## How is registration implemented? +Maintaining a map, whose key is the type name and the value is the corresponding Op constructor. + +--- +# The Registry Map + +### `OpInfoMap` + +`op_type(string)` -> `OpInfo` + +`OpInfo`: + +- **`creator`**: The Op constructor. +- **`grad_op_type`**: The type of the gradient Op. +- **`proto`**: The Op's Protobuf, including inputs, outputs and required attributes. +- **`checker`**: Used to check attributes. + +--- +# Related Concepts + +### Op_Maker +It's constructor takes `proto` and `checker`. They are completed during Op_Maker's construction. ([ScaleOpMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37)) + +### Register Macros +```cpp +REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type, grad_op_class) +REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) +``` + +--- +# Registration Process +1. Write an Op class and its gradient Op class, if required. +2. Write an Op maker class. In the constructor of this class, describe the inputs, outputs and attributes of the operator. +3. Invoke the macro `REGISTER_OP`. This macro will + 1. Call maker class to complete `proto` and `checker` + 2. Using the completed `proto` and `checker`, it will add a new key-value pair to the `OpInfoMap` + +--- +# Backward Module (1/2) +### Create Backward Operator +- Mapping from forward Op to backward Op +![backward](https://gist.githubusercontent.com/dzhwinter/a6fbd4623ee76c459f7f94591fd1abf0/raw/61026ab6e518e66bde66a889bc42557a1fccff33/backward.png) + +--- +# Backward Module (2/2) +### Build Backward Network +- **Input**: a graph of forward operators +- **Output**: a graph of backward operators +- **Corner cases in construction** + - Shared Variables => insert an `Add` operator to combine gradients + - No Gradient => insert a `fill_zero_grad` operator + - Recursive NetOp => call `Backward` recursively + - RNN Op => recursively call `Backward` on stepnet + - RNN Op => recursively call `Backward` on stepnet + + +--- +# Scope, Variable, Tensor + +* `Tensor` is an n-dimension array with type. + * Only dims and data pointers are stored in `Tensor`. + * All operations on `Tensor` are written in `Operator` or global functions. + * Variable length Tensor design [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md) +* `Variable` instances are the inputs and the outputs of an operator, not just `Tensor`. + * `step_scopes` in RNN is a variable and not a tensor. +* `Scope` is where variables are stored. + * map + * `Scope` has a hierarchical structure. The local scope can get variables from its parent scope. + +--- +# Block (in design) +## the difference between original RNNOp and Block +- As an operator is more intuitive than `RNNOp`, +- Offers a new interface `Eval(targets)` to deduce the minimal block to `Run`, +- Fits the compile-time/ runtime separation design paradigm. + - During the compilation, `SymbolTable` stores `VarDesc`s and `OpDesc`s and serialize to a `BlockDesc` + - When graph executes, a Block with `BlockDesc` is passed. It then creates `Op` and `Var` instances and then invokes `Run`. + +--- +# Milestone +- Take Paddle/books as the main line, the requirement of the models motivates framework refactoring, +- Model migration + - Framework development gives **priority support** to model migration, for example, + - the MNIST demo needs a Python interface, + - the RNN models require the framework to support `LoDTensor`. + - Determine some timelines, + - Frequently used Ops need to be migrated first, + - Different models can be migrated in parallel. +- Improve the framework at the same time +- Accept imperfection, concentrate on solving the specific problem at the right price. + +--- +# Control the migration quality +- Compare the performance of migrated models with old ones. +- Follow the google C++ style guide. +- Build the automatic workflow of generating Python/C++ documentations. + - The documentation of layers and ops should be written inside the code. + - Take the documentation quality into account when submitting pull requests. + - Preview the documentations, read and improve them from a user's perspective. diff --git a/doc/design/register_grad_op.md b/doc/design/register_grad_op.md new file mode 100644 index 0000000000000000000000000000000000000000..8d973eb53178c3e889c845144553a453e11f067c --- /dev/null +++ b/doc/design/register_grad_op.md @@ -0,0 +1,92 @@ +# Design Doc: Gradient Operators Registration + + +## The Problem Posed + +Currently, for each C++ operator class definition, a *gradient operator creator* function is registered, which takes as input a C++ operator instance and returns the corresponding gradient operator instance. + +However, we noticed two problems with the current design: + +1. As we decided to separate the *compilation* and the *execution* phases, we need to change the creator to take an `OpDesc` protobuf message in a `ProgramDesc` and inserts corresponding `OpDesc` messages into the `ProgramDesc` message. + +1. For some operators, the gradient computation can be written in terms of existing operators. For example, the gradient of *minus* operator consists of two operators -- an *identity* operator followed by a *scale* operator. Hence the registration mechanism needs to support mapping from an operator to a set of operators for the gradient computation. + +## The Current Implementation + +Instances of the C++ class `OpInfo` are stored an associative map whose key is the operator type. The `grad_op_type` indicates the associated gradient operator type. An operator can create the gradient operator by invoking `OpInfo::creator_` of the gradient operator. The pseudo code is as follows + +```cpp +struct OpInfo { + std::function creator_; + std::string grad_op_type_; + ... +}; + +map OpInfoMap; + +OperatorBase* CreateGradientOperator(const OperatorBase& op) { + return OpInfoMap.at(op.Type()).creator_(...); +} +``` + +## Proposed Solution + +The mapping relationship between an operator and its gradient operators is a function. The interface of this function is: + +```cpp +// (OpDesc) --> vector +std::function(const OpDescBind&)>; +``` + +The function takes an `OpDescBind` of the forward operator and returns one or many gradient operator descriptions. `OpDescBind` is a C++ wrapper for the protobuf message `OpDesc` for rapid manipulation of `OpDesc`. + +The `GradOpDescMaker` will be registered in `OpInfo` and will replace the `grad_op_type_` field. The `OpInfo` should look like + +```cpp +struct OpInfo { + std::function>(const OpDescBind&)> grad_op_maker_; + ... +}; +``` + +The `grad_op_maker_ ` is a `nullptr` if the operator does not have any associated gradient operators. + +We propose a base class called `GradOpDescMakerBase` to let operator developers generate `Gradient Operators` easily. The public interface of that class is + +```cpp +class GradOpDescMakerBase { +public: + GradOpDescMakerBase(const OpDescBind& ); + virtual std::vector> operator()()const = 0; +}; +``` + +We can convert `GradOpDescMakerBase` to `std::function>(const OpDescBind&)>` by + +```cpp +using GradOpMaker = ...; +std::function(const OpDescBind&)> func; +func = [] (const OpDescBind& fwd_op) { + GradOpMaker maker(fwd_op); + return maker(); +}; +``` + +We can write many helper functions since the `GradOpDescMakerBase` is a class now. The basic helper functions get the variables of `Input`, `Output`, `InputGradient` and `OutputGradient` in the forwarding operator. + +We should change register macros at the same time. In the current solution, there is no difference between forwarding operators and backward operators. So `REGISTER_OP` just register one operator. If the `REGISTER_OPERATOR ` contains `OpProtoAndCheckerMaker` and `GradOpDescMaker`, we just list them in the same macro. It can be done by a macro contains `__VA_ARGS__`. + +The user interface should be + +```cpp +vector MinusOpGradMaker(OpDesc) {...} +REGISTER_OPERATOR(minus, MinusOp, MinusOpProtoAndCheckerMaker, SumOpGradMaker); +// Developers can still manually implement gradient operator. +REGISTER_OPERATOR(minus_grad, MinusGradOp); +``` + +The interface of current `REGISTER_OP` macro could not be changed. In `REGISTER_OP`, it will invoke `REGISTER_OPERATOR` two times and generate GradOpDescMaker inside. + +```cpp +REGISTER_OP(minus, MinusOp, MinusOpProtoAndCheckerMaker, minus_grad, MinusGradOp); +``` diff --git a/doc/design/regularization.md b/doc/design/regularization.md new file mode 100644 index 0000000000000000000000000000000000000000..21280ac898feb4dd5e5a5d9e88d121e856850f0b --- /dev/null +++ b/doc/design/regularization.md @@ -0,0 +1,72 @@ +# Regularization in PaddlePaddle + +## Introduction to Regularization +A central problem in machine learning is how to design an algorithm that will perform well not just on the training data, but also on new data. A frequently faced problem is the problem of **overfitting**, where the model does not make reliable predictions on new unseen data. **Regularization** is the process of introducing additional information in order to prevent overfitting. This is usually done by adding extra penalties to the loss function that restricts the parameter spaces that an optimization algorithm can explore. + +### Parameter Norm Penalties +Most common regularization approaches in deep learning are based on limiting the capacity of the models by adding a parameter norm penalty to the objective function `J`. This is given as follows: + +
+ +The parameter `alpha` is a hyperparameter that weights the relative contribution of the norm penalty term, `omega`, relative to the standard objective function `J`. + +The most commonly used norm penalties are the L2 norm penalty and the L1 norm penalty. These are given as follows: + +##### L2 Regularization: +
+ +##### L1 Regularization +
+ +A much more detailed mathematical background of regularization can be found [here](http://www.deeplearningbook.org/contents/regularization.html). + +## Regularization Survey + +A detailed survey of regularization in various deep learning frameworks can be found [here](https://github.com/PaddlePaddle/Paddle/wiki/Regularization-Survey). + +## Proposal for Regularization in PaddlePaddle + +### Low-Level implementation + +In the new design, we propose to create new operations for regularization. For now, we can add 2 ops that correspond to the most frequently used regularizations: +- L2_regularization_op +- L1_regularization_op + +These ops can be like any other ops with their own CPU/GPU implementations either using Eigen or separate CPU and GPU kernels. As the initial implementation, we can implement their kernels using Eigen following the abstraction pattern implemented for [Activation Ops](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/accuracy_op.h). This abstraction pattern can make it very easy to implement new regularization schemes other than L1 and L2 norm penalties. + +The idea of building ops for regularization is in sync with the refactored Paddle philosophy of using operators to represent any computation unit. The way these ops will be added to the computation graph, will be decided by the [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) in Python API. + +### Computation Graph + +Below is an example of a really simple feed forward neural network. + +
+ +The Python API will modify this computation graph to add regularization operators. The modified computation graph will look as follows: + +
+    +### Python API implementation for Regularization + +Using the low level ops, `L2_regularization_op` and `L1_regularization_op`, any user can add regularization to their computation graphs. However, this will require a lot of lines of code and we should design Python APIs that support regularization. An example of such an API can be seen in [Keras](https://keras.io/regularizers/). As per the PaddlePaddle [Python API design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md), the layer functions are responsible for creating operators, operator parameters and variables. Since regularization is a property of parameters, it makes sense to create these in the layer functions. + +#### Creation of Regularization ops +There are two possibilities for creating the regularization ops: +1. We create these ops immediately while building the computation graph. +2. We add these ops in a lazy manner, just before the backward, similar to the way the optimization ops are added. + +The proposal is to add these ops in a lazy manner just before the backward pass. + +#### Storage of Regularization attributes + +Since we want to create the regularization ops in a lazy manner, the regularization attributes (type of regularization and weight of regularization penalty) can be stored as attributes of the [`Parameter`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/framework.py#L421) class. This is because regularization is a property of the parameters and storing regularization properties with Parameters also allows for shared parameters. + +#### High-level API + +In PaddlePaddle Python API, users will primarily rely on [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) to create neural network layers. Hence, we also need to provide regularization functionality in layer functions. The design of these APIs can be postponed for later right now. A good reference for these APIs can be found in [Keras](https://keras.io/regularizers/) and also by looking at Tensorflow in [`tf.contrib.layers`](https://www.tensorflow.org/api_guides/python/contrib.layers). + + + + + + diff --git a/doc/design/releasing_process.md b/doc/design/releasing_process.md new file mode 100644 index 0000000000000000000000000000000000000000..14c081ea84282e52a2e36475c3c0ea755122d154 --- /dev/null +++ b/doc/design/releasing_process.md @@ -0,0 +1,68 @@ +# PaddlePaddle发行规范 + +PaddlePaddle使用git-flow branching model做分支管理,使用[Semantic Versioning](http://semver.org/)标准表示PaddlePaddle版本号。 + +PaddlePaddle每次发新的版本,遵循以下流程: + +1. 从`develop`分支派生出新的分支,分支名为`release/版本号`。例如,`release/0.10.0` +1. 将新分支的版本打上tag,tag为`版本号rc.Patch号`。第一个tag为`0.10.0rc1`,第二个为`0.10.0rc2`,依次类推。 +1. 对这个版本的提交,做如下几个操作: + * 修改`python/setup.py.in`中的版本信息,并将`istaged`字段设为`True`。 + * 编译这个版本的Docker发行镜像,发布到dockerhub。如果失败,修复Docker编译镜像问题,Patch号加一,返回第二步 + * 编译这个版本的Ubuntu Deb包。如果失败,修复Ubuntu Deb包编译问题,Patch号加一,返回第二步。 + * 使用Regression Test List作为检查列表,测试Docker镜像/ubuntu安装包的功能正确性 + * 如果失败,记录下所有失败的例子,在这个`release/版本号`分支中,修复所有bug后,Patch号加一,返回第二步 + * 编译这个版本的python wheel包,并发布到pypi。 + * 由于pypi.python.org目前遵循[严格的命名规范PEP 513](https://www.python.org/dev/peps/pep-0513),在使用twine上传之前,需要重命名wheel包中platform相关的后缀,比如将`linux_x86_64`修改成`manylinux1_x86_64`。 + * pypi上的package名称为paddlepaddle和paddlepaddle_gpu,如果要上传GPU版本的包,需要修改build/python/setup.py中,name: "paddlepaddle_gpu"并重新打包wheel包:`python setup.py bdist_wheel`。 + * 上传方法: + ``` + cd build/python + pip install twine + twine upload dist/[package to upload] + ``` +1. 第三步完成后,将`release/版本号`分支合入master分支,并删除`release/版本号`分支。将master分支的合入commit打上tag,tag为`版本号`。同时再将`master`分支合入`develop`分支。最后删除`release/版本号`分支。 +1. 编译master分支的Docker发行镜像,发布到dockerhub。编译ubuntu的deb包,发布到github release页面 +1. 协同完成Release Note的书写 + + +需要注意的是: + +* `release/版本号`分支一旦建立,一般不允许再从`develop`分支合入`release/版本号`。这样保证`release/版本号`分支功能的封闭,方便测试人员测试PaddlePaddle的行为。 +* 在`release/版本号`分支存在的时候,如果有bugfix的行为,需要将bugfix的分支同时merge到`master`, `develop`和`release/版本号`这三个分支。 + +## PaddlePaddle 分支规范 + +PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范,并适应github的特性做了一些区别。 + +* PaddlePaddle的主版本库遵循[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范。其中: + * `master`分支为稳定(stable branch)版本分支。每一个`master`分支的版本都是经过单元测试和回归测试的版本。 + * `develop`分支为开发(develop branch)版本分支。每一个`develop`分支的版本都经过单元测试,但并没有经过回归测试。 + * `release/版本号`分支为每一次Release时建立的临时分支。在这个阶段的代码正在经历回归测试。 + +* 其他用户的fork版本库并不需要严格遵守[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范,但所有fork的版本库的所有分支都相当于特性分支。 + * 建议,开发者fork的版本库使用`develop`分支同步主版本库的`develop`分支 + * 建议,开发者fork的版本库中,再基于`develop`版本fork出自己的功能分支。 + * 当功能分支开发完毕后,向PaddlePaddle的主版本库提交`Pull Reuqest`,进而进行代码评审。 + * 在评审过程中,开发者修改自己的代码,可以继续在自己的功能分支提交代码。 + +* BugFix分支也是在开发者自己的fork版本库维护,与功能分支不同的是,BugFix分支需要分别给主版本库的`master`、`develop`与可能有的`release/版本号`分支,同时提起`Pull Request`。 + +## PaddlePaddle回归测试列表 + +本列表说明PaddlePaddle发版之前需要测试的功能点。 + +### PaddlePaddle Book中所有章节 + +PaddlePaddle每次发版本首先要保证PaddlePaddle Book中所有章节功能的正确性。功能的正确性包括验证PaddlePaddle目前的`paddle_trainer`训练和纯使用`Python`训练模型正确性。 + +| | 新手入门章节 | 识别数字 | 图像分类 | 词向量 | 情感分析 | 语意角色标注 | 机器翻译 | 个性化推荐 | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | +| API.V2 + Docker + GPU | | | | | | | | | +| API.V2 + Docker + CPU | | | | | | | | | +| `paddle_trainer` + Docker + GPU | | | | | | | | | +| `paddle_trainer` + Docker + CPU | | | | | | | | | +| API.V2 + Ubuntu + GPU | | | | | | | | | +| API.V2 + Ubuntu + CPU | | | | | | | | | +| `paddle_trainer` + Ubuntu + GPU | | | | | | | | | +| `paddle_trainer` + Ubuntu + CPU | | | | | | | | | diff --git a/doc/design/scope.md b/doc/design/scope.md new file mode 100644 index 0000000000000000000000000000000000000000..4da76eebb74abcd26ec2b8671399e6bc4fb58574 --- /dev/null +++ b/doc/design/scope.md @@ -0,0 +1,124 @@ +# Design of Scope in Paddle + +## Overview + +Scope is an important concept in programming languages, which defines a program region that a set of bindings between names and entities applies. In a specific scope, a valid name is uniquely associated with an entity, such as a variable. And in another scope, this name may refer to other entity or nothing at all. It clearly restricts the visibility and validity of names in a program. Hence **Scope** is introduced to PaddlePaddle to manage variables in context. But different from the original abstract concept, Scope now becomes an object with two important attributes: + +- Scope is an association of a name to variable. +- Variables in a parent scope can be retrieved from local scope. + +A detailed explanation of these two attributes goes as following. + + +## Scope is an association of a name to variable. + +Scope is an association of a name to variable. All variables belong to `Scope`. You need to specify a scope to run a Net, i.e., `net.Run(&scope)`. One net can run in different scopes and update different variable in the scope. + + +1. Scope only contains a map of a name to variable. + + All parameters, data, states in a Net should be variables and stored inside a scope. Each op should get inputs and outputs to do computation from a scope, such as data buffer, state (momentum) etc. + +1. Variable can only be created by Scope and a variable can only be got from Scope. User cannot create or get a variable outside a scope. This is a constraints of our framework, and will keep our framework simple and clear. + +1. Scope only contains methods that are used to Create and Get Variables. Scope do not contain Operators and have no information to run them. + `Net` is designed to drive the computation and Scope only contains a map of variables. There is no computation logic inside a `Scope`. Scope just handles the lifetime management of variables. + - `Create` is used to create a Variable by its name and add the mapping relation. + - `Get` is used to find a Variable by name. + +1. Every variable only belongs to one certain Scope. + + Variable can not belong to many scopes. If you want to use variables from parent scope, you can use `parent scope`. + +1. Scope should destruct all Variables inside it when itself is destructed. User can never store `Variable` pointer somewhere else. + + Because Variable can only be got from Scope. When destroying Scope, we also need to destroy all the Variables in it. If user store `Variable` pointer to private data member or some global variable, the pointer will be an invalid pointer when associated `Scope` is destroyed. + +```cpp +class Scope { + public: + Variable* Var(const std::string& name); + const Variable* FindVar(const std::string& name) const; + + private: + std::unordered_map> vars_; +}; +``` + + +## Parent scope and local scope + +Just like [scope](https://en.wikipedia.org/wiki/Scope_(computer_science)) in programming languages, `Scope` in the neural network can also be a local scope. There are two attributes about local scope. + +1. We can create local variables in a local scope. When that local scope is destroyed, all local variables should also be destroyed. +2. Variables in a parent scope can be retrieved from local scopes of that parent scope, i.e., when user get a variable from a scope, it will try to search this variable in current scope. If there is no such variable in the local scope, `scope` will keep searching from its parent, until the variable is found or there is no parent. + +```cpp +class Scope { + public: + Scope(const std::shared_ptr& scope): parent_(scope) {} + + Variable* FindVar(const std::string& name) const { + auto it = vars_.find(name); + if (it != vars_.end()) { + return it->second.get(); + } else if (parent_ != nullptr) { + return parent_->FindVar(name); + } else { + return nullptr; + } + } + + private: + std::shared_ptr parent_ {nullptr}; +}; +``` + +In `Scope` class, there is a private data member called `parent_`. `parent_` is a smart pointer to its parent scope. When user `Get` a variable by its `name`, the `name` will be searched inside the current scope. If the variable cannot be found locally and parent scope is not a `nullptr`, the variable will be searched inside that parent scope. `parent_` pointer's default value is `nullptr`. It means that the scope is a global scope when `parent_` is nullptr. + +A local scope is very useful when we implement Recurrent Neural Network. Each timestep of an RNN should be a `Net`. Each `Net` of timestep (`StepNet` for short) should use an independent local scope. Just like variables in a while loop is inside a local scope in programming languages. By using a single `StepNet` and changing local scope, we can implement an RNN easily. + +# Interface Design + +```cpp +class Variable { + private: + Variable() = default; + friend class Scope; +}; + +class Scope { + private: + Scope(const std::shared_ptr& parent = nullptr); + + public: + static std::shared_ptr Create(const std::shared_ptr& parent = nullptr); + + // return nullptr if not found. + Variable* FindVar(const std::string& name) const; + + // return if already contains same name variable. + Variable* Var(const std::string& name); + + private: + std::shared_ptr parent_; + std::unordered_map> vars_; +}; +``` +## Only scope can create a variable + +To ensure `only scope can create a variable`, we should mark `Variable`'s constructor as a private member function, and Scope is a friend class of Variable. And then only `Var` can construct `Variable`. + +## When scope destroyed, all variables inside this scope should be destroyed together + +The scope hold unique pointers for all variables. User can `FindVar` from scope, but he should not hold this pointer as a member variable. Because when scope is destroyed, all variables inside this scope will be destroyed together. + +## Sharing a parent scope + +Local scope contains a `parent_` pointer. It is a linked-list for scopes. Using a `shared_ptr` because when a local scope is using, its parents cannot be destroyed. + +Also, as the parent scope is a `shared_ptr`, we can only `Create()` a scope shared pointer. We cannot construct a scope variable, because it cannot be passed to other scope as `parent` pointer. + +## Orthogonal interface + +`FindVar` will return `nullptr` when `name` is not found. It can be used as `Contains` method. `Var` will return an `Error` when there is a name conflict locally. Combine `FindVar` and `Var`, we can implement `Var` easily. diff --git a/doc/design/selected_rows.md b/doc/design/selected_rows.md new file mode 100644 index 0000000000000000000000000000000000000000..1a98839a957612b91b2276b58818623ecc62d1d5 --- /dev/null +++ b/doc/design/selected_rows.md @@ -0,0 +1,74 @@ +# Design Doc: Selected Rows + +`SelectedRows` is a type of sparse tensor data type, which is designed to support `embedding` operators. The gradient of embedding table is a sparse tensor. Only a few rows are non-zero values in this tensor. It is straight-forward to represent a sparse tensor by the following sparse tensor data structure: + +```cpp +class SelectedRows { + private: + vector rows_; + Tensor value_; + int height_; +}; +``` + +The field `height_` is the first dimension of `SelectedRows`. The `rows` are the indices of the non-zero rows of `SelectedRows`. The `value_` field is an N-dim tensor of shape `[rows.size() /* NUM_ROWS */, ...]`, which supplies values for each row. The dimension of `SelectedRows` satisfies `[height_] + value_.shape[1:]`. + +Suppose that a SelectedRows-typed variable `x` has many rows, but only two of them have values -- row 73 is `[1, 2]` and row 84 is `[3, 4]`, the `SelectedRows` representation would be: + +``` +x = SelectedRow { + rows = [73, 84], + value = [[1, 2], [3,4]] +} +``` + + +## SelectedRows in Protobuf + +`SelectedRows` is a type of `Variable`. `VarDesc` in protobuf should describe the `SelectedRows` information. Only the tensor dimension of a `SelectedRows` will be described in compile-time because the `rows_` and `value_` are dependent on the training data. +So we use `TensorDesc` to unify `data_type` and `dims`. A LodTensorDesc contains a `TensorDesc` and `lod_level`. The description of `SelectedRows` is a Tensor description. + +```proto +message TensorDesc { + required DataType data_type = 1; + repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480] +} + +message LodTensorDesc { + required TensorDesc tensor = 1; + optional int lod_level = 2; +} + +message VarDesc { + required string name = 1; + enum VarType { + LOD_TENSOR = 0; + SELECTED_ROWS = 1; + } + required VarType type = 2; + optional LodTensorDesc lod_desc = 3; + optional TensorDesc selected_rows_desc = 4; + optional bool persistable = 5 [ default = false ]; +} +``` + +## InferShape for Selected Rows + +Just like `LoD` information, `InferShape` method will infer the output tensor type as well. The operator should decide whether its output is a `SelectedRows` or `Dense` tensor. + +For example, the gradient operator of `TableLookup` will always generate `SelectedRows`. Its `InferShape` method should be like following + +```cpp +void TableLookupGrad::InferShape(context) { + ... + context.SetDataType("Embedding.Grad", kSelectedRows); +} +``` + + +## Sparse Operators + +There are several operators that need to be written to support `SelectedRows`. These are: + +1. Operators which generate `SelectedRows` gradient. e.g. Gradient of `TableLookupOp`. +2. Optimize operators which support `SelectedRows` gradient. e.g. `SGD` or `AdaGrad` for `SelectedRows`. However, there should be only one `SGD` operator. `OpWithKernel::Run` should select a suitable kernel for both `dense` tensor or `SelectedRows`. diff --git a/doc/design/simple_op_design.md b/doc/design/simple_op_design.md new file mode 100644 index 0000000000000000000000000000000000000000..c7aeed7f9b4637e1c29d530f37b42d12500af82f --- /dev/null +++ b/doc/design/simple_op_design.md @@ -0,0 +1,202 @@ +## Interaction between C++ and Python + +Users employ API in Python to describe their own network, however, the network construction actually happens in C++. so Protobuf is introduced to send the message between Python and C++. + +The Interaction between Python and C++ can be simplified as two steps: + +1. C++ tells Python how many Ops there are, and what parameter do users need to offer to initialize a new Op. Python then builds API for each Op at compile time. + +2. Users invoke APIs built by Python and provide necessary parameters. These parameters will be sent to C++ for finishing the Op construction task. + +### Message from C++ to Python + +We define a Protobuf message class `OpProto` to hold message needed in the first step. What should an `OpProto` contain? This question is equivalent to “What message do we need to offer, to build a Python API which is legal and user oriented and can use to describe a whole Op.” + +Following message are necessary: + +1. Op's name, and its simple comment. +2. Input and output variable number; each variable's name, type, and comment. +3. Op's attributes; each attribute includes name, type, comment, **default value** and **value range**. + +So `OpProto` can be defined as follows: + +```proto +enum AttrType { + INT = 1; + FLOAT = 2; + STRING = 3; + INTS = 4; + FLOATS = 5; + STRINGS = 6; +}; + +message AttrValue { + AttrType type = 1; + optional int iv = 2; + optional float fv = 3; + optional string sv = 4; + repeated int ivs = 5; + repeated float fvs = 6; + repeated string svs = 7; +}; + +message AttrProto { + required string name = 1; + required string comment = 2; + required AttrType type = 3; +}; + +message VarProto { + required string name = 1; + required string comment = 2; + required bool is_tensor = 3; +}; + +message OpProto { + repeated VarProto inputs = 1; + repeated VarProto outputs = 2; + repeated AttrProto attrs = 3; + required string type = 4; + required string comment = 5; +}; +``` + +To generate Python code automatically: + +```python +def create_python_ops_creatation_functions(): + op_protos = paddle.framework.OpRegistry.get_all_op_proto() + for type_name in op_protos: + op_proto = op_protos[type_name] + def __impl__(**kwargs): # User must use key word args in Paddle API + inputs = [kwargs.get(ipt.name, "") for ipt in op_proto.inputs] + outputs = [kwargs.get(opt.name, "") for opt in op_proto.outputs] + attrs = [cast_to_op_attr(attr, kwargs.get(attr.name, None)) for attr in op_proto.attrs] + opdesc = (input, outputs, type_name, attrs) + return paddle.framework.OpRegistry.CreateOp(opdesc) + __impl__.__doc__ = create_doc_string(op_proto) + globals()[type_name] = __impl__ + +create_python_ops_creatation_functions() +``` + +### Message from Python to C++ + +To hold message needed in the above second step, we define Protobuf message class `OpDesc`. It is used to hold user-specified parameters in Op describing. + +```proto +message OpDesc { + required string type = 1; + repeated string inputs = 2; + repeated string outputs = 3; + map attrs = 4; +}; +``` + +## OpProto Register + +Every Op has its own `OpProto`. For using convenience, we need to register them and record all their messages. For each `Op` class, we define a corresponding `OpMaker` class, in whose constructor we implement the `OpProto`'s building process. `OpMaker`'s constructor will be invoked by another function `OpRegistry::RegisterOp()`. + +```cpp +class OpProtoMaker { +public: + OpProtoMaker(OpProto* proto): proto_(proto) {} +protected: + OpProto* proto_; + void AddInput(const std::string& name, const std::string& desc) {...} + void AddAttr(const std::string& name, const std::string& desc, TypeId type) {...} + void AddComment(const std::string& comment) { ... } +}; + +class OpRegistry { +public: + using OpCreator = std::function; + + template + static void RegisterOp(const std::string& name) { + gCreators_[name] = [](const OpDesc& desc) { + return new OpType(desc); + }; + OpProto& opProto = gProtos_[name]; + OpMaker()(&opProto); + } + + static map gCreators_; + static map gProtos_; +}; + +template +class OpRegister { + public: + OpRegister(std::string type) { + OpRegistry::RegisterOp(type); + } +}; + +#define REGISTER_OP(op_class, op_maker_class, type_name) \ + class op_class##Register { \ + private: \ + const static OpRegister<#op_class, #op_maker_class> reg; \ + }; \ + const Register op_class##Register::reg(#type_name); + +class CosineOp { +// ... +} + +struct CosineOpProtoMaker : public OpProtoMaker { + CosineOpProtoMaker(OpProto* proto) : OpProtoMaker(proto) { + AddInput("input", "input of cosine op"); + AddAttr("scale", "scale of cosine op", float).Default(1.0).GreaterThan(0.0); + AddType("cos"); + AddComment("This is cos op"); + } +} + +REGISTER_OP(CosineOp, CosineOpProtoMaker, cos); +``` + +In `REGISTER_OP(CosineOp, CosineOpProtoMaker, cos)`, we register not only `CosineOp` but also `CosineOpProto`. As fields of `CosineOpProto`, the default value and value range of `scale` are also registered here. + +## Python API + +Python APIs are divided into two types, high-level API and low-level API. + +### High-Level API + +High-level API is called by users directly, so it should keep its style consistent with existing V2 APIs. + +Here is a sample about how a define a fc layer: + +```python +hd = fc_layer(input=data, size=56, with_bias=True, activation="sigmoid"); +``` + +`hd` is the output of `fc_layer` and it's a `variable`. It can be further sent into other layers as input. + +The definition of `fc_layer()`: + +```python +def fc_layer(input, size, with_bias, activation): + attr_map = {"size":size} + check_attrs(attr_map) + w = make_variable('w') + if with_bias: + b = make_variable('b') + else: + b = None + fc_output = make_variable('fc_output'); + fc_op(input, w, b, fc_output, attr_map) + act_output = make_variable('sigmod_output'); + if activation == "sigmod": + sigmod_op(fc_output, act_output); + elif: + # ... + return act_output; +``` + +### Low Leval API + +In above sample, `fc_op` and `sigmod_op` are low-level API. They build `OpDesc` and invoke corresponding C++ code. + +*TODO* diff --git a/doc/design/speech/README.MD b/doc/design/speech/README.MD new file mode 100644 index 0000000000000000000000000000000000000000..7304650e628dba210488cd2dc4836318b5383b2a --- /dev/null +++ b/doc/design/speech/README.MD @@ -0,0 +1,155 @@ +# DeepSpeech2 on PaddlePaddle: Design Doc + +We are planning to build Deep Speech 2 (DS2) \[[1](#references)\], a powerful Automatic Speech Recognition (ASR) engine, on PaddlePaddle. For the first-stage plan, we have the following short-term goals: + +- Release a basic distributed implementation of DS2 on PaddlePaddle. +- Contribute a chapter of Deep Speech to PaddlePaddle Book. + +Intensive system optimization and low-latency inference library (details in \[[1](#references)\]) are not yet covered in this first-stage plan. + +## Table of Contents + +- [Tasks](#tasks) +- [Task Dependency](#task-dependency) +- [Design Details](#design-details) + - [Overview](#overview) + - [Row Convolution](#row-convolution) + - [Beam Search With CTC and LM](#beam-search-with-ctc-and-lm) +- [Future Work](#future-work) +- [References](#references) + +## Tasks + +We roughly break down the project into 14 tasks: + +1. Develop an **audio data provider**: + - Json filelist generator. + - Audio file format transformer. + - Spectrogram feature extraction, power normalization etc. + - Batch data reader with SortaGrad. + - Data augmentation (optional). + - Prepare (one or more) public English data sets & baseline. +2. Create a **simplified DS2 model configuration**: + - With only fixed-length (by padding) audio sequences (otherwise need *Task 3*). + - With only bidirectional-GRU (otherwise need *Task 4*). + - With only greedy decoder (otherwise need *Task 5, 6*). +3. Develop to support **variable-shaped** dense-vector (image) batches of input data. + - Update `DenseScanner` in `dataprovider_converter.py`, etc. +4. Develop a new **lookahead-row-convolution layer** (See \[[1](#references)\] for details): + - Lookahead convolution windows. + - Within-row convolution, without kernels shared across rows. +5. Build KenLM **language model** (5-gram) for beam search decoder: + - Use KenLM toolkit. + - Prepare the corpus & train the model. + - Create infererence interfaces (for Task 6). +6. Develop a **beam search decoder** with CTC + LM + WORDCOUNT: + - Beam search with CTC. + - Beam search with external custom scorer (e.g. LM). + - Try to design a more general beam search interface. +7. Develop a **Word Error Rate evaluator**: + - update `ctc_error_evaluator`(CER) to support WER. +8. Prepare internal dataset for Mandarin (optional): + - Dataset, baseline, evaluation details. + - Particular data preprocessing for Mandarin. + - Might need cooperating with the Speech Department. +9. Create **standard DS2 model configuration**: + - With variable-length audio sequences (need *Task 3*). + - With unidirectional-GRU + row-convolution (need *Task 4*). + - With CTC-LM beam search decoder (need *Task 5, 6*). +10. Make it run perfectly on **clusters**. +11. Experiments and **benchmarking** (for accuracy, not efficiency): + - With public English dataset. + - With internal (Baidu) Mandarin dataset (optional). +12. Time **profiling** and optimization. +13. Prepare **docs**. +14. Prepare PaddlePaddle **Book** chapter with a simplified version. + +## Task Dependency + +Tasks parallelizable within phases: + +Roadmap | Description | Parallelizable Tasks +----------- | :------------------------------------ | :-------------------- +Phase I | Simplified model & components | *Task 1* ~ *Task 8* +Phase II | Standard model & benchmarking & profiling | *Task 9* ~ *Task 12* +Phase III | Documentations | *Task13* ~ *Task14* + +Issue for each task will be created later. Contributions, discussions and comments are all highly appreciated and welcomed! + +## Design Details + +### Overview + +Traditional **ASR** (Automatic Speech Recognition) pipelines require great human efforts devoted to elaborately tuning multiple hand-engineered components (e.g. audio feature design, accoustic model, pronuncation model and language model etc.). **Deep Speech 2** (**DS2**) \[[1](#references)\], however, trains such ASR models in an end-to-end manner, replacing most intermediate modules with only a single deep network architecture. With scaling up both the data and model sizes, DS2 achieves a very significant performance boost. + +Please read Deep Speech 2 \[[1](#references),[2](#references)\] paper for more background knowledge. + +The classical DS2 network contains 15 layers (from bottom to top): + +- **Two** data layers (audio spectrogram, transcription text) +- **Three** 2D convolution layers +- **Seven** uni-directional simple-RNN layers +- **One** lookahead row convolution layers +- **One** fully-connected layers +- **One** CTC-loss layer + +
+
+Figure 1. Archetecture of Deep Speech 2 Network. +
+ +We don't have to persist on this 2-3-7-1-1-1 depth \[[2](#references)\]. Similar networks with different depths might also work well. As in \[[1](#references)\], authors use a different depth (e.g. 2-2-3-1-1-1) for final experiments. + +Key ingredients about the layers: + +- **Data Layers**: + - Frame sequences data of audio **spectrogram** (with FFT). + - Token sequences data of **transcription** text (labels). + - These two type of sequences do not have the same lengthes, thus a CTC-loss layer is required. +- **2D Convolution Layers**: + - Not only temporal convolution, but also **frequency convolution**. Like a 2D image convolution, but with a variable dimension (i.e. temporal dimension). + - With striding for only the first convlution layer. + - No pooling for all convolution layers. +- **Uni-directional RNNs** + - Uni-directional + row convolution: for low-latency inference. + - Bi-direcitional + without row convolution: if we don't care about the inference latency. +- **Row convolution**: + - For looking only a few steps ahead into the feature, instead of looking into a whole sequence in bi-directional RNNs. + - Not nessesary if with bi-direcitional RNNs. + - "**Row**" means convolutions are done within each frequency dimension (row), and no convolution kernels shared across. +- **Batch Normalization Layers**: + - Added to all above layers (except for data and loss layer). + - Sequence-wise normalization for RNNs: BatchNorm only performed on input-state projection and not state-state projection, for efficiency consideration. + + +Required Components | PaddlePaddle Support | Need to Develop +:------------------------------------- | :-------------------------------------- | :----------------------- +Data Layer I (Spectrogram) | Not supported yet. | TBD (Task 3) +Data Layer II (Transcription) | `paddle.data_type.integer_value_sequence` | - +2D Convolution Layer | `paddle.layer.image_conv_layer` | - +DataType Converter (vec2seq) | `paddle.layer.block_expand` | - +Bi-/Uni-directional RNNs | `paddle.layer.recurrent_group` | - +Row Convolution Layer | Not supported yet. | TBD (Task 4) +CTC-loss Layer | `paddle.layer.warp_ctc` | - +Batch Normalization Layer | `paddle.layer.batch_norm` | - +CTC-Beam search | Not supported yet. | TBD (Task 6) + +### Row Convolution + +TODO by Assignees + +### Beam Search with CTC and LM + +TODO by Assignees + +## Future Work + +- Efficiency Improvement +- Accuracy Improvement +- Low-latency Inference Library +- Large-scale benchmarking + +## References + +1. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](http://proceedings.mlr.press/v48/amodei16.pdf). ICML 2016. +2. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](https://arxiv.org/abs/1512.02595). arXiv:1512.02595. diff --git a/doc/design/speech/image/ds2_network.png b/doc/design/speech/image/ds2_network.png new file mode 100644 index 0000000000000000000000000000000000000000..1a5b2184d47928cc2849d5a7c8ea2d8cf5337e11 Binary files /dev/null and b/doc/design/speech/image/ds2_network.png differ diff --git a/doc/design/tensor_array.md b/doc/design/tensor_array.md new file mode 100644 index 0000000000000000000000000000000000000000..37e4f7b90f94fa3eb015e733999cd84c96b2239c --- /dev/null +++ b/doc/design/tensor_array.md @@ -0,0 +1,271 @@ +# Design for TensorArray +This design doc presents the necessity of a new C++ class `TensorArray`. +In addition to the very simple C++ implementation + +```c++ +class TensorArray { + public: + explicit TensorArray(const LoDTensor&); + explicit TensorArray(size_t size); + + private: + vector values_; +}; +``` + +We also need to expose it to PaddlePaddle's Python API, +because users would want to use it with our very flexible operators `WhileLoop`. +An example for a RNN based on dynamic operators is + +```python +input = pd.data(...) +num_steps = Var(12) + +TensorArray states(size=num_steps) +TensorArray step_inputs(unstack_from=input) +TensorArray step_outputs(size=num_steps) + +W = Tensor(...) +U = Tensor(...) +default_state = some_op() + +step = Var(1) + +wloop = paddle.create_whileloop(loop_vars=[step]) +with wloop.frame(): + wloop.break_if(pd.equal(step, num_steps) + pre_state = states.read(step-1, default_state) + step_input = step_inputs.read(step) + state = pd.sigmoid(pd.matmul(U, pre_state) + pd.matmul(W, step_input)) + states.write(step, state) + step_outputs.write(step, state) # output state + step.update(state+1) + +output = step_outputs.stack() +``` + +## Background +Steps are one of the core concepts of RNN. In each time step of RNN, there should be several input segments, states, and output segments; all these components act like arrays, for example, call `states[step_id]` will get the state in `step_id`th time step. + +An RNN can be implemented with the following pseudocode + +```c++ +Array states; +Array input_segments; +Array output_segments; +Parameter W, U; + +step = 1 +seq_len = 12 +while_loop { + if (step == seq_len) break; + states[step] = sigmoid(W * states[step-1] + U * input_segments[step]); + output_segments[step] = states[step] // take state as output + step++; +} +``` +According to the [RNN roadmap](https://github.com/PaddlePaddle/Paddle/issues/4561), there are several different RNNs that PaddlePaddle will eventually support. + +Currently, the basic RNN implementation supported by PaddlePaddle is the `recurrent_op` which takes tensors as input and splits them into `input_segments`. + + +Since a tensor cannot store variable-length sequences directly, PaddlePaddle implements the tensor with level of details (`LoDTensor` for short). +Segmenting the `LoDTensor` is much more complicated than splitting a tensor, that makes it necessary to refactor the `recurrent_op` with `LoDTensor` segmenting support. + +As the next step in RNN support, `dynamic_recurrent_op` should be introduced to handle inputs with variable-length sequences. + +The implementation is similar to `recurrent_op`. +The key difference is the way **the original input `LoDTensors` and outupts are split to get the `input_segments` and the `output_segments`.** + + +Though it can't be built over `recurrent_op` or `dynamic_recurrent_op` directly, +the logic behind splitting a tensor or a LoD tensor into `input_segments` remains the same. + +## Why `TensorArray` +The logic behind splitting the inputs to segments, states and outputs is similar and can be shared in a seperate module. + +The array of `states`, `input_segments` and `output_segments` would be exposed to users when writing a dynamic RNN model similar to the above pseudo codes. + +So there should be an array-like container, which can store the segments of a tensor or LoD tensor. + +**This container can store an array of tensors and provides several methods to split a tensor or a LoD tensor** . +This is where the notion of `TensorArray` comes from. + +## Introduce TensorArray to uniform all the three RNNs +TensorArray as a new concept is borrowed from TensorFlow, +it is meant to be used with dynamic iteration primitives such as `while_loop` and `map_fn`. + +This concept can be used to support our new design of dynamic operations, and help to refactor some existing variant-sentence-related layers, +such as `recurrent_op`, `RecurrentGradientMachine`. + +In [our design for dynamic RNN](https://github.com/PaddlePaddle/Paddle/pull/4401), +`TensorArray` is used to segment inputs and store states in all time steps. +By providing some methods similar to a C++ array, +the definition of some state-based dynamic models such as RNN can be more natural and highly flexible. + +## Dynamic-operations on TensorArray + +`TensorArray` will be used directly when defining dynamic models, so some operators listed below should be implemented + +```python +# several helper operators for TensorArray +def tensor_array_stack(ta, tensor): + ''' + get a tensor array `ta`, return a packed `tensor`. + ''' + pass + +def tensor_array_unstack(tensor, ta): + ''' + get a `tensor`, unstack it and get a tensor array `ta`. + ''' + pass + +def tensor_array_write(ta, index, tensor, data_shared): + ''' + get a `tensor` and a scalar tensor `index`, write `tensor` into index-th + value of the tensor array `ta`. + `data_shared` is an attribute that specifies whether to copy or reference the tensors. + ''' + pass + +def tensor_array_read(ta, index, tensor): + ''' + get a tensor array `ta`, a scalar tensor `index`, read the index-th value of + `ta` and return as the `tensor`. + ''' + pass + +def tensor_array_size(ta, tensor): + ''' + get a tensor array `ta`, return the size of `ta` and return as the scalar `tensor`. + ''' + pass +``` + +It is trivial for users to use so many low-level operators, so some helper methods should be proposed in python wrapper to make `TensorArray` easier to use, +for example + +```python +class TensorArray: + def __init__(self, name): + self.name = name + self.desc = TensorArrayDesc() + + def stack(self, name=None): + ''' + Pack the values in a `TensorArray` into a tensor with rank one higher + than each tensor in `values`. + `stack` can be used to split tensor into time steps for RNN or whileloop. + + @name: str + the name of the variable to output. + ''' + tensor = Var(name) + tensor_array_stack(self.name, tensor) + return tensor + + def unstack(self, input): + ''' + Unpacks the given dimension of a rank-`R` tensor into rank-`(R-1)` tensors. + `unstack` can be used to concatenate all the time steps for RNN or whileloop. + + @input: str + the name of input tensor + ''' + tensor_array_unstack(tensor, self.name) + + def write(self, index, value, data_shared=True): + ''' + Write value into index of the TensorArray. + If `data_shared` is set to True, than the index-th value in TensorArray will + be shared with the tensor passed in. + + @index: str + name of a scalar tensor + @value: str + name of a tensor + @data_shared: bool + ''' + tensor_array_write(self.name, index, value, data_shared) + + def read(self, index, output): + ''' + Read the value at location `index` in the `TensorArray`. + + @index: str + name of a scalar tensor + @output: + name of a output variable + ''' + tensor_array_read(self.name, index, output) + + + def size(self, output): + ''' + Return the number of values. + + @output: str + name of a scalar tensor + ''' + tensor_array_size(self.name, output) +``` + +## LoDTensor-related Supports +The `RecurrentGradientMachine` in Paddle serves as a flexible RNN layer; it takes varience-length sequences as input, and output sequences too. + +Since each step of RNN can only take a tensor-represented batch of data as input, +some preprocess should be taken on the inputs such as sorting the sentences by their length in descending order and cut each word and pack to new batches. + +Such cut-like operations can be embedded into `TensorArray` as general methods called `unpack` and `pack`, +these two operations are similar to `stack` and `unstack` except that they operate on variable-length sequences formated as a LoD tensor rather than a tensor. + +Some definitions are like + +```python +def unpack(level): + ''' + Split LodTensor in some `level` and generate batches, if set `sort_by_length`, + will sort by length. + + Returns: + - a new `TensorArray`, whose values are LodTensors and represents batches + of data. + - an int32 Tensor, which stores the map from the new batch's indices to + original LoDTensor + ''' + pass + +def pack(level, indices_map): + ''' + Recover the original LoD-arranged LoDTensor with the values in a `TensorArray` + and `level` and `indices_map`. + ''' + pass +``` + +With these two methods, a varience-length sentence supported RNN can be implemented like + +```c++ +// input is the varient-length data +LodTensor sentence_input(xxx); +TensorArray ta; +Tensor indice_map; +Tensor boot_state = xxx; // to initialize rnn's first state +TensorArray::unpack(input, 1/*level*/, true/*sort_by_length*/, &ta, &indice_map); +TessorArray step_outputs; +TensorArray states; + +for (int step = 0; step = ta.size(); step++) { + auto state = states.read(step); + // rnnstep is a function which acts like a step of RNN + auto step_input = ta.read(step); + auto step_output = rnnstep(step_input, state); + step_outputs.write(step_output, true/*data_shared*/); +} + +// rnn_output is the final output of an rnn +LoDTensor rnn_output = ta.pack(ta, indice_map); +``` +the code above shows that by embedding the LoDTensor-related preprocess operations into `TensorArray`, +the implementation of a RNN that supports varient-length sentences is far more concise than `RecurrentGradientMachine` because the latter mixes all the codes together, hard to read and extend. diff --git a/doc/design/test.dot b/doc/design/test.dot new file mode 100644 index 0000000000000000000000000000000000000000..62c69b8fc8010a26a54a6ee8ef1488aad94d747a --- /dev/null +++ b/doc/design/test.dot @@ -0,0 +1,35 @@ + +digraph Test { + z -> generator -> G_img; + G_img -> discriminator -> D_f -> d_loss_f; + label0 -> d_loss_f -> d_loss; + + img -> discriminator -> D_t -> d_loss_t; + label1 -> d_loss_t -> d_loss; + + d_loss -> d_loss_t[color=red, style=dashed]; + d_loss -> d_loss_f[color=red, style=dashed]; + d_loss_t -> D_t[color=red, style=dashed]; + d_loss_f -> D_f[color=red, style=dashed]; + D_t -> discriminator[color=red, style=dashed]; + D_f -> discriminator[color=red, style=dashed]; + + D_f -> g_loss; + label2 -> g_loss; + + g_loss -> D_f[color=green, style=dashed]; + D_f -> discriminator[color=green, style=dashed]; + discriminator -> G_img[color=green, style=dashed]; + G_img -> generator[color=green, style=dashed]; + + discriminator [color=red, shape=box]; + generator [color=green, shape=box]; + z [shape=diamond]; + img [shape=diamond]; + label0 [shape=diamond]; + label1 [shape=diamond]; + label2 [shape=diamond]; + + d_loss [color=red]; + g_loss [color=green]; +} diff --git a/doc/design/test.dot.png b/doc/design/test.dot.png new file mode 100644 index 0000000000000000000000000000000000000000..4e121a40b9f7b2232d7cdda315bad15926446f55 Binary files /dev/null and b/doc/design/test.dot.png differ diff --git a/doc/design/var_desc.md b/doc/design/var_desc.md new file mode 100644 index 0000000000000000000000000000000000000000..0b2958c1b10ef6a6ce51aa75f61e15a7f2d94b3f --- /dev/null +++ b/doc/design/var_desc.md @@ -0,0 +1,69 @@ +## Background +PaddlePaddle divides the description of neural network computation graph into two stages: compile time and runtime. + +PaddlePaddle use proto message to describe compile time graph because + +1. Computation graph should be able to be saved to a file. +1. In distributed training, the graph will be serialized and send to multiple workers. + +The computation graph is constructed by Data Node and Operation Node. The concept to represent them is in the table below. + +| |compile time|runtime| +|---|---|---| +|Data|VarDesc(proto)|Variable(cpp)| +|Operation|OpDesc(proto)|Operator(cpp)| + + +## Definition of VarDesc + +A VarDesc should have a name, and value. The are two kinds of variable type in compile time, they are `LoDTensor` and `SelectedRows`. + +```proto +message VarDesc { + required string name = 1; + enum VarType { + LOD_TENSOR = 0; + SELECTED_ROWS = 1; + } + required VarType type = 2; + optional LoDTensorDesc lod_desc = 3; + optional TensorDesc selected_rows_desc = 4; + optional bool persistable = 5 [ default = false ]; +} +``` + +## Definition of TensorDesc + +```proto +enum DataType { + BOOL = 0; + INT16 = 1; + INT32 = 2; + INT64 = 3; + FP16 = 4; + FP32 = 5; + FP64 = 6; +} + +message TensorDesc { + required DataType data_type = 1; + repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480] +} +``` + +A TensorDesc describes `SelectedRows` and `LoDTensor`. For details of `SelectedRows`, please reference [`SelectedRows`](./selected_rows.md). + +## Definition of LodTensorDesc + +```proto +message LoDTensorDesc { + required TensorDesc tensor = 1; + optional int lod_level = 2; +} +``` + +A LoDTensorDesc contains a tensor and a lod_level. + +## Definition of Variable in Python + +For Variable in Python, please reference [`Python API`](./python_api.md). diff --git a/doc/faq/build_and_install/index_cn.rst b/doc/faq/build_and_install/index_cn.rst new file mode 100644 index 0000000000000000000000000000000000000000..f1677e216f31d79b53ac29a0afbf6fbb886a0dcd --- /dev/null +++ b/doc/faq/build_and_install/index_cn.rst @@ -0,0 +1,111 @@ +################### +编译安装与单元测试 +################### + +.. contents:: + +1. 运行Docker GPU镜像出现 "CUDA driver version is insufficient" +---------------------------------------------------------------- + +用户在使用PaddlePaddle GPU的Docker镜像的时候,常常出现 `Cuda Error: CUDA driver version is insufficient for CUDA runtime version`, 原因在于没有把机器上CUDA相关的驱动和库映射到容器内部。 +具体的解决方法是: + +.. code-block:: bash + + $ export CUDA_SO="$(\ls usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')" + $ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}') + $ docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddlepaddle:latest-gpu + +更多关于Docker的安装与使用, 请参考 `PaddlePaddle Docker 文档 `_ 。 + + +2. CMake源码编译, 找到的PythonLibs和PythonInterp版本不一致 +---------------------------------------------------------------- + +这是目前CMake寻找Python的逻辑存在缺陷,如果系统安装了多个Python版本,CMake找到的Python库和Python解释器版本可能有不一致现象,导致编译PaddlePaddle失败。正确的解决方法是, +用户强制指定特定的Python版本,具体操作如下: + + .. code-block:: bash + + cmake .. -DPYTHON_EXECUTABLE= -DPYTHON_LIBRARY= -DPYTHON_INCLUDE_DIR= + +用户需要指定本机上Python的路径:````, ````, ```` + +3. CMake源码编译,Paddle版本号为0.0.0 +-------------------------------------- + +如果运行 :code:`paddle version`, 出现 :code:`PaddlePaddle 0.0.0`;或者运行 :code:`cmake ..`,出现 + +.. code-block:: bash + + CMake Warning at cmake/version.cmake:20 (message): + Cannot add paddle version from git tag + +那么用户需要拉取所有的远程分支到本机,命令为 :code:`git fetch upstream`,然后重新cmake即可。 + +4. paddlepaddle\*.whl is not a supported wheel on this platform. +------------------------------------------------------------------------ + +出现这个问题的主要原因是,没有找到和当前系统匹配的paddlepaddle安装包。最新的paddlepaddle python安装包支持Linux x86_64和MacOS 10.12操作系统,并安装了python 2.7和pip 9.0.1。 + +更新 :code:`pip` 包的方法是\: + +.. code-block:: bash + + pip install --upgrade pip + +如果还不行,可以执行 :code:`python -c "import pip; print(pip.pep425tags.get_supported())"` 获取当前系统支持的python包的后缀, +并对比是否和正在安装的后缀一致。 + +如果系统支持的是 :code:`linux_x86_64` 而安装包是 :code:`manylinux1_x86_64` ,需要升级pip版本到最新; +如果系统支持 :code:`manylinux1_x86_64` 而安装包(本地)是 :code:`linux_x86_64` ,可以重命名这个whl包为 :code:`manylinux1_x86_64` 再安装。 + +5. 编译安装后执行 import paddle.v2 as paddle 报ImportError: No module named v2 +------------------------------------------------------------------------------------------ +先查看一下是否曾经安装过paddle v1版本,有的话需要先卸载: + +pip uninstall py_paddle paddle + +然后安装paddle的python环境, 在build目录下执行 + +pip install python/dist/paddle*.whl && pip install ../paddle/dist/py_paddle*.whl + +6. 遇到“非法指令”或者是“illegal instruction” +-------------------------------------------- + +PaddlePaddle使用avx SIMD指令提高cpu执行效率,因此错误的使用二进制发行版可能会导致这种错误,请选择正确的版本。 + +7. python相关的单元测试都过不了 +-------------------------------- + +如果出现以下python相关的单元测试都过不了的情况: + +.. code-block:: bash + + 24 - test_PyDataProvider (Failed) + 26 - test_RecurrentGradientMachine (Failed) + 27 - test_NetworkCompare (Failed) + 28 - test_PyDataProvider2 (Failed) + 32 - test_Prediction (Failed) + 33 - test_Compare (Failed) + 34 - test_Trainer (Failed) + 35 - test_TrainerOnePass (Failed) + 36 - test_CompareTwoNets (Failed) + 37 - test_CompareTwoOpts (Failed) + 38 - test_CompareSparse (Failed) + 39 - test_recurrent_machine_generation (Failed) + 40 - test_PyDataProviderWrapper (Failed) + 41 - test_config_parser (Failed) + 42 - test_swig_api (Failed) + 43 - layers_test (Failed) + +并且查询PaddlePaddle单元测试的日志,提示: + +.. code-block:: bash + + paddle package is already in your PYTHONPATH. But unittest need a clean environment. + Please uninstall paddle package before start unittest. Try to 'pip uninstall paddle'. + +解决办法是: + +* 卸载PaddlePaddle包 :code:`pip uninstall paddle`, 清理掉老旧的PaddlePaddle安装包,使得单元测试有一个干净的环境。如果PaddlePaddle包已经在python的site-packages里面,单元测试会引用site-packages里面的python包,而不是源码目录里 :code:`/python` 目录下的python包。同时,即便设置 :code:`PYTHONPATH` 到 :code:`/python` 也没用,因为python的搜索路径是优先已经安装的python包。 diff --git a/doc/faq/cluster/index_cn.rst b/doc/faq/cluster/index_cn.rst new file mode 100644 index 0000000000000000000000000000000000000000..e59c1e1a54a0c876d1e6e89f88030de59fb9fc1a --- /dev/null +++ b/doc/faq/cluster/index_cn.rst @@ -0,0 +1,17 @@ +############### +集群训练与预测 +############### + +.. contents:: + +1. 集群多节点训练,日志中保存均为网络通信类错误 +------------------------------------------------ + +集群多节点训练,日志报错为网络通信类错误,比如 :code:`Connection reset by peer` 等。 +此类报错通常是由于某一个节点的错误导致这个节点的训练进程退出,从而引发其他节点无法连接导致,可以参考下面的步骤排查: + +* 从 :code:`train.log` , :code:`server.log` 找到最早报错的地方,查看是否是其他错误引发的报错(比如FPE,内存不足,磁盘空间不足等)。 + +* 如果发现最早的报错就是网络通信的问题,很有可能是非独占方式执行导致的端口冲突,可以联系OP,看当前MPI集群是否支持resource=full参数提交,如果支持增加此参数提交,并更换job 端口。 + +* 如果当前MPI集群并不支持任务独占模式,可以联系OP是否可以更换集群或升级当前集群。 diff --git a/doc/faq/index_cn.rst b/doc/faq/index_cn.rst index df5e172252277a881480cd2816eb901b711abe6b..9929767cac212237b3e2c3a547ba9a3c9d5f0979 100644 --- a/doc/faq/index_cn.rst +++ b/doc/faq/index_cn.rst @@ -1,301 +1,11 @@ -#################### FAQ -#################### +==== -.. contents:: +.. toctree:: + :maxdepth: 1 -1. 如何减少内存占用 ---------------------------------- - -神经网络的训练本身是一个非常消耗内存和显存的工作,经常会消耗数10GB的内存和数GB的显存。 -PaddlePaddle的内存占用主要分为如下几个方面\: - -* DataProvider缓冲池内存(只针对内存) -* 神经元激活内存(针对内存和显存) -* 参数内存 (针对内存和显存) -* 其他内存杂项 - -其中,其他内存杂项是指PaddlePaddle本身所用的一些内存,包括字符串分配,临时变量等等,暂不考虑在内。 - -减少DataProvider缓冲池内存 -++++++++++++++++++++++++++ - -PyDataProvider使用的是异步加载,同时在内存里直接随即选取数据来做Shuffle。即 - -.. graphviz:: - - digraph { - rankdir=LR; - 数据文件 -> 内存池 -> PaddlePaddle训练 - } - -所以,减小这个内存池即可减小内存占用,同时也可以加速开始训练前数据载入的过程。但是,这 -个内存池实际上决定了shuffle的粒度。所以,如果将这个内存池减小,又要保证数据是随机的, -那么最好将数据文件在每次读取之前做一次shuffle。可能的代码为 - -.. literalinclude:: src/reduce_min_pool_size.py - -这样做可以极大的减少内存占用,并且可能会加速训练过程,详细文档参考 :ref:`api_pydataprovider2` 。 - -神经元激活内存 -++++++++++++++ - -神经网络在训练的时候,会对每一个激活暂存一些数据,如神经元激活值等。 -在反向传递的时候,这些数据会被用来更新参数。这些数据使用的内存主要和两个参数有关系, -一是batch size,另一个是每条序列(Sequence)长度。所以,其实也是和每个mini-batch中包含 -的时间步信息成正比。 - -所以做法可以有两种: - -* 减小batch size。 即在网络配置中 :code:`settings(batch_size=1000)` 设置成一个小一些的值。但是batch size本身是神经网络的超参数,减小batch size可能会对训练结果产生影响。 -* 减小序列的长度,或者直接扔掉非常长的序列。比如,一个数据集大部分序列长度是100-200, - 但是突然有一个10000长的序列,就很容易导致内存超限,特别是在LSTM等RNN中。 - -参数内存 -++++++++ - -PaddlePaddle支持非常多的优化算法(Optimizer),不同的优化算法需要使用不同大小的内存。 -例如使用 :code:`adadelta` 算法,则需要使用等于权重参数规模大约5倍的内存。举例,如果参数保存下来的模型目录 -文件为 :code:`100M`, 那么该优化算法至少需要 :code:`500M` 的内存。 - -可以考虑使用一些优化算法,例如 :code:`momentum`。 - -2. 如何加速PaddlePaddle的训练速度 ---------------------------------- - -加速PaddlePaddle训练可以考虑从以下几个方面\: - -* 减少数据载入的耗时 -* 加速训练速度 -* 利用分布式训练驾驭更多的计算资源 - -减少数据载入的耗时 -++++++++++++++++++ - -使用\ :code:`pydataprovider`\ 时,可以减少缓存池的大小,同时设置内存缓存功能,即可以极大的加速数据载入流程。 -:code:`DataProvider` 缓存池的减小,和之前减小通过减小缓存池来减小内存占用的原理一致。 - -.. literalinclude:: src/reduce_min_pool_size.py - -同时 :code:`@provider` 接口有一个 :code:`cache` 参数来控制缓存方法,将其设置成 :code:`CacheType.CACHE_PASS_IN_MEM` 的话,会将第一个 :code:`pass` (过完所有训练数据即为一个pass)生成的数据缓存在内存里,在之后的 :code:`pass` 中,不会再从 :code:`python` 端读取数据,而是直接从内存的缓存里读取数据。这也会极大减少数据读入的耗时。 - - -加速训练速度 -++++++++++++ - -PaddlePaddle支持Sparse的训练,sparse训练需要训练特征是 :code:`sparse_binary_vector` 、 :code:`sparse_vector` 、或者 :code:`integer_value` 的任一一种。同时,与这个训练数据交互的Layer,需要将其Parameter设置成 sparse 更新模式,即设置 :code:`sparse_update=True` - -这里使用简单的 :code:`word2vec` 训练语言模型距离,具体使用方法为\: - -使用一个词前两个词和后两个词,来预测这个中间的词。这个任务的DataProvider为\: - -.. literalinclude:: src/word2vec_dataprovider.py - -这个任务的配置为\: - -.. literalinclude:: src/word2vec_config.py - - -利用更多的计算资源 -++++++++++++++++++ - -利用更多的计算资源可以分为一下几个方式来进行\: - -* 单机CPU训练 - - * 使用多线程训练。设置命令行参数 :code:`trainer_count`。 - -* 单机GPU训练 - - * 使用显卡训练。设置命令行参数 :code:`use_gpu`。 - * 使用多块显卡训练。设置命令行参数 :code:`use_gpu` 和 :code:`trainer_count` 。 - -* 多机训练 - - * 请参考 :ref:`cluster_train` 。 - - -3. 遇到“非法指令”或者是“illegal instruction” --------------------------------------------- - -PaddlePaddle使用avx SIMD指令提高cpu执行效率,因此错误的使用二进制发行版可能会导致这种错误,请选择正确的版本。 - -4. 如何选择SGD算法的学习率 --------------------------- - -在采用sgd/async_sgd进行训练时,一个重要的问题是选择正确的learning_rate。如果learning_rate太大,那么训练有可能不收敛,如果learning_rate太小,那么收敛可能很慢,导致训练时间过长。 - -通常做法是从一个比较大的learning_rate开始试,如果不收敛,那减少学习率10倍继续试验,直到训练收敛为止。那么如何判断训练不收敛呢?可以估计出如果模型采用不变的输出最小的cost0是多少。 - -如果训练过程的的cost明显高于这个常数输出的cost,那么我们可以判断为训练不收敛。举一个例子,假如我们是三分类问题,采用multi-class-cross-entropy作为cost,数据中0,1,2三类的比例为 :code:`0.2, 0.5, 0.3` , 那么常数输出所能达到的最小cost是 :code:`-(0.2*log(0.2)+0.5*log(0.5)+0.3*log(0.3))=1.03` 。如果训练一个pass(或者更早)后,cost还大于这个数,那么可以认为训练不收敛,应该降低学习率。 - - -5. 如何初始化参数 ------------------ - -默认情况下,PaddlePaddle使用均值0,标准差为 :math:`\frac{1}{\sqrt{d}}` 来初始化参数。其中 :math:`d` 为参数矩阵的宽度。这种初始化方式在一般情况下不会产生很差的结果。如果用户想要自定义初始化方式,PaddlePaddle目前提供两种参数初始化的方式\: - -* 高斯分布。将 :code:`param_attr` 设置成 :code:`param_attr=ParamAttr(initial_mean=0.0, initial_std=1.0)` -* 均匀分布。将 :code:`param_attr` 设置成 :code:`param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0)` - -比如设置一个全连接层的参数初始化方式和bias初始化方式,可以使用如下代码。 - -.. code-block:: python - - hidden = fc_layer(input=ipt, param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0), - bias_attr=ParamAttr(initial_mean=1.0, initial_std=0.0)) - -上述代码将bias全部初始化为1.0, 同时将参数初始化为 :code:`[1.0, -1.0]` 的均匀分布。 - -6. 如何共享参数 ---------------- - -PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID,相同名字的参数,会共享参数。设置参数的名字,可以使用 :code:`ParamAttr(name="YOUR_PARAM_NAME")` 来设置。更方便的设置方式,是使得要共享的参数使用同样的 :code:`ParamAttr` 对象。 - -简单的全连接网络,参数共享的配置示例为\: - -.. literalinclude:: ../../python/paddle/trainer_config_helpers/tests/configs/shared_fc.py - -这里 :code:`hidden_a` 和 :code:`hidden_b` 使用了同样的parameter和bias。并且softmax层的两个输入也使用了同样的参数 :code:`softmax_param`。 - -7. \*-cp27mu-linux_x86_64.whl is not a supported wheel on this platform. ------------------------------------------------------------------------- - -出现这个问题的主要原因是,系统编译wheel包的时候,使用的 :code:`wheel` 包是最新的, -而系统中的 :code:`pip` 包比较老。具体的解决方法是,更新 :code:`pip` 包并重新编译PaddlePaddle。 -更新 :code:`pip` 包的方法是\: - -.. code-block:: bash - - pip install --upgrade pip - -8. python相关的单元测试都过不了 --------------------------------- - -如果出现以下python相关的单元测试都过不了的情况: - -.. code-block:: bash - - 24 - test_PyDataProvider (Failed) - 26 - test_RecurrentGradientMachine (Failed) - 27 - test_NetworkCompare (Failed) - 28 - test_PyDataProvider2 (Failed) - 32 - test_Prediction (Failed) - 33 - test_Compare (Failed) - 34 - test_Trainer (Failed) - 35 - test_TrainerOnePass (Failed) - 36 - test_CompareTwoNets (Failed) - 37 - test_CompareTwoOpts (Failed) - 38 - test_CompareSparse (Failed) - 39 - test_recurrent_machine_generation (Failed) - 40 - test_PyDataProviderWrapper (Failed) - 41 - test_config_parser (Failed) - 42 - test_swig_api (Failed) - 43 - layers_test (Failed) - -并且查询PaddlePaddle单元测试的日志,提示: - -.. code-block:: bash - - paddle package is already in your PYTHONPATH. But unittest need a clean environment. - Please uninstall paddle package before start unittest. Try to 'pip uninstall paddle'. - -解决办法是: - -* 卸载PaddlePaddle包 :code:`pip uninstall paddle`, 清理掉老旧的PaddlePaddle安装包,使得单元测试有一个干净的环境。如果PaddlePaddle包已经在python的site-packages里面,单元测试会引用site-packages里面的python包,而不是源码目录里 :code:`/python` 目录下的python包。同时,即便设置 :code:`PYTHONPATH` 到 :code:`/python` 也没用,因为python的搜索路径是优先已经安装的python包。 - - -9. 运行Docker GPU镜像出现 "CUDA driver version is insufficient" ----------------------------------------------------------------- - -用户在使用PaddlePaddle GPU的Docker镜像的时候,常常出现 `Cuda Error: CUDA driver version is insufficient for CUDA runtime version`, 原因在于没有把机器上CUDA相关的驱动和库映射到容器内部。 -具体的解决方法是: - -.. code-block:: bash - - $ export CUDA_SO="$(\ls usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')" - $ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}') - $ docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddlepaddle:latest-gpu - -更多关于Docker的安装与使用, 请参考 `PaddlePaddle Docker 文档 `_ 。 - - -10. CMake源码编译, 找到的PythonLibs和PythonInterp版本不一致 ----------------------------------------------------------------- - -这是目前CMake寻找Python的逻辑存在缺陷,如果系统安装了多个Python版本,CMake找到的Python库和Python解释器版本可能有不一致现象,导致编译PaddlePaddle失败。正确的解决方法是, -用户强制指定特定的Python版本,具体操作如下: - - .. code-block:: bash - - cmake .. -DPYTHON_EXECUTABLE= -DPYTHON_LIBRARY= -DPYTHON_INCLUDE_DIR= - -用户需要指定本机上Python的路径:````, ````, ```` - -10. A protocol message was rejected because it was too big ----------------------------------------------------------- - -如果在训练NLP相关模型时,出现以下错误: - -.. code-block:: bash - - [libprotobuf ERROR google/protobuf/io/coded_stream.cc:171] A protocol message was rejected because it was too big (more than 67108864 bytes). To increase the limit (or to disable these warnings), see CodedInputStream::SetTotalBytesLimit() in google/protobuf/io/coded_stream.h. - F1205 14:59:50.295174 14703 TrainerConfigHelper.cpp:59] Check failed: m->conf.ParseFromString(configProtoStr) - -可能的原因是:传给dataprovider的某一个args过大,一般是由于直接传递大字典导致的。错误的define_py_data_sources2类似: - -.. code-block:: python - - src_dict = dict() - for line_count, line in enumerate(open(src_dict_path, "r")): - src_dict[line.strip()] = line_count - - define_py_data_sources2( - train_list, - test_list, - module="dataprovider", - obj="process", - args={"src_dict": src_dict}) - -解决方案是:将字典的地址作为args传给dataprovider,然后在dataprovider里面根据该地址加载字典。即define_py_data_sources2应改为: - -.. code-block:: python - - define_py_data_sources2( - train_list, - test_list, - module="dataprovider", - obj="process", - args={"src_dict_path": src_dict_path}) - -完整源码可参考 `seqToseq `_ 示例。 - -11. 如何指定GPU设备 -------------------- - -例如机器上有4块GPU,编号从0开始,指定使用2、3号GPU: - -* 方式1:通过 `CUDA_VISIBLE_DEVICES `_ 环境变量来指定特定的GPU。 - -.. code-block:: bash - - env CUDA_VISIBLE_DEVICES=2,3 paddle train --use_gpu=true --trainer_count=2 - -* 方式2:通过命令行参数 ``--gpu_id`` 指定。 - -.. code-block:: bash - - paddle train --use_gpu=true --trainer_count=2 --gpu_id=2 - - -12. 训练过程中出现 :code:`Floating point exception`, 训练因此退出怎么办? ------------------------------------------------------------------------- - -Paddle二进制在运行时捕获了浮点数异常,只要出现浮点数异常(即训练过程中出现NaN或者Inf),立刻退出。浮点异常通常的原因是浮点数溢出、除零等问题。 -主要原因包括两个方面: - -* 训练过程中参数或者训练过程中的梯度尺度过大,导致参数累加,乘除等时候,导致了浮点数溢出。 -* 模型一直不收敛,发散到了一个数值特别大的地方。 -* 训练数据有问题,导致参数收敛到了一些奇异的情况。或者输入数据尺度过大,有些特征的取值达到数百万,这时进行矩阵乘法运算就可能导致浮点数溢出。 - -主要的解决办法是减小学习律或者对数据进行归一化处理。 + build_and_install/index_cn.rst + model/index_cn.rst + parameter/index_cn.rst + local/index_cn.rst + cluster/index_cn.rst diff --git a/doc/faq/local/index_cn.rst b/doc/faq/local/index_cn.rst new file mode 100644 index 0000000000000000000000000000000000000000..b331d9d36e6a279881c3b1a5586835e7186957fb --- /dev/null +++ b/doc/faq/local/index_cn.rst @@ -0,0 +1,213 @@ +############### +本地训练与预测 +############### + +.. contents:: + +1. 如何减少内存占用 +------------------- + +神经网络的训练本身是一个非常消耗内存和显存的工作,经常会消耗数10GB的内存和数GB的显存。 +PaddlePaddle的内存占用主要分为如下几个方面\: + +* DataProvider缓冲池内存(只针对内存) +* 神经元激活内存(针对内存和显存) +* 参数内存 (针对内存和显存) +* 其他内存杂项 + +其中,其他内存杂项是指PaddlePaddle本身所用的一些内存,包括字符串分配,临时变量等等,暂不考虑在内。 + +减少DataProvider缓冲池内存 +++++++++++++++++++++++++++ + +PyDataProvider使用的是异步加载,同时在内存里直接随即选取数据来做Shuffle。即 + +.. graphviz:: + + digraph { + rankdir=LR; + 数据文件 -> 内存池 -> PaddlePaddle训练 + } + +所以,减小这个内存池即可减小内存占用,同时也可以加速开始训练前数据载入的过程。但是,这 +个内存池实际上决定了shuffle的粒度。所以,如果将这个内存池减小,又要保证数据是随机的, +那么最好将数据文件在每次读取之前做一次shuffle。可能的代码为 + +.. literalinclude:: src/reduce_min_pool_size.py + +这样做可以极大的减少内存占用,并且可能会加速训练过程,详细文档参考 :ref:`api_pydataprovider2` 。 + +神经元激活内存 +++++++++++++++ + +神经网络在训练的时候,会对每一个激活暂存一些数据,如神经元激活值等。 +在反向传递的时候,这些数据会被用来更新参数。这些数据使用的内存主要和两个参数有关系, +一是batch size,另一个是每条序列(Sequence)长度。所以,其实也是和每个mini-batch中包含 +的时间步信息成正比。 + +所以做法可以有两种: + +* 减小batch size。 即在网络配置中 :code:`settings(batch_size=1000)` 设置成一个小一些的值。但是batch size本身是神经网络的超参数,减小batch size可能会对训练结果产生影响。 +* 减小序列的长度,或者直接扔掉非常长的序列。比如,一个数据集大部分序列长度是100-200, + 但是突然有一个10000长的序列,就很容易导致内存超限,特别是在LSTM等RNN中。 + +参数内存 +++++++++ + +PaddlePaddle支持非常多的优化算法(Optimizer),不同的优化算法需要使用不同大小的内存。 +例如使用 :code:`adadelta` 算法,则需要使用等于权重参数规模大约5倍的内存。举例,如果参数保存下来的模型目录 +文件为 :code:`100M`, 那么该优化算法至少需要 :code:`500M` 的内存。 + +可以考虑使用一些优化算法,例如 :code:`momentum`。 + +2. 如何加速训练速度 +------------------- + +加速PaddlePaddle训练可以考虑从以下几个方面\: + +* 减少数据载入的耗时 +* 加速训练速度 +* 利用分布式训练驾驭更多的计算资源 + +减少数据载入的耗时 +++++++++++++++++++ + +使用\ :code:`pydataprovider`\ 时,可以减少缓存池的大小,同时设置内存缓存功能,即可以极大的加速数据载入流程。 +:code:`DataProvider` 缓存池的减小,和之前减小通过减小缓存池来减小内存占用的原理一致。 + +.. literalinclude:: src/reduce_min_pool_size.py + +同时 :code:`@provider` 接口有一个 :code:`cache` 参数来控制缓存方法,将其设置成 :code:`CacheType.CACHE_PASS_IN_MEM` 的话,会将第一个 :code:`pass` (过完所有训练数据即为一个pass)生成的数据缓存在内存里,在之后的 :code:`pass` 中,不会再从 :code:`python` 端读取数据,而是直接从内存的缓存里读取数据。这也会极大减少数据读入的耗时。 + + +加速训练速度 +++++++++++++ + +PaddlePaddle支持Sparse的训练,sparse训练需要训练特征是 :code:`sparse_binary_vector` 、 :code:`sparse_vector` 、或者 :code:`integer_value` 的任一一种。同时,与这个训练数据交互的Layer,需要将其Parameter设置成 sparse 更新模式,即设置 :code:`sparse_update=True` + +这里使用简单的 :code:`word2vec` 训练语言模型距离,具体使用方法为\: + +使用一个词前两个词和后两个词,来预测这个中间的词。这个任务的DataProvider为\: + +.. literalinclude:: src/word2vec_dataprovider.py + +这个任务的配置为\: + +.. literalinclude:: src/word2vec_config.py + + +利用更多的计算资源 +++++++++++++++++++ + +利用更多的计算资源可以分为以下几个方式来进行\: + +* 单机CPU训练 + + * 使用多线程训练。设置命令行参数 :code:`trainer_count`。 + +* 单机GPU训练 + + * 使用显卡训练。设置命令行参数 :code:`use_gpu`。 + * 使用多块显卡训练。设置命令行参数 :code:`use_gpu` 和 :code:`trainer_count` 。 + +* 多机训练 + + * 请参考 :ref:`cluster_train` 。 + +3. 如何指定GPU设备 +------------------ + +例如机器上有4块GPU,编号从0开始,指定使用2、3号GPU: + +* 方式1:通过 `CUDA_VISIBLE_DEVICES `_ 环境变量来指定特定的GPU。 + +.. code-block:: bash + + env CUDA_VISIBLE_DEVICES=2,3 paddle train --use_gpu=true --trainer_count=2 + +* 方式2:通过命令行参数 ``--gpu_id`` 指定。 + +.. code-block:: bash + + paddle train --use_gpu=true --trainer_count=2 --gpu_id=2 + + +4. 训练过程中出现 :code:`Floating point exception`, 训练因此退出怎么办? +------------------------------------------------------------------------ + +Paddle二进制在运行时捕获了浮点数异常,只要出现浮点数异常(即训练过程中出现NaN或者Inf),立刻退出。浮点异常通常的原因是浮点数溢出、除零等问题。 +主要原因包括两个方面: + +* 训练过程中参数或者训练过程中的梯度尺度过大,导致参数累加,乘除等时候,导致了浮点数溢出。 +* 模型一直不收敛,发散到了一个数值特别大的地方。 +* 训练数据有问题,导致参数收敛到了一些奇异的情况。或者输入数据尺度过大,有些特征的取值达到数百万,这时进行矩阵乘法运算就可能导致浮点数溢出。 + +这里有两种有效的解决方法: + +1. 设置 :code:`gradient_clipping_threshold` 参数,示例代码如下: + +.. code-block:: python + +optimizer = paddle.optimizer.RMSProp( + learning_rate=1e-3, + gradient_clipping_threshold=10.0, + regularization=paddle.optimizer.L2Regularization(rate=8e-4)) + +具体可以参考 `nmt_without_attention `_ 示例。 + +2. 设置 :code:`error_clipping_threshold` 参数,示例代码如下: + +.. code-block:: python + +decoder_inputs = paddle.layer.fc( + act=paddle.activation.Linear(), + size=decoder_size * 3, + bias_attr=False, + input=[context, current_word], + layer_attr=paddle.attr.ExtraLayerAttribute( + error_clipping_threshold=100.0)) + +完整代码可以参考示例 `machine translation `_ 。 + +两种方法的区别: + +1. 两者都是对梯度的截断,但截断时机不同,前者在 :code:`optimzier` 更新网络参数时应用;后者在激活函数反向计算时被调用; +2. 截断对象不同:前者截断可学习参数的梯度,后者截断回传给前层的梯度; + +除此之外,还可以通过减小学习率或者对数据进行归一化处理来解决这类问题。 + +5. 如何调用 infer 接口输出多个layer的预测结果 +----------------------------------------------- + +* 将需要输出的层作为 :code:`paddle.inference.Inference()` 接口的 :code:`output_layer` 参数输入,代码如下: + +.. code-block:: python + + inferer = paddle.inference.Inference(output_layer=[layer1, layer2], parameters=parameters) + +* 指定要输出的字段进行输出。以输出 :code:`value` 字段为例,代码如下: + +.. code-block:: python + + out = inferer.infer(input=data_batch, field=["value"]) + +需要注意的是: + +* 如果指定了2个layer作为输出层,实际上需要的输出结果是两个矩阵; +* 假设第一个layer的输出A是一个 N1 * M1 的矩阵,第二个 Layer 的输出B是一个 N2 * M2 的矩阵; +* paddle.v2 默认会将A和B 横向拼接,当N1 和 N2 大小不一样时,会报如下的错误: + +.. code-block:: python + + ValueError: all the input array dimensions except for the concatenation axis must match exactly + +多个层的输出矩阵的高度不一致导致拼接失败,这种情况常常发生在: + +* 同时输出序列层和非序列层; +* 多个输出层处理多个不同长度的序列; + +此时可以在调用infer接口时通过设置 :code:`flatten_result=False` , 跳过“拼接”步骤,来解决上面的问题。这时,infer接口的返回值是一个python list: + +* list 中元素的个数等于网络中输出层的个数; +* list 中每个元素是一个layer的输出结果矩阵,类型是numpy的ndarray; +* 每一个layer输出矩阵的高度,在非序列输入时:等于样本数;序列输入时等于:输入序列中元素的总数;宽度等于配置中layer的size; diff --git a/doc/faq/src/reduce_min_pool_size.py b/doc/faq/local/src/reduce_min_pool_size.py similarity index 100% rename from doc/faq/src/reduce_min_pool_size.py rename to doc/faq/local/src/reduce_min_pool_size.py diff --git a/doc/faq/src/word2vec_config.py b/doc/faq/local/src/word2vec_config.py similarity index 100% rename from doc/faq/src/word2vec_config.py rename to doc/faq/local/src/word2vec_config.py diff --git a/doc/faq/src/word2vec_dataprovider.py b/doc/faq/local/src/word2vec_dataprovider.py similarity index 100% rename from doc/faq/src/word2vec_dataprovider.py rename to doc/faq/local/src/word2vec_dataprovider.py diff --git a/doc/faq/model/index_cn.rst b/doc/faq/model/index_cn.rst new file mode 100644 index 0000000000000000000000000000000000000000..b47bbe05bdb39d1ade9434a7e54bf6ca88a91cc9 --- /dev/null +++ b/doc/faq/model/index_cn.rst @@ -0,0 +1,69 @@ +######### +模型配置 +######### + +.. contents:: + +1. 出现 :code:`Duplicated layer name` 错误怎么办 +-------------------------------------------------- + +出现该错误的原因一般是用户对不同layer的参数 :code:`name` 设置了相同的取值。遇到该错误时,先找出参数 :code:`name` 取值相同的layer,然后将这些layer的参数 :code:`name` 设置为不同的值。 + +2. :code:`paddle.layer.memory` 的参数 :code:`name` 如何使用 +------------------------------------------------------------- + +* :code:`paddle.layer.memory` 用于获取特定layer上一时间步的输出,该layer是通过参数 :code:`name` 指定,即,:code:`paddle.layer.memory` 会关联参数 :code:`name` 取值相同的layer,并将该layer上一时间步的输出作为自身当前时间步的输出。 + +* PaddlePaddle的所有layer都有唯一的name,用户通过参数 :code:`name` 设定,当用户没有显式设定时,PaddlePaddle会自动设定。而 :code:`paddle.layer.memory` 不是真正的layer,其name由参数 :code:`memory_name` 设定,当用户没有显式设定时,PaddlePaddle会自动设定。:code:`paddle.layer.memory` 的参数 :code:`name` 用于指定其要关联的layer,需要用户显式设定。 + +3. 两种使用 drop_out 的方法有何区别 +------------------------------------ + +* 在PaddlePaddle中使用dropout有两种方式 + + * 在相应layer的 :code:`layer_atter` 设置 :code:`drop_rate`,以 :code:`paddle.layer.fc` 为例,代码如下: + + .. code-block:: python + + fc = paddle.layer.fc(input=input, layer_attr=paddle.attr.ExtraLayerAttribute(drop_rate=0.5)) + + * 使用 :code:`paddle.layer.dropout`,以 :code:`paddle.layer.fc` 为例,代码如下: + + .. code-block:: python + + fc = paddle.layer.fc(input=input) + drop_fc = paddle.layer.dropout(input=fc, dropout_rate=0.5) + +* :code:`paddle.layer.dropout` 实际上使用了 :code:`paddle.layer.add_to`,并在该layer里采用第一种方式设置 :code:`drop_rate` 来使用dropout的。这种方式对内存消耗较大。 + +* PaddlePaddle在激活函数里实现dropout,而不是在layer里实现。 + +* :code:`paddle.layer.lstmemory`、:code:`paddle.layer.grumemory`、:code:`paddle.layer.recurrent` 不是通过一般的方式来实现对输出的激活,所以不能采用第一种方式在这几个layer里设置 :code:`drop_rate` 来使用dropout。若要对这几个layer使用dropout,可采用第二种方式,即使用 :code:`paddle.layer.dropout`。 + +4. 不同的 recurrent layer 的区别 +---------------------------------- +以LSTM为例,在PaddlePaddle中包含以下 recurrent layer: + +* :code:`paddle.layer.lstmemory` +* :code:`paddle.networks.simple_lstm` +* :code:`paddle.networks.lstmemory_group` +* :code:`paddle.networks.bidirectional_lstm` + +按照具体实现方式可以归纳为2类: + +1. 由 recurrent_group 实现的 recurrent layer: + + * 用户在使用这一类recurrent layer时,可以访问由recurrent unit在一个时间步内计算得到的中间值(例如:hidden states, memory cells等); + * 上述的 :code:`paddle.networks.lstmemory_group` 是这一类的 recurrent layer ; + +2. 将recurrent layer作为一个整体来实现: + + * 用户在使用这一类recurrent layer,只能访问它们的输出值; + * 上述的 :code:`paddle.networks.lstmemory_group` 、 :code:`paddle.networks.simple_lstm` 和 :code:`paddle.networks.bidirectional_lstm` 属于这一类的实现; + +将recurrent layer作为一个整体来实现, 能够针对CPU和GPU的计算做更多优化, 所以相比于recurrent group的实现方式, 第二类 recurrent layer 计算效率更高。 在实际应用中,如果用户不需要访问LSTM的中间变量,而只需要获得recurrent layer计算的输出,我们建议使用第二类实现。 + +此外,关于LSTM, PaddlePaddle中还包含 :code:`paddle.networks.lstmemory_unit` 这一计算单元: + + * 不同于上述介绍的recurrent layer , :code:`paddle.networks.lstmemory_unit` 定义了LSTM单元在一个时间步内的计算过程,它并不是一个完整的recurrent layer,也不能接收序列数据作为输入; + * :code:`paddle.networks.lstmemory_unit` 只能在recurrent_group中作为step function使用; diff --git a/doc/faq/parameter/index_cn.rst b/doc/faq/parameter/index_cn.rst new file mode 100644 index 0000000000000000000000000000000000000000..6fa0c64413be1616a435640b0347904a49873349 --- /dev/null +++ b/doc/faq/parameter/index_cn.rst @@ -0,0 +1,201 @@ +######### +参数设置 +######### + +.. contents:: + +1. 如何选择SGD算法的学习率 +-------------------------- + +在采用sgd/async_sgd进行训练时,一个重要的问题是选择正确的learning_rate。如果learning_rate太大,那么训练有可能不收敛,如果learning_rate太小,那么收敛可能很慢,导致训练时间过长。 + +通常做法是从一个比较大的learning_rate开始试,如果不收敛,那减少学习率10倍继续试验,直到训练收敛为止。那么如何判断训练不收敛呢?可以估计出如果模型采用不变的输出最小的cost0是多少。 + +如果训练过程的的cost明显高于这个常数输出的cost,那么我们可以判断为训练不收敛。举一个例子,假如我们是三分类问题,采用multi-class-cross-entropy作为cost,数据中0,1,2三类的比例为 :code:`0.2, 0.5, 0.3` , 那么常数输出所能达到的最小cost是 :code:`-(0.2*log(0.2)+0.5*log(0.5)+0.3*log(0.3))=1.03` 。如果训练一个pass(或者更早)后,cost还大于这个数,那么可以认为训练不收敛,应该降低学习率。 + +2. 如何设置学习率退火(learning rate annealing) +------------------------------------------------ + +在相应的优化算法里设置learning_rate_schedule及相关参数,以使用Adam算法为例,代码如下: + +.. code-block:: python + + optimizer = paddle.optimizer.Adam( + learning_rate=1e-3, + learning_rate_decay_a=0.5, + learning_rate_decay_b=0.75, + learning_rate_schedule="poly",) + +PaddlePaddle目前支持8种learning_rate_schedule,这8种learning_rate_schedule及其对应学习率计算方式如下: + +* "constant" + + lr = learning_rate + +* "poly" + + lr = learning_rate * pow(1 + learning_rate_decay_a * num_samples_processed, -learning_rate_decay_b) + + 其中,num_samples_processed为已训练样本数,下同。 + +* "caffe_poly" + + lr = learning_rate * pow(1.0 - num_samples_processed / learning_rate_decay_a, learning_rate_decay_b) + +* "exp" + + lr = learning_rate * pow(learning_rate_decay_a, num_samples_processed / learning_rate_decay_b) + +* "discexp" + + lr = learning_rate * pow(learning_rate_decay_a, floor(num_samples_processed / learning_rate_decay_b)) + +* "linear" + + lr = max(learning_rate - learning_rate_decay_a * num_samples_processed, learning_rate_decay_b) + +* "manual" + + 这是一种按已训练样本数分段取值的学习率退火方法。使用该learning_rate_schedule时,用户通过参数 :code:`learning_rate_args` 设置学习率衰减因子分段函数,当前的学习率为所设置 :code:`learning_rate` 与当前的衰减因子的乘积。以使用Adam算法为例,代码如下: + + .. code-block:: python + + optimizer = paddle.optimizer.Adam( + learning_rate=1e-3, + learning_rate_schedule="manual", + learning_rate_args="1000:1.0,2000:0.9,3000:0.8",) + + 在该示例中,当已训练样本数小于等于1000时,学习率为 :code:`1e-3 * 1.0`;当已训练样本数大于1000小于等于2000时,学习率为 :code:`1e-3 * 0.9`;当已训练样本数大于2000时,学习率为 :code:`1e-3 * 0.8`。 + +* "pass_manual" + + 这是一种按已训练pass数分段取值的学习率退火方法。使用该learning_rate_schedule时,用户通过参数 :code:`learning_rate_args` 设置学习率衰减因子分段函数,当前的学习率为所设置 :code:`learning_rate` 与当前的衰减因子的乘积。以使用Adam算法为例,代码如下: + + .. code-block:: python + + optimizer = paddle.optimizer.Adam( + learning_rate=1e-3, + learning_rate_schedule="pass_manual", + learning_rate_args="1:1.0,2:0.9,3:0.8",) + + 在该示例中,当已训练pass数小于等于1时,学习率为 :code:`1e-3 * 1.0`;当已训练pass数大于1小于等于2时,学习率为 :code:`1e-3 * 0.9`;当已训练pass数大于2时,学习率为 :code:`1e-3 * 0.8`。 + +3. 如何初始化参数 +----------------- + +默认情况下,PaddlePaddle使用均值0,标准差为 :math:`\frac{1}{\sqrt{d}}` 来初始化参数。其中 :math:`d` 为参数矩阵的宽度。这种初始化方式在一般情况下不会产生很差的结果。如果用户想要自定义初始化方式,PaddlePaddle目前提供两种参数初始化的方式\: + +* 高斯分布。将 :code:`param_attr` 设置成 :code:`param_attr=ParamAttr(initial_mean=0.0, initial_std=1.0)` +* 均匀分布。将 :code:`param_attr` 设置成 :code:`param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0)` + +比如设置一个全连接层的参数初始化方式和bias初始化方式,可以使用如下代码。 + +.. code-block:: python + + hidden = fc_layer(input=ipt, param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0), + bias_attr=ParamAttr(initial_mean=1.0, initial_std=0.0)) + +上述代码将bias全部初始化为1.0, 同时将参数初始化为 :code:`[1.0, -1.0]` 的均匀分布。 + +4. 如何共享参数 +--------------- + +PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID,相同名字的参数,会共享参数。设置参数的名字,可以使用 :code:`ParamAttr(name="YOUR_PARAM_NAME")` 来设置。更方便的设置方式,是使得要共享的参数使用同样的 :code:`ParamAttr` 对象。 + +简单的全连接网络,参数共享的配置示例为\: + +.. literalinclude:: ../../python/paddle/trainer_config_helpers/tests/configs/shared_fc.py + +这里 :code:`hidden_a` 和 :code:`hidden_b` 使用了同样的parameter和bias。并且softmax层的两个输入也使用了同样的参数 :code:`softmax_param`。 + +5. 如何加载预训练参数 +------------------------ + +* 对加载预训练参数的层,设置其参数属性 :code:`is_static=True`,使该层的参数在训练过程中保持不变。以embedding层为例,代码如下: + +.. code-block:: python + + emb_para = paddle.attr.Param(name='emb', is_static=True) + paddle.layer.embedding(size=word_dim, input=x, param_attr=emb_para) + + +* 从模型文件将预训练参数载入 :code:`numpy.array`,在创建parameters后,使用 :code:`parameters.set()` 加载预训练参数。PaddlePaddle保存的模型参数文件前16字节为头信息,用户将参数载入 :code:`numpy.array` 时须从第17字节开始。以embedding层为例,代码如下: + +.. code-block:: python + + def load_parameter(file_name, h, w): + with open(file_name, 'rb') as f: + f.read(16) # skip header. + return np.fromfile(f, dtype=np.float32).reshape(h, w) + + parameters = paddle.parameters.create(my_cost) + parameters.set('emb', load_parameter(emb_param_file, 30000, 256)) + +6. 存储的参数格式是什么,如何和明文进行相互转化 +-------------------------------------------------- + +PaddlePaddle保存的模型参数文件内容由16字节头信息和网络参数两部分组成。头信息中,1~4字节表示PaddlePaddle版本信息,请直接填充0;5~8字节表示每个参数占用的字节数,当保存的网络参数为float类型时为4,double类型时为8;9~16字节表示保存的参数总个数。 + +将PaddlePaddle保存的模型参数还原回明文时,可以使用相应数据类型的 :code:`numpy.array` 加载具体网络参数,此时可以跳过PaddlePaddle模型参数文件的头信息。若在PaddlePaddle编译时,未指定按照double精度编译,默认情况下按照float精度计算,保存的参数也是float类型。这时在使用 :code:`numpy.array` 时,一般设置 :code:`dtype=float32` 。示例如下: + +.. code-block:: python + + def read_parameter(fname, width): + s = open(fname).read() + # skip header + vec = np.fromstring(s[16:], dtype=np.float32) + # width is the size of the corresponding layer + np.savetxt(fname + ".csv", vec.reshape(width, -1), + fmt="%.6f", delimiter=",") + + +将明文参数转化为PaddlePaddle可加载的模型参数时,首先构造头信息,再写入网络参数。下面的代码将随机生成的矩阵转化为可以被PaddlePaddle加载的模型参数。 + +.. code-block:: python + + def gen_rand_param(param_file, width, height, need_trans): + np.random.seed() + header = struct.pack("iil", 0, 4, height * width) + param = np.float32(np.random.rand(height, width)) + with open(param_file, "w") as fparam: + fparam.write(header + param.tostring()) + +7. A protocol message was rejected because it was too big +------------------------------------------------------------ + +如果在训练NLP相关模型时,出现以下错误: + +.. code-block:: bash + + [libprotobuf ERROR google/protobuf/io/coded_stream.cc:171] A protocol message was rejected because it was too big (more than 67108864 bytes). To increase the limit (or to disable these warnings), see CodedInputStream::SetTotalBytesLimit() in google/protobuf/io/coded_stream.h. + F1205 14:59:50.295174 14703 TrainerConfigHelper.cpp:59] Check failed: m->conf.ParseFromString(configProtoStr) + +可能的原因是:传给dataprovider的某一个args过大,一般是由于直接传递大字典导致的。错误的define_py_data_sources2类似: + +.. code-block:: python + + src_dict = dict() + for line_count, line in enumerate(open(src_dict_path, "r")): + src_dict[line.strip()] = line_count + + define_py_data_sources2( + train_list, + test_list, + module="dataprovider", + obj="process", + args={"src_dict": src_dict}) + +解决方案是:将字典的地址作为args传给dataprovider,然后在dataprovider里面根据该地址加载字典。即define_py_data_sources2应改为: + +.. code-block:: python + + define_py_data_sources2( + train_list, + test_list, + module="dataprovider", + obj="process", + args={"src_dict_path": src_dict_path}) + +完整源码可参考 `seqToseq `_ 示例。 + + diff --git a/doc/getstarted/basic_usage/index_cn.rst b/doc/getstarted/basic_usage/index_cn.rst deleted file mode 100644 index 428f58830e0b10c024f31238b7404c6df193eecd..0000000000000000000000000000000000000000 --- a/doc/getstarted/basic_usage/index_cn.rst +++ /dev/null @@ -1,108 +0,0 @@ -经典的线性回归任务 -================== - -PaddlePaddle是源于百度的一个深度学习平台。这份简短的介绍将向你展示如何利用PaddlePaddle来解决一个经典的线性回归问题。 - -任务简介 --------- - -我们展示如何用PaddlePaddle解决 `单变量的线性回归 `_ 问题。线性回归的输入是一批点 `(x, y)` ,其中 `y = wx + b + ε`, 而 ε 是一个符合高斯分布的随机变量。线性回归的输出是从这批点估计出来的参数 `w` 和 `b` 。 - -一个例子是房产估值。我们假设房产的价格(y)是其大小(x)的一个线性函数,那么我们可以通过收集市场上房子的大小和价格,用来估计线性函数的参数w 和 b。 - -准备数据 ------------ - -假设变量 `x` 和 `y` 的真实关系为: `y = 2x + 0.3 + ε`,这里展示如何使用观测数据来拟合这一线性关系。首先,Python代码将随机产生2000个观测点,作为线性回归的输入。下面脚本符合PaddlePaddle期待的读取数据的Python程序的模式。 - -.. code-block:: python - - # dataprovider.py - from paddle.trainer.PyDataProvider2 import * - import random - - # 定义输入数据的类型: 2个浮点数 - @provider(input_types=[dense_vector(1), dense_vector(1)],use_seq=False) - def process(settings, input_file): - for i in xrange(2000): - x = random.random() - yield [x], [2*x+0.3] - -训练模型 ------------ - -为了还原 `y = 2x + 0.3`,我们先从一条随机的直线 `y' = wx + b` 开始,然后利用观测数据调整 `w` 和 `b` 使得 `y'` 和 `y` 的差距不断减小,最终趋于接近。这个过程就是模型的训练过程,而 `w` 和 `b` 就是模型的参数,即我们的训练目标。 - -在PaddlePaddle里,该模型的网络配置如下。 - -.. code-block:: python - - # trainer_config.py - from paddle.trainer_config_helpers import * - - # 1. 定义数据来源,调用上面的process函数获得观测数据 - data_file = 'empty.list' - with open(data_file, 'w') as f: f.writelines(' ') - define_py_data_sources2(train_list=data_file, test_list=None, - module='dataprovider', obj='process',args={}) - - # 2. 学习算法。控制如何改变模型参数 w 和 b - settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer()) - - # 3. 神经网络配置 - x = data_layer(name='x', size=1) - y = data_layer(name='y', size=1) - # 线性计算网络层: ȳ = wx + b - ȳ = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b')) - # 计算误差函数,即 ȳ 和真实 y 之间的距离 - cost = mse_cost(input= ȳ, label=y) - outputs(cost) - - -这段简短的配置展示了PaddlePaddle的基本用法: - -- 第一部分定义了数据输入。一般情况下,PaddlePaddle先从一个文件列表里获得数据文件地址,然后交给用户自定义的函数(例如上面的 `process`函数)进行读入和预处理从而得到真实输入。本文中由于输入数据是随机生成的不需要读输入文件,所以放一个空列表(`empty.list`)即可。 - -- 第二部分主要是选择学习算法,它定义了模型参数改变的规则。PaddlePaddle提供了很多优秀的学习算法,这里使用一个基于momentum的随机梯度下降(SGD)算法,该算法每批量(batch)读取12个采样数据进行随机梯度计算来更新更新。 - -- 最后一部分是神经网络的配置。由于PaddlePaddle已经实现了丰富的网络层,所以很多时候你需要做的只是定义正确的网络层并把它们连接起来。这里使用了三种网络单元: - - - **数据层**:数据层 `data_layer` 是神经网络的入口,它读入数据并将它们传输到接下来的网络层。这里数据层有两个,分别对应于变量 `x` 和 `y`。 - - **全连接层**:全连接层 `fc_layer` 是基础的计算单元,这里利用它建模变量之间的线性关系。计算单元是神经网络的核心,PaddlePaddle支持大量的计算单元和任意深度的网络连接,从而可以拟合任意的函数来学习复杂的数据关系。 - - **回归误差代价层**:回归误差代价层 `mse_cost` 是众多误差代价函数层的一种,它们在训练过程作为网络的出口,用来计算模型的误差,是模型参数优化的目标函数。 - -定义了网络结构并保存为 `trainer_config.py` 之后,运行以下训练命令: - -.. code-block:: bash - - paddle train --config=trainer_config.py --save_dir=./output --num_passes=30 - -PaddlePaddle将在观测数据集上迭代训练30轮,并将每轮的模型结果存放在 `./output` 路径下。从输出日志可以看到,随着轮数增加误差代价函数的输出在不断的减小,这意味着模型在训练数据上不断的改进,直到逼近真实解:` y = 2x + 0.3 ` - -模型检验 ------------ - -训练完成后,我们希望能够检验模型的好坏。一种常用的做法是用学习的模型对另外一组测试数据进行预测,评价预测的效果。在这个例子中,由于已经知道了真实答案,我们可以直接观察模型的参数是否符合预期来进行检验。 - -PaddlePaddle将每个模型参数作为一个numpy数组单独存为一个文件,所以可以利用如下方法读取模型的参数。 - -.. code-block:: python - - import numpy as np - import os - - def load(file_name): - with open(file_name, 'rb') as f: - f.read(16) # skip header for float type. - return np.fromfile(f, dtype=np.float32) - - print 'w=%.6f, b=%.6f' % (load('output/pass-00029/w'), load('output/pass-00029/b')) - # w=1.999743, b=0.300137 - -.. image:: ./parameters.png - :align: center - :scale: 80 % - -从图中可以看到,虽然 `w` 和 `b` 都使用随机值初始化,但在起初的几轮训练中它们都在快速逼近真实值,并且后续仍在不断改进,使得最终得到的模型几乎与真实模型一致。 - -这样,我们用PaddlePaddle解决了单变量线性回归问题, 包括数据输入、模型训练和最后的结果验证。 diff --git a/doc/getstarted/basic_usage/index_en.rst b/doc/getstarted/basic_usage/index_en.rst deleted file mode 100644 index 6775da20c2f51000f305b095d40abd27b8fa6c0e..0000000000000000000000000000000000000000 --- a/doc/getstarted/basic_usage/index_en.rst +++ /dev/null @@ -1,101 +0,0 @@ -Simple Linear Regression -======================== - -PaddlePaddle is a deep learning platform open-sourced by Baidu. With PaddlePaddle, you can easily train a classic neural network within a couple lines of configuration, or you can build sophisticated models that provide state-of-the-art performance on difficult learning tasks like sentiment analysis, machine translation, image caption and so on. - -Problem Background ------------------- - -Now, to give you a hint of what using PaddlePaddle looks like, let's start with a fundamental learning problem - `simple linear regression `_: you have observed a set of two-dimensional data points of ``X`` and ``Y``, where ``X`` is an explanatory variable and ``Y`` is corresponding dependent variable, and you want to recover the underlying correlation between ``X`` and ``Y``. Linear regression can be used in many practical scenarios. For example, ``X`` can be a variable about house size, and ``Y`` a variable about house price. You can build a model that captures relationship between them by observing real estate markets. - -Prepare the Data ------------------ - -Suppose the true relationship can be characterized as ``Y = 2X + 0.3``, let's see how to recover this pattern only from observed data. Here is a piece of python code that feeds synthetic data to PaddlePaddle. The code is pretty self-explanatory, the only extra thing you need to add for PaddlePaddle is a definition of input data types. - - .. code-block:: python - - # dataprovider.py - from paddle.trainer.PyDataProvider2 import * - import random - - # define data types of input: 2 real numbers - @provider(input_types=[dense_vector(1), dense_vector(1)],use_seq=False) - def process(settings, input_file): - for i in xrange(2000): - x = random.random() - yield [x], [2*x+0.3] - -Train a NeuralNetwork ----------------------- - -To recover this relationship between ``X`` and ``Y``, we use a neural network with one layer of linear activation units and a square error cost layer. Don't worry if you are not familiar with these terminologies, it's just saying that we are starting from a random line ``Y' = wX + b`` , then we gradually adapt ``w`` and ``b`` to minimize the difference between ``Y'`` and ``Y``. Here is what it looks like in PaddlePaddle: - - .. code-block:: python - - # trainer_config.py - from paddle.trainer_config_helpers import * - - # 1. read data. Suppose you saved above python code as dataprovider.py - data_file = 'empty.list' - with open(data_file, 'w') as f: f.writelines(' ') - define_py_data_sources2(train_list=data_file, test_list=None, - module='dataprovider', obj='process',args={}) - - # 2. learning algorithm - settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer()) - - # 3. Network configuration - x = data_layer(name='x', size=1) - y = data_layer(name='y', size=1) - y_predict = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b')) - cost = mse_cost(input=y_predict, label=y) - outputs(cost) - -Some of the most fundamental usages of PaddlePaddle are demonstrated: - -- The first part shows how to feed data into PaddlePaddle. In general cases, PaddlePaddle reads raw data from a list of files, and then do some user-defined process to get real input. In this case, we only need to create a placeholder file since we are generating synthetic data on the fly. - -- The second part describes learning algorithm. It defines in what ways adjustments are made to model parameters. PaddlePaddle provides a rich set of optimizers, but a simple momentum based optimizer will suffice here, and it processes 12 data points each time. - -- Finally, the network configuration. It usually is as simple as "stacking" layers. Three kinds of layers are used in this configuration: - - **Data Layer**: a network always starts with one or more data layers. They provide input data to the rest of the network. In this problem, two data layers are used respectively for ``X`` and ``Y``. - - **FC Layer**: FC layer is short for Fully Connected Layer, which connects all the input units to current layer and does the actual computation specified as activation function. Computation layers like this are the fundamental building blocks of a deeper model. - - **Cost Layer**: in training phase, cost layers are usually the last layers of the network. They measure the performance of current model, and provide guidence to adjust parameters. - -Now that everything is ready, you can train the network with a simple command line call: - - .. code-block:: bash - - paddle train --config=trainer_config.py --save_dir=./output --num_passes=30 - - -This means that PaddlePaddle will train this network on the synthectic dataset for 30 passes, and save all the models under path ``./output``. You will see from the messages printed out during training phase that the model cost is decreasing as time goes by, which indicates we are getting a closer guess. - - -Evaluate the Model -------------------- - -Usually, a different dataset that left out during training phase should be used to evalute the models. However, we are lucky enough to know the real answer: ``w=2, b=0.3``, thus a better option is to check out model parameters directly. - -In PaddlePaddle, training is just to get a collection of model parameters, which are ``w`` and ``b`` in this case. Each parameter is saved in an individual file in the popular ``numpy`` array format. Here is the code that reads parameters from last pass. - - .. code-block:: python - - import numpy as np - import os - - def load(file_name): - with open(file_name, 'rb') as f: - f.read(16) # skip header for float type. - return np.fromfile(f, dtype=np.float32) - - print 'w=%.6f, b=%.6f' % (load('output/pass-00029/w'), load('output/pass-00029/b')) - # w=1.999743, b=0.300137 - - .. image:: parameters.png - :align: center - -Although starts from a random guess, you can see that value of ``w`` changes quickly towards 2 and ``b`` changes quickly towards 0.3. In the end, the predicted line is almost identical with real answer. - -There, you have recovered the underlying pattern between ``X`` and ``Y`` only from observed data. diff --git a/doc/getstarted/basic_usage/parameters.png b/doc/getstarted/basic_usage/parameters.png deleted file mode 100644 index 2ec67480951e21f0400bce1c34b3108dcd65c18c..0000000000000000000000000000000000000000 Binary files a/doc/getstarted/basic_usage/parameters.png and /dev/null differ diff --git a/doc/getstarted/build_and_install/build_from_source_cn.rst b/doc/getstarted/build_and_install/build_from_source_cn.rst new file mode 100644 index 0000000000000000000000000000000000000000..c875c807b8ab2e420dec189ef32d41533f58fa6d --- /dev/null +++ b/doc/getstarted/build_and_install/build_from_source_cn.rst @@ -0,0 +1,141 @@ +从源码编译 +====================== + +.. _build_step: + +编译方法 +---------------- + +PaddlePaddle主要使用 `CMake `_ 以及GCC, G++作为编译工具。 +我们推荐您使用PaddlePaddle Docker编译环境镜像完成编译,这样可以免去单独安装编译依赖的步骤,可选的不同编译环境Docker镜像 +可以在 `这里 `_ 找到。 + +如果您选择不使用Docker镜像,则需要在本机安装下面章节列出的 `编译依赖`_ 之后才能开始编译的步骤。 + +编译PaddlePaddle,需要执行: + +.. code-block:: bash + + git clone https://github.com/PaddlePaddle/Paddle.git + cd Paddle + # 如果使用Docker编译环境,执行下面的命令编译CPU-Only的二进制 + docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh + # 如果不使用Docker编译环境,执行下面的命令 + mkdir build + cd build + cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF .. + make + +编译完成后会在build/python/dist目录下生成输出的whl包,可以选在在当前机器安装也可以拷贝到目标机器安装: + +.. code-block:: bash + + pip install build/python/dist/*.whl + + +.. _run_test: + +执行单元测试 +---------------- + +如果您期望在编译完成后立即执行所有的单元测试,可以按照下面的方法: + +使用Docker的情况下,设置 :code:`RUN_TEST=ON` 和 :code:`WITH_TESTING=ON` 就会在完成编译之后,立即执行单元测试。 +开启 :code:`WITH_GPU=ON` 可以指定同时执行GPU上的单元测试。 + +.. code-block:: bash + + docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh + +如果不使用Docker,可以执行ctest命令即可: + +.. code-block:: bash + + mkdir build + cd build + cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF .. + make + ctest + # 指定执行其中一个单元测试 test_mul_op + ctest -R test_mul_op + +.. _compile_deps: + +编译依赖 +---------------- + +PaddlePaddle编译需要使用到下面的依赖(包含但不限于),其他的依赖软件,会自动在编译时下载。 + +.. csv-table:: PaddlePaddle编译依赖 + :header: "依赖", "版本", "说明" + :widths: 10, 15, 30 + + "CMake", ">=3.5", "" + "GCC", "4.8.2", "推荐使用CentOS的devtools2" + "Python", "2.7.x", "依赖libpython2.7.so" + "pip", ">=9.0", "" + "numpy", "", "" + "SWIG", ">=2.0", "" + "Go", ">=1.8", "可选" + + +.. _build_options: + +编译选项 +---------------- + +PaddlePaddle的编译选项,包括生成CPU/GPU二进制文件、链接何种BLAS库等。 +用户可在调用cmake的时候设置它们,详细的cmake使用方法可以参考 +`官方文档 `_ 。 + +在cmake的命令行中,通过使用 ``-D`` 命令设置该类编译选项,例如: + +.. code-block:: bash + + cmake .. -DWITH_GPU=OFF + +.. csv-table:: 编译选项说明 + :header: "选项", "说明", "默认值" + :widths: 1, 7, 2 + + "WITH_GPU", "是否支持GPU", "ON" + "WITH_C_API", "是否仅编译CAPI", "OFF" + "WITH_DOUBLE", "是否使用双精度浮点数", "OFF" + "WITH_DSO", "是否运行时动态加载CUDA动态库,而非静态加载CUDA动态库。", "ON" + "WITH_AVX", "是否编译含有AVX指令集的PaddlePaddle二进制文件", "ON" + "WITH_PYTHON", "是否内嵌PYTHON解释器", "ON" + "WITH_STYLE_CHECK", "是否编译时进行代码风格检查", "ON" + "WITH_TESTING", "是否开启单元测试", "ON" + "WITH_DOC", "是否编译中英文文档", "OFF" + "WITH_SWIG_PY", "是否编译PYTHON的SWIG接口,该接口可用于预测和定制化训练", "Auto" + "WITH_GOLANG", "是否编译go语言的可容错parameter server", "ON" + "WITH_MKL", "是否使用MKL数学库,如果为否则是用OpenBLAS", "ON" + +BLAS ++++++ + +PaddlePaddle支持 `MKL `_ 和 +`OpenBlAS `_ 两种BLAS库。默认使用MKL。如果使用MKL并且机器含有AVX2指令集, +还会下载MKL-DNN数学库,详细参考 `这里 `_ 。 + +如果关闭MKL,则会使用OpenBLAS作为BLAS库。 + +CUDA/cuDNN ++++++++++++ + +PaddlePaddle在编译时/运行时会自动找到系统中安装的CUDA和cuDNN库进行编译和执行。 +使用参数 :code:`-DCUDA_ARCH_NAME=Auto` 可以指定开启自动检测SM架构,加速编译。 + +PaddlePaddle可以使用cuDNN v5.1之后的任何一个版本来编译运行,但尽量请保持编译和运行使用的cuDNN是同一个版本。 +我们推荐使用最新版本的cuDNN。 + +编译选项的设置 +++++++++++++++ + +PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/cuDNN库。cmake编译时,首先在系统路径( :code:`/usr/lib:/usr/local/lib` )中搜索这几个库,同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置,例如 + +.. code-block:: bash + + cmake .. -DWITH_GPU=ON -DWITH_TESTING=OFF -DCUDNN_ROOT=/opt/cudnnv5 + +**注意:这几个编译选项的设置,只在第一次cmake的时候有效。如果之后想要重新设置,推荐清理整个编译目录(** :code:`rm -rf` )**后,再指定。** diff --git a/doc/getstarted/build_and_install/build_from_source_en.md b/doc/getstarted/build_and_install/build_from_source_en.md deleted file mode 100644 index 69f4501f370dcc9d603ec54a63d68568d66e832e..0000000000000000000000000000000000000000 --- a/doc/getstarted/build_and_install/build_from_source_en.md +++ /dev/null @@ -1,222 +0,0 @@ -Installing from Sources -========================== - -* [1. Download and Setup](#download) -* [2. Requirements](#requirements) -* [3. Build on Ubuntu](#ubuntu) -* [4. Build on Centos](#centos) - - -## Download and Setup -You can download PaddlePaddle from the [github source](https://github.com/PaddlePaddle/Paddle). - -```bash -git clone https://github.com/PaddlePaddle/Paddle paddle -cd paddle -``` -## Requirements - -To compile the source code, your computer must be equipped with the following dependencies. - -- **Compiler**: GCC >= 4.8 or Clang >= 3.3 (AppleClang >= 5.1) and gfortran compiler -- **CMake**: CMake >= 3.0 (at least CMake 3.4 on Mac OS X) -- **BLAS**: MKL, OpenBlas or ATLAS -- **Python**: only support Python 2.7 - -**Note:** For CUDA 7.0 and CUDA 7.5, GCC 5.0 and up are not supported! -For CUDA 8.0, GCC versions later than 5.3 are not supported! - -### Options - -PaddlePaddle supports some build options. - - - - - - - - - - - - - - - - - - - - - - - - - - -
OptionalDescription
WITH_GPUCompile PaddlePaddle with NVIDIA GPU
WITH_AVXCompile PaddlePaddle with AVX intrinsics
WITH_DSOCompile PaddlePaddle with dynamic linked CUDA
WITH_TESTINGCompile PaddlePaddle with unit testing
WITH_SWIG_PYCompile PaddlePaddle with inference api
WITH_STYLE_CHECKCompile PaddlePaddle with style check
WITH_PYTHONCompile PaddlePaddle with python interpreter
WITH_DOUBLECompile PaddlePaddle with double precision
WITH_RDMACompile PaddlePaddle with RDMA support
WITH_TIMERCompile PaddlePaddle with stats timer
WITH_PROFILERCompile PaddlePaddle with GPU profiler
WITH_DOCCompile PaddlePaddle with documentation
WITH_COVERAGECompile PaddlePaddle with code coverage
COVERALLS_UPLOADPackage code coverage data to coveralls
ON_TRAVISExclude special unit test on Travis CI
- - -**Note:** - - The GPU version works best with Cuda Toolkit 8.0 and cuDNN v5. - - Other versions like Cuda Toolkit 7.0, 7.5 and cuDNN v3, v4 are also supported. - - **To utilize cuDNN v5, Cuda Toolkit 7.5 is prerequisite and vice versa.** - -As a simple example, consider the following: - -1. **BLAS Dependencies(optional)** - - CMake will search BLAS libraries from system. If not found, OpenBLAS will be downloaded, built and installed automatically. - To utilize preinstalled BLAS, you can simply specify MKL, OpenBLAS or ATLAS via `MKL_ROOT`, `OPENBLAS_ROOT` or `ATLAS_ROOT`. - - ```bash - # specify MKL - cmake .. -DMKL_ROOT= - # or specify OpenBLAS - cmake .. -DOPENBLAS_ROOT= - ``` - -2. **Doc Dependencies(optional)** - - To generate PaddlePaddle's documentation, install dependencies and set `-DWITH_DOC=ON` as follows: - - ```bash - pip install 'sphinx>=1.4.0' - pip install sphinx_rtd_theme recommonmark - - # install doxygen on Ubuntu - sudo apt-get install doxygen - # install doxygen on Mac OS X - brew install doxygen - - # active docs in cmake - cmake .. -DWITH_DOC=ON` - ``` - -## Build on Ubuntu 14.04 - -### Install Dependencies - -- **Paddle Dependencies** - - ```bash - # necessary - sudo apt-get update - sudo apt-get install -y git curl gcc g++ gfortran make build-essential automake - sudo apt-get install -y python python-pip python-numpy libpython-dev bison - sudo pip install 'protobuf==3.1.0.post1' - - # install cmake 3.4 - curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \ - cd cmake-3.4.1 && ./bootstrap && make -j4 && sudo make install && \ - cd .. && rm -rf cmake-3.4.1 - ``` - -- **GPU Dependencies (optional)** - - To build GPU version, you will need the following installed: - - 1. a CUDA-capable GPU - 2. A supported version of Linux with a gcc compiler and toolchain - 3. NVIDIA CUDA Toolkit (available at http://developer.nvidia.com/cuda-downloads) - 4. NVIDIA cuDNN Library (availabel at https://developer.nvidia.com/cudnn) - - The CUDA development environment relies on tight integration with the host development environment, - including the host compiler and C runtime libraries, and is therefore only supported on - distribution versions that have been qualified for this CUDA Toolkit release. - - After downloading cuDNN library, issue the following commands: - - ```bash - sudo tar -xzf cudnn-7.5-linux-x64-v5.1.tgz -C /usr/local - sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn* - ``` - Then you need to set LD\_LIBRARY\_PATH, PATH environment variables in ~/.bashrc. - - ```bash - export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH - export PATH=/usr/local/cuda/bin:$PATH - ``` - -### Build and Install - -As usual, the best option is to create build folder under paddle project directory. - -```bash -mkdir build && cd build -``` - -Finally, you can build and install PaddlePaddle: - -```bash -# you can add build option here, such as: -cmake .. -DCMAKE_INSTALL_PREFIX= -# please use sudo make install, if you want to install PaddlePaddle into the system -make -j `nproc` && make install -# set PaddlePaddle installation path in ~/.bashrc -export PATH=/bin:$PATH -# install PaddlePaddle Python modules. -sudo pip install /opt/paddle/share/wheels/*.whl -``` -## Build on Centos 7 - -### Install Dependencies - -- **CPU Dependencies** - - ```bash - # necessary - sudo yum update - sudo yum install -y epel-release - sudo yum install -y make cmake3 python-devel python-pip gcc-gfortran swig git - sudo pip install wheel numpy - sudo pip install 'protobuf>=3.0.0' - ``` - -- **GPU Dependencies (optional)** - - To build GPU version, you will need the following installed: - - 1. a CUDA-capable GPU - 2. A supported version of Linux with a gcc compiler and toolchain - 3. NVIDIA CUDA Toolkit (available at http://developer.nvidia.com/cuda-downloads) - 4. NVIDIA cuDNN Library (availabel at https://developer.nvidia.com/cudnn) - - The CUDA development environment relies on tight integration with the host development environment, - including the host compiler and C runtime libraries, and is therefore only supported on - distribution versions that have been qualified for this CUDA Toolkit release. - - After downloading cuDNN library, issue the following commands: - - ```bash - sudo tar -xzf cudnn-7.5-linux-x64-v5.1.tgz -C /usr/local - sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn* - ``` - Then you need to set LD\_LIBRARY\_PATH, PATH environment variables in ~/.bashrc. - - ```bash - export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH - export PATH=/usr/local/cuda/bin:$PATH - ``` - -### Build and Install - -As usual, the best option is to create build folder under paddle project directory. - -```bash -mkdir build && cd build -``` - -Finally, you can build and install PaddlePaddle: - -```bash -# you can add build option here, such as: -cmake3 .. -DCMAKE_INSTALL_PREFIX= -# please use sudo make install, if you want to install PaddlePaddle into the system -make -j `nproc` && make install -# set PaddlePaddle installation path in ~/.bashrc -export PATH=/bin:$PATH -# install PaddlePaddle Python modules. -sudo pip install /opt/paddle/share/wheels/*.whl -``` diff --git a/doc/getstarted/build_and_install/build_from_source_en.rst b/doc/getstarted/build_and_install/build_from_source_en.rst new file mode 100644 index 0000000000000000000000000000000000000000..f194f84ce7c961bb8644d7c077a7c71730220ea2 --- /dev/null +++ b/doc/getstarted/build_and_install/build_from_source_en.rst @@ -0,0 +1,159 @@ +Build from Sources +========================== + +.. _build_step: + +How To Build +---------------- + +PaddlePaddle mainly uses `CMake `_ and GCC, G++ as compile +tools. We recommend you to use our pre-built Docker image to run the build +to avoid installing dependencies by yourself. We have several build environment +Docker images `here `_ . + +If you choose not to use Docker image for your build, you need to install the +below `Compile Dependencies`_ before run the build. + +Then run: + +.. code-block:: bash + + git clone https://github.com/PaddlePaddle/Paddle.git + cd Paddle + # run the following command to build a CPU-Only binaries if you are using docker + docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh + # else run these commands + mkdir build + cd build + cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF .. + make + +When the compile finishes, you can get the output whl package under +build/python/dist, then you can choose to install the whl on local +machine or copy it to the target machine. + +.. code-block:: bash + + pip install build/python/dist/*.whl + + +.. _run_test: + +Run Tests +---------------- + +If you wish to run the tests, you may follow the below steps: + +When using Docker, set :code:`RUN_TEST=ON` and :code:`WITH_TESTING=ON` will run test immediately after the build. +Set :code:`WITH_GPU=ON` Can also run tests on GPU. + +.. code-block:: bash + + docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/paddle/scripts/docker/build.sh + +If you don't use Docker, just run ctest will start the tests: + +.. code-block:: bash + + mkdir build + cd build + cmake -DWITH_GPU=OFF -DWITH_TESTING=ON .. + make + ctest + # run a single test like test_mul_op + ctest -R test_mul_op + + +.. _compile_deps: + +Compile Dependencies +---------------- + +PaddlePaddle need the following dependencies when compiling, other dependencies +will be downloaded automatically. + +.. csv-table:: PaddlePaddle Compile Dependencies + :header: "Dependency", "Version", "Description" + :widths: 10, 15, 30 + + "CMake", ">=3.5", "" + "GCC", "4.8.2", "Recommend devtools2 for CentOS" + "Python", "2.7.x", "Need libpython2.7.so" + "pip", ">=9.0", "" + "numpy", "", "" + "SWIG", ">=2.0", "" + "Go", ">=1.8", "Optional" + + +.. _build_options: + +Build Options +---------------- + +Build options include whether build binaries for CPU or GPU, which BLAS +library to use etc. You may pass these settings when running cmake. +For detailed cmake tutorial please refer to `here `_ 。 + +.. _build_options_bool: + +Bool Type Options +---------------- + +You can add :code:`-D` argument to pass such options, like: + +.. code-block:: bash + + cmake .. -DWITH_GPU=OFF + +.. csv-table:: Bool Type Options + :header: "Option", "Description", "Default" + :widths: 1, 7, 2 + + "WITH_GPU", "Build with GPU support", "ON" + "WITH_C_API", "Build only CAPI", "OFF" + "WITH_DOUBLE", "Build with double precision", "OFF" + "WITH_DSO", "Dynamically load CUDA libraries", "ON" + "WITH_AVX", "Build with AVX support", "ON" + "WITH_PYTHON", "Build with integrated Python interpreter", "ON" + "WITH_STYLE_CHECK", "Check code style when building", "ON" + "WITH_TESTING", "Build unit tests", "ON" + "WITH_DOC", "Build documentations", "OFF" + "WITH_SWIG_PY", "Build Python SWIG interface for V2 API", "Auto" + "WITH_GOLANG", "Build fault-tolerant parameter server written in go", "ON" + "WITH_MKL", "Use MKL as BLAS library, else use OpenBLAS", "ON" + + +BLAS ++++++ + +PaddlePaddle supports `MKL `_ and +`OpenBlAS `_ as BLAS library。By default it uses MKL. +If you are using MKL and your machine supports AVX2, MKL-DNN will also be downloaded +and used, for more `details `_ . + +If you choose not to use MKL, then OpenBlAS will be used. + +CUDA/cuDNN ++++++++++++ + +PaddlePaddle will automatically find CUDA and cuDNN when compiling and running. +parameter :code:`-DCUDA_ARCH_NAME=Auto` can be used to detect SM architecture +automatically in order to speed up the build. + +PaddlePaddle can build with any version later than cuDNN v5.1, and we intend to +keep on with latest cuDNN versions. Be sure to run with the same version of cuDNN +you built. + +Pass Compile Options +++++++++++++++ + +You can pass compile options to use intended BLAS/CUDA/Cudnn libraries. +When running cmake command, it will search system paths like +:code:`/usr/lib:/usr/local/lib` and then search paths that you +passed to cmake, i.e. + +.. code-block:: bash + + cmake .. -DWITH_GPU=ON -DWITH_TESTING=OFF -DCUDNN_ROOT=/opt/cudnnv5 + +**NOTE: These options only take effect when running cmake for the first time, you need to clean the cmake cache or clean the build directory (** :code:`rm -rf` **) if you want to change it.** diff --git a/doc/getstarted/build_and_install/cmake.png b/doc/getstarted/build_and_install/cmake.png deleted file mode 100644 index a58cd09ad99cf27cc1ca5785fe54d726b83a82f6..0000000000000000000000000000000000000000 Binary files a/doc/getstarted/build_and_install/cmake.png and /dev/null differ diff --git a/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst b/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst deleted file mode 100644 index be0c1ffa451b2901ec06621dd4d886f800b4562e..0000000000000000000000000000000000000000 --- a/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst +++ /dev/null @@ -1,43 +0,0 @@ -PaddlePaddle的编译选项 -====================== - -PaddlePaddle的编译选项,包括生成CPU/GPU二进制文件、链接何种BLAS库等。用户可在调用cmake的时候设置它们,详细的cmake使用方法可以参考 `官方文档 `_ 。 - -Bool型的编译选项 ----------------- -用户可在cmake的命令行中,通过使用 ``-D`` 命令设置该类编译选项,例如 - -.. code-block:: bash - - cmake .. -DWITH_GPU=OFF - -.. csv-table:: Bool型的编译选项 - :widths: 1, 7, 2 - :file: compile_options.csv - -BLAS/CUDA/Cudnn的编译选项 --------------------------- -BLAS -+++++ - -PaddlePaddle支持以下任意一种BLAS库:`MKL `_ ,`ATLAS `_ ,`OpenBlAS `_ 和 `REFERENCE BLAS `_ 。 - -.. csv-table:: BLAS路径相关的编译选项 - :widths: 1, 2, 7 - :file: cblas_settings.csv - -CUDA/Cudnn -+++++++++++ - -PaddlePaddle可以使用cudnn v2之后的任何一个版本来编译运行,但尽量请保持编译和运行使用的cudnn是同一个版本。 我们推荐使用最新版本的cudnn v5.1。 - -编译选项的设置 -++++++++++++++ - -PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/Cudnn库。cmake编译时,首先在系统路径(/usr/lib\:/usr/local/lib)中搜索这几个库,同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置,例如 - -.. code-block:: bash - - cmake .. -DMKL_ROOT=/opt/mkl/ -DCUDNN_ROOT=/opt/cudnnv5 - -注意:这几个编译选项的设置,只在第一次cmake的时候有效。如果之后想要重新设置,推荐清理整个编译目录(``rm -rf``)后,再指定。 diff --git a/doc/getstarted/build_and_install/cmake/cblas_settings.csv b/doc/getstarted/build_and_install/cmake/cblas_settings.csv deleted file mode 100644 index a6356baf16a0d3d2499e39d2055d8ee878dcaef2..0000000000000000000000000000000000000000 --- a/doc/getstarted/build_and_install/cmake/cblas_settings.csv +++ /dev/null @@ -1,5 +0,0 @@ -编译选项,描述,注意 -MKL_ROOT,MKL的路径,${MKL_ROOT}/include下需要包含mkl.h,${MKL_ROOT}/lib目录下需要包含mkl_core,mkl_sequential和mkl_intel_lp64三个库。 -ATLAS_ROOT,ATLAS的路径,${ATLAS_ROOT}/include下需要包含cblas.h,${ATLAS_ROOT}/lib下需要包含cblas和atlas两个库。 -OPENBLAS_ROOT,OpenBLAS的路径,${OPENBLAS_ROOT}/include下需要包含cblas.h,${OPENBLAS_ROOT}/lib下需要包含openblas库。 -REFERENCE_CBLAS_ROOT,REFERENCE BLAS的路径,${REFERENCE_CBLAS_ROOT}/include下需要包含cblas.h,${REFERENCE_CBLAS_ROOT}/lib下需要包含cblas库。 \ No newline at end of file diff --git a/doc/getstarted/build_and_install/cmake/compile_options.csv b/doc/getstarted/build_and_install/cmake/compile_options.csv deleted file mode 100644 index 463b825470579d0c3736a408b1e82dd33e6f8d42..0000000000000000000000000000000000000000 --- a/doc/getstarted/build_and_install/cmake/compile_options.csv +++ /dev/null @@ -1,12 +0,0 @@ -选项,说明,默认值 -WITH_GPU,是否支持GPU。,取决于是否寻找到CUDA工具链 -WITH_DOUBLE,是否使用双精度浮点数。,否 -WITH_DSO,是否运行时动态加载CUDA动态库,而非静态加载CUDA动态库。,是 -WITH_AVX,是否编译含有AVX指令集的PaddlePaddle二进制文件,是 -WITH_PYTHON,是否内嵌PYTHON解释器。方便今后的嵌入式移植工作。,是 -WITH_STYLE_CHECK,是否编译时进行代码风格检查,是 -WITH_RDMA,是否开启RDMA,否 -WITH_TIMER,是否开启计时功能。如果开启会导致运行略慢,打印的日志变多,但是方便调试和测Benchmark,否 -WITH_TESTING,是否开启单元测试,取决于是否寻找到GTEST -WITH_DOC,是否编译中英文文档,否 -WITH_SWIG_PY,是否编译PYTHON的SWIG接口,该接口可用于预测和定制化训练,取决于是否寻找到SWIG \ No newline at end of file diff --git a/doc/getstarted/build_and_install/docker_install_cn.rst b/doc/getstarted/build_and_install/docker_install_cn.rst index da2d4234658b6ea4730346e721437cc1633c4362..f78b1fb0e11aa028a4b7abb5270740b97f8039e9 100644 --- a/doc/getstarted/build_and_install/docker_install_cn.rst +++ b/doc/getstarted/build_and_install/docker_install_cn.rst @@ -1,183 +1,139 @@ -PaddlePaddle的Docker容器使用方式 +使用Docker安装运行 ================================ -PaddlePaddle目前唯一官方支持的运行的方式是Docker容器。因为Docker能在所有主要操作系统(包括Linux,Mac OS X和Windows)上运行。 请注意,您需要更改 `Dockers设置 `_ 才能充分利用Mac OS X和Windows上的硬件资源。 +使用Docker安装和运行PaddlePaddle可以无需考虑依赖环境即可运行。并且也可以在Windows的docker中运行。 +您可以在 `Docker官网 `_ 获得基本的Docker安装和使用方法。 +如果您在使用Windows,可以参考 +`这篇 `_ +教程,完成在Windows上安装和使用Docker。 -PaddlePaddle发布的Docker镜像使用说明 ------------------------------- +在了解Docker的基本使用方法之后,即可开始下面的步骤: -我们把PaddlePaddle的编译环境打包成一个镜像,称为开发镜像,里面涵盖了 -PaddlePaddle需要的所有编译工具。把编译出来的PaddlePaddle也打包成一个镜 -像,称为生产镜像,里面涵盖了PaddlePaddle运行所需的所有环境。每次 -PaddlePaddle发布新版本的时候都会发布对应版本的生产镜像以及开发镜像。运 -行镜像包括纯CPU版本和GPU版本以及其对应的非AVX版本。我们会在 -`dockerhub.com `_ 提供最新 -的Docker镜像,可以在"tags"标签下找到最新的Paddle镜像版本。为了方便在国 -内的开发者下载Docker镜像,我们提供了国内的镜像服务器供大家使用。如果您 -在国内,请把文档里命令中的paddlepaddle/paddle替换成 -docker.paddlepaddle.org/paddle。 - -1. 开发镜像::code:`paddlepaddle/paddle:-dev` - - 这个镜像包含了Paddle相关的开发工具以及编译和运行环境。用户可以使用开发镜像代替配置本地环境,完成开发,编译,发布, - 文档编写等工作。由于不同的Paddle的版本可能需要不同的依赖和工具,所以如果需要自行配置开发环境需要考虑版本的因素。 - 开发镜像包含了以下工具: - - - gcc/clang - - nvcc - - Python - - sphinx - - woboq - - sshd - 很多开发者会使用远程的安装有GPU的服务器工作,用户可以使用ssh登录到这台服务器上并执行 :code:`docker exec`进入开发镜像并开始工作, - 也可以在开发镜像中启动一个SSHD服务,方便开发者直接登录到镜像中进行开发: - - 以交互容器方式运行开发镜像: +.. _docker_pull: - .. code-block:: bash +获取PaddlePaddle的Docker镜像 +------------------------------ - docker run -it --rm paddlepaddle/paddle:-dev /bin/bash +执行下面的命令获取最新的PaddlePaddle Docker镜像 - 或者,可以以后台进程方式运行容器: + .. code-block:: bash - .. code-block:: bash + docker pull paddlepaddle/paddle - docker run -d -p 2202:22 -p 8888:8888 paddledev/paddle:-dev +对于国内用户,我们提供了加速访问的镜像源: - 然后用密码 :code:`root` SSH进入容器: + .. code-block:: bash - .. code-block:: bash + docker pull docker.paddlepaddle.org/paddle - ssh -p 2202 root@localhost +下载GPU版本的Docker镜像: - SSH方式的一个优点是我们可以从多个终端进入容器。比如,一个终端运行vi,另一个终端运行Python。另一个好处是我们可以把PaddlePaddle容器运行在远程服务器上,并在笔记本上通过SSH与其连接。 + .. code-block:: bash -2. 生产镜像:根据CPU、GPU和非AVX区分了如下4个镜像: + docker pull paddlepaddle/paddle:latest-gpu + docker pull docker.paddlepaddle.org/paddle:latest-gpu - - GPU/AVX::code:`paddlepaddle/paddle:-gpu` - - GPU/no-AVX::code:`paddlepaddle/paddle:-gpu-noavx` - - CPU/AVX::code:`paddlepaddle/paddle:` - - CPU/no-AVX::code:`paddlepaddle/paddle:-noavx` +选择下载使用不同的BLAS库的Docker镜像: - 纯CPU镜像以及GPU镜像都会用到AVX指令集,但是2008年之前生产的旧电脑不支持AVX。以下指令能检查Linux电脑是否支持AVX: + .. code-block:: bash - .. code-block:: bash - - if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi + # 默认是使用MKL的镜像 + docker pull paddlepaddle/paddle + # 使用OpenBLAS的镜像 + docker pull paddlepaddle/paddle:latest-openblas - 如果输出是No,就需要选择使用no-AVX的镜像 +下载指定版本的Docker镜像,可以从 `DockerHub网站 `_ 获取可选的tag,并执行下面的命令: - 以上方法在GPU镜像里也能用,只是请不要忘记提前在物理机上安装GPU最新驱动。 - 为了保证GPU驱动能够在镜像里面正常运行,我们推荐使用[nvidia-docker](https://github.com/NVIDIA/nvidia-docker)来运行镜像。 + .. code-block:: bash - .. code-block:: bash + docker pull paddlepaddle/paddle:[tag] + # 比如: + docker pull docker.paddlepaddle.org/paddle:0.10.0-gpu - nvidia-docker run -it --rm paddledev/paddle:0.10.0rc1-gpu /bin/bash +.. _docker_run: - 注意: 如果使用nvidia-docker存在问题,你也许可以尝试更老的方法,具体如下,但是我们并不推荐这种方法。: - - .. code-block:: bash - - export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')" - export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}') - docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:-gpu - -3. 运行以及发布您的AI程序 - - 假设您已经完成了一个AI训练的python程序 :code:`a.py`,这个程序是您在开发机上使用开发镜像完成开发。此时您可以运行这个命令在开发机上进行测试运行: +在Docker中执行PaddlePaddle训练程序 +------------------------------ - .. code-block:: bash +假设您已经在当前目录(比如在/home/work)编写了一个PaddlePaddle的程序 :code:`train.py` (可以参考 +`PaddlePaddleBook `_ +编写),就可以使用下面的命令开始执行训练: - docker run -it -v $PWD:/work paddle /work/a.py + .. code-block:: bash - 如果要使用GPU,请运行: + cd /home/work + docker run -it -v $PWD:/work paddlepaddle/paddle /work/train.py + +上述命令中, :code:`-it` 参数说明容器已交互式运行; :code:`-v $PWD:/work` +指定将当前路径(Linux中$PWD变量会展开为当前路径的绝对路径)挂载到容器内部的 :code:`/work` +目录; :code:`paddlepaddle/paddle` 指定需要使用的容器; 最后 :code:`/work/train.py` +为容器内执行的命令,即运行训练程序。 - .. code-block:: bash +当然,您也可以进入到Docker容器中,以交互式的方式执行或调试您的代码: - nvidia-docker run -it -v $PWD:/work paddle /work/a.py + .. code-block:: bash + docker run -it -v $PWD:/work paddlepaddle/paddle /bin/bash + cd /work + python train.py +**注:PaddlePaddle Docker镜像为了减小体积,默认没有安装vim,您可以在容器中执行** :code:`apt-get install -y vim` **安装后,在容器中编辑代码。** - 这里`a.py`包含的所有依赖假设都可以在Paddle的运行容器中。如果需要包含更多的依赖、或者需要发布您的应用的镜像,可以编写`Dockerfile`使用`FROM paddledev/paddle:` - 创建和发布自己的AI程序镜像。 +.. _docker_run_book: -运行PaddlePaddle Book ---------------------- - -Jupyter Notebook是一个开源的web程序,大家可以通过它制作和分享带有代码、公式、图表、文字的交互式文档。用户可以通过网页浏览文档。 +使用Docker启动PaddlePaddle Book教程 +------------------------------ -PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Nodebook。 +使用Docker可以快速在本地启动一个包含了PaddlePaddle官方Book教程的Jupyter Notebook,可以通过网页浏览。 +PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Notebook。 如果您想要更深入了解deep learning,PaddlePaddle Book一定是您最好的选择。 +大家可以通过它阅读教程,或者制作和分享带有代码、公式、图表、文字的交互式文档。 我们提供可以直接运行PaddlePaddle Book的Docker镜像,直接运行: -.. code-block:: bash + .. code-block:: bash - docker run -p 8888:8888 paddlepaddle/book + docker run -p 8888:8888 paddlepaddle/book 然后在浏览器中输入以下网址: -.. code-block:: text + .. code-block:: text - http://localhost:8888/ + http://localhost:8888/ 就这么简单,享受您的旅程! -通过Docker容器开发PaddlePaddle ------------------------------- - -开发人员可以在Docker开发镜像中开发PaddlePaddle。这样开发人员可以以一致的方式在不同的平台上工作 - Linux,Mac OS X和Windows。 - -1. 制作PaddlePaddle开发镜像 - - PaddlePaddle每次发布新版本都会发布对应的开发镜像供开发者直接使用。这里介绍如生成造这个开发镜像。 - 生成Docker镜像的方式有两个,一个是直接把一个容器转换成镜像,另一个是创建Dockerfile并运行docker build指令按照Dockerfile生成镜像。第一个方法的好处是简单快捷,适合自己实验,可以快速迭代。第二个方法的好处是Dockerfile可以把整个生成流程描述很清楚,其他人很容易看懂镜像生成过程,持续集成系统也可以简单地复现这个过程。我们采用第二个方法。Dockerfile位于PaddlePaddle repo的根目录。生成生产镜像只需要运行: - - .. code-block:: bash - - git clone https://github.com/PaddlePaddle/Paddle.git - cd Paddle - docker build -t paddle:dev . +.. _docker_run_gpu: - docker build这个命令的-t指定了生成的镜像的名字,这里我们用paddle:dev。到此,PaddlePaddle开发镜像就被构建完毕了。 +使用Docker执行GPU训练 +------------------------------ -2. 制作PaddlePaddle生产镜像 +为了保证GPU驱动能够在镜像里面正常运行,我们推荐使用 +`nvidia-docker `_ 来运行镜像。 +请不要忘记提前在物理机上安装GPU最新驱动。 - 生产镜像的生成分为两步,第一步是运行: + .. code-block:: bash - .. code-block:: bash - - docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=OFF" -e "WITH_TEST=ON" paddle:dev + nvidia-docker run -it -v $PWD:/work paddledev/paddle:latest-gpu /bin/bash - 以上命令会编译PaddlePaddle,生成运行程序,以及生成创建生产镜像的Dockerfile。所有生成的的文件都在build目录下。“WITH_GPU”控制生成的生产镜像是否支持GPU,“WITH_AVX”控制生成的生产镜像是否支持AVX,”WITH_TEST“控制是否生成单元测试。 +**注: 如果没有安装nvidia-docker,可以尝试以下的方法,将CUDA库和Linux设备挂载到Docker容器内:** - 第二步是运行: + .. code-block:: bash - .. code-block:: bash - - docker build -t paddle:prod -f build/Dockerfile ./build + export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')" + export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}') + docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:latest-gpu - 以上命令会按照生成的Dockerfile把生成的程序拷贝到生产镜像中并做相应的配置,最终生成名为paddle:prod的生产镜像。 +**关于AVX:** -3. 运行单元测试 +AVX是一种CPU指令集,可以加速PaddlePaddle的计算。最新的PaddlePaddle Docker镜像默认 +是开启AVX编译的,所以,如果您的电脑不支持AVX,需要单独 +`编译 <./build_from_source_cn.rst>`_ PaddlePaddle为no-avx版本。 - 运行以下指令: +以下指令能检查Linux电脑是否支持AVX: .. code-block:: bash - - docker run -it -v $(pwd):/paddle paddle:dev bash -c "cd /paddle/build && ctest" - -文档 ----- -Paddle的Docker开发镜像带有一个通过 `woboq code browser -`_ 生成的HTML版本的C++源代码,便于用户浏览C++源码。 - -只要在Docker里启动PaddlePaddle的时候给它一个名字,就可以再运行另一个Nginx Docker镜像来服务HTML代码: - -.. code-block:: bash - - docker run -d --name paddle-cpu-doc paddle:-dev - docker run -d --volumes-from paddle-cpu-doc -p 8088:80 nginx + if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi -接着我们就能够打开浏览器在 http://localhost:8088/paddle/ 浏览代码。 +如果输出是No,就需要选择使用no-AVX的镜像 diff --git a/doc/getstarted/build_and_install/docker_install_en.rst b/doc/getstarted/build_and_install/docker_install_en.rst index 03df497506099d2fb758bd0ab437d2c082f2b537..d7acc7aeb744b19d83acb520d07c8551168dd096 100644 --- a/doc/getstarted/build_and_install/docker_install_en.rst +++ b/doc/getstarted/build_and_install/docker_install_en.rst @@ -1,270 +1,146 @@ -PaddlePaddle in Docker Containers +Run in Docker Containers ================================= -Docker container is currently the only officially-supported way to -running PaddlePaddle. This is reasonable as Docker now runs on all -major operating systems including Linux, Mac OS X, and Windows. -Please be aware that you will need to change `Dockers settings -`_ to make full use -of your hardware resource on Mac OS X and Windows. +Run PaddlePaddle in Docker container so that you don't need to care about +runtime dependencies, also you can run under Windows system. You can get +tutorials at `here `_ . -Working With Docker -------------------- +If you are using Windows, please refer to +`this `_ +tutorial to start running docker under windows. -Docker is simple as long as we understand a few basic concepts: +After you've read above tutorials you may proceed the following steps. -- *image*: A Docker image is a pack of software. It could contain one or more programs and all their dependencies. For example, the PaddlePaddle's Docker image includes pre-built PaddlePaddle and Python and many Python packages. We can run a Docker image directly, other than installing all these software. We can type +.. _docker_pull: - .. code-block:: bash - - docker images +Pull PaddlePaddle Docker Image +------------------------------ - to list all images in the system. We can also run +Run the following command to download the latest Docker images: .. code-block:: bash - - docker pull paddlepaddle/paddle:0.10.0rc2 - to download a Docker image, paddlepaddle/paddle in this example, - from Dockerhub.com. + docker pull paddlepaddle/paddle -- *container*: considering a Docker image a program, a container is a - "process" that runs the image. Indeed, a container is exactly an - operating system process, but with a virtualized filesystem, network - port space, and other virtualized environment. We can type +For users in China, we provide a faster mirror: .. code-block:: bash - docker run paddlepaddle/paddle:0.10.0rc2 - - to start a container to run a Docker image, paddlepaddle/paddle in this example. + docker pull docker.paddlepaddle.org/paddle -- By default docker container have an isolated file system namespace, - we can not see the files in the host file system. By using *volume*, - mounted files in host will be visible inside docker container. - Following command will mount current dirctory into /data inside - docker container, run docker container from debian image with - command :code:`ls /data`. +Download GPU version images: .. code-block:: bash - docker run --rm -v $(pwd):/data debian ls /data - -Usage of CPU-only and GPU Images ----------------------------------- - -We package PaddlePaddle's compile environment into a Docker image, -called the develop image, it contains all compiling tools that -PaddlePaddle needs. We package compiled PaddlePaddle program into a -Docker image as well, called the production image, it contains all -runtime environment that running PaddlePaddle needs. For each version -of PaddlePaddle, we release both of them. Production image includes -CPU-only version and a CUDA GPU version and their no-AVX versions. - -We put the docker images on `dockerhub.com -`_. You can find the -latest versions under "tags" tab at dockerhub.com. If you are in -China, you can use our Docker image registry mirror to speed up the -download process. To use it, please replace all paddlepaddle/paddle in -the commands to docker.paddlepaddle.org/paddle. - -1. Production images, this image might have multiple variants: - - - GPU/AVX::code:`paddlepaddle/paddle:-gpu` - - GPU/no-AVX::code:`paddlepaddle/paddle:-gpu-noavx` - - CPU/AVX::code:`paddlepaddle/paddle:` - - CPU/no-AVX::code:`paddlepaddle/paddle:-noavx` - - Please be aware that the CPU-only and the GPU images both use the - AVX instruction set, but old computers produced before 2008 do not - support AVX. The following command checks if your Linux computer - supports AVX: - - .. code-block:: bash - - if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi - - - To run the CPU-only image as an interactive container: - - .. code-block:: bash - - docker run -it --rm paddlepaddle/paddle:0.10.0rc2 /bin/bash - - Above method work with the GPU image too -- the recommended way is - using `nvidia-docker `_. - - Please install nvidia-docker first following this `tutorial - `_. - - Now you can run a GPU image: - - .. code-block:: bash - - nvidia-docker run -it --rm paddlepaddle/paddle:0.10.0rc2-gpu /bin/bash - -2. development image :code:`paddlepaddle/paddle:-dev` - - This image has packed related develop tools and runtime - environment. Users and developers can use this image instead of - their own local computer to accomplish development, build, - releasing, document writing etc. While different version of paddle - may depends on different version of libraries and tools, if you - want to setup a local environment, you must pay attention to the - versions. The development image contains: - - - gcc/clang - - nvcc - - Python - - sphinx - - woboq - - sshd - - Many developers use servers with GPUs, they can use ssh to login to - the server and run :code:`docker exec` to enter the docker - container and start their work. Also they can start a development - docker image with SSHD service, so they can login to the container - and start work. - - -Train Model Using Python API ----------------------------- - -Our official docker image provides a runtime for PaddlePaddle -programs. The typical workflow will be as follows: - -Create a directory as workspace: - -.. code-block:: bash - - mkdir ~/workspace - -Edit a PaddlePaddle python program using your favourite editor - -.. code-block:: bash - - emacs ~/workspace/example.py + docker pull paddlepaddle/paddle:latest-gpu + docker pull docker.paddlepaddle.org/paddle:latest-gpu -Run the program using docker: +Choose between different BLAS version: -.. code-block:: bash - - docker run --rm -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0rc2 python /workspace/example.py - -Or if you are using GPU for training: - -.. code-block:: bash - - nvidia-docker run --rm -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0rc2-gpu python /workspace/example.py - -Above commands will start a docker container by running :code:`python -/workspace/example.py`. It will stop once :code:`python -/workspace/example.py` finishes. - -Another way is to tell docker to start a :code:`/bin/bash` session and -run PaddlePaddle program interactively: - -.. code-block:: bash - - docker run -it -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0rc2 /bin/bash - # now we are inside docker container - cd /workspace - python example.py - -Running with GPU is identical: - -.. code-block:: bash - - nvidia-docker run -it -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0rc2-gpu /bin/bash - # now we are inside docker container - cd /workspace - python example.py - - -Develop PaddlePaddle or Train Model Using C++ API ---------------------------------------------------- - -We will be using PaddlePaddle development image since it contains all -compiling tools and dependencies. - -1. Build PaddlePaddle develop image + .. code-block:: bash - Use following command to build PaddlePaddle develop image: + # image using MKL by default + docker pull paddlepaddle/paddle + # image using OpenBLAS + docker pull paddlepaddle/paddle:latest-openblas - .. code-block:: bash - git clone https://github.com/PaddlePaddle/Paddle.git && cd Paddle - docker build -t paddle:dev . +If you want to use legacy versions, choose a tag from +`DockerHub `_ +and run: -2. Build PaddlePaddle production image + .. code-block:: bash - There are two steps for building production image, the first step is to run: + docker pull paddlepaddle/paddle:[tag] + # i.e. + docker pull docker.paddlepaddle.org/paddle:0.10.0-gpu - .. code-block:: bash +.. _docker_run: - docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=OFF" -e "WITH_TEST=ON" paddle:dev +Launch your training program in Docker +------------------------------ - The above command will compile PaddlePaddle and create a Dockerfile for building production image. All the generated files are in the build directory. "WITH_GPU" controls if the generated production image supports GPU. "WITH_AVX" controls if the generated production image supports AVX. "WITH_TEST" controls if the unit test will be generated. +Assume that you have already written a PaddlePaddle program +named :code:`train.py` under directory :code:`/home/work` (refer to +`PaddlePaddleBook `_ +for more samples), then run the following command: - The second step is to run: + .. code-block:: bash - .. code-block:: bash + cd /home/work + docker run -it -v $PWD:/work paddlepaddle/paddle /work/train.py - docker build -t paddle:prod -f build/Dockerfile ./build +In the above command, :code:`-it` means run the container interactively; +:code:`-v $PWD:/work` means mount the current directory ($PWD will expand +to current absolute path in Linux) under :code:`/work` in the container. +:code:`paddlepaddle/paddle` to specify image to use; finnally +:code:`/work/train.py` is the command to run inside docker. - The above command will generate the production image by copying the compiled PaddlePaddle program into the image. +Also, you can go into the container shell, run or debug your code +interactively: -3. Run unit test + .. code-block:: bash + docker run -it -v $PWD:/work paddlepaddle/paddle /bin/bash + cd /work + python train.py - Following command will run unit test: +**NOTE: We did not install vim in the default docker image to reduce the image size, you can run** :code:`apt-get install -y vim` **to install it if you need to edit python files.** - .. code-block:: bash - - docker run -it -v $(pwd):/paddle paddle:dev bash -c "cd /paddle/build && ctest" +.. _docker_run_book: PaddlePaddle Book ------------------ -The Jupyter Notebook is an open-source web application that allows -you to create and share documents that contain live code, equations, -visualizations and explanatory text in a single browser. - -PaddlePaddle Book is an interactive Jupyter Notebook for users and developers. -We already exposed port 8888 for this book. If you want to +You can create a container serving PaddlePaddle Book using Jupyter Notebook in +one minute using Docker. PaddlePaddle Book is an interactive Jupyter Notebook +for users and developers.If you want to dig deeper into deep learning, PaddlePaddle Book definitely is your best choice. We provide a packaged book image, simply issue the command: -.. code-block:: bash + .. code-block:: bash - docker run -p 8888:8888 paddlepaddle/book + docker run -p 8888:8888 paddlepaddle/book Then, you would back and paste the address into the local browser: -.. code-block:: text + .. code-block:: text - http://localhost:8888/ + http://localhost:8888/ That's all. Enjoy your journey! +.. _docker_run_gpu: -Documentation -------------- +Train with Docker with GPU +------------------------------ -Paddle Docker images include an HTML version of C++ source code -generated using `woboq code browser -`_. This makes it easy -for users to browse and understand the C++ source code. +We recommend using +`nvidia-docker `_ +to run GPU training jobs. Please ensure you have latest +GPU driver installed before move on. -As long as we give the Paddle Docker container a name, we can run an -additional Nginx Docker container to serve the volume from the Paddle -container: + .. code-block:: bash -.. code-block:: bash + nvidia-docker run -it -v $PWD:/work paddledev/paddle:latest-gpu /bin/bash - docker run -d --name paddle-cpu-doc paddle: - docker run -d --volumes-from paddle-cpu-doc -p 8088:80 nginx +**NOTE: If you don't have nvidia-docker installed, try the following method to mount CUDA libs and devices into the container.** + .. code-block:: bash + + export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')" + export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}') + docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:latest-gpu + +**About AVX:** + +AVX is a kind of CPU instruction can accelerate PaddlePaddle's calculations. +The latest PaddlePaddle Docker image turns AVX on by default, so, if your +computer doesn't support AVX, you'll probably need to +`build <./build_from_source_en.rst>`_ with :code:`WITH_AVX=OFF`. -Then we can direct our Web browser to the HTML version of source code -at http://localhost:8088/paddle/ +The following command will tell you whether your computer supports AVX. + + .. code-block:: bash + + if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi diff --git a/doc/getstarted/build_and_install/index_cn.rst b/doc/getstarted/build_and_install/index_cn.rst index a24df6c518fad84a48061ecb34ee46cb312a4995..c9ba84c842b530162c92713046e64fdf82bd441b 100644 --- a/doc/getstarted/build_and_install/index_cn.rst +++ b/doc/getstarted/build_and_install/index_cn.rst @@ -6,24 +6,28 @@ 安装流程 ++++++++ -PaddlePaddle提供数个预编译的二进制来进行安装,包括Docker镜像,ubuntu的deb安装包等。我们推荐使用Docker镜像来部署环境,同时欢迎贡献更多的安装包。 +PaddlePaddle提供pip和Docker的安装方式: .. toctree:: :maxdepth: 1 - - docker_install_cn.rst - ubuntu_install_cn.rst - + pip_install_cn.rst + docker_install_cn.rst + ../../howto/dev/build_cn.md 编译流程 ++++++++ .. warning:: - 编译流程主要推荐高级用户查看,普通用户请走安装流程。 + 建议直接使用上述安装流程,方便快速安装。只有在遇到需要独立定制的二进制时才需要编译。 .. toctree:: :maxdepth: 1 - cmake/build_from_source_cn.rst + build_from_source_cn.rst + +常见问题解答 +++++++++++ + +`常见问题解答 `_ diff --git a/doc/getstarted/build_and_install/index_en.rst b/doc/getstarted/build_and_install/index_en.rst index 1bfd4f75c0b9b82d61d28a30f03181f7be159f24..32d66d63dd5b2a30d5de4a088dc80b680830cb84 100644 --- a/doc/getstarted/build_and_install/index_en.rst +++ b/doc/getstarted/build_and_install/index_en.rst @@ -1,23 +1,34 @@ Install and Build ================= -Install PaddlePaddle ----------------------- +.. _install_steps: -.. toctree:: - :maxdepth: 1 +Install Steps +++++++++ + +You can choose either pip or Docker to complete your install: + +.. toctree:: + :maxdepth: 1 + + pip_install_en.rst + docker_install_en.rst + ../../howto/dev/build_en.md - docker_install_en.rst - ubuntu_install_en.rst Build from Source ----------------- .. warning:: - Please use :code:`deb` package or :code:`docker` image to install paddle. The building guide is used for hacking or contributing PaddlePaddle source code. + We recommend to directly install via above installation steps, you'll only need to build PaddlePaddle from source when you need a modifed binary. .. toctree:: :maxdepth: 1 build_from_source_en.md + +FAQ +++++++++++ + +`FAQ `_ diff --git a/doc/getstarted/build_and_install/paddleci.png b/doc/getstarted/build_and_install/paddleci.png new file mode 100644 index 0000000000000000000000000000000000000000..16087ce059aa3c07ce8c927d983eb86351915825 Binary files /dev/null and b/doc/getstarted/build_and_install/paddleci.png differ diff --git a/doc/getstarted/build_and_install/pip_install_cn.rst b/doc/getstarted/build_and_install/pip_install_cn.rst new file mode 100644 index 0000000000000000000000000000000000000000..b270e2c2f0b0cbfd6fb4b9b0750d207952f84d76 --- /dev/null +++ b/doc/getstarted/build_and_install/pip_install_cn.rst @@ -0,0 +1,86 @@ +使用pip安装 +================================ + +PaddlePaddle可以使用常用的Python包管理工具 +`pip `_ +完成安装,并可以在大多数主流的Linux操作系统以及MacOS上执行。 + +.. _pip_install: + +使用pip安装 +------------------------------ + + +执行下面的命令即可在当前机器上安装PaddlePaddle的运行时环境,并自动下载安装依赖软件。 + + .. code-block:: bash + + pip install paddlepaddle + + +如果需要安装支持GPU的版本,需要执行: + + .. code-block:: bash + + pip install paddlepaddle-gpu + +如果需要获取并安装最新的(开发分支)PaddlePaddle,可以从我们的CI系统中下载最新的whl安装包和c-api开发包并安装, +您可以从下面的表格中找到需要的版本: + +如果在点击下面链接时出现如下登陆界面,点击“Log in as guest”即可开始下载: + +.. image:: paddleci.png + :scale: 50 % + :align: center + +.. csv-table:: 各个版本最新的whl包 + :header: "版本说明", "cp27-cp27mu", "cp27-cp27m", "C-API" + :widths: 1, 3, 3, 3 + + "cpu_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" + "cpu_avx_openblas", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl `_", "暂无" + "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" + "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" + "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" + +.. _pip_dependency: + +运行环境依赖 +------------------------------ + +PaddlePaddle安装包由于不仅仅包含.py程序,而且包含了C++编写的部分,所以我们确保发布的二进制包可以支持主流的Linux操作系统,比如CentOS 6以上,Ubuntu 14.04以上,MacOS 10.12以上。 + +PaddlePaddle发布的安装包会尽量对齐 `manylinux1 `_ 标准,通常使用CentOS 5作为编译环境。但由于CUDA库通常需要CentOS 6以上,而且CentOS 5即将停止维护,所以我们默认使用CentOS 6作为标准编译环境。 + +.. csv-table:: PaddlePaddle环境依赖 + :header: "依赖", "版本", "说明" + :widths: 10, 15, 30 + + "操作系统", "Linux, MacOS", "CentOS 6以上,Ubuntu 14.04以上,MacOS 10.12以上" + "Python", "2.7.x", "暂时不支持Python3" + "libc.so", "GLIBC_2.7", "glibc至少包含GLIBC_2.7以上的符号" + "libstdc++.so", "GLIBCXX_3.4.11, CXXABI_1.3.3", "至少包含GLIBCXX_3.4.11, CXXABI_1.3.3以上的符号" + "libgcc_s.so", "GCC_3.3", "至少包含GCC_3.3以上的符号" + +.. _pip_faq: + +安装常见问题和解决方法 +------------------------------ + +- paddlepaddle*.whl is not a supported wheel on this platform. + + 出现这个问题的主要原因是,没有找到和当前系统匹配的paddlepaddle安装包。请检查Python版本是否为2.7系列。另外最新的pip官方源中的安装包默认是manylinux1标准,需要使用最新的pip (>9.0.0) 才可以安装。可以使用下面的命令更新您的pip: + + .. code-block:: bash + + pip install --upgrade pip + + 如果仍然存在问题,可以执行: + + .. code-block:: bash + + python -c "import pip; print(pip.pep425tags.get_supported())" + + 获取当前系统支持的安装包格式,并检查和需安装的包是否匹配。pypi安装包可以在 `这个 `_ 链接中找到。 + + 如果系统支持的是 linux_x86_64 而安装包是 manylinux1_x86_64 ,需要升级pip版本到最新; 如果系统支持 manylinux1_x86_64 而安装包(本地)是 linux_x86_64 ,可以重命名这个whl包为 manylinux1_x86_64 再安装。 diff --git a/doc/getstarted/build_and_install/pip_install_en.rst b/doc/getstarted/build_and_install/pip_install_en.rst new file mode 100644 index 0000000000000000000000000000000000000000..70f601a11c610e0a2b5dcc8b73d2c3ea19e195e1 --- /dev/null +++ b/doc/getstarted/build_and_install/pip_install_en.rst @@ -0,0 +1,104 @@ +Install Using pip +================================ + +You can use current widely used Python package management +tool `pip `_ +to install PaddlePaddle. This method can be used in +most of current Linux systems or MacOS. + +.. _pip_install: + +Install Using pip +------------------------------ + +Run the following command to install PaddlePaddle on the current +machine, it will also download requirements. + + .. code-block:: bash + + pip install paddlepaddle + + +If you wish to install GPU version, just run: + + .. code-block:: bash + + pip install paddlepaddle-gpu + +If you wish to install the latest develop branch PaddlePaddle, +you can download the latest whl package from our CI system. Access +the below links, log in as guest, then click at the "Artifact" +tab, you'll find the download link of whl packages. + +If the links below shows up the login form, just click "Log in as guest" to start the download: + +.. image:: paddleci.png + :scale: 50 % + :align: center + +.. csv-table:: whl package of each version + :header: "version", "cp27-cp27mu", "cp27-cp27m", "C-API" + :widths: 1, 3, 3, 3 + + "cpu_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" + "cpu_avx_openblas", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl `_", "Not Available" + "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" + "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" + "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" + +.. _pip_dependency: + +Runtime Dependency +------------------------------ + +PaddlePaddle installation packages (whl) does not only contain .py files, +but also binaries built from C++ code. We ensure that PaddlePaddle can +run on current mainline Linux distributions, like CentOS 6, Ubuntu 14.04 +and MacOS 10.12. + +PaddlePaddle whl packages are trying to satisfy +`manylinux1 `_ +standard, which uses CentOS 5 as default build environment. But CUDA libraries +seems only run on CentOS 6 at least, also, CentOS 5 is about to end its lifetime, +so we use CentOS 6 as default build environment. + +.. csv-table:: PaddlePaddle Runtime Deps + :header: "Dependency", "version", "description" + :widths: 10, 15, 30 + + "OS", "Linux, MacOS", "CentOS 6 or later,Ubuntu 14.04 or later,MacOS 10.12 or later" + "Python", "2.7.x", "Currently Python3 is not supported" + "libc.so", "GLIBC_2.7", "glibc at least include GLIBC_2.7 symbols" + "libstdc++.so", "GLIBCXX_3.4.11, CXXABI_1.3.3", "At least include GLIBCXX_3.4.11, CXXABI_1.3.3 symbols" + "libgcc_s.so", "GCC_3.3", "At least include GCC_3.3 symbols" + +.. _pip_faq: + +FAQ +------------------------------ + +- paddlepaddle*.whl is not a supported wheel on this platform. + + The main cause of this issue is that your current platform is + not supported. Please check that you are using Python 2.7 series. + Besides, pypi only supports manylinux1 standard, you'll need to + upgrade your pip to >9.0.0. Then run the below command: + + .. code-block:: bash + + pip install --upgrade pip + + If the problem still exists, run the following command: + + .. code-block:: bash + + python -c "import pip; print(pip.pep425tags.get_supported())" + + Then you'll get supported package suffixes, then check if it matches + the file name of the whl package. You can find default whl package at + `here `_ + + If your system supports linux_x86_64 but the whl package is manylinux1_x86_64, + you'll need to update pip to the latest version; If your system supports + manylinux1_x86_64 but the whl package is linux_x86_64 you can rename the + file to manylinux1_x86_64 suffix and then install. diff --git a/doc/getstarted/build_and_install/ubuntu_install_cn.rst b/doc/getstarted/build_and_install/ubuntu_install_cn.rst deleted file mode 100644 index 9e39ccb00f5d5655c30148900a3d76a22aacfc01..0000000000000000000000000000000000000000 --- a/doc/getstarted/build_and_install/ubuntu_install_cn.rst +++ /dev/null @@ -1,71 +0,0 @@ -Ubuntu部署PaddlePaddle -=================================== - -PaddlePaddle提供了ubuntu 14.04 deb安装包。 - -安装 ------- - -安装包的下载地址是\: https://github.com/PaddlePaddle/Paddle/releases - -它包含四个版本\: - -* cpu版本: 支持主流x86处理器平台, 使用了avx指令集。 - -* cpu-noavx版本:支持主流x86处理器平台,没有使用avx指令集。 - -* gpu版本:支持主流x86处理器平台,支持nvidia cuda平台,使用了avx指令集。 - -* gpu-noavx版本:支持主流x86处理器平台,支持nvidia cuda平台,没有使用avx指令集。 - -下载完相关安装包后,执行: - -.. code-block:: shell - - sudo apt-get install gdebi - gdebi paddle-*-cpu.deb - -或者: - -.. code-block:: shell - - dpkg -i paddle-*-cpu.deb - apt-get install -f - - -在 :code:`dpkg -i` 的时候如果报一些依赖未找到的错误是正常的, -在 :code:`apt-get install -f` 里会继续安装 PaddlePaddle。 - -安装完成后,可以使用命令 :code:`paddle version` 查看安装后的paddle 版本: - -.. code-block:: shell - - PaddlePaddle 0.8.0b1, compiled with - with_avx: ON - with_gpu: OFF - with_double: OFF - with_python: ON - with_rdma: OFF - with_timer: OFF - with_predict_sdk: - - -可能遇到的问题 --------------- - -libcudart.so/libcudnn.so找不到 -++++++++++++++++++++++++++++++ - -安装完成后,运行 :code:`paddle train` 报错\: - -.. code-block:: shell - - 0831 12:36:04.151525 1085 hl_dso_loader.cc:70] Check failed: nullptr != *dso_handle For Gpu version of PaddlePaddle, it couldn't find CUDA library: libcudart.so Please make sure you already specify its path.Note: for training data on Cpu using Gpu version of PaddlePaddle,you must specify libcudart.so via LD_LIBRARY_PATH. - -原因是未设置cuda运行时环境变量。 如果使用GPU版本的PaddlePaddle,请安装CUDA 7.5 和CUDNN 5到本地环境中,并设置: - -.. code-block:: shell - - export LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/lib:$LD_LIBRARY_PATH - export PATH=/usr/local/cuda/bin:$PATH - diff --git a/doc/getstarted/build_and_install/ubuntu_install_en.rst b/doc/getstarted/build_and_install/ubuntu_install_en.rst deleted file mode 100644 index ea8042085bf458be96e71017d229d88ad867695b..0000000000000000000000000000000000000000 --- a/doc/getstarted/build_and_install/ubuntu_install_en.rst +++ /dev/null @@ -1,25 +0,0 @@ -Debian Package installation guide -================================= - -PaddlePaddle supports :code:`deb` pacakge. The installation of this :code:`deb` package is tested in ubuntu 14.04, but it should be support other debian based linux, too. - -There are four versions of debian package, :code:`cpu`, :code:`gpu`, :code:`cpu-noavx`, :code:`gpu-noavx`. And :code:`noavx` version is used to support CPU which does not contain :code:`AVX` instructions. The download url of :code:`deb` package is \: https://github.com/baidu/Paddle/releases/ - - -After downloading PaddlePaddle deb packages, you can use :code:`gdebi` install. - -.. code-block:: bash - - gdebi paddle-*.deb - -If :code:`gdebi` is not installed, you can use :code:`sudo apt-get install gdebi` to install it. - -Or you can use following commands to install PaddlePaddle. - -.. code-block:: bash - - dpkg -i paddle-*.deb - apt-get install -f - -And if you use GPU version deb package, you need to install CUDA toolkit and cuDNN, and set related environment variables(such as LD_LIBRARY_PATH) first. It is normal when `dpkg -i` get errors. `apt-get install -f` will continue install paddle, and install dependences. - diff --git a/doc/getstarted/concepts/src/train.py b/doc/getstarted/concepts/src/train.py index 679d0a931a7d650108ea89a04080a55d2976f72e..8aceb23406a476f08639cc6223cdf730b728a705 100644 --- a/doc/getstarted/concepts/src/train.py +++ b/doc/getstarted/concepts/src/train.py @@ -8,7 +8,7 @@ paddle.init(use_gpu=False) x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(2)) y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear()) y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1)) -cost = paddle.layer.mse_cost(input=y_predict, label=y) +cost = paddle.layer.square_error_cost(input=y_predict, label=y) # create parameters parameters = paddle.parameters.create(cost) @@ -31,7 +31,7 @@ def event_handler(event): # define training dataset reader def train_reader(): train_x = np.array([[1, 1], [1, 2], [3, 4], [5, 2]]) - train_y = np.array([-2, -3, -7, -7]) + train_y = np.array([[-2], [-3], [-7], [-7]]) def reader(): for i in xrange(train_y.shape[0]): diff --git a/doc/getstarted/concepts/use_concepts_cn.rst b/doc/getstarted/concepts/use_concepts_cn.rst index e63ca11102c8ce457afcc3c262fa5f159361c01d..c243083794bb3c4659242de99b3b2715af9d7c24 100644 --- a/doc/getstarted/concepts/use_concepts_cn.rst +++ b/doc/getstarted/concepts/use_concepts_cn.rst @@ -81,9 +81,9 @@ PaddlePaddle支持不同类型的输入数据,主要包括四种类型,和 .. code-block:: bash y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear()) - cost = paddle.layer.mse_cost(input=y_predict, label=y) + cost = paddle.layer.square_error_cost(input=y_predict, label=y) -其中,x与y为之前描述的输入层;而y_predict是接收x作为输入,接上一个全连接层;cost接收y_predict与y作为输入,接上均方误差层。 +其中,x与y为之前描述的输入层;而y_predict是接收x作为输入,接上一个全连接层;cost接收y_predict与y作为输入,接上平方误差层。 最后一层cost中记录了神经网络的所有拓扑结构,通过组合不同的layer,我们即可完成神经网络的搭建。 @@ -111,7 +111,7 @@ PaddlePaddle支持不同类型的输入数据,主要包括四种类型,和 # define training dataset reader def train_reader(): train_x = np.array([[1, 1], [1, 2], [3, 4], [5, 2]]) - train_y = np.array([-2, -3, -7, -7]) + train_y = np.array([[-2], [-3], [-7], [-7]]) def reader(): for i in xrange(train_y.shape[0]): yield train_x[i], train_y[i] @@ -147,4 +147,4 @@ PaddlePaddle支持不同类型的输入数据,主要包括四种类型,和 .. literalinclude:: src/train.py :linenos: -有关线性回归的实际应用,可以参考PaddlePaddle book的 `第一章节 `_。 \ No newline at end of file +有关线性回归的实际应用,可以参考PaddlePaddle book的 `第一章节 `_。 diff --git a/doc/getstarted/index_cn.rst b/doc/getstarted/index_cn.rst index 0cb27f802c40ef123fdc9c6799aad3b2a5f554c0..a9087be6f350c5656cabb0c64ba0f200d1c666cc 100644 --- a/doc/getstarted/index_cn.rst +++ b/doc/getstarted/index_cn.rst @@ -1,10 +1,61 @@ 新手入门 ============ +.. _quick_install: + +快速安装 +++++++++ + +PaddlePaddle支持使用pip快速安装,目前支持CentOS 6以上, Ubuntu 14.04以及MacOS 10.12,并安装有Python2.7。 +执行下面的命令完成快速安装: + + .. code-block:: bash + + pip install paddlepaddle + +如果需要安装支持GPU的版本,需要执行: + + .. code-block:: bash + + pip install paddlepaddle-gpu + +更详细的安装和编译方法参考: + .. toctree:: :maxdepth: 1 build_and_install/index_cn.rst - concepts/use_concepts_cn.rst -- `深度学习入门课程 `_ +.. _quick_start: + +快速开始 +++++++++ + +创建一个 housing.py 并粘贴此Python代码: + + .. code-block:: python + + import paddle.v2 as paddle + + # Initialize PaddlePaddle. + paddle.init(use_gpu=False, trainer_count=1) + + # Configure the neural network. + x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13)) + y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear()) + + # Infer using provided test data. + probs = paddle.infer( + output_layer=y_predict, + parameters=paddle.dataset.uci_housing.model(), + input=[item for item in paddle.dataset.uci_housing.test()()]) + + for i in xrange(len(probs)): + print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000) + +执行 :code:`python housing.py` 瞧! 它应该打印出预测住房数据的清单。 + +.. toctree:: + :maxdepth: 1 + + concepts/use_concepts_cn.rst diff --git a/doc/getstarted/index_en.rst b/doc/getstarted/index_en.rst index 9f771e93e8b63eb98e31ec12667bd1aa007af20e..d14e3f5c0cc90792fce9cb82e65da482c44dc433 100644 --- a/doc/getstarted/index_en.rst +++ b/doc/getstarted/index_en.rst @@ -1,9 +1,61 @@ GET STARTED ============ +.. _quick_install: + +Quick Install +---------------------- + +You can use pip to install PaddlePaddle with a single command, supports +CentOS 6 above, Ubuntu 14.04 above or MacOS 10.12, with Python 2.7 installed. +Simply run the following command to install: + + .. code-block:: bash + + pip install paddlepaddle + +If you need to install GPU version, run: + + .. code-block:: bash + + pip install paddlepaddle-gpu + +For more details about installation and build: + .. toctree:: :maxdepth: 1 build_and_install/index_en.rst -- `Deep Learning 101 `_ + +.. _quick_start: + +Quick Start +++++++++ + +Create a new file called housing.py, and paste this Python +code: + + + .. code-block:: python + + import paddle.v2 as paddle + + # Initialize PaddlePaddle. + paddle.init(use_gpu=False, trainer_count=1) + + # Configure the neural network. + x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13)) + y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear()) + + # Infer using provided test data. + probs = paddle.infer( + output_layer=y_predict, + parameters=paddle.dataset.uci_housing.model(), + input=[item for item in paddle.dataset.uci_housing.test()()]) + + for i in xrange(len(probs)): + print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000) + +Run :code:`python housing.py` and voila! It should print out a list of predictions +for the test housing data. diff --git a/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst b/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst index 79048e92482851af6c2dd7d055868ebcaa7a298b..e05173c2006ff47ecb6ca5a4fe1502de750acc59 100644 --- a/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst +++ b/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst @@ -28,17 +28,17 @@ pooling 的使用示例如下,详细见 :ref:`api_v2.layer_pooling` 配置API seq_pool = pooling(input=layer, pooling_type=pooling.Max(), - agg_level=AggregateLevel.EACH_SEQUENCE) + agg_level=AggregateLevel.TO_SEQUENCE) - `pooling_type` 目前支持两种,分别是:pooling.Max()和pooling.Avg()。 -- `agg_level=AggregateLevel.EACH_TIMESTEP` 时(默认值): +- `agg_level=AggregateLevel.TO_NO_SEQUENCE` 时(默认值): - 作用:双层序列经过运算变成一个0层序列,或单层序列经过运算变成一个0层序列 - 输入:一个双层序列,或一个单层序列 - 输出:一个0层序列,即整个输入序列(单层或双层)的平均值(或最大值) -- `agg_level=AggregateLevel.EACH_SEQUENCE` 时: +- `agg_level=AggregateLevel.TO_SEQUENCE` 时: - 作用:一个双层序列经过运算变成一个单层序列 - 输入:必须是一个双层序列 @@ -52,15 +52,15 @@ last_seq 的使用示例如下( :ref:`api_v2.layer_first_seq` 类似),详 .. code-block:: bash last = last_seq(input=layer, - agg_level=AggregateLevel.EACH_SEQUENCE) + agg_level=AggregateLevel.TO_SEQUENCE) -- `agg_level=AggregateLevel.EACH_TIMESTEP` 时(默认值): +- `agg_level=AggregateLevel.TO_NO_SEQUENCE` 时(默认值): - 作用:一个双层序列经过运算变成一个0层序列,或一个单层序列经过运算变成一个0层序列 - 输入:一个双层序列或一个单层序列 - 输出:一个0层序列,即整个输入序列(双层或者单层)最后一个,或第一个元素。 -- `agg_level=AggregateLevel.EACH_SEQUENCE` 时: +- `agg_level=AggregateLevel.TO_SEQUENCE` 时: - 作用:一个双层序列经过运算变成一个单层序列 - 输入:必须是一个双层序列 - 输出:一个单层序列,其中每个元素是双层序列中每个subseq最后一个(或第一个)元素。 @@ -74,9 +74,9 @@ expand 的使用示例如下,详细见 :ref:`api_v2.layer_expand` 配置API。 ex = expand(input=layer1, expand_as=layer2, - expand_level=ExpandLevel.FROM_TIMESTEP) + expand_level=ExpandLevel.FROM_NO_SEQUENCE) -- `expand_level=ExpandLevel.FROM_TIMESTEP` 时(默认值): +- `expand_level=ExpandLevel.FROM_NO_SEQUENCE` 时(默认值): - 作用:一个0层序列经过运算扩展成一个单层序列,或者一个双层序列 - 输入:layer1必须是一个0层序列,是待扩展的数据;layer2 可以是一个单层序列,或者是一个双层序列,提供扩展的长度信息 diff --git a/doc/howto/deep_model/rnn/hrnn_rnn_api_compare_cn.rst b/doc/howto/deep_model/rnn/hrnn_rnn_api_compare_cn.rst index 96e52b910a22576fd75c9d4e1bef6e2cf74bc84f..efdc44455ea4dc81a87b4d4fc8a81e78b15cb06a 100644 --- a/doc/howto/deep_model/rnn/hrnn_rnn_api_compare_cn.rst +++ b/doc/howto/deep_model/rnn/hrnn_rnn_api_compare_cn.rst @@ -81,7 +81,7 @@ * 在本例中,我们将原始数据的每一组,通过\ :code:`recurrent_group`\ 进行拆解,拆解成的每一句话再通过一个LSTM网络。这和单层RNN的配置是等价的。 -* 与单层RNN的配置类似,我们只需要使用LSTM encode成的最后一个向量。所以对\ :code:`recurrent_group`\ 进行了\ :code:`last_seq`\ 操作。但和单层RNN不同,我们是对每一个子序列取最后一个元素,因此\ :code:`agg_level=AggregateLevel.EACH_SEQUENCE`\ 。 +* 与单层RNN的配置类似,我们只需要使用LSTM encode成的最后一个向量。所以对\ :code:`recurrent_group`\ 进行了\ :code:`last_seq`\ 操作。但和单层RNN不同,我们是对每一个子序列取最后一个元素,因此\ :code:`agg_level=AggregateLevel.TO_SEQUENCE`\ 。 * 至此,\ :code:`lstm_last`\ 便和单层RNN配置中的\ :code:`lstm_last`\ 具有相同的结果了。 diff --git a/doc/howto/deep_model/rnn/index_cn.rst b/doc/howto/deep_model/rnn/index_cn.rst index 9e805ca85191b793c8798a239927a318c70b96f5..9ecab5594cff47cde4700b7ce0f58013a960a16e 100644 --- a/doc/howto/deep_model/rnn/index_cn.rst +++ b/doc/howto/deep_model/rnn/index_cn.rst @@ -4,6 +4,7 @@ RNN相关模型 .. toctree:: :maxdepth: 1 + rnn_config_cn.rst recurrent_group_cn.md hierarchical_layer_cn.rst hrnn_rnn_api_compare_cn.rst diff --git a/doc/howto/deep_model/rnn/index_en.rst b/doc/howto/deep_model/rnn/index_en.rst index 13a153b05c578e0af82ee29db5ea27fd4b6d6f59..7adc79873d699fdfd5a85034bcef964dd1f19132 100644 --- a/doc/howto/deep_model/rnn/index_en.rst +++ b/doc/howto/deep_model/rnn/index_en.rst @@ -1,2 +1,7 @@ RNN Models ========== + +.. toctree:: + :maxdepth: 1 + + rnn_config_en.rst diff --git a/doc/howto/deep_model/rnn/rnn_config_cn.rst b/doc/howto/deep_model/rnn/rnn_config_cn.rst index ac2bd0775f4ab2e0a0c37462e2c23001123b152b..63fa161fafed0f3a8ec8799af21304cbec62d813 100644 --- a/doc/howto/deep_model/rnn/rnn_config_cn.rst +++ b/doc/howto/deep_model/rnn/rnn_config_cn.rst @@ -5,36 +5,13 @@ RNN配置 中配置循环神经网络(RNN)。PaddlePaddle 高度支持灵活和高效的循环神经网络配置。 在本教程中,您将了解如何: -- 准备用来学习循环神经网络的序列数据。 - 配置循环神经网络架构。 - 使用学习完成的循环神经网络模型生成序列。 我们将使用 vanilla 循环神经网络和 sequence to sequence 模型来指导你完成这些步骤。sequence to sequence -模型的代码可以在\ ``demo / seqToseq``\ 找到。 - -准备序列数据 ------------- - -PaddlePaddle -不需要对序列数据进行任何预处理,例如填充。唯一需要做的是将相应类型设置为输入。例如,以下代码段定义了三个输入。 -它们都是序列,它们的大小是\ ``src_dict``\ ,\ ``trg_dict``\ 和\ ``trg_dict``\ : - -.. code:: python - - settings.input_types = [ - integer_value_sequence(len(settings.src_dict)), - integer_value_sequence(len(settings.trg_dict)), - integer_value_sequence(len(settings.trg_dict))] - -在\ ``process``\ 函数中,每个\ ``yield``\ 函数将返回三个整数列表。每个整数列表被视为一个整数序列: - -.. code:: python - - yield src_ids, trg_ids, trg_ids_next - -有关如何编写数据提供程序的更多细节描述,请参考 :ref:`api_pydataprovider2` 。完整的数据提供文件在 -``demo/seqToseq/dataprovider.py``\ 。 +模型的代码可以在 `book/08.machine_translation `_ 找到。 +wmt14数据的提供文件在 `python/paddle/v2/dataset/wmt14.py `_ 。 配置循环神经网络架构 -------------------- @@ -44,7 +21,7 @@ PaddlePaddle 循环神经网络在每个时间步骤顺序地处理序列。下面列出了 LSTM 的架构的示例。 -.. image:: ../../../tutorials/sentiment_analysis/bi_lstm.jpg +.. image:: src/bi_lstm.jpg :align: center 一般来说,循环网络从 :math:`t=1` 到 :math:`t=T` 或者反向地从 :math:`t=T` 到 :math:`t=1` 执行以下操作。 @@ -85,19 +62,19 @@ vanilla act=None, rnn_layer_attr=None): def __rnn_step__(ipt): - out_mem = memory(name=name, size=size) - rnn_out = mixed_layer(input = [full_matrix_projection(ipt), - full_matrix_projection(out_mem)], - name = name, - bias_attr = rnn_bias_attr, - act = act, - layer_attr = rnn_layer_attr, - size = size) + out_mem = paddle.layer.memory(name=name, size=size) + rnn_out = paddle.layer.mixed(input = [paddle.layer.full_matrix_projection(input=ipt), + paddle.layer.full_matrix_projection(input=out_mem)], + name = name, + bias_attr = rnn_bias_attr, + act = act, + layer_attr = rnn_layer_attr, + size = size) return rnn_out - return recurrent_group(name='%s_recurrent_group' % name, - step=__rnn_step__, - reverse=reverse, - input=input) + return paddle.layer.recurrent_group(name='%s_recurrent_group' % name, + step=__rnn_step__, + reverse=reverse, + input=input) PaddlePaddle 使用“Memory”(记忆模块)实现单步函数。\ **Memory**\ 是在PaddlePaddle中构造循环神经网络时最重要的概念。 @@ -119,7 +96,7 @@ Sequence to Sequence Model with Attention 我们将使用 sequence to sequence model with attention 作为例子演示如何配置复杂的循环神经网络模型。该模型的说明如下图所示。 -.. image:: ../../../tutorials/text_generation/encoder-decoder-attention-model.png +.. image:: src/encoder-decoder-attention-model.png :align: center 在这个模型中,源序列 :math:`S = \{s_1, \dots, s_T\}` @@ -140,43 +117,52 @@ Sequence to Sequence Model with Attention .. code:: python # 定义源语句的数据层 - src_word_id = data_layer(name='source_language_word', size=source_dict_dim) + src_word_id = paddle.layer.data( + name='source_language_word', + type=paddle.data_type.integer_value_sequence(source_dict_dim)) # 计算每个词的词向量 - src_embedding = embedding_layer( + src_embedding = paddle.layer.embedding( input=src_word_id, size=word_vector_dim, - param_attr=ParamAttr(name='_source_language_embedding')) + param_attr=paddle.attr.ParamAttr(name='_source_language_embedding')) # 应用前向循环神经网络 - src_forward = grumemory(input=src_embedding, size=encoder_size) + src_forward = paddle.networks.simple_gru( + input=src_embedding, size=encoder_size) # 应用反向递归神经网络(reverse=True表示反向循环神经网络) - src_backward = grumemory(input=src_embedding, - size=encoder_size, - reverse=True) + src_backward = paddle.networks.simple_gru( + input=src_embedding, size=encoder_size, reverse=True) # 将循环神经网络的前向和反向部分混合在一起 - encoded_vector = concat_layer(input=[src_forward, src_backward]) + encoded_vector = paddle.layer.concat(input=[src_forward, src_backward]) # 投射编码向量到 decoder_size - encoder_proj = mixed_layer(input = [full_matrix_projection(encoded_vector)], - size = decoder_size) + encoded_proj = paddle.layer.mixed( + size=decoder_size, + input=paddle.layer.full_matrix_projection(encoded_vector)) # 计算反向RNN的第一个实例 - backward_first = first_seq(input=src_backward) + backward_first = paddle.layer.first_seq(input=src_backward) # 投射反向RNN的第一个实例到 decoder size - decoder_boot = mixed_layer(input=[full_matrix_projection(backward_first)], size=decoder_size, act=TanhActivation()) + decoder_boot = paddle.layer.mixed( + size=decoder_size, + act=paddle.activation.Tanh(), + input=paddle.layer.full_matrix_projection(backward_first)) 解码器使用 ``recurrent_group`` 来定义循环神经网络。单步函数和输出函数在 ``gru_decoder_with_attention`` 中定义: .. code:: python - group_inputs=[StaticInput(input=encoded_vector,is_seq=True), - StaticInput(input=encoded_proj,is_seq=True)] - trg_embedding = embedding_layer( - input=data_layer(name='target_language_word', - size=target_dict_dim), - size=word_vector_dim, - param_attr=ParamAttr(name='_target_language_embedding')) + group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True) + group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True) + group_inputs = [group_input1, group_input2] + trg_embedding = paddle.layer.embedding( + input=paddle.layer.data( + name='target_language_word', + type=paddle.data_type.integer_value_sequence(target_dict_dim)), + size=word_vector_dim, + param_attr=paddle.attr.ParamAttr(name='_target_language_embedding')) + group_inputs.append(trg_embedding) group_inputs.append(trg_embedding) # 对于配备有注意力机制的解码器,在训练中, @@ -185,9 +171,10 @@ Sequence to Sequence Model with Attention # StaticInput 意味着不同时间步的输入都是相同的值, # 否则它以一个序列输入,不同时间步的输入是不同的。 # 所有输入序列应该有相同的长度。 - decoder = recurrent_group(name=decoder_group_name, - step=gru_decoder_with_attention, - input=group_inputs) + decoder = paddle.layer.recurrent_group( + name=decoder_group_name, + step=gru_decoder_with_attention, + input=group_inputs) 单步函数的实现如下所示。首先,它定义解码网络的\ **Memory**\ 。然后定义 attention,门控循环单元单步函数和输出函数: @@ -198,27 +185,32 @@ attention,门控循环单元单步函数和输出函数: # 定义解码器的Memory # Memory的输出定义在 gru_step 内 # 注意 gru_step 应该与它的Memory名字相同 - decoder_mem = memory(name='gru_decoder', - size=decoder_size, - boot_layer=decoder_boot) + decoder_mem = paddle.layer.memory( + name='gru_decoder', size=decoder_size, boot_layer=decoder_boot) # 计算 attention 加权编码向量 - context = simple_attention(encoded_sequence=enc_vec, - encoded_proj=enc_proj, - decoder_state=decoder_mem) + context = paddle.networks.simple_attention( + encoded_sequence=enc_vec, + encoded_proj=enc_proj, + decoder_state=decoder_mem) # 混合当前词向量和attention加权编码向量 - decoder_inputs = mixed_layer(inputs = [full_matrix_projection(context), - full_matrix_projection(current_word)], - size = decoder_size * 3) + decoder_inputs = paddle.layer.mixed( + size=decoder_size * 3, + input=[ + paddle.layer.full_matrix_projection(input=context), + paddle.layer.full_matrix_projection(input=current_word) + ]) # 定义门控循环单元循环神经网络单步函数 - gru_step = gru_step_layer(name='gru_decoder', - input=decoder_inputs, - output_mem=decoder_mem, - size=decoder_size) + gru_step = paddle.layer.gru_step( + name='gru_decoder', + input=decoder_inputs, + output_mem=decoder_mem, + size=decoder_size) # 定义输出函数 - out = mixed_layer(input=[full_matrix_projection(input=gru_step)], - size=target_dict_dim, - bias_attr=True, - act=SoftmaxActivation()) + out = paddle.layer.mixed( + size=target_dict_dim, + bias_attr=True, + act=paddle.activation.Softmax(), + input=paddle.layer.full_matrix_projection(input=gru_step)) return out 生成序列 @@ -238,41 +230,32 @@ attention,门控循环单元单步函数和输出函数: - ``beam_size``: beam search 算法中的beam大小。 - ``max_length``: 生成序列的最大长度。 -- 使用 ``seqtext_printer_evaluator`` - 根据索引矩阵和字典打印文本。这个函数需要设置: - - - ``id_input``: 数据的整数ID,用于标识生成的文件中的相应输出。 - - ``dict_file``: 用于将词ID转换为词的字典文件。 - - ``result_file``: 生成结果文件的路径。 - 代码如下: .. code:: python - group_inputs=[StaticInput(input=encoded_vector,is_seq=True), - StaticInput(input=encoded_proj,is_seq=True)] + group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True) + group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True) + group_inputs = [group_input1, group_input2] # 在生成时,解码器基于编码源序列和最后生成的目标词预测下一目标词。 # 编码源序列(编码器输出)必须由只读Memory的 StaticInput 指定。 # 这里, GeneratedInputs 自动获取上一个生成的词,并在最开始初始化为起始词,如 。 - trg_embedding = GeneratedInput( - size=target_dict_dim, - embedding_name='_target_language_embedding', - embedding_size=word_vector_dim) + trg_embedding = paddle.layer.GeneratedInput( + size=target_dict_dim, + embedding_name='_target_language_embedding', + embedding_size=word_vector_dim) group_inputs.append(trg_embedding) - beam_gen = beam_search(name=decoder_group_name, - step=gru_decoder_with_attention, - input=group_inputs, - bos_id=0, # Beginnning token. - eos_id=1, # End of sentence token. - beam_size=beam_size, - max_length=max_length) - - seqtext_printer_evaluator(input=beam_gen, - id_input=data_layer(name="sent_id", size=1), - dict_file=trg_dict_path, - result_file=gen_trans_file) - outputs(beam_gen) - -注意,这种生成技术只用于类似解码器的生成过程。如果你正在处理序列标记任务,请参阅 :ref:`semantic_role_labeling` 了解更多详细信息。 - -完整的配置文件在\ ``demo/seqToseq/seqToseq_net.py``\ 。 + beam_gen = paddle.layer.beam_search( + name=decoder_group_name, + step=gru_decoder_with_attention, + input=group_inputs, + bos_id=0, # Beginnning token. + eos_id=1, # End of sentence token. + beam_size=beam_size, + max_length=max_length) + + return beam_gen + +注意,这种生成技术只用于类似解码器的生成过程。如果你正在处理序列标记任务,请参阅 `book/06.understand_sentiment `_ 了解更多详细信息。 + +完整的配置文件在 `book/08.machine_translation/train.py `_ 。 diff --git a/doc/howto/deep_model/rnn/rnn_config_en.rst b/doc/howto/deep_model/rnn/rnn_config_en.rst index 73f5d5371fcd3ce95253cad47b0d8e738284441c..f92edd108ff5c10a31b5f181f0f6dcb7a3f119f3 100644 --- a/doc/howto/deep_model/rnn/rnn_config_en.rst +++ b/doc/howto/deep_model/rnn/rnn_config_en.rst @@ -3,34 +3,11 @@ RNN Configuration This tutorial will guide you how to configure recurrent neural network in PaddlePaddle. PaddlePaddle supports highly flexible and efficient recurrent neural network configuration. In this tutorial, you will learn how to: -- prepare sequence data for learning recurrent neural networks. - configure recurrent neural network architecture. - generate sequence with learned recurrent neural network models. -We will use vanilla recurrent neural network, and sequence to sequence model to guide you through these steps. The code of sequence to sequence model can be found at :code:`demo/seqToseq`. - -===================== -Prepare Sequence Data -===================== - -PaddlePaddle does not need any preprocessing to sequence data, such as padding. The only thing that needs to be done is to set the type of the corresponding type to input. For example, the following code snippets defines three input. All of them are sequences, and the size of them are :code:`src_dict`, :code:`trg_dict`, and :code:`trg_dict`: - -.. code-block:: python - - settings.input_types = [ - integer_value_sequence(len(settings.src_dict)), - integer_value_sequence(len(settings.trg_dict)), - integer_value_sequence(len(settings.trg_dict))] - - -Then at the :code:`process` function, each :code:`yield` function will return three integer lists. Each integer list is treated as a sequence of integers: - -.. code-block:: python - - yield src_ids, trg_ids, trg_ids_next - - -For more details description of how to write a data provider, please refer to :ref:`api_pydataprovider2` . The full data provider file is located at :code:`demo/seqToseq/dataprovider.py`. +We will use vanilla recurrent neural network, and sequence to sequence model to guide you through these steps. The code of sequence to sequence model can be found at `book/08.machine_translation `_ . +And the data preparation of this model can be found at `python/paddle/v2/dataset/wmt14.py `_ =============================================== Configure Recurrent Neural Network Architecture @@ -42,7 +19,7 @@ Simple Gated Recurrent Neural Network Recurrent neural network process a sequence at each time step sequentially. An example of the architecture of LSTM is listed below. -.. image:: ../../../tutorials/sentiment_analysis/src/bi_lstm.jpg +.. image:: src/bi_lstm.jpg :align: center Generally speaking, a recurrent network perform the following operations from :math:`t=1` to :math:`t=T`, or reversely from :math:`t=T` to :math:`t=1`. @@ -75,19 +52,19 @@ Its **output function** simply takes :math:`x_t` as the output. act=None, rnn_layer_attr=None): def __rnn_step__(ipt): - out_mem = memory(name=name, size=size) - rnn_out = mixed_layer(input = [full_matrix_projection(ipt), - full_matrix_projection(out_mem)], - name = name, - bias_attr = rnn_bias_attr, - act = act, - layer_attr = rnn_layer_attr, - size = size) + out_mem = paddle.layer.memory(name=name, size=size) + rnn_out = paddle.layer.mixed(input = [paddle.layer.full_matrix_projection(input=ipt), + paddle.layer.full_matrix_projection(input=out_mem)], + name = name, + bias_attr = rnn_bias_attr, + act = act, + layer_attr = rnn_layer_attr, + size = size) return rnn_out - return recurrent_group(name='%s_recurrent_group' % name, - step=__rnn_step__, - reverse=reverse, - input=input) + return paddle.layer.recurrent_group(name='%s_recurrent_group' % name, + step=__rnn_step__, + reverse=reverse, + input=input) PaddlePaddle uses memory to construct step function. **Memory** is the most important concept when constructing recurrent neural networks in PaddlePaddle. A memory is a state that is used recurrently in step functions, such as :math:`x_{t+1} = f_x(x_t)`. One memory contains an **output** and a **input**. The output of memory at the current time step is utilized as the input of the memory at the next time step. A memory can also has a **boot layer**, whose output is utilized as the initial value of the memory. In our case, the output of the gated recurrent unit is employed as the output memory. Notice that the name of the layer :code:`rnn_out` is the same as the name of :code:`out_mem`. This means the output of the layer :code:`rnn_out` (:math:`x_{t+1}`) is utilized as the **output** of :code:`out_mem` memory. @@ -101,7 +78,7 @@ Sequence to Sequence Model with Attention ----------------------------------------- We will use the sequence to sequence model with attention as an example to demonstrate how you can configure complex recurrent neural network models. An illustration of the sequence to sequence model with attention is shown in the following figure. -.. image:: ../../../tutorials/text_generation/encoder-decoder-attention-model.png +.. image:: src/encoder-decoder-attention-model.png :align: center In this model, the source sequence :math:`S = \{s_1, \dots, s_T\}` is encoded with a bidirectional gated recurrent neural networks. The hidden states of the bidirectional gated recurrent neural network :math:`H_S = \{H_1, \dots, H_T\}` is called *encoder vector* The decoder is a gated recurrent neural network. When decoding each token :math:`y_t`, the gated recurrent neural network generates a set of weights :math:`W_S^t = \{W_1^t, \dots, W_T^t\}`, which are used to compute a weighted sum of the encoder vector. The weighted sum of the encoder vector is utilized to condition the generation of the token :math:`y_t`. @@ -113,43 +90,52 @@ We also project the encoder vector to :code:`decoder_size` dimensional space, ge .. code-block:: python # Define the data layer of the source sentence. - src_word_id = data_layer(name='source_language_word', size=source_dict_dim) + src_word_id = paddle.layer.data( + name='source_language_word', + type=paddle.data_type.integer_value_sequence(source_dict_dim)) # Calculate the word embedding of each word. - src_embedding = embedding_layer( + src_embedding = paddle.layer.embedding( input=src_word_id, size=word_vector_dim, - param_attr=ParamAttr(name='_source_language_embedding')) + param_attr=paddle.attr.ParamAttr(name='_source_language_embedding')) # Apply forward recurrent neural network. - src_forward = grumemory(input=src_embedding, size=encoder_size) + src_forward = paddle.networks.simple_gru( + input=src_embedding, size=encoder_size) # Apply backward recurrent neural network. reverse=True means backward recurrent neural network. - src_backward = grumemory(input=src_embedding, - size=encoder_size, - reverse=True) + src_backward = paddle.networks.simple_gru( + input=src_embedding, size=encoder_size, reverse=True) # Mix the forward and backward parts of the recurrent neural network together. - encoded_vector = concat_layer(input=[src_forward, src_backward]) + encoded_vector = paddle.layer.concat(input=[src_forward, src_backward]) # Project encoding vector to decoder_size. - encoder_proj = mixed_layer(input = [full_matrix_projection(encoded_vector)], - size = decoder_size) + encoded_proj = paddle.layer.mixed( + size=decoder_size, + input=paddle.layer.full_matrix_projection(encoded_vector)) # Compute the first instance of the backward RNN. - backward_first = first_seq(input=src_backward) + backward_first = paddle.layer.first_seq(input=src_backward) # Project the first instance of backward RNN to decoder size. - decoder_boot = mixed_layer(input=[full_matrix_projection(backward_first)], size=decoder_size, act=TanhActivation()) + decoder_boot = paddle.layer.mixed( + size=decoder_size, + act=paddle.activation.Tanh(), + input=paddle.layer.full_matrix_projection(backward_first)) The decoder uses :code:`recurrent_group` to define the recurrent neural network. The step and output functions are defined in :code:`gru_decoder_with_attention`: .. code-block:: python - group_inputs=[StaticInput(input=encoded_vector,is_seq=True), - StaticInput(input=encoded_proj,is_seq=True)] - trg_embedding = embedding_layer( - input=data_layer(name='target_language_word', - size=target_dict_dim), - size=word_vector_dim, - param_attr=ParamAttr(name='_target_language_embedding')) + group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True) + group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True) + group_inputs = [group_input1, group_input2] + trg_embedding = paddle.layer.embedding( + input=paddle.layer.data( + name='target_language_word', + type=paddle.data_type.integer_value_sequence(target_dict_dim)), + size=word_vector_dim, + param_attr=paddle.attr.ParamAttr(name='_target_language_embedding')) + group_inputs.append(trg_embedding) group_inputs.append(trg_embedding) # For decoder equipped with attention mechanism, in training, @@ -158,9 +144,10 @@ The decoder uses :code:`recurrent_group` to define the recurrent neural network. # StaticInput means the same value is utilized at different time steps. # Otherwise, it is a sequence input. Inputs at different time steps are different. # All sequence inputs should have the same length. - decoder = recurrent_group(name=decoder_group_name, - step=gru_decoder_with_attention, - input=group_inputs) + decoder = paddle.layer.recurrent_group( + name=decoder_group_name, + step=gru_decoder_with_attention, + input=group_inputs) The implementation of the step function is listed as below. First, it defines the **memory** of the decoder network. Then it defines attention, gated recurrent unit step function, and the output function: @@ -171,27 +158,32 @@ The implementation of the step function is listed as below. First, it defines th # Defines the memory of the decoder. # The output of this memory is defined in gru_step. # Notice that the name of gru_step should be the same as the name of this memory. - decoder_mem = memory(name='gru_decoder', - size=decoder_size, - boot_layer=decoder_boot) + decoder_mem = paddle.layer.memory( + name='gru_decoder', size=decoder_size, boot_layer=decoder_boot) # Compute attention weighted encoder vector. - context = simple_attention(encoded_sequence=enc_vec, - encoded_proj=enc_proj, - decoder_state=decoder_mem) + context = paddle.networks.simple_attention( + encoded_sequence=enc_vec, + encoded_proj=enc_proj, + decoder_state=decoder_mem) # Mix the current word embedding and the attention weighted encoder vector. - decoder_inputs = mixed_layer(inputs = [full_matrix_projection(context), - full_matrix_projection(current_word)], - size = decoder_size * 3) + decoder_inputs = paddle.layer.mixed( + size=decoder_size * 3, + input=[ + paddle.layer.full_matrix_projection(input=context), + paddle.layer.full_matrix_projection(input=current_word) + ]) # Define Gated recurrent unit recurrent neural network step function. - gru_step = gru_step_layer(name='gru_decoder', - input=decoder_inputs, - output_mem=decoder_mem, - size=decoder_size) + gru_step = paddle.layer.gru_step( + name='gru_decoder', + input=decoder_inputs, + output_mem=decoder_mem, + size=decoder_size) # Defines the output function. - out = mixed_layer(input=[full_matrix_projection(input=gru_step)], - size=target_dict_dim, - bias_attr=True, - act=SoftmaxActivation()) + out = paddle.layer.mixed( + size=target_dict_dim, + bias_attr=True, + act=paddle.activation.Softmax(), + input=paddle.layer.full_matrix_projection(input=gru_step)) return out @@ -207,45 +199,37 @@ After training the model, we can use it to generate sequences. A common practice - :code:`eos_id`: the end token. Every sentence ends with the end token. - :code:`beam_size`: the beam size used in beam search. - :code:`max_length`: the maximum length of the generated sentences. - -* use :code:`seqtext_printer_evaluator` to print text according to index matrix and dictionary. This function needs to set: - - - :code:`id_input`: the integer ID of the data, used to identify the corresponding output in the generated files. - - :code:`dict_file`: the dictionary file for converting word id to word. - - :code:`result_file`: the path of the generation result file. The code is listed below: .. code-block:: python - group_inputs=[StaticInput(input=encoded_vector,is_seq=True), - StaticInput(input=encoded_proj,is_seq=True)] + group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True) + group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True) + group_inputs = [group_input1, group_input2] # In generation, decoder predicts a next target word based on # the encoded source sequence and the last generated target word. # The encoded source sequence (encoder's output) must be specified by # StaticInput which is a read-only memory. # Here, GeneratedInputs automatically fetchs the last generated word, # which is initialized by a start mark, such as . - trg_embedding = GeneratedInput( - size=target_dict_dim, - embedding_name='_target_language_embedding', - embedding_size=word_vector_dim) + trg_embedding = paddle.layer.GeneratedInput( + size=target_dict_dim, + embedding_name='_target_language_embedding', + embedding_size=word_vector_dim) group_inputs.append(trg_embedding) - beam_gen = beam_search(name=decoder_group_name, - step=gru_decoder_with_attention, - input=group_inputs, - bos_id=0, # Beginnning token. - eos_id=1, # End of sentence token. - beam_size=beam_size, - max_length=max_length) + beam_gen = paddle.layer.beam_search( + name=decoder_group_name, + step=gru_decoder_with_attention, + input=group_inputs, + bos_id=0, # Beginnning token. + eos_id=1, # End of sentence token. + beam_size=beam_size, + max_length=max_length) - seqtext_printer_evaluator(input=beam_gen, - id_input=data_layer(name="sent_id", size=1), - dict_file=trg_dict_path, - result_file=gen_trans_file) - outputs(beam_gen) + return beam_gen -Notice that this generation technique is only useful for decoder like generation process. If you are working on sequence tagging tasks, please refer to :ref:`semantic_role_labeling` for more details. +Notice that this generation technique is only useful for decoder like generation process. If you are working on sequence tagging tasks, please refer to `book/06.understand_sentiment `_ for more details. -The full configuration file is located at :code:`demo/seqToseq/seqToseq_net.py`. +The full configuration file is located at `book/08.machine_translation/train.py `_ . diff --git a/doc/tutorials/sentiment_analysis/bi_lstm.jpg b/doc/howto/deep_model/rnn/src/bi_lstm.jpg similarity index 100% rename from doc/tutorials/sentiment_analysis/bi_lstm.jpg rename to doc/howto/deep_model/rnn/src/bi_lstm.jpg diff --git a/doc/tutorials/text_generation/encoder-decoder-attention-model.png b/doc/howto/deep_model/rnn/src/encoder-decoder-attention-model.png similarity index 100% rename from doc/tutorials/text_generation/encoder-decoder-attention-model.png rename to doc/howto/deep_model/rnn/src/encoder-decoder-attention-model.png diff --git a/doc/howto/dev/build_cn.md b/doc/howto/dev/build_cn.md new file mode 100644 index 0000000000000000000000000000000000000000..4a80a5245102fb992f513a749f6a02e1130188af --- /dev/null +++ b/doc/howto/dev/build_cn.md @@ -0,0 +1,124 @@ +# 用Docker编译和测试PaddlePaddle + +## 需要的软硬件 + +为了开发PaddlePaddle,我们需要 + +1. 一台电脑,可以装的是 Linux, BSD, Windows 或者 MacOS 操作系统,以及 +1. Docker。 + +不需要依赖其他任何软件了。即便是 Python 和 GCC 都不需要,因为我们会把所有编译工具都安装进一个 Docker image 里。 + +## 总体流程 + +1. 获取源码 + + ```bash + git clone https://github.com/paddlepaddle/paddle + ``` + +2. 安装开发工具到 Docker image 里 + + ```bash + cd paddle; docker build -t paddle:dev . + ``` + + 请注意这个命令结尾处的 `.`;它表示 `docker build` 应该读取当前目录下的 [`Dockerfile`文件](https://github.com/PaddlePaddle/Paddle/blob/develop/Dockerfile),按照其内容创建一个名为 `paddle:dev` 的 Docker image,并且把各种开发工具安装进去。 + +3. 编译 + + 以下命令启动一个 Docker container 来执行 `paddle:dev` 这个 Docker image,同时把当前目录(源码树根目录)映射为 container 里的 `/paddle` 目录,并且运行 `Dockerfile` 描述的默认入口程序 [`build.sh`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh)。这个脚本调用 `cmake` 和 `make` 来编译 `/paddle` 里的源码,结果输出到 `/paddle/build`,也就是本地的源码树根目录里的 `build` 子目录。 + + ```bash + docker run --rm -v $PWD:/paddle paddle:dev + ``` + + 上述命令编译出一个 CUDA-enabled 版本。如果我们只需要编译一个只支持 CPU 的版本,可以用 + + ```bash + docker run --rm -e WITH_GPU=OFF -v $PWD:/paddle paddle:dev + ``` + +4. 运行单元测试 + + 用本机的第一个 GPU 来运行包括 GPU 单元测试在内的所有单元测试: + + ```bash + NV_GPU=0 nvidia-docker run --rm -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest" + ``` + + 如果编译的时候我们用了 `WITH_GPU=OFF` 选项,那么编译过程只会产生 CPU-based 单元测试,那么我们也就不需要 nvidia-docker 来运行单元测试了。我们只需要: + + ```bash + docker run --rm -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest" + ``` + + 有时候我们只想运行一个特定的单元测试,比如 `memory_test`,我们可以 + + ```bash + nvidia-docker run --rm -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest -V -R memory_test" + ``` + +5. 清理 + + 有时候我们会希望清理掉已经下载的第三方依赖以及已经编译的二进制文件。此时只需要: + + ```bash + rm -rf build + ``` + +## 为什么要 Docker 呀? + +- 什么是 Docker? + + 如果您没有听说 Docker,可以把它想象为一个类似 virtualenv 的系统,但是虚拟的不仅仅是 Python 的运行环境。 + +- Docker 还是虚拟机? + + 有人用虚拟机来类比 Docker。需要强调的是:Docker 不会虚拟任何硬件,Docker container 里运行的编译工具实际上都是在本机的 CPU 和操作系统上直接运行的,性能和把编译工具安装在本机运行一样。 + +- 为什么用 Docker? + + 把工具和配置都安装在一个 Docker image 里可以标准化编译环境。这样如果遇到问题,其他人可以复现问题以便帮助。 + + 另外,对于习惯使用Windows和MacOS的开发者来说,使用Docker就不用配置交叉编译环境了。 + +- 我可以选择不用Docker吗? + + 当然可以。大家可以用把开发工具安装进入 Docker image 一样的方式,把这些工具安装到本机。这篇文档介绍基于 Docker 的开发流程,是因为这个流程比其他方法都更简便。 + +- 学习 Docker 有多难? + + 理解 Docker 并不难,大概花十分钟看一下[这篇文章](https://zhuanlan.zhihu.com/p/19902938)。这可以帮您省掉花一小时安装和配置各种开发工具,以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。 + +- 我可以用 IDE 吗? + + 当然可以,因为源码就在本机上。IDE 默认调用 make 之类的程序来编译源码,我们只需要配置 IDE 来调用 Docker 命令编译源码即可。 + + 很多 PaddlePaddle 开发者使用 Emacs。他们在自己的 `~/.emacs` 配置文件里加两行 + + ```emacs + (global-set-key "\C-cc" 'compile) + (setq compile-command + "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev") + ``` + + 就可以按 `Ctrl-C` 和 `c` 键来启动编译了。 + +- 可以并行编译吗? + + 是的。我们的 Docker image 运行一个 [Bash 脚本](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh)。这个脚本调用 `make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。 + +## 可能碰到的问题 + +- Docker 需要 sudo + + 如果用自己的电脑开发,自然也就有管理员权限(sudo)了。如果用公用的电脑开发,需要请管理员安装和配置好 Docker。此外,PaddlePaddle 项目在努力开始支持其他不需要 sudo 的集装箱技术,比如 rkt。 + +- 在 Windows/MacOS 上编译很慢 + + Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存,以保证编译高效。具体做法请参考[这个issue](https://github.com/PaddlePaddle/Paddle/issues/627)。 + +- 磁盘不够 + + 本文中的例子里,`docker run` 命令里都用了 `--rm` 参数,这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 `docker ps -a` 命令看到停止后但是没有删除的 containers。`docker build` 命令有时候会产生一些中间结果,是没有名字的 images,也会占用磁盘。可以参考[这篇文章](https://zaiste.net/posts/removing_docker_containers/)来清理这些内容。 diff --git a/doc/howto/dev/build_en.md b/doc/howto/dev/build_en.md new file mode 100644 index 0000000000000000000000000000000000000000..91c41ef8ce3abdec5d69a9cbcebbc49b17d8f663 --- /dev/null +++ b/doc/howto/dev/build_en.md @@ -0,0 +1,124 @@ +# Build using Docker + +## What Developers Need + +To contribute to PaddlePaddle, you need + +1. A computer -- Linux, BSD, Windows, MacOS, and +1. Docker. + +Nothing else. Not even Python and GCC, because you can install all build tools into a Docker image. We run all the tools by running this image. + +## General Process + +1. Retrieve source code. + + ```bash + git clone https://github.com/paddlepaddle/paddle + ``` + +2. Install build tools into a Docker image. + + ```bash + cd paddle; docker build -t paddle:dev . + ``` + + Please be aware of the `.` at the end of the command, which refers to the [`./Dockerfile` file](https://github.com/PaddlePaddle/Paddle/blob/develop/Dockerfile). `docker build` follows instructions in this file to create a Docker image named `paddle:dev`, and installs building tools into it. + +3. Build from source. + + This following command starts a Docker container that executes the Docker image `paddle:dev`, mapping the current directory to `/paddle/` in the container, and runs the default entry-point [`build.sh`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh) as specified in the Dockefile. `build.sh` invokes `cmake` and `make` to build PaddlePaddle source code, which had been mapped to `/paddle`, and writes outputs to `/paddle/build`, which maps to `build` in the current source directory on the computer. + + ```bash + docker run -v $PWD:/paddle paddle:dev + ``` + + Above command builds a CUDA-enabled version. If we want to build a CPU-only version, we can type + + ```bash + docker run -e WITH_GPU=OFF -v $PWD:/paddle paddle:dev + ``` + +4. Run unit tests. + + To run all unit tests using the first GPU of a node: + + ```bash + NV_GPU=0 nvidia-docker run -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest" + ``` + + If we used `WITH_GPU=OFF` at build time, it generates only CPU-based unit tests, and we don't need nvidia-docker to run them. We can just run + + ```bash + docker run -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest" + ``` + + Sometimes we want to run a specific unit test, say `memory_test`, we can run + + ```bash + nvidia-docker run -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest -V -R memory_test" + ``` + +5. Clean Build. + + Sometimes, we might want to clean all thirt-party dependents and built binaries. To do so, just + + ```bash + rm -rf build + ``` + +## Docker, Or Not? + +- What is Docker? + + If you haven't heard of it, consider it something like Python's virtualenv. + +- Docker or virtual machine? + + Some people compare Docker with VMs, but Docker doesn't virtualize any hardware nor running a guest OS, which means there is no compromise on the performance. + +- Why Docker? + + Using a Docker image of build tools standardizes the building environment, which makes it easier for others to reproduce your problems and to help. + + Also, some build tools don't run on Windows or Mac or BSD, but Docker runs almost everywhere, so developers can use whatever computer they want. + +- Can I choose not to use Docker? + + Sure, you don't have to install build tools into a Docker image; instead, you can install them in your local computer. This document exists because Docker would make the development way easier. + +- How difficult is it to learn Docker? + + It takes you ten minutes to read [an introductory article](https://docs.docker.com/get-started) and saves you more than one hour to install all required build tools, configure them, especially when new versions of PaddlePaddle require some new tools. Not even to mention the time saved when other people trying to reproduce the issue you have. + +- Can I use my favorite IDE? + + Yes, of course. The source code resides on your local computer, and you can edit it using whatever editor you like. + + Many PaddlePaddle developers are using Emacs. They add the following few lines into their `~/.emacs` configure file: + + ```emacs + (global-set-key "\C-cc" 'compile) + (setq compile-command + "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev") + ``` + + so they could type `Ctrl-C` and `c` to build PaddlePaddle from source. + +- Does Docker do parallel building? + + Our building Docker image runs a [Bash script](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh), which calls `make -j$(nproc)` to starts as many processes as the number of your CPU cores. + +## Some Gotchas + +- Docker requires sudo + + An owner of a computer has the administrative privilege, a.k.a., sudo, and Docker requires this privilege to work properly. If you use a shared computer for development, please ask the administrator to install and configure Docker. We will do our best to support rkt, another container technology that doesn't require sudo. + +- Docker on Windows/MacOS builds slowly + + On Windows and MacOS, Docker containers run in a Linux VM. You might want to give this VM some more memory and CPUs so to make the building efficient. Please refer to [this issue](https://github.com/PaddlePaddle/Paddle/issues/627) for details. + +- Not enough disk space + + Examples in this article uses option `--rm` with the `docker run` command. This option ensures that stopped containers do not exist on hard disks. We can use `docker ps -a` to list all containers, including stopped. Sometimes `docker build` generates some intermediate dangling images, which also take disk space. To clean them, please refer to [this article](https://zaiste.net/posts/removing_docker_containers/). diff --git a/doc/howto/dev/contribute_to_paddle_cn.md b/doc/howto/dev/contribute_to_paddle_cn.md index 775938612e8d213b92e2eb69dae805838dc5ae96..699390145226ec2b65fdf5122db187e1d30d669e 100644 --- a/doc/howto/dev/contribute_to_paddle_cn.md +++ b/doc/howto/dev/contribute_to_paddle_cn.md @@ -7,6 +7,7 @@ - 确保编译器选项 `WITH_STYLE_CHECK` 已打开,并且编译能通过代码样式检查。 - 所有代码必须具有单元测试。 - 通过所有单元测试。 +- 请遵守[提交代码的一些约定](#提交代码的一些约定)。 以下教程将指导您提交代码。 ## [Fork](https://help.github.com/articles/fork-a-repo/) @@ -83,7 +84,7 @@ no changes added to commit (use "git add" and/or "git commit -a") ➜ docker build -t paddle:dev . ``` -随后可以用这个开发镜像开build PaddlePaddle的源码。比如如果要build一个不依赖GPU,但是支持AVX指令集,并且包括unit tests的PaddlePaddle,可以: +随后可以用这个开发镜像开始build PaddlePaddle的源码。比如如果要build一个不依赖GPU,但是支持AVX指令集,并且包括unit tests的PaddlePaddle,可以: ```bash ➜ docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TEST=ON" paddle:dev @@ -217,3 +218,22 @@ upstream ``` 至此,我们就完成了一次代码贡献的过程。 + +## 提交代码的一些约定 + +为了使评审人在评审代码时更好地专注于代码本身,请您每次提交代码时,遵守以下约定: +1. 请保证Travis-CI 中单元测试能顺利通过。如果没过,说明提交的代码存在问题,评审人一般不做评审。 +2. 提交PUll Request前: + - 请注意commit的数量: + - 原因:如果仅仅修改一个文件但提交了十几个commit,每个commit只做了少量的修改,这会给评审人带来很大困扰。评审人需要逐一查看每个commit才能知道做了哪些修改,且不排除commit之间的修改存在相互覆盖的情况。 + - 建议:每次提交时,保持尽量少的commit,可以通过`git commit --amend`补充上次的commit。对已经Push到远程仓库的多个commit,可以参考[squash commits after push](http://stackoverflow.com/questions/5667884/how-to-squash-commits-in-git-after-they-have-been-pushed)。 + - 请注意每个commit的名称:应能反映当前commit的内容,不能太随意。 +3. 如果解决了某个Issue的问题,请在该PUll Request的**第一个**评论框中加上:`fix #issue_number`,这样当该PUll Request被合并后,会自动关闭对应的Issue。关键词包括:close, closes, closed, fix, fixes, fixed, resolve, resolves, resolved,请选择合适的词汇。详细可参考[Closing issues via commit messages](https://help.github.com/articles/closing-issues-via-commit-messages)。 + +此外,在回复评审人意见时,请您遵守以下约定: +1. 评审人的每个意见都必须回复(这是开源社区的基本礼貌,别人帮了忙,应该说谢谢): + - 对评审意见同意且按其修改完的,给个简单的`Done`即可; + - 对评审意见不同意的,请给出您自己的反驳理由。 +2. 如果评审意见比较多: + - 请给出总体的修改情况。 + - 请采用[start a review](https://help.github.com/articles/reviewing-proposed-changes-in-a-pull-request/)进行回复,而非直接回复的方式。原因是每个回复都会发送一封邮件,会造成邮件灾难。 diff --git a/doc/howto/dev/contribute_to_paddle_en.md b/doc/howto/dev/contribute_to_paddle_en.md deleted file mode 100644 index 9b0d3e83c0dc264650eda73e6801c60a75439b4a..0000000000000000000000000000000000000000 --- a/doc/howto/dev/contribute_to_paddle_en.md +++ /dev/null @@ -1,146 +0,0 @@ -# Contribute Code - -We sincerely appreciate your contributions. You can use fork and pull request -workflow to merge your code. - -## Code Requirements -- Your code must be fully documented by - [doxygen](http://www.stack.nl/~dimitri/doxygen/) style. -- Make sure the compiler option WITH\_STYLE\_CHECK is on and the compiler - passes the code style check. -- All code must have unit test. -- Pass all unit tests. - -The following tutorial guides you into submitting your contibution. - -## [Creating a Fork](https://help.github.com/articles/fork-a-repo/) - -Just head over to the GitHub page and click the "Fork" button. -It's just that simple. - -## Clone - -Paddle is currently using [git-flow branching model](http://nvie.com/posts/a-successful-git-branching-model/). -The **develop** is the main branch, and other user's branches are feature branches. - -Once you've created a fork, you can use your favorite git client to clone your -repo or just head straight to the command line: - -```shell -# Clone your fork to your local machine -git clone --branch develop https://github.com/USERNAME/Paddle.git -``` -If your repository doesn't contain **develop** branch, just create it by your own. - -```shell -git clone https://github.com/USERNAME/Paddle.git Paddle -cd Paddle -git checkout -b develop # create develop branch. -git remote add upstream https://github.com/PaddlePaddle/Paddle.git # add upstream to baidu/Paddle -git pull upstream develop # update to upstream -``` - -Then you can start to develop by making a local developement branch - -```shell -git checkout -b MY_COOL_STUFF_BRANCH -``` - -## Using `pre-commit` hook - -Paddle developers use [pre-commit](http://pre-commit.com/) tool to manage git -pre-commit hooks. It can help us format source codes (cpp, python), check some -basic thing before commit (only one EOL for each file, do not add a huge file -in git). `pre-commit` tests is a part of unit tests in Travis-CI now, every -PR doesn't fit hook can not be merged into Paddle. - -To use [pre-commit](http://pre-commit.com/), you should install it by -`pip install pre-commit`, and currently, Paddle uses `clang-format` to format -c/cpp sources. Please make sure clang-format 3.8+ installed. - -Then just run `pre-commit install` in your Paddle clone directory. When you -commit your code, the pre-commit hook will check the local code if there is -anything not suitable to commit, and so on. - -## Commit - -Commit your changes by following command lines: - -```shell -# show the working tree status -git status -# add modified files -git add xx -env EDITOR=vim git commit # You can write your comments by vim/nano/emacs. -``` -The first line of commit infomation is the title. The second and later lines -are the details if any. - -## Keeping Fork Up to Date - -Before pull your request, you should sync your code from the latest PaddlePaddle. -To do this, you'll need to add a remote at first: - -```shell -# see the current configured remote repository -git remote -v -# add upstream repository -git remote add upstream https://github.com/PaddlePaddle/Paddle.git -# verify the new upstream -git remote -v -``` - -Update your fork with the latest upstream changes: - -```shell -git pull --rebase upstream develop -``` - -If there are no unique commits locally, git will simply perform a fast-forward. -However, if you have been making changes (in the vast majority of cases you -probably shouldn't be), you may have to deal with conflicts. - -Now, your local master branch is up-to-date with everything modified upstream. - -## Push to GitHub - -```shell -# push to your repository in Github -git push -u origin MY_COOL_STUFF_BRANCH # create remote branch MY_COOL_STUFF_BRANCH to origin. -``` - -## Pull Request - -Go to the page for your fork on GitHub, select your development branch, -and click the **pull request button**. - -## Update your pull request with the lastest version - -During the code review, your pull request may become stale because new commits in -baidu/Paddle. GitHub allows autmotic update if there is no conflict. You can do this -by clicking the "Update Branch" button in your pull request page. However, in the case -of conflict, you need to do the update manually. You need to do the following on -your local repository: -```shell -git checkout MY_COOL_STUFF_BRANCH -git pull upstream develop -# You may need to resolve the conflict according to the git prompt. -# Make and test your code. -git push origin MY_COOL_STUFF_BRANCH -``` -Now your Pull Request is updated with the latest version. - -## Revise your pull request - -When you revise your pull request according to reviewer's comments, please use 'git commit' instead of 'git commit --amend' to commit your changes so that the reviewers can see the difference between the new pull requrest and the old pull request. - -The possible commands are - -```shell -git checkout MY_COOL_STUFF_BRANCH -git pull upstream develop # update local to newest code base. -# May be some conflicts will occured. -# And develop your cool stuff -env EDITOR=vim git commit # add your revise log -git push origin MY_COOL_STUFF_BRANCH -``` diff --git a/doc/howto/dev/contribute_to_paddle_en.md b/doc/howto/dev/contribute_to_paddle_en.md new file mode 120000 index 0000000000000000000000000000000000000000..c97564d93a7f0a753a23cd97d2467d595bd154ff --- /dev/null +++ b/doc/howto/dev/contribute_to_paddle_en.md @@ -0,0 +1 @@ +../../../CONTRIBUTING.md \ No newline at end of file diff --git a/doc/howto/dev/new_layer_cn.rst b/doc/howto/dev/new_layer_cn.rst index 9489a921c70ad6ee5709f46445554f5d9640162c..75037e693b32f923ee7dc9dfec322495fe4ce10a 100644 --- a/doc/howto/dev/new_layer_cn.rst +++ b/doc/howto/dev/new_layer_cn.rst @@ -37,7 +37,7 @@ \frac{\partial c(y)}{\partial x} = \frac{\partial c(y)}{\partial y} \frac{\partial y}{\partial x} -假设 :math:`z = f(W^T x + b)` ,那么 +假设 :math:`z = W^T x + b` ,那么 .. math:: diff --git a/doc/howto/dev/new_layer_en.rst b/doc/howto/dev/new_layer_en.rst index 46481f5ead33dc6a26507e021fd9ae0f8316e940..110a9fb38f890a766bb4480e91feb22d3b0838a5 100644 --- a/doc/howto/dev/new_layer_en.rst +++ b/doc/howto/dev/new_layer_en.rst @@ -29,7 +29,7 @@ Fully connected layer takes a dense input vector with dimension :math:`D_i`. It where :math:`f(.)` is an nonlinear *activation* function, such as sigmoid, tanh, and Relu. -The transformation matrix :math:`W` and bias vector :math:`b` are the *parameters* of the layer. The *parameters* of a layer are learned during training in the *backward pass*. The backward pass computes the gradients of the output function with respect to all parameters and inputs. The optimizer can use chain rule to compute the gradients of the loss function with respect to each parameter. +The transformation matrix :math:`W` and bias vector :math:`b` are the *parameters* of the layer. The *parameters* of a layer are learned during training in the *backward pass*. The backward pass computes the gradients of the output function with respect to all parameters and inputs. The optimizer can use chain rule to compute the gradients of the loss function with respect to each parameter. Suppose our loss function is :math:`c(y)`, then @@ -37,7 +37,7 @@ Suppose our loss function is :math:`c(y)`, then \frac{\partial c(y)}{\partial x} = \frac{\partial c(y)}{\partial y} \frac{\partial y}{\partial x} -Suppose :math:`z = f(W^T x + b)`, then +Suppose :math:`z = W^T x + b`, then .. math:: @@ -48,7 +48,7 @@ This derivative can be automatically computed by our base layer class. Then, for fully connected layer, we need to compute: .. math:: - + \frac{\partial z}{\partial x} = W, \frac{\partial z_j}{\partial W_{ij}} = x_i, \frac{\partial z}{\partial b} = \mathbf 1 where :math:`\mathbf 1` is an all one vector, :math:`W_{ij}` is the number at the i-th row and j-th column of the matrix :math:`W`, :math:`z_j` is the j-th component of the vector :math:`z`, and :math:`x_i` is the i-th component of the vector :math:`x`. @@ -322,7 +322,7 @@ All the gradient check unit tests are located in :code:`paddle/gserver/tests/tes /* weight */ true); } } - + If you are creating a new file for the test, such as :code:`paddle/gserver/tests/testFCGrad.cpp`, you need to add the file to :code:`paddle/gserver/tests/CMakeLists.txt`. An example is given below. All the unit tests will run when you execute the command :code:`make tests`. Notice that some layers might need high accuracy for the gradient check unit tests to work well. You need to configure :code:`WITH_DOUBLE` to `ON` when configuring cmake. .. code-block:: bash diff --git a/doc/howto/dev/new_op_cn.md b/doc/howto/dev/new_op_cn.md new file mode 100644 index 0000000000000000000000000000000000000000..6cfc9536f20e88571a9845a50be0341fe4d9f78b --- /dev/null +++ b/doc/howto/dev/new_op_cn.md @@ -0,0 +1,332 @@ +# 如何写新的Operator + + - [概念简介](#概念简介) + - [实现C++类](#实现C++类) + - [定义ProtoMaker类](#定义ProtoMaker类) + - [定义Operator类](#定义Operator类) + - [定义OpKernel类](#定义OpKernel类) + - [注册Operator](#注册Operator) + - [编译](#编译) + - [绑定Python](#绑定Python) + - [实现单元测试](#实现单元测试) + - [前向Operator单测](#前向Operator单测) + - [反向Operator单测](#反向Operator单测) + - [编译和执行](#编译和执行) + + +## 概念简介 + +简单介绍需要用到基类,详细介绍请参考设计文档。 + +- `framework::OperatorBase`: Operator(简写,Op)基类。 +- `framework::OpKernel`: Op计算函数的基类,称作Kernel。 +- `framework::OperatorWithKernel`:继承自OperatorBase,Op有计算函数,称作有Kernel。 +- `class OpProtoAndCheckerMaker`:描述该Op的输入、输出、属性、注释,主要用于Python API接口生成 + +依据是否包含kernel,可以将Op分为两种:包含Kernel的Op和不包含kernel的Op,前者Op的定义继承自`OperatorBase`,后者继承自`OperatorWithKernel`。本教程主要介绍带Kernel的Op如何写,简单总结Op需要包含的内容如下: + + + 内容 | 定义位置 +-------------- | :---------------------- +OpProtoMake定义 | `.cc`文件,Backward Op不需要定义OpProtoMake +Op定义 | `.cc`文件 +Kernel实现 | CPU、GPU共享Kernel实现在`.h`文件中,否则,CPU 实现在`.cc`文件中,GPU 实现在`.cu`文件中。 +注册Op | Op注册实现在`.cc`文件;Kernel注册CPU实现在`.cc`文件中,GPU实现在`.cu`文件中 + + +实现新的op都添加至目录[paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators)下,文件命名以`*_op.h`(如有) 、 `*_op.cc` 、`*_op.cu`(如有)结尾。**系统会根据文件名自动构建op和其对应的Python扩展。** + + +下面以矩阵乘操作,即[MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc)为例来介绍如何写带Kernel的Operator。 + + +## 实现C++类 + + +### 1. 定义ProtoMaker类 + +矩阵乘法的公式:$Out = X * Y$, 可见该计算由两个输入,一个输出组成。 + +首先定义`ProtoMaker`来描述该Op的输入、输出,并添加注释: + +```cpp +class MulOpMaker : public framework::OpProtoAndCheckerMaker { + public: + MulOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "(Tensor), 2D tensor of size (M x K)"); + AddInput("Y", "(Tensor), 2D tensor of size (K x N)"); + AddOutput("Out", "(Tensor), 2D tensor of size (M x N)"); + AddComment(R"DOC( +Two Element Mul Operator. +The equation is: Out = X * Y +)DOC"); + } +}; +``` + +[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L43)继承自`framework::OpProtoAndCheckerMaker`,构造函数含有2个参数: + + - `framework::OpProto` : 前者存储Op的输入输出和参数属性,将用于Python API接口的生成。 + - `framework::OpAttrChecker` :后者用于检查参数属性的合法性。 + +构造函数里通过`AddInput`添加输入参数,通过`AddOutput`添加输出参数,通过`AddComment`添加Op的注释。这些函数会将对应内容添加到`OpProto`中。 + +上面的代码在`MulOp`中添加两个输入`X`和`Y`,添加了一个输出`Out`,并解释了各自含义,命名请遵守[命名规范](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/name_convention.md)。 + + +再以[`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37)为例: + +```cpp +template +class ScaleOpMaker : public framework::OpProtoAndCheckerMaker { + public: + ScaleOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "The input tensor of scale operator.").NotInGradient(); + AddOutput("Out", "The output tensor of scale operator.").NotInGradient(); + AddComment(R"DOC(Scale operator +The equation is: Out = scale*X +)DOC"); + AddAttr("scale", "scale of scale operator.").SetDefault(1.0); + } +}; +``` + +这个例子有两处不同: + +- `AddInput("X","...").NotInGradient()` : 表示`X`这个输入不参与`ScaleOp`对应的梯度Op计算之中,如果Op的某个输入不参与反向梯度的计算,请显示地调用`.NotInGradient()`进行设置。 + +- `AddAttr("scale", "...").SetDefault(1.0);` : 增加`scale`系数,作为参数属性,并且设置默认值为1.0。 + + +### 2. 定义Operator类 + +下面的点实现了MulOp的定义: + +```cpp +class MulOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(const framework::InferShapeContext &ctx) const override { + auto dim0 = ctx.Input("X")->dims(); + auto dim1 = ctx.Input("Y")->dims(); + PADDLE_ENFORCE_EQ(dim0.size(), 2, + "input X(%s) should be a tensor with 2 dims, a matrix", + ctx.op_.Input("X")); + PADDLE_ENFORCE_EQ(dim1.size(), 2, + "input Y(%s) should be a tensor with 2 dims, a matrix", + ctx.op_.Input("Y")); + PADDLE_ENFORCE_EQ( + dim0[1], dim1[0], + "First matrix's width must be equal with second matrix's height."); + ctx.Output("Out")->Resize({dim0[0], dim1[1]}); + } +}; +``` + +[`MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L22)继承自`OperatorWithKernel`。`public`成员: + +```cpp +using framework::OperatorWithKernel::OperatorWithKernel; +``` + +这句表示使用基类`OperatorWithKernel`的构造函数,也可写成: + +```cpp +MulOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorWithKernel(type, inputs, outputs, attrs) {} +``` + +还需要重写`InferShape`接口。`InferShape`为const函数,不能修改Op的成员变量,参数为`const framework::InferShapeContext &ctx`,通过该参数可获取到输入输出以及属性。它的功能是: + + - 1). 做检查, 尽早报错:检查输入数据维度、类型等是否合法。 + - 2). 设置输出Tensor的形状。 + +通常`OpProtoMaker`和`Op`类的定义写在`.cc`文件中,和下面将要介绍的注册函数一起放在`.cc`中 + +### 3. 定义OpKernel类 + +`MulKernel`继承自`framework::OpKernel`,带有下面两个模板参数: + +- `typename Place`: 表示设备类型,不同设备(CPU、GPU)共享同一个Kernel时,需加该模板参数,不共享则不加,一个不共享的例子是[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。 + +- `typename T` : 表示数据类型,如`float`, `double`等。 + +需要为`MulKernel`类重写`Compute`接口。 +- `Compute`接受一个输入参数:`const framework::ExecutionContext& context`。 +- 与`InferShapeContext`相比,`ExecutionContext`增加了设备类型,同样可获取到输入输出和属性参数。 +- `Compute`函数里实现`OpKernel`的具体计算逻辑。 + +下面是 `MulKernel` `Compute`的实现: + + ```cpp + template + class MulKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* X = context.Input("X"); + auto* Y = context.Input("Y"); + auto* Z = context.Output("Out"); + Z->mutable_data(context.GetPlace()); + auto* device_context = + const_cast(context.device_context_); + math::matmul(*X, false, *Y, false, 1, Z, 0, device_context); + } + }; + ``` + +需要注意:**不同设备(CPU、GPU)共享一个Op定义,是否则共享同一个`OpKernel`,取决于`Compute`调用的函数是否支持不同设备。** + +`MulOp`的CPU、GPU实现共享同一个`Kernel`。`OpKernel`不共享的例子可以参考:[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。 + +为了使`OpKernel`的计算过程书写更加简单,并且CPU、GPU的代码可以复用,我们通常借助 Eigen unsupported Tensor模块来实现`Compute`接口。关于在PaddlePaddle中如何使用Eigen库,请参考[使用文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md)。 + + +到此,前向Op实现完成。接下来,需要在`.cc`文件中注册该op和kernel。 +反向Op类的定义,反向OpKernel的定义与前向Op类似,这里不再赘述。**但需注意反向Op没有`ProtoMaker`**。 + +### 4. 注册Operator + +- 在`.cc`文件中注册前向、反向Op类,注册CPU Kernel。 + + ```cpp + namespace ops = paddle::operators; + REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad); + REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel); + REGISTER_OP_CPU_KERNEL(mul_grad, + ops::MulGradKernel); + ``` + + 在上面的代码中: + + - `REGISTER_OP` : 注册`ops::MulOp`类,类型名为`mul`,该类的`ProtoMaker`为`ops::MulOpMaker`,注册`ops::MulOpGrad`,类型名为`mul_grad`。 + - `REGISTER_OP_WITHOUT_GRADIENT` : 用于注册没有反向的Op。 + - `REGISTER_OP_CPU_KERNEL` :注册`ops::MulKernel`类,并特化模板参数为`paddle::platform::CPUPlace`和`float`类型,同理,注册`ops::MulGradKernel`类。 + + +- 在 `.cu`文件中注册GPU Kernel。 + - 请注意,如果GPU Kernel的实现基于Eigen unsupported模块,那么在 `.cu`的开始请加上宏定义 `#define EIGEN_USE_GPU`,代码示例如下: + + ```cpp + // if use Eigen unsupported module before include head files + // #define EIGEN_USE_GPU + + namespace ops = paddle::operators; + REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel); + REGISTER_OP_GPU_KERNEL(mul_grad, + ops::MulGradKernel); + ``` + +### 5. 编译 + +运行下面命令可以进行编译: + +``` +make mul_op +``` + +## 绑定Python + +系统会对新增的op自动绑定Python,并链接到生成的lib库中。 + +## 实现单元测试 + +单测包括对比前向Op不同设备(CPU、GPU)的实现、对比反向OP不同设备(CPU、GPU)的实现、反向Op的梯度测试。下面介绍介绍[`MulOp`的单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/test_mul_op.py)。 + +### 前向Operator单元测试 + +前向Op单元测试继承自`unittest.TestCase`,并定义元类`__metaclass__ = OpTestMeta`。各项更加具体的单元测试在`OpTestMeta`里完成。测试前向Operator,需要: + +1. 在`setUp`函数定义输入、输出,以及相关的属性参数。 +2. 生成随机的输入数据。 +3. 在Python脚本中实现与前向operator相同的计算逻辑,得到输出值,与operator前向计算的输出进行对比。 + + + ```python + import unittest + import numpy as np + from gradient_checker import GradientChecker, create_op + from op_test_util import OpTestMeta + + class TestMulOp(unittest.TestCase): + __metaclass__ = OpTestMeta + + def setUp(self): + self.type = "mul" + self.inputs = { + 'X': np.random.random((32, 84)).astype("float32"), + 'Y': np.random.random((84, 100)).astype("float32") + } + self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])} + ``` + +上面的代码首先导入依赖的包,下面是对`setUp`函数中操作的重要变量的详细解释: + +- `self.type = "mul" ` : 定义类型,与operator注册时注册的类型一致。 +- `self.inputs` : 定义输入,类型为`numpy.array`,并初始化。 +- `self.outputs` : 定义输出,并在Python脚本中完成与operator同样的计算逻辑,返回Python端的计算结果。 + + +### 反向Operator单元测试 + +反向Op单元测试继承自`GradientChecker`,而`GradientChecker`继承自`unittest.TestCase`,因此,**反向单元测试函数需要以`test_`开头**。 + +```python +class TestMulGradOp(GradientChecker): + def setUp(self): + self.op = create_op("mul") + self.inputs = { + 'X': np.random.random((32, 84)).astype("float32"), + 'Y': np.random.random((84, 100)).astype("float32") + } + + def test_check_grad_normal(self): + # mul op will enlarge the relative error + self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5) + + def test_check_grad_ingore_x(self): + self.check_grad( + ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X")) + + def test_check_grad_ingore_y(self): + self.check_grad( + ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y')) +``` + +下面解释代码中一些关键的地方: + +- 调用`create_op("mul")`创建反向Op对应的前向Op。 +- `test_check_grad_normal`中调用`check_grad`使用数值法检测梯度正确性和稳定性。 + - 第一个参数`["X", "Y"]` : 指定对输入变量`X`、`Y`做梯度检测。 + - 第二个参数`"Out"` : 指定前向网络最终的输出目标变量`Out`。 + - 第三个参数`max_relative_error`:指定检测梯度时能容忍的最大错误值。 +- `test_check_grad_ingore_x`和`test_check_grad_ingore_y`分支用来测试只需要计算一个输入梯度的情况。 + + +### 编译和执行单元测试 + +`python/paddle/v2/framework/tests` 目录下新增的 `test_*.py` 单元测试会被自动加入工程进行编译。 + +请注意,**不同于Op的编译测试,运行单元测试测时需要编译整个工程**,并且编译时需要打开`WITH_TESTING`, 即`cmake paddle_dir -DWITH_TESTING=ON`。编译成功后,执行下面的命令来运行单元测试: + +```bash +make test ARGS="-R test_mul_op -V" +``` + +或者: + +```bash +ctest -R test_mul_op +``` + +## 注意事项 + +- 为每个Op创建单独的`*_op.h`(如有)、`*_op.cc`和`*_op.cu`(如有)。不允许一个文件中包含多个Op,这将会导致编译出错。 +- 注册Op时的类型名,需要和该Op的名字一样。即不允许在`A_op.cc`里面,注册`REGISTER_OP(B, ...)`等,这将会导致单元测试出错。 +- 如果Op没有实现GPU Kernel,请不要创建空的`*_op.cu`,这将会导致单元测试出错。 +- 如果多个Op依赖一些共用的函数,可以创建非`*_op.*`格式的文件来存放,如`gather.h`文件。 diff --git a/doc/howto/dev/new_op_en.md b/doc/howto/dev/new_op_en.md new file mode 100644 index 0000000000000000000000000000000000000000..1e88e1f5b4df710f1b69f0305d8d8a2921c4249a --- /dev/null +++ b/doc/howto/dev/new_op_en.md @@ -0,0 +1,342 @@ +# How to write a new operator + + - [Background](#background) + - [Implementing C++ Types](#implementing-c++-types) + - [Defining ProtoMaker](#defining-protoMaker) + - [Defining Operator](#defining-operator) + - [Registering Operator](#registering-operator) + - [Compilation](#compilation) + - [Python Binding](#python-binding) + - [Unit Tests](#unit-tests) + - [Testing Forward Operators](#testing-forward-operators) + - [Testing Backward Operators](#testing-backward-operators) + - [Compiling and Running](#compiling-and-running) + - [Remarks](#remarks) +## Background + +Here are the base types needed. For details, please refer to the design docs. + +- `framework::OperatorBase`: Operator (Op)base class. +- `framework::OpKernel`: Base class for Op computation. +- `framework::OperatorWithKernel`: Inherited from OperatorBase, describing an operator with computation. +- `class OpProtoAndCheckerMaker`: Describes an Operator's input, output, attributes and description, mainly used to interface with Python API. + +An operator can be differentiated by whether in has kernel methods. An operator with kernel inherits from `OperatorWithKernel` while the ones without inherit from `OperatorBase`. This tutorial focuses on implementing operators with kernels. In short, an operator includes the following information: + + + Information | Where is it defined +-------------- | :---------------------- +OpProtoMake definition | `.cc`files, Backward Op does not need an OpProtoMake interface. +Op definition | `.cc` files +Kernel implementation | The kernel methods shared between CPU and GPU are defined in `.h` files. CPU-specific kernels live in `.cc` files, while GPU-specific kernels are implemented in `.cu`files. +Registering the Op | Ops are registered in `.cc` files; For Kernel registration, `.cc` files contain the CPU implementation, while `.cu` files contain the GPU implementation. + + +New Operator implementations are added to the list [paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators), with file names in the format `*_op.h` (if applicable), `*_op.cc`, `*_op.cu` (if applicable).** The system will use the naming scheme to automatically build operators and their corresponding Python extensions. ** + + +Let's take matrix multiplication operator, [MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc), as an example to introduce the writing of an Operator with Kernel. + + +## Implementing C++ Types + + +### 1. Defining Class ProtoMaker + +Matrix Multiplication can be written as $Out = X * Y$, meaning that the operation consists of two inputs and pne output. + +First, define `ProtoMaker` to describe the Operator's input, output, and additional comments: + +```cpp +class MulOpMaker : public framework::OpProtoAndCheckerMaker { + public: + MulOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "(Tensor), 2D tensor of size (M x K)"); + AddInput("Y", "(Tensor), 2D tensor of size (K x N)"); + AddOutput("Out", "(Tensor), 2D tensor of size (M x N)"); + AddComment(R"DOC( +Two Element Mul Operator. +The equation is: Out = X * Y +)DOC"); + } +}; +``` + +[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L43)is inherited from`framework::OpProtoAndCheckerMaker`, consisting of 2 variables in the constructor: + + - `framework::OpProto` stores Operator input and variable attribute, used for generating Python API interfaces. + - `framework::OpAttrChecker` is used to validate variable attributes. + +The constructor utilizes `AddInput`, `AddOutput`, and `AddComment`, so that the corresponding information will be added to `OpProto`. + +The code above adds two inputs `X` and `Y` to `MulOp`, an output `Out`, and their corresponding descriptions, in accordance to Paddle's [naming convention](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/name_convention.md). + + +An additional example [`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37) is implemented as follows: + +```cpp +template +class ScaleOpMaker : public framework::OpProtoAndCheckerMaker { + public: + ScaleOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "The input tensor of scale operator.").NotInGradient(); + AddOutput("Out", "The output tensor of scale operator.").NotInGradient(); + AddComment(R"DOC(Scale operator +The equation is: Out = scale*X +)DOC"); + AddAttr("scale", "scale of scale operator.").SetDefault(1.0); + } +}; +``` + +There are two changes in this example: + +- `AddInput("X","...").NotInGradient()` expresses that input `X` is not involved in `ScaleOp`'s corresponding computation. If an input to an operator is not participating in back-propagation, please explicitly set `.NotInGradient()`. + +- `AddAttr("scale", "...").SetDefault(1.0);` adds `scale`constant as an attribute, and sets the default value to 1.0. + + +### 2. Defining Operator + +The following code defines the interface for MulOp: + +```cpp +class MulOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(const framework::InferShapeContext &ctx) const override { + auto dim0 = ctx.Input("X")->dims(); + auto dim1 = ctx.Input("Y")->dims(); + PADDLE_ENFORCE_EQ(dim0.size(), 2, + "input X(%s) should be a tensor with 2 dims, a matrix", + ctx.op_.Input("X")); + PADDLE_ENFORCE_EQ(dim1.size(), 2, + "input Y(%s) should be a tensor with 2 dims, a matrix", + ctx.op_.Input("Y")); + PADDLE_ENFORCE_EQ( + dim0[1], dim1[0], + "First matrix's width must be equal with second matrix's height."); + ctx.Output("Out")->Resize({dim0[0], dim1[1]}); + } +}; +``` + +[`MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L22) is inherited from `OperatorWithKernel`. Its `public` member + +```cpp +using framework::OperatorWithKernel::OperatorWithKernel; +``` + +expresses an operator constructor using base class `OperatorWithKernel`, alternatively written as + +```cpp +MulOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorWithKernel(type, inputs, outputs, attrs) {} +``` + +`InferShape` interface needs to be re-written.`InferShape` is a constant method and cannot modify Op's member variables, its constant member `const framework::InferShapeContext &ctx` can be used to extract input, output, and attributes. It functions to + + - 1). validate and error out early: it checks input data dimensions and types. + - 2). configures the tensor shape in the output. + +Usually `OpProtoMaker` and `Op`'s type definitions are written in `.cc` files, which also include the registration methods introduced later. + +### 3. Defining OpKernel + +`MulKernel` inherits `framework::OpKernel`, which includes the following templates: + +- `typename Place` denotes device type. When different devices, namely the CPU and the GPU, share the same kernel, this template needs to be added. If they don't share kernels, this must not be added. An example of a non-sharing kernel is [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43). + +- `typename T` denotes data type, such as `float` or `double`. + +`MulKernel` types need to rewrite the interface for `Compute`. +- `Compute` takes one input variable `const framework::ExecutionContext& context`. +- Compared with `InferShapeContext`, `ExecutionContext` includes device types, and can similarly extract input, output, and attribute variables. +- `Compute` implements the computation logics of an `OpKernel`. + +`MulKernel`'s implementation of `Compute` is as follows: + + ```cpp + template + class MulKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* X = context.Input("X"); + auto* Y = context.Input("Y"); + auto* Z = context.Output("Out"); + Z->mutable_data(context.GetPlace()); + auto* device_context = + const_cast(context.device_context_); + math::matmul(*X, false, *Y, false, 1, Z, 0, device_context); + } + }; + ``` + +Note that **different devices (CPU, GPU)share an Op definition; whether or not they share the same `OpKernel` depends on whether `Compute` calls functions that support both devices.** + +`MulOp`'s CPU and GPU share the same `Kernel`. A non-sharing `OpKernel` example can be seen in [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43). + +To ease the writing of `OpKernel` compute, and for reusing code cross-device, [`Eigen-unsupported Tensor`](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md?fileviewer=file-view-default) module is used to implement `Compute` interface. To learn about how the Eigen library is used in PaddlePaddle, please see [usage document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md). + + +This concludes the forward implementation of an operator. Next its operation and kernel need to be registered in a `.cc` file. + +The definition of its corresponding backward operator, if applicable, is similar to that of an forward operator. **Note that a backward operator does not include a `ProtoMaker`**. + +### 4. Registering Operator + +- In `.cc` files, register forward and backward operator classes and the CPU kernel. + + ```cpp + namespace ops = paddle::operators; + REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad); + REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel); + REGISTER_OP_CPU_KERNEL(mul_grad, + ops::MulGradKernel); + ``` + + In that code block, + + - `REGISTER_OP` registers the `ops::MulOp` class, type named `mul`, its type `ProtoMaker` is `ops::MulOpMaker`, registering `ops::MulOpGrad` as `mul_grad`. + - `REGISTER_OP_WITHOUT_GRADIENT` registers an operator without gradient. + - `REGISTER_OP_CPU_KERNEL` registers `ops::MulKernel` class and specialized template types `paddle::platform::CPUPlace` and `float`, which also registers `ops::MulGradKernel`. + + +- Registering GPU Kernel in `.cu` files + - Note that if GPU Kernel is implemented using the `Eigen unsupported` module, then on top of `.cu`, a macro definition `#define EIGEN_USE_GPU` is needed, such as + + ```cpp + // if use Eigen unsupported module before include head files + #define EIGEN_USE_GPU + + namespace ops = paddle::operators; + REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel); + REGISTER_OP_GPU_KERNEL(mul_grad, + ops::MulGradKernel); + ``` + +### 5. Compilation + +Run the following commands to compile. + +``` +make mul_op +``` + +## Python Binding + +The system will automatically bind to Python and link it to a generated library. + +## Unit Tests + +Unit tests for an operator include + +1. comparing a forward operator's implementations on different devices, + +2. comparing a backward operator's implementation on different devices, and + +3. a scaling test for the backward operator. + +Here, we introduce the [unit tests for `MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/test_mul_op.py). + +### Testing Forward Operators + +A forward operator unit test inherits `unittest.TestCase` and defines metaclass `__metaclass__ = OpTestMeta`. More concrete tests are performed in `OpTestMeta`. Testing a forward operator requires the following: + +1. Defining input, output and relevant attributes in `setUp` method. + +2. Generating random input data. + +3. Implementing the same computation logic in a Python script: + + ```python + import unittest + import numpy as np + from gradient_checker import GradientChecker, create_op + from op_test_util import OpTestMeta + + class TestMulOp(unittest.TestCase): + __metaclass__ = OpTestMeta + + def setUp(self): + self.type = "mul" + self.inputs = { + 'X': np.random.random((32, 84)).astype("float32"), + 'Y': np.random.random((84, 100)).astype("float32") + } + self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])} + ``` +Get its output, and compare it with the forward operator's own output. + +The code above first loads required packages. In addition, we have + +- `self.type = "mul" ` defines the type that is identical to what the operator's registered type. +- `self.inputs` defines input, with type `numpy.array` and initializes it. +- `self.outputs` defines output and completes the same operator computation in the Python script, and returns its result from the Python script. + +### Testing Backward Operators + +A backward operator unit test inherits `GradientChecker`, which inherits `unittest.TestCase`. As a result, **a backward operator unit test needs to be have the prefix `test_`**. + +```python +class TestMulGradOp(GradientChecker): + def setUp(self): + self.op = create_op("mul") + self.inputs = { + 'X': np.random.random((32, 84)).astype("float32"), + 'Y': np.random.random((84, 100)).astype("float32") + } + + def test_check_grad_normal(self): + # mul op will enlarge the relative error + self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5) + + def test_check_grad_ingore_x(self): + self.check_grad( + ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X")) + + def test_check_grad_ingore_y(self): + self.check_grad( + ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y')) +``` + +Some key points in the code above include: + +- `create_op("mul")` creates the backward operator's corresponding forward operator. +- `test_normal` calls `check_grad` to validate scaling tests' correctness and stability through numeric methods. + - The first variable `["X", "Y"]` appoints `X` and `Y` to be scale tested. + - The second variable `"Out"` points to the network's final output target `Out`. + - The third variable `max_relative_error` points to the maximum relative tolerance error during scaling tests. +- `test_check_grad_ingore_x` and `test_check_grad_ingore_y`branches test the cases where there is only one scaling input. + +### Compiling and Running + + +Any new unit testing file of the format `test_*.py` added to the director `python/paddle/v2/framework/tests` is automatically added to the project to compile. + +Note that **unlike the compile test for Ops, running unit tests requires compiling the entire project** and requires compiling with flag `WITH_TESTING` on i.e. `cmake paddle_dir -DWITH_TESTING=ON`. + +After successfully compiling the project, run the following command to run unit tests: + +```bash +make test ARGS="-R test_mul_op -V" +``` + +Or, + +```bash +ctest -R test_mul_op +``` + +## Remarks + +- Every `*_op.h` (if applicable), `*_op.cc`, and `*_op.cu` (if applicable) must be created for a unique Op. Compiling will fail if multiple operators are included per file. +- The type with which an operator is registered needs to be identical to the Op's name. Registering `REGISTER_OP(B, ...)` in `A_op.cc` will cause unit testing failures. +- If the operator does not implement a GPU kernel, please refrain from creating an empty `*_op.cu` file, or else unit tests will fail. +- If multiple operators rely on some shared methods, a file NOT named `*_op.*` can be created to store them, such as `gather.h`. diff --git a/doc/howto/dev/use_eigen_cn.md b/doc/howto/dev/use_eigen_cn.md new file mode 100644 index 0000000000000000000000000000000000000000..1367323b71277984834d9d4f0d9bea0f69478479 --- /dev/null +++ b/doc/howto/dev/use_eigen_cn.md @@ -0,0 +1,146 @@ +## 在Paddle中如何使用Eigen + +神经网络本质上是一个计算图,计算需要的数据存放在`Tensor`中,而计算过程是由`Operartor`来描述的。在执行时,`Operator`调用对应`OpKernel`中的`Compute`接口,实现对`Tensor`的操作。 + + +### Eigen Tensor模块 + +Eigen Tensor模块对element-wise计算提供了强大的支持,并且书写一份代码,可以同时在CPU、GPU执行。但Eigen Tensor是一个正在开发中的模块,因此可能测试不够完备,文档较少。 + +关于Eigen Tensor模块的详细介绍请参考[文档1](https://github.com/RLovelett/eigen/blob/master/unsupported/Eigen/CXX11/src/Tensor/README.md) 和[文档2](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md) + + +### paddle::framework::Tensor + +Paddle Tensor定义在framework目录下,其主要接口如下: + +```cpp +class Tensor { + public: + /*! Return a pointer to mutable memory block. */ + template + inline T* data(); + + /** + * @brief Return a pointer to mutable memory block. + * @note If not exist, then allocation. + */ + template + inline T* mutable_data(platform::Place place); + + /** + * @brief Return a pointer to mutable memory block. + * + * @param[in] dims The dimensions of the memory block. + * @param[in] place The place of the memory block. + * + * @note If not exist, then allocation. + */ + template + inline T* mutable_data(DDim dims, platform::Place place); + + /*! Resize the dimensions of the memory block. */ + inline Tensor& Resize(const DDim& dims); + + /*! Return the dimensions of the memory block. */ + inline const DDim& dims() const; + + private: + /*! holds the memory block if allocated. */ + std::shared_ptr holder_; + + /*! points to dimensions of memory block. */ + DDim dim_; +}; +``` + +`Placeholder`的作用是延迟分配内存,即我们可以先定义一个Tensor,然后使用Resize接口设置Tensor的大小,最后再调用mutable_data接口分配实际的内存。 + +```cpp +paddle::framework::Tensor t; +paddle::platform::CPUPlace place; +// set size first +t.Resize({2, 3}); +// allocate memory on CPU later +t.mutable_data(place); +``` + +### paddle::framework::Tensor使用样例 +下面以AddOp为例说明Tensor的使用过程: + +- InferShape + +在运行神经网络计算图时,我们先调用每个`Operator`的`InferShape`接口,根据输入Tensor的大小来设置输出Tensor的大小,`Resize`接口会被调用。 + +```cpp +void InferShape(const framework::InferShapeContext &ctx) const override { + PADDLE_ENFORCE_EQ(ctx.Input("X")->dims(), + ctx.Input("Y")->dims(), + "Two input of Add Op's dimension must be same."); + ctx.Output("Out")->Resize(ctx.Input("X")->dims()); +} +``` + + +- Run + +`Operator`的`Run`接口最终会调用对应`OpKernel`的`Compute`接口,在这时真正的分配内存,`mutable_data`接口会被调用。 + +```cpp +void Compute(const framework::ExecutionContext& context) const override { + auto* input0 = context.Input("X"); + auto* input1 = context.Input("Y"); + auto* output = context.Output("Out"); + + output->mutable_data(context.GetPlace()); + + auto x = EigenVector::Flatten(*input0); + auto y = EigenVector::Flatten(*input1); + auto z = EigenVector::Flatten(*output); + + auto place = context.GetEigenDevice(); + + z.device(place) = x + y; +} +``` + + +### paddle::framework::Tensor到EigenTensor的转换 + +如上一小节所示,在具体的计算中,我们需要先把输入Tensor和输出Tensor转换为Eigen支持的格式。我们在[eigen.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/eigen.h)中提供了一些全局函数用来实现paddle::framework::Tensor到EigenTensor/EigenMatrix/EigenVector/EigenScalar的转换。 + +以EigenTensor为例,做一个介绍 + +```cpp +Tensor t; +float* p = t.mutable_data(make_ddim({1, 2, 3}), platform::CPUPlace()); +for (int i = 0; i < 1 * 2 * 3; i++) { + p[i] = static_cast(i); +} + +EigenTensor::Type et = EigenTensor::From(t); +``` + +From是EigenTensor模板提供的一个接口,可以实现从paddle::framework::Tensor到对EigenTensor的转换。由于Tensor的rank是模板参数,因此在转换时需要显示的指定。 + +在Eigen中,不同rank的Tensor是不同类型,Vector是rank为1的Tensor。需要额外注意的是,EigenVector::From方法是把paddle中的一维Tensor转为Eigen的一维Tensor,在这里用EigenVector来表示;而EigenVector::Flatten方法是把paddle中的一个Tensor进行reshape操作,压扁成为Eigen的一维Tensor,类型仍然为EigenVector。 + +更多的转换方法请参考eigen_test.cc中的[单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/eigen_test.cc)。 + + + +### 实现计算 + +当需要完成计算时,我们需要等式左边的EigenTensor调用device接口。在这里需要注意的是,这里的EigenTensor之间的运算只是改变了原有Tensor中的数据,而不会改变原有Tensor的shape信息。 + +```cpp +auto x = EigenVector::Flatten(*input0); +auto y = EigenVector::Flatten(*input1); +auto z = EigenVector::Flatten(*output); +auto place = context.GetEigenDevice(); +z.device(place) = x + y; +``` + +在这段代码中,input0/input1/output可以是任意维度的Tensor。我们调用了EigenVector的Flatten接口,把任意维度的Tensor转为了一维的EigenVector。而在计算结束之后,input0/input1/output的原有shape信息不变。如果想改变原有Tensor的shape信息,可以调用Resize接口进行改变。 + +由于Eigen Tensor模块的文档较少,我们可以参考TensorFlow的[kernels](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/kernels)模块下的相关`OpKernel`的计算代码。 diff --git a/doc/howto/dev/use_eigen_en.md b/doc/howto/dev/use_eigen_en.md new file mode 100644 index 0000000000000000000000000000000000000000..e169106e12f5d62696f1f0e7163562793b32c18c --- /dev/null +++ b/doc/howto/dev/use_eigen_en.md @@ -0,0 +1,146 @@ +## How to use Eigen in Paddle + +Essentially, a neural network is a compute graph. T data needed for the computation is stored in `Tensor`s and its computation procedure is described by `Operator`s. An `Operator` calls the `Compute` interface in its corresponding `OpKernel` and operates on the `Tensor`. + + +### Eigen Tensor Module + +The Eigen Tensor module supports powerful element-wise computation. In addition, a piece of code written using it can be run on both the CPU and the GPU. + +Note that Eigen Tensor is still being actively developed, so its tests are not completely covered and its documentation may be sparse. + +For details on Eigen Tensor module, please see [doc 1](https://github.com/RLovelett/eigen/blob/master/unsupported/Eigen/CXX11/src/Tensor/README.md) and [doc 2](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md). + + +### paddle::framework::Tensor + +Paddle Tensor's is defined in the framework directory with the following interface: + +```cpp +class Tensor { + public: + /*! Return a pointer to mutable memory block. */ + template + inline T* data(); + + /** + * @brief Return a pointer to mutable memory block. + * @note If not exist, then allocation. + */ + template + inline T* mutable_data(platform::Place place); + + /** + * @brief Return a pointer to mutable memory block. + * + * @param[in] dims The dimensions of the memory block. + * @param[in] place The place of the memory block. + * + * @note If not exist, then allocation. + */ + template + inline T* mutable_data(DDim dims, platform::Place place); + + /*! Resize the dimensions of the memory block. */ + inline Tensor& Resize(const DDim& dims); + + /*! Return the dimensions of the memory block. */ + inline const DDim& dims() const; + + private: + /*! holds the memory block if allocated. */ + std::shared_ptr holder_; + + /*! points to dimensions of memory block. */ + DDim dim_; +}; +``` + +`Placeholder` is used to delay memory allocation; that is, we can first define a tensor, using `Resize` to configure its shape, and then call `mutuable_data` to allocate the actual memory. + +```cpp +paddle::framework::Tensor t; +paddle::platform::CPUPlace place; +// set size first +t.Resize({2, 3}); +// allocate memory on CPU later +t.mutable_data(place); +``` + +### paddle::framework::Tensor Usage +`AddOp` demonstrates Tensor's usage. + +- InferShape + +When computing a neural network's compute graph, first call every `Operator`'s `InferShape` method, and use `Resize` to configure the size of the output tensor. + +```cpp +void InferShape(const framework::InferShapeContext &ctx) const override { + PADDLE_ENFORCE_EQ(ctx.Input("X")->dims(), + ctx.Input("Y")->dims(), + "Two input of Add Op's dimension must be same."); + ctx.Output("Out")->Resize(ctx.Input("X")->dims()); +} +``` + + +- Run + +```cpp +void Compute(const framework::ExecutionContext& context) const override { + auto* input0 = context.Input("X"); + auto* input1 = context.Input("Y"); + auto* output = context.Output("Out"); + + output->mutable_data(context.GetPlace()); + + auto x = EigenVector::Flatten(*input0); + auto y = EigenVector::Flatten(*input1); + auto z = EigenVector::Flatten(*output); + + auto place = context.GetEigenDevice(); + + z.device(place) = x + y; +} +``` + + +### paddle::framework::Tensor到EigenTensor的转换 + +As shown above, in actual computation, we need to transform the input and output `Tensor`s into formats Eigen supports. We show some functions in [eigen.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/eigen.h) to implement the transformation from `paddle::framework::Tensor`to `EigenTensor/EigenMatrix/EigenVector/EigenScalar`. + +Using EigenTensor as an example: + +```cpp +Tensor t; +float* p = t.mutable_data(make_ddim({1, 2, 3}), platform::CPUPlace()); +for (int i = 0; i < 1 * 2 * 3; i++) { + p[i] = static_cast(i); +} + +EigenTensor::Type et = EigenTensor::From(t); +``` + +`From` is an interfacing method provided by the EigenTensor template, which implements the transformation from a `paddle::framework::Tensor` object to an EigenTensor. Since `rank` is a template parameter, it needs to be explicitly specified at the time of the transformation. + +In Eigen, tensors with different ranks are different types, with `Vector` bring a rank-1 instance. Note that `EigenVector::From` uses a transformation from an 1-dimensional Paddle tensor to a 1-dimensional Eigen tensor while `EigenVector::Flatten` reshapes a paddle tensor and flattens it into a 1-dimensional Eigen tensor. Both resulting tensors are still typed EigenVector. + +For more transformations, see the [unit tests](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/eigen_test.cc) in the `eigen_test.cc` file. + + + +### Implementing Computation + +While computing, the device interface is needed from the EigenTensors on the left hand side of the assignments. Note that the computation between EigenTensors only changes the data originally inthe Tensor and does not change all the shape information associated with the Tensor. + +```cpp +auto x = EigenVector::Flatten(*input0); +auto y = EigenVector::Flatten(*input1); +auto z = EigenVector::Flatten(*output); +auto place = context.GetEigenDevice(); +z.device(place) = x + y; +``` + +In this code segment, input0/input1/output can be Tensors of arbitrary dimension. We are calling Flatten from EigenVector, transforming a tensor of any dimension into a 1-dimensional EigenVector. After completing computation, input0/input1/output will retain the same shape information, and they can be resized using the `Resize` interface. + +Because the Eigen Tensor module is under-documented, please refer to `OpKernel`'s computation code in TensorFlow's [kernel module documentation](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/kernels). diff --git a/doc/howto/dev/write_docs_cn.rst b/doc/howto/dev/write_docs_cn.rst index 5051a892304fdc8b0f1a19a7d4560d5ee007c47d..1bc947c260d7adb75ee5a2bb10e6b91bc0be2d4c 100644 --- a/doc/howto/dev/write_docs_cn.rst +++ b/doc/howto/dev/write_docs_cn.rst @@ -3,52 +3,109 @@ ################## PaddlePaddle的文档包括英文文档 ``doc`` 和中文文档 ``doc_cn`` 两个部分。文档都是通过 `cmake`_ 驱动 `sphinx`_ 编译生成,生成后的文档分别存储在编译目录的 ``doc`` 和 ``doc_cn`` 两个子目录下。 +也可以利用PaddlePaddle 工具来编译文档,这个情况下所有的文件会存在整理过的的文件目录 .ppo_workspace/content 下 +如何构建文档 +============ -如何构建PaddlePaddle的文档 -========================== +PaddlePaddle的文档构建有三种方式。 -PaddlePaddle的文档构建有直接构建和基于Docker构建两种方式。构建PaddlePaddle文档需要准备的环境相对较复杂,所以我们推荐使用基于Docker来构建PaddlePaddle的文档。 +使用PaddlePaddle.org工具 +-------------- +这个是目前推荐的使用方法。除了可以自动编译文档,也可以直接在网页预览文档。 -使用Docker构建PaddlePaddle的文档 --------------------------------- +文件工具是使用Docker,需要在系统里先安装好Docker工具包。Docker安装请参考Docker的官网。安装好Docker之后及可用以下命令启动工具 -使用Docker构建PaddlePaddle的文档,需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 `_ 。安装好Docker之后可以使用源码目录下的脚本构建文档,即 +.. code-block:: bash + + mkdir paddlepaddle # Create paddlepaddle working directory + cd paddlepaddle + + # Clone the content repositories + git clone https://github.com/PaddlePaddle/Paddle.git + git clone https://github.com/PaddlePaddle/book.git + git clone https://github.com/PaddlePaddle/models.git + git clone https://github.com/PaddlePaddle/Mobile.git + + # Please specify the working directory through -v + docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest + +注意: PaddlePaddle.org 会在 -v (volume) 指定的内容存储库运行命令 +之后再用网页连到http://localhost:8000就可以在网页上生成需要的文档 +编译后的文件将被存储在工作目录 /.ppo_workspace/content。 + +如果不想使用 Docker,你还可以通过运行Django框架直接激活工具的服务器。使用下面的命令来运行它。 + +.. code-block:: bash + + mkdir paddlepaddle # Create paddlepaddle working directory + cd paddlepaddle -.. code-block:: bash + # Clone the content repositories and PaddlePaddle.org + git clone https://github.com/PaddlePaddle/Paddle.git + git clone https://github.com/PaddlePaddle/book.git + git clone https://github.com/PaddlePaddle/models.git + git clone https://github.com/PaddlePaddle/Mobile.git + git clone https://github.com/PaddlePaddle/PaddlePaddle.org.git - cd TO_YOUR_PADDLE_CLONE_PATH - cd paddle/scripts/tools/build_docs - bash build_docs.sh + # Please specify the PaddlePaddle working directory. In the current setting, it should be pwd + export CONTENT_DIR= + export ENV='' + cd PaddlePaddle.org/portal/ + pip install -r requirements.txt + python manage.py runserver -编译完成后,该目录下会生成如下两个子目录\: +工具服务器将读取环境变量 CONTENT_DIR 搜索代码库。请指定的PaddlePaddle工作目录给环境变量 CONTENT_DIR。 +之后再用网页连到http://localhost:8000就可以在网页上生成需要的文档。 +编译后的文件将被存储在工作目录 /.ppo_workspace/content。 -* doc 英文文档目录 -* doc_cn 中文文档目录 +想了解更多PaddlePaddle.org工具的详细信息,可以 `点击这里 `_ 。 +使用Docker构建 +-------------- + +使用Docker构建PaddlePaddle的文档,需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 `_ 。安装好Docker之后可以使用源码目录下的脚本构建文档,即 + +.. code-block:: bash + + cd TO_YOUR_PADDLE_CLONE_PATH + cd paddle/scripts/tools/build_docs + sh build_docs.sh + +编译完成之后,会在当前目录生成两个子目录\: doc(英文文档目录)和 doc_cn(中文文档目录)。 打开浏览器访问对应目录下的index.html即可访问本地文档。 -.. code-block:: bash +直接构建 +-------- + +如果提示正确,可以执行以下命令编译生成文档,即 - open doc_cn/index.html +.. code-block:: bash + cd TO_YOUR_PADDLE_CLONE_PATH + mkdir -p build + cd build + cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON + make gen_proto_py + make paddle_docs paddle_docs_cn -直接构建PaddlePaddle的文档 --------------------------- +编译完成之后,会在当前目录生成两个子目录\: doc(英文文档目录)和 doc_cn(中文文档目录)。 +打开浏览器访问对应目录下的index.html即可访问本地文档。 -TBD -如何书写PaddlePaddle的文档 -========================== +如何书写文档 +============ -TBD +PaddlePaddle文档使用 `sphinx`_ 自动生成,用户可以参考sphinx教程进行书写。 -如何更新www.paddlepaddle.org文档 -================================ +如何更新www.paddlepaddle.org +============================ -TBD +更新的文档以PR的形式提交到github中,提交方式参见 `贡献文档 `_ 。 +目前PaddlePaddle的develop分支的文档是自动触发更新的,用户可以分别查看最新的 `中文文档 `_ 和 +`英文文档 `_ 。 -.. _cmake: https://cmake.org/ -.. _sphinx: http://www.sphinx-doc.org/en/1.4.8/ +.. _cmake: https://cmake.org/ +.. _sphinx: http://www.sphinx-doc.org/en/1.4.8/ diff --git a/doc/howto/dev/write_docs_en.rst b/doc/howto/dev/write_docs_en.rst new file mode 100644 index 0000000000000000000000000000000000000000..b3ef07eb1d0012827df8e6a4f27c5fa643649492 --- /dev/null +++ b/doc/howto/dev/write_docs_en.rst @@ -0,0 +1,80 @@ +################## +Contribute Documentation +################## + +PaddlePaddle supports English documentation ``doc`` and Chinese documentation ``doc_cn``. +Both are compiled by `cmake`_ and `sphinx`_ , the compiled documentations will be stored under ``doc`` and ``doc_cn`` directories. +When using the PaddlePaddle.org to compile documentations, the compiled documentations will be stored under a consolidated directory: .ppo_workspace/content + +How to Build Documentations +============ + +We recommend using PaddlePaddle.org tool to build documentation + + +Use PaddlePaddle.org tool +-------------- +This is the recommended method to build documentation. It can compile documentation and preview the documentation in a web browser. + +The tool uses Docker, please install it on your system. Please check Docker official website on how to install Docker. You may use the following commands to activate the tool + +.. code-block:: bash + + mkdir paddlepaddle # Create paddlepaddle working directory + cd paddlepaddle + + # Clone the content repositories. You may only clone the contents you need + git clone https://github.com/PaddlePaddle/Paddle.git + git clone https://github.com/PaddlePaddle/book.git + git clone https://github.com/PaddlePaddle/models.git + git clone https://github.com/PaddlePaddle/Mobile.git + + # Please specify the working directory through -v + docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest + +Note: PaddlePaddle.org will read the content repos specified in the -v (volume) flag of the docker run command +Use a web browser and navigate to http://localhost:8000, click the buttons to compile the documentation +The compiled documentations will be stored in /.ppo_workspace/content + + +If you don't wish to use Docker, you can also activate the tool through Django. Use the following the commands to set up + +.. code-block:: bash + + mkdir paddlepaddle # Create paddlepaddle working directory + cd paddlepaddle + + # Clone the content repositories and PaddlePaddle.org + git clone https://github.com/PaddlePaddle/Paddle.git + git clone https://github.com/PaddlePaddle/book.git + git clone https://github.com/PaddlePaddle/models.git + git clone https://github.com/PaddlePaddle/Mobile.git + git clone https://github.com/PaddlePaddle/PaddlePaddle.org.git + + # Please specify the PaddlePaddle working directory. In the current setting, it should be pwd + export CONTENT_DIR= + export ENV='' + cd PaddlePaddle.org/portal/ + pip install -r requirements.txt + python manage.py runserver + +Use a web browser and navigate to http://localhost:8000, click the buttons to compile the documentation +The compiled documentations will be stored in /.ppo_workspace/content + +If you want to learn more on the PaddlePaddle.org, please `click here `_ 。 + +How to write Documentations +============ + +PaddlePaddle uses `sphinx`_ to compile documentations,Please check sphinx official website for more detail. + + +How to update www.paddlepaddle.org +============================ + +Please create PRs and submit them to github, please check `Contribute Code `_ 。 +PaddlePaddle develop branch will update the documentation once the PR is merged. User may check latest `Chinese Docs `_ and +`English Docs `_ 。 + +.. _cmake: https://cmake.org/ +.. _sphinx: http://www.sphinx-doc.org/en/1.4.8/ diff --git a/doc/howto/index_cn.rst b/doc/howto/index_cn.rst index 26449a6365843b526b3ac3111b337d2f17524c9d..991b9e2596a3b499846b963152c838d66260265d 100644 --- a/doc/howto/index_cn.rst +++ b/doc/howto/index_cn.rst @@ -19,8 +19,8 @@ .. toctree:: :maxdepth: 1 - dev/write_docs_cn.rst dev/contribute_to_paddle_cn.md + dev/write_docs_cn.rst 模型配置 -------- diff --git a/doc/howto/index_en.rst b/doc/howto/index_en.rst index 1fbfcd260b912078f00ed5b720ed607db725c4e2..61bf25ccd12eeedffc747fdd4ce84fa4adde07ee 100644 --- a/doc/howto/index_en.rst +++ b/doc/howto/index_en.rst @@ -20,6 +20,7 @@ Development dev/new_layer_en.rst dev/contribute_to_paddle_en.md + dev/write_docs_en.rst Configuration ------------- diff --git a/doc/howto/optimization/cpu_profiling.md b/doc/howto/optimization/cpu_profiling.md new file mode 100644 index 0000000000000000000000000000000000000000..1775374cf6e518586c28bbd8e04946c74df7e4c5 --- /dev/null +++ b/doc/howto/optimization/cpu_profiling.md @@ -0,0 +1,197 @@ +This tutorial introduces techniques we use to profile and tune the +CPU performance of PaddlePaddle. We will use Python packages +`cProfile` and `yep`, and Google's `perftools`. + +Profiling is the process that reveals performance bottlenecks, +which could be very different from what's in the developers' mind. +Performance tuning is done to fix these bottlenecks. Performance optimization +repeats the steps of profiling and tuning alternatively. + +PaddlePaddle users program AI applications by calling the Python API, which calls +into `libpaddle.so.` written in C++. In this tutorial, we focus on +the profiling and tuning of + +1. the Python code and +1. the mixture of Python and C++ code. + +## Profiling the Python Code + +### Generate the Performance Profiling File + +We can use Python standard +package, [`cProfile`](https://docs.python.org/2/library/profile.html), +to generate Python profiling file. For example: + +```bash +python -m cProfile -o profile.out main.py +``` + +where `main.py` is the program we are going to profile, `-o` specifies +the output file. Without `-o`, `cProfile` would outputs to standard +output. + +### Look into the Profiling File + +`cProfile` generates `profile.out` after `main.py` completes. We can +use [`cprofilev`](https://github.com/ymichael/cprofilev) to look into +the details: + +```bash +cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py +``` + +where `-a` specifies the HTTP IP, `-p` specifies the port, `-f` +specifies the profiling file, and `main.py` is the source file. + +Open the Web browser and points to the local IP and the specifies +port, we will see the output like the following: + +``` + ncalls tottime percall cumtime percall filename:lineno(function) + 1 0.284 0.284 29.514 29.514 main.py:1() + 4696 0.128 0.000 15.748 0.003 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/executor.py:20(run) + 4696 12.040 0.003 12.040 0.003 {built-in method run} + 1 0.144 0.144 6.534 6.534 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/__init__.py:14() +``` + +where each line corresponds to Python function, and the meaning of +each column is as follows: + +| column | meaning | +| --- | --- | +| ncalls | the number of calls into a function | +| tottime | the total execution time of the function, not including the + execution time of other functions called by the function | +| percall | tottime divided by ncalls | +| cumtime | the total execution time of the function, including the execution time of other functions being called | +| percall | cumtime divided by ncalls | +| filename:lineno(function) | where the function is defined | + +### Identify Performance Bottlenecks + +Usually, `tottime` and the related `percall` time is what we want to +focus on. We can sort above profiling file by tottime: + +```text + 4696 12.040 0.003 12.040 0.003 {built-in method run} + 300005 0.874 0.000 1.681 0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/dataset/mnist.py:38(reader) + 107991 0.676 0.000 1.519 0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:219(__init__) + 4697 0.626 0.000 2.291 0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp) + 1 0.618 0.618 0.618 0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/__init__.py:1() +``` + +We can see that the most time-consuming function is the `built-in +method run`, which is a C++ function in `libpaddle.so`. We will +explain how to profile C++ code in the next section. At this +moment, let's look into the third function `sync_with_cpp`, which is a +Python function. We can click it to understand more about it: + +``` +Called By: + + Ordered by: internal time + List reduced from 4497 to 2 due to restriction <'sync_with_cpp'> + +Function was called by... + ncalls tottime cumtime +/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp) <- 4697 0.626 2.291 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp) +/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp) <- 4696 0.019 2.316 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:487(clone) + 1 0.000 0.001 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:534(append_backward) + + +Called: + + Ordered by: internal time + List reduced from 4497 to 2 due to restriction <'sync_with_cpp'> +``` + +The lists of the callers of `sync_with_cpp` might help us understand +how to improve the function definition. + +## Profiling Python and C++ Code + +### Generate the Profiling File + +To profile a mixture of Python and C++ code, we can use a Python +package, `yep`, that can work with Google's `perftools`, which is a +commonly-used profiler for C/C++ code. + +In Ubuntu systems, we can install `yep` and `perftools` by running the +following commands: + +```bash +apt update +apt install libgoogle-perftools-dev +pip install yep +``` + +Then we can run the following command + +```bash +python -m yep -v main.py +``` + +to generate the profiling file. The default filename is +`main.py.prof`. + +Please be aware of the `-v` command line option, which prints the +analysis results after generating the profiling file. By examining the + the print result, we'd know that if we stripped debug +information from `libpaddle.so` at build time. The following hints +help make sure that the analysis results are readable: + +1. Use GCC command line option `-g` when building `libpaddle.so` so to + include the debug information. The standard building system of + PaddlePaddle is CMake, so you might want to set + `CMAKE_BUILD_TYPE=RelWithDebInfo`. + +1. Use GCC command line option `-O2` or `-O3` to generate optimized + binary code. It doesn't make sense to profile `libpaddle.so` + without optimization, because it would anyway run slowly. + +1. Profiling the single-threaded binary file before the + multi-threading version, because the latter often generates tangled + profiling analysis result. You might want to set environment + variable `OMP_NUM_THREADS=1` to prevents OpenMP from automatically + starting multiple threads. + +### Examining the Profiling File + +The tool we used to examine the profiling file generated by +`perftools` is [`pprof`](https://github.com/google/pprof), which +provides a Web-based GUI like `cprofilev`. + +We can rely on the standard Go toolchain to retrieve the source code +of `pprof` and build it: + +```bash +go get github.com/google/pprof +``` + +Then we can use it to profile `main.py.prof` generated in the previous +section: + +```bash +pprof -http=0.0.0.0:3213 `which python` ./main.py.prof +``` + +Where `-http` specifies the IP and port of the HTTP service. +Directing our Web browser to the service, we would see something like +the following: + +![result](./pprof_1.png) + +### Identifying the Performance Bottlenecks + +Similar to how we work with `cprofilev`, we'd focus on `tottime` and +`cumtime`. + +![kernel_perf](./pprof_2.png) + +We can see that the execution time of multiplication and the computing +of the gradient of multiplication takes 2% to 4% of the total running +time, and `MomentumOp` takes about 17%. Obviously, we'd want to +optimize `MomentumOp`. + +`pprof` would mark performance critical parts of the program in +red. It's a good idea to follow the hints. diff --git a/doc/howto/optimization/cpu_profiling_cn.md b/doc/howto/optimization/cpu_profiling_cn.md new file mode 100644 index 0000000000000000000000000000000000000000..14eba0e2f34b115f5cd24920b5b1af07ec953d00 --- /dev/null +++ b/doc/howto/optimization/cpu_profiling_cn.md @@ -0,0 +1,155 @@ +此教程会介绍如何使用Python的cProfile包、Python库yep、Google perftools来进行性能分析 (profiling) 与调优(performance tuning)。 + +Profling 指发现性能瓶颈。系统中的瓶颈可能和程序员开发过程中想象的瓶颈相去甚远。Tuning 指消除瓶颈。性能优化的过程通常是不断重复地 profiling 和 tuning。 + +PaddlePaddle 用户一般通过调用 Python API 编写深度学习程序。大部分 Python API 调用用 C++ 写的 libpaddle.so。所以 PaddlePaddle 的性能分析与调优分为两个部分: + +* Python 代码的性能分析 +* Python 与 C++ 混合代码的性能分析 + + +## Python代码的性能分析 + +### 生成性能分析文件 + +Python标准库中提供了性能分析的工具包,[cProfile](https://docs.python.org/2/library/profile.html)。生成Python性能分析的命令如下: + +```bash +python -m cProfile -o profile.out main.py +``` + +其中 `main.py` 是我们要分析的程序,`-o`标识了一个输出的文件名,用来存储本次性能分析的结果。如果不指定这个文件,`cProfile`会打印到标准输出。 + +### 查看性能分析文件 + +`cProfile` 在main.py 运行完毕后输出`profile.out`。我们可以使用[`cprofilev`](https://github.com/ymichael/cprofilev)来查看性能分析结果。`cprofilev`是一个Python的第三方库。使用它会开启一个HTTP服务,将性能分析结果以网页的形式展示出来: + +```bash +cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py +``` + +其中`-a`标识HTTP服务绑定的IP。使用`0.0.0.0`允许外网访问这个HTTP服务。`-p`标识HTTP服务的端口。`-f`标识性能分析的结果文件。`main.py`标识被性能分析的源文件。 + +用Web浏览器访问对应网址,即可显示性能分析的结果: + +``` + ncalls tottime percall cumtime percall filename:lineno(function) + 1 0.284 0.284 29.514 29.514 main.py:1() + 4696 0.128 0.000 15.748 0.003 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/executor.py:20(run) + 4696 12.040 0.003 12.040 0.003 {built-in method run} + 1 0.144 0.144 6.534 6.534 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/__init__.py:14() +``` + +每一列的含义是: + +| 列名 | 含义 | +| --- | --- | +| ncalls | 函数的调用次数 | +| tottime | 函数实际使用的总时间。该时间去除掉本函数调用其他函数的时间 | +| percall | tottime的每次调用平均时间 | +| cumtime | 函数总时间。包含这个函数调用其他函数的时间 | +| percall | cumtime的每次调用平均时间 | +| filename:lineno(function) | 文件名, 行号,函数名 | + + +### 寻找性能瓶颈 + +通常`tottime`和`cumtime`是寻找瓶颈的关键指标。这两个指标代表了某一个函数真实的运行时间。 + +将性能分析结果按照tottime排序,效果如下: + +```text + 4696 12.040 0.003 12.040 0.003 {built-in method run} + 300005 0.874 0.000 1.681 0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/dataset/mnist.py:38(reader) + 107991 0.676 0.000 1.519 0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:219(__init__) + 4697 0.626 0.000 2.291 0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp) + 1 0.618 0.618 0.618 0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/__init__.py:1() +``` + +可以看到最耗时的函数是C++端的`run`函数。这需要联合我们第二节`Python`与`C++`混合代码的性能分析来进行调优。而`sync_with_cpp`函数的总共耗时很长,每次调用的耗时也很长。于是我们可以点击`sync_with_cpp`的详细信息,了解其调用关系。 + +```text +Called By: + + Ordered by: internal time + List reduced from 4497 to 2 due to restriction <'sync_with_cpp'> + +Function was called by... + ncalls tottime cumtime +/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp) <- 4697 0.626 2.291 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp) +/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp) <- 4696 0.019 2.316 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:487(clone) + 1 0.000 0.001 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:534(append_backward) + + +Called: + + Ordered by: internal time + List reduced from 4497 to 2 due to restriction <'sync_with_cpp'> +``` + +通常观察热点函数间的调用关系,和对应行的代码,就可以了解到问题代码在哪里。当我们做出性能修正后,再次进行性能分析(profiling)即可检查我们调优后的修正是否能够改善程序的性能。 + + + +## Python与C++混合代码的性能分析 + +### 生成性能分析文件 + +C++的性能分析工具非常多。常见的包括`gprof`, `valgrind`, `google-perftools`。但是调试Python中使用的动态链接库与直接调试原始二进制相比增加了很多复杂度。幸而Python的一个第三方库`yep`提供了方便的和`google-perftools`交互的方法。于是这里使用`yep`进行Python与C++混合代码的性能分析 + +使用`yep`前需要安装`google-perftools`与`yep`包。ubuntu下安装命令为 + +```bash +apt update +apt install libgoogle-perftools-dev +pip install yep +``` + +安装完毕后,我们可以通过 + +```bash +python -m yep -v main.py +``` + +生成性能分析文件。生成的性能分析文件为`main.py.prof`。 + +命令行中的`-v`指定在生成性能分析文件之后,在命令行显示分析结果。我们可以在命令行中简单的看一下生成效果。因为C++与Python不同,编译时可能会去掉调试信息,运行时也可能因为多线程产生混乱不可读的性能分析结果。为了生成更可读的性能分析结果,可以采取下面几点措施: + +1. 编译时指定`-g`生成调试信息。使用cmake的话,可以将CMAKE_BUILD_TYPE指定为`RelWithDebInfo`。 +2. 编译时一定要开启优化。单纯的`Debug`编译性能会和`-O2`或者`-O3`有非常大的差别。`Debug`模式下的性能测试是没有意义的。 +3. 运行性能分析的时候,先从单线程开始,再开启多线程,进而多机。毕竟单线程调试更容易。可以设置`OMP_NUM_THREADS=1`这个环境变量关闭openmp优化。 + +### 查看性能分析文件 + +在运行完性能分析后,会生成性能分析结果文件。我们可以使用[`pprof`](https://github.com/google/pprof)来显示性能分析结果。注意,这里使用了用`Go`语言重构后的`pprof`,因为这个工具具有web服务界面,且展示效果更好。 + +安装`pprof`的命令和一般的`Go`程序是一样的,其命令如下: + +```bash +go get github.com/google/pprof +``` + +进而我们可以使用如下命令开启一个HTTP服务: + +```bash +pprof -http=0.0.0.0:3213 `which python` ./main.py.prof +``` + +这行命令中,`-http`指开启HTTP服务。`which python`会产生当前Python二进制的完整路径,进而指定了Python可执行文件的路径。`./main.py.prof`输入了性能分析结果。 + +访问对应的网址,我们可以查看性能分析的结果。结果如下图所示: + +![result](./pprof_1.png) + + +### 寻找性能瓶颈 + +与寻找Python代码的性能瓶颈类似,寻找Python与C++混合代码的性能瓶颈也是要看`tottime`和`cumtime`。而`pprof`展示的调用图也可以帮助我们发现性能中的问题。 + +例如下图中, + +![kernel_perf](./pprof_2.png) + +在一次训练中,乘法和乘法梯度的计算占用2%-4%左右的计算时间。而`MomentumOp`占用了17%左右的计算时间。显然,`MomentumOp`的性能有问题。 + +在`pprof`中,对于性能的关键路径都做出了红色标记。先检查关键路径的性能问题,再检查其他部分的性能问题,可以更有次序的完成性能的优化。 diff --git a/doc/howto/optimization/pprof_1.png b/doc/howto/optimization/pprof_1.png new file mode 100644 index 0000000000000000000000000000000000000000..8e9edbf377672d0ef40f2fc7bd39e746923550cb Binary files /dev/null and b/doc/howto/optimization/pprof_1.png differ diff --git a/doc/howto/optimization/pprof_2.png b/doc/howto/optimization/pprof_2.png new file mode 100644 index 0000000000000000000000000000000000000000..172ba20399ba974d27f4c072425277b69b02520b Binary files /dev/null and b/doc/howto/optimization/pprof_2.png differ diff --git a/doc/howto/usage/cluster/cluster_train_cn.md b/doc/howto/usage/cluster/cluster_train_cn.md index 274452fbf0c595ad7b4dbeffe85ad9038f12b458..2e98b3de3fe2284375f87e883ff4bac19255dbeb 100644 --- a/doc/howto/usage/cluster/cluster_train_cn.md +++ b/doc/howto/usage/cluster/cluster_train_cn.md @@ -1,145 +1,225 @@ -```eval_rst -.. _cluster_train: +# PaddlePaddle分布式训练 + +* [概述](#概述) +* [环境准备](#环境准备) +* [启动参数说明](#启动参数说明) + * [启动参数服务器](#启动参数服务器) + * [启动计算节点](#启动计算节点) + * [准备数据集](#准备数据集) + * [准备训练程序](#准备训练程序) +* [使用分布式计算平台或工具](#使用分布式计算平台或工具) + * [使用Fabric启动集群作业](#使用fabric启动集群作业) + * [准备一个Linux集群](#准备一个linux集群) + * [启动集群作业](#启动集群作业) + * [终止集群作业](#终止集群作业) + * [检查集群训练结果](#检查集群训练结果) + * [检查模型输出](#检查模型输出) + * [在OpenMPI集群中提交训练作业](#在openmpi集群中提交训练作业) + * [准备OpenMPI集群](#准备OpenMPI集群) + * [启动集群作业](#启动集群作业-1) + * [在Kubernetes集群中提交训练作业](#在kubernetes集群中提交训练作业) + +## 概述 +本文将介绍如何使用PaddlePaddle在不同的集群框架下完成分布式训练。分布式训练架构如下图所示: + + + +- 数据分片(Data shard): 用于训练神经网络的数据,被切分成多个部分,每个部分分别给每个trainer使用。 +- 计算节点(Trainer): 每个trainer启动后读取切分好的一部分数据,开始神经网络的“前馈”和“后馈”计算,并和参数服务器通信。在完成一定量数据的训练后,上传计算得出的梯度(gradients),然后下载优化更新后的神经网络参数(parameters)。 +- 参数服务器(Parameter server):每个参数服务器只保存整个神经网络所有参数的一部分。参数服务器接收从计算节点上传的梯度,并完成参数优化更新,再将更新后的参数下发到每个计算节点。 + +这样,通过计算节点和参数服务器的分布式协作,可以完成神经网络的SGD方法的训练。PaddlePaddle可以同时支持同步随机梯度下降(SGD)和异步随机梯度下降。 + +在使用同步SGD训练神经网络时,PaddlePaddle使用同步屏障(barrier),使梯度的提交和参数的更新按照顺序方式执行。在异步SGD中,则并不会等待所有trainer提交梯度才更新参数,这样极大地提高了计算的并行性:参数服务器之间不相互依赖,并行地接收梯度和更新参数,参数服务器也不会等待计算节点全部都提交梯度之后才开始下一步,计算节点之间也不会相互依赖,并行地执行模型的训练。可以看出,虽然异步SGD方式会提高参数更新并行度, 但是并不能保证参数同步更新,在任意时间某一台参数服务器上保存的参数可能比另一台要更新,与同步SGD相比,梯度会有噪声。 + +## 环境准备 + +1. 准备您的计算集群。计算集群通常由一组(几台到几千台规模)的Linux服务器组成。服务器之间可以通过局域网(LAN)联通,每台服务器具有集群中唯一的IP地址(或者可被DNS解析的主机名)。集群中的每台计算机通常被成为一个“节点”。 +1. 我们需要在集群的所有节点上安装 PaddlePaddle。 如果要启用GPU,还需要在节点上安装对应的GPU驱动以及CUDA。PaddlePaddle的安装可以参考[build_and_install](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/getstarted/build_and_install)的多种安装方式。我们推荐使用[Docker](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_cn.rst)安装方式来快速安装PaddlePaddle。 + +安装完成之后,执行下面的命令可以查看已经安装的版本(docker安装方式可以进入docker容器执行:`docker run -it paddlepaddle/paddle:[tag] /bin/bash`): +```bash +$ paddle version +PaddlePaddle 0.10.0, compiled with + with_avx: ON + with_gpu: OFF + with_double: OFF + with_python: ON + with_rdma: OFF + with_timer: OFF ``` -# 运行分布式训练 +下面以`doc/howto/usage/cluster/src/word2vec`中的代码作为实例,介绍使用PaddlePaddle v2 API完成分布式训练。 -在本文中,我们将阐释如何在集群上运行分布式 Paddle 训练作业。我们将以[推荐系统](https://github.com/baidu/Paddle/tree/develop/demo/recommendation)为例创建分布式的单进程训练。 +## 启动参数说明 +### 启动参数服务器 +执行以下的命令启动一个参数服务器并等待和计算节点的数据交互 +```bash +$ paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 +``` -在本文中使用的[脚本](https://github.com/baidu/Paddle/tree/develop/paddle/scripts/cluster_train)通过 SSH 运行分布式作业。 它们还可以供那些运行更复杂的集群管理系统(如 MPI 和 [Kubernetes](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/k8s) )的用户参考。 +如果希望可以在后台运行pserver程序,并保存输出到一个日志文件,可以运行: +```bash +$ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 &> pserver.log +``` -## 前提条件 +| 参数 | 是否必选 | 默认值 | 说明 | +| ------------- | ------------- | ------------- | ------------- | +| port | 必选 | 7164 | pserver监听的起始端口,根据ports_num决定
总端口个数,从起始端口监听多个端口用于通信 | +| ports_num | 必选 | 1 | 监听的端口个数 | +| ports_num_for_sparse | 必选 | 1 | 用于稀疏类型参数通信的端口个数 | +| num_gradient_servers | 必选 | 1 | 当前训练任务pserver总数 | + +### 启动计算节点 +执行以下命令启动使用python编写的trainer程序(文件名为任意文件名,如train.py) +```bash +$ python train.py +``` -1. 上述脚本使用 Python 库 [fabric](http://www.fabfile.org/) 来运行 SSH 命令。 我们使用 `pip` 来安装 fabric: +trainer需要和pserver保持网络联通以完成训练。trainer启动需要传入端口、pserver地址等参数使trainer可以正确连接到pserver。这些参数可以通过环境变量(https://zh.wikipedia.org/wiki/环境变量 )或编写程序时`paddle.init()`中传入参数。如果同时使用`paddle.init()`参数和环境变量,将会优先使用`paddle.init()`中传入的参数。 - ```bash - pip install fabric - ``` +使用环境变量: -2. 我们需要在集群的所有节点上安装 PaddlePaddle。 如果要启用GPU,需要在 `/usr/local/cuda` 中安装 CUDA; 否则 Paddle 将在运行时报错。 +```bash +export PADDLE_INIT_USE_GPU=False +export PADDLE_INIT_TRAINER_COUNT=1 +export PADDLE_INIT_PORT=7164 +export PADDLE_INIT_PORTS_NUM=1 +export PADDLE_INIT_PORTS_NUM_FOR_SPARSE=1 +export PADDLE_INIT_NUM_GRADIENT_SERVERS=1 +export PADDLE_INIT_TRAINER_ID=0 +export PADDLE_INIT_PSERVERS=127.0.0.1 +``` -3. 在 [`cluster_train/conf.py`] 中设置 `ROOT_DIR`, 该 ROOT_DIR 要在所有节点上存在。为了方便起见,我们通常在所有节点上创建一个 Unix 用户 `paddle`,并设置 `ROOT_DIR=/home/paddle`。这样,我们可以将 SSH 公钥写入 `/home/paddle/.ssh/authorized_keys`,以便用户 `paddle` 可以 SSH 到所有节点而不用密码。 +使用参数: -## 准备工作空间 +```python +paddle.init( + use_gpu=False, + trainer_count=1, + port=7164, + ports_num=1, + ports_num_for_sparse=1, + num_gradient_servers=1, + trainer_id=0, + pservers="127.0.0.1") +``` -我们将放置依赖库、配置等文件的目录视为 *工作空间(workspace)*。 +| 参数 | 是否必选 | 默认 | 说明 | +| ------------- | ------------- | ------------- | ------------- | +| use_gpu | 可选 | False | 是否启用GPU训练 | +| trainer_count | 必选 | 1 | 当前训练任务trainer总个数 | +| port | 必选 | 7164 | 连接到pserver的端口 | +| ports_num | 必选 | 1 | 连接到pserver的端口个数 | +| ports_num_for_sparse | 必选 | 1 | 和pserver之间用于稀疏类型参数通信的端口个数 | +| num_gradient_servers | 必选 | 1 | 当前训练任务pserver总数 | +| trainer_id | 必选 | 0 | 每个trainer的唯一ID,从0开始的整数 | +| pservers | 必选 | 127.0.0.1 | 当前训练任务启动的pserver的IP列表,多个IP使用“,”隔开 | -这些 `train/test` 数据应该在启动集群作业之前准备好。 为了满足训练/测试数据放置在工作空间中不同目录的要求,PADDLE 根据在模型配置文件中使用的名为 `train.list/test.list` 的索引文件引用训练/测试数据,所以训练/测试数据也包含 train.list/test.list 两个列表文件。所有本地训练 demo 已经提供了脚本来帮助您创建这两个文件,并且集群作业中的所有节点将在正常情况下处理具有相同逻辑代码的文件。 -通常,你可以使用本地训练中的相同模型文件进行集群训练。请记住,在模型文件的 `setting`函数中设置的 `batch_size` 表示在集群作业**每个**节点中的 batch 大小,而不是使用同步 SGD 的总 batch 大小。 +### 准备数据集 -以下步骤基于 demo 目录中的 [demo/recommendation](https://github.com/PaddlePaddle/Paddle/tree/develop/demo/recommendation)。 +参考样例数据准备脚本[prepare.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py),准备训练数据和验证数据集,我们使用paddle.dataset.imikolov数据集,并根据分布式训练并发数(trainer节点个数),在`prepare.py`开头部分指定`SPLIT_COUNT`将数据切分成多份。 -你只需完成 demo/recommendation 教程文档到 `Train` 的部分,之后你会得到训练/测试数据和模型配置文件。最后,只需使用 demo/recommendation 作为集群训练的工作空间。 +在线上系统中,通常会使用MapReduce任务的输出结果作为训练结果,这样训练文件的个数会比较多,而且个数并不确定。在trainer中可以使用下面取模的方法为每个trainer分配训练数据文件: -最后,你的工作空间应如下所示: -``` -. -|-- common_utils.py -|-- data -| |-- config.json -| |-- config_generator.py -| |-- meta.bin -| |-- meta_config.json -| |-- meta_generator.py -| |-- ml-1m -| |-- ml_data.sh -| |-- ratings.dat.test -| |-- ratings.dat.train -| |-- split.py -| |-- test.list -| `-- train.list -|-- dataprovider.py -|-- evaluate.sh -|-- prediction.py -|-- preprocess.sh -|-- requirements.txt -|-- run.sh -`-- trainer_config.py +```python +import os +train_list = [] +flist = os.listdir("/train_data/") +for f in flist: + suffix = int(f.split("-")[1]) + if suffix % TRAINER_COUNT == TRAINER_ID: + train_list.append(f) ``` -虽然这些文件并非都需要集群训练,但是也没有必要删除无用的文件。 -`trainer_config.py` -表示模型配置文件。 +示例程序`prepare.py`会把训练集和测试集分别分割成多个文件(例子中为3个,后缀为`-00000`、`-00001`和`-00002`): +``` +train.txt +train.txt-00000 +train.txt-00001 +train.txt-00002 +test.txt +test.txt-00000 +test.txt-00001 +test.txt-00002 +``` -`train.list` 和 `test.list` -文件索引。它存储当前节点所有训练/测试数据的所有相对或绝对文件路径。 +在进行分布式训练时,每个trainer进程需要能够读取属于自己的一份数据。在一些分布式系统中,系统会提供一个分布式存储服务,这样保存在分布式存储中的数据可以被集群中的每个节点读取到。如果不使用分布式存储,则需要手动拷贝属于每个trainer节点的训练数据到对应的节点上。 -`dataprovider.py` -用于读取训练/测试样本。这与本地训练相同。 +对于不同的训练任务,训练数据格式和训练程序的`reader()`会大不相同,所以开发者需要根据自己训练任务的实际场景完成训练数据的分割和`reader()`的编写。 -`data` -数据目录中的所有文件被 train.list/test.list 引用。 +### 准备训练程序 +我们会对每个训练任务都会在每个节点上创建一个工作空间(workspace),其中包含了用户的训练程序、程序依赖、挂载或下载的训练数据分片。 -## 准备集群作业配置 +最后,工作空间应如下所示: +``` +. +|-- my_lib.py +|-- word_dict.pickle +|-- train.py +|-- train_data_dir/ +| |-- train.txt-00000 +| |-- train.txt-00001 +| |-- train.txt-00002 +`-- test_data_dir/ + |-- test.txt-00000 + |-- test.txt-00001 + `-- test.txt-00002 +``` -以下选项必须在 cluster_train/conf.py 中认真设置 +- `my_lib.py`:会被`train.py`调用的一些用户定义的库函数,比如PIL库等。 +- `word_dict.pickle`:在`train.py`中会使用到的字典数据文件。 +- `train.py`:训练程序,代码参考[api_train_v2_cluster.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py)。***注意:*** 对于本样例代码,在使用不同的分布式计算平台时,您可能需要修改`train.py`开头的部分(如下),以便获得训练数据的位置和获取环境变量配置: -`HOSTS` 所有节点运行集群作业的主机名或 IP 。你还可以将用户和 ssh 端口附加到主机名上,例如 root@192.168.100.17:9090。 + ```python + cluster_train_file = "./train_data_dir/train/train.txt" + cluster_test_file = "./test_data_dir/test/test.txt" + node_id = os.getenv("OMPI_COMM_WORLD_RANK") + if not node_id: + raise EnvironmentError("must provied OMPI_COMM_WORLD_RANK") + ``` -`ROOT_DIR` 用于放置 JOB 工作空间目录的工作空间 ROOT 目录 +- `train_data_dir`:包含训练数据的目录,可以是从分布式存储挂载过来的,也可以是在任务启动前下载到本地的。 +- `test_data_dir`:包含测试数据集的目录。 -`PADDLE_NIC` 集群通信通道的 NIC(Network Interface Card, 网络接口卡) 接口名称,例如以太网的 eth0,infiniband 的 ib0。 +## 使用分布式计算平台或工具 -`PADDLE_PORT` 集群通信通道的端口号 +PaddlePaddle可以使用多种分布式计算平台构建分布式计算任务,包括: +- [Kubernetes](http://kubernetes.io) Google开源的容器集群的调度框架,支持大规模集群生产环境的完整集群方案。 +- [OpenMPI](https://www.open-mpi.org) 成熟的高性能并行计算框架。 +- [Fabric](http://www.fabfile.org) 集群管理工具。可以使用`Fabric`编写集群任务提交和管理脚本。 -`PADDLE_PORTS_NUM` 用于集群通信通道的端口数。 如果集群节点数量少(少于5〜6个节点),建议将其设置为较大,如2〜8,以获得更好的网络性能。 +对于不同的集群平台,会分别介绍集群作业的启动和停止方法。这些例子都可以在[cluster_train_v2](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2)找到。 -`PADDLE_PORTS_NUM_FOR_SPARSE` 用于 sparse remote updater 集群通信信道的端口数。如果使用 sparse remote update,则可以像 `PADDLE_PORTS_NUM` 一样设置。 +在使用分布式计算平台进行训练时,任务被调度在集群中时,分布式计算平台通常会通过API或者环境变量提供任务运行需要的参数,比如节点的ID、IP和任务节点个数等。 -`LD_LIBRARY_PATH` 为集群作业设置额外的 LD_LIBRARY_PATH。你可以使用它来设置 CUDA 库的路径。 +### 使用Fabric启动集群作业 -默认配置如下: +#### 准备一个Linux集群 +可以在`paddle/scripts/cluster_train_v2/fabric/docker_cluster`目录下,执行`kubectl -f ssh_servers.yaml`启动一个测试集群,并使用`kubectl get po -o wide`获得这些节点的IP地址。 -```python -HOSTS = [ - "root@192.168.100.17", - "root@192.168.100.18", - ] - -''' -工作空间配置 -''' - -#工作空间根目录 -ROOT_DIR = "/home/paddle" - -''' -网络配置 -''' -#pserver NIC -PADDLE_NIC = "eth0" -#pserver 端口 -PADDLE_PORT = 7164 -#pserver 端口数 -PADDLE_PORTS_NUM = 2 -#pserver sparse ports num -PADDLE_PORTS_NUM_FOR_SPARSE = 2 - -#集群作业中所有进程的环境设置 -LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/lib64" -``` +#### 启动集群作业 -### 启动集群作业 -`paddle.py` 提供了自动化脚本来启动不同节点中的所有 PaddlePaddle 集群进程。默认情况下,所有命令行选项可以设置为```paddle.py``` 命令选项并且 `paddle.py` 将透明、自动地将这些选项应用到 PaddlePaddle 底层进程。 +`paddle.py` 提供了自动化脚本来启动不同节点中的所有 PaddlePaddle 集群进程。默认情况下,所有命令行选项可以设置为 `paddle.py` 命令选项并且 `paddle.py` 将透明、自动地将这些选项应用到 PaddlePaddle 底层进程。 `paddle.py` 为方便作业启动提供了两个独特的命令选项。 -`job_dispatch_package` 设为本地 `workspace` 目录,它将被分发到 conf.py 中设置的所有节点。 它有助于帮助频繁修改和访问工作区文件的用户减少负担,否则频繁的多节点工作空间部署可能会很麻烦。 -`job_workspace` 设为已部署的工作空间目录,`paddle.py` 将跳过分发阶段直接启动所有节点的集群作业。它可以帮助减少分发延迟。 +- `job_dispatch_package` 设为本地 `workspace` 目录,它将被分发到 `conf.py` 中设置的所有节点。它有助于帮助频繁修改和访问工作区文件的用户减少负担,否则频繁的多节点工作空间部署可能会很麻烦。 +- `job_workspace` 设为已部署的工作空间目录,`paddle.py` 将跳过分发阶段直接启动所有节点的集群作业。它可以帮助减少分发延迟。 -`cluster_train/run.sh` 提供了命令样例来运行 `demo/recommendation` 集群工作,只需用你定义的目录修改 `job_dispatch_package` 和 `job_workspace`,然后: +`cluster_train/run.sh` 提供了命令样例来运行 `doc/howto/usage/cluster/src/word2vec` 集群任务,只需用您定义的目录修改 `job_dispatch_package` 和 `job_workspace`,然后: ``` sh run.sh ``` 集群作业将会在几秒后启动。 -### 终止集群作业 +#### 终止集群作业 `paddle.py`能获取`Ctrl + C` SIGINT 信号来自动终止它启动的所有进程。只需中断 `paddle.py` 任务来终止集群作业。如果程序崩溃你也可以手动终止。 -### 检查集群训练结果 +#### 检查集群训练结果 详细信息请检查 $workspace/log 里的日志,每一个节点都有相同的日志结构。 `paddle_trainer.INFO` @@ -149,11 +229,57 @@ sh run.sh 提供 pserver 运行日志,有助于诊断分布式错误。 `server.log` -提供 pserver 进程的 stderr 和 stdout。训练失败时可以检查错误日志。 +提供 parameter server 进程的 stderr 和 stdout。训练失败时可以检查错误日志。 `train.log` 提供训练过程的 stderr 和 stdout。训练失败时可以检查错误日志。 -### 检查模型输出 +#### 检查模型输出 运行完成后,模型文件将被写入节点 0 的 `output` 目录中。 工作空间中的 `nodefile` 表示当前集群作业的节点 ID。 + +### 在OpenMPI集群中提交训练作业 + +#### 准备OpenMPI集群 + +执行下面的命令以启动3个节点的OpenMPI集群和一个"head"节点: + +```bash +paddle/scripts/cluster_train_v2/openmpi/docker_cluster +kubectl create -f head.yaml +kubectl create -f mpi-nodes.yaml +``` + +然后可以从head节点ssh无密码登录到OpenMPI的每个节点上。 + +#### 启动集群作业 + +您可以按照下面的步骤在OpenMPI集群中提交paddle训练任务: + +```bash +# 获得head和node节点的IP地址 +kubectl get po -o wide +# 将node节点的IP地址保存到machines文件中 +kubectl get po -o wide | grep nodes | awk '{print $6}' > machines +# 拷贝必要的文件到head节点 +scp -i ssh/id_rsa.mpi.pub machines prepare.py train.py start_mpi_train.sh tutorial@[headIP]:~ +# ssh 登录到head节点 +ssh -i ssh/id_rsa.mpi.pub tutorial@[headIP] +# --------------- 以下操作均在head节点中执行 --------------- +# 准备训练数据 +python prepare.py +# 拷贝训练程序和字典文件到每台MPI节点 +cat machines | xargs -i scp word_dict.pickle train.py start_mpi_train.sh machines {}:/home/tutorial +# 创建日志目录 +mpirun -hostfile machines -n 3 mkdir /home/tutorial/logs +# 拷贝训练数据到各自的节点 +scp train.txt-00000 test.txt-00000 [node1IP]:/home/tutorial +scp train.txt-00001 test.txt-00001 [node2IP]:/home/tutorial +scp train.txt-00002 test.txt-00002 [node3IP]:/home/tutorial +# 启动训练任务 +mpirun -hostfile machines -n 3 /home/tutorial/start_mpi_train.sh +``` + +### 在Kubernetes集群中提交训练作业 + +此部分的使用方法可以参考[here](../k8s/k8s_distributed_cn.md)。 diff --git a/doc/howto/usage/cluster/cluster_train_en.md b/doc/howto/usage/cluster/cluster_train_en.md index c60876721cbf5565d6e48c8061811aacada748cd..baa97c0c02ae490fff8587071bd2d4adfb5325e3 100644 --- a/doc/howto/usage/cluster/cluster_train_en.md +++ b/doc/howto/usage/cluster/cluster_train_en.md @@ -1,129 +1,220 @@ -# Run Distributed Training +# PaddlePaddle Distributed Training + +* [Introduction](#introduction) +* [Preparations](#preparations) +* [Command-line arguments](#command-line-arguments) + * [Starting parameter server](#starting-parameter-server) + * [Starting trainer](#starting-trainer) + * [Prepare Training Dataset](#prepare-training-dataset) + * [Prepare Training program](#prepare-training-program) +* [Use cluster platforms or cluster management tools](#use-cluster-platforms-or-cluster-management-tools) + * [Cluster Training Using Fabric](#cluster-training-using-fabric) + * [Prepare a Linux cluster](#prepare-a-linux-cluster) + * [Launching Cluster Job](#launching-cluster-job) + * [Kill Cluster Job](#kill-cluster-job) + * [Check Cluster Training Result](#check-cluster-training-result) + * [Check Model Output](#check-model-output) + * [Cluster Training Using OpenMPI](#cluster-training-using-openmpi) + * [Prepare an OpenMPI cluster](#prepare-an-openmpi-cluster) + * [Launching Cluster Job](#launching-cluster-job-1) + * [Cluster Training Using Kubernetes](#cluster-training-using-kubernetes) + +## Introduction + +In this article, we'll explain how to run distributed training jobs with PaddlePaddle on different types of clusters. The diagram below shows the main architecture of a distributed trainning job: + + + +- Data shard: training data will be split into multiple partitions, trainers use the partitions of the whole dataset to do the training job. +- Trainer: each trainer reads the data shard, and train the neural network. Then the trainer will upload calculated "gradients" to parameter servers, and wait for parameters to be optimized on the parameter server side. When that finishes, the trainer download optimized parameters and continues its training. +- Parameter server: every parameter server stores part of the whole neural network model data. They will do optimization calculations when gradients are uploaded from trainers, and then send updated parameters to trainers. + +PaddlePaddle can support both synchronize stochastic gradient descent (SGD) and asynchronous SGD. + +When training with synchronize SGD, PaddlePaddle uses an internal "synchronize barrier" which makes gradients update and parameter download in strict order. On the other hand, asynchronous SGD won't wait for all trainers to finish upload at a single step, this will increase the parallelism of distributed training: parameter servers do not depend on each other, they'll do parameter optimization concurrently. Parameter servers will not wait for trainers, so trainers will also do their work concurrently. But asynchronous SGD will introduce more randomness and noises in the gradient. + +## Preparations +1. Prepare your computer cluster. It's normally a bunch of Linux servers connected by LAN. Each server will be assigned a unique IP address. The computers in the cluster can be called "nodes". +2. Install PaddlePaddle on every node. If you are going to take advantage of GPU cards, you'll also need to install proper driver and CUDA libraries. To install PaddlePaddle please read [this build and install](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/getstarted/build_and_install) document. We strongly recommend using [Docker installation](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_en.rst). + +After installation, you can check the version by typing the below command (run a docker container if using docker: `docker run -it paddlepaddle/paddle:[tag] /bin/bash`): + +```bash +$ paddle version +PaddlePaddle 0.10.0rc, compiled with + with_avx: ON + with_gpu: OFF + with_double: OFF + with_python: ON + with_rdma: OFF + with_timer: OFF +``` -In this article, we explain how to run distributed Paddle training jobs on clusters. We will create the distributed version of the single-process training example, [recommendation](https://github.com/baidu/Paddle/tree/develop/demo/recommendation). +We'll take `doc/howto/usage/cluster/src/word2vec` as an example to introduce distributed training using PaddlePaddle v2 API. -[Scripts](https://github.com/baidu/Paddle/tree/develop/paddle/scripts/cluster_train) used in this article launch distributed jobs via SSH. They also work as a reference for users running more sophisticated cluster management systems like MPI and [Kubernetes](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/k8s). +## Command-line arguments -## Prerequisite +### Starting parameter server -1. Aforementioned scripts use a Python library [fabric](http://www.fabfile.org/) to run SSH commands. We can use `pip` to install fabric: +Type the below command to start a parameter server which will wait for trainers to connect: - ```bash - pip install fabric - ``` +```bash +$ paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 +``` -1. We need to install PaddlePaddle on all nodes in the cluster. To enable GPUs, we need to install CUDA in `/usr/local/cuda`; otherwise Paddle would report errors at runtime. +If you wish to run parameter servers in background, and save a log file, you can type: +```bash +$ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 &> pserver.log +``` -1. Set the `ROOT_DIR` variable in [`cluster_train/conf.py`] on all nodes. For convenience, we often create a Unix user `paddle` on all nodes and set `ROOT_DIR=/home/paddle`. In this way, we can write public SSH keys into `/home/paddle/.ssh/authorized_keys` so that user `paddle` can SSH to all nodes without password. +| param | required | default | description | +| ------------- | ------------- | ------------- | ------------- | +| port | required | 7164 | port which parameter server will listen on. If ports_num greater than 1, parameter server will listen on multiple ports for more network throughput | +| ports_num | required | 1 | total number of ports will listen on | +| ports_num_for_sparse | required | 1 | number of ports which serves sparse parameter update | +| num_gradient_servers | required | 1 | total number of gradient servers | -## Prepare Job Workspace +### Starting trainer +Type the command below to start the trainer(name the file whatever you want, like "train.py") -We refer to the directory where we put dependent libraries, config files, etc., as *workspace*. +```bash +$ python train.py +``` -These `train/test` data should be prepared before launching cluster job. To satisfy the requirement that train/test data are placed in different directory from workspace, PADDLE refers train/test data according to index file named as `train.list/test.list` which are used in model config file. So the train/test data also contains train.list/test.list two list file. All local training demo already provides scripts to help you create these two files, and all nodes in cluster job will handle files with same logical code in normal condition. +Trainers' network need to be connected with parameter servers' network to finish the job. Trainers need to know port and IPs to locate parameter servers. You can pass arguments to trainers through [environment variables](https://en.wikipedia.org/wiki/Environment_variable) or pass to `paddle.init()` function. Arguments passed to the `paddle.init()` function will overwrite environment variables. -Generally, you can use same model file from local training for cluster training. What you should have in mind that, the `batch_size` set in `setting` function in model file means batch size in `each` node of cluster job instead of total batch size if synchronization SGD was used. +Use environment viriables: -Following steps are based on [demo/recommendation](https://github.com/PaddlePaddle/Paddle/tree/develop/demo/recommendation) demo in demo directory. +```bash +export PADDLE_INIT_USE_GPU=False +export PADDLE_INIT_TRAINER_COUNT=1 +export PADDLE_INIT_PORT=7164 +export PADDLE_INIT_PORTS_NUM=1 +export PADDLE_INIT_PORTS_NUM_FOR_SPARSE=1 +export PADDLE_INIT_NUM_GRADIENT_SERVERS=1 +export PADDLE_INIT_TRAINER_ID=0 +export PADDLE_INIT_PSERVERS=127.0.0.1 +python train.py +``` -You just go through demo/recommendation tutorial doc until `Train` section, and at last you will get train/test data and model configuration file. Finaly, just use demo/recommendation as workspace for cluster training. +Pass arguments: -At last your workspace should look like as follow: +```python +paddle.init( + use_gpu=False, + trainer_count=1, + port=7164, + ports_num=1, + ports_num_for_sparse=1, + num_gradient_servers=1, + trainer_id=0, + pservers="127.0.0.1") ``` -. -|-- common_utils.py -|-- data -| |-- config.json -| |-- config_generator.py -| |-- meta.bin -| |-- meta_config.json -| |-- meta_generator.py -| |-- ml-1m -| |-- ml_data.sh -| |-- ratings.dat.test -| |-- ratings.dat.train -| |-- split.py -| |-- test.list -| `-- train.list -|-- dataprovider.py -|-- evaluate.sh -|-- prediction.py -|-- preprocess.sh -|-- requirements.txt -|-- run.sh -`-- trainer_config.py + +| param | required | default | description | +| ------------- | ------------- | ------------- | ------------- | +| use_gpu | optional | False | set to "True" to enable GPU training | +| trainer_count | required | 1 | total count of trainers in the training job | +| port | required | 7164 | port to connect to parameter server | +| ports_num | required | 1 | number of ports for communication | +| ports_num_for_sparse | required | 1 | number of ports for sparse type caculation | +| num_gradient_servers | required | 1 | total number of gradient server | +| trainer_id | required | 0 | ID for every trainer, start from 0 | +| pservers | required | 127.0.0.1 | list of IPs of parameter servers, separated by "," | + +### Prepare Training Dataset + +Here's some example code [prepare.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py), it will download public `imikolov` dataset and split it into multiple files according to job parallelism(trainers count). Modify `SPLIT_COUNT` at the begining of `prepare.py` to change the count of output files. + +In the real world, we often use `MapReduce` job's output as training data, so there will be lots of files. You can use `mod` to assign training file to trainers: + +```python +import os +train_list = [] +flist = os.listdir("/train_data/") +for f in flist: + suffix = int(f.split("-")[1]) + if suffix % TRAINER_COUNT == TRAINER_ID: + train_list.append(f) ``` -Not all of these files are needed for cluster training, but it's not necessary to remove useless files. -`trainer_config.py` -Indicates the model config file. +Example code `prepare.py` will split training data and testing data into 3 files with digital suffix like `-00000`, `-00001` and`-00002`: -`train.list` and `test.list` -File index. It stores all relative or absolute file paths of all train/test data at current node. +``` +train.txt +train.txt-00000 +train.txt-00001 +train.txt-00002 +test.txt +test.txt-00000 +test.txt-00001 +test.txt-00002 +``` -`dataprovider.py` -used to read train/test samples. It's same as local training. +When job started, every trainer needs to get it's own part of data. In some distributed systems a storage service will be provided, so the date under that path can be accessed by all the trainer nodes. Without the storage service, you must copy the training data to each trainer node. -`data` -all files in data directory are refered by train.list/test.list which are refered by data provider. +Different training jobs may have different data format and `reader()` function, developers may need to write different data prepare scripts and `reader()` functions for their job. +### Prepare Training program -## Prepare Cluster Job Configuration +We'll create a *workspace* directory on each node, storing your training program, dependencies, mounted or downloaded dataset directory. -The options below must be carefully set in cluster_train/conf.py -`HOSTS` all nodes hostname or ip that will run cluster job. You can also append user and ssh port with hostname, such as root@192.168.100.17:9090. +Your workspace may looks like: +``` +. +|-- my_lib.py +|-- word_dict.pickle +|-- train.py +|-- train_data_dir/ +| |-- train.txt-00000 +| |-- train.txt-00001 +| |-- train.txt-00002 +`-- test_data_dir/ + |-- test.txt-00000 + |-- test.txt-00001 + `-- test.txt-00002 +``` -`ROOT_DIR` workspace ROOT directory for placing JOB workspace directory +- `my_lib.py`: user defined libraries, like PIL libs. This is optional. +- `word_dict.pickle`: dict file for training word embeding. +- `train.py`: training program. Sample code: [api_train_v2_cluster.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py). ***NOTE:*** You may need to modify the head part of `train.py` when using different cluster platform to retrive configuration environment variables: -`PADDLE_NIC` the NIC(Network Interface Card) interface name for cluster communication channel, such as eth0 for ethternet, ib0 for infiniband. + ```python + cluster_train_file = "./train_data_dir/train/train.txt" + cluster_test_file = "./test_data_dir/test/test.txt" + node_id = os.getenv("OMPI_COMM_WORLD_RANK") + if not node_id: + raise EnvironmentError("must provied OMPI_COMM_WORLD_RANK") + ``` -`PADDLE_PORT` port number for cluster commnunication channel +- `train_data_dir`: containing training data. Mount from storage service or copy trainning data to here. +- `test_data_dir`: containing testing data. -`PADDLE_PORTS_NUM` the number of port used for cluster communication channle. if the number of cluster nodes is small(less than 5~6nodes), recommend you set it to larger, such as 2 ~ 8, for better network performance. +## Use cluster platforms or cluster management tools -`PADDLE_PORTS_NUM_FOR_SPARSE` the number of port used for sparse updater cluster commnunication channel. if sparse remote update is used, set it like `PADDLE_PORTS_NUM` +PaddlePaddle supports running jobs on several platforms including: +- [Kubernetes](http://kubernetes.io) open-source system for automating deployment, scaling, and management of containerized applications from Google. +- [OpenMPI](https://www.open-mpi.org) Mature high performance parallel computing framework. +- [Fabric](http://www.fabfile.org) A cluster management tool. Write scripts to submit jobs or manage the cluster. -`LD_LIBRARY_PATH` set addtional LD_LIBRARY_PATH for cluster job. You can use it to set CUDA libraries path. +We'll introduce cluster job management on these platforms. The examples can be found under [cluster_train_v2](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2). -Default Configuration as follow: +These cluster platforms provide API or environment variables for training processes, when the job is dispatched to different nodes. Like node ID, IP or total number of nodes etc. -```python -HOSTS = [ - "root@192.168.100.17", - "root@192.168.100.18", - ] - -''' -workspace configuration -''' - -#root dir for workspace -ROOT_DIR = "/home/paddle" - -''' -network configuration -''' -#pserver nics -PADDLE_NIC = "eth0" -#pserver port -PADDLE_PORT = 7164 -#pserver ports num -PADDLE_PORTS_NUM = 2 -#pserver sparse ports num -PADDLE_PORTS_NUM_FOR_SPARSE = 2 - -#environments setting for all processes in cluster job -LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/lib64" -``` +### Cluster Training Using Fabric -### Launching Cluster Job -`paddle.py` provides automatical scripts to start all PaddlePaddle cluster processes in different nodes. By default, all command line options can set as `paddle.py` command options and `paddle.py` will transparently and automatically set these options to PaddlePaddle lower level processes. +#### Prepare a Linux cluster + +Run `kubectl -f ssh_servers.yaml` under the directory: `paddle/scripts/cluster_train_v2/fabric/docker_cluster` will launch a demo cluster. Run `kubectl get po -o wide` to get IP addresses of these nodes. + +#### Launching Cluster Job +`paddle.py` provides automatical scripts to start all PaddlePaddle cluster processes in different nodes. By default, all command line options can be set as `paddle.py` command options and `paddle.py` will transparently and automatically set these options to PaddlePaddle lower level processes. `paddle.py`provides two distinguished command option for easy job launching. -`job_dispatch_package` set it with local `workspace`directory, it will be dispatched to all nodes set in conf.py. It could be helpful for frequent hacking workspace files, otherwise frequent mulit-nodes workspace deployment could make your crazy. -`job_workspace` set it with already deployed workspace directory, `paddle.py` will skip dispatch stage to directly launch cluster job with all nodes. It could help to reduce heavy +- `job_dispatch_package` set it with local `workspace` directory, it will be dispatched to all nodes which is set in `conf.py`. It could be helpful for frequently manipulating workspace files. otherwise, frequent multi-nodes workspace deployment is very annoying. +- `job_workspace` set it with already deployed workspace directory, `paddle.py` will skip dispatch stage to directly launch cluster job with all nodes. It could help to reduce heavy dispatch latency. `cluster_train/run.sh` provides command line sample to run `demo/recommendation` cluster job, just modify `job_dispatch_package` and `job_workspace` with your defined directory, then: @@ -133,24 +224,70 @@ sh run.sh The cluster Job will start in several seconds. -### Kill Cluster Job -`paddle.py` can capture `Ctrl + C` SIGINT signal to automatically kill all processes launched by it. So just stop `paddle.py` to kill cluster job. You should mannally kill job if program crashed. +#### Kill Cluster Job +`paddle.py` can capture `Ctrl + C` SIGINT signal to automatically kill all processes launched by it. So just stop `paddle.py` to kill cluster job. You should manually kill the job if the program crashed. -### Check Cluster Training Result +#### Check Cluster Training Result Check log in $workspace/log for details, each node owns same log structure. `paddle_trainer.INFO` -It provides almost all interal output log for training, same as local training. Check runtime model convergence here. +It provides almost all internal output log for training, same as local training. Check runtime model convergence here. `paddle_pserver2.INFO` -It provides pserver running log, which could help to diagnose distributed error. +It provides parameter server running log, which could help to diagnose distributed error. `server.log` -It provides stderr and stdout of pserver process. Check error log if training crashs. +It provides stderr and stdout of parameter server process. Check error log if training crashes. `train.log` -It provides stderr and stdout of trainer process. Check error log if training crashs. +It provides stderr and stdout of trainer process. Check error log if training crashes. -### Check Model Output -After one pass finished, model files will be writed in `output` directory in node 0. +#### Check Model Output +After one pass finished, model files will be written in `output` directory in node 0. `nodefile` in workspace indicates the node id of current cluster job. + +### Cluster Training Using OpenMPI + +#### Prepare an OpenMPI cluster + +Run the following command to start a 3-node MPI cluster and one "head" node. + +```bash +cd paddle/scripts/cluster_train_v2/openmpi/docker_cluster +kubectl create -f head.yaml +kubectl create -f mpi-nodes.yaml +``` + +Then you can log in to every OpenMPI node using ssh without input any passwords. + +#### Launching Cluster Job + +Follow the steps to launch a PaddlePaddle training job in OpenMPI cluster:\ + +```bash +# find out node IP addresses +kubectl get po -o wide +# generate a "machines" file containing node IP addresses +kubectl get po -o wide | grep nodes | awk '{print $6}' > machines +# copy necessary files onto "head" node +scp -i ssh/id_rsa.mpi.pub machines prepare.py train.py start_mpi_train.sh tutorial@[headIP]:~ +# login to head node using ssh +ssh -i ssh/id_rsa.mpi.pub tutorial@[headIP] +# --------------- in head node --------------- +# prepare training data +python prepare.py +# copy training data and dict file to MPI nodes +cat machines | xargs -i scp word_dict.pickle train.py start_mpi_train.sh machines {}:/home/tutorial +# creat a directory for storing log files +mpirun -hostfile machines -n 3 mkdir /home/tutorial/logs +# copy training data to every node +scp train.txt-00000 test.txt-00000 [node1IP]:/home/tutorial +scp train.txt-00001 test.txt-00001 [node2IP]:/home/tutorial +scp train.txt-00002 test.txt-00002 [node3IP]:/home/tutorial +# start the job +mpirun -hostfile machines -n 3 /home/tutorial/start_mpi_train.sh +``` + +### Cluster Training Using Kubernetes + +The details can be found [here](../k8s/k8s_cn.md) diff --git a/doc/howto/usage/cluster/src/trainer.png b/doc/howto/usage/cluster/src/trainer.png new file mode 100644 index 0000000000000000000000000000000000000000..6537d3d56589ca9f19a77a50a970e4b5275e6ce0 Binary files /dev/null and b/doc/howto/usage/cluster/src/trainer.png differ diff --git a/doc/howto/usage/cluster/src/trainer_cn.png b/doc/howto/usage/cluster/src/trainer_cn.png new file mode 100644 index 0000000000000000000000000000000000000000..f9525739cc8bc6506adde642aafa0a85ae3ebebc Binary files /dev/null and b/doc/howto/usage/cluster/src/trainer_cn.png differ diff --git a/doc/howto/usage/cluster/src/word2vec/api_train_v2.py b/doc/howto/usage/cluster/src/word2vec/api_train_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..c0940f0e56eafa22f8aeb7052c0ddc79d8862917 --- /dev/null +++ b/doc/howto/usage/cluster/src/word2vec/api_train_v2.py @@ -0,0 +1,100 @@ +import gzip +import math + +import paddle.v2 as paddle + +embsize = 32 +hiddensize = 256 +N = 5 + + +def wordemb(inlayer): + wordemb = paddle.layer.embedding( + input=inlayer, + size=embsize, + param_attr=paddle.attr.Param( + name="_proj", + initial_std=0.001, + learning_rate=1, + l2_rate=0, + sparse_update=True)) + return wordemb + + +def main(): + # for local training + cluster_train = False + + if not cluster_train: + paddle.init(use_gpu=False, trainer_count=1) + else: + paddle.init( + use_gpu=False, + trainer_count=2, + port=7164, + ports_num=1, + ports_num_for_sparse=1, + num_gradient_servers=1) + word_dict = paddle.dataset.imikolov.build_dict() + dict_size = len(word_dict) + firstword = paddle.layer.data( + name="firstw", type=paddle.data_type.integer_value(dict_size)) + secondword = paddle.layer.data( + name="secondw", type=paddle.data_type.integer_value(dict_size)) + thirdword = paddle.layer.data( + name="thirdw", type=paddle.data_type.integer_value(dict_size)) + fourthword = paddle.layer.data( + name="fourthw", type=paddle.data_type.integer_value(dict_size)) + nextword = paddle.layer.data( + name="fifthw", type=paddle.data_type.integer_value(dict_size)) + + Efirst = wordemb(firstword) + Esecond = wordemb(secondword) + Ethird = wordemb(thirdword) + Efourth = wordemb(fourthword) + + contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth]) + hidden1 = paddle.layer.fc(input=contextemb, + size=hiddensize, + act=paddle.activation.Sigmoid(), + layer_attr=paddle.attr.Extra(drop_rate=0.5), + bias_attr=paddle.attr.Param(learning_rate=2), + param_attr=paddle.attr.Param( + initial_std=1. / math.sqrt(embsize * 8), + learning_rate=1)) + predictword = paddle.layer.fc(input=hidden1, + size=dict_size, + bias_attr=paddle.attr.Param(learning_rate=2), + act=paddle.activation.Softmax()) + + def event_handler(event): + if isinstance(event, paddle.event.EndIteration): + if event.batch_id % 100 == 0: + with gzip.open("batch-" + str(event.batch_id) + ".tar.gz", + 'w') as f: + trainer.save_parameter_to_tar(f) + result = trainer.test( + paddle.batch( + paddle.dataset.imikolov.test(word_dict, N), 32)) + print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % ( + event.pass_id, event.batch_id, event.cost, event.metrics, + result.metrics) + + cost = paddle.layer.classification_cost(input=predictword, label=nextword) + + parameters = paddle.parameters.create(cost) + adagrad = paddle.optimizer.AdaGrad( + learning_rate=3e-3, + regularization=paddle.optimizer.L2Regularization(8e-4)) + trainer = paddle.trainer.SGD(cost, + parameters, + adagrad, + is_local=not cluster_train) + trainer.train( + paddle.batch(paddle.dataset.imikolov.train(word_dict, N), 32), + num_passes=30, + event_handler=event_handler) + + +if __name__ == '__main__': + main() diff --git a/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py b/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py new file mode 100644 index 0000000000000000000000000000000000000000..2e6d8887124a5524505b097803a60a35478ca644 --- /dev/null +++ b/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py @@ -0,0 +1,123 @@ +import math +import os +import paddle.v2 as paddle +import pickle + +embsize = 32 +hiddensize = 256 +N = 5 +cluster_train_file = "./train_data_dir/train/train.txt" +cluster_test_file = "./test_data_dir/test/test.txt" +node_id = os.getenv("OMPI_COMM_WORLD_RANK") +if not node_id: + raise EnvironmentError("must provied OMPI_COMM_WORLD_RANK") + + +def wordemb(inlayer): + wordemb = paddle.layer.embedding( + input=inlayer, + size=embsize, + param_attr=paddle.attr.Param( + name="_proj", + initial_std=0.001, + learning_rate=1, + l2_rate=0, + sparse_update=True)) + return wordemb + + +def cluster_reader_cluster(filename, node_id): + def cluster_reader(): + with open("-".join([filename, "%05d" % int(node_id)]), "r") as f: + for l in f: + csv_data = [int(cell) for cell in l.split(",")] + yield tuple(csv_data) + + return cluster_reader + + +def main(): + # get arguments from env + + # for local training + TRUTH = ["true", "True", "TRUE", "1", "yes", "Yes", "YES"] + cluster_train = os.getenv('PADDLE_CLUSTER_TRAIN', "False") in TRUTH + use_gpu = os.getenv('PADDLE_INIT_USE_GPU', "False") + + if not cluster_train: + paddle.init( + use_gpu=use_gpu, + trainer_count=int(os.getenv("PADDLE_INIT_TRAINER_COUNT", "1"))) + else: + paddle.init( + use_gpu=use_gpu, + trainer_count=int(os.getenv("PADDLE_INIT_TRAINER_COUNT", "1")), + port=int(os.getenv("PADDLE_INIT_PORT", "7164")), + ports_num=int(os.getenv("PADDLE_INIT_PORTS_NUM", "1")), + ports_num_for_sparse=int( + os.getenv("PADDLE_INIT_PORTS_NUM_FOR_SPARSE", "1")), + num_gradient_servers=int( + os.getenv("PADDLE_INIT_NUM_GRADIENT_SERVERS", "1")), + trainer_id=int(os.getenv("PADDLE_INIT_TRAINER_ID", "0")), + pservers=os.getenv("PADDLE_INIT_PSERVERS", "127.0.0.1")) + fn = open("thirdparty/wuyi_train_thdpty/word_dict.pickle", "r") + word_dict = pickle.load(fn) + fn.close() + dict_size = len(word_dict) + firstword = paddle.layer.data( + name="firstw", type=paddle.data_type.integer_value(dict_size)) + secondword = paddle.layer.data( + name="secondw", type=paddle.data_type.integer_value(dict_size)) + thirdword = paddle.layer.data( + name="thirdw", type=paddle.data_type.integer_value(dict_size)) + fourthword = paddle.layer.data( + name="fourthw", type=paddle.data_type.integer_value(dict_size)) + nextword = paddle.layer.data( + name="fifthw", type=paddle.data_type.integer_value(dict_size)) + + Efirst = wordemb(firstword) + Esecond = wordemb(secondword) + Ethird = wordemb(thirdword) + Efourth = wordemb(fourthword) + + contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth]) + hidden1 = paddle.layer.fc(input=contextemb, + size=hiddensize, + act=paddle.activation.Sigmoid(), + layer_attr=paddle.attr.Extra(drop_rate=0.5), + bias_attr=paddle.attr.Param(learning_rate=2), + param_attr=paddle.attr.Param( + initial_std=1. / math.sqrt(embsize * 8), + learning_rate=1)) + predictword = paddle.layer.fc(input=hidden1, + size=dict_size, + bias_attr=paddle.attr.Param(learning_rate=2), + act=paddle.activation.Softmax()) + + def event_handler(event): + if isinstance(event, paddle.event.EndIteration): + if event.batch_id % 100 == 0: + result = trainer.test( + paddle.batch( + cluster_reader_cluster(cluster_test_file, node_id), 32)) + print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % ( + event.pass_id, event.batch_id, event.cost, event.metrics, + result.metrics) + + cost = paddle.layer.classification_cost(input=predictword, label=nextword) + parameters = paddle.parameters.create(cost) + adagrad = paddle.optimizer.AdaGrad( + learning_rate=3e-3, + regularization=paddle.optimizer.L2Regularization(8e-4)) + trainer = paddle.trainer.SGD(cost, + parameters, + adagrad, + is_local=not cluster_train) + trainer.train( + paddle.batch(cluster_reader_cluster(cluster_train_file, node_id), 32), + num_passes=30, + event_handler=event_handler) + + +if __name__ == '__main__': + main() diff --git a/doc/howto/usage/cluster/src/word2vec/prepare.py b/doc/howto/usage/cluster/src/word2vec/prepare.py new file mode 100644 index 0000000000000000000000000000000000000000..24f5c5b26d37ea03de3ab4dc2d967a4bd009eef0 --- /dev/null +++ b/doc/howto/usage/cluster/src/word2vec/prepare.py @@ -0,0 +1,41 @@ +import paddle.v2 as paddle +import tarfile +import os +import pickle + +SPLIT_COUNT = 3 +N = 5 + + +def file_len(fd): + for i, l in enumerate(fd): + pass + return i + 1 + + +def split_from_reader_by_line(filename, reader, split_count): + fn = open(filename, "w") + for batch_id, batch_data in enumerate(reader()): + batch_data_str = [str(d) for d in batch_data] + fn.write(",".join(batch_data_str)) + fn.write("\n") + fn.close() + + fn = open(filename, "r") + total_line_count = file_len(fn) + fn.close() + per_file_lines = total_line_count / split_count + 1 + cmd = "split -d -a 5 -l %d %s %s-" % (per_file_lines, filename, filename) + os.system(cmd) + + +word_dict = paddle.dataset.imikolov.build_dict() +with open("word_dict.pickle", "w") as dict_f: + pickle.dump(word_dict, dict_f) + +split_from_reader_by_line("train.txt", + paddle.dataset.imikolov.train(word_dict, N), + SPLIT_COUNT) +split_from_reader_by_line("test.txt", + paddle.dataset.imikolov.test(word_dict, N), + SPLIT_COUNT) diff --git a/doc/howto/usage/cmd_parameter/arguments_cn.md b/doc/howto/usage/cmd_parameter/arguments_cn.md index f7aa525054468670f59309ddf9206af55bb77869..2dea231ca5487978d59a4d0a570431722ed6b3bf 100644 --- a/doc/howto/usage/cmd_parameter/arguments_cn.md +++ b/doc/howto/usage/cmd_parameter/arguments_cn.md @@ -63,7 +63,7 @@ -训练dot_period +训练dot_period √√ diff --git a/doc/howto/usage/k8s/k8s_distributed_cn.md b/doc/howto/usage/k8s/k8s_distributed_cn.md index 3121b3f59df650c0a22d0bd305a6f793b202d30e..a9bebf09558b06993119803458977abedbbfbdd0 100644 --- a/doc/howto/usage/k8s/k8s_distributed_cn.md +++ b/doc/howto/usage/k8s/k8s_distributed_cn.md @@ -213,7 +213,7 @@ I1116 09:10:17.123440 50 Util.cpp:130] Calling runInitFunctions I1116 09:10:17.123764 50 Util.cpp:143] Call runInitFunctions done. [WARNING 2016-11-16 09:10:17,227 default_decorators.py:40] please use keyword arguments in paddle config. [INFO 2016-11-16 09:10:17,239 networks.py:1282] The input order is [movie_id, title, genres, user_id, gender, age, occupation, rating] -[INFO 2016-11-16 09:10:17,239 networks.py:1289] The output order is [__mse_cost_0__] +[INFO 2016-11-16 09:10:17,239 networks.py:1289] The output order is [__square_error_cost_0__] I1116 09:10:17.392917 50 Trainer.cpp:170] trainer mode: Normal I1116 09:10:17.613910 50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process I1116 09:10:17.680917 50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process diff --git a/doc/index_cn.rst b/doc/index_cn.rst index 9279bac7f4b2898c18979630a8d6dfcb2dba70e0..ada51c2d73263898b2c748437f8eb0f30b537073 100644 --- a/doc/index_cn.rst +++ b/doc/index_cn.rst @@ -8,3 +8,4 @@ PaddlePaddle 文档 howto/index_cn.rst api/index_cn.rst faq/index_cn.rst + mobile/index_cn.rst diff --git a/doc/index_en.rst b/doc/index_en.rst index 168c7667c61da677905585d6c4b5037ce80b3765..23b64b6cadf776d44c4d0aa5a550ffe24be13b18 100644 --- a/doc/index_en.rst +++ b/doc/index_en.rst @@ -7,4 +7,4 @@ PaddlePaddle Documentation getstarted/index_en.rst howto/index_en.rst api/index_en.rst - about/index_en.rst + mobile/index_en.rst diff --git a/doc/mobile/cross_compiling_for_android_cn.md b/doc/mobile/cross_compiling_for_android_cn.md new file mode 100644 index 0000000000000000000000000000000000000000..424d7718c64438496cf0895397babd5408e1ca02 --- /dev/null +++ b/doc/mobile/cross_compiling_for_android_cn.md @@ -0,0 +1,168 @@ +# Android平台编译指南 + +用户可通过如下两种方式,交叉编译Android平台上适用的PaddlePaddle库: +- 基于Docker容器的编译方式 +- 基于Linux交叉编译环境的编译方式 + +## 基于Docker容器的编译方式 +Docker能在所有主要操作系统(包括Linux,Mac OS X和Windows)上运行,因此,使用基于Docker容器的编译方式,用户可在自己熟悉的开发平台上编译Android平台上适用的PaddlePaddle库。 + +### 构建PaddlePaddle的Android开发镜像 +我们把PaddlePaddle的交叉编译环境打包成一个镜像,称为开发镜像,里面涵盖了交叉编译Android版PaddlePaddle库需要的所有编译工具。 + +```bash +$ git clone https://github.com/PaddlePaddle/Paddle.git +$ cd Paddle +$ docker build -t username/paddle-android:dev . -f Dockerfile.android +``` + +### 编译PaddlePaddle C-API库 +构建好开发镜像后,即可使用开发镜像来编译Android版PaddlePaddle C-API库。 +Android的Docker开发镜像向用户提供两个可配置的参数: + + ++ + + + + + + + + + + + + + + + + + + + + + + +
ArgumentOptional ValuesDefault
ANDROID_ABIarmeabi-v7a, arm64-v8aarmeabi-v7a
ANDROID_API>= 2121
+ +- 编译`armeabi-v7a`,`Android API 21`的PaddlePaddle库 + ```bash + $ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" username/paddle-android:dev + ``` + +- 编译`arm64-v8a`,`Android API 21`的PaddlePaddle库 + ```bash + $ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=arm64-v8a" -e "ANDROID_API=21" username/paddle-android:dev + ``` + +执行上述`docker run`命令时,容器默认执行[paddle/scripts/docker/build_android.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build_android.sh)脚本。该脚本中记录了交叉编译Android版PaddlePaddle库常用的CMake配置,并且会根据`ANDROID_ABI`和`ANDROID_API`自动构建独立工具链、进行编译和安装。由于arm64架构要求Android API不小于21。因此当`ANDROID_ABI=arm64-v8a`,`ANDROID_API<21`时,Docker容器中将默认使用`Android API 21`的编译工具链。用户可以参考下文**配置交叉编译参数**章节,根据个人的需求修改定制Docker容器所执行的脚本。编译安装结束之后,PaddlePaddle的C-API库将被安装到`$PWD/install_android`目录,所依赖的第三方库同时也被安装到`$PWD/install_android/third_party`目录。 + +## 基于Linux交叉编译环境的编译方式 +本文档将以Linux x86-64平台为例,介绍交叉编译Android平台上适用的PaddlePaddle库的方法和步骤。 + +### 准备交叉编译环境 + +从源码交叉编译PaddlePaddle,用户需要提前准备好交叉编译环境。Android平台上使用的C/C++交叉编译工具链为[Android NDK](https://developer.android.com/ndk/downloads/index.html?hl=zh-cn),用户可自行前往下载预编译好的版本,也可通过以下命令获取: + +```bash +wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip +unzip -q android-ndk-r14b-linux-x86_64.zip +``` + +Android NDK中包含了所有Android API级别、所有架构(arm/arm64/x86/mips)需要用到的编译工具和系统库。用户可根据自己的编译目标架构、所需支持的最低Android API级别,构建[独立工具链](https://developer.android.google.cn/ndk/guides/standalone_toolchain.html?hl=zh-cn)。 + +- 构建`armeabi-v7a`、 `Android API 21`的独立工具链: + +```bash +your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \ + --arch=arm --platform=android-21 --install-dir=your/path/to/arm_standalone_toolchain +``` + +此命令将在`your/path/to/arm_standalone_toolchain`目录生成一套独立编译工具链,面向架构为32位ARM架构,支持的最小的Android API级别为21,支持编译器`arm-linux-androideabi-gcc (GCC) 4.9`和`clang 3.8`。 + +- 构建`arm64-v8a`、 `Android API 21`的独立工具链: +```bash +your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \ + --arch=arm64 --platform=android-21 --install-dir=your/path/to/arm64_standalone_toolchain +``` + +此命令将在`your/path/to/arm64_standalone_toolchain`目录生成一套独立编译工具链,面向架构为64位ARM64架构,支持的最小Android API级别为21,支持编译器`arm-linux-androideabi-gcc (GCC) 4.9`和`clang 3.8`。 + +注意:**PaddlePaddle要求使用的编译工具链所支持的Android API级别不小于21**。 + +### 配置交叉编译参数 + +CMake系统对交叉编译提供了支持[cmake-toolchains](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling)。为了简化cmake配置,PaddlePaddle为交叉编译提供了工具链配置文档[cmake/cross_compiling/android.cmake](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/android.cmake),以提供一些默认的编译器和编译参数相关配置。注意,从CMake 3.7版本开始,CMake官方对Android平台的交叉编译提供了通用的支持。PaddlePaddle若检测到用户使用的CMake版本不低于3.7时,将会将用户传进来的配置参数传递CMake系统,交由CMake系统本身来处理。有关参数配置的详细说明见[cmake-toolchains](https://cmake.org/cmake/help/v3.7/manual/cmake-toolchains.7.html#cross-compiling)。 + +交叉编译Android版本的PaddlePaddle库时,有一些必须配置的参数: +- `CMAKE_SYSTEM_NAME`,CMake编译的目标平台,必须设置为`Android`。在设置`CMAKE_SYSTEM_NAME=Android`后,PaddlePaddle的CMake系统才认为是在交叉编译Android系统的版本,并自动编译宿主机版protoc可执行文件、目标机版protobuf库、以及Android所需`arm_soft_fp_abi`分支的目标机版OpenBLAS库。此外,还会强制设置一些PaddlePaddle参数的值(`WITH_GPU=OFF`、`WITH_AVX=OFF`、`WITH_PYTHON=OFF`、`WITH_RDMA=OFF`)。 +- `WITH_C_API`,必须设置为`ON`。在Android平台上只支持使用C-API来预测。 +- `WITH_SWIG_PY`,必须设置为`OFF`。在Android平台上不支持通过swig调用来训练或者预测。 + +Android平台可选配置参数: + +- `ANDROID_STANDALONE_TOOLCHAIN`,独立工具链所在的绝对路径,或者相对于构建目录的相对路径。PaddlePaddle的CMake系统将根据该值自动推导和设置需要使用的交叉编译器、sysroot、以及Android API级别;否则,用户需要在cmake时手动设置这些值。无默认值。 +- `ANDROID_TOOLCHAIN`,目标工具链。可设置`gcc/clang`,默认值为`clang`。 + - CMake 3.7以上,将会始终使用`clang`工具链;CMake 3.7以下,可设置`ANDROID_TOOLCHAIN=gcc`以使用`gcc`工具链。 + - Android官方提供的`clang`编译器要求系统支持`GLIBC 2.15`以上。 +- `ANDROID_ABI`,目标架构ABI。目前支持`armeabi-v7a`和`arm64-v8a`,默认值为`armeabi-v7a`。 +- `ANDROID_NATIVE_API_LEVEL`,工具链的Android API级别。若没有显式设置,PaddlePaddle将根据`ANDROID_STANDALONE_TOOLCHAIN`的值自动推导得到。 +- `ANROID_ARM_MODE`,是否使用ARM模式。 + - `ANDROID_ABI=armeabi-v7a`时,可设置`ON/OFF`,默认值为`ON`; + - `ANDROID_ABI=arm64-v8a`时,不需要设置。 +- `ANDROID_ARM_NEON`,是否使用NEON指令。 + - `ANDROID_ABI=armeabi-v7a`时,可设置`ON/OFF`,默认值为`ON`; + - `ANDROID_ABI=arm64-v8a`时,不需要设置。 + +其他配置参数: + +- `USE_EIGEN_FOR_BLAS`,是否使用Eigen库进行矩阵计算。可设置`ON/OFF`,默认值为`OFF`。 +- `HOST_C/CXX_COMPILER`,宿主机的C/C++编译器。在编译宿主机版protoc可执行文件和目标机版OpenBLAS库时需要用到。默认设置成环境变量`CC`的值;若环境变量`CC`没有设置,则设置成`cc`编译器。 + +常用的cmake配置如下: + +```bash +cmake -DCMAKE_SYSTEM_NAME=Android \ + -DANDROID_STANDALONE_TOOLCHAIN=your/path/to/arm_standalone_toolchain \ + -DANDROID_ABI=armeabi-v7a \ + -DANDROID_ARM_NEON=ON \ + -DANDROID_ARM_MODE=ON \ + -DUSE_EIGEN_FOR_BLAS=ON \ + -DCMAKE_INSTALL_PREFIX=your/path/to/install \ + -DWITH_C_API=ON \ + -DWITH_SWIG_PY=OFF \ + .. +``` + +``` +cmake -DCMAKE_SYSTEM_NAME=Android \ + -DANDROID_STANDALONE_TOOLCHAIN=your/path/to/arm64_standalone_toolchain \ + -DANDROID_ABI=arm64-v8a \ + -DUSE_EIGEN_FOR_BLAS=OFF \ + -DCMAKE_INSTALL_PREFIX=your/path/to/install \ + -DWITH_C_API=ON \ + -DWITH_SWIG_PY=OFF \ + .. +``` + +用户还可根据自己的需求设置其他编译参数。比如希望最小化生成的库的大小,可以设置`CMAKE_BUILD_TYPE`为`MinSizeRel`;若希望最快的执行速度,则可设置`CMAKE_BUILD_TYPE`为`Release`。亦可以通过手动设置`CMAKE_C/CXX_FLAGS_MINSIZEREL/RELEASE`来影响PaddlePaddle的编译过程。 + +**性能TIPS**,为了达到最快的计算速度,在CMake参数配置上,有以下建议: +- 设置`CMAKE_BUILD_TYPE`为`Release` +- 使用`clang`编译工具链 +- `armeabi-v7a`时,设置`USE_EIGEN_BLAS=ON`,使用Eigen进行矩阵计算;`arm64-v8a`时,设置`USE_EIGEN_FOR_BLAS=OFF`,使用OpenBLAS进行矩阵计算 + +### 编译和安装 + +CMake配置完成后,执行以下命令,PaddlePaddle将自动下载和编译所有第三方依赖库、编译和安装PaddlePaddle预测库。 + +```bash +make +make install +``` + +注意:如果你曾经在源码目录下编译过其他平台的PaddlePaddle库,请先使用`rm -rf`命令删除`third_party`目录和`build`目录,以确保所有的第三方依赖库和PaddlePaddle代码都是针对新的CMake配置重新编译的。 + +执行完安装命令后,`your/path/to/install`目录中会包含`include`、`lib`和`third_party`目录,其中`include`中包含C-API的头文件,`lib`中包含若干个不同Android ABI的PaddlePaddle库,`third_party`中包含所依赖的所有第三方库。自此,PaddlePaddle的已经安装完成,用户可将`your/path/to/install`目录下的生成文件用于深度学习相关Android App中,调用方法见C-API文档。 diff --git a/doc/mobile/cross_compiling_for_android_en.md b/doc/mobile/cross_compiling_for_android_en.md new file mode 100644 index 0000000000000000000000000000000000000000..26858581fc1d77a9391520ac0dfd80fbd98f508c --- /dev/null +++ b/doc/mobile/cross_compiling_for_android_en.md @@ -0,0 +1,175 @@ +# Build PaddlePaddle for Android + +There are two approaches to build PaddlePaddle for Android: using Docker and on Linux without Docker. + +## Cross-Compiling Using Docker + +Docker-based cross-compiling is the recommended approach because Docker runs on all major operating systems, including Linux, Mac OS X, and Windows. + +### Build the Docker Image + +The following steps pack all the tools that we need to build PaddlePaddle into a Docker image. + +```bash +$ git clone https://github.com/PaddlePaddle/Paddle.git +$ cd Paddle +$ docker build -t paddle:dev-android . -f Dockerfile.android +``` + +### Build the Inference Library + +We can run the Docker image we just created to build the inference library of PaddlePaddle for Android using the command below: + +```bash +$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" paddle:dev-android +``` + +The Docker image accepts two arguments `ANDROID_ABI` and `ANDROID_API`: + + ++ + + + + + + + + + + + + + + + + + + + + + + +
ArgumentOptional ValuesDefault
ANDROID_ABIarmeabi-v7a, arm64-v8aarmeabi-v7a
ANDROID_API>= 2121
+ +The ARM-64 architecture (`arm64-v8a`) requires at least level 21 of Android API. + +The default entry-point of the Docker image, [`paddle/scripts/docker/build_android.sh`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build_android.sh) generates the [Android cross-compiling standalone toolchain](https://developer.android.com/ndk/guides/standalone_toolchain.html) based on the argument: `ANDROID_ABI` or `ANDROID_API`. For information about other configuration arguments, please continue reading. + +The above command generates and outputs the inference library in `$PWD/install_android` and puts third-party libraries in `$PWD/install_android/third_party`. + +## Cross-Compiling on Linux + +The Linux-base approach to cross-compile is to run steps in `Dockerfile.android` manually on a Linux x64 computer. + +### Setup the Environment + +To build for Android's, we need [Android NDK]( +https://developer.android.com/ndk/downloads/index.html): + +```bash +wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip +unzip -q android-ndk-r14b-linux-x86_64.zip +``` + +Android NDK includes everything we need to build the [*standalone toolchain*](https://developer.android.com/ndk/guides/standalone_toolchain.html), which in then used to build PaddlePaddle for Android. (We plan to remove the intermediate stage of building the standalone toolchain in the near future.) + +- To build the standalone toolchain for `armeabi-v7a` and Android API level 21: + + ```bash + your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \ + --arch=arm --platform=android-21 --install-dir=your/path/to/arm_standalone_toolchain + ``` + + The generated standalone toolchain will be in `your/path/to/arm_standalone_toolchain`. + +- To build the standalone toolchain for `arm64-v8a` and Android API level 21: + + ```bash + your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \ + --arch=arm64 --platform=android-21 --install-dir=your/path/to/arm64_standalone_toolchain + ``` + + The generated standalone toolchain will be in `your/path/to/arm64_standalone_toolchain`. + +**Please be aware that the minimum level of Android API required by PaddlePaddle is 21.** + +### Cross-Compiling Arguments + +CMake supports [choosing the toolchain](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling). PaddlePaddle provides [`android.cmake`](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/android.cmake), which configures the Android cross-compiling toolchain for CMake. `android.cmake` is not required for CMake >= 3.7, which support Android cross-compiling. PaddlePaddle detects the CMake version, for those newer than 3.7, it uses [the official version](https://cmake.org/cmake/help/v3.7/manual/cmake-toolchains.7.html#cross-compiling). + +Some other CMake arguments you need to know: + +- `CMAKE_SYSTEM_NAME` must be `Android`. This tells PaddlePaddle's CMake system to cross-compile third-party dependencies. This also changes some other CMake arguments like `WITH_GPU=OFF`, `WITH_AVX=OFF`, `WITH_PYTHON=OFF`, and `WITH_RDMA=OFF`. +- `WITH_C_API` must be `ON`, to build the C-based inference library for Android. +- `WITH_SWIG_PY` must be `OFF` because the Android platform doesn't support SWIG-based API. + +Some Android-specific arguments: + +- `ANDROID_STANDALONE_TOOLCHAIN`: the absolute path of the Android standalone toolchain, or the path relative to the CMake build directory. PaddlePaddle's CMake extensions would derive the cross-compiler, sysroot and Android API level from this argument. +- `ANDROID_TOOLCHAIN`: could be `gcc` or `clang`. The default value is `clang`. + - For CMake >= 3.7, it should anyway be `clang`. For older versions, it could be `gcc`. + - Android's official `clang` requires `glibc` >= 2.15. +- `ANDROID_ABI`: could be `armeabi-v7a` or `arm64-v8a`. The default value is `armeabi-v7a`. +- `ANDROID_NATIVE_API_LEVEL`: could be derived from the value of `ANDROID_STANDALONE_TOOLCHAIN`. +- `ANROID_ARM_MODE`: + - could be `ON` or `OFF`, and defaults to `ON`, when `ANDROID_ABI=armeabi-v7a`; + - no need to specify when `ANDROID_ABI=arm64-v8a`. +- `ANDROID_ARM_NEON`: indicates if to use NEON instructions. + - could be `ON` or `OFF`, and defaults to `ON`, when `ANDROID_ABI=armeabi-v7a`; + - no need to specify when `ANDROID_ABI=arm64-v8a`. + +Other useful arguments: + +- `USE_EIGEN_FOR_BLAS`: indicates if using Eigen. Could be `ON` or `OFF`, defaults to `OFF`. +- `HOST_C/CXX_COMPILER`: specifies the host compiler, which is used to build the host-specific protoc and target-specific OpenBLAS. It defaults to the value of the environment variable `CC`, or `cc`. + +Some frequent configurations for your reference: + +```bash +cmake -DCMAKE_SYSTEM_NAME=Android \ + -DANDROID_STANDALONE_TOOLCHAIN=your/path/to/arm_standalone_toolchain \ + -DANDROID_ABI=armeabi-v7a \ + -DANDROID_ARM_NEON=ON \ + -DANDROID_ARM_MODE=ON \ + -DUSE_EIGEN_FOR_BLAS=ON \ + -DCMAKE_INSTALL_PREFIX=your/path/to/install \ + -DWITH_C_API=ON \ + -DWITH_SWIG_PY=OFF \ + .. +``` + +``` +cmake -DCMAKE_SYSTEM_NAME=Android \ + -DANDROID_STANDALONE_TOOLCHAIN=your/path/to/arm64_standalone_toolchain \ + -DANDROID_ABI=arm64-v8a \ + -DUSE_EIGEN_FOR_BLAS=OFF \ + -DCMAKE_INSTALL_PREFIX=your/path/to/install \ + -DWITH_C_API=ON \ + -DWITH_SWIG_PY=OFF \ + .. +``` + + +There are some other arguments you might want to configure. + +- `CMAKE_BUILD_TYPE=MinSizeRel` minimizes the size of library. +- `CMAKE_BUILD_TYPE-Release` optimizes the runtime performance. + +Our own tip for performance optimization to use clang and Eigen or OpenBLAS: +- `CMAKE_BUILD_TYPE=Release` +- `ANDROID_TOOLCHAIN=clang` +- `USE_EIGEN_BLAS=ON` for `armeabi-v7a`, or `USE_EIGEN_FOR_BLAS=OFF` for `arm64-v8a`. + +### Build and Install + +After running `cmake`, we can run `make; make install` to build and install. + +Before building, you might want to remove the `third_party` and `build` directories including pre-built libraries for other architectures. + +After building,in the directory `CMAKE_INSTALL_PREFIX`, you will find three sub-directories: + +- `include`: the header file of the inference library, +- `lib`: the inference library built for various Android ABIs, +- `third_party`: dependent third-party libraries built for Android. diff --git a/doc/mobile/cross_compiling_for_ios_cn.md b/doc/mobile/cross_compiling_for_ios_cn.md new file mode 100644 index 0000000000000000000000000000000000000000..9da48e7f2119ce901fbb3abab73400df27be16d2 --- /dev/null +++ b/doc/mobile/cross_compiling_for_ios_cn.md @@ -0,0 +1,117 @@ +# iOS平台编译指南 +交叉编译iOS平台上适用的PaddlePaddle库,需要在MacOS系统上进行。本文的将介绍在MacOS上,从源码交叉编译iOS平台上适用的PaddlePaddle库。 + +## 准备交叉编译环境 +Apple官方为iOS开发提供了完整的交叉编译工具和集成开发环境,用户从App Store下载安装Xcode即可。也可自行前往官网下载,[Xcode](https://developer.apple.com/cn/xcode/)。安装完成之后,可在命令行执行`xcodebuild -version`,判断是否安装成功。 + +```bash +$ xcodebuild -version +Xcode 9.0 +Build version 9A235 +``` + +## 配置交叉编译参数 + +PaddlePaddle为交叉编译提供了工具链配置文档[cmake/cross_compiling/ios.cmake](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/ios.cmake),以提供一些默认的编译器和编译参数配置。 + +交叉编译iOS版本的PaddlePaddle库时,有一些必须配置的参数: + +- `CMAKE_SYSTEM_NAME`,CMake编译的目标平台,必须设置为`iOS`。在设置`CMAKE_SYSTEM_NAME=iOS`后,PaddlePaddle的CMake系统会自动编译所有的第三方依赖库,并且强制设置一些PaddlePaddle参数的值(`WITH_C_API=ON`、`WITH_GPU=OFF`、`WITH_AVX=OFF`、`WITH_PYTHON=OFF`、`WITH_RDMA=OFF`)。 +- `WITH_C_API`,是否编译C-API预测库,必须设置为ON。在iOS平台上只支持使用C-API来预测。 +- `WITH_SWIG_PY`,必须设置为ON。在iOS平台上不支持通过swig调用来训练或者预测。 + +iOS平台可选配置参数: + +- `IOS_PLATFORM`,可设置为`OS/SIMULATOR`,默认值为`OS`。 + - `OS`,构建目标为`arm`架构的iPhone或者iPad等物理设备。 + - `SIMULATOR`,构建目标为`x86`架构的模拟器平台。 +- `IOS_ARCH`,目标架构。针对不同的`IOS_PLATFORM`,可设置的目标架构如下表所示,默认编译所有架构: + + + + + + + + + + + + + + + + + + + + + + +
IOS_PLATFORMIOS_ARCH
OSarmv7, armv7s, arm64
SIMULATORi386, x86_64
+ +- `IOS_DEPLOYMENT_TARGET`,最小的iOS部署版本,默认值为`7.0`。 +- `IOS_ENABLE_BITCODE`,是否使能[Bitcode](https://developer.apple.com/library/content/documentation/IDEs/Conceptual/AppDistributionGuide/AppThinning/AppThinning.html#//apple_ref/doc/uid/TP40012582-CH35-SW3),可设置`ON/OFF`,默认值为`ON`。 +- `IOS_USE_VECLIB_FOR_BLAS`,是否使用[vecLib](https://developer.apple.com/documentation/accelerate/veclib)框架进行BLAS矩阵计算,可设置`ON/OFF`,默认值为`OFF`。 +- `IOS_DEVELOPMENT_ROOT`,`Developer`目录,可显式指定为`/path/to/platform/Developer`。若未显式指定,PaddlePaddle将会根据`IOS_PLATFORM`自动选择`Xcode`对应`platform`的`Developer`目录。 +- `IOS_SDK_ROOT`,所使用`SDK`的根目录,可显式指定为`/path/to/platform/Developer/SDKs/SDK`。若未显式指定,PaddlePaddle将会自动选择`IOS_DEVELOPMENT_ROOT`目录下最新的`SDK`版本。 + +其他配置参数: + +- `USE_EIGEN_FOR_BLAS`,是否使用Eigen库进行矩阵计算,在`IOS_USE_VECLIB_FOR_BLAS=OFF`时有效。可设置`ON/OFF`,默认值为`OFF`。 +- `HOST_C/CXX_COMPILER`,宿主机的C/C++编译器。默认值为环境变量`CC/CXX`的值;若环境变量`CC/CXX`未设置,则使用`cc/c++`编译器。 + +常用的cmake配置如下: + +```bash +cmake -DCMAKE_SYSTEM_NAME=iOS \ + -DIOS_PLATFORM=OS \ + -DIOS_ARCH="armv7;arm64" \ + -DIOS_ENABLE_BITCODE=ON \ + -DIOS_USE_VECLIB_FOR_BLAS=ON \ + -DCMAKE_INSTALL_PREFIX=your/path/to/install \ + -DWITH_C_API=ON \ + -DWITH_TESTING=OFF \ + -DWITH_SWIG_PY=OFF \ + .. +``` + +```bash +cmake -DCMAKE_SYSTEM_NAME=iOS \ + -DIOS_PLATFORM=SIMULATOR \ + -DIOS_ARCH="x86_64" \ + -DIOS_USE_VECLIB_FOR_BLAS=ON \ + -DCMAKE_INSTALL_PREFIX=your/path/to/install \ + -DWITH_C_API=ON \ + -DWITH_TESTING=OFF \ + -DWITH_SWIG_PY=OFF \ + .. +``` + +用户还可根据自己的需求设置其他编译参数。比如希望最小化生成库的大小,可以设置`CMAKE_BUILD_TYPE`为`MinSizeRel`;若希望得到最快的执行速度,则可设置`CMAKE_BUILD_TYPE`为`Release`。亦可以通过手动设置`CMAKE_C/CXX_FLAGS`来影响PaddlePaddle的编译过程。 + +**性能TIPS**,为了达到最快的计算速度,在CMake参数配置上,有以下建议: + +- 设置`CMAKE_BUILD_TYPE`为`Release` +- 设置`IOS_USE_VECLIB_FOR_BLAS=ON`,调用`vecLib`框架提供的BLAS函数进行矩阵计算。 + +## 编译和安装 + +CMake配置完成后,执行以下命令,PaddlePaddle将自动下载和编译所有第三方依赖库、编译和安装PaddlePaddle预测库。 + +``` +$ make +$ make install +``` + +注意:如果你曾在源码目录下编译过其他平台的PaddlePaddle库,请先使用`rm -rf`命令删除`third_party`目录和`build`目录,以确保所有的第三方依赖库和PaddlePaddle代码都是针对新的CMake配置重新编译的。 + +执行完安装命令后,`your/path/to/install`目录中会包含以下内容: + +- `include`目录,其中包含所有C-API的头文件 +- `lib`目录,其中包含PaddlePaddle的C-API静态库 +- `third_party`目录,其中包含所依赖的所有第三方库 + +注意,如果PaddlePaddle库需要同时支持真机和模拟器,则需要分别编译真机和模拟器版本,然后使用`lipo`工具合并fat库。 + +自此,PaddlePaddle库已经安装完成,用户可将合成的fat库用于深度学习相关的iOS App中,调用方法见C-API文档。 diff --git a/doc/mobile/cross_compiling_for_raspberry_cn.md b/doc/mobile/cross_compiling_for_raspberry_cn.md new file mode 100644 index 0000000000000000000000000000000000000000..f8ef9dc8031613831437745995268f3abc392f5b --- /dev/null +++ b/doc/mobile/cross_compiling_for_raspberry_cn.md @@ -0,0 +1,62 @@ +# Raspberry Pi平台编译指南 + +通常有两个方法来构建基于 Rasspberry Pi 的版本: + +1. 通过ssh等方式登录到Raspberry Pi系统上来构建。所需的开发工具和第三方库可以参考 [`/Dockerfile`](https://github.com/PaddlePaddle/Paddle/blob/develop/Dockerfile)。 + +1. 另一个方法是交叉编译。这篇文档介绍在 Linux/x64 上交叉编译Raspberry Pi平台上适用的PaddlePaddle的方法和步骤。 + +## 安装交叉编译器 + +克隆下面 Github repo + +```bash +git clone https://github.com/raspberrypi/tools.git +``` + +即可在 `./tools/tree/master/arm-bcm2708/gcc-linaro-arm-linux-gnueabihf-raspbian-x64` 目录里找到交叉编译器 arm-linux-gnueabihf-gcc 4.8.3。运行该编译工具链需要一台 Linux x64 机器上以及 2.14版本以上的 glibc。 + +## 配置交叉编译参数 + +CMake[支持交叉编译](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling)。PaddlePaddle for Raspberry Pi的配置信息在[cmake/cross_compiling/raspberry_pi.cmake](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/raspberry_pi.cmake)。 + +交叉编译Raspberry Pi版本PaddlePaddle库时,有一些必须配置的参数: + +- `CMAKE_SYSTEM_NAME`:CMake编译的目标平台,必须配置为`RPi`。在设置`CMAKE_SYSTEM_NAME=RPi`后,PaddlePaddle的CMake系统才认为在是在交叉编译Raspberry Pi系统的版本,并自动编译宿主机版protoc可执行文件、目标机版protobuf库、以及目标机版OpenBLAS库。 + +- `RPI_TOOLCHAIN`:编译工具链所在的绝对路径,或者相对于构建目录的相对路径。PaddlePaddle的CMake系统将根据该值自动设置需要使用的交叉编译器;否则,用户需要在cmake时手动设置这些值。无默认值。 + +- `RPI_ARM_NEON`:是否使用NEON指令。目前必须设置成`ON`,默认值为`ON`。 + +- `HOST_C/CXX_COMPILER`,宿主机的C/C++编译器。在编译宿主机版protoc可执行文件和目标机版OpenBLAS库时需要用到。默认设置成环境变量`CC`的值;若环境变量`CC`没有设置,则设置成`cc`编译器。 + +一个常用的CMake配置如下: + +``` +cmake -DCMAKE_SYSTEM_NAME=RPi \ + -DRPI_TOOLCHAIN=your/path/to/arm-bcm2708/gcc-linaro-arm-linux-gnueabihf-raspbian-x64 \ + -DRPI_ARM_NEON=ON \ + -DCMAKE_INSTALL_PREFIX=your/path/to/install \ + -DWITH_GPU=OFF \ + -DWITH_C_API=ON \ + -DWITH_PYTHON=OFF \ + -DWITH_SWIG_PY=OFF \ + .. +``` + +其中`WITH_C_API=ON`表示需要构建推理库。 + +用户还可根据自己的需求设置其他编译参数。比如希望最小化生成的库的大小,可以设置`CMAKE_BUILD_TYPE`为`MinSizeRel`;若希望最快的执行速度,则可设置`CMAKE_BUILD_TYPE`为`Release`。 + +## 编译和安装 + +CMake配置完成后,执行以下命令,PaddlePaddle将自动下载和编译所有第三方依赖库、编译和安装PaddlePaddle。 + +```bash +make +make install +``` + +注意:如果你曾经在源码目录下编译过其他平台的PaddlePaddle库,请先使用`rm -rf`命令删除`third_party`目录和`build`目录,以确保所有的第三方依赖库和PaddlePaddle代码都是针对新的CMake配置重新编译的。 + +执行完安装命令后,`your/path/to/install`目录中会包含`include`和`lib`目录,其中`include`中包含C-API的头文件,`lib`中包含一个Raspberry Pi版本的库。 diff --git a/doc/mobile/cross_compiling_for_raspberry_en.md b/doc/mobile/cross_compiling_for_raspberry_en.md new file mode 100644 index 0000000000000000000000000000000000000000..3c1a5950ff9553bb725d5a96e3fdf2e5e9f6f95c --- /dev/null +++ b/doc/mobile/cross_compiling_for_raspberry_en.md @@ -0,0 +1,62 @@ +# Build PaddlePaddle for Raspberry Pi + +You may use any of the following two approaches to build the inference library of PaddlePaddle for Raspberry Pi: + +1. Build using SSH: Log in to a Raspberry Pi using SSH and build the library. The required development tools and third-party dependencies are listed in here: [`/Dockerfile`](https://github.com/PaddlePaddle/Paddle/blob/develop/Dockerfile). + +1. Cross-compile: We talk about how to cross-compile PaddlePaddle for Raspberry Pi on a Linux/x64 machine, in more detail in this article. + +## The Cross-Compiling Toolchain + +Step 1. Clone the Github repo by running the following command. + +```bash +git clone https://github.com/raspberrypi/tools.git +``` + +Step 2. Use the pre-built cross-compiler found in `./tools/tree/master/arm-bcm2708/gcc-linaro-arm-linux-gnueabihf-raspbian-x64`. To run it on a Linux computer, glibc version >= 2.14 is needed. + +## CMake Arguments + +CMake supports [cross-compiling](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling). All CMake configuration arguments required for the cross-compilation for Raspberry Pi can be found in [`cmake/cross_compiling/raspberry_pi.cmake`](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/raspberry_pi.cmake). + +Some important arguments that need to be set: + +- `CMAKE_SYSTEM_NAME`: The target platform. Must be `RPi`. + +- `RPI_TOOLCHAIN`: The absolute path of the cross-compiling toolchain. + +- `RPI_ARM_NEON`: Use ARM NEON Intrinsics. This is a required argument and set default to `ON`. + +- `HOST_C/CXX_COMPILER`: The C/C++ compiler for the host. It is used to build building tools running on the host, for example, protoc. + +A commonly-used CMake configuration is as follows: + +``` +cmake -DCMAKE_SYSTEM_NAME=RPi \ + -DRPI_TOOLCHAIN=your/path/to/arm-bcm2708/gcc-linaro-arm-linux-gnueabihf-raspbian-x64 \ + -DRPI_ARM_NEON=ON \ + -DCMAKE_INSTALL_PREFIX=your/path/to/install \ + -DWITH_GPU=OFF \ + -DWITH_C_API=ON \ + -DWITH_PYTHON=OFF \ + -DWITH_SWIG_PY=OFF \ + .. +``` + +To build the inference library, please set the argument WITH\_C\_API to ON: `WITH_C_API=ON`. + +You can add more arguments. For example, to minimize the size of the generated inference library, you may use `CMAKE_BUILD_TYPE=MinSizeRel`. For performance optimization, you may use `CMAKE_BUILD_TYPE=Release`. + +## Build and Install + +The following commands build the inference library of PaddlePaddle for Raspberry Pi and third-party dependencies. + +```bash +make +make install +``` + + The intermediate files will be stored in `build`. Third-party libraries will be located in `build/third_party`. If you have already built it for other platforms like Android or iOS, you may want to clear these directories by running the command: `rm -rf build`. + +The infernece library will be in `your/path/to/install/lib`, with related header files in `your/path/to/install/include`. diff --git a/doc/mobile/index_cn.rst b/doc/mobile/index_cn.rst new file mode 100644 index 0000000000000000000000000000000000000000..1d99666e58b7043b85b0203ee0dfcd1957710161 --- /dev/null +++ b/doc/mobile/index_cn.rst @@ -0,0 +1,9 @@ +MOBILE +====== + +.. toctree:: + :maxdepth: 1 + + cross_compiling_for_android_cn.md + cross_compiling_for_ios_cn.md + cross_compiling_for_raspberry_cn.md diff --git a/doc/mobile/index_en.rst b/doc/mobile/index_en.rst new file mode 100644 index 0000000000000000000000000000000000000000..3c08d736717cfe8d5fdf449dc58015086befbe60 --- /dev/null +++ b/doc/mobile/index_en.rst @@ -0,0 +1,8 @@ +MOBILE +====== + +.. toctree:: + :maxdepth: 1 + + cross_compiling_for_android_en.md + cross_compiling_for_raspberry_en.md diff --git a/doc/survey/cluster_bootstrapping_tools.md b/doc/survey/cluster_bootstrapping_tools.md new file mode 100644 index 0000000000000000000000000000000000000000..1cd9962700bb49866f1ed6987abc28b27888a23f --- /dev/null +++ b/doc/survey/cluster_bootstrapping_tools.md @@ -0,0 +1,71 @@ +# Cluster bootstrapping tool survey +## Abstract +In order to bring up a cluster from bare metal machine to a fully functional kubernetes cluster for Paddlepaddle to run, we need to utilize some tools. Here we are going to compare [Sextant](https://github.com/k8sp/sextant) and [Tectonic installer](https://github.com/coreos/tectonic-installer) + +## Basic assumptions +Here are some basic assumptions before we move on to details +1. You are an administrator of a bare metal machine cluster, which means: + * you have full control to each of the machines. + * you have full control to the network which machines are connected to. +2. Machines can be booted from network with PEX or iPXE +3. You understand the [general procedure to bring up a cluster](#appendix-general-procedure-to-bring-up-a-cluster) + +if your cluster is able to mark above items with checkmarks, then keep reading. + +## Comparing Sextant and Tectonic installer +### Sextant +Sextant is an end2end solution to bring up a bare metal cluster to a fully functional k8s cluster, it integrates DHCP, name service, PEX, cloud-config-service, docker registry services altogether. + +#### Pros +1. End2End: basically all admin need to do is to config the cluster.yaml and power on the cluster. +2. Offline cluster configuration: Sextant has 2 phases during working with it, config time and deploy time. when admin is configuring, it requires admin's machine has internet connectivity, which will download some images, etc. But in deploy time, it's completely OK to go offline since all dependencies are ready during config time. +3. docker registry integrated. +4. GPU machine took care of. + +### Cons +1. k8s API server is not deployed with high availability in considering by default. +2. No grouping support. +3. No API interface, a one-off service. + + +### Tectonic installer +First of all, Tectonic is not free, it requires coreos.com account as a step of installation, and free user can only create less than 10 nodes. + +Tectonic is a suite of software which wraps around k8s and providing more utility regarding dev ops, ie, +Tectonic installer as it's named, it installs Tectonic to a bare metal cluster which means it's not totally an equivalent of Sextant. At the "booting a cluster" part, it mostly utilizes [Matchbox](https://github.com/coreos/matchbox), which is a general cluster bootstrapper. + +Matchbox's Approach is similar to Sexstant. + +### Pros +1. supports grouping machines. +2. supports running provisioning service in rtk. (not a big deal though). +3. supports http/gRPC API interface. +4. supports multi-template. + +### Cons +1. Not an e2e solution to bring up a cluster, need a lot of extra work and other software. +2. [Not fully supporting](https://github.com/coreos/matchbox/issues/550) centOS deployment yet. + +## Conclusion +Sextant is a better solution overall for paddle cloud deploying to a bare metal cluster. It would be great if Sextant can also 1) deploy k8s api server with high availability by default; 2) not designed as a one-off service. + + + +## Appendix: General procedure to bring up a cluster +It's physically impossible for a cluster admin to manually install OS and applications into cluster nodes one by one, here is what an admin would do in cloud industry: +1. setup a bootstrap machine with static IP in the cluster, which has following services: + * DHCP: assigns ip address for rest of the nodes. + * name service: to map node name to a IP + * PXE related services: the booting related info will be delivered to newly booted machines as their IP is assigned via DHCP service, PXE service will provide further booting and installing info and image with TFTP and http protocol. + * cluster config service: this is for providing cluster node with OS config via http + * optional docker registry: a built-in docker registry makes the whole cluster independent from connecting internet, and speeds up software distribution. +2. New node powers on, it will + * broadcast the request for an IP address + * DHCP server assigns the IP address, and deliver the PXE booting related info to the node. + * cluster node will request config files with booting info delivered with DHCP via the TFTP service, and in most of the cases, the config file will point to a http service for the booting image. + * Since PXE is configured with initrd, it will utilize the cloud config service and do further installations like coreOS or K8s installations. + * then restart the node. + +For further understanding, following 2 links from Matchbox are some good readings: +* [Machine lifecycle](https://github.com/coreos/matchbox/blob/master/Documentation/machine-lifecycle.md) +* [PXE booting](https://github.com/coreos/matchbox/blob/master/Documentation/network-booting.md) diff --git a/doc/templates/conf.py.cn.in b/doc/templates/conf.py.cn.in index 95cad835b11816f4d2e256c2abd662a545a5bad2..41b35b5b233abd737db07aaeb6c6dd4bf6d42b08 100644 --- a/doc/templates/conf.py.cn.in +++ b/doc/templates/conf.py.cn.in @@ -13,22 +13,18 @@ # serve to show the default. import sys import os, subprocess +sys.path.insert(0, os.path.abspath('@PADDLE_SOURCE_DIR@/python')) import shlex from recommonmark import parser, transform -try: - import py_paddle - import paddle - import paddle.v2 -except ImportError: - print("Must install paddle python package before generating documentation") - sys.exit(1) +import paddle +import paddle.v2 MarkdownParser = parser.CommonMarkParser AutoStructify = transform.AutoStructify # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. -templates_path = ["@PROJ_ROOT@/doc_theme/templates"] +templates_path = ["@PADDLE_SOURCE_DIR@/doc_theme/templates"] # -- General configuration ------------------------------------------------ @@ -124,7 +120,7 @@ html_theme = 'sphinx_rtd_theme' # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['@PROJ_ROOT@/doc_theme/static'] +html_static_path = ['@PADDLE_SOURCE_DIR@/doc_theme/static'] # Output file base name for HTML help builder. htmlhelp_basename = project + 'doc' diff --git a/doc/templates/conf.py.en.in b/doc/templates/conf.py.en.in index b477f0120c4fa0544012080b7cfb8572d3c44b04..5822c2481dd61da2084b0de76f6f65aa4e32e033 100644 --- a/doc/templates/conf.py.en.in +++ b/doc/templates/conf.py.en.in @@ -13,15 +13,11 @@ # serve to show the default. import sys import os, subprocess +sys.path.insert(0, os.path.abspath('@PADDLE_SOURCE_DIR@/python')) import shlex from recommonmark import parser, transform -try: - import py_paddle - import paddle - import paddle.v2 -except ImportError: - print("Must install paddle python package before generating documentation") - sys.exit(1) +import paddle +import paddle.v2 MarkdownParser = parser.CommonMarkParser @@ -29,7 +25,7 @@ AutoStructify = transform.AutoStructify # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. -templates_path = ["@PROJ_ROOT@/doc_theme/templates"] +templates_path = ["@PADDLE_SOURCE_DIR@/doc_theme/templates"] # -- General configuration ------------------------------------------------ @@ -124,7 +120,7 @@ html_theme = 'sphinx_rtd_theme' # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['@PROJ_ROOT@/doc_theme/static'] +html_static_path = ['@PADDLE_SOURCE_DIR@/doc_theme/static'] # Output file base name for HTML help builder. htmlhelp_basename = project + 'doc' diff --git a/doc/tutorials/image_classification/cifar.png b/doc/tutorials/image_classification/cifar.png deleted file mode 100644 index f54a0c58837cb3385b32dc57d02cec92666ef0f1..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/image_classification/cifar.png and /dev/null differ diff --git a/doc/tutorials/image_classification/image_classification.png b/doc/tutorials/image_classification/image_classification.png deleted file mode 100644 index 14f255805081c1b4fab27eaf336fd389fa93ca19..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/image_classification/image_classification.png and /dev/null differ diff --git a/doc/tutorials/image_classification/index_cn.md b/doc/tutorials/image_classification/index_cn.md deleted file mode 100644 index 87f465522a0fa21c8c03754b4be8dcb035c4de81..0000000000000000000000000000000000000000 --- a/doc/tutorials/image_classification/index_cn.md +++ /dev/null @@ -1,205 +0,0 @@ -图像分类教程 -========== - -在本教程中,我们将使用CIFAR-10数据集训练一个卷积神经网络,并使用这个神经网络来对图片进行分类。如下图所示,卷积神经网络可以辨识图片中的主体,并给出分类结果。 -
![Image Classification](./image_classification.png)
- -## 数据准备 -首先下载CIFAR-10数据集。下面是CIFAR-10数据集的官方网址: - - - -我们准备了一个脚本,可以用于从官方网站上下载CIFAR-10数据集,转为jpeg文件并存入特定的目录。使用这个脚本前请确认已经安装了pillow及相关依赖模块。可以参照下面的命令进行安装: - -1. 安装pillow - -```bash -sudo apt-get install libjpeg-dev -pip install pillow -``` - -2. 下载数据集 - -```bash -cd demo/image_classification/data/ -sh download_cifar.sh -``` - -CIFAR-10数据集包含60000张32x32的彩色图片。图片分为10类,每个类包含6000张。其中50000张图片作为训练集,10000张作为测试集。 - -下图展示了所有的图片类别,每个类别中随机抽取了10张图片。 -
![Image Classification](./cifar.png)
- -脚本运行完成后,我们应当会得到一个名为cifar-out的文件夹,其下子文件夹的结构如下 - - -``` -train ----airplane ----automobile ----bird ----cat ----deer ----dog ----frog ----horse ----ship ----truck -test ----airplane ----automobile ----bird ----cat ----deer ----dog ----frog ----horse ----ship ----truck -``` - -cifar-out下包含`train`和`test`两个文件夹,其中分别包含了CIFAR-10中的训练集和测试集。这两个文件夹下各自有10个子文件夹,每个子文件夹下存储相应分类的图片。将图片按照上述结构存储好之后,我们就可以着手对分类模型进行训练了。 - -## 预处理 -数据下载之后,还需要进行预处理,将数据转换为Paddle的格式。我们可以通过如下命令进行预处理工作: - -``` -cd demo/image_classification/ -sh preprocess.sh -``` - -其中`preprocess.sh` 调用 `./demo/image_classification/preprocess.py` 对图片进行预处理 -```sh -export PYTHONPATH=$PYTHONPATH:../../ -data_dir=./data/cifar-out -python preprocess.py -i $data_dir -s 32 -c 1 -``` - -`./demo/image_classification/preprocess.py` 使用如下参数: - -- `-i` 或 `--input` 给出输入数据所在路径; -- `-s` 或 `--size` 给出图片尺寸; -- `-c` 或 `--color` 标示图片是彩色图或灰度图 - -## 模型训练 -在开始训练之前,我们需要先创建一个模型配置文件。下面我们给出了一个配置示例。**注意**,这里的列出的和`vgg_16_cifar.py`文件稍有差别,因为该文件可适用于预测。 - -```python -from paddle.trainer_config_helpers import * -data_dir='data/cifar-out/batches/' -meta_path=data_dir+'batches.meta' -args = {'meta':meta_path, 'mean_img_size': 32, - 'img_size': 32, 'num_classes': 10, - 'use_jpeg': 1, 'color': "color"} -define_py_data_sources2(train_list=data_dir+"train.list", - test_list=data_dir+'test.list', - module='image_provider', - obj='processData', - args=args) -settings( - batch_size = 128, - learning_rate = 0.1 / 128.0, - learning_method = MomentumOptimizer(0.9), - regularization = L2Regularization(0.0005 * 128)) - -img = data_layer(name='image', size=3*32*32) -lbl = data_layer(name="label", size=10) -# small_vgg is predined in trainer_config_helpers.network -predict = small_vgg(input_image=img, num_channels=3) -outputs(classification_cost(input=predict, label=lbl)) -``` - -在第一行中我们载入用于定义网络的函数。 -```python -from paddle.trainer_config_helpers import * -``` - -之后定义的`define_py_data_sources2`使用Python数据提供器,其中 `args`将在`image_provider.py`进行使用,该文件负责产生图片数据并传递给Paddle系统 - - `meta`: 训练集平均值。 - - `mean_img_size`: 平均特征图的高度及宽度。 - - `img_size`:输入图片的高度及宽度。 - - `num_classes`:类别个数。 - - `use_jpeg`:处理过程中数据存储格式。 - - `color`:标示是否为彩色图片。 - - `settings`用于设置训练算法。在下面的例子中,learning rate被设置为0.1除以batch size,而weight decay则为0.0005乘以batch size。 - - ```python -settings( - batch_size = 128, - learning_rate = 0.1 / 128.0, - learning_method = MomentumOptimizer(0.9), - regularization = L2Regularization(0.0005 * 128) -) -``` - -`small_vgg`定义了网络结构。这里我们使用的是一个小的VGG网络。关于VGG卷积神经网络的描述可以参考:[http://www.robots.ox.ac.uk/~vgg/research/very_deep/](http://www.robots.ox.ac.uk/~vgg/research/very_deep/)。 -```python -# small_vgg is predined in trainer_config_helpers.network -predict = small_vgg(input_image=img, num_channels=3) -``` -配置创建完毕后,可以运行脚本train.sh来训练模型。 - -```bash -config=vgg_16_cifar.py -output=./cifar_vgg_model -log=train.log - -paddle train \ ---config=$config \ ---dot_period=10 \ ---log_period=100 \ ---test_all_data_in_one_period=1 \ ---use_gpu=1 \ ---save_dir=$output \ -2>&1 | tee $log - -python -m paddle.utils.plotcurve -i $log > plot.png -``` -- 这里我们使用的是GPU模式进行训练。如果你没有GPU环境,可以设置`use_gpu=0`。 -- `./demo/image_classification/vgg_16_cifar.py`是网络和数据配置文件。各项参数的详细说明可以在命令行参数相关文档中找到。 -- 脚本`plotcurve.py`依赖于python的`matplotlib`模块。因此如果这个脚本运行失败,也许是因为需要安装`matplotlib`。 -在训练完成后,训练及测试误差曲线图会被`plotcurve.py`脚本保存在 `plot.png`中。下面是一个误差曲线图的示例: - -
![Training and testing curves.](./plot.png)
- -## 预测 -在训练完成后,模型及参数会被保存在路径`./cifar_vgg_model/pass-%05d`下。例如第300个pass的模型会被保存在`./cifar_vgg_model/pass-00299`。 - -要对一个图片的进行分类预测,我们可以使用`predict.sh`,该脚本将输出预测分类的标签: - -``` -sh predict.sh -``` - -predict.sh: -``` -model=cifar_vgg_model/pass-00299/ -image=data/cifar-out/test/airplane/seaplane_s_000978.png -use_gpu=1 -python prediction.py $model $image $use_gpu -``` - -## 练习 -在CUB-200数据集上使用VGG模型训练一个鸟类图片分类模型。相关的鸟类数据集可以从如下地址下载,其中包含了200种鸟类的照片(主要来自北美洲)。 - - - - - - -## 细节探究 -### 卷积神经网络 -卷积神经网络是一种使用卷积层的前向神经网络,很适合构建用于理解图片内容的模型。一个典型的神经网络如下图所示: - -![Convolutional Neural Network](./lenet.png) - -一个卷积神经网络包含如下层: - -- 卷积层:通过卷积操作从图片或特征图中提取特征 -- 池化层:使用max-pooling对特征图下采样 -- 全连接层:使输入层到隐藏层的神经元是全部连接的。 - -卷积神经网络在图片分类上有着惊人的性能,这是因为它发掘出了图片的两类重要信息:局部关联性质和空间不变性质。通过交替使用卷积和池化处理, 卷积神经网络能够很好的表示这两类信息。 - -关于如何定义网络中的层,以及如何在层之间进行连接,请参考Layer文档。 diff --git a/doc/tutorials/image_classification/index_en.md b/doc/tutorials/image_classification/index_en.md deleted file mode 100644 index 60c81a6a539944634773f38ec4c9a59709dd4afc..0000000000000000000000000000000000000000 --- a/doc/tutorials/image_classification/index_en.md +++ /dev/null @@ -1,221 +0,0 @@ -Image Classification Tutorial -============================== - -This tutorial will guide you through training a convolutional neural network to classify objects using the CIFAR-10 image classification dataset. -As shown in the following figure, the convolutional neural network can recognize the main object in images, and output the classification result. - -
![Image Classification](./image_classification.png)
- -## Data Preparation -First, download CIFAR-10 dataset. CIFAR-10 dataset can be downloaded from its official website. - - - -We have prepared a script to download and process CIFAR-10 dataset. The script will download CIFAR-10 dataset from the official dataset. -It will convert it to jpeg images and organize them into a directory with the required structure for the tutorial. Make sure that you have installed pillow and its dependents. -Consider the following commands: - -1. install pillow dependents - -```bash -sudo apt-get install libjpeg-dev -pip install pillow -``` - -2. download data and preparation - -```bash -cd demo/image_classification/data/ -sh download_cifar.sh -``` - -The CIFAR-10 dataset consists of 60000 32x32 color images in 10 classes, with 6000 images per class. There are 50000 training images and 10000 test images. - -Here are the classes in the dataset, as well as 10 random images from each: -
![Image Classification](./cifar.png)
- - -After downloading and converting, we should find a directory (cifar-out) containing the dataset in the following format: - -``` -train ----airplane ----automobile ----bird ----cat ----deer ----dog ----frog ----horse ----ship ----truck -test ----airplane ----automobile ----bird ----cat ----deer ----dog ----frog ----horse ----ship ----truck -``` - -It has two directories:`train` and `test`. These two directories contain training data and testing data of CIFAR-10, respectively. Each of these two folders contains 10 sub-folders, ranging from `airplane` to `truck`. Each sub-folder contains images with the corresponding label. After the images are organized into this structure, we are ready to train an image classification model. - -## Preprocess -After the data has been downloaded, it needs to be pre-processed into the Paddle format. We can run the following command for preprocessing. - -``` -cd demo/image_classification/ -sh preprocess.sh -``` - -`preprocess.sh` calls `./demo/image_classification/preprocess.py` to preprocess image data. -```sh -export PYTHONPATH=$PYTHONPATH:../../ -data_dir=./data/cifar-out -python preprocess.py -i $data_dir -s 32 -c 1 -``` - -`./demo/image_classification/preprocess.py` has the following arguments - -- `-i` or `--input` specifes the input data directory. -- `-s` or `--size` specifies the processed size of images. -- `-c` or `--color` specifes whether images are color images or gray images. - - -## Model Training -We need to create a model config file before training the model. An example of the config file (vgg_16_cifar.py) is listed below. **Note**, it is slightly different from the `vgg_16_cifar.py` which also applies to the prediction. - -```python -from paddle.trainer_config_helpers import * -data_dir='data/cifar-out/batches/' -meta_path=data_dir+'batches.meta' -args = {'meta':meta_path, 'mean_img_size': 32, - 'img_size': 32, 'num_classes': 10, - 'use_jpeg': 1, 'color': "color"} -define_py_data_sources2(train_list=data_dir+"train.list", - test_list=data_dir+'test.list', - module='image_provider', - obj='processData', - args=args) -settings( - batch_size = 128, - learning_rate = 0.1 / 128.0, - learning_method = MomentumOptimizer(0.9), - regularization = L2Regularization(0.0005 * 128)) - -img = data_layer(name='image', size=3*32*32) -lbl = data_layer(name="label", size=10) -# small_vgg is predined in trainer_config_helpers.network -predict = small_vgg(input_image=img, num_channels=3) -outputs(classification_cost(input=predict, label=lbl)) -``` - -The first line imports python functions for defining networks. -```python -from paddle.trainer_config_helpers import * -``` - -Then define an `define_py_data_sources2` which use python data provider -interface. The arguments in `args` are used in `image_provider.py` which -yeilds image data and transform them to Paddle. - - `meta`: the mean value of training set. - - `mean_img_size`: the size of mean feature map. - - `img_size`: the height and width of input image. - - `num_classes`: the number of classes. - - `use_jpeg`: the data storage type when preprocessing. - - `color`: specify color image. - -`settings` specifies the training algorithm. In the following example, -it specifies learning rate as 0.1, but divided by batch size, and the weight decay -is 0.0005 and multiplied by batch size. -```python -settings( - batch_size = 128, - learning_rate = 0.1 / 128.0, - learning_method = MomentumOptimizer(0.9), - regularization = L2Regularization(0.0005 * 128) -) -``` - -The `small_vgg` specifies the network. We use a small version of VGG convolutional network as our network -for classification. A description of VGG network can be found here [http://www.robots.ox.ac.uk/~vgg/research/very_deep/](http://www.robots.ox.ac.uk/~vgg/research/very_deep/). -```python -# small_vgg is predined in trainer_config_helpers.network -predict = small_vgg(input_image=img, num_channels=3) -``` -After writing the config, we can train the model by running the script train.sh. - -```bash -config=vgg_16_cifar.py -output=./cifar_vgg_model -log=train.log - -paddle train \ ---config=$config \ ---dot_period=10 \ ---log_period=100 \ ---test_all_data_in_one_period=1 \ ---use_gpu=1 \ ---save_dir=$output \ -2>&1 | tee $log - -python -m paddle.utils.plotcurve -i $log > plot.png -``` - -- Here we use GPU mode to train. If you have no gpu environment, just set `use_gpu=0`. - -- `./demo/image_classification/vgg_16_cifar.py` is the network and data configuration file. The meaning of the other flags can be found in the documentation of the command line flags. - -- The script `plotcurve.py` requires the python module of `matplotlib`, so if it fails, maybe you need to install `matplotlib`. - - -After training finishes, the training and testing error curves will be saved to `plot.png` using `plotcurve.py` script. An example of the plot is shown below: - -
![Training and testing curves.](./plot.png)
- - -## Prediction -After we train the model, the model file as well as the model parameters are stored in path `./cifar_vgg_model/pass-%05d`. For example, the model of the 300-th pass is stored at `./cifar_vgg_model/pass-00299`. - -To make a prediction for an image, one can run `predict.sh` as follows. The script will output the label of the classfiication. - -``` -sh predict.sh -``` - -predict.sh: -``` -model=cifar_vgg_model/pass-00299/ -image=data/cifar-out/test/airplane/seaplane_s_000978.png -use_gpu=1 -python prediction.py $model $image $use_gpu -``` - -## Exercise -Train a image classification of birds using VGG model and CUB-200 dataset. The birds dataset can be downloaded here. It contains an image dataset with photos of 200 bird species (mostly North American). - - - - - - -## Delve into Details -### Convolutional Neural Network -A Convolutional Neural Network is a feedforward neural network that uses convolution layers. It is very suitable for building neural networks that process and understand images. A standard convolutional neural network is shown below: - -![Convolutional Neural Network](./lenet.png) - -Convolutional Neural Network contains the following layers: - -- Convolutional layer: It uses convolution operation to extract features from an image or a feature map. -- Pooling layer: It uses max-pooling to downsample feature maps. -- Fully Connected layer: It uses fully connected connections to transform features. - -Convolutional Neural Network achieves amazing performance for image classification because it exploits two important characteristics of images: *local correlation* and *spatial invariance*. By iteratively applying convolution and max-pooing operations, convolutional neural network can well represent these two characteristics of images. - - -For more details of how to define layers and their connections, please refer to the documentation of layers. diff --git a/doc/tutorials/image_classification/lenet.png b/doc/tutorials/image_classification/lenet.png deleted file mode 100644 index 1e6f2b32bad797f3fccb929c72a121fc935b0cbb..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/image_classification/lenet.png and /dev/null differ diff --git a/doc/tutorials/image_classification/plot.png b/doc/tutorials/image_classification/plot.png deleted file mode 100644 index a31f99791c670e18bb8c62b7604ec8cb0284ffb4..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/image_classification/plot.png and /dev/null differ diff --git a/doc/tutorials/image_classification/src/cifar.png b/doc/tutorials/image_classification/src/cifar.png deleted file mode 100644 index f54a0c58837cb3385b32dc57d02cec92666ef0f1..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/image_classification/src/cifar.png and /dev/null differ diff --git a/doc/tutorials/image_classification/src/image_classification.png b/doc/tutorials/image_classification/src/image_classification.png deleted file mode 100644 index 14f255805081c1b4fab27eaf336fd389fa93ca19..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/image_classification/src/image_classification.png and /dev/null differ diff --git a/doc/tutorials/image_classification/src/lenet.png b/doc/tutorials/image_classification/src/lenet.png deleted file mode 100644 index 1e6f2b32bad797f3fccb929c72a121fc935b0cbb..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/image_classification/src/lenet.png and /dev/null differ diff --git a/doc/tutorials/image_classification/src/plot.png b/doc/tutorials/image_classification/src/plot.png deleted file mode 100644 index a31f99791c670e18bb8c62b7604ec8cb0284ffb4..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/image_classification/src/plot.png and /dev/null differ diff --git a/doc/tutorials/index_cn.md b/doc/tutorials/index_cn.md deleted file mode 100644 index 6a27004d58d24cc466d930322be8cdbb2f434c74..0000000000000000000000000000000000000000 --- a/doc/tutorials/index_cn.md +++ /dev/null @@ -1,13 +0,0 @@ -# 完整教程 - -* [快速入门](quick_start/index_cn.rst) -* [个性化推荐](rec/ml_regression_cn.rst) -* [图像分类](image_classification/index_cn.md) -* [情感分析](sentiment_analysis/index_cn.md) -* [语义角色标注](semantic_role_labeling/index_cn.md) -* [机器翻译](text_generation/index_cn.md) - -## 常用模型 - -* [ResNet模型](imagenet_model/resnet_model_cn.md) -* [词向量模型](embedding_model/index_cn.md) diff --git a/doc/tutorials/index_en.md b/doc/tutorials/index_en.md deleted file mode 100644 index 77331a703b6f0fdf92921ebcc476325b7327e976..0000000000000000000000000000000000000000 --- a/doc/tutorials/index_en.md +++ /dev/null @@ -1,14 +0,0 @@ -# TUTORIALS -There are several examples and demos here. - -* [Quick Start](quick_start/index_en.md) -* [MovieLens Regression](rec/ml_regression_en.rst) -* [Image Classification](image_classification/index_en.md) -* [Sentiment Analysis](sentiment_analysis/index_en.md) -* [Semantic Role Labeling](semantic_role_labeling/index_en.md) -* [Text Generation](text_generation/index_en.md) -* [Image Auto-Generation](gan/index_en.md) - -## Model Zoo -* [ImageNet: ResNet](imagenet_model/resnet_model_en.md) -* [Embedding: Chinese Word](embedding_model/index_en.md) diff --git a/doc/tutorials/rec/ml_dataset_cn.md b/doc/tutorials/rec/ml_dataset_cn.md deleted file mode 100644 index 2207a776f0774e72aba15169e59258dd04583637..0000000000000000000000000000000000000000 --- a/doc/tutorials/rec/ml_dataset_cn.md +++ /dev/null @@ -1,105 +0,0 @@ -```eval_rst -.. _demo_ml_dataset: - -``` - -# MovieLens数据集 - -[MovieLens 数据集](http://grouplens.org/datasets/movielens/)由GroupLens Research实验室搜集整理。 -该数据集包含一些用户信息、电影信息以及电影评分\[1-5\]。根据数据量规模,该数据及有很多不同的版本。 -我们用[MovieLens 百万数据集](http://files.grouplens.org/datasets/movielens/ml-1m.zip)作为示例数据 -集,其中包含6,000位用户对4,000部电影的1,000,000条评价。该数据集于2003年2月发布。 - -## 数据集特征 - -在[ml-1m 数据集](http://files.grouplens.org/datasets/movielens/ml-1m.zip)中有许多的特征。在[ml-1m 数据集] -(http://files.grouplens.org/datasets/movielens/ml-1m.zip)中的这些数据文件(含有".dat"的后缀)实际上是CSV文件, -分隔符为"::"。以下我们翻译数据集网站中README文件的描述: - -### 评分文件描述(ratings.dat) - - -所有的评分数据都包含在"ratings.dat"文件中,遵循如下的格式: - -用户ID::电影ID::评分::时间戳 - -- 用户ID范围从1到6040 -- 电影ID范围从1到3952 -- 评分被调整为5星的规模(只允许整数的星级) -- 时间戳表示为从1970-01-01(UTC)来的秒数,与time(2)的返回值一致 -- 每位用户至少有20条评分 - -### 用户文件描述(users.dat) - -所有的用户信息都包含在"users.dat"文件中,遵循如下的格式: - -用户ID::性别::年龄::职业::邮编 - -所有的人口统计学信息由用户自愿提供,没有进行正确性的检查。只有含有人 -口统计学信息的用户才被包含在数据集中。 - -- 性别,用"M"表示男性,"F"表示女性 -- 年龄从下列列表范围中选取: - - * 1: "18岁以下" - * 18: "18-24岁" - * 25: "25-34岁" - * 35: "35-44岁" - * 45: "45-49岁" - * 50: "50-55岁" - * 56: "56+" - -- 职业从下面所列中选择: - - * 0: "其他"或不确定 - * 1: "学术/教育工作者" - * 2: "艺术家" - * 3: "文书工作/管理员" - * 4: "大学生/研究生" - * 5: "客户服务" - * 6: "医生/医疗保健" - * 7: "行政工作/管理人员" - * 8: "农民" - * 9: "操持家务者" - * 10: "高中毕业生" - * 11: "律师" - * 12: "程序员" - * 13: "退休人员" - * 14: "销售/市场" - * 15: "科学家" - * 16: "自由职业者" - * 17: "技术员/工程师" - * 18: "推销员/手工艺者" - * 19: "无业人士" - * 20: "作家" - -### 电影文件描述(movies.dat) - -所有的电影信息都包含在"movies.dat"文件中,遵循如下的格式: - -电影ID::电影名称::电影类型 - -- 电影名称(包括发行时间)与IMDB网站提供的一致 -- 电影类型如符合多种用管道符号|分割,选自下列类型: - - * 动作片 - * 冒险片 - * 动画片 - * 儿童片 - * 喜剧片 - * 犯罪片 - * 纪录片 - * 戏剧 - * 奇幻片 - * 黑色电影 - * 恐怖片 - * 音乐剧 - * 悬疑片 - * 浪漫片 - * 科幻片 - * 惊险电影 - * 战争片 - * 西部片 - -- 由于意外的副本记录和测试记录,有些电影ID可能与实际电影不相符合 -- 电影大部分是手工输入数据,因此可能会有一些错误和不一致发生 diff --git a/doc/tutorials/rec/ml_dataset_en.md b/doc/tutorials/rec/ml_dataset_en.md deleted file mode 100644 index 25dea5c4afbf1ce1c1ac6195cbd245b116459e2e..0000000000000000000000000000000000000000 --- a/doc/tutorials/rec/ml_dataset_en.md +++ /dev/null @@ -1,111 +0,0 @@ -```eval_rst -.. _demo_ml_dataset: -``` - -# MovieLens Dataset - -The [MovieLens Dataset](http://grouplens.org/datasets/movielens/) was collected by GroupLens Research. -The data set contains some user information, movie information, and many movie ratings from \[1-5\]. -The data sets have many version depending on the size of set. -We use [MovieLens 1M Dataset](http://files.grouplens.org/datasets/movielens/ml-1m.zip) as a demo dataset, which contains -1 million ratings from 6000 users on 4000 movies. Released 2/2003. - -## Dataset Features - -In [ml-1m Dataset](http://files.grouplens.org/datasets/movielens/ml-1m.zip), there are many features in these dataset. -The data files (which have ".dat" extension) in [ml-1m Dataset](http://files.grouplens.org/datasets/movielens/ml-1m.zip) -is basically CSV file that delimiter is "::". The description in README we quote here. - -### RATINGS FILE DESCRIPTION(ratings.dat) - - -All ratings are contained in the file "ratings.dat" and are in the -following format: - -UserID::MovieID::Rating::Timestamp - -- UserIDs range between 1 and 6040 -- MovieIDs range between 1 and 3952 -- Ratings are made on a 5-star scale (whole-star ratings only) -- Timestamp is represented in seconds since the epoch as returned by time(2) -- Each user has at least 20 ratings - -### USERS FILE DESCRIPTION(users.dat) - -User information is in the file "users.dat" and is in the following -format: - -UserID::Gender::Age::Occupation::Zip-code - -All demographic information is provided voluntarily by the users and is -not checked for accuracy. Only users who have provided some demographic -information are included in this data set. - -- Gender is denoted by a "M" for male and "F" for female -- Age is chosen from the following ranges: - - * 1: "Under 18" - * 18: "18-24" - * 25: "25-34" - * 35: "35-44" - * 45: "45-49" - * 50: "50-55" - * 56: "56+" - -- Occupation is chosen from the following choices: - - * 0: "other" or not specified - * 1: "academic/educator" - * 2: "artist" - * 3: "clerical/admin" - * 4: "college/grad student" - * 5: "customer service" - * 6: "doctor/health care" - * 7: "executive/managerial" - * 8: "farmer" - * 9: "homemaker" - * 10: "K-12 student" - * 11: "lawyer" - * 12: "programmer" - * 13: "retired" - * 14: "sales/marketing" - * 15: "scientist" - * 16: "self-employed" - * 17: "technician/engineer" - * 18: "tradesman/craftsman" - * 19: "unemployed" - * 20: "writer" - -### MOVIES FILE DESCRIPTION(movies.dat) - -Movie information is in the file "movies.dat" and is in the following -format: - -MovieID::Title::Genres - -- Titles are identical to titles provided by the IMDB (including -year of release) -- Genres are pipe-separated and are selected from the following genres: - - * Action - * Adventure - * Animation - * Children's - * Comedy - * Crime - * Documentary - * Drama - * Fantasy - * Film-Noir - * Horror - * Musical - * Mystery - * Romance - * Sci-Fi - * Thriller - * War - * Western - -- Some MovieIDs do not correspond to a movie due to accidental duplicate -entries and/or test entries -- Movies are mostly entered by hand, so errors and inconsistencies may exist diff --git a/doc/tutorials/rec/ml_regression_cn.rst b/doc/tutorials/rec/ml_regression_cn.rst deleted file mode 100644 index 9278c9f603b648099f448963bc2246b8dc014ab7..0000000000000000000000000000000000000000 --- a/doc/tutorials/rec/ml_regression_cn.rst +++ /dev/null @@ -1,349 +0,0 @@ -MovieLens数据集评分回归模型 -=========================== - -这里我们在MovieLens数据集描述一种 **余弦相似度回归** 任务。 -该示例将展示paddle如何进行词向量嵌入,处理相似度回归,针对文本 -的单词级别的卷积神经网络,以及paddle如何处理多种类型的输入。 -需要注意的是,该模型网络只是用于进行demo展示paddle如何工作,而 -没有进行结构的微调。 - - -**我们非常欢迎您用PADDLEPADDLE构建更好的示例,如果您有好的建议来 -让这个示例变得更好,希望能让我们知晓。** - -数据准备 -````````` -下载并解压数据集 -''''''''''''''''' -这里我们使用 :ref:`demo_ml_dataset` 。 -要下载和解压数据集,只需要简单的运行下面的命令即可。 - -.. code-block:: bash - - cd demo/recommendation/data - ./ml_data.sh - -:code:`demo/recommendation/data/ml-1m` 的目录结构为: - -.. code-block:: text - - +--ml-1m - +--- movies.dat # 电影特征 - +--- ratings.dat # 评分 - +--- users.dat # 用户特征 - +--- README # 数据集描述 - -字段配置文件 -''''''''''''' -**字段配置文件** 用来具体说明数据集的字段和文件格式, -例如,说明每个特征文件具体字段是 **什么** 类型。 - -ml-1m的字段配置文件在目录 :code:`demo/recommendation/data/config.json` 中。 -其具体说明了字段类型和文件名称: - -1) 用户文件中有四种类型的字段\: 编号,性别,年龄和职业; - -2) 文件名称为"users.dat",文件的分隔符为"::"。 - -.. include:: ../../../demo/recommendation/data/config.json - :code: json - :literal: - -准备数据 -````````` -你需要安装python的第三方库。 -**强烈推荐使用VIRTUALENV来创造一个干净的python环境。** - -.. code-block:: bash - - pip install -r requirements.txt - -预处理数据一般的命令为: - -.. code-block:: bash - - cd demo/recommendation - ./preprocess.sh - -下面介绍预处理过程具体的步骤。 - -提取电影或用户的特征并生成python对象 -''''''''''''''''''''''''''''''''''''' - -在movielens 1m数据集中,电影和用户有许多的特征。 -评分文件的每一行仅仅提供电影或用户的编号来代表相应的电影或用户。 -我们首先处理电影或用户的特征文件,然后用pickle命令将特征( **Meta** )对象存储为文件。 - -Meta配置文件 -............. - -**Meta配置文件** 用来具体描述 **如何** 解析数据集中的每一个字段。 -该文件可以从字段配置文件生成,或是手动编辑生成。文件的格式可以 -为json或yaml格式。解析器能通过文件的扩展名自动识别文件的格式。 - -要将字段配置文件转化为meta配置文件,只需要运行: - -.. code-block:: bash - - cd demo/recommendation/data - python config_generator.py config.json > meta_config.json - -生成的meta配置文件如下所示: - -.. include:: ../../../demo/recommendation/data/meta_config.json - :code: json - :literal: - -在meta文件中有两种特征\: 电影和用户。 - -* 在电影文件movies.dat中 - * 我们仅用"::"来分隔每一行 - * pos 0 代表编号 - * pos 1 特征: - * name是电影名 - * 利用正则表达式来解析该特征 - * 基于字母的词嵌入特征 - * 是序列 - * pos 2 特征: - * name是体裁 - * type是one hot稠密向量 - * dictionary由解析自动生成,每一个key由'|'分隔 -* 在用户文件users.dat中 - * 我们仅用"::"来分隔每一行 - * pos 0 代表编号 - * pos 1 特征: - * name是性别 - * 简单的基于字母的词嵌入 - * pos 2 特征: - * name是年龄 - * 是整个的词嵌入 - * 嵌入编号会根据单词排序 - * pos 3 特征: - * name是职业 - * 简单的整个词嵌入 - - -Meta文件 -'''''''' - -有了meta配置文件之后,我们可以生成 **Meta文件** ,该文件是python的pickle对象, -存储着电影或用户信息。可以运行下面的命令来生成。 - -.. code-block:: bash - - python meta_generator.py ml-1m meta.bin --config=meta_config.json - -meta文件 :code:`meta.bin` 的结构如下: - -.. code-block:: text - - +--+ movie - | +--+ __meta__ - | | +--+ raw_meta # 每个特征的meta配置。列表 - | | | + - | | | | # 编号字段,我们用编号作为key - | | | +--+ {'count': 3883, 'max': 3952, 'is_key': True, 'type': 'id', 'min': 1} - | | | | - | | | | # 电影名字段,嵌入特征字典 - | | | +--+ {'dict': [ ... ], 'type': 'embedding', 'name': 'title', 'seq': 'sequence'} - | | | | - | | | | # 体裁字段,体裁字典 - | | | +--+ {'dict': [ ... ], 'type': 'one_hot_dense', 'name': 'genres'} - | | | - | | +--+ feature_map [1, 2] # a list for raw_meta index for feature field. - | | # it means there are 2 features for each key. - | | # * 0 offset of feature is raw_meta[1], Title. - | | # * 1 offset of feature is raw_meta[2], Genres. - | | - | +--+ 1 # 电影1的特征 - | | + - | | +---+ [[...], [...]] # title ids, genres dense vector - | | - | +--+ 2 - | | - | +--+ ... - | - +--- user - +--+ __meta__ - | + - | +--+ raw_meta - | | + - | | +--+ id field as user - | | | - | | +--+ {'dict': ['F', 'M'], 'type': 'embedding', 'name': 'gender', 'seq': 'no_sequence'} - | | | - | | +--+ {'dict': ['1', '18', '25', '35', '45', '50', '56'], 'type': 'embedding', 'name': 'age', 'seq': 'no_sequence'} - | | | - | | +--+ {'dict': [...], 'type': 'embedding', 'name': 'occupation', 'seq': 'no_sequence'} - | | - | +--+ feature_map [1, 2, 3] - | - +--+ 1 # 用户1的特征 - | - +--+ 2 - +--+ ... - - -分割训练/测试文件 -'''''''''''''''''' - -我们将 :code:`ml-1m/ratings.dat` 文件分割为训练和测试文件。分割文件的方法是:对于每位用户,我们将评分分成两部分。 -这样的话每位用户在测试文件中将与训练文件含有同样的信息。 - -用 :code:`separate.py` 来分离训练和测试文件。 - -.. code-block:: bash - - python split.py ml-1m/ratings.dat --delimiter="::" --test_ratio=0.1 - -这样就会生成两个文件::code:`ml-1m/ratings.dat.train` 和 :code:`ml-1m/ratings.data.test` 。 -将他们移动到目录 :code:`data` ,然后进行随机打乱,再为paddle的训练过程提供文件列表。 - -.. code-block:: bash - - shuf ml-1m/ratings.dat.train > ratings.dat.train - cp ml-1m/ratings.dat.test . - echo "./data/ratings.dat.train" > train.list - echo "./data/ratings.dat.test" > test.list - - -神经网络结构配置 -````````````````` - -训练器配置文件 -''''''''''''''' - -网络结构如下图所示: - -.. image:: rec_regression_network.png - :align: center - :alt: rec_regression_network - -该示例的神经网络配置文件 :code:`trainer_config.py` 如下所示: - -.. literalinclude:: ../../../demo/recommendation/trainer_config.py - :language: python - :lines: 15- - -在文件 :code:`trainer_config.py` 中,我们仅仅是将每个特征种类映射到一个特征向量中,以下 -展示了如何将每个特征映射到一个向量。 - -* :code:`id` \: 仅仅是简单的嵌入,然后添加一个全连接层。 -* :code:`embedding` \: - - 如果是序列,则先做嵌入,然后再做一次文本卷积网络操作, - 然后得到平均采样的结果。 - - 如果不是序列,则先做嵌入,然后添加一个全连接层。 -* :code:`one_host_dense` \: - - 仅仅是两个全连接层。 - -然后我们利用多输入的:code:`fc_layer` 全连接层将电影的每个特征结合成一个电影特征, -并且对用户的特征做同样的操作,也得到一个用户特征。然后我们求这两个特征的余弦相似度。 - -在这些网络中,我们用以下的一些:ref:`api_trainer_config` 中的接口。 - -* 数据层, :ref:`api_trainer_config_helpers_layers_data_layer` -* 全连接层, :ref:`api_trainer_config_helpers_layers_fc_layer` -* 嵌入层, :ref:`api_trainer_config_helpers_layers_embedding_layer` -* 文本投影层, :ref:`api_trainer_config_helpers_layers_context_projection` -* 采样层, :ref:`api_trainer_config_helpers_layers_pooling_layer` -* 余弦相似度层, :ref:`api_trainer_config_helpers_layers_cos_sim` -* 文本卷积采样层, :ref:`api_trainer_config_helpers_network_text_conv_pool` -* 声明Python数据源, :ref:`api_trainer_config_helpers_data_sources` - -数据提供脚本 -''''''''''''' - -.. literalinclude:: ../../../demo/recommendation/dataprovider.py - :language: python - :lines: 15- - -数据提供脚本仅仅是读取meta.bin和评分文件,生成训练需要的样本。 -在脚本 :code:`dataprovider.py` 中,我们需要设置: - -* obj.slots\: 特征的类型和维度。 -* use_seq\: :code:`dataprovider.py` 中的数据是否为序列模式。 -* process\: 返回数据的每一条样本给 :code:`paddle` 。 - -数据提供脚本的细节文档可以参考 :ref:`api_pydataprovider2` 。 - -训练 -```` - -准备好数据,配置了网络,编写好数据提供脚本后,现在我们可以开始paddle训练了。 - -代码 :code:`run.sh` 如下: - -.. literalinclude:: ../../../demo/recommendation/run.sh - :language: bash - :lines: 16- - -该脚本仅仅是开始一个paddle训练过程,将日志写入文件 :code:`log.txt` ,然后 -打印在屏幕上。 - -脚本 :code:`run.sh` 中的每一行命令,请参考页面 :ref:`cmd_line_index` 。 -这些参数的简短介绍如下: - -* config\: 告诉paddle哪个文件是神经网络的配置文件。 -* save_dir\: 告诉paddle将模型保存在: code:`./output` 中。 -* use_gpu\: 是否使用GPU,默认为不使用。 -* trainer_count\: 一台机器上面的线程数量。 -* test_all_data_in_one_period\: 每一个测试周期测试一次所有数据。否则, - 每个测试周期测试: code:`batch_size` 批次的数据。 -* log_period\: 在训练了: code:`log_period` 批次后打印日志。 -* dot_period\: 在每训练: code:`dot_period` 个批次后打印一个 :code:`.` 。 -* num_passes\: 训练至多: code:`num_passes` 轮。 - -如果训练过程启动成功的话,输出应该类似如下: - -.. code-block:: text - - I0601 08:07:22.832059 10549 TrainerInternal.cpp:157] Batch=100 samples=160000 AvgCost=4.13494 CurrentCost=4.13494 Eval: CurrentEval: - - I0601 08:07:50.672627 10549 TrainerInternal.cpp:157] Batch=200 samples=320000 AvgCost=3.80957 CurrentCost=3.48421 Eval: CurrentEval: - - I0601 08:08:18.877369 10549 TrainerInternal.cpp:157] Batch=300 samples=480000 AvgCost=3.68145 CurrentCost=3.42519 Eval: CurrentEval: - - I0601 08:08:46.863963 10549 TrainerInternal.cpp:157] Batch=400 samples=640000 AvgCost=3.6007 CurrentCost=3.35847 Eval: CurrentEval: - - I0601 08:09:15.413025 10549 TrainerInternal.cpp:157] Batch=500 samples=800000 AvgCost=3.54811 CurrentCost=3.33773 Eval: CurrentEval: - I0601 08:09:36.058670 10549 TrainerInternal.cpp:181] Pass=0 Batch=565 samples=902826 AvgCost=3.52368 Eval: - I0601 08:09:46.215489 10549 Tester.cpp:101] Test samples=97383 cost=3.32155 Eval: - I0601 08:09:46.215966 10549 GradientMachine.cpp:132] Saving parameters to ./output/model/pass-00000 - I0601 08:09:46.233397 10549 ParamUtil.cpp:99] save dir ./output/model/pass-00000 - I0601 08:09:46.233438 10549 Util.cpp:209] copy trainer_config.py to ./output/model/pass-00000 - I0601 08:09:46.233541 10549 ParamUtil.cpp:147] fileName trainer_config.py - -模型被保存在 :code:`output/` 目录中。你可以在任何时候用 :code:`Ctrl-C` 来停止训练。 - -模型评估和预测 -``````````````` - -在训练了几个轮次以后,你可以对模型进行评估,得到最好轮次下的模型。运行下面命令即可: - -.. code-block:: bash - - ./evaluate.sh - -你将看到如下的信息: - -.. code-block:: text - - Best pass is 00009, error is 3.06949, which means predict get error as 0.875998002281 - evaluating from pass output/pass-00009 - -然后,你可以预测任何用户对于任何一部电影的评价,运行下面命令即可: - -.. code-block:: bash - - python prediction.py 'output/pass-00009/' - -预测程序将读取用户的输入,然后输出预测分数。用户预测的命令行界面如下: - -.. code-block:: text - - Input movie_id: 9 - Input user_id: 4 - Prediction Score is 2.56 - Input movie_id: 8 - Input user_id: 2 - Prediction Score is 3.13 diff --git a/doc/tutorials/rec/ml_regression_en.rst b/doc/tutorials/rec/ml_regression_en.rst deleted file mode 100644 index 993b9a516f134ff8b59e8755b721f76c8f32f0fd..0000000000000000000000000000000000000000 --- a/doc/tutorials/rec/ml_regression_en.rst +++ /dev/null @@ -1,348 +0,0 @@ -Regression MovieLens Ratting -============================ - -Here we demonstrate a **Cosine Similarity Regression** job in movie lens dataset. -This demo will show how paddle does (word) embedding job, -handles the similarity regression, -the character-level convolutional networks for text, and how does paddle handle -multiple types of inputs. -Note that the model structure is not fine-tuned and just a demo to show how paddle works. - - -YOU ARE WELCOME TO BUILD A BETTER DEMO -BY USING PADDLEPADDLE, AND LET US KNOW TO MAKE THIS DEMO BETTER. - -Data Preparation -```````````````` -Download and extract dataset -'''''''''''''''''''''''''''' -We use :ref:`demo_ml_dataset` here. -To download and unzip the dataset, simply run the following commands. - -.. code-block:: bash - - cd demo/recommendation/data - ./ml_data.sh - -And the directory structure of :code:`demo/recommendation/data/ml-1m` is: - -.. code-block:: text - - +--ml-1m - +--- movies.dat # movie features - +--- ratings.dat # ratings - +--- users.dat # user features - +--- README # dataset description - -Field config file -''''''''''''''''' -**Field config file** is used to specify the fields of the dataset and the file format, -i.e, specific **WHAT** type it is in each feature file. - -The field config file of ml-1m shows in :code:`demo/recommendation/data/config.json`. -It specifics the field types and file names: 1) there are four types of field for user file\: id, gender, age and occupation; -2) the filename is "users.dat", and the delimiter of file is "::". - -.. include:: ../../../demo/recommendation/data/config.json - :code: json - :literal: - -Preprocess Data -``````````````` -You need to install python 3rd party libraries. -IT IS HIGHLY RECOMMEND TO USE VIRTUALENV MAKE A CLEAN PYTHON ENVIRONMENT. - -.. code-block:: bash - - pip install -r requirements.txt - -The general command for preprocessing the dataset is: - -.. code-block:: bash - - cd demo/recommendation - ./preprocess.sh - -And the detail steps are introduced as follows. - -Extract Movie/User features to python object -''''''''''''''''''''''''''''''''''''''''''''' - -There are many features in movie or user in movielens 1m dataset. -Each line of rating file just provides a Movie/User id to refer each movie or user. -We process the movie/user feature file first, and pickle the feature (**Meta**) object as a file. - -Meta config file -................ - -**Meta config file** is used to specific **HOW** to parse each field in dataset. -It could be translated from field config file, or written by hand. -Its file format could be either json or yaml syntax file. Parser will automatically choose the file format by extension name. - -To convert Field config file to meta config file, just run: - -.. code-block:: bash - - cd demo/recommendation/data - python config_generator.py config.json > meta_config.json - -The meta config file shows below: - -.. include:: ../../../demo/recommendation/data/meta_config.json - :code: json - :literal: - -There are two kinds of features in meta\: movie and user. - -* in movie file, whose name is movies.dat - * we just split each line by "::" - * pos 0 is id. - * pos 1 feature: - * name is title. - * it uses regex to parse this feature. - * it is a char based word embedding feature. - * it is a sequence. - * pos 2 feature: - * name is genres. - * type is one hot dense vector. - * dictionary is auto generated by parsing, each key is split by '|' -* in user file, whose name is users.dat - * we just split each line by "::" - * pos 0 is id. - * pos 1 feature: - * name is gender - * just simple char based embedding. - * pos 2 feature: - * name is age - * just whole word embedding. - * embedding id will be sort by word. - * pos 3 feature: - * name is occupation. - * just simple whole word embedding. - - -Meta file -''''''''' - -After having meta config file, we can generate **Meta file**, a python pickle object which stores movie/user information. -The following commands could be run to generate it. - -.. code-block:: bash - - python meta_generator.py ml-1m meta.bin --config=meta_config.json - -And the structure of the meta file :code:`meta.bin` is: - -.. code-block:: text - - +--+ movie - | +--+ __meta__ - | | +--+ raw_meta # each feature meta config. list - | | | + - | | | | # ID Field, we use id as key - | | | +--+ {'count': 3883, 'max': 3952, 'is_key': True, 'type': 'id', 'min': 1} - | | | | - | | | | # Titile field, the dictionary list of embedding. - | | | +--+ {'dict': [ ... ], 'type': 'embedding', 'name': 'title', 'seq': 'sequence'} - | | | | - | | | | # Genres field, the genres dictionary - | | | +--+ {'dict': [ ... ], 'type': 'one_hot_dense', 'name': 'genres'} - | | | - | | +--+ feature_map [1, 2] # a list for raw_meta index for feature field. - | | # it means there are 2 features for each key. - | | # * 0 offset of feature is raw_meta[1], Title. - | | # * 1 offset of feature is raw_meta[2], Genres. - | | - | +--+ 1 # movie 1 features - | | + - | | +---+ [[...], [...]] # title ids, genres dense vector - | | - | +--+ 2 - | | - | +--+ ... - | - +--- user - +--+ __meta__ - | + - | +--+ raw_meta - | | + - | | +--+ id field as user - | | | - | | +--+ {'dict': ['F', 'M'], 'type': 'embedding', 'name': 'gender', 'seq': 'no_sequence'} - | | | - | | +--+ {'dict': ['1', '18', '25', '35', '45', '50', '56'], 'type': 'embedding', 'name': 'age', 'seq': 'no_sequence'} - | | | - | | +--+ {'dict': [...], 'type': 'embedding', 'name': 'occupation', 'seq': 'no_sequence'} - | | - | +--+ feature_map [1, 2, 3] - | - +--+ 1 # user 1 features - | - +--+ 2 - +--+ ... - - -Split Training/Testing files -'''''''''''''''''''''''''''' - -We split :code:`ml-1m/ratings.dat` into a training and testing file. The way to split file is for each user, we split the -rating by two parts. So each user in testing file will have some rating information in training file. - -Use :code:`separate.py` to separate the training and testing file. - -.. code-block:: bash - - python split.py ml-1m/ratings.dat --delimiter="::" --test_ratio=0.1 - -Then two files will be generated\: :code:`ml-1m/ratings.dat.train` and :code:`ml-1m/rating.data.test`. -Move them to workspace :code:`data`, shuffle the train file, and prepare the file list for paddle train. - -.. code-block:: bash - - shuf ml-1m/ratings.dat.train > ratings.dat.train - cp ml-1m/ratings.dat.test . - echo "./data/ratings.dat.train" > train.list - echo "./data/ratings.dat.test" > test.list - - -Neural Network Configuration -```````````````````````````` - -Trainer Config File -''''''''''''''''''' - -The network structure shows below. - -.. image:: rec_regression_network.png - :align: center - :alt: rec_regression_network - -The demo's neural network config file :code:`trainer_config.py` show as below. - -.. literalinclude:: ../../../demo/recommendation/trainer_config.py - :language: python - :lines: 15- - -In this :code:`trainer_config.py`, we just map each feature type to -a feature vector, following shows how to map each feature to a vector shows below. - -* :code:`id`\: Just simple embedding, and then add to fully connected layer. -* :code:`embedding`\: - - if is_sequence, get the embedding and do a text convolutional operation, - get the average pooling result. - - if not sequence, get the embedding and add to fully connected layer. -* :code:`one_host_dense`\: - - just two fully connected layer. - -Then we combine each features of movie into one movie feature by a -:code:`fc_layer` with multiple inputs, and do the same thing to user features, -get one user feature. Then we calculate the cosine similarity of these two -features. - -In these networks, we use several APIs in :ref:`api_trainer_config` . There are - -* Data Layer, :ref:`api_trainer_config_helpers_layers_data_layer` -* Fully Connected Layer, :ref:`api_trainer_config_helpers_layers_fc_layer` -* Embedding Layer, :ref:`api_trainer_config_helpers_layers_embedding_layer` -* Context Projection Layer, :ref:`api_trainer_config_helpers_layers_context_projection` -* Pooling Layer, :ref:`api_trainer_config_helpers_layers_pooling_layer` -* Cosine Similarity Layer, :ref:`api_trainer_config_helpers_layers_cos_sim` -* Text Convolution Pooling Layer, :ref:`api_trainer_config_helpers_network_text_conv_pool` -* Declare Python Data Sources :ref:`api_trainer_config_helpers_data_sources`. - -Data Provider -''''''''''''' - -.. literalinclude:: ../../../demo/recommendation/dataprovider.py - :language: python - :lines: 15- - -The data provider just read the meta.bin and rating file, yield each sample for training. -In this :code:`dataprovider.py`, we should set\: - -* obj.slots\: The feature types and dimension. -* use_seq\: Whether this :code:`dataprovider.py` in sequence mode or not. -* process\: Return each sample of data to :code:`paddle`. - -The data provider details document see :ref:`api_pydataprovider2`. - -Train -````` - -After prepare data, config network, writting data provider, now we can run paddle training. - -The :code:`run.sh` is shown as follow: - -.. literalinclude:: ../../../demo/recommendation/run.sh - :language: bash - :lines: 16- - -It just start a paddle training process, write the log to :code:`log.txt`, -then print it on screen. - -Each command line argument in :code:`run.sh`, please refer to the :ref:`cmd_line_index` page. The short description of these arguments is shown as follow. - -* config\: Tell paddle which file is neural network configuration. -* save_dir\: Tell paddle save model into :code:`./output`. -* use_gpu\: Use gpu or not. Default is false. -* trainer_count\: The compute thread in one machine. -* test_all_data_in_one_period\: Test All Data during one test period. Otherwise, - will test a :code:`batch_size` data in one test period. -* log_period\: Print log after train :code:`log_period` batches. -* dot_period\: Print a :code:`.` after train :code:`dot_period` batches. -* num_passes\: Train at most :code:`num_passes`. - -If training process starts successfully, the output likes follow: - -.. code-block:: text - - I0601 08:07:22.832059 10549 TrainerInternal.cpp:157] Batch=100 samples=160000 AvgCost=4.13494 CurrentCost=4.13494 Eval: CurrentEval: - - I0601 08:07:50.672627 10549 TrainerInternal.cpp:157] Batch=200 samples=320000 AvgCost=3.80957 CurrentCost=3.48421 Eval: CurrentEval: - - I0601 08:08:18.877369 10549 TrainerInternal.cpp:157] Batch=300 samples=480000 AvgCost=3.68145 CurrentCost=3.42519 Eval: CurrentEval: - - I0601 08:08:46.863963 10549 TrainerInternal.cpp:157] Batch=400 samples=640000 AvgCost=3.6007 CurrentCost=3.35847 Eval: CurrentEval: - - I0601 08:09:15.413025 10549 TrainerInternal.cpp:157] Batch=500 samples=800000 AvgCost=3.54811 CurrentCost=3.33773 Eval: CurrentEval: - I0601 08:09:36.058670 10549 TrainerInternal.cpp:181] Pass=0 Batch=565 samples=902826 AvgCost=3.52368 Eval: - I0601 08:09:46.215489 10549 Tester.cpp:101] Test samples=97383 cost=3.32155 Eval: - I0601 08:09:46.215966 10549 GradientMachine.cpp:132] Saving parameters to ./output/model/pass-00000 - I0601 08:09:46.233397 10549 ParamUtil.cpp:99] save dir ./output/model/pass-00000 - I0601 08:09:46.233438 10549 Util.cpp:209] copy trainer_config.py to ./output/model/pass-00000 - I0601 08:09:46.233541 10549 ParamUtil.cpp:147] fileName trainer_config.py - -The model is saved in :code:`output/` directory. You can use :code:`Ctrl-C` to stop training whenever you want. - -Evaluate and Predict -```````````````````` - -After training several passes, you can evaluate them and get the best pass. Just run - -.. code-block:: bash - - ./evaluate.sh - -You will see messages like this: - -.. code-block:: text - - Best pass is 00009, error is 3.06949, which means predict get error as 0.875998002281 - evaluating from pass output/pass-00009 - -Then, you can predict what any user will rate a movie. Just run - -.. code-block:: bash - - python prediction.py 'output/pass-00009/' - -Predictor will read user input, and predict scores. It has a command-line user interface as follows: - -.. code-block:: text - - Input movie_id: 9 - Input user_id: 4 - Prediction Score is 2.56 - Input movie_id: 8 - Input user_id: 2 - Prediction Score is 3.13 diff --git a/doc/tutorials/rec/rec_regression_network.png b/doc/tutorials/rec/rec_regression_network.png deleted file mode 100644 index 7d2b54d4fcf560cd5b667628f0012c3822efd9b2..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/rec/rec_regression_network.png and /dev/null differ diff --git a/doc/tutorials/semantic_role_labeling/feature.jpg b/doc/tutorials/semantic_role_labeling/feature.jpg deleted file mode 100644 index 0e3310e4ace5613917e7779d3198ccbb3cdc5ada..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/semantic_role_labeling/feature.jpg and /dev/null differ diff --git a/doc/tutorials/semantic_role_labeling/index_cn.md b/doc/tutorials/semantic_role_labeling/index_cn.md deleted file mode 100644 index f6061766c038a7bb6e4ae376685a10cd5669d2ed..0000000000000000000000000000000000000000 --- a/doc/tutorials/semantic_role_labeling/index_cn.md +++ /dev/null @@ -1,201 +0,0 @@ -# 语义角色标注教程 # - -语义角色标注(Semantic role labeling, SRL)是浅层语义解析的一种形式,其目的是在给定的输入句子中发现每个谓词的谓词论元结构。 SRL作为很多自然语言处理任务中的中间步骤是很有用的,如信息提取、文档自动分类和问答。 实例如下 [1]: - - [ A0 He ] [ AM-MOD would ][ AM-NEG n’t ] [ V accept] [ A1 anything of value ] from [A2 those he was writing about ]. - -- V: 动词 -- A0: 接受者 -- A1: 接受的东西 -- A2: 从……接受 -- A3: 属性 -- AM-MOD: 情态动词 -- AM-NEG: 否定 - -给定动词“accept”,句子中的组块将会扮演某些语义角色。这里,标签方案来自 Penn Proposition Bank。 - -到目前为止,大多数成功的SRL系统是建立在某种形式的句法分析结果之上的,使用了基于句法结构的预定义特征模板。 本教程将介绍使用深度双向长短期记忆(DB-LSTM)模型[2]的端到端系统来解决SRL任务,这在很大程度上优于先前的最先进的系统。 这个系统将SRL任务视为序列标注问题。 - -## 数据描述 -相关论文[2]采用 CoNLL-2005&2012 共享任务中设置的数据进行训练和测试。由于数据许可的原因,演示采用 CoNLL-2005 的测试数据集,可以在网站上找到。 - -用户只需执行以下命令就可以下载并处理原始数据: - -```bash -cd data -./get_data.sh -``` -`data `目录会出现如下几个新的文件: -```bash -conll05st-release:the test data set of CoNll-2005 shared task -test.wsj.words:the Wall Street Journal data sentences -test.wsj.props: the propositional arguments -feature: the extracted features from data set -``` - -## 训练 -### DB-LSTM -请参阅情感分析的演示以了解有关长期短期记忆单元的更多信息。 - -与在 Sentiment Analysis 演示中使用的 Bidirectional-LSTM 不同,DB-LSTM 采用另一种方法来堆叠LSTM层。首先,标准LSTM以正向处理该序列。该 LSTM 层的输入和输出作为下一个 LSTM 层的输入,并被反向处理。这两个标准 LSTM 层组成一对 LSTM。然后我们堆叠一对对的 LSTM 层后得到深度 LSTM 模型。 - -下图展示了时间扩展的2层 DB-LSTM 网络。 -
-![pic](./network_arch.png) -
- -### 特征 -两个输入特征在这个流程中起着至关重要的作用:predicate(pred)和argument(arguments)。 还采用了两个其他特征:谓词上下文(ctx-p)和区域标记(mr)。 因为单个谓词不能精确地描述谓词信息,特别是当相同的词在句子中出现多于一次时。 使用谓词上下文,可以在很大程度上消除歧义。类似地,如果它位于谓词上下文区域中,则使用区域标记 mr = 1 来表示参数位置,反之则 mr = 0。这四个简单的特征是我们的SRL系统所需要的。上下文大小设置为1的一个样本的特征如下[2]所示: -
-![pic](./feature.jpg) -
- -在这个示例中,相应的标记句子是: - -[ A1 A record date ] has [ AM-NEG n't ] been [ V set ] . - -在演示中, 我们采用上面的特征模板, 包括: `argument`, `predicate`, `ctx-p (p=-1,0,1)`, `mark` 并使用 `B/I/O` 方案来标记每个参数。这些特征和标签存储在 `feature` 文件中, 用`\t`分割。 - -### 数据提供 - -`dataprovider.py` 是一个包装数据的 Python 文件。 函数 `hook()` 定义了网络的数据槽。六个特征和标签都是索引槽。 -``` -def hook(settings, word_dict, label_dict, **kwargs): - settings.word_dict = word_dict - settings.label_dict = label_dict - #all inputs are integral and sequential type - settings.slots = [ - integer_value_sequence(len(word_dict)), - integer_value_sequence(len(predicate_dict)), - integer_value_sequence(len(word_dict)), - integer_value_sequence(len(word_dict)), - integer_value_sequence(len(word_dict)), - integer_value_sequence(len(word_dict)), - integer_value_sequence(len(word_dict)), - integer_value_sequence(2), - integer_value_sequence(len(label_dict))] -``` -相应的数据迭代器如下: -``` -@provider(init_hook=hook, should_shuffle=True, calc_batch_size=get_batch_size, - can_over_batch_size=False, cache=CacheType.CACHE_PASS_IN_MEM) -def process(settings, file_name): - with open(file_name, 'r') as fdata: - for line in fdata: - sentence, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, label = \ - line.strip().split('\t') - - words = sentence.split() - sen_len = len(words) - word_slot = [settings.word_dict.get(w, UNK_IDX) for w in words] - - predicate_slot = [settings.predicate_dict.get(predicate)] * sen_len - ctx_n2_slot = [settings.word_dict.get(ctx_n2, UNK_IDX)] * sen_len - ctx_n1_slot = [settings.word_dict.get(ctx_n1, UNK_IDX)] * sen_len - ctx_0_slot = [settings.word_dict.get(ctx_0, UNK_IDX)] * sen_len - ctx_p1_slot = [settings.word_dict.get(ctx_p1, UNK_IDX)] * sen_len - ctx_p2_slot = [settings.word_dict.get(ctx_p2, UNK_IDX)] * sen_len - - marks = mark.split() - mark_slot = [int(w) for w in marks] - - label_list = label.split() - label_slot = [settings.label_dict.get(w) for w in label_list] - yield word_slot, predicate_slot, ctx_n2_slot, ctx_n1_slot, \ - ctx_0_slot, ctx_p1_slot, ctx_p2_slot, mark_slot, label_slot -``` -函数 `process` 返回8个特征list和1个标签list。 - -### 神经网络配置 - -`db_lstm.py` 是在训练过程中加载字典并定义数据提供程序模块和网络架构的神经网络配置文件。 - -九个 `data_layer` 从数据提供程序加载实例。八个特征分别转换为向量,并由`mixed_layer`混合。 深度双向LSTM层提取softmax层的特征。目标函数是标签的交叉熵。 - -### 训练 -训练的脚本是 `train.sh`,用户只需执行: -```bash - ./train.sh -``` -`train.sh` 中的内容: -``` -paddle train \ - --config=./db_lstm.py \ - --use_gpu=0 \ - --log_period=5000 \ - --trainer_count=1 \ - --show_parameter_stats_period=5000 \ - --save_dir=./output \ - --num_passes=10000 \ - --average_test_period=10000000 \ - --init_model_path=./data \ - --load_missing_parameter_strategy=rand \ - --test_all_data_in_one_period=1 \ -2>&1 | tee 'train.log' -``` - -- \--config=./db_lstm.py : 网络配置文件 -- \--use_gpu=false: 使用 CPU 训练(如果已安装 PaddlePaddle GPU版本并想使用 GPU 训练可以设置为true,目前 crf_layer 不支持 GPU) -- \--log_period=500: 每20个batch输出日志 -- \--trainer_count=1: 设置线程数(或 GPU 数) -- \--show_parameter_stats_period=5000: 每100个batch显示参数统计 -- \--save_dir=./output: 模型输出路径 -- \--num_passes=10000: 设置数据遍历次数,一个pass意味着PaddlePaddle训练数据集中的所有样本被遍历一次 -- \--average_test_period=10000000: 每个 average_test_period 批次对平均参数进行测试 -- \--init_model_path=./data: 参数初始化路径 -- \--load_missing_parameter_strategy=rand: 随机初始不存在的参数 -- \--test_all_data_in_one_period=1: 在一个周期内测试所有数据 - - -训练后,模型将保存在目录`output`中。 我们的训练曲线如下: -
-![pic](./src/curve.jpg) -
- -### 测试 -测试脚本是 `test.sh`, 执行: -```bash - ./test.sh -``` -`tesh.sh` 的主要部分: -``` -paddle train \ - --config=./db_lstm.py \ - --model_list=$model_list \ - --job=test \ - --config_args=is_test=1 \ -``` - - - \--config=./db_lstm.py: 网络配置文件 - - \--model_list=$model_list.list: 模型列表文件 - - \--job=test: 指示测试任务 - - \--config_args=is_test=1: 指示测试任务的标记 - - \--test_all_data_in_one_period=1: 在一个周期内测试所有数据 - - -### 预测 -预测脚本是 `predict.sh`,用户只需执行: -```bash - ./predict.sh - -``` -在`predict.sh`中,用户应该提供网络配置文件,模型路径,标签文件,字典文件,特征文件。 -``` -python predict.py - -c $config_file \ - -w $best_model_path \ - -l $label_file \ - -p $predicate_dict_file \ - -d $dict_file \ - -i $input_file \ - -o $output_file -``` - -`predict.py` 是主要的可执行python脚本,其中包括函数:加载模型,加载数据,数据预测。网络模型将输出标签的概率分布。 在演示中,我们使用最大概率的标签作为结果。用户还可以根据概率分布矩阵实现柱搜索或维特比解码。 - -预测后,结果保存在 `predict.res` 中。 - -## 引用 -[1] Martha Palmer, Dan Gildea, and Paul Kingsbury. The Proposition Bank: An Annotated Corpus of Semantic Roles , Computational Linguistics, 31(1), 2005. - -[2] Zhou, Jie, and Wei Xu. "End-to-end learning of semantic role labeling using recurrent neural networks." Proceedings of the Annual Meeting of the Association for Computational Linguistics. 2015. diff --git a/doc/tutorials/semantic_role_labeling/index_en.md b/doc/tutorials/semantic_role_labeling/index_en.md deleted file mode 100644 index 92d7c634832119c718711a57c16f69492d405f28..0000000000000000000000000000000000000000 --- a/doc/tutorials/semantic_role_labeling/index_en.md +++ /dev/null @@ -1,204 +0,0 @@ -```eval_rst -.. _semantic_role_labeling: -``` - -# Semantic Role labeling Tutorial # - -Semantic role labeling (SRL) is a form of shallow semantic parsing whose goal is to discover the predicate-argument structure of each predicate in a given input sentence. SRL is useful as an intermediate step in a wide range of natural language processing tasks, such as information extraction. automatic document categorization and question answering. An instance is as following [1]: - - [ A0 He ] [ AM-MOD would ][ AM-NEG n’t ] [ V accept] [ A1 anything of value ] from [A2 those he was writing about ]. - -- V: verb -- A0: acceptor -- A1: thing accepted -- A2: accepted-from -- A3: Attribute -- AM-MOD: modal -- AM-NEG: negation - -Given the verb "accept", the chunks in sentence would play certain semantic roles. Here, the label scheme is from Penn Proposition Bank. - -To this date, most of the successful SRL systems are built on top of some form of parsing results where pre-defined feature templates over the syntactic structure are used. This tutorial will present an end-to-end system using deep bidirectional long short-term memory (DB-LSTM)[2] for solving the SRL task, which largely outperforms the previous state-of-the-art systems. The system regards SRL task as the sequence labelling problem. - -## Data Description -The relevant paper[2] takes the data set in CoNLL-2005&2012 Shared Task for training and testing. Accordingto data license, the demo adopts the test data set of CoNLL-2005, which can be reached on website. - -To download and process the original data, user just need to execute the following command: - -```bash -cd data -./get_data.sh -``` -Several new files appear in the `data `directory as follows. -```bash -conll05st-release:the test data set of CoNll-2005 shared task -test.wsj.words:the Wall Street Journal data sentences -test.wsj.props: the propositional arguments -feature: the extracted features from data set -``` - -## Training -### DB-LSTM -Please refer to the Sentiment Analysis demo to learn more about the long short-term memory unit. - -Unlike Bidirectional-LSTM that used in Sentiment Analysis demo, the DB-LSTM adopts another way to stack LSTM layer. First a standard LSTM processes the sequence in forward direction. The input and output of this LSTM layer are taken by the next LSTM layer as input, processed in reversed direction. These two standard LSTM layers compose a pair of LSTM. Then we stack LSTM layers pair after pair to obtain the deep LSTM model. - -The following figure shows a temporal expanded 2-layer DB-LSTM network. -
-![pic](./src/network_arch.png) -
- -### Features -Two input features play an essential role in this pipeline: predicate (pred) and argument (argu). Two other features: predicate context (ctx-p) and region mark (mr) are also adopted. Because a single predicate word can not exactly describe the predicate information, especially when the same words appear more than one times in a sentence. With the predicate context, the ambiguity can be largely eliminated. Similarly, we use region mark mr = 1 to denote the argument position if it locates in the predicate context region, or mr = 0 if does not. These four simple features are all we need for our SRL system. Features of one sample with context size set to 1 is showed as following[2]: -
-![pic](./src/feature.jpg) -
- -In this sample, the coresponding labelled sentence is: - -[ A1 A record date ] has [ AM-NEG n't ] been [ V set ] . - -In the demo, we adopt the feature template as above, consists of : `argument`, `predicate`, `ctx-p (p=-1,0,1)`, `mark` and use `B/I/O` scheme to label each argument. These features and labels are stored in `feature` file, and separated by `\t`. - -### Data Provider - -`dataprovider.py` is the python file to wrap data. `hook()` function is to define the data slots for network. The Six features and label are all IndexSlots. -``` -def hook(settings, word_dict, label_dict, **kwargs): - settings.word_dict = word_dict - settings.label_dict = label_dict - #all inputs are integral and sequential type - settings.slots = [ - integer_value_sequence(len(word_dict)), - integer_value_sequence(len(predicate_dict)), - integer_value_sequence(len(word_dict)), - integer_value_sequence(len(word_dict)), - integer_value_sequence(len(word_dict)), - integer_value_sequence(len(word_dict)), - integer_value_sequence(len(word_dict)), - integer_value_sequence(2), - integer_value_sequence(len(label_dict))] -``` -The corresponding data iterator is as following: -``` -@provider(init_hook=hook, should_shuffle=True, calc_batch_size=get_batch_size, - can_over_batch_size=False, cache=CacheType.CACHE_PASS_IN_MEM) -def process(settings, file_name): - with open(file_name, 'r') as fdata: - for line in fdata: - sentence, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, label = \ - line.strip().split('\t') - - words = sentence.split() - sen_len = len(words) - word_slot = [settings.word_dict.get(w, UNK_IDX) for w in words] - - predicate_slot = [settings.predicate_dict.get(predicate)] * sen_len - ctx_n2_slot = [settings.word_dict.get(ctx_n2, UNK_IDX)] * sen_len - ctx_n1_slot = [settings.word_dict.get(ctx_n1, UNK_IDX)] * sen_len - ctx_0_slot = [settings.word_dict.get(ctx_0, UNK_IDX)] * sen_len - ctx_p1_slot = [settings.word_dict.get(ctx_p1, UNK_IDX)] * sen_len - ctx_p2_slot = [settings.word_dict.get(ctx_p2, UNK_IDX)] * sen_len - - marks = mark.split() - mark_slot = [int(w) for w in marks] - - label_list = label.split() - label_slot = [settings.label_dict.get(w) for w in label_list] - yield word_slot, predicate_slot, ctx_n2_slot, ctx_n1_slot, \ - ctx_0_slot, ctx_p1_slot, ctx_p2_slot, mark_slot, label_slot -``` -The `process`function yield 9 lists which are 8 features and label. - -### Neural Network Config -`db_lstm.py` is the neural network config file to load the dictionaries and define the data provider module and network architecture during the training procedure. - -Nine `data_layer` load instances from data provider. Eight features are transformed into embedddings respectively, and mixed by `mixed_layer` . Deep bidirectional LSTM layers extract features for the softmax layer. The objective function is cross entropy of labels. - -### Run Training -The script for training is `train.sh`, user just need to execute: -```bash - ./train.sh -``` -The content in `train.sh`: -``` -paddle train \ - --config=./db_lstm.py \ - --use_gpu=0 \ - --log_period=5000 \ - --trainer_count=1 \ - --show_parameter_stats_period=5000 \ - --save_dir=./output \ - --num_passes=10000 \ - --average_test_period=10000000 \ - --init_model_path=./data \ - --load_missing_parameter_strategy=rand \ - --test_all_data_in_one_period=1 \ -2>&1 | tee 'train.log' -``` - -- \--config=./db_lstm.py : network config file. -- \--use_gpu=false: use CPU to train, set true, if you install GPU version of PaddlePaddle and want to use GPU to train, until now crf_layer do not support GPU -- \--log_period=500: print log every 20 batches. -- \--trainer_count=1: set thread number (or GPU count). -- \--show_parameter_stats_period=5000: show parameter statistic every 100 batches. -- \--save_dir=./output: output path to save models. -- \--num_passes=10000: set pass number, one pass in PaddlePaddle means training all samples in dataset one time. -- \--average_test_period=10000000: do test on average parameter every average_test_period batches -- \--init_model_path=./data: parameter initialization path -- \--load_missing_parameter_strategy=rand: random initialization unexisted parameters -- \--test_all_data_in_one_period=1: test all data in one period - - -After training, the models will be saved in directory `output`. Our training curve is as following: -
-![pic](./src/curve.jpg) -
- -### Run testing -The script for testing is `test.sh`, user just need to execute: -```bash - ./test.sh -``` -The main part in `tesh.sh` -``` -paddle train \ - --config=./db_lstm.py \ - --model_list=$model_list \ - --job=test \ - --config_args=is_test=1 \ -``` - - - \--config=./db_lstm.py: network config file - - \--model_list=$model_list.list: model list file - - \--job=test: indicate the test job - - \--config_args=is_test=1: flag to indicate test - - \--test_all_data_in_one_period=1: test all data in 1 period - - -### Run prediction -The script for prediction is `predict.sh`, user just need to execute: -```bash - ./predict.sh - -``` -In `predict.sh`, user should offer the network config file, model path, label file, word dictionary file, feature file -``` -python predict.py - -c $config_file \ - -w $best_model_path \ - -l $label_file \ - -p $predicate_dict_file \ - -d $dict_file \ - -i $input_file \ - -o $output_file -``` - -`predict.py` is the main executable python script, which includes functions: load model, load data, data prediction. The network model will output the probability distribution of labels. In the demo, we take the label with maximum probability as result. User can also implement the beam search or viterbi decoding upon the probability distribution matrix. - -After prediction, the result is saved in `predict.res`. - -## Reference -[1] Martha Palmer, Dan Gildea, and Paul Kingsbury. The Proposition Bank: An Annotated Corpus of Semantic Roles , Computational Linguistics, 31(1), 2005. - -[2] Zhou, Jie, and Wei Xu. "End-to-end learning of semantic role labeling using recurrent neural networks." Proceedings of the Annual Meeting of the Association for Computational Linguistics. 2015. diff --git a/doc/tutorials/semantic_role_labeling/network_arch.png b/doc/tutorials/semantic_role_labeling/network_arch.png deleted file mode 100644 index 4ae7864212f2a0a38102ee7ff600527ea99fec82..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/semantic_role_labeling/network_arch.png and /dev/null differ diff --git a/doc/tutorials/semantic_role_labeling/src/curve.jpg b/doc/tutorials/semantic_role_labeling/src/curve.jpg deleted file mode 100644 index baa35ae7f0a0b6c246f3a0d331735477ab8bcd70..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/semantic_role_labeling/src/curve.jpg and /dev/null differ diff --git a/doc/tutorials/semantic_role_labeling/src/feature.jpg b/doc/tutorials/semantic_role_labeling/src/feature.jpg deleted file mode 100644 index 0e3310e4ace5613917e7779d3198ccbb3cdc5ada..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/semantic_role_labeling/src/feature.jpg and /dev/null differ diff --git a/doc/tutorials/semantic_role_labeling/src/network_arch.png b/doc/tutorials/semantic_role_labeling/src/network_arch.png deleted file mode 100644 index 4ae7864212f2a0a38102ee7ff600527ea99fec82..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/semantic_role_labeling/src/network_arch.png and /dev/null differ diff --git a/doc/tutorials/sentiment_analysis/index_cn.md b/doc/tutorials/sentiment_analysis/index_cn.md deleted file mode 100644 index 1323ec1a6abb2e7b5eeb2fbfff9cce5fe78a2c06..0000000000000000000000000000000000000000 --- a/doc/tutorials/sentiment_analysis/index_cn.md +++ /dev/null @@ -1,325 +0,0 @@ -# 情感分析教程 - -情感分析有许多应用场景。 一个基本的应用场景是区分给定文本的褒贬两极性,给定的文本可以是一个文档、句子、或者是一个小的文本片段。 一个简单的例子如:把用户在购物网站、旅游网站、团购网站(亚马逊、天猫、淘宝等)上发表的评论分成正面评论和负面评论两类。 - -情感分析也常用于基于大量评论和个人博客来监控社会媒体。 例如,研究人员分析了几个关于消费者信心和政治观点的调查,结果发现它们与同时期的Twitter消息中的情绪词频率相关 [1]。 另一个例子是通过分析每日Twitter博客的文本内容来预测股票变动 [2]。 - -另一方面,抓取产品的用户评论并分析他们的情感,有助于理解用户对不同公司,不同产品,甚至不同竞争对手产品的偏好。 - -本教程将指导您完成长期短期记忆(LSTM)网络的训练过程,以分类来自[大型电影评论数据集](http://ai.stanford.edu/~amaas/data/sentiment/)(有时称为[互联网电影数据库 (IMDB)](http://ai.stanford.edu/~amaas/papers/wvSent_acl2011.pdf))的句子的情感 。 此数据集包含电影评论及其相关联的类别标签,即正面和负面。 - -## 数椐准备 - -### IMDB 数椐介绍 - -训练模型之前, 我们需要预处理数椐并构建一个字典。 首先, 你可以使用下面的脚本下载 IMDB 数椐集和[Moses](http://www.statmt.org/moses/)工具, 这是一个基于统计的机器翻译系统. 我们提供了一个数据预处理脚本,它不仅能够处理IMDB数据,还能处理其他用户自定义的数据。 为了使用提前编写的脚本,需要将标记的训练和测试样本移动到另一个路径,这已经在`get_imdb.sh`中完成。 - -``` -cd demo/sentiment/data -./get_imdb.sh -``` -如果数椐获取成功,你将在目录```./demo/sentiment/data```中看到下面的文件: - -``` -aclImdb get_imdb.sh imdb mosesdecoder-master -``` - -* aclImdb: 从外部网站上下载的原始数椐集。 -* imdb: 仅包含训练和测试数椐集。 -* mosesdecoder-master: Moses 工具。 - -IMDB数据集包含25,000个已标注过的高极性电影评论用于训练,25,000个用于测试。负面的评论的得分小于等于4,正面的评论的得大于等于7,总评分10分。 运行完脚本 `./get_imdb.sh`后, 我们可以看到在目录 `aclImdb`中的数椐集的结构如下: - -``` -imdbEr.txt imdb.vocab README test train -``` -* train: 训练数椐集。 -* test : 测试数椐集。 -* imdb.vocab: 字典文件。 -* imdbEr.txt: 字典imdb.vocab中每个切分单词的预期评级。 -* README: 数椐说明文档。 - -测试集和训练集目录包含下面的文件: - -``` -labeledBow.feat neg pos unsup unsupBow.feat urls_neg.txt urls_pos.txt urls_unsup.txt -``` - -* pos: 正面评价样本,包含12,500个txt文件,每个文件是一个电影评论。 -* neg: 负面评价样本,包含12,500个txt文件,每个文件是一个电影评论。 -* unsup: 未标记的评价样本,包含50,000个txt文件。 -* urls_xx.txt: 每个评论的网址。 -* xxBow.feat: 用于统计词频的Bow模型特征。 - -### IMDB 数椐准备 - -在这个例子中,我们只使用已经标注过的训练集和测试集,且默认在训练集上构建字典,而不使用IMDB数椐集中的imdb.vocab做为字典。训练集已经做了随机打乱排序而测试集没有。 Moses 工具中的脚本`tokenizer.perl` 用于切分单单词和标点符号。执行下面的命令就可以预处理数椐。 - -``` -cd demo/sentiment/ -./preprocess.sh -``` -preprocess.sh: - -``` -data_dir="./data/imdb" -python preprocess.py -i data_dir -``` - -* data_dir: 输入数椐所在目录。 -* preprocess.py: 预处理脚本。 - -运行成功后目录`demo/sentiment/data/pre-imdb` 结构如下: - -``` -dict.txt labels.list test.list test_part_000 train.list train_part_000 -``` -* test\_part\_000 and train\_part\_000: 所有标记的测试集和训练集, 训练集已经随机打乱。 -* train.list and test.list: 训练集和测试集文件列表。 -* dict.txt: 利用训练集生成的字典。 -* labels.txt: neg 0, pos 1, 含义:标签0表示负面的评论,标签1表示正面的评论。 - -### 用户自定义数椐预处理 - -如果你执行其它的用情感分析来分类文本的任务,可以按如下的结构来准备数椐. 我们提供了脚本来构建字典和预处理数椐。所以你只用按下面的结构来组织数椐就行了。 - -``` -dataset -|----train -| |----class1 -| | |----text_files -| |----class2 -| | |----text_files -| | ... -|----test -| |----class1 -| | |----text_files -| |----class2 -| | |----text_files -| | ... -``` -* dataset: 一级目录。 -* train, test: 二级目录。 -* class1,class2,...: 三级目录。 -* text_files: 文本格式的实例文件。 - -所有同目录下的文本实例文件都是同级别的。 每个文本文件包含一个或者多个实例,每一行表示一个实例。 为了充分的随机打乱训练集, 在预处理含有多行数椐的文本文件时参数设置稍有不同, 执行`preprocess.sh`脚本时需要加上`-m True`参数。 tokenizer.perl 默认用来切分单记和标点符号,如果你不需要这个操作,在运行`preprocess.sh`时加上`-t False`参数即可。 - -## 训练模型 - -在这步任务中,我们使用了循环神经网络(RNN)的 LSTM 架构来训练情感分析模型。 引入LSTM模型主要是为了克服消失梯度的问题。 LSTM网络类似于具有隐藏层的标准循环神经网络, 但是隐藏层中的每个普通节点被一个记忆单元替换。 每个记忆单元包含四个主要的元素: 输入门, 具有自循环连接的神经元,忘记门和输出门。 更多的细节可以在文献中找到[4]。 LSTM架构的最大优点是它可以在长时间间隔内记忆信息,而没有短时记忆的损失。在有新的单词来临的每一个时间步骤内,存储在记忆单元区块的历史信息被更新用来迭代的学习单词以合理的序列程现。 - -
![LSTM](src/lstm.png)
-
图表 1. LSTM [3]
- -情感分析是自然语言理解中最典型的问题之一。 它的目的是预测在一个序列中表达的情感态度。 通常, ,仅仅是一些关键词,如形容词和副词,在预测序列或段落的情感中起主要作用。然而有些评论上下文非常长,例如 IMDB的数椐集。 我们只所以使用LSTM来执行这个任务是因为其改进的设计并且具有门机制。 首先,它能够从词级到具有可变上下文长度的上下文级别来总结表示。 第二,它可以在句子级别利用可扩展的上下文, 而大多数方法只是利用n-gram级别的知识。第三,它直接学习段落表示,而不是组合上下文级别信息。 - -在本演示中,我们提供两个网络,即双向LSTM和三层堆叠LSTM。 - -#### 双向LSTM - -图2是双向LSTM网络,后面连全连接层和softmax层。 - -
![BiLSTM](src/bi_lstm.jpg)
-
图 2. Bidirectional-LSTM
- -#### Stacked-LSTM -图3是三层LSTM结构。图的底部是word embedding(对文档处理后形成的单词向量)。 接下来,连接三个LSTM隐藏层,并且第二个是反向LSTM。然后提取隐藏LSTM层的所有时间步长的最大词向量作为整个序列的表示。 最后,使用具有softmax激活的全连接前馈层来执行分类任务。 更多内容可查看参考文献 [5]。 - -
![StackedLSTM](src/stacked_lstm.jpg)
-
图 3. Stacked-LSTM for sentiment analysis
- -**配置** - -进入`demo/sentiment` 目录 , `trainer_config.py` 是一个配置文件的例子, 其中包含算法和网络配置。第一行从`sentiment_net.py`中导出预定义的网络。 - -trainer_config.py: - -```python -from sentiment_net import * - -data_dir = "./data/pre-imdb" -# whether this config is used for test -is_test = get_config_arg('is_test', bool, False) -# whether this config is used for prediction -is_predict = get_config_arg('is_predict', bool, False) -dict_dim, class_dim = sentiment_data(data_dir, is_test, is_predict) - -################## Algorithm Config ##################### - -settings( - batch_size=128, - learning_rate=2e-3, - learning_method=AdamOptimizer(), - regularization=L2Regularization(8e-4), - gradient_clipping_threshold=25 -) - -#################### Network Config ###################### -stacked_lstm_net(dict_dim, class_dim=class_dim, - stacked_num=3, is_predict=is_predict) -#bidirectional_lstm_net(dict_dim, class_dim=class_dim, is_predict=is_predict) -``` - -* **数椐定义**: - * get\_config\_arg(): 获取通过 `--config_args=xx` 设置的命令行参数。 - * 定义训练数椐和测试数椐提供者, 这里使用了PaddlePaddle的Python接口来加载数椐。想了解更多细节可以参考PyDataProvider部分的文档 - -* **算法配置**: - * 使用随机梯度下降(sgd)算法。 - * 使用 adam 优化。 - * 设置batch size大小为128。 - * 设置平均sgd窗口。 - * 设置全局学习率。 -* **网络配置**: - * dict_dim: 获取字典维度。 - * class_dim: 设置类别数,IMDB有两个标签,即正面评价标签和负面评价标签。 - * `stacked_lstm_net`: 预定义网络如图3所示,默认情况下使用此网络 - * `bidirectional_lstm_net`: 预定义网络,如图2所示。 - -**训练** - -首先安装PaddlePaddle。 然后使用下面的脚本 `train.sh` 来开启本地的训练。 - -``` -cd demo/sentiment/ -./train.sh -``` - -train.sh: - -``` -config=trainer_config.py -output=./model_output -paddle train --config=$config \ - --save_dir=$output \ - --job=train \ - --use_gpu=false \ - --trainer_count=4 \ - --num_passes=10 \ - --log_period=20 \ - --dot_period=20 \ - --show_parameter_stats_period=100 \ - --test_all_data_in_one_period=1 \ - 2>&1 | tee 'train.log' -``` - -* \--config=$config: 设置网络配置。 -* \--save\_dir=$output: 设置输出路径以保存训练完成的模型。 -* \--job=train: 设置工作模式为训练。 -* \--use\_gpu=false: 使用CPU训练,如果你安装GPU版本的PaddlePaddle,并想使用GPU来训练设置为true。 -* \--trainer\_count=4:设置线程数(或GPU个数)。 -* \--num\_passes=15: 设置pass,PaddlePaddle中的一个pass意味着对数据集中的所有样本进行一次训练。 -* \--log\_period=20: 每20个batch打印一次日志。 -* \--show\_parameter\_stats\_period=100: 每100个batch打印一次统计信息。 -* \--test\_all_data\_in\_one\_period=1: 每次测试都测试所有数据。 - -如果运行成功,输出日志保存在路径 `demo/sentiment/train.log`中,模型保存在目录`demo/sentiment/model_output/`中。 输出日志说明如下: - -``` -Batch=20 samples=2560 AvgCost=0.681644 CurrentCost=0.681644 Eval: classification_error_evaluator=0.36875 CurrentEval: classification_error_evaluator=0.36875 -... -Pass=0 Batch=196 samples=25000 AvgCost=0.418964 Eval: classification_error_evaluator=0.1922 -Test samples=24999 cost=0.39297 Eval: classification_error_evaluator=0.149406 -``` -- Batch=xx: 表示训练了xx个Batch。 -- samples=xx: 表示训练了xx个样本。。 -- AvgCost=xx: 从第0个batch到当前batch的平均损失。 -- CurrentCost=xx: 最新log_period个batch处理的当前损失。 -- Eval: classification\_error\_evaluator=xx: 表示第0个batch到当前batch的分类错误。 -- CurrentEval: classification\_error\_evaluator: 最新log_period个batch的分类错误。 -- Pass=0: 通过所有训练集一次称为一遍。 0表示第一次经过训练集。 - -默认情况下,我们使用`stacked_lstm_net`网络,当传递相同的样本数时,它的收敛速度比`bidirectional_lstm_net`快。如果要使用双向LSTM,只需删除最后一行中的注释并把“stacked_lstm_net”注释掉。 - -## 测试模型 - -测试模型是指使用训练出的模型评估已标记的验证集。 - -``` -cd demo/sentiment -./test.sh -``` - -test.sh: - -```bash -function get_best_pass() { - cat $1 | grep -Pzo 'Test .*\n.*pass-.*' | \ - sed -r 'N;s/Test.* error=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' | \ - sort | head -n 1 -} - -log=train.log -LOG=`get_best_pass $log` -LOG=(${LOG}) -evaluate_pass="model_output/pass-${LOG[1]}" - -echo 'evaluating from pass '$evaluate_pass - -model_list=./model.list -touch $model_list | echo $evaluate_pass > $model_list -net_conf=trainer_config.py -paddle train --config=$net_conf \ - --model_list=$model_list \ - --job=test \ - --use_gpu=false \ - --trainer_count=4 \ - --config_args=is_test=1 \ - 2>&1 | tee 'test.log' -``` - -函数`get_best_pass`依据分类错误率获得最佳模型进行测试。 在本示例中,我们默认使用IMDB的测试数据集作为验证。 与训练不同,它需要在这里指定`--job = test`和模型路径,即`--model_list = $model_list`。如果运行成功,日志将保存在“demo / sentiment / test.log”的路径中。例如,在我们的测试中,最好的模型是`model_output / pass-00002`,分类误差是0.115645,如下: - -``` -Pass=0 samples=24999 AvgCost=0.280471 Eval: classification_error_evaluator=0.115645 -``` - -## 预测 - -`predict.py`脚本提供了一个预测接口。在使用它之前请安装PaddlePaddle的python api。 预测IMDB的未标记评论的一个实例如下: - -``` -cd demo/sentiment -./predict.sh -``` -predict.sh: - -``` -#Note the default model is pass-00002, you shold make sure the model path -#exists or change the mode path. -model=model_output/pass-00002/ -config=trainer_config.py -label=data/pre-imdb/labels.list -cat ./data/aclImdb/test/pos/10007_10.txt | python predict.py \ - --tconf=$config\ - --model=$model \ - --label=$label \ - --dict=./data/pre-imdb/dict.txt \ - --batch_size=1 -``` - -* `cat ./data/aclImdb/test/pos/10007_10.txt` : 输入预测样本。 -* `predict.py` : 预测接口脚本。 -* `--tconf=$config` : 设置网络配置。 -* `--model=$model` : 设置模型路径。 -* `--label=$label` : 设置标签类别字典,这个字典是整数标签和字符串标签的一个对应。 -* `--dict=data/pre-imdb/dict.txt` : 设置字典文件。 -* `--batch_size=1` : 设置batch size。 - -注意应该确保默认模型路径`model_output / pass-00002`存在或更改为其它模型路径。 - -本示例的预测结果: - -``` -Loading parameters from model_output/pass-00002/ -./data/aclImdb/test/pos/10014_7.txt: predicting label is pos -``` -我们真诚地感谢您的关注,并欢迎您来参与贡献。 - -## 参考文档 -[1] Brendan O'Connor, Ramnath Balasubramanyan, Bryan R. Routledge, and Noah A. Smith. 2010. [From Tweets to Polls: Linking Text Sentiment to Public Opinion Time Series](http://homes.cs.washington.edu/~nasmith/papers/oconnor+balasubramanyan+routledge+smith.icwsm10.pdf). In ICWSM-2010.
-[2] Johan Bollen, Huina Mao, Xiaojun Zeng. 2011. [Twitter mood predicts the stock market](http://arxiv.org/abs/1010.3003), Journal of Computational Science.
-[3] Alex Graves, Marcus Liwicki, Santiago Fernan- dez, Roman Bertolami, Horst Bunke, and Ju ̈rgen Schmidhuber. 2009. [A novel connectionist system for unconstrained handwriting recognition. IEEE Transactions on Pattern Analysis and Machine In- telligence](http://www.cs.toronto.edu/~graves/tpami_2009.pdf), 31(5):855–868.
-[4] Zachary C. Lipton, [A Critical Review of Recurrent Neural Networks for Sequence Learning](http://arxiv.org/abs/1506.00019v1), arXiv:1506.00019.
-[5] Jie Zhou and Wei Xu; [End-to-end Learning of Semantic Role Labeling Using Recurrent Neural Networks](http://www.aclweb.org/anthology/P/P15/P15-1109.pdf); ACL-IJCNLP 2015.
diff --git a/doc/tutorials/sentiment_analysis/index_en.md b/doc/tutorials/sentiment_analysis/index_en.md deleted file mode 100644 index bb7681db44ca6f286ad6935ddfecb9becb429192..0000000000000000000000000000000000000000 --- a/doc/tutorials/sentiment_analysis/index_en.md +++ /dev/null @@ -1,328 +0,0 @@ -# Sentiment Analysis Tutorial - -Sentiment analysis has many applications. A basic task in sentiment analysis is classifying the polarity of a given text at the document, sentence or feature/aspect level. One simple example is to classify the customer reviews in a shopping website, a tourism website, and group buying websites like Amazon, TaoBao, Tmall etc. - -Sentiment analysis is also used to monitor social media based on large amount of reviews or blogs. For example, the researchers analyzed several surveys on consumer confidence and political opinion, found they correlate to sentiment word frequencies in contemporaneous Twitter messages [1]. Another example is to forecast stock movements through analyzing the text content of a daily Twitter blog [2]. - -On the other hand, grabbing the user comments of products and analyzing their sentiment are useful to understand user preferences for companies, products, even competing products. - -This tutorial will guide you through the process of training a Long Short Term Memory (LSTM) Network to classify the sentiment of sentences from [Large Movie Review Dataset](http://ai.stanford.edu/~amaas/data/sentiment/), sometimes known as the Internet Movie Database (IMDB). This dataset contains movie reviews along with their associated binary sentiment polarity labels, namely positive and negative. So randomly guessing yields 50% accuracy. - -## Data Preparation - -### IMDB Data Introduction - -Before training models, we need to preprocess the data and build a dictionary. First, you can use following script to download IMDB dataset and [Moses](http://www.statmt.org/moses/) tool, which is a statistical machine translation system. We provide a data preprocessing script, which is capable of handling not only IMDB data, but also other user-defined data. In order to use the pre-written script, it needs to move labeled train and test samples to another path, which has been done in `get_imdb.sh`. - -``` -cd demo/sentiment/data -./get_imdb.sh -``` -If the data is obtained successfuly, you will see the following files at ```./demo/sentiment/data```: - -``` -aclImdb get_imdb.sh imdb mosesdecoder-master -``` - -* aclImdb: raw dataset downloaded from website. -* imdb: only contains train and test data. -* mosesdecoder-master: Moses tool. - -IMDB dataset contains 25,000 highly polar movie reviews for training, and 25,000 for testing. A negative review has a score ≤ 4 out of 10, and a positive review has a score ≥ 7 out of 10. After running `./get_imdb.sh`, we can find the dataset has the following structure in `aclImdb`. - -``` -imdbEr.txt imdb.vocab README test train -``` -* train: train sets. -* test : test sets. -* imdb.vocab: dictionary. -* imdbEr.txt: expected rating for each token in imdb.vocab. -* README: data documentation. - -The file in train set directory is as follows. The test set also contains them except `unsup` and `urls_unsup.txt`. - -``` -labeledBow.feat neg pos unsup unsupBow.feat urls_neg.txt urls_pos.txt urls_unsup.txt -``` - -* pos: positive samples, contains 12,500 txt files, each file is one movie review. -* neg: negative samples, contains 12,500 txt files, each file is one movie review. -* unsup: unlabeled samples, contains 50,000 txt files. -* urls_xx.txt: urls of each reviews. -* xxBow.feat: already-tokenized bag of words (BoW) features. - -### IMDB Data Preparation - -In this demo, we only use labled train and test set and not use imdb.vocab as dictionary. By default, dictionary is builded on train set. Train set is shuffled and test set is not. `tokenizer.perl` in Moses tool is used to tokenize the words and punctuation. Simply execute the following command to preprcess data. - -``` -cd demo/sentiment/ -./preprocess.sh -``` -preprocess.sh: - -``` -data_dir="./data/imdb" -python preprocess.py -i data_dir -``` - -* data_dir: input data directory. -* preprocess.py: preprocess script. - -If running successfully, you will see `demo/sentiment/data/pre-imdb` directory as follows: - -``` -dict.txt labels.list test.list test_part_000 train.list train_part_000 -``` -* test\_part\_000 and train\_part\_000: all labeled test and train sets. Train sets have be shuffled. -* train.list and test.list: train and test file lists. -* dict.txt: dictionary generated on train sets by default. -* labels.txt: neg 0, pos 1, means label 0 is negative review, label 1 is positive review. - -### User-defined Data Preparation - -If you perform other sentiment classifcation task, you can prepare data as follows. We have provided the scripts to build dictionary and preprocess data. So just organize data as follows. - -``` -dataset -|----train -| |----class1 -| | |----text_files -| |----class2 -| | |----text_files -| | ... -|----test -| |----class1 -| | |----text_files -| |----class2 -| | |----text_files -| | ... -``` -* dataset: 1st directory. -* train, test: 2nd directory. -* class1,class2,...: 3rd directory. -* text_files: samples with text file format. - -All samples with text files format under the same folder are same category. Each text file contains one or more samples and each line is one sample. In order to shuffle fully, the preprocessing is a little different for data with multiple lines in one text file, which needs to set `-m True` in `preprocess.sh`. And tokenizer.perl is used by default. If you don't need it, only set `-t False` in `preprocess.sh'. - -## Training - -In this task, we use Recurrent Neural Network (RNN) of LSTM architecure to train sentiment analysis model. LSTM model was introduced primarily in order to overcome the problem of vanishing gradients. LSTM network resembles a standard recurrent neural network with a hidden layer, but each ordinary node in the hidden layer is replaced by a memory cell. Each memory cell contains four main elements: an input gate, a neuron with a self-recurrent connection, a forget gate and an output gate. More details can be found in the literature [4]. The biggest advantage of the LSTM architecture is that it learns to memorize information over long time intervals without the loss of short time memory. At each time step with a new coming word, historical information stored in the memory block is updated to iteratively learn the sequence representation. - -
![LSTM](./lstm.png)
-
Figure 1. LSTM [3]
- -Sentiment analysis is among the most typical problems in natural language understanding. It aims at predicting the attitude expressed in a sequence. Usually, only some key words, like adjectives and adverbs words, play a major role in predicting the sentiment of sequences or paragraphs. However, some review or comment contexts are very long, such as IMDB dataset. We use LSTM to perform this task for its improved design with the gate mechanism. First, it is able to summarize the representation from word level to context level with variable context length which is adapted by the gate values. Second, it can utilize the expanded context at the sentence level, while most methods are good at utilizing n-gram level knowledge. Third, it learns the paragraph representation directly rather than combining the context level information. This results in this end-to-end framework. - -In this demo we provide two network, namely bidirectional-LSTM and three layers of stacked-LSTM. - -#### Bidirectional-LSTM - -One is a bidirectional LSTM network, connected by fully connected layer and softmax, as shown in Figure 2. - -
![BiLSTM](./bi_lstm.jpg)
-
Figure 2. Bidirectional-LSTM
- -#### Stacked-LSTM -Another is three-layer LSTM structure in Figure 3. The bottom of the figure is word embedding. Next, three LSTM-Hidden layers are connected and the second LSTM is reversed. Then extract the maximum hidden vectors of all time step of hidden and LSTM layer as the representation for the entire sequence. Finally, a fully connected feed forward layer with softmax activation is used to perform the classification task. This network is refered to paper [5]. - -
![StackedLSTM](./stacked_lstm.jpg)
-
Figure 3. Stacked-LSTM for sentiment analysis
- -**Config** - -Switch into `demo/sentiment` directory, `trainer_config.py` file is an example of the config, containing algorithm and newtork configure. The first line imports predefined networks from `sentiment_net.py`. - -trainer_config.py: - -```python -from sentiment_net import * - -data_dir = "./data/pre-imdb" -# whether this config is used for test -is_test = get_config_arg('is_test', bool, False) -# whether this config is used for prediction -is_predict = get_config_arg('is_predict', bool, False) -dict_dim, class_dim = sentiment_data(data_dir, is_test, is_predict) - -################## Algorithm Config ##################### - -settings( - batch_size=128, - learning_rate=2e-3, - learning_method=AdamOptimizer(), - average_window=0.5, - regularization=L2Regularization(8e-4), - gradient_clipping_threshold=25 -) - -#################### Network Config ###################### -stacked_lstm_net(dict_dim, class_dim=class_dim, - stacked_num=3, is_predict=is_predict) -#bidirectional_lstm_net(dict_dim, class_dim=class_dim, is_predict=is_predict) -``` - -* **Data Definition**: - * get\_config\_arg(): get arguments setted by `--config_args=xx` in commandline argument. - * Define data provider, here using Python interface to load data. For details, you can refer to the document of PyDataProvider2. - -* **Algorithm Configuration**: - * set batch size of 128. - * set global learning rate. - * use adam optimization. - * set average sgd window. - * set L2 regularization. - * set gradient clipping threshold. -* **Network Configuration**: - * dict_dim: dictionary dimension. - * class_dim: category number, IMDB has two label, namely positive and negative label. - * `stacked_lstm_net`: predefined network as shown in Figure 3, use this network by default. - * `bidirectional_lstm_net`: predefined network as shown in Figure 2. - -**Training** - -Install PaddlePaddle first if necessary. Then you can use script `train.sh` as follows to launch local training. - -``` -cd demo/sentiment/ -./train.sh -``` - -train.sh: - -``` -config=trainer_config.py -output=./model_output -paddle train --config=$config \ - --save_dir=$output \ - --job=train \ - --use_gpu=false \ - --trainer_count=4 \ - --num_passes=10 \ - --log_period=20 \ - --dot_period=20 \ - --show_parameter_stats_period=100 \ - --test_all_data_in_one_period=1 \ - 2>&1 | tee 'train.log' -``` - -* \--config=$config: set network config. -* \--save\_dir=$output: set output path to save models. -* \--job=train: set job mode to train. -* \--use\_gpu=false: use CPU to train, set true, if you install GPU version of PaddlePaddle and want to use GPU to train. -* \--trainer\_count=4: set thread number (or GPU count). -* \--num\_passes=15: set pass number, one pass in PaddlePaddle means training all samples in dataset one time. -* \--log\_period=20: print log every 20 batches. -* \--show\_parameter\_stats\_period=100: show parameter statistic every 100 batches. -* \--test\_all_data\_in\_one\_period=1: test all data every testing. - -If the run succeeds, the output log is saved in path of `demo/sentiment/train.log` and model is saved in path of `demo/sentiment/model_output/`. The output log is explained as follows. - -``` -Batch=20 samples=2560 AvgCost=0.681644 CurrentCost=0.681644 Eval: classification_error_evaluator=0.36875 CurrentEval: classification_error_evaluator=0.36875 -... -Pass=0 Batch=196 samples=25000 AvgCost=0.418964 Eval: classification_error_evaluator=0.1922 -Test samples=24999 cost=0.39297 Eval: classification_error_evaluator=0.149406 -``` -- Batch=xx: means passing xx batches. -- samples=xx: means passing xx samples. -- AvgCost=xx: averaged cost from 0-th batch to current batch. -- CurrentCost=xx: current cost of latest log_period batches. -- Eval: classification\_error\_evaluator=xx: means classfication error from 0-th batch ro current batch. -- CurrentEval: classification\_error\_evaluator: current classfication error of the lates log_period batches. -- Pass=0: Going through all training set one time is called one pass. 0 means going through training set first time. - -By default, we use the `stacked_lstm_net` network, which converges at a faster rate than `bidirectional_lstm_net` when passing same sample number. If you want to use bidirectional LSTM, just remove comment in the last line and comment `stacked_lstm_net`. - -## Testing - -Testing means evaluating the labeled validation set using trained model. - -``` -cd demo/sentiment -./test.sh -``` - -test.sh: - -```bash -function get_best_pass() { - cat $1 | grep -Pzo 'Test .*\n.*pass-.*' | \ - sed -r 'N;s/Test.* error=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' | \ - sort | head -n 1 -} - -log=train.log -LOG=`get_best_pass $log` -LOG=(${LOG}) -evaluate_pass="model_output/pass-${LOG[1]}" - -echo 'evaluating from pass '$evaluate_pass - -model_list=./model.list -touch $model_list | echo $evaluate_pass > $model_list -net_conf=trainer_config.py -paddle train --config=$net_conf \ - --model_list=$model_list \ - --job=test \ - --use_gpu=false \ - --trainer_count=4 \ - --config_args=is_test=1 \ - 2>&1 | tee 'test.log' -``` - -The function `get_best_pass` gets the best model by classification error rate for testing. In this example, We use test dataset of IMDB as validation by default. Unlike training, it needs to specify `--job=test` and model path, namely `--model_list=$model_list` here. If running successfully, the log is saved in path of `demo/sentiment/test.log`. For example, in our test, the best model is `model_output/pass-00002`, the classification error is 0.115645 as follows. - -``` -Pass=0 samples=24999 AvgCost=0.280471 Eval: classification_error_evaluator=0.115645 -``` - -## Prediction - -`predict.py` provides a predicting interface. You should install python api of PaddlePaddle before using it. One example to predict unlabeled review of IMDB is as follows. Simply running: - -``` -cd demo/sentiment -./predict.sh -``` -predict.sh: - -``` -#Note the default model is pass-00002, you shold make sure the model path -#exists or change the mode path. -model=model_output/pass-00002/ -config=trainer_config.py -label=data/pre-imdb/labels.list -cat ./data/aclImdb/test/pos/10007_10.txt | python predict.py \ - --tconf=$config\ - --model=$model \ - --label=$label \ - --dict=./data/pre-imdb/dict.txt \ - --batch_size=1 -``` - -* `cat ./data/aclImdb/test/pos/10007_10.txt` : the input sample. -* `predict.py` : predicting interface. -* `--tconf=$config` : set network configure. -* ` --model=$model` : set model path. -* `--label=$label` : set dictionary about corresponding relation between integer label and string label. -* `--dict=data/pre-imdb/dict.txt` : set dictionary. -* `--batch_size=1` : set batch size. - -Note you should make sure the default model path `model_output/pass-00002` -exists or change the model path. - -Predicting result of this example: - -``` -Loading parameters from model_output/pass-00002/ -./data/aclImdb/test/pos/10014_7.txt: predicting label is pos -``` -We sincerely appreciate your interest and welcome your contributions. - -## Reference -[1] Brendan O'Connor, Ramnath Balasubramanyan, Bryan R. Routledge, and Noah A. Smith. 2010. [From Tweets to Polls: Linking Text Sentiment to Public Opinion Time Series](http://homes.cs.washington.edu/~nasmith/papers/oconnor+balasubramanyan+routledge+smith.icwsm10.pdf). In ICWSM-2010.
-[2] Johan Bollen, Huina Mao, Xiaojun Zeng. 2011. [Twitter mood predicts the stock market](http://arxiv.org/abs/1010.3003), Journal of Computational Science.
-[3] Alex Graves, Marcus Liwicki, Santiago Fernan- dez, Roman Bertolami, Horst Bunke, and Ju ̈rgen Schmidhuber. 2009. [A novel connectionist system for unconstrained handwriting recognition. IEEE Transactions on Pattern Analysis and Machine In- telligence](http://www.cs.toronto.edu/~graves/tpami_2009.pdf), 31(5):855–868.
-[4] Zachary C. Lipton, [A Critical Review of Recurrent Neural Networks for Sequence Learning](http://arxiv.org/abs/1506.00019v1), arXiv:1506.00019.
-[5] Jie Zhou and Wei Xu; [End-to-end Learning of Semantic Role Labeling Using Recurrent Neural Networks](http://www.aclweb.org/anthology/P/P15/P15-1109.pdf); ACL-IJCNLP 2015.
diff --git a/doc/tutorials/sentiment_analysis/lstm.png b/doc/tutorials/sentiment_analysis/lstm.png deleted file mode 100644 index aaf1fc690da2ffb8418cde5ed81848ddb5263030..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/sentiment_analysis/lstm.png and /dev/null differ diff --git a/doc/tutorials/sentiment_analysis/src/bi_lstm.jpg b/doc/tutorials/sentiment_analysis/src/bi_lstm.jpg deleted file mode 100644 index adec1606d64d6e35ffe7e62abfa9a09309b05c84..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/sentiment_analysis/src/bi_lstm.jpg and /dev/null differ diff --git a/doc/tutorials/sentiment_analysis/src/lstm.png b/doc/tutorials/sentiment_analysis/src/lstm.png deleted file mode 100644 index aaf1fc690da2ffb8418cde5ed81848ddb5263030..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/sentiment_analysis/src/lstm.png and /dev/null differ diff --git a/doc/tutorials/sentiment_analysis/src/stacked_lstm.jpg b/doc/tutorials/sentiment_analysis/src/stacked_lstm.jpg deleted file mode 100644 index 4239055050966e0095e188a8c81d860711bce29d..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/sentiment_analysis/src/stacked_lstm.jpg and /dev/null differ diff --git a/doc/tutorials/sentiment_analysis/stacked_lstm.jpg b/doc/tutorials/sentiment_analysis/stacked_lstm.jpg deleted file mode 100644 index 4239055050966e0095e188a8c81d860711bce29d..0000000000000000000000000000000000000000 Binary files a/doc/tutorials/sentiment_analysis/stacked_lstm.jpg and /dev/null differ diff --git a/doc/tutorials/text_generation/index_cn.md b/doc/tutorials/text_generation/index_cn.md deleted file mode 100644 index 41a87b926db399d692d677e5278e7d5a0b7b5594..0000000000000000000000000000000000000000 --- a/doc/tutorials/text_generation/index_cn.md +++ /dev/null @@ -1,339 +0,0 @@ -# 文本生成教程 # - -在语言生成领域中,“序列到序列”(sequence to sequence)的方法已被证明是一种强大的模型。它可以被应用于进行机器翻译(machine translation)、query改写(query rewriting)、图像描述(image captioning)等等。 - -本篇教程将会指导你通过训练一个“序列到序列”的神经网络机器翻译(NMT)模型来将法语翻译成英语。 - -我们遵循 [Neural Machine Translation by Jointly Learning to Align and Translate](http://arxiv.org/abs/1409.0473) 这篇文章,其中详细说明了模型架构,以及在WMT-14数据集上得到良好表现的训练过程。本篇教程在PaddlePaddle中重现了这一良好的训练结果。 - -我们感谢@caoying的pull request,其中定义了模型架构和solver配置。 - -## 数据准备 ## -### 下载与解压缩 ### -从该链接 [http://www-lium.univ-lemans.fr/~schwenk/cslm\_joint\_paper/](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/) 下载WMT-14数据集,然后解压,并将Develop和Test数据分别放入不同的文件夹。 - -- **Train data**: [bitexts (选择过后的)](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/bitexts.tgz) -- **Develop and Test data**: [dev 与 test 数据](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz) - -在Linux下,只需要简单地运行以下命令。否则你需要自己下载、解压、拆分到不同文件夹、并且分别重命名文件后缀。 - -```bash -cd demo/seqToseq/data -./wmt14_data.sh -``` - -我们会发现数据集 `wmt14` 中包含如下表所示的3个文件夹。 - ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
folder nameFrench-English parallel corpora filenumber of total filesize
train_dataccb2_pc30.src, ccb2_pc30.trg, etc123.55G
test_datantst1213.src, ntst1213.trg21636k
gen_datantst14.src, ntst14.trg2864k
-
- -- 每个文件夹都包含法语到英语的平行语料库 -- **XXX.src** 是原始法语文件;**XXX.trg** 是目标英语文件 -- **XXX.src** 和 **XXX.trg** 的行数应该一致 -- 每行都是一个法语或者英语的句子 -- **XXX.src** 和 **XXX.trg** 中任意第i行的句子之间都有着一一对应的关系 - -### 用户自定义数据集 ### - -如果你想进行诸如语义转述(Paraphrasing)等其他“序列到序列”的任务,你只需要按照如下方式组织数据,并将它们放在`demo/seqToseq/data`目录下: - - dataset - train - file1.src file1.trg - file2.src file2.trg - ...... - test - file1.src file1.trg - file2.src file2.trg - ...... - gen - file1.src file1.trg - file2.src file2.trg - ...... - -- 一级目录:数据集文件夹名称 -- 二级目录:train、test和gen这三个文件夹是固定的 -- 三级目录:源语言到目标语言的平行语料库文件 - - **XXX.src** 是源语言的文件,**XXX.trg** 时目标语言的文件 - - 文件中的每行都必须是一个句子 - - **XXX.src** 和 **XXX.trg** 中任意第i行的句子之间都必须有着一一对应的关系 - -## 数据预处理 ## -### 预处理工作流程 ### -- 将每个源语言到目标语言的平行语料库文件合并为一个文件: - - 合并每个 **XXX.src** 和 **XXX.trg** 文件为 **XXX** - - **XXX** 中的第i行 = **XXX.src** 中的第i行 + '\t' + **XXX.trg**中的第i行 -- 创建训练数据的“源字典”和“目标字典”,每个字典都有DICTSIZE个单词,包括: - - 词频最高的(DICTSIZE - 3)个单词 - - 3个特殊符号 - - ``:序列的开始 - - ``:序列的结束 - - ``:未包含在字典中的单词 - -### 预处理命令和结果 -对数据集进行预处理的基本命令是: - -```python -cd demo/seqToseq/ -python preprocess.py -i INPUT [-d DICTSIZE] [-m] -``` - -- `-i INPUT`:输入的原始数据集路径 -- `-d DICTSIZE`:指定的字典单词数,如果没有设置,字典会包含输入数据集中的所有单词 -- `-m --mergeDict`:合并 “源字典”和“目标字典”,使得两个字典有相同的上下文 - -你将会看到如下消息: - - concat parallel corpora for dataset - build source dictionary for train data - build target dictionary for train data - dictionary size is XXX - -然后你只需要运行以下命令: - -```python -python preprocess.py -i data/wmt14 -d 30000 -``` - -这将花费数分钟的时间,并且将预处理好的数据集存放在`demo/seqToseq/data/pre-wmt14`目录下。目录结构如下: - - train test gen train.list test.list gen.list src.dict trg.dict# Text generation Tutorial # - -- **train, test, gen**:分别包含了法语到英语的平行语料库的训练数据、测试数据和生成数据。文件夹中的每个文件的每一行包含两部分,首先是法语序列,然后是对应的英语序列。 -- **train.list, test.list, gen.list**:分别为train,test,gen文件夹中的文件列表 -- **src.dict, trg.dict**:源(法语)/目标(英语)字典,每个字典包含总共30000个单词:29997个最高频单词和3个特殊符号 - -## 模型训练 ## -### 简介### - -神经网络机器翻译(NMT)旨在建立一个可以被协同调至最优翻译效果的单神经元网络。近期提出的NMT模型通常都属于编解码模型(encoder–decoder models)的一种。编解码模型将一个源语句编码为一个定长的向量,然后解码器通过这个向量生成一个目标语句。 - -在这个任务中,我们使用了一个编解码模型的扩展,它同时学习排列(align)与翻译。每当模型在翻译过程中生成了一个单词,它就会在源语句中搜索出最相关信息的位置的集合。解码器根据上下文向量预测出一个目标单词,这个向量与源中搜索出的位置和所有之前生成的目标单词有关。如想了解更多详细的解释,可以参考 [Neural Machine Translation by Jointly Learning to Align and Translate](http://arxiv.org/abs/1409.0473)。 - -这个模型对于编解码模型来说,最不同的特色是它并没有将输入语句编码为一个单独的定长向量。相反,它将输入语句编码为向量的序列,其中每个向量对应输入语句中的一个元素。然后在解码被翻译的语句时,会自适应地从这些向量中选择一个子集出来。这使得NMT模型得以解放出来,不必再将任意长度源语句中的所有信息压缩至一个定长的向量中。该模型在长语句翻译的场景下效果提升更加明显,在任意长度语句翻译的场景下都可以观察到其效果的提升。 -
![](./encoder-decoder-attention-model.png)
-
Figure 1. Encoder-Decoder-Attention-Model
- -### 使用PaddlePaddle训练模型 ### -我们在训练之前需要常见一个模型配置文件,这里是一个例子`demo/seqToseq/translation/train.conf`。前三行import了定义network,job_mode和attention_mode的python函数。 - -```python -from seqToseq_net import * -is_generating = False - -### Data Definiation -train_conf = seq_to_seq_data(data_dir = "./data/pre-wmt14", - is_generating = is_generating) - -### Algorithm Configuration -settings( - learning_method = AdamOptimizer(), - batch_size = 50, - learning_rate = 5e-4) - -### Network Architecture -gru_encoder_decoder(train_conf, is_generating) -``` - -1. **Data Definiation**:在示例中我们定义了一个序列到序列的训练和测试数据。它返回train_conf作为配置,其输入参数如下: - - data_dir:训练数据和测试数据的目录 - - is_generating:这个配置是否用来生成,这里设置为False -2. **Algorithm Configuration**:在示例中我们使用SGD训练算法(默认),和ADAM学习方法,指定batch_size为50,learning_rate为5e-4 -3. **Network Architecture**:在示例中我们使用attention版本的GRU编解码网络。它包括了一个双向的GRU作为编码器和解码器,它模拟了解码翻译过程中在源语句中的搜索。 - -### 训练模型的命令与结果### -写完模型配置之后,我们可以通过以下命令来训练模型: - -```bash -cd demo/seqToseq/translation -./train.sh -``` - -`train.sh` 的内容如下所示: - -```bash -paddle train \ ---config='translation/train.conf' \ ---save_dir='translation/model' \ ---use_gpu=false \ ---num_passes=16 \ ---show_parameter_stats_period=100 \ ---trainer_count=4 \ ---log_period=10 \ ---dot_period=5 \ -2>&1 | tee 'translation/train.log' -``` -- config: 设置神经网络的配置文件 -- save_dir: 设置保存模型的输出路径 -- use_gpu: 是否使用GPU训练,这里设置为使用CPU -- num_passes: 设置passes的数量。paddle中的一条pass表示训练数据集中所有的样本一次 -- show_parameter_stats_period: 这里每隔100个batch显示一次参数统计信息 -- trainer_count: 设置CPU线程数或者GPU设备数 -- log_period: 这里每隔10个batch打印一次日志 -- dot_period: 这里每个5个batch打印一个点"." - -训练的损失函数默认每隔10个batch打印一次,你将会看到如下消息: - - I0719 19:16:45.952062 15563 TrainerInternal.cpp:160] Batch=10 samples=500 AvgCost=198.475 CurrentCost=198.475 Eval: classification_error_evaluator=0.737155 CurrentEval: classification_error_evaluator=0.737155 - I0719 19:17:56.707319 15563 TrainerInternal.cpp:160] Batch=20 samples=1000 AvgCost=157.479 CurrentCost=116.483 Eval: classification_error_evaluator=0.698392 CurrentEval: classification_error_evaluator=0.659065 - ..... -- AvgCost:从第0个batch到当前batch的平均cost -- CurrentCost::当前batch的cost -- classification\_error\_evaluator(Eval):从第0个评估到当前评估中,每个单词的预测错误率 -- classification\_error\_evaluator(CurrentEval):当前评估中,每个单词的预测错误率 - -当classification\_error\_evaluator的值低于0.35时,模型就训练成功了。 - -## 文本生成 ## -### 简介### - -一般而言,NMT模型受制于源语句的编码,并且通过给出当前目标单词来预测下一个目标单词。在训练过程中,当前单词在相比之下总是被当作真值(ground truth)。在生成过程中,当前单词是解码器最后一步的输出,这来自于PaddlePaddle的内存中。 - -而且,我们使用集束搜索(Beam Search)来生成序列。集束搜索使用广度优先搜索来构建搜索树。对于树的每一层,生成当前层的所有后继状态,并将它们按照启发代价(heuristic cost)升序排列。但是这种方法在每层只保存预设数量的最优状态(这个数量称为beam size)。 - -### 预训练的模型 ### -我们在拥有50个节点的集群中训练模型,每个节点有两个6核CPU。我们在5天里训练了16个pass,其中每条pass花费了7个小时。model_dir中有16个子目录,每个里面都包含202MB的全部的模型参数。然后我们发现pass-00012的模型有着最高的BLEU值27.77(参考文献[BLEU: a Method for Automatic Evaluation of Machine Translation](http://www.aclweb.org/anthology/P02-1040.pdf))。要下载解压这个模型,只需在linux下运行如下命令: - -```bash -cd demo/seqToseq/data -./wmt14_model.sh -``` - -### 使用PaddlePaddle生成模型 ### -在翻译法语句子之前,我们需要创建模型配置文件。这里是一个例子`demo/seqToseq/translation/gen.conf`。前三行import了定义network,job_mode和attention_mode的python函数。 - -```python -from seqToseq_net import * -is_generating = True - -################## Data Definiation ##################### -gen_conf = seq_to_seq_data(data_dir = "./data/pre-wmt14", - is_generating = is_generating, - gen_result = "./translation/gen_result") - -############## Algorithm Configuration ################## -settings( - learning_method = AdamOptimizer(), - batch_size = 1, - learning_rate = 0) - -################# Network configure ##################### -gru_encoder_decoder(gen_conf, is_generating) -``` - -1. **Data Definiation**:在示例中我们定义了一个序列到序列的生成数据。它返回gen_conf作为配置,其输入参数如下: - - data_dir:生成数据的目录 -  - is_generating:这个配置是否用来生成,这里设置为True -  - gen_result:保存生成结果的文件 -2. **Algorithm Configuration**:在生成过程中我们使用SGD训练算法,并指定batch_size为1(每次生成1个序列),learning_rate为0 -3. **Network Architecture**:本质上与训练模型一样 - -### 生成模型的命令与结果 ### -写完模型配置之后,我们可以通过以下命令来进行从法语到英语的文本翻译: - -```bash -cd demo/seqToseq/translation -./gen.sh -``` - - `gen.sh` 的内容如下所示。与训练模型不同的是,这里有一些不同的参数需要指定: - -```bash -paddle train \ ---job=test \ ---config='translation/gen.conf' \ ---save_dir='data/wmt14_model' \ ---use_gpu=true \ ---num_passes=13 \ ---test_pass=12 \ ---trainer_count=1 \ -2>&1 | tee 'translation/gen.log' -``` -- job:设置任务的模式为测试 -- save_dir:存储模型的路径 -- num_passes and test_pass:从test_pass到(num_passes - 1)加载模型参数,这里只加载 `data/wmt14_model/pass-00012` - -你将会看到这样的消息: - - I0706 14:48:31.178915 31441 GradientMachine.cpp:143] Loading parameters from data/wmt14_model/pass-00012 - I0706 14:48:40.012039 31441 Tester.cpp:125] Batch=100 samples=100 AvgCost=0 - I0706 14:48:48.898632 31441 Tester.cpp:125] Batch=200 samples=200 AvgCost=0 - ... - -然后在`demo/seqToseq/translation/gen_result`中的生成结果如下所示: - - 0 - 0 -11.1314 The about the width of the seats while large controls are at stake - 1 -11.1519 The on the width of the seats while large controls are at stake - 2 -11.5988 The about the width of the seats while large controls are at stake . - - 1 - 0 -24.4149 The dispute is between the major aircraft manufacturers about the width of the tourist seats on the flights , paving the way for a confrontation during the month of the Dubai . - 1 -26.9524 The dispute is between the major aircraft manufacturers about the width of the tourist seats on the flights , paving the way for a confrontation during the month of Dubai ' s . - 2 -27.9574 The dispute is between the major aircraft manufacturers about the width of the tourist seats on the flights , paving the way for a confrontation during the month of Dubai ' s Dubai . - ... - -- 这是集束搜索的结果,其中beam size是3 -- 第一行的“0”和第6行的“1”表示生成数据的序列id -- 其他六行列出了集束搜索的结果 - - 第二列是集束搜索的得分(从大到小) - - 第三列是生成的英语序列 -- 有两个特殊标识: - - ``:序列的结尾 - - ``:不包含在字典中的单词 - -### BLEU评估 ### -对机器翻译的人工评估工作很广泛但也很昂贵。一篇论文 [BLEU: a Method for Automatic Evaluation of Machine Translation](http://www.aclweb.org/anthology/P02-1040.pdf) 展示了一种方法,当需要快速或者频繁的评估时,使用自动的替补来替代经验丰富的人工评判。[Moses](http://www.statmt.org/moses/) 是一个统计学的机器翻译系统,我们使用其中的 [multi-bleu.perl](https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/multi-bleu.perl) 来做BLEU评估。运行以下命令来下载这个脚本: - -```bash -cd demo/seqToseq/translation -./moses_bleu.sh -``` - -由于标准的翻译结果已经下载到这里`data/wmt14/gen/ntst14.trg`,我们可以运行以下命令来做BLEU评估。 - -```bash -cd demo/seqToseq/translation -./eval_bleu.sh FILE BEAMSIZE -``` - -- FILE:生成的结果文件 -- BEAMSIZE:集束搜索中的扩展广度 diff --git a/doc/tutorials/text_generation/index_en.md b/doc/tutorials/text_generation/index_en.md deleted file mode 100644 index 5d8e667c20bd1fda64a6e11a88517d52112b72fa..0000000000000000000000000000000000000000 --- a/doc/tutorials/text_generation/index_en.md +++ /dev/null @@ -1,338 +0,0 @@ -# Text generation Tutorial # - -Sequence to sequence has been proven to be a powerful model for language generation. It can be used for machine translation, query rewriting, image captioning, etc. - -This tutorial guides you through training a sequence to sequence model for neural machine translation (NMT) network that translates French to English. - -We follow the paper [Neural Machine Translation by Jointly Learning to Align and Translate](http://arxiv.org/abs/1409.0473) , which details the model architecture and training procedure for good performance on WMT-14 dataset. This tutorial reproduces this result in PaddlePaddle. - -We thank @caoying for the pull request that defines the model architecture and solver configurations. - -## Data Preparation ## -### Download and Extract ### -Download the WMT-14 dataset from [http://www-lium.univ-lemans.fr/~schwenk/cslm\_joint\_paper/](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/), extract it, and divide Develop and Test data into separate folder. - -- **Train data**: [bitexts (after selection)](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/bitexts.tgz) -- **Develop and Test data**: [dev+test data](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz) - -To do this, simply run the following commands in linux, otherwise, you need to download, extract, divide, and rename the file suffix respectively. - -```bash -cd demo/seqToseq/data -./wmt14_data.sh -``` - -We should find that the dataset `wmt14` has three folders as shown in the following table. - ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
folder nameFrench-English parallel corpora filenumber of total filesize
train_dataccb2_pc30.src, ccb2_pc30.trg, etctwelve3.55G
test_datantst1213.src, ntst1213.trgtwo1636k
gen_datantst14.src, ntst14.trgtwo864k
-
- -- Each folder has French-English parallel corpora -- **XXX.src** are source French files; **XXX.trg** are target English files. -- The number of lines of **XXX.src** and **XXX.trg** should be the same. -- Each line is a French/English sentence. -- There is a one-to-one correspondence between the sentence at the i-th line of **XXX.src** and **XXX.trg**. - -### User Defined Dataset ### - -If you need to do other sequence-to-sequence tasks, such as Paraphrasing, you only need to organize the data as follows, and place them in `demo/seqToseq/data`: - - dataset - train - file1.src file1.trg - file2.src file2.trg - ...... - test - file1.src file1.trg - file2.src file2.trg - ...... - gen - file1.src file1.trg - file2.src file2.trg - ...... -- 1st directory: dataset folder name -- 2nd directory: folder of train, test, and gen. The names of these three folders are fixed. -- 3rd file: Source-Target parallel corpora files. - - **XXX.src** are source files, **XXX.trg** are target files. - - Each line of the file must be a sequence. - - There should be a one-to-one correspondence between the i-th sequence of **XXX.src** and **XXX.trg**. - -## Data Preprocess ## -### Preprocessing Workflow ### -- Concat each Source-Target parallel corpora to be one file: - - concat each **XXX.src** and **XXX.trg** to be **XXX**. - - the i-th line of **XXX** = the i-th line of **XXX.src** + '\t' + the i-th line of **XXX.trg** -- Build source and target dictionary of train data, each dictionary has DICTSIZE words: - - the most frequent (DICTSIZE-3) words - - 3 special token: - - ``: the start of a sequence - - ``: the end of a sequence - - ``: a word not included in dictionary - -### Preprocessing Command and Result -The general command for preprocessing the dataset is: - -```python -cd demo/seqToseq/ -python preprocess.py -i INPUT [-d DICTSIZE] [-m] -``` - -- `-i INPUT`: the path of input original dataset -- `-d DICTSIZE`: the specified word count of dictionary, if not set, dictionary will contain all the words in input dataset -- `-m --mergeDict`: merge source and target dictionary, thus, two dictionaries have the same context - -And you will see messages like this: - - concat parallel corpora for dataset - build source dictionary for train data - build target dictionary for train data - dictionary size is XXX - -Here, you can simply run the command: - -```python -python preprocess.py -i data/wmt14 -d 30000 -``` - -It will take several minutes, and store the preprocessed dataset in `demo/seqToseq/data/pre-wmt14`, the directory has following structure. - - train test gen train.list test.list gen.list src.dict trg.dict - -- **train, test, gen**: folder contains French-English parallel corpora of train data, test data and gen data respectively. Each line of file in folder contains two parts, the former is a French sequence, and the latter is a corresponding English sequence. -- **train.list, test.list, gen.list**: text contains a file list in train folder, test folder and gen folder respectively -- **src.dict, trg.dict**: source (French) / target (English) dictionary, each dictionary has 30000 words: the most frequent 29997 words and 3 special token - -## Model Training ## -### Introduction ### - -Neural machine translation (NMT) aims at building a single neural network that can be jointly tuned to maximize translation performance. Recently proposed NMT models often belong to a family of encoder–decoder models. Encoder-Decoder models encode a source sentence into a fixed-length vector from which a decoder generates a target sentence. - -In this task, we use an extension to the encoder–decoder model which learns to align and translate jointly. Each time the model generates a word in a translation, it searches for a set of positions in the source sentence for the most relevant information. The decoder predicts a target word based on the context vectors associated with these source positions and all the previous generated target words. For more detailed explanation, readers can refer to paper [Neural Machine Translation by Jointly Learning to Align and Translate](http://arxiv.org/abs/1409.0473). - -The most distinguishing feature of this model is that it doesn't encode an input sentence into a single fixed-length vector. Instead, it encodes the input sentence into a sequence of vectors, where one vector corresponds to an input element. A subset of these vectors is chosen adaptively while decoding the translated sentence. This frees a NMT model from having to squash all the information of a source sentence, regardless of its length, into a fixed-length vector. The improvement of this model is more apparent for longer sentences, but the improvement can be observed for sentences of any length. -
![](./encoder-decoder-attention-model.png)
-
Figure 1. Encoder-Decoder-Attention-Model
- -### Training Model in PaddlePaddle ### -We need to create a model config file before training. Here is an example `demo/seqToseq/translation/train.conf`. The first three lines import python function for defining network, and define the job_mode and attention_mode. - -```python -from seqToseq_net import * -is_generating = False - -### Data Definiation -train_conf = seq_to_seq_data(data_dir = "./data/pre-wmt14", - is_generating = is_generating) - -### Algorithm Configuration -settings( - learning_method = AdamOptimizer(), - batch_size = 50, - learning_rate = 5e-4) - -### Network Architecture -gru_encoder_decoder(train_conf, is_generating) -``` - -1. **Data Definiation**: We define a SeqToSeq train and test data in our example. It returns train_conf as the configuration, following is its input arguments: - - data_dir: directory of train data and test data - - is\_generating: whether this config is used for generating, here is false -2. **Algorithm Configuration**: We use the SGD training algorithm (default), ADAM learning method in our example, specify batch_size as 50, and learning rate as 5e-4. -3. **Network Architecture**: We use an attention version of GRU Encoder-Decoder network in our example. It consists a bidirectional GRU as an encoder and a decoder that emulates searching through a source sentence during decoding a translation. - -### Training Command and Result### -After writing the model config, we can train the model by running the command: - -```bash -cd demo/seqToseq/translation -./train.sh -``` - -The `train.sh` is shown as follows: - -```bash -paddle train \ ---config='translation/train.conf' \ ---save_dir='translation/model' \ ---use_gpu=false \ ---num_passes=16 \ ---show_parameter_stats_period=100 \ ---trainer_count=4 \ ---log_period=10 \ ---dot_period=5 \ -2>&1 | tee 'translation/train.log' -``` -- config: set config of neural network -- save_dir: set output path to save models -- use_gpu: whether to use GPU to train, here use CPU -- num_passes: set number of passes. One pass in paddle means training all samples in dataset one time -- show_parameter_stats_period: here show parameter statistic every 100 batches -- trainer_count: set number of CPU threads or GPU devices -- log_period: here print log every 10 batches -- dot_period: here print '.' every 5 batches - -The training loss function is printed every 10 batch by default, and you will see messages like this: - - I0719 19:16:45.952062 15563 TrainerInternal.cpp:160] Batch=10 samples=500 AvgCost=198.475 CurrentCost=198.475 Eval: classification_error_evaluator=0.737155 CurrentEval: classification_error_evaluator=0.737155 - I0719 19:17:56.707319 15563 TrainerInternal.cpp:160] Batch=20 samples=1000 AvgCost=157.479 CurrentCost=116.483 Eval: classification_error_evaluator=0.698392 CurrentEval: classification_error_evaluator=0.659065 - ..... -- AvgCost: Average Cost from 0th batch to current batch -- CurrentCost: Cost in current batch -- classification\_error\_evaluator(Eval): False prediction rate for each word from 0th evaluation to current evaluation -- classification\_error\_evaluator(CurrentEval): False prediction rate for each word in current evaluation - -And when the classification\_error\_evaluator is less than 0.35, the model is trained sucessfully. - -## Text Generation ## -### Introduction ### - -Generally speaking, the NMT model is conditioned on the encodings of the source sentence, and then to predict the next target word by given the current target word. In the training process, the current word is always knowns as the ground truth, by contrast. In the generating process, the current word is the output of the decoder in last time step, which is accessed to from a memory in PaddlePaddle. - -Besides, we use Beam Search to generate sequences. Beam search uses breadth-first search to build its search tree. At each level of the tree, it generates all successors of the states at the current level, sorting them in increasing order of heuristic cost. However, it only stores a predetermined number of best states at each level (called the beam size). - -### Pretrained model ### -We trained the model on a cluster with 50 nodes, each node has two 6-core CPUs. We trained 16 passes in 5 days, where each pass takes 7 hours. The model_dir has 16 sub-folder, each of which contains the whole model parameters with 202MB size. And we find pass-00012 model has the highest BLEU 27.77 (see paper [BLEU: a Method for Automatic Evaluation of Machine Translation](http://www.aclweb.org/anthology/P02-1040.pdf)). To download and extract this model, simply run the following commands in linux. - -```bash -cd demo/seqToseq/data -./wmt14_model.sh -``` - -### Generating Model in PaddlePaddle ### -We need to create a model config file before translating French sequence. Here is an example `demo/seqToseq/translation/gen.conf`, the first three lines import python function for defining network, and define the job\_mode and attention\_mode. - -```python -from seqToseq_net import * -is_generating = True - -################## Data Definiation ##################### -gen_conf = seq_to_seq_data(data_dir = "./data/pre-wmt14", - is_generating = is_generating, - gen_result = "./translation/gen_result") - -############## Algorithm Configuration ################## -settings( - learning_method = AdamOptimizer(), - batch_size = 1, - learning_rate = 0) - -################# Network configure ##################### -gru_encoder_decoder(gen_conf, is_generating) -``` - -1. **Data Definiation**: We defines an SeqToSeq gen data in our example. It returns gen_conf as the configuration, following is its input arguments: - - data\_dir: directory of gen data -   - is\_generating: whether this config is used for generating, here is true -   - gen\_result: file to store the generation result -2. **Algorithm Configuration**: We use SGD traing algorithm in generation, and specify batch_size as 1 (each time generate one sequence), and learning rate as 0. -3. **Network Architecture**: Essentially the same as the training model. - -### Generating Command and Result ### -After writing the model config, we can do text translation from French to English by running the command: - -```bash -cd demo/seqToseq/translation -./gen.sh -``` - -The `gen.sh` is shown as follows, unlike training, there are some different arguments to specify: - -```bash -paddle train \ ---job=test \ ---config='translation/gen.conf' \ ---save_dir='data/wmt14_model' \ ---use_gpu=true \ ---num_passes=13 \ ---test_pass=12 \ ---trainer_count=1 \ -2>&1 | tee 'translation/gen.log' -``` -- job: set job mode to test -- save_dir: the path of saved models -- num_passes and test_pass: loading model parameters from test_pass to (num_passes - 1), here only loads `data/wmt14_model/pass-00012` - -You will see messages like this: - - I0706 14:48:31.178915 31441 GradientMachine.cpp:143] Loading parameters from data/wmt14_model/pass-00012 - I0706 14:48:40.012039 31441 Tester.cpp:125] Batch=100 samples=100 AvgCost=0 - I0706 14:48:48.898632 31441 Tester.cpp:125] Batch=200 samples=200 AvgCost=0 - ... - -And the generating result in `demo/seqToseq/translation/gen_result` likes: - - 0 - 0 -11.1314 The about the width of the seats while large controls are at stake - 1 -11.1519 The on the width of the seats while large controls are at stake - 2 -11.5988 The about the width of the seats while large controls are at stake . - - 1 - 0 -24.4149 The dispute is between the major aircraft manufacturers about the width of the tourist seats on the flights , paving the way for a confrontation during the month of the Dubai . - 1 -26.9524 The dispute is between the major aircraft manufacturers about the width of the tourist seats on the flights , paving the way for a confrontation during the month of Dubai ' s . - 2 -27.9574 The dispute is between the major aircraft manufacturers about the width of the tourist seats on the flights , paving the way for a confrontation during the month of Dubai ' s Dubai . - ... - -- This is the beam search result, where beam size is 3 -- '0' in 1st-line and '1' in 6th-line mean the sequence-id in gen data -- Other six lines list the beam search results - - The 2nd-column is the score of beam search (from large to small) - - The 3rd-colunm is the generating English sequence -- There is 2 special tokens: - - ``: the end of a sequence - - ``: a word not included in dictionary - -### Bleu Evalutaion ### -Human evaluations of machine translation are extensive but expensive. Paper [BLEU: a Method for Automatic Evaluation of Machine Translation](http://www.aclweb.org/anthology/P02-1040.pdf) presents a method as an automated understudy to skilled human judges which substitutes for them when there is need for quick or frequent evaluations. [Moses](http://www.statmt.org/moses/) is a statistical machine translation system, and we use [multi-bleu.perl](https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/multi-bleu.perl) of it to do Bleu Evalution. To download this script, simply run the following command: - -```bash -cd demo/seqToseq/translation -./moses_bleu.sh -``` - -Since the standard translation is alrealy downloaded as `data/wmt14/gen/ntst14.trg`, we can do Bleu Evalution by running the command: - -```bash -cd demo/seqToseq/translation -./eval_bleu.sh FILE BEAMSIZE -``` - -- FILE: the generation result file -- BEAMSIZE: expand width in beam search diff --git a/doc/v1_api_tutorials/README.md b/doc/v1_api_tutorials/README.md new file mode 100644 index 0000000000000000000000000000000000000000..071b8da61fbcab3e88819273008b4526546202ad --- /dev/null +++ b/doc/v1_api_tutorials/README.md @@ -0,0 +1,5 @@ +The tutorials in v1_api_tutorials are using v1_api currently, and will be upgraded to v2_api later. +Thus, v1_api_tutorials is a temporary directory. We decide not to maintain it and will delete it in future. + +Please go to [PaddlePaddle/book](https://github.com/PaddlePaddle/book) and +[PaddlePaddle/models](https://github.com/PaddlePaddle/models) to learn PaddlePaddle. diff --git a/doc/tutorials/embedding_model/index_cn.md b/doc/v1_api_tutorials/embedding_model/index_cn.md similarity index 100% rename from doc/tutorials/embedding_model/index_cn.md rename to doc/v1_api_tutorials/embedding_model/index_cn.md diff --git a/doc/tutorials/embedding_model/index_en.md b/doc/v1_api_tutorials/embedding_model/index_en.md similarity index 100% rename from doc/tutorials/embedding_model/index_en.md rename to doc/v1_api_tutorials/embedding_model/index_en.md diff --git a/doc/tutorials/embedding_model/neural-n-gram-model.png b/doc/v1_api_tutorials/embedding_model/neural-n-gram-model.png similarity index 100% rename from doc/tutorials/embedding_model/neural-n-gram-model.png rename to doc/v1_api_tutorials/embedding_model/neural-n-gram-model.png diff --git a/doc/tutorials/gan/gan.png b/doc/v1_api_tutorials/gan/gan.png similarity index 100% rename from doc/tutorials/gan/gan.png rename to doc/v1_api_tutorials/gan/gan.png diff --git a/doc/tutorials/gan/index_en.md b/doc/v1_api_tutorials/gan/index_en.md similarity index 100% rename from doc/tutorials/gan/index_en.md rename to doc/v1_api_tutorials/gan/index_en.md diff --git a/doc/tutorials/gan/mnist_sample.png b/doc/v1_api_tutorials/gan/mnist_sample.png similarity index 100% rename from doc/tutorials/gan/mnist_sample.png rename to doc/v1_api_tutorials/gan/mnist_sample.png diff --git a/doc/tutorials/gan/uniform_sample.png b/doc/v1_api_tutorials/gan/uniform_sample.png similarity index 100% rename from doc/tutorials/gan/uniform_sample.png rename to doc/v1_api_tutorials/gan/uniform_sample.png diff --git a/doc/tutorials/imagenet_model/resnet_block.jpg b/doc/v1_api_tutorials/imagenet_model/resnet_block.jpg similarity index 100% rename from doc/tutorials/imagenet_model/resnet_block.jpg rename to doc/v1_api_tutorials/imagenet_model/resnet_block.jpg diff --git a/doc/tutorials/imagenet_model/resnet_model_cn.md b/doc/v1_api_tutorials/imagenet_model/resnet_model_cn.md similarity index 100% rename from doc/tutorials/imagenet_model/resnet_model_cn.md rename to doc/v1_api_tutorials/imagenet_model/resnet_model_cn.md diff --git a/doc/tutorials/imagenet_model/resnet_model_en.md b/doc/v1_api_tutorials/imagenet_model/resnet_model_en.md similarity index 100% rename from doc/tutorials/imagenet_model/resnet_model_en.md rename to doc/v1_api_tutorials/imagenet_model/resnet_model_en.md diff --git a/doc/tutorials/quick_start/index_cn.rst b/doc/v1_api_tutorials/quick_start/index_cn.rst similarity index 100% rename from doc/tutorials/quick_start/index_cn.rst rename to doc/v1_api_tutorials/quick_start/index_cn.rst diff --git a/doc/tutorials/quick_start/index_en.md b/doc/v1_api_tutorials/quick_start/index_en.md similarity index 100% rename from doc/tutorials/quick_start/index_en.md rename to doc/v1_api_tutorials/quick_start/index_en.md diff --git a/doc/tutorials/quick_start/src/NetContinuous_cn.jpg b/doc/v1_api_tutorials/quick_start/src/NetContinuous_cn.jpg similarity index 100% rename from doc/tutorials/quick_start/src/NetContinuous_cn.jpg rename to doc/v1_api_tutorials/quick_start/src/NetContinuous_cn.jpg diff --git a/doc/tutorials/quick_start/src/NetContinuous_en.png b/doc/v1_api_tutorials/quick_start/src/NetContinuous_en.png similarity index 100% rename from doc/tutorials/quick_start/src/NetContinuous_en.png rename to doc/v1_api_tutorials/quick_start/src/NetContinuous_en.png diff --git a/doc/tutorials/quick_start/src/NetConv_cn.jpg b/doc/v1_api_tutorials/quick_start/src/NetConv_cn.jpg similarity index 100% rename from doc/tutorials/quick_start/src/NetConv_cn.jpg rename to doc/v1_api_tutorials/quick_start/src/NetConv_cn.jpg diff --git a/doc/tutorials/quick_start/src/NetConv_en.png b/doc/v1_api_tutorials/quick_start/src/NetConv_en.png similarity index 100% rename from doc/tutorials/quick_start/src/NetConv_en.png rename to doc/v1_api_tutorials/quick_start/src/NetConv_en.png diff --git a/doc/tutorials/quick_start/src/NetLR_cn.jpg b/doc/v1_api_tutorials/quick_start/src/NetLR_cn.jpg similarity index 100% rename from doc/tutorials/quick_start/src/NetLR_cn.jpg rename to doc/v1_api_tutorials/quick_start/src/NetLR_cn.jpg diff --git a/doc/tutorials/quick_start/src/NetLR_en.png b/doc/v1_api_tutorials/quick_start/src/NetLR_en.png similarity index 100% rename from doc/tutorials/quick_start/src/NetLR_en.png rename to doc/v1_api_tutorials/quick_start/src/NetLR_en.png diff --git a/doc/tutorials/quick_start/src/NetRNN_cn.jpg b/doc/v1_api_tutorials/quick_start/src/NetRNN_cn.jpg similarity index 100% rename from doc/tutorials/quick_start/src/NetRNN_cn.jpg rename to doc/v1_api_tutorials/quick_start/src/NetRNN_cn.jpg diff --git a/doc/tutorials/quick_start/src/NetRNN_en.png b/doc/v1_api_tutorials/quick_start/src/NetRNN_en.png similarity index 100% rename from doc/tutorials/quick_start/src/NetRNN_en.png rename to doc/v1_api_tutorials/quick_start/src/NetRNN_en.png diff --git a/doc/tutorials/quick_start/src/PipelineNetwork_cn.jpg b/doc/v1_api_tutorials/quick_start/src/PipelineNetwork_cn.jpg similarity index 100% rename from doc/tutorials/quick_start/src/PipelineNetwork_cn.jpg rename to doc/v1_api_tutorials/quick_start/src/PipelineNetwork_cn.jpg diff --git a/doc/tutorials/quick_start/src/PipelineNetwork_en.jpg b/doc/v1_api_tutorials/quick_start/src/PipelineNetwork_en.jpg similarity index 100% rename from doc/tutorials/quick_start/src/PipelineNetwork_en.jpg rename to doc/v1_api_tutorials/quick_start/src/PipelineNetwork_en.jpg diff --git a/doc/tutorials/quick_start/src/PipelineTest_cn.jpg b/doc/v1_api_tutorials/quick_start/src/PipelineTest_cn.jpg similarity index 100% rename from doc/tutorials/quick_start/src/PipelineTest_cn.jpg rename to doc/v1_api_tutorials/quick_start/src/PipelineTest_cn.jpg diff --git a/doc/tutorials/quick_start/src/PipelineTest_en.png b/doc/v1_api_tutorials/quick_start/src/PipelineTest_en.png similarity index 100% rename from doc/tutorials/quick_start/src/PipelineTest_en.png rename to doc/v1_api_tutorials/quick_start/src/PipelineTest_en.png diff --git a/doc/tutorials/quick_start/src/PipelineTrain_cn.jpg b/doc/v1_api_tutorials/quick_start/src/PipelineTrain_cn.jpg similarity index 100% rename from doc/tutorials/quick_start/src/PipelineTrain_cn.jpg rename to doc/v1_api_tutorials/quick_start/src/PipelineTrain_cn.jpg diff --git a/doc/tutorials/quick_start/src/PipelineTrain_en.png b/doc/v1_api_tutorials/quick_start/src/PipelineTrain_en.png similarity index 100% rename from doc/tutorials/quick_start/src/PipelineTrain_en.png rename to doc/v1_api_tutorials/quick_start/src/PipelineTrain_en.png diff --git a/doc/tutorials/quick_start/src/Pipeline_cn.jpg b/doc/v1_api_tutorials/quick_start/src/Pipeline_cn.jpg similarity index 100% rename from doc/tutorials/quick_start/src/Pipeline_cn.jpg rename to doc/v1_api_tutorials/quick_start/src/Pipeline_cn.jpg diff --git a/doc/tutorials/quick_start/src/Pipeline_en.jpg b/doc/v1_api_tutorials/quick_start/src/Pipeline_en.jpg similarity index 100% rename from doc/tutorials/quick_start/src/Pipeline_en.jpg rename to doc/v1_api_tutorials/quick_start/src/Pipeline_en.jpg diff --git a/doc_theme/templates/layout.html b/doc_theme/templates/layout.html index 65e61c5f298e19adc6330c378779a6edf418752e..9fca69dc4e7f0827acfc755a97a662350214b90e 100644 --- a/doc_theme/templates/layout.html +++ b/doc_theme/templates/layout.html @@ -101,7 +101,7 @@